asda

asdas
 avatar
unknown
python
a year ago
3.1 kB
5
Indexable
import os
import re
import torch
import spacy
print(torch.cuda.is_available())
import multiprocessing as mp

# Load the Spacy model for Italian
nlp = spacy.load('it_core_news_sm')
nlp.max_length = 250000000


# Function to read De Mauro's basic vocabulary
def load_de_mauro_vocabulary(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        vocabulary = set(f.read().splitlines())
    return vocabulary


# Function to preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    return text


# Function to calculate the percentage of words from the basic vocabulary
def calculate_de_mauro_percentage(corpus, vocabulary):
    doc = nlp(corpus)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    total_words = len(tokens)
    if total_words == 0:
        return 0
    base_words = [token for token in tokens if token in vocabulary]
    base_word_count = len(base_words)
    percentage = (base_word_count / total_words) * 100
    return percentage


# Function to read a single text file
def read_single_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text


# Function to read all text files from a folder and subfolders using multiprocessing
def read_texts_from_folders(root_folder):
    file_paths = []
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    # Use multiprocessing to read files concurrently
    with mp.Pool(mp.cpu_count()) as pool:
        texts = pool.map(read_single_file, file_paths)

    return texts


if __name__ == '__main__':
    # Path to the root folder containing subfolders with texts
    root_folder_path = 'C:/Users/kemal/PycharmProjects/la/docs'

    # Path to De Mauro's basic vocabulary file
    vocabulary_file = 'nvdb.words.txt'

    # Load De Mauro's basic vocabulary
    de_mauro_vocabulary = load_de_mauro_vocabulary(vocabulary_file)

    # Read and preprocess all texts from the folders
    texts = read_texts_from_folders(root_folder_path)
    preprocessed_texts = [preprocess_text(text) for text in texts]

    # Calculate the percentage of words from the basic vocabulary for each chunk
    percentages = []
    # Total number of words in the text
    print("Total Preprocessed Texts:", len(preprocessed_texts))
    for text in preprocessed_texts:
        percentage = calculate_de_mauro_percentage(text, de_mauro_vocabulary)
        percentages.append(percentage)
        print("Count:", len(percentages))

    # Calculate the overall percentage
    if len(percentages) > 0:
        overall_percentage = sum(percentages) / len(percentages)
        print(f'Overall percentage of De Mauro\'s basic vocabulary: {overall_percentage:.2f}%')
    else:
        overall_percentage = 0
        print('No texts found in the specified folder.')
Editor is loading...
Leave a Comment