Untitled

import os
import re
import spacy

# Load the Spacy model for Italian
nlp = spacy.load('it_core_news_sm')
nlp.max_length = 1000000000000

# Function to read De Mauro's basic vocabulary
def load_de_mauro_vocabulary(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        vocabulary = set(f.read().splitlines())
    return vocabulary

# Function to preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    return text

# Function to calculate the percentage of words from the basic vocabulary
def calculate_de_mauro_percentage(corpus, vocabulary):
    doc = nlp(corpus)
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    total_words = len(tokens)
    base_words = [token for token in tokens if token in vocabulary]
    base_word_count = len(base_words)
    percentage = (base_word_count / total_words) * 100
    return percentage

# Function to read all text files from a folder and subfolders
def read_texts_from_folders(root_folder):
    complete_corpus = ""
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    complete_corpus += text + " "  # Concatenate texts
    return complete_corpus

# Path to the root folder containing subfolders with texts
root_folder_path = '/Users/lauraocchipinti/analisi_lessicale/analisi_lessicale/corpus_covicor copia 2'

# Path to De Mauro's basic vocabulary file
vocabulary_file = '/Users/lauraocchipinti/analisi_lessicale/analisi_lessicale/nvdb.words.txt'

# Load De Mauro's basic vocabulary
de_mauro_vocabulary = load_de_mauro_vocabulary(vocabulary_file)

# Read and concatenate all texts from the folders
complete_corpus = read_texts_from_folders(root_folder_path)

# Preprocess the text
preprocessed_corpus = preprocess_text(complete_corpus)

# Calculate the percentage of words from the basic vocabulary
percentage = calculate_de_mauro_percentage(preprocessed_corpus, de_mauro_vocabulary)

print(f"Percentage of words belonging to De Mauro's basic vocabulary: {percentage:.2f}%")
Editor is loading...