Untitled

 avatar
unknown
python
a year ago
2.1 kB
4
Indexable
import os
import spacy
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

# Carica il modello di spaCy per l'italiano
nlp = spacy.load('it_core_news_sm')
nlp.max_length = 1000000000000


def calcola_ttr_100_parole(testo):
    doc = nlp(testo.lower())
    tokens = [token.text for token in doc if not token.is_punct and not token.is_digit][
             :100]  # Prendi solo le prime 100 parole
    total_tokens = len(tokens)
    total_types = len(set(tokens))
    ttr = total_types / total_tokens if total_tokens > 0 else 0
    return ttr


def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        testo = f.read()
        return calcola_ttr_100_parole(testo)

ttr_values = []
def analizza_cartelle(root_folder):

    file_paths = []

    for root, dirs, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    with ProcessPoolExecutor() as executor:
        ttr_values = list(executor.map(process_file, file_paths))

    return ttr_values


if __name__ == '__main__':
    # Percorso della cartella radice
    percorso_cartella_radice = "/Users/lauraocchipinti/analisi_lessicale/analisi_lessicale/docs_coris"
    ttr_values = analizza_cartelle(percorso_cartella_radice)

    # Calcolo della media e della deviazione standard del TTR
    mean_ttr = pd.Series(ttr_values).mean()
    std_ttr = pd.Series(ttr_values).std()

    print(f"Media TTR: {mean_ttr}")
    print(f"Deviazione Standard TTR: {std_ttr}")

    # Salvataggio dei risultati
    df_ttr = pd.DataFrame(ttr_values, columns=['TTR'])
    df_ttr.to_csv('ttr_100_parole_Coris.csv', index=False)
    summary_stats = {
        'Media TTR': mean_ttr,
        'Deviazione Standard TTR': std_ttr
    }
    pd.DataFrame([summary_stats]).to_csv('ttr_summary_Coris.csv', index=False)

    print("Dettagli del TTR per le prime 100 parole salvati in 'ttr_100_parole_Coris.csv'.")
    print("Statistiche riassuntive del TTR salvate in 'ttr_summary_Coris.csv'.")
    print(summary_stats)
Editor is loading...
Leave a Comment