Untitled
unknown
python
a year ago
2.1 kB
4
Indexable
import os import spacy import pandas as pd from concurrent.futures import ProcessPoolExecutor # Carica il modello di spaCy per l'italiano nlp = spacy.load('it_core_news_sm') nlp.max_length = 1000000000000 def calcola_ttr_100_parole(testo): doc = nlp(testo.lower()) tokens = [token.text for token in doc if not token.is_punct and not token.is_digit][ :100] # Prendi solo le prime 100 parole total_tokens = len(tokens) total_types = len(set(tokens)) ttr = total_types / total_tokens if total_tokens > 0 else 0 return ttr def process_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: testo = f.read() return calcola_ttr_100_parole(testo) ttr_values = [] def analizza_cartelle(root_folder): file_paths = [] for root, dirs, files in os.walk(root_folder): for file in files: file_path = os.path.join(root, file) file_paths.append(file_path) with ProcessPoolExecutor() as executor: ttr_values = list(executor.map(process_file, file_paths)) return ttr_values if __name__ == '__main__': # Percorso della cartella radice percorso_cartella_radice = "/Users/lauraocchipinti/analisi_lessicale/analisi_lessicale/docs_coris" ttr_values = analizza_cartelle(percorso_cartella_radice) # Calcolo della media e della deviazione standard del TTR mean_ttr = pd.Series(ttr_values).mean() std_ttr = pd.Series(ttr_values).std() print(f"Media TTR: {mean_ttr}") print(f"Deviazione Standard TTR: {std_ttr}") # Salvataggio dei risultati df_ttr = pd.DataFrame(ttr_values, columns=['TTR']) df_ttr.to_csv('ttr_100_parole_Coris.csv', index=False) summary_stats = { 'Media TTR': mean_ttr, 'Deviazione Standard TTR': std_ttr } pd.DataFrame([summary_stats]).to_csv('ttr_summary_Coris.csv', index=False) print("Dettagli del TTR per le prime 100 parole salvati in 'ttr_100_parole_Coris.csv'.") print("Statistiche riassuntive del TTR salvate in 'ttr_summary_Coris.csv'.") print(summary_stats)
Editor is loading...
Leave a Comment