Untitled
unknown
python
a year ago
2.1 kB
25
Indexable
import os
import spacy
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
# Carica il modello di spaCy per l'italiano
nlp = spacy.load('it_core_news_sm')
nlp.max_length = 1000000000000
def calcola_ttr_100_parole(testo):
doc = nlp(testo.lower())
tokens = [token.text for token in doc if not token.is_punct and not token.is_digit][
:100] # Prendi solo le prime 100 parole
total_tokens = len(tokens)
total_types = len(set(tokens))
ttr = total_types / total_tokens if total_tokens > 0 else 0
return ttr
def process_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
testo = f.read()
return calcola_ttr_100_parole(testo)
ttr_values = []
def analizza_cartelle(root_folder):
file_paths = []
for root, dirs, files in os.walk(root_folder):
for file in files:
file_path = os.path.join(root, file)
file_paths.append(file_path)
with ProcessPoolExecutor() as executor:
ttr_values = list(executor.map(process_file, file_paths))
return ttr_values
if __name__ == '__main__':
# Percorso della cartella radice
percorso_cartella_radice = "/Users/lauraocchipinti/analisi_lessicale/analisi_lessicale/docs_coris"
ttr_values = analizza_cartelle(percorso_cartella_radice)
# Calcolo della media e della deviazione standard del TTR
mean_ttr = pd.Series(ttr_values).mean()
std_ttr = pd.Series(ttr_values).std()
print(f"Media TTR: {mean_ttr}")
print(f"Deviazione Standard TTR: {std_ttr}")
# Salvataggio dei risultati
df_ttr = pd.DataFrame(ttr_values, columns=['TTR'])
df_ttr.to_csv('ttr_100_parole_Coris.csv', index=False)
summary_stats = {
'Media TTR': mean_ttr,
'Deviazione Standard TTR': std_ttr
}
pd.DataFrame([summary_stats]).to_csv('ttr_summary_Coris.csv', index=False)
print("Dettagli del TTR per le prime 100 parole salvati in 'ttr_100_parole_Coris.csv'.")
print("Statistiche riassuntive del TTR salvate in 'ttr_summary_Coris.csv'.")
print(summary_stats)Editor is loading...
Leave a Comment