Untitled
unknown
python
a year ago
3.6 kB
8
Indexable
import os import re import pandas as pd import spacy # Carica il modello di SpaCy per l'italiano nlp = spacy.load("it_core_news_sm") # Funzione per calcolare le percentuali e i rapporti def calculate_percentages_and_ratios(text): # Analizza il testo con SpaCy doc = nlp(text) # Trova tutte le parole etichettate words = re.findall(r'\b\w+:\w+\b', text) # Filtra solo le parole di contenuto (sostantivi, verbi, aggettivi, avverbi) content_words = [ word for word in words if any( token.text == word.split(':')[0] and token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'} for token in doc ) ] total_labeled_words = len(content_words) simple_words = sum(1 for word in content_words if word.endswith(':SIMPLE')) complex_words = sum(1 for word in content_words if word.endswith(':COMPLEX')) # Totale parole nel testo total_words = len(doc) # Calcola le percentuali simple_percentage = (simple_words / total_labeled_words) * 100 if total_labeled_words > 0 else 0 complex_percentage = (complex_words / total_labeled_words) * 100 if total_labeled_words > 0 else 0 simple_total_percentage = (simple_words / total_words) * 100 if total_words > 0 else 0 complex_total_percentage = (complex_words / total_words) * 100 if total_words > 0 else 0 # Calcola i rapporti simple_to_complex_ratio = (simple_words / complex_words) if complex_words > 0 else 0 complex_to_simple_ratio = (complex_words / simple_words) if simple_words > 0 else 0 return simple_percentage, complex_percentage, simple_to_complex_ratio, complex_to_simple_ratio, simple_total_percentage, complex_total_percentage # Leggi i testi dal corpus (per esempio da una directory) corpus_dir = '/Users/lauraocchipinti/PycharmProjects/phd_thesis/labeled_corpus' # Sostituire con il percorso corretto texts = [] for root, dirs, files in os.walk(corpus_dir): for file in files: if file.endswith('.txt'): # Assume che i file di testo siano con estensione .txt with open(os.path.join(root, file), 'r', encoding='utf-8') as f: texts.append(f.read()) # Calcola le percentuali e i rapporti per ogni testo results = [] for text in texts: simple_percentage, complex_percentage, simple_to_complex_ratio, complex_to_simple_ratio, simple_total_percentage, complex_total_percentage = calculate_percentages_and_ratios( text) results.append((simple_percentage, complex_percentage, simple_to_complex_ratio, complex_to_simple_ratio, simple_total_percentage, complex_total_percentage)) # Calcola la media delle percentuali e dei rapporti simple_mean = sum(result[0] for result in results) / len(results) if results else 0 complex_mean = sum(result[1] for result in results) / len(results) if results else 0 simple_to_complex_ratio_mean = sum(result[2] for result in results) / len(results) if results else 0 complex_to_simple_ratio_mean = sum(result[3] for result in results) / len(results) if results else 0 simple_total_mean = sum(result[4] for result in results) / len(results) if results else 0 complex_total_mean = sum(result[5] for result in results) / len(results) if results else 0 # Mostra i risultati df = pd.DataFrame(results, columns=['Simple Percentage', 'Complex Percentage', 'Simple/Complex Ratio', 'Complex/Simple Ratio', 'Simple Total Percentage', 'Complex Total Percentage']) df.loc['Mean'] = [simple_mean, complex_mean, simple_to_complex_ratio_mean, complex_to_simple_ratio_mean, simple_total_mean, complex_total_mean] print(df)
Editor is loading...
Leave a Comment