Untitled
unknown
python
a year ago
3.3 kB
10
Indexable
import os
import re
import pandas as pd
import spacy
from multiprocessing import Pool, cpu_count
nlp = spacy.load("it_core_news_sm")
def calculate_percentages_and_ratios(text):
# Analizza il testo con SpaCy
doc = nlp(text)
# Trova tutte le parole etichettate
words = re.findall(r'\b\w+:\w+\b', text)
content_words = [
word for word in words if any(
token.text == word.split(':')[0] and token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}
for token in doc
)
]
total_labeled_words = len(content_words)
simple_words = sum(1 for word in content_words if word.endswith(':SIMPLE'))
complex_words = sum(1 for word in content_words if word.endswith(':COMPLEX'))
# Totale parole nel testo
total_words = len(doc)
# Calcola le percentuali
simple_percentage = (simple_words / total_labeled_words) * 100 if total_labeled_words > 0 else 0
complex_percentage = (complex_words / total_labeled_words) * 100 if total_labeled_words > 0 else 0
simple_total_percentage = (simple_words / total_words) * 100 if total_words > 0 else 0
complex_total_percentage = (complex_words / total_words) * 100 if total_words > 0 else 0
# Calcola i rapporti
simple_to_complex_ratio = (simple_words / complex_words) if complex_words > 0 else 0
complex_to_simple_ratio = (complex_words / simple_words) if simple_words > 0 else 0
return simple_percentage, complex_percentage, simple_to_complex_ratio, complex_to_simple_ratio, simple_total_percentage, complex_total_percentage
def process_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return calculate_percentages_and_ratios(text)
if __name__ == '__main__':
corpus_dir = os.path.join(os.path.dirname(__file__), 'labeled_corpus')
file_paths = []
for root, dirs, files in os.walk(corpus_dir):
for file in files:
if file.endswith('.txt'):
file_paths.append(os.path.join(root, file))
# Use multiprocessing to process files in parallel
with Pool(processes=cpu_count()) as pool:
results = pool.map(process_file, file_paths)
# Calcola la media delle percentuali e dei rapporti
simple_mean = sum(result[0] for result in results) / len(results) if results else 0
complex_mean = sum(result[1] for result in results) / len(results) if results else 0
simple_to_complex_ratio_mean = sum(result[2] for result in results) / len(results) if results else 0
complex_to_simple_ratio_mean = sum(result[3] for result in results) / len(results) if results else 0
simple_total_mean = sum(result[4] for result in results) / len(results) if results else 0
complex_total_mean = sum(result[5] for result in results) / len(results) if results else 0
# Mostra i risultati
df = pd.DataFrame(results,
columns=['Simple Percentage', 'Complex Percentage', 'Simple/Complex Ratio', 'Complex/Simple Ratio',
'Simple Total Percentage', 'Complex Total Percentage'])
df.loc['Mean'] = [simple_mean, complex_mean, simple_to_complex_ratio_mean, complex_to_simple_ratio_mean,
simple_total_mean, complex_total_mean]
print(df)Editor is loading...
Leave a Comment