asda
asdasunknown
python
a year ago
3.1 kB
5
Indexable
import os import re import torch import spacy print(torch.cuda.is_available()) import multiprocessing as mp # Load the Spacy model for Italian nlp = spacy.load('it_core_news_sm') nlp.max_length = 250000000 # Function to read De Mauro's basic vocabulary def load_de_mauro_vocabulary(file_path): with open(file_path, 'r', encoding='utf-8') as f: vocabulary = set(f.read().splitlines()) return vocabulary # Function to preprocess the text def preprocess_text(text): text = text.lower() # Convert to lowercase text = re.sub(r'\W+', ' ', text) # Remove punctuation return text # Function to calculate the percentage of words from the basic vocabulary def calculate_de_mauro_percentage(corpus, vocabulary): doc = nlp(corpus) tokens = [token.text for token in doc if not token.is_punct and not token.is_space] total_words = len(tokens) if total_words == 0: return 0 base_words = [token for token in tokens if token in vocabulary] base_word_count = len(base_words) percentage = (base_word_count / total_words) * 100 return percentage # Function to read a single text file def read_single_file(file_path): with open(file_path, 'r', encoding='utf-8') as f: text = f.read() return text # Function to read all text files from a folder and subfolders using multiprocessing def read_texts_from_folders(root_folder): file_paths = [] for root, dirs, files in os.walk(root_folder): for file in files: file_path = os.path.join(root, file) file_paths.append(file_path) # Use multiprocessing to read files concurrently with mp.Pool(mp.cpu_count()) as pool: texts = pool.map(read_single_file, file_paths) return texts if __name__ == '__main__': # Path to the root folder containing subfolders with texts root_folder_path = 'C:/Users/kemal/PycharmProjects/la/docs' # Path to De Mauro's basic vocabulary file vocabulary_file = 'nvdb.words.txt' # Load De Mauro's basic vocabulary de_mauro_vocabulary = load_de_mauro_vocabulary(vocabulary_file) # Read and preprocess all texts from the folders texts = read_texts_from_folders(root_folder_path) preprocessed_texts = [preprocess_text(text) for text in texts] # Calculate the percentage of words from the basic vocabulary for each chunk percentages = [] # Total number of words in the text print("Total Preprocessed Texts:", len(preprocessed_texts)) for text in preprocessed_texts: percentage = calculate_de_mauro_percentage(text, de_mauro_vocabulary) percentages.append(percentage) print("Count:", len(percentages)) # Calculate the overall percentage if len(percentages) > 0: overall_percentage = sum(percentages) / len(percentages) print(f'Overall percentage of De Mauro\'s basic vocabulary: {overall_percentage:.2f}%') else: overall_percentage = 0 print('No texts found in the specified folder.')
Editor is loading...
Leave a Comment