Untitled
unknown
plain_text
2 years ago
3.0 kB
16
Indexable
from datasets import load_dataset
from collections import Counter
import pandas as pd
import numpy as np
import math
dataset = load_dataset("papluca/language-identification")
def FilterDataset(dataset, language_labels):
filtered_dataset = {}
for split in dataset.keys():
filtered_dataset[split] = dataset[split].filter(lambda example: example['labels'] in language_labels)
return filtered_dataset
selected_labels = ['nl', 'de', 'en', 'es', 'pt', 'ru']
filtered_dataset = FilterDataset(dataset, selected_labels)
def generate_trigrams(text):
trigrams = []
clean_text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace())
words = clean_text.split()
for word in words:
if len(word) >= 3:
for i in range(len(word) - 2):
trigram = word[i:i+3]
trigrams.append(trigram)
return trigrams
def generate_trigrams_for_dataset(dataset):
trigrams_per_language = {}
for split in dataset.keys():
for example in dataset[split]:
text = example['text']
trigrams = generate_trigrams(text)
language_label = example['labels']
if language_label not in trigrams_per_language:
trigrams_per_language[language_label] = []
trigrams_per_language[language_label].extend(trigrams)
return trigrams_per_language
trigrams_per_language = generate_trigrams_for_dataset(filtered_dataset)
def calculate_trigram_probabilities(trigrams_per_language):
probabilities = {}
for language, trigrams in trigrams_per_language.items():
trigram_counts = Counter(trigrams)
total_trigrams = sum(trigram_counts.values())
vocabulary_size = len(set(trigrams))
smoothed_total = total_trigrams + vocabulary_size
trigram_probabilities = {trigram: (count + 1) / smoothed_total for trigram, count in trigram_counts.items()}
probabilities[language] = trigram_probabilities
return probabilities
trigram_probabilities = calculate_trigram_probabilities(trigrams_per_language)
def identify_language(text, trigram_probabilities, default_probability=1e-6):
input_trigrams = generate_trigrams(text)
language_scores = {language: 0 for language in trigram_probabilities}
for language, probabilities in trigram_probabilities.items():
for trigram in input_trigrams:
trigram_probability = probabilities.get(trigram, default_probability)
language_scores[language] += math.log(trigram_probability)
most_likely_language = max(language_scores, key=language_scores.get)
return most_likely_language
text = "sta es una muestra de una oración en español para probar el algoritmo."
most_likely_language = identify_language(text, trigram_probabilities)
print(f"De meest waarschijnlijke taal is: {most_likely_language}")
Editor is loading...
Leave a Comment