Untitled

mail@pastecode.io avatar
unknown
plain_text
2 months ago
3.0 kB
1
Indexable
Never
from datasets import load_dataset
from collections import Counter
import pandas as pd 
import numpy as np 
import math 

dataset = load_dataset("papluca/language-identification") 

def FilterDataset(dataset, language_labels): 
    filtered_dataset = {}

    for split in dataset.keys():
        filtered_dataset[split] = dataset[split].filter(lambda example: example['labels'] in language_labels) 

    return filtered_dataset

selected_labels = ['nl', 'de', 'en', 'es', 'pt', 'ru']
filtered_dataset = FilterDataset(dataset, selected_labels) 

def generate_trigrams(text):
    trigrams = []
    clean_text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace())
    words = clean_text.split()
    for word in words:
        if len(word) >= 3:
            for i in range(len(word) - 2):
                trigram = word[i:i+3]
                trigrams.append(trigram)
    return trigrams

def generate_trigrams_for_dataset(dataset):
    trigrams_per_language = {}

    for split in dataset.keys():
        for example in dataset[split]:
            text = example['text']
            trigrams = generate_trigrams(text)
            language_label = example['labels']
            if language_label not in trigrams_per_language:
                trigrams_per_language[language_label] = []
            trigrams_per_language[language_label].extend(trigrams)

    return trigrams_per_language

trigrams_per_language = generate_trigrams_for_dataset(filtered_dataset)


def calculate_trigram_probabilities(trigrams_per_language): 
    probabilities = {}

    for language, trigrams in trigrams_per_language.items():
        trigram_counts = Counter(trigrams)
        total_trigrams = sum(trigram_counts.values())
        vocabulary_size = len(set(trigrams))
        
        smoothed_total = total_trigrams + vocabulary_size
        
        trigram_probabilities = {trigram: (count + 1) / smoothed_total for trigram, count in trigram_counts.items()}
        probabilities[language] = trigram_probabilities
        
    return probabilities


trigram_probabilities = calculate_trigram_probabilities(trigrams_per_language) 



def identify_language(text, trigram_probabilities, default_probability=1e-6):

    input_trigrams = generate_trigrams(text)
    
    language_scores = {language: 0 for language in trigram_probabilities}
    
    for language, probabilities in trigram_probabilities.items():
        for trigram in input_trigrams:
            trigram_probability = probabilities.get(trigram, default_probability)
            language_scores[language] += math.log(trigram_probability)
    
    most_likely_language = max(language_scores, key=language_scores.get)
    
    return most_likely_language

text = "sta es una muestra de una oración en español para probar el algoritmo."
most_likely_language = identify_language(text, trigram_probabilities)
print(f"De meest waarschijnlijke taal is: {most_likely_language}")
Leave a Comment