Untitled

mail@pastecode.io avatar
unknown
python
2 months ago
6.1 kB
5
Indexable
Never
from datasets import load_dataset
from collections import Counter
import pandas as pd
import numpy as np
import math
import time

def filter_dataset(dataset, language_labels):
    return {split: dataset[split].filter(lambda example: example['labels'] in language_labels) for split in dataset.keys()}

def generate_ngrams(text, n):
    clean_text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace())
    words = clean_text.split()
    return [word[i:i+n] for word in words if len(word) >= n for i in range(len(word) - n + 1)]

def generate_ngrams_for_dataset(dataset):
    ngrams_per_language = {}

    if isinstance(dataset, dict):
        for split in dataset.keys():
            for example in dataset[split]:
                text = example['text']
                language_label = example['labels']
                ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 2))
                ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 3))
    else:
        for example in dataset:
            text = example['text']
            language_label = example['labels']
            ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 2))
            ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 3))

    return ngrams_per_language

def calculate_ngram_probabilities(ngrams_per_language):
    probabilities = {}

    for language, ngrams in ngrams_per_language.items():
        ngram_counts = Counter(ngrams)
        total_ngrams = sum(ngram_counts.values())
        vocabulary_size = len(set(ngrams))
        smoothed_total = total_ngrams + vocabulary_size
        ngram_probabilities = {ngram: (count + 1) / smoothed_total for ngram, count in ngram_counts.items()}
        probabilities[language] = ngram_probabilities

    return probabilities

def identify_language(text, ngram_probabilities, default_probability=0.0001):
    input_bigrams = generate_ngrams(text, 2)
    input_trigrams = generate_ngrams(text, 3)

    language_scores = {language: 0 for language in ngram_probabilities[2]}

    for language in language_scores.keys():
        language_scores[language] = sum(math.log(ngram_probabilities[2][language].get(bigram, default_probability)) for bigram in input_bigrams)
        language_scores[language] += sum(math.log(ngram_probabilities[3][language].get(trigram, default_probability)) for trigram in input_trigrams)

    return max(language_scores, key=language_scores.get)

def identify_language_with_probabilities(text, ngram_probabilities, default_probability=0.0001):
    input_bigrams = generate_ngrams(text, 2)
    input_trigrams = generate_ngrams(text, 3)

    language_scores = {language: 0 for language in ngram_probabilities[3]}

    for language in language_scores.keys():
        language_scores[language] = sum(math.log(ngram_probabilities[2][language].get(bigram, default_probability)) for bigram in input_bigrams)
        language_scores[language] += sum(math.log(ngram_probabilities[3][language].get(trigram, default_probability)) for trigram in input_trigrams)

    max_score = max(language_scores.values())
    exp_scores = {language: math.exp(score - max_score) for language, score in language_scores.items()}
    total_exp_scores = sum(exp_scores.values())
    language_probabilities = {language: score / total_exp_scores for language, score in exp_scores.items()}

    most_likely_language = max(language_probabilities, key=language_probabilities.get)
    return most_likely_language, language_probabilities

def test_language_model(dataset_split, ngram_probabilities):
    correct_predictions = 0
    total_predictions = 0

    start_time = time.time()
    for example in dataset_split:
        text = example['text']
        actual_language = example['labels']
        predicted_language = identify_language(text, ngram_probabilities, default_probability=1e-4)

        if predicted_language == actual_language:
            correct_predictions += 1
        total_predictions += 1

    elapsed_time = time.time() - start_time
    accuracy = correct_predictions / total_predictions
    return accuracy, elapsed_time

def evaluation(filtered_dataset):
    bigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
    trigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))

    ngram_probabilities = {
        2: bigram_probabilities,
        3: trigram_probabilities
    }

    accuracy_bigrams, time_bigrams = test_language_model(filtered_dataset['test'], ngram_probabilities)
    accuracy_trigrams, time_trigrams = test_language_model(filtered_dataset['test'], ngram_probabilities)

    print(f"Bigram Accuracy: {accuracy_bigrams * 100:.2f}%")
    print(f"Bigram Time: {time_bigrams:.2f} seconds")
    print(f"Trigram Accuracy: {accuracy_trigrams * 100:.2f}%")
    print(f"Trigram Time: {time_trigrams:.2f} seconds")

if __name__ == "__main__":
    dataset = load_dataset("papluca/language-identification")
    selected_labels = ['nl', 'de', 'en', 'es', 'pt', 'ru']
    filtered_dataset = filter_dataset(dataset, selected_labels)

    input_text = input("Voer een zin in om de taal te identificeren: ")
    trigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
    bigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
    ngram_probabilities = {
        2: bigram_probabilities,
        3: trigram_probabilities
    }
    most_likely_language, language_probabilities = identify_language_with_probabilities(input_text, ngram_probabilities)
    print(f"De meest waarschijnlijke taal is: {most_likely_language}")
    print("Kansen van alle talen:")
    for language, probability in language_probabilities.items():
        print(f"{language}: {probability:.4f}")

    evaluation(filtered_dataset)
Leave a Comment