Untitled
unknown
python
2 years ago
6.1 kB
12
Indexable
from datasets import load_dataset
from collections import Counter
import pandas as pd
import numpy as np
import math
import time
def filter_dataset(dataset, language_labels):
return {split: dataset[split].filter(lambda example: example['labels'] in language_labels) for split in dataset.keys()}
def generate_ngrams(text, n):
clean_text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace())
words = clean_text.split()
return [word[i:i+n] for word in words if len(word) >= n for i in range(len(word) - n + 1)]
def generate_ngrams_for_dataset(dataset):
ngrams_per_language = {}
if isinstance(dataset, dict):
for split in dataset.keys():
for example in dataset[split]:
text = example['text']
language_label = example['labels']
ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 2))
ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 3))
else:
for example in dataset:
text = example['text']
language_label = example['labels']
ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 2))
ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 3))
return ngrams_per_language
def calculate_ngram_probabilities(ngrams_per_language):
probabilities = {}
for language, ngrams in ngrams_per_language.items():
ngram_counts = Counter(ngrams)
total_ngrams = sum(ngram_counts.values())
vocabulary_size = len(set(ngrams))
smoothed_total = total_ngrams + vocabulary_size
ngram_probabilities = {ngram: (count + 1) / smoothed_total for ngram, count in ngram_counts.items()}
probabilities[language] = ngram_probabilities
return probabilities
def identify_language(text, ngram_probabilities, default_probability=0.0001):
input_bigrams = generate_ngrams(text, 2)
input_trigrams = generate_ngrams(text, 3)
language_scores = {language: 0 for language in ngram_probabilities[2]}
for language in language_scores.keys():
language_scores[language] = sum(math.log(ngram_probabilities[2][language].get(bigram, default_probability)) for bigram in input_bigrams)
language_scores[language] += sum(math.log(ngram_probabilities[3][language].get(trigram, default_probability)) for trigram in input_trigrams)
return max(language_scores, key=language_scores.get)
def identify_language_with_probabilities(text, ngram_probabilities, default_probability=0.0001):
input_bigrams = generate_ngrams(text, 2)
input_trigrams = generate_ngrams(text, 3)
language_scores = {language: 0 for language in ngram_probabilities[3]}
for language in language_scores.keys():
language_scores[language] = sum(math.log(ngram_probabilities[2][language].get(bigram, default_probability)) for bigram in input_bigrams)
language_scores[language] += sum(math.log(ngram_probabilities[3][language].get(trigram, default_probability)) for trigram in input_trigrams)
max_score = max(language_scores.values())
exp_scores = {language: math.exp(score - max_score) for language, score in language_scores.items()}
total_exp_scores = sum(exp_scores.values())
language_probabilities = {language: score / total_exp_scores for language, score in exp_scores.items()}
most_likely_language = max(language_probabilities, key=language_probabilities.get)
return most_likely_language, language_probabilities
def test_language_model(dataset_split, ngram_probabilities):
correct_predictions = 0
total_predictions = 0
start_time = time.time()
for example in dataset_split:
text = example['text']
actual_language = example['labels']
predicted_language = identify_language(text, ngram_probabilities, default_probability=1e-4)
if predicted_language == actual_language:
correct_predictions += 1
total_predictions += 1
elapsed_time = time.time() - start_time
accuracy = correct_predictions / total_predictions
return accuracy, elapsed_time
def evaluation(filtered_dataset):
bigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
trigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
ngram_probabilities = {
2: bigram_probabilities,
3: trigram_probabilities
}
accuracy_bigrams, time_bigrams = test_language_model(filtered_dataset['test'], ngram_probabilities)
accuracy_trigrams, time_trigrams = test_language_model(filtered_dataset['test'], ngram_probabilities)
print(f"Bigram Accuracy: {accuracy_bigrams * 100:.2f}%")
print(f"Bigram Time: {time_bigrams:.2f} seconds")
print(f"Trigram Accuracy: {accuracy_trigrams * 100:.2f}%")
print(f"Trigram Time: {time_trigrams:.2f} seconds")
if __name__ == "__main__":
dataset = load_dataset("papluca/language-identification")
selected_labels = ['nl', 'de', 'en', 'es', 'pt', 'ru']
filtered_dataset = filter_dataset(dataset, selected_labels)
input_text = input("Voer een zin in om de taal te identificeren: ")
trigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
bigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset))
ngram_probabilities = {
2: bigram_probabilities,
3: trigram_probabilities
}
most_likely_language, language_probabilities = identify_language_with_probabilities(input_text, ngram_probabilities)
print(f"De meest waarschijnlijke taal is: {most_likely_language}")
print("Kansen van alle talen:")
for language, probability in language_probabilities.items():
print(f"{language}: {probability:.4f}")
evaluation(filtered_dataset)Editor is loading...
Leave a Comment