Untitled
unknown
python
a year ago
6.1 kB
8
Indexable
from datasets import load_dataset from collections import Counter import pandas as pd import numpy as np import math import time def filter_dataset(dataset, language_labels): return {split: dataset[split].filter(lambda example: example['labels'] in language_labels) for split in dataset.keys()} def generate_ngrams(text, n): clean_text = ''.join(char.lower() for char in text if char.isalnum() or char.isspace()) words = clean_text.split() return [word[i:i+n] for word in words if len(word) >= n for i in range(len(word) - n + 1)] def generate_ngrams_for_dataset(dataset): ngrams_per_language = {} if isinstance(dataset, dict): for split in dataset.keys(): for example in dataset[split]: text = example['text'] language_label = example['labels'] ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 2)) ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 3)) else: for example in dataset: text = example['text'] language_label = example['labels'] ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 2)) ngrams_per_language.setdefault(language_label, []).extend(generate_ngrams(text, 3)) return ngrams_per_language def calculate_ngram_probabilities(ngrams_per_language): probabilities = {} for language, ngrams in ngrams_per_language.items(): ngram_counts = Counter(ngrams) total_ngrams = sum(ngram_counts.values()) vocabulary_size = len(set(ngrams)) smoothed_total = total_ngrams + vocabulary_size ngram_probabilities = {ngram: (count + 1) / smoothed_total for ngram, count in ngram_counts.items()} probabilities[language] = ngram_probabilities return probabilities def identify_language(text, ngram_probabilities, default_probability=0.0001): input_bigrams = generate_ngrams(text, 2) input_trigrams = generate_ngrams(text, 3) language_scores = {language: 0 for language in ngram_probabilities[2]} for language in language_scores.keys(): language_scores[language] = sum(math.log(ngram_probabilities[2][language].get(bigram, default_probability)) for bigram in input_bigrams) language_scores[language] += sum(math.log(ngram_probabilities[3][language].get(trigram, default_probability)) for trigram in input_trigrams) return max(language_scores, key=language_scores.get) def identify_language_with_probabilities(text, ngram_probabilities, default_probability=0.0001): input_bigrams = generate_ngrams(text, 2) input_trigrams = generate_ngrams(text, 3) language_scores = {language: 0 for language in ngram_probabilities[3]} for language in language_scores.keys(): language_scores[language] = sum(math.log(ngram_probabilities[2][language].get(bigram, default_probability)) for bigram in input_bigrams) language_scores[language] += sum(math.log(ngram_probabilities[3][language].get(trigram, default_probability)) for trigram in input_trigrams) max_score = max(language_scores.values()) exp_scores = {language: math.exp(score - max_score) for language, score in language_scores.items()} total_exp_scores = sum(exp_scores.values()) language_probabilities = {language: score / total_exp_scores for language, score in exp_scores.items()} most_likely_language = max(language_probabilities, key=language_probabilities.get) return most_likely_language, language_probabilities def test_language_model(dataset_split, ngram_probabilities): correct_predictions = 0 total_predictions = 0 start_time = time.time() for example in dataset_split: text = example['text'] actual_language = example['labels'] predicted_language = identify_language(text, ngram_probabilities, default_probability=1e-4) if predicted_language == actual_language: correct_predictions += 1 total_predictions += 1 elapsed_time = time.time() - start_time accuracy = correct_predictions / total_predictions return accuracy, elapsed_time def evaluation(filtered_dataset): bigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset)) trigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset)) ngram_probabilities = { 2: bigram_probabilities, 3: trigram_probabilities } accuracy_bigrams, time_bigrams = test_language_model(filtered_dataset['test'], ngram_probabilities) accuracy_trigrams, time_trigrams = test_language_model(filtered_dataset['test'], ngram_probabilities) print(f"Bigram Accuracy: {accuracy_bigrams * 100:.2f}%") print(f"Bigram Time: {time_bigrams:.2f} seconds") print(f"Trigram Accuracy: {accuracy_trigrams * 100:.2f}%") print(f"Trigram Time: {time_trigrams:.2f} seconds") if __name__ == "__main__": dataset = load_dataset("papluca/language-identification") selected_labels = ['nl', 'de', 'en', 'es', 'pt', 'ru'] filtered_dataset = filter_dataset(dataset, selected_labels) input_text = input("Voer een zin in om de taal te identificeren: ") trigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset)) bigram_probabilities = calculate_ngram_probabilities(generate_ngrams_for_dataset(filtered_dataset)) ngram_probabilities = { 2: bigram_probabilities, 3: trigram_probabilities } most_likely_language, language_probabilities = identify_language_with_probabilities(input_text, ngram_probabilities) print(f"De meest waarschijnlijke taal is: {most_likely_language}") print("Kansen van alle talen:") for language, probability in language_probabilities.items(): print(f"{language}: {probability:.4f}") evaluation(filtered_dataset)
Editor is loading...
Leave a Comment