Untitled
unknown
plain_text
a year ago
5.5 kB
13
Indexable
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
# Ustawienia losowych seedów (opcjonalne dla powtarzalności)
np.random.seed(42)
tf.random.set_seed(42)
# ------------------------------------------------------------------------------
# 1. Wczytanie zbioru GoEmotions i zdefiniowanie listy etykiet
# ------------------------------------------------------------------------------
dataset = tfds.load('goemotions', split='train')
LABELS = [
"admiration", "amusement", "anger", "annoyance", "approval",
"caring", "confusion", "curiosity", "desire", "disappointment",
"disapproval", "disgust", "embarrassment", "excitement", "fear",
"gratitude", "grief", "joy", "love", "nervousness",
"optimism", "pride", "realization", "relief", "remorse",
"sadness", "surprise", "neutral"
]
def extract_text_and_labels(example):
"""
Z example (dict) pobiera 'comment_text' (string)
oraz 28 booli -> konwertuje je na float (0/1) w kolejności z LABELS.
"""
text = example["comment_text"]
label_vector = tf.stack([tf.cast(example[label], tf.float32) for label in LABELS])
return text, label_vector
# Mapujemy dataset na (tekst, wektor_etykiet)
dataset = dataset.map(extract_text_and_labels)
# ------------------------------------------------------------------------------
# 2. Konwersja z TF Dataset do list (tekst + wektor etykiet)
# ------------------------------------------------------------------------------
text_data = []
labels_data = []
for example_text, example_labels in dataset:
text_str = example_text.numpy().decode('utf-8')
labels_arr = example_labels.numpy() # shape=(28,)
text_data.append(text_str)
labels_data.append(labels_arr)
labels_data = np.array(labels_data, dtype=np.float32)
print(f"Liczba wczytanych próbek: {len(text_data)}") # ~43k powinno wyjść
# ------------------------------------------------------------------------------
# 3. Tokenizacja tekstu, tworzenie sekwencji
# ------------------------------------------------------------------------------
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
MAX_LEN = 50
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
y = labels_data
# ------------------------------------------------------------------------------
# 4. Podział na zbiór treningowy i testowy (80/20)
# ------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print("Shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)
# ------------------------------------------------------------------------------
# 5. Budowa modelu (Embedding + LSTM + Dense sigmoid)
# Dodajemy 'input_length=MAX_LEN', aby Keras znał kształt i pokazał parametry.
# ------------------------------------------------------------------------------
model = Sequential([
Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_LEN),
LSTM(128),
Dropout(0.3),
Dense(28, activation='sigmoid') # Multi-label => sigmoid
])
model.compile(
loss='binary_crossentropy', # dla multi-label
optimizer='adam',
metrics=['accuracy'] # w multi-label i tak bardziej liczy się F1
)
# Teraz model już „wie”, jak wygląda kształt wejścia, więc:
model.summary()
# ------------------------------------------------------------------------------
# 6. Trening
# ------------------------------------------------------------------------------
EPOCHS = 5
BATCH_SIZE = 32
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
epochs=EPOCHS,
batch_size=BATCH_SIZE
)
# ------------------------------------------------------------------------------
# 7. Ewaluacja – obliczamy F1 (micro i macro)
# ------------------------------------------------------------------------------
predictions = model.predict(X_test)
pred_labels = (predictions >= 0.5).astype(int)
f1_micro = f1_score(y_test, pred_labels, average='micro')
f1_macro = f1_score(y_test, pred_labels, average='macro')
print("\n=== Ewaluacja (F1) ===")
print(f"F1 (micro): {f1_micro:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
# Można także wyświetlić dokładny raport:
print("\n=== Classification Report (skrót) ===")
print(classification_report(y_test, pred_labels, zero_division=0))
# ------------------------------------------------------------------------------
# 8. Przykładowe predykcje
# ------------------------------------------------------------------------------
SAMPLES_TO_PREDICT = 5
for i in range(SAMPLES_TO_PREDICT):
print(f"\nPrzykład nr {i+1}")
print("Tekst:", tokenizer.sequences_to_texts([X_test[i]]))
print("Prawdziwe etykiety:", y_test[i].astype(int))
print("Przewidywane etykiety:", pred_labels[i])Editor is loading...
Leave a Comment