Untitled

 avatar
unknown
plain_text
a month ago
5.5 kB
3
Indexable
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Ustawienia losowych seedów (opcjonalne dla powtarzalności)
np.random.seed(42)
tf.random.set_seed(42)

# ------------------------------------------------------------------------------
# 1. Wczytanie zbioru GoEmotions i zdefiniowanie listy etykiet
# ------------------------------------------------------------------------------
dataset = tfds.load('goemotions', split='train')

LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness",
    "optimism", "pride", "realization", "relief", "remorse",
    "sadness", "surprise", "neutral"
]

def extract_text_and_labels(example):
    """
    Z example (dict) pobiera 'comment_text' (string)
    oraz 28 booli -> konwertuje je na float (0/1) w kolejności z LABELS.
    """
    text = example["comment_text"]
    label_vector = tf.stack([tf.cast(example[label], tf.float32) for label in LABELS])
    return text, label_vector

# Mapujemy dataset na (tekst, wektor_etykiet)
dataset = dataset.map(extract_text_and_labels)

# ------------------------------------------------------------------------------
# 2. Konwersja z TF Dataset do list (tekst + wektor etykiet)
# ------------------------------------------------------------------------------
text_data = []
labels_data = []

for example_text, example_labels in dataset:
    text_str = example_text.numpy().decode('utf-8')
    labels_arr = example_labels.numpy()  # shape=(28,)
    text_data.append(text_str)
    labels_data.append(labels_arr)

labels_data = np.array(labels_data, dtype=np.float32)
print(f"Liczba wczytanych próbek: {len(text_data)}")  # ~43k powinno wyjść

# ------------------------------------------------------------------------------
# 3. Tokenizacja tekstu, tworzenie sekwencji
# ------------------------------------------------------------------------------
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(text_data)

sequences = tokenizer.texts_to_sequences(text_data)

MAX_LEN = 50
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
y = labels_data

# ------------------------------------------------------------------------------
# 4. Podział na zbiór treningowy i testowy (80/20)
# ------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

# ------------------------------------------------------------------------------
# 5. Budowa modelu (Embedding + LSTM + Dense sigmoid)
#    Dodajemy 'input_length=MAX_LEN', aby Keras znał kształt i pokazał parametry.
# ------------------------------------------------------------------------------
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_LEN),
    LSTM(128),
    Dropout(0.3),
    Dense(28, activation='sigmoid')  # Multi-label => sigmoid
])

model.compile(
    loss='binary_crossentropy',  # dla multi-label
    optimizer='adam',
    metrics=['accuracy']         # w multi-label i tak bardziej liczy się F1
)

# Teraz model już „wie”, jak wygląda kształt wejścia, więc:
model.summary()

# ------------------------------------------------------------------------------
# 6. Trening
# ------------------------------------------------------------------------------
EPOCHS = 5
BATCH_SIZE = 32

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

# ------------------------------------------------------------------------------
# 7. Ewaluacja – obliczamy F1 (micro i macro)
# ------------------------------------------------------------------------------
predictions = model.predict(X_test)
pred_labels = (predictions >= 0.5).astype(int)

f1_micro = f1_score(y_test, pred_labels, average='micro')
f1_macro = f1_score(y_test, pred_labels, average='macro')

print("\n=== Ewaluacja (F1) ===")
print(f"F1 (micro): {f1_micro:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")

# Można także wyświetlić dokładny raport:
print("\n=== Classification Report (skrót) ===")
print(classification_report(y_test, pred_labels, zero_division=0))

# ------------------------------------------------------------------------------
# 8. Przykładowe predykcje
# ------------------------------------------------------------------------------
SAMPLES_TO_PREDICT = 5
for i in range(SAMPLES_TO_PREDICT):
    print(f"\nPrzykład nr {i+1}")
    print("Tekst:", tokenizer.sequences_to_texts([X_test[i]]))
    print("Prawdziwe etykiety:", y_test[i].astype(int))
    print("Przewidywane etykiety:", pred_labels[i])
Leave a Comment