Untitled
import tensorflow as tf import tensorflow_datasets as tfds import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score, classification_report from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout # Ustawienia losowych seedów (opcjonalne dla powtarzalności) np.random.seed(42) tf.random.set_seed(42) # ------------------------------------------------------------------------------ # 1. Wczytanie zbioru GoEmotions i zdefiniowanie listy etykiet # ------------------------------------------------------------------------------ dataset = tfds.load('goemotions', split='train') LABELS = [ "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral" ] def extract_text_and_labels(example): """ Z example (dict) pobiera 'comment_text' (string) oraz 28 booli -> konwertuje je na float (0/1) w kolejności z LABELS. """ text = example["comment_text"] label_vector = tf.stack([tf.cast(example[label], tf.float32) for label in LABELS]) return text, label_vector # Mapujemy dataset na (tekst, wektor_etykiet) dataset = dataset.map(extract_text_and_labels) # ------------------------------------------------------------------------------ # 2. Konwersja z TF Dataset do list (tekst + wektor etykiet) # ------------------------------------------------------------------------------ text_data = [] labels_data = [] for example_text, example_labels in dataset: text_str = example_text.numpy().decode('utf-8') labels_arr = example_labels.numpy() # shape=(28,) text_data.append(text_str) labels_data.append(labels_arr) labels_data = np.array(labels_data, dtype=np.float32) print(f"Liczba wczytanych próbek: {len(text_data)}") # ~43k powinno wyjść # ------------------------------------------------------------------------------ # 3. Tokenizacja tekstu, tworzenie sekwencji # ------------------------------------------------------------------------------ VOCAB_SIZE = 10000 tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>") tokenizer.fit_on_texts(text_data) sequences = tokenizer.texts_to_sequences(text_data) MAX_LEN = 50 X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post') y = labels_data # ------------------------------------------------------------------------------ # 4. Podział na zbiór treningowy i testowy (80/20) # ------------------------------------------------------------------------------ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) print("Shapes:") print("X_train:", X_train.shape) print("y_train:", y_train.shape) print("X_test:", X_test.shape) print("y_test:", y_test.shape) # ------------------------------------------------------------------------------ # 5. Budowa modelu (Embedding + LSTM + Dense sigmoid) # Dodajemy 'input_length=MAX_LEN', aby Keras znał kształt i pokazał parametry. # ------------------------------------------------------------------------------ model = Sequential([ Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_LEN), LSTM(128), Dropout(0.3), Dense(28, activation='sigmoid') # Multi-label => sigmoid ]) model.compile( loss='binary_crossentropy', # dla multi-label optimizer='adam', metrics=['accuracy'] # w multi-label i tak bardziej liczy się F1 ) # Teraz model już „wie”, jak wygląda kształt wejścia, więc: model.summary() # ------------------------------------------------------------------------------ # 6. Trening # ------------------------------------------------------------------------------ EPOCHS = 5 BATCH_SIZE = 32 history = model.fit( X_train, y_train, validation_data=(X_test, y_test), epochs=EPOCHS, batch_size=BATCH_SIZE ) # ------------------------------------------------------------------------------ # 7. Ewaluacja – obliczamy F1 (micro i macro) # ------------------------------------------------------------------------------ predictions = model.predict(X_test) pred_labels = (predictions >= 0.5).astype(int) f1_micro = f1_score(y_test, pred_labels, average='micro') f1_macro = f1_score(y_test, pred_labels, average='macro') print("\n=== Ewaluacja (F1) ===") print(f"F1 (micro): {f1_micro:.4f}") print(f"F1 (macro): {f1_macro:.4f}") # Można także wyświetlić dokładny raport: print("\n=== Classification Report (skrót) ===") print(classification_report(y_test, pred_labels, zero_division=0)) # ------------------------------------------------------------------------------ # 8. Przykładowe predykcje # ------------------------------------------------------------------------------ SAMPLES_TO_PREDICT = 5 for i in range(SAMPLES_TO_PREDICT): print(f"\nPrzykład nr {i+1}") print("Tekst:", tokenizer.sequences_to_texts([X_test[i]])) print("Prawdziwe etykiety:", y_test[i].astype(int)) print("Przewidywane etykiety:", pred_labels[i])
Leave a Comment