Untitled
unknown
python
a year ago
5.3 kB
7
Indexable
# Train the final LSTM_US_SA model with the best parameters
model = Sequential(name="LSTM_US_SA_Model_Final")
model.add(LSTM(best_params['lstm_units'],
input_shape=(1, sequences.shape[1]),
dropout=best_params['dropout_rate'],
recurrent_dropout=best_params['recurrent_dropout_rate'],
name="LSTM_Layer"))
model.add(Flatten(name="Flatten_Layer"))
model.add(Dense(3, activation='softmax', name="Output_Layer"))
optimizer = AdamW(learning_rate=best_params['learning_rate'])
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(model.summary())
# Add new columns to us_df
us_df['predicted_fold_sentiment'] = [[] for _ in range(len(us_df))]
us_df['predicted_sentiment'] = None
# Cross-validation with the best parameters
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
n_folds = skf.get_n_splits()
split_indices = {i: (train_index, val_index) for i, (train_index, val_index) in enumerate(skf.split(sequences, labels))}
# Initialize metrics storage
train_accuracy_scores, val_accuracy_scores = [], []
train_precision_scores, val_precision_scores = [], []
train_recall_scores, val_recall_scores = [], []
train_f1_scores, val_f1_scores = [], []
train_roc_auc_scores, val_roc_auc_scores = [], []
fold_histories = []
all_y_true, all_y_pred = [], []
# Initialize class-specific metrics storage
class_names = [0, 1, 2] # Assuming the classes are 0, 1, and 2
precision_per_class = {cls: [] for cls in class_names}
recall_per_class = {cls: [] for cls in class_names}
f1_per_class = {cls: [] for cls in class_names}
# Train and evaluate the model for each fold
for i, (train_index, val_index) in tqdm(split_indices.items(), total=n_folds, desc="Folds"):
X_train_fold, X_val_fold = sequences[train_index], sequences[val_index]
y_train_fold, y_val_fold = labels[train_index], labels[val_index]
X_train_fold = np.expand_dims(X_train_fold, axis=1)
X_val_fold = np.expand_dims(X_val_fold, axis=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.0001)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)
class_weight = dict(enumerate(compute_class_weight(class_weight="balanced", classes=np.unique(y_train_fold), y=y_train_fold)))
history = model.fit(X_train_fold, y_train_fold,
validation_data=(X_val_fold, y_val_fold),
epochs=100,
batch_size=best_params['batch_size'],
class_weight=class_weight,
callbacks=[early_stopping, reduce_lr])
y_train_pred = np.argmax(model.predict(X_train_fold), axis=1)
y_val_pred = np.argmax(model.predict(X_val_fold), axis=1)
all_y_true.extend(y_val_fold)
all_y_pred.extend(y_val_pred)
train_accuracy_scores.append(accuracy_score(y_train_fold, y_train_pred))
val_accuracy_scores.append(accuracy_score(y_val_fold, y_val_pred))
train_precision_scores.append(precision_score(y_train_fold, y_train_pred, average='weighted'))
val_precision_scores.append(precision_score(y_val_fold, y_val_pred, average='weighted'))
train_recall_scores.append(recall_score(y_train_fold, y_train_pred, average='weighted'))
val_recall_scores.append(recall_score(y_val_fold, y_val_pred, average='weighted'))
train_f1_scores.append(f1_score(y_train_fold, y_train_pred, average='weighted'))
val_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='weighted'))
train_roc_auc_scores.append(roc_auc_score(label_binarize(y_train_fold, classes=[0, 1, 2]), label_binarize(y_train_pred, classes=[0, 1, 2]), average='weighted', multi_class='ovr'))
val_roc_auc_scores.append(roc_auc_score(label_binarize(y_val_fold, classes=[0, 1, 2]), label_binarize(y_val_pred, classes=[0, 1, 2]), average='weighted', multi_class='ovr'))
# Calculate class-specific metrics for this fold
for cls in class_names:
precision_per_class[cls].append(precision_score(y_val_fold, y_val_pred, labels=[cls], average='macro', zero_division=0))
recall_per_class[cls].append(recall_score(y_val_fold, y_val_pred, labels=[cls], average='macro', zero_division=0))
f1_per_class[cls].append(f1_score(y_val_fold, y_val_pred, labels=[cls], average='macro', zero_division=0))
fold_histories.append(history)
for idx, val_idx in enumerate(val_index):
us_df.at[val_idx, 'predicted_fold_sentiment'].append(y_val_pred[idx])
# Majority voting for predicted sentiment
def majority_vote(sentiments):
return Counter(sentiments).most_common(1)[0][0]
us_df['predicted_sentiment'] = us_df['predicted_fold_sentiment'].apply(majority_vote)
# Save results to CSV
output_df = pd.DataFrame({'id_str': us_df['id_str'],
'full_text': us_df['full_text'],
'sentiment': us_df['sentiment'],
'predicted_sentiment': us_df['predicted_sentiment'],
'predicted_fold_sentiment': us_df['predicted_fold_sentiment']})
output_df.to_csv(f'{path}{model.name}_prediction.csv', index=False)Editor is loading...
Leave a Comment