Untitled
unknown
python
a year ago
1.9 kB
3
Indexable
Never
import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, SimpleRNN, Dense from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np import pandas as pd url = 'https://raw.githubusercontent.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/master/Sarcasm_Headlines_Dataset.json' df = pd.read_json(url, lines=True) headlines = df['headline'].values labels = df['is_sarcastic'].values indices = np.arange(len(headlines)) np.random.shuffle(indices) train_ratio = 0.8 split_index = int(train_ratio * len(headlines)) train_headlines, test_headlines = [], [] train_labels, test_labels = [], [] for i, idx in enumerate(indices): if i < split_index: train_headlines.append(headlines[idx]) train_labels.append(labels[idx]) else: test_headlines.append(headlines[idx]) test_labels.append(labels[idx]) train_headlines = np.array(train_headlines) test_headlines = np.array(test_headlines) train_labels = np.array(train_labels) test_labels = np.array(test_labels) max_words = 10000 maxlen = 100 tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(train_headlines) train_sequences = tokenizer.texts_to_sequences(train_headlines) test_sequences = tokenizer.texts_to_sequences(test_headlines) train_padded = pad_sequences(train_sequences, maxlen=maxlen) test_padded = pad_sequences(test_sequences, maxlen=maxlen) # Build model model = Sequential([ Embedding(max_words, 32, input_length=maxlen), SimpleRNN(32), Dense(1, activation='sigmoid') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Train the model model.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_split=0.2)