Untitled

mail@pastecode.io avatar
unknown
python
a year ago
1.9 kB
3
Indexable
Never
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

url = 'https://raw.githubusercontent.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/master/Sarcasm_Headlines_Dataset.json'
df = pd.read_json(url, lines=True)
headlines = df['headline'].values
labels = df['is_sarcastic'].values
indices = np.arange(len(headlines))
np.random.shuffle(indices)
train_ratio = 0.8
split_index = int(train_ratio * len(headlines))
train_headlines, test_headlines = [], []
train_labels, test_labels = [], []
for i, idx in enumerate(indices):
    if i < split_index:
        train_headlines.append(headlines[idx])
        train_labels.append(labels[idx])
    else:
        test_headlines.append(headlines[idx])
        test_labels.append(labels[idx])

train_headlines = np.array(train_headlines)
test_headlines = np.array(test_headlines)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

max_words = 10000
maxlen = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_headlines)
train_sequences = tokenizer.texts_to_sequences(train_headlines)
test_sequences = tokenizer.texts_to_sequences(test_headlines)
train_padded = pad_sequences(train_sequences, maxlen=maxlen)
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

# Build model
model = Sequential([
    Embedding(max_words, 32, input_length=maxlen),
    SimpleRNN(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_split=0.2)