Untitled
unknown
python
3 years ago
2.7 kB
17
Indexable
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset
# Define the fields for the dataset
script = Field(sequential=True, tokenize='spacy', lower=True)
genre = Field(sequential=False, use_vocab=False)
# Load the dataset
data = TabularDataset(path='movie_scripts.csv', format='csv',
fields=[('script', script), ('genre', genre)], skip_header=True)
# Split the dataset into training and testing sets
train_data, test_data = data.split()
# Build the vocabulary
script.build_vocab(train_data)
# Define the CNN architecture
class CNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([nn.Conv2d(in_channels=1,
out_channels=num_filters,
kernel_size=(fs, embedding_dim))
for fs in filter_sizes])
self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim)
def forward(self, x):
x = x.permute(1, 0) # required to put the batch size as the first dimension
x = self.embedding(x)
x = x.unsqueeze(1) # (batch_size, 1, seq_len, emb_dim)
x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
x = torch.cat(x, 1)
x = self.fc(x)
return x
# Instantiate the model
VOCAB_SIZE = len(script.vocab)
EMBEDDING_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(set(genre.process([data.examples[i].genre for i in range(len(data))]))
model = CNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM)
# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Train the model (continued)
for epoch in range(10):
for i, batch in enumerate(train_iter):
script, genre = batch.script, batch.genre
optimizer.zero_grad()
output = model(script)
loss = criterion(output, genre)
loss.backward()
optimizer.step()
# Evaluate the model on the test data
test_loss = 0
test_acc = 0
with torch.no_grad():
for i, batch in enumerate(test_iter):
script, genre = batch.script, batch.genre
output = model(script)
test_loss += criterion(output, genre)
test_acc += (output.argmax(1) == genre).float().mean()
print('Test Loss:', test_loss/len(test_iter))
print('Test Acc:', test_acc/len(test_iter))
Editor is loading...