Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
2.7 kB
4
Indexable
Never
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset

# Define the fields for the dataset
script = Field(sequential=True, tokenize='spacy', lower=True)
genre = Field(sequential=False, use_vocab=False)

# Load the dataset
data = TabularDataset(path='movie_scripts.csv', format='csv', 
                      fields=[('script', script), ('genre', genre)], skip_header=True)

# Split the dataset into training and testing sets
train_data, test_data = data.split()

# Build the vocabulary
script.build_vocab(train_data)

# Define the CNN architecture
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, 
                                             out_channels=num_filters, 
                                             kernel_size=(fs, embedding_dim)) 
                                   for fs in filter_sizes])
        self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim)
        
    def forward(self, x):
        x = x.permute(1, 0) # required to put the batch size as the first dimension 
        x = self.embedding(x)
        x = x.unsqueeze(1) # (batch_size, 1, seq_len, emb_dim)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.fc(x)
        return x

# Instantiate the model
VOCAB_SIZE = len(script.vocab)
EMBEDDING_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(set(genre.process([data.examples[i].genre for i in range(len(data))]))
model = CNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the model (continued)
    for epoch in range(10):
        for i, batch in enumerate(train_iter):
            script, genre = batch.script, batch.genre
            optimizer.zero_grad()
            output = model(script)
            loss = criterion(output, genre)
            loss.backward()
            optimizer.step()

# Evaluate the model on the test data
test_loss = 0
test_acc = 0
with torch.no_grad():
    for i, batch in enumerate(test_iter):
        script, genre = batch.script, batch.genre
        output = model(script)
        test_loss += criterion(output, genre)
        test_acc += (output.argmax(1) == genre).float().mean()
print('Test Loss:', test_loss/len(test_iter))
print('Test Acc:', test_acc/len(test_iter))