Untitled
unknown
python
2 years ago
2.7 kB
7
Indexable
import torch import torch.nn as nn import torch.optim as optim from torchtext.data import Field, TabularDataset # Define the fields for the dataset script = Field(sequential=True, tokenize='spacy', lower=True) genre = Field(sequential=False, use_vocab=False) # Load the dataset data = TabularDataset(path='movie_scripts.csv', format='csv', fields=[('script', script), ('genre', genre)], skip_header=True) # Split the dataset into training and testing sets train_data, test_data = data.split() # Build the vocabulary script.build_vocab(train_data) # Define the CNN architecture class CNN(nn.Module): def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim): super().__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes]) self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim) def forward(self, x): x = x.permute(1, 0) # required to put the batch size as the first dimension x = self.embedding(x) x = x.unsqueeze(1) # (batch_size, 1, seq_len, emb_dim) x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] x = torch.cat(x, 1) x = self.fc(x) return x # Instantiate the model VOCAB_SIZE = len(script.vocab) EMBEDDING_DIM = 100 NUM_FILTERS = 100 FILTER_SIZES = [3,4,5] OUTPUT_DIM = len(set(genre.process([data.examples[i].genre for i in range(len(data))])) model = CNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM) # Define the loss function and the optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) # Train the model (continued) for epoch in range(10): for i, batch in enumerate(train_iter): script, genre = batch.script, batch.genre optimizer.zero_grad() output = model(script) loss = criterion(output, genre) loss.backward() optimizer.step() # Evaluate the model on the test data test_loss = 0 test_acc = 0 with torch.no_grad(): for i, batch in enumerate(test_iter): script, genre = batch.script, batch.genre output = model(script) test_loss += criterion(output, genre) test_acc += (output.argmax(1) == genre).float().mean() print('Test Loss:', test_loss/len(test_iter)) print('Test Acc:', test_acc/len(test_iter))
Editor is loading...