a month ago
4.5 kB
import torch import torch.nn as nn import torch.optim as optim import time import subprocess import numpy as np import torch.cuda.nvtx as nvtx # Check if CUDA is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") print(torch.version.cuda) class TFLiteMaximum(nn.Module): def __init__(self, channels): super(TFLiteMaximum, self).__init__() # Initialize a learnable parameter for the maximum operation self.threshold = nn.Parameter(torch.zeros(1, channels, 1, 1)) def forward(self, x): return torch.maximum(x, self.threshold) class CNNModel(nn.Module): def __init__(self): super(CNNModel, self).__init__() self.conv = nn.Conv2d(1, 16, kernel_size=3, padding=1, bias=True) nn.init.constant_(self.conv.bias, 16) # Set bias to 16 self.maximum = TFLiteMaximum(16) # 16 channels from conv layer self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2) self.fc = nn.Linear(16 * 14 * 14, 10) # Adjust based on input size def forward(self, x): nvtx.range_push("Conv2D") x = self.conv(x) nvtx.range_pop() nvtx.range_push("Multiplication") x = x * 0.10000000 # Multiplication layer nvtx.range_pop() nvtx.range_push("MAXIMUM") x = self.maximum(x) nvtx.range_pop() nvtx.range_push("MaxPool2D") x = self.max_pool(x) nvtx.range_pop() x = x.view(x.size(0), -1) x = self.fc(x) return x # Function to check GPU memory usage def get_gpu_memory_usage(): try: command = "nvidia-smi --query-gpu=memory.used --format=csv,nounits,noheader" memory_use = subprocess.check_output(command.split()).decode('ascii').strip().split('\n')[0] return int(memory_use) except: return 0 # Training function with NVTX ranges def train_model(model, criterion, optimizer, x_train, y_train, num_epochs): for epoch in range(num_epochs): nvtx.range_push(f"Epoch {epoch}") print(f"Epoch {epoch+1}/{num_epochs}") start_time = time.time() nvtx.range_push("Forward Pass") model.train() outputs = model(x_train) nvtx.range_pop() # End of Forward Pass nvtx.range_push("Loss Calculation") loss = criterion(outputs, y_train) nvtx.range_pop() # End of Loss Calculation nvtx.range_push("Backward Pass") optimizer.zero_grad() loss.backward() nvtx.range_pop() # End of Backward Pass nvtx.range_push("Optimizer Step") optimizer.step() nvtx.range_pop() # End of Optimizer Step # Calculate accuracy # _, predicted = torch.max(outputs.data, 1) # accuracy = (predicted == y_train).sum().item() / y_train.size(0) # Print metrics # print(f"Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}") # print(f"GPU Memory Usage: {get_gpu_memory_usage()} MB") # print(f"Time taken: {time.time() - start_time:.2f} seconds") # print("------") nvtx.range_pop() # End of Epoch if __name__ == "__main__": nvtx.range_push("Model Creation and Data Generation") # Create the model and move it to GPU if available model = CNNModel().to(device) # Define loss function and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) # Generate some dummy data x_train = torch.randn(10000, 1, 28, 28).to(device) y_train = torch.randint(0, 10, (10000,)).to(device) nvtx.range_pop() nvtx.range_push("Training") train_model(model, criterion, optimizer, x_train, y_train, num_epochs=50) nvtx.range_pop() nvtx.range_push("GPU Inference") model.eval() with torch.no_grad(): start_time = time.time() predictions = model(x_train) print(f"GPU Inference time: {time.time() - start_time:.4f} seconds") nvtx.range_pop() # nvtx.range_push("CPU Inference") # model_cpu = model.to('cpu') # x_train_cpu = x_train.to('cpu') # with torch.no_grad(): # start_time = time.time() # predictions_cpu = model_cpu(x_train_cpu) # print(f"CPU Inference time: {time.time() - start_time:.4f} seconds") # nvtx.range_pop()
Leave a Comment