20240823_CONV
user_3093867
python
a year ago
4.5 kB
10
Indexable
import torch
import torch.nn as nn
import torch.optim as optim
import time
import subprocess
import numpy as np
import torch.cuda.nvtx as nvtx
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(torch.version.cuda)
class TFLiteMaximum(nn.Module):
def __init__(self, channels):
super(TFLiteMaximum, self).__init__()
# Initialize a learnable parameter for the maximum operation
self.threshold = nn.Parameter(torch.zeros(1, channels, 1, 1))
def forward(self, x):
return torch.maximum(x, self.threshold)
class CNNModel(nn.Module):
def __init__(self):
super(CNNModel, self).__init__()
self.conv = nn.Conv2d(1, 16, kernel_size=3, padding=1, bias=True)
nn.init.constant_(self.conv.bias, 16) # Set bias to 16
self.maximum = TFLiteMaximum(16) # 16 channels from conv layer
self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
self.fc = nn.Linear(16 * 14 * 14, 10) # Adjust based on input size
def forward(self, x):
nvtx.range_push("Conv2D")
x = self.conv(x)
nvtx.range_pop()
nvtx.range_push("Multiplication")
x = x * 0.10000000 # Multiplication layer
nvtx.range_pop()
nvtx.range_push("MAXIMUM")
x = self.maximum(x)
nvtx.range_pop()
nvtx.range_push("MaxPool2D")
x = self.max_pool(x)
nvtx.range_pop()
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# Function to check GPU memory usage
def get_gpu_memory_usage():
try:
command = "nvidia-smi --query-gpu=memory.used --format=csv,nounits,noheader"
memory_use = subprocess.check_output(command.split()).decode('ascii').strip().split('\n')[0]
return int(memory_use)
except:
return 0
# Training function with NVTX ranges
def train_model(model, criterion, optimizer, x_train, y_train, num_epochs):
for epoch in range(num_epochs):
nvtx.range_push(f"Epoch {epoch}")
print(f"Epoch {epoch+1}/{num_epochs}")
start_time = time.time()
nvtx.range_push("Forward Pass")
model.train()
outputs = model(x_train)
nvtx.range_pop() # End of Forward Pass
nvtx.range_push("Loss Calculation")
loss = criterion(outputs, y_train)
nvtx.range_pop() # End of Loss Calculation
nvtx.range_push("Backward Pass")
optimizer.zero_grad()
loss.backward()
nvtx.range_pop() # End of Backward Pass
nvtx.range_push("Optimizer Step")
optimizer.step()
nvtx.range_pop() # End of Optimizer Step
# Calculate accuracy
# _, predicted = torch.max(outputs.data, 1)
# accuracy = (predicted == y_train).sum().item() / y_train.size(0)
# Print metrics
# print(f"Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")
# print(f"GPU Memory Usage: {get_gpu_memory_usage()} MB")
# print(f"Time taken: {time.time() - start_time:.2f} seconds")
# print("------")
nvtx.range_pop() # End of Epoch
if __name__ == "__main__":
nvtx.range_push("Model Creation and Data Generation")
# Create the model and move it to GPU if available
model = CNNModel().to(device)
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Generate some dummy data
x_train = torch.randn(10000, 1, 28, 28).to(device)
y_train = torch.randint(0, 10, (10000,)).to(device)
nvtx.range_pop()
nvtx.range_push("Training")
train_model(model, criterion, optimizer, x_train, y_train, num_epochs=50)
nvtx.range_pop()
nvtx.range_push("GPU Inference")
model.eval()
with torch.no_grad():
start_time = time.time()
predictions = model(x_train)
print(f"GPU Inference time: {time.time() - start_time:.4f} seconds")
nvtx.range_pop()
# nvtx.range_push("CPU Inference")
# model_cpu = model.to('cpu')
# x_train_cpu = x_train.to('cpu')
# with torch.no_grad():
# start_time = time.time()
# predictions_cpu = model_cpu(x_train_cpu)
# print(f"CPU Inference time: {time.time() - start_time:.4f} seconds")
# nvtx.range_pop()Editor is loading...
Leave a Comment