Untitled
import torch import cutlass from cutlass.epilogue import leaky_relu,relu from cutlass import Tensor as FakeTensor from cutlass.utils.profiler import CUDAEventProfiler # This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to # omit this information. print_module = True # Set up the problem size N, H, W, C = [32, 416, 416, 64] # Input tensor: [N, H, W, C] under channel-last layout # N, H, W, C = [32, 832, 832, 60] # Input tensor: [N, H, W, C] under channel-last layout # N, H, W, C = [32, 1664, 1664, 60] # Input tensor: [N, H, W, C] under channel-last layout K, R, S = [16, 3, 3] # Weight tensor: [K, R, S, C] under channel-last layout stride = (2, 2) padding = (1, 1) dilation = (1, 1) # Compute the output size [N, P, Q, K] N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation) print(f"Computed output size: N={N}, P={P}, Q={Q}, K={K}") # 加入這行來確認計算的大小 # Set up data types dtype = torch.float16 type_A = torch.float16 type_B = torch.float16 type_C = torch.float16 type_D = torch.float16 # Create input tensors torch.manual_seed(1234) input = torch.ceil( torch.empty(size=(N, C, H, W), dtype=type_A, device="cuda").uniform_(-4.5, 3.5) ).to(memory_format=torch.channels_last) weight = torch.ceil( torch.empty(size=(K, C, R, S), dtype=type_B, device="cuda").uniform_(-4.5, 3.5) ).to(memory_format=torch.channels_last) tensor_C = torch.ceil( torch.empty(size=(N, K, P, Q), dtype=type_B, device="cuda").uniform_(-4.5, 3.5) ).to(memory_format=torch.channels_last) output = torch.zeros_like(tensor_C) temp = torch.ceil( torch.empty(size=(K, P, Q), dtype=type_C, device="cuda").uniform_(-4.5, 3.5) ) alpha = 1.0 beta = 0.0 plan = cutlass.op.Conv2d(kind="fprop", element=np.float32) plan.activation = cutlass.epilogue.relu # Create and configure the Conv2d plan plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32) # Set the epilogue visitor negative_slope = 0.5 plan.activation = ("leaky_relu", negative_slope) # Run the convolution with leaky ReLU # visitor_args = {"alpha": alpha, "D": output} plan.run( input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module ) # Verify the results against PyTorch def verify_results(): # Compute reference result using PyTorch # output_torch = torch.nn.functional.conv2d( # input, weight, # stride=stride, padding=padding, dilation=dilation # ) output_torch = alpha * torch.ops.aten.conv2d( input, weight, stride=stride, padding=padding, dilation=dilation) + beta * tensor_C output_torch = torch.nn.functional.leaky_relu(output_torch, negative_slope=negative_slope) # Compare results max_diff = torch.max(torch.abs(output_torch - output)) print(f"Maximum difference between CUTLASS and PyTorch: {max_diff}") assert torch.allclose(output_torch, output, rtol=1e-2, atol=1e-2) print("Verification successful!") verify_results()
Leave a Comment