Untitled
import torch import cutlass from cutlass.epilogue import leaky_relu,relu from cutlass import Tensor as FakeTensor from cutlass.utils.profiler import CUDAEventProfiler # This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to # omit this information. print_module = True dtype = torch.float16 acc_type = torch.float32 # 累加器数据类型 # plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor) # plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=acc_type) plan = cutlass.op.Conv2dFprop( element=dtype, # 输入/输出数据类型 element_accumulator=acc_type, # 累加器数据类型 ) # plan = cutlass.op.Conv2d( # element=dtype, # 输入/输出数据类型 # element_accumulator=acc_type, # 累加器数据类型 # ) negative_slope = 0.1 # plan.conv_kind = 0 # class ConvKind(enum.IntEnum): # Fprop = 0 # Dgrad = 1 # Wgrad = 2 plan.activation = ("leaky_relu", negative_slope) op = plan.construct() conv2d_leakyrelu = cutlass.emit.pytorch(op, name='conv2d', cc=plan.cc, sourcedir='conv2d+leakyrelu', jit=True) # Set up the problem size N, H, W, C = [32, 416, 416, 64] # Input tensor: [N, H, W, C] under channel-last layout # N, H, W, C = [32, 832, 832, 60] # Input tensor: [N, H, W, C] under channel-last layout # N, H, W, C = [32, 1664, 1664, 60] # Input tensor: [N, H, W, C] under channel-last layout K, R, S = [16, 3, 3] # Weight tensor: [K, R, S, C] under channel-last layout stride = (2, 2) padding = (1, 1) dilation = (1, 1) # Compute the output size [N, P, Q, K] N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation) print(f"Computed output size: N={N}, P={P}, Q={Q}, K={K}") # 加入這行來確認計算的大小 # Set up data types dtype = torch.float16 type_A = torch.float16 type_B = torch.float16 type_C = torch.float16 type_D = torch.float16 # Create input tensors torch.manual_seed(1234) input = torch.ceil( torch.empty(size=(N, C, H, W), dtype=type_A, device="cuda").uniform_(-4.5, 3.5) ).to(memory_format=torch.channels_last) weight = torch.ceil( torch.empty(size=(K, C, R, S), dtype=type_B, device="cuda").uniform_(-4.5, 3.5) ).to(memory_format=torch.channels_last) tensor_C = torch.ceil( torch.empty(size=(N, K, P, Q), dtype=type_B, device="cuda").uniform_(-4.5, 3.5) ).to(memory_format=torch.channels_last) output = torch.zeros_like(tensor_C) temp = torch.ceil( torch.empty(size=(K, P, Q), dtype=type_C, device="cuda").uniform_(-4.5, 3.5) ) alpha = 1.0 beta = 0.0
Leave a Comment