Untitled
unknown
python
10 months ago
2.6 kB
3
Indexable
import torch
import cutlass
from cutlass.epilogue import leaky_relu,relu
from cutlass import Tensor as FakeTensor
from cutlass.utils.profiler import CUDAEventProfiler
# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to
# omit this information.
print_module = True
dtype = torch.float16
acc_type = torch.float32 # 累加器数据类型
# plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
# plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=acc_type)
plan = cutlass.op.Conv2dFprop(
element=dtype, # 输入/输出数据类型
element_accumulator=acc_type, # 累加器数据类型
)
# plan = cutlass.op.Conv2d(
# element=dtype, # 输入/输出数据类型
# element_accumulator=acc_type, # 累加器数据类型
# )
negative_slope = 0.1
# plan.conv_kind = 0
# class ConvKind(enum.IntEnum):
# Fprop = 0
# Dgrad = 1
# Wgrad = 2
plan.activation = ("leaky_relu", negative_slope)
op = plan.construct()
conv2d_leakyrelu = cutlass.emit.pytorch(op, name='conv2d', cc=plan.cc, sourcedir='conv2d+leakyrelu', jit=True)
# Set up the problem size
N, H, W, C = [32, 416, 416, 64] # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 832, 832, 60] # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 1664, 1664, 60] # Input tensor: [N, H, W, C] under channel-last layout
K, R, S = [16, 3, 3] # Weight tensor: [K, R, S, C] under channel-last layout
stride = (2, 2)
padding = (1, 1)
dilation = (1, 1)
# Compute the output size [N, P, Q, K]
N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)
print(f"Computed output size: N={N}, P={P}, Q={Q}, K={K}") # 加入這行來確認計算的大小
# Set up data types
dtype = torch.float16
type_A = torch.float16
type_B = torch.float16
type_C = torch.float16
type_D = torch.float16
# Create input tensors
torch.manual_seed(1234)
input = torch.ceil(
torch.empty(size=(N, C, H, W), dtype=type_A, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)
weight = torch.ceil(
torch.empty(size=(K, C, R, S), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)
tensor_C = torch.ceil(
torch.empty(size=(N, K, P, Q), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)
output = torch.zeros_like(tensor_C)
temp = torch.ceil( torch.empty(size=(K, P, Q), dtype=type_C, device="cuda").uniform_(-4.5, 3.5) )
alpha = 1.0
beta = 0.0Editor is loading...
Leave a Comment