Untitled

import torch
import cutlass
from cutlass.epilogue import leaky_relu,relu
from cutlass import Tensor as FakeTensor
from cutlass.utils.profiler import CUDAEventProfiler
# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to
# omit this information.
print_module = True

dtype = torch.float16
acc_type = torch.float32  # 累加器数据类型

# plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
# plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=acc_type)
plan = cutlass.op.Conv2dFprop(
    element=dtype,  # 输入/输出数据类型
    element_accumulator=acc_type,  # 累加器数据类型
)
# plan = cutlass.op.Conv2d(
#     element=dtype,  # 输入/输出数据类型
#     element_accumulator=acc_type,  # 累加器数据类型
# )
negative_slope = 0.1
# plan.conv_kind = 0
# class ConvKind(enum.IntEnum):
#   Fprop = 0
#   Dgrad = 1
#   Wgrad = 2
plan.activation = ("leaky_relu", negative_slope)

op = plan.construct()
conv2d_leakyrelu = cutlass.emit.pytorch(op, name='conv2d', cc=plan.cc, sourcedir='conv2d+leakyrelu', jit=True)
# Set up the problem size
N, H, W, C = [32, 416, 416, 64]  # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 832, 832, 60]  # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 1664, 1664, 60]  # Input tensor: [N, H, W, C] under channel-last layout
K, R, S = [16, 3, 3]          # Weight tensor: [K, R, S, C] under channel-last layout
stride = (2, 2)
padding = (1, 1)
dilation = (1, 1)

# Compute the output size [N, P, Q, K]
N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)
print(f"Computed output size: N={N}, P={P}, Q={Q}, K={K}")  # 加入這行來確認計算的大小


# Set up data types
dtype = torch.float16
type_A = torch.float16
type_B = torch.float16
type_C = torch.float16
type_D = torch.float16

# Create input tensors
torch.manual_seed(1234)
input = torch.ceil(
    torch.empty(size=(N, C, H, W), dtype=type_A, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)

weight = torch.ceil(
    torch.empty(size=(K, C, R, S), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)

tensor_C = torch.ceil(
    torch.empty(size=(N, K, P, Q), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)

output = torch.zeros_like(tensor_C)
temp = torch.ceil( torch.empty(size=(K, P, Q), dtype=type_C, device="cuda").uniform_(-4.5, 3.5) )

alpha = 1.0
beta = 0.0
Editor is loading...