Untitled
unknown
python
a year ago
3.1 kB
8
Indexable
import torch
import cutlass
from cutlass.epilogue import leaky_relu,relu
from cutlass import Tensor as FakeTensor
from cutlass.utils.profiler import CUDAEventProfiler
# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to
# omit this information.
print_module = True
# Set up the problem size
N, H, W, C = [32, 416, 416, 64] # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 832, 832, 60] # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 1664, 1664, 60] # Input tensor: [N, H, W, C] under channel-last layout
K, R, S = [16, 3, 3] # Weight tensor: [K, R, S, C] under channel-last layout
stride = (2, 2)
padding = (1, 1)
dilation = (1, 1)
# Compute the output size [N, P, Q, K]
N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)
print(f"Computed output size: N={N}, P={P}, Q={Q}, K={K}") # 加入這行來確認計算的大小
# Set up data types
dtype = torch.float16
type_A = torch.float16
type_B = torch.float16
type_C = torch.float16
type_D = torch.float16
# Create input tensors
torch.manual_seed(1234)
input = torch.ceil(
torch.empty(size=(N, C, H, W), dtype=type_A, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)
weight = torch.ceil(
torch.empty(size=(K, C, R, S), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)
tensor_C = torch.ceil(
torch.empty(size=(N, K, P, Q), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)
output = torch.zeros_like(tensor_C)
temp = torch.ceil( torch.empty(size=(K, P, Q), dtype=type_C, device="cuda").uniform_(-4.5, 3.5) )
alpha = 1.0
beta = 0.0
plan = cutlass.op.Conv2d(kind="fprop", element=np.float32)
plan.activation = cutlass.epilogue.relu
# Create and configure the Conv2d plan
plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)
# Set the epilogue visitor
negative_slope = 0.5
plan.activation = ("leaky_relu", negative_slope)
# Run the convolution with leaky ReLU
# visitor_args = {"alpha": alpha, "D": output}
plan.run(
input, weight, tensor_C, output,
stride, padding, dilation,
alpha, beta, print_module=print_module
)
# Verify the results against PyTorch
def verify_results():
# Compute reference result using PyTorch
# output_torch = torch.nn.functional.conv2d(
# input, weight,
# stride=stride, padding=padding, dilation=dilation
# )
output_torch = alpha * torch.ops.aten.conv2d(
input, weight, stride=stride, padding=padding, dilation=dilation) + beta * tensor_C
output_torch = torch.nn.functional.leaky_relu(output_torch, negative_slope=negative_slope)
# Compare results
max_diff = torch.max(torch.abs(output_torch - output))
print(f"Maximum difference between CUTLASS and PyTorch: {max_diff}")
assert torch.allclose(output_torch, output, rtol=1e-2, atol=1e-2)
print("Verification successful!")
verify_results()
Editor is loading...
Leave a Comment