Untitled

import torch
import cutlass
from cutlass.epilogue import leaky_relu,relu
from cutlass import Tensor as FakeTensor
from cutlass.utils.profiler import CUDAEventProfiler
# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to
# omit this information.
print_module = True

# Set up the problem size
N, H, W, C = [32, 416, 416, 64]  # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 832, 832, 60]  # Input tensor: [N, H, W, C] under channel-last layout
# N, H, W, C = [32, 1664, 1664, 60]  # Input tensor: [N, H, W, C] under channel-last layout
K, R, S = [16, 3, 3]          # Weight tensor: [K, R, S, C] under channel-last layout
stride = (2, 2)
padding = (1, 1)
dilation = (1, 1)

# Compute the output size [N, P, Q, K]
N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)
print(f"Computed output size: N={N}, P={P}, Q={Q}, K={K}")  # 加入這行來確認計算的大小


# Set up data types
dtype = torch.float16
type_A = torch.float16
type_B = torch.float16
type_C = torch.float16
type_D = torch.float16

# Create input tensors
torch.manual_seed(1234)
input = torch.ceil(
    torch.empty(size=(N, C, H, W), dtype=type_A, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)

weight = torch.ceil(
    torch.empty(size=(K, C, R, S), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)

tensor_C = torch.ceil(
    torch.empty(size=(N, K, P, Q), dtype=type_B, device="cuda").uniform_(-4.5, 3.5)
).to(memory_format=torch.channels_last)

output = torch.zeros_like(tensor_C)
temp = torch.ceil( torch.empty(size=(K, P, Q), dtype=type_C, device="cuda").uniform_(-4.5, 3.5) )

alpha = 1.0
beta = 0.0


plan = cutlass.op.Conv2d(kind="fprop", element=np.float32)
plan.activation = cutlass.epilogue.relu

# Create and configure the Conv2d plan
plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)

# Set the epilogue visitor
negative_slope = 0.5
plan.activation = ("leaky_relu", negative_slope)

# Run the convolution with leaky ReLU
# visitor_args = {"alpha": alpha, "D": output}
plan.run(
    input, weight, tensor_C, output,
    stride, padding, dilation,
    alpha, beta, print_module=print_module
)


# Verify the results against PyTorch
def verify_results():
    # Compute reference result using PyTorch
    # output_torch = torch.nn.functional.conv2d(
    #     input, weight,
    #     stride=stride, padding=padding, dilation=dilation
    # )
    output_torch = alpha * torch.ops.aten.conv2d(
    input, weight, stride=stride, padding=padding, dilation=dilation) + beta * tensor_C
    
    output_torch = torch.nn.functional.leaky_relu(output_torch, negative_slope=negative_slope)
    
    # Compare results
    max_diff = torch.max(torch.abs(output_torch - output))
    print(f"Maximum difference between CUTLASS and PyTorch: {max_diff}")
    assert torch.allclose(output_torch, output, rtol=1e-2, atol=1e-2)
    print("Verification successful!")

verify_results()
Editor is loading...