0913_4kernels_separate

#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <random>

// Utility function for CUDA error checking
#define checkCudaErrors(call)                                              \
  do {                                                                     \
    cudaError_t err = call;                                                \
    if (err != cudaSuccess) {                                              \
      std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " \
                << cudaGetErrorString(err) << std::endl;                   \
      exit(EXIT_FAILURE);                                                  \
    }                                                                      \
  } while (0)

// Constants
const int INPUT_CHANNELS = 3;
const int OUTPUT_CHANNELS = 16;
const int INPUT_SIZE = 128;
const int OUTPUT_SIZE = 64;
const int BLOCK_SIZE = 16;
const int pool_size = 2;
const int stride = 2;

// Assume we're using a 2D grid and 3D blocks
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
#define BLOCK_DIM_Z 3  // Process 3 channels per thread

// MaxPooling kernel
// __global__ void optimized_maxpool2d(const half* __restrict__ input,
//                                     half* __restrict__ output, int batch_size,
//                                     int in_height, int in_width, int channels,
//                                     int pool_size, int stride) {
//   int tx = threadIdx.x, ty = threadIdx.y, tz = threadIdx.z;
//   int bx = blockIdx.x, by = blockIdx.y;

//   int out_height = (in_height - pool_size) / stride + 1;
//   int out_width = (in_width - pool_size) / stride + 1;

//   int out_x = bx * BLOCK_DIM_X + tx;
//   int out_y = by * BLOCK_DIM_Y + ty;

//   if (out_x >= out_width || out_y >= out_height) return;

//   // Each thread processes BLOCK_DIM_Z channels
//   for (int c_offset = 0; c_offset < channels; c_offset += BLOCK_DIM_Z) {
//     int c = c_offset + tz;
//     if (c >= channels) break;

//     // Max pooling operation for each thread, no shared memory
//     half max_val = __float2half(-INFINITY);

//     // Process each element in the pool window
//     for (int py = 0; py < pool_size; ++py) {
//       for (int px = 0; px < pool_size; ++px) {
//         int in_y = out_y * stride + py;
//         int in_x = out_x * stride + px;
//         if (in_y < in_height && in_x < in_width) {
//           int in_idx = (c * in_height + in_y) * in_width + in_x;
//           half val = input[in_idx];
//           max_val = fmaxf(val, max_val);
//         }
//       }
//     }

//     int out_idx = (c * out_height + out_y) * out_width + out_x;
//     output[out_idx] = max_val;
//   }
// }

__global__ void optimized_maxpool2d(const half* __restrict__ input,
                                    half* __restrict__ output, int batch_size,
                                    int in_height, int in_width, int channels,
                                    int pool_size, int stride) {
  int tx = threadIdx.x, ty = threadIdx.y, tz = threadIdx.z;
  int bx = blockIdx.x, by = blockIdx.y;

  int out_height = (in_height - pool_size) / stride + 1;
  int out_width = (in_width - pool_size) / stride + 1;

  int out_x = bx * BLOCK_DIM_X + tx;
  int out_y = by * BLOCK_DIM_Y + ty;

  if (out_x >= out_width || out_y >= out_height) return;

  extern __shared__ half shared_input[];

  // Each thread processes BLOCK_DIM_Z channels
  for (int c_offset = 0; c_offset < channels; c_offset += BLOCK_DIM_Z) {
    int c = c_offset + tz;
    if (c >= channels) break;

    // Load input patch into shared memory
    for (int py = 0; py < pool_size; ++py) {
      for (int px = 0; px < pool_size; ++px) {
        int in_y = out_y * stride + py;
        int in_x = out_x * stride + px;
        if (in_y < in_height && in_x < in_width) {
          int in_idx = (c * in_height + in_y) * in_width + in_x;
          shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) +
                       (ty * BLOCK_DIM_X + tx) * pool_size * pool_size +
                       py * pool_size + px] = input[in_idx];
        } else {
          shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) +
                       (ty * BLOCK_DIM_X + tx) * pool_size * pool_size +
                       py * pool_size + px] = __float2half(-INFINITY);
        }
      }
    }

    __syncthreads();

    // Max pooling operation
    half max_val = __float2half(-INFINITY);

#pragma unroll
    for (int py = 0; py < pool_size; ++py) {
#pragma unroll
      for (int px = 0; px < pool_size; ++px) {
        half val =
            shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) +
                         (ty * BLOCK_DIM_X + tx) * pool_size * pool_size +
                         py * pool_size + px];
        max_val = fmaxf(val, max_val);
      }
    }

    int out_idx = (c * out_height + out_y) * out_width + out_x;
    output[out_idx] = max_val;

    __syncthreads();
  }
}

int main() {
  // Initialize input and output data
  std::vector<float> h_input(INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS);
  std::vector<float> h_output(OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_CHANNELS);

  // Use a fixed seed for reproducibility
  unsigned int seed = 1234;
  std::mt19937 gen(seed);
  std::uniform_real_distribution<float> dis(-1.0f, 1.0f);

  for (auto& val : h_input) val = dis(gen);

  half *d_input, *d_pool_output;
  size_t input_size = INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS * sizeof(half);
  size_t pool_output_size = OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_CHANNELS * sizeof(half);

  checkCudaErrors(cudaMalloc(&d_input, input_size));
  checkCudaErrors(cudaMalloc(&d_pool_output, pool_output_size));

  // Convert input to half precision and copy to device
  std::vector<half> h_input_half(h_input.size());
  for (size_t i = 0; i < h_input.size(); ++i) {
    h_input_half[i] = __float2half(h_input[i]);
  }
  checkCudaErrors(cudaMemcpy(d_input, h_input_half.data(), input_size, cudaMemcpyHostToDevice));

  // Configure grid and block dimensions for MaxPooling
  dim3 block_dim(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
  dim3 grid_dim((OUTPUT_SIZE + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
                (OUTPUT_SIZE + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
                1);  // Assuming batch size of 1 for simplicity
  size_t shared_mem_size = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z * pool_size *
                           pool_size * sizeof(half);

  // Launch MaxPooling kernel
  optimized_maxpool2d<<<grid_dim, block_dim, shared_mem_size>>>(d_input, d_pool_output, 1, INPUT_SIZE, INPUT_SIZE, INPUT_CHANNELS, pool_size, stride);
  checkCudaErrors(cudaGetLastError());
  checkCudaErrors(cudaDeviceSynchronize());

  // Copy result back to host
  std::vector<half> h_output_half(h_output.size());
  checkCudaErrors(cudaMemcpy(h_output_half.data(), d_pool_output, pool_output_size, cudaMemcpyDeviceToHost));

  for (size_t i = 0; i < h_output.size(); ++i) {
    h_output[i] = __half2float(h_output_half[i]);
  }

  // Output results
  std::cout << "Output (first few values):" << std::endl;
  for (int i = 0; i < 10; ++i) {
    std::cout << h_output[i] << " ";
  }
  std::cout << std::endl;

  // Clean up
  checkCudaErrors(cudaFree(d_input));
  checkCudaErrors(cudaFree(d_pool_output));

  std::cout << "Program completed." << std::endl;
  return 0;
}
Editor is loading...