maxpool0913
user_5573880
c_cpp
a month ago
7.6 kB
5
Indexable
Never
#include <cuda_fp16.h> #include <cuda_runtime.h> #include <stdio.h> #include <iostream> #include <vector> #include <random> // Utility function for CUDA error checking #define checkCudaErrors(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " \ << cudaGetErrorString(err) << std::endl; \ exit(EXIT_FAILURE); \ } \ } while (0) // Constants const int INPUT_CHANNELS = 3; const int OUTPUT_CHANNELS = 16; const int INPUT_SIZE = 416; const int OUTPUT_SIZE = 208; const int BLOCK_SIZE = 16; const int pool_size = 2; const int stride = 2; // Assume we're using a 2D grid and 3D blocks #define BLOCK_DIM_X 16 #define BLOCK_DIM_Y 16 #define BLOCK_DIM_Z 3 // Process 3 channels per thread // MaxPooling kernel __global__ void optimized_maxpool2d(const half* __restrict__ input, half* __restrict__ output, int batch_size, int in_height, int in_width, int channels, int pool_size, int stride) { int tx = threadIdx.x, ty = threadIdx.y, tz = threadIdx.z; int bx = blockIdx.x, by = blockIdx.y; int out_height = (in_height - pool_size) / stride + 1; int out_width = (in_width - pool_size) / stride + 1; int out_x = bx * BLOCK_DIM_X + tx; int out_y = by * BLOCK_DIM_Y + ty; if (out_x >= out_width || out_y >= out_height) return; // Each thread processes BLOCK_DIM_Z channels for (int c_offset = 0; c_offset < channels; c_offset += BLOCK_DIM_Z) { int c = c_offset + tz; if (c >= channels) break; // Max pooling operation for each thread, no shared memory half max_val = __float2half(-INFINITY); // Process each element in the pool window for (int py = 0; py < pool_size; ++py) { for (int px = 0; px < pool_size; ++px) { int in_y = out_y * stride + py; int in_x = out_x * stride + px; if (in_y < in_height && in_x < in_width) { int in_idx = (c * in_height + in_y) * in_width + in_x; half val = input[in_idx]; max_val = fmaxf(val, max_val); } } } int out_idx = (c * out_height + out_y) * out_width + out_x; output[out_idx] = max_val; } } // __global__ void optimized_maxpool2d(const half* __restrict__ input, // half* __restrict__ output, int batch_size, // int in_height, int in_width, int channels, // int pool_size, int stride) { // int tx = threadIdx.x, ty = threadIdx.y, tz = threadIdx.z; // int bx = blockIdx.x, by = blockIdx.y; // int out_height = (in_height - pool_size) / stride + 1; // int out_width = (in_width - pool_size) / stride + 1; // int out_x = bx * BLOCK_DIM_X + tx; // int out_y = by * BLOCK_DIM_Y + ty; // if (out_x >= out_width || out_y >= out_height) return; // extern __shared__ half shared_input[]; // // Each thread processes BLOCK_DIM_Z channels // for (int c_offset = 0; c_offset < channels; c_offset += BLOCK_DIM_Z) { // int c = c_offset + tz; // if (c >= channels) break; // // Load input patch into shared memory // for (int py = 0; py < pool_size; ++py) { // for (int px = 0; px < pool_size; ++px) { // int in_y = out_y * stride + py; // int in_x = out_x * stride + px; // if (in_y < in_height && in_x < in_width) { // int in_idx = (c * in_height + in_y) * in_width + in_x; // shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) + // (ty * BLOCK_DIM_X + tx) * pool_size * pool_size + // py * pool_size + px] = input[in_idx]; // } else { // shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) + // (ty * BLOCK_DIM_X + tx) * pool_size * pool_size + // py * pool_size + px] = __float2half(-INFINITY); // } // } // } // __syncthreads(); // // Max pooling operation // half max_val = __float2half(-INFINITY); // #pragma unroll // for (int py = 0; py < pool_size; ++py) { // #pragma unroll // for (int px = 0; px < pool_size; ++px) { // half val = // shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) + // (ty * BLOCK_DIM_X + tx) * pool_size * pool_size + // py * pool_size + px]; // max_val = fmaxf(val, max_val); // } // } // int out_idx = (c * out_height + out_y) * out_width + out_x; // output[out_idx] = max_val; // __syncthreads(); // } // } int main() { // Initialize input and output data std::vector<float> h_input(INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS); std::vector<float> h_output(OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_CHANNELS); // Use a fixed seed for reproducibility unsigned int seed = 1234; std::mt19937 gen(seed); std::uniform_real_distribution<float> dis(-1.0f, 1.0f); for (auto& val : h_input) val = dis(gen); half *d_input, *d_pool_output; size_t input_size = INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS * sizeof(half); size_t pool_output_size = OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_CHANNELS * sizeof(half); checkCudaErrors(cudaMalloc(&d_input, input_size)); checkCudaErrors(cudaMalloc(&d_pool_output, pool_output_size)); // Convert input to half precision and copy to device std::vector<half> h_input_half(h_input.size()); for (size_t i = 0; i < h_input.size(); ++i) { h_input_half[i] = __float2half(h_input[i]); } checkCudaErrors(cudaMemcpy(d_input, h_input_half.data(), input_size, cudaMemcpyHostToDevice)); // Configure grid and block dimensions for MaxPooling dim3 block_dim(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); dim3 grid_dim((OUTPUT_SIZE + BLOCK_DIM_X - 1) / BLOCK_DIM_X, (OUTPUT_SIZE + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y, 1); // Assuming batch size of 1 for simplicity size_t shared_mem_size = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z * pool_size * pool_size * sizeof(half); // Launch MaxPooling kernel optimized_maxpool2d<<<grid_dim, block_dim, shared_mem_size>>>(d_input, d_pool_output, 1, INPUT_SIZE, INPUT_SIZE, INPUT_CHANNELS, pool_size, stride); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaDeviceSynchronize()); // Copy result back to host std::vector<half> h_output_half(h_output.size()); checkCudaErrors(cudaMemcpy(h_output_half.data(), d_pool_output, pool_output_size, cudaMemcpyDeviceToHost)); for (size_t i = 0; i < h_output.size(); ++i) { h_output[i] = __half2float(h_output_half[i]); } // Output results std::cout << "Output (first few values):" << std::endl; for (int i = 0; i < 10; ++i) { std::cout << h_output[i] << " "; } std::cout << std::endl; // Clean up checkCudaErrors(cudaFree(d_input)); checkCudaErrors(cudaFree(d_pool_output)); std::cout << "Program completed." << std::endl; return 0; }
Leave a Comment