maxpool_cudnn_0913
user_5573880
c_cpp
a month ago
5.5 kB
2
Indexable
Never
#include <cuda_fp16.h> #include <cuda_runtime.h> #include <cudnn.h> #include <stdio.h> #include <iostream> #include <random> #include <vector> // Utility function for CUDA and cuDNN error checking #define checkCudaErrors(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " \ << cudaGetErrorString(err) << std::endl; \ exit(EXIT_FAILURE); \ } \ } while (0) #define checkCudnnErrors(call) \ do { \ cudnnStatus_t err = call; \ if (err != CUDNN_STATUS_SUCCESS) { \ std::cerr << "cuDNN error in " << __FILE__ << ":" << __LINE__ << ": " \ << cudnnGetErrorString(err) << std::endl; \ exit(EXIT_FAILURE); \ } \ } while (0) // Constants const int INPUT_CHANNELS = 3; const int OUTPUT_CHANNELS = 16; const int INPUT_SIZE = 416; const int OUTPUT_SIZE = 208; const int pool_size = 2; const int stride = 2; // Initialize cuDNN pooling descriptor void setupPoolingDescriptor(cudnnPoolingDescriptor_t& pooling_desc, int pool_size, int stride) { checkCudnnErrors(cudnnCreatePoolingDescriptor(&pooling_desc)); checkCudnnErrors(cudnnSetPooling2dDescriptor( pooling_desc, CUDNN_POOLING_MAX, // Max pooling CUDNN_PROPAGATE_NAN, // Propagate NaNs pool_size, pool_size, // Pooling window size 0, 0, // Padding stride, stride // Stride )); } int main() { // Initialize cuDNN cudnnHandle_t cudnn; checkCudnnErrors(cudnnCreate(&cudnn)); // Initialize input and output data std::vector<float> h_input(INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS); std::vector<float> h_output(OUTPUT_SIZE * OUTPUT_SIZE * INPUT_CHANNELS); // Use a fixed seed for reproducibility unsigned int seed = 1234; std::mt19937 gen(seed); std::uniform_real_distribution<float> dis(-1.0f, 1.0f); for (auto& val : h_input) val = dis(gen); half* d_input; half* d_pool_output; size_t input_size = INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS * sizeof(half); size_t pool_output_size = OUTPUT_SIZE * OUTPUT_SIZE * INPUT_CHANNELS * sizeof(half); checkCudaErrors(cudaMalloc(&d_input, input_size)); checkCudaErrors(cudaMalloc(&d_pool_output, pool_output_size)); // Convert input to half precision and copy to device std::vector<half> h_input_half(h_input.size()); for (size_t i = 0; i < h_input.size(); ++i) { h_input_half[i] = __float2half(h_input[i]); } checkCudaErrors(cudaMemcpy(d_input, h_input_half.data(), input_size, cudaMemcpyHostToDevice)); // Set up input tensor descriptor cudnnTensorDescriptor_t input_desc; checkCudnnErrors(cudnnCreateTensorDescriptor(&input_desc)); checkCudnnErrors(cudnnSetTensor4dDescriptor( input_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, 1, INPUT_CHANNELS, INPUT_SIZE, INPUT_SIZE // N, C, H, W )); // Set up output tensor descriptor cudnnTensorDescriptor_t output_desc; checkCudnnErrors(cudnnCreateTensorDescriptor(&output_desc)); checkCudnnErrors(cudnnSetTensor4dDescriptor( output_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, 1, INPUT_CHANNELS, OUTPUT_SIZE, OUTPUT_SIZE // N, C, H, W )); // Set up pooling descriptor cudnnPoolingDescriptor_t pooling_desc; setupPoolingDescriptor(pooling_desc, pool_size, stride); // Execute MaxPooling float alpha = 1.0f, beta = 0.0f; checkCudnnErrors(cudnnPoolingForward( cudnn, pooling_desc, &alpha, input_desc, d_input, // Input tensor &beta, output_desc, d_pool_output // Output tensor )); // Copy result back to host std::vector<half> h_output_half(h_output.size()); checkCudaErrors(cudaMemcpy(h_output_half.data(), d_pool_output, pool_output_size, cudaMemcpyDeviceToHost)); for (size_t i = 0; i < h_output.size(); ++i) { h_output[i] = __half2float(h_output_half[i]); } // Output results std::cout << "MaxPooling output (first few values):" << std::endl; for (int i = 0; i < 10; ++i) { std::cout << h_output[i] << " "; } std::cout << std::endl; // Clean up cuDNN resources checkCudnnErrors(cudnnDestroyPoolingDescriptor(pooling_desc)); checkCudnnErrors(cudnnDestroyTensorDescriptor(input_desc)); checkCudnnErrors(cudnnDestroyTensorDescriptor(output_desc)); checkCudnnErrors(cudnnDestroy(cudnn)); // Clean up CUDA memory checkCudaErrors(cudaFree(d_input)); checkCudaErrors(cudaFree(d_pool_output)); std::cout << "Program completed." << std::endl; return 0; }
Leave a Comment