maxpool0913
user_5573880
c_cpp
a year ago
7.6 kB
10
Indexable
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <random>
// Utility function for CUDA error checking
#define checkCudaErrors(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " \
<< cudaGetErrorString(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
// Constants
const int INPUT_CHANNELS = 3;
const int OUTPUT_CHANNELS = 16;
const int INPUT_SIZE = 416;
const int OUTPUT_SIZE = 208;
const int BLOCK_SIZE = 16;
const int pool_size = 2;
const int stride = 2;
// Assume we're using a 2D grid and 3D blocks
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
#define BLOCK_DIM_Z 3 // Process 3 channels per thread
// MaxPooling kernel
__global__ void optimized_maxpool2d(const half* __restrict__ input,
half* __restrict__ output, int batch_size,
int in_height, int in_width, int channels,
int pool_size, int stride) {
int tx = threadIdx.x, ty = threadIdx.y, tz = threadIdx.z;
int bx = blockIdx.x, by = blockIdx.y;
int out_height = (in_height - pool_size) / stride + 1;
int out_width = (in_width - pool_size) / stride + 1;
int out_x = bx * BLOCK_DIM_X + tx;
int out_y = by * BLOCK_DIM_Y + ty;
if (out_x >= out_width || out_y >= out_height) return;
// Each thread processes BLOCK_DIM_Z channels
for (int c_offset = 0; c_offset < channels; c_offset += BLOCK_DIM_Z) {
int c = c_offset + tz;
if (c >= channels) break;
// Max pooling operation for each thread, no shared memory
half max_val = __float2half(-INFINITY);
// Process each element in the pool window
for (int py = 0; py < pool_size; ++py) {
for (int px = 0; px < pool_size; ++px) {
int in_y = out_y * stride + py;
int in_x = out_x * stride + px;
if (in_y < in_height && in_x < in_width) {
int in_idx = (c * in_height + in_y) * in_width + in_x;
half val = input[in_idx];
max_val = fmaxf(val, max_val);
}
}
}
int out_idx = (c * out_height + out_y) * out_width + out_x;
output[out_idx] = max_val;
}
}
// __global__ void optimized_maxpool2d(const half* __restrict__ input,
// half* __restrict__ output, int batch_size,
// int in_height, int in_width, int channels,
// int pool_size, int stride) {
// int tx = threadIdx.x, ty = threadIdx.y, tz = threadIdx.z;
// int bx = blockIdx.x, by = blockIdx.y;
// int out_height = (in_height - pool_size) / stride + 1;
// int out_width = (in_width - pool_size) / stride + 1;
// int out_x = bx * BLOCK_DIM_X + tx;
// int out_y = by * BLOCK_DIM_Y + ty;
// if (out_x >= out_width || out_y >= out_height) return;
// extern __shared__ half shared_input[];
// // Each thread processes BLOCK_DIM_Z channels
// for (int c_offset = 0; c_offset < channels; c_offset += BLOCK_DIM_Z) {
// int c = c_offset + tz;
// if (c >= channels) break;
// // Load input patch into shared memory
// for (int py = 0; py < pool_size; ++py) {
// for (int px = 0; px < pool_size; ++px) {
// int in_y = out_y * stride + py;
// int in_x = out_x * stride + px;
// if (in_y < in_height && in_x < in_width) {
// int in_idx = (c * in_height + in_y) * in_width + in_x;
// shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) +
// (ty * BLOCK_DIM_X + tx) * pool_size * pool_size +
// py * pool_size + px] = input[in_idx];
// } else {
// shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) +
// (ty * BLOCK_DIM_X + tx) * pool_size * pool_size +
// py * pool_size + px] = __float2half(-INFINITY);
// }
// }
// }
// __syncthreads();
// // Max pooling operation
// half max_val = __float2half(-INFINITY);
// #pragma unroll
// for (int py = 0; py < pool_size; ++py) {
// #pragma unroll
// for (int px = 0; px < pool_size; ++px) {
// half val =
// shared_input[(tz * BLOCK_DIM_X * BLOCK_DIM_Y) +
// (ty * BLOCK_DIM_X + tx) * pool_size * pool_size +
// py * pool_size + px];
// max_val = fmaxf(val, max_val);
// }
// }
// int out_idx = (c * out_height + out_y) * out_width + out_x;
// output[out_idx] = max_val;
// __syncthreads();
// }
// }
int main() {
// Initialize input and output data
std::vector<float> h_input(INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS);
std::vector<float> h_output(OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_CHANNELS);
// Use a fixed seed for reproducibility
unsigned int seed = 1234;
std::mt19937 gen(seed);
std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
for (auto& val : h_input) val = dis(gen);
half *d_input, *d_pool_output;
size_t input_size = INPUT_SIZE * INPUT_SIZE * INPUT_CHANNELS * sizeof(half);
size_t pool_output_size = OUTPUT_SIZE * OUTPUT_SIZE * OUTPUT_CHANNELS * sizeof(half);
checkCudaErrors(cudaMalloc(&d_input, input_size));
checkCudaErrors(cudaMalloc(&d_pool_output, pool_output_size));
// Convert input to half precision and copy to device
std::vector<half> h_input_half(h_input.size());
for (size_t i = 0; i < h_input.size(); ++i) {
h_input_half[i] = __float2half(h_input[i]);
}
checkCudaErrors(cudaMemcpy(d_input, h_input_half.data(), input_size, cudaMemcpyHostToDevice));
// Configure grid and block dimensions for MaxPooling
dim3 block_dim(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
dim3 grid_dim((OUTPUT_SIZE + BLOCK_DIM_X - 1) / BLOCK_DIM_X,
(OUTPUT_SIZE + BLOCK_DIM_Y - 1) / BLOCK_DIM_Y,
1); // Assuming batch size of 1 for simplicity
size_t shared_mem_size = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z * pool_size *
pool_size * sizeof(half);
// Launch MaxPooling kernel
optimized_maxpool2d<<<grid_dim, block_dim, shared_mem_size>>>(d_input, d_pool_output, 1, INPUT_SIZE, INPUT_SIZE, INPUT_CHANNELS, pool_size, stride);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
// Copy result back to host
std::vector<half> h_output_half(h_output.size());
checkCudaErrors(cudaMemcpy(h_output_half.data(), d_pool_output, pool_output_size, cudaMemcpyDeviceToHost));
for (size_t i = 0; i < h_output.size(); ++i) {
h_output[i] = __half2float(h_output_half[i]);
}
// Output results
std::cout << "Output (first few values):" << std::endl;
for (int i = 0; i < 10; ++i) {
std::cout << h_output[i] << " ";
}
std::cout << std::endl;
// Clean up
checkCudaErrors(cudaFree(d_input));
checkCudaErrors(cudaFree(d_pool_output));
std::cout << "Program completed." << std::endl;
return 0;
}
Editor is loading...
Leave a Comment