cuda
unknown
c_cpp
2 years ago
2.5 kB
5
Indexable
#include <iostream> #include <cuda_runtime.h> // The kernel function (as provided in a previous answer) __global__ void convolve2D(const float* image, const float* kernel, float* output, int width, int height, int k_width, int k_height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width - k_width + 1 && y < height - k_height + 1) { float sum = 0.0f; for (int i = 0; i < k_height; ++i) { for (int j = 0; j < k_width; ++j) { sum += image[(y + i) * width + (x + j)] * kernel[i * k_width + j]; } } output[y * (width - k_width + 1) + x] = sum; } } int main() { // Input image and kernel dimensions int width = 1024; int height = 1024; int k_width = 3; int k_height = 3; // Calculate output image dimensions int out_width = width - k_width + 1; int out_height = height - k_height + 1; // Allocate host memory for input image, kernel, and output image float *image = new float[width * height]; float *kernel = new float[k_width * k_height]; float *output = new float[out_width * out_height]; // Load image and kernel data (not shown) // Allocate device memory for input image, kernel, and output image float *d_image, *d_kernel, *d_output; cudaMalloc((void **)&d_image, width * height * sizeof(float)); cudaMalloc((void **)&d_kernel, k_width * k_height * sizeof(float)); cudaMalloc((void **)&d_output, out_width * out_height * sizeof(float)); // Copy input image and kernel data to device memory cudaMemcpy(d_image, image, width * height * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel, k_width * k_height * sizeof(float), cudaMemcpyHostToDevice); // Configure thread block and grid dimensions dim3 blockDim(16, 16); dim3 gridDim((out_width + blockDim.x - 1) / blockDim.x, (out_height + blockDim.y - 1) / blockDim.y); // Launch the kernel convolve2D<<<gridDim, blockDim>>>(d_image, d_kernel, d_output, width, height, k_width, k_height); // Copy the output data back to the host cudaMemcpy(output, d_output, out_width * out_height * sizeof(float), cudaMemcpyDeviceToHost); // Deallocate device memory cudaFree(d_image); cudaFree(d_kernel); cudaFree(d_output); // Deallocate host memory delete[] image; delete[] kernel; delete[] output; return 0; }
Editor is loading...