cuda
unknown
c_cpp
3 years ago
2.5 kB
6
Indexable
#include <iostream>
#include <cuda_runtime.h>
// The kernel function (as provided in a previous answer)
__global__ void convolve2D(const float* image, const float* kernel, float* output, int width, int height, int k_width, int k_height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width - k_width + 1 && y < height - k_height + 1) {
float sum = 0.0f;
for (int i = 0; i < k_height; ++i) {
for (int j = 0; j < k_width; ++j) {
sum += image[(y + i) * width + (x + j)] * kernel[i * k_width + j];
}
}
output[y * (width - k_width + 1) + x] = sum;
}
}
int main() {
// Input image and kernel dimensions
int width = 1024;
int height = 1024;
int k_width = 3;
int k_height = 3;
// Calculate output image dimensions
int out_width = width - k_width + 1;
int out_height = height - k_height + 1;
// Allocate host memory for input image, kernel, and output image
float *image = new float[width * height];
float *kernel = new float[k_width * k_height];
float *output = new float[out_width * out_height];
// Load image and kernel data (not shown)
// Allocate device memory for input image, kernel, and output image
float *d_image, *d_kernel, *d_output;
cudaMalloc((void **)&d_image, width * height * sizeof(float));
cudaMalloc((void **)&d_kernel, k_width * k_height * sizeof(float));
cudaMalloc((void **)&d_output, out_width * out_height * sizeof(float));
// Copy input image and kernel data to device memory
cudaMemcpy(d_image, image, width * height * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel, k_width * k_height * sizeof(float), cudaMemcpyHostToDevice);
// Configure thread block and grid dimensions
dim3 blockDim(16, 16);
dim3 gridDim((out_width + blockDim.x - 1) / blockDim.x, (out_height + blockDim.y - 1) / blockDim.y);
// Launch the kernel
convolve2D<<<gridDim, blockDim>>>(d_image, d_kernel, d_output, width, height, k_width, k_height);
// Copy the output data back to the host
cudaMemcpy(output, d_output, out_width * out_height * sizeof(float), cudaMemcpyDeviceToHost);
// Deallocate device memory
cudaFree(d_image);
cudaFree(d_kernel);
cudaFree(d_output);
// Deallocate host memory
delete[] image;
delete[] kernel;
delete[] output;
return 0;
}Editor is loading...