cuda

#include <iostream>
#include <cuda_runtime.h>

// The kernel function (as provided in a previous answer)
__global__ void convolve2D(const float* image, const float* kernel, float* output, int width, int height, int k_width, int k_height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width - k_width + 1 && y < height - k_height + 1) {
        float sum = 0.0f;

        for (int i = 0; i < k_height; ++i) {
            for (int j = 0; j < k_width; ++j) {
                sum += image[(y + i) * width + (x + j)] * kernel[i * k_width + j];
            }
        }

        output[y * (width - k_width + 1) + x] = sum;
    }
}

int main() {
    // Input image and kernel dimensions
    int width = 1024;
    int height = 1024;
    int k_width = 3;
    int k_height = 3;

    // Calculate output image dimensions
    int out_width = width - k_width + 1;
    int out_height = height - k_height + 1;

    // Allocate host memory for input image, kernel, and output image
    float *image = new float[width * height];
    float *kernel = new float[k_width * k_height];
    float *output = new float[out_width * out_height];

    // Load image and kernel data (not shown)

    // Allocate device memory for input image, kernel, and output image
    float *d_image, *d_kernel, *d_output;
    cudaMalloc((void **)&d_image, width * height * sizeof(float));
    cudaMalloc((void **)&d_kernel, k_width * k_height * sizeof(float));
    cudaMalloc((void **)&d_output, out_width * out_height * sizeof(float));

    // Copy input image and kernel data to device memory
    cudaMemcpy(d_image, image, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, kernel, k_width * k_height * sizeof(float), cudaMemcpyHostToDevice);

    // Configure thread block and grid dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((out_width + blockDim.x - 1) / blockDim.x, (out_height + blockDim.y - 1) / blockDim.y);

    // Launch the kernel
    convolve2D<<<gridDim, blockDim>>>(d_image, d_kernel, d_output, width, height, k_width, k_height);

    // Copy the output data back to the host
    cudaMemcpy(output, d_output, out_width * out_height * sizeof(float), cudaMemcpyDeviceToHost);

    // Deallocate device memory
    cudaFree(d_image);
    cudaFree(d_kernel);
    cudaFree(d_output);

    // Deallocate host memory
    delete[] image;
    delete[] kernel;
    delete[] output;

    return 0;
}
Editor is loading...