add

#include <cuda_runtime.h>
#include <iostream>

// CUDA kernel for element-wise addition of two vectors
__global__ void addVectorsKernel(float* A, float* B, float* C, int N) {
    // Calculate global thread index based on block and thread index
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Perform addition if the thread index is within the bounds of the array
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    // Number of elements in the vectors
    int N = 1024;

    // Size of the arrays in bytes
    size_t size = N * sizeof(float);

    // Host (CPU) memory allocation
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);
    float* h_C = (float*)malloc(size);

    // Initialize host arrays with some values
    for (int i = 0; i < N; i++) {
        h_A[i] = static_cast<float>(i);
        h_B[i] = static_cast<float>(i * 2);
    }

    // Device (GPU) memory allocation
    float* d_A;
    float* d_B;
    float* d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define the number of threads per block and number of blocks
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch the kernel
    addVectorsKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy the result from device to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify the result by printing some values
    for (int i = 0; i < 10; i++) {
        std::cout << "h_A[" << i << "] + h_B[" << i << "] = " << h_A[i] << " + " << h_B[i] 
                  << " = " << h_C[i] << std::endl;
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}
Editor is loading...