mul
unknown
c_cpp
a month ago
2.0 kB
3
Indexable
Never
#include <cuda_runtime.h> #include <iostream> // CUDA kernel to perform element-wise multiplication of two vectors __global__ void elementwiseMulKernel(float* A, float* B, float* C, int N) { // Calculate global thread index int i = blockIdx.x * blockDim.x + threadIdx.x; // Perform multiplication if the thread index is within bounds if (i < N) { C[i] = A[i] * B[i]; } } int main() { // Number of elements in the vectors int N = 64; // Size of the arrays in bytes size_t size = N * sizeof(float); // Host (CPU) memory allocation float* h_A = (float*)malloc(size); float* h_B = (float*)malloc(size); float* h_C = (float*)malloc(size); // Initialize host arrays with some values for (int i = 0; i < N; i++) { h_A[i] = static_cast<float>(i + 1); // Avoid zeros for meaningful multiplication h_B[i] = static_cast<float>(i + 2); } // Device (GPU) memory allocation float* d_A; float* d_B; float* d_C; cudaMalloc((void**)&d_A, size); cudaMalloc((void**)&d_B, size); cudaMalloc((void**)&d_C, size); // Copy data from host to device cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); // Define the number of threads per block and number of blocks int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; // Launch the kernel elementwiseMulKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N); // Copy the result from device to host cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); // Verify the result by printing some values for (int i = 0; i < 10; i++) { std::cout << "h_A[" << i << "] * h_B[" << i << "] = " << h_A[i] << " * " << h_B[i] << " = " << h_C[i] << std::endl; } // Free device memory cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); // Free host memory free(h_A); free(h_B); free(h_C); return 0; }
Leave a Comment