mul
unknown
c_cpp
a year ago
2.0 kB
9
Indexable
#include <cuda_runtime.h>
#include <iostream>
// CUDA kernel to perform element-wise multiplication of two vectors
__global__ void elementwiseMulKernel(float* A, float* B, float* C, int N) {
// Calculate global thread index
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Perform multiplication if the thread index is within bounds
if (i < N) {
C[i] = A[i] * B[i];
}
}
int main() {
// Number of elements in the vectors
int N = 64;
// Size of the arrays in bytes
size_t size = N * sizeof(float);
// Host (CPU) memory allocation
float* h_A = (float*)malloc(size);
float* h_B = (float*)malloc(size);
float* h_C = (float*)malloc(size);
// Initialize host arrays with some values
for (int i = 0; i < N; i++) {
h_A[i] = static_cast<float>(i + 1); // Avoid zeros for meaningful multiplication
h_B[i] = static_cast<float>(i + 2);
}
// Device (GPU) memory allocation
float* d_A;
float* d_B;
float* d_C;
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// Copy data from host to device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Define the number of threads per block and number of blocks
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// Launch the kernel
elementwiseMulKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
// Copy the result from device to host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify the result by printing some values
for (int i = 0; i < 10; i++) {
std::cout << "h_A[" << i << "] * h_B[" << i << "] = " << h_A[i] << " * " << h_B[i]
<< " = " << h_C[i] << std::endl;
}
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
return 0;
}
Editor is loading...
Leave a Comment