Untitled
unknown
plain_text
2 years ago
4.4 kB
3
Indexable
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <math.h> #include <ctime> #define MAX_CUDA_BLOCKS 1024 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size); __global__ void addKernel(int *c, const int *a, const int *b) { // Get current thread "position" x and y int innerX = threadIdx.x; int innerY = threadIdx.y; int blockYOffset = blockIdx.x; // Get block size (to simulate matrix behaviour) int blockWidth = blockDim.x * blockDim.y; // Calculate final element index (for A, B and output C matrices) int elementIndex = blockYOffset * blockWidth + innerX * blockDim.x + innerY; // Return calculated result to the output pointer c[elementIndex] = a[elementIndex] + b[elementIndex]; } int main() { // Initialize randomness srand((unsigned int) time (NULL)); // Define Matrix/Vector dimensions (X x Y) (X x 1 for vectors) const int SIZE_X = 17; const int SIZE_Y = 1; // Cache size for later calculations const int VEC_LEN = SIZE_X * SIZE_Y; // Define input and output vectors int a[VEC_LEN]; int b[VEC_LEN]; // Initialize empty output vector int c[VEC_LEN] = { 0 }; // Fill in input vectors with some initial data for (int i = 0; i < VEC_LEN; i++) { } } } a[i] = rand() % 100; b[i] = rand() % 100; // Add matrices (or vectors) using cuda cudaError_t cudaStatus = addWithCuda(c, a, b, VEC_LEN); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } // Print output result for (int i = 0; i < VEC_LEN; i++) { if (i && i % SIZE_X == 0) { printf("\n"); printf("%5d ", c[i]); // Close CUDA interface cudaStatus = cudaThreadExit(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaThreadExit failed!"); return 1; } // Pause system on finish system("pause"); return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Split work into NxN block sector (max CUDA threads count per block shouldn't exceed 1024) const int N = sqrt(1024.0f); dim3 blockCount = size / MAX_CUDA_BLOCKS + (size % MAX_CUDA_BLOCKS == 0 ? 0 : 1); dim3 threadsPerBlock(N, N); // Initialize calculations addKernel<<<blockCount, threadsPerBlock>>>(dev_c, dev_a, dev_b); // cudaThreadSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaThreadSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus; }
Editor is loading...