Untitled

 avatar
unknown
plain_text
2 years ago
4.4 kB
3
Indexable
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <ctime>
#define MAX_CUDA_BLOCKS           1024
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);
__global__ void addKernel(int *c, const int *a, const int *b) { // Get current thread "position" x and y
    int innerX = threadIdx.x;
       int innerY = threadIdx.y;
       int blockYOffset = blockIdx.x;
       // Get block size (to simulate matrix behaviour)
       int blockWidth = blockDim.x * blockDim.y;
       // Calculate final element index (for A, B and output C matrices)
       int elementIndex = blockYOffset * blockWidth + innerX * blockDim.x + innerY;
       // Return calculated result to the output pointer
    c[elementIndex] = a[elementIndex] + b[elementIndex];
}
int main() {
       // Initialize randomness
       srand((unsigned int) time (NULL));
       // Define Matrix/Vector dimensions (X x Y) (X x 1 for vectors)
       const int SIZE_X = 17;
       const int SIZE_Y = 1;
       // Cache size for later calculations
       const int VEC_LEN = SIZE_X * SIZE_Y;
       // Define input and output vectors
    int a[VEC_LEN];
       int b[VEC_LEN];
       // Initialize empty output vector
    int c[VEC_LEN] = { 0 };
       // Fill in input vectors with some initial data
       for (int i = 0; i < VEC_LEN; i++) {

}
} }
a[i] = rand() % 100;
b[i] = rand() % 100;
   // Add matrices (or vectors) using cuda
cudaError_t cudaStatus = addWithCuda(c, a, b, VEC_LEN);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
return 1; }
   // Print output result
   for (int i = 0; i < VEC_LEN; i++) {
          if (i && i % SIZE_X == 0) {
       printf("\n");
printf("%5d ", c[i]);
       // Close CUDA interface
    cudaStatus = cudaThreadExit();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadExit failed!");
return 1; }
       // Pause system on finish
       system("pause");
    return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size) {
       int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on,
change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU
installed?");
goto Error; }
    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
goto Error; }
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
goto Error; }
    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");

goto Error; }
    // Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
goto Error; }
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
goto Error; }
       // Split work into NxN block sector (max CUDA threads count per block shouldn't
exceed 1024)
       const int N = sqrt(1024.0f);
       dim3 blockCount = size / MAX_CUDA_BLOCKS + (size % MAX_CUDA_BLOCKS == 0 ? 0 :
1);
       dim3 threadsPerBlock(N, N);
       // Initialize calculations
    addKernel<<<blockCount, threadsPerBlock>>>(dev_c, dev_a, dev_b);
    // cudaThreadSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaThreadSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching
addKernel!\n", cudaStatus);
goto Error; }
    // Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
goto Error; }
Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    return cudaStatus;
}
Editor is loading...