Untitled
unknown
plain_text
2 years ago
9.1 kB
5
Indexable
#include <cuda_runtime.h> #include <helper_cuda.h> #include "hw3_header.h" __global__ void initTemp(int** temp_hist_arr) { int index = blockDim.x * blockIdx.x + threadIdx.x; for (int i = 0; i < NUM_RANGE; i++) { temp_hist_arr[index][i] = 0; } } __global__ void buildTemp(int** temp_hist_arr, int* arr) { int index = blockDim.x * blockIdx.x + threadIdx.x; for (int i = 0; i < ARR_SIZE;i++) { //int value = A[offset_A + i]; //temp[offset_temp + value]++; } } __global__ void buildHist(int *h, int *temp) { int index = threadIdx.x; for (int i = 0; i < NUM_CUDA_THREADS; i++) h[index] += temp[index + i*NUM_RANGE]; } int computeOnGPU(int* h_array, int numElements, int* h_hist_temp) { // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; size_t arr_size = numElements * sizeof(int); size_t hist_size = NUM_RANGE * sizeof(int); size_t temp_hist_arr_size = NUM_CUDA_BLOCKS * NUM_CUDA_THREADS * sizeof(int*); // Allocate memory on GPU to copy the data array from the host int* d_array; err = cudaMalloc((void **)&d_array, arr_size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device memory for device array - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy data array from host to the GPU memory err = cudaMemcpy(d_array, h_array, arr_size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy data from host to device for device array - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Allocate memory on GPU to create a histogram on the host int* d_hist; err = cudaMalloc((void **)&d_hist, hist_size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device memory for device hist - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy data array from host to the GPU memory err = cudaMemcpy(d_hist, h_hist_temp, hist_size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy data from host to device for device hist - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Allocate memory on host to create an array of pointers to the temporary histograms of each thread int** h_temp_hist_arr = (int**)malloc(temp_hist_arr_size); //Allocate memory on GPU for each of the temporary histogram arrays for(int i=0; i<NUM_CUDA_BLOCKS*NUM_CUDA_THREADS; i++) { err = cudaMalloc((void **)&h_temp_hist_arr[i], hist_size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device memory for device temp hist arr- %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } } //create device hist of arrays void* d_temp_hist_arr; err = cudaMalloc((void **)&d_temp_hist_arr, temp_hist_arr_size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device memory for device temp hist - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy data array from host to the GPU memory err = cudaMemcpy(d_temp_hist_arr, h_temp_hist_arr, temp_hist_arr_size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy data from host to device for temp hist arr - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //Initialize the temporary histogram arrays initTemp<<<NUM_CUDA_BLOCKS, NUM_CUDA_THREADS>>>((int**)d_temp_hist_arr); //Launch kernel //Free device arrays cudaFree(d_array); cudaFree(d_hist); for(int i=0;i<NUM_CUDA_BLOCKS*NUM_CUDA_THREADS;i++) { err = cudaFree(h_temp_hist_arr[i]); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device memory for device temp hist arr- %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } } cudaFree(d_temp_hist_arr); free(h_temp_hist_arr); return 0; } /* int computeOnGPU(int *data, int numElements) { // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; size_t size = numElements * sizeof(float); // Allocate memory on GPU to copy the data from the host int *d_A; err = cudaMalloc((void **)&d_A, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device memory - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy data from host to the GPU memory err = cudaMemcpy(d_A, data, size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy data from host to device - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Launch the Kernel int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; //incrementByOne<<<blocksPerGrid, threadsPerBlock>>>(d_A, numElements); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch vectorAdd kernel - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the result from GPU to the host memory. err = cudaMemcpy(data, d_A, size, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy result array from device to host -%s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Free allocated memory on GPU if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "Failed to free device data - %s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } return 0; // Allocate memory on device int *d_A = NULL; err = cudaMalloc((void **)&d_A, size); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } int *d_h = NULL; err = cudaMalloc((void **)&d_h, NUM_RANGE * sizeof(int)); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } int *d_temp = NULL; err = cudaMalloc((void **)&d_temp, NUM_CUDA_THREADS * NUM_RANGE * sizeof(int)); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the vector A to the device //err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Initialize vectors on device initHist <<< 1 , NUM_RANGE >>> (d_h); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } initTemp <<< 1 , NUM_CUDA_THREADS >>> (d_temp); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Build partial histograms for each thread buildTemp <<< 1, NUM_CUDA_THREADS >>> (d_A, d_temp); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Unify the results buildHist<<< 1, RANGE>>>(d_h, d_temp); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the final histogram to the host err = cudaMemcpy(h_H, d_h, RANGE * sizeof(int), cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Verify that the result vector is correct int result = 0; for (int i = 0; i < RANGE; i++) result += h_H[i]; if (result == SIZE) printf("Test PASSED\n"); else printf("Test FAILED!!!!!\n"); for(int i=0;i<RANGE;i++) { printf("%d:[%d]\n", i, h_H[i]); } // Free device global memory err = cudaFree(d_A); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaFree(d_h); if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } if (err != cudaSuccess) { fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err)); exit(EXIT_FAILURE); } return 0; }*/
Editor is loading...