Untitled
unknown
plain_text
3 years ago
9.1 kB
7
Indexable
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include "hw3_header.h"
__global__ void initTemp(int** temp_hist_arr)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = 0; i < NUM_RANGE; i++)
{
temp_hist_arr[index][i] = 0;
}
}
__global__ void buildTemp(int** temp_hist_arr, int* arr)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = 0; i < ARR_SIZE;i++) {
//int value = A[offset_A + i];
//temp[offset_temp + value]++;
}
}
__global__ void buildHist(int *h, int *temp) {
int index = threadIdx.x;
for (int i = 0; i < NUM_CUDA_THREADS; i++)
h[index] += temp[index + i*NUM_RANGE];
}
int computeOnGPU(int* h_array, int numElements, int* h_hist_temp) {
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
size_t arr_size = numElements * sizeof(int);
size_t hist_size = NUM_RANGE * sizeof(int);
size_t temp_hist_arr_size = NUM_CUDA_BLOCKS * NUM_CUDA_THREADS * sizeof(int*);
// Allocate memory on GPU to copy the data array from the host
int* d_array;
err = cudaMalloc((void **)&d_array, arr_size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device memory for device array - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy data array from host to the GPU memory
err = cudaMemcpy(d_array, h_array, arr_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy data from host to device for device array - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate memory on GPU to create a histogram on the host
int* d_hist;
err = cudaMalloc((void **)&d_hist, hist_size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device memory for device hist - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy data array from host to the GPU memory
err = cudaMemcpy(d_hist, h_hist_temp, hist_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy data from host to device for device hist - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate memory on host to create an array of pointers to the temporary histograms of each thread
int** h_temp_hist_arr = (int**)malloc(temp_hist_arr_size);
//Allocate memory on GPU for each of the temporary histogram arrays
for(int i=0; i<NUM_CUDA_BLOCKS*NUM_CUDA_THREADS; i++)
{
err = cudaMalloc((void **)&h_temp_hist_arr[i], hist_size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device memory for device temp hist arr- %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//create device hist of arrays
void* d_temp_hist_arr;
err = cudaMalloc((void **)&d_temp_hist_arr, temp_hist_arr_size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device memory for device temp hist - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy data array from host to the GPU memory
err = cudaMemcpy(d_temp_hist_arr, h_temp_hist_arr, temp_hist_arr_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy data from host to device for temp hist arr - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//Initialize the temporary histogram arrays
initTemp<<<NUM_CUDA_BLOCKS, NUM_CUDA_THREADS>>>((int**)d_temp_hist_arr);
//Launch kernel
//Free device arrays
cudaFree(d_array);
cudaFree(d_hist);
for(int i=0;i<NUM_CUDA_BLOCKS*NUM_CUDA_THREADS;i++)
{
err = cudaFree(h_temp_hist_arr[i]);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device memory for device temp hist arr- %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
cudaFree(d_temp_hist_arr);
free(h_temp_hist_arr);
return 0;
}
/*
int computeOnGPU(int *data, int numElements) {
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
size_t size = numElements * sizeof(float);
// Allocate memory on GPU to copy the data from the host
int *d_A;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device memory - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy data from host to the GPU memory
err = cudaMemcpy(d_A, data, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy data from host to device - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
//incrementByOne<<<blocksPerGrid, threadsPerBlock>>>(d_A, numElements);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the result from GPU to the host memory.
err = cudaMemcpy(data, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy result array from device to host -%s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free allocated memory on GPU
if (cudaFree(d_A) != cudaSuccess) {
fprintf(stderr, "Failed to free device data - %s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
return 0;
// Allocate memory on device
int *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
int *d_h = NULL;
err = cudaMalloc((void **)&d_h, NUM_RANGE * sizeof(int));
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
int *d_temp = NULL;
err = cudaMalloc((void **)&d_temp, NUM_CUDA_THREADS * NUM_RANGE * sizeof(int));
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the vector A to the device
//err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Initialize vectors on device
initHist <<< 1 , NUM_RANGE >>> (d_h);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
initTemp <<< 1 , NUM_CUDA_THREADS >>> (d_temp);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Build partial histograms for each thread
buildTemp <<< 1, NUM_CUDA_THREADS >>> (d_A, d_temp);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Unify the results
buildHist<<< 1, RANGE>>>(d_h, d_temp);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the final histogram to the host
err = cudaMemcpy(h_H, d_h, RANGE * sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Verify that the result vector is correct
int result = 0;
for (int i = 0; i < RANGE; i++)
result += h_H[i];
if (result == SIZE)
printf("Test PASSED\n");
else
printf("Test FAILED!!!!!\n");
for(int i=0;i<RANGE;i++)
{
printf("%d:[%d]\n", i, h_H[i]);
}
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_h);
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
if (err != cudaSuccess) {
fprintf(stderr, "Error in line %d (error code %s)!\n", __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
return 0;
}*/
Editor is loading...