Untitled
task 3unknown
c_cpp
a year ago
6.1 kB
5
Indexable
#include <mpi.h> #include <stdio.h> #include <omp.h> #include <time.h> #include <stdlib.h> #define SIZE 300000 // Size of the array #define RANGE 256 // Range of the numbers in the array #define NUM_BLOCKS 10 // Number of CUDA blocks #define NUM_THREADS 20 // Number of threads per CUDA block // Function to handle CUDA errors void cudaExceptionHandler(cudaError_t err, const char *customMessage) { if (err != cudaSuccess) { fprintf(stderr, "CUDA error: %s %s\n", customMessage, cudaGetErrorString(err)); exit(EXIT_FAILURE); } } // Function to allocate memory on the device void allocateDeviceMemory(int **deviceData, int **deviceHist, int dataSize) { cudaExceptionHandler(cudaMalloc((void **)deviceData, dataSize * sizeof(int)),"Failed to allocate device memory"); cudaExceptionHandler(cudaMalloc((void **)deviceHist, RANGE * sizeof(int)),"Failed to allocate device memory"); } // Function to copy data from host to device void copyDataToDevice(int *deviceData, int *data, int dataSize) { cudaExceptionHandler(cudaMemcpy(deviceData, data, dataSize * sizeof(int), cudaMemcpyHostToDevice),"Failed to copy data from host to device"); } // Function to copy histogram result from device to host void copyHistogramResult(int *histogram, int *deviceHist) { cudaExceptionHandler(cudaMemcpy(histogram, deviceHist, RANGE * sizeof(int), cudaMemcpyDeviceToHost),"Failed to Copy the histogram result from the device to the host"); } // CUDA kernel to initialize the histogram array with zeros __global__ void initArray(int *deviceHist) { deviceHist[threadIdx.x] = 0; } // CUDA kernel to calculate histogram __global__ void histogramKernel(int *deviceData, int *deviceHist, int dataSize) { // Calculate the index of the current thread int tid = blockIdx.x * blockDim.x + threadIdx.x; // Calculate the remain for each thread to jump to the next block of data int next = blockDim.x * gridDim.x; while (tid < dataSize) { atomicAdd(&(deviceHist[deviceData[tid]]), 1); // Increment the corresponding histogram bin atomically tid += next; // Jump to the next block of data } } // Function to free device memory void freeDeviceMemory(int *deviceData, int *deviceHist) { cudaExceptionHandler(cudaFree(deviceData),"Failed Free memory on the device-"); cudaExceptionHandler(cudaFree(deviceHist),"Failed Free memory on the device-"); } // Function to allocate memory for integer array on the host int *allocateIntArray(int size) { int *array = (int *)malloc(size * sizeof(int)); if (array == NULL) { printf("Memory allocation failed\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); } return array; } // Function to allocate memory for integer array on the host and initialize it with zeros int *allocateCallocArray(int size) { int *array = (int *)calloc(size, sizeof(int)); if (array == NULL) { printf("Memory allocation failed\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); } return array; } int main(int argc, char **argv) { MPI_Status status; int *h_data; // host array filled with numbers and scattered to 2 processes int *h_result; // host array that holds the final histogram result. the results are gathered from the GPU int *scatterd_data_part; // the input data that each MPI process receives after scattering int *d_Data; int *d_Hist; int *global_result; //int size_ = SIZE * sizeof(int); // size for device copies. int rank, size, split_size; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); if (size != 2) { // Ensure there are exactly two MPI processes printf("Run the example with two processes only\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); // Changed "__LINE__" to MPI_ERR_OTHER for more meaningful error } MPI_Comm_rank(MPI_COMM_WORLD, &rank); split_size = SIZE / size; // Calculate split size for scattering ( 150_000 ) scatterd_data_part = allocateIntArray(split_size); // each process part of data to loop over h_result = allocateIntArray(RANGE); // Array to store the histogram result h_data = allocateIntArray(SIZE); // Data to loop over and count allocateDeviceMemory(&d_Data, &d_Hist, split_size); // Allocate memory on the device copyDataToDevice(d_Data, h_data, split_size); // Copy data from host to device if (rank == 0) { global_result = allocateCallocArray(RANGE); // Allocate memory for the final result // Note : here we used 4 cores as required. #pragma omp parallel for num_threads(4) // initialize values for the data for (int i = 0; i < SIZE; i++) h_data[i] = rand() % RANGE; } initArray<<<1, RANGE>>>(d_Hist); // Initialize the histogram array on the device with zeros cudaExceptionHandler(cudaGetLastError(),"Failed to Initialize the histogram array on the device with zeros"); MPI_Scatter(h_data, split_size, MPI_INT, scatterd_data_part, split_size, MPI_INT, 0, MPI_COMM_WORLD);// Scatter data among processes // Calculate histogram on GPU histogramKernel<<<NUM_BLOCKS, NUM_THREADS>>>(d_Data, d_Hist, split_size); cudaExceptionHandler(cudaGetLastError(),"Failed to launch kernel error code : "); // Copy histogram result from device to host copyHistogramResult(h_result, d_Hist,""); // Reduce histograms from all processes to get the global result MPI_Reduce(h_result, global_result, RANGE, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); // Print global histogram result on rank 0 if (rank == 0) for (int i = 0; i < RANGE; i++) printf("Histogram[%d]: %d\n", i, global_result[i]); // Free allocated memory in Host & Device free(h_data); free(scatterd_data_part); free(h_result); freeDeviceMemory(d_Data, d_Hist); MPI_Finalize(); return 0; }
Editor is loading...
Leave a Comment