Untitled
task 3unknown
c_cpp
2 years ago
5.8 kB
11
Indexable
#include <mpi.h>
#include <stdio.h>
#include <omp.h>
#include <time.h>
#include <stdlib.h>
#define SIZE 300000 // Size of the array
#define RANGE 256 // Range of the numbers in the array
#define NUM_BLOCKS 10 // Number of CUDA blocks
#define NUM_THREADS 20 // Number of threads per CUDA block
// Function to handle CUDA errors
void cudaExceptionHandler(cudaError_t err, const char *customMessage) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: %s %s\n", customMessage, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
// Function to allocate memory on the device
void allocateDeviceMemory(int **deviceData, int **deviceHist, int dataSize) {
cudaExceptionHandler(cudaMalloc((void **)deviceData, dataSize * sizeof(int)),"Failed to allocate device memory");
cudaExceptionHandler(cudaMalloc((void **)deviceHist, RANGE * sizeof(int)),"Failed to allocate device memory");
}
// Function to copy data from host to device
void copyDataToDevice(int *deviceData, int *data, int dataSize) {
cudaExceptionHandler(cudaMemcpy(deviceData, data, dataSize * sizeof(int), cudaMemcpyHostToDevice),"Failed to copy data from host to device");
}
// Function to copy histogram result from device to host
void copyHistogramResult(int *histogram, int *deviceHist) {
cudaExceptionHandler(cudaMemcpy(histogram, deviceHist, RANGE * sizeof(int), cudaMemcpyDeviceToHost),"Failed to Copy the histogram result from the device to the host");
}
// CUDA kernel to initialize the histogram array with zeros
__global__ void initArray(int *deviceHist) {
deviceHist[threadIdx.x] = 0;
}
// CUDA kernel to calculate histogram
__global__ void histogramKernel(int *deviceData, int *deviceHist, int dataSize)
{
// Calculate the index of the current thread
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// Calculate the remain for each thread to jump to the next block of data
int next = blockDim.x * gridDim.x;
while (tid < dataSize)
{
atomicAdd(&(deviceHist[deviceData[tid]]), 1); // Increment the corresponding histogram bin atomically
tid += next; // Jump to the next block of data
}
}
// Function to free device memory
void freeDeviceMemory(int *deviceData, int *deviceHist) {
cudaExceptionHandler(cudaFree(deviceData),"Failed Free memory on the device-");
cudaExceptionHandler(cudaFree(deviceHist),"Failed Free memory on the device-");
}
// Function to allocate memory for integer array on the host
int *allocateMemoryArray(int size, int mallocOrCalloc) {
int *array = (int *)(mallocOrCalloc == 0 ? calloc(size, sizeof(int)) : malloc(size * sizeof(int)));
if (array == NULL) {
printf("Memory allocation failed\n");
MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER);
}
return array;
}
int main(int argc, char **argv) {
MPI_Status status;
int *h_data; // host array filled with numbers and scattered to 2 processes
int *h_result; // host array that holds the final histogram result. the results are gathered from the GPU
int *scatterd_data_part; // the input data that each MPI process receives after scattering
int *d_Data;
int *d_Hist;
int *global_result;
//int size_ = SIZE * sizeof(int); // size for device copies.
int rank, size, split_size;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != 2) { // Ensure there are exactly two MPI processes
printf("Run the example with two processes only\n");
MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); // Changed "__LINE__" to MPI_ERR_OTHER for more meaningful error
}
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
split_size = SIZE / size; // Calculate split size for scattering ( 150_000 )
scatterd_data_part = allocateMemoryArray(split_size,1); // each process part of data to loop over
h_result = allocateMemoryArray(RANGE,1); // Array to store the histogram result
h_data = allocateMemoryArray(SIZE,1); // Data to loop over and count
allocateDeviceMemory(&d_Data, &d_Hist, split_size); // Allocate memory on the device
copyDataToDevice(d_Data, h_data, split_size); // Copy data from host to device
if (rank == 0) {
global_result = allocateMemoryArray(RANGE,0); // Allocate memory for the final result
// Note : here we used 4 cores as required.
#pragma omp parallel for num_threads(4) // initialize values for the data
for (int i = 0; i < SIZE; i++)
h_data[i] = rand() % RANGE;
}
initArray<<<1, RANGE>>>(d_Hist); // Initialize the histogram array on the device with zeros
cudaExceptionHandler(cudaGetLastError(),"Failed to Initialize the histogram array on the device with zeros");
MPI_Scatter(h_data, split_size, MPI_INT, scatterd_data_part, split_size, MPI_INT, 0, MPI_COMM_WORLD);// Scatter data among processes
// Calculate histogram on GPU
histogramKernel<<<NUM_BLOCKS, NUM_THREADS>>>(d_Data, d_Hist, split_size);
cudaExceptionHandler(cudaGetLastError(),"Failed to launch kernel error code : ");
// Copy histogram result from device to host
copyHistogramResult(h_result, d_Hist,"");
// Reduce histograms from all processes to get the global result
MPI_Reduce(h_result, global_result, RANGE, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
// Print global histogram result on rank 0
if (rank == 0)
for (int i = 0; i < RANGE; i++)
printf("Histogram[%d]: %d\n", i, global_result[i]);
// Free allocated memory in Host & Device
free(h_data);
free(scatterd_data_part);
free(h_result);
freeDeviceMemory(d_Data, d_Hist);
MPI_Finalize();
return 0;
}
Editor is loading...
Leave a Comment