task 3
mail@pastecode.io avatar
22 days ago
5.8 kB
#include <mpi.h>
#include <stdio.h>
#include <omp.h>
#include <time.h>
#include <stdlib.h>

#define SIZE  300000       // Size of the array
#define RANGE  256         // Range of the numbers in the array
#define NUM_BLOCKS 10      // Number of CUDA blocks
#define NUM_THREADS 20     // Number of threads per CUDA block

// Function to handle CUDA errors
void cudaExceptionHandler(cudaError_t err, const char *customMessage) {
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: %s %s\n", customMessage, cudaGetErrorString(err));
// Function to allocate memory on the device
void allocateDeviceMemory(int **deviceData, int **deviceHist, int dataSize) {
    cudaExceptionHandler(cudaMalloc((void **)deviceData, dataSize * sizeof(int)),"Failed to allocate device memory");
    cudaExceptionHandler(cudaMalloc((void **)deviceHist, RANGE * sizeof(int)),"Failed to allocate device memory");

// Function to copy data from host to device
void copyDataToDevice(int *deviceData, int *data, int dataSize) {
    cudaExceptionHandler(cudaMemcpy(deviceData, data, dataSize * sizeof(int), cudaMemcpyHostToDevice),"Failed to copy data from host to device");

// Function to copy histogram result from device to host
void copyHistogramResult(int *histogram, int *deviceHist) {
    cudaExceptionHandler(cudaMemcpy(histogram, deviceHist, RANGE * sizeof(int), cudaMemcpyDeviceToHost),"Failed to Copy the histogram result from the device to the host");

// CUDA kernel to initialize the histogram array with zeros
__global__ void initArray(int *deviceHist) {
    deviceHist[threadIdx.x] = 0;

// CUDA kernel to calculate histogram
__global__ void histogramKernel(int *deviceData, int *deviceHist, int dataSize)
	// Calculate the index of the current thread 
    int tid = blockIdx.x * blockDim.x + threadIdx.x; 

    // Calculate the remain for each thread to jump to the next block of data
    int next = blockDim.x * gridDim.x; 

    while (tid < dataSize) 
        atomicAdd(&(deviceHist[deviceData[tid]]), 1); // Increment the corresponding histogram bin atomically
        tid += next; // Jump to the next block of data

// Function to free device memory
void freeDeviceMemory(int *deviceData, int *deviceHist) {
    cudaExceptionHandler(cudaFree(deviceData),"Failed Free memory on the device-");
    cudaExceptionHandler(cudaFree(deviceHist),"Failed Free memory on the device-");

// Function to allocate memory for integer array on the host
int *allocateMemoryArray(int size, int mallocOrCalloc) {
    int *array = (int *)(mallocOrCalloc == 0 ? calloc(size, sizeof(int)) : malloc(size * sizeof(int)));
    if (array == NULL) {
        printf("Memory allocation failed\n");
    return array;

int main(int argc, char **argv) {
    MPI_Status status;

    int *h_data; // host array filled with numbers and scattered to 2 processes
    int *h_result; // host array that holds the final histogram result. the results are gathered from the GPU
    int *scatterd_data_part; // the input data that each MPI process receives after scattering

    int *d_Data;
    int *d_Hist;

    int *global_result;
    //int size_ = SIZE * sizeof(int); // size for device copies.
    int rank, size, split_size;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    if (size != 2) { // Ensure there are exactly two MPI processes
        printf("Run the example with two processes only\n");
        MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); // Changed "__LINE__" to MPI_ERR_OTHER for more meaningful error
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    split_size = SIZE / size;  // Calculate split size for scattering ( 150_000 )

    scatterd_data_part = allocateMemoryArray(split_size,1); // each process part of data to loop over
    h_result = allocateMemoryArray(RANGE,1); // Array to store the histogram result
    h_data = allocateMemoryArray(SIZE,1); // Data to loop over and count

    allocateDeviceMemory(&d_Data, &d_Hist, split_size); // Allocate memory on the device
    copyDataToDevice(d_Data, h_data, split_size); // Copy data from host to device

    if (rank == 0) {
        global_result = allocateMemoryArray(RANGE,0); // Allocate memory for the final result
        // Note : here we used 4 cores as required.
        #pragma omp parallel for num_threads(4) // initialize values for the data 
        for (int i = 0; i < SIZE; i++)
            h_data[i] = rand() % RANGE;
    initArray<<<1, RANGE>>>(d_Hist); // Initialize the histogram array on the device with zeros
    cudaExceptionHandler(cudaGetLastError(),"Failed to Initialize the histogram array on the device with zeros");

    MPI_Scatter(h_data, split_size, MPI_INT, scatterd_data_part, split_size, MPI_INT, 0, MPI_COMM_WORLD);// Scatter data among processes

    // Calculate histogram on GPU
    histogramKernel<<<NUM_BLOCKS, NUM_THREADS>>>(d_Data, d_Hist, split_size);
    cudaExceptionHandler(cudaGetLastError(),"Failed to launch kernel error code : ");

    // Copy histogram result from device to host
    copyHistogramResult(h_result, d_Hist,"");

    // Reduce histograms from all processes to get the global result
    MPI_Reduce(h_result, global_result, RANGE, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    // Print global histogram result on rank 0
    if (rank == 0)
        for (int i = 0; i < RANGE; i++)
            printf("Histogram[%d]: %d\n", i, global_result[i]);

    // Free allocated memory in Host & Device
    freeDeviceMemory(d_Data, d_Hist); 


    return 0;
Leave a Comment