Untitled

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include "util.h"

int compare(const void* p, const void* q) {
    float* val_p = ((Point*)p)->values;
    float* val_q = ((Point*)q)->values;
    return (val_p[0] > val_q[0]) - (val_p[0] < val_q[0]);
}

// Check if the point indexed by the kernel thread is permissible.
__global__ void checkPermissible(float* point_dims, bool* flags, int dim, int number) {
    __shared__ float shared_points[256 * 8];  // threads per block * dimensions (tests have dim <= 7)

    const int blockIndex = blockIdx.x;
    const int threadIndex = threadIdx.x;
    const int index = blockIndex * blockDim.x + threadIndex;

    if (index >= number) {
        return;
    }

    // Load this thread's point into shared memory.
    for (int j = 0; j < dim; j++) {
        // Store in block-local index.
        shared_points[threadIndex * dim + j] = point_dims[index * dim + j];
    }

    __syncthreads();

    bool permissible = true;

    for (int i = 0; i < index && permissible; ++i) {
        bool prevails = true;
        for (int j = 0; j < dim && prevails; j++) {
            float point_i_dim_value;
            int i_block = i / blockDim.x;
            // If the point we're comparing against is in the same block, use shared memory.
            // Another thread will have loaded this point's dimensions into shared memory.
            // Otherwise, use global memory.
            if (i_block == blockIndex) {
                // Access shared memory by the block-local index.
                point_i_dim_value = shared_points[(i % blockDim.x) * dim + j];
            } else {
                point_i_dim_value = point_dims[i * dim + j];
            }

            if (point_i_dim_value > shared_points[threadIndex * dim + j]) {
                prevails = false;
            }
        }

        if (prevails) {
            permissible = false;
        }
    }

    flags[index] = permissible;
}

extern "C" int asgn1b(Point* h_points, Point** pPermissiblePoints, int number, int dim, int gpuid) {
    // points -- input data
    // pPermissiblePoints -- your computed answer
    // number -- number of points in dataset
    // dim -- the dimension of the dataset
    // gpuid -- the gpu used to run the program

    int permissiblePointNum = 0;
    Point* permissiblePoints = NULL;

    permissiblePoints = (Point*)malloc(sizeof(Point) * number);

    qsort(h_points, number, sizeof(Point), compare);

    cudaSetDevice(gpuid);

    // Allocate host memory for the flags and point dimensions.
    bool* h_flags;
    float* h_point_dims;

    cudaMallocManaged(&h_flags, sizeof(bool) * number);
    cudaMallocManaged(&h_point_dims, sizeof(float) * dim * number);

    for (int i = 0; i < number; ++i) {
        h_flags[i] = false;
        for (int j = 0; j < dim; ++j) {
            h_point_dims[i * dim + j] = h_points[i].values[j];
        }
    }

    // Allocate device memory for the points and "permissible" flags.
    const size_t POINT_BYTES = sizeof(float) * dim * number;
    const size_t FLAG_BYTES = sizeof(bool) * number;

    // Execute the kernel.
    const int BLOCK_SIZE = 256;
    const int NUM_BLOCKS = (number + BLOCK_SIZE - 1) / BLOCK_SIZE;
    checkPermissible<<<NUM_BLOCKS, BLOCK_SIZE>>>(h_point_dims, h_flags, dim, number);

    cudaDeviceSynchronize();  // wait for GPU kernels to execute

    // Collect the permissible points.
    for (int i = 0; i < number; ++i) {
        if (h_flags[i]) {
            permissiblePoints[permissiblePointNum++] = h_points[i];
        }
    }

    // Cleanup CUDA memory.
    cudaFree(h_point_dims);
    cudaFree(h_flags);

    *pPermissiblePoints = permissiblePoints;
    return permissiblePointNum;
}
Editor is loading...