Untitled
unknown
c_cpp
5 months ago
3.8 kB
7
Indexable
#include <cuda.h> #include <cuda_runtime.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include "util.h" int compare(const void* p, const void* q) { float* val_p = ((Point*)p)->values; float* val_q = ((Point*)q)->values; return (val_p[0] > val_q[0]) - (val_p[0] < val_q[0]); } // Check if the point indexed by the kernel thread is permissible. __global__ void checkPermissible(float* point_dims, bool* flags, int dim, int number) { __shared__ float shared_points[256 * 8]; // threads per block * dimensions (tests have dim <= 7) const int blockIndex = blockIdx.x; const int threadIndex = threadIdx.x; const int index = blockIndex * blockDim.x + threadIndex; if (index >= number) { return; } // Load this thread's point into shared memory. for (int j = 0; j < dim; j++) { // Store in block-local index. shared_points[threadIndex * dim + j] = point_dims[index * dim + j]; } __syncthreads(); bool permissible = true; for (int i = 0; i < index && permissible; ++i) { bool prevails = true; for (int j = 0; j < dim && prevails; j++) { float point_i_dim_value; int i_block = i / blockDim.x; // If the point we're comparing against is in the same block, use shared memory. // Another thread will have loaded this point's dimensions into shared memory. // Otherwise, use global memory. if (i_block == blockIndex) { // Access shared memory by the block-local index. point_i_dim_value = shared_points[(i % blockDim.x) * dim + j]; } else { point_i_dim_value = point_dims[i * dim + j]; } if (point_i_dim_value > shared_points[threadIndex * dim + j]) { prevails = false; } } if (prevails) { permissible = false; } } flags[index] = permissible; } extern "C" int asgn1b(Point* h_points, Point** pPermissiblePoints, int number, int dim, int gpuid) { // points -- input data // pPermissiblePoints -- your computed answer // number -- number of points in dataset // dim -- the dimension of the dataset // gpuid -- the gpu used to run the program int permissiblePointNum = 0; Point* permissiblePoints = NULL; permissiblePoints = (Point*)malloc(sizeof(Point) * number); qsort(h_points, number, sizeof(Point), compare); cudaSetDevice(gpuid); // Allocate host memory for the flags and point dimensions. bool* h_flags; float* h_point_dims; cudaMallocManaged(&h_flags, sizeof(bool) * number); cudaMallocManaged(&h_point_dims, sizeof(float) * dim * number); for (int i = 0; i < number; ++i) { h_flags[i] = false; for (int j = 0; j < dim; ++j) { h_point_dims[i * dim + j] = h_points[i].values[j]; } } // Allocate device memory for the points and "permissible" flags. const size_t POINT_BYTES = sizeof(float) * dim * number; const size_t FLAG_BYTES = sizeof(bool) * number; // Execute the kernel. const int BLOCK_SIZE = 256; const int NUM_BLOCKS = (number + BLOCK_SIZE - 1) / BLOCK_SIZE; checkPermissible<<<NUM_BLOCKS, BLOCK_SIZE>>>(h_point_dims, h_flags, dim, number); cudaDeviceSynchronize(); // wait for GPU kernels to execute // Collect the permissible points. for (int i = 0; i < number; ++i) { if (h_flags[i]) { permissiblePoints[permissiblePointNum++] = h_points[i]; } } // Cleanup CUDA memory. cudaFree(h_point_dims); cudaFree(h_flags); *pPermissiblePoints = permissiblePoints; return permissiblePointNum; }
Editor is loading...
Leave a Comment