Untitled
unknown
c_cpp
a year ago
3.8 kB
9
Indexable
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "util.h"
int compare(const void* p, const void* q) {
float* val_p = ((Point*)p)->values;
float* val_q = ((Point*)q)->values;
return (val_p[0] > val_q[0]) - (val_p[0] < val_q[0]);
}
// Check if the point indexed by the kernel thread is permissible.
__global__ void checkPermissible(float* point_dims, bool* flags, int dim, int number) {
__shared__ float shared_points[256 * 8]; // threads per block * dimensions (tests have dim <= 7)
const int blockIndex = blockIdx.x;
const int threadIndex = threadIdx.x;
const int index = blockIndex * blockDim.x + threadIndex;
if (index >= number) {
return;
}
// Load this thread's point into shared memory.
for (int j = 0; j < dim; j++) {
// Store in block-local index.
shared_points[threadIndex * dim + j] = point_dims[index * dim + j];
}
__syncthreads();
bool permissible = true;
for (int i = 0; i < index && permissible; ++i) {
bool prevails = true;
for (int j = 0; j < dim && prevails; j++) {
float point_i_dim_value;
int i_block = i / blockDim.x;
// If the point we're comparing against is in the same block, use shared memory.
// Another thread will have loaded this point's dimensions into shared memory.
// Otherwise, use global memory.
if (i_block == blockIndex) {
// Access shared memory by the block-local index.
point_i_dim_value = shared_points[(i % blockDim.x) * dim + j];
} else {
point_i_dim_value = point_dims[i * dim + j];
}
if (point_i_dim_value > shared_points[threadIndex * dim + j]) {
prevails = false;
}
}
if (prevails) {
permissible = false;
}
}
flags[index] = permissible;
}
extern "C" int asgn1b(Point* h_points, Point** pPermissiblePoints, int number, int dim, int gpuid) {
// points -- input data
// pPermissiblePoints -- your computed answer
// number -- number of points in dataset
// dim -- the dimension of the dataset
// gpuid -- the gpu used to run the program
int permissiblePointNum = 0;
Point* permissiblePoints = NULL;
permissiblePoints = (Point*)malloc(sizeof(Point) * number);
qsort(h_points, number, sizeof(Point), compare);
cudaSetDevice(gpuid);
// Allocate host memory for the flags and point dimensions.
bool* h_flags;
float* h_point_dims;
cudaMallocManaged(&h_flags, sizeof(bool) * number);
cudaMallocManaged(&h_point_dims, sizeof(float) * dim * number);
for (int i = 0; i < number; ++i) {
h_flags[i] = false;
for (int j = 0; j < dim; ++j) {
h_point_dims[i * dim + j] = h_points[i].values[j];
}
}
// Allocate device memory for the points and "permissible" flags.
const size_t POINT_BYTES = sizeof(float) * dim * number;
const size_t FLAG_BYTES = sizeof(bool) * number;
// Execute the kernel.
const int BLOCK_SIZE = 256;
const int NUM_BLOCKS = (number + BLOCK_SIZE - 1) / BLOCK_SIZE;
checkPermissible<<<NUM_BLOCKS, BLOCK_SIZE>>>(h_point_dims, h_flags, dim, number);
cudaDeviceSynchronize(); // wait for GPU kernels to execute
// Collect the permissible points.
for (int i = 0; i < number; ++i) {
if (h_flags[i]) {
permissiblePoints[permissiblePointNum++] = h_points[i];
}
}
// Cleanup CUDA memory.
cudaFree(h_point_dims);
cudaFree(h_flags);
*pPermissiblePoints = permissiblePoints;
return permissiblePointNum;
}Editor is loading...
Leave a Comment