cuda
unknown
c_cpp
a year ago
2.8 kB
22
Indexable
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "timer.h"
#include "files.h"
#define SOFTENING 1e-9f
typedef struct { float x, y, z, vx, vy, vz; } Body;
__global__
void bodyForceKernel(Body *p, float dt, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
float Fx = 0.0f, Fy = 0.0f, Fz = 0.0f;
for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3;
Fy += dy * invDist3;
Fz += dz * invDist3;
}
p[i].vx += dt * Fx;
p[i].vy += dt * Fy;
p[i].vz += dt * Fz;
}
__global__
void integratePositionsKernel(Body *p, float dt, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
p[i].x += p[i].vx * dt;
p[i].y += p[i].vy * dt;
p[i].z += p[i].vz * dt;
}
int main(const int argc, const char** argv) {
int nBodies = 2 << 11;
if (argc > 1) nBodies = 2 << atoi(argv[1]);
const char *initialized_values;
const char *solution_values;
if (nBodies == 2<<11) {
initialized_values = "09-nbody/files/initialized_4096";
solution_values = "09-nbody/files/solution_4096";
} else {
initialized_values = "09-nbody/files/initialized_65536";
solution_values = "09-nbody/files/solution_65536";
}
if (argc > 2) initialized_values = argv[2];
if (argc > 3) solution_values = argv[3];
const float dt = 0.01f;
const int nIters = 10;
int bytes = nBodies * sizeof(Body);
Body *p;
cudaMallocManaged(&p, bytes);
cudaMemPrefetchAsync(p, bytes, cudaCpuDeviceId);
read_values_from_file(initialized_values, (float*)p, bytes);
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
size_t threadsPerBlock = 256;
size_t numberOfBlocks = (nBodies + threadsPerBlock - 1) / threadsPerBlock;
double totalTime = 0.0;
for (int iter = 0; iter < nIters; iter++) {
StartTimer();
cudaMemPrefetchAsync(p, bytes, deviceId);
bodyForceKernel<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies);
cudaDeviceSynchronize();
integratePositionsKernel<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies);
cudaDeviceSynchronize();
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}
double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);
write_values_to_file(solution_values, (float*)p, bytes);
cudaFree(p);
return 0;
}
Editor is loading...
Leave a Comment