cuda

 avatar
unknown
c_cpp
6 months ago
2.8 kB
6
Indexable
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "timer.h"
#include "files.h"

#define SOFTENING 1e-9f

typedef struct { float x, y, z, vx, vy, vz; } Body;

__global__
void bodyForceKernel(Body *p, float dt, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i >= n) return;

  float Fx = 0.0f, Fy = 0.0f, Fz = 0.0f;
  for (int j = 0; j < n; j++) {
    float dx = p[j].x - p[i].x;
    float dy = p[j].y - p[i].y;
    float dz = p[j].z - p[i].z;
    float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
    float invDist = rsqrtf(distSqr);
    float invDist3 = invDist * invDist * invDist;

    Fx += dx * invDist3;
    Fy += dy * invDist3;
    Fz += dz * invDist3;
  }

  p[i].vx += dt * Fx;
  p[i].vy += dt * Fy;
  p[i].vz += dt * Fz;
}

__global__
void integratePositionsKernel(Body *p, float dt, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i >= n) return;

  p[i].x += p[i].vx * dt;
  p[i].y += p[i].vy * dt;
  p[i].z += p[i].vz * dt;
}

int main(const int argc, const char** argv) {
  int nBodies = 2 << 11;
  if (argc > 1) nBodies = 2 << atoi(argv[1]);

  const char *initialized_values;
  const char *solution_values;
  if (nBodies == 2<<11) {
    initialized_values = "09-nbody/files/initialized_4096";
    solution_values = "09-nbody/files/solution_4096";
  } else {
    initialized_values = "09-nbody/files/initialized_65536";
    solution_values = "09-nbody/files/solution_65536";
  }

  if (argc > 2) initialized_values = argv[2];
  if (argc > 3) solution_values = argv[3];

  const float dt = 0.01f;
  const int nIters = 10;

  int bytes = nBodies * sizeof(Body);
  Body *p;
  cudaMallocManaged(&p, bytes);
  cudaMemPrefetchAsync(p, bytes, cudaCpuDeviceId);
  read_values_from_file(initialized_values, (float*)p, bytes);

  int deviceId;
  int numberOfSMs;
  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

  size_t threadsPerBlock = 256;
  size_t numberOfBlocks = (nBodies + threadsPerBlock - 1) / threadsPerBlock;

  double totalTime = 0.0;

  for (int iter = 0; iter < nIters; iter++) {
    StartTimer();

    cudaMemPrefetchAsync(p, bytes, deviceId);

    bodyForceKernel<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies);
    cudaDeviceSynchronize();

    integratePositionsKernel<<<numberOfBlocks, threadsPerBlock>>>(p, dt, nBodies);
    cudaDeviceSynchronize();

    const double tElapsed = GetTimer() / 1000.0;
    totalTime += tElapsed;
  }

  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
  printf("%0.3f Billion Interactions / second\n", billionsOfOpsPerSecond);

  write_values_to_file(solution_values, (float*)p, bytes);

  cudaFree(p);
  return 0;
}
Editor is loading...
Leave a Comment