bnsmfft.cuh

#ifndef GPU_BOOTSTRAP_FFT_CUH
#define GPU_BOOTSTRAP_FFT_CUH

#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "twiddles.cuh"
#include "types/complex/operations.cuh"

#define DUFFS_DEVICE(count, loop_body) \
  do {                                 \
    int _count = (count);              \
    int _n = (_count + 7) / 8;         \
    switch (_count % 8) {              \
    case 0: do { loop_body;            \
    case 7:      loop_body;            \
    case 6:      loop_body;            \
    case 5:      loop_body;            \
    case 4:      loop_body;            \
    case 3:      loop_body;            \
    case 2:      loop_body;            \
    case 1:      loop_body;            \
            } while (--_n > 0);        \
    }                                  \
  } while (false)

#define NSMFFT_DIRECT_LEVEL_1_LOOP_BODY                      \
  do {                                                       \
    i1 = tid;                                                \
    i2 = tid + params::degree / 2;                           \
                                                             \
    u = A[i1];                                               \
    v = A[i2] * (double2){0.707106781186547461715008466854,  \
                          0.707106781186547461715008466854}; \
                                                             \
    A[i1] += v;                                              \
    A[i2] = u - v;                                           \
                                                             \
    tid += params::degree / params::opt;                     \
  } while (false)

#define NSMFFT_DIRECT_LEVEL_1()                                     \
  do {                                                              \
    DUFFS_DEVICE(params::opt / 2, NSMFFT_DIRECT_LEVEL_1_LOOP_BODY); \
    __syncthreads();                                                \
  } while (false)

#define NSMFFT_DIRECT_LEVEL_LOOP_BODY(level)       \
  do {                                             \
    int _shift = 1 << (level);                     \
    twid_id = tid / (params::degree / _shift);     \
    i1 = 2 * (params::degree / _shift) * twid_id + \
        (tid & (params::degree / _shift - 1));     \
    i2 = i1 + params::degree / _shift;             \
                                                   \
    w = negtwiddles[twid_id + _shift / 2];         \
    u = A[i1];                                     \
    v = A[i2] * w;                                 \
                                                   \
    A[i1] += v;                                    \
    A[i2] = u - v;                                 \
                                                   \
    tid += params::degree / params::opt;           \
  } while (false)


#define NSMFFT_DIRECT_LEVEL(level)                                       \
  do {                                                                   \
    tid = threadIdx.x;                                                   \
    DUFFS_DEVICE(params::opt / 2, NSMFFT_DIRECT_LEVEL_LOOP_BODY(level)); \
    __syncthreads();                                                     \
  } while (false)

/*
 * Direct negacyclic FFT:
 *   - before the FFT the N real coefficients are stored into a
 *     N/2 sized complex with the even coefficients in the real part
 *     and the odd coefficients in the imaginary part. This is referred to
 *     as the half-size FFT
 *   - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
 *     opt is divided by 2 because the butterfly pattern is always applied
 *     between pairs of coefficients
 *   - instead of twisting each coefficient A_j before the FFT by
 *     multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
 *     the FFT is modified, and for each level k of the FFT the twiddle:
 *     w_j,k = exp(-i pi j/2^k)
 *     is replaced with:
 *     \zeta_j,k = exp(-i pi (2j-1)/2^k)
 */
template <class params> __device__ void NSMFFT_direct(double2 *A) {

  /* We don't make bit reverse here, since twiddles are already reversed
   *  Each thread is always in charge of "opt/2" pairs of coefficients,
   *  which is why we always loop through N/2 by N/opt strides
   *  The pragma unroll instruction tells the compiler to unroll the
   *  full loop, which should increase performance
   */

  size_t tid = threadIdx.x;
  size_t twid_id;
  size_t i1, i2;
  double2 u, v, w;
  // level 1
  // we don't make actual complex multiplication on level1 since we have only
  // one twiddle, it's real and image parts are equal, so we can multiply
  // it with simpler operations
  NSMFFT_DIRECT_LEVEL_1();

  // level 2
  // from this level there are more than one twiddles and none of them has equal
  // real and imag parts, so complete complex multiplication is needed
  // for each level params::degree / 2^level represents number of coefficients
  // inside divided chunk of specific level
  //
  NSMFFT_DIRECT_LEVEL(2);

  // level 3
  NSMFFT_DIRECT_LEVEL(3);

  // level 4
  NSMFFT_DIRECT_LEVEL(4);

  // level 5
  NSMFFT_DIRECT_LEVEL(5);

  // level 6
  NSMFFT_DIRECT_LEVEL(6);

  // level 7
  NSMFFT_DIRECT_LEVEL(7);

  // from level 8, we need to check size of params degree, because we support
  // minimum actual polynomial size = 256,  when compressed size is halfed and
  // minimum supported compressed size is 128, so we always need first 7
  // levels of butterfly operation, since butterfly levels are hardcoded
  // we need to check if polynomial size is big enough to require specific level
  // of butterfly.
  if constexpr (params::degree >= 256) {
    // level 8
    NSMFFT_DIRECT_LEVEL(8);
  }

  if constexpr (params::degree >= 512) {
    // level 9
    NSMFFT_DIRECT_LEVEL(9);
  }

  if constexpr (params::degree >= 1024) {
    // level 10
    NSMFFT_DIRECT_LEVEL(10);
  }

  if constexpr (params::degree >= 2048) {
    // level 11
    NSMFFT_DIRECT_LEVEL(11);
  }

  if constexpr (params::degree >= 4096) {
    // level 12
    NSMFFT_DIRECT_LEVEL(12);
  }

  if constexpr (params::degree >= 8192) {
    // level 13
    NSMFFT_DIRECT_LEVEL(13);
  }
}

#define NSMFFT_INVERSE_LEVEL_LOOP_BODY(level)      \
  do {                                             \
    int _shift = 1 << level;                       \
    twid_id = tid / (params::degree / _shift);     \
    i1 = 2 * (params::degree / _shift) * twid_id + \
        (tid & (params::degree / _shift - 1));     \
    i2 = i1 + params::degree / _shift;             \
                                                   \
    w = negtwiddles[twid_id + _shift / 2];         \
    u = A[i1] - A[i2];                             \
                                                   \
    A[i1] += A[i2];                                \
    A[i2] = u * conjugate(w);                      \
                                                   \
    tid += params::degree / params::opt;           \
  } while (false)

#define NSMFFT_INVERSE_LEVEL(level)                                       \
  do {                                                                    \
    tid = threadIdx.x;                                                    \
    DUFFS_DEVICE(params::opt / 2, NSMFFT_INVERSE_LEVEL_LOOP_BODY(level)); \
    __syncthreads();                                                      \
  } while (false)

/*
 * negacyclic inverse fft
 */
template <class params> __device__ void NSMFFT_inverse(double2 *A) {

  /* We don't make bit reverse here, since twiddles are already reversed
   *  Each thread is always in charge of "opt/2" pairs of coefficients,
   *  which is why we always loop through N/2 by N/opt strides
   *  The pragma unroll instruction tells the compiler to unroll the
   *  full loop, which should increase performance
   */

  size_t tid = threadIdx.x;
  size_t twid_id;
  size_t i1, i2;
  double2 u, w;

  // divide input by compressed polynomial size
  tid = threadIdx.x;
  for (size_t i = 0; i < params::opt; ++i) {
    A[tid] /= params::degree;
    tid += params::degree / params::opt;
  }
  __syncthreads();

  // none of the twiddles have equal real and imag part, so
  // complete complex multiplication has to be done
  // here we have more than one twiddle
  // mapping in backward fft is reversed
  // butterfly operation is started from last level

  if constexpr (params::degree >= 8192) {
    // level 13
    NSMFFT_INVERSE_LEVEL(13);
  }

  if constexpr (params::degree >= 4096) {
    // level 12
    NSMFFT_INVERSE_LEVEL(12);
  }

  if constexpr (params::degree >= 2048) {
    // level 11
    NSMFFT_INVERSE_LEVEL(11);
  }

  if constexpr (params::degree >= 1024) {
    // level 10
    NSMFFT_INVERSE_LEVEL(10);
  }

  if constexpr (params::degree >= 512) {
    // level 9
    NSMFFT_INVERSE_LEVEL(9);
  }

  if constexpr (params::degree >= 256) {
    // level 8
    NSMFFT_INVERSE_LEVEL(8);
  }

  // below level 8, we don't need to check size of params degree, because we
  // support minimum actual polynomial size = 256,  when compressed size is
  // halfed and minimum supported compressed size is 128, so we always need
  // last 7 levels of butterfly operation, since butterfly levels are hardcoded
  // we don't need to check if polynomial size is big enough to require
  // specific level of butterfly.
  // level 7
  NSMFFT_INVERSE_LEVEL(7);

  // level 6
  NSMFFT_INVERSE_LEVEL(6);

  // level 5
  NSMFFT_INVERSE_LEVEL(5);

  // level 4
  NSMFFT_INVERSE_LEVEL(4);

  // level 3
  NSMFFT_INVERSE_LEVEL(3);

  // level 2
  NSMFFT_INVERSE_LEVEL(2);

  // level 1
  NSMFFT_INVERSE_LEVEL(1);
}

#define BATCH_NSMFFT_LOOP_BODY(expr)          \
  do {                                        \
    expr;                                     \
    tid = tid + params::degree / params::opt; \
  } while (false)

#define BATCH_NSMFFT(expr)                                       \
  do {                                                           \
    tid = threadIdx.x;                                           \
    DUFFS_DEVICE(params::opt / 2, BATCH_NSMFFT_LOOP_BODY(expr)); \
  } while (false)

/*
 * global batch fft
 * does fft in half size
 * unrolling half size fft result in half size + 1 elements
 * this function must be called with actual degree
 * function takes as input already compressed input
 */
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
                             double2 *buffer) {
  extern __shared__ double2 sharedMemoryFFT[];
  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
                               : sharedMemoryFFT;
  int tid;

  BATCH_NSMFFT(fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid]);
  __syncthreads();
  NSMFFT_direct<HalfDegree<params>>(fft);
  __syncthreads();

  BATCH_NSMFFT(d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid]);
}

/*
 * global batch polynomial multiplication
 * only used for fft tests
 * d_input1 and d_output must not have the same pointer
 * d_input1 can be modified inside the function
 */
template <class params, sharedMemDegree SMD>
__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
                                     double2 *d_output, double2 *buffer) {
  extern __shared__ double2 sharedMemoryFFT[];
  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
                               : sharedMemoryFFT;

  // Move first polynomial into shared memory(if possible otherwise it will
  // be moved in device buffer)
  int tid;
  BATCH_NSMFFT(fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid]);

  // Perform direct negacyclic fourier transform
  __syncthreads();
  NSMFFT_direct<HalfDegree<params>>(fft);
  __syncthreads();

  // Put the result of direct fft inside input1
  BATCH_NSMFFT(d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid]);
  __syncthreads();

  // Move first polynomial into shared memory(if possible otherwise it will
  // be moved in device buffer)
  BATCH_NSMFFT(fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid]);

  // Perform direct negacyclic fourier transform on the second polynomial
  __syncthreads();
  NSMFFT_direct<HalfDegree<params>>(fft);
  __syncthreads();

  // calculate pointwise multiplication inside fft buffer
  BATCH_NSMFFT(fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid]);

  // Perform backward negacyclic fourier transform
  __syncthreads();
  NSMFFT_inverse<HalfDegree<params>>(fft);
  __syncthreads();

  // copy results in output buffer
  BATCH_NSMFFT(d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid]);
}

#endif // GPU_BOOTSTRAP_FFT_CUH
Editor is loading...