bnsmfft.cuh
unknown
c_cpp
a year ago
13 kB
9
Indexable
#ifndef GPU_BOOTSTRAP_FFT_CUH
#define GPU_BOOTSTRAP_FFT_CUH
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "twiddles.cuh"
#include "types/complex/operations.cuh"
#define DUFFS_DEVICE(count, loop_body) \
do { \
int _count = (count); \
int _n = (_count + 7) / 8; \
switch (_count % 8) { \
case 0: do { loop_body; \
case 7: loop_body; \
case 6: loop_body; \
case 5: loop_body; \
case 4: loop_body; \
case 3: loop_body; \
case 2: loop_body; \
case 1: loop_body; \
} while (--_n > 0); \
} \
} while (false)
#define NSMFFT_DIRECT_LEVEL_1_LOOP_BODY \
do { \
i1 = tid; \
i2 = tid + params::degree / 2; \
\
u = A[i1]; \
v = A[i2] * (double2){0.707106781186547461715008466854, \
0.707106781186547461715008466854}; \
\
A[i1] += v; \
A[i2] = u - v; \
\
tid += params::degree / params::opt; \
} while (false)
#define NSMFFT_DIRECT_LEVEL_1() \
do { \
DUFFS_DEVICE(params::opt / 2, NSMFFT_DIRECT_LEVEL_1_LOOP_BODY); \
__syncthreads(); \
} while (false)
#define NSMFFT_DIRECT_LEVEL_LOOP_BODY(level) \
do { \
int _shift = 1 << (level); \
twid_id = tid / (params::degree / _shift); \
i1 = 2 * (params::degree / _shift) * twid_id + \
(tid & (params::degree / _shift - 1)); \
i2 = i1 + params::degree / _shift; \
\
w = negtwiddles[twid_id + _shift / 2]; \
u = A[i1]; \
v = A[i2] * w; \
\
A[i1] += v; \
A[i2] = u - v; \
\
tid += params::degree / params::opt; \
} while (false)
#define NSMFFT_DIRECT_LEVEL(level) \
do { \
tid = threadIdx.x; \
DUFFS_DEVICE(params::opt / 2, NSMFFT_DIRECT_LEVEL_LOOP_BODY(level)); \
__syncthreads(); \
} while (false)
/*
* Direct negacyclic FFT:
* - before the FFT the N real coefficients are stored into a
* N/2 sized complex with the even coefficients in the real part
* and the odd coefficients in the imaginary part. This is referred to
* as the half-size FFT
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
* opt is divided by 2 because the butterfly pattern is always applied
* between pairs of coefficients
* - instead of twisting each coefficient A_j before the FFT by
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
* the FFT is modified, and for each level k of the FFT the twiddle:
* w_j,k = exp(-i pi j/2^k)
* is replaced with:
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
*/
template <class params> __device__ void NSMFFT_direct(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, v, w;
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
NSMFFT_DIRECT_LEVEL_1();
// level 2
// from this level there are more than one twiddles and none of them has equal
// real and imag parts, so complete complex multiplication is needed
// for each level params::degree / 2^level represents number of coefficients
// inside divided chunk of specific level
//
NSMFFT_DIRECT_LEVEL(2);
// level 3
NSMFFT_DIRECT_LEVEL(3);
// level 4
NSMFFT_DIRECT_LEVEL(4);
// level 5
NSMFFT_DIRECT_LEVEL(5);
// level 6
NSMFFT_DIRECT_LEVEL(6);
// level 7
NSMFFT_DIRECT_LEVEL(7);
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfly operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
// level 8
NSMFFT_DIRECT_LEVEL(8);
}
if constexpr (params::degree >= 512) {
// level 9
NSMFFT_DIRECT_LEVEL(9);
}
if constexpr (params::degree >= 1024) {
// level 10
NSMFFT_DIRECT_LEVEL(10);
}
if constexpr (params::degree >= 2048) {
// level 11
NSMFFT_DIRECT_LEVEL(11);
}
if constexpr (params::degree >= 4096) {
// level 12
NSMFFT_DIRECT_LEVEL(12);
}
if constexpr (params::degree >= 8192) {
// level 13
NSMFFT_DIRECT_LEVEL(13);
}
}
#define NSMFFT_INVERSE_LEVEL_LOOP_BODY(level) \
do { \
int _shift = 1 << level; \
twid_id = tid / (params::degree / _shift); \
i1 = 2 * (params::degree / _shift) * twid_id + \
(tid & (params::degree / _shift - 1)); \
i2 = i1 + params::degree / _shift; \
\
w = negtwiddles[twid_id + _shift / 2]; \
u = A[i1] - A[i2]; \
\
A[i1] += A[i2]; \
A[i2] = u * conjugate(w); \
\
tid += params::degree / params::opt; \
} while (false)
#define NSMFFT_INVERSE_LEVEL(level) \
do { \
tid = threadIdx.x; \
DUFFS_DEVICE(params::opt / 2, NSMFFT_INVERSE_LEVEL_LOOP_BODY(level)); \
__syncthreads(); \
} while (false)
/*
* negacyclic inverse fft
*/
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
/* We don't make bit reverse here, since twiddles are already reversed
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, w;
// divide input by compressed polynomial size
tid = threadIdx.x;
for (size_t i = 0; i < params::opt; ++i) {
A[tid] /= params::degree;
tid += params::degree / params::opt;
}
__syncthreads();
// none of the twiddles have equal real and imag part, so
// complete complex multiplication has to be done
// here we have more than one twiddle
// mapping in backward fft is reversed
// butterfly operation is started from last level
if constexpr (params::degree >= 8192) {
// level 13
NSMFFT_INVERSE_LEVEL(13);
}
if constexpr (params::degree >= 4096) {
// level 12
NSMFFT_INVERSE_LEVEL(12);
}
if constexpr (params::degree >= 2048) {
// level 11
NSMFFT_INVERSE_LEVEL(11);
}
if constexpr (params::degree >= 1024) {
// level 10
NSMFFT_INVERSE_LEVEL(10);
}
if constexpr (params::degree >= 512) {
// level 9
NSMFFT_INVERSE_LEVEL(9);
}
if constexpr (params::degree >= 256) {
// level 8
NSMFFT_INVERSE_LEVEL(8);
}
// below level 8, we don't need to check size of params degree, because we
// support minimum actual polynomial size = 256, when compressed size is
// halfed and minimum supported compressed size is 128, so we always need
// last 7 levels of butterfly operation, since butterfly levels are hardcoded
// we don't need to check if polynomial size is big enough to require
// specific level of butterfly.
// level 7
NSMFFT_INVERSE_LEVEL(7);
// level 6
NSMFFT_INVERSE_LEVEL(6);
// level 5
NSMFFT_INVERSE_LEVEL(5);
// level 4
NSMFFT_INVERSE_LEVEL(4);
// level 3
NSMFFT_INVERSE_LEVEL(3);
// level 2
NSMFFT_INVERSE_LEVEL(2);
// level 1
NSMFFT_INVERSE_LEVEL(1);
}
#define BATCH_NSMFFT_LOOP_BODY(expr) \
do { \
expr; \
tid = tid + params::degree / params::opt; \
} while (false)
#define BATCH_NSMFFT(expr) \
do { \
tid = threadIdx.x; \
DUFFS_DEVICE(params::opt / 2, BATCH_NSMFFT_LOOP_BODY(expr)); \
} while (false)
/*
* global batch fft
* does fft in half size
* unrolling half size fft result in half size + 1 elements
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
: sharedMemoryFFT;
int tid;
BATCH_NSMFFT(fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid]);
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
BATCH_NSMFFT(d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid]);
}
/*
* global batch polynomial multiplication
* only used for fft tests
* d_input1 and d_output must not have the same pointer
* d_input1 can be modified inside the function
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
double2 *d_output, double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
: sharedMemoryFFT;
// Move first polynomial into shared memory(if possible otherwise it will
// be moved in device buffer)
int tid;
BATCH_NSMFFT(fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid]);
// Perform direct negacyclic fourier transform
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
// Put the result of direct fft inside input1
BATCH_NSMFFT(d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid]);
__syncthreads();
// Move first polynomial into shared memory(if possible otherwise it will
// be moved in device buffer)
BATCH_NSMFFT(fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid]);
// Perform direct negacyclic fourier transform on the second polynomial
__syncthreads();
NSMFFT_direct<HalfDegree<params>>(fft);
__syncthreads();
// calculate pointwise multiplication inside fft buffer
BATCH_NSMFFT(fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid]);
// Perform backward negacyclic fourier transform
__syncthreads();
NSMFFT_inverse<HalfDegree<params>>(fft);
__syncthreads();
// copy results in output buffer
BATCH_NSMFFT(d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid]);
}
#endif // GPU_BOOTSTRAP_FFT_CUH
Editor is loading...
Leave a Comment