Untitled
unknown
c_cpp
9 months ago
3.1 kB
6
Indexable
#include <cstdio>
#include <cuda_runtime.h>
// Texture for input data
texture<float> input_tex;
__global__ void polyphase_upsampler_kernel(float *d_coeffs, float *d_output, int input_length) {
extern __shared__ float phase_coeffs[];
int phase = blockIdx.x; // Phase index (0..15)
int tid = blockIdx.y * blockDim.x + threadIdx.x;
int k = tid; // Input index
// Load phase coefficients into shared memory
for (int i = threadIdx.x; i < 64; i += blockDim.x) {
phase_coeffs[i] = d_coeffs[phase + i * 16];
}
__syncthreads();
if (k >= input_length) return;
float sum = 0.0f;
for (int i = 0; i < 64; ++i) {
int input_idx = k - i;
float x = tex1Dfetch(input_tex, input_idx);
sum += x * phase_coeffs[i];
}
int output_idx = k * 16 + phase;
if (output_idx < input_length * 16) {
d_output[output_idx] = sum;
}
}
void launch_polyphase_kernel(float *h_input, int input_length, float *h_coeffs, float *h_output) {
float *d_input, *d_coeffs, *d_output;
// Allocate device memory
cudaMalloc(&d_input, input_length * sizeof(float));
cudaMalloc(&d_coeffs, 1024 * sizeof(float));
cudaMalloc(&d_output, input_length * 16 * sizeof(float));
// Copy input and coefficients to device
cudaMemcpy(d_input, h_input, input_length * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coeffs, h_coeffs, 1024 * sizeof(float), cudaMemcpyHostToDevice);
// Configure texture
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
cudaBindTexture(0, input_tex, d_input, channel_desc, input_length * sizeof(float));
input_tex.addressMode[0] = cudaAddressModeBorder;
input_tex.filterMode = cudaFilterModePoint;
input_tex.normalized = 0;
// Kernel launch parameters
const int threads_per_block = 256;
dim3 grid(16, (input_length + threads_per_block - 1) / threads_per_block); // 16 phases, blocks cover input_length
dim3 block(threads_per_block);
size_t shared_mem_size = 64 * sizeof(float); // Each phase has 64 coefficients
// Launch kernel
polyphase_upsampler_kernel<<<grid, block, shared_mem_size>>>(d_coeffs, d_output, input_length);
// Copy output back to host
cudaMemcpy(h_output, d_output, input_length * 16 * sizeof(float), cudaMemcpyDeviceToHost);
// Cleanup
cudaUnbindTexture(input_tex);
cudaFree(d_input);
cudaFree(d_coeffs);
cudaFree(d_output);
}
// Example usage
int main() {
const int input_length = 1024; // Example input size
const int output_length = input_length * 16;
float *h_input = new float[input_length];
float *h_coeffs = new float[1024];
float *h_output = new float[output_length];
// Initialize input and coefficients (example values)
for (int i = 0; i < input_length; ++i) h_input[i] = 1.0f;
for (int i = 0; i < 1024; ++i) h_coeffs[i] = 0.01f; // Replace with actual coefficients
launch_polyphase_kernel(h_input, input_length, h_coeffs, h_output);
// Cleanup host memory
delete[] h_input;
delete[] h_coeffs;
delete[] h_output;
return 0;
}Editor is loading...
Leave a Comment