Untitled
#include <cstdio> #include <cuda_runtime.h> // Texture for input data texture<float> input_tex; __global__ void polyphase_upsampler_kernel(float *d_coeffs, float *d_output, int input_length) { extern __shared__ float phase_coeffs[]; int phase = blockIdx.x; // Phase index (0..15) int tid = blockIdx.y * blockDim.x + threadIdx.x; int k = tid; // Input index // Load phase coefficients into shared memory for (int i = threadIdx.x; i < 64; i += blockDim.x) { phase_coeffs[i] = d_coeffs[phase + i * 16]; } __syncthreads(); if (k >= input_length) return; float sum = 0.0f; for (int i = 0; i < 64; ++i) { int input_idx = k - i; float x = tex1Dfetch(input_tex, input_idx); sum += x * phase_coeffs[i]; } int output_idx = k * 16 + phase; if (output_idx < input_length * 16) { d_output[output_idx] = sum; } } void launch_polyphase_kernel(float *h_input, int input_length, float *h_coeffs, float *h_output) { float *d_input, *d_coeffs, *d_output; // Allocate device memory cudaMalloc(&d_input, input_length * sizeof(float)); cudaMalloc(&d_coeffs, 1024 * sizeof(float)); cudaMalloc(&d_output, input_length * 16 * sizeof(float)); // Copy input and coefficients to device cudaMemcpy(d_input, h_input, input_length * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_coeffs, h_coeffs, 1024 * sizeof(float), cudaMemcpyHostToDevice); // Configure texture cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>(); cudaBindTexture(0, input_tex, d_input, channel_desc, input_length * sizeof(float)); input_tex.addressMode[0] = cudaAddressModeBorder; input_tex.filterMode = cudaFilterModePoint; input_tex.normalized = 0; // Kernel launch parameters const int threads_per_block = 256; dim3 grid(16, (input_length + threads_per_block - 1) / threads_per_block); // 16 phases, blocks cover input_length dim3 block(threads_per_block); size_t shared_mem_size = 64 * sizeof(float); // Each phase has 64 coefficients // Launch kernel polyphase_upsampler_kernel<<<grid, block, shared_mem_size>>>(d_coeffs, d_output, input_length); // Copy output back to host cudaMemcpy(h_output, d_output, input_length * 16 * sizeof(float), cudaMemcpyDeviceToHost); // Cleanup cudaUnbindTexture(input_tex); cudaFree(d_input); cudaFree(d_coeffs); cudaFree(d_output); } // Example usage int main() { const int input_length = 1024; // Example input size const int output_length = input_length * 16; float *h_input = new float[input_length]; float *h_coeffs = new float[1024]; float *h_output = new float[output_length]; // Initialize input and coefficients (example values) for (int i = 0; i < input_length; ++i) h_input[i] = 1.0f; for (int i = 0; i < 1024; ++i) h_coeffs[i] = 0.01f; // Replace with actual coefficients launch_polyphase_kernel(h_input, input_length, h_coeffs, h_output); // Cleanup host memory delete[] h_input; delete[] h_coeffs; delete[] h_output; return 0; }
Leave a Comment