Untitled

 avatar
unknown
c_cpp
14 days ago
3.1 kB
4
Indexable
#include <cstdio>
#include <cuda_runtime.h>

// Texture for input data
texture<float> input_tex;

__global__ void polyphase_upsampler_kernel(float *d_coeffs, float *d_output, int input_length) {
    extern __shared__ float phase_coeffs[];

    int phase = blockIdx.x; // Phase index (0..15)
    int tid = blockIdx.y * blockDim.x + threadIdx.x;
    int k = tid; // Input index

    // Load phase coefficients into shared memory
    for (int i = threadIdx.x; i < 64; i += blockDim.x) {
        phase_coeffs[i] = d_coeffs[phase + i * 16];
    }
    __syncthreads();

    if (k >= input_length) return;

    float sum = 0.0f;
    for (int i = 0; i < 64; ++i) {
        int input_idx = k - i;
        float x = tex1Dfetch(input_tex, input_idx);
        sum += x * phase_coeffs[i];
    }

    int output_idx = k * 16 + phase;
    if (output_idx < input_length * 16) {
        d_output[output_idx] = sum;
    }
}

void launch_polyphase_kernel(float *h_input, int input_length, float *h_coeffs, float *h_output) {
    float *d_input, *d_coeffs, *d_output;

    // Allocate device memory
    cudaMalloc(&d_input, input_length * sizeof(float));
    cudaMalloc(&d_coeffs, 1024 * sizeof(float));
    cudaMalloc(&d_output, input_length * 16 * sizeof(float));

    // Copy input and coefficients to device
    cudaMemcpy(d_input, h_input, input_length * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_coeffs, h_coeffs, 1024 * sizeof(float), cudaMemcpyHostToDevice);

    // Configure texture
    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
    cudaBindTexture(0, input_tex, d_input, channel_desc, input_length * sizeof(float));
    input_tex.addressMode[0] = cudaAddressModeBorder;
    input_tex.filterMode = cudaFilterModePoint;
    input_tex.normalized = 0;

    // Kernel launch parameters
    const int threads_per_block = 256;
    dim3 grid(16, (input_length + threads_per_block - 1) / threads_per_block); // 16 phases, blocks cover input_length
    dim3 block(threads_per_block);
    size_t shared_mem_size = 64 * sizeof(float); // Each phase has 64 coefficients

    // Launch kernel
    polyphase_upsampler_kernel<<<grid, block, shared_mem_size>>>(d_coeffs, d_output, input_length);

    // Copy output back to host
    cudaMemcpy(h_output, d_output, input_length * 16 * sizeof(float), cudaMemcpyDeviceToHost);

    // Cleanup
    cudaUnbindTexture(input_tex);
    cudaFree(d_input);
    cudaFree(d_coeffs);
    cudaFree(d_output);
}

// Example usage
int main() {
    const int input_length = 1024; // Example input size
    const int output_length = input_length * 16;

    float *h_input = new float[input_length];
    float *h_coeffs = new float[1024];
    float *h_output = new float[output_length];

    // Initialize input and coefficients (example values)
    for (int i = 0; i < input_length; ++i) h_input[i] = 1.0f;
    for (int i = 0; i < 1024; ++i) h_coeffs[i] = 0.01f; // Replace with actual coefficients

    launch_polyphase_kernel(h_input, input_length, h_coeffs, h_output);

    // Cleanup host memory
    delete[] h_input;
    delete[] h_coeffs;
    delete[] h_output;

    return 0;
}
Leave a Comment