softmax

#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>

// Utility function for CUDA error checking
#define checkCudaErrors(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
        exit(EXIT_FAILURE); \
    } \
} while(0)

const int SIZE = 128;  // Matrix size 64x64

// CUDA kernel to compute softmax row-wise for a 64x64 matrix
__global__ void softmaxKernel(float* input, float* output, int width) {
    // Get the row index for this thread
    int row = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width) {
        float max_val = -INFINITY;
        
        // Find the maximum value in the row for numerical stability
        for (int col = 0; col < width; ++col) {
            max_val = fmaxf(max_val, input[row * width + col]);
        }

        // Compute the exponentials and sum them
        float sum = 0.0f;
        for (int col = 0; col < width; ++col) {
            output[row * width + col] = expf(input[row * width + col] - max_val);  // Subtract max for numerical stability
            sum += output[row * width + col];
        }

        // Normalize the output by dividing by the sum of exponentials
        for (int col = 0; col < width; ++col) {
            output[row * width + col] /= sum;
        }
    }
}

int main() {
    const int matrix_size = SIZE * SIZE * sizeof(float);

    // Host memory allocation
    float h_input[SIZE][SIZE];
    float h_output[SIZE][SIZE];

    // Initialize input matrix with some values
    for (int i = 0; i < SIZE; ++i) {
        for (int j = 0; j < SIZE; ++j) {
            h_input[i][j] = static_cast<float>(i + j);  // Simple initialization
        }
    }

    // Device memory allocation
    float* d_input;
    float* d_output;
    checkCudaErrors(cudaMalloc(&d_input, matrix_size));
    checkCudaErrors(cudaMalloc(&d_output, matrix_size));

    // Copy input data from host to device
    checkCudaErrors(cudaMemcpy(d_input, h_input, matrix_size, cudaMemcpyHostToDevice));

    // Launch the softmax kernel
    int threads_per_block = 64;
    int blocks_per_grid = (SIZE + threads_per_block - 1) / threads_per_block;

    softmaxKernel<<<blocks_per_grid, threads_per_block>>>(d_input, d_output, SIZE);

    // Check for any errors launching the kernel
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());

    // Copy result back to host
    checkCudaErrors(cudaMemcpy(h_output, d_output, matrix_size, cudaMemcpyDeviceToHost));

    // Print some of the output to verify
    std::cout << "Softmax output (first row): ";
    for (int j = 0; j < SIZE; ++j) {
        std::cout << h_output[0][j] << " ";
    }
    std::cout << std::endl;

    // Free device memory
    checkCudaErrors(cudaFree(d_input));
    checkCudaErrors(cudaFree(d_output));

    return 0;
}
Editor is loading...