softmax
unknown
c_cpp
a month ago
2.9 kB
5
Indexable
Never
#include <stdio.h> #include <cuda_runtime.h> #include <iostream> #include <cmath> // Utility function for CUDA error checking #define checkCudaErrors(call) do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \ exit(EXIT_FAILURE); \ } \ } while(0) const int SIZE = 128; // Matrix size 64x64 // CUDA kernel to compute softmax row-wise for a 64x64 matrix __global__ void softmaxKernel(float* input, float* output, int width) { // Get the row index for this thread int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < width) { float max_val = -INFINITY; // Find the maximum value in the row for numerical stability for (int col = 0; col < width; ++col) { max_val = fmaxf(max_val, input[row * width + col]); } // Compute the exponentials and sum them float sum = 0.0f; for (int col = 0; col < width; ++col) { output[row * width + col] = expf(input[row * width + col] - max_val); // Subtract max for numerical stability sum += output[row * width + col]; } // Normalize the output by dividing by the sum of exponentials for (int col = 0; col < width; ++col) { output[row * width + col] /= sum; } } } int main() { const int matrix_size = SIZE * SIZE * sizeof(float); // Host memory allocation float h_input[SIZE][SIZE]; float h_output[SIZE][SIZE]; // Initialize input matrix with some values for (int i = 0; i < SIZE; ++i) { for (int j = 0; j < SIZE; ++j) { h_input[i][j] = static_cast<float>(i + j); // Simple initialization } } // Device memory allocation float* d_input; float* d_output; checkCudaErrors(cudaMalloc(&d_input, matrix_size)); checkCudaErrors(cudaMalloc(&d_output, matrix_size)); // Copy input data from host to device checkCudaErrors(cudaMemcpy(d_input, h_input, matrix_size, cudaMemcpyHostToDevice)); // Launch the softmax kernel int threads_per_block = 64; int blocks_per_grid = (SIZE + threads_per_block - 1) / threads_per_block; softmaxKernel<<<blocks_per_grid, threads_per_block>>>(d_input, d_output, SIZE); // Check for any errors launching the kernel checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaDeviceSynchronize()); // Copy result back to host checkCudaErrors(cudaMemcpy(h_output, d_output, matrix_size, cudaMemcpyDeviceToHost)); // Print some of the output to verify std::cout << "Softmax output (first row): "; for (int j = 0; j < SIZE; ++j) { std::cout << h_output[0][j] << " "; } std::cout << std::endl; // Free device memory checkCudaErrors(cudaFree(d_input)); checkCudaErrors(cudaFree(d_output)); return 0; }
Leave a Comment