softmax
unknown
c_cpp
a year ago
2.9 kB
12
Indexable
#include <stdio.h>
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>
// Utility function for CUDA error checking
#define checkCudaErrors(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while(0)
const int SIZE = 128; // Matrix size 64x64
// CUDA kernel to compute softmax row-wise for a 64x64 matrix
__global__ void softmaxKernel(float* input, float* output, int width) {
// Get the row index for this thread
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < width) {
float max_val = -INFINITY;
// Find the maximum value in the row for numerical stability
for (int col = 0; col < width; ++col) {
max_val = fmaxf(max_val, input[row * width + col]);
}
// Compute the exponentials and sum them
float sum = 0.0f;
for (int col = 0; col < width; ++col) {
output[row * width + col] = expf(input[row * width + col] - max_val); // Subtract max for numerical stability
sum += output[row * width + col];
}
// Normalize the output by dividing by the sum of exponentials
for (int col = 0; col < width; ++col) {
output[row * width + col] /= sum;
}
}
}
int main() {
const int matrix_size = SIZE * SIZE * sizeof(float);
// Host memory allocation
float h_input[SIZE][SIZE];
float h_output[SIZE][SIZE];
// Initialize input matrix with some values
for (int i = 0; i < SIZE; ++i) {
for (int j = 0; j < SIZE; ++j) {
h_input[i][j] = static_cast<float>(i + j); // Simple initialization
}
}
// Device memory allocation
float* d_input;
float* d_output;
checkCudaErrors(cudaMalloc(&d_input, matrix_size));
checkCudaErrors(cudaMalloc(&d_output, matrix_size));
// Copy input data from host to device
checkCudaErrors(cudaMemcpy(d_input, h_input, matrix_size, cudaMemcpyHostToDevice));
// Launch the softmax kernel
int threads_per_block = 64;
int blocks_per_grid = (SIZE + threads_per_block - 1) / threads_per_block;
softmaxKernel<<<blocks_per_grid, threads_per_block>>>(d_input, d_output, SIZE);
// Check for any errors launching the kernel
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
// Copy result back to host
checkCudaErrors(cudaMemcpy(h_output, d_output, matrix_size, cudaMemcpyDeviceToHost));
// Print some of the output to verify
std::cout << "Softmax output (first row): ";
for (int j = 0; j < SIZE; ++j) {
std::cout << h_output[0][j] << " ";
}
std::cout << std::endl;
// Free device memory
checkCudaErrors(cudaFree(d_input));
checkCudaErrors(cudaFree(d_output));
return 0;
}
Editor is loading...
Leave a Comment