sequential
unknown
c_cpp
a year ago
16 kB
7
Indexable
/* Copyright (c) 1993-2017, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // System includes #include <stdio.h> #include <assert.h> // #include <curand.h> // #include <cublas_v2.h> // CUDA runtime #include <cuda_runtime.h> #include <cuda_profiler_api.h> // Define some error checking macros. #define cudaErrCheck(stat) \ { \ cudaErrCheck_((stat), __FILE__, __LINE__); \ } void cudaErrCheck_(cudaError_t stat, const char *file, int line) { if (stat != cudaSuccess) { fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line); } } // #define cublasErrCheck(stat) { cublasErrCheck_((stat), __FILE__, __LINE__); } // void cublasErrCheck_(cublasStatus_t stat, const char *file, int line) { // if (stat != CUBLAS_STATUS_SUCCESS) { // f//printf(stderr, "cuBLAS Error: %d %s %d\n", stat, file, line); // } // } // #define curandErrCheck(stat) { curandErrCheck_((stat), __FILE__, __LINE__); } // void curandErrCheck_(curandStatus_t stat, const char *file, int line) { // if (stat != CURAND_STATUS_SUCCESS) { // f//printf(stderr, "cuRand Error: %d %s %d\n", stat, file, line); // } // } #include <mma.h> using namespace nvcuda; // Must be multiples of 16 for wmma code to work #define MATRIX_M 32 #define MATRIX_N 32 #define MATRIX_K 32 // The only dimensions currently supported by WMMA const int WMMA_M = 16; const int WMMA_N = 16; const int WMMA_K = 16; // cuda core: CUDA kernel for matrix multiplication __global__ void matrixMulKernel(float *A, float *B, float *C, int N) { // printf("----start--kernel---sp--\n"); int ROW = blockIdx.y * blockDim.y + threadIdx.y; int COL = blockIdx.x * blockDim.x + threadIdx.x; float tmpSum = 0.0; if (ROW < N && COL < N) { for (int i = 0; i < N; i++) { tmpSum += A[ROW * N + i] * B[i * N + COL]; } } C[ROW * N + COL] = tmpSum; //printf("---end---kernel---sp--\n"); } // Function to initialize the matrix with some values // void initializeMatrix(float *mat, int N) // { // for (int i = 0; i < N * N; i++) // { // mat[i] = rand() % 100; // Assign a random value for simplicity // } // } // Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming: // 1) Matrices are packed in memory. // 2) M, N and K are multiples of 16. // 3) Neither A nor B are transposed. // Note: This is NOT a high performance example but is for demonstration purposes only // For a high performance code please use the GEMM provided in cuBLAS. __global__ void wmma_example(half *a, half *b, float *c, int M, int N, int K, float alpha, float beta) { // Leading dimensions. Packed with no transpositions. int lda = M; int ldb = K; int ldc = M; // Tile using a 2D grid int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; int warpN = (blockIdx.y * blockDim.y + threadIdx.y); // Declare the fragments wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag; wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag; wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> acc_frag; wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag; wmma::fill_fragment(acc_frag, 0.0f); // Loop over k //printf("----start--forloop-- threadIdx.x: %d, threadIdx.y: %d\n", threadIdx.x, threadIdx.y); for (int i = 0; i < K; i += WMMA_K) { //printf("----start--target---sp--\n"); int aRow = warpM * WMMA_M; int aCol = i; int bRow = i; int bCol = warpN * WMMA_N; //printf("aRow: %d, aCol: %d, bRow: %d, bCol: %d, i: %d\n", aRow, aCol, bRow, bCol, i); //printf("---end---target---sp--\n"); // Bounds checking if (aRow < M && aCol < K && bRow < K && bCol < N) { // Load the inputs //printf("----start--target---tensor--\n"); wmma::load_matrix_sync(a_frag, a + aRow + aCol * lda, lda); wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); // Perform the matrix multiplication wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); //printf("---end---target---tensor--\n"); } } // Load in the current value of c, scale it by beta, and add this our result scaled by alpha int cRow = warpM * WMMA_M; int cCol = warpN * WMMA_N; if (cRow < M && cCol < N) { wmma::load_matrix_sync(c_frag, c + cRow + cCol * ldc, ldc, wmma::mem_col_major); #pragma unroll for (int i = 0; i < c_frag.num_elements; i++) { c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; } // Store the output wmma::store_matrix_sync(c + cRow + cCol * ldc, c_frag, ldc, wmma::mem_col_major); } } __global__ void convertFp32ToFp16(half *out, float *in, int n) { // count cycles int idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx < n) { out[idx] = in[idx]; } } int main(int argc, char *argv[]) { // Initialize the problem int nkernels = 2; // number of concurrent kernels int nstreams = nkernels + 1; // use one more stream than concurrent kernel // int nbytes = nkernels * sizeof(float_t); // number of data bytes float kernel_time = 10; // time the kernel should run in ms float elapsed_time; // timing variables int cuda_device = 0; cudaDeviceProp deviceProp; cudaErrCheck(cudaGetDevice(&cuda_device)); cudaErrCheck(cudaGetDeviceProperties(&deviceProp, cuda_device)); if ((deviceProp.concurrentKernels == 0)) { //printf("> GPU does not support concurrent kernel execution\n"); //printf(" CUDA kernel runs will be serialized\n"); } else { //printf("concurrent kernel: %d\n",deviceProp.concurrentKernels); } //printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", // deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); //-------------------- // cuda core int N = MATRIX_N; // Define the size of the matrix size_t size = N * N * sizeof(float_t); float *h_A, *h_B, *h_C; // host copies of A, B, C float *d_A, *d_B, *d_C; // device copies of A, B, C float *a_fp32; float *b_fp32; half *a_fp16; half *b_fp16; // //printf("WMMA Example2\n"); float *c; // float *c_cublas; float *c_wmma; // float *c_host_cublas; float *c_host_wmma; // //printf("WMMA Example3\n"); // cuda core: Allocate space for host copies and setup values cudaErrCheck(cudaMallocHost((void **)&h_A, size)); cudaErrCheck(cudaMallocHost((void **)&h_B, size)); cudaErrCheck(cudaMallocHost((void **)&h_C, size)); // h_A = (float *)malloc(size); // h_B = (float *)malloc(size); // h_C = (float *)malloc(size); // Allocate space for device copies cudaErrCheck(cudaMalloc((void **)&d_A, size)); cudaErrCheck(cudaMalloc((void **)&d_B, size)); cudaErrCheck(cudaMalloc((void **)&d_C, size)); // cudaMalloc((void **)&d_A, size); // cudaMalloc((void **)&d_B, size); // cudaMalloc((void **)&d_C, size); // Initialize matrices A and B with random values for (int i = 0; i < N * N; i++) { h_A[i] = (float)rand() / (float)RAND_MAX * 100.0; // Assign a random float value between 0 and 100 h_B[i] = (float)rand() / (float)RAND_MAX * 100.0; // Assign a random float value between 0 and 100 h_C[i] = 1.5; } // initializeMatrix(h_A, N); // initializeMatrix(h_B, N); // Copy inputs to device cudaErrCheck(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); cudaErrCheck(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice)); cudaErrCheck(cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice)); // cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); // cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); // stream create // allocate and initialize an array of stream handles // cudaStream_t *streams = // (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); // for (int i = 0; i < nstreams; i++) { // cudaErrCheck(cudaStreamCreate(&(streams[i]))); // } // cudaStream_t stream1, stream2; // cudaStreamCreate(&stream1); // cudaStreamCreate(&stream2); // cudaEvent_t startWMMA; // cudaEvent_t stopWMMA; // cudaErrCheck(cudaEventCreate(&startWMMA)); // cudaErrCheck(cudaEventCreate(&stopWMMA)); // cudaErrCheck(cudaMalloc((void **)&a_fp32, MATRIX_M * MATRIX_K * sizeof(float))); // cudaErrCheck(cudaMalloc((void **)&b_fp32, MATRIX_K * MATRIX_N * sizeof(float))); cudaErrCheck(cudaMalloc((void **)&a_fp16, MATRIX_M * MATRIX_K * sizeof(half))); cudaErrCheck(cudaMalloc((void **)&b_fp16, MATRIX_K * MATRIX_N * sizeof(half))); // cudaErrCheck(cudaMalloc((void **)&c, MATRIX_M * MATRIX_N * sizeof(float))); // cudaErrCheck(cudaMalloc((void**)&c_cublas, MATRIX_M * MATRIX_N * sizeof(float))); // cudaErrCheck(cudaMalloc((void **)&c_wmma, MATRIX_M * MATRIX_N * sizeof(float))); // c_host_cublas = (float*)malloc(MATRIX_M * MATRIX_N * sizeof(float)); // c_host_wmma = (float *)malloc(MATRIX_M * MATRIX_N * sizeof(float)); // curandErrCheck(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); // curandErrCheck(curandSetPseudoRandomGeneratorSeed(gen, 1337ULL)); // curandErrCheck(curandGenerateUniform(gen, a_fp32, MATRIX_M * MATRIX_K)); // curandErrCheck(curandGenerateUniform(gen, b_fp32, MATRIX_K * MATRIX_N)); // curand doesn't currently support fp16 so we generate in fp32 and convert to fp16. //printf("Converting to fp16...\n"); //printf("Current cycle time:"); //! concurrent // convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 127) / 128, 128, 0, streams[0]>>>(a_fp16, h_A, MATRIX_M * MATRIX_K); //* sequential // convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 127) / 128, 128, 0, 0>>>(a_fp16, h_A, MATRIX_M * MATRIX_K); convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 31) / 32, 32, 0, 0>>>(a_fp16, h_A, MATRIX_M * MATRIX_K); // convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 255) / 256, 256>>>(a_fp16, a_fp32, MATRIX_M * MATRIX_K); //printf("Current cycle time done:"); // //printf("Current cycle time: %f\n", getCycleTime()); //! concurrent // convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 127) / 128, 128, 0, streams[1]>>>(b_fp16, h_B, MATRIX_K * MATRIX_N); //* sequential convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 31) / 32, 32, 0, 0>>>(b_fp16, h_B, MATRIX_K * MATRIX_N); // convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 127) / 128, 128, 0, 0>>>(b_fp16, h_B, MATRIX_K * MATRIX_N); // convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 255) / 256, 256>>>(b_fp16, b_fp32, MATRIX_K * MATRIX_N); //printf("Converting to fp16... DONE\n"); // curandErrCheck(curandGenerateUniform(gen, c, MATRIX_M * MATRIX_N)); // curandErrCheck(curandDestroyGenerator(gen)); // cudaErrCheck(cudaMemcpy(c_cublas, c, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToDevice)); // cudaErrCheck(cudaMemcpy(c_wmma, h_C, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyHostToDevice)); float alpha = 2.0f; float beta = 2.0f; //printf("\nM = %d, N = %d, K = %d. alpha = %f, beta = %f\n\n", MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta); // First: using WMMA dim3 gridDim; dim3 blockDim; // blockDim.x must be a multple of warpSize // 128x4 means we have 16 warps and a block computes a 64x64 output tile blockDim.x = 32; blockDim.y = 1; // blockDim.x = 128; // blockDim.y = 1; // blockDim.x = 64; // blockDim.y = 1; // blockDim.x = 128; // blockDim.y = 4; gridDim.x = (MATRIX_M + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32); gridDim.y = (MATRIX_N + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); //printf("gridDim.x = %d, gridDim.y = %d\n", gridDim.x, gridDim.y); //printf("Running with wmma...\n"); // cudaErrCheck(cudaEventRecord(startWMMA)); // wmma_example<<<gridDim, blockDim>>>(a_fp16, b_fp16, c_wmma, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta); // matrixMulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N); //* sequential wmma_example<<<gridDim, blockDim, 0, 0>>>(a_fp16, b_fp16, d_C, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta); matrixMulKernel<<<gridDim, blockDim, 0, 0>>>(d_A, d_B, d_C, N); //! concurrent // wmma_example<<<gridDim, blockDim, 0, streams[0]>>>(a_fp16, b_fp16, d_C, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta); // matrixMulKernel<<<gridDim, blockDim, 0, streams[1]>>>(d_A, d_B, d_C, N); // //printf("Running with wmma...done\n"); // cudaErrCheck(cudaEventRecord(stopWMMA)); // cudaErrCheck(cudaEventSynchronize(stopWMMA)); // cudaStreamSynchronize(streams[0]); // cudaStreamSynchronize(streams[1]); // Error checking //printf("\nChecking results...\n"); // cudaErrCheck(cudaMemcpy(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost)); //* sequential cudaErrCheck(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost)); //! concurrent // cudaErrCheck(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost,streams[nstreams - 1])); // cudaErrCheck(cudaMemcpy(c_host_cublas, c_cublas, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost)); // cudaStreamDestroy(streams[0]); // cudaStreamDestroy(streams[1]); // cudaErrCheck(cudaEventDestroy(startWMMA)); // cudaErrCheck(cudaEventDestroy(stopWMMA)); // cudaErrCheck(cudaEventDestroy(startcublas)); // cudaErrCheck(cudaEventDestroy(stopcublas)); // cudaErrCheck(cudaFree(a_fp32)); // cudaErrCheck(cudaFree(b_fp32)); cudaErrCheck(cudaFree(a_fp16)); cudaErrCheck(cudaFree(b_fp16)); // cudaErrCheck(cudaFree(c)); // cudaErrCheck(cudaFree(c_cublas)); // cudaErrCheck(cudaFree(c_wmma)); // cuda core cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); // free(c_host_cublas); // free(c_host_wmma); cudaErrCheck(cudaDeviceReset()); return 0; }
Editor is loading...
Leave a Comment