concurrent

/* Copyright (c) 1993-2017, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// System includes
#include <stdio.h>
#include <assert.h>
// #include <curand.h>
// #include <cublas_v2.h>

// CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

// Define some error checking macros.
#define cudaErrCheck(stat)                       \
   {                                             \
      cudaErrCheck_((stat), __FILE__, __LINE__); \
   }
void cudaErrCheck_(cudaError_t stat, const char *file, int line)
{
   if (stat != cudaSuccess)
   {
      fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
   }
}

// #define cublasErrCheck(stat) { cublasErrCheck_((stat), __FILE__, __LINE__); }
// void cublasErrCheck_(cublasStatus_t stat, const char *file, int line) {
//    if (stat != CUBLAS_STATUS_SUCCESS) {
//       fprintf(stderr, "cuBLAS Error: %d %s %d\n", stat, file, line);
//    }
// }

// #define curandErrCheck(stat) { curandErrCheck_((stat), __FILE__, __LINE__); }
// void curandErrCheck_(curandStatus_t stat, const char *file, int line) {
//    if (stat != CURAND_STATUS_SUCCESS) {
//       fprintf(stderr, "cuRand Error: %d %s %d\n", stat, file, line);
//    }
// }

#include <mma.h>
using namespace nvcuda;
// Must be multiples of 16 for wmma code to work
#define MATRIX_M 32
#define MATRIX_N 32
#define MATRIX_K 32

// The only dimensions currently supported by WMMA
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;

// cuda core: CUDA kernel for matrix multiplication
__global__ void matrixMulKernel(float *A, float *B, float *C, int N)
{
   // printf("----start--kernel---sp--\n");

   int ROW = blockIdx.y * blockDim.y + threadIdx.y;
   int COL = blockIdx.x * blockDim.x + threadIdx.x;

   float tmpSum = 0.0;
   if (ROW < N && COL < N)
   {
      for (int i = 0; i < N; i++)
      {
         tmpSum += A[ROW * N + i] * B[i * N + COL];
      }
   }
   C[ROW * N + COL] = tmpSum;
   // printf("---end---kernel---sp--\n");
}

// Function to initialize the matrix with some values
// void initializeMatrix(float *mat, int N)
// {
//    for (int i = 0; i < N * N; i++)
//    {
//       mat[i] = rand() % 100; // Assign a random value for simplicity
//    }
// }

// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
//  1) Matrices are packed in memory.
//  2) M, N and K are multiples of 16.
//  3) Neither A nor B are transposed.
// Note: This is NOT a high performance example but is for demonstration purposes only
//       For a high performance code please use the GEMM provided in cuBLAS.
__global__ void wmma_example(half *a, half *b, float *c, int M, int N, int K, float alpha, float beta)
{
   // Leading dimensions. Packed with no transpositions.
   int lda = M;
   int ldb = K;
   int ldc = M;

   // Tile using a 2D grid
   int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
   int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
   
   // Declare the fragments
   wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
   wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
   wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> acc_frag;
   wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;

   wmma::fill_fragment(acc_frag, 0.0f);

   // Loop over k
   // printf("----start--forloop-- threadIdx.x: %d, threadIdx.y: %d\n", threadIdx.x, threadIdx.y);
   for (int i = 0; i < K; i += WMMA_K)
   {
      // printf("----start--target---sp--\n");

      int aRow = warpM * WMMA_M;
      int aCol = i;

      int bRow = i;
      int bCol = warpN * WMMA_N;

      // printf("aRow: %d, aCol: %d, bRow: %d, bCol: %d, i: %d\n", aRow, aCol, bRow, bCol, i);
      // printf("---end---target---sp--\n");

      // Bounds checking
      if (aRow < M && aCol < K && bRow < K && bCol < N)
      {
         // Load the inputs
         // printf("----start--target---tensor--\n");
         wmma::load_matrix_sync(a_frag, a + aRow + aCol * lda, lda);

         wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);

         // Perform the matrix multiplication
         wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
         // printf("---end---target---tensor--\n");
      }
   }

   // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
   int cRow = warpM * WMMA_M;
   int cCol = warpN * WMMA_N;

   if (cRow < M && cCol < N)
   {
      wmma::load_matrix_sync(c_frag, c + cRow + cCol * ldc, ldc, wmma::mem_col_major);

#pragma unroll
      for (int i = 0; i < c_frag.num_elements; i++)
      {
         c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
      }

      // Store the output
      wmma::store_matrix_sync(c + cRow + cCol * ldc, c_frag, ldc, wmma::mem_col_major);
   }
}

__global__ void convertFp32ToFp16(half *out, float *in, int n)
{
   // count cycles
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx < n)
   {
      out[idx] = in[idx];
   }
}

int main(int argc, char *argv[])
{
   // Initialize the problem
   //! concurrent
   int nkernels = 2;            // number of concurrent kernels
   int nstreams = nkernels + 1; // use one more stream than concurrent kernel
   // int nbytes = nkernels * sizeof(float_t);  // number of data bytes
   // float kernel_time = 10; // time the kernel should run in ms
   // float elapsed_time;     // timing variables
   int cuda_device = 0;

   cudaDeviceProp deviceProp;
   cudaErrCheck(cudaGetDevice(&cuda_device));

   cudaErrCheck(cudaGetDeviceProperties(&deviceProp, cuda_device));

   if ((deviceProp.concurrentKernels == 0))
   {
      printf("> GPU does not support concurrent kernel execution\n");
      printf("  CUDA kernel runs will be serialized\n");
   }
   else
   {
      printf("concurrent kernel: %d\n", deviceProp.concurrentKernels);
   }

   // printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
   //   deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
   //--------------------

   // cuda core
   int N = MATRIX_N; // Define the size of the matrix
   size_t size = N * N * sizeof(float);
   float *h_A, *h_B, *h_C; // host copies of A, B, C
   float *d_A, *d_B, *d_C; // device copies of A, B, C

   // float *a_fp32;
   // float *b_fp32;
   half *a_fp16;
   half *b_fp16;
   // printf("WMMA Example2\n");

   float *c;
   //    float *c_cublas;
   float *c_wmma;

   //    float *c_host_cublas;
   float *c_host_wmma;
   // printf("WMMA Example3\n");

   // cuda core: Allocate space for host copies and setup values
   cudaErrCheck(cudaMallocHost((void **)&h_A, size));
   cudaErrCheck(cudaMallocHost((void **)&h_B, size));
   cudaErrCheck(cudaMallocHost((void **)&h_C, size));
   //    h_A = (float *)malloc(size);
   //    h_B = (float *)malloc(size);
   //    h_C = (float *)malloc(size);

   // Allocate space for device copies
   cudaErrCheck(cudaMalloc((void **)&d_A, size));
   cudaErrCheck(cudaMalloc((void **)&d_B, size));
   cudaErrCheck(cudaMalloc((void **)&d_C, size));
   //    cudaMalloc((void **)&d_A, size);
   //    cudaMalloc((void **)&d_B, size);
   //    cudaMalloc((void **)&d_C, size);

   // Initialize matrices A and B with random values
   for (int i = 0; i < N * N; i++)
   {
      h_A[i] = (float)rand() / (float)RAND_MAX * 100.0; // Assign a random float value between 0 and 100
      h_B[i] = (float)rand() / (float)RAND_MAX * 100.0; // Assign a random float value between 0 and 100
      h_C[i] = 0;
   }
   // initializeMatrix(h_A, N);
   // initializeMatrix(h_B, N);

   // Copy inputs to device
   cudaErrCheck(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
   cudaErrCheck(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
   cudaErrCheck(cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice));
   //    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
   //    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
   
   // stream create
   // allocate and initialize an array of stream handles

   //! concurrent
   cudaStream_t *streams =
       (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));

   for (int i = 0; i < nstreams; i++)
   {
      cudaErrCheck(cudaStreamCreate(&(streams[i])));
   }

   //    curandGenerator_t gen;
   //    cublasHandle_t cublasHandle;

   cudaEvent_t startWMMA;
   cudaEvent_t stopWMMA;

   //    cudaEvent_t startcublas;
   //    cudaEvent_t stopcublas;

   cudaErrCheck(cudaEventCreate(&startWMMA));
   cudaErrCheck(cudaEventCreate(&stopWMMA));

   //    cudaErrCheck(cudaEventCreate(&startcublas));
   //    cudaErrCheck(cudaEventCreate(&stopcublas));

   //    cublasErrCheck(cublasCreate(&cublasHandle));

   // Use tensor cores
   //    cublasErrCheck(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH));

   // cudaErrCheck(cudaMalloc((void **)&a_fp32, MATRIX_M * MATRIX_K * sizeof(float)));
   // cudaErrCheck(cudaMalloc((void **)&b_fp32, MATRIX_K * MATRIX_N * sizeof(float)));
   cudaErrCheck(cudaMalloc((void **)&a_fp16, MATRIX_M * MATRIX_K * sizeof(half)));
   cudaErrCheck(cudaMalloc((void **)&b_fp16, MATRIX_K * MATRIX_N * sizeof(half)));

   cudaErrCheck(cudaMalloc((void **)&c, MATRIX_M * MATRIX_N * sizeof(float)));
   //    cudaErrCheck(cudaMalloc((void**)&c_cublas, MATRIX_M * MATRIX_N * sizeof(float)));
   cudaErrCheck(cudaMalloc((void **)&c_wmma, MATRIX_M * MATRIX_N * sizeof(float)));

   //    c_host_cublas = (float*)malloc(MATRIX_M * MATRIX_N * sizeof(float));
   c_host_wmma = (float *)malloc(MATRIX_M * MATRIX_N * sizeof(float));

   //    curandErrCheck(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
   //    curandErrCheck(curandSetPseudoRandomGeneratorSeed(gen, 1337ULL));

   //    curandErrCheck(curandGenerateUniform(gen, a_fp32, MATRIX_M * MATRIX_K));
   //    curandErrCheck(curandGenerateUniform(gen, b_fp32, MATRIX_K * MATRIX_N));

   // curand doesn't currently support fp16 so we generate in fp32 and convert to fp16.
   printf("Converting to fp16...\n");
   // printf("Current cycle time:");

   //* sequential
   // convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 31) / 32, 32, 0, 0>>>(a_fp16, h_A, MATRIX_M * MATRIX_K);
   // convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 31) / 32, 32, 0, 0>>>(b_fp16, h_B, MATRIX_K * MATRIX_N);
   
   //! concurrent
   convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 31) / 32, 32, 0, streams[0]>>>(a_fp16, h_A, MATRIX_M * MATRIX_K);
   convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 31) / 32, 32, 0, streams[1]>>>(b_fp16, h_B, MATRIX_K * MATRIX_N);
   
   // printf("Current cycle time done:");

   // printf("Current cycle time: %f\n", getCycleTime());
   
   printf("Converting to fp16... DONE\n");

   //    curandErrCheck(curandGenerateUniform(gen, c, MATRIX_M * MATRIX_N));

   //    curandErrCheck(curandDestroyGenerator(gen));

   //    cudaErrCheck(cudaMemcpy(c_cublas, c, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToDevice));
   cudaErrCheck(cudaMemcpy(c_wmma, c, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToDevice));

   float alpha = 2.0f;
   float beta = 2.0f;

   printf("\nM = %d, N = %d, K = %d. alpha = %f, beta = %f\n\n", MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);


   // First: using WMMA
   dim3 gridDim;
   dim3 blockDim;

   // blockDim.x must be a multple of warpSize
   // 128x4 means we have 16 warps and a block computes a 64x64 output tile
   blockDim.x = 32;
   blockDim.y = 1;
   // blockDim.x = 64;
   // blockDim.y = 1;
   // blockDim.x = 128;
   // blockDim.y = 4;

   gridDim.x = (MATRIX_M + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
   gridDim.y = (MATRIX_N + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
   printf("gridDim.x = %d, gridDim.y = %d\n", gridDim.x, gridDim.y);
   printf("Running with wmma...\n");
   cudaErrCheck(cudaEventRecord(startWMMA));

   //* sequential
   // wmma_example<<<gridDim, blockDim, 0, 0>>>(a_fp16, b_fp16, c_wmma, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
   // matrixMulKernel<<<gridDim, blockDim, 0, 0>>>(d_A, d_B, d_C, N);

   //! concurrent
   wmma_example<<<gridDim, blockDim, 0, streams[0]>>>(a_fp16, b_fp16, d_C, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
   matrixMulKernel<<<gridDim, blockDim, 0, streams[1]>>>(d_A, d_B, d_C, N);
   
   printf("Running with wmma...done\n");
   cudaErrCheck(cudaEventRecord(stopWMMA));
   cudaErrCheck(cudaEventSynchronize(stopWMMA));

   //! concurrent
   cudaStreamSynchronize(streams[0]);
   cudaStreamSynchronize(streams[1]);

   // Now using cuBLAS
   //    printf("Running with cuBLAS...\n");
   // Warm up cuBLAS run starts
   //    cublasErrCheck(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
   //                 MATRIX_M, MATRIX_N, MATRIX_K,
   //                 &alpha,
   //                 a_fp16, CUDA_R_16F, MATRIX_M,
   //                 b_fp16, CUDA_R_16F, MATRIX_K,
   //                 &beta,
   //                 c_cublas, CUDA_R_32F, MATRIX_M,
   //                 CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   // Warm up cuBLAS run ends

   // reset the c_cublas buffer
   //    cudaErrCheck(cudaMemcpy(c_cublas, c, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToDevice));

   //    cudaErrCheck(cudaEventRecord(startcublas));
   //    cublasErrCheck(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
   //                 MATRIX_M, MATRIX_N, MATRIX_K,
   //                 &alpha,
   //                 a_fp16, CUDA_R_16F, MATRIX_M,
   //                 b_fp16, CUDA_R_16F, MATRIX_K,
   //                 &beta,
   //                 c_cublas, CUDA_R_32F, MATRIX_M,
   //                 CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   //    cudaErrCheck(cudaEventRecord(stopcublas));
   //    cudaErrCheck(cudaEventSynchronize(stopcublas));

   // Error checking
   printf("\nChecking results...\n");

   //* sequential
   cudaErrCheck(cudaMemcpy(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost));
   cudaErrCheck(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));

   //! concurrent
   cudaErrCheck(cudaMemcpyAsync(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost, streams[nstreams - 1]));
   cudaErrCheck(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, streams[nstreams - 1]));
   
   //    cudaErrCheck(cudaMemcpy(c_host_cublas, c_cublas, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost));

   // 0.01% relative tolerance. 1e-5 absolute tolerance.
   //    int errors = 0;
   //    for (int i = 0; i < MATRIX_M * MATRIX_N; i++) {
   //       float v1 = c_host_wmma[i];
   //       float v2 = c_host_cublas[i];
   //       float diff  = fabs(v1 - v2);
   //       float relative_err = diff / v2;
   //       float eps = 1e-4;
   //       if ((relative_err >= eps)) {
   //          errors++;
   //          if (errors < 10) printf("%f %f\n", v1, v2);
   //       }
   //    }

   //    if (errors > 0) {
   //       printf("WMMA does not agree with cuBLAS! %d errors!\n", errors);
   //    }
   //    else {
   //       printf("Results verified: cublas and WMMA agree.\n\n");
   //       float wmmaTime;
   //       float cublasTime;
   //       cudaErrCheck(cudaEventElapsedTime(&wmmaTime, startWMMA, stopWMMA));
   //       cudaErrCheck(cudaEventElapsedTime(&cublasTime, startcublas, stopcublas));
   //       printf("wmma took %fms\n", wmmaTime);
   //       printf("cublas took %fms\n", cublasTime);

   //       printf("\nFor a faster code using wmma you should check out the cudaTensorCoreGemm sample in the CUDA Toolkit.\nThis code was written as a demo only!\n\n");
   //    }
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, startWMMA, stopWMMA);
   printf("Kernel execution time: %f ms\n", milliseconds);

   //! concurrent
   cudaStreamDestroy(streams[0]);
   cudaStreamDestroy(streams[1]);

   cudaErrCheck(cudaEventDestroy(startWMMA));
   cudaErrCheck(cudaEventDestroy(stopWMMA));

   //    cudaErrCheck(cudaEventDestroy(startcublas));
   //    cudaErrCheck(cudaEventDestroy(stopcublas));

   // cudaErrCheck(cudaFree(a_fp32));
   // cudaErrCheck(cudaFree(b_fp32));
   cudaErrCheck(cudaFree(a_fp16));
   cudaErrCheck(cudaFree(b_fp16));

   cudaErrCheck(cudaFree(c));
   //    cudaErrCheck(cudaFree(c_cublas));
   cudaErrCheck(cudaFree(c_wmma));

   // cuda core
   cudaFree(d_A);
   cudaFree(d_B);
   cudaFree(d_C);
   free(h_A);
   free(h_B);
   free(h_C);

   //    free(c_host_cublas);
   free(c_host_wmma);

   cudaErrCheck(cudaDeviceReset());
   return 0;
}
Editor is loading...