concurrent

/* Copyright (c) 1993-2017, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// System includes
#include <stdio.h>
#include <assert.h>
// #include <curand.h>
// #include <cublas_v2.h>

// CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

// Define some error checking macros.
#define cudaErrCheck(stat)                       \
   {                                             \
      cudaErrCheck_((stat), __FILE__, __LINE__); \
   }
void cudaErrCheck_(cudaError_t stat, const char *file, int line)
{
   if (stat != cudaSuccess)
   {
      fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
   }
}

// #define cublasErrCheck(stat) { cublasErrCheck_((stat), __FILE__, __LINE__); }
// void cublasErrCheck_(cublasStatus_t stat, const char *file, int line) {
//    if (stat != CUBLAS_STATUS_SUCCESS) {
//       f//printf(stderr, "cuBLAS Error: %d %s %d\n", stat, file, line);
//    }
// }

// #define curandErrCheck(stat) { curandErrCheck_((stat), __FILE__, __LINE__); }
// void curandErrCheck_(curandStatus_t stat, const char *file, int line) {
//    if (stat != CURAND_STATUS_SUCCESS) {
//       f//printf(stderr, "cuRand Error: %d %s %d\n", stat, file, line);
//    }
// }

#include <mma.h>
using namespace nvcuda;

// Must be multiples of 16 for wmma code to work
#define MATRIX_M 32
#define MATRIX_N 32
#define MATRIX_K 32

// The only dimensions currently supported by WMMA
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;

// cuda core: CUDA kernel for matrix multiplication
__global__ void matrixMulKernel(float *A, float *B, float *C, int N)
{
//    printf("----start--kernel---sp--\n");
   int ROW = blockIdx.y * blockDim.y + threadIdx.y;
   int COL = blockIdx.x * blockDim.x + threadIdx.x;

   float tmpSum = 0.0;
   if (ROW < N && COL < N)
   {
      for (int i = 0; i < N; i++)
      {
         tmpSum += A[ROW * N + i] * B[i * N + COL];
      }
   }
   C[ROW * N + COL] = tmpSum;
   //printf("---end---kernel---sp--\n");
}

// Function to initialize the matrix with some values
// void initializeMatrix(float *mat, int N)
// {
//    for (int i = 0; i < N * N; i++)
//    {
//       mat[i] = rand() % 100; // Assign a random value for simplicity
//    }
// }

// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
//  1) Matrices are packed in memory.
//  2) M, N and K are multiples of 16.
//  3) Neither A nor B are transposed.
// Note: This is NOT a high performance example but is for demonstration purposes only
//       For a high performance code please use the GEMM provided in cuBLAS.
__global__ void wmma_example(half *a, half *b, float *c, int M, int N, int K, float alpha, float beta)
{
   // Leading dimensions. Packed with no transpositions.
   int lda = M;
   int ldb = K;
   int ldc = M;

   // Tile using a 2D grid
   int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
   int warpN = (blockIdx.y * blockDim.y + threadIdx.y);

   // Declare the fragments
   wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
   wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
   wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> acc_frag;
   wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;

   wmma::fill_fragment(acc_frag, 0.0f);

   // Loop over k
   //printf("----start--forloop-- threadIdx.x: %d, threadIdx.y: %d\n", threadIdx.x, threadIdx.y);
   for (int i = 0; i < K; i += WMMA_K)
   {
      //printf("----start--target---sp--\n");

      int aRow = warpM * WMMA_M;
      int aCol = i;

      int bRow = i;
      int bCol = warpN * WMMA_N;

      //printf("aRow: %d, aCol: %d, bRow: %d, bCol: %d, i: %d\n", aRow, aCol, bRow, bCol, i);
      //printf("---end---target---sp--\n");

      // Bounds checking
      if (aRow < M && aCol < K && bRow < K && bCol < N)
      {
         // Load the inputs
         //printf("----start--target---tensor--\n");
         wmma::load_matrix_sync(a_frag, a + aRow + aCol * lda, lda);
         

         wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);


         // Perform the matrix multiplication
         wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
         //printf("---end---target---tensor--\n");
      }
   }

   // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
   int cRow = warpM * WMMA_M;
   int cCol = warpN * WMMA_N;

   if (cRow < M && cCol < N)
   {
      wmma::load_matrix_sync(c_frag, c + cRow + cCol * ldc, ldc, wmma::mem_col_major);

#pragma unroll
      for (int i = 0; i < c_frag.num_elements; i++)
      {
         c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
      }

      // Store the output
      wmma::store_matrix_sync(c + cRow + cCol * ldc, c_frag, ldc, wmma::mem_col_major);
   }
}

__global__ void convertFp32ToFp16(half *out, float *in, int n)
{
   // count cycles

   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   if (idx < n)
   {
      out[idx] = in[idx];
   }
}

int main(int argc, char *argv[])
{
    // Initialize the problem
    int nkernels = 2;             // number of concurrent kernels
    int nstreams = nkernels + 1;  // use one more stream than concurrent kernel
    // int nbytes = nkernels * sizeof(float_t);  // number of data bytes
    float kernel_time = 10;                   // time the kernel should run in ms
    float elapsed_time;                       // timing variables
    int cuda_device = 0;

    cudaDeviceProp deviceProp;
    cudaErrCheck(cudaGetDevice(&cuda_device));

    cudaErrCheck(cudaGetDeviceProperties(&deviceProp, cuda_device));

    if ((deviceProp.concurrentKernels == 0)) {
        printf("> GPU does not support concurrent kernel execution\n");
        printf("  CUDA kernel runs will be serialized\n");
    } else {
        printf("concurrent kernel: %d\n",deviceProp.concurrentKernels);
    }

    //printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
        //  deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
    //--------------------
   // cuda core
   int N = MATRIX_N; // Define the size of the matrix
   size_t size = N * N * sizeof(float_t);
   float *h_A, *h_B, *h_C; // host copies of A, B, C
   float *d_A, *d_B, *d_C; // device copies of A, B, C

   float *a_fp32;
   float *b_fp32;
   half *a_fp16;
   half *b_fp16;
   // //printf("WMMA Example2\n");

   float *c;
   //    float *c_cublas;
   float *c_wmma;

   //    float *c_host_cublas;
   float *c_host_wmma;
   // //printf("WMMA Example3\n");

   // cuda core: Allocate space for host copies and setup values
   cudaErrCheck(cudaMallocHost((void **)&h_A, size));
   cudaErrCheck(cudaMallocHost((void **)&h_B, size));
   cudaErrCheck(cudaMallocHost((void **)&h_C, size));
//    h_A = (float *)malloc(size);
//    h_B = (float *)malloc(size);
//    h_C = (float *)malloc(size);

   // Allocate space for device copies
   cudaErrCheck(cudaMalloc((void **)&d_A, size));
   cudaErrCheck(cudaMalloc((void **)&d_B, size));
   cudaErrCheck(cudaMalloc((void **)&d_C, size));
//    cudaMalloc((void **)&d_A, size);
//    cudaMalloc((void **)&d_B, size);
//    cudaMalloc((void **)&d_C, size);

   // Initialize matrices A and B with random values
   for (int i = 0; i < N * N; i++)
   {
      h_A[i] = (float)rand() / (float)RAND_MAX * 100.0; // Assign a random float value between 0 and 100
      h_B[i] = (float)rand() / (float)RAND_MAX * 100.0; // Assign a random float value between 0 and 100
      h_C[i] = 1.5;
   }
//    initializeMatrix(h_A, N);
//    initializeMatrix(h_B, N);


   // Copy inputs to device
   cudaErrCheck(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
   cudaErrCheck(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice));
   cudaErrCheck(cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice));
//    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
//    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

   // stream create
   // allocate and initialize an array of stream handles
    cudaStream_t *streams =
        (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));

    for (int i = 0; i < nstreams; i++) {
        cudaErrCheck(cudaStreamCreate(&(streams[i])));
    }

//    cudaStream_t stream1, stream2;
//    cudaStreamCreate(&stream1);
//    cudaStreamCreate(&stream2);


//    cudaEvent_t startWMMA;
//    cudaEvent_t stopWMMA;


//    cudaErrCheck(cudaEventCreate(&startWMMA));
//    cudaErrCheck(cudaEventCreate(&stopWMMA));


//    cudaErrCheck(cudaMalloc((void **)&a_fp32, MATRIX_M * MATRIX_K * sizeof(float)));
//    cudaErrCheck(cudaMalloc((void **)&b_fp32, MATRIX_K * MATRIX_N * sizeof(float)));
   cudaErrCheck(cudaMalloc((void **)&a_fp16, MATRIX_M * MATRIX_K * sizeof(half)));
   cudaErrCheck(cudaMalloc((void **)&b_fp16, MATRIX_K * MATRIX_N * sizeof(half)));

//    cudaErrCheck(cudaMalloc((void **)&c, MATRIX_M * MATRIX_N * sizeof(float)));
   //    cudaErrCheck(cudaMalloc((void**)&c_cublas, MATRIX_M * MATRIX_N * sizeof(float)));
//    cudaErrCheck(cudaMalloc((void **)&c_wmma, MATRIX_M * MATRIX_N * sizeof(float)));

   //    c_host_cublas = (float*)malloc(MATRIX_M * MATRIX_N * sizeof(float));
//    c_host_wmma = (float *)malloc(MATRIX_M * MATRIX_N * sizeof(float));

   //    curandErrCheck(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
   //    curandErrCheck(curandSetPseudoRandomGeneratorSeed(gen, 1337ULL));

   //    curandErrCheck(curandGenerateUniform(gen, a_fp32, MATRIX_M * MATRIX_K));
   //    curandErrCheck(curandGenerateUniform(gen, b_fp32, MATRIX_K * MATRIX_N));

   // curand doesn't currently support fp16 so we generate in fp32 and convert to fp16.
   //! concurrent
   
   printf("Converting to fp16...a_fp16\n");
   printf("Current cycle time:");
   convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 31) / 32, 32, 0, streams[0]>>>(a_fp16, h_A, MATRIX_M * MATRIX_K);
//    convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 127) / 128, 128, 0, streams[0]>>>(a_fp16, h_A, MATRIX_M * MATRIX_K);
   //* sequential
//    convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 127) / 128, 128, 0, 0>>>(a_fp16, h_A, MATRIX_M * MATRIX_K);

//    convertFp32ToFp16<<<(MATRIX_M * MATRIX_K + 255) / 256, 256>>>(a_fp16, a_fp32, MATRIX_M * MATRIX_K);
   //printf("Current cycle time done:");
   // //printf("Current cycle time: %f\n", getCycleTime());
   //! concurrent
   printf("Converting to fp16...b_fp16\n");
   printf("Current cycle time:");
   convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 31) / 32, 32, 0, streams[1]>>>(b_fp16, h_B, MATRIX_K * MATRIX_N);
//    convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 127) / 128, 128, 0, streams[1]>>>(b_fp16, h_B, MATRIX_K * MATRIX_N);
   //* sequential
//    convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 127) / 128, 128, 0, 0>>>(b_fp16, h_B, MATRIX_K * MATRIX_N);
//    convertFp32ToFp16<<<(MATRIX_K * MATRIX_N + 255) / 256, 256>>>(b_fp16, b_fp32, MATRIX_K * MATRIX_N);
   //printf("Converting to fp16... DONE\n");

   //    curandErrCheck(curandGenerateUniform(gen, c, MATRIX_M * MATRIX_N));

   //    curandErrCheck(curandDestroyGenerator(gen));

   //    cudaErrCheck(cudaMemcpy(c_cublas, c, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToDevice));
//    cudaErrCheck(cudaMemcpy(c_wmma, h_C, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyHostToDevice));

   float alpha = 2.0f;
   float beta = 2.0f;

   //printf("\nM = %d, N = %d, K = %d. alpha = %f, beta = %f\n\n", MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);


   // First: using WMMA
   dim3 gridDim;
   dim3 blockDim;

   // blockDim.x must be a multple of warpSize
   // 128x4 means we have 16 warps and a block computes a 64x64 output tile
   blockDim.x = 32;
   blockDim.y = 1;
//    blockDim.x = 128;
//    blockDim.y = 1;
   // blockDim.x = 64;
   // blockDim.y = 1;
   // blockDim.x = 128;
   // blockDim.y = 4;

   gridDim.x = (MATRIX_M + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
   gridDim.y = (MATRIX_N + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
   printf("gridDim.x = %d, gridDim.y = %d\n", gridDim.x, gridDim.y);
   
       //printf("Running with wmma...\n");
//    cudaErrCheck(cudaEventRecord(startWMMA));
   // wmma_example<<<gridDim, blockDim>>>(a_fp16, b_fp16, c_wmma, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
   // matrixMulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
   
   //* sequential
//    wmma_example<<<gridDim, blockDim, 0, 0>>>(a_fp16, b_fp16, d_C, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
//    matrixMulKernel<<<gridDim, blockDim, 0, 0>>>(d_A, d_B, d_C, N);
   //! concurrent
   printf("Running with wmma...\n");
   wmma_example<<<gridDim, blockDim, 0, streams[0]>>>(a_fp16, b_fp16, d_C, MATRIX_M, MATRIX_N, MATRIX_K, alpha, beta);
   printf("Running with wmma...done\n");
   printf("Running with matrixMulKernel\n");
   matrixMulKernel<<<gridDim, blockDim, 0, streams[1]>>>(d_A, d_B, d_C, N);
   printf("Running with matrixMulKernel...done\n");

   // //printf("Running with wmma...done\n");
//    cudaErrCheck(cudaEventRecord(stopWMMA));
//    cudaErrCheck(cudaEventSynchronize(stopWMMA));

   cudaStreamSynchronize(streams[0]);
   cudaStreamSynchronize(streams[1]);

   // Error checking
   //printf("\nChecking results...\n");
//    cudaErrCheck(cudaMemcpy(c_host_wmma, c_wmma, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost));

   //* sequential
//    cudaErrCheck(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
   //! concurrent
   cudaErrCheck(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost,streams[nstreams - 1]));
   //    cudaErrCheck(cudaMemcpy(c_host_cublas, c_cublas, MATRIX_M * MATRIX_N * sizeof(float), cudaMemcpyDeviceToHost));


   cudaStreamDestroy(streams[0]);
   cudaStreamDestroy(streams[1]);

//    cudaErrCheck(cudaEventDestroy(startWMMA));
//    cudaErrCheck(cudaEventDestroy(stopWMMA));

   //    cudaErrCheck(cudaEventDestroy(startcublas));
   //    cudaErrCheck(cudaEventDestroy(stopcublas));

//    cudaErrCheck(cudaFree(a_fp32));
//    cudaErrCheck(cudaFree(b_fp32));
   cudaErrCheck(cudaFree(a_fp16));
   cudaErrCheck(cudaFree(b_fp16));

//    cudaErrCheck(cudaFree(c));
   //    cudaErrCheck(cudaFree(c_cublas));
//    cudaErrCheck(cudaFree(c_wmma));

   // cuda core
   cudaFree(d_A);
   cudaFree(d_B);
   cudaFree(d_C);
   free(h_A);
   free(h_B);
   free(h_C);

   //    free(c_host_cublas);
//    free(c_host_wmma);

   cudaErrCheck(cudaDeviceReset());
   return 0;
}
Editor is loading...