Untitled

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <iostream>
#include <algorithm>

//#define __DEBUG__
#define BLOCKSIZE 32
#define MEMSIZE 64
#define ROW 0
#define COL 1

#define HANDLE_ERROR(status) \
{ \
	if (status != cudaSuccess) \
	{ \
		fprintf(stderr, "%s failed  at line %d \nError message: %s \n", \
			__FILE__, __LINE__ ,cudaGetErrorString(status)); \
		exit(EXIT_FAILURE); \
	} \
}

const int INF = ((1 << 30) - 1);
//const int V = 50010;
void input(char* inFileName, int B);
void output(char* outFileName);

void block_FW(int B);
int ceil(int a, int b);
void cal(int B, int Round, int block_start_x, int block_start_y, int block_width, int block_height);
__global__ void block_FW_phase_1(int B, int *dist, int round, int vertex);
__global__ void block_FW_phase_2(int B, int *dist, int round, int vertex, int mode);
__global__ void block_FW_phase_3(int B, int *dist, int round, int vertex);

__global__ void dist_print(int *dist, int v) {
    for( int i = 0; i < v * v; ++i){
        printf("%d ", dist[i]);
    }
    printf("\n");
}

using namespace std;

int n, m;
int *Dist;
int *Dist_result;
int *Dist_cuda;
int vertex, edge, o_v;
struct stat fileInfo = {0};



int main(int argc, char* argv[]) {
    int B = MEMSIZE;
    input(argv[1], B);
    block_FW(B);
    output(argv[2]);
    return 0;
}

void input(char* infile, int B){
    int fd = open(infile, O_RDWR);
    fstat(fd, &fileInfo);
    int* file_map = (int*)mmap(NULL, fileInfo.st_size, PROT_READ, MAP_SHARED, fd, 0);
    
    vertex = file_map[0];
    edge = file_map[1];
    
    #ifdef __DEBUG__
        cout << "vertex= " << vertex << ", edge= " << edge << endl;
    #endif

    //vertex = vertex + ( BLOCK_FACTOR - (   (vertex-1)  % BLOCK_FACTOR + 1 )      );
    o_v = vertex;
    vertex = (vertex % B != 0) ? (vertex/B+1)*B : vertex;
    Dist = new int[vertex * vertex];
    //cudaHostAlloc( &Dist, vertex * vertex * sizeof (int), cudaHostAllocDefault);
    //cudaHostAlloc( &Dist, vertex * vertex * sizeof (int), cudaHostAllocMapped);

    for(int i = 0; i < vertex; i++) {
        for(int j = 0; j < vertex; j++) {
            if(i != j)
                Dist[i * vertex + j] = INF;
            else
                Dist[i * vertex + j] = 0;       // i == j
        }
    }

    int now_position = 2;
    int src, dst, weight;
    for(int i = 0; i < edge; i++) {
        src = file_map[now_position++];
        dst = file_map[now_position++];
        weight = file_map[now_position++];
        #ifdef __DEBUG__
            cout << "Frome: " << src << " to " << dst << " weight = " << weight << endl; 
        #endif
        Dist[src * vertex + dst] = weight;
    }
    close(fd);

}

void output(char* outFileName) {
    FILE* outfile = fopen(outFileName, "w");
    for (int i = 0; i < o_v; ++i) {
        for (int j = 0; j < o_v; ++j) {
            if (Dist[i * vertex + j] >= INF)
                Dist[i * vertex + j] = INF;
        }
        fwrite(Dist + i * vertex, sizeof(int), o_v, outfile);
    }
    fclose(outfile);
    //cudaFreeHost(Dist);
}

int ceil(int a, int b) { return (a + b - 1) / b; }


void block_FW(int B) {
#ifdef __DEBUG__
    float k_time;
    cudaEvent_t start, stop;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    HANDLE_ERROR(cudaStatus);
#endif


    int round = ceil(vertex, B);
    // Init and copy data to vram
    //cudaHostGetDevicePointer( &Dist_cuda, Dist, 0);
    
#ifdef __DEBUG__
    // Create cuda event
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
#endif 

    cudaMalloc(&Dist_cuda, vertex*vertex*sizeof(int));
    cudaMemcpy(Dist_cuda, Dist, vertex*vertex*sizeof(int), cudaMemcpyHostToDevice);
    //cudaFreeHost(Dist);

    // Create block and thread
    dim3 num_blocks_phase_1(1, 1);
    dim3 num_blocks_phase_2(round, 1);
    dim3 num_blocks_phase_3(round, round);
    dim3 num_threads(BLOCKSIZE,BLOCKSIZE);

    cudaFuncSetCacheConfig(block_FW_phase_1, cudaFuncCachePreferL1);
    cudaFuncSetCacheConfig(block_FW_phase_2, cudaFuncCachePreferL1);
    cudaFuncSetCacheConfig(block_FW_phase_3, cudaFuncCachePreferL1);
    
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
#ifdef __DEBUG__
    cudaEventRecord(start, 0);
#endif

    for (int r = 0; r < round; ++r) {
        //printf("%d %d\n", r, round);
        //fflush(stdout);
        /* Phase 1*/
        block_FW_phase_1 <<< num_blocks_phase_1, num_threads>>>(B,  Dist_cuda, r, vertex);
        /* Phase 2*/
        block_FW_phase_2 <<< num_blocks_phase_2, num_threads>>> (B, Dist_cuda, r, vertex, ROW);
        block_FW_phase_2 <<< num_blocks_phase_2, num_threads>>> (B, Dist_cuda, r, vertex, COL);

        /* Phase 3*/
        block_FW_phase_3 <<< num_blocks_phase_3, num_threads>>> (B, Dist_cuda, r, vertex);
        //dist_print <<< 1, 1>>> (Dist_cuda, vertex);
    }
    
#ifdef __DEBUG__
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&k_time, start, stop);
#endif
    //cudaThreadSynchronize();
    
    // Move back
    cudaMemcpy(Dist, Dist_cuda, vertex*vertex*sizeof(int), cudaMemcpyDeviceToHost);
    
    // Free cuda memory and destroy cuda event
    //cudaFree(Dist_cuda);
#ifdef __DEBUG__
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
#endif
}

__global__ void block_FW_phase_1(int B, int *dist, int round, int vertex){
    //  Declared Shared Memory 
    __shared__ int mys[MEMSIZE][MEMSIZE];
    // Register
    int x = threadIdx.x;
    int y = threadIdx.y;
    int dummy = B * round;
    int d_x = threadIdx.x + dummy;
    int d_y = threadIdx.y + dummy;

    // load shared memory
    mys[y][x]       = dist[d_x + d_y * vertex];
    mys[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex];

    mys[y+BLOCKSIZE][x]    = dist[d_x + (d_y+BLOCKSIZE) * vertex];
    mys[y][x+BLOCKSIZE]    = dist[(d_x + BLOCKSIZE) + d_y * vertex];

    // int a = mys[y][x];
    // int b = mys[y+BLOCKSIZE][x+BLOCKSIZE];
    // int c = mys[y+BLOCKSIZE][x];
    // int d = mys[y][x+BLOCKSIZE];  
    __syncthreads();

    #pragma unroll 12
    for (int i=0; i < B; ++i){
        // a = min(a, (mys[y][i] + mys[i][x]));
        // b = min(b, (mys[y+BLOCKSIZE][i] + mys[i][x+BLOCKSIZE]));
        // c = min(c, (mys[y+BLOCKSIZE][i] + mys[i][x]));
        // d = min(d, (mys[y][i] + mys[i][x+BLOCKSIZE]));

        mys[y][x]       = min(mys[y][x],        (mys[y][i] + mys[i][x])   );
        mys[y+BLOCKSIZE][x+BLOCKSIZE] = min(mys[y+BLOCKSIZE][x+BLOCKSIZE],  (mys[y+BLOCKSIZE][i] + mys[i][x+BLOCKSIZE]) );

        mys[y+BLOCKSIZE][x]    = min(mys[y+BLOCKSIZE][x], (mys[y+BLOCKSIZE][i] + mys[i][x]) );
        mys[y][x+BLOCKSIZE]    = min(mys[y][x+BLOCKSIZE], (mys[y][i] + mys[i][x+BLOCKSIZE]) );
    }
    __syncthreads();
    // Move back
    // dist[d_x + d_y * vertex]  = a;
    // dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = b;
    // dist[d_x + (d_y+BLOCKSIZE) * vertex] = c;
    // dist[(d_x + BLOCKSIZE) + d_y * vertex] = d;

    dist[d_x + d_y * vertex]                = mys[y][x];
    dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]  = mys[y+BLOCKSIZE][x+BLOCKSIZE];
    
    dist[d_x + (d_y+BLOCKSIZE) * vertex]           = mys[y+BLOCKSIZE][x];
    dist[(d_x + BLOCKSIZE) + d_y * vertex]         = mys[y][x+BLOCKSIZE];
}

__global__ void block_FW_phase_2(int B, int *dist, int round, int vertex, int mode){
    //if (blockIdx.x == round) return;
    __shared__ int mys_bas[MEMSIZE][MEMSIZE];
    __shared__ int mys_pre[MEMSIZE][MEMSIZE];

    int x = threadIdx.x;
    int y = threadIdx.y;
    int dummy = round * B;

    /* 
        mode = 0 row 
        mode = 1 col
    */
    int d_x = (mode == ROW) ? (threadIdx.x + blockIdx.x * B) : (threadIdx.x + dummy);
    int d_y = (mode == ROW) ? (threadIdx.y + dummy) : (threadIdx.y + blockIdx.x * B);

    // load shared memory
    mys_bas[y][x]                       = dist[d_x + d_y * vertex];
    mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]   = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex];

    mys_bas[y+BLOCKSIZE][x]         = dist[d_x + (d_y+BLOCKSIZE) * vertex];
    mys_bas[y][x+BLOCKSIZE]         = dist[(d_x + BLOCKSIZE) + d_y * vertex];

    // Update d_x d_y for ref block
    d_x = threadIdx.x + dummy;
    d_y = threadIdx.y + dummy;

    mys_pre[y][x]                       = dist[d_x + d_y * vertex];
    mys_pre[y+BLOCKSIZE][x+BLOCKSIZE]   = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex];

    mys_pre[y+BLOCKSIZE][x]         = dist[d_x + (d_y+BLOCKSIZE) * vertex];
    mys_pre[y][x+BLOCKSIZE]         = dist[(d_x + BLOCKSIZE) + d_y * vertex];
    __syncthreads();

    int a = mys_bas[y][x];
    int b = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE];
    int c = mys_bas[y+BLOCKSIZE][x];
    int d = mys_bas[y][x+BLOCKSIZE];  

    #pragma unroll 12
    for ( int i = 0; i < B; ++i){
        if (mode == ROW) {
            a = min(a, (mys_pre[y][i] + mys_bas[i][x]) );
            b = min(b, (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x+BLOCKSIZE]) );

            c = min(c, (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x]));
            d = min(d, (mys_pre[y][i] + mys_bas[i][x+BLOCKSIZE]));

            // mys_bas[y][x]                       = min(mys_bas[y][x],        (mys_pre[y][i] + mys_bas[i][x]) );
            // mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]   = min(mys_bas[y+BLOCKSIZE][x+BLOCKSIZE],  (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x+BLOCKSIZE]) );

            // mys_bas[y+BLOCKSIZE][x]    = min(mys_bas[y+BLOCKSIZE][x], (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x]));
            // mys_bas[y][x+BLOCKSIZE]    = min(mys_bas[y][x+BLOCKSIZE], (mys_pre[y][i] + mys_bas[i][x+BLOCKSIZE]));
        }else {
            a = min(a, (mys_bas[y][i] + mys_pre[i][x]) );
            b = min(b, (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x+BLOCKSIZE]) );
            c = min(c, (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x]));
            d = min(d, (mys_bas[y][i] + mys_pre[i][x+BLOCKSIZE]));

            // mys_bas[y][x]                       = min(mys_bas[y][x],        (mys_bas[y][i] + mys_pre[i][x]) );
            // mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]   = min(mys_bas[y+BLOCKSIZE][x+BLOCKSIZE],  (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x+BLOCKSIZE]) );

            // mys_bas[y+BLOCKSIZE][x]    = min(mys_bas[y+BLOCKSIZE][x], (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x]));
            // mys_bas[y][x+BLOCKSIZE]    = min(mys_bas[y][x+BLOCKSIZE], (mys_bas[y][i] + mys_pre[i][x+BLOCKSIZE]));
        }
    }
    
    // reset d_x, d_y
    d_x = (mode == ROW) ? (threadIdx.x + blockIdx.x * B) : (threadIdx.x + dummy);
    d_y = (mode == ROW) ? (threadIdx.y + dummy) : (threadIdx.y + blockIdx.x * B);

    // Move back
    dist[d_x + d_y * vertex]  = a;
    dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = b;
    dist[d_x + (d_y+BLOCKSIZE) * vertex] = c;
    dist[(d_x + BLOCKSIZE) + d_y * vertex] = d;
    // dist[d_x + d_y * vertex]                                = mys_bas[y][x];
    // dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]    = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE];
    
    // dist[d_x + (d_y+BLOCKSIZE) * vertex]           = mys_bas[y+BLOCKSIZE][x];
    // dist[(d_x + BLOCKSIZE) + d_y * vertex]         = mys_bas[y][x+BLOCKSIZE];
}


__global__ void block_FW_phase_3(int B, int *dist, int round, int vertex){
    __shared__ int mys_bas[MEMSIZE][MEMSIZE];
    __shared__ int mys_pre_x[MEMSIZE][MEMSIZE];
    __shared__ int mys_pre_y[MEMSIZE][MEMSIZE];

    int x = threadIdx.x;
    int y = threadIdx.y;
    int dummy = round * B;

    int d_x = threadIdx.x + (blockIdx.x * B);
    int d_y = threadIdx.y + (blockIdx.y * B);

    int d_x_pre = dummy + threadIdx.x;
    int d_y_pre = dummy + threadIdx.y;

    // load data to share memory
    mys_bas[y][x]                       = dist[d_x + d_y * vertex];
    mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]   = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex];

    mys_bas[y+BLOCKSIZE][x]    = dist[d_x + (d_y + BLOCKSIZE) * vertex];
    mys_bas[y][x+BLOCKSIZE]    = dist[(d_x + BLOCKSIZE) + d_y * vertex];

    // load reference X axis
    mys_pre_x[y][x]                     = dist[d_x_pre + d_y * vertex];
    mys_pre_x[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x_pre + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex];

    mys_pre_x[y+BLOCKSIZE][x]    = dist[d_x_pre + (d_y + BLOCKSIZE) * vertex];
    mys_pre_x[y][x+BLOCKSIZE]    = dist[(d_x_pre + BLOCKSIZE) + d_y * vertex];

    // load reference Y axis
    mys_pre_y[y][x]                     = dist[d_x + d_y_pre * vertex];
    mys_pre_y[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y_pre + BLOCKSIZE) * vertex];

    mys_pre_y[y+BLOCKSIZE][x]    = dist[d_x + (d_y_pre + BLOCKSIZE) * vertex];
    mys_pre_y[y][x+BLOCKSIZE]    = dist[(d_x + BLOCKSIZE) + d_y_pre * vertex];

    int a = mys_bas[y][x];
    int b = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE];
    int c = mys_bas[y+BLOCKSIZE][x];
    int d = mys_bas[y][x+BLOCKSIZE];  

    __syncthreads();
                   
    #pragma unroll 12
    for ( int i = 0; i < B; ++i){
        
        a = min(a,        (mys_pre_x[y][i] + mys_pre_y[i][x]) );
        b = min(b,  (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x+BLOCKSIZE]) );
        c = min(c, (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x]));
        d = min(d, (mys_pre_x[y][i] + mys_pre_y[i][x+BLOCKSIZE]));

        // mys_bas[y][x]                     = min(mys_bas[y][x],        (mys_pre_x[y][i] + mys_pre_y[i][x]) );
        // mys_bas[y+BLOCKSIZE][x+BLOCKSIZE] = min(mys_bas[y+BLOCKSIZE][x+BLOCKSIZE],  (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x+BLOCKSIZE]) );

        // mys_bas[y+BLOCKSIZE][x]   = min(mys_bas[y+BLOCKSIZE][x], (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x]));
        // mys_bas[y][x+BLOCKSIZE]   = min(mys_bas[y][x+BLOCKSIZE], (mys_pre_x[y][i] + mys_pre_y[i][x+BLOCKSIZE]));
    }
    // Move back
    
    dist[d_x + d_y * vertex]  = a;
    dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = b;
    dist[d_x + (d_y+BLOCKSIZE) * vertex] = c;
    dist[(d_x + BLOCKSIZE) + d_y * vertex] = d;
    
    // dist[d_x + d_y * vertex]                                = mys_bas[y][x];
    // dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]    = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE];
    
    // dist[d_x + (d_y+BLOCKSIZE) * vertex]           = mys_bas[y+BLOCKSIZE][x];
    // dist[(d_x + BLOCKSIZE) + d_y * vertex]         = mys_bas[y][x+BLOCKSIZE];
}
Editor is loading...