Untitled
unknown
haxe
4 years ago
14 kB
5
Indexable
#include <stdio.h> #include <stdlib.h> #include <cuda.h> #include <cuda_runtime.h> #include <sys/mman.h> #include <sys/stat.h> #include <unistd.h> #include <fcntl.h> #include <iostream> #include <algorithm> //#define __DEBUG__ #define BLOCKSIZE 32 #define MEMSIZE 64 #define ROW 0 #define COL 1 #define HANDLE_ERROR(status) \ { \ if (status != cudaSuccess) \ { \ fprintf(stderr, "%s failed at line %d \nError message: %s \n", \ __FILE__, __LINE__ ,cudaGetErrorString(status)); \ exit(EXIT_FAILURE); \ } \ } const int INF = ((1 << 30) - 1); //const int V = 50010; void input(char* inFileName, int B); void output(char* outFileName); void block_FW(int B); int ceil(int a, int b); void cal(int B, int Round, int block_start_x, int block_start_y, int block_width, int block_height); __global__ void block_FW_phase_1(int B, int *dist, int round, int vertex); __global__ void block_FW_phase_2(int B, int *dist, int round, int vertex, int mode); __global__ void block_FW_phase_3(int B, int *dist, int round, int vertex); __global__ void dist_print(int *dist, int v) { for( int i = 0; i < v * v; ++i){ printf("%d ", dist[i]); } printf("\n"); } using namespace std; int n, m; int *Dist; int *Dist_result; int *Dist_cuda; int vertex, edge, o_v; struct stat fileInfo = {0}; int main(int argc, char* argv[]) { int B = MEMSIZE; input(argv[1], B); block_FW(B); output(argv[2]); return 0; } void input(char* infile, int B){ int fd = open(infile, O_RDWR); fstat(fd, &fileInfo); int* file_map = (int*)mmap(NULL, fileInfo.st_size, PROT_READ, MAP_SHARED, fd, 0); vertex = file_map[0]; edge = file_map[1]; #ifdef __DEBUG__ cout << "vertex= " << vertex << ", edge= " << edge << endl; #endif //vertex = vertex + ( BLOCK_FACTOR - ( (vertex-1) % BLOCK_FACTOR + 1 ) ); o_v = vertex; vertex = (vertex % B != 0) ? (vertex/B+1)*B : vertex; Dist = new int[vertex * vertex]; //cudaHostAlloc( &Dist, vertex * vertex * sizeof (int), cudaHostAllocDefault); //cudaHostAlloc( &Dist, vertex * vertex * sizeof (int), cudaHostAllocMapped); for(int i = 0; i < vertex; i++) { for(int j = 0; j < vertex; j++) { if(i != j) Dist[i * vertex + j] = INF; else Dist[i * vertex + j] = 0; // i == j } } int now_position = 2; int src, dst, weight; for(int i = 0; i < edge; i++) { src = file_map[now_position++]; dst = file_map[now_position++]; weight = file_map[now_position++]; #ifdef __DEBUG__ cout << "Frome: " << src << " to " << dst << " weight = " << weight << endl; #endif Dist[src * vertex + dst] = weight; } close(fd); } void output(char* outFileName) { FILE* outfile = fopen(outFileName, "w"); for (int i = 0; i < o_v; ++i) { for (int j = 0; j < o_v; ++j) { if (Dist[i * vertex + j] >= INF) Dist[i * vertex + j] = INF; } fwrite(Dist + i * vertex, sizeof(int), o_v, outfile); } fclose(outfile); //cudaFreeHost(Dist); } int ceil(int a, int b) { return (a + b - 1) / b; } void block_FW(int B) { #ifdef __DEBUG__ float k_time; cudaEvent_t start, stop; cudaError_t cudaStatus; cudaStatus = cudaSetDevice(0); HANDLE_ERROR(cudaStatus); #endif int round = ceil(vertex, B); // Init and copy data to vram //cudaHostGetDevicePointer( &Dist_cuda, Dist, 0); #ifdef __DEBUG__ // Create cuda event cudaEventCreate(&start); cudaEventCreate(&stop); #endif cudaMalloc(&Dist_cuda, vertex*vertex*sizeof(int)); cudaMemcpy(Dist_cuda, Dist, vertex*vertex*sizeof(int), cudaMemcpyHostToDevice); //cudaFreeHost(Dist); // Create block and thread dim3 num_blocks_phase_1(1, 1); dim3 num_blocks_phase_2(round, 1); dim3 num_blocks_phase_3(round, round); dim3 num_threads(BLOCKSIZE,BLOCKSIZE); cudaFuncSetCacheConfig(block_FW_phase_1, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(block_FW_phase_2, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(block_FW_phase_3, cudaFuncCachePreferL1); cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); #ifdef __DEBUG__ cudaEventRecord(start, 0); #endif for (int r = 0; r < round; ++r) { //printf("%d %d\n", r, round); //fflush(stdout); /* Phase 1*/ block_FW_phase_1 <<< num_blocks_phase_1, num_threads>>>(B, Dist_cuda, r, vertex); /* Phase 2*/ block_FW_phase_2 <<< num_blocks_phase_2, num_threads>>> (B, Dist_cuda, r, vertex, ROW); block_FW_phase_2 <<< num_blocks_phase_2, num_threads>>> (B, Dist_cuda, r, vertex, COL); /* Phase 3*/ block_FW_phase_3 <<< num_blocks_phase_3, num_threads>>> (B, Dist_cuda, r, vertex); //dist_print <<< 1, 1>>> (Dist_cuda, vertex); } #ifdef __DEBUG__ cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&k_time, start, stop); #endif //cudaThreadSynchronize(); // Move back cudaMemcpy(Dist, Dist_cuda, vertex*vertex*sizeof(int), cudaMemcpyDeviceToHost); // Free cuda memory and destroy cuda event //cudaFree(Dist_cuda); #ifdef __DEBUG__ cudaEventDestroy(start); cudaEventDestroy(stop); #endif } __global__ void block_FW_phase_1(int B, int *dist, int round, int vertex){ // Declared Shared Memory __shared__ int mys[MEMSIZE][MEMSIZE]; // Register int x = threadIdx.x; int y = threadIdx.y; int dummy = B * round; int d_x = threadIdx.x + dummy; int d_y = threadIdx.y + dummy; // load shared memory mys[y][x] = dist[d_x + d_y * vertex]; mys[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]; mys[y+BLOCKSIZE][x] = dist[d_x + (d_y+BLOCKSIZE) * vertex]; mys[y][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + d_y * vertex]; // int a = mys[y][x]; // int b = mys[y+BLOCKSIZE][x+BLOCKSIZE]; // int c = mys[y+BLOCKSIZE][x]; // int d = mys[y][x+BLOCKSIZE]; __syncthreads(); #pragma unroll 12 for (int i=0; i < B; ++i){ // a = min(a, (mys[y][i] + mys[i][x])); // b = min(b, (mys[y+BLOCKSIZE][i] + mys[i][x+BLOCKSIZE])); // c = min(c, (mys[y+BLOCKSIZE][i] + mys[i][x])); // d = min(d, (mys[y][i] + mys[i][x+BLOCKSIZE])); mys[y][x] = min(mys[y][x], (mys[y][i] + mys[i][x]) ); mys[y+BLOCKSIZE][x+BLOCKSIZE] = min(mys[y+BLOCKSIZE][x+BLOCKSIZE], (mys[y+BLOCKSIZE][i] + mys[i][x+BLOCKSIZE]) ); mys[y+BLOCKSIZE][x] = min(mys[y+BLOCKSIZE][x], (mys[y+BLOCKSIZE][i] + mys[i][x]) ); mys[y][x+BLOCKSIZE] = min(mys[y][x+BLOCKSIZE], (mys[y][i] + mys[i][x+BLOCKSIZE]) ); } __syncthreads(); // Move back // dist[d_x + d_y * vertex] = a; // dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = b; // dist[d_x + (d_y+BLOCKSIZE) * vertex] = c; // dist[(d_x + BLOCKSIZE) + d_y * vertex] = d; dist[d_x + d_y * vertex] = mys[y][x]; dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = mys[y+BLOCKSIZE][x+BLOCKSIZE]; dist[d_x + (d_y+BLOCKSIZE) * vertex] = mys[y+BLOCKSIZE][x]; dist[(d_x + BLOCKSIZE) + d_y * vertex] = mys[y][x+BLOCKSIZE]; } __global__ void block_FW_phase_2(int B, int *dist, int round, int vertex, int mode){ //if (blockIdx.x == round) return; __shared__ int mys_bas[MEMSIZE][MEMSIZE]; __shared__ int mys_pre[MEMSIZE][MEMSIZE]; int x = threadIdx.x; int y = threadIdx.y; int dummy = round * B; /* mode = 0 row mode = 1 col */ int d_x = (mode == ROW) ? (threadIdx.x + blockIdx.x * B) : (threadIdx.x + dummy); int d_y = (mode == ROW) ? (threadIdx.y + dummy) : (threadIdx.y + blockIdx.x * B); // load shared memory mys_bas[y][x] = dist[d_x + d_y * vertex]; mys_bas[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]; mys_bas[y+BLOCKSIZE][x] = dist[d_x + (d_y+BLOCKSIZE) * vertex]; mys_bas[y][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + d_y * vertex]; // Update d_x d_y for ref block d_x = threadIdx.x + dummy; d_y = threadIdx.y + dummy; mys_pre[y][x] = dist[d_x + d_y * vertex]; mys_pre[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]; mys_pre[y+BLOCKSIZE][x] = dist[d_x + (d_y+BLOCKSIZE) * vertex]; mys_pre[y][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + d_y * vertex]; __syncthreads(); int a = mys_bas[y][x]; int b = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]; int c = mys_bas[y+BLOCKSIZE][x]; int d = mys_bas[y][x+BLOCKSIZE]; #pragma unroll 12 for ( int i = 0; i < B; ++i){ if (mode == ROW) { a = min(a, (mys_pre[y][i] + mys_bas[i][x]) ); b = min(b, (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x+BLOCKSIZE]) ); c = min(c, (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x])); d = min(d, (mys_pre[y][i] + mys_bas[i][x+BLOCKSIZE])); // mys_bas[y][x] = min(mys_bas[y][x], (mys_pre[y][i] + mys_bas[i][x]) ); // mys_bas[y+BLOCKSIZE][x+BLOCKSIZE] = min(mys_bas[y+BLOCKSIZE][x+BLOCKSIZE], (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x+BLOCKSIZE]) ); // mys_bas[y+BLOCKSIZE][x] = min(mys_bas[y+BLOCKSIZE][x], (mys_pre[y+BLOCKSIZE][i] + mys_bas[i][x])); // mys_bas[y][x+BLOCKSIZE] = min(mys_bas[y][x+BLOCKSIZE], (mys_pre[y][i] + mys_bas[i][x+BLOCKSIZE])); }else { a = min(a, (mys_bas[y][i] + mys_pre[i][x]) ); b = min(b, (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x+BLOCKSIZE]) ); c = min(c, (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x])); d = min(d, (mys_bas[y][i] + mys_pre[i][x+BLOCKSIZE])); // mys_bas[y][x] = min(mys_bas[y][x], (mys_bas[y][i] + mys_pre[i][x]) ); // mys_bas[y+BLOCKSIZE][x+BLOCKSIZE] = min(mys_bas[y+BLOCKSIZE][x+BLOCKSIZE], (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x+BLOCKSIZE]) ); // mys_bas[y+BLOCKSIZE][x] = min(mys_bas[y+BLOCKSIZE][x], (mys_bas[y+BLOCKSIZE][i] + mys_pre[i][x])); // mys_bas[y][x+BLOCKSIZE] = min(mys_bas[y][x+BLOCKSIZE], (mys_bas[y][i] + mys_pre[i][x+BLOCKSIZE])); } } // reset d_x, d_y d_x = (mode == ROW) ? (threadIdx.x + blockIdx.x * B) : (threadIdx.x + dummy); d_y = (mode == ROW) ? (threadIdx.y + dummy) : (threadIdx.y + blockIdx.x * B); // Move back dist[d_x + d_y * vertex] = a; dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = b; dist[d_x + (d_y+BLOCKSIZE) * vertex] = c; dist[(d_x + BLOCKSIZE) + d_y * vertex] = d; // dist[d_x + d_y * vertex] = mys_bas[y][x]; // dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]; // dist[d_x + (d_y+BLOCKSIZE) * vertex] = mys_bas[y+BLOCKSIZE][x]; // dist[(d_x + BLOCKSIZE) + d_y * vertex] = mys_bas[y][x+BLOCKSIZE]; } __global__ void block_FW_phase_3(int B, int *dist, int round, int vertex){ __shared__ int mys_bas[MEMSIZE][MEMSIZE]; __shared__ int mys_pre_x[MEMSIZE][MEMSIZE]; __shared__ int mys_pre_y[MEMSIZE][MEMSIZE]; int x = threadIdx.x; int y = threadIdx.y; int dummy = round * B; int d_x = threadIdx.x + (blockIdx.x * B); int d_y = threadIdx.y + (blockIdx.y * B); int d_x_pre = dummy + threadIdx.x; int d_y_pre = dummy + threadIdx.y; // load data to share memory mys_bas[y][x] = dist[d_x + d_y * vertex]; mys_bas[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]; mys_bas[y+BLOCKSIZE][x] = dist[d_x + (d_y + BLOCKSIZE) * vertex]; mys_bas[y][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + d_y * vertex]; // load reference X axis mys_pre_x[y][x] = dist[d_x_pre + d_y * vertex]; mys_pre_x[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x_pre + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex]; mys_pre_x[y+BLOCKSIZE][x] = dist[d_x_pre + (d_y + BLOCKSIZE) * vertex]; mys_pre_x[y][x+BLOCKSIZE] = dist[(d_x_pre + BLOCKSIZE) + d_y * vertex]; // load reference Y axis mys_pre_y[y][x] = dist[d_x + d_y_pre * vertex]; mys_pre_y[y+BLOCKSIZE][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + (d_y_pre + BLOCKSIZE) * vertex]; mys_pre_y[y+BLOCKSIZE][x] = dist[d_x + (d_y_pre + BLOCKSIZE) * vertex]; mys_pre_y[y][x+BLOCKSIZE] = dist[(d_x + BLOCKSIZE) + d_y_pre * vertex]; int a = mys_bas[y][x]; int b = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]; int c = mys_bas[y+BLOCKSIZE][x]; int d = mys_bas[y][x+BLOCKSIZE]; __syncthreads(); #pragma unroll 12 for ( int i = 0; i < B; ++i){ a = min(a, (mys_pre_x[y][i] + mys_pre_y[i][x]) ); b = min(b, (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x+BLOCKSIZE]) ); c = min(c, (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x])); d = min(d, (mys_pre_x[y][i] + mys_pre_y[i][x+BLOCKSIZE])); // mys_bas[y][x] = min(mys_bas[y][x], (mys_pre_x[y][i] + mys_pre_y[i][x]) ); // mys_bas[y+BLOCKSIZE][x+BLOCKSIZE] = min(mys_bas[y+BLOCKSIZE][x+BLOCKSIZE], (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x+BLOCKSIZE]) ); // mys_bas[y+BLOCKSIZE][x] = min(mys_bas[y+BLOCKSIZE][x], (mys_pre_x[y+BLOCKSIZE][i] + mys_pre_y[i][x])); // mys_bas[y][x+BLOCKSIZE] = min(mys_bas[y][x+BLOCKSIZE], (mys_pre_x[y][i] + mys_pre_y[i][x+BLOCKSIZE])); } // Move back dist[d_x + d_y * vertex] = a; dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = b; dist[d_x + (d_y+BLOCKSIZE) * vertex] = c; dist[(d_x + BLOCKSIZE) + d_y * vertex] = d; // dist[d_x + d_y * vertex] = mys_bas[y][x]; // dist[(d_x + BLOCKSIZE) + (d_y + BLOCKSIZE) * vertex] = mys_bas[y+BLOCKSIZE][x+BLOCKSIZE]; // dist[d_x + (d_y+BLOCKSIZE) * vertex] = mys_bas[y+BLOCKSIZE][x]; // dist[(d_x + BLOCKSIZE) + d_y * vertex] = mys_bas[y][x+BLOCKSIZE]; }
Editor is loading...