/************************************************** * All-pairs-shortest paths * Host code. * Recursive in-place implementation * Copyright by Ceren Budak, Aydin Buluc, Fenglin Liao, Arda Atali * NOTES: Continues the recursion until all of the problem fits into one block. * DATE: December 2007 * ROW-WISE (OLD) TIMINGS: [Max BLOCK_SIZE possible is 16] ~5100/5900 ms with 4K vertices and BLOCK_SIZE=16 ~7500 ms with 4K vertices and BLOCK_SIZE=8 * COL-WISE (NEW) TIMINGS: [Using Volkov's GEMM] ~1014 ms with 4K vertices ***************************************************/ /** * An implementation of the recursive APSP algorithm * A is an adjacency matrix of a graph, nonzeros represents edge, zeros represent no edges * Matrices are laid out in column-major order */ // System includes #include #include #include #include #include // Project includes #include // Kernel includes #include "apsp_kernel.h" using namespace std; void runTest(int argc, char** argv); void printDiff(float *, float*, int, int); void floydWarshall(float *, int, int); extern "C" void computeGoldCol(float *, const float *, unsigned int); int main(int argc, char** argv) { runTest(argc, argv); CUT_EXIT(argc, argv); } void Load(FILE * fid, float * distMatrix, int size) { int read = 0; int v1, v2; float value; for (int j=0; j>>(data, width,start); } else if(width <= FAST_GEMM) { int nw = width/2; // new width floydWarshall(data, start, nw); // setup execution parameters dim3 threadsmult(BLOCK_SIZE, BLOCK_SIZE); dim3 gridmult(nw / BLOCK_SIZE, nw / BLOCK_SIZE); // execute the kernel B = AB matrixMul<<< gridmult, threadsmult >>>(data, data, data, nw, start+nw, start, start,start,start+nw, start,0); // execute the kernel C = CA matrixMul<<< gridmult, threadsmult >>>(data, data, data, nw, start, start+nw,start,start+nw,start, start,0); // execute the kernel D += CB matrixMul<<< gridmult, threadsmult >>>(data, data, data, nw, start+nw,start+nw,start,start+nw, start+nw, start,1); // do FW for D floydWarshall(data, start+nw, nw); // execute the kernel B = BD matrixMul<<< gridmult, threadsmult >>>(data, data, data, nw, start+nw, start, start+nw,start,start+nw, start+nw,0); // execute the kernel C = DC matrixMul<<< gridmult, threadsmult >>>(data, data, data, nw, start, start+nw,start+nw,start+nw,start, start+nw,0); // execute the kernel A += BC matrixMul<<< gridmult, threadsmult >>>(data, data, data, nw, start,start,start+nw,start, start, start+nw,1); } else { /*A=floyd-warshall(A); B=AB; C=CA; D=D+CB; D=floyd-warshall(D); B=BD; C=DC; A=A+BC;*/ int nw = width/2; // new width floydWarshall(data, start, nw); // setup execution parameters dim3 gemmgrid( nw/64, nw/16 ); dim3 gemmthreads( 16, 4 ); // Remember: Column-major float * A = data + start * WA + start; float * B = data + (start+nw) * WA + start; float * C = data + start * WA + (start+nw); float * D = data + (start+nw) * WA + (start+nw); // sgemmNN_MinPlus( const float *A, int lda, const float *B, int ldb, float* C, int ldc, int k, float alpha, float beta ) // no need to send m & n since they are known through grid dimensions ! // execute the parallel multiplication kernel B = AB sgemmNN_MinPlus<<>>(A, WA, B, WA, B, WA, nw, FLOATINF ); // execute the parallel multiplication kernel C = CA sgemmNN_MinPlus<<>>(C, WA, A, WA, C, WA, nw, FLOATINF ); // execute the parallel multiplication kernel D += CB sgemmNN_MinPlus<<>>(C, WA, B, WA, D, WA, nw, 1 ); // do FW for D floydWarshall(data, start+nw, nw); // execute the parallel multiplication kernel B = BD sgemmNN_MinPlus<<>>(B, WA, D, WA, B, WA, nw, FLOATINF ); // execute the parallel multiplication kernel C = DC sgemmNN_MinPlus<<>>(D, WA, C, WA, C, WA, nw, FLOATINF ); // execute the parallel multiplication kernel A += BC sgemmNN_MinPlus<<>>(B, WA, C, WA, A, WA, nw, 1 ); } } void printDiff(float *data1, float *data2, int width, int height) { fprintf(stderr,"Verifying..."); int i,j,k; int error_count=0; for (i=0; i 0.01 ) { fprintf(stderr,"diff(%d,%d) CPU=%f, GPU=%f\n", i,j, data1[k], data2[k]); error_count++; } } } printf("\nTotal Errors = %d\n", error_count); /* printf("Writing output to disk...\n"); FILE * fp; if((fp=fopen("result.txt", "w")) == NULL) { printf("Cannot open file.\n"); exit(1); } for (int i=0; i