/** * All-pairs-shortest paths * Device code (column-wise) * Recursive in-place implementation * Copyright by Aydin Buluc * June 2008 */ #ifndef _APSP_KERNEL_H_ #define _APSP_KERNEL_H_ #include #include "apsp.h" #define CHECK_BANK_CONFLICTS 0 #if CHECK_BANK_CONFLICTS #define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j)) #define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j)) #else #define AS(i, j) As[i][j] #define BS(i, j) Bs[i][j] #endif /** * APSP using a single block (column version) * Iteration is within this kernel function, * So, no looping is necessary when calling apsp_seq * start is the starting offset of nboth dimensions */ __global__ void apsp_seq( float * A, int wA, int start) { // Thread index int tx = threadIdx.x; int ty = threadIdx.y; // Csub is used to store the element of the result // that is computed by the thread int offset = start * WA + start; // memory offset for (int k = 0; k < wA; ++k) { float M1 = A[offset+ty*WA+k]; // kth row float M2 = A[offset+k*WA + tx]; // kth column A[offset+ty*WA + tx] = fminf(M1+M2, A[offset+ty*WA + tx]); __syncthreads(); } } template __device__ void saxpy_MinPlus(float a, float *b, float *c) { for(int i=0; i( a[0], &bs[0][0], c ); a[0] = A[0*lda]; saxpy_MinPlus<16>( a[1], &bs[1][0], c ); a[1] = A[1*lda]; saxpy_MinPlus<16>( a[2], &bs[2][0], c ); a[2] = A[2*lda]; saxpy_MinPlus<16>( a[3], &bs[3][0], c ); a[3] = A[3*lda]; A += 4*lda; saxpy_MinPlus<16>( a[0], &bs[4][0], c ); a[0] = A[0*lda]; saxpy_MinPlus<16>( a[1], &bs[5][0], c ); a[1] = A[1*lda]; saxpy_MinPlus<16>( a[2], &bs[6][0], c ); a[2] = A[2*lda]; saxpy_MinPlus<16>( a[3], &bs[7][0], c ); a[3] = A[3*lda]; A += 4*lda; saxpy_MinPlus<16>( a[0], &bs[8][0], c ); a[0] = A[0*lda]; saxpy_MinPlus<16>( a[1], &bs[9][0], c ); a[1] = A[1*lda]; saxpy_MinPlus<16>( a[2], &bs[10][0], c ); a[2] = A[2*lda]; saxpy_MinPlus<16>( a[3], &bs[11][0], c ); a[3] = A[3*lda]; A += 4*lda; saxpy_MinPlus<16>( a[0], &bs[12][0], c ); saxpy_MinPlus<16>( a[1], &bs[13][0], c ); saxpy_MinPlus<16>( a[2], &bs[14][0], c ); saxpy_MinPlus<16>( a[3], &bs[15][0], c ); B += 16; __syncthreads(); } while( B < Blast ); for( int i = 0; i < 16; ++i, C += ldc ) { C[0] = fminf(c[i],beta*C[0]); } } #endif // #ifndef _APSP_KERNEL_H_