#define BLOCK 64 void mm_inner(int, const double *, const double *, double *); void mm_arbit7(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; /****************************************************/ } } void mm_arbit6(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; /****************************************************/ } } void mm_arbit5(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; /****************************************************/ } } void mm_arbit4(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; /****************************************************/ } } void mm_arbit3(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; /****************************************************/ } } void mm_arbit2(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; /****************************************************/ } } void mm_arbit1(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ c0= C[ROW * Skip + i + 0]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; } C[ROW * Skip + i + 0]= c0; /****************************************************/ } } void mm_arbit0(int X, int Y, int Z, const double *A, const double *B, double *C, int Skip) { int ROW, i, j, k, stop; #define ICHUNK 8 stop= Z/ICHUNK; stop= stop * ICHUNK; for(ROW= 0; ROW < X; ROW++) { register double c0, c1, c2, c3, c4, c5, c6, c7; for(i= 0; i < stop; i+= ICHUNK) { /* c0= c1= c2= c3= c4= c5= c6= c7= 0; */ c0= C[ROW * Skip + i + 0]; c1= C[ROW * Skip + i + 1]; c2= C[ROW * Skip + i + 2]; c3= C[ROW * Skip + i + 3]; c4= C[ROW * Skip + i + 4]; c5= C[ROW * Skip + i + 5]; c6= C[ROW * Skip + i + 6]; c7= C[ROW * Skip + i + 7]; for(j= 0; j < Y; j++) { register double A_row_j= A[ROW * Skip + j]; c0+= A_row_j * B[j * Skip + i]; c1+= A_row_j * B[j * Skip + i+1]; c2+= A_row_j * B[j * Skip + i+2]; c3+= A_row_j * B[j * Skip + i+3]; c4+= A_row_j * B[j * Skip + i+4]; c5+= A_row_j * B[j * Skip + i+5]; c6+= A_row_j * B[j * Skip + i+6]; c7+= A_row_j * B[j * Skip + i+7]; } C[ROW * Skip + i + 0]= c0; C[ROW * Skip + i + 1]= c1; C[ROW * Skip + i + 2]= c2; C[ROW * Skip + i + 3]= c3; C[ROW * Skip + i + 4]= c4; C[ROW * Skip + i + 5]= c5; C[ROW * Skip + i + 6]= c6; C[ROW * Skip + i + 7]= c7; } /**************************************************************/ /****************************************************/ } } #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) typedef void (* mm_fun)(int, int, int, const double *, const double *, double *, int); mm_fun mm_functions[ICHUNK]= { mm_arbit0, mm_arbit1, mm_arbit2, mm_arbit3, mm_arbit4, mm_arbit5, mm_arbit6, mm_arbit7}; #define mm_arbit(X, Y, Z, A, B, C, Skip) (mm_functions[Z % ICHUNK])\ (X, Y, Z, A, B, C, Skip) void mul_mfmf_mf(int S, const double *A, const double *B, double *C) { register int i, j, k; if(S <= 160) { (mm_functions[S % ICHUNK])(S, S, S, A, B, C, S); return; } for(i= 0; i < S; i+= BLOCK) for(j= 0; j < S; j+= BLOCK) for(k= 0; k < S; k+= BLOCK) { int X, Y, Z; const double *a, *b; double *c; X= min(BLOCK, S - i); Y= min(BLOCK, S - k); Z= min(BLOCK, S - j); /* printf("%d %d %d, %d %d %d\n", X, Y, Z, i, j, k); */ a= &A[i * S + k]; b= &B[k * S + j]; c= &C[i * S + j]; mm_arbit(X, Y, Z, a, b, c, S); } }