/********************************************************************** * * Hongyun Wang & Paul Milligan, January 1995. * * CS 267 "Applications of Parallel Computers" * Square matrix multiplication: * C = C + A*B * Optimised for the IBM RS6000 Power2 * **********************************************************************/ void mul_mfmf_mf( int m, const double *A, const double *B, double *C ) { #define N3 32 int m_block, m_chunk, m_rem, N2 ; int i1, j1, k1; int m_row, m_col_a, m_col_b; double dummy; int row_rem, col_rem; register int i2, j2_b, j2_a; register int row1, col1, row2, j; register int ind1, ind2; register double C11,C12,C13,C14,C15,C16,C17,C18, C21,C22,C23,C24,C25,C26,C27,C28; register double A1,A2; register double B1,B2,B3,B4,B5,B6,B7,B8; if (m < 161) N2 = 160; else N2 = 96; m_block = (m - N3) / N2; m_chunk = m_block * N2; m_rem = m - m_block * N2; m_row = N2; for (i1 = 0; i1 < m; i1 += m_row) { if (i1 == m_chunk) m_row = m_rem; m_col_b = N2; for (j1 = 0; j1 < m; j1 += m_col_b) { if (j1 == m_chunk) m_col_b = m_rem; m_col_a = N2; for ( k1 = 0; k1 < m; k1 += m_col_a) { if (k1 == m_chunk) m_col_a = m_rem; i2 = (m_row / 2) * 2; row_rem = m_row - i2; i2 += i1; j2_b = (m_col_b / 8) * 8; col_rem = m_col_b - j2_b; j2_b += j1; j2_a = m_col_a + k1; for (row1=i1; row1 < i2; row1 += 2 ) { row2 = row1 + 1; /* then operate on 8 col blocks: */ for (col1=j1; col1 < j2_b; col1 += 8 ) { C11=C12=C13=C14=C15=C16=C17=C18=0.0; C21=C22=C23=C24=C25=C26=C27=C28=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + col1; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; C23 += A2 * B3; C24 += A2 * B4; B5 = B[ind1+4]; B6 = B[ind1+5]; C15 += A1 * B5; C16 += A1 * B6; C25 += A2 * B5; C26 += A2 * B6; B7 = B[ind1+6]; B8 = B[ind1+7]; C17 += A1 * B7; C18 += A1 * B8; C27 += A2 * B7; C28 += A2 * B8; } ind1 = row1 * m + col1; ind2 = row2 * m + col1; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind1+5] += C16; C[ind1+6] += C17; C[ind1+7] += C18; C[ind2] += C21; C[ind2+1] += C22; C[ind2+2] += C23; C[ind2+3] += C24; C[ind2+4] += C25; C[ind2+5] += C26; C[ind2+6] += C27; C[ind2+7] += C28; } /* next block of 8 cols */ switch (col_rem) { case 0: break; case 7: C11=C12=C13=C14=C15=C16=C17=0.0; C21=C22=C23=C24=C25=C26=C27=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; C23 += A2 * B3; C24 += A2 * B4; B5 = B[ind1+4]; B6 = B[ind1+5]; C15 += A1 * B5; C16 += A1 * B6; C25 += A2 * B5; C26 += A2 * B6; B7 = B[ind1+6]; C17 += A1 * B7; C27 += A2 * B7; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind1+5] += C16; C[ind1+6] += C17; C[ind2] += C21; C[ind2+1] += C22; C[ind2+2] += C23; C[ind2+3] += C24; C[ind2+4] += C25; C[ind2+5] += C26; C[ind2+6] += C27; break; case 6: C11=C12=C13=C14=C15=C16=0.0; C21=C22=C23=C24=C25=C26=0.0; for (j=k1; j < j2_a ;j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; C23 += A2 * B3; C24 += A2 * B4; B5 = B[ind1+4]; B6 = B[ind1+5]; C15 += A1 * B5; C16 += A1 * B6; C25 += A2 * B5; C26 += A2 * B6; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind1+5] += C16; C[ind2] += C21; C[ind2+1] += C22; C[ind2+2] += C23; C[ind2+3] += C24; C[ind2+4] += C25; C[ind2+5] += C26; break; case 5: C11=C12=C13=C14=C15=0.0; C21=C22=C23=C24=C25=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; C23 += A2 * B3; C24 += A2 * B4; B5 = B[ind1+4]; C15 += A1 * B5; C25 += A2 * B5; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind2] += C21; C[ind2+1] += C22; C[ind2+2] += C23; C[ind2+3] += C24; C[ind2+4] += C25; break; case 4: C11=C12=C13=C14=0.0; C21=C22=C23=C24=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; C23 += A2 * B3; C24 += A2 * B4; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind2] += C21; C[ind2+1] += C22; C[ind2+2] += C23; C[ind2+3] += C24; break; case 3: C11=C12=C13=0.0; C21=C22=C23=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; B3 = B[ind1+2]; C13 += A1 * B3; C23 += A2 * B3; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind2] += C21; C[ind2+1] += C22; C[ind2+2] += C23; break; case 2: C11=C12=0.0; C21=C22=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; C21 += A2 * B1; C22 += A2 * B2; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind2] += C21; C[ind2+1] += C22; break; case 1: C11=0.0; C21=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[row1*m+j]; A2 = A[row2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; C11 += A1 * B1; C21 += A2 * B1; } ind1 = row1 * m + j2_b; ind2 = row2 * m + j2_b; C[ind1] += C11; C[ind2] += C21; break; default : exit(1); } } /* next pair of rows */ if (row_rem) { for (col1=j1; col1 < j2_b ; col1 += 8 ) { C11=C12=C13=C14=C15=C16=C17=C18=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + col1; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; B5 = B[ind1+4]; B6 = B[ind1+5]; C15 += A1 * B5; C16 += A1 * B6; B7 = B[ind1+6]; B8 = B[ind1+7]; C17 += A1 * B7; C18 += A1 * B8; } ind1 = i2 * m + col1; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind1+5] += C16; C[ind1+6] += C17; C[ind1+7] += C18; } /* next block of 8 cols */ switch (col_rem) { case 0: break; case 7: C11=C12=C13=C14=C15=C16=C17=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; B5 = B[ind1+4]; B6 = B[ind1+5]; C15 += A1 * B5; C16 += A1 * B6; B7 = B[ind1+6]; C17 += A1 * B7; } ind1 = i2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind1+5] += C16; C[ind1+6] += C17; break; case 6: C11=C12=C13=C14=C15=C16=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; B5 = B[ind1+4]; B6 = B[ind1+5]; C15 += A1 * B5; C16 += A1 * B6; } ind1 = i2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; C[ind1+5] += C16; break; case 5: C11=C12=C13=C14=C15=0.0; for (j=k1; j < j2_a ;j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; B5 = B[ind1+4]; C15 += A1 * B5; } ind1 = i2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; C[ind1+4] += C15; break; case 4: C11=C12=C13=C14=0.0; for (j=k1; j < j2_a ;j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; B3 = B[ind1+2]; B4 = B[ind1+3]; C13 += A1 * B3; C14 += A1 * B4; } ind1 = i2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; C[ind1+3] += C14; break; case 3: C11=C12=C13=0.0; for (j=k1; j < j2_a ;j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; B3 = B[ind1+2]; C13 += A1 * B3; } ind1 = i2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; C[ind1+2] += C13; break; case 2: C11=C12=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; B2 = B[ind1+1]; C11 += A1 * B1; C12 += A1 * B2; } ind1 = i2 * m + j2_b; C[ind1] += C11; C[ind1+1] += C12; break; case 1: C11=0.0; for (j=k1; j < j2_a; j++ ) { ind1 = j*m + j2_b; A1 = A[i2*m+j]; dummy = B[ind1 + m]; B1 = B[ind1]; C11 += A1 * B1; } ind1 = i2 * m + j2_b; C[ind1] += C11; break; default: exit(1); } } /* next pair of rows */ } } } } /* end mult_mfmf_mf() */