/**********************************************************************
 *
 * Hongyun Wang & Paul Milligan, January 1995.
 *
 * CS 267 "Applications of Parallel Computers"
 * Square matrix multiplication:
 * C = C + A*B
 * Optimised for the IBM RS6000 Power2 
 *
**********************************************************************/

void mul_mfmf_mf( int m,                
                  const double *A,     
                  const double *B,     
                  double *C )
{
#define N3 32

  int m_block, m_chunk, m_rem, N2 ;
  int i1, j1, k1;
  int m_row, m_col_a, m_col_b;

  double dummy;

  int row_rem, col_rem;
  register int i2, j2_b, j2_a;
  register int row1, col1, row2, j;
  register int ind1, ind2;

  register double C11,C12,C13,C14,C15,C16,C17,C18,
		  C21,C22,C23,C24,C25,C26,C27,C28;
  register double A1,A2;
  register double B1,B2,B3,B4,B5,B6,B7,B8;

  if (m < 161) 
    N2 = 160;
  else
    N2 = 96;
  
  m_block = (m - N3) / N2;
  m_chunk = m_block * N2;
  m_rem = m - m_block * N2;

  m_row = N2;
  for (i1 = 0; i1 < m; i1 += m_row)
  {
    if (i1 == m_chunk) m_row = m_rem;
    m_col_b = N2;
    for (j1 = 0; j1 < m; j1 += m_col_b)
    {
      if (j1 == m_chunk) m_col_b = m_rem;
      m_col_a = N2;
      for ( k1 = 0; k1 < m; k1 += m_col_a)
      {
      if (k1 == m_chunk) m_col_a = m_rem;

  i2 = (m_row / 2) * 2;
  row_rem = m_row - i2;
  i2 += i1;

  j2_b = (m_col_b / 8) * 8;
  col_rem = m_col_b - j2_b;
  j2_b += j1;
  j2_a = m_col_a + k1;

  for (row1=i1; row1 < i2; row1 += 2 ) 
  {
     row2 = row1 + 1;

     /* then operate on 8 col blocks: */

     for (col1=j1; col1 < j2_b; col1 += 8 ) 
     {
        C11=C12=C13=C14=C15=C16=C17=C18=0.0;
	C21=C22=C23=C24=C25=C26=C27=C28=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + col1;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;
           C23 += A2 * B3;
           C24 += A2 * B4;

           B5 = B[ind1+4];
           B6 = B[ind1+5];
           C15 += A1 * B5;
           C16 += A1 * B6;
           C25 += A2 * B5;
           C26 += A2 * B6;

           B7 = B[ind1+6];
           B8 = B[ind1+7];
           C17 += A1 * B7;
           C18 += A1 * B8;
           C27 += A2 * B7;
           C28 += A2 * B8;
        }
        ind1 = row1 * m + col1;
	ind2 = row2 * m + col1;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;
        C[ind1+5] += C16;
        C[ind1+6] += C17;
        C[ind1+7] += C18;

	C[ind2] += C21;
	C[ind2+1] += C22;
	C[ind2+2] += C23;
	C[ind2+3] += C24;
	C[ind2+4] += C25;
	C[ind2+5] += C26;
	C[ind2+6] += C27;
	C[ind2+7] += C28;

    } /* next block of 8 cols */
    switch (col_rem) 
    {
      case 0:
	break;

      case 7:
        C11=C12=C13=C14=C15=C16=C17=0.0;
	C21=C22=C23=C24=C25=C26=C27=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;
           C23 += A2 * B3;
           C24 += A2 * B4;

           B5 = B[ind1+4];
           B6 = B[ind1+5];
           C15 += A1 * B5;
           C16 += A1 * B6;
           C25 += A2 * B5;
           C26 += A2 * B6;

           B7 = B[ind1+6];
           C17 += A1 * B7;
           C27 += A2 * B7;
        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;
        C[ind1+5] += C16;
        C[ind1+6] += C17;

	C[ind2] += C21;
	C[ind2+1] += C22;
	C[ind2+2] += C23;
	C[ind2+3] += C24;
	C[ind2+4] += C25;
	C[ind2+5] += C26;
	C[ind2+6] += C27;

	break;

      case 6:
        C11=C12=C13=C14=C15=C16=0.0;
	C21=C22=C23=C24=C25=C26=0.0;
        for (j=k1; j < j2_a ;j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;
           C23 += A2 * B3;
           C24 += A2 * B4;

           B5 = B[ind1+4];
           B6 = B[ind1+5];
           C15 += A1 * B5;
           C16 += A1 * B6;
           C25 += A2 * B5;
           C26 += A2 * B6;

        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;
        C[ind1+5] += C16;

	C[ind2] += C21;
	C[ind2+1] += C22;
	C[ind2+2] += C23;
	C[ind2+3] += C24;
	C[ind2+4] += C25;
	C[ind2+5] += C26;

	break;

      case 5:
        C11=C12=C13=C14=C15=0.0;
	C21=C22=C23=C24=C25=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;
           C23 += A2 * B3;
           C24 += A2 * B4;

           B5 = B[ind1+4];
           C15 += A1 * B5;
           C25 += A2 * B5;

        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;

	C[ind2] += C21;
	C[ind2+1] += C22;
	C[ind2+2] += C23;
	C[ind2+3] += C24;
	C[ind2+4] += C25;

	break;

      case 4:
        C11=C12=C13=C14=0.0;
	C21=C22=C23=C24=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;
           C23 += A2 * B3;
           C24 += A2 * B4;

        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;

	C[ind2] += C21;
	C[ind2+1] += C22;
	C[ind2+2] += C23;
	C[ind2+3] += C24;

	break;

      case 3:
        C11=C12=C13=0.0;
	C21=C22=C23=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

           B3 = B[ind1+2];
           C13 += A1 * B3;
           C23 += A2 * B3;

        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;

	C[ind2] += C21;
	C[ind2+1] += C22;
	C[ind2+2] += C23;

	break;

      case 2:
        C11=C12=0.0;
	C21=C22=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;
           C21 += A2 * B1;
           C22 += A2 * B2;

        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;

	C[ind2] += C21;
	C[ind2+1] += C22;

	break;

      case 1:
        C11=0.0;
	C21=0.0;
        for (j=k1; j < j2_a; j++ ) 
	{
	   ind1 = j*m + j2_b;
           A1 = A[row1*m+j];
           A2 = A[row2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           C11 += A1 * B1;
           C21 += A2 * B1;

        }
        ind1 = row1 * m + j2_b;
	ind2 = row2 * m + j2_b;
        C[ind1] += C11;

	C[ind2] += C21;

	break;

      default :
	exit(1);
    }
  } /* next pair of rows */
  if (row_rem) 
  {
     for (col1=j1; col1 < j2_b ; col1 += 8 ) {
        C11=C12=C13=C14=C15=C16=C17=C18=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + col1;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;

           B5 = B[ind1+4];
           B6 = B[ind1+5];
           C15 += A1 * B5;
           C16 += A1 * B6;

           B7 = B[ind1+6];
           B8 = B[ind1+7];
           C17 += A1 * B7;
           C18 += A1 * B8;
        }
        ind1 = i2 * m + col1;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;
        C[ind1+5] += C16;
        C[ind1+6] += C17;
        C[ind1+7] += C18;

    } /* next block of 8 cols */
    switch (col_rem) 
    {
      case 0:
	break;

      case 7:
        C11=C12=C13=C14=C15=C16=C17=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;

           B5 = B[ind1+4];
           B6 = B[ind1+5];
           C15 += A1 * B5;
           C16 += A1 * B6;

           B7 = B[ind1+6];
           C17 += A1 * B7;
        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;
        C[ind1+5] += C16;
        C[ind1+6] += C17;

	break;

      case 6:
        C11=C12=C13=C14=C15=C16=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;

           B5 = B[ind1+4];
           B6 = B[ind1+5];
           C15 += A1 * B5;
           C16 += A1 * B6;

        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;
        C[ind1+5] += C16;

	break;

      case 5:
        C11=C12=C13=C14=C15=0.0;
        for (j=k1; j < j2_a ;j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;

           B5 = B[ind1+4];
           C15 += A1 * B5;

        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;
        C[ind1+4] += C15;

	break;

      case 4:
        C11=C12=C13=C14=0.0;
        for (j=k1; j < j2_a ;j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

           B3 = B[ind1+2];
           B4 = B[ind1+3];
           C13 += A1 * B3;
           C14 += A1 * B4;

        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;
        C[ind1+3] += C14;

	break;

      case 3:
        C11=C12=C13=0.0;
        for (j=k1; j < j2_a ;j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

           B3 = B[ind1+2];
           C13 += A1 * B3;

        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;
        C[ind1+2] += C13;

	break;

      case 2:
        C11=C12=0.0;
        for (j=k1; j < j2_a; j++ ) {
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           B2 = B[ind1+1];
           C11 += A1 * B1;
           C12 += A1 * B2;

        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;
        C[ind1+1] += C12;

	break;

      case 1:
        C11=0.0;
        for (j=k1; j < j2_a; j++ ) 
	{
	   ind1 = j*m + j2_b;
           A1 = A[i2*m+j];

	   dummy = B[ind1 + m];
           B1 = B[ind1];
           C11 += A1 * B1;

        }
        ind1 = i2 * m + j2_b;
        C[ind1] += C11;

	break;

      default:
	exit(1);
    }
  } /* next pair of rows */
      }
    }
  }
} /* end mult_mfmf_mf() */