/*
**
** PHiPAC Matrix-Matrix Code for the operation:
**    C = A*B + C
**
** Automatically Generated by mm_gen ($Revision: 1.29 $) using the command:
**    /db/bilmes/phipac/mm_gen/mm_gen -cb 1 1 10 -cb 200 100 40 -prec doub -file mm.c 
**
** Run '/db/bilmes/phipac/mm_gen/mm_gen -help' for help.
**
** Generated on: Monday January 29 1996, 17:27:02 PST
** Created by: Jeff Bilmes <bilmes@cs.berkeley.edu>
**             http://www.icsi.berkeley.edu/~bilmes/phipac
**
**
** Usage:
**    mul_mdmd_md(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
** where
**  A is an MxK matrix
**  B is an KxN matrix
**  C is an MxN matrix
**  Astride is the number of entries between the start of each row of A
**  Bstride is the number of entries between the start of each row of B
**  Cstride is the number of entries between the start of each row of C
**
**
** "Copyright (c) 1995 The Regents of the University of California.  All
** rights reserved."  Permission to use, copy, modify, and distribute
** this software and its documentation for any purpose, without fee, and
** without written agreement is hereby granted, provided that the above
** copyright notice and the following two paragraphs appear in all copies
** of this software.
**
** IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
** DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
** OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF
** CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**
** THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
** INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
** AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
** ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO
** PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
**
*/

#define LOAD1x1(c00,C,Cstride) \
{\
   double * _cp = C; \
   c00 = _cp[0]; \
}

#define STORE1x1(c00,C,Cstride) \
{\
   double *_cp = C; \
   _cp[0] = c00; \
}

/* Fixed M,K,N = 1,1,1 fully-unrolled matrix matrix multiply. */
#define mul_md1x1md1x1_md1x1(c00,A0,B,Bstride) \
{ \
   register double _b0; \
   register double _a0; \
   \
   \
   _b0 = B[0]; \
   B += Bstride; \
   _a0 = A0[0]; \
   c00 += _a0*_b0; \
}


#define LOAD1x2(c00,c01,C,Cstride) \
{\
   double * _cp = C; \
   c00 = _cp[0]; c01 = _cp[1]; \
}

#define STORE1x2(c00,c01,C,Cstride) \
{\
   double *_cp = C; \
   _cp[0] = c00; _cp[1] = c01; \
}

/* Fixed M,K,N = 1,1,2 fully-unrolled matrix matrix multiply. */
#define mul_md1x1md1x2_md1x2(c00,c01,A0,B,Bstride) \
{ \
   register double _b0,_b1; \
   register double _a0; \
   \
   \
   _b0 = B[0]; _b1 = B[1]; \
   B += Bstride; \
   _a0 = A0[0]; \
   c00 += _a0*_b0; c01 += _a0*_b1; \
}


#define LOAD1x4(c00,c01,c02,c03,C,Cstride) \
{\
   double * _cp = C; \
   c00 = _cp[0]; c01 = _cp[1]; c02 = _cp[2]; c03 = _cp[3]; \
}

#define STORE1x4(c00,c01,c02,c03,C,Cstride) \
{\
   double *_cp = C; \
   _cp[0] = c00; _cp[1] = c01; _cp[2] = c02; _cp[3] = c03; \
}

/* Fixed M,K,N = 1,1,4 fully-unrolled matrix matrix multiply. */
#define mul_md1x1md1x4_md1x4(c00,c01,c02,c03,A0,B,Bstride) \
{ \
   register double _b0,_b1,_b2,_b3; \
   register double _a0; \
   \
   \
   _b0 = B[0]; _b1 = B[1]; _b2 = B[2]; _b3 = B[3]; \
   B += Bstride; \
   _a0 = A0[0]; \
   c00 += _a0*_b0; c01 += _a0*_b1; c02 += _a0*_b2; c03 += _a0*_b3; \
}


#define LOAD1x8(c00,c01,c02,c03,c04,c05,c06,c07,C,Cstride) \
{\
   double * _cp = C; \
   c00 = _cp[0]; c01 = _cp[1]; c02 = _cp[2]; c03 = _cp[3]; c04 = _cp[4]; c05 = _cp[5]; c06 = _cp[6]; c07 = _cp[7]; \
}

#define STORE1x8(c00,c01,c02,c03,c04,c05,c06,c07,C,Cstride) \
{\
   double *_cp = C; \
   _cp[0] = c00; _cp[1] = c01; _cp[2] = c02; _cp[3] = c03; _cp[4] = c04; _cp[5] = c05; _cp[6] = c06; _cp[7] = c07; \
}

/* Fixed M,K,N = 1,1,8 fully-unrolled matrix matrix multiply. */
#define mul_md1x1md1x8_md1x8(c00,c01,c02,c03,c04,c05,c06,c07,A0,B,Bstride) \
{ \
   register double _b0,_b1,_b2,_b3,_b4,_b5,_b6,_b7; \
   register double _a0; \
   \
   \
   _b0 = B[0]; _b1 = B[1]; _b2 = B[2]; _b3 = B[3]; _b4 = B[4]; _b5 = B[5]; _b6 = B[6]; _b7 = B[7]; \
   B += Bstride; \
   _a0 = A0[0]; \
   c00 += _a0*_b0; c01 += _a0*_b1; c02 += _a0*_b2; c03 += _a0*_b3; c04 += _a0*_b4; c05 += _a0*_b5; c06 += _a0*_b6; c07 += _a0*_b7; \
}


#define LOAD1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,C,Cstride) \
{\
   double * _cp = C; \
   c00 = _cp[0]; c01 = _cp[1]; c02 = _cp[2]; c03 = _cp[3]; c04 = _cp[4]; c05 = _cp[5]; c06 = _cp[6]; c07 = _cp[7]; c08 = _cp[8]; c09 = _cp[9]; \
}

#define STORE1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,C,Cstride) \
{\
   double *_cp = C; \
   _cp[0] = c00; _cp[1] = c01; _cp[2] = c02; _cp[3] = c03; _cp[4] = c04; _cp[5] = c05; _cp[6] = c06; _cp[7] = c07; _cp[8] = c08; _cp[9] = c09; \
}

/* Fixed M,K,N = 1,1,10 fully-unrolled matrix matrix multiply. */
#define mul_md1x1md1x10_md1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,A0,B,Bstride) \
{ \
   register double _b0,_b1,_b2,_b3,_b4,_b5,_b6,_b7,_b8,_b9; \
   register double _a0; \
   \
   \
   _b0 = B[0]; _b1 = B[1]; _b2 = B[2]; _b3 = B[3]; _b4 = B[4]; _b5 = B[5]; _b6 = B[6]; _b7 = B[7]; _b8 = B[8]; _b9 = B[9]; \
   B += Bstride; \
   _a0 = A0[0]; \
   c00 += _a0*_b0; c01 += _a0*_b1; c02 += _a0*_b2; c03 += _a0*_b3; c04 += _a0*_b4; c05 += _a0*_b5; c06 += _a0*_b6; c07 += _a0*_b7; c08 += _a0*_b8; c09 += _a0*_b9; \
}


/* Fixed M,N = 200,400, Arbitrary K L0-blocked matrix matrix multiply. */
static void
mul_mdmd_md_l1_arb_k(int K, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
{
   const double *a0,*b0;
   double *c0;
   const double *ap0_0;
   const double *bp0;
   double *cp0;
   const int A_sbs_stride = Astride*1;
   const int C_sbs_stride = Cstride*1;
   const int k_marg_el = K & 0;
   const int k_norm = K - k_marg_el;
   double *const c0_endp = C+200*Cstride;
   register double c00,c01,c02,c03,c04,c05,c06,c07,c08,c09;
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const double* const ap0_endp = a0 + k_norm;
      double* const cp0_endp = c0 + 400;
      for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=10,cp0+=10) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x10_md1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0,Bstride);
         }
         STORE1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
      }
   }
}

/* Arbitrary M,K,N L0-blocked matrix matrix multiply. */
static void
mul_mdmd_md_l1_arb_all(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
{
   const double *a0,*b0;
   double *c0;
   const double *ap0_0;
   const double *bp0;
   double *cp0;
   const int A_sbs_stride = Astride*1;
   const int C_sbs_stride = Cstride*1;
   const int k_marg_el = K & 0;
   const int k_norm = K - k_marg_el;
   const int m_marg_el = M & 0;
   const int m_norm = M - m_marg_el;
   const int n_marg_el = N % 10;
   const int n_norm = N - n_marg_el;
   double *const c0_endp = C+m_norm*Cstride;
   register double c00,c01,c02,c03,c04,c05,c06,c07,c08,c09;
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const double* const ap0_endp = a0 + k_norm;
      double* const cp0_endp = c0 + n_norm;
      for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=10,cp0+=10) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x10_md1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0,Bstride);
         }
         STORE1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
      }
   }
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const double* const ap0_endp = a0 + k_norm;
      b0 = B+n_norm;
      cp0 = c0+n_norm;
      if (n_marg_el & 0x8) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x8(c00,c01,c02,c03,c04,c05,c06,c07,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x8_md1x8(c00,c01,c02,c03,c04,c05,c06,c07,ap0_0,bp0,Bstride);
         }
         STORE1x8(c00,c01,c02,c03,c04,c05,c06,c07,cp0,Cstride);
         b0 += 8;
         cp0 += 8;
      }
      if (n_marg_el & 0x4) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x4(c00,c01,c02,c03,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x4_md1x4(c00,c01,c02,c03,ap0_0,bp0,Bstride);
         }
         STORE1x4(c00,c01,c02,c03,cp0,Cstride);
         b0 += 4;
         cp0 += 4;
      }
      if (n_marg_el & 0x2) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x2(c00,c01,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x2_md1x2(c00,c01,ap0_0,bp0,Bstride);
         }
         STORE1x2(c00,c01,cp0,Cstride);
         b0 += 2;
         cp0 += 2;
      }
      if (n_marg_el & 0x1) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x1(c00,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x1_md1x1(c00,ap0_0,bp0,Bstride);
         }
         STORE1x1(c00,cp0,Cstride);
      }
   }
}

/* Fixed M,K,N = 200,100,400 L0-blocked matrix matrix multiply. */
static void
mul_mdmd_md_l1(const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
{
   const double *a0,*b0;
   double *c0;
   const double *ap0_0;
   const double *bp0;
   double *cp0;
   const int A_sbs_stride = Astride*1;
   const int C_sbs_stride = Cstride*1;
   double *const c0_endp = C+200*Cstride;
   register double c00,c01,c02,c03,c04,c05,c06,c07,c08,c09;
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const double* const ap0_endp = a0 + 100;
      double* const cp0_endp = c0 + 400;
      for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=10,cp0+=10) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=1) {
            mul_md1x1md1x10_md1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0,Bstride);
         }
         STORE1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
      }
   }
}

void
mul_mdmd_md(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
{
   /* Code for L1-blocked routine. */
   int m2,k2,n2;
   const double *a2,*b2;
   double *c2;
   const double *ap2,*bp2;
   double *cp2;
   if (M < 201 && K < 101 && N < 401) {
      mul_mdmd_md_l1_arb_all(M,K,N,A,B,C,Astride,Bstride,Cstride);
      return;
   }
   for (m2=0; m2<=M-200; m2+=200) {
      c2 = C + m2*Cstride;
      a2 = A + m2*Astride;
      for (n2=0,b2=B,cp2=c2; n2<=N-400; n2+=400,b2+=400,cp2+=400) {
         for (k2=0,bp2=b2,ap2=a2; k2<=K-100; k2+=100,bp2+=100*Bstride,ap2+=100) {
            mul_mdmd_md_l1(ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
         if (k2 < K) {
            mul_mdmd_md_l1_arb_k(K-k2,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
      }
      if (n2 < N) {
         for (k2=0,bp2=b2,ap2=a2; k2<=K-100; k2+=100,bp2+=100*Bstride,ap2+=100) {
            mul_mdmd_md_l1_arb_all(200,100,N-n2,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
         if (k2 < K) {
            mul_mdmd_md_l1_arb_all(200,K-k2,N-n2,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
      }
   }
   if (m2 < M) {
      c2 = C + m2*Cstride;
      a2 = A + m2*Astride;
      for (n2=0,b2=B,cp2=c2; n2<=N-400; n2+=400,b2+=400,cp2+=400) {
         for (k2=0,bp2=b2,ap2=a2; k2<=K-100; k2+=100,bp2+=100*Bstride,ap2+=100) {
            mul_mdmd_md_l1_arb_all(M-m2,100,400,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
         if (k2 < K) {
            mul_mdmd_md_l1_arb_all(M-m2,K-k2,400,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
      }
      if (n2 < N) {
         for (k2=0,bp2=b2,ap2=a2; k2<=K-100; k2+=100,bp2+=100*Bstride,ap2+=100) {
            mul_mdmd_md_l1_arb_all(M-m2,100,N-n2,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
         if (k2 < K) {
            mul_mdmd_md_l1_arb_all(M-m2,K-k2,N-n2,ap2,bp2,cp2,Astride,Bstride,Cstride);
         }
      }
   }
}