#define BLOCK 64


void
	mm_inner(int, const double *, const double *, double *);




void
mm_arbit7(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
/****************************************************/

	}
}

void
mm_arbit6(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
/****************************************************/

	}
}

void
mm_arbit5(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
/****************************************************/

	}
}

void
mm_arbit4(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 
/****************************************************/

	}
}

void
mm_arbit3(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
/****************************************************/

	}
}

void
mm_arbit2(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
/****************************************************/

	}
}

void
mm_arbit1(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
		c0= C[ROW * Skip + i + 0];

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
			}
		
		C[ROW * Skip + i + 0]= c0;
/****************************************************/

	}
}
void
mm_arbit0(int X, int Y, int Z, const double *A, const double *B, double *C,
    int Skip)
{
int
  ROW,
  i,
  j,
  k,
  stop;

#define ICHUNK 8

stop= Z/ICHUNK;
stop= stop * ICHUNK;

for(ROW= 0; ROW < X; ROW++)
 	{
	register double
		c0,
		c1,
		c2,
		c3,
		c4,
		c5,
		c6,
		c7;

	for(i= 0; i < stop; i+= ICHUNK)
		{
		/* c0= c1= c2= c3= c4= c5= c6= c7= 0; */

		c0= C[ROW * Skip + i + 0];
		c1= C[ROW * Skip + i + 1];
		c2= C[ROW * Skip + i + 2];
		c3= C[ROW * Skip + i + 3]; 

		c4= C[ROW * Skip + i + 4]; 
		c5= C[ROW * Skip + i + 5]; 
		c6= C[ROW * Skip + i + 6]; 
		c7= C[ROW * Skip + i + 7]; 

		for(j= 0; j < Y; j++)
			{
			register double
				A_row_j= A[ROW * Skip + j];

			c0+= A_row_j * B[j * Skip + i];
		   	c1+= A_row_j * B[j * Skip + i+1];
		   	c2+= A_row_j * B[j * Skip + i+2];
		   	c3+= A_row_j * B[j * Skip + i+3]; 

		   	c4+= A_row_j * B[j * Skip + i+4]; 
		   	c5+= A_row_j * B[j * Skip + i+5]; 
		   	c6+= A_row_j * B[j * Skip + i+6]; 
		   	c7+= A_row_j * B[j * Skip + i+7]; 
			}
		
		C[ROW * Skip + i + 0]= c0;
		C[ROW * Skip + i + 1]= c1;
		C[ROW * Skip + i + 2]= c2;
		C[ROW * Skip + i + 3]= c3; 

		C[ROW * Skip + i + 4]= c4; 
		C[ROW * Skip + i + 5]= c5; 
		C[ROW * Skip + i + 6]= c6; 
		C[ROW * Skip + i + 7]= c7; 
		}

/**************************************************************/
/****************************************************/

	}
}






#define max(a, b) ((a) > (b) ? (a) : (b))
#define min(a, b) ((a) < (b) ? (a) : (b))

typedef void (* mm_fun)(int, int, int, const double *, const double *, double *, int);

mm_fun
	mm_functions[ICHUNK]= {
		mm_arbit0,
		mm_arbit1,
		mm_arbit2,
		mm_arbit3,
		mm_arbit4,
		mm_arbit5,
		mm_arbit6,
		mm_arbit7};

#define mm_arbit(X, Y, Z, A, B, C, Skip) (mm_functions[Z % ICHUNK])\
	(X, Y, Z, A, B, C, Skip)


void
mul_mfmf_mf(int S, const double *A, const double *B, double *C)
{
register int
	i,
	j,
	k;

if(S <= 160)
	{
	(mm_functions[S % ICHUNK])(S, S, S, A, B, C, S);
	return;
	}

for(i= 0; i < S; i+= BLOCK)
	for(j= 0; j < S; j+= BLOCK)
		for(k= 0; k < S; k+= BLOCK)
			{
			int
				X, Y, Z;
			const double
				*a, *b;
			double
				*c;

			X= min(BLOCK, S - i);
			Y= min(BLOCK, S - k);
			Z= min(BLOCK, S - j);

/*			printf("%d %d %d, %d %d %d\n",
				X, Y, Z, i, j, k); */

			a= &A[i * S + k];
			b= &B[k * S + j];
			c= &C[i * S + j];

			mm_arbit(X, Y, Z, a, b, c, S);
			}
}