//////////////////////////////////////////////////////////////////////
// $Id: driver.c,v 1.9 1998/02/24 22:55:33 dmartin Exp $
//////////////////////////////////////////////////////////////////////
//
// Matrix Multiply Contest Test Driver
// U.C. Berkeley, Department of EECS, Computer Science Division
// CS 267, Spring 1998
// Based on code from Chad Yoshikawa
// Extended by David Martin
//
//////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/resource.h>

#define ABS(val) ((val) > 0 ? (val) : -(val))
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define SQR(a)   ((a) * (a))
#define CUBE(a)  ((a) * (a) * (a))

//////////////////////////////////////////////////////////////////////
// BEGIN CONFIGURATION

#define NUM_CORRECTNESS_CHECKS 10
#define RANDOM_TESTS 0
#define MAX_ERROR 2.0

#define TEST_RUNS 30
#define CALC_ITERS(n) (10 + 1e8 / CUBE (n))

int qtest_sizes[] = { 16, 24, 32, 48, 64, 96, 128, 192, 256 }; // aligned sizes
int atest_sizes[] = { 23, 31, 47, 73, 97, 127, 163, 191, 211, 229, 251 };  // odd sizes

/* primes from 16 to 256:
17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97 101 103 107 109
113 127 131 137 139 149 151 157 163 167 173 179 181 191 193 197 199 211
223 227 229 233 239 241 251 
*/

// END CONFIGURATION
//////////////////////////////////////////////////////////////////////

#define NUM_QTESTS (sizeof(qtest_sizes) / sizeof(int))
#define NUM_ATESTS (sizeof(atest_sizes) / sizeof(int))

int* test_sizes[2] = { qtest_sizes, atest_sizes };
int num_tests[2] = { NUM_QTESTS, NUM_ATESTS };

extern void matmul (int i_matdim,
		    const double* pd_A,
		    const double* pd_B,
		    double* pd_C);
#define MUL_MFMF_MF(size,A,B,C) matmul(size,A,B,C)

extern double drand48();
extern unsigned short* seed48();
extern int getrussage(int,struct rusage*);
struct rusage rus; /* starting time */
struct rusage rue; /* ending time */
#define START_TIMING  getrusage(RUSAGE_SELF,&rus);
#define STOP_TIMING   getrusage(RUSAGE_SELF,&rue);

double 
reportTiming() {
  struct timeval utime;
  utime.tv_sec = rue.ru_utime.tv_sec - rus.ru_utime.tv_sec ;
  if ( rue.ru_utime.tv_usec < rus.ru_utime.tv_usec ) {
    utime.tv_sec--;
    utime.tv_usec = 1000000l - rus.ru_utime.tv_usec +
      rue.ru_utime.tv_usec;
  } else
    utime.tv_usec = rue.ru_utime.tv_usec -
      rus.ru_utime.tv_usec ;
  return ((double)utime.tv_sec + (double)utime.tv_usec*1e-6);
}

void
myseed()
{
  int i;
  unsigned short seed16v[3];
  for (i=0;i<3;i++)
     seed16v[i] = time(0);
  seed48(seed16v);
}


/*
** A naive matrix multiply routine.
** Used to test for correctness.
*/
void
naive_mm(int Sm,int Sk,int Sn,
	 const double *A,const double *B,double *C)
{
  int i,j,k;
  for (i=0;i<Sm;i++)
    for (j=0;j<Sn;j++) {
      double tmp = C[i*Sn+j];
      for (k=0;k<Sk;k++)
	tmp += A[i*Sk+k]*B[k*Sn+j];
      C[i*Sn+j] = tmp;
    }
}

/* return a random number uniformly in [l,u] inclusive, l < u */
int 
rrand(int l, int u) 
{ 
  return (l + (int)((1+u-l)*drand48())); 
}


double 
l1_norm(double *mat,int rows,int cols)
{
  double sum=0;
  int i;
  for (i=0;i<rows*cols;i++) {
    double val = *mat++;
    sum += ABS(val);
  }
  return sum;
}

double 
l1_norm_diff(double *mat1,double *mat2,int rows,int cols)
{
  double sum=0;
  int i;
  for (i=0;i<rows*cols;i++) {
    double val = *mat1++ - *mat2++;
    sum += ABS(val);
  }
  return sum;
}


/*
** error: Error formula to compare two 
**        matrix multiplies.
** norm(C1-C2)/(macheps*norm(A)*norm(B)),
** Ci=float(A*B)
** macheps=2^(-24) in single prec.
**        =2^(-53) in double prec.
*/
double 
error(double *mat1,double *mat2,int rows,int cols)
{
  const double macheps = 1.110223024625157e-16; /* = 2^(-53) */
  return l1_norm_diff(mat1,mat2,rows,cols)/
    (macheps*l1_norm(mat1,rows,cols)*l1_norm(mat2,rows,cols));
}


void 
mat_init(double *mat,int rows,int cols)
{
  int i;
  for (i=0;i<rows*cols;i++)
    *mat++ = 2.0*drand48()-1.0;
}

void 
checkCorrect ()
{
  double *A,*B,*C,*cA,*cB,*cC;
  int minDim = 1;
  int maxDim = 256;
  int maxnbytes = sizeof(double) * SQR (maxDim);
  int i, j;

  A = (double*) malloc (maxnbytes);
  B = (double*) malloc (maxnbytes);
  C = (double*) malloc (maxnbytes);
  
  cA = (double*) malloc (maxnbytes);
  cB = (double*) malloc (maxnbytes);
  cC = (double*) malloc (maxnbytes);
  
  fprintf (stderr, "Checking for correctness on sizes:"); 

#if !RANDOM_TESTS
  for (i = 0; i < 2; i++) for (j = 0; j < num_tests[i]; j++) 
    {
      int matdim = test_sizes[i][j];
#else
  for (i = 0; i < NUM_CORRECTNESS_CHECKS; i++) 
    {
      int matdim = rrand (minDim, maxDim);
#endif
      double err;
      int nbytes = sizeof(double) * SQR(matdim);

      fprintf (stderr, " %d", matdim); 

      mat_init (A, matdim, matdim);
      mat_init (B, matdim, matdim);
      mat_init (C, matdim, matdim);

      bcopy ((void*)A, (void*)cA, nbytes);
      bcopy ((void*)B, (void*)cB, nbytes);
      bcopy ((void*)C, (void*)cC, nbytes);

      naive_mm (matdim, matdim, matdim, cA, cB, cC);
      MUL_MFMF_MF (matdim, A, B, C);

      if (bcmp ((void*)A, (void*)cA, nbytes) != 0 ||
	  bcmp ((void*)B, (void*)cB, nbytes) != 0) 
	{
	  fprintf (stderr, "Source matrices were modified.  DISQUALIFIED!!!\n");
	  //exit (0);
	}

      if ((err = error (C, cC, matdim, matdim)) > MAX_ERROR)
	{
	  fprintf (stderr, "Error for test case %dx%d is %f > %f. DISQUALIFIED!!!\n",
		   matdim, matdim, err, MAX_ERROR);
	  //exit (0);
	}

    }
  fprintf (stderr,"\n"); 

  free (A); free (B); free (C); 
  free (cA); free (cB); free (cC);
}

void
timeIt ()
{
  double *A, *B, *C;
  double *oA[TEST_RUNS], *oB[TEST_RUNS], *oC[TEST_RUNS];
  int i, j, k;
  int test;

  for (k = 0; k < 2; k++)
    {
      if (k > 0) printf ("\n");

      for (test = 0; test < num_tests[k]; test++) 
	{
	  int matdim = test_sizes[k][test];
	  const int num_iters = CALC_ITERS (matdim);
	  double max_mflops = 0.0;
	  int run;
	
	  /* make sure these are quad-word (i.e., 16-byte) aligned */
#if 0
	  A = oA = (double*) malloc ((SQR(matdim)+1) * sizeof(double));
	  B = oB = (double*) malloc ((SQR(matdim)+1) * sizeof(double));
	  C = oC = (double*) malloc ((SQR(matdim)+1) * sizeof(double));
#endif
	  
	  for (run = 0; run < TEST_RUNS; run++) 
	    {
	      int iter;
	      double mflops;
	      double utime;

	      /* use different matricies for each trial so that the OS page mapping */
	      /* won't affect the results... */
	      A = oA[run] = (double*) malloc ((SQR(matdim)+rrand(1,10)) * sizeof(double));
	      B = oB[run] = (double*) malloc ((SQR(matdim)+rrand(1,10)) * sizeof(double));
	      C = oC[run] = (double*) malloc ((SQR(matdim)+rrand(1,10)) * sizeof(double));

	      if (((unsigned)A) & 0x8) A = (double*)(((unsigned)A)+0x8);
	      if (((unsigned)B) & 0x8) B = (double*)(((unsigned)B)+0x8);
	      if (((unsigned)C) & 0x8) C = (double*)(((unsigned)C)+0x8);

	      mat_init (A, matdim, matdim);
	      mat_init (B, matdim, matdim);
	      mat_init (C, matdim, matdim);

	      START_TIMING;
	      for (iter=0;iter<num_iters;iter++) 
		{
		  // iteratively accumulate into C
		  MUL_MFMF_MF (matdim, A, B, C);
		}
	      STOP_TIMING;

	      utime = reportTiming();
	      // (2 * n^3) FLOPs (n^3 mul-adds)
	      mflops = 2.0 * CUBE(matdim) * num_iters * 1e-6 / utime;
	      max_mflops = MAX (max_mflops, mflops);
	    }

	  printf("%d %.0f\n", matdim, max_mflops); 
	  fflush(stdout);

	  for (run = 0; run < TEST_RUNS; run++) 
	    {
	      free (oA[run]); 
	      free (oB[run]); 
	      free (oC[run]); 
	    }
	}
    }
}

int 
main()
{
  myseed();
  checkCorrect ();
  timeIt ();
  return 0;
}

//////////////////////////////////////////////////////////////////////
// $Log: driver.c,v $
// Revision 1.9  1998/02/24 22:55:33  dmartin
// *** empty log message ***
//
// Revision 1.8  1998/02/24 22:52:23  dmartin
// better correctness checks
//  - checks that A,B not modified
//  - initializes C to non-zero
// checks for correctness on the sizes that are timed
//
// Revision 1.7  1998/02/07 02:04:53  dmartin
// fixed possible accuracy bug in CALC_ITERS
//
// Revision 1.6  1998/02/05 21:28:11  dmartin
// *** empty log message ***
//
// Revision 1.5  1998/02/05 21:27:49  dmartin
// fixed num_tests bug
//
// Revision 1.4  1998/02/05 21:13:11  dmartin
// *** empty log message ***
//
// Revision 1.3  1998/02/05 21:11:23  dmartin
// iterations calcuated from size
// code cleaned up a bit
// no functional changes
//
// Revision 1.2  1998/01/27 02:14:23  dmartin
// *** empty log message ***
//
// Revision 1.1  1998/01/19 00:49:45  dmartin
// Initial revision
//
//////////////////////////////////////////////////////////////////////