#define N 64
#define SRMASK 0x3f

#define G0 0171
#define G1 0133

/* this falls out of favour against a simpler table lookup */
/* as long as the table can be kept in cache */
/* may need to revert to shift and look method if this isn't so */
#if 0
/* quick lookup method for number of 1 bits in a 5-bit quantity */
/* nth bit == parity for quantity n */
/* only useful on StrongArm with arbitrary shift */
/* 1001 0110 0110 1001 0110 1001 1001 0110 */
#define PARITYBIT 0x96696996
/* CODE finds parity of 6-bit quantity */
#if 0
/* this is the simple approach */
#define CODE(n) ( ((n)&1) ^ ((PARITYBIT >> ((n)>>1)) & 1) )
#endif
/* this is 1 instruction more efficient */
#define CODE(n) ( ((n) ^ (PARITYBIT >> ((n)>>1))) & 1)
#endif

int codetable[N];

/* constant TBD */
#define FPONE FIXED_POINT_ONE

/* fixed point mpy */
#define FPTIMES(a,b) ((a)*(b)>>FP_SHIFT)

int metrics[2][N];

/* this is 4*32 bit traceback */
#define SIZETB 4

int tbbuf[SIZETB][N];

int *om, *nm;

/* if it was hard to write, it should be hard to read */
void nextbit(int i, int q, int *tb) {
/* rework this to avoid pipeline stall on the mpy */
#if 0
  im1 = i - FPONE;
  qm1 = q - FPONE;
  im1 = FPTIMES(im1, im1);
  qm1 = FPTIMES(qm1, qm1);
  /* note (x+1)^2 = (x-1)^2 + 4x */
  /* save us the computation cause shifts are quick */
  fouri = i<<2;
  fourq = q<<2;
#endif

  im1 = i - FPONE;
  im1 = im1*im1;
  qm1 = q - FPONE;
  qm1 = qm1*qm1;
  fouri = i<<2;
  fourq = q<<2;
  im1 = im1>>FP_SHIFT;
  qm1 = qm1>>FP_SHIFT;		/* CCC=8 */

  /* swap metric buffers */
  /* using pointers avoids index arithmetic */
  t = om;
  om = nm;
  nm = t;			/* CCC=11 */

  for (i=N; i--; ) {		/* loop overhd, CCC=12+3N */
    /* one loop unroll across MSB of i should give giant win */
    /* since MSB(G0)=MSB(G1)=1 */
    /* do another inversion job like the one below */
    /* too complicated for now;  need sleep */
/*
    ONLY A NOTE:  DO NOT USE
    prev0 = (i << 1);
    cg0 = prev0 & G0;
    cg0 = CODE(cg0);  <- can simplify since LSB=0 (invalid anyways, 7 bits)
*/
/*  table lookup replaces the following */
#if 0
    cg0 = i & (G0>>1);	/* G0>>1 should be precomputed */
    cg0 = CODE(cg0);
    cg1 = i & (G1>>1);
    cg1 = CODE(cg1);
#endif

    cg0 = codetable[i & (G0>>1)];
    cg1 = codetable[i & (G1>>1)];	/* CCC=12+9N */

    d0 = im1 + qm1;
    d1 = d0;				/* CCC=12+11N */

    /* ONLY VALID SINCE LSB(G0)=LSB(G1)=1 */
    /* ensures codes are inverses between m=0 and m=1 */
    /* think about it if you don't get it */
    if (cg0)
      d1 += fouri;	/* check this;  may be backwards */
    else
      d0 += fouri;			/* CCC=12+14N */
    if (cg1)
      d1 += fourq;	/* check this;  may be backwards */
    else
      d0 += fourq;			/* CCC=12+17N */

    prev0 = (i&SRMASK) << 1;
    prev1 = prev0 | 1;			/* CCC=12+20N */

/*  t = &tb[i]; */
    t = tb+i;
    td = *t;
    d0 += om[prev0];
    d1 += om[prev1];			/* CCC=12+26N */

    td=td<<1;		/* serialize traceback */
    if (d0 < d1) {	/* easy since metrics positive! */ /* CCC=12+28N */
      nm[i] = d0;
    }
    else {
      td|=1;
      nm[i] = d1;			/* worst case: CCC=12+31N */
    }
    *t = td;				/* CCC=12+32N */
  }
}

void traceback(int tbnum) {		/* CCC=1 for pass by value */
  /* choose arbitrary start point for TB, output trailing 32 bits */
  state=0;
  for (i=SIZETB; i--; ) {		/* CCC=3+4*3=15 */
    for (j=0; j<32; j++) {		/* CCC=15+4*(1+32*3)=403 */
      state = (state&SRMASK)>>1;	/* CCC=403+2*4*32=659 */
      if((tbbuf[tbnum][state]>>j)&1) {	/* CCC=659+6*32*4=1427 */
	if(!i)
	  OUTPUT 1;	/* assume one cycle for this */
	state |= SRMASK+1;	/* SRMASK+1 should be preevaluated */
					/* CCC=1555+3*32*4=1811 */
      }
      else
	if(!i)
	  OUTPUT 0;			/* CCC=1811+2*32*4=2067 */
    }
    tbnum=(SIZETB+tbnum-1)%SIZETB;	/* CCC=2067+2*4=2075 */
  }
}

main() {
  thisbuf=0;
  startoutput=0;
  /* CCC for this section is cycles to decode 32 bits */
  while (1) {				/* CCC=1 */
    for (i=32; i--; ) {			/* CCC=2+3*32=98 */
      READ(&i,&q);			/* CCC=98+2*32=162 */
      nextbit(i,q,tbbuf[thisbuf]);	/* CCC=162+32*2060+32*1=65952 */
    }
    thisbuf = (thisbuf+1)%SIZETB;	/* CCC=65954 */
    if(!thisbuf)
      startoutput=1;
    if(startoutput)			/* CCC=65957 */
      traceback(thisbuf);		/* CCC=65957+2075=68032 */
  }
}