csb/html/bmcsb_8cpp_source.html

 #include "bmcsb.h"

 #include "utility.h"


 // Choose block size as big as possible given the following constraints

 // 1) The bot array is addressible by IT

 // 2) The parts of x & y vectors that a block touches fits into L2 cache [assuming a saxpy() operation]

 // 3) There's enough parallel slackness for block rows (at least SLACKNESS * CILK_NPROC)

 template <class NT, class IT, unsigned TTDIM>

 void BmCsb<NT, IT, TTDIM>::Init(int workers, IT forcelogbeta)

 {

     ispar = (workers > 1);

     IT roundrowup = nextpoweroftwo(m);

     IT roundcolup = nextpoweroftwo(n);


     // if indices are negative, highestbitset returns -1,

     // but that will be caught by the sizereq below

     IT rowbits = highestbitset(roundrowup);

     IT colbits = highestbitset(roundcolup);

     bool sizereq;

     if (ispar)

     {

         sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers)

             && (IntPower<2>(colbits) > SLACKNESS * workers));

     }

     else

     {

         sizereq = ((rowbits > 1) && (colbits > 1));

     }

     if(!sizereq)

     {

         cerr << "Matrix too small for this library" << endl;

         return;

     }


     rowlowbits = rowbits-1;

     collowbits = colbits-1;

     IT inf = numeric_limits<IT>::max();

     IT maxbits = highestbitset(inf);


     rowhighbits = rowbits-rowlowbits;   // # higher order bits for rows (has at least one bit)

     colhighbits = colbits-collowbits;   // # higher order bits for cols (has at least one bit)

     if(ispar)

     {

         while(IntPower<2>(rowhighbits) < SLACKNESS * workers)

         {

             rowhighbits++;

             rowlowbits--;

         }

     }


     // calculate the space that suby occupies in L2 cache

     IT yL2 = IntPower<2>(rowlowbits) * sizeof(NT);

     while(yL2 > L2SIZE)

     {

         yL2 /= 2;

         rowhighbits++;

         rowlowbits--;

     }


     // calculate the space that subx occupies in L2 cache

     IT xL2 = IntPower<2>(collowbits) * sizeof(NT);

     while(xL2 > L2SIZE)

     {

         xL2 /= 2;

         colhighbits++;

         collowbits--;

     }


     // blocks need to be square for correctness (maybe generalize this later?)

     while(rowlowbits+collowbits > maxbits)

     {

         if(rowlowbits > collowbits)

         {

             rowhighbits++;

             rowlowbits--;

         }

         else

         {

             colhighbits++;

             collowbits--;

         }

     }

     while(rowlowbits > collowbits)

     {

         rowhighbits++;

         rowlowbits--;

     }

     while(rowlowbits < collowbits)

     {

         colhighbits++;

         collowbits--;

     }

     assert (collowbits == rowlowbits);

     lowrowmask = IntPower<2>(rowlowbits) - 1;

     lowcolmask = IntPower<2>(collowbits) - 1;

     if(forcelogbeta != 0)

     {

         IT candlowmask  = IntPower<2>(forcelogbeta) -1;

         cout << "Forcing beta to "<< (candlowmask+1) << " instead of the chosen " << (lowrowmask+1) << endl;

         cout << "Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;

         lowrowmask = lowcolmask = candlowmask;

         rowlowbits = collowbits = forcelogbeta;

         rowhighbits = rowbits-rowlowbits;

         colhighbits = colbits-collowbits;

     }

     else

     {

         double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));

         IT logbeta = static_cast<IT>(ceil(log2(sqrtn))) + 2;

         if(rowlowbits > logbeta)

         {

             rowlowbits = collowbits = logbeta;

             lowrowmask = lowcolmask = IntPower<2>(logbeta) -1;

             rowhighbits = rowbits-rowlowbits;

                     colhighbits = colbits-collowbits;

         }

         cout << "Beta chosen to be "<< (lowrowmask+1) << endl;

     }

     highrowmask = ((roundrowup - 1) ^ lowrowmask);

     highcolmask = ((roundcolup - 1) ^ lowcolmask);


     // nbc = #{block columns} = #{blocks in any block row},  nbr = #{block rows)

     IT blcdimrow = lowrowmask + 1;

         IT blcdimcol = lowcolmask + 1;

         nbr = static_cast<IT>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));

         nbc = static_cast<IT>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));


     blcrange = (lowrowmask+1) * (lowcolmask+1); // range indexed by one block

     mortoncmp = MortonCompare<IT>(rowlowbits, collowbits, lowrowmask, lowcolmask);

 }


 // copy constructor

 template <class NT, class IT, unsigned TTDIM>

 BmCsb<NT, IT, TTDIM>::BmCsb (const BmCsb<NT,IT,TTDIM> & rhs)

 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc), nrb(rhs.nrb),

 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),

 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),

 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)

 {

     if(nz > 0)  // nz > 0 iff nrb > 0

     {

         num = new NT[nz+2](); num++;

         bot = new IT[nrb];

         masks = new MTYPE[nrb];


         copy ( rhs.num, rhs.num+nz+1, num);

         copy ( rhs.bot, rhs.bot+nrb, bot );

         copy ( rhs.masks, rhs.masks+nrb, masks );

     }

     if ( nbr > 0)

     {

         top = new IT* [nbr];

         for(IT i=0; i<nbr; ++i)

             top[i] = new IT[nbc+1];

         for(IT i=0; i<nbr; ++i)

             for(IT j=0; j <= nbc; ++j)

                 top[i][j] = rhs.top[i][j];

     }

 }


 template <class NT, class IT, unsigned TTDIM>

 BmCsb<NT, IT, TTDIM> & BmCsb<NT, IT, TTDIM>::operator= (const BmCsb<NT, IT, TTDIM> & rhs)

 {

     if(this != &rhs)

     {

         if(nz > 0)  // if the existing object is not empty

         {

             // make it empty

             delete [] masks;

             delete [] bot;

             delete [] (--num);

         }

         if(nbr > 0)

         {

             for(IT i=0; i<nbr; ++i)

                 delete [] top[i];

             delete [] top;

         }


         ispar   = rhs.ispar;

         nz  = rhs.nz;

         nrb     = rhs.nrb;

         n   = rhs.n;

         m       = rhs.m;

         nbr     = rhs.nbr;

         nbc     = rhs.nbc;

         blcrange = rhs.blcrange;


         rowhighbits = rhs.rowhighbits;

         rowlowbits = rhs.rowlowbits;

         highrowmask = rhs.highrowmask;

         lowrowmask = rhs.lowrowmask;


         colhighbits = rhs.colhighbits;

         collowbits = rhs.collowbits;

         highcolmask = rhs.highcolmask;

         lowcolmask= rhs.lowcolmask;

         mortoncmp = rhs.mortoncmp;


         if(nz > 0)  // if the copied object is not empty

         {

             num = new NT[nz+2]();  num++;

             bot = new IT[nrb];

             masks = new MTYPE[nrb];


             copy ( rhs.num, rhs.num+nz+1, num);

             copy ( rhs.bot, rhs.bot+nrb, bot );

             copy ( rhs.masks, rhs.masks+nrb, masks );

         }

         if(nbr > 0)

         {

             top = new IT* [nbr];

             for(IT i=0; i<nbr; ++i)

                 top[i] = new IT[nbc+1];

             for(IT i=0; i<nbr; ++i)

                 for(IT j=0; j <= nbc; ++j)

                     top[i][j] = rhs.top[i][j];

         }

     }

     return *this;

 }


 template <class NT, class IT, unsigned TTDIM>

 BmCsb<NT, IT, TTDIM>::~BmCsb()

 {

     if( nz > 0)

     {

         delete [] masks;

         delete [] bot;

         delete [] (--num);

     }

     if ( nbr > 0)

     {

         for(IT i=0; i<nbr; ++i)

             delete [] top[i];

         delete [] top;

     }

 }


 template <class NT, class IT, unsigned TTDIM>

 BmCsb<NT, IT, TTDIM>::BmCsb (Csc<NT, IT> & csc, int workers):nz(csc.nz), m(csc.m),n(csc.n)

 {

         typedef std::pair<IT, IT> ipair;

         typedef std::pair<IT, ipair> mypair;


         assert(nz != 0 && n != 0 && m != 0);

         Init(workers);


         num = new NT[nz+2]();   num++;  // Padding for SSEspmv (the blendv operation)

     // bot is later to be resized to nrb (number of register blocks)

     // nrb < nz as the worst case happens when each register block contains only one nonzero


         top = allocate2D<IT>(nbr, nbc+1);

         mypair * pairarray = new mypair[nz];

         IT k = 0;

         for(IT j = 0; j < n; ++j)

         {

                 for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i) // scan the jth column

                 {

                         // concatenate the higher/lower order half of both row (first) index and col (second) index bits

                         IT hindex = (((highrowmask &  csc.ir[i] ) >> rowlowbits) << colhighbits)

                                                                                 | ((highcolmask & j) >> collowbits);

                         IT lindex = ((lowrowmask &  csc.ir[i]) << collowbits) | (lowcolmask & j) ;


                         // i => location of that nonzero in csc.ir and csc.num arrays^M

                         pairarray[k++] = mypair(hindex, ipair(lindex,i));

                 }

         }

         sort(pairarray, pairarray+nz);  // sort according to hindex

         SortBlocks(pairarray, csc.num);

         delete [] pairarray;

 }


 template <class NT, class IT, unsigned TTDIM>

 void BmCsb<NT, IT, TTDIM>::SortBlocks(pair<IT, pair<IT,IT> > * pairarray, NT * val)

 {

         typedef pair<IT, pair<IT, IT> > mypair;

         IT cnz = 0;

     IT crb = 0; // current register block

         IT ldim = IntPower<2>(colhighbits);  // leading dimension (not always equal to nbc)

     vector<IT> tempbot;

     vector<MTYPE> M;

         for(IT i = 0; i < nbr; ++i)

         {

                 for(IT j = 0; j < nbc; ++j)

                 {

                         top[i][j] = tempbot.size(); // top array now points to register blocks (instead of nonzeros)

                         IT prevcnz = cnz;

                         std::vector<mypair> blocknz;

                         while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) )        // as long as we're in this block

                         {

                                 IT lowbits = pairarray[cnz].second.first;

                                 IT rlowbits = ((lowbits >> collowbits) & lowrowmask);

                                 IT clowbits = (lowbits & lowcolmask);

                                 IT bikey = BitInterleaveLow(rlowbits, clowbits);


                                 blocknz.push_back(mypair(bikey, pairarray[cnz++].second));

                         }

                         // sort the block into bitinterleaved order

                         sort(blocknz.begin(), blocknz.end());


             int lastregblk = -1;

             IT bnz = blocknz.size();


             for(IT bcur=0; bcur < bnz; ++bcur)

             {

                 int curregblk = getDivident(blocknz[bcur].first, RBSIZE);

                 if(curregblk > lastregblk)  // new register block

                 {

                     lastregblk = curregblk;

                     M.push_back((MTYPE) 0);


                     // The following lines implement a get_head function that returns

                     // the top-left index of the register block that this nonzero belongs

                         IT Ci = blocknz[bcur].second.first & lowcolmask;

                         IT Ri = (blocknz[bcur].second.first >> collowbits) & lowrowmask;

                     Ci -= getModulo(Ci,RBDIM);

                     Ri -= getModulo(Ri,RBDIM);

                     IT lefttop = ((lowrowmask & Ri) << collowbits) | (lowcolmask & Ci);


                     tempbot.push_back(lefttop);

                 }

                 M.back() |= GetMaskTable<MTYPE>(getModulo(blocknz[bcur].first, RBSIZE));

             }

                         for(IT k=prevcnz; k<cnz ; ++k)

                         {

                                 num[k] = val[blocknz[k-prevcnz].second.second];

                         }

                 }

                 top[i][nbc] = tempbot.size();

         }

     assert(M.size() == tempbot.size());

     masks = new MTYPE[M.size()];

     copy(M.begin(), M.end(), masks);


     bot = new IT[tempbot.size()];

     copy(tempbot.begin(), tempbot.end(), bot);

     nrb = tempbot.size();


         assert(cnz == nz);

 }


 template <class NT, class IT, unsigned TTDIM>

 void BmCsb<NT, IT, TTDIM>::BMult(IT** chunks, IT start, IT end, const NT * x, NT * y, IT ysize, IT * __restrict sumscan) const

 {

     assert(end-start > 0);  // there should be at least one chunk

     if (end-start == 1)     // single chunk

     {

         if((chunks[end] - chunks[start]) == 1)  // chunk consists of a single (normally dense) block

         {

             IT chi = ( (chunks[start] - chunks[0])  << collowbits);


             // m-chi > lowcolmask for all blocks except the last skinny tall one.

             // if the last one is regular too, then it has m-chi = lowcolmask+1

             if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask )    // parallelize if it is a regular/complete block

             {

                 const NT * __restrict subx = &x[chi];

                 BlockPar( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange, BREAKNRB * ysize, sumscan);

             }

             else        // otherwise block parallelization will fail

             {

                 SubSpMV(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y, sumscan);

             }

         }

         else    // a number of sparse blocks with a total of at most O(\beta) nonzeros

         {

             SubSpMV(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y, sumscan);

         }

     }

     else

     {

         IT mid = (start+end)/2;                 // divide chunks into half

         cilk_spawn BMult(chunks, start, mid, x, y, ysize, sumscan);

         if(SYNCHED)

         {

             BMult(chunks, mid, end, x, y, ysize, sumscan);

         }

         else

         {

             NT * temp = new NT[ysize];

             std::fill_n(temp, ysize, 0.0);

             BMult(chunks, mid, end, x, temp, ysize, sumscan);

             cilk_sync;

             for(IT i=0; i<ysize; ++i)

                 y[i] += temp[i];

             delete [] temp;

         }

     }

 }


 // Parallelize the block itself (A*x version)

 // start/end: element start/end positions (indices to the bot array)

 // bot[start...end] always fall in the same block

 // PRECONDITION: rangeend-rangebeg is a power of two

 // TODO: we rely on the particular implementation of lower_bound for correctness, which is dangerous !

 //       what if lhs (instead of rhs) parameter to the comparison object is the splitter?

 template <class NT, class IT, unsigned TTDIM>

 void BmCsb<NT, IT, TTDIM>::BlockPar(IT start, IT end, const NT * __restrict subx, NT * __restrict suby,

                 IT rangebeg, IT rangeend, IT cutoff, IT * __restrict sumscan) const

 {

     assert(IsPower2(rangeend-rangebeg));

     if(end - start < cutoff)

     {

         SSEspmv(num + sumscan[start], masks + start, bot + start, end-start, subx, suby, lowcolmask, lowrowmask, collowbits);

     }

     else

     {

         // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)

         // Specifically, it returns the first position where value could be inserted without violating the ordering

         IT halfrange = (rangebeg+rangeend)/2;

         IT qrt1range = (rangebeg+halfrange)/2;

         IT qrt3range = (halfrange+rangeend)/2;


         IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);

         IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);

         IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);


         /* -------

            | 0 2 |

            | 1 3 |

            ------- */

         // subtracting two pointers pointing to the same array gives you the # of elements separating them

         // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT

         IT size0 = static_cast<IT> (left - &bot[start]);

         IT size1 = static_cast<IT> (mid - left);

         IT size2 = static_cast<IT> (right - mid);

         IT size3 = static_cast<IT> (&bot[end] - right);


         IT ncutoff = std::max<IT>(cutoff/2, MINNRBTOPAR);


         // We can choose to perform [0,3] in parallel and then [1,2] in parallel

         // or perform [0,1] in parallel and then [2,3] in parallel

         // Decision is based on the balance, i.e. we pick the more balanced parallelism

         if( ( absdiff(size0,size3) + absdiff(size1,size2) ) < ( absdiff(size0,size1) + absdiff(size2,size3) ) )

         {

             cilk_spawn BlockPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff,sumscan);  // multiply subblock_0

             BlockPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff,sumscan);         // multiply subblock_3

             cilk_sync;


             cilk_spawn BlockPar(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff,sumscan); // multiply subblock_1

             BlockPar(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff,sumscan);      // multiply subblock_2

             cilk_sync;

         }

         else

         {

             cilk_spawn BlockPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff,sumscan);  // multiply subblock_0

             BlockPar(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff,sumscan);    // multiply subblock_1

             cilk_sync;


             cilk_spawn BlockPar(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff,sumscan);   // multiply subblock_2

             BlockPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff,sumscan);             // multiply subblock_3

             cilk_sync;

         }

     }

 }


 // double* restrict a; --> No aliases for a[0], a[1], ...

 // bstart/bend: block start/end index (to the top array)

 template <class NT, class IT, unsigned TTDIM>

 void BmCsb<NT, IT, TTDIM>::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const NT * __restrict x, NT * __restrict suby, IT * __restrict sumscan) const

 {

     for (IT j = bstart ; j < bend ; ++j)        // for all blocks inside that block row

     {

         IT chi = (j << collowbits);  // &x[chi] addresses the higher order bits for column indices


         if(btop[j+1] - btop[j] > 0)

         {

             SSEspmv(num + sumscan[btop[j]], masks + btop[j], bot + btop[j], btop[j+1]-btop[j], x+chi, suby, lowcolmask, lowrowmask, collowbits);

         }

     }

 }


 // Print stats to an ofstream object

 template <class NT, class IT, unsigned TTDIM>

 ofstream & BmCsb<NT, IT, TTDIM>::PrintStats(ofstream & outfile) const

 {

     if(nz == 0)

     {

         outfile << "## Matrix Doesn't have any nonzeros" <<endl;

         return outfile;

     }

     const IT ntop = nbr * nbc;


     outfile << "## Average block is of dimensions "<< lowrowmask+1 << "-by-" << lowcolmask+1 << endl;

     outfile << "## Number of real blocks is "<< ntop << endl;

     outfile << "## Row imbalance is " << RowImbalance(*this) << endl;


     std::vector<int> blocksizes(ntop);

     for(IT i=0; i<nbr; ++i)

     {

         for(IT j=0; j < nbc; ++j)

         {

             blocksizes[i*nbc+j] = static_cast<int> (top[i][j+1]-top[i][j]);

         }

     }

     sort(blocksizes.begin(), blocksizes.end());

     outfile<< "## Total number of nonzeros: " << nz << endl;

     outfile<< "## Total number of register blocks: "<< accumulate(blocksizes.begin(), blocksizes.end(), 0) << endl;

     outfile<< "## Average fill ratio is: " << static_cast<double>(nz) / static_cast<double>((RBSIZE *  nrb)) << endl;

     outfile<< "## The histogram of fill ratios within register blocks:" << endl;


     unsigned * counts = new unsigned[nrb];

     popcountall(masks, counts, nrb);

     printhistogram(counts, nrb, RBSIZE);

     delete [] counts;


     outfile << "## Nonzero distribution (sorted) of blocks follows: \n" ;

     for(IT i=0; i< ntop; ++i)

     {

         outfile << blocksizes[i] << "\n";

     }

     outfile << endl;

     return outfile;

 }

BmCsb::PrintStats
ofstream & PrintStats(ofstream &outfile) const
Definition: bmcsb.cpp:488

printhistogram
void printhistogram(const T *scansum, size_t size, unsigned bins)
Definition: utility.h:166

getModulo
unsigned int getModulo(unsigned int n, unsigned int d)
Definition: utility.h:496

getDivident
unsigned int getDivident(unsigned int n, unsigned int d)
Definition: utility.h:502

nextpoweroftwo
unsigned int nextpoweroftwo(unsigned int v)
Definition: utility.h:401

MINNRBTOPAR
#define MINNRBTOPAR
Definition: utility.h:139

BmCsb
Definition: bmcsb.h:21

SLACKNESS
#define SLACKNESS
Definition: utility.h:130

RBDIM
#define RBDIM
Definition: utility.h:128

bmcsb.h

BREAKNRB
#define BREAKNRB
Definition: utility.h:138

SYNCHED
#define SYNCHED
Definition: utility.h:21

BitInterleaveLow
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
Definition: utility.h:344

IntPower< 2 >
unsigned IntPower< 2 >(unsigned exponent)
Definition: utility.h:387

utility.h

IsPower2
bool IsPower2(T x)
Definition: utility.h:396

SSEspmv
void SSEspmv(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
Definition: SSEspmv.cpp:1120

highestbitset
unsigned int highestbitset(unsigned __int64 v)
Definition: utility.h:423

Csc
Definition: csc.h:15

absdiff
#define absdiff(x, y)
Definition: utility.h:147

RowImbalance
float RowImbalance(const CSB &A)
Definition: friends.h:400

BmCsb::BmCsb
BmCsb()
Definition: bmcsb.h:24

BmCsb::~BmCsb
~BmCsb()
Definition: bmcsb.cpp:225

RBSIZE
#define RBSIZE
Definition: utility.h:129

L2SIZE
#define L2SIZE
Definition: utility.h:132

MortonCompare< IT >

popcountall
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
Definition: SSEspmv.cpp:1232

BmCsb::operator=
BmCsb< NT, IT, TTDIM > & operator=(const BmCsb< NT, IT, TTDIM > &rhs)
Definition: bmcsb.cpp:163