8 template <
class NT,
class IT,
unsigned TTDIM>
11 ispar = (workers > 1);
27 sizereq = ((rowbits > 1) && (colbits > 1));
31 cerr <<
"Matrix too small for this library" << endl;
35 rowlowbits = rowbits-1;
36 collowbits = colbits-1;
37 IT inf = numeric_limits<IT>::max();
40 rowhighbits = rowbits-rowlowbits;
41 colhighbits = colbits-collowbits;
70 while(rowlowbits+collowbits > maxbits)
72 if(rowlowbits > collowbits)
83 while(rowlowbits > collowbits)
88 while(rowlowbits < collowbits)
93 assert (collowbits == rowlowbits);
99 cout <<
"Forcing beta to "<< (candlowmask+1) <<
" instead of the chosen " << (lowrowmask+1) << endl;
100 cout <<
"Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
101 lowrowmask = lowcolmask = candlowmask;
102 rowlowbits = collowbits = forcelogbeta;
103 rowhighbits = rowbits-rowlowbits;
104 colhighbits = colbits-collowbits;
108 double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));
109 IT logbeta =
static_cast<IT
>(ceil(log2(sqrtn))) + 2;
110 if(rowlowbits > logbeta)
112 rowlowbits = collowbits = logbeta;
114 rowhighbits = rowbits-rowlowbits;
115 colhighbits = colbits-collowbits;
117 cout <<
"Beta chosen to be "<< (lowrowmask+1) << endl;
119 highrowmask = ((roundrowup - 1) ^ lowrowmask);
120 highcolmask = ((roundcolup - 1) ^ lowcolmask);
123 IT blcdimrow = lowrowmask + 1;
124 IT blcdimcol = lowcolmask + 1;
125 nbr =
static_cast<IT
>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));
126 nbc =
static_cast<IT
>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));
128 blcrange = (lowrowmask+1) * (lowcolmask+1);
134 template <
class NT,
class IT,
unsigned TTDIM>
136 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc), nrb(rhs.nrb),
137 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),
138 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),
139 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
143 num =
new NT[nz+2](); num++;
145 masks =
new MTYPE[nrb];
147 copy ( rhs.num, rhs.num+nz+1, num);
148 copy ( rhs.bot, rhs.bot+nrb, bot );
149 copy ( rhs.masks, rhs.masks+nrb, masks );
154 for(IT i=0; i<nbr; ++i)
155 top[i] =
new IT[nbc+1];
156 for(IT i=0; i<nbr; ++i)
157 for(IT j=0; j <= nbc; ++j)
158 top[i][j] = rhs.top[i][j];
162 template <
class NT,
class IT,
unsigned TTDIM>
176 for(IT i=0; i<nbr; ++i)
188 blcrange = rhs.blcrange;
190 rowhighbits = rhs.rowhighbits;
191 rowlowbits = rhs.rowlowbits;
192 highrowmask = rhs.highrowmask;
193 lowrowmask = rhs.lowrowmask;
195 colhighbits = rhs.colhighbits;
196 collowbits = rhs.collowbits;
197 highcolmask = rhs.highcolmask;
198 lowcolmask= rhs.lowcolmask;
199 mortoncmp = rhs.mortoncmp;
203 num =
new NT[nz+2](); num++;
205 masks =
new MTYPE[nrb];
207 copy ( rhs.num, rhs.num+nz+1, num);
208 copy ( rhs.bot, rhs.bot+nrb, bot );
209 copy ( rhs.masks, rhs.masks+nrb, masks );
214 for(IT i=0; i<nbr; ++i)
215 top[i] =
new IT[nbc+1];
216 for(IT i=0; i<nbr; ++i)
217 for(IT j=0; j <= nbc; ++j)
218 top[i][j] = rhs.top[i][j];
224 template <
class NT,
class IT,
unsigned TTDIM>
235 for(IT i=0; i<nbr; ++i)
241 template <
class NT,
class IT,
unsigned TTDIM>
244 typedef std::pair<IT, IT> ipair;
245 typedef std::pair<IT, ipair> mypair;
247 assert(nz != 0 && n != 0 && m != 0);
250 num =
new NT[nz+2](); num++;
254 top = allocate2D<IT>(nbr, nbc+1);
255 mypair * pairarray =
new mypair[nz];
257 for(IT j = 0; j < n; ++j)
259 for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i)
262 IT hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
263 | ((highcolmask & j) >> collowbits);
264 IT lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
267 pairarray[k++] = mypair(hindex, ipair(lindex,i));
270 sort(pairarray, pairarray+nz);
271 SortBlocks(pairarray, csc.num);
275 template <
class NT,
class IT,
unsigned TTDIM>
278 typedef pair<IT, pair<IT, IT> > mypair;
284 for(IT i = 0; i < nbr; ++i)
286 for(IT j = 0; j < nbc; ++j)
288 top[i][j] = tempbot.size();
290 std::vector<mypair> blocknz;
291 while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) )
293 IT lowbits = pairarray[cnz].second.first;
294 IT rlowbits = ((lowbits >> collowbits) & lowrowmask);
295 IT clowbits = (lowbits & lowcolmask);
298 blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
301 sort(blocknz.begin(), blocknz.end());
304 IT bnz = blocknz.size();
306 for(IT bcur=0; bcur < bnz; ++bcur)
309 if(curregblk > lastregblk)
311 lastregblk = curregblk;
312 M.push_back((MTYPE) 0);
316 IT Ci = blocknz[bcur].second.first & lowcolmask;
317 IT Ri = (blocknz[bcur].second.first >> collowbits) & lowrowmask;
320 IT lefttop = ((lowrowmask & Ri) << collowbits) | (lowcolmask & Ci);
322 tempbot.push_back(lefttop);
324 M.back() |= GetMaskTable<MTYPE>(
getModulo(blocknz[bcur].first,
RBSIZE));
326 for(IT k=prevcnz; k<cnz ; ++k)
328 num[k] = val[blocknz[k-prevcnz].second.second];
331 top[i][nbc] = tempbot.size();
333 assert(M.size() == tempbot.size());
334 masks =
new MTYPE[M.size()];
335 copy(M.begin(), M.end(), masks);
337 bot =
new IT[tempbot.size()];
338 copy(tempbot.begin(), tempbot.end(), bot);
339 nrb = tempbot.size();
352 template <
class NT,
class IT,
unsigned TTDIM>
355 assert(end-start > 0);
358 if((chunks[end] - chunks[start]) == 1)
360 IT chi = ( (chunks[start] - chunks[0]) << collowbits);
364 if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask )
366 const NT * __restrict subx = &x[chi];
367 BlockPar( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange,
BREAKNRB * ysize, sumscan);
371 SubSpMV(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y, sumscan);
376 SubSpMV(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y, sumscan);
381 IT mid = (start+end)/2;
382 cilk_spawn BMult(chunks, start, mid, x, y, ysize, sumscan);
385 BMult(chunks, mid, end, x, y, ysize, sumscan);
389 NT * temp =
new NT[ysize];
390 std::fill_n(temp, ysize, 0.0);
391 BMult(chunks, mid, end, x, temp, ysize, sumscan);
393 for(IT i=0; i<ysize; ++i)
408 template <
class NT,
class IT,
unsigned TTDIM>
410 IT rangebeg, IT rangeend, IT cutoff, IT * __restrict sumscan)
const
412 assert(
IsPower2(rangeend-rangebeg));
413 if(end - start < cutoff)
415 SSEspmv(num + sumscan[start], masks + start, bot + start, end-start, subx, suby, lowcolmask, lowrowmask, collowbits);
421 IT halfrange = (rangebeg+rangeend)/2;
422 IT qrt1range = (rangebeg+halfrange)/2;
423 IT qrt3range = (halfrange+rangeend)/2;
425 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
426 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
427 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
435 IT size0 =
static_cast<IT
> (left - &bot[start]);
436 IT size1 =
static_cast<IT
> (mid - left);
437 IT size2 =
static_cast<IT
> (right - mid);
438 IT size3 =
static_cast<IT
> (&bot[end] - right);
447 cilk_spawn BlockPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff,sumscan);
448 BlockPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff,sumscan);
451 cilk_spawn BlockPar(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff,sumscan);
452 BlockPar(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff,sumscan);
457 cilk_spawn BlockPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff,sumscan);
458 BlockPar(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff,sumscan);
461 cilk_spawn BlockPar(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff,sumscan);
462 BlockPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff,sumscan);
471 template <
class NT,
class IT,
unsigned TTDIM>
472 void BmCsb<NT, IT, TTDIM>::SubSpMV(IT * __restrict btop, IT bstart, IT bend,
const NT * __restrict x, NT * __restrict suby, IT * __restrict sumscan)
const
474 for (IT j = bstart ; j < bend ; ++j)
476 IT chi = (j << collowbits);
478 if(btop[j+1] - btop[j] > 0)
480 SSEspmv(num + sumscan[btop[j]], masks + btop[j], bot + btop[j], btop[j+1]-btop[j], x+chi, suby, lowcolmask, lowrowmask, collowbits);
487 template <
class NT,
class IT,
unsigned TTDIM>
492 outfile <<
"## Matrix Doesn't have any nonzeros" <<endl;
495 const IT ntop = nbr * nbc;
497 outfile <<
"## Average block is of dimensions "<< lowrowmask+1 <<
"-by-" << lowcolmask+1 << endl;
498 outfile <<
"## Number of real blocks is "<< ntop << endl;
499 outfile <<
"## Row imbalance is " <<
RowImbalance(*
this) << endl;
501 std::vector<int> blocksizes(ntop);
502 for(IT i=0; i<nbr; ++i)
504 for(IT j=0; j < nbc; ++j)
506 blocksizes[i*nbc+j] =
static_cast<int> (top[i][j+1]-top[i][j]);
509 sort(blocksizes.begin(), blocksizes.end());
510 outfile<<
"## Total number of nonzeros: " << nz << endl;
511 outfile<<
"## Total number of register blocks: "<< accumulate(blocksizes.begin(), blocksizes.end(), 0) << endl;
512 outfile<<
"## Average fill ratio is: " <<
static_cast<double>(nz) / static_cast<double>((
RBSIZE * nrb)) << endl;
513 outfile<<
"## The histogram of fill ratios within register blocks:" << endl;
515 unsigned * counts =
new unsigned[nrb];
520 outfile <<
"## Nonzero distribution (sorted) of blocks follows: \n" ;
521 for(IT i=0; i< ntop; ++i)
523 outfile << blocksizes[i] <<
"\n";
ofstream & PrintStats(ofstream &outfile) const
void printhistogram(const T *scansum, size_t size, unsigned bins)
unsigned int getModulo(unsigned int n, unsigned int d)
unsigned int getDivident(unsigned int n, unsigned int d)
unsigned int nextpoweroftwo(unsigned int v)
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
unsigned IntPower< 2 >(unsigned exponent)
void SSEspmv(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
unsigned int highestbitset(unsigned __int64 v)
float RowImbalance(const CSB &A)
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
BmCsb< NT, IT, TTDIM > & operator=(const BmCsb< NT, IT, TTDIM > &rhs)