8 template <
class NT,
class IT,
unsigned TTDIM>
11 ispar = (workers > 1);
24 sizereq = (nbits > 1);
28 cerr <<
"Matrix too small for this library" << endl;
33 IT inf = numeric_limits<IT>::max();
36 nhighbits = nbits-nlowbits;
59 cout <<
"Forcing beta to "<< (candlowmask+1) <<
" instead of the chosen " << (lowmask+1) << endl;
60 cout <<
"Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
61 lowmask = candlowmask;
62 nlowbits = forcelogbeta;
63 nhighbits = nbits-nlowbits;
67 double sqrtn = sqrt(static_cast<double>(n)); IT logbeta =
static_cast<IT
>(ceil(log2(sqrtn))) + 2;
68 if(nlowbits > logbeta)
72 nhighbits = nbits-nlowbits;
74 cout <<
"Beta chosen to be "<< (lowmask+1) << endl;
76 highmask = ((roundup - 1) ^ lowmask);
78 IT blcdim = lowmask + 1;
79 ncsb =
static_cast<IT
>(ceil(static_cast<double>(n) / static_cast<double>(blcdim)));
81 blcrange = (lowmask+1) * (lowmask+1);
87 template <
class NT,
class IT,
unsigned TTDIM>
89 : nz(rhs.nz), n(rhs.n), blcrange(rhs.blcrange), ncsb(rhs.ncsb), nrb(rhs.nrb),
90 nhighbits(rhs.nhighbits), nlowbits(rhs.nlowbits), diagonal(rhs.diagonal),
91 highmask(rhs.highmask), lowmask(rhs.lowmask), mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
99 masks =
new MTYPE[nrb];
100 scansum =
new IT[nrb];
102 copy ( rhs.num, rhs.num+nz+1, num);
103 copy ( rhs.bot, rhs.bot+nrb, bot );
104 copy ( rhs.masks, rhs.masks+nrb, masks );
105 copy ( rhs.scansum, rhs.scansum+nrb, scansum );
109 top =
new IT* [ncsb];
110 for(IT i=0; i<ncsb; ++i)
111 top[i] =
new IT[ncsb-i+1];
112 for(IT i=0; i<ncsb; ++i)
113 for(IT j=0; j <= (ncsb-i); ++j)
114 top[i][j] = rhs.top[i][j];
118 template <
class NT,
class IT,
unsigned TTDIM>
133 for(IT i=0; i<ncsb; ++i)
142 blcrange = rhs.blcrange;
143 mortoncmp = rhs.mortoncmp;
144 diagonal = rhs.diagonal;
146 nhighbits = rhs.nhighbits;
147 nlowbits = rhs.nlowbits;
148 highmask = rhs.highmask;
149 lowmask = rhs.lowmask;
154 num =
new NT[nz+2](); num++;
156 masks =
new MTYPE[nrb];
157 scansum =
new IT[nrb];
159 copy ( rhs.num, rhs.num+nz+1, num);
160 copy ( rhs.bot, rhs.bot+nrb, bot );
161 copy ( rhs.masks, rhs.masks+nrb, masks );
162 copy ( rhs.scansum, rhs.scansum+nrb, scansum );
166 top =
new IT* [ncsb];
167 for(IT i=0; i<ncsb; ++i)
168 top[i] =
new IT[ncsb-i+1];
169 for(IT i=0; i<ncsb; ++i)
170 for(IT j=0; j <= (ncsb-i); ++j)
171 top[i][j] = rhs.top[i][j];
177 template <
class NT,
class IT,
unsigned TTDIM>
189 for(IT i=0; i<ncsb; ++i)
195 template <
class NT,
class IT,
unsigned TTDIM>
198 typedef std::pair<IT, IT> ipair;
199 typedef std::pair<IT, ipair> mypair;
201 assert(nz != 0 && n != 0);
204 top =
new IT* [ncsb];
205 for(IT i=0; i<ncsb; ++i)
206 top[i] =
new IT[ncsb-i+1];
208 mypair * pairarray =
new mypair[nz];
210 for(IT j = 0; j < n; ++j)
212 for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i)
215 IT hindex = (((highmask & csc.ir[i] ) >> nlowbits) << nhighbits) | ((highmask & j) >> nlowbits);
216 IT lindex = ((lowmask & csc.ir[i]) << nlowbits) | (lowmask & j) ;
219 pairarray[k++] = mypair(hindex, ipair(lindex,i));
222 sort(pairarray, pairarray+nz);
223 SortBlocks(pairarray, csc.num);
227 template <
class NT,
class IT,
unsigned TTDIM>
230 typedef pair<IT, pair<IT, IT> > mypair;
237 for(IT i = 0; i < ncsb; ++i)
239 for(IT j = 0; j < (ncsb-i); ++j)
241 top[i][j] = tempbot.size();
243 std::vector<mypair> blocknz;
244 while(cnz < nz && pairarray[cnz].first == ((i*ldim)+(j+i)) )
246 IT interlowbits = pairarray[cnz].second.first;
247 IT rlowbits = ((interlowbits >> nlowbits) & lowmask);
248 IT clowbits = (interlowbits & lowmask);
251 if(j == 0 && rlowbits == clowbits)
253 diagonal.push_back(make_pair((i << nlowbits)+rlowbits, val[pairarray[cnz++].second.second]));
257 blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
261 sort(blocknz.begin(), blocknz.end());
264 for(
typename vector<mypair>::iterator itr = blocknz.begin(); itr != blocknz.end(); ++itr)
266 tempnum.push_back( val[itr->second.second] );
269 if(curregblk > lastregblk)
271 lastregblk = curregblk;
272 M.push_back((MTYPE) 0);
276 IT Ci = itr->second.first & lowmask;
277 IT Ri = (itr->second.first >> nlowbits) & lowmask;
280 IT lefttop = ((lowmask & Ri) << nlowbits) | (lowmask & Ci);
282 tempbot.push_back(lefttop);
287 top[i][ncsb-i] = tempbot.size();
292 nrb = tempbot.size();
293 masks =
new MTYPE[nrb];
294 scansum =
new IT[nrb];
296 num =
new NT[nz+2](); num++;
298 copy(M.begin(), M.end(), masks);
300 copy(tempbot.begin(), tempbot.end(), bot);
301 copy(tempnum.begin(), tempnum.end(), num);
304 template<
class NT,
class IT,
unsigned TTDIM>
311 lspace =
new IT[lsize];
312 rspace =
new IT[rsize];
313 for(IT i=0; i<rsize; ++i)
320 lspace[lsize-1] = size-1;
325 IT chunksfour = size/4;
326 IT rest = size - 4*chunksfour;
327 lsize = 2*chunksfour;
328 rsize = 2*chunksfour;
339 lspace =
new IT[lsize];
340 rspace =
new IT[rsize];
342 for(IT i=0; i<chunksfour; ++i)
345 lspace[2*i+1] = 4*i+1;
347 rspace[2*i+1] = 4*i+3;
351 lspace[lsize-2] = size-3;
352 lspace[lsize-1] = size-2;
353 rspace[rsize-1] = size-1;
357 lspace[lsize-2] = size-2;
358 lspace[lsize-1] = size-1;
362 lspace[lsize-1] = size-1;
367 template<
class NT,
class IT,
unsigned TTDIM>
370 cilk_for(IT i=0; i< ncsb-d; ++i)
372 IT rhi = (i << nlowbits);
374 cilk_for(IT j=d; j < (ncsb-i); ++j)
376 IT chi = ((j+i) << nlowbits);
377 symcsr(num+scansum[top[i][j]], masks+top[i][j], bot+top[i][j], top[i][j+1]-top[i][j], x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
383 template <
class NT,
class IT,
unsigned TTDIM>
388 cilk_for(IT i=0; i< ncsb; ++i)
390 IT hi = (i << nlowbits);
392 if(i == (ncsb-1) && (n-hi) <= lowmask)
394 SSEsym(num + scansum[top[i][0]], masks + top[i][0], bot + top[i][0], top[i][1]-top[i][0], x+hi, y+hi, lowmask, nlowbits);
398 BlockTriPar(top[i][0], top[i][1], x+hi, y+hi, 0, blcrange,
BREAKNRB * (nlowbits+1));
404 cilk_for(IT i=0; i< ncsb; ++i)
406 IT hi = (i << nlowbits);
407 SSEsym(num + scansum[top[i][0]], masks + top[i][0], bot + top[i][0], top[i][1]-top[i][0], x+hi, y+hi, lowmask, nlowbits);
411 const IT diagsize = diagonal.size();
412 cilk_for(IT i=0; i < diagsize; ++i)
414 y[diagonal[i].first] += diagonal[i].second * x[diagonal[i].first];
421 template <
class NT,
class IT,
unsigned TTDIM>
427 DivideIterationSpace(lspace, rspace, lsize, rsize, ncsb-d, d);
431 for(IT k=0; k<lsize; ++k)
433 lsum += top[lspace[k]][d+1] - top[lspace[k]][d];
435 for(IT k=0; k<rsize; ++k)
437 rsum += top[rspace[k]][d+1] - top[rspace[k]][d];
439 float lave = lsum / lsize;
440 float rave = rsum / rsize;
442 cilk_for(IT i=0; i< lsize; ++i)
444 IT rhi = (lspace[i] << nlowbits) ;
445 IT chi = ((lspace[i]+d) << nlowbits);
446 IT start = top[lspace[i]][d];
447 IT end = top[lspace[i]][d+1];
449 if((top[lspace[i]][d+1] - top[lspace[i]][d] >
BALANCETH * lave)
450 && (!(lspace[i] == (ncsb-d-1) && (n-chi) <= lowmask)))
452 BlockPar(start, end, x+chi, x+rhi, y+rhi, y+chi, 0, blcrange,
BREAKNRB * (nlowbits+1));
456 SSEsym(num + scansum[start], masks + start, bot + start, end-start, x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
459 cilk_for(IT j=0; j< rsize; ++j)
461 IT rhi = (rspace[j] << nlowbits) ;
462 IT chi = ((rspace[j]+d) << nlowbits);
463 IT start = top[rspace[j]][d];
464 IT end = top[rspace[j]][d+1];
466 if((top[rspace[j]][d+1] - top[rspace[j]][d] >
BALANCETH * rave)
467 && (!(rspace[j] == (ncsb-d-1) && (n-chi) <= lowmask)))
469 BlockPar(start, end, x+chi, x+rhi, y+rhi, y+chi, 0, blcrange,
BREAKNRB * (nlowbits+1));
473 SSEsym(num + scansum[start], masks + start, bot + start, end-start, x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
484 template <
class NT,
class IT,
unsigned TTDIM>
486 IT rangebeg, IT rangeend, IT cutoff)
const
488 assert(
IsPower2(rangeend-rangebeg));
489 if(end - start < cutoff)
491 SSEsym(num + scansum[start], masks + start, bot + start, end-start, subx, suby, lowmask, nlowbits);
497 IT halfrange = (rangebeg+rangeend)/2;
498 IT qrt1range = (rangebeg+halfrange)/2;
499 IT qrt3range = (halfrange+rangeend)/2;
501 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
502 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
510 IT size0 =
static_cast<IT
> (mid - &bot[start]);
511 IT size2 =
static_cast<IT
> (right - mid);
512 IT size3 =
static_cast<IT
> (&bot[end] - right);
516 cilk_spawn BlockTriPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
517 BlockTriPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
520 BlockPar(start+size0, end-size3, subx, subx, suby, suby, halfrange, qrt3range, ncutoff);
530 template <
class NT,
class IT,
unsigned TTDIM>
532 NT * __restrict suby, NT * __restrict suby_mirror, IT rangebeg, IT rangeend, IT cutoff)
const
534 assert(
IsPower2(rangeend-rangebeg));
535 if(end - start < cutoff)
538 SSEsym(num + scansum[start], masks + start, bot + start, end-start, subx, subx_mirror, suby, suby_mirror, lowmask, nlowbits);
544 IT halfrange = (rangebeg+rangeend)/2;
545 IT qrt1range = (rangebeg+halfrange)/2;
546 IT qrt3range = (halfrange+rangeend)/2;
548 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
549 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
550 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
558 IT size0 =
static_cast<IT
> (left - &bot[start]);
559 IT size1 =
static_cast<IT
> (mid - left);
560 IT size2 =
static_cast<IT
> (right - mid);
561 IT size3 =
static_cast<IT
> (&bot[end] - right);
567 cilk_spawn BlockPar(start, start+size0, subx, subx_mirror, suby, suby_mirror, rangebeg, qrt1range, ncutoff);
568 BlockPar(end-size3, end, subx, subx_mirror, suby, suby_mirror, qrt3range, rangeend, ncutoff);
571 cilk_spawn BlockPar(start+size0, start+size0+size1, subx, subx_mirror, suby, suby_mirror, qrt1range, halfrange, ncutoff);
572 BlockPar(start+size0+size1, end-size3, subx, subx_mirror, suby, suby_mirror, halfrange, qrt3range, ncutoff);
580 template <
class NT,
class IT,
unsigned TTDIM>
583 const IT diagsize = diagonal.size();
584 for(IT i=0; i < diagsize; ++i)
586 y[diagonal[i].first] += diagonal[i].second * x[diagonal[i].first];
588 for (IT i = 0 ; i < ncsb ; ++i)
590 IT rhi = (i << nlowbits);
591 for (IT j = 1 ; j < (ncsb-i) ; ++j)
593 IT chi = ((j+i) << nlowbits);
594 SSEsym(num + scansum[top[i][j]], masks+top[i][j], bot+top[i][j], top[i][j+1]-top[i][j], x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
597 SSEsym(num + scansum[top[i][0]], masks+top[i][0], bot+top[i][0], top[i][1]-top[i][0], x+rhi, y+rhi, lowmask, nlowbits);
602 template <
class NT,
class IT,
unsigned TTDIM>
611 IT * sums =
new IT[size];
612 for(
size_t i=0; i< size; ++i)
614 sums[i] = top[i][d+1] - top[i][d];
616 IT max = *max_element(sums, sums+size);
617 IT mean = accumulate(sums, sums+size, 0.0) / size;
620 return static_cast<float>(max) / mean;
625 template <
class NT,
class IT,
unsigned TTDIM>
629 for(
size_t i=0; i< ncsb-d; ++i)
631 sum += (top[i][d+1] - top[i][d]);
637 template <
class NT,
class IT,
unsigned TTDIM>
642 outfile <<
"## Matrix Doesn't have any nonzeros" <<endl;
645 const IT ntop = ncsb * ncsb;
647 outfile <<
"## Average block is of dimensions "<< lowmask+1 <<
"-by-" << lowmask+1 << endl;
648 outfile <<
"## Average fill ratio is: " <<
static_cast<double>(nz) / static_cast<double>((
RBSIZE * nrb)) << endl;
649 outfile <<
"## Number of real blocks is "<< ntop << endl;
650 outfile <<
"## Main (0th) block diagonal imbalance: " << Imbalance(0) << endl;
651 outfile <<
"## 1st block diagonal imbalance: " << Imbalance(1) << endl;
652 outfile <<
"## 2nd block diagonal imbalance: " << Imbalance(2) << endl;
654 outfile <<
"## nrb ratios (block diagonal 0,1,2): " <<
static_cast<float>(nrbsum(0)) / nrb <<
", "
655 << static_cast<float>(nrbsum(1)) / nrb <<
", " << static_cast<float>(nrbsum(2)) / nrb << endl;
656 outfile <<
"## atomics ratio: " <<
static_cast<float>(nrb-nrbsum(0)-nrbsum(1)-nrbsum(2))/nrb << endl;
658 outfile<<
"## Total number of nonzeros: " << nz << endl;
659 outfile<<
"## Total number of register blocks: "<< nrb << endl;
664 template <
class NT,
class IT,
unsigned TTDIM>
667 for(IT i =0; i<ncsb; ++i)
669 for(IT j=0; j< (ncsb-i); ++j)
671 outfile <<
"Dumping A.top(" << i <<
"," << j <<
")" << endl;
672 for(IT k=top[i][j]; k< top[i][j+1]; ++k)
674 IT rli = ((bot[k] >> nlowbits) & lowmask);
675 IT cli = bot[k] & lowmask;
676 outfile <<
"A(" << rli <<
"," << cli <<
")=" << num[k] << endl;
unsigned int getModulo(unsigned int n, unsigned int d)
unsigned int getDivident(unsigned int n, unsigned int d)
unsigned int nextpoweroftwo(unsigned int v)
unsigned prescan(unsigned *a, MTYPE *const M, int n)
ofstream & Dump(ofstream &outfile) const
BmSym< NT, IT, TTDIM > & operator=(const BmSym< NT, IT, TTDIM > &rhs)
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
unsigned IntPower< 2 >(unsigned exponent)
unsigned int highestbitset(unsigned __int64 v)
void SSEsym(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lowmask, unsigned nlbits)
void symcsr(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lcmask, unsigned nlbits)
ofstream & PrintStats(ofstream &outfile) const