9 template <
class NT,
class IT>
12 ispar = (workers > 1);
25 sizereq = (nbits > 1);
29 cerr <<
"Matrix too small for this library" << endl;
34 IT inf = numeric_limits<IT>::max();
37 nhighbits = nbits-nlowbits;
60 cout <<
"Forcing beta to "<< (candlowmask+1) <<
" instead of the chosen " << (lowmask+1) << endl;
61 cout <<
"Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
62 lowmask = candlowmask;
63 nlowbits = forcelogbeta;
64 nhighbits = nbits-nlowbits;
68 double sqrtn = sqrt(static_cast<double>(n));
69 IT logbeta =
static_cast<IT
>(ceil(log2(sqrtn))) + 2;
70 if(nlowbits > logbeta)
74 nhighbits = nbits-nlowbits;
76 cout <<
"Beta chosen to be "<< (lowmask+1) << endl;
78 highmask = ((roundup - 1) ^ lowmask);
80 IT blcdim = lowmask + 1;
81 ncsb =
static_cast<IT
>(ceil(static_cast<double>(n) / static_cast<double>(blcdim)));
83 blcrange = (lowmask+1) * (lowmask+1);
89 template <
class NT,
class IT>
91 : nz(rhs.nz), n(rhs.n), blcrange(rhs.blcrange), ncsb(rhs.ncsb), nhighbits(rhs.nhighbits), nlowbits(rhs.nlowbits),
92 highmask(rhs.highmask), lowmask(rhs.lowmask), mortoncmp(rhs.mortoncmp), ispar(rhs.ispar), diagonal(rhs.diagonal)
99 copy ( rhs.num, rhs.num+nz, num);
100 copy ( rhs.bot, rhs.bot+nz, bot );
104 top =
new IT* [ncsb];
105 for(IT i=0; i<ncsb; ++i)
106 top[i] =
new IT[ncsb-i+1];
107 for(IT i=0; i<ncsb; ++i)
108 for(IT j=0; j <= (ncsb-i); ++j)
109 top[i][j] = rhs.top[i][j];
113 template <
class NT,
class IT>
126 for(IT i=0; i<ncsb; ++i)
134 blcrange = rhs.blcrange;
135 mortoncmp = rhs.mortoncmp;
137 nhighbits = rhs.nhighbits;
138 nlowbits = rhs.nlowbits;
139 highmask = rhs.highmask;
140 lowmask = rhs.lowmask;
141 diagonal = rhs.diagonal;
148 copy ( rhs.num, rhs.num+nz, num);
149 copy ( rhs.bot, rhs.bot+nz, bot );
153 top =
new IT* [ncsb];
154 for(IT i=0; i<ncsb; ++i)
155 top[i] =
new IT[ncsb-i+1];
156 for(IT i=0; i<ncsb; ++i)
157 for(IT j=0; j <= (ncsb-i); ++j)
158 top[i][j] = rhs.top[i][j];
164 template <
class NT,
class IT>
174 for(IT i=0; i<ncsb; ++i)
180 template <
class NT,
class IT>
183 typedef std::pair<IT, IT> ipair;
184 typedef std::pair<IT, ipair> mypair;
186 assert(nz != 0 && n != 0);
189 top =
new IT* [ncsb];
190 for(IT i=0; i<ncsb; ++i)
191 top[i] =
new IT[ncsb-i+1];
193 mypair * pairarray =
new mypair[nz];
195 for(IT j = 0; j < n; ++j)
197 for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i)
200 IT hindex = (((highmask & csc.ir[i] ) >> nlowbits) << nhighbits) | ((highmask & j) >> nlowbits);
201 IT lindex = ((lowmask & csc.ir[i]) << nlowbits) | (lowmask & j) ;
204 pairarray[k++] = mypair(hindex, ipair(lindex,i));
207 sort(pairarray, pairarray+nz);
208 SortBlocks(pairarray, csc.num);
212 template <
class NT,
class IT>
215 typedef pair<IT, pair<IT, IT> > mypair;
220 for(IT i = 0; i < ncsb; ++i)
222 for(IT j = 0; j < (ncsb-i); ++j)
224 top[i][j] = tempbot.size();
226 std::vector<mypair> blocknz;
227 while(cnz < nz && pairarray[cnz].first == ((i*ldim)+(j+i)) )
229 IT interlowbits = pairarray[cnz].second.first;
230 IT rlowbits = ((interlowbits >> nlowbits) & lowmask);
231 IT clowbits = (interlowbits & lowmask);
234 if(j == 0 && rlowbits == clowbits)
236 diagonal.push_back(make_pair((i << nlowbits)+rlowbits, val[pairarray[cnz++].second.second]));
240 blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
244 sort(blocknz.begin(), blocknz.end());
245 typename vector<mypair>::iterator itr;
247 for( itr = blocknz.begin(); itr != blocknz.end(); ++itr)
249 tempbot.push_back( itr->second.first );
250 tempnum.push_back( val[itr->second.second] );
253 top[i][ncsb-i] = tempbot.size();
256 assert (cnz == (tempbot.size() + diagonal.size()));
261 copy(tempbot.begin(), tempbot.end(), bot);
262 copy(tempnum.begin(), tempnum.end(), num);
263 sort(diagonal.begin(), diagonal.end());
266 template<
class NT,
class IT>
273 lspace =
new IT[lsize];
274 rspace =
new IT[rsize];
275 for(IT i=0; i<rsize; ++i)
282 lspace[lsize-1] = size-1;
287 IT chunks = size / (2*d);
288 int rest = size - (2*d*chunks);
300 lspace =
new IT[lsize];
301 rspace =
new IT[rsize];
302 int remrest = (int) rest;
305 for(IT i=0; i<chunks; ++i)
307 lspace[2*i+0] = 4*i+0;
308 lspace[2*i+1] = 4*i+1;
309 rspace[2*i+0] = 4*i+2;
310 rspace[2*i+1] = 4*i+3;
312 if(remrest-- > 0) lspace[2*chunks+0] = 4*chunks+0;
313 if(remrest-- > 0) lspace[2*chunks+1] = 4*chunks+1;
314 if(remrest-- > 0) rspace[2*chunks+0] = 4*chunks+2;
318 for(IT i=0; i<chunks; ++i)
320 lspace[3*i+0] = 6*i+0;
321 lspace[3*i+1] = 6*i+1;
322 lspace[3*i+2] = 6*i+2;
323 rspace[3*i+0] = 6*i+3;
324 rspace[3*i+1] = 6*i+4;
325 rspace[3*i+2] = 6*i+5;
327 if(remrest-- > 0) lspace[3*chunks+0] = 6*chunks+0;
328 if(remrest-- > 0) lspace[3*chunks+1] = 6*chunks+1;
329 if(remrest-- > 0) lspace[3*chunks+2] = 6*chunks+2;
330 if(remrest-- > 0) rspace[3*chunks+0] = 6*chunks+3;
331 if(remrest-- > 0) rspace[3*chunks+1] = 6*chunks+4;
336 for(IT i=0; i<chunks; ++i)
338 lspace[4*i+0] = 8*i+0;
339 lspace[4*i+1] = 8*i+1;
340 lspace[4*i+2] = 8*i+2;
341 lspace[4*i+3] = 8*i+3;
342 rspace[4*i+0] = 8*i+4;
343 rspace[4*i+1] = 8*i+5;
344 rspace[4*i+2] = 8*i+6;
345 rspace[4*i+3] = 8*i+7;
347 if(remrest-- > 0) lspace[4*chunks+0] = 8*chunks+0;
348 if(remrest-- > 0) lspace[4*chunks+1] = 8*chunks+1;
349 if(remrest-- > 0) lspace[4*chunks+2] = 8*chunks+2;
350 if(remrest-- > 0) lspace[4*chunks+3] = 8*chunks+3;
351 if(remrest-- > 0) rspace[4*chunks+0] = 8*chunks+4;
352 if(remrest-- > 0) rspace[4*chunks+1] = 8*chunks+5;
353 if(remrest-- > 0) rspace[4*chunks+2] = 8*chunks+6;
357 cout <<
"Diagonal d = " << d <<
" is not yet supported" << endl;
362 template<
class NT,
class IT>
365 cilk_for(IT i=0; i< ncsb-d; ++i)
367 IT rhi = (i << nlowbits);
368 NT * __restrict suby = &y[rhi];
369 const NT * __restrict subx_mirror = &x[rhi];
371 cilk_for(IT j=d; j < (ncsb-i); ++j)
373 IT chi = ((j+i) << nlowbits);
374 const NT * __restrict subx = &x[chi];
375 NT * __restrict suby_mirror = &y[chi];
377 IT * __restrict r_bot = bot;
378 NT * __restrict r_num = num;
379 for(IT k=top[i][j]; k<top[i][j+1]; ++k)
384 IT rli = ((r_bot[k] >> nlowbits) & lowmask);
385 IT cli = (r_bot[k] & lowmask);
398 template <
class NT,
class IT>
403 cilk_for(IT i=0; i< ncsb; ++i)
405 IT hi = (i << nlowbits);
406 NT * __restrict suby = &y[hi];
407 const NT * __restrict subx = &x[hi];
409 if(i == (ncsb-1) && (n-hi) <= lowmask)
411 IT * __restrict r_bot = bot;
412 NT * __restrict r_num = num;
413 for(IT k=top[i][0]; k<top[i][1]; ++k)
418 IT rli = ((ind >> nlowbits) & lowmask);
419 IT cli = (ind & lowmask);
421 suby[rli] += val * subx[cli];
422 suby[cli] += val * subx[rli];
427 BlockTriPar(top[i][0], top[i][1], subx, suby, 0, blcrange,
BREAKEVEN * (nlowbits+1));
433 cilk_for(IT i=0; i< ncsb; ++i)
435 IT hi = (i << nlowbits);
436 NT * __restrict suby = &y[hi];
437 const NT * __restrict subx = &x[hi];
439 IT * __restrict r_bot = bot;
440 NT * __restrict r_num = num;
441 for(IT k=top[i][0]; k<top[i][1]; ++k)
446 IT rli = ((ind >> nlowbits) & lowmask);
447 IT cli = (ind & lowmask);
449 suby[rli] += val * subx[cli];
450 suby[cli] += val * subx[rli];
454 const IT diagsize = diagonal.size();
455 cilk_for(IT i=0; i < diagsize; ++i)
457 y[diagonal[i].first] += diagonal[i].second * x[diagonal[i].first];
464 template <
class NT,
class IT>
475 DivideIterationSpace(lspace, rspace, lsize, rsize, ncsb-d, d);
478 for(IT k=0; k<lsize; ++k)
480 lsum += top[lspace[k]][d+1] - top[lspace[k]][d];
482 for(IT k=0; k<rsize; ++k)
484 rsum += top[rspace[k]][d+1] - top[rspace[k]][d];
486 float lave = lsum / lsize;
487 float rave = rsum / rsize;
489 cilk_for(IT i=0; i< lsize; ++i)
491 IT rhi = (lspace[i] << nlowbits) ;
492 IT chi = ((lspace[i]+d) << nlowbits);
493 NT * __restrict suby = &y[rhi];
494 NT * __restrict suby_mirror = &y[chi];
495 const NT * __restrict subx = &x[chi];
496 const NT * __restrict subx_mirror = &x[rhi];
498 if((top[lspace[i]][d+1] - top[lspace[i]][d] >
BALANCETH * lave)
499 && (!(lspace[i] == (ncsb-d-1) && (n-chi) <= lowmask)))
501 BlockPar(top[lspace[i]][d], top[lspace[i]][d+1], subx, subx_mirror, suby, suby_mirror, 0, blcrange,
BREAKEVEN * (nlowbits+1));
505 IT * __restrict r_bot = bot;
506 NT * __restrict r_num = num;
507 for(IT k=top[lspace[i]][d]; k<top[lspace[i]][d+1]; ++k)
509 IT rli = ((r_bot[k] >> nlowbits) & lowmask);
510 IT cli = (r_bot[k] & lowmask);
511 suby[rli] += r_num[k] * subx[cli];
512 suby_mirror[cli] += r_num[k] * subx_mirror[rli];
517 cilk_for(IT j=0; j< rsize; ++j)
519 IT rhi = (rspace[j] << nlowbits) ;
520 IT chi = ((rspace[j]+d) << nlowbits);
521 NT * __restrict suby = &y[rhi];
522 NT * __restrict suby_mirror = &y[chi];
523 const NT * __restrict subx = &x[chi];
524 const NT * __restrict subx_mirror = &x[rhi];
526 if((top[rspace[j]][d+1] - top[rspace[j]][d] >
BALANCETH * rave)
527 && (!(rspace[j] == (ncsb-d-1) && (n-chi) <= lowmask)))
529 BlockPar(top[rspace[j]][d], top[rspace[j]][d+1], subx, subx_mirror, suby, suby_mirror, 0, blcrange,
BREAKEVEN * (nlowbits+1));
533 IT * __restrict r_bot = bot;
534 NT * __restrict r_num = num;
535 for(IT k=top[rspace[j]][d]; k<top[rspace[j]][d+1]; ++k)
537 IT rli = ((r_bot[k] >> nlowbits) & lowmask);
538 IT cli = (r_bot[k] & lowmask);
539 suby[rli] += r_num[k] * subx[cli];
540 suby_mirror[cli] += r_num[k] * subx_mirror[rli];
552 template <
class NT,
class IT>
554 IT rangebeg, IT rangeend, IT cutoff)
const
556 assert(
IsPower2(rangeend-rangebeg));
557 if(end - start < cutoff)
559 IT * __restrict r_bot = bot;
560 NT * __restrict r_num = num;
561 for(IT k=start; k<end; ++k)
566 IT rli = ((ind >> nlowbits) & lowmask);
567 IT cli = (ind & lowmask);
569 suby[rli] += val * subx[cli];
570 suby[cli] += val * subx[rli];
577 IT halfrange = (rangebeg+rangeend)/2;
578 IT qrt1range = (rangebeg+halfrange)/2;
579 IT qrt3range = (halfrange+rangeend)/2;
581 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
582 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
590 IT size0 =
static_cast<IT
> (mid - &bot[start]);
591 IT size2 =
static_cast<IT
> (right - mid);
592 IT size3 =
static_cast<IT
> (&bot[end] - right);
596 cilk_spawn BlockTriPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
597 BlockTriPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
600 BlockPar(start+size0, end-size3, subx, subx, suby, suby, halfrange, qrt3range, ncutoff);
610 template <
class NT,
class IT>
612 NT * __restrict suby, NT * __restrict suby_mirror, IT rangebeg, IT rangeend, IT cutoff)
const
614 assert(
IsPower2(rangeend-rangebeg));
615 if(end - start < cutoff)
617 IT * __restrict r_bot = bot;
618 NT * __restrict r_num = num;
619 for(IT k=start; k<end; ++k)
624 IT rli = ((ind >> nlowbits) & lowmask);
625 IT cli = (ind & lowmask);
627 suby[rli] += val * subx[cli];
628 suby_mirror[cli] += val * subx_mirror[rli];
635 IT halfrange = (rangebeg+rangeend)/2;
636 IT qrt1range = (rangebeg+halfrange)/2;
637 IT qrt3range = (halfrange+rangeend)/2;
639 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
640 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
641 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
649 IT size0 =
static_cast<IT
> (left - &bot[start]);
650 IT size1 =
static_cast<IT
> (mid - left);
651 IT size2 =
static_cast<IT
> (right - mid);
652 IT size3 =
static_cast<IT
> (&bot[end] - right);
658 cilk_spawn BlockPar(start, start+size0, subx, subx_mirror, suby, suby_mirror, rangebeg, qrt1range, ncutoff);
659 BlockPar(end-size3, end, subx, subx_mirror, suby, suby_mirror, qrt3range, rangeend, ncutoff);
662 cilk_spawn BlockPar(start+size0, start+size0+size1, subx, subx_mirror, suby, suby_mirror, qrt1range, halfrange, ncutoff);
663 BlockPar(start+size0+size1, end-size3, subx, subx_mirror, suby, suby_mirror, halfrange, qrt3range, ncutoff);
671 template <
class NT,
class IT>
674 const IT diagsize = diagonal.size();
675 for(IT i=0; i < diagsize; ++i)
677 y[diagonal[i].first] += diagonal[i].second * x[diagonal[i].first];
679 for (IT i = 0 ; i < ncsb ; ++i)
681 IT rhi = (i << nlowbits);
683 const NT * subx_mirror = &x[rhi];
685 IT * __restrict r_bot = bot;
686 NT * __restrict r_num = num;
687 for (IT j = 0 ; j < (ncsb-i) ; ++j)
690 IT chi = ((j+i) << nlowbits);
691 const NT * __restrict subx = &x[chi];
692 NT * __restrict suby_mirror = &y[chi];
694 for(IT k=top[i][j]; k<top[i][j+1]; ++k)
696 IT rli = ((r_bot[k] >> nlowbits) & lowmask);
697 IT cli = (r_bot[k] & lowmask);
699 suby[rli] += val * subx[cli];
700 suby_mirror[cli] += val * subx_mirror[rli];
707 template <
class NT,
class IT>
716 IT * sums =
new IT[size];
717 for(
size_t i=0; i< size; ++i)
719 sums[i] = top[i][d+1] - top[i][d];
721 IT max = *max_element(sums, sums+size);
722 IT mean = accumulate(sums, sums+size, 0.0) / size;
725 return static_cast<float>(max) / mean;
730 template <
class NT,
class IT>
734 for(
size_t i=0; i< ncsb-d; ++i)
736 sum += (top[i][d+1] - top[i][d]);
742 template <
class NT,
class IT>
747 outfile <<
"## Matrix Doesn't have any nonzeros" <<endl;
750 const IT ntop = ncsb * ncsb;
752 outfile <<
"## Average block is of dimensions "<< lowmask+1 <<
"-by-" << lowmask+1 << endl;
753 outfile <<
"## Number of real blocks is "<< ntop << endl;
754 outfile <<
"## Main (0th) block diagonal imbalance: " << Imbalance(0) << endl;
755 outfile <<
"## 1st block diagonal imbalance: " << Imbalance(1) << endl;
756 outfile <<
"## 2nd block diagonal imbalance: " << Imbalance(2) << endl;
758 outfile <<
"## nnz ratios (block diagonal 0,1,2): " <<
static_cast<float>(nzsum(0)) / nz <<
", "
759 << static_cast<float>(nzsum(1)) / nz <<
", " << static_cast<float>(nzsum(2)) / nz << endl;
760 outfile <<
"## atomics ratio: " <<
static_cast<float>(nz-nzsum(0)-nzsum(1)-nzsum(2))/nz << endl;
762 std::vector<int> blocksizes;
763 for(IT i=0; i<ncsb; ++i)
765 for(IT j=0; j < (ncsb-i); ++j)
767 blocksizes.push_back(static_cast<int> (top[i][j+1]-top[i][j]));
770 sort(blocksizes.begin(), blocksizes.end());
771 outfile<<
"## Total number of nonzeros: " << 2*nz +diagonal.size()<< endl;
772 outfile<<
"## Total number of stored nonzeros: "<< nz+diagonal.size() << endl;
773 outfile<<
"## Size of diagonal: " << diagonal.size() << endl;
775 outfile <<
"## Nonzero distribution (sorted) of blocks follows: \n" ;
776 std::copy(blocksizes.begin(), blocksizes.end(), ostream_iterator<int>(outfile,
"\n"));
782 template <
class NT,
class IT>
785 for(
typename vector< pair<IT,NT> >::const_iterator itr = diagonal.begin(); itr != diagonal.end(); ++itr)
787 outfile << itr->first <<
" " << itr->second <<
"\n";
789 for(IT i =0; i<ncsb; ++i)
791 for(IT j=0; j< (ncsb-i); ++j)
793 outfile <<
"Dumping A.top(" << i <<
"," << j <<
")" << endl;
794 for(IT k=top[i][j]; k< top[i][j+1]; ++k)
796 IT rli = ((bot[k] >> nlowbits) & lowmask);
797 IT cli = bot[k] & lowmask;
798 outfile <<
"A(" << rli <<
"," << cli <<
")=" << num[k] << endl;
unsigned int nextpoweroftwo(unsigned int v)
ofstream & Dump(ofstream &outfile) const
CsbSym< NT, IT > & operator=(const CsbSym< NT, IT > &rhs)
ofstream & PrintStats(ofstream &outfile) const
void atomicallyIncrementDouble(volatile double *target, const double by)
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
unsigned IntPower< 2 >(unsigned exponent)
unsigned int highestbitset(unsigned __int64 v)