9 template <
class NT,
class IT>
12 ispar = (workers > 1);
28 sizereq = ((rowbits > 1) && (colbits > 1));
33 cerr <<
"Matrix too small for this library" << endl;
37 rowlowbits = rowbits-1;
38 collowbits = colbits-1;
39 IT inf = numeric_limits<IT>::max();
42 rowhighbits = rowbits-rowlowbits;
43 colhighbits = colbits-collowbits;
72 while(rowlowbits+collowbits > maxbits)
74 if(rowlowbits > collowbits)
85 while(rowlowbits > collowbits)
90 while(rowlowbits < collowbits)
95 assert (collowbits == rowlowbits);
102 cout <<
"Forcing beta to "<< (candlowmask+1) <<
" instead of the chosen " << (lowrowmask+1) << endl;
103 cout <<
"Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
104 lowrowmask = lowcolmask = candlowmask;
105 rowlowbits = collowbits = forcelogbeta;
106 rowhighbits = rowbits-rowlowbits;
107 colhighbits = colbits-collowbits;
111 double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));
112 IT logbeta =
static_cast<IT
>(ceil(log2(sqrtn))) + 2;
113 if(rowlowbits > logbeta)
115 rowlowbits = collowbits = logbeta;
117 rowhighbits = rowbits-rowlowbits;
118 colhighbits = colbits-collowbits;
120 cout <<
"Beta chosen to be "<< (lowrowmask+1) << endl;
122 highrowmask = ((roundrowup - 1) ^ lowrowmask);
123 highcolmask = ((roundcolup - 1) ^ lowcolmask);
126 IT blcdimrow = lowrowmask + 1;
127 IT blcdimcol = lowcolmask + 1;
128 nbr =
static_cast<IT
>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));
129 nbc =
static_cast<IT
>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));
131 blcrange = (lowrowmask+1) * (lowcolmask+1);
141 ispar = (workers > 1);
157 sizereq = ((rowbits > 1) && (colbits > 1));
162 cerr <<
"Matrix too small for this library" << endl;
166 rowlowbits = rowbits-1;
167 collowbits = colbits-1;
168 IT inf = numeric_limits<IT>::max();
171 rowhighbits = rowbits-rowlowbits;
172 colhighbits = colbits-collowbits;
183 while(rowlowbits+collowbits > maxbits)
185 if(rowlowbits > collowbits)
196 while(rowlowbits > collowbits)
201 while(rowlowbits < collowbits)
206 assert (collowbits == rowlowbits);
210 if(forcelogbeta != 0)
213 cout <<
"Forcing beta to "<< (candlowmask+1) <<
" instead of the chosen " << (lowrowmask+1) << endl;
214 cout <<
"Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
215 lowrowmask = lowcolmask = candlowmask;
216 rowlowbits = collowbits = forcelogbeta;
217 rowhighbits = rowbits-rowlowbits;
218 colhighbits = colbits-collowbits;
222 double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));
223 IT logbeta =
static_cast<IT
>(ceil(log2(sqrtn))) + 2;
224 if(rowlowbits > logbeta)
226 rowlowbits = collowbits = logbeta;
228 rowhighbits = rowbits-rowlowbits;
229 colhighbits = colbits-collowbits;
231 cout <<
"Beta chosen to be "<< (lowrowmask+1) << endl;
233 highrowmask = ((roundrowup - 1) ^ lowrowmask);
234 highcolmask = ((roundcolup - 1) ^ lowcolmask);
237 IT blcdimrow = lowrowmask + 1;
238 IT blcdimcol = lowcolmask + 1;
239 nbr =
static_cast<IT
>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));
240 nbc =
static_cast<IT
>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));
242 blcrange = (lowrowmask+1) * (lowcolmask+1);
248 template <
class NT,
class IT>
251 assert(nz != 0 && n != 0 && m != 0);
256 top = allocate2D<IT>(nbr, nbc+1);
263 assert(nz != 0 && n != 0 && m != 0);
266 top = allocate2D<IT>(nbr, nbc+1);
270 template <
class NT,
class IT>
272 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc),
273 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),
274 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),
275 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
282 copy (rhs.num, rhs.num + nz, num);
283 copy (rhs.bot, rhs.bot + nz, bot);
287 top = allocate2D<IT>(nbr, nbc+1);
288 for(IT i=0; i<nbr; ++i)
289 copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
296 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc),
297 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),
298 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),
299 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
304 copy (rhs.bot, rhs.bot + nz, bot);
308 top = allocate2D<IT>(nbr, nbc+1);
309 for(IT i=0; i<nbr; ++i)
310 copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
314 template <
class NT,
class IT>
334 blcrange = rhs.blcrange;
335 rowhighbits = rhs.rowhighbits;
336 rowlowbits = rhs.rowlowbits;
337 highrowmask = rhs.highrowmask;
338 lowrowmask = rhs.lowrowmask;
339 colhighbits = rhs.colhighbits;
340 collowbits = rhs.collowbits;
341 highcolmask = rhs.highcolmask;
342 lowcolmask= rhs.lowcolmask;
343 mortoncmp = rhs.mortoncmp;
348 copy (rhs.num, rhs.num + nz, num);
349 copy (rhs.bot, rhs.bot + nz, bot);
353 top = allocate2D<IT>(nbr, nbc+1);
354 for(IT i=0; i<nbr; ++i)
355 copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
380 blcrange = rhs.blcrange;
381 rowhighbits = rhs.rowhighbits;
382 rowlowbits = rhs.rowlowbits;
383 highrowmask = rhs.highrowmask;
384 lowrowmask = rhs.lowrowmask;
385 colhighbits = rhs.colhighbits;
386 collowbits = rhs.collowbits;
387 highcolmask = rhs.highcolmask;
388 lowcolmask= rhs.lowcolmask;
389 mortoncmp = rhs.mortoncmp;
393 copy (rhs.bot, rhs.bot + nz, bot);
397 top = allocate2D<IT>(nbr, nbc+1);
398 for(IT i=0; i<nbr; ++i)
399 copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
405 template <
class NT,
class IT>
432 template <
class NT,
class IT>
435 typedef std::pair<IT, IT> ipair;
436 typedef std::pair<IT, ipair> mypair;
437 assert(nz != 0 && n != 0 && m != 0);
438 if(forcelogbeta == 0)
441 Init(workers, forcelogbeta);
445 top = allocate2D<IT>(nbr, nbc+1);
446 mypair * pairarray =
new mypair[nz];
448 for(IT j = 0; j < n; ++j)
450 for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i)
453 IT hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
454 | ((highcolmask & j) >> collowbits);
455 IT lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
458 pairarray[k++] = mypair(hindex, ipair(lindex,i));
461 sort(pairarray, pairarray+nz);
462 SortBlocks(pairarray, csc.num);
467 template <
typename NT>
470 typedef std::pair<IT, IT> ipair;
471 typedef std::pair<IT, ipair> mypair;
472 assert(nz != 0 && n != 0 && m != 0);
476 top = allocate2D<IT>(nbr, nbc+1);
477 mypair * pairarray =
new mypair[nz];
479 for(IT j = 0; j < n; ++j)
481 for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i)
484 IT hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
485 | ((highcolmask & j) >> collowbits);
486 IT lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
489 pairarray[k++] = mypair(hindex, ipair(lindex,i));
492 sort(pairarray, pairarray+nz);
493 SortBlocks(pairarray);
498 template <
class NT,
class IT>
500 :nz(size),m(rows),n(cols)
502 typedef std::pair<IT, IT> ipair;
503 typedef std::pair<IT, ipair> mypair;
504 assert(nz != 0 && n != 0 && m != 0);
505 Init(workers, forcelogbeta);
509 top = allocate2D<IT>(nbr, nbc+1);
510 mypair * pairarray =
new mypair[nz];
511 for(IT k = 0; k < nz; ++k)
514 IT hindex = (((highrowmask & ri[k] ) >> rowlowbits) << colhighbits) | ((highcolmask & ci[k]) >> collowbits);
515 IT lindex = ((lowrowmask & ri[k]) << collowbits) | (lowcolmask & ci[k]) ;
518 pairarray[k] = mypair(hindex, ipair(lindex, k));
520 sort(pairarray, pairarray+nz);
521 SortBlocks(pairarray, val);
527 :nz(size),m(rows),n(cols)
529 typedef std::pair<IT, IT> ipair;
530 typedef std::pair<IT, ipair> mypair;
531 assert(nz != 0 && n != 0 && m != 0);
532 Init(workers, forcelogbeta);
535 top = allocate2D<IT>(nbr, nbc+1);
536 mypair * pairarray =
new mypair[nz];
537 for(IT k = 0; k < nz; ++k)
540 IT hindex = (((highrowmask & ri[k] ) >> rowlowbits) << colhighbits) | ((highcolmask & ci[k]) >> collowbits);
541 IT lindex = ((lowrowmask & ri[k]) << collowbits) | (lowcolmask & ci[k]) ;
544 pairarray[k] = mypair(hindex, ipair(lindex, k));
546 sort(pairarray, pairarray+nz);
547 SortBlocks(pairarray);
551 template <
class NT,
class IT>
554 typedef typename std::pair<IT, std::pair<IT, IT> > mypair;
557 for(IT i = 0; i < nbr; ++i)
559 for(IT j = 0; j < nbc; ++j)
563 vector< mypair > blocknz;
564 while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) )
566 IT lowbits = pairarray[cnz].second.first;
567 IT rlowbits = ((lowbits >> collowbits) & lowrowmask);
568 IT clowbits = (lowbits & lowcolmask);
571 blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
574 sort(blocknz.begin(), blocknz.end());
576 for(IT k=prevcnz; k<cnz ; ++k)
578 bot[k] = blocknz[k-prevcnz].second.first;
579 num[k] = val[blocknz[k-prevcnz].second.second];
590 typedef pair<IT, pair<IT, IT> > mypair;
593 for(IT i = 0; i < nbr; ++i)
595 for(IT j = 0; j < nbc; ++j)
599 std::vector<mypair> blocknz;
600 while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) )
602 IT lowbits = pairarray[cnz].second.first;
603 IT rlowbits = ((lowbits >> collowbits) & lowrowmask);
604 IT clowbits = (lowbits & lowcolmask);
607 blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
610 sort(blocknz.begin(), blocknz.end());
612 for(IT k=prevcnz; k<cnz ; ++k)
613 bot[k] = blocknz[k-prevcnz].second.first;
626 template <
class NT,
class IT>
627 template <
typename SR,
typename RHS,
typename LHS>
628 void BiCsb<NT, IT>::BMult(IT** chunks, IT start, IT end,
const RHS * __restrict x, LHS * __restrict y, IT ysize)
const
630 assert(end-start > 0);
633 if((chunks[end] - chunks[start]) == 1)
635 IT chi = ( (chunks[start] - chunks[0]) << collowbits);
639 if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask )
641 const RHS * __restrict subx = &x[chi];
642 BlockPar<SR>( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange,
BREAKEVEN * ysize);
646 SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
651 SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
657 IT mid = (start+end)/2;
659 cilk_spawn BMult<SR>(chunks, start, mid, x, y, ysize);
662 BMult<SR>(chunks, mid, end, x, y, ysize);
666 LHS * temp =
new LHS[ysize]();
672 BMult<SR>(chunks, mid, end, x, temp, ysize);
676 for(IT i=0; i<ysize; ++i)
677 SR::axpy(temp[i], y[i]);
686 template <
typename SR,
typename RHS,
typename LHS>
687 void BiCsb<bool, IT>::BMult(IT** chunks, IT start, IT end,
const RHS * __restrict x, LHS * __restrict y, IT ysize)
const
689 assert(end-start > 0);
692 if((chunks[end] - chunks[start]) == 1)
694 IT chi = ( (chunks[start] - chunks[0]) << collowbits);
698 if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask )
700 const RHS * __restrict subx = &x[chi];
701 BlockPar<SR>( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange,
BREAKEVEN * ysize);
705 SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
710 SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
716 IT mid = (start+end)/2;
718 cilk_spawn BMult<SR>(chunks, start, mid, x, y, ysize);
721 BMult<SR>(chunks, mid, end, x, y, ysize);
725 LHS * temp =
new LHS[ysize]();
731 BMult<SR>(chunks, mid, end, x, temp, ysize);
735 for(IT i=0; i<ysize; ++i)
736 SR::axpy(temp[i], y[i]);
751 template <
class NT,
class IT>
752 template <
typename SR,
typename RHS,
typename LHS>
753 void BiCsb<NT, IT>::BTransMult(vector< vector< tuple<IT,IT,IT> > * > & chunks, IT start, IT end,
const RHS * __restrict x, LHS * __restrict y, IT ysize)
const
758 assert(end-start > 0);
761 if(chunks[start]->size() == 1)
764 auto block = chunks[start]->front();
765 IT chi = ( get<2>(block) << rowlowbits);
771 if(ysize == (lowrowmask+1) && (m-chi) > lowrowmask && (get<1>(block)-get<0>(block)) >
BREAKEVEN * ysize)
773 const RHS * __restrict subx = &x[chi];
774 BlockParT<SR>( get<0>(block) , get<1>(block), subx, y, 0, blcrange,
BREAKEVEN * ysize);
778 SubSpMVTrans<SR>(*(chunks[start]), x, y);
783 SubSpMVTrans<SR>(*(chunks[start]), x, y);
788 IT mid = (start+end)/2;
789 cilk_spawn BTransMult<SR>(chunks, start, mid, x, y, ysize);
792 BTransMult<SR>(chunks, mid, end, x, y, ysize);
796 LHS * temp =
new LHS[ysize]();
797 BTransMult<SR>(chunks, mid, end, x, temp, ysize);
801 for(IT i=0; i<ysize; ++i)
802 SR::axpy(temp[i], y[i]);
811 template <
typename SR,
typename RHS,
typename LHS>
812 void BiCsb<bool, IT>::BTransMult(vector< vector< tuple<IT,IT,IT> > * > & chunks, IT start, IT end,
const RHS * __restrict x, LHS * __restrict y, IT ysize)
const
814 assert(end-start > 0);
817 if(chunks[start]->size() == 1)
820 auto block = chunks[start]->front();
821 IT chi = ( get<2>(block) << rowlowbits);
825 if(ysize == (lowrowmask+1) && (m-chi) > lowrowmask )
827 const RHS * __restrict subx = &x[chi];
828 BlockParT<SR>( get<0>(block) , get<1>(block), subx, y, 0, blcrange,
BREAKEVEN * ysize);
832 SubSpMVTrans<SR>(*(chunks[start]), x, y);
837 SubSpMVTrans<SR>(*(chunks[start]), x, y);
842 IT mid = (start+end)/2;
843 cilk_spawn BTransMult<SR>(chunks, start, mid, x, y, ysize);
846 BTransMult<SR>(chunks, mid, end, x, y, ysize);
850 LHS * temp =
new LHS[ysize]();
851 BTransMult<SR>(chunks, mid, end, x, temp, ysize);
855 for(IT i=0; i<ysize; ++i)
856 SR::axpy(temp[i], y[i]);
865 template <
class NT,
class IT>
866 template <
typename SR,
typename RHS,
typename LHS>
867 void BiCsb<NT, IT>::SubSpMV(IT * __restrict btop, IT bstart, IT bend,
const RHS * __restrict x, LHS * __restrict suby)
const
869 IT * __restrict r_bot = bot;
870 NT * __restrict r_num = num;
872 __m128i lcms = _mm_set1_epi32 (lowcolmask);
873 __m128i lrms = _mm_set1_epi32 (lowrowmask);
875 for (IT j = bstart ; j < bend ; ++j)
878 IT chi = (j << collowbits);
879 const RHS * __restrict subx = &x[chi];
883 IT range = (btop[j+1]-btop[j]) >> 2;
887 for (IT k = 0 ; k < range ; ++k)
891 #define ALIGN16 __attribute__((aligned(16)))
893 IT ALIGN16 rli4[4]; IT ALIGN16 cli4[4];
894 NT ALIGN16 x4[4]; NT ALIGN16 y4[4];
897 IT pin = start + (k << 2);
899 __m128i bots = _mm_loadu_si128((__m128i*) &r_bot[pin]);
900 __m128i clis = _mm_and_si128( bots, lcms);
901 __m128i rlis = _mm_and_si128( _mm_srli_epi32(bots, collowbits), lrms);
902 _mm_store_si128 ((__m128i*) cli4, clis);
903 _mm_store_si128 ((__m128i*) rli4, rlis);
905 x4[0] = subx[cli4[0]];
906 x4[1] = subx[cli4[1]];
907 x4[2] = subx[cli4[2]];
908 x4[3] = subx[cli4[3]];
910 __m128d Y01QW = _mm_mul_pd((__m128d)_mm_loadu_pd(&r_num[pin]), (__m128d)_mm_load_pd(&x4[0]));
911 __m128d Y23QW = _mm_mul_pd((__m128d)_mm_loadu_pd(&r_num[pin+2]), (__m128d)_mm_load_pd(&x4[2]));
913 _mm_store_pd(&y4[0],Y01QW);
914 _mm_store_pd(&y4[2],Y23QW);
916 suby[rli4[0]] += y4[0];
917 suby[rli4[1]] += y4[1];
918 suby[rli4[2]] += y4[2];
919 suby[rli4[3]] += y4[3];
921 for(IT k=start+4*range; k<btop[j+1]; ++k)
923 IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
924 IT cli = (r_bot[k] & lowcolmask);
925 SR::axpy(r_num[k], subx[cli], suby[rli]);
931 for(IT k=btop[j]; k<btop[j+1]; ++k)
933 IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
934 IT cli = (r_bot[k] & lowcolmask);
935 SR::axpy(r_num[k], subx[cli], suby[rli]);
945 template <
typename SR,
typename RHS,
typename LHS>
948 IT * __restrict r_bot = bot;
949 for (IT j = bstart ; j < bend ; ++j)
952 IT chi = (j << collowbits);
953 const RHS * __restrict subx = &x[chi];
954 for (IT k = btop[j] ; k < btop[j+1] ; ++k)
956 IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
957 IT cli = (r_bot[k] & lowcolmask);
958 SR::axpy(subx[cli], suby[rli]);
964 template <
class NT,
class IT>
965 template <
typename SR,
typename RHS,
typename LHS>
968 IT * __restrict r_bot = bot;
969 NT * __restrict r_num = num;
970 for(
auto itr = chunk.begin(); itr != chunk.end(); ++itr)
973 IT chi = ( get<2>(*itr) << rowlowbits);
974 const RHS * __restrict subx = &x[chi];
976 IT nzbeg = get<0>(*itr);
977 IT nzend = get<1>(*itr);
979 for (IT k = nzbeg ; k < nzend ; ++k)
982 IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
983 IT rli = (r_bot[k] & lowcolmask);
984 SR::axpy(r_num[k], subx[cli], suby[rli]);
991 template <
typename SR,
typename RHS,
typename LHS>
994 IT * __restrict r_bot = bot;
995 for(
auto itr = chunk.begin(); itr != chunk.end(); ++itr)
998 IT chi = ( get<2>(*itr) << rowlowbits);
999 const RHS * __restrict subx = &x[chi];
1001 IT nzbeg = get<0>(*itr);
1002 IT nzend = get<1>(*itr);
1004 for (IT k = nzbeg ; k < nzend ; ++k)
1007 IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1008 IT rli = (r_bot[k] & lowcolmask);
1009 SR::axpy(subx[cli], suby[rli]);
1014 template <
class NT,
class IT>
1015 template <
typename SR,
typename RHS,
typename LHS>
1018 IT * __restrict r_bot = bot;
1019 NT * __restrict r_num = num;
1020 for(IT i= rowstart; i < rowend; ++i)
1023 IT chi = (i << rowlowbits);
1024 const RHS * __restrict subx = &x[chi];
1026 for (IT k = top[i][col] ; k < top[i][col+1] ; ++k)
1029 IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1030 IT rli = (r_bot[k] & lowcolmask);
1031 SR::axpy(r_num[k], subx[cli], suby[rli]);
1038 template <
typename SR,
typename RHS,
typename LHS>
1041 IT * __restrict r_bot = bot;
1042 for(IT i= rowstart; i < rowend; ++i)
1045 IT chi = (i << rowlowbits);
1046 const RHS * __restrict subx = &x[chi];
1047 for (IT k = top[i][col] ; k < top[i][col+1] ; ++k)
1050 IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1051 IT rli = (r_bot[k] & lowcolmask);
1052 SR::axpy(subx[cli], suby[rli]);
1063 template <
class NT,
class IT>
1064 template <
typename SR,
typename RHS,
typename LHS>
1066 IT rangebeg, IT rangeend, IT cutoff)
const
1068 assert(
IsPower2(rangeend-rangebeg));
1069 if(end - start < cutoff)
1071 IT * __restrict r_bot = bot;
1072 NT * __restrict r_num = num;
1073 for (IT k = start ; k < end ; ++k)
1075 IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
1076 IT cli = (r_bot[k] & lowcolmask);
1077 SR::axpy(r_num[k], subx[cli], suby[rli]);
1084 IT halfrange = (rangebeg+rangeend)/2;
1085 IT qrt1range = (rangebeg+halfrange)/2;
1086 IT qrt3range = (halfrange+rangeend)/2;
1088 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1089 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1090 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1098 IT size0 =
static_cast<IT
> (left - &bot[start]);
1099 IT size1 =
static_cast<IT
> (mid - left);
1100 IT size2 =
static_cast<IT
> (right - mid);
1101 IT size3 =
static_cast<IT
> (&bot[end] - right);
1110 cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1111 BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1114 cilk_spawn BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1115 BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1120 cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1121 BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1124 cilk_spawn BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1125 BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1133 template <
typename SR,
typename RHS,
typename LHS>
1135 IT rangebeg, IT rangeend, IT cutoff)
const
1137 assert(
IsPower2(rangeend-rangebeg));
1138 if(end - start < cutoff)
1140 IT * __restrict r_bot = bot;
1141 for (IT k = start ; k < end ; ++k)
1143 IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
1144 IT cli = (r_bot[k] & lowcolmask);
1145 SR::axpy(subx[cli], suby[rli]);
1152 IT halfrange = (rangebeg+rangeend)/2;
1153 IT qrt1range = (rangebeg+halfrange)/2;
1154 IT qrt3range = (halfrange+rangeend)/2;
1156 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1157 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1158 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1166 IT size0 =
static_cast<IT
> (left - &bot[start]);
1167 IT size1 =
static_cast<IT
> (mid - left);
1168 IT size2 =
static_cast<IT
> (right - mid);
1169 IT size3 =
static_cast<IT
> (&bot[end] - right);
1178 cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1179 BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1182 cilk_spawn BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1183 BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1188 cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1189 BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1192 cilk_spawn BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1193 BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1202 template <
class NT,
class IT>
1203 template <
typename SR,
typename RHS,
typename LHS>
1205 IT rangebeg, IT rangeend, IT cutoff)
const
1207 if(end - start < cutoff)
1209 IT * __restrict r_bot = bot;
1210 NT * __restrict r_num = num;
1211 for (IT k = start ; k < end ; ++k)
1214 IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1215 IT rli = (r_bot[k] & lowcolmask);
1216 SR::axpy(r_num[k], subx[cli], suby[rli]);
1221 IT halfrange = (rangebeg+rangeend)/2;
1222 IT qrt1range = (rangebeg+halfrange)/2;
1223 IT qrt3range = (halfrange+rangeend)/2;
1227 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1228 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1229 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1237 IT size0 =
static_cast<IT
> (left - &bot[start]);
1238 IT size1 =
static_cast<IT
> (mid - left);
1239 IT size2 =
static_cast<IT
> (right - mid);
1240 IT size3 =
static_cast<IT
> (&bot[end] - right);
1249 cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1250 BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1253 cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1254 BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1259 cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1260 BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1263 cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1264 BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1272 template <
typename SR,
typename RHS,
typename LHS>
1274 IT rangebeg, IT rangeend, IT cutoff)
const
1276 if(end - start < cutoff)
1278 IT * __restrict r_bot = bot;
1279 for (IT k = start ; k < end ; ++k)
1282 IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1283 IT rli = (r_bot[k] & lowcolmask);
1284 SR::axpy(subx[cli], suby[rli]);
1289 IT halfrange = (rangebeg+rangeend)/2;
1290 IT qrt1range = (rangebeg+halfrange)/2;
1291 IT qrt3range = (halfrange+rangeend)/2;
1295 IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1296 IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1297 IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1305 IT size0 =
static_cast<IT
> (left - &bot[start]);
1306 IT size1 =
static_cast<IT
> (mid - left);
1307 IT size2 =
static_cast<IT
> (right - mid);
1308 IT size3 =
static_cast<IT
> (&bot[end] - right);
1317 cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1318 BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1321 cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1322 BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1327 cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff);
1328 BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff);
1331 cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);
1332 BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff);
1339 template <
class NT,
class IT>
1344 outfile <<
"## Matrix Doesn't have any nonzeros" <<endl;
1347 const IT ntop = nbr * nbc;
1349 outfile <<
"## Average block is of dimensions "<< lowrowmask+1 <<
"-by-" << lowcolmask+1 << endl;
1350 outfile <<
"## Number of real blocks is "<< ntop << endl;
1351 outfile <<
"## Row imbalance is " <<
RowImbalance(*
this) << endl;
1352 outfile <<
"## Col imbalance is " <<
ColImbalance(*
this) << endl;
1353 outfile <<
"## Block parallel calls is " << blockparcalls.get_value() << endl;
1355 std::vector<int> blocksizes(ntop);
1356 for(IT i=0; i<nbr; ++i)
1358 for(IT j=0; j < nbc; ++j)
1360 blocksizes[i*nbc+j] =
static_cast<int> (top[i][j+1]-top[i][j]);
1363 sort(blocksizes.begin(), blocksizes.end());
1364 outfile<<
"## Total nonzeros: "<< accumulate(blocksizes.begin(), blocksizes.end(), 0) << endl;
1366 outfile <<
"## Nonzero distribution (sorted) of blocks follows: \n" ;
1367 for(IT i=0; i< ntop; ++i)
1369 outfile << blocksizes[i] <<
"\n";
ofstream & PrintStats(ofstream &outfile) const
unsigned int nextpoweroftwo(unsigned int v)
BiCsb< NT, IT > & operator=(const BiCsb< NT, IT > &rhs)
float ColImbalance(const BiCsb< NT, IT > &A)
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
unsigned char * aligned_malloc(uint64_t size)
unsigned IntPower< 2 >(unsigned exponent)
void aligned_free(unsigned char *ptr)
unsigned int highestbitset(unsigned __int64 v)
float RowImbalance(const CSB &A)
void deallocate2D(T **array, I m)