Compressed Sparse Blocks  1.2
 All Classes Files Functions Variables Typedefs Friends Macros Pages
bicsb.cpp
Go to the documentation of this file.
1 #include <cassert>
2 #include "bicsb.h"
3 #include "utility.h"
4 
5 // Choose block size as big as possible given the following constraints
6 // 1) The bot array is addressible by IT
7 // 2) The parts of x & y vectors that a block touches fits into L2 cache [assuming a saxpy() operation]
8 // 3) There's enough parallel slackness for block rows (at least SLACKNESS * CILK_NPROC)
9 template <class NT, class IT>
10 void BiCsb<NT, IT>::Init(int workers, IT forcelogbeta)
11 {
12  ispar = (workers > 1);
13  IT roundrowup = nextpoweroftwo(m);
14  IT roundcolup = nextpoweroftwo(n);
15 
16  // if indices are negative, highestbitset returns -1,
17  // but that will be caught by the sizereq below
18  IT rowbits = highestbitset(roundrowup);
19  IT colbits = highestbitset(roundcolup);
20  bool sizereq;
21  if (ispar)
22  {
23  sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers)
24  && (IntPower<2>(colbits) > SLACKNESS * workers));
25  }
26  else
27  {
28  sizereq = ((rowbits > 1) && (colbits > 1));
29  }
30 
31  if(!sizereq)
32  {
33  cerr << "Matrix too small for this library" << endl;
34  return;
35  }
36 
37  rowlowbits = rowbits-1;
38  collowbits = colbits-1;
39  IT inf = numeric_limits<IT>::max();
40  IT maxbits = highestbitset(inf);
41 
42  rowhighbits = rowbits-rowlowbits; // # higher order bits for rows (has at least one bit)
43  colhighbits = colbits-collowbits; // # higher order bits for cols (has at least one bit)
44  if(ispar)
45  {
46  while(IntPower<2>(rowhighbits) < SLACKNESS * workers)
47  {
48  rowhighbits++;
49  rowlowbits--;
50  }
51  }
52 
53  // calculate the space that suby occupies in L2 cache
54  IT yL2 = IntPower<2>(rowlowbits) * sizeof(NT);
55  while(yL2 > L2SIZE)
56  {
57  yL2 /= 2;
58  rowhighbits++;
59  rowlowbits--;
60  }
61 
62  // calculate the space that subx occupies in L2 cache
63  IT xL2 = IntPower<2>(collowbits) * sizeof(NT);
64  while(xL2 > L2SIZE)
65  {
66  xL2 /= 2;
67  colhighbits++;
68  collowbits--;
69  }
70 
71  // blocks need to be square for correctness (maybe generalize this later?)
72  while(rowlowbits+collowbits > maxbits)
73  {
74  if(rowlowbits > collowbits)
75  {
76  rowhighbits++;
77  rowlowbits--;
78  }
79  else
80  {
81  colhighbits++;
82  collowbits--;
83  }
84  }
85  while(rowlowbits > collowbits)
86  {
87  rowhighbits++;
88  rowlowbits--;
89  }
90  while(rowlowbits < collowbits)
91  {
92  colhighbits++;
93  collowbits--;
94  }
95  assert (collowbits == rowlowbits);
96 
97  lowrowmask = IntPower<2>(rowlowbits) - 1;
98  lowcolmask = IntPower<2>(collowbits) - 1;
99  if(forcelogbeta != 0)
100  {
101  IT candlowmask = IntPower<2>(forcelogbeta) -1;
102  cout << "Forcing beta to "<< (candlowmask+1) << " instead of the chosen " << (lowrowmask+1) << endl;
103  cout << "Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
104  lowrowmask = lowcolmask = candlowmask;
105  rowlowbits = collowbits = forcelogbeta;
106  rowhighbits = rowbits-rowlowbits;
107  colhighbits = colbits-collowbits;
108  }
109  else
110  {
111  double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));
112  IT logbeta = static_cast<IT>(ceil(log2(sqrtn))) + 2;
113  if(rowlowbits > logbeta)
114  {
115  rowlowbits = collowbits = logbeta;
116  lowrowmask = lowcolmask = IntPower<2>(logbeta) -1;
117  rowhighbits = rowbits-rowlowbits;
118  colhighbits = colbits-collowbits;
119  }
120  cout << "Beta chosen to be "<< (lowrowmask+1) << endl;
121  }
122  highrowmask = ((roundrowup - 1) ^ lowrowmask);
123  highcolmask = ((roundcolup - 1) ^ lowcolmask);
124 
125  // nbc = #{block columns} = #{blocks in any block row}, nbr = #{block rows)
126  IT blcdimrow = lowrowmask + 1;
127  IT blcdimcol = lowcolmask + 1;
128  nbr = static_cast<IT>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));
129  nbc = static_cast<IT>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));
130 
131  blcrange = (lowrowmask+1) * (lowcolmask+1); // range indexed by one block
132  mortoncmp = MortonCompare<IT>(rowlowbits, collowbits, lowrowmask, lowcolmask);
133 }
134 
135 // Partial template specialization for booleans
136 // Does not check cache considerations as this is mostly likely
137 // to be used for gaxpy() with multiple rhs vectors (we don't know how many and what type at this point)
138 template <class IT>
139 void BiCsb<bool,IT>::Init(int workers, IT forcelogbeta)
140 {
141  ispar = (workers > 1);
142  IT roundrowup = nextpoweroftwo(m);
143  IT roundcolup = nextpoweroftwo(n);
144 
145  // if indices are negative, highestbitset returns -1,
146  // but that will be caught by the sizereq below
147  IT rowbits = highestbitset(roundrowup);
148  IT colbits = highestbitset(roundcolup);
149  bool sizereq;
150  if (ispar)
151  {
152  sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers)
153  && (IntPower<2>(colbits) > SLACKNESS * workers));
154  }
155  else
156  {
157  sizereq = ((rowbits > 1) && (colbits > 1));
158  }
159 
160  if(!sizereq)
161  {
162  cerr << "Matrix too small for this library" << endl;
163  return;
164  }
165 
166  rowlowbits = rowbits-1;
167  collowbits = colbits-1;
168  IT inf = numeric_limits<IT>::max();
169  IT maxbits = highestbitset(inf);
170 
171  rowhighbits = rowbits-rowlowbits; // # higher order bits for rows (has at least one bit)
172  colhighbits = colbits-collowbits; // # higher order bits for cols (has at least one bit)
173  if(ispar)
174  {
175  while(IntPower<2>(rowhighbits) < SLACKNESS * workers)
176  {
177  rowhighbits++;
178  rowlowbits--;
179  }
180  }
181 
182  // blocks need to be square for correctness (maybe generalize this later?)
183  while(rowlowbits+collowbits > maxbits)
184  {
185  if(rowlowbits > collowbits)
186  {
187  rowhighbits++;
188  rowlowbits--;
189  }
190  else
191  {
192  colhighbits++;
193  collowbits--;
194  }
195  }
196  while(rowlowbits > collowbits)
197  {
198  rowhighbits++;
199  rowlowbits--;
200  }
201  while(rowlowbits < collowbits)
202  {
203  colhighbits++;
204  collowbits--;
205  }
206  assert (collowbits == rowlowbits);
207 
208  lowrowmask = IntPower<2>(rowlowbits) - 1;
209  lowcolmask = IntPower<2>(collowbits) - 1;
210  if(forcelogbeta != 0)
211  {
212  IT candlowmask = IntPower<2>(forcelogbeta) -1;
213  cout << "Forcing beta to "<< (candlowmask+1) << " instead of the chosen " << (lowrowmask+1) << endl;
214  cout << "Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
215  lowrowmask = lowcolmask = candlowmask;
216  rowlowbits = collowbits = forcelogbeta;
217  rowhighbits = rowbits-rowlowbits;
218  colhighbits = colbits-collowbits;
219  }
220  else
221  {
222  double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));
223  IT logbeta = static_cast<IT>(ceil(log2(sqrtn))) + 2;
224  if(rowlowbits > logbeta)
225  {
226  rowlowbits = collowbits = logbeta;
227  lowrowmask = lowcolmask = IntPower<2>(logbeta) -1;
228  rowhighbits = rowbits-rowlowbits;
229  colhighbits = colbits-collowbits;
230  }
231  cout << "Beta chosen to be "<< (lowrowmask+1) << endl;
232  }
233  highrowmask = ((roundrowup - 1) ^ lowrowmask);
234  highcolmask = ((roundcolup - 1) ^ lowcolmask);
235 
236  // nbc = #{block columns} = #{blocks in any block row}, nbr = #{block rows)
237  IT blcdimrow = lowrowmask + 1;
238  IT blcdimcol = lowcolmask + 1;
239  nbr = static_cast<IT>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));
240  nbc = static_cast<IT>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));
241 
242  blcrange = (lowrowmask+1) * (lowcolmask+1); // range indexed by one block
243  mortoncmp = MortonCompare<IT>(rowlowbits, collowbits, lowrowmask, lowcolmask);
244 }
245 
246 
247 // Constructing empty BiCsb objects (size = 0) are not allowed.
248 template <class NT, class IT>
249 BiCsb<NT, IT>::BiCsb (IT size, IT rows, IT cols, int workers): nz(size),m(rows),n(cols)
250 {
251  assert(nz != 0 && n != 0 && m != 0);
252  Init(workers);
253 
254  num = (NT*) aligned_malloc( nz * sizeof(NT));
255  bot = (IT*) aligned_malloc( nz * sizeof(IT));
256  top = allocate2D<IT>(nbr, nbc+1);
257 }
258 
259 // Partial template specialization for booleans
260 template <class IT>
261 BiCsb<bool, IT>::BiCsb (IT size, IT rows, IT cols, int workers): nz(size),m(rows),n(cols)
262 {
263  assert(nz != 0 && n != 0 && m != 0);
264  Init(workers);
265  bot = (IT*) aligned_malloc( nz * sizeof(IT));
266  top = allocate2D<IT>(nbr, nbc+1);
267 }
268 
269 // copy constructor
270 template <class NT, class IT>
272 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc),
273 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),
274 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),
275 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
276 {
277  if(nz > 0)
278  {
279  num = (NT*) aligned_malloc( nz * sizeof(NT));
280  bot = (IT*) aligned_malloc( nz * sizeof(IT));
281 
282  copy (rhs.num, rhs.num + nz, num);
283  copy (rhs.bot, rhs.bot + nz, bot);
284  }
285  if ( nbr > 0)
286  {
287  top = allocate2D<IT>(nbr, nbc+1);
288  for(IT i=0; i<nbr; ++i)
289  copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
290  }
291 }
292 
293 // copy constructor for partial NT=boolean specialization
294 template <class IT>
296 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc),
297 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),
298 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),
299 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
300 {
301  if(nz > 0)
302  {
303  bot = (IT*) aligned_malloc( nz * sizeof(IT));
304  copy (rhs.bot, rhs.bot + nz, bot);
305  }
306  if ( nbr > 0)
307  {
308  top = allocate2D<IT>(nbr, nbc+1);
309  for(IT i=0; i<nbr; ++i)
310  copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
311  }
312 }
313 
314 template <class NT, class IT>
316 {
317  if(this != &rhs)
318  {
319  if(nz > 0) // if the existing object is not empty, make it empty
320  {
321  aligned_free(bot);
322  aligned_free(num);
323  }
324  if(nbr > 0)
325  {
326  deallocate2D(top, nbr);
327  }
328  ispar = rhs.ispar;
329  nz = rhs.nz;
330  n = rhs.n;
331  m = rhs.m;
332  nbr = rhs.nbr;
333  nbc = rhs.nbc;
334  blcrange = rhs.blcrange;
335  rowhighbits = rhs.rowhighbits;
336  rowlowbits = rhs.rowlowbits;
337  highrowmask = rhs.highrowmask;
338  lowrowmask = rhs.lowrowmask;
339  colhighbits = rhs.colhighbits;
340  collowbits = rhs.collowbits;
341  highcolmask = rhs.highcolmask;
342  lowcolmask= rhs.lowcolmask;
343  mortoncmp = rhs.mortoncmp;
344  if(nz > 0) // if the copied object is not empty
345  {
346  num = (NT*) aligned_malloc( nz * sizeof(NT));
347  bot = (IT*) aligned_malloc( nz * sizeof(IT));
348  copy (rhs.num, rhs.num + nz, num);
349  copy (rhs.bot, rhs.bot + nz, bot);
350  }
351  if ( nbr > 0)
352  {
353  top = allocate2D<IT>(nbr, nbc+1);
354  for(IT i=0; i<nbr; ++i)
355  copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
356  }
357  }
358  return *this;
359 }
360 
361 template <class IT>
363 {
364  if(this != &rhs)
365  {
366  if(nz > 0) // if the existing object is not empty, make it empty
367  {
368  aligned_free(bot);
369  }
370  if(nbr > 0)
371  {
372  deallocate2D(top, nbr);
373  }
374  ispar = rhs.ispar;
375  nz = rhs.nz;
376  n = rhs.n;
377  m = rhs.m;
378  nbr = rhs.nbr;
379  nbc = rhs.nbc;
380  blcrange = rhs.blcrange;
381  rowhighbits = rhs.rowhighbits;
382  rowlowbits = rhs.rowlowbits;
383  highrowmask = rhs.highrowmask;
384  lowrowmask = rhs.lowrowmask;
385  colhighbits = rhs.colhighbits;
386  collowbits = rhs.collowbits;
387  highcolmask = rhs.highcolmask;
388  lowcolmask= rhs.lowcolmask;
389  mortoncmp = rhs.mortoncmp;
390  if(nz > 0) // if the copied object is not empty
391  {
392  bot = (IT*) aligned_malloc( nz * sizeof(IT));
393  copy (rhs.bot, rhs.bot + nz, bot);
394  }
395  if ( nbr > 0)
396  {
397  top = allocate2D<IT>(nbr, nbc+1);
398  for(IT i=0; i<nbr; ++i)
399  copy (rhs.top[i], rhs.top[i] + nbc + 1, top[i]);
400  }
401  }
402  return *this;
403 }
404 
405 template <class NT, class IT>
407 {
408  if( nz > 0)
409  {
410  aligned_free((unsigned char*) num);
411  aligned_free((unsigned char*) bot);
412  }
413  if ( nbr > 0)
414  {
415  deallocate2D(top, nbr);
416  }
417 }
418 
419 template <class IT>
421 {
422  if( nz > 0)
423  {
424  aligned_free((unsigned char*) bot);
425  }
426  if ( nbr > 0)
427  {
428  deallocate2D(top, nbr);
429  }
430 }
431 
432 template <class NT, class IT>
433 BiCsb<NT, IT>::BiCsb (Csc<NT, IT> & csc, int workers, IT forcelogbeta):nz(csc.nz),m(csc.m),n(csc.n)
434 {
435  typedef std::pair<IT, IT> ipair;
436  typedef std::pair<IT, ipair> mypair;
437  assert(nz != 0 && n != 0 && m != 0);
438  if(forcelogbeta == 0)
439  Init(workers);
440  else
441  Init(workers, forcelogbeta);
442 
443  num = (NT*) aligned_malloc( nz * sizeof(NT));
444  bot = (IT*) aligned_malloc( nz * sizeof(IT));
445  top = allocate2D<IT>(nbr, nbc+1);
446  mypair * pairarray = new mypair[nz];
447  IT k = 0;
448  for(IT j = 0; j < n; ++j)
449  {
450  for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i) // scan the jth column
451  {
452  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
453  IT hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
454  | ((highcolmask & j) >> collowbits);
455  IT lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
456 
457  // i => location of that nonzero in csc.ir and csc.num arrays
458  pairarray[k++] = mypair(hindex, ipair(lindex,i));
459  }
460  }
461  sort(pairarray, pairarray+nz); // sort according to hindex
462  SortBlocks(pairarray, csc.num);
463  delete [] pairarray;
464 }
465 
466 template <class IT>
467 template <typename NT> // to provide conversion from arbitrary Csc<> to specialized BiCsb<bool>
468 BiCsb<bool, IT>::BiCsb (Csc<NT, IT> & csc, int workers):nz(csc.nz),m(csc.m),n(csc.n)
469 {
470  typedef std::pair<IT, IT> ipair;
471  typedef std::pair<IT, ipair> mypair;
472  assert(nz != 0 && n != 0 && m != 0);
473  Init(workers);
474 
475  bot = (IT*) aligned_malloc( nz * sizeof(IT));
476  top = allocate2D<IT>(nbr, nbc+1);
477  mypair * pairarray = new mypair[nz];
478  IT k = 0;
479  for(IT j = 0; j < n; ++j)
480  {
481  for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i) // scan the jth column
482  {
483  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
484  IT hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
485  | ((highcolmask & j) >> collowbits);
486  IT lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
487 
488  // i => location of that nonzero in csc.ir and csc.num arrays
489  pairarray[k++] = mypair(hindex, ipair(lindex,i));
490  }
491  }
492  sort(pairarray, pairarray+nz); // sort according to hindex
493  SortBlocks(pairarray);
494  delete [] pairarray;
495 }
496 
497 // Assumption: rowindices (ri) and colindices(ci) are "parallel arrays" sorted w.r.t. column index values
498 template <class NT, class IT>
499 BiCsb<NT, IT>::BiCsb (IT size, IT rows, IT cols, IT * ri, IT * ci, NT * val, int workers, IT forcelogbeta)
500 :nz(size),m(rows),n(cols)
501 {
502  typedef std::pair<IT, IT> ipair;
503  typedef std::pair<IT, ipair> mypair;
504  assert(nz != 0 && n != 0 && m != 0);
505  Init(workers, forcelogbeta);
506 
507  num = (NT*) aligned_malloc( nz * sizeof(NT));
508  bot = (IT*) aligned_malloc( nz * sizeof(IT));
509  top = allocate2D<IT>(nbr, nbc+1);
510  mypair * pairarray = new mypair[nz];
511  for(IT k = 0; k < nz; ++k)
512  {
513  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
514  IT hindex = (((highrowmask & ri[k] ) >> rowlowbits) << colhighbits) | ((highcolmask & ci[k]) >> collowbits);
515  IT lindex = ((lowrowmask & ri[k]) << collowbits) | (lowcolmask & ci[k]) ;
516 
517  // k is stored in order to retrieve the location of this nonzero in val array
518  pairarray[k] = mypair(hindex, ipair(lindex, k));
519  }
520  sort(pairarray, pairarray+nz); // sort according to hindex
521  SortBlocks(pairarray, val);
522  delete [] pairarray;
523 }
524 
525 template <class IT>
526 BiCsb<bool, IT>::BiCsb (IT size, IT rows, IT cols, IT * ri, IT * ci, int workers, IT forcelogbeta)
527 :nz(size),m(rows),n(cols)
528 {
529  typedef std::pair<IT, IT> ipair;
530  typedef std::pair<IT, ipair> mypair;
531  assert(nz != 0 && n != 0 && m != 0);
532  Init(workers, forcelogbeta);
533 
534  bot = (IT*) aligned_malloc( nz * sizeof(IT));
535  top = allocate2D<IT>(nbr, nbc+1);
536  mypair * pairarray = new mypair[nz];
537  for(IT k = 0; k < nz; ++k)
538  {
539  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
540  IT hindex = (((highrowmask & ri[k] ) >> rowlowbits) << colhighbits) | ((highcolmask & ci[k]) >> collowbits);
541  IT lindex = ((lowrowmask & ri[k]) << collowbits) | (lowcolmask & ci[k]) ;
542 
543  // k is stored in order to retrieve the location of this nonzero in val array
544  pairarray[k] = mypair(hindex, ipair(lindex, k));
545  }
546  sort(pairarray, pairarray+nz); // sort according to hindex
547  SortBlocks(pairarray);
548  delete [] pairarray;
549 }
550 
551 template <class NT, class IT>
552 void BiCsb<NT, IT>::SortBlocks(pair<IT, pair<IT,IT> > * pairarray, NT * val)
553 {
554  typedef typename std::pair<IT, std::pair<IT, IT> > mypair;
555  IT cnz = 0;
556  IT ldim = IntPower<2>(colhighbits); // leading dimension (not always equal to nbc)
557  for(IT i = 0; i < nbr; ++i)
558  {
559  for(IT j = 0; j < nbc; ++j)
560  {
561  top[i][j] = cnz;
562  IT prevcnz = cnz;
563  vector< mypair > blocknz;
564  while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) ) // as long as we're in this block
565  {
566  IT lowbits = pairarray[cnz].second.first;
567  IT rlowbits = ((lowbits >> collowbits) & lowrowmask);
568  IT clowbits = (lowbits & lowcolmask);
569  IT bikey = BitInterleaveLow(rlowbits, clowbits);
570 
571  blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
572  }
573  // sort the block into bitinterleaved order
574  sort(blocknz.begin(), blocknz.end());
575 
576  for(IT k=prevcnz; k<cnz ; ++k)
577  {
578  bot[k] = blocknz[k-prevcnz].second.first;
579  num[k] = val[blocknz[k-prevcnz].second.second];
580  }
581  }
582  top[i][nbc] = cnz; // hence equal to top[i+1][0] if i+1 < nbr
583  }
584  assert(cnz == nz);
585 }
586 
587 template <class IT>
588 void BiCsb<bool, IT>::SortBlocks(pair<IT, pair<IT,IT> > * pairarray)
589 {
590  typedef pair<IT, pair<IT, IT> > mypair;
591  IT cnz = 0;
592  IT ldim = IntPower<2>(colhighbits); // leading dimension (not always equal to nbc)
593  for(IT i = 0; i < nbr; ++i)
594  {
595  for(IT j = 0; j < nbc; ++j)
596  {
597  top[i][j] = cnz;
598  IT prevcnz = cnz;
599  std::vector<mypair> blocknz;
600  while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) ) // as long as we're in this block
601  {
602  IT lowbits = pairarray[cnz].second.first;
603  IT rlowbits = ((lowbits >> collowbits) & lowrowmask);
604  IT clowbits = (lowbits & lowcolmask);
605  IT bikey = BitInterleaveLow(rlowbits, clowbits);
606 
607  blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
608  }
609  // sort the block into bitinterleaved order
610  sort(blocknz.begin(), blocknz.end());
611 
612  for(IT k=prevcnz; k<cnz ; ++k)
613  bot[k] = blocknz[k-prevcnz].second.first;
614  }
615  top[i][nbc] = cnz;
616  }
617  assert(cnz == nz);
618 }
619 
626 template <class NT, class IT>
627 template <typename SR, typename RHS, typename LHS>
628 void BiCsb<NT, IT>::BMult(IT** chunks, IT start, IT end, const RHS * __restrict x, LHS * __restrict y, IT ysize) const
629 {
630  assert(end-start > 0); // there should be at least one chunk
631  if (end-start == 1) // single chunk
632  {
633  if((chunks[end] - chunks[start]) == 1) // chunk consists of a single (normally dense) block
634  {
635  IT chi = ( (chunks[start] - chunks[0]) << collowbits);
636 
637  // m-chi > lowcolmask for all blocks except the last skinny tall one.
638  // if the last one is regular too, then it has m-chi = lowcolmask+1
639  if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask ) // parallelize if it is a regular/complete block
640  {
641  const RHS * __restrict subx = &x[chi];
642  BlockPar<SR>( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange, BREAKEVEN * ysize);
643  }
644  else // otherwise block parallelization will fail
645  {
646  SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
647  }
648  }
649  else // a number of sparse blocks with a total of at most O(\beta) nonzeros
650  {
651  SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
652  }
653  }
654  else
655  {
656  // divide chunks into half
657  IT mid = (start+end)/2;
658 
659  cilk_spawn BMult<SR>(chunks, start, mid, x, y, ysize);
660  if(SYNCHED)
661  {
662  BMult<SR>(chunks, mid, end, x, y, ysize);
663  }
664  else
665  {
666  LHS * temp = new LHS[ysize]();
667  // not the empty set of parantheses as the initializer, therefore
668  // even if LHS is a built-in type (such as double,int) it will be default-constructed
669  // The C++ standard says that: A default constructed POD type is zero-initialized,
670  // for non-POD types (such as std::array), the caller should make sure default constructs to zero
671 
672  BMult<SR>(chunks, mid, end, x, temp, ysize);
673  cilk_sync;
674 
675  #pragma simd
676  for(IT i=0; i<ysize; ++i)
677  SR::axpy(temp[i], y[i]);
678 
679  delete [] temp;
680  }
681  }
682 }
683 
684 // partial template specialization for NT=bool
685 template <class IT>
686 template <typename SR, typename RHS, typename LHS>
687 void BiCsb<bool, IT>::BMult(IT** chunks, IT start, IT end, const RHS * __restrict x, LHS * __restrict y, IT ysize) const
688 {
689  assert(end-start > 0); // there should be at least one chunk
690  if (end-start == 1) // single chunk
691  {
692  if((chunks[end] - chunks[start]) == 1) // chunk consists of a single (normally dense) block
693  {
694  IT chi = ( (chunks[start] - chunks[0]) << collowbits);
695 
696  // m-chi > lowcolmask for all blocks except the last skinny tall one.
697  // if the last one is regular too, then it has m-chi = lowcolmask+1
698  if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask ) // parallelize if it is a regular/complete block
699  {
700  const RHS * __restrict subx = &x[chi];
701  BlockPar<SR>( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange, BREAKEVEN * ysize);
702  }
703  else // otherwise block parallelization will fail
704  {
705  SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
706  }
707  }
708  else // a number of sparse blocks with a total of at most O(\beta) nonzeros
709  {
710  SubSpMV<SR>(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y);
711  }
712  }
713  else
714  {
715  // divide chunks into half
716  IT mid = (start+end)/2;
717 
718  cilk_spawn BMult<SR>(chunks, start, mid, x, y, ysize);
719  if(SYNCHED)
720  {
721  BMult<SR>(chunks, mid, end, x, y, ysize);
722  }
723  else
724  {
725  LHS * temp = new LHS[ysize]();
726  // not the empty set of parantheses as the initializer, therefore
727  // even if LHS is a built-in type (such as double,int) it will be default-constructed
728  // The C++ standard says that: A default constructed POD type is zero-initialized,
729  // for non-POD types (such as std::array), the caller should make sure default constructs to zero
730 
731  BMult<SR>(chunks, mid, end, x, temp, ysize);
732  cilk_sync;
733 
734  #pragma simd
735  for(IT i=0; i<ysize; ++i)
736  SR::axpy(temp[i], y[i]);
737 
738  delete [] temp;
739  }
740  }
741 }
742 
751 template <class NT, class IT>
752 template <typename SR, typename RHS, typename LHS>
753 void BiCsb<NT, IT>::BTransMult(vector< vector< tuple<IT,IT,IT> > * > & chunks, IT start, IT end, const RHS * __restrict x, LHS * __restrict y, IT ysize) const
754 {
755 #ifdef STATS
756  blockparcalls += 1;
757 #endif
758  assert(end-start > 0); // there should be at least one chunk
759  if (end-start == 1) // single chunk (note that single chunk does not mean single block)
760  {
761  if(chunks[start]->size() == 1) // chunk consists of a single (normally dense) block
762  {
763  // get the block row id higher order bits to index x (because this is A'x)
764  auto block = chunks[start]->front(); // get the tuple representing this compressed sparse block
765  IT chi = ( get<2>(block) << rowlowbits);
766 
767  // m-chi > lowrowmask for all blocks except the last skinny tall one.
768  // if the last one is regular too, then it has m-chi = lowcolmask+1
769  // parallelize if it is a regular/complete block (and it it is worth it)
770 
771  if(ysize == (lowrowmask+1) && (m-chi) > lowrowmask && (get<1>(block)-get<0>(block)) > BREAKEVEN * ysize)
772  {
773  const RHS * __restrict subx = &x[chi];
774  BlockParT<SR>( get<0>(block) , get<1>(block), subx, y, 0, blcrange, BREAKEVEN * ysize);
775  }
776  else // otherwise block parallelization will fail
777  {
778  SubSpMVTrans<SR>(*(chunks[start]), x, y);
779  }
780  }
781  else // a number of sparse blocks with a total of at most O(\beta) nonzeros
782  {
783  SubSpMVTrans<SR>(*(chunks[start]), x, y);
784  }
785  }
786  else // multiple chunks
787  {
788  IT mid = (start+end)/2;
789  cilk_spawn BTransMult<SR>(chunks, start, mid, x, y, ysize);
790  if(SYNCHED)
791  {
792  BTransMult<SR>(chunks, mid, end, x, y, ysize);
793  }
794  else
795  {
796  LHS * temp = new LHS[ysize]();
797  BTransMult<SR>(chunks, mid, end, x, temp, ysize);
798  cilk_sync;
799 
800  #pragma simd
801  for(IT i=0; i<ysize; ++i)
802  SR::axpy(temp[i], y[i]);
803 
804  delete [] temp;
805  }
806  }
807 }
808 
809 // Partial template specialization on NT=bool
810 template <class IT>
811 template <typename SR, typename RHS, typename LHS>
812 void BiCsb<bool, IT>::BTransMult(vector< vector< tuple<IT,IT,IT> > * > & chunks, IT start, IT end, const RHS * __restrict x, LHS * __restrict y, IT ysize) const
813 {
814  assert(end-start > 0); // there should be at least one chunk
815  if (end-start == 1) // single chunk (note that single chunk does not mean single block)
816  {
817  if(chunks[start]->size() == 1) // chunk consists of a single (normally dense) block
818  {
819  // get the block row id higher order bits to index x (because this is A'x)
820  auto block = chunks[start]->front(); // get the tuple representing this compressed sparse block
821  IT chi = ( get<2>(block) << rowlowbits);
822 
823  // m-chi > lowrowmask for all blocks except the last skinny tall one.
824  // if the last one is regular too, then it has m-chi = lowcolmask+1
825  if(ysize == (lowrowmask+1) && (m-chi) > lowrowmask ) // parallelize if it is a regular/complete block
826  {
827  const RHS * __restrict subx = &x[chi];
828  BlockParT<SR>( get<0>(block) , get<1>(block), subx, y, 0, blcrange, BREAKEVEN * ysize);
829  }
830  else // otherwise block parallelization will fail
831  {
832  SubSpMVTrans<SR>(*(chunks[start]), x, y);
833  }
834  }
835  else // a number of sparse blocks with a total of at most O(\beta) nonzeros
836  {
837  SubSpMVTrans<SR>(*(chunks[start]), x, y);
838  }
839  }
840  else // multiple chunks
841  {
842  IT mid = (start+end)/2;
843  cilk_spawn BTransMult<SR>(chunks, start, mid, x, y, ysize);
844  if(SYNCHED)
845  {
846  BTransMult<SR>(chunks, mid, end, x, y, ysize);
847  }
848  else
849  {
850  LHS * temp = new LHS[ysize]();
851  BTransMult<SR>(chunks, mid, end, x, temp, ysize);
852  cilk_sync;
853 
854  #pragma simd
855  for(IT i=0; i<ysize; ++i)
856  SR::axpy(temp[i], y[i]);
857 
858  delete [] temp;
859  }
860  }
861 }
862 
863 // double* restrict a; --> No aliases for a[0], a[1], ...
864 // bstart/bend: block start/end index (to the top array)
865 template <class NT, class IT>
866 template <typename SR, typename RHS, typename LHS>
867 void BiCsb<NT, IT>::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const RHS * __restrict x, LHS * __restrict suby) const
868 {
869  IT * __restrict r_bot = bot;
870  NT * __restrict r_num = num;
871 
872  __m128i lcms = _mm_set1_epi32 (lowcolmask);
873  __m128i lrms = _mm_set1_epi32 (lowrowmask);
874 
875  for (IT j = bstart ; j < bend ; ++j) // for all blocks inside that block row
876  {
877  // get higher order bits for column indices
878  IT chi = (j << collowbits);
879  const RHS * __restrict subx = &x[chi];
880 
881 #ifdef SIMDUNROLL
882  IT start = btop[j];
883  IT range = (btop[j+1]-btop[j]) >> 2;
884 
885  if(range > ROLLING)
886  {
887  for (IT k = 0 ; k < range ; ++k) // for all nonzeros within ith block (expected =~ nnz/n = c)
888  {
889  // ABAB: how to ensure alignment on the stack?
890  // float a[4] __attribute__((aligned(0x1000)));
891  #define ALIGN16 __attribute__((aligned(16)))
892 
893  IT ALIGN16 rli4[4]; IT ALIGN16 cli4[4];
894  NT ALIGN16 x4[4]; NT ALIGN16 y4[4];
895 
896  // _mm_srli_epi32: Shifts the 4 signed or unsigned 32-bit integers to right by shifting in zeros.
897  IT pin = start + (k << 2);
898 
899  __m128i bots = _mm_loadu_si128((__m128i*) &r_bot[pin]); // load 4 consecutive r_bot elements
900  __m128i clis = _mm_and_si128( bots, lcms);
901  __m128i rlis = _mm_and_si128( _mm_srli_epi32(bots, collowbits), lrms);
902  _mm_store_si128 ((__m128i*) cli4, clis);
903  _mm_store_si128 ((__m128i*) rli4, rlis);
904 
905  x4[0] = subx[cli4[0]];
906  x4[1] = subx[cli4[1]];
907  x4[2] = subx[cli4[2]];
908  x4[3] = subx[cli4[3]];
909 
910  __m128d Y01QW = _mm_mul_pd((__m128d)_mm_loadu_pd(&r_num[pin]), (__m128d)_mm_load_pd(&x4[0]));
911  __m128d Y23QW = _mm_mul_pd((__m128d)_mm_loadu_pd(&r_num[pin+2]), (__m128d)_mm_load_pd(&x4[2]));
912 
913  _mm_store_pd(&y4[0],Y01QW);
914  _mm_store_pd(&y4[2],Y23QW);
915 
916  suby[rli4[0]] += y4[0];
917  suby[rli4[1]] += y4[1];
918  suby[rli4[2]] += y4[2];
919  suby[rli4[3]] += y4[3];
920  }
921  for(IT k=start+4*range; k<btop[j+1]; ++k)
922  {
923  IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
924  IT cli = (r_bot[k] & lowcolmask);
925  SR::axpy(r_num[k], subx[cli], suby[rli]);
926  }
927  }
928  else
929  {
930 #endif
931  for(IT k=btop[j]; k<btop[j+1]; ++k)
932  {
933  IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
934  IT cli = (r_bot[k] & lowcolmask);
935  SR::axpy(r_num[k], subx[cli], suby[rli]);
936  }
937 #ifdef SIMDUNROLL
938  }
939 #endif
940  }
941 }
942 
943 // Partial boolean specialization on NT=bool
944 template <class IT>
945 template <typename SR, typename RHS, typename LHS>
946 void BiCsb<bool, IT>::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const RHS * __restrict x, LHS * __restrict suby) const
947 {
948  IT * __restrict r_bot = bot;
949  for (IT j = bstart ; j < bend ; ++j) // for all blocks inside that block row or chunk
950  {
951  // get higher order bits for column indices
952  IT chi = (j << collowbits);
953  const RHS * __restrict subx = &x[chi];
954  for (IT k = btop[j] ; k < btop[j+1] ; ++k) // for all nonzeros within ith block (expected =~ nnz/n = c)
955  {
956  IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
957  IT cli = (r_bot[k] & lowcolmask);
958  SR::axpy(subx[cli], suby[rli]); // suby [rli] += subx [cli] where subx and suby are vectors.
959  }
960  }
961 }
962 
964 template <class NT, class IT>
965 template <typename SR, typename RHS, typename LHS>
966 void BiCsb<NT, IT>::SubSpMVTrans(const vector< tuple<IT,IT,IT> > & chunk, const RHS * __restrict x, LHS * __restrict suby) const
967 {
968  IT * __restrict r_bot = bot;
969  NT * __restrict r_num = num;
970  for(auto itr = chunk.begin(); itr != chunk.end(); ++itr) // over all blocks within this chunk
971  {
972  // get the starting point for accessing x
973  IT chi = ( get<2>(*itr) << rowlowbits);
974  const RHS * __restrict subx = &x[chi];
975 
976  IT nzbeg = get<0>(*itr);
977  IT nzend = get<1>(*itr);
978 
979  for (IT k = nzbeg ; k < nzend ; ++k)
980  {
981  // Note the swap in cli/rli
982  IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
983  IT rli = (r_bot[k] & lowcolmask);
984  SR::axpy(r_num[k], subx[cli], suby[rli]); // suby [rli] += r_num[k] * subx [cli] where subx and suby are vectors.
985  }
986  }
987 }
988 
990 template <class IT>
991 template <typename SR, typename RHS, typename LHS>
992 void BiCsb<bool, IT>::SubSpMVTrans(const vector< tuple<IT,IT,IT> > & chunk, const RHS * __restrict x, LHS * __restrict suby) const
993 {
994  IT * __restrict r_bot = bot;
995  for(auto itr = chunk.begin(); itr != chunk.end(); ++itr)
996  {
997  // get the starting point for accessing x
998  IT chi = ( get<2>(*itr) << rowlowbits);
999  const RHS * __restrict subx = &x[chi];
1000 
1001  IT nzbeg = get<0>(*itr);
1002  IT nzend = get<1>(*itr);
1003 
1004  for (IT k = nzbeg ; k < nzend ; ++k)
1005  {
1006  // Note the swap in cli/rli
1007  IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1008  IT rli = (r_bot[k] & lowcolmask);
1009  SR::axpy(subx[cli], suby[rli]); // suby [rli] += subx [cli] where subx and suby are vectors.
1010  }
1011  }
1012 }
1013 
1014 template <class NT, class IT>
1015 template <typename SR, typename RHS, typename LHS>
1016 void BiCsb<NT, IT>::SubSpMVTrans(IT col, IT rowstart, IT rowend, const RHS * __restrict x, LHS * __restrict suby) const
1017 {
1018  IT * __restrict r_bot = bot;
1019  NT * __restrict r_num = num;
1020  for(IT i= rowstart; i < rowend; ++i)
1021  {
1022  // get the starting point for accessing x
1023  IT chi = (i << rowlowbits);
1024  const RHS * __restrict subx = &x[chi];
1025 
1026  for (IT k = top[i][col] ; k < top[i][col+1] ; ++k)
1027  {
1028  // Note the swap in cli/rli
1029  IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1030  IT rli = (r_bot[k] & lowcolmask);
1031  SR::axpy(r_num[k], subx[cli], suby[rli]); // suby [rli] += r_num[k] * subx [cli] where subx and suby are vectors.
1032  }
1033  }
1034 }
1035 
1036 
1037 template <class IT>
1038 template <typename SR, typename RHS, typename LHS>
1039 void BiCsb<bool, IT>::SubSpMVTrans(IT col, IT rowstart, IT rowend, const RHS * __restrict x, LHS * __restrict suby) const
1040 {
1041  IT * __restrict r_bot = bot;
1042  for(IT i= rowstart; i < rowend; ++i)
1043  {
1044  // get the starting point for accessing x
1045  IT chi = (i << rowlowbits);
1046  const RHS * __restrict subx = &x[chi];
1047  for (IT k = top[i][col] ; k < top[i][col+1] ; ++k)
1048  {
1049  // Note the swap in cli/rli
1050  IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1051  IT rli = (r_bot[k] & lowcolmask);
1052  SR::axpy(subx[cli], suby[rli]); // suby [rli] += subx [cli] where subx and suby are vectors.
1053  }
1054  }
1055 }
1056 
1057 // Parallelize the block itself (A*x version)
1058 // start/end: element start/end positions (indices to the bot array)
1059 // bot[start...end] always fall in the same block
1060 // PRECONDITION: rangeend-rangebeg is a power of two
1061 // TODO: we rely on the particular implementation of lower_bound for correctness, which is dangerous !
1062 // what if lhs (instead of rhs) parameter to the comparison object is the splitter?
1063 template <class NT, class IT>
1064 template <typename SR, typename RHS, typename LHS>
1065 void BiCsb<NT, IT>::BlockPar(IT start, IT end, const RHS * __restrict subx, LHS * __restrict suby,
1066  IT rangebeg, IT rangeend, IT cutoff) const
1067 {
1068  assert(IsPower2(rangeend-rangebeg));
1069  if(end - start < cutoff)
1070  {
1071  IT * __restrict r_bot = bot;
1072  NT * __restrict r_num = num;
1073  for (IT k = start ; k < end ; ++k)
1074  {
1075  IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
1076  IT cli = (r_bot[k] & lowcolmask);
1077  SR::axpy(r_num[k], subx[cli], suby[rli]); // suby [rli] += r_num[k] * subx [cli] where subx and suby are vectors.
1078  }
1079  }
1080  else
1081  {
1082  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
1083  // Specifically, it returns the first position where value could be inserted without violating the ordering
1084  IT halfrange = (rangebeg+rangeend)/2;
1085  IT qrt1range = (rangebeg+halfrange)/2;
1086  IT qrt3range = (halfrange+rangeend)/2;
1087 
1088  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1089  IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1090  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1091 
1092  /* -------
1093  | 0 2 |
1094  | 1 3 |
1095  ------- */
1096  // subtracting two pointers pointing to the same array gives you the # of elements separating them
1097  // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT
1098  IT size0 = static_cast<IT> (left - &bot[start]);
1099  IT size1 = static_cast<IT> (mid - left);
1100  IT size2 = static_cast<IT> (right - mid);
1101  IT size3 = static_cast<IT> (&bot[end] - right);
1102 
1103  IT ncutoff = std::max<IT>(cutoff/2, MINNNZTOPAR);
1104 
1105  // We can choose to perform [0,3] in parallel and then [1,2] in parallel
1106  // or perform [0,1] in parallel and then [2,3] in parallel
1107  // Decision is based on the balance, i.e. we pick the more balanced parallelism
1108  if( ( absdiff(size0,size3) + absdiff(size1,size2) ) < ( absdiff(size0,size1) + absdiff(size2,size3) ) )
1109  {
1110  cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1111  BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1112  cilk_sync;
1113 
1114  cilk_spawn BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff); // multiply subblock_1
1115  BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1116  cilk_sync;
1117  }
1118  else
1119  {
1120  cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1121  BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff); // multiply subblock_1
1122  cilk_sync;
1123 
1124  cilk_spawn BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1125  BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1126  cilk_sync;
1127  }
1128  }
1129 }
1130 
1131 
1132 template <class IT>
1133 template <typename SR, typename RHS, typename LHS>
1134 void BiCsb<bool, IT>::BlockPar(IT start, IT end, const RHS * __restrict subx, LHS * __restrict suby,
1135  IT rangebeg, IT rangeend, IT cutoff) const
1136 {
1137  assert(IsPower2(rangeend-rangebeg));
1138  if(end - start < cutoff)
1139  {
1140  IT * __restrict r_bot = bot;
1141  for (IT k = start ; k < end ; ++k)
1142  {
1143  IT rli = ((r_bot[k] >> collowbits) & lowrowmask);
1144  IT cli = (r_bot[k] & lowcolmask);
1145  SR::axpy(subx[cli], suby[rli]); // suby [rli] += subx [cli] where subx and suby are vectors.
1146  }
1147  }
1148  else
1149  {
1150  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
1151  // Specifically, it returns the first position where value could be inserted without violating the ordering
1152  IT halfrange = (rangebeg+rangeend)/2;
1153  IT qrt1range = (rangebeg+halfrange)/2;
1154  IT qrt3range = (halfrange+rangeend)/2;
1155 
1156  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1157  IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1158  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1159 
1160  /* -------
1161  | 0 2 |
1162  | 1 3 |
1163  ------- */
1164  // subtracting two pointers pointing to the same array gives you the # of elements separating them
1165  // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT
1166  IT size0 = static_cast<IT> (left - &bot[start]);
1167  IT size1 = static_cast<IT> (mid - left);
1168  IT size2 = static_cast<IT> (right - mid);
1169  IT size3 = static_cast<IT> (&bot[end] - right);
1170 
1171  IT ncutoff = std::max<IT>(cutoff/2, MINNNZTOPAR);
1172 
1173  // We can choose to perform [0,3] in parallel and then [1,2] in parallel
1174  // or perform [0,1] in parallel and then [2,3] in parallel
1175  // Decision is based on the balance, i.e. we pick the more balanced parallelism
1176  if( ( absdiff(size0,size3) + absdiff(size1,size2) ) < ( absdiff(size0,size1) + absdiff(size2,size3) ) )
1177  {
1178  cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1179  BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1180  cilk_sync;
1181 
1182  cilk_spawn BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff); // multiply subblock_1
1183  BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1184  cilk_sync;
1185  }
1186  else
1187  {
1188  cilk_spawn BlockPar<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1189  BlockPar<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff); // multiply subblock_1
1190  cilk_sync;
1191 
1192  cilk_spawn BlockPar<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1193  BlockPar<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1194  cilk_sync;
1195  }
1196  }
1197 }
1198 
1199 // Parallelize the block itself (A'*x version)
1200 // start/end: element start/end positions (indices to the bot array)
1201 // bot[start...end] always fall in the same block
1202 template <class NT, class IT>
1203 template <typename SR, typename RHS, typename LHS>
1204 void BiCsb<NT, IT>::BlockParT(IT start, IT end, const RHS * __restrict subx, LHS * __restrict suby,
1205  IT rangebeg, IT rangeend, IT cutoff) const
1206 {
1207  if(end - start < cutoff)
1208  {
1209  IT * __restrict r_bot = bot;
1210  NT * __restrict r_num = num;
1211  for (IT k = start ; k < end ; ++k)
1212  {
1213  // Note the swap in cli/rli
1214  IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1215  IT rli = (r_bot[k] & lowcolmask);
1216  SR::axpy(r_num[k], subx[cli], suby[rli]); // suby [rli] += r_num[k] * subx [cli] where subx and suby are vectors.
1217  }
1218  }
1219  else
1220  {
1221  IT halfrange = (rangebeg+rangeend)/2;
1222  IT qrt1range = (rangebeg+halfrange)/2;
1223  IT qrt3range = (halfrange+rangeend)/2;
1224 
1225  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
1226  // Specifically, it returns the first position where value could be inserted without violating the ordering
1227  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1228  IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1229  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1230 
1231  /* -------
1232  | 0 1 |
1233  | 2 3 |
1234  ------- */
1235  // subtracting two pointers pointing to the same array gives you the # of elements separating them
1236  // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT
1237  IT size0 = static_cast<IT> (left - &bot[start]);
1238  IT size1 = static_cast<IT> (mid - left);
1239  IT size2 = static_cast<IT> (right - mid);
1240  IT size3 = static_cast<IT> (&bot[end] - right);
1241 
1242  IT ncutoff = std::max<IT>(cutoff/2, MINNNZTOPAR);
1243 
1244  // We can choose to perform [0,3] in parallel and then [1,2] in parallel
1245  // or perform [0,2] in parallel and then [1,3] in parallel
1246  // Decision is based on the balance, i.e. we pick the more balanced parallelism
1247  if( ( absdiff(size0,size3) + absdiff(size1,size2) ) < ( absdiff(size0,size2) + absdiff(size1,size3) ) )
1248  {
1249  cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1250  BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1251  cilk_sync;
1252 
1253  cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);// multiply subblock_1
1254  BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1255  cilk_sync;
1256  }
1257  else
1258  {
1259  cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1260  BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1261  cilk_sync;
1262 
1263  cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);// multiply subblock_1
1264  BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1265  cilk_sync;
1266  }
1267  }
1268 }
1269 
1270 
1271 template <class IT>
1272 template <typename SR, typename RHS, typename LHS>
1273 void BiCsb<bool, IT>::BlockParT(IT start, IT end, const RHS * __restrict subx, LHS * __restrict suby,
1274  IT rangebeg, IT rangeend, IT cutoff) const
1275 {
1276  if(end - start < cutoff)
1277  {
1278  IT * __restrict r_bot = bot;
1279  for (IT k = start ; k < end ; ++k)
1280  {
1281  // Note the swap in cli/rli
1282  IT cli = ((r_bot[k] >> collowbits) & lowrowmask);
1283  IT rli = (r_bot[k] & lowcolmask);
1284  SR::axpy(subx[cli], suby[rli]); // suby [rli] += subx [cli] where subx and suby are vectors.
1285  }
1286  }
1287  else
1288  {
1289  IT halfrange = (rangebeg+rangeend)/2;
1290  IT qrt1range = (rangebeg+halfrange)/2;
1291  IT qrt3range = (halfrange+rangeend)/2;
1292 
1293  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
1294  // Specifically, it returns the first position where value could be inserted without violating the ordering
1295  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
1296  IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
1297  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
1298 
1299  /* -------
1300  | 0 1 |
1301  | 2 3 |
1302  ------- */
1303  // subtracting two pointers pointing to the same array gives you the # of elements separating them
1304  // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT
1305  IT size0 = static_cast<IT> (left - &bot[start]);
1306  IT size1 = static_cast<IT> (mid - left);
1307  IT size2 = static_cast<IT> (right - mid);
1308  IT size3 = static_cast<IT> (&bot[end] - right);
1309 
1310  IT ncutoff = std::max<IT>(cutoff/2, MINNNZTOPAR);
1311 
1312  // We can choose to perform [0,3] in parallel and then [1,2] in parallel
1313  // or perform [0,2] in parallel and then [1,3] in parallel
1314  // Decision is based on the balance, i.e. we pick the more balanced parallelism
1315  if( ( absdiff(size0,size3) + absdiff(size1,size2) ) < ( absdiff(size0,size2) + absdiff(size1,size3) ) )
1316  {
1317  cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1318  BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1319  cilk_sync;
1320 
1321  cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);// multiply subblock_1
1322  BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1323  cilk_sync;
1324  }
1325  else
1326  {
1327  cilk_spawn BlockParT<SR>(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
1328  BlockParT<SR>(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
1329  cilk_sync;
1330 
1331  cilk_spawn BlockParT<SR>(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff);// multiply subblock_1
1332  BlockParT<SR>(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
1333  cilk_sync;
1334  }
1335  }
1336 }
1337 
1338 // Print stats to an ofstream object
1339 template <class NT, class IT>
1340 ofstream & BiCsb<NT, IT>::PrintStats(ofstream & outfile) const
1341 {
1342  if(nz == 0)
1343  {
1344  outfile << "## Matrix Doesn't have any nonzeros" <<endl;
1345  return outfile;
1346  }
1347  const IT ntop = nbr * nbc;
1348 
1349  outfile << "## Average block is of dimensions "<< lowrowmask+1 << "-by-" << lowcolmask+1 << endl;
1350  outfile << "## Number of real blocks is "<< ntop << endl;
1351  outfile << "## Row imbalance is " << RowImbalance(*this) << endl;
1352  outfile << "## Col imbalance is " << ColImbalance(*this) << endl;
1353  outfile << "## Block parallel calls is " << blockparcalls.get_value() << endl;
1354 
1355  std::vector<int> blocksizes(ntop);
1356  for(IT i=0; i<nbr; ++i)
1357  {
1358  for(IT j=0; j < nbc; ++j)
1359  {
1360  blocksizes[i*nbc+j] = static_cast<int> (top[i][j+1]-top[i][j]);
1361  }
1362  }
1363  sort(blocksizes.begin(), blocksizes.end());
1364  outfile<< "## Total nonzeros: "<< accumulate(blocksizes.begin(), blocksizes.end(), 0) << endl;
1365 
1366  outfile << "## Nonzero distribution (sorted) of blocks follows: \n" ;
1367  for(IT i=0; i< ntop; ++i)
1368  {
1369  outfile << blocksizes[i] << "\n";
1370  }
1371  outfile << endl;
1372  return outfile;
1373 }
1374 
1375 
ofstream & PrintStats(ofstream &outfile) const
Definition: bicsb.cpp:1340
unsigned int nextpoweroftwo(unsigned int v)
Definition: utility.h:401
~BiCsb()
Definition: bicsb.cpp:406
BiCsb()
Definition: bicsb.h:22
#define SLACKNESS
Definition: utility.h:130
BiCsb< NT, IT > & operator=(const BiCsb< NT, IT > &rhs)
Definition: bicsb.cpp:315
float ColImbalance(const BiCsb< NT, IT > &A)
Definition: friends.h:414
#define SYNCHED
Definition: utility.h:21
#define BREAKEVEN
Definition: utility.h:136
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
Definition: utility.h:344
unsigned char * aligned_malloc(uint64_t size)
Definition: utility.h:248
unsigned IntPower< 2 >(unsigned exponent)
Definition: utility.h:387
void aligned_free(unsigned char *ptr)
Definition: utility.h:258
bool IsPower2(T x)
Definition: utility.h:396
#define ROLLING
Definition: utility.h:141
unsigned int highestbitset(unsigned __int64 v)
Definition: utility.h:423
Definition: csc.h:15
#define absdiff(x, y)
Definition: utility.h:147
float RowImbalance(const CSB &A)
Definition: friends.h:400
Definition: bicsb.h:19
#define L2SIZE
Definition: utility.h:132
void deallocate2D(T **array, I m)
Definition: utility.h:311
#define MINNNZTOPAR
Definition: utility.h:137