Compressed Sparse Blocks  1.2
 All Classes Files Functions Variables Typedefs Friends Macros Pages
bmcsb.cpp
Go to the documentation of this file.
1 #include "bmcsb.h"
2 #include "utility.h"
3 
4 // Choose block size as big as possible given the following constraints
5 // 1) The bot array is addressible by IT
6 // 2) The parts of x & y vectors that a block touches fits into L2 cache [assuming a saxpy() operation]
7 // 3) There's enough parallel slackness for block rows (at least SLACKNESS * CILK_NPROC)
8 template <class NT, class IT, unsigned TTDIM>
9 void BmCsb<NT, IT, TTDIM>::Init(int workers, IT forcelogbeta)
10 {
11  ispar = (workers > 1);
12  IT roundrowup = nextpoweroftwo(m);
13  IT roundcolup = nextpoweroftwo(n);
14 
15  // if indices are negative, highestbitset returns -1,
16  // but that will be caught by the sizereq below
17  IT rowbits = highestbitset(roundrowup);
18  IT colbits = highestbitset(roundcolup);
19  bool sizereq;
20  if (ispar)
21  {
22  sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers)
23  && (IntPower<2>(colbits) > SLACKNESS * workers));
24  }
25  else
26  {
27  sizereq = ((rowbits > 1) && (colbits > 1));
28  }
29  if(!sizereq)
30  {
31  cerr << "Matrix too small for this library" << endl;
32  return;
33  }
34 
35  rowlowbits = rowbits-1;
36  collowbits = colbits-1;
37  IT inf = numeric_limits<IT>::max();
38  IT maxbits = highestbitset(inf);
39 
40  rowhighbits = rowbits-rowlowbits; // # higher order bits for rows (has at least one bit)
41  colhighbits = colbits-collowbits; // # higher order bits for cols (has at least one bit)
42  if(ispar)
43  {
44  while(IntPower<2>(rowhighbits) < SLACKNESS * workers)
45  {
46  rowhighbits++;
47  rowlowbits--;
48  }
49  }
50 
51  // calculate the space that suby occupies in L2 cache
52  IT yL2 = IntPower<2>(rowlowbits) * sizeof(NT);
53  while(yL2 > L2SIZE)
54  {
55  yL2 /= 2;
56  rowhighbits++;
57  rowlowbits--;
58  }
59 
60  // calculate the space that subx occupies in L2 cache
61  IT xL2 = IntPower<2>(collowbits) * sizeof(NT);
62  while(xL2 > L2SIZE)
63  {
64  xL2 /= 2;
65  colhighbits++;
66  collowbits--;
67  }
68 
69  // blocks need to be square for correctness (maybe generalize this later?)
70  while(rowlowbits+collowbits > maxbits)
71  {
72  if(rowlowbits > collowbits)
73  {
74  rowhighbits++;
75  rowlowbits--;
76  }
77  else
78  {
79  colhighbits++;
80  collowbits--;
81  }
82  }
83  while(rowlowbits > collowbits)
84  {
85  rowhighbits++;
86  rowlowbits--;
87  }
88  while(rowlowbits < collowbits)
89  {
90  colhighbits++;
91  collowbits--;
92  }
93  assert (collowbits == rowlowbits);
94  lowrowmask = IntPower<2>(rowlowbits) - 1;
95  lowcolmask = IntPower<2>(collowbits) - 1;
96  if(forcelogbeta != 0)
97  {
98  IT candlowmask = IntPower<2>(forcelogbeta) -1;
99  cout << "Forcing beta to "<< (candlowmask+1) << " instead of the chosen " << (lowrowmask+1) << endl;
100  cout << "Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
101  lowrowmask = lowcolmask = candlowmask;
102  rowlowbits = collowbits = forcelogbeta;
103  rowhighbits = rowbits-rowlowbits;
104  colhighbits = colbits-collowbits;
105  }
106  else
107  {
108  double sqrtn = sqrt(sqrt(static_cast<double>(m) * static_cast<double>(n)));
109  IT logbeta = static_cast<IT>(ceil(log2(sqrtn))) + 2;
110  if(rowlowbits > logbeta)
111  {
112  rowlowbits = collowbits = logbeta;
113  lowrowmask = lowcolmask = IntPower<2>(logbeta) -1;
114  rowhighbits = rowbits-rowlowbits;
115  colhighbits = colbits-collowbits;
116  }
117  cout << "Beta chosen to be "<< (lowrowmask+1) << endl;
118  }
119  highrowmask = ((roundrowup - 1) ^ lowrowmask);
120  highcolmask = ((roundcolup - 1) ^ lowcolmask);
121 
122  // nbc = #{block columns} = #{blocks in any block row}, nbr = #{block rows)
123  IT blcdimrow = lowrowmask + 1;
124  IT blcdimcol = lowcolmask + 1;
125  nbr = static_cast<IT>(ceil(static_cast<double>(m) / static_cast<double>(blcdimrow)));
126  nbc = static_cast<IT>(ceil(static_cast<double>(n) / static_cast<double>(blcdimcol)));
127 
128  blcrange = (lowrowmask+1) * (lowcolmask+1); // range indexed by one block
129  mortoncmp = MortonCompare<IT>(rowlowbits, collowbits, lowrowmask, lowcolmask);
130 }
131 
132 
133 // copy constructor
134 template <class NT, class IT, unsigned TTDIM>
136 : nz(rhs.nz), m(rhs.m), n(rhs.n), blcrange(rhs.blcrange), nbr(rhs.nbr), nbc(rhs.nbc), nrb(rhs.nrb),
137 rowhighbits(rhs.rowhighbits), rowlowbits(rhs.rowlowbits), highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask),
138 colhighbits(rhs.colhighbits), collowbits(rhs.collowbits), highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask),
139 mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
140 {
141  if(nz > 0) // nz > 0 iff nrb > 0
142  {
143  num = new NT[nz+2](); num++;
144  bot = new IT[nrb];
145  masks = new MTYPE[nrb];
146 
147  copy ( rhs.num, rhs.num+nz+1, num);
148  copy ( rhs.bot, rhs.bot+nrb, bot );
149  copy ( rhs.masks, rhs.masks+nrb, masks );
150  }
151  if ( nbr > 0)
152  {
153  top = new IT* [nbr];
154  for(IT i=0; i<nbr; ++i)
155  top[i] = new IT[nbc+1];
156  for(IT i=0; i<nbr; ++i)
157  for(IT j=0; j <= nbc; ++j)
158  top[i][j] = rhs.top[i][j];
159  }
160 }
161 
162 template <class NT, class IT, unsigned TTDIM>
164 {
165  if(this != &rhs)
166  {
167  if(nz > 0) // if the existing object is not empty
168  {
169  // make it empty
170  delete [] masks;
171  delete [] bot;
172  delete [] (--num);
173  }
174  if(nbr > 0)
175  {
176  for(IT i=0; i<nbr; ++i)
177  delete [] top[i];
178  delete [] top;
179  }
180 
181  ispar = rhs.ispar;
182  nz = rhs.nz;
183  nrb = rhs.nrb;
184  n = rhs.n;
185  m = rhs.m;
186  nbr = rhs.nbr;
187  nbc = rhs.nbc;
188  blcrange = rhs.blcrange;
189 
190  rowhighbits = rhs.rowhighbits;
191  rowlowbits = rhs.rowlowbits;
192  highrowmask = rhs.highrowmask;
193  lowrowmask = rhs.lowrowmask;
194 
195  colhighbits = rhs.colhighbits;
196  collowbits = rhs.collowbits;
197  highcolmask = rhs.highcolmask;
198  lowcolmask= rhs.lowcolmask;
199  mortoncmp = rhs.mortoncmp;
200 
201  if(nz > 0) // if the copied object is not empty
202  {
203  num = new NT[nz+2](); num++;
204  bot = new IT[nrb];
205  masks = new MTYPE[nrb];
206 
207  copy ( rhs.num, rhs.num+nz+1, num);
208  copy ( rhs.bot, rhs.bot+nrb, bot );
209  copy ( rhs.masks, rhs.masks+nrb, masks );
210  }
211  if(nbr > 0)
212  {
213  top = new IT* [nbr];
214  for(IT i=0; i<nbr; ++i)
215  top[i] = new IT[nbc+1];
216  for(IT i=0; i<nbr; ++i)
217  for(IT j=0; j <= nbc; ++j)
218  top[i][j] = rhs.top[i][j];
219  }
220  }
221  return *this;
222 }
223 
224 template <class NT, class IT, unsigned TTDIM>
226 {
227  if( nz > 0)
228  {
229  delete [] masks;
230  delete [] bot;
231  delete [] (--num);
232  }
233  if ( nbr > 0)
234  {
235  for(IT i=0; i<nbr; ++i)
236  delete [] top[i];
237  delete [] top;
238  }
239 }
240 
241 template <class NT, class IT, unsigned TTDIM>
242 BmCsb<NT, IT, TTDIM>::BmCsb (Csc<NT, IT> & csc, int workers):nz(csc.nz), m(csc.m),n(csc.n)
243 {
244  typedef std::pair<IT, IT> ipair;
245  typedef std::pair<IT, ipair> mypair;
246 
247  assert(nz != 0 && n != 0 && m != 0);
248  Init(workers);
249 
250  num = new NT[nz+2](); num++; // Padding for SSEspmv (the blendv operation)
251  // bot is later to be resized to nrb (number of register blocks)
252  // nrb < nz as the worst case happens when each register block contains only one nonzero
253 
254  top = allocate2D<IT>(nbr, nbc+1);
255  mypair * pairarray = new mypair[nz];
256  IT k = 0;
257  for(IT j = 0; j < n; ++j)
258  {
259  for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i) // scan the jth column
260  {
261  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
262  IT hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
263  | ((highcolmask & j) >> collowbits);
264  IT lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
265 
266  // i => location of that nonzero in csc.ir and csc.num arrays^M
267  pairarray[k++] = mypair(hindex, ipair(lindex,i));
268  }
269  }
270  sort(pairarray, pairarray+nz); // sort according to hindex
271  SortBlocks(pairarray, csc.num);
272  delete [] pairarray;
273 }
274 
275 template <class NT, class IT, unsigned TTDIM>
276 void BmCsb<NT, IT, TTDIM>::SortBlocks(pair<IT, pair<IT,IT> > * pairarray, NT * val)
277 {
278  typedef pair<IT, pair<IT, IT> > mypair;
279  IT cnz = 0;
280  IT crb = 0; // current register block
281  IT ldim = IntPower<2>(colhighbits); // leading dimension (not always equal to nbc)
282  vector<IT> tempbot;
283  vector<MTYPE> M;
284  for(IT i = 0; i < nbr; ++i)
285  {
286  for(IT j = 0; j < nbc; ++j)
287  {
288  top[i][j] = tempbot.size(); // top array now points to register blocks (instead of nonzeros)
289  IT prevcnz = cnz;
290  std::vector<mypair> blocknz;
291  while(cnz < nz && pairarray[cnz].first == ((i*ldim)+j) ) // as long as we're in this block
292  {
293  IT lowbits = pairarray[cnz].second.first;
294  IT rlowbits = ((lowbits >> collowbits) & lowrowmask);
295  IT clowbits = (lowbits & lowcolmask);
296  IT bikey = BitInterleaveLow(rlowbits, clowbits);
297 
298  blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
299  }
300  // sort the block into bitinterleaved order
301  sort(blocknz.begin(), blocknz.end());
302 
303  int lastregblk = -1;
304  IT bnz = blocknz.size();
305 
306  for(IT bcur=0; bcur < bnz; ++bcur)
307  {
308  int curregblk = getDivident(blocknz[bcur].first, RBSIZE);
309  if(curregblk > lastregblk) // new register block
310  {
311  lastregblk = curregblk;
312  M.push_back((MTYPE) 0);
313 
314  // The following lines implement a get_head function that returns
315  // the top-left index of the register block that this nonzero belongs
316  IT Ci = blocknz[bcur].second.first & lowcolmask;
317  IT Ri = (blocknz[bcur].second.first >> collowbits) & lowrowmask;
318  Ci -= getModulo(Ci,RBDIM);
319  Ri -= getModulo(Ri,RBDIM);
320  IT lefttop = ((lowrowmask & Ri) << collowbits) | (lowcolmask & Ci);
321 
322  tempbot.push_back(lefttop);
323  }
324  M.back() |= GetMaskTable<MTYPE>(getModulo(blocknz[bcur].first, RBSIZE));
325  }
326  for(IT k=prevcnz; k<cnz ; ++k)
327  {
328  num[k] = val[blocknz[k-prevcnz].second.second];
329  }
330  }
331  top[i][nbc] = tempbot.size();
332  }
333  assert(M.size() == tempbot.size());
334  masks = new MTYPE[M.size()];
335  copy(M.begin(), M.end(), masks);
336 
337  bot = new IT[tempbot.size()];
338  copy(tempbot.begin(), tempbot.end(), bot);
339  nrb = tempbot.size();
340 
341  assert(cnz == nz);
342 }
343 
344 
345 
352 template <class NT, class IT, unsigned TTDIM>
353 void BmCsb<NT, IT, TTDIM>::BMult(IT** chunks, IT start, IT end, const NT * x, NT * y, IT ysize, IT * __restrict sumscan) const
354 {
355  assert(end-start > 0); // there should be at least one chunk
356  if (end-start == 1) // single chunk
357  {
358  if((chunks[end] - chunks[start]) == 1) // chunk consists of a single (normally dense) block
359  {
360  IT chi = ( (chunks[start] - chunks[0]) << collowbits);
361 
362  // m-chi > lowcolmask for all blocks except the last skinny tall one.
363  // if the last one is regular too, then it has m-chi = lowcolmask+1
364  if(ysize == (lowrowmask+1) && (m-chi) > lowcolmask ) // parallelize if it is a regular/complete block
365  {
366  const NT * __restrict subx = &x[chi];
367  BlockPar( *(chunks[start]) , *(chunks[end]), subx, y, 0, blcrange, BREAKNRB * ysize, sumscan);
368  }
369  else // otherwise block parallelization will fail
370  {
371  SubSpMV(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y, sumscan);
372  }
373  }
374  else // a number of sparse blocks with a total of at most O(\beta) nonzeros
375  {
376  SubSpMV(chunks[0], chunks[start]-chunks[0], chunks[end]-chunks[0], x, y, sumscan);
377  }
378  }
379  else
380  {
381  IT mid = (start+end)/2; // divide chunks into half
382  cilk_spawn BMult(chunks, start, mid, x, y, ysize, sumscan);
383  if(SYNCHED)
384  {
385  BMult(chunks, mid, end, x, y, ysize, sumscan);
386  }
387  else
388  {
389  NT * temp = new NT[ysize];
390  std::fill_n(temp, ysize, 0.0);
391  BMult(chunks, mid, end, x, temp, ysize, sumscan);
392  cilk_sync;
393  for(IT i=0; i<ysize; ++i)
394  y[i] += temp[i];
395  delete [] temp;
396  }
397  }
398 }
399 
400 
401 
402 // Parallelize the block itself (A*x version)
403 // start/end: element start/end positions (indices to the bot array)
404 // bot[start...end] always fall in the same block
405 // PRECONDITION: rangeend-rangebeg is a power of two
406 // TODO: we rely on the particular implementation of lower_bound for correctness, which is dangerous !
407 // what if lhs (instead of rhs) parameter to the comparison object is the splitter?
408 template <class NT, class IT, unsigned TTDIM>
409 void BmCsb<NT, IT, TTDIM>::BlockPar(IT start, IT end, const NT * __restrict subx, NT * __restrict suby,
410  IT rangebeg, IT rangeend, IT cutoff, IT * __restrict sumscan) const
411 {
412  assert(IsPower2(rangeend-rangebeg));
413  if(end - start < cutoff)
414  {
415  SSEspmv(num + sumscan[start], masks + start, bot + start, end-start, subx, suby, lowcolmask, lowrowmask, collowbits);
416  }
417  else
418  {
419  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
420  // Specifically, it returns the first position where value could be inserted without violating the ordering
421  IT halfrange = (rangebeg+rangeend)/2;
422  IT qrt1range = (rangebeg+halfrange)/2;
423  IT qrt3range = (halfrange+rangeend)/2;
424 
425  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
426  IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
427  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
428 
429  /* -------
430  | 0 2 |
431  | 1 3 |
432  ------- */
433  // subtracting two pointers pointing to the same array gives you the # of elements separating them
434  // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT
435  IT size0 = static_cast<IT> (left - &bot[start]);
436  IT size1 = static_cast<IT> (mid - left);
437  IT size2 = static_cast<IT> (right - mid);
438  IT size3 = static_cast<IT> (&bot[end] - right);
439 
440  IT ncutoff = std::max<IT>(cutoff/2, MINNRBTOPAR);
441 
442  // We can choose to perform [0,3] in parallel and then [1,2] in parallel
443  // or perform [0,1] in parallel and then [2,3] in parallel
444  // Decision is based on the balance, i.e. we pick the more balanced parallelism
445  if( ( absdiff(size0,size3) + absdiff(size1,size2) ) < ( absdiff(size0,size1) + absdiff(size2,size3) ) )
446  {
447  cilk_spawn BlockPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff,sumscan); // multiply subblock_0
448  BlockPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff,sumscan); // multiply subblock_3
449  cilk_sync;
450 
451  cilk_spawn BlockPar(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff,sumscan); // multiply subblock_1
452  BlockPar(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff,sumscan); // multiply subblock_2
453  cilk_sync;
454  }
455  else
456  {
457  cilk_spawn BlockPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff,sumscan); // multiply subblock_0
458  BlockPar(start+size0, start+size0+size1, subx, suby, qrt1range, halfrange, ncutoff,sumscan); // multiply subblock_1
459  cilk_sync;
460 
461  cilk_spawn BlockPar(start+size0+size1, end-size3, subx, suby, halfrange, qrt3range, ncutoff,sumscan); // multiply subblock_2
462  BlockPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff,sumscan); // multiply subblock_3
463  cilk_sync;
464  }
465  }
466 }
467 
468 
469 // double* restrict a; --> No aliases for a[0], a[1], ...
470 // bstart/bend: block start/end index (to the top array)
471 template <class NT, class IT, unsigned TTDIM>
472 void BmCsb<NT, IT, TTDIM>::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const NT * __restrict x, NT * __restrict suby, IT * __restrict sumscan) const
473 {
474  for (IT j = bstart ; j < bend ; ++j) // for all blocks inside that block row
475  {
476  IT chi = (j << collowbits); // &x[chi] addresses the higher order bits for column indices
477 
478  if(btop[j+1] - btop[j] > 0)
479  {
480  SSEspmv(num + sumscan[btop[j]], masks + btop[j], bot + btop[j], btop[j+1]-btop[j], x+chi, suby, lowcolmask, lowrowmask, collowbits);
481  }
482  }
483 }
484 
485 
486 // Print stats to an ofstream object
487 template <class NT, class IT, unsigned TTDIM>
488 ofstream & BmCsb<NT, IT, TTDIM>::PrintStats(ofstream & outfile) const
489 {
490  if(nz == 0)
491  {
492  outfile << "## Matrix Doesn't have any nonzeros" <<endl;
493  return outfile;
494  }
495  const IT ntop = nbr * nbc;
496 
497  outfile << "## Average block is of dimensions "<< lowrowmask+1 << "-by-" << lowcolmask+1 << endl;
498  outfile << "## Number of real blocks is "<< ntop << endl;
499  outfile << "## Row imbalance is " << RowImbalance(*this) << endl;
500 
501  std::vector<int> blocksizes(ntop);
502  for(IT i=0; i<nbr; ++i)
503  {
504  for(IT j=0; j < nbc; ++j)
505  {
506  blocksizes[i*nbc+j] = static_cast<int> (top[i][j+1]-top[i][j]);
507  }
508  }
509  sort(blocksizes.begin(), blocksizes.end());
510  outfile<< "## Total number of nonzeros: " << nz << endl;
511  outfile<< "## Total number of register blocks: "<< accumulate(blocksizes.begin(), blocksizes.end(), 0) << endl;
512  outfile<< "## Average fill ratio is: " << static_cast<double>(nz) / static_cast<double>((RBSIZE * nrb)) << endl;
513  outfile<< "## The histogram of fill ratios within register blocks:" << endl;
514 
515  unsigned * counts = new unsigned[nrb];
516  popcountall(masks, counts, nrb);
517  printhistogram(counts, nrb, RBSIZE);
518  delete [] counts;
519 
520  outfile << "## Nonzero distribution (sorted) of blocks follows: \n" ;
521  for(IT i=0; i< ntop; ++i)
522  {
523  outfile << blocksizes[i] << "\n";
524  }
525  outfile << endl;
526  return outfile;
527 }
ofstream & PrintStats(ofstream &outfile) const
Definition: bmcsb.cpp:488
void printhistogram(const T *scansum, size_t size, unsigned bins)
Definition: utility.h:166
unsigned int getModulo(unsigned int n, unsigned int d)
Definition: utility.h:496
unsigned int getDivident(unsigned int n, unsigned int d)
Definition: utility.h:502
unsigned int nextpoweroftwo(unsigned int v)
Definition: utility.h:401
#define MINNRBTOPAR
Definition: utility.h:139
Definition: bmcsb.h:21
#define SLACKNESS
Definition: utility.h:130
#define RBDIM
Definition: utility.h:128
#define BREAKNRB
Definition: utility.h:138
#define SYNCHED
Definition: utility.h:21
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
Definition: utility.h:344
unsigned IntPower< 2 >(unsigned exponent)
Definition: utility.h:387
bool IsPower2(T x)
Definition: utility.h:396
void SSEspmv(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
Definition: SSEspmv.cpp:1120
unsigned int highestbitset(unsigned __int64 v)
Definition: utility.h:423
Definition: csc.h:15
#define absdiff(x, y)
Definition: utility.h:147
float RowImbalance(const CSB &A)
Definition: friends.h:400
BmCsb()
Definition: bmcsb.h:24
~BmCsb()
Definition: bmcsb.cpp:225
#define RBSIZE
Definition: utility.h:129
#define L2SIZE
Definition: utility.h:132
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
Definition: SSEspmv.cpp:1232
BmCsb< NT, IT, TTDIM > & operator=(const BmCsb< NT, IT, TTDIM > &rhs)
Definition: bmcsb.cpp:163