Compressed Sparse Blocks  1.2
 All Classes Files Functions Variables Typedefs Friends Macros Pages
bmsym.cpp
Go to the documentation of this file.
1 #include "bmsym.h"
2 #include "utility.h"
3 
4 // Choose block size as big as possible given the following constraints
5 // 1) The bot array is addressible by IT
6 // 2) The parts of x & y vectors that a block touches fits into L2 cache [assuming a saxpy() operation]
7 // 3) There's enough parallel slackness for block rows (at least SLACKNESS * CILK_NPROC)
8 template <class NT, class IT, unsigned TTDIM>
9 void BmSym<NT, IT, TTDIM>::Init(int workers, IT forcelogbeta)
10 {
11  ispar = (workers > 1);
12  IT roundup = nextpoweroftwo(n);
13 
14  // if indices are negative, highestbitset returns -1,
15  // but that will be caught by the sizereq below
16  IT nbits = highestbitset(roundup);
17  bool sizereq;
18  if (ispar)
19  {
20  sizereq = (IntPower<2>(nbits) > SLACKNESS * workers);
21  }
22  else
23  {
24  sizereq = (nbits > 1);
25  }
26  if(!sizereq)
27  {
28  cerr << "Matrix too small for this library" << endl;
29  return;
30  }
31 
32  nlowbits = nbits-1;
33  IT inf = numeric_limits<IT>::max();
34  IT maxbits = highestbitset(inf);
35 
36  nhighbits = nbits-nlowbits; // # higher order bits for rows (has at least one bit)
37  if(ispar)
38  {
39  while(IntPower<2>(nhighbits) < SLACKNESS * workers)
40  {
41  nhighbits++;
42  nlowbits--;
43  }
44  }
45 
46  // calculate the space that suby and subx occupy in L2 cache
47  IT yL2 = IntPower<2>(nlowbits) * sizeof(NT);
48  while(yL2 > L2SIZE)
49  {
50  yL2 /= 2;
51  nhighbits++;
52  nlowbits--;
53  }
54 
55  lowmask = IntPower<2>(nlowbits) - 1;
56  if(forcelogbeta != 0)
57  {
58  IT candlowmask = IntPower<2>(forcelogbeta) -1;
59  cout << "Forcing beta to "<< (candlowmask+1) << " instead of the chosen " << (lowmask+1) << endl;
60  cout << "Warning : No checks are performed on the beta you have forced, anything can happen !" << endl;
61  lowmask = candlowmask;
62  nlowbits = forcelogbeta;
63  nhighbits = nbits-nlowbits;
64  }
65  else
66  {
67  double sqrtn = sqrt(static_cast<double>(n)); IT logbeta = static_cast<IT>(ceil(log2(sqrtn))) + 2;
68  if(nlowbits > logbeta)
69  {
70  nlowbits = logbeta;
71  lowmask = IntPower<2>(logbeta) -1;
72  nhighbits = nbits-nlowbits;
73  }
74  cout << "Beta chosen to be "<< (lowmask+1) << endl;
75  }
76  highmask = ((roundup - 1) ^ lowmask);
77 
78  IT blcdim = lowmask + 1;
79  ncsb = static_cast<IT>(ceil(static_cast<double>(n) / static_cast<double>(blcdim)));
80 
81  blcrange = (lowmask+1) * (lowmask+1); // range indexed by one block
82  mortoncmp = MortCompSym<IT>(nlowbits, lowmask);
83 }
84 
85 
86 // copy constructor
87 template <class NT, class IT, unsigned TTDIM>
89 : nz(rhs.nz), n(rhs.n), blcrange(rhs.blcrange), ncsb(rhs.ncsb), nrb(rhs.nrb),
90 nhighbits(rhs.nhighbits), nlowbits(rhs.nlowbits), diagonal(rhs.diagonal),
91 highmask(rhs.highmask), lowmask(rhs.lowmask), mortoncmp(rhs.mortoncmp), ispar(rhs.ispar)
92 {
93  if(nz > 0) // nz > 0 iff nrb > 0
94  {
95  num = new NT[nz+2](); // pad from both sides
96  num++;
97 
98  bot = new IT[nrb];
99  masks = new MTYPE[nrb];
100  scansum = new IT[nrb];
101 
102  copy ( rhs.num, rhs.num+nz+1, num);
103  copy ( rhs.bot, rhs.bot+nrb, bot );
104  copy ( rhs.masks, rhs.masks+nrb, masks );
105  copy ( rhs.scansum, rhs.scansum+nrb, scansum );
106  }
107  if ( ncsb > 0)
108  {
109  top = new IT* [ncsb];
110  for(IT i=0; i<ncsb; ++i)
111  top[i] = new IT[ncsb-i+1];
112  for(IT i=0; i<ncsb; ++i)
113  for(IT j=0; j <= (ncsb-i); ++j)
114  top[i][j] = rhs.top[i][j];
115  }
116 }
117 
118 template <class NT, class IT, unsigned TTDIM>
120 {
121  if(this != &rhs)
122  {
123  if(nz > 0) // if the existing object is not empty
124  {
125  // make it empty
126  delete [] scansum;
127  delete [] masks;
128  delete [] bot;
129  delete [] (--num);
130  }
131  if(ncsb > 0)
132  {
133  for(IT i=0; i<ncsb; ++i)
134  delete [] top[i];
135  delete [] top;
136  }
137  ispar = rhs.ispar;
138  nz = rhs.nz;
139  nrb = rhs.nrb;
140  n = rhs.n;
141  ncsb = rhs.ncsb;
142  blcrange = rhs.blcrange;
143  mortoncmp = rhs.mortoncmp;
144  diagonal = rhs.diagonal;
145 
146  nhighbits = rhs.nhighbits;
147  nlowbits = rhs.nlowbits;
148  highmask = rhs.highmask;
149  lowmask = rhs.lowmask;
150 
151 
152  if(nz > 0) // if the copied object is not empty
153  {
154  num = new NT[nz+2](); num++;
155  bot = new IT[nrb];
156  masks = new MTYPE[nrb];
157  scansum = new IT[nrb];
158 
159  copy ( rhs.num, rhs.num+nz+1, num);
160  copy ( rhs.bot, rhs.bot+nrb, bot );
161  copy ( rhs.masks, rhs.masks+nrb, masks );
162  copy ( rhs.scansum, rhs.scansum+nrb, scansum );
163  }
164  if(ncsb > 0)
165  {
166  top = new IT* [ncsb];
167  for(IT i=0; i<ncsb; ++i)
168  top[i] = new IT[ncsb-i+1];
169  for(IT i=0; i<ncsb; ++i)
170  for(IT j=0; j <= (ncsb-i); ++j)
171  top[i][j] = rhs.top[i][j];
172  }
173  }
174  return *this;
175 }
176 
177 template <class NT, class IT, unsigned TTDIM>
179 {
180  if( nz > 0)
181  {
182  delete [] scansum;
183  delete [] masks;
184  delete [] bot;
185  delete [] (--num);
186  }
187  if ( ncsb > 0)
188  {
189  for(IT i=0; i<ncsb; ++i)
190  delete [] top[i];
191  delete [] top;
192  }
193 }
194 
195 template <class NT, class IT, unsigned TTDIM>
196 BmSym<NT, IT, TTDIM>::BmSym (Csc<NT, IT> & csc, int workers):nz(csc.nz), n(csc.n)
197 {
198  typedef std::pair<IT, IT> ipair;
199  typedef std::pair<IT, ipair> mypair;
200 
201  assert(nz != 0 && n != 0);
202  Init(workers);
203 
204  top = new IT* [ncsb];
205  for(IT i=0; i<ncsb; ++i)
206  top[i] = new IT[ncsb-i+1];
207 
208  mypair * pairarray = new mypair[nz];
209  IT k = 0;
210  for(IT j = 0; j < n; ++j)
211  {
212  for (IT i = csc.jc [j] ; i < csc.jc[j+1] ; ++i) // scan the jth column
213  {
214  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
215  IT hindex = (((highmask & csc.ir[i] ) >> nlowbits) << nhighbits) | ((highmask & j) >> nlowbits);
216  IT lindex = ((lowmask & csc.ir[i]) << nlowbits) | (lowmask & j) ;
217 
218  // i => location of that nonzero in csc.ir and csc.num arrays^M
219  pairarray[k++] = mypair(hindex, ipair(lindex,i));
220  }
221  }
222  sort(pairarray, pairarray+nz); // sort according to hindex
223  SortBlocks(pairarray, csc.num);
224  delete [] pairarray;
225 }
226 
227 template <class NT, class IT, unsigned TTDIM>
228 void BmSym<NT, IT, TTDIM>::SortBlocks(pair<IT, pair<IT,IT> > * pairarray, NT * val)
229 {
230  typedef pair<IT, pair<IT, IT> > mypair;
231  IT cnz = 0;
232  IT crb = 0; // current register block
233  IT ldim = IntPower<2>(nhighbits); // leading dimension (not always equal to ncsb)
234  vector<NT> tempnum;
235  vector<IT> tempbot;
236  vector<MTYPE> M;
237  for(IT i = 0; i < ncsb; ++i)
238  {
239  for(IT j = 0; j < (ncsb-i); ++j)
240  {
241  top[i][j] = tempbot.size(); // top points to register blocks
242  IT prevcnz = cnz;
243  std::vector<mypair> blocknz;
244  while(cnz < nz && pairarray[cnz].first == ((i*ldim)+(j+i)) ) // as long as we're in this block
245  {
246  IT interlowbits = pairarray[cnz].second.first;
247  IT rlowbits = ((interlowbits >> nlowbits) & lowmask);
248  IT clowbits = (interlowbits & lowmask);
249  IT bikey = BitInterleaveLow(rlowbits, clowbits);
250 
251  if(j == 0 && rlowbits == clowbits)
252  {
253  diagonal.push_back(make_pair((i << nlowbits)+rlowbits, val[pairarray[cnz++].second.second]));
254  }
255  else
256  {
257  blocknz.push_back(mypair(bikey, pairarray[cnz++].second));
258  }
259  }
260  // sort the block into bitinterleaved order
261  sort(blocknz.begin(), blocknz.end());
262 
263  int lastregblk = -1;
264  for(typename vector<mypair>::iterator itr = blocknz.begin(); itr != blocknz.end(); ++itr)
265  {
266  tempnum.push_back( val[itr->second.second] );
267 
268  int curregblk = getDivident(itr->first, RBSIZE);
269  if(curregblk > lastregblk) // new register block
270  {
271  lastregblk = curregblk;
272  M.push_back((MTYPE) 0);
273 
274  // The following lines implement a get_head function that returns
275  // the top-left index of the register block that this nonzero belongs
276  IT Ci = itr->second.first & lowmask;
277  IT Ri = (itr->second.first >> nlowbits) & lowmask;
278  Ci -= getModulo(Ci,RBDIM);
279  Ri -= getModulo(Ri,RBDIM);
280  IT lefttop = ((lowmask & Ri) << nlowbits) | (lowmask & Ci);
281 
282  tempbot.push_back(lefttop);
283  }
284  M.back() |= GetMaskTable<MTYPE>(getModulo(itr->first, RBSIZE));
285  }
286  }
287  top[i][ncsb-i] = tempbot.size();
288  }
289 
290  assert (cnz == nz);
291  nz = tempnum.size(); // update the number of off-diagonal nonzeros
292  nrb = tempbot.size(); // update the number of off-diagonal register blocks
293  masks = new MTYPE[nrb];
294  scansum = new IT[nrb];
295  bot = new IT[nrb];
296  num = new NT[nz+2](); num++; // padded for blendv in both sides
297 
298  copy(M.begin(), M.end(), masks);
299  prescan(scansum, masks, nrb);
300  copy(tempbot.begin(), tempbot.end(), bot);
301  copy(tempnum.begin(), tempnum.end(), num);
302 }
303 
304 template<class NT, class IT, unsigned TTDIM>
305 void BmSym<NT, IT, TTDIM>::DivideIterationSpace(IT * & lspace, IT * & rspace, IT & lsize, IT & rsize, IT size, IT d) const
306 {
307  if(d == 1)
308  {
309  lsize = size-size/2;
310  rsize = size/2;
311  lspace = new IT[lsize];
312  rspace = new IT[rsize];
313  for(IT i=0; i<rsize; ++i) // we alternate indices
314  {
315  lspace[i] = 2*i;
316  rspace[i] = 2*i+1;
317  }
318  if(lsize > rsize)
319  {
320  lspace[lsize-1] = size-1;
321  }
322  }
323  else if(d == 2)
324  {
325  IT chunksfour = size/4; // we alternate chunks of two
326  IT rest = size - 4*chunksfour; // rest is modulus 4
327  lsize = 2*chunksfour;
328  rsize = 2*chunksfour;
329  if(rest > 2)
330  {
331  rsize += (rest-2);
332  lsize += 2;
333  }
334  else
335  {
336  lsize += rest;
337  }
338 
339  lspace = new IT[lsize];
340  rspace = new IT[rsize];
341 
342  for(IT i=0; i<chunksfour; ++i) // we alternate indices
343  {
344  lspace[2*i] = 4*i;
345  lspace[2*i+1] = 4*i+1;
346  rspace[2*i] = 4*i+2;
347  rspace[2*i+1] = 4*i+3;
348  }
349  if(rest == 3)
350  {
351  lspace[lsize-2] = size-3;
352  lspace[lsize-1] = size-2;
353  rspace[rsize-1] = size-1;
354  }
355  else if(rest == 2)
356  {
357  lspace[lsize-2] = size-2;
358  lspace[lsize-1] = size-1;
359  }
360  else if(rest == 1)
361  {
362  lspace[lsize-1] = size-1;
363  }
364  }
365 }
366 
367 template<class NT, class IT, unsigned TTDIM>
368 void BmSym<NT, IT, TTDIM>::MultAddAtomics(NT * __restrict y, const NT * __restrict x, const IT d) const
369 {
370  cilk_for(IT i=0; i< ncsb-d; ++i) // all blocks at the dth diagonal and beyond
371  {
372  IT rhi = (i << nlowbits);
373 
374  cilk_for(IT j=d; j < (ncsb-i); ++j)
375  {
376  IT chi = ((j+i) << nlowbits);
377  symcsr(num+scansum[top[i][j]], masks+top[i][j], bot+top[i][j], top[i][j+1]-top[i][j], x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
378  }
379  }
380 }
381 
382 
383 template <class NT, class IT, unsigned TTDIM>
384 void BmSym<NT, IT, TTDIM>::MultMainDiag(NT * __restrict y, const NT * __restrict x) const
385 {
386  if(Imbalance(0) > 2 * BALANCETH) // factor of 2: main diagonal has twice as much parallelism as other diagonals
387  {
388  cilk_for(IT i=0; i< ncsb; ++i) // in main diagonal, j = i
389  {
390  IT hi = (i << nlowbits);
391 
392  if(i == (ncsb-1) && (n-hi) <= lowmask) // last iteration and it's irregular (can't parallelize)
393  {
394  SSEsym(num + scansum[top[i][0]], masks + top[i][0], bot + top[i][0], top[i][1]-top[i][0], x+hi, y+hi, lowmask, nlowbits);
395  }
396  else
397  {
398  BlockTriPar(top[i][0], top[i][1], x+hi, y+hi, 0, blcrange, BREAKNRB * (nlowbits+1));
399  }
400  }
401  }
402  else // No need for block parallelization
403  {
404  cilk_for(IT i=0; i< ncsb; ++i) // in main diagonal, j = i
405  {
406  IT hi = (i << nlowbits);
407  SSEsym(num + scansum[top[i][0]], masks + top[i][0], bot + top[i][0], top[i][1]-top[i][0], x+hi, y+hi, lowmask, nlowbits);
408  }
409  }
410 
411  const IT diagsize = diagonal.size();
412  cilk_for(IT i=0; i < diagsize; ++i)
413  {
414  y[diagonal[i].first] += diagonal[i].second * x[diagonal[i].first]; // process the diagonal
415  }
416 }
417 
418 
419 // Multiply the nth block diagonal
420 // which is composed of blocks A[i][i+n]
421 template <class NT, class IT, unsigned TTDIM>
422 void BmSym<NT, IT, TTDIM>::MultDiag(NT * __restrict y, const NT * __restrict x, const IT d) const
423 {
424  IT * lspace;
425  IT * rspace;
426  IT lsize, rsize;
427  DivideIterationSpace(lspace, rspace, lsize, rsize, ncsb-d, d);
428 
429  IT lsum = 0;
430  IT rsum = 0;
431  for(IT k=0; k<lsize; ++k)
432  {
433  lsum += top[lspace[k]][d+1] - top[lspace[k]][d];
434  }
435  for(IT k=0; k<rsize; ++k)
436  {
437  rsum += top[rspace[k]][d+1] - top[rspace[k]][d];
438  }
439  float lave = lsum / lsize;
440  float rave = rsum / rsize;
441 
442  cilk_for(IT i=0; i< lsize; ++i) // in the dth diagonal, j = i+d
443  {
444  IT rhi = (lspace[i] << nlowbits) ;
445  IT chi = ((lspace[i]+d) << nlowbits);
446  IT start = top[lspace[i]][d];
447  IT end = top[lspace[i]][d+1];
448 
449  if((top[lspace[i]][d+1] - top[lspace[i]][d] > BALANCETH * lave) // relative denser block
450  && (!(lspace[i] == (ncsb-d-1) && (n-chi) <= lowmask))) // and parallelizable
451  {
452  BlockPar(start, end, x+chi, x+rhi, y+rhi, y+chi, 0, blcrange, BREAKNRB * (nlowbits+1));
453  }
454  else
455  {
456  SSEsym(num + scansum[start], masks + start, bot + start, end-start, x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
457  }
458  }
459  cilk_for(IT j=0; j< rsize; ++j)
460  {
461  IT rhi = (rspace[j] << nlowbits) ;
462  IT chi = ((rspace[j]+d) << nlowbits);
463  IT start = top[rspace[j]][d];
464  IT end = top[rspace[j]][d+1];
465 
466  if((top[rspace[j]][d+1] - top[rspace[j]][d] > BALANCETH * rave) // relative denser block
467  && (!(rspace[j] == (ncsb-d-1) && (n-chi) <= lowmask))) // and parallelizable
468  {
469  BlockPar(start, end, x+chi, x+rhi, y+rhi, y+chi, 0, blcrange, BREAKNRB * (nlowbits+1));
470  }
471  else
472  {
473  SSEsym(num + scansum[start], masks + start, bot + start, end-start, x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
474  }
475  }
476  delete [] lspace;
477  delete [] rspace;
478 }
479 
480 // Block parallelization for upper triangular compressed sparse blocks
481 // start/end: element start/end positions (indices to the bot array)
482 // bot[start...end] always fall in the `same block
483 // PRECONDITION: rangeend-rangebeg is a power of two
484 template <class NT, class IT, unsigned TTDIM>
485 void BmSym<NT, IT, TTDIM>::BlockTriPar(IT start, IT end, const NT * __restrict subx, NT * __restrict suby,
486  IT rangebeg, IT rangeend, IT cutoff) const
487 {
488  assert(IsPower2(rangeend-rangebeg));
489  if(end - start < cutoff)
490  {
491  SSEsym(num + scansum[start], masks + start, bot + start, end-start, subx, suby, lowmask, nlowbits);
492  }
493  else
494  {
495  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
496  // Specifically, it returns the first position where value could be inserted without violating the ordering
497  IT halfrange = (rangebeg+rangeend)/2;
498  IT qrt1range = (rangebeg+halfrange)/2;
499  IT qrt3range = (halfrange+rangeend)/2;
500 
501  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp); // divides in mid column
502  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
503 
504  /* -------
505  | 0 2 |
506  | 1 3 |
507  ------- */
508  // subtracting two pointers pointing to the same array gives you the # of elements separating them
509  // In the symmetric case, quadrant "1" doesn't exist (size1 = 0)
510  IT size0 = static_cast<IT> (mid - &bot[start]);
511  IT size2 = static_cast<IT> (right - mid);
512  IT size3 = static_cast<IT> (&bot[end] - right);
513 
514  IT ncutoff = std::max<IT>(cutoff/2, MINNRBTOPAR);
515 
516  cilk_spawn BlockTriPar(start, start+size0, subx, suby, rangebeg, qrt1range, ncutoff); // multiply subblock_0
517  BlockTriPar(end-size3, end, subx, suby, qrt3range, rangeend, ncutoff); // multiply subblock_3
518  cilk_sync;
519 
520  BlockPar(start+size0, end-size3, subx, subx, suby, suby, halfrange, qrt3range, ncutoff); // multiply subblock_2
521  }
522 }
523 
524 // Parallelize the block itself
525 // start/end: element start/end positions (indices to the bot array)
526 // bot[start...end] always fall in the same block
527 // PRECONDITION: rangeend-rangebeg is a power of two
528 // TODO: we rely on the particular implementation of lower_bound for correctness, which is dangerous !
529 // what if lhs (instead of rhs) parameter to the comparison object is the splitter?
530 template <class NT, class IT, unsigned TTDIM>
531 void BmSym<NT, IT, TTDIM>::BlockPar(IT start, IT end, const NT * __restrict subx, const NT * __restrict subx_mirror,
532  NT * __restrict suby, NT * __restrict suby_mirror, IT rangebeg, IT rangeend, IT cutoff) const
533 {
534  assert(IsPower2(rangeend-rangebeg));
535  if(end - start < cutoff)
536  {
537  // Aliasing is not an issue here. BlockPar is only called on off-diagonal register blocks
538  SSEsym(num + scansum[start], masks + start, bot + start, end-start, subx, subx_mirror, suby, suby_mirror, lowmask, nlowbits);
539  }
540  else
541  {
542  // Lower_bound is a version of binary search: it attempts to find the element value in an ordered range [first, last)
543  // Specifically, it returns the first position where value could be inserted without violating the ordering
544  IT halfrange = (rangebeg+rangeend)/2;
545  IT qrt1range = (rangebeg+halfrange)/2;
546  IT qrt3range = (halfrange+rangeend)/2;
547 
548  IT * mid = std::lower_bound(&bot[start], &bot[end], halfrange, mortoncmp);
549  IT * left = std::lower_bound(&bot[start], mid, qrt1range, mortoncmp);
550  IT * right = std::lower_bound(mid, &bot[end], qrt3range, mortoncmp);
551 
552  /* -------
553  | 0 2 |
554  | 1 3 |
555  ------- */
556  // subtracting two pointers pointing to the same array gives you the # of elements separating them
557  // we're *sure* that the differences are 1) non-negative, 2) small enough to be indexed by an IT
558  IT size0 = static_cast<IT> (left - &bot[start]);
559  IT size1 = static_cast<IT> (mid - left);
560  IT size2 = static_cast<IT> (right - mid);
561  IT size3 = static_cast<IT> (&bot[end] - right);
562 
563  IT ncutoff = std::max<IT>(cutoff/2, MINNRBTOPAR);
564 
565  // We only perform [0,3] in parallel and then [1,2] in parallel because the symmetric update causes races when
566  // performing [0,1] in parallel (as it would perform [0,2] in the fictitious lower triangular part)
567  cilk_spawn BlockPar(start, start+size0, subx, subx_mirror, suby, suby_mirror, rangebeg, qrt1range, ncutoff); // multiply subblock_0
568  BlockPar(end-size3, end, subx, subx_mirror, suby, suby_mirror, qrt3range, rangeend, ncutoff); // multiply subblock_3
569  cilk_sync;
570 
571  cilk_spawn BlockPar(start+size0, start+size0+size1, subx, subx_mirror, suby, suby_mirror, qrt1range, halfrange, ncutoff); // multiply subblock_1
572  BlockPar(start+size0+size1, end-size3, subx, subx_mirror, suby, suby_mirror, halfrange, qrt3range, ncutoff); // multiply subblock_2
573  cilk_sync;
574  }
575 }
576 
577 
578 // double* restrict a; --> No aliases for a[0], a[1], ...
579 // bstart/bend: block start/end index (to the top array)
580 template <class NT, class IT, unsigned TTDIM>
581 void BmSym<NT, IT, TTDIM>::SeqSpMV(const NT * __restrict x, NT * __restrict y) const
582 {
583  const IT diagsize = diagonal.size();
584  for(IT i=0; i < diagsize; ++i)
585  {
586  y[diagonal[i].first] += diagonal[i].second * x[diagonal[i].first]; // process the diagonal
587  }
588  for (IT i = 0 ; i < ncsb ; ++i) // for all block rows of A
589  {
590  IT rhi = (i << nlowbits);
591  for (IT j = 1 ; j < (ncsb-i) ; ++j) // for all blocks inside that block row
592  {
593  IT chi = ((j+i) << nlowbits);
594  SSEsym(num + scansum[top[i][j]], masks+top[i][j], bot+top[i][j], top[i][j+1]-top[i][j], x+chi, x+rhi, y+rhi, y+chi, lowmask, nlowbits);
595  }
596 
597  SSEsym(num + scansum[top[i][0]], masks+top[i][0], bot+top[i][0], top[i][1]-top[i][0], x+rhi, y+rhi, lowmask, nlowbits);
598  }
599 }
600 
601 // Imbalance in the dth block diagonal (the main diagonal is the 0th)
602 template <class NT, class IT,unsigned TTDIM>
603 float BmSym<NT, IT,TTDIM>::Imbalance(IT d) const
604 {
605  if(ncsb <= d+1)
606  {
607  return 0.0; // no such diagonal exist
608  }
609  // get the average without the last left-over blockrow
610  IT size = ncsb-d-1;
611  IT * sums = new IT[size];
612  for(size_t i=0; i< size; ++i)
613  {
614  sums[i] = top[i][d+1] - top[i][d];
615  }
616  IT max = *max_element(sums, sums+size);
617  IT mean = accumulate(sums, sums+size, 0.0) / size;
618  delete [] sums;
619 
620  return static_cast<float>(max) / mean;
621 }
622 
623 
624 // Total number of register blocks in the dth block diagonal (the main diagonal is the 0th)
625 template <class NT, class IT, unsigned TTDIM>
626 IT BmSym<NT, IT,TTDIM>::nrbsum(IT d) const
627 {
628  IT sum = 0;
629  for(size_t i=0; i< ncsb-d; ++i)
630  {
631  sum += (top[i][d+1] - top[i][d]);
632  }
633  return sum;
634 }
635 
636 // Print stats to an ofstream object
637 template <class NT, class IT, unsigned TTDIM>
638 ofstream & BmSym<NT, IT, TTDIM>::PrintStats(ofstream & outfile) const
639 {
640  if(nz == 0)
641  {
642  outfile << "## Matrix Doesn't have any nonzeros" <<endl;
643  return outfile;
644  }
645  const IT ntop = ncsb * ncsb;
646 
647  outfile << "## Average block is of dimensions "<< lowmask+1 << "-by-" << lowmask+1 << endl;
648  outfile << "## Average fill ratio is: " << static_cast<double>(nz) / static_cast<double>((RBSIZE * nrb)) << endl;
649  outfile << "## Number of real blocks is "<< ntop << endl;
650  outfile << "## Main (0th) block diagonal imbalance: " << Imbalance(0) << endl;
651  outfile << "## 1st block diagonal imbalance: " << Imbalance(1) << endl;
652  outfile << "## 2nd block diagonal imbalance: " << Imbalance(2) << endl;
653 
654  outfile << "## nrb ratios (block diagonal 0,1,2): " << static_cast<float>(nrbsum(0)) / nrb << ", "
655  << static_cast<float>(nrbsum(1)) / nrb << ", " << static_cast<float>(nrbsum(2)) / nrb << endl;
656  outfile << "## atomics ratio: " << static_cast<float>(nrb-nrbsum(0)-nrbsum(1)-nrbsum(2))/nrb << endl;
657 
658  outfile<< "## Total number of nonzeros: " << nz << endl;
659  outfile<< "## Total number of register blocks: "<< nrb << endl;
660  return outfile;
661 }
662 
663 
664 template <class NT, class IT, unsigned TTDIM>
665 ofstream & BmSym<NT, IT, TTDIM>::Dump(ofstream & outfile) const
666 {
667  for(IT i =0; i<ncsb; ++i)
668  {
669  for(IT j=0; j< (ncsb-i); ++j)
670  {
671  outfile << "Dumping A.top(" << i << "," << j << ")" << endl;
672  for(IT k=top[i][j]; k< top[i][j+1]; ++k)
673  {
674  IT rli = ((bot[k] >> nlowbits) & lowmask);
675  IT cli = bot[k] & lowmask;
676  outfile << "A(" << rli << "," << cli << ")=" << num[k] << endl;
677  }
678  }
679  }
680  return outfile;
681 }
unsigned int getModulo(unsigned int n, unsigned int d)
Definition: utility.h:496
unsigned int getDivident(unsigned int n, unsigned int d)
Definition: utility.h:502
unsigned int nextpoweroftwo(unsigned int v)
Definition: utility.h:401
#define MINNRBTOPAR
Definition: utility.h:139
~BmSym()
Definition: bmsym.cpp:178
#define SLACKNESS
Definition: utility.h:130
#define RBDIM
Definition: utility.h:128
unsigned prescan(unsigned *a, MTYPE *const M, int n)
Definition: utility.h:191
ofstream & Dump(ofstream &outfile) const
Definition: bmsym.cpp:665
#define BREAKNRB
Definition: utility.h:138
BmSym< NT, IT, TTDIM > & operator=(const BmSym< NT, IT, TTDIM > &rhs)
Definition: bmsym.cpp:119
Definition: bmsym.h:50
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
Definition: utility.h:344
unsigned IntPower< 2 >(unsigned exponent)
Definition: utility.h:387
#define BALANCETH
Definition: utility.h:127
bool IsPower2(T x)
Definition: utility.h:396
unsigned int highestbitset(unsigned __int64 v)
Definition: utility.h:423
Definition: csc.h:15
#define RBSIZE
Definition: utility.h:129
#define L2SIZE
Definition: utility.h:132
void SSEsym(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lowmask, unsigned nlbits)
Definition: SSEspmv.cpp:329
void symcsr(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lcmask, unsigned nlbits)
Definition: SSEspmv.cpp:222
BmSym()
Definition: bmsym.h:53
ofstream & PrintStats(ofstream &outfile) const
Definition: bmsym.cpp:638