Compressed Sparse Blocks  1.2
friends.h
Go to the documentation of this file.
1 #ifndef _FRIENDS_H_
2 #define _FRIENDS_H_
3
4 #include <iostream>
5 #include <algorithm>
6 #include "bicsb.h"
7 #include "bmcsb.h"
8 #include "bmsym.h"
9 #include "csbsym.h"
10 #include "utility.h"
11 #include "timer.gettimeofday.c"
12
13 using namespace std;
14
15 template <class NU, class IU>
16 class BiCsb;
17
18 template <class NU, class IU, unsigned UUDIM>
19 class BmCsb;
20
21 double prescantime;
22
23
24 #if (__GNUC__ == 4 && (__GNUC_MINOR__ < 7) )
25 #define emplace_back push_back
26 #endif
27
28
29
30 // SpMV with Bit-Masked CSB
31 // No semiring or type promotion support yet
32 template <typename NT, typename IT, unsigned TTDIM>
33 void bmcsb_gespmv (const BmCsb<NT, IT, TTDIM> & A, const NT * __restrict x, NT * __restrict y)
34 {
35  double t0 = timer_seconds_since_init();
36
37  unsigned * scansum = new unsigned[A.nrb];
38  unsigned sum = prescan(scansum, A.masks, A.nrb);
39
40  double t1 = timer_seconds_since_init();
41  prescantime += (t1-t0);
42
43  IT ysize = A.lowrowmask + 1; // size of the output subarray (per block row - except the last)
44
45  if( A.isPar() )
46  {
47  float rowave = static_cast<float>(A.numnonzeros()) / (A.nbr-1);
48  cilk_for (IT i = 0 ; i < A.nbr ; ++i) // for all block rows of A
49  {
50  IT * btop = A.top [i]; // get the pointer to this block row
51  IT rhi = ((i << A.rowlowbits) & A.highrowmask);
52  NT * suby = &y[rhi];
53  if( A.top[i][A.nbc] - A.top[i][0] > BALANCETH * rowave)
54  {
55  IT thsh = ysize * BREAKNRB;
56  vector<IT*> chunks;
57  chunks.push_back(btop);
58  for(IT j =0; j < A.nbc; )
59  {
60  IT count = btop[j+1] - btop[j];
61  if(count < thsh && j < A.nbc)
62  {
63  while(count < thsh && j < A.nbc)
64  {
65  count += btop[(++j)+1] - btop[j];
66  }
67  chunks.push_back(btop+j); // push, but exclude the block that caused the overflow
68  }
69  else
70  {
71  chunks.push_back(btop+(++j)); // don't exclude the overflow block if it is the only block in that chunk
72  }
73  }
74  // In std:vector, the elements are stored contiguously so that we can
75  // treat &chunks[0] as an array of pointers to IT w/out literally copying it to IT**
76  if(i==(A.nbr-1)) // last iteration
77  {
78  A.BMult(&chunks[0], 0, chunks.size()-1, x, suby, A.rowsize() - ysize*i, scansum);
79  }
80  else
81  {
82  A.BMult(&chunks[0], 0, chunks.size()-1, x, suby, ysize, scansum);
83  }
84  }
85  else
86  {
87  A.SubSpMV(btop, 0, A.nbc, x, suby, scansum);
88  }
89  }
90  }
91
92  else
93  {
94  cilk_for (IT i = 0 ; i < A.nbr ; ++i) // for all block rows of A
95  {
96  IT * btop = A.top [i]; // get the pointer to this block row
97  IT rhi = ((i << A.rowlowbits) & A.highrowmask);
98  NT * suby = &y[rhi];
99
100  A.SubSpMV(btop, 0, A.nbc, x, suby, scansum);
101  }
102  }
103  delete [] scansum;
104 }
105
112 template <typename SR, typename NT, typename IT, typename RHS, typename LHS>
113 void bicsb_gespmv (const BiCsb<NT, IT> & A, const RHS * __restrict x, LHS * __restrict y)
114 {
115  IT ysize = A.lowrowmask + 1; // size of the output subarray (per block row - except the last)
116
117  if(A.isPar() )
118  {
119  float rowave = static_cast<float>(A.numnonzeros()) / (A.nbr-1);
120  cilk_for (IT i = 0 ; i < A.nbr ; ++i) // for all block rows of A
121  {
122  IT * btop = A.top [i]; // get the pointer to this block row
123  IT rhi = ((i << A.rowlowbits) & A.highrowmask);
124  LHS * suby = &y[rhi];
125
126  if(A.top[i][A.nbc] - A.top[i][0] > std::max( static_cast<NT>(BALANCETH * rowave), static_cast<NT>(BREAKEVEN * ysize) ) )
127  {
128  IT thsh = BREAKEVEN * ysize;
129  vector<IT*> chunks;
130  chunks.push_back(btop);
131  for(IT j =0; j < A.nbc; )
132  {
133  IT count = btop[j+1] - btop[j];
134  if(count < thsh && j < A.nbc)
135  {
136  while(count < thsh && j < A.nbc)
137  {
138  count += btop[(++j)+1] - btop[j];
139  }
140  chunks.push_back(btop+j); // push, but exclude the block that caused the overflow
141  }
142  else
143  {
144  chunks.push_back(btop+(++j)); // don't exclude the overflow block if it is the only block in that chunk
145  }
146  }
147  // In std:vector, the elements are stored contiguously so that we can
148  // treat &chunks[0] as an array of pointers to IT w/out literally copying it to IT**
149  if(i==(A.nbr-1)) // last iteration
150  {
151  A.template BMult<SR>(&chunks[0], 0, chunks.size()-1, x, suby, A.rowsize() - ysize*i);
152  }
153  else
154  {
155  A.template BMult<SR>(&chunks[0], 0, chunks.size()-1, x, suby, ysize); // chunksize-1 because we always insert a dummy chunk
156  }
157  }
158  else
159  {
160  A.template SubSpMV<SR>(btop, 0, A.nbc, x, suby);
161  }
162  }
163  }
164  else
165  {
166  cilk_for (IT i = 0 ; i < A.nbr ; ++i) // for all block rows of A
167  {
168  IT * btop = A.top [i]; // get the pointer to this block row
169  IT rhi = ((i << A.rowlowbits) & A.highrowmask);
170  LHS * suby = &y[rhi];
171  A.template SubSpMV<SR>(btop, 0, A.nbc, x, suby);
172  }
173  }
174 }
175
176
183 template <typename SR, typename NT, typename IT, typename RHS, typename LHS>
184 void bicsb_gespmvt (const BiCsb<NT, IT> & A, const RHS * __restrict x, LHS * __restrict y)
185 {
186  IT ysize = A.lowcolmask + 1; // size of the output subarray (per block column - except the last)
187
188  // A.top (nbr=3, nbc=4):
189  // 0 5 17 21 24
190  // 24 28 33 39 53
191  // 53 60 61 70 72
192
193  vector<IT> colsums(A.nbc,0);
194  cilk_for(IT j=0; j<A.nbc; ++j)
195  {
196  for(IT i=0; i< A.nbr; ++i)
197  {
198  colsums[j] += (A.top[i][j+1] - A.top[i][j]);
199  }
200  }
201
202  if( A.isPar() )
203  {
204  float colave = static_cast<float>(A.numnonzeros()) / (A.nbc-1);
205  cilk_for (IT j = 0 ; j < A.nbc ; ++j) // for all block columns of A
206  {
207  IT rhi = ((j << A.rowlowbits) & A.highcolmask);
208  LHS * suby = &y[rhi];
209  typedef typename std::tuple<IT,IT,IT> IntTriple;
210  typedef typename std::vector< IntTriple > ChunkType;
211  vector< ChunkType * > chunks; // we will have to manage
212
213  // the second condition is == natural == because if colsums[j] < BREAKEVEN * ysize,
214  // then the whole row will be a single chunk of sparse blocks that runs as a single strand
215  if( colsums[j] > BALANCETH * colave && colsums[j] > BREAKEVEN * ysize)
216  {
217  IT thsh = BREAKEVEN * ysize;
218  // each chunk is represented by a vector of blocks
219  // each block is represented by its {begin, end} pointers to bot array AND its -row- block id (within the block column)
220  // get<0>(tuple): begin pointer to bot, get<1>(tuple): end pointer to bot, get<2>(tuple): row block id
221
222  for(IT i =0; i < A.nbr; ++i )
223  {
224  ChunkType * chunk = new ChunkType();
225  chunk->emplace_back( IntTriple (A.top[i][j], A.top[i][j+1], i));
226  IT count = A.top[i][j+1] - A.top[i][j];
227
228  if(count < thsh)
229  {
230  // while adding the next (i+1) element wouldn't exceed the chunk limit
231  while(i < A.nbr-1 && (count+A.top[i+1][j+1] - A.top[i+1][j]) < thsh )
232  {
233  i++; // move to next one before push
234  if(A.top[i][j+1] - A.top[i][j] > 0)
235  {
236  chunk->emplace_back( IntTriple (A.top[i][j], A.top[i][j+1], i));
237  count += A.top[i][j+1] - A.top[i][j];
238  }
239  }
240  // push, but exclude the block that caused the overflow
241  chunks.push_back(chunk); // emplace_back wouldn't buy anything for simple structures like pointers
242  }
243  else // already above the limit by itself => single dense block
244  {
245  chunks.push_back(chunk);
246  }
247  }
248  if(j==(A.nbc-1)) // last iteration
249  {
250  A.template BTransMult<SR>(chunks, 0, chunks.size(), x, suby, A.colsize() - ysize*j);
251  }
252  else
253  {
254  A.template BTransMult<SR>(chunks, 0, chunks.size(), x, suby, ysize); // chunksize (no -1) as there is no dummy chunk
255  }
256
257  // call the destructor of each chunk vector
258  for_each(chunks.begin(), chunks.end(), [](ChunkType * pPtr){ delete pPtr; });
259  }
260  else
261  {
262  A.template SubSpMVTrans<SR>(j, 0, A.nbr, x, suby);
263  }
264  }
265  }
266  else
267  {
268  cilk_for (IT j =0; j< A.nbc; ++j) // for all block columns of A
269  {
270  IT rhi = ((j << A.collowbits) & A.highcolmask);
271  LHS * suby = &y[rhi];
272
273  A.template SubSpMVTrans<SR>(j, 0, A.nbr, x, suby);
274  }
275  }
276 }
277
278 // SpMV with symmetric CSB
279 // No semiring or type promotion support yet
280 template <typename NT, typename IT>
281 void csbsym_gespmv (const CsbSym<NT, IT> & A, const NT * __restrict x, NT * __restrict y)
282 {
283  #pragma isat marker SM2_begin
284  //if( A.isPar() )
285  //{
286  #pragma isat tuning name(tune_tempy) scope(SM1_begin, SM1_end) measure(SM2_begin, SM2_end) variable(SPAWNS, range(1,6)) variable(NDIAGS, range(1,11)) search(dependent)
287  #pragma isat marker SM1_begin
288  #define SPAWNS 1 // how many you do in parallel at a time
289  #define NDIAGS 3 // how many you do in total
290  NT ** t_y = new NT* [SPAWNS];
291  t_y[0] = y; // alias t_y[0] to y
292  for(int i=1; i<SPAWNS; ++i)
293  {
294  t_y[i] = new NT[A.n]();
295  }
296  if(NDIAGS < SPAWNS)
297  {
298  cout << "Impossible to execute" << endl;
299  return;
300  }
301  int syncs = NDIAGS / SPAWNS;
302  int remdiags = NDIAGS;
303  for(int j=0; j < syncs; ++j)
304  {
305  if(remdiags > 1)
306  {
307  A.MultDiag(t_y[0], x, j*SPAWNS); // maps to A.MultMainDiag(y,x) if j = 0
308  --remdiags; // decrease remaining diagonals
309  int i = 1;
310  for(; (i < SPAWNS) && (remdiags > 1) ; ++i)
311  {
312  cilk_spawn A.MultDiag(t_y[i], x, j*SPAWNS + i);
313  --remdiags;
314  }
315  if(i < SPAWNS && remdiags == 1)
316  {
317  cilk_spawn A.MultAddAtomics(t_y[i], x, j*SPAWNS + i);
318  --remdiags;
319  }
320  cilk_sync;
321  }
322  else if(remdiags == 1)
323  {
324  A.MultAddAtomics(t_y[0], x, j*SPAWNS); // will only happen is remdiags is 1 when the outerloop started
325  --remdiags;
326  }
327  }
328
329  cilk_for(int j=0; j< A.n; ++j)
330  {
331  for(int i=1; i<SPAWNS; ++i) // report if this doesn't get unrolled
332  y[j] += t_y[i][j];
333  }
334  for(int i=1; i<SPAWNS; ++i) // don't delete t_y[0]
335  delete [] t_y[i];
336  delete [] t_y;
337  #pragma isat marker SM1_end
338  //}
339  //else
340  //{
341  // A.SeqSpMV(x, y);
342  //}
343  #pragma isat marker SM2_end
344 }
345
346
347 // SpMV with symmetric register blocked CSB
348 template <typename NT, typename IT, unsigned TTDIM>
349 void bmsym_gespmv (const BmSym<NT, IT, TTDIM> & A, const NT * __restrict x, NT * __restrict y)
350 {
351  if( A.isPar() )
352  {
353  NT * y1 = new NT[A.n]();
354  NT * y2 = new NT[A.n]();
355  NT * y3;
356
357  IT size0 = A.nrbsum(0);
358  IT size1 = A.nrbsum(1);
359  IT size2 = A.nrbsum(2);
360
361  if(size0+size1+size2 != A.nrb)
362  {
363  y3 = new NT[A.n]();
365  }
366
367  cilk_spawn A.MultDiag(y1,x,1);
368  cilk_spawn A.MultDiag(y2,x,2);
369  A.MultMainDiag(y, x);
370
371  cilk_sync;
372
373  if(size0+size1+size2 != A.nrb)
374  {
375  cilk_for(int i=0; i< A.n; ++i)
376  {
377  y[i] += y1[i] + y2[i] + y3[i];
378  }
379  delete [] y3;
380  }
381  else
382  {
383  cilk_for(int i=0; i< A.n; ++i)
384  {
385  y[i] += y1[i] + y2[i];
386  }
387  }
388
389  delete [] y1;
390  delete [] y2;
391  }
392  else
393  {
394  A.SeqSpMV(x, y);
395  }
396 }
397
398 // Works on any CSB-like data structure
399 template <class CSB>
400 float RowImbalance(const CSB & A)
401 {
402  // get the average without the last left-over blockrow
403  float rowave = static_cast<float>(*(A.top[A.nbr-1])) / (A.nbr-1);
404  unsigned rowmax = 0;
405  for(size_t i=1; i< A.nbr; ++i)
406  {
407  rowmax = std::max(rowmax, *(A.top[i]) - *(A.top[i-1]));
408  }
409  return static_cast<float>(rowmax) / rowave;
410 }
411
412
413 template <class NT, class IT>
414 float ColImbalance(const BiCsb<NT,IT> & A)
415 {
416  vector<float> sum(A.nbc-1);
417  cilk_for(IT j=1; j< A.nbc; ++j) // ignore the last block column
418  {
419  IT * blocknnz = new IT[A.nbr]; // nnz per block responsible
420  for(IT i=0; i<A.nbr; ++i)
421  {
422  blocknnz[i] = A.top[i][j] - A.top[i][j-1];
423  }
424  sum[j-1] = std::accumulate(blocknnz, blocknnz + (A.nbr-1), 0); // ignore the last block row
425  delete [] blocknnz;
426  }
427  float colave = std::accumulate(sum.begin(), sum.end(), 0.0) / static_cast<float>(A.nbc-1);
428  vector<float>::iterator colmax = std::max_element(sum.begin(), sum.end());
429  return (*colmax) / colave;
430 }
431
432
433 #endif
434
void bicsb_gespmv(const BiCsb< NT, IT > &A, const RHS *__restrict x, LHS *__restrict y)
Definition: friends.h:113
IT rowsize() const
Definition: bmcsb.h:33
bool isPar() const
Definition: bmsym.h:65
Definition: bmcsb.h:21
IT rowsize() const
Definition: bicsb.h:34
void csbsym_gespmv(const CsbSym< NT, IT > &A, const NT *__restrict x, NT *__restrict y)
Definition: friends.h:281
unsigned prescan(unsigned *a, MTYPE *const M, int n)
Definition: utility.h:191
float ColImbalance(const BiCsb< NT, IT > &A)
Definition: friends.h:414
#define NDIAGS
#define BREAKNRB
Definition: utility.h:138
IT numnonzeros() const
Definition: bicsb.h:35
#define SPAWNS
IT colsize() const
Definition: bicsb.h:33
Definition: bmsym.h:50
#define BREAKEVEN
Definition: utility.h:136
#define BALANCETH
Definition: utility.h:127
bool isPar() const
Definition: bmcsb.h:35
void bmcsb_gespmv(const BmCsb< NT, IT, TTDIM > &A, const NT *__restrict x, NT *__restrict y)
Definition: friends.h:33
void bicsb_gespmvt(const BiCsb< NT, IT > &A, const RHS *__restrict x, LHS *__restrict y)
Definition: friends.h:184
void bmsym_gespmv(const BmSym< NT, IT, TTDIM > &A, const NT *__restrict x, NT *__restrict y)
Definition: friends.h:349
Definition: csbsym.h:43
double prescantime
Definition: friends.h:19
float RowImbalance(const CSB &A)
Definition: friends.h:400
Definition: bicsb.h:19
bool isPar() const
Definition: bicsb.h:36