11 #include "timer.gettimeofday.c"
15 template <
class NU,
class IU>
18 template <
class NU,
class IU,
unsigned UUDIM>
24 #if (__GNUC__ == 4 && (__GNUC_MINOR__ < 7) )
25 #define emplace_back push_back
32 template <
typename NT,
typename IT,
unsigned TTDIM>
35 double t0 = timer_seconds_since_init();
37 unsigned * scansum =
new unsigned[A.nrb];
38 unsigned sum =
prescan(scansum, A.masks, A.nrb);
40 double t1 = timer_seconds_since_init();
43 IT ysize = A.lowrowmask + 1;
47 float rowave =
static_cast<float>(A.numnonzeros()) / (A.nbr-1);
48 cilk_for (IT i = 0 ; i < A.nbr ; ++i)
50 IT * btop = A.top [i];
51 IT rhi = ((i << A.rowlowbits) & A.highrowmask);
53 if( A.top[i][A.nbc] - A.top[i][0] >
BALANCETH * rowave)
57 chunks.push_back(btop);
58 for(IT j =0; j < A.nbc; )
60 IT count = btop[j+1] - btop[j];
61 if(count < thsh && j < A.nbc)
63 while(count < thsh && j < A.nbc)
65 count += btop[(++j)+1] - btop[j];
67 chunks.push_back(btop+j);
71 chunks.push_back(btop+(++j));
78 A.BMult(&chunks[0], 0, chunks.size()-1, x, suby, A.
rowsize() - ysize*i, scansum);
82 A.BMult(&chunks[0], 0, chunks.size()-1, x, suby, ysize, scansum);
87 A.SubSpMV(btop, 0, A.nbc, x, suby, scansum);
94 cilk_for (IT i = 0 ; i < A.nbr ; ++i)
96 IT * btop = A.top [i];
97 IT rhi = ((i << A.rowlowbits) & A.highrowmask);
100 A.SubSpMV(btop, 0, A.nbc, x, suby, scansum);
112 template <
typename SR,
typename NT,
typename IT,
typename RHS,
typename LHS>
115 IT ysize = A.lowrowmask + 1;
119 float rowave =
static_cast<float>(A.
numnonzeros()) / (A.nbr-1);
120 cilk_for (IT i = 0 ; i < A.nbr ; ++i)
122 IT * btop = A.top [i];
123 IT rhi = ((i << A.rowlowbits) & A.highrowmask);
124 LHS * suby = &y[rhi];
126 if(A.top[i][A.nbc] - A.top[i][0] > std::max( static_cast<NT>(
BALANCETH * rowave),
static_cast<NT
>(
BREAKEVEN * ysize) ) )
130 chunks.push_back(btop);
131 for(IT j =0; j < A.nbc; )
133 IT count = btop[j+1] - btop[j];
134 if(count < thsh && j < A.nbc)
136 while(count < thsh && j < A.nbc)
138 count += btop[(++j)+1] - btop[j];
140 chunks.push_back(btop+j);
144 chunks.push_back(btop+(++j));
151 A.template BMult<SR>(&chunks[0], 0, chunks.size()-1, x, suby, A.
rowsize() - ysize*i);
155 A.template BMult<SR>(&chunks[0], 0, chunks.size()-1, x, suby, ysize);
160 A.template SubSpMV<SR>(btop, 0, A.nbc, x, suby);
166 cilk_for (IT i = 0 ; i < A.nbr ; ++i)
168 IT * btop = A.top [i];
169 IT rhi = ((i << A.rowlowbits) & A.highrowmask);
170 LHS * suby = &y[rhi];
171 A.template SubSpMV<SR>(btop, 0, A.nbc, x, suby);
183 template <
typename SR,
typename NT,
typename IT,
typename RHS,
typename LHS>
186 IT ysize = A.lowcolmask + 1;
193 vector<IT> colsums(A.nbc,0);
194 cilk_for(IT j=0; j<A.nbc; ++j)
196 for(IT i=0; i< A.nbr; ++i)
198 colsums[j] += (A.top[i][j+1] - A.top[i][j]);
204 float colave =
static_cast<float>(A.
numnonzeros()) / (A.nbc-1);
205 cilk_for (IT j = 0 ; j < A.nbc ; ++j)
207 IT rhi = ((j << A.rowlowbits) & A.highcolmask);
208 LHS * suby = &y[rhi];
209 typedef typename std::tuple<IT,IT,IT> IntTriple;
210 typedef typename std::vector< IntTriple > ChunkType;
211 vector< ChunkType * > chunks;
222 for(IT i =0; i < A.nbr; ++i )
224 ChunkType * chunk =
new ChunkType();
225 chunk->emplace_back( IntTriple (A.top[i][j], A.top[i][j+1], i));
226 IT count = A.top[i][j+1] - A.top[i][j];
231 while(i < A.nbr-1 && (count+A.top[i+1][j+1] - A.top[i+1][j]) < thsh )
234 if(A.top[i][j+1] - A.top[i][j] > 0)
236 chunk->emplace_back( IntTriple (A.top[i][j], A.top[i][j+1], i));
237 count += A.top[i][j+1] - A.top[i][j];
241 chunks.push_back(chunk);
245 chunks.push_back(chunk);
250 A.template BTransMult<SR>(chunks, 0, chunks.size(), x, suby, A.
colsize() - ysize*j);
254 A.template BTransMult<SR>(chunks, 0, chunks.size(), x, suby, ysize);
258 for_each(chunks.begin(), chunks.end(), [](ChunkType * pPtr){
delete pPtr; });
262 A.template SubSpMVTrans<SR>(j, 0, A.nbr, x, suby);
268 cilk_for (IT j =0; j< A.nbc; ++j)
270 IT rhi = ((j << A.collowbits) & A.highcolmask);
271 LHS * suby = &y[rhi];
273 A.template SubSpMVTrans<SR>(j, 0, A.nbr, x, suby);
280 template <
typename NT,
typename IT>
283 #pragma isat marker SM2_begin
286 #pragma isat tuning name(tune_tempy) scope(SM1_begin, SM1_end) measure(SM2_begin, SM2_end) variable(SPAWNS, range(1,6)) variable(NDIAGS, range(1,11)) search(dependent)
287 #pragma isat marker SM1_begin
288 #define SPAWNS 1 // how many you do in parallel at a time
289 #define NDIAGS 3 // how many you do in total
290 NT ** t_y =
new NT* [
SPAWNS];
292 for(
int i=1; i<
SPAWNS; ++i)
294 t_y[i] =
new NT[A.n]();
298 cout <<
"Impossible to execute" << endl;
303 for(
int j=0; j < syncs; ++j)
307 A.MultDiag(t_y[0], x, j*SPAWNS);
310 for(; (i < SPAWNS) && (remdiags > 1) ; ++i)
312 cilk_spawn A.MultDiag(t_y[i], x, j*SPAWNS + i);
315 if(i < SPAWNS && remdiags == 1)
317 cilk_spawn A.MultAddAtomics(t_y[i], x, j*SPAWNS + i);
322 else if(remdiags == 1)
324 A.MultAddAtomics(t_y[0], x, j*SPAWNS);
329 cilk_for(
int j=0; j< A.n; ++j)
331 for(
int i=1; i<
SPAWNS; ++i)
334 for(
int i=1; i<
SPAWNS; ++i)
337 #pragma isat marker SM1_end
343 #pragma isat marker SM2_end
348 template <
typename NT,
typename IT,
unsigned TTDIM>
353 NT * y1 =
new NT[A.n]();
354 NT * y2 =
new NT[A.n]();
357 IT size0 = A.nrbsum(0);
358 IT size1 = A.nrbsum(1);
359 IT size2 = A.nrbsum(2);
361 if(size0+size1+size2 != A.nrb)
364 cilk_spawn A.MultAddAtomics(y3,x,3);
367 cilk_spawn A.MultDiag(y1,x,1);
368 cilk_spawn A.MultDiag(y2,x,2);
369 A.MultMainDiag(y, x);
373 if(size0+size1+size2 != A.nrb)
375 cilk_for(
int i=0; i< A.n; ++i)
377 y[i] += y1[i] + y2[i] + y3[i];
383 cilk_for(
int i=0; i< A.n; ++i)
385 y[i] += y1[i] + y2[i];
403 float rowave =
static_cast<float>(*(A.top[A.nbr-1])) / (A.nbr-1);
405 for(
size_t i=1; i< A.nbr; ++i)
407 rowmax = std::max(rowmax, *(A.top[i]) - *(A.top[i-1]));
409 return static_cast<float>(rowmax) / rowave;
413 template <
class NT,
class IT>
416 vector<float> sum(A.nbc-1);
417 cilk_for(IT j=1; j< A.nbc; ++j)
419 IT * blocknnz =
new IT[A.nbr];
420 for(IT i=0; i<A.nbr; ++i)
422 blocknnz[i] = A.top[i][j] - A.top[i][j-1];
424 sum[j-1] = std::accumulate(blocknnz, blocknnz + (A.nbr-1), 0);
427 float colave = std::accumulate(sum.begin(), sum.end(), 0.0) /
static_cast<float>(A.nbc-1);
428 vector<float>::iterator colmax = std::max_element(sum.begin(), sum.end());
429 return (*colmax) / colave;
void bicsb_gespmv(const BiCsb< NT, IT > &A, const RHS *__restrict x, LHS *__restrict y)
void csbsym_gespmv(const CsbSym< NT, IT > &A, const NT *__restrict x, NT *__restrict y)
unsigned prescan(unsigned *a, MTYPE *const M, int n)
float ColImbalance(const BiCsb< NT, IT > &A)
void bmcsb_gespmv(const BmCsb< NT, IT, TTDIM > &A, const NT *__restrict x, NT *__restrict y)
void bicsb_gespmvt(const BiCsb< NT, IT > &A, const RHS *__restrict x, LHS *__restrict y)
void bmsym_gespmv(const BmSym< NT, IT, TTDIM > &A, const NT *__restrict x, NT *__restrict y)
float RowImbalance(const CSB &A)