56 template <
class SR,
class IT,
class NUM,
class IVT,
class OVT>
57 void SpImpl<SR,IT,NUM,IVT,OVT>::SpMXSpV(
const Dcsc<IT,NUM> & Adcsc, int32_t mA,
const int32_t * indx,
const IVT * numx, int32_t veclen,
58 std::vector<int32_t> & indy, std::vector< OVT > & numy)
62 std::vector< std::pair<IT,IT> > colinds( (IT) veclen);
63 Adcsc.
FillColInds(indx, (IT) veclen, colinds, NULL, 0);
65 if(
sizeof(NUM) >
sizeof(OVT))
68 for(IT j =0; j< veclen; ++j)
70 while(colinds[j].first != colinds[j].second )
73 if(SR::returnedSAID())
84 std::make_heap(wset, wset+hsize);
87 std::pop_heap(wset, wset + hsize);
88 IT locv = wset[hsize-1].
runr;
89 if((!indy.empty()) && indy.back() == wset[hsize-1].
key)
91 numy.back() = SR::add(numy.back(), wset[hsize-1].
num);
95 indy.push_back( (int32_t) wset[hsize-1].key);
96 numy.push_back(wset[hsize-1].num);
100 while ( (++(colinds[locv].first)) != colinds[locv].second )
103 if(!SR::returnedSAID())
105 wset[hsize-1].
key = Adcsc.
ir[colinds[locv].first];
106 wset[hsize-1].
num = mrhs;
107 std::push_heap(wset, wset+hsize);
120 for(IT j =0; j< veclen; ++j)
122 if(colinds[j].first != colinds[j].second)
127 std::make_heap(wset, wset+hsize);
130 std::pop_heap(wset, wset + hsize);
131 IT locv = wset[hsize-1].
runr;
134 if (!SR::returnedSAID())
136 if((!indy.empty()) && indy.back() == wset[hsize-1].
key)
138 numy.back() = SR::add(numy.back(), mrhs);
142 indy.push_back( (int32_t) wset[hsize-1].key);
143 numy.push_back(mrhs);
147 if( (++(colinds[locv].first)) != colinds[locv].second)
150 wset[hsize-1].
key = Adcsc.
ir[colinds[locv].first];
151 wset[hsize-1].
num = Adcsc.
numx[colinds[locv].first];
152 std::push_heap(wset, wset+hsize);
167 template <
class SR,
class IT,
class IVT,
class OVT>
168 void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV(
const Dcsc<IT,bool> & Adcsc, int32_t mA,
const int32_t * indx,
const IVT * numx, int32_t veclen,
169 std::vector<int32_t> & indy, std::vector<OVT> & numy)
171 IT inf = std::numeric_limits<IT>::min();
172 IT sup = std::numeric_limits<IT>::max();
173 KNHeap< IT, IVT > sHeap(sup, inf);
177 while(i< Adcsc.
nzc && k < veclen)
179 if(Adcsc.
jc[i] < indx[k]) ++i;
180 else if(indx[k] < Adcsc.
jc[i]) ++k;
183 for(IT j=Adcsc.
cp[i]; j < Adcsc.
cp[i+1]; ++j)
185 sHeap.insert(Adcsc.
ir[j], numx[k]);
194 if(sHeap.getSize() > 0)
196 sHeap.deleteMin(&row, &num);
197 indy.push_back( (int32_t) row);
198 numy.push_back( num );
200 while(sHeap.getSize() > 0)
202 sHeap.deleteMin(&row, &num);
203 if(indy.back() == row)
205 numy.back() = SR::add(numy.back(), num);
209 indy.push_back( (int32_t) row);
222 template <
typename SR,
typename IT,
typename IVT,
class OVT>
223 void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV(
const Dcsc<IT,bool> & Adcsc, int32_t mA,
const int32_t * indx,
const IVT * numx, int32_t veclen,
224 int32_t * indy, OVT * numy,
int * cnts,
int * dspls,
int p_c)
226 OVT * localy =
new OVT[mA];
228 std::vector< std::vector<int32_t> > nzinds(p_c);
230 int32_t perproc = mA / p_c;
233 while(i< Adcsc.
nzc && k < veclen)
235 if(Adcsc.
jc[i] < indx[k]) ++i;
236 else if(indx[k] < Adcsc.
jc[i]) ++k;
239 for(IT j=Adcsc.
cp[i]; j < Adcsc.
cp[i+1]; ++j)
241 int32_t rowid = (int32_t) Adcsc.
ir[j];
244 int32_t owner = std::min(rowid / perproc, static_cast<int32_t>(p_c-1));
245 localy[rowid] = numx[k];
246 nzinds[owner].push_back(rowid);
251 localy[rowid] = SR::add(localy[rowid], numx[k]);
259 for(
int p = 0; p< p_c; ++p)
261 sort(nzinds[p].begin(), nzinds[p].end());
262 cnts[p] = nzinds[p].size();
263 int32_t * locnzinds = &nzinds[p][0];
264 int32_t offset = perproc * p;
265 for(
int i=0; i< cnts[p]; ++i)
267 indy[dspls[p]+i] = locnzinds[i] - offset;
268 numy[dspls[p]+i] = localy[locnzinds[i]];
276 template <
typename SR,
typename IT,
typename IVT,
typename OVT>
277 void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV_ForThreading(
const Dcsc<IT,bool> & Adcsc, int32_t mA,
const int32_t * indx,
const IVT * numx, int32_t veclen,
278 std::vector<int32_t> & indy, std::vector<OVT> & numy, int32_t offset)
280 std::vector<OVT> localy(mA);
282 std::vector<uint32_t> nzinds;
284 SpMXSpV_ForThreading(Adcsc, mA, indx, numx, veclen, indy, numy, offset, localy, isthere, nzinds);
290 template <
typename SR,
typename IT,
typename IVT,
typename OVT>
291 void SpImpl<SR,IT,bool,IVT,OVT>::SpMXSpV_ForThreading(
const Dcsc<IT,bool> & Adcsc, int32_t mA,
const int32_t * indx,
const IVT * numx, int32_t veclen, std::vector<int32_t> & indy, std::vector<OVT> & numy, int32_t offset, std::vector<OVT> & localy,
BitMap & isthere, std::vector<uint32_t> & nzinds)
296 while(i< Adcsc.
nzc && k < veclen)
298 if(Adcsc.
jc[i] < indx[k]) ++i;
299 else if(indx[k] < Adcsc.
jc[i]) ++k;
302 for(IT j=Adcsc.
cp[i]; j < Adcsc.
cp[i+1]; ++j)
304 uint32_t rowid = (uint32_t) Adcsc.
ir[j];
307 localy[rowid] = numx[k];
308 nzinds.push_back(rowid);
313 localy[rowid] = SR::add(localy[rowid], numx[k]);
319 int nnzy = nzinds.size();
323 for(
int i=0; i< nnzy; ++i)
325 indy[i] = nzinds[i] + offset;
326 numy[i] = localy[nzinds[i]];
344 template <
typename SR,
typename IT,
typename NT,
typename IVT,
typename OVT>
345 void SpMXSpV_HeapSort(
const Csc<IT,NT> & Acsc, int32_t mA,
const int32_t * indx,
const IVT * numx, int32_t veclen, std::vector<int32_t> & indy, std::vector<OVT> & numy, int32_t offset)
347 IT inf = std::numeric_limits<IT>::min();
348 IT sup = std::numeric_limits<IT>::max();
349 KNHeap< IT, OVT > sHeap(sup, inf);
352 for (int32_t k = 0; k < veclen; ++k)
355 for(IT j=Acsc.
jc[colid]; j < Acsc.
jc[colid+1]; ++j)
358 sHeap.insert(Acsc.
ir[j], val);
364 if(sHeap.getSize() > 0)
366 sHeap.deleteMin(&row, &num);
368 indy.push_back( (int32_t) row);
369 numy.push_back( num );
371 while(sHeap.getSize() > 0)
373 sHeap.deleteMin(&row, &num);
375 if(indy.back() == row)
377 numy.back() = SR::add(numy.back(), num);
381 indy.push_back( (int32_t) row);
389 template <
typename SR,
typename IT,
typename NT,
typename IVT,
typename OVT>
396 double tstart = MPI_Wtime();
402 nthreads = omp_get_num_threads();
405 if(rowSplits < nthreads)
407 std::ostringstream outs;
408 outs <<
"Warning in SpMXSpV_Bucket: " << rowSplits <<
" buckets are supplied for " << nthreads <<
" threads\n";
409 outs <<
"4 times the number of threads are recommended when creating PreAllocatedSPA\n";
413 int32_t rowPerSplit = mA / rowSplits;
422 std::vector<std::vector<int32_t>> bSize(rowSplits, std::vector<int32_t> ( rowSplits, 0));
423 std::vector<std::vector<int32_t>> bOffset(rowSplits, std::vector<int32_t> ( rowSplits, 0));
424 std::vector<int32_t> sendSize(rowSplits);
425 double t0, t1, t2, t3, t4;
426 #ifdef BENCHMARK_SPMSPV 431 #pragma omp parallel for schedule(dynamic, 1) 433 for(
int b=0; b<rowSplits; b++)
436 int perBucket = veclen/rowSplits;
437 int spill = veclen%rowSplits;
438 int32_t xstart = b*perBucket + std::min(spill, b);
439 int32_t xend = (b+1)*perBucket + std::min(spill, b+1);
440 std::vector<int32_t> temp(rowSplits,0);
441 for (int32_t i = xstart; i < xend; ++i)
444 for(IT j=Acsc.
jc[colid]; j < Acsc.
jc[colid+1]; ++j)
446 uint32_t rowid = (uint32_t) Acsc.
ir[j];
447 int32_t splitId = rowSplits-1;
448 if(rowPerSplit!=0) splitId = (rowid/rowPerSplit > rowSplits-1) ? rowSplits-1 : rowid/rowPerSplit;
454 for(
int k=0; k<rowSplits; k++)
456 bSize[b][k] = temp[k];
459 sendSize[b] = totSend;
463 #ifdef BENCHMARK_SPMSPV 464 t1 = MPI_Wtime() - t0;
471 for(
int i=1; i<rowSplits; i++)
473 for(
int j=0; j<rowSplits; j++)
475 bOffset[i][j] = bOffset[i-1][j] + bSize[i-1][j];
480 std::vector<uint32_t> disp(rowSplits+1);
481 int maxBucketSize = -1;
483 for(
int j=0; j<rowSplits; j++)
485 int thisBucketSize = bOffset[rowSplits-1][j] + bSize[rowSplits-1][j];
486 disp[j+1] = disp[j] + thisBucketSize;
487 bSize[rowSplits-1][j] = 0;
488 maxBucketSize = std::max(thisBucketSize, maxBucketSize);
493 #ifdef BENCHMARK_SPMSPV 494 double tseq = MPI_Wtime() - t0;
501 #ifndef L2_CACHE_SIZE 502 #define L2_CACHE_SIZE 256000 504 int THREAD_BUF_LEN = 256;
505 int itemsize =
sizeof(int32_t) +
sizeof(OVT);
508 int bufferMem = THREAD_BUF_LEN * rowSplits * itemsize + 8 * rowSplits;
512 THREAD_BUF_LEN = std::min(maxBucketSize+1,THREAD_BUF_LEN);
514 #ifdef BENCHMARK_SPMSPV 522 int32_t* tIndSplitA =
new int32_t[rowSplits*THREAD_BUF_LEN];
523 OVT* tNumSplitA =
new OVT[rowSplits*THREAD_BUF_LEN];
524 std::vector<int32_t> tBucketSize(rowSplits);
525 std::vector<int32_t> tOffset(rowSplits);
527 #pragma omp for schedule(dynamic,1) 529 for(
int b=0; b<rowSplits; b++)
532 std::fill(tBucketSize.begin(), tBucketSize.end(), 0);
533 std::fill(tOffset.begin(), tOffset.end(), 0);
534 int perBucket = veclen/rowSplits;
535 int spill = veclen%rowSplits;
536 int32_t xstart = b*perBucket + std::min(spill, b);
537 int32_t xend = (b+1)*perBucket + std::min(spill, b+1);
539 for (int32_t i = xstart; i < xend; ++i)
542 for(IT j=Acsc.
jc[colid]; j < Acsc.
jc[colid+1]; ++j)
545 uint32_t rowid = (uint32_t) Acsc.
ir[j];
546 int32_t splitId = rowSplits-1;
547 if(rowPerSplit!=0) splitId = (rowid/rowPerSplit > rowSplits-1) ? rowSplits-1 : rowid/rowPerSplit;
548 if (tBucketSize[splitId] < THREAD_BUF_LEN)
550 tIndSplitA[splitId*THREAD_BUF_LEN + tBucketSize[splitId]] = rowid;
551 tNumSplitA[splitId*THREAD_BUF_LEN + tBucketSize[splitId]++] = val;
555 std::copy(tIndSplitA + splitId*THREAD_BUF_LEN, tIndSplitA + (splitId+1)*THREAD_BUF_LEN, &SPA.
indSplitA[disp[splitId] + bOffset[b][splitId]] + tOffset[splitId]);
556 std::copy(tNumSplitA + splitId*THREAD_BUF_LEN, tNumSplitA + (splitId+1)*THREAD_BUF_LEN, &SPA.
numSplitA[disp[splitId] + bOffset[b][splitId]] + tOffset[splitId]);
557 tIndSplitA[splitId*THREAD_BUF_LEN] = rowid;
558 tNumSplitA[splitId*THREAD_BUF_LEN] = val;
559 tOffset[splitId] += THREAD_BUF_LEN ;
560 tBucketSize[splitId] = 1;
565 for(
int splitId=0; splitId<rowSplits; ++splitId)
567 if(tBucketSize[splitId]>0)
569 std::copy(tIndSplitA + splitId*THREAD_BUF_LEN, tIndSplitA + splitId*THREAD_BUF_LEN + tBucketSize[splitId], &SPA.
indSplitA[disp[splitId] + bOffset[b][splitId]] + tOffset[splitId]);
570 std::copy(tNumSplitA + splitId*THREAD_BUF_LEN, tNumSplitA + splitId*THREAD_BUF_LEN + tBucketSize[splitId], &SPA.
numSplitA[disp[splitId] + bOffset[b][splitId]] + tOffset[splitId]);
574 delete [] tIndSplitA;
575 delete [] tNumSplitA;
578 #ifdef BENCHMARK_SPMSPV 579 t2 = MPI_Wtime() - t0;
582 std::vector<uint32_t> nzInRowSplits(rowSplits);
583 uint32_t* nzinds =
new uint32_t[disp[rowSplits]];
586 #pragma omp parallel for schedule(dynamic,1) 588 for(
int rs=0; rs<rowSplits; ++rs)
591 for(
int i=disp[rs]; i<disp[rs+1] ; i++)
593 int32_t lrowid = SPA.
indSplitA[i] - rs * rowPerSplit;
596 uint32_t tMergeDisp = disp[rs];
597 for(
int i=disp[rs]; i<disp[rs+1] ; i++)
600 int32_t lrowid = rowid - rs * rowPerSplit;
604 nzinds[tMergeDisp++] = rowid;
613 integerSort(nzinds + disp[rs], tMergeDisp - disp[rs]);
614 nzInRowSplits[rs] = tMergeDisp - disp[rs];
618 #ifdef BENCHMARK_SPMSPV 619 t3 = MPI_Wtime() - t0;
622 std::vector<uint32_t> dispRowSplits(rowSplits+1);
623 dispRowSplits[0] = 0;
624 for(
int i=0; i<rowSplits; i++)
626 dispRowSplits[i+1] = dispRowSplits[i] + nzInRowSplits[i];
629 #ifdef BENCHMARK_SPMSPV 632 int nnzy = dispRowSplits[rowSplits];
635 #ifdef BENCHMARK_SPMSPV 636 tseq = MPI_Wtime() - t0;
640 int maxNnzInSplit = *std::max_element(nzInRowSplits.begin(),nzInRowSplits.end());
641 THREAD_BUF_LEN = std::min(maxNnzInSplit+1,256);
646 OVT* tnumy =
new OVT [THREAD_BUF_LEN];
647 int32_t* tindy =
new int32_t [THREAD_BUF_LEN];
650 #pragma omp for schedule(dynamic,1) 652 for(
int rs=0; rs<rowSplits; rs++)
656 uint32_t * thisind = nzinds + disp[rs];
657 std::copy(nzinds+disp[rs], nzinds+disp[rs]+nzInRowSplits[rs], indy.begin()+dispRowSplits[rs]);
658 for(
int j=0; j<nzInRowSplits[rs]; j++)
661 if ( curSize < THREAD_BUF_LEN)
663 tnumy[curSize++] = SPA.
V_localy[0][thisind[j]];
667 std::copy(tnumy, tnumy+curSize, numy.begin()+dispRowSplits[rs]+tdisp);
669 tnumy[0] = SPA.
V_localy[0][thisind[j]];
675 std::copy(tnumy, tnumy+curSize, numy.begin()+dispRowSplits[rs]+tdisp);
683 #ifdef BENCHMARK_SPMSPV 684 t4 = MPI_Wtime() - t0;
691 #ifdef BENCHMARK_SPMSPV 692 double tall = MPI_Wtime() - tstart;
693 std::ostringstream outs1;
694 outs1 <<
"Time breakdown of SpMSpV-bucket." << std::endl;
695 outs1 <<
"Estimate buckets: "<< t1 <<
" Bucketing: " << t2 <<
" SPA-merge: " << t3 <<
" Output: " << t4 <<
" Total: "<< tall << std::endl;
std::vector< std::vector< bool > > V_isthereBool
bool get_bit(uint64_t pos)
std::vector< int32_t > indSplitA
IT * ir
row indices, size nz
std::vector< std::vector< OVT > > V_localy
IT * cp
The master array, size nzc+1 (keeps column pointers)
void SpMXSpV_HeapSort(const Csc< IT, NT > &Acsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, int32_t offset)
void FillColInds(const VT *colnums, IT nind, std::vector< std::pair< IT, IT > > &colinds, IT *aux, IT csize) const
static void Print(const std::string &s)
NT * numx
generic values, size nz
IT nzc
number of columns with at least one non-zero in them
std::vector< OVT > numSplitA
void SpMXSpV_Bucket(const Csc< IT, NT > &Acsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, PreAllocatedSPA< OVT > &SPA)
SpDCCols< IT, NT > * multiply(SpDCCols< IT, NT > &splitA, SpDCCols< IT, NT > &splitB, CCGrid &CMG, bool isBT, bool threaded)
IT * jc
col indices, size nzc
void integerSort(std::pair< uint32_t, T > *A, int n)
static void SpMXSpV_ForThreading(const Dcsc< IT, NUM > &Adcsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy, int32_t offset)
void set_bit(uint64_t pos)
static void SpMXSpV(const Dcsc< IT, NUM > &Adcsc, int32_t mA, const int32_t *indx, const IVT *numx, int32_t veclen, std::vector< int32_t > &indy, std::vector< OVT > &numy)