30 #ifndef _PAR_FRIENDS_H_ 31 #define _PAR_FRIENDS_H_ 47 template <
class IT,
class NT,
class DER>
58 template <
typename IT,
typename NT>
66 else if (vecs.size() < 2)
73 typename std::vector< FullyDistVec<IT,NT> >::iterator it = vecs.begin();
74 std::shared_ptr<CommGrid> commGridPtr = it->
getcommgrid();
75 MPI_Comm World = commGridPtr->GetWorld();
77 IT nglen = it->TotalLength();
78 IT cumloclen = it->MyLocLength();
80 for(; it != vecs.end(); ++it)
82 if(*(commGridPtr) != *(it->getcommgrid()))
87 nglen += it->TotalLength();
88 cumloclen += it->MyLocLength();
91 int nprocs = commGridPtr->GetSize();
93 std::vector< std::vector< NT > > data(nprocs);
94 std::vector< std::vector< IT > > inds(nprocs);
96 for(it = vecs.begin(); it != vecs.end(); ++it)
98 IT loclen = it->LocArrSize();
99 for(IT i=0; i < loclen; ++i)
102 IT loffset = it->LengthUntil();
103 int owner = ConCat.Owner(gloffset+loffset+i, locind);
104 data[owner].push_back(it->arr[i]);
105 inds[owner].push_back(locind);
107 gloffset += it->TotalLength();
110 int * sendcnt =
new int[nprocs];
111 int * sdispls =
new int[nprocs];
112 for(
int i=0; i<nprocs; ++i)
113 sendcnt[i] = (
int) data[i].size();
115 int * rdispls =
new int[nprocs];
116 int * recvcnt =
new int[nprocs];
117 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
120 for(
int i=0; i<nprocs-1; ++i)
122 sdispls[i+1] = sdispls[i] + sendcnt[i];
123 rdispls[i+1] = rdispls[i] + recvcnt[i];
125 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs,static_cast<IT>(0));
126 NT * senddatabuf =
new NT[cumloclen];
127 for(
int i=0; i<nprocs; ++i)
129 std::copy(data[i].begin(), data[i].end(), senddatabuf+sdispls[i]);
130 std::vector<NT>().swap(data[i]);
132 NT * recvdatabuf =
new NT[totrecv];
133 MPI_Alltoallv(senddatabuf, sendcnt, sdispls, MPIType<NT>(), recvdatabuf, recvcnt, rdispls, MPIType<NT>(), World);
134 delete [] senddatabuf;
136 IT * sendindsbuf =
new IT[cumloclen];
137 for(
int i=0; i<nprocs; ++i)
139 std::copy(inds[i].begin(), inds[i].end(), sendindsbuf+sdispls[i]);
140 std::vector<IT>().swap(inds[i]);
142 IT * recvindsbuf =
new IT[totrecv];
143 MPI_Alltoallv(sendindsbuf, sendcnt, sdispls, MPIType<IT>(), recvindsbuf, recvcnt, rdispls, MPIType<IT>(), World);
144 DeleteAll(sendindsbuf, sendcnt, sdispls);
146 for(
int i=0; i<nprocs; ++i)
148 for(
int j = rdispls[i]; j < rdispls[i] + recvcnt[i]; ++j)
150 ConCat.arr[recvindsbuf[j]] = recvdatabuf[j];
153 DeleteAll(recvindsbuf, recvcnt, rdispls);
158 template <
typename MATRIXA,
typename MATRIXB>
161 if(A.getncol() != B.getnrow())
163 std::ostringstream outs;
164 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
165 outs << A.getncol() <<
" != " << B.getnrow() << std::endl;
170 if((
void*) &A == (
void*) &B)
172 std::ostringstream outs;
173 outs <<
"Can not multiply, inputs alias (make a temporary copy of one of them first)"<< std::endl;
183 template <
typename IT,
typename NT,
typename DER>
202 recoverCols = recoverPct;
204 recoverCols = EWiseApply<NT>(recoverCols, colSums,
205 [](NT spval, NT dval){
return spval;},
206 [](NT spval, NT dval){
return dval < spval;},
209 IT nrecover = recoverCols.
getnnz();
215 A.
Kselect(recoverCols, recoverNum, kselectVersion);
222 pruneCols.Set(recoverCols);
224 #ifdef COMBBLAS_DEBUG 225 std::ostringstream outs;
226 outs <<
"Number of columns needing recovery: " << nrecover << std::endl;
237 [](NT spval, NT dval){
return spval;},
238 [](NT spval, NT dval){
return spval==-1;},
239 true,
static_cast<NT
>(-1));
241 selectCols = selectNum;
242 selectCols = EWiseApply<NT>(selectCols, nnzPerColumn,
243 [](NT spval, NT dval){
return spval;},
244 [](NT spval, NT dval){
return dval > spval;},
246 IT nselect = selectCols.
getnnz();
253 A.
Kselect(selectCols, selectNum, kselectVersion);
259 pruneCols.Set(selectCols);
260 #ifdef COMBBLAS_DEBUG 261 std::ostringstream outs;
262 outs <<
"Number of columns needing selection: " << nselect << std::endl;
281 selectCols = recoverNum;
282 selectCols = EWiseApply<NT>(selectCols, nnzPerColumn1,
283 [](NT spval, NT dval){
return spval;},
284 [](NT spval, NT dval){
return dval < spval;},
288 selectCols = recoverPct;
289 selectCols = EWiseApply<NT>(selectCols, colSums1,
290 [](NT spval, NT dval){
return spval;},
291 [](NT spval, NT dval){
return dval < spval;},
294 IT n_recovery_after_select = selectCols.getnnz();
295 if(n_recovery_after_select>0)
302 A.
Kselect(selectCols, recoverNum, kselectVersion);
307 pruneCols.Set(selectCols);
309 #ifdef COMBBLAS_DEBUG 310 std::ostringstream outs1;
311 outs1 <<
"Number of columns needing recovery after selection: " << nselect << std::endl;
348 template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
350 int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct,
int kselectVersion,
int64_t perProcessMemory)
353 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
356 std::ostringstream outs;
357 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
363 if(phases <1 || phases >= A.
getncol())
365 SpParHelper::Print(
"MemEfficientSpGEMM: The value of phases is too small or large. Resetting to 1.\n");
370 std::shared_ptr<CommGrid> GridC =
ProductGrid((A.commGrid).get(), (B.commGrid).
get(), stages, dummy, dummy);
373 if(perProcessMemory>0)
376 MPI_Comm World = GridC->GetWorld();
377 MPI_Comm_size(World,&p);
379 int64_t perNNZMem_in =
sizeof(IU)*2 +
sizeof(NU1);
380 int64_t perNNZMem_out =
sizeof(IU)*2 +
sizeof(NUO);
386 int64_t inputMem = gannz * perNNZMem_in * 4;
390 int64_t asquareMem = asquareNNZ * perNNZMem_out * 2;
396 int64_t k = std::min(
int64_t(std::max(selectNum, recoverNum)), d );
401 int64_t outputMem = outputNNZ * perNNZMem_in * 2;
404 int64_t remainingMem = perProcessMemory*1000000000 - inputMem - outputMem;
407 phases = 1 + (asquareMem+kselectmem) / remainingMem;
417 std::cout <<
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n Warning: input and output memory requirement is greater than per-process avaiable memory. Keeping phase to the value supplied at the command line. The program may go out of memory and crash! \n !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
419 #ifdef SHOW_MEMORY_USAGE 420 int64_t maxMemory = kselectmem/phases + inputMem + outputMem + asquareMem / phases;
421 if(maxMemory>1000000000)
422 std::cout <<
"phases: " << phases <<
": per process memory: " << perProcessMemory <<
" GB asquareMem: " << asquareMem/1000000000.00 <<
" GB" <<
" inputMem: " << inputMem/1000000000.00 <<
" GB" <<
" outputMem: " << outputMem/1000000000.00 <<
" GB" <<
" kselectmem: " << kselectmem/1000000000.00 <<
" GB" << std::endl;
424 std::cout <<
"phases: " << phases <<
": per process memory: " << perProcessMemory <<
" GB asquareMem: " << asquareMem/1000000.00 <<
" MB" <<
" inputMem: " << inputMem/1000000.00 <<
" MB" <<
" outputMem: " << outputMem/1000000.00 <<
" MB" <<
" kselectmem: " << kselectmem/1000000.00 <<
" MB" << std::endl;
430 IU C_m = A.spSeq->getnrow();
431 IU C_n = B.spSeq->getncol();
433 std::vector< UDERB > PiecesOfB;
434 UDERB CopyB = *(B.spSeq);
436 CopyB.ColSplit(phases, PiecesOfB);
437 MPI_Barrier(GridC->GetWorld());
440 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
441 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
451 std::vector< UDERO > toconcatenate;
453 int Aself = (A.commGrid)->GetRankInProcRow();
454 int Bself = (B.commGrid)->GetRankInProcCol();
456 for(
int p = 0; p< phases; ++p)
459 std::vector< SpTuples<IU,NUO> *> tomerge;
460 for(
int i = 0; i < stages; ++i)
463 if(i == Aself) ARecv = A.spSeq;
466 ess.resize(UDERA::esscount);
467 for(
int j=0; j< UDERA::esscount; ++j)
468 ess[j] = ARecvSizes[j][i];
473 double t0=MPI_Wtime();
477 double t1=MPI_Wtime();
482 if(i == Bself) BRecv = &(PiecesOfB[p]);
485 ess.resize(UDERB::esscount);
486 for(
int j=0; j< UDERB::esscount; ++j)
487 ess[j] = BRecvSizes[j][i];
491 double t2=MPI_Wtime();
495 double t3=MPI_Wtime();
501 double t4=MPI_Wtime();
503 SpTuples<IU,NUO> * C_cont = LocalSpGEMM<SR, NUO>(*ARecv, *BRecv,i != Aself, i != Bself);
506 double t5=MPI_Wtime();
511 tomerge.push_back(C_cont);
517 #ifdef SHOW_MEMORY_USAGE 518 int64_t gcnnz_unmerged, lcnnz_unmerged = 0;
519 for(
size_t i = 0; i < tomerge.size(); ++i)
521 lcnnz_unmerged += tomerge[i]->
getnnz();
523 MPI_Allreduce(&lcnnz_unmerged, &gcnnz_unmerged, 1,
MPIType<int64_t>(), MPI_MAX, MPI_COMM_WORLD);
524 int64_t summa_memory = gcnnz_unmerged*20;
528 if(summa_memory>1000000000)
529 std::cout << p+1 <<
". unmerged: " << summa_memory/1000000000.00 <<
"GB " ;
531 std::cout << p+1 <<
". unmerged: " << summa_memory/1000000.00 <<
" MB " ;
537 double t6=MPI_Wtime();
541 SpTuples<IU,NUO> * OnePieceOfC_tuples = MultiwayMerge<SR>(tomerge, C_m, PiecesOfB[p].getncol(),
true);
543 #ifdef SHOW_MEMORY_USAGE 544 int64_t gcnnz_merged, lcnnz_merged ;
545 lcnnz_merged = OnePieceOfC_tuples->
getnnz();
546 MPI_Allreduce(&lcnnz_merged, &gcnnz_merged, 1,
MPIType<int64_t>(), MPI_MAX, MPI_COMM_WORLD);
549 int64_t merge_memory = gcnnz_merged*2*20;
553 if(merge_memory>1000000000)
554 std::cout <<
" merged: " << merge_memory/1000000000.00 <<
"GB " ;
556 std::cout <<
" merged: " << merge_memory/1000000.00 <<
" MB " ;
563 double t7=MPI_Wtime();
566 UDERO * OnePieceOfC =
new UDERO(* OnePieceOfC_tuples,
false);
567 delete OnePieceOfC_tuples;
570 MCLPruneRecoverySelect(OnePieceOfC_mat, hardThreshold, selectNum, recoverNum, recoverPct, kselectVersion);
572 #ifdef SHOW_MEMORY_USAGE 573 int64_t gcnnz_pruned, lcnnz_pruned ;
575 MPI_Allreduce(&lcnnz_pruned, &gcnnz_pruned, 1,
MPIType<int64_t>(), MPI_MAX, MPI_COMM_WORLD);
579 int64_t prune_memory = gcnnz_pruned*2*20;
584 if(prune_memory>1000000000)
585 std::cout <<
"Prune: " << prune_memory/1000000000.00 <<
"GB " << std::endl ;
587 std::cout <<
"Prune: " << prune_memory/1000000.00 <<
" MB " << std::endl ;
593 toconcatenate.push_back(OnePieceOfC_mat.
seq());
597 UDERO *
C =
new UDERO(0,C_m, C_n,0);
598 C->ColConcatenate(toconcatenate);
616 template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
627 std::shared_ptr<CommGrid> GridC =
ProductGrid((A.commGrid).get(), (B.commGrid).
get(), stages, dummy, dummy);
628 IU C_m = A.spSeq->getnrow();
629 IU C_n = B.spSeq->getncol();
631 UDERA * A1seq =
new UDERA();
632 UDERA * A2seq =
new UDERA();
633 UDERB * B1seq =
new UDERB();
634 UDERB * B2seq =
new UDERB();
635 (A.spSeq)->Split( *A1seq, *A2seq);
636 const_cast< UDERB*
>(B.spSeq)->Transpose();
637 (B.spSeq)->Split( *B1seq, *B2seq);
638 MPI_Barrier(GridC->GetWorld());
640 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
641 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
649 std::vector< SpTuples<IU,NUO> *> tomerge;
651 int Aself = (A.commGrid)->GetRankInProcRow();
652 int Bself = (B.commGrid)->GetRankInProcCol();
654 for(
int i = 0; i < stages; ++i)
663 ess.resize(UDERA::esscount);
664 for(
int j=0; j< UDERA::esscount; ++j)
666 ess[j] = ARecvSizes[j][i];
678 ess.resize(UDERB::esscount);
679 for(
int j=0; j< UDERB::esscount; ++j)
681 ess[j] = BRecvSizes[j][i];
694 if(!C_cont->isZero())
695 tomerge.push_back(C_cont);
699 if(clearA)
delete A1seq;
700 if(clearB)
delete B1seq;
707 for(
int i = 0; i < stages; ++i)
716 ess.resize(UDERA::esscount);
717 for(
int j=0; j< UDERA::esscount; ++j)
719 ess[j] = ARecvSizes[j][i];
733 ess.resize(UDERB::esscount);
734 for(
int j=0; j< UDERB::esscount; ++j)
736 ess[j] = BRecvSizes[j][i];
748 if(!C_cont->isZero())
749 tomerge.push_back(C_cont);
763 (A.spSeq)->Merge(*A1seq, *A2seq);
775 (B.spSeq)->Merge(*B1seq, *B2seq);
778 const_cast< UDERB*
>(B.spSeq)->Transpose();
781 UDERO *
C =
new UDERO(MergeAll<SR>(tomerge, C_m, C_n,
true),
false);
791 template <
typename SR,
typename NUO,
typename UDERO,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
801 std::shared_ptr<CommGrid> GridC =
ProductGrid((A.commGrid).get(), (B.commGrid).
get(), stages, dummy, dummy);
802 IU C_m = A.spSeq->getnrow();
803 IU C_n = B.spSeq->getncol();
806 MPI_Barrier(GridC->GetWorld());
808 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
809 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
817 std::vector< SpTuples<IU,NUO> *> tomerge;
819 int Aself = (A.commGrid)->GetRankInProcRow();
820 int Bself = (B.commGrid)->GetRankInProcCol();
822 for(
int i = 0; i < stages; ++i)
831 ess.resize(UDERA::esscount);
832 for(
int j=0; j< UDERA::esscount; ++j)
834 ess[j] = ARecvSizes[j][i];
848 ess.resize(UDERB::esscount);
849 for(
int j=0; j< UDERB::esscount; ++j)
851 ess[j] = BRecvSizes[j][i];
873 if(!C_cont->isZero())
874 tomerge.push_back(C_cont);
876 #ifdef COMBBLAS_DEBUG 877 std::ostringstream outs;
878 outs << i <<
"th SUMMA iteration"<< std::endl;
882 if(clearA && A.spSeq != NULL)
887 if(clearB && B.spSeq != NULL)
901 UDERO *
C =
new UDERO(*C_tuples,
false);
915 template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
923 std::ostringstream outs;
924 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
932 std::shared_ptr<CommGrid> GridC =
ProductGrid((A.commGrid).get(), (B.commGrid).
get(), stages, dummy, dummy);
934 MPI_Barrier(GridC->GetWorld());
936 IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
937 IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
945 int Aself = (A.commGrid)->GetRankInProcRow();
946 int Bself = (B.commGrid)->GetRankInProcCol();
949 for(
int i = 0; i < stages; ++i)
958 ess.resize(UDERA::esscount);
959 for(
int j=0; j< UDERA::esscount; ++j)
961 ess[j] = ARecvSizes[j][i];
975 ess.resize(UDERB::esscount);
976 for(
int j=0; j< UDERB::esscount; ++j)
978 ess[j] = BRecvSizes[j][i];
989 IU nzc = BRecv->GetDCSC()->nzc;
992 #pragma omp parallel for reduction (+:nnzC_stage) 994 for (IU k=0; k<nzc; k++)
996 nnzC_stage = nnzC_stage + colnnzC[k];
998 nnzC_SUMMA += nnzC_stage;
1011 MPI_Allreduce(&nnzC_SUMMA, &nnzC_SUMMA_max, 1,
MPIType<int64_t>(), MPI_MAX, GridC->GetWorld());
1013 return nnzC_SUMMA_max;
1019 template <
typename MATRIX,
typename VECTOR>
1022 if(A.getncol() != x.TotalLength())
1024 std::ostringstream outs;
1025 outs <<
"Can not multiply, dimensions does not match"<< std::endl;
1026 outs << A.getncol() <<
" != " << x.TotalLength() << std::endl;
1030 if(! ( *(A.getcommgrid()) == *(x.getcommgrid())) )
1032 std::cout <<
"Grids are not comparable for SpMV" << std::endl;
1038 template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1042 template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1048 return SpMV<SR>(
A, x, indexisvalue, optbuf);
1056 template<
typename IU,
typename NV>
1059 int32_t xlocnz = (int32_t) x.
getlocnnz();
1060 int32_t roffst = (int32_t) x.RowLenUntil();
1062 IU luntil = x.LengthUntil();
1063 int diagneigh = x.commGrid->GetComplementRank();
1066 MPI_Sendrecv(&roffst, 1,
MPIType<int32_t>(), diagneigh,
TROST, &roffset, 1,
MPIType<int32_t>(), diagneigh,
TROST, World, &status);
1067 MPI_Sendrecv(&xlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, &trxlocnz, 1,
MPIType<int32_t>(), diagneigh,
TRNNZ, World, &status);
1068 MPI_Sendrecv(&luntil, 1, MPIType<IU>(), diagneigh,
TRLUT, &lenuntil, 1, MPIType<IU>(), diagneigh,
TRLUT, World, &status);
1072 trxinds =
new int32_t[trxlocnz];
1073 int32_t * temp_xind =
new int32_t[xlocnz];
1075 #pragma omp parallel for 1077 for(
int i=0; i< xlocnz; ++i)
1078 temp_xind[i] = (int32_t) x.ind[i];
1079 MPI_Sendrecv(temp_xind, xlocnz,
MPIType<int32_t>(), diagneigh,
TRI, trxinds, trxlocnz,
MPIType<int32_t>(), diagneigh,
TRI, World, &status);
1080 delete [] temp_xind;
1083 trxnums =
new NV[trxlocnz];
1084 MPI_Sendrecv(const_cast<NV*>(
SpHelper::p2a(x.num)), xlocnz, MPIType<NV>(), diagneigh,
TRX, trxnums, trxlocnz, MPIType<NV>(), diagneigh,
TRX, World, &status);
1087 std::transform(trxinds, trxinds+trxlocnz, trxinds, std::bind2nd(std::plus<int32_t>(), roffset));
1098 template<
typename IU,
typename NV>
1099 void AllGatherVector(MPI_Comm & ColWorld,
int trxlocnz, IU lenuntil, int32_t * & trxinds, NV * & trxnums,
1100 int32_t * & indacc, NV * & numacc,
int & accnz,
bool indexisvalue)
1102 int colneighs, colrank;
1103 MPI_Comm_size(ColWorld, &colneighs);
1104 MPI_Comm_rank(ColWorld, &colrank);
1105 int * colnz =
new int[colneighs];
1106 colnz[colrank] = trxlocnz;
1107 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
1108 int * dpls =
new int[colneighs]();
1109 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
1110 accnz = std::accumulate(colnz, colnz+colneighs, 0);
1111 indacc =
new int32_t[accnz];
1112 numacc =
new NV[accnz];
1122 double t0=MPI_Wtime();
1130 if(colrank == 0) lenuntilcol = lenuntil;
1131 MPI_Bcast(&lenuntilcol, 1, MPIType<IU>(), 0, ColWorld);
1132 for(
int i=0; i< accnz; ++i)
1134 numacc[i] = indacc[i] + lenuntilcol;
1139 MPI_Allgatherv(trxnums, trxlocnz, MPIType<NV>(), numacc, colnz, dpls, MPIType<NV>(), ColWorld);
1143 double t1=MPI_Wtime();
1157 template<
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1159 int32_t * & sendindbuf, OVT * & sendnumbuf,
int * & sdispls,
int * sendcnt,
int accnz,
bool indexisvalue,
PreAllocatedSPA<OVT> & SPA)
1163 if(A.spSeq->getnsplit() > 0)
1166 generic_gespmv_threaded_setbuffers<SR> (*(A.spSeq), indacc, numacc, accnz, optbuf.
inds, optbuf.
nums, sendcnt, optbuf.
dspls, rowneighs);
1170 generic_gespmv<SR> (*(A.spSeq), indacc, numacc, accnz, optbuf.
inds, optbuf.
nums, sendcnt, optbuf.
dspls, rowneighs, indexisvalue);
1176 if(A.spSeq->getnsplit() > 0)
1179 int totalsent = generic_gespmv_threaded<SR> (*(A.spSeq), indacc, numacc, accnz, sendindbuf, sendnumbuf, sdispls, rowneighs, SPA);
1182 for(
int i=0; i<rowneighs-1; ++i)
1183 sendcnt[i] = sdispls[i+1] - sdispls[i];
1184 sendcnt[rowneighs-1] = totalsent - sdispls[rowneighs-1];
1189 std::vector< int32_t > indy;
1190 std::vector< OVT > numy;
1191 generic_gespmv<SR>(*(A.spSeq), indacc, numacc, accnz, indy, numy, SPA);
1195 int32_t bufsize = indy.size();
1196 sendindbuf =
new int32_t[bufsize];
1197 sendnumbuf =
new OVT[bufsize];
1201 for(
int i=0; i<rowneighs; ++i)
1203 int32_t end_this = (i==rowneighs-1) ? A.
getlocalrows(): (i+1)*perproc;
1204 while(k < bufsize && indy[k] < end_this)
1206 sendindbuf[k] = indy[k] - i*perproc;
1207 sendnumbuf[k] = numy[k];
1212 sdispls =
new int[rowneighs]();
1213 std::partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
1225 template <
typename SR,
typename IU,
typename OVT>
1226 void MergeContributions(
int* listSizes, std::vector<int32_t *> & indsvec, std::vector<OVT *> & numsvec, std::vector<IU>& mergedind, std::vector<OVT>& mergednum)
1229 int nlists = indsvec.size();
1235 int veclen = listSizes[0];
1236 mergedind.resize(veclen);
1237 mergednum.resize(veclen);
1238 for(
int i=0; i<veclen; i++)
1240 mergedind[i] = indsvec[0][i];
1241 mergednum[i] = numsvec[0][i];
1247 int32_t inf = std::numeric_limits<int32_t>::min();
1248 int32_t sup = std::numeric_limits<int32_t>::max();
1249 KNHeap< int32_t, int32_t > sHeap(sup, inf);
1250 int * processed =
new int[nlists]();
1251 for(
int i=0; i<nlists; ++i)
1253 if(listSizes[i] > 0)
1256 sHeap.insert(indsvec[i][0], i);
1263 sHeap.deleteMin(&key, &locv);
1264 mergedind.push_back( static_cast<IU>(key));
1265 mergednum.push_back(numsvec[locv][0]);
1267 if( (++(processed[locv])) < listSizes[locv] )
1268 sHeap.insert(indsvec[locv][processed[locv]], locv);
1274 sHeap.deleteMin(&key, &locv);
1275 if(mergedind.back() ==
static_cast<IU
>(key))
1277 mergednum.back() = SR::add(mergednum.back(), numsvec[locv][processed[locv]]);
1283 mergedind.push_back(static_cast<IU>(key));
1284 mergednum.push_back(numsvec[locv][processed[locv]]);
1287 if( (++(processed[locv])) < listSizes[locv] )
1288 sHeap.insert(indsvec[locv][processed[locv]], locv);
1297 template <
typename SR,
typename IU,
typename OVT>
1298 void MergeContributions_threaded(
int * & listSizes, std::vector<int32_t *> & indsvec, std::vector<OVT *> & numsvec, std::vector<IU> & mergedind, std::vector<OVT> & mergednum, IU maxindex)
1301 int nlists = indsvec.size();
1307 int veclen = listSizes[0];
1308 mergedind.resize(veclen);
1309 mergednum.resize(veclen);
1312 #pragma omp parallel for 1314 for(
int i=0; i<veclen; i++)
1316 mergedind[i] = indsvec[0][i];
1317 mergednum[i] = numsvec[0][i];
1324 #pragma omp parallel 1326 nthreads = omp_get_num_threads();
1329 int nsplits = 4*nthreads;
1330 nsplits = std::min(nsplits, (
int)maxindex);
1331 std::vector< std::vector<int32_t> > splitters(nlists);
1332 for(
int k=0; k< nlists; k++)
1334 splitters[k].resize(nsplits+1);
1335 splitters[k][0] =
static_cast<int32_t
>(0);
1336 #pragma omp parallel for 1337 for(
int i=1; i< nsplits; i++)
1339 IU cur_idx = i * (maxindex/nsplits);
1340 auto it = std::lower_bound (indsvec[k], indsvec[k] + listSizes[k], cur_idx);
1341 splitters[k][i] = (int32_t) (it - indsvec[k]);
1343 splitters[k][nsplits] = listSizes[k];
1347 std::vector<std::vector<IU>> indsBuf(nsplits);
1348 std::vector<std::vector<OVT>> numsBuf(nsplits);
1350 #pragma omp parallel for schedule(dynamic) 1351 for(
int i=0; i< nsplits; i++)
1353 std::vector<int32_t *> tIndsVec(nlists);
1354 std::vector<OVT *> tNumsVec(nlists);
1355 std::vector<int> tLengths(nlists);
1356 for(
int j=0; j< nlists; ++j)
1358 tIndsVec[j] = indsvec[j] + splitters[j][i];
1359 tNumsVec[j] = numsvec[j] + splitters[j][i];
1360 tLengths[j]= splitters[j][i+1] - splitters[j][i];
1363 MergeContributions<SR>(tLengths.data(), tIndsVec, tNumsVec, indsBuf[i], numsBuf[i]);
1367 std::vector<IU> tdisp(nsplits+1);
1369 for(
int i=0; i<nsplits; ++i)
1371 tdisp[i+1] = tdisp[i] + indsBuf[i].size();
1374 mergedind.resize(tdisp[nsplits]);
1375 mergednum.resize(tdisp[nsplits]);
1378 #pragma omp parallel for schedule(dynamic) 1379 for(
int i=0; i< nsplits; i++)
1381 std::copy(indsBuf[i].data() , indsBuf[i].data() + indsBuf[i].
size(), mergedind.data() + tdisp[i]);
1382 std::copy(numsBuf[i].data() , numsBuf[i].data() + numsBuf[i].
size(), mergednum.data() + tdisp[i]);
1393 template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1401 MPI_Comm World = x.commGrid->GetWorld();
1402 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1403 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1408 int32_t *trxinds, *indacc;
1409 IVT *trxnums, *numacc;
1412 double t0=MPI_Wtime();
1415 TransposeVector(World, x, trxlocnz, lenuntil, trxinds, trxnums, indexisvalue);
1418 double t1=MPI_Wtime();
1422 if(x.commGrid->GetGridRows() > 1)
1424 AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, indexisvalue);
1434 MPI_Comm_size(RowWorld, &rowneighs);
1435 int * sendcnt =
new int[rowneighs]();
1436 int32_t * sendindbuf;
1441 double t2=MPI_Wtime();
1444 LocalSpMV<SR>(
A, rowneighs, optbuf, indacc, numacc, sendindbuf, sendnumbuf, sdispls, sendcnt, accnz, indexisvalue, SPA);
1447 double t3=MPI_Wtime();
1452 if(x.commGrid->GetGridCols() == 1)
1454 y.ind.resize(sendcnt[0]);
1455 y.num.resize(sendcnt[0]);
1458 if(optbuf.totmax > 0 )
1461 #pragma omp parallel for 1463 for(
int i=0; i<sendcnt[0]; i++)
1465 y.ind[i] = optbuf.inds[i];
1466 y.num[i] = optbuf.nums[i];
1472 #pragma omp parallel for 1474 for(
int i=0; i<sendcnt[0]; i++)
1476 y.ind[i] = sendindbuf[i];
1477 y.num[i] = sendnumbuf[i];
1479 DeleteAll(sendindbuf, sendnumbuf,sdispls);
1484 int * rdispls =
new int[rowneighs];
1485 int * recvcnt =
new int[rowneighs];
1486 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld);
1490 for(
int i=0; i<rowneighs-1; ++i)
1492 rdispls[i+1] = rdispls[i] + recvcnt[i];
1495 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
1496 int32_t * recvindbuf =
new int32_t[totrecv];
1497 OVT * recvnumbuf =
new OVT[totrecv];
1500 double t4=MPI_Wtime();
1502 if(optbuf.totmax > 0 )
1505 MPI_Alltoallv(optbuf.nums, sendcnt, optbuf.dspls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
1511 MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<OVT>(), recvnumbuf, recvcnt, rdispls, MPIType<OVT>(), RowWorld);
1512 DeleteAll(sendindbuf, sendnumbuf, sendcnt, sdispls);
1515 double t5=MPI_Wtime();
1520 double t6=MPI_Wtime();
1524 std::vector<IU>().swap(y.ind);
1525 std::vector<OVT>().swap(y.num);
1527 std::vector<int32_t *> indsvec(rowneighs);
1528 std::vector<OVT *> numsvec(rowneighs);
1531 #pragma omp parallel for 1533 for(
int i=0; i<rowneighs; i++)
1535 indsvec[i] = recvindbuf+rdispls[i];
1536 numsvec[i] = recvnumbuf+rdispls[i];
1539 MergeContributions_threaded<SR>(recvcnt, indsvec, numsvec, y.ind, y.num, y.MyLocLength());
1541 MergeContributions<SR>(recvcnt, indsvec, numsvec, y.ind, y.num);
1544 DeleteAll(recvcnt, rdispls,recvindbuf, recvnumbuf);
1546 double t7=MPI_Wtime();
1553 template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1557 SpMV<SR>(
A, x, y, indexisvalue, optbuf, SPA);
1560 template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1565 SpMV<SR>(
A, x, y, indexisvalue, optbuf, SPA);
1568 template <
typename SR,
typename IVT,
typename OVT,
typename IU,
typename NUM,
typename UDER>
1572 SpMV<SR>(
A, x, y, indexisvalue, optbuf, SPA);
1580 template <
typename SR,
typename IU,
typename NUM,
typename UDER>
1586 SpMV<SR>(
A, x, y, indexisvalue, optbuf);
1593 template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
1600 MPI_Comm World = x.commGrid->GetWorld();
1601 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1602 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1607 int diagneigh = x.commGrid->GetComplementRank();
1609 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
1611 NUV * trxnums =
new NUV[trxsize];
1612 MPI_Sendrecv(const_cast<NUV*>(
SpHelper::p2a(x.arr)), xsize, MPIType<NUV>(), diagneigh,
TRX, trxnums, trxsize, MPIType<NUV>(), diagneigh,
TRX, World, &status);
1614 int colneighs, colrank;
1615 MPI_Comm_size(ColWorld, &colneighs);
1616 MPI_Comm_rank(ColWorld, &colrank);
1617 int * colsize =
new int[colneighs];
1618 colsize[colrank] = trxsize;
1619 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, ColWorld);
1620 int * dpls =
new int[colneighs]();
1621 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
1622 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
1623 NUV * numacc =
new NUV[accsize];
1625 MPI_Allgatherv(trxnums, trxsize, MPIType<NUV>(), numacc, colsize, dpls, MPIType<NUV>(), ColWorld);
1629 T_promote
id = SR::id();
1631 T_promote * localy =
new T_promote[ysize];
1632 std::fill_n(localy, ysize,
id);
1635 dcsc_gespmv_threaded<SR>(*(A.spSeq), numacc, localy);
1637 dcsc_gespmv<SR>(*(A.spSeq), numacc, localy);
1647 MPI_Comm_size(RowWorld, &rowneighs);
1650 for(
int i=0; i< rowneighs; ++i)
1652 begptr = y.RowLenUntil(i);
1653 if(i == rowneighs-1)
1659 endptr = y.RowLenUntil(i+1);
1661 MPI_Reduce(localy+begptr,
SpHelper::p2a(y.arr), endptr-begptr, MPIType<T_promote>(), SR::mpi_op(), i, RowWorld);
1673 template <
typename SR,
typename IU,
typename NUM,
typename NUV,
typename UDER>
1680 MPI_Comm World = x.commGrid->GetWorld();
1681 MPI_Comm ColWorld = x.commGrid->GetColWorld();
1682 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
1686 int roffst = x.RowLenUntil();
1689 int diagneigh = x.commGrid->GetComplementRank();
1691 MPI_Sendrecv(&xlocnz, 1, MPI_INT, diagneigh,
TRX, &trxlocnz, 1, MPI_INT, diagneigh,
TRX, World, &status);
1692 MPI_Sendrecv(&roffst, 1, MPI_INT, diagneigh,
TROST, &offset, 1, MPI_INT, diagneigh,
TROST, World, &status);
1694 IU * trxinds =
new IU[trxlocnz];
1695 NUV * trxnums =
new NUV[trxlocnz];
1696 MPI_Sendrecv(const_cast<IU*>(
SpHelper::p2a(x.ind)), xlocnz, MPIType<IU>(), diagneigh,
TRX, trxinds, trxlocnz, MPIType<IU>(), diagneigh,
TRX, World, &status);
1697 MPI_Sendrecv(const_cast<NUV*>(
SpHelper::p2a(x.num)), xlocnz, MPIType<NUV>(), diagneigh,
TRX, trxnums, trxlocnz, MPIType<NUV>(), diagneigh,
TRX, World, &status);
1698 std::transform(trxinds, trxinds+trxlocnz, trxinds, std::bind2nd(std::plus<IU>(), offset));
1700 int colneighs, colrank;
1701 MPI_Comm_size(ColWorld, &colneighs);
1702 MPI_Comm_rank(ColWorld, &colrank);
1703 int * colnz =
new int[colneighs];
1704 colnz[colrank] = trxlocnz;
1705 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
1706 int * dpls =
new int[colneighs]();
1707 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
1708 int accnz = std::accumulate(colnz, colnz+colneighs, 0);
1709 IU * indacc =
new IU[accnz];
1710 NUV * numacc =
new NUV[accnz];
1714 MPI_Allgatherv(trxinds, trxlocnz, MPIType<IU>(), indacc, colnz, dpls, MPIType<IU>(), ColWorld);
1715 MPI_Allgatherv(trxnums, trxlocnz, MPIType<NUV>(), numacc, colnz, dpls, MPIType<NUV>(), ColWorld);
1719 std::vector< int32_t > indy;
1720 std::vector< T_promote > numy;
1722 int32_t * tmpindacc =
new int32_t[accnz];
1723 for(
int i=0; i< accnz; ++i) tmpindacc[i] = indacc[i];
1726 dcsc_gespmv<SR>(*(A.spSeq), tmpindacc, numacc, accnz, indy, numy);
1732 IU yintlen = y.MyRowLength();
1735 MPI_Comm_size(RowWorld,&rowneighs);
1736 std::vector< std::vector<IU> > sendind(rowneighs);
1737 std::vector< std::vector<T_promote> > sendnum(rowneighs);
1738 typename std::vector<int32_t>::size_type outnz = indy.size();
1739 for(
typename std::vector<IU>::size_type i=0; i< outnz; ++i)
1742 int rown = y.OwnerWithinRow(yintlen, static_cast<IU>(indy[i]), locind);
1743 sendind[rown].push_back(locind);
1744 sendnum[rown].push_back(numy[i]);
1747 IU * sendindbuf =
new IU[outnz];
1748 T_promote * sendnumbuf =
new T_promote[outnz];
1749 int * sendcnt =
new int[rowneighs];
1750 int * sdispls =
new int[rowneighs];
1751 for(
int i=0; i<rowneighs; ++i)
1752 sendcnt[i] = sendind[i].
size();
1754 int * rdispls =
new int[rowneighs];
1755 int * recvcnt =
new int[rowneighs];
1756 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld);
1760 for(
int i=0; i<rowneighs-1; ++i)
1762 sdispls[i+1] = sdispls[i] + sendcnt[i];
1763 rdispls[i+1] = rdispls[i] + recvcnt[i];
1765 int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
1766 IU * recvindbuf =
new IU[totrecv];
1767 T_promote * recvnumbuf =
new T_promote[totrecv];
1769 for(
int i=0; i<rowneighs; ++i)
1771 std::copy(sendind[i].begin(), sendind[i].end(), sendindbuf+sdispls[i]);
1772 std::vector<IU>().swap(sendind[i]);
1774 for(
int i=0; i<rowneighs; ++i)
1776 std::copy(sendnum[i].begin(), sendnum[i].end(), sendnumbuf+sdispls[i]);
1777 std::vector<T_promote>().swap(sendnum[i]);
1779 MPI_Alltoallv(sendindbuf, sendcnt, sdispls, MPIType<IU>(), recvindbuf, recvcnt, rdispls, MPIType<IU>(), RowWorld);
1780 MPI_Alltoallv(sendnumbuf, sendcnt, sdispls, MPIType<T_promote>(), recvnumbuf, recvcnt, rdispls, MPIType<T_promote>(), RowWorld);
1783 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
1786 IU ysize = y.MyLocLength();
1787 T_promote * localy =
new T_promote[ysize];
1788 bool * isthere =
new bool[ysize];
1789 std::vector<IU> nzinds;
1790 std::fill_n(isthere, ysize,
false);
1792 for(
int i=0; i< totrecv; ++i)
1794 if(!isthere[recvindbuf[i]])
1796 localy[recvindbuf[i]] = recvnumbuf[i];
1797 nzinds.push_back(recvindbuf[i]);
1798 isthere[recvindbuf[i]] =
true;
1802 localy[recvindbuf[i]] = SR::add(localy[recvindbuf[i]], recvnumbuf[i]);
1805 DeleteAll(isthere, recvindbuf, recvnumbuf);
1806 sort(nzinds.begin(), nzinds.end());
1807 int nnzy = nzinds.size();
1810 for(
int i=0; i< nnzy; ++i)
1812 y.ind[i] = nzinds[i];
1813 y.num[i] = localy[nzinds[i]];
1820 template <
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB>
1827 if(*(A.commGrid) == *(B.commGrid))
1829 DER_promote * result =
new DER_promote(
EWiseMult(*(A.spSeq),*(B.spSeq),exclude) );
1834 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
1840 template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation>
1844 if(*(A.commGrid) == *(B.commGrid))
1846 RETDER * result =
new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, notB, defaultBVal) );
1851 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
1857 template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
1859 (
const SpParMat<IU,NU1,UDERA> & A,
const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect,
const bool useExtendedBinOp)
1861 if(*(A.commGrid) == *(B.commGrid))
1863 RETDER * result =
new RETDER( EWiseApply<RETT>(*(A.spSeq),*(B.spSeq), __binary_op, do_op, allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect) );
1868 std::cout <<
"Grids are not comparable elementwise apply" << std::endl;
1875 template <
typename RETT,
typename RETDER,
typename IU,
typename NU1,
typename NU2,
typename UDERA,
typename UDERB,
typename _BinaryOperation,
typename _BinaryPredicate>
1877 EWiseApply (
const SpParMat<IU,NU1,UDERA> & A,
const SpParMat<IU,NU2,UDERB> & B, _BinaryOperation __binary_op, _BinaryPredicate do_op,
bool allowANulls,
bool allowBNulls,
const NU1& ANullVal,
const NU2& BNullVal,
const bool allowIntersect =
true)
1879 return EWiseApply<RETT, RETDER>(
A,
B,
1882 allowANulls, allowBNulls, ANullVal, BNullVal, allowIntersect,
true);
1890 template <
typename IU,
typename NU1,
typename NU2>
1896 if(*(V.commGrid) == *(W.commGrid))
1899 if(V.glen != W.glen)
1901 std::cerr <<
"Vector dimensions don't match for EWiseMult\n";
1906 Product.glen = V.glen;
1910 #if defined(_OPENMP) && defined(CBLAS_EXPERIMENTAL) // not faster than serial 1912 std::vector <IU> tlosizes (actual_splits, 0);
1913 std::vector < std::vector<IU> > tlinds(actual_splits);
1914 std::vector < std::vector<T_promote> > tlnums(actual_splits);
1915 IU tlsize = size / actual_splits;
1916 #pragma omp parallel for //schedule(dynamic, 1) 1917 for(IU t = 0; t < actual_splits; ++t)
1919 IU tlbegin = t*tlsize;
1920 IU tlend = (t==actual_splits-1)? size : (t+1)*tlsize;
1921 for(IU i=tlbegin; i<tlend; ++i)
1923 if(W.arr[V.ind[i]] == zero)
1925 tlinds[t].push_back(V.ind[i]);
1926 tlnums[t].push_back(V.num[i]);
1931 std::vector<IU> prefix_sum(actual_splits+1,0);
1932 std::partial_sum(tlosizes.begin(), tlosizes.end(), prefix_sum.begin()+1);
1933 Product.ind.resize(prefix_sum[actual_splits]);
1934 Product.num.resize(prefix_sum[actual_splits]);
1936 #pragma omp parallel for //schedule(dynamic, 1) 1937 for(IU t=0; t< actual_splits; ++t)
1939 std::copy(tlinds[t].begin(), tlinds[t].end(), Product.ind.begin()+prefix_sum[t]);
1940 std::copy(tlnums[t].begin(), tlnums[t].end(), Product.num.begin()+prefix_sum[t]);
1943 for(IU i=0; i<
size; ++i)
1945 if(W.arr[V.ind[i]] == zero)
1947 Product.ind.push_back(V.ind[i]);
1948 Product.num.push_back(V.num[i]);
1955 for(IU i=0; i<
size; ++i)
1957 if(W.arr[V.ind[i]] != zero)
1959 Product.ind.push_back(V.ind[i]);
1960 Product.num.push_back(V.num[i] * W.arr[V.ind[i]]);
1969 std::cout <<
"Grids are not comparable elementwise multiplication" << std::endl;
1979 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
1983 typedef RET T_promote;
1984 if(*(V.commGrid) == *(W.commGrid))
1987 if(V.TotalLength() != W.TotalLength())
1989 std::ostringstream outs;
1990 outs <<
"Vector dimensions don't match (" << V.TotalLength() <<
" vs " << W.TotalLength() <<
") for EWiseApply (short version)\n";
1998 #pragma omp parallel 2000 nthreads = omp_get_num_threads();
2004 Product.glen = V.glen;
2009 std::vector<std::vector<IU>> tProductInd(nthreads);
2010 std::vector<std::vector<T_promote>> tProductVal(nthreads);
2013 perthread = size/nthreads;
2015 perthread = spsize/nthreads;
2018 #pragma omp parallel 2023 curthread = omp_get_thread_num();
2025 IU tStartIdx = perthread * curthread;
2026 IU tNextIdx = perthread * (curthread+1);
2030 if(curthread == nthreads-1) tNextIdx =
size;
2033 auto it = std::lower_bound (V.ind.begin(), V.ind.end(), tStartIdx);
2034 IU tSpIdx = (IU) std::distance(V.ind.begin(), it);
2037 for(IU tIdx=tStartIdx; tIdx < tNextIdx; ++tIdx)
2039 if(tSpIdx < spsize && V.ind[tSpIdx] < tNextIdx && V.ind[tSpIdx] == tIdx)
2041 if (_doOp(V.num[tSpIdx], W.arr[tIdx],
false,
false))
2043 tProductInd[curthread].push_back(tIdx);
2044 tProductVal[curthread].push_back (_binary_op(V.num[tSpIdx], W.arr[tIdx],
false,
false));
2050 if (_doOp(Vzero, W.arr[tIdx],
true,
false))
2052 tProductInd[curthread].push_back(tIdx);
2053 tProductVal[curthread].push_back (_binary_op(Vzero, W.arr[tIdx],
true,
false));
2060 if(curthread == nthreads-1) tNextIdx = spsize;
2061 for(IU tSpIdx=tStartIdx; tSpIdx < tNextIdx; ++tSpIdx)
2063 if (_doOp(V.num[tSpIdx], W.arr[V.ind[tSpIdx]],
false,
false))
2066 tProductInd[curthread].push_back( V.ind[tSpIdx]);
2067 tProductVal[curthread].push_back (_binary_op(V.num[tSpIdx], W.arr[V.ind[tSpIdx]],
false,
false));
2073 std::vector<IU> tdisp(nthreads+1);
2075 for(
int i=0; i<nthreads; ++i)
2077 tdisp[i+1] = tdisp[i] + tProductInd[i].size();
2081 Product.ind.resize(tdisp[nthreads]);
2082 Product.num.resize(tdisp[nthreads]);
2085 #pragma omp parallel 2090 curthread = omp_get_thread_num();
2092 std::copy(tProductInd[curthread].begin(), tProductInd[curthread].end(), Product.ind.data() + tdisp[curthread]);
2093 std::copy(tProductVal[curthread].begin() , tProductVal[curthread].end(), Product.num.data() + tdisp[curthread]);
2100 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2125 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2131 return EWiseApply_threaded<RET>(V, W, _binary_op, _doOp, allowVNulls, Vzero, useExtendedBinOp);
2134 typedef RET T_promote;
2135 if(*(V.commGrid) == *(W.commGrid))
2139 if(V.TotalLength() != W.TotalLength())
2141 std::ostringstream outs;
2142 outs <<
"Vector dimensions don't match (" << V.TotalLength() <<
" vs " << W.TotalLength() <<
") for EWiseApply (short version)\n";
2148 Product.glen = V.glen;
2149 IU
size= W.LocArrSize();
2150 IU spsize = V.getlocnnz();
2155 for(IU i=0; i<
size; ++i)
2157 if(sp_iter < spsize && V.ind[sp_iter] == i)
2159 if (_doOp(V.num[sp_iter], W.arr[i],
false,
false))
2161 Product.ind.push_back(i);
2162 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[i],
false,
false));
2168 if (_doOp(Vzero, W.arr[i],
true,
false))
2170 Product.ind.push_back(i);
2171 Product.num.push_back(_binary_op(Vzero, W.arr[i],
true,
false));
2179 for(sp_iter = 0; sp_iter < spsize; ++sp_iter)
2181 if (_doOp(V.num[sp_iter], W.arr[V.ind[sp_iter]],
false,
false))
2183 Product.ind.push_back(V.ind[sp_iter]);
2184 Product.num.push_back(_binary_op(V.num[sp_iter], W.arr[V.ind[sp_iter]],
false,
false));
2194 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2226 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2228 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls, NU1 Vzero, NU2 Wzero,
const bool allowIntersect,
const bool useExtendedBinOp)
2231 typedef RET T_promote;
2232 if(*(V.commGrid) == *(W.commGrid))
2235 if(V.glen != W.glen)
2237 std::ostringstream outs;
2238 outs <<
"Vector dimensions don't match (" << V.glen <<
" vs " << W.glen <<
") for EWiseApply (full version)\n";
2244 Product.glen = V.glen;
2245 typename std::vector< IU >::const_iterator indV = V.ind.begin();
2246 typename std::vector< NU1 >::const_iterator numV = V.num.begin();
2247 typename std::vector< IU >::const_iterator indW = W.ind.begin();
2248 typename std::vector< NU2 >::const_iterator numW = W.num.begin();
2250 while (indV < V.ind.end() && indW < W.ind.end())
2257 if (_doOp(*numV, *numW,
false,
false))
2259 Product.ind.push_back(*indV);
2260 Product.num.push_back(_binary_op(*numV, *numW,
false,
false));
2266 else if (*indV < *indW)
2271 if (_doOp(*numV, Wzero,
false,
true))
2273 Product.ind.push_back(*indV);
2274 Product.num.push_back(_binary_op(*numV, Wzero,
false,
true));
2284 if (_doOp(Vzero, *numW,
true,
false))
2286 Product.ind.push_back(*indW);
2287 Product.num.push_back(_binary_op(Vzero, *numW,
true,
false));
2294 while (allowWNulls && indV < V.ind.end())
2296 if (_doOp(*numV, Wzero,
false,
true))
2298 Product.ind.push_back(*indV);
2299 Product.num.push_back(_binary_op(*numV, Wzero,
false,
true));
2303 while (allowVNulls && indW < W.ind.end())
2305 if (_doOp(Vzero, *numW,
true,
false))
2307 Product.ind.push_back(*indW);
2308 Product.num.push_back(_binary_op(Vzero, *numW,
true,
false));
2317 std::cout <<
"Grids are not comparable for EWiseApply" << std::endl;
2324 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2330 return EWiseApply<RET>(V, W,
2333 allowVNulls, Vzero,
true);
2338 template <
typename RET,
typename IU,
typename NU1,
typename NU2,
typename _BinaryOperation,
typename _BinaryPredicate>
2340 (
const FullyDistSpVec<IU,NU1> & V,
const FullyDistSpVec<IU,NU2> & W , _BinaryOperation _binary_op, _BinaryPredicate _doOp,
bool allowVNulls,
bool allowWNulls, NU1 Vzero, NU2 Wzero,
const bool allowIntersect =
true)
2342 return EWiseApply<RET>(V, W,
2345 allowVNulls, allowWNulls, Vzero, Wzero, allowIntersect,
true);
void MergeContributions_threaded(int *&listSizes, std::vector< int32_t *> &indsvec, std::vector< OVT *> &numsvec, std::vector< IU > &mergedind, std::vector< OVT > &mergednum, IU maxindex)
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
std::shared_ptr< CommGrid > getcommgrid() const
Dcsc< IU, N_promote > EWiseApply(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, _BinaryOperation __binary_op, bool notB, const NU2 &defaultBVal)
FullyDistVec< IT, NT > Concatenate(std::vector< FullyDistVec< IT, NT > > &vecs)
MPI_Datatype MPIType< int64_t >(void)
DER::LocalIT getlocalrows() const
SpParMat< IU, NUO, UDERO > Mult_AnXBn_Synch(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
static const T * p2a(const std::vector< T > &v)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
static void GetSetSizes(const SpMat< IT, NT, DER > &Matrix, IT **&sizes, MPI_Comm &comm1d)
bool Kselect(FullyDistVec< GIT, VT > &rvec, IT k_limit, int kselectVersion) const
std::shared_ptr< CommGrid > getcommgrid() const
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)
std::shared_ptr< CommGrid > getcommgrid() const
DER::LocalIT getlocalcols() const
SpParMat< IU, NUO, UDERO > Mult_AnXBn_DoubleBuff(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, bool clearA=false, bool clearB=false)
bool CheckSpGEMMCompliance(const MATRIXA &A, const MATRIXB &B)
MPI_Datatype MPIType< int32_t >(void)
int64_t EstPerProcessNnzSUMMA(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B)
void MCLPruneRecoverySelect(SpParMat< IT, NT, DER > &A, NT hardThreshold, IT selectNum, IT recoverNum, NT recoverPct, int kselectVersion)
FullyDistSpVec< IU, RET > EWiseApply_threaded(const FullyDistSpVec< IU, NU1 > &V, const FullyDistVec< IU, NU2 > &W, _BinaryOperation _binary_op, _BinaryPredicate _doOp, bool allowVNulls, NU1 Vzero, const bool useExtendedBinOp)
FullyDistSpVec< IT, VT > SpMV(const SpParMat< IT, bool, UDER > &A, const FullyDistSpVec< IT, VT > &x, OptBuf< int32_t, VT > &optbuf)
static void Print(const std::string &s)
double mcl_multiwaymergetime
double cblas_allgathertime
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
double mcl_prunecolumntime
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
static void BCastMatrix(MPI_Comm &comm1d, SpMat< IT, NT, DER > &Matrix, const std::vector< IT > &essentials, int root)
SpParMat< IT, NT, DER > PruneColumn(const FullyDistVec< IT, NT > &pvals, _BinaryOperation __binary_op, bool inPlace=true)
Prune every column of a sparse matrix based on pvals.
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
DER::LocalIT getlocalnnz() const
double mcl_localspgemmtime
SpParMat< IU, NUO, UDERO > MemEfficientSpGEMM(SpParMat< IU, NU1, UDERA > &A, SpParMat< IU, NU2, UDERB > &B, int phases, NUO hardThreshold, IU selectNum, IU recoverNum, NUO recoverPct, int kselectVersion, int64_t perProcessMemory)
double cblas_mergeconttime
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
double cblas_transvectime
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
double cblas_localspmvtime
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
static void deallocate2D(T **array, I m)
double cblas_alltoalltime
SpParMat< IT, NT, DER > Prune(_UnaryOperation __unary_op, bool inPlace=true)