38 #include <sys/types.h> 52 template <
class IT,
class NT,
class DER>
55 assert( (
sizeof(IT) >=
sizeof(
typename DER::LocalIT)) );
58 perror(
"Input file doesn't exist\n");
61 commGrid.reset(
new CommGrid(world, 0, 0));
65 template <
class IT,
class NT,
class DER>
68 assert( (
sizeof(IT) >=
sizeof(
typename DER::LocalIT)) );
69 commGrid.reset(
new CommGrid(world, 0, 0));
72 template <
class IT,
class NT,
class DER>
75 assert( (
sizeof(IT) >=
sizeof(
typename DER::LocalIT)) );
79 template <
class IT,
class NT,
class DER>
82 assert( (
sizeof(IT) >=
sizeof(
typename DER::LocalIT)) );
88 template <
class IT,
class NT,
class DER>
91 SpParHelper::Print(
"COMBBLAS Warning: It is dangerous to create (matrix) objects without specifying the communicator, are you sure you want to create this object in MPI_COMM_WORLD?\n");
92 assert( (
sizeof(IT) >=
sizeof(
typename DER::LocalIT)) );
94 commGrid.reset(
new CommGrid(MPI_COMM_WORLD, 0, 0));
100 template <
class IT,
class NT,
class DER>
104 assert( (
sizeof(IT) >=
sizeof(
typename DER::LocalIT)) );
106 commGrid.reset(
new CommGrid(world, 0, 0));
109 template <
class IT,
class NT,
class DER>
112 if(spSeq != NULL)
delete spSeq;
115 template <
class IT,
class NT,
class DER>
118 if(spSeq != NULL)
delete spSeq;
128 template <
class IT,
class NT,
class DER>
129 template <
typename VT,
typename GIT>
131 const std::vector<NT> & activemedians,
const std::vector<IT> & activennzperc,
int itersuntil,
132 std::vector< std::vector<NT> > & localmat,
const std::vector<IT> & actcolsmap, std::vector<IT> & klimits,
133 std::vector<IT> & toretain, std::vector<std::vector<std::pair<IT,NT>>> & tmppair, IT coffset,
const FullyDistVec<GIT,VT> & rvec)
const 135 int rankincol = commGrid->GetRankInProcCol();
136 int colneighs = commGrid->GetGridRows();
137 int nprocs = commGrid->GetSize();
138 std::vector<double> finalWeightedMedians(thischunk, 0.0);
140 MPI_Gather(activemedians.data() + itersuntil*chunksize, thischunk, MPIType<NT>(), all_medians.data(), thischunk, MPIType<NT>(), 0, commGrid->GetColWorld());
141 MPI_Gather(activennzperc.data() + itersuntil*chunksize, thischunk, MPIType<IT>(), nnz_per_col.data(), thischunk, MPIType<IT>(), 0, commGrid->GetColWorld());
145 std::vector<double> columnCounts(thischunk, 0.0);
146 std::vector< std::pair<NT, double> > mediansNweights(colneighs);
148 for(
int j = 0; j < thischunk; ++j)
150 for(
int k = 0; k<colneighs; ++k)
152 size_t fetchindex = k*thischunk+j;
153 columnCounts[j] +=
static_cast<double>(nnz_per_col[fetchindex]);
155 for(
int k = 0; k<colneighs; ++k)
157 size_t fetchindex = k*thischunk+j;
158 mediansNweights[k] = std::make_pair(all_medians[fetchindex], static_cast<double>(nnz_per_col[fetchindex]) / columnCounts[j]);
160 sort(mediansNweights.begin(), mediansNweights.end());
162 double sumofweights = 0;
164 while( k<colneighs && sumofweights < 0.5)
166 sumofweights += mediansNweights[k++].second;
168 finalWeightedMedians[j] = mediansNweights[k-1].first;
171 MPI_Bcast(finalWeightedMedians.data(), thischunk,
MPIType<double>(), 0, commGrid->GetColWorld());
173 std::vector<IT> larger(thischunk, 0);
174 std::vector<IT> smaller(thischunk, 0);
175 std::vector<IT> equal(thischunk, 0);
178 #pragma omp parallel for 180 for(
int j = 0; j < thischunk; ++j)
182 size_t fetchindex = actcolsmap[j+itersuntil*chunksize];
183 for(
size_t k = 0; k < localmat[fetchindex].size(); ++k)
186 if(localmat[fetchindex][k] > finalWeightedMedians[j])
188 else if(localmat[fetchindex][k] < finalWeightedMedians[j])
194 MPI_Allreduce(MPI_IN_PLACE, larger.data(), thischunk, MPIType<IT>(), MPI_SUM, commGrid->GetColWorld());
195 MPI_Allreduce(MPI_IN_PLACE, smaller.data(), thischunk, MPIType<IT>(), MPI_SUM, commGrid->GetColWorld());
196 MPI_Allreduce(MPI_IN_PLACE, equal.data(), thischunk, MPIType<IT>(), MPI_SUM, commGrid->GetColWorld());
200 omp_lock_t lock[nprocs];
201 for (
int i=0; i<nprocs; i++)
202 omp_init_lock(&(lock[i]));
205 numThreads = omp_get_num_threads();
209 std::vector < std::vector<IT> > perthread2retain(numThreads);
212 #pragma omp parallel for 214 for(
int j = 0; j < thischunk; ++j)
217 int myThread = omp_get_thread_num();
223 size_t clmapindex = j+itersuntil*chunksize;
224 size_t fetchindex = actcolsmap[clmapindex];
227 if(klimits[clmapindex] <= larger[j])
229 std::vector<NT> survivors;
230 for(
size_t k = 0; k < localmat[fetchindex].size(); ++k)
232 if(localmat[fetchindex][k] > finalWeightedMedians[j])
233 survivors.push_back(localmat[fetchindex][k]);
235 localmat[fetchindex].swap(survivors);
236 perthread2retain[myThread].push_back(clmapindex);
238 else if (klimits[clmapindex] > larger[j] + equal[j])
240 std::vector<NT> survivors;
241 for(
size_t k = 0; k < localmat[fetchindex].size(); ++k)
243 if(localmat[fetchindex][k] < finalWeightedMedians[j])
244 survivors.push_back(localmat[fetchindex][k]);
246 localmat[fetchindex].swap(survivors);
248 klimits[clmapindex] -= (larger[j] + equal[j]);
249 perthread2retain[myThread].push_back(clmapindex);
253 std::vector<NT> survivors;
254 for(
size_t k = 0; k < localmat[fetchindex].size(); ++k)
256 if(localmat[fetchindex][k] >= finalWeightedMedians[j])
257 survivors.push_back(localmat[fetchindex][k]);
259 localmat[fetchindex].swap(survivors);
263 IT n_perproc = getlocalcols() / colneighs;
264 int assigned = std::max(static_cast<int>(fetchindex/n_perproc), colneighs-1);
265 if( assigned == rankincol)
268 int owner = rvec.Owner(coffset + fetchindex, locid);
271 omp_set_lock(&(lock[owner]));
273 tmppair[owner].emplace_back(std::make_pair(locid, finalWeightedMedians[j]));
275 omp_unset_lock(&(lock[owner]));
281 std::vector<IT> tdisp(numThreads+1);
283 for(
int i=0; i<numThreads; ++i)
285 tdisp[i+1] = tdisp[i] + perthread2retain[i].size();
287 toretain.resize(tdisp[numThreads]);
289 #pragma omp parallel for 290 for(
int i=0; i< numThreads; i++)
292 std::copy(perthread2retain[i].data() , perthread2retain[i].data()+ perthread2retain[i].
size(), toretain.data() + tdisp[i]);
296 for (
int i=0; i<nprocs; i++)
297 omp_destroy_lock(&(lock[i]));
307 template <
class IT,
class NT,
class DER>
308 template <
typename VT,
typename GIT>
311 if(*rvec.commGrid != *commGrid)
313 SpParHelper::Print(
"Grids are not comparable, SpParMat::Kselect() fails!", commGrid->GetWorld());
318 int rankincol = commGrid->GetRankInProcCol();
319 int rankinrow = commGrid->GetRankInProcRow();
320 int rowneighs = commGrid->GetGridCols();
321 int colneighs = commGrid->GetGridRows();
322 int myrank = commGrid->GetRank();
323 int nprocs = commGrid->GetSize();
326 Reduce(colcnt,
Column, std::plus<IT>(), (IT) 0, [](NT i){
return (IT) 1;});
331 int diagneigh = colcnt.commGrid->GetComplementRank();
333 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, commGrid->GetWorld(), &status);
335 IT * trxnums =
new IT[trxsize];
336 MPI_Sendrecv(const_cast<IT*>(SpHelper::p2a(colcnt.arr)), xsize, MPIType<IT>(), diagneigh,
TRX, trxnums, trxsize, MPIType<IT>(), diagneigh,
TRX, commGrid->GetWorld(), &status);
338 int * colsize =
new int[colneighs];
339 colsize[rankincol] = trxsize;
340 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, commGrid->GetColWorld());
341 int * dpls =
new int[colneighs]();
342 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
343 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
344 std::vector<IT> percsum(accsize);
346 MPI_Allgatherv(trxnums, trxsize, MPIType<IT>(), percsum.data(), colsize, dpls, MPIType<VT>(), commGrid->GetColWorld());
350 IT locm = getlocalcols();
351 std::vector< std::vector<NT> > localmat(locm);
353 #ifdef COMBBLAS_DEBUG 354 if(accsize != locm) std::cout <<
"Gather vector along columns logic is wrong" << std::endl;
357 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
359 if(percsum[colit.colid()] >= k_limit)
361 localmat[colit.colid()].reserve(colit.nnz());
362 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
364 localmat[colit.colid()].push_back(nzit.value());
369 int64_t activecols = std::count_if(percsum.begin(), percsum.end(), [k_limit](IT i){
return i >= k_limit;});
370 int64_t activennz = std::accumulate(percsum.begin(), percsum.end(), (
int64_t) 0);
372 int64_t totactcols, totactnnzs;
373 MPI_Allreduce(&activecols, &totactcols, 1,
MPIType<int64_t>(), MPI_SUM, commGrid->GetRowWorld());
374 if(myrank == 0) std::cout <<
"Number of initial active columns are " << totactcols << std::endl;
376 MPI_Allreduce(&activennz, &totactnnzs, 1,
MPIType<int64_t>(), MPI_SUM, commGrid->GetRowWorld());
377 if(myrank == 0) std::cout <<
"Number of initial nonzeros are " << totactnnzs << std::endl;
379 #ifdef COMBBLAS_DEBUG 380 IT glactcols = colcnt.
Count([k_limit](IT i){
return i >= k_limit;});
381 if(myrank == 0) std::cout <<
"Number of initial active columns are " << glactcols << std::endl;
382 if(glactcols != totactcols)
if(myrank == 0) std::cout <<
"Wrong number of active columns are computed" << std::endl;
388 #ifdef COMBBLAS_DEBUG 395 std::ostringstream ss;
396 ss <<
"TopK: k_limit (" << k_limit <<
")" <<
" >= maxNnzInColumn. Returning the result of Reduce(Column, minimum<NT>()) instead..." << std::endl;
397 SpParHelper::Print(ss.str());
402 std::vector<IT> actcolsmap(activecols);
403 for (IT i=0, j=0; i< locm; ++i) {
404 if(percsum[i] >= k_limit)
408 std::vector<NT> all_medians;
409 std::vector<IT> nnz_per_col;
410 std::vector<IT> klimits(activecols, k_limit);
411 int activecols_lowerbound = 10*colneighs;
414 IT * locncols =
new IT[rowneighs];
415 locncols[rankinrow] = locm;
416 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(),locncols, 1, MPIType<IT>(), commGrid->GetRowWorld());
417 IT coffset = std::accumulate(locncols, locncols+rankinrow, static_cast<IT>(0));
421 MPI_Datatype MPI_pair;
422 MPI_Type_contiguous(
sizeof(std::pair<IT,NT>), MPI_CHAR, &MPI_pair);
423 MPI_Type_commit(&MPI_pair);
425 int * sendcnt =
new int[nprocs];
426 int * recvcnt =
new int[nprocs];
427 int * sdispls =
new int[nprocs]();
428 int * rdispls =
new int[nprocs]();
430 while(totactcols > 0)
432 int chunksize, iterations, lastchunk;
433 if(activecols > activecols_lowerbound)
438 chunksize =
static_cast<int>(activecols/colneighs);
439 iterations = std::max(static_cast<int>(activecols/chunksize), 1);
440 lastchunk = activecols - (iterations-1)*chunksize;
444 chunksize = activecols;
446 lastchunk = activecols;
448 std::vector<NT> activemedians(activecols);
449 std::vector<IT> activennzperc(activecols);
452 #pragma omp parallel for 454 for(IT i=0; i< activecols; ++i)
456 size_t orgindex = actcolsmap[i];
457 if(localmat[orgindex].empty())
459 activemedians[i] = (NT) 0;
460 activennzperc[i] = 0;
465 auto entriesincol(localmat[orgindex]);
466 std::nth_element(entriesincol.begin(), entriesincol.begin() + entriesincol.size()/2, entriesincol.end());
467 activemedians[i] = entriesincol[entriesincol.size()/2];
468 activennzperc[i] = entriesincol.size();
472 percsum.resize(activecols, 0);
473 MPI_Allreduce(activennzperc.data(), percsum.data(), activecols, MPIType<IT>(), MPI_SUM, commGrid->GetColWorld());
474 activennz = std::accumulate(percsum.begin(), percsum.end(), (
int64_t) 0);
476 #ifdef COMBBLAS_DEBUG 477 MPI_Allreduce(&activennz, &totactnnzs, 1,
MPIType<int64_t>(), MPI_SUM, commGrid->GetRowWorld());
478 if(myrank == 0) std::cout <<
"Number of active nonzeros are " << totactnnzs << std::endl;
481 std::vector<IT> toretain;
484 all_medians.resize(lastchunk*colneighs);
485 nnz_per_col.resize(lastchunk*colneighs);
487 std::vector< std::vector< std::pair<IT,NT> > > tmppair(nprocs);
488 for(
int i=0; i< iterations-1; ++i)
490 TopKGather(all_medians, nnz_per_col, chunksize, chunksize, activemedians, activennzperc, i, localmat, actcolsmap, klimits, toretain, tmppair, coffset, rvec);
492 TopKGather(all_medians, nnz_per_col, lastchunk, chunksize, activemedians, activennzperc, iterations-1, localmat, actcolsmap, klimits, toretain, tmppair, coffset, rvec);
496 for(IT i=0; i<nprocs; ++i)
498 sendcnt[i] = tmppair[i].size();
499 totsend += tmppair[i].size();
502 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetWorld());
504 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
505 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
506 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
507 assert((totsend < std::numeric_limits<int>::max()));
508 assert((totrecv < std::numeric_limits<int>::max()));
510 std::pair<IT,NT> * sendpair =
new std::pair<IT,NT>[totsend];
511 for(
int i=0; i<nprocs; ++i)
513 std::copy(tmppair[i].begin(), tmppair[i].end(), sendpair+sdispls[i]);
514 std::vector< std::pair<IT,NT> >().swap(tmppair[i]);
516 std::vector< std::pair<IT,NT> > recvpair(totrecv);
517 MPI_Alltoallv(sendpair, sendcnt, sdispls, MPI_pair, recvpair.data(), recvcnt, rdispls, MPI_pair, commGrid->GetWorld());
521 for(
auto & update : recvpair )
524 rvec.arr[update.first] = update.second;
526 #ifdef COMBBLAS_DEBUG 527 MPI_Allreduce(MPI_IN_PLACE, &updated, 1, MPIType<IT>(), MPI_SUM, commGrid->GetWorld());
528 if(myrank == 0) std::cout <<
"Total vector entries updated " << updated << std::endl;
533 std::vector<IT> newactivecols(toretain.size());
534 std::vector<IT> newklimits(toretain.size());
536 for(
auto & retind : toretain )
538 newactivecols[newindex] = actcolsmap[retind];
539 newklimits[newindex++] = klimits[retind];
541 actcolsmap.swap(newactivecols);
542 klimits.swap(newklimits);
543 activecols = actcolsmap.size();
545 MPI_Allreduce(&activecols, &totactcols, 1,
MPIType<int64_t>(), MPI_SUM, commGrid->GetRowWorld());
546 #ifdef COMBBLAS_DEBUG 547 if(myrank == 0) std::cout <<
"Number of active columns are " << totactcols << std::endl;
550 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
551 MPI_Type_free(&MPI_pair);
553 #ifdef COMBBLAS_DEBUG 554 if(myrank == 0) std::cout <<
"Exiting kselect2"<< std::endl;
561 template <
class IT,
class NT,
class DER>
564 MPI_Comm World = commGrid->GetWorld();
565 int rank = commGrid->GetRank();
566 int nprocs = commGrid->GetSize();
569 char * _fn =
const_cast<char*
>(filename.c_str());
570 MPI_File_open(World, _fn, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &thefile);
572 int rankinrow = commGrid->GetRankInProcRow();
573 int rankincol = commGrid->GetRankInProcCol();
574 int rowneighs = commGrid->GetGridCols();
575 int colneighs = commGrid->GetGridRows();
577 IT * colcnts =
new IT[rowneighs];
578 IT * rowcnts =
new IT[colneighs];
579 rowcnts[rankincol] = getlocalrows();
580 colcnts[rankinrow] = getlocalcols();
582 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), colcnts, 1, MPIType<IT>(), commGrid->GetRowWorld());
583 IT coloffset = std::accumulate(colcnts, colcnts+rankinrow, static_cast<IT>(0));
585 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), rowcnts, 1, MPIType<IT>(), commGrid->GetColWorld());
586 IT rowoffset = std::accumulate(rowcnts, rowcnts+rankincol, static_cast<IT>(0));
589 IT * prelens =
new IT[nprocs];
590 prelens[
rank] = 2*getlocalnnz();
591 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), prelens, 1, MPIType<IT>(), commGrid->GetWorld());
592 IT lengthuntil = std::accumulate(prelens, prelens+rank, static_cast<IT>(0));
596 MPI_Offset disp = lengthuntil *
sizeof(uint32_t);
597 char native[] =
"native";
598 MPI_File_set_view(thefile, disp, MPI_UNSIGNED, MPI_UNSIGNED, native, MPI_INFO_NULL);
599 uint32_t * gen_edges =
new uint32_t[prelens[
rank]];
602 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
604 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
606 gen_edges[k++] = (uint32_t) (nzit.rowid() + rowoffset);
607 gen_edges[k++] = (uint32_t) (colit.colid() + coloffset);
610 assert(k == prelens[rank]);
611 MPI_File_write(thefile, gen_edges, prelens[rank], MPI_UNSIGNED, NULL);
612 MPI_File_close(&thefile);
618 template <
class IT,
class NT,
class DER>
621 if(rhs.spSeq != NULL)
622 spSeq =
new DER(*(rhs.spSeq));
624 commGrid = rhs.commGrid;
627 template <
class IT,
class NT,
class DER>
634 if(spSeq != NULL)
delete spSeq;
635 if(rhs.spSeq != NULL)
636 spSeq =
new DER(*(rhs.spSeq));
638 commGrid = rhs.commGrid;
643 template <
class IT,
class NT,
class DER>
648 if(*commGrid == *rhs.commGrid)
650 (*spSeq) += (*(rhs.spSeq));
654 std::cout <<
"Grids are not comparable for parallel addition (A+B)" << std::endl;
659 std::cout<<
"Missing feature (A+A): Use multiply with 2 instead !"<<std::endl;
664 template <
class IT,
class NT,
class DER>
667 IT totnnz = getnnz();
669 IT localnnz = (IT) spSeq->getnnz();
670 MPI_Allreduce( &localnnz, &maxnnz, 1, MPIType<IT>(), MPI_MAX, commGrid->GetWorld());
671 if(totnnz == 0)
return 1;
672 return static_cast<float>((commGrid->GetSize() * maxnnz)) /
static_cast<float>(totnnz);
675 template <
class IT,
class NT,
class DER>
679 IT localnnz = spSeq->
getnnz();
680 MPI_Allreduce( &localnnz, &totalnnz, 1, MPIType<IT>(), MPI_SUM, commGrid->GetWorld());
684 template <
class IT,
class NT,
class DER>
688 IT localrows = spSeq->
getnrow();
689 MPI_Allreduce( &localrows, &totalrows, 1, MPIType<IT>(), MPI_SUM, commGrid->GetColWorld());
693 template <
class IT,
class NT,
class DER>
697 IT localcols = spSeq->
getncol();
698 MPI_Allreduce( &localcols, &totalcols, 1, MPIType<IT>(), MPI_SUM, commGrid->GetRowWorld());
702 template <
class IT,
class NT,
class DER>
703 template <
typename _BinaryOperation>
707 if(!(*commGrid == *(x.commGrid)))
709 std::cout <<
"Grids are not comparable for SpParMat::DimApply" << std::endl;
713 MPI_Comm World = x.commGrid->GetWorld();
714 MPI_Comm ColWorld = x.commGrid->GetColWorld();
715 MPI_Comm RowWorld = x.commGrid->GetRowWorld();
722 int diagneigh = x.commGrid->GetComplementRank();
724 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
726 NT * trxnums =
new NT[trxsize];
727 MPI_Sendrecv(const_cast<NT*>(SpHelper::p2a(x.arr)), xsize, MPIType<NT>(), diagneigh,
TRX, trxnums, trxsize, MPIType<NT>(), diagneigh,
TRX, World, &status);
729 int colneighs, colrank;
730 MPI_Comm_size(ColWorld, &colneighs);
731 MPI_Comm_rank(ColWorld, &colrank);
732 int * colsize =
new int[colneighs];
733 colsize[colrank] = trxsize;
735 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, ColWorld);
736 int * dpls =
new int[colneighs]();
737 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
738 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
739 NT * scaler =
new NT[accsize];
741 MPI_Allgatherv(trxnums, trxsize, MPIType<NT>(), scaler, colsize, dpls, MPIType<NT>(), ColWorld);
744 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
746 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
748 nzit.value() = __binary_op(nzit.value(), scaler[colit.colid()]);
757 int rowneighs, rowrank;
758 MPI_Comm_size(RowWorld, &rowneighs);
759 MPI_Comm_rank(RowWorld, &rowrank);
760 int * rowsize =
new int[rowneighs];
761 rowsize[rowrank] = xsize;
762 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize, 1, MPI_INT, RowWorld);
763 int * dpls =
new int[rowneighs]();
764 std::partial_sum(rowsize, rowsize+rowneighs-1, dpls+1);
765 int accsize = std::accumulate(rowsize, rowsize+rowneighs, 0);
766 NT * scaler =
new NT[accsize];
768 MPI_Allgatherv(const_cast<NT*>(SpHelper::p2a(x.arr)), xsize, MPIType<NT>(), scaler, rowsize, dpls, MPIType<NT>(), RowWorld);
771 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
773 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
775 nzit.value() = __binary_op(nzit.value(), scaler[nzit.rowid()]);
783 std::cout <<
"Unknown scaling dimension, returning..." << std::endl;
789 template <
class IT,
class NT,
class DER>
790 template <
typename _BinaryOperation,
typename _UnaryOperation >
808 std::cout <<
"Unknown reduction dimension, returning empty vector" << std::endl;
813 Reduce(parvec, dim, __binary_op,
id, __unary_op);
817 template <
class IT,
class NT,
class DER>
818 template <
typename _BinaryOperation>
836 std::cout <<
"Unknown reduction dimension, returning empty vector" << std::endl;
846 template <
class IT,
class NT,
class DER>
847 template <
typename VT,
typename GIT,
typename _BinaryOperation>
854 template <
class IT,
class NT,
class DER>
855 template <
typename VT,
typename GIT,
typename _BinaryOperation,
typename _UnaryOperation>
862 template <
class IT,
class NT,
class DER>
863 template <
typename VT,
typename GIT,
typename _BinaryOperation,
typename _UnaryOperation>
866 if(*rvec.commGrid != *commGrid)
868 SpParHelper::Print(
"Grids are not comparable, SpParMat::Reduce() fails!", commGrid->GetWorld());
879 IT n_thiscol = getlocalcols();
880 int colneighs = commGrid->GetGridRows();
881 int colrank = commGrid->GetRankInProcCol();
883 GIT * loclens =
new GIT[colneighs];
884 GIT * lensums =
new GIT[colneighs+1]();
886 GIT n_perproc = n_thiscol / colneighs;
887 if(colrank == colneighs-1)
888 loclens[colrank] = n_thiscol - (n_perproc*colrank);
890 loclens[colrank] = n_perproc;
892 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<GIT>(), loclens, 1, MPIType<GIT>(), commGrid->GetColWorld());
893 std::partial_sum(loclens, loclens+colneighs, lensums+1);
895 std::vector<VT> trarr;
896 typename DER::SpColIter colit = spSeq->begcol();
897 for(
int i=0; i< colneighs; ++i)
899 VT * sendbuf =
new VT[loclens[i]];
900 std::fill(sendbuf, sendbuf+loclens[i],
id);
902 for(; colit != spSeq->endcol() && colit.colid() < lensums[i+1]; ++colit)
904 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
906 sendbuf[colit.colid()-lensums[i]] = __binary_op(static_cast<VT>(__unary_op(nzit.value())), sendbuf[colit.colid()-lensums[i]]);
913 trarr.resize(loclens[i]);
914 recvbuf = SpHelper::p2a(trarr);
916 MPI_Reduce(sendbuf, recvbuf, loclens[i], MPIType<VT>(), mympiop, i, commGrid->GetColWorld());
922 GIT trlen = trarr.size();
923 int diagneigh = commGrid->GetComplementRank();
925 MPI_Sendrecv(&trlen, 1, MPIType<IT>(), diagneigh,
TRNNZ, &reallen, 1, MPIType<IT>(), diagneigh,
TRNNZ, commGrid->GetWorld(), &status);
927 rvec.arr.resize(reallen);
928 MPI_Sendrecv(SpHelper::p2a(trarr), trlen, MPIType<VT>(), diagneigh,
TRX, SpHelper::p2a(rvec.arr), reallen, MPIType<VT>(), diagneigh,
TRX, commGrid->GetWorld(), &status);
929 rvec.glen = getncol();
935 rvec.glen = getnrow();
936 int rowneighs = commGrid->GetGridCols();
937 int rowrank = commGrid->GetRankInProcRow();
938 GIT * loclens =
new GIT[rowneighs];
939 GIT * lensums =
new GIT[rowneighs+1]();
940 loclens[rowrank] = rvec.MyLocLength();
941 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<GIT>(), loclens, 1, MPIType<GIT>(), commGrid->GetRowWorld());
942 std::partial_sum(loclens, loclens+rowneighs, lensums+1);
945 rvec.arr.resize(loclens[rowrank],
id);
949 #define MAXCOLUMNBATCH 5 * 1024 * 1024 950 typename DER::SpColIter begfinger = spSeq->begcol();
953 int numreducecalls = (int) ceil(static_cast<float>(spSeq->getnzc()) / static_cast<float>(
MAXCOLUMNBATCH));
955 MPI_Allreduce( &numreducecalls, &maxreducecalls, 1, MPI_INT, MPI_MAX, commGrid->GetRowWorld());
957 for(
int k=0; k< maxreducecalls; ++k)
959 std::vector<typename DER::SpColIter::NzIter> nziters;
960 typename DER::SpColIter curfinger = begfinger;
961 for(; curfinger != spSeq->endcol() && nziters.size() <
MAXCOLUMNBATCH ; ++curfinger)
963 nziters.push_back(spSeq->begnz(curfinger));
965 for(
int i=0; i< rowneighs; ++i)
967 VT * sendbuf =
new VT[loclens[i]];
968 std::fill(sendbuf, sendbuf+loclens[i],
id);
970 typename DER::SpColIter colit = begfinger;
972 for(; colit != curfinger; ++colit, ++colcnt)
974 typename DER::SpColIter::NzIter nzit = nziters[colcnt];
975 for(; nzit != spSeq->endnz(colit) && nzit.rowid() < lensums[i+1]; ++nzit)
977 sendbuf[nzit.rowid()-lensums[i]] = __binary_op(static_cast<VT>(__unary_op(nzit.value())), sendbuf[nzit.rowid()-lensums[i]]);
979 nziters[colcnt] = nzit;
985 for(
int j=0; j< loclens[i]; ++j)
987 sendbuf[j] = __binary_op(rvec.arr[j], sendbuf[j]);
989 recvbuf = SpHelper::p2a(rvec.arr);
991 MPI_Reduce(sendbuf, recvbuf, loclens[i], MPIType<VT>(), mympiop, i, commGrid->GetRowWorld());
994 begfinger = curfinger;
998 catch (std::length_error& le)
1000 std::cerr <<
"Length error: " << le.what() << std::endl;
1006 std::cout <<
"Unknown reduction dimension, returning empty vector" << std::endl;
1012 #ifndef KSELECTLIMIT 1013 #define KSELECTLIMIT 10000 1021 template <
class IT,
class NT,
class DER>
1022 template <
typename VT,
typename GIT>
1025 #ifdef COMBBLAS_DEBUG 1029 Kselect2(test2, k_limit);
1031 SpParHelper::Print(
"Kselect1 and Kselect2 producing same results\n");
1034 SpParHelper::Print(
"WARNING: Kselect1 and Kselect2 producing DIFFERENT results\n");
1035 test1.PrintToFile(
"test1");
1036 test2.PrintToFile(
"test2");
1040 if(kselectVersion==1 || k_limit < KSELECTLIMIT)
1047 bool ret = Kselect2(kthAll, k_limit);
1049 [](VT spval, VT dval){
return dval;},
1050 [](VT spval, VT dval){
return true;},
1061 template <
class IT,
class NT,
class DER>
1062 template <
typename VT,
typename GIT>
1065 #ifdef COMBBLAS_DEBUG 1069 Kselect2(test2, k_limit);
1071 SpParHelper::Print(
"Kselect1 and Kselect2 producing same results\n");
1074 SpParHelper::Print(
"WARNING: Kselect1 and Kselect2 producing DIFFERENT results\n");
1080 if(kselectVersion==1 || k_limit < KSELECTLIMIT)
1083 return Kselect2(rvec, k_limit);
1092 template <
class IT,
class NT,
class DER>
1093 template <
typename VT,
typename GIT,
typename _UnaryOperation>
1096 if(*rvec.commGrid != *commGrid)
1098 SpParHelper::Print(
"Grids are not comparable, SpParMat::Kselect() fails!", commGrid->GetWorld());
1103 Reduce(nnzPerColumn,
Column, std::plus<IT>(), (IT)0, [](NT val){
return (IT)1;});
1105 if(k>maxnnzPerColumn)
1107 SpParHelper::Print(
"Kselect: k is greater then maxNnzInColumn. Calling Reduce instead...\n");
1112 IT n_thiscol = getlocalcols();
1116 std::vector<VT> sendbuf(n_thiscol*k);
1117 std::vector<IT> send_coldisp(n_thiscol+1,0);
1118 std::vector<IT> local_coldisp(n_thiscol+1,0);
1125 if(spSeq->getnnz()>0)
1127 typename DER::SpColIter colit = spSeq->begcol();
1128 for(IT i=0; i<n_thiscol; ++i)
1130 local_coldisp[i+1] = local_coldisp[i];
1131 send_coldisp[i+1] = send_coldisp[i];
1132 if(i==colit.colid())
1134 local_coldisp[i+1] += colit.nnz();
1136 send_coldisp[i+1] += k;
1138 send_coldisp[i+1] += colit.nnz();
1144 assert(local_coldisp[n_thiscol] == spSeq->getnnz());
1148 std::vector<VT> localmat(spSeq->getnnz());
1152 #pragma omp parallel for 1154 for(IT i=0; i<nzc; i++)
1157 typename DER::SpColIter colit = spSeq->begcol() + i;
1158 IT colid = colit.colid();
1159 IT idx = local_coldisp[colid];
1160 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
1162 localmat[idx++] =
static_cast<VT
>(__unary_op(nzit.value()));
1167 std::sort(localmat.begin()+local_coldisp[colid], localmat.begin()+local_coldisp[colid+1], std::greater<VT>());
1168 std::copy(localmat.begin()+local_coldisp[colid], localmat.begin()+local_coldisp[colid+1], sendbuf.begin()+send_coldisp[colid]);
1172 std::partial_sort(localmat.begin()+local_coldisp[colid], localmat.begin()+local_coldisp[colid]+k, localmat.begin()+local_coldisp[colid+1], std::greater<VT>());
1173 std::copy(localmat.begin()+local_coldisp[colid], localmat.begin()+local_coldisp[colid]+k, sendbuf.begin()+send_coldisp[colid]);
1177 std::vector<VT>().swap(localmat);
1178 std::vector<IT>().swap(local_coldisp);
1180 std::vector<VT> recvbuf(n_thiscol*k);
1181 std::vector<VT> tempbuf(n_thiscol*k);
1182 std::vector<IT> recv_coldisp(n_thiscol+1);
1183 std::vector<IT> templen(n_thiscol);
1185 int colneighs = commGrid->GetGridRows();
1186 int colrank = commGrid->GetRankInProcCol();
1188 for(
int p=2; p <= colneighs; p*=2)
1191 if(colrank%p == p/2)
1193 int receiver = colrank - ceil(p/2);
1194 MPI_Send(send_coldisp.data(), n_thiscol+1, MPIType<IT>(), receiver, 0, commGrid->GetColWorld());
1195 MPI_Send(sendbuf.data(), send_coldisp[n_thiscol], MPIType<VT>(), receiver, 1, commGrid->GetColWorld());
1198 else if(colrank%p == 0)
1200 int sender = colrank + ceil(p/2);
1201 if(sender < colneighs)
1204 MPI_Recv(recv_coldisp.data(), n_thiscol+1, MPIType<IT>(), sender, 0, commGrid->GetColWorld(), MPI_STATUS_IGNORE);
1205 MPI_Recv(recvbuf.data(), recv_coldisp[n_thiscol], MPIType<VT>(), sender, 1, commGrid->GetColWorld(), MPI_STATUS_IGNORE);
1210 #pragma omp parallel for 1212 for(IT i=0; i<n_thiscol; ++i)
1215 IT j=send_coldisp[i], l=recv_coldisp[i];
1219 for(; j<send_coldisp[i+1] && l<recv_coldisp[i+1] && lid<k;)
1221 if(sendbuf[j] > recvbuf[l])
1222 tempbuf[offset+lid++] = sendbuf[j++];
1224 tempbuf[offset+lid++] = recvbuf[l++];
1226 while(j<send_coldisp[i+1] && lid<k) tempbuf[offset+lid++] = sendbuf[j++];
1227 while(l<recv_coldisp[i+1] && lid<k) tempbuf[offset+lid++] = recvbuf[l++];
1231 send_coldisp[0] = 0;
1232 for(IT i=0; i<n_thiscol; i++)
1234 send_coldisp[i+1] = send_coldisp[i] + templen[i];
1239 #pragma omp parallel for 1241 for(IT i=0; i<n_thiscol; i++)
1244 std::copy(tempbuf.begin()+offset, tempbuf.begin()+offset+templen[i], sendbuf.begin() + send_coldisp[i]);
1249 MPI_Barrier(commGrid->GetWorld());
1250 std::vector<VT> kthItem(n_thiscol);
1252 int root = commGrid->GetDiagOfProcCol();
1253 if(root==0 && colrank==0)
1256 #pragma omp parallel for 1258 for(IT i=0; i<n_thiscol; i++)
1260 IT nitems = send_coldisp[i+1]-send_coldisp[i];
1262 kthItem[i] = sendbuf[send_coldisp[i]+k-1];
1264 kthItem[i] = std::numeric_limits<VT>::min();
1267 else if(root>0 && colrank==0)
1270 #pragma omp parallel for 1272 for(IT i=0; i<n_thiscol; i++)
1274 IT nitems = send_coldisp[i+1]-send_coldisp[i];
1276 kthItem[i] = sendbuf[send_coldisp[i]+k-1];
1278 kthItem[i] = std::numeric_limits<VT>::min();
1280 MPI_Send(kthItem.data(), n_thiscol, MPIType<VT>(), root, 0, commGrid->GetColWorld());
1282 else if(root>0 && colrank==root)
1284 MPI_Recv(kthItem.data(), n_thiscol, MPIType<VT>(), 0, 0, commGrid->GetColWorld(), MPI_STATUS_IGNORE);
1287 std::vector <int> sendcnts;
1288 std::vector <int> dpls;
1291 int proccols = commGrid->GetGridCols();
1292 IT n_perproc = n_thiscol / proccols;
1293 sendcnts.resize(proccols);
1294 std::fill(sendcnts.data(), sendcnts.data()+proccols-1, n_perproc);
1295 sendcnts[proccols-1] = n_thiscol - (n_perproc * (proccols-1));
1296 dpls.resize(proccols,0);
1297 std::partial_sum(sendcnts.data(), sendcnts.data()+proccols-1, dpls.data()+1);
1300 int rowroot = commGrid->GetDiagOfProcRow();
1303 MPI_Scatter(sendcnts.data(),1, MPI_INT, & recvcnts, 1, MPI_INT, rowroot, commGrid->GetRowWorld());
1305 rvec.arr.resize(recvcnts);
1306 MPI_Scatterv(kthItem.data(),sendcnts.data(), dpls.data(), MPIType<VT>(), rvec.arr.data(), rvec.arr.size(), MPIType<VT>(),rowroot, commGrid->GetRowWorld());
1307 rvec.glen = getncol();
1313 template <
class IT,
class NT,
class DER>
1314 template <
typename VT,
typename GIT,
typename _UnaryOperation>
1317 if(*rvec.commGrid != *commGrid)
1319 SpParHelper::Print(
"Grids are not comparable, SpParMat::Kselect() fails!", commGrid->GetWorld());
1335 IT n_thiscol = getlocalcols();
1336 MPI_Comm World = rvec.commGrid->GetWorld();
1337 MPI_Comm ColWorld = rvec.commGrid->GetColWorld();
1338 MPI_Comm RowWorld = rvec.commGrid->GetRowWorld();
1339 int colneighs = commGrid->GetGridRows();
1340 int colrank = commGrid->GetRankInProcCol();
1341 int coldiagrank = commGrid->GetDiagOfProcCol();
1348 int32_t *trxinds, *activeCols;
1349 VT *trxnums, *numacc;
1350 TransposeVector(World, rvec, trxlocnz, lenuntil, trxinds, trxnums,
true);
1352 if(rvec.commGrid->GetGridRows() > 1)
1355 AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, activeCols, numacc, accnz,
true);
1360 activeCols = trxinds;
1365 std::vector<bool> isactive(n_thiscol,
false);
1366 for(
int i=0; i<accnz ; i++)
1368 isactive[activeCols[i]] =
true;
1371 IT nActiveCols = accnz;
1375 std::vector<IT> send_coldisp(n_thiscol+1,0);
1376 std::vector<IT> local_coldisp(n_thiscol+1,0);
1378 VT * sendbuf =
static_cast<VT *
> (::operator
new (n_thiscol*k*
sizeof(VT)));
1385 if(spSeq->getnnz()>0)
1387 typename DER::SpColIter colit = spSeq->begcol();
1388 for(IT i=0; i<n_thiscol; ++i)
1390 local_coldisp[i+1] = local_coldisp[i];
1391 send_coldisp[i+1] = send_coldisp[i];
1392 if(i==colit.colid())
1396 local_coldisp[i+1] += colit.nnz();
1398 send_coldisp[i+1] += k;
1400 send_coldisp[i+1] += colit.nnz();
1411 VT * localmat =
static_cast<VT *
> (::operator
new (local_coldisp[n_thiscol]*
sizeof(VT)));
1415 #pragma omp parallel for 1417 for(IT i=0; i<nzc; i++)
1420 typename DER::SpColIter colit = spSeq->begcol() + i;
1421 IT colid = colit.colid();
1424 IT idx = local_coldisp[colid];
1425 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
1427 localmat[idx++] =
static_cast<VT
>(__unary_op(nzit.value()));
1432 sort(localmat+local_coldisp[colid], localmat+local_coldisp[colid+1], std::greater<VT>());
1433 std::copy(localmat+local_coldisp[colid], localmat+local_coldisp[colid+1], sendbuf+send_coldisp[colid]);
1437 partial_sort(localmat+local_coldisp[colid], localmat+local_coldisp[colid]+k, localmat+local_coldisp[colid+1], std::greater<VT>());
1438 std::copy(localmat+local_coldisp[colid], localmat+local_coldisp[colid]+k, sendbuf+send_coldisp[colid]);
1445 ::operator
delete(localmat);
1446 std::vector<IT>().swap(local_coldisp);
1448 VT * recvbuf =
static_cast<VT *
> (::operator
new (n_thiscol*k*
sizeof(VT)));
1449 VT * tempbuf =
static_cast<VT *
> (::operator
new (n_thiscol*k*
sizeof(VT)));
1452 std::vector<IT> recv_coldisp(n_thiscol+1);
1453 std::vector<IT> templen(n_thiscol);
1457 for(
int p=2; p <= colneighs; p*=2)
1460 if(colrank%p == p/2)
1462 int receiver = colrank - ceil(p/2);
1463 MPI_Send(send_coldisp.data(), n_thiscol+1, MPIType<IT>(), receiver, 0, commGrid->GetColWorld());
1464 MPI_Send(sendbuf, send_coldisp[n_thiscol], MPIType<VT>(), receiver, 1, commGrid->GetColWorld());
1467 else if(colrank%p == 0)
1469 int sender = colrank + ceil(p/2);
1470 if(sender < colneighs)
1473 MPI_Recv(recv_coldisp.data(), n_thiscol+1, MPIType<IT>(), sender, 0, commGrid->GetColWorld(), MPI_STATUS_IGNORE);
1474 MPI_Recv(recvbuf, recv_coldisp[n_thiscol], MPIType<VT>(), sender, 1, commGrid->GetColWorld(), MPI_STATUS_IGNORE);
1479 #pragma omp parallel for 1481 for(IT i=0; i<n_thiscol; ++i)
1484 IT j=send_coldisp[i], l=recv_coldisp[i];
1488 for(; j<send_coldisp[i+1] && l<recv_coldisp[i+1] && lid<k;)
1490 if(sendbuf[j] > recvbuf[l])
1491 tempbuf[offset+lid++] = sendbuf[j++];
1493 tempbuf[offset+lid++] = recvbuf[l++];
1495 while(j<send_coldisp[i+1] && lid<k) tempbuf[offset+lid++] = sendbuf[j++];
1496 while(l<recv_coldisp[i+1] && lid<k) tempbuf[offset+lid++] = recvbuf[l++];
1500 send_coldisp[0] = 0;
1501 for(IT i=0; i<n_thiscol; i++)
1503 send_coldisp[i+1] = send_coldisp[i] + templen[i];
1508 #pragma omp parallel for 1510 for(IT i=0; i<n_thiscol; i++)
1513 std::copy(tempbuf+offset, tempbuf+offset+templen[i], sendbuf + send_coldisp[i]);
1518 MPI_Barrier(commGrid->GetWorld());
1533 std::vector<VT> kthItem(nActiveCols);
1537 #pragma omp parallel for 1539 for(IT i=0; i<nActiveCols; i++)
1541 IT ai = activeCols[i];
1542 IT nitems = send_coldisp[ai+1]-send_coldisp[ai];
1544 kthItem[i] = sendbuf[send_coldisp[ai]+k-1];
1546 kthItem[i] = std::numeric_limits<VT>::min();
1548 kthItem[i] = sendbuf[send_coldisp[ai+1]-1];
1560 if(coldiagrank>0 && colrank==0)
1562 MPI_Send(kthItem.data(), nActiveCols, MPIType<VT>(), coldiagrank, 0, commGrid->GetColWorld());
1564 else if(coldiagrank>0 && colrank==coldiagrank)
1566 MPI_Recv(kthItem.data(), nActiveCols, MPIType<VT>(), 0, 0, commGrid->GetColWorld(), MPI_STATUS_IGNORE);
1578 int rowroot = commGrid->GetDiagOfProcRow();
1579 int proccols = commGrid->GetGridCols();
1580 std::vector <int> sendcnts(proccols,0);
1581 std::vector <int> dpls(proccols,0);
1582 int lsize = rvec.ind.size();
1584 MPI_Gather(&lsize,1, MPI_INT, sendcnts.data(), 1, MPI_INT, rowroot, RowWorld);
1585 std::partial_sum(sendcnts.data(), sendcnts.data()+proccols-1, dpls.data()+1);
1586 MPI_Scatterv(kthItem.data(),sendcnts.data(), dpls.data(), MPIType<VT>(), rvec.num.data(), rvec.num.size(), MPIType<VT>(),rowroot, RowWorld);
1588 ::operator
delete(sendbuf);
1589 ::operator
delete(recvbuf);
1590 ::operator
delete(tempbuf);
1597 template <
class IT,
class NT,
class DER>
1602 IT m_perproc = getnrow() / commGrid->GetGridRows();
1603 IT n_perproc = getncol() / commGrid->GetGridCols();
1604 IT moffset = commGrid->GetRankInProcCol() * m_perproc;
1605 IT noffset = commGrid->GetRankInProcRow() * n_perproc;
1607 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
1609 IT diagrow = colit.colid() + noffset;
1610 typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit);
1611 if(nzit != spSeq->endnz(colit))
1613 IT firstrow = nzit.rowid() + moffset;
1614 IT lastrow = (nzit+ colit.nnz()-1).rowid() + moffset;
1616 if(firstrow <= diagrow)
1618 IT dev = diagrow - firstrow;
1619 if(upperlBW < dev) upperlBW = dev;
1621 if(lastrow >= diagrow)
1623 IT dev = lastrow - diagrow;
1624 if(lowerlBW < dev) lowerlBW = dev;
1630 MPI_Allreduce( &upperlBW, &upperBW, 1, MPIType<IT>(), MPI_MAX, commGrid->GetWorld());
1640 template <
class IT,
class NT,
class DER>
1643 int colrank = commGrid->GetRankInProcRow();
1644 IT cols = getncol();
1645 IT rows = getnrow();
1646 IT m_perproc = cols / commGrid->GetGridRows();
1647 IT n_perproc = rows / commGrid->GetGridCols();
1648 IT moffset = commGrid->GetRankInProcCol() * m_perproc;
1649 IT noffset = colrank * n_perproc;
1652 int pc = commGrid->GetGridCols();
1654 if(colrank!=pc-1 ) n_thisproc = n_perproc;
1655 else n_thisproc = cols - (pc-1)*n_perproc;
1658 std::vector<IT> firstRowInCol(n_thisproc,getnrow());
1659 std::vector<IT> lastRowInCol(n_thisproc,-1);
1661 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
1663 IT diagrow = colit.colid() + noffset;
1664 typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit);
1665 if(nzit != spSeq->endnz(colit))
1667 IT firstrow = nzit.rowid() + moffset;
1668 IT lastrow = (nzit+ colit.nnz()-1).rowid() + moffset;
1669 if(firstrow <= diagrow)
1671 firstRowInCol[colit.colid()] = firstrow;
1673 if(lastrow >= diagrow)
1675 lastRowInCol[colit.colid()] = lastrow;
1680 std::vector<IT> firstRowInCol_global(n_thisproc,getnrow());
1682 MPI_Allreduce( firstRowInCol.data(), firstRowInCol_global.data(), n_thisproc, MPIType<IT>(), MPI_MIN, commGrid->colWorld);
1686 for(IT i=0; i<n_thisproc; i++)
1688 if(firstRowInCol_global[i]==rows)
1691 profile += (i + noffset - firstRowInCol_global[i]);
1694 IT profile_global = 0;
1695 MPI_Allreduce( &profile, &profile_global, 1, MPIType<IT>(), MPI_SUM, commGrid->rowWorld);
1697 return (profile_global);
1702 template <
class IT,
class NT,
class DER>
1703 template <
typename VT,
typename GIT,
typename _BinaryOperation>
1708 SpParHelper::Print(
"SpParMat::MaskedReduce() is only implemented for Colum\n");
1711 MaskedReduce(rvec, mask, dim, __binary_op,
id,
myidentity<NT>(), exclude);
1723 template <
class IT,
class NT,
class DER>
1724 template <
typename VT,
typename GIT,
typename _BinaryOperation,
typename _UnaryOperation>
1725 void SpParMat<IT,NT,DER>::MaskedReduce(
FullyDistVec<GIT,VT> & rvec,
FullyDistSpVec<GIT,VT> & mask,
Dim dim, _BinaryOperation __binary_op, VT
id, _UnaryOperation __unary_op,
bool exclude)
const 1727 MPI_Comm World = commGrid->GetWorld();
1728 MPI_Comm ColWorld = commGrid->GetColWorld();
1729 MPI_Comm RowWorld = commGrid->GetRowWorld();
1733 SpParHelper::Print(
"SpParMat::MaskedReduce() is only implemented for Colum\n");
1736 if(*rvec.commGrid != *commGrid)
1738 SpParHelper::Print(
"Grids are not comparable, SpParMat::MaskedReduce() fails!", commGrid->GetWorld());
1742 int rowneighs = commGrid->GetGridCols();
1743 int rowrank = commGrid->GetRankInProcRow();
1744 std::vector<int> rownz(rowneighs);
1745 int locnnzMask =
static_cast<int> (mask.
getlocnnz());
1746 rownz[rowrank] = locnnzMask;
1747 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rownz.data(), 1, MPI_INT, RowWorld);
1748 std::vector<int> dpls(rowneighs+1,0);
1749 std::partial_sum(rownz.begin(), rownz.end(), dpls.data()+1);
1750 int accnz = std::accumulate(rownz.begin(), rownz.end(), 0);
1751 std::vector<GIT> sendInd(locnnzMask);
1752 std::transform(mask.ind.begin(), mask.ind.end(),sendInd.begin(), bind2nd(std::plus<GIT>(), mask.RowLenUntil()));
1754 std::vector<GIT> indMask(accnz);
1755 MPI_Allgatherv(sendInd.data(), rownz[rowrank], MPIType<GIT>(), indMask.data(), rownz.data(), dpls.data(), MPIType<GIT>(), RowWorld);
1759 IT n_thiscol = getlocalcols();
1760 int colneighs = commGrid->GetGridRows();
1761 int colrank = commGrid->GetRankInProcCol();
1763 GIT * loclens =
new GIT[colneighs];
1764 GIT * lensums =
new GIT[colneighs+1]();
1766 GIT n_perproc = n_thiscol / colneighs;
1767 if(colrank == colneighs-1)
1768 loclens[colrank] = n_thiscol - (n_perproc*colrank);
1770 loclens[colrank] = n_perproc;
1772 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<GIT>(), loclens, 1, MPIType<GIT>(), commGrid->GetColWorld());
1773 std::partial_sum(loclens, loclens+colneighs, lensums+1);
1775 std::vector<VT> trarr;
1776 typename DER::SpColIter colit = spSeq->begcol();
1777 for(
int i=0; i< colneighs; ++i)
1779 VT * sendbuf =
new VT[loclens[i]];
1780 std::fill(sendbuf, sendbuf+loclens[i],
id);
1782 for(; colit != spSeq->endcol() && colit.colid() < lensums[i+1]; ++colit)
1785 typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit);
1787 for(; nzit != spSeq->endnz(colit) && k < indMask.size(); )
1789 if(nzit.rowid() < indMask[k])
1793 sendbuf[colit.colid()-lensums[i]] = __binary_op(static_cast<VT>(__unary_op(nzit.value())), sendbuf[colit.colid()-lensums[i]]);
1797 else if(nzit.rowid() > indMask[k]) ++k;
1802 sendbuf[colit.colid()-lensums[i]] = __binary_op(static_cast<VT>(__unary_op(nzit.value())), sendbuf[colit.colid()-lensums[i]]);
1811 while(nzit != spSeq->endnz(colit))
1813 sendbuf[colit.colid()-lensums[i]] = __binary_op(static_cast<VT>(__unary_op(nzit.value())), sendbuf[colit.colid()-lensums[i]]);
1819 VT * recvbuf = NULL;
1822 trarr.resize(loclens[i]);
1823 recvbuf = SpHelper::p2a(trarr);
1831 GIT trlen = trarr.size();
1832 int diagneigh = commGrid->GetComplementRank();
1834 MPI_Sendrecv(&trlen, 1, MPIType<IT>(), diagneigh,
TRNNZ, &reallen, 1, MPIType<IT>(), diagneigh,
TRNNZ, commGrid->GetWorld(), &status);
1836 rvec.arr.resize(reallen);
1837 MPI_Sendrecv(SpHelper::p2a(trarr), trlen, MPIType<VT>(), diagneigh,
TRX, SpHelper::p2a(rvec.arr), reallen, MPIType<VT>(), diagneigh,
TRX, commGrid->GetWorld(), &status);
1838 rvec.glen = getncol();
1845 template <
class IT,
class NT,
class DER>
1846 template <
typename NNT,
typename NDER>
1849 NDER * convert =
new NDER(*spSeq);
1854 template <
class IT,
class NT,
class DER>
1855 template <
typename NIT,
typename NNT,
typename NDER>
1858 NDER * convert =
new NDER(*spSeq);
1866 template <
class IT,
class NT,
class DER>
1870 DER * tempseq =
new DER((*spSeq)(ri, ci));
1881 template <
class IT,
class NT,
class DER>
1882 template <
typename PTNTBOOL,
typename PTBOOLNT>
1888 if((*(ri.commGrid) != *(commGrid)) || (*(ci.commGrid) != *(commGrid)))
1890 SpParHelper::Print(
"Grids are not comparable, SpRef fails !");
1898 locmax_ri = *std::max_element(ri.arr.begin(), ri.arr.end());
1900 locmax_ci = *std::max_element(ci.arr.begin(), ci.arr.end());
1902 IT total_m = getnrow();
1903 IT total_n = getncol();
1904 if(locmax_ri > total_m || locmax_ci > total_n)
1912 IT roffset = ri.RowLenUntil();
1913 IT rrowlen = ri.MyRowLength();
1914 IT coffset = ci.RowLenUntil();
1915 IT crowlen = ci.MyRowLength();
1923 IT rowneighs = commGrid->GetGridCols();
1924 IT totalm = getnrow();
1925 IT totaln = getncol();
1926 IT m_perproccol = totalm / rowneighs;
1927 IT n_perproccol = totaln / rowneighs;
1930 IT diagneigh = commGrid->GetComplementRank();
1931 IT mylocalrows = getlocalrows();
1932 IT mylocalcols = getlocalcols();
1935 MPI_Sendrecv(&mylocalrows, 1, MPIType<IT>(), diagneigh,
TRROWX, &trlocalrows, 1, MPIType<IT>(), diagneigh,
TRROWX, commGrid->GetWorld(), &status);
1938 std::vector< std::vector<IT> > rowid(rowneighs);
1939 std::vector< std::vector<IT> > colid(rowneighs);
1942 IT locvec = ri.arr.size();
1943 for(
typename std::vector<IT>::size_type i=0; i< (unsigned)locvec; ++i)
1947 IT rowrec = (m_perproccol!=0) ? std::min(ri.arr[i] / m_perproccol, rowneighs-1) : (rowneighs-1);
1950 rowid[rowrec].push_back( i + roffset);
1951 colid[rowrec].push_back(ri.arr[i] - (rowrec * m_perproccol));
1954 int * sendcnt =
new int[rowneighs];
1955 int * recvcnt =
new int[rowneighs];
1956 for(IT i=0; i<rowneighs; ++i)
1957 sendcnt[i] = rowid[i].
size();
1959 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetRowWorld());
1960 int * sdispls =
new int[rowneighs]();
1961 int * rdispls =
new int[rowneighs]();
1962 std::partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
1963 std::partial_sum(recvcnt, recvcnt+rowneighs-1, rdispls+1);
1964 IT p_nnz = std::accumulate(recvcnt,recvcnt+rowneighs, static_cast<IT>(0));
1967 IT * p_rows =
new IT[p_nnz];
1968 IT * p_cols =
new IT[p_nnz];
1969 IT * senddata =
new IT[locvec];
1970 for(
int i=0; i<rowneighs; ++i)
1972 std::copy(rowid[i].begin(), rowid[i].end(), senddata+sdispls[i]);
1973 std::vector<IT>().swap(rowid[i]);
1975 MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_rows, recvcnt, rdispls, MPIType<IT>(), commGrid->GetRowWorld());
1977 for(
int i=0; i<rowneighs; ++i)
1979 std::copy(colid[i].begin(), colid[i].end(), senddata+sdispls[i]);
1980 std::vector<IT>().swap(colid[i]);
1982 MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_cols, recvcnt, rdispls, MPIType<IT>(), commGrid->GetRowWorld());
1985 std::tuple<IT,IT,bool> * p_tuples =
new std::tuple<IT,IT,bool>[p_nnz];
1986 for(IT i=0; i< p_nnz; ++i)
1988 p_tuples[i] = std::make_tuple(p_rows[i], p_cols[i], 1);
1992 DER_IT * PSeq =
new DER_IT();
1993 PSeq->Create( p_nnz, rrowlen, trlocalrows, p_tuples);
1998 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
2000 SpParHelper::Print(
"Symmetric permutation\n", commGrid->GetWorld());
2006 SpParHelper::Print(
"In place multiplication\n", commGrid->GetWorld());
2008 *
this = Mult_AnXBn_DoubleBuff<PTBOOLNT, NT, DER>(P, *
this,
false,
true);
2016 *
this = Mult_AnXBn_DoubleBuff<PTNTBOOL, NT, DER>(*
this, P,
true,
true);
2021 PA = Mult_AnXBn_DoubleBuff<PTBOOLNT, NT, DER>(P,*
this);
2023 return Mult_AnXBn_DoubleBuff<PTNTBOOL, NT, DER>(PA, P);
2033 PA = Mult_AnXBn_DoubleBuff<PTBOOLNT, NT, DER>(P, *
this);
2040 locvec = ci.arr.size();
2041 for(
typename std::vector<IT>::size_type i=0; i< (unsigned)locvec; ++i)
2044 IT rowrec = (n_perproccol!=0) ? std::min(ci.arr[i] / n_perproccol, rowneighs-1) : (rowneighs-1);
2047 rowid[rowrec].push_back( i + coffset);
2048 colid[rowrec].push_back(ci.arr[i] - (rowrec * n_perproccol));
2051 for(IT i=0; i<rowneighs; ++i)
2052 sendcnt[i] = rowid[i].
size();
2054 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetRowWorld());
2055 std::fill(sdispls, sdispls+rowneighs, 0);
2056 std::fill(rdispls, rdispls+rowneighs, 0);
2057 std::partial_sum(sendcnt, sendcnt+rowneighs-1, sdispls+1);
2058 std::partial_sum(recvcnt, recvcnt+rowneighs-1, rdispls+1);
2059 IT q_nnz = std::accumulate(recvcnt,recvcnt+rowneighs, static_cast<IT>(0));
2062 IT * q_rows =
new IT[q_nnz];
2063 IT * q_cols =
new IT[q_nnz];
2064 senddata =
new IT[locvec];
2065 for(
int i=0; i<rowneighs; ++i)
2067 std::copy(rowid[i].begin(), rowid[i].end(), senddata+sdispls[i]);
2068 std::vector<IT>().swap(rowid[i]);
2070 MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), q_rows, recvcnt, rdispls, MPIType<IT>(), commGrid->GetRowWorld());
2072 for(
int i=0; i<rowneighs; ++i)
2074 std::copy(colid[i].begin(), colid[i].end(), senddata+sdispls[i]);
2075 std::vector<IT>().swap(colid[i]);
2077 MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), q_cols, recvcnt, rdispls, MPIType<IT>(), commGrid->GetRowWorld());
2078 DeleteAll(senddata, sendcnt, recvcnt, sdispls, rdispls);
2080 std::tuple<IT,IT,bool> * q_tuples =
new std::tuple<IT,IT,bool>[q_nnz];
2081 for(IT i=0; i< q_nnz; ++i)
2083 q_tuples[i] = std::make_tuple(q_rows[i], q_cols[i], 1);
2086 DER_IT * QSeq =
new DER_IT();
2087 QSeq->Create( q_nnz, crowlen, mylocalcols, q_tuples);
2095 *
this = Mult_AnXBn_DoubleBuff<PTNTBOOL, NT, DER>(PA, Q,
true,
true);
2100 return Mult_AnXBn_DoubleBuff<PTNTBOOL, NT, DER>(PA, Q);
2105 template <
class IT,
class NT,
class DER>
2110 if((*(ri.commGrid) != *(B.commGrid)) || (*(ci.commGrid) != *(B.commGrid)))
2112 SpParHelper::Print(
"Grids are not comparable, SpAsgn fails !", commGrid->GetWorld());
2115 IT total_m_A = getnrow();
2116 IT total_n_A = getncol();
2120 if(total_m_B != ri.TotalLength())
2122 SpParHelper::Print(
"First dimension of B does NOT match the length of ri, SpAsgn fails !", commGrid->GetWorld());
2125 if(total_n_B != ci.TotalLength())
2127 SpParHelper::Print(
"Second dimension of B does NOT match the length of ci, SpAsgn fails !", commGrid->GetWorld());
2134 rvec->
iota(total_m_B, 0);
2141 qvec->
iota(total_n_B, 0);
2148 template <
class IT,
class NT,
class DER>
2153 if((*(ri.commGrid) != *(commGrid)) || (*(ci.commGrid) != *(commGrid)))
2155 SpParHelper::Print(
"Grids are not comparable, Prune fails!\n", commGrid->GetWorld());
2163 locmax_ri = *std::max_element(ri.arr.begin(), ri.arr.end());
2165 locmax_ci = *std::max_element(ci.arr.begin(), ci.arr.end());
2167 IT total_m = getnrow();
2168 IT total_n = getncol();
2169 if(locmax_ri > total_m || locmax_ci > total_n)
2183 template <
class IT,
class NT,
class DER>
2184 template <
typename _BinaryOperation>
2187 MPI_Barrier(MPI_COMM_WORLD);
2188 if(getncol() != pvals.TotalLength())
2190 std::ostringstream outs;
2191 outs <<
"Can not prune column-by-column, dimensions does not match"<< std::endl;
2192 outs << getncol() <<
" != " << pvals.TotalLength() << std::endl;
2193 SpParHelper::Print(outs.str());
2196 if(! ( *(getcommgrid()) == *(pvals.
getcommgrid())) )
2198 std::cout <<
"Grids are not comparable for PurneColumn" << std::endl;
2202 MPI_Comm World = pvals.commGrid->GetWorld();
2203 MPI_Comm ColWorld = pvals.commGrid->GetColWorld();
2209 int diagneigh = pvals.commGrid->GetComplementRank();
2211 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
2214 NT * trxnums =
new NT[trxsize];
2215 MPI_Sendrecv(const_cast<NT*>(SpHelper::p2a(pvals.arr)), xsize, MPIType<NT>(), diagneigh,
TRX, trxnums, trxsize, MPIType<NT>(), diagneigh,
TRX, World, &status);
2217 int colneighs, colrank;
2218 MPI_Comm_size(ColWorld, &colneighs);
2219 MPI_Comm_rank(ColWorld, &colrank);
2220 int * colsize =
new int[colneighs];
2221 colsize[colrank] = trxsize;
2222 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize, 1, MPI_INT, ColWorld);
2223 int * dpls =
new int[colneighs]();
2224 std::partial_sum(colsize, colsize+colneighs-1, dpls+1);
2225 int accsize = std::accumulate(colsize, colsize+colneighs, 0);
2226 std::vector<NT> numacc(accsize);
2228 #ifdef COMBBLAS_DEBUG 2229 std::ostringstream outs2;
2230 outs2 <<
"PruneColumn displacements: ";
2231 for(
int i=0; i< colneighs; ++i)
2233 outs2 << dpls[i] <<
" ";
2236 SpParHelper::Print(outs2.str());
2241 MPI_Allgatherv(trxnums, trxsize, MPIType<NT>(), numacc.data(), colsize, dpls, MPIType<NT>(), ColWorld);
2247 assert(accsize == getlocalcols());
2250 spSeq->PruneColumn(numacc.data(), __binary_op, inPlace);
2255 return SpParMat<IT,NT,DER>(spSeq->PruneColumn(numacc.data(), __binary_op, inPlace), commGrid);
2262 template <
class IT,
class NT,
class DER>
2263 template <
typename _BinaryOperation>
2266 MPI_Barrier(MPI_COMM_WORLD);
2267 if(getncol() != pvals.TotalLength())
2269 std::ostringstream outs;
2270 outs <<
"Can not prune column-by-column, dimensions does not match"<< std::endl;
2271 outs << getncol() <<
" != " << pvals.TotalLength() << std::endl;
2272 SpParHelper::Print(outs.str());
2275 if(! ( *(getcommgrid()) == *(pvals.
getcommgrid())) )
2277 std::cout <<
"Grids are not comparable for PurneColumn" << std::endl;
2281 MPI_Comm World = pvals.commGrid->GetWorld();
2282 MPI_Comm ColWorld = pvals.commGrid->GetColWorld();
2283 int diagneigh = pvals.commGrid->GetComplementRank();
2286 IT roffst = pvals.RowLenUntil();
2291 MPI_Sendrecv(&roffst, 1, MPIType<IT>(), diagneigh,
TROST, &roffset, 1, MPIType<IT>(), diagneigh,
TROST, World, &status);
2292 MPI_Sendrecv(&xlocnz, 1, MPIType<IT>(), diagneigh,
TRNNZ, &trxlocnz, 1, MPIType<IT>(), diagneigh,
TRNNZ, World, &status);
2294 std::vector<IT> trxinds (trxlocnz);
2295 std::vector<NT> trxnums (trxlocnz);
2296 MPI_Sendrecv(pvals.ind.data(), xlocnz, MPIType<IT>(), diagneigh,
TRI, trxinds.data(), trxlocnz, MPIType<IT>(), diagneigh,
TRI, World, &status);
2297 MPI_Sendrecv(pvals.num.data(), xlocnz, MPIType<NT>(), diagneigh,
TRX, trxnums.data(), trxlocnz, MPIType<NT>(), diagneigh,
TRX, World, &status);
2298 std::transform(trxinds.data(), trxinds.data()+trxlocnz, trxinds.data(), std::bind2nd(std::plus<IT>(), roffset));
2301 int colneighs, colrank;
2302 MPI_Comm_size(ColWorld, &colneighs);
2303 MPI_Comm_rank(ColWorld, &colrank);
2304 int * colnz =
new int[colneighs];
2305 colnz[colrank] = trxlocnz;
2306 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colnz, 1, MPI_INT, ColWorld);
2307 int * dpls =
new int[colneighs]();
2308 std::partial_sum(colnz, colnz+colneighs-1, dpls+1);
2309 IT accnz = std::accumulate(colnz, colnz+colneighs, 0);
2311 std::vector<IT> indacc(accnz);
2312 std::vector<NT> numacc(accnz);
2313 MPI_Allgatherv(trxinds.data(), trxlocnz, MPIType<IT>(), indacc.data(), colnz, dpls, MPIType<IT>(), ColWorld);
2314 MPI_Allgatherv(trxnums.data(), trxlocnz, MPIType<NT>(), numacc.data(), colnz, dpls, MPIType<NT>(), ColWorld);
2322 spSeq->PruneColumn(indacc.data(), numacc.data(), __binary_op, inPlace);
2327 return SpParMat<IT,NT,DER>(spSeq->PruneColumn(indacc.data(), numacc.data(), __binary_op, inPlace), commGrid);
2334 template <
class IT,
class NT,
class DER>
2337 if(*commGrid == *rhs.commGrid)
2339 spSeq->EWiseMult(*(rhs.spSeq), exclude);
2343 std::cout <<
"Grids are not comparable, EWiseMult() fails !" << std::endl;
2349 template <
class IT,
class NT,
class DER>
2352 if(*commGrid == *rhs.commGrid)
2358 std::cout <<
"Grids are not comparable, EWiseScale() fails !" << std::endl;
2363 template <
class IT,
class NT,
class DER>
2364 template <
typename _BinaryOperation>
2367 if(*commGrid == *rhs.commGrid)
2369 if(getlocalrows() == rhs.m && getlocalcols() == rhs.n)
2375 std::cout <<
"Matrices have different dimensions, UpdateDense() fails !" << std::endl;
2381 std::cout <<
"Grids are not comparable, UpdateDense() fails !" << std::endl;
2386 template <
class IT,
class NT,
class DER>
2393 if (commGrid->myrank == 0)
2394 std::cout <<
"As a whole: " << mm <<
" rows and "<< nn <<
" columns and "<< nznz <<
" nonzeros" << std::endl;
2397 IT allprocs = commGrid->grrows * commGrid->grcols;
2398 for(IT i=0; i< allprocs; ++i)
2400 if (commGrid->myrank == i)
2402 std::cout <<
"Processor (" << commGrid->GetRankInProcRow() <<
"," << commGrid->GetRankInProcCol() <<
")'s data: " << std::endl;
2405 MPI_Barrier(commGrid->GetWorld());
2410 template <
class IT,
class NT,
class DER>
2413 int local =
static_cast<int>((*spSeq) == (*(rhs.spSeq)));
2415 MPI_Allreduce( &local, &whole, 1, MPI_INT, MPI_BAND, commGrid->GetWorld());
2416 return static_cast<bool>(whole);
2424 template <
class IT,
class NT,
class DER>
2425 template <
typename _BinaryOperation,
typename LIT>
2428 int nprocs = commGrid->GetSize();
2429 int * sendcnt =
new int[nprocs];
2430 int * recvcnt =
new int[nprocs];
2431 for(
int i=0; i<nprocs; ++i)
2432 sendcnt[i] = data[i].
size();
2434 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetWorld());
2435 int * sdispls =
new int[nprocs]();
2436 int * rdispls =
new int[nprocs]();
2437 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
2438 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
2439 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
2440 IT totsent = std::accumulate(sendcnt,sendcnt+nprocs, static_cast<IT>(0));
2442 assert((totsent < std::numeric_limits<int>::max()));
2443 assert((totrecv < std::numeric_limits<int>::max()));
2448 commGrid->OpenDebugFile(
"Displacements", oput);
2449 copy(sdispls, sdispls+nprocs, ostream_iterator<int>(oput,
" ")); oput << endl;
2450 copy(rdispls, rdispls+nprocs, ostream_iterator<int>(oput,
" ")); oput << endl;
2454 if(commGrid->GetRank() == 0) gsizes =
new IT[nprocs];
2455 MPI_Gather(&totrecv, 1, MPIType<IT>(), gsizes, 1, MPIType<IT>(), 0, commGrid->GetWorld());
2456 if(commGrid->GetRank() == 0) { std::copy(gsizes, gsizes+nprocs, std::ostream_iterator<IT>(std::cout,
" ")); std::cout << std::endl; }
2457 MPI_Barrier(commGrid->GetWorld());
2458 MPI_Gather(&totsent, 1, MPIType<IT>(), gsizes, 1, MPIType<IT>(), 0, commGrid->GetWorld());
2459 if(commGrid->GetRank() == 0) { copy(gsizes, gsizes+nprocs, ostream_iterator<IT>(cout,
" ")); cout << endl; }
2460 MPI_Barrier(commGrid->GetWorld());
2461 if(commGrid->GetRank() == 0)
delete [] gsizes;
2464 std::tuple<LIT,LIT,NT> * senddata =
new std::tuple<LIT,LIT,NT>[locsize];
2465 for(
int i=0; i<nprocs; ++i)
2467 std::copy(data[i].begin(), data[i].end(), senddata+sdispls[i]);
2469 data[i].shrink_to_fit();
2471 MPI_Datatype MPI_triple;
2472 MPI_Type_contiguous(
sizeof(std::tuple<LIT,LIT,NT>), MPI_CHAR, &MPI_triple);
2473 MPI_Type_commit(&MPI_triple);
2475 std::tuple<LIT,LIT,NT> * recvdata =
new std::tuple<LIT,LIT,NT>[totrecv];
2476 MPI_Alltoallv(senddata, sendcnt, sdispls, MPI_triple, recvdata, recvcnt, rdispls, MPI_triple, commGrid->GetWorld());
2478 DeleteAll(senddata, sendcnt, recvcnt, sdispls, rdispls);
2479 MPI_Type_free(&MPI_triple);
2481 int r = commGrid->GetGridRows();
2482 int s = commGrid->GetGridCols();
2483 IT m_perproc = total_m / r;
2484 IT n_perproc = total_n / s;
2485 int myprocrow = commGrid->GetRankInProcCol();
2486 int myproccol = commGrid->GetRankInProcRow();
2487 IT locrows, loccols;
2488 if(myprocrow != r-1) locrows = m_perproc;
2489 else locrows = total_m - myprocrow * m_perproc;
2490 if(myproccol != s-1) loccols = n_perproc;
2491 else loccols = total_n - myproccol * n_perproc;
2498 spSeq =
new DER(A,
false);
2503 template <
class IT,
class NT,
class DER>
2507 if((*(distrows.commGrid) != *(distcols.commGrid)) || (*(distcols.commGrid) != *(distvals.commGrid)))
2509 SpParHelper::Print(
"Grids are not comparable, Sparse() fails!\n");
2512 if((distrows.TotalLength() != distcols.TotalLength()) || (distcols.TotalLength() != distvals.TotalLength()))
2514 SpParHelper::Print(
"Vectors have different sizes, Sparse() fails!");
2518 commGrid = distrows.commGrid;
2519 int nprocs = commGrid->GetSize();
2520 std::vector< std::vector < std::tuple<IT,IT,NT> > > data(nprocs);
2523 for(IT i=0; i<locsize; ++i)
2526 int owner = Owner(total_m, total_n, distrows.arr[i], distcols.arr[i], lrow, lcol);
2527 data[owner].push_back(std::make_tuple(lrow,lcol,distvals.arr[i]));
2531 SparseCommon(data, locsize, total_m, total_n, std::plus<NT>());
2535 SparseCommon(data, locsize, total_m, total_n,
maximum<NT>());
2541 template <
class IT,
class NT,
class DER>
2545 if((*(distrows.commGrid) != *(distcols.commGrid)) )
2547 SpParHelper::Print(
"Grids are not comparable, Sparse() fails!\n");
2550 if((distrows.TotalLength() != distcols.TotalLength()) )
2552 SpParHelper::Print(
"Vectors have different sizes, Sparse() fails!\n");
2555 commGrid = distrows.commGrid;
2556 int nprocs = commGrid->GetSize();
2557 std::vector< std::vector < std::tuple<IT,IT,NT> > > data(nprocs);
2560 for(IT i=0; i<locsize; ++i)
2563 int owner = Owner(total_m, total_n, distrows.arr[i], distcols.arr[i], lrow, lcol);
2564 data[owner].push_back(std::make_tuple(lrow,lcol,val));
2568 SparseCommon(data, locsize, total_m, total_n, std::plus<NT>());
2572 SparseCommon(data, locsize, total_m, total_n,
maximum<NT>());
2576 template <
class IT,
class NT,
class DER>
2577 template <
class DELIT>
2581 typedef typename DER::LocalIT LIT;
2583 int nprocs = commGrid->GetSize();
2584 int gridrows = commGrid->GetGridRows();
2585 int gridcols = commGrid->GetGridCols();
2586 std::vector< std::vector<LIT> > data(nprocs);
2591 if(
sizeof(LIT) <
sizeof(DELIT))
2593 std::ostringstream outs;
2594 outs <<
"Warning: Using smaller indices for the matrix than DistEdgeList\n";
2595 outs <<
"Local matrices are " << m_perproc <<
"-by-" << n_perproc << std::endl;
2596 SpParHelper::Print(outs.str(), commGrid->GetWorld());
2602 int64_t perstage = DEL.nedges / stages;
2604 std::vector<LIT> alledges;
2606 for(LIT s=0; s< stages; ++s)
2609 int64_t n_after= ((s==(stages-1))? DEL.nedges : ((s+1)*perstage));
2616 for (
int64_t i = n_befor; i < n_after; i++)
2618 int64_t fr = get_v0_from_edge(&(DEL.pedges[i]));
2619 int64_t to = get_v1_from_edge(&(DEL.pedges[i]));
2621 if(fr >= 0 && to >= 0)
2625 data[owner].push_back(lrow);
2626 data[owner].push_back(lcol);
2633 for (
int64_t i = n_befor; i < n_after; i++)
2635 if(DEL.edges[2*i+0] >= 0 && DEL.edges[2*i+1] >= 0)
2638 int owner = Owner(DEL.
getGlobalV(), DEL.
getGlobalV(), DEL.edges[2*i+0], DEL.edges[2*i+1], lrow, lcol);
2639 data[owner].push_back(lrow);
2640 data[owner].push_back(lcol);
2646 LIT * sendbuf =
new LIT[2*realedges];
2647 int * sendcnt =
new int[nprocs];
2648 int * sdispls =
new int[nprocs];
2649 for(
int i=0; i<nprocs; ++i)
2650 sendcnt[i] = data[i].
size();
2652 int * rdispls =
new int[nprocs];
2653 int * recvcnt =
new int[nprocs];
2654 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT,commGrid->GetWorld());
2658 for(
int i=0; i<nprocs-1; ++i)
2660 sdispls[i+1] = sdispls[i] + sendcnt[i];
2661 rdispls[i+1] = rdispls[i] + recvcnt[i];
2663 for(
int i=0; i<nprocs; ++i)
2664 std::copy(data[i].begin(), data[i].end(), sendbuf+sdispls[i]);
2667 for(
int i=0; i<nprocs; ++i)
2668 std::vector<LIT>().swap(data[i]);
2672 IT thisrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
2673 LIT * recvbuf =
new LIT[thisrecv];
2674 totrecv += thisrecv;
2676 MPI_Alltoallv(sendbuf, sendcnt, sdispls, MPIType<LIT>(), recvbuf, recvcnt, rdispls, MPIType<LIT>(), commGrid->GetWorld());
2677 DeleteAll(sendcnt, recvcnt, sdispls, rdispls,sendbuf);
2678 std::copy (recvbuf,recvbuf+thisrecv,std::back_inserter(alledges));
2682 int myprocrow = commGrid->GetRankInProcCol();
2683 int myproccol = commGrid->GetRankInProcRow();
2684 LIT locrows, loccols;
2685 if(myprocrow != gridrows-1) locrows = m_perproc;
2686 else locrows = DEL.
getGlobalV() - myprocrow * m_perproc;
2687 if(myproccol != gridcols-1) loccols = n_perproc;
2688 else loccols = DEL.
getGlobalV() - myproccol * n_perproc;
2691 spSeq =
new DER(A,
false);
2694 template <
class IT,
class NT,
class DER>
2697 MPI_Comm DiagWorld = commGrid->GetDiagWorld();
2700 if(DiagWorld != MPI_COMM_NULL)
2705 spSeq =
new DER(tuples,
false);
2707 MPI_Allreduce( &removed, & totrem, 1, MPIType<IT>(), MPI_SUM, commGrid->GetWorld());
2713 template <
class IT,
class NT,
class DER>
2716 MPI_Comm DiagWorld = commGrid->GetDiagWorld();
2717 if(DiagWorld != MPI_COMM_NULL)
2719 typedef typename DER::LocalIT LIT;
2722 tuples.
AddLoops(loopval, replaceExisting);
2724 spSeq =
new DER(tuples,
false);
2730 template <
class IT,
class NT,
class DER>
2735 if(*loopvals.commGrid != *commGrid)
2737 SpParHelper::Print(
"Grids are not comparable, SpParMat::AddLoops() fails!\n", commGrid->GetWorld());
2740 if (getncol()!= loopvals.TotalLength())
2742 SpParHelper::Print(
"The number of entries in loopvals is not equal to the number of diagonal entries.\n");
2748 int rowProcs = commGrid->GetGridCols();
2749 std::vector<int> recvcnt(rowProcs, 0);
2750 std::vector<int> rdpls(rowProcs, 0);
2751 MPI_Gather(&locsize, 1, MPI_INT, recvcnt.data(), 1, MPI_INT, commGrid->GetDiagOfProcRow(), commGrid->GetRowWorld());
2752 std::partial_sum(recvcnt.data(), recvcnt.data()+rowProcs-1, rdpls.data()+1);
2754 IT totrecv = rdpls[rowProcs-1] + recvcnt[rowProcs-1];
2755 assert((totrecv < std::numeric_limits<int>::max()));
2757 std::vector<NT> rowvals(totrecv);
2758 MPI_Gatherv(loopvals.arr.data(), locsize, MPIType<NT>(), rowvals.data(), recvcnt.data(), rdpls.data(),
2759 MPIType<NT>(), commGrid->GetDiagOfProcRow(), commGrid->GetRowWorld());
2762 MPI_Comm DiagWorld = commGrid->GetDiagWorld();
2763 if(DiagWorld != MPI_COMM_NULL)
2765 typedef typename DER::LocalIT LIT;
2768 tuples.
AddLoops(rowvals, replaceExisting);
2770 spSeq =
new DER(tuples,
false);
2778 template <
class IT,
class NT,
class DER>
2779 template <
typename LIT,
typename OT>
2782 if(spSeq->getnsplit() > 0)
2784 SpParHelper::Print(
"Can not declare preallocated buffers for multithreaded execution\n", commGrid->GetWorld());
2788 typedef typename DER::LocalIT LocIT;
2792 LocIT nA = spSeq->getncol();
2794 int p_c = commGrid->GetGridCols();
2795 int p_r = commGrid->GetGridRows();
2797 LocIT rwperproc = mA / p_c;
2798 LocIT cwperproc = nA / p_r;
2801 LocIT * colinds =
seq->GetDCSC()->jc;
2802 LocIT locnzc =
seq->getnzc();
2804 int * gsizes = NULL;
2807 std::vector<LocIT> pack2send;
2812 for(
int pid = 1; pid <= p_r; pid++)
2816 IT offset = dummyRHS.RowLenUntil(pid-1);
2817 int diagneigh = commGrid->GetComplementRank();
2818 MPI_Sendrecv(&offset, 1, MPIType<IT>(), diagneigh,
TRTAGNZ, &diagoffset, 1, MPIType<IT>(), diagneigh,
TRTAGNZ, commGrid->GetWorld(), &status);
2820 LocIT endind = (pid == p_r)? nA : static_cast<LocIT>(pid) * cwperproc;
2821 while(cci < locnzc && colinds[cci] < endind)
2823 pack2send.push_back(colinds[cci++]-diagoffset);
2825 if(pid-1 == myrank) gsizes =
new int[p_r];
2826 MPI_Gather(&mysize, 1, MPI_INT, gsizes, 1, MPI_INT, pid-1, commGrid->GetColWorld());
2829 IT colcnt = std::accumulate(gsizes, gsizes+p_r, static_cast<IT>(0));
2830 recvbuf =
new IT[colcnt];
2831 dpls =
new IT[p_r]();
2832 std::partial_sum(gsizes, gsizes+p_r-1, dpls+1);
2836 MPI_Gatherv(SpHelper::p2a(pack2send), mysize, MPIType<LocIT>(), recvbuf, gsizes, dpls, MPIType<LocIT>(), pid-1, commGrid->GetColWorld());
2837 std::vector<LocIT>().swap(pack2send);
2841 recveclen = dummyRHS.MyLocLength();
2842 std::vector< std::vector<LocIT> > service(recveclen);
2843 for(
int i=0; i< p_r; ++i)
2845 for(
int j=0; j< gsizes[i]; ++j)
2847 IT colid2update = recvbuf[dpls[i]+j];
2848 if(service[colid2update].
size() < GATHERVNEIGHLIMIT)
2850 service.push_back(i);
2860 std::vector<bool> isthere(mA,
false);
2861 std::vector<int> maxlens(p_c,0);
2863 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
2865 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
2867 LocIT rowid = nzit.rowid();
2870 LocIT owner = std::min(nzit.rowid() / rwperproc, (LocIT) p_c-1);
2872 isthere[rowid] =
true;
2876 SpParHelper::Print(
"Optimization buffers set\n", commGrid->GetWorld());
2877 optbuf.
Set(maxlens,mA);
2880 template <
class IT,
class NT,
class DER>
2883 spSeq->RowSplit(numsplits);
2891 template <
class IT,
class NT,
class DER>
2892 template <
typename SR>
2896 std::shared_ptr<CommGrid> Grid =
ProductGrid(commGrid.get(), commGrid.get(), stages, dummy, dummy);
2898 typedef typename DER::LocalIT LIT;
2900 LIT AA_m = spSeq->getnrow();
2901 LIT AA_n = spSeq->getncol();
2903 DER seqTrn = spSeq->TransposeConst();
2905 MPI_Barrier(commGrid->GetWorld());
2907 LIT ** NRecvSizes = SpHelper::allocate2D<LIT>(DER::esscount, stages);
2908 LIT ** TRecvSizes = SpHelper::allocate2D<LIT>(DER::esscount, stages);
2910 SpParHelper::GetSetSizes( *spSeq, NRecvSizes, commGrid->GetRowWorld());
2911 SpParHelper::GetSetSizes( seqTrn, TRecvSizes, commGrid->GetColWorld());
2916 std::vector< SpTuples<LIT,NT> *> tomerge;
2918 int Nself = commGrid->GetRankInProcRow();
2919 int Tself = commGrid->GetRankInProcCol();
2921 for(
int i = 0; i < stages; ++i)
2923 std::vector<LIT> ess;
2924 if(i == Nself) NRecv = spSeq;
2927 ess.resize(DER::esscount);
2928 for(
int j=0; j< DER::esscount; ++j)
2929 ess[j] = NRecvSizes[j][i];
2933 SpParHelper::BCastMatrix(Grid->GetRowWorld(), *NRecv, ess, i);
2936 if(i == Tself) TRecv = &seqTrn;
2939 ess.resize(DER::esscount);
2940 for(
int j=0; j< DER::esscount; ++j)
2941 ess[j] = TRecvSizes[j][i];
2944 SpParHelper::BCastMatrix(Grid->GetColWorld(), *TRecv, ess, i);
2946 SpTuples<LIT,NT> * AA_cont = MultiplyReturnTuples<SR, NT>(*NRecv, *TRecv,
false,
true);
2947 if(!AA_cont->isZero())
2948 tomerge.push_back(AA_cont);
2950 if(i != Nself)
delete NRecv;
2951 if(i != Tself)
delete TRecv;
2954 SpHelper::deallocate2D(NRecvSizes, DER::esscount);
2955 SpHelper::deallocate2D(TRecvSizes, DER::esscount);
2958 spSeq =
new DER(MergeAll<SR>(tomerge, AA_m, AA_n),
false);
2959 for(
unsigned int i=0; i<tomerge.size(); ++i)
2964 template <
class IT,
class NT,
class DER>
2967 if(commGrid->myproccol == commGrid->myprocrow)
2973 typedef typename DER::LocalIT LIT;
2975 LIT locnnz = Atuples.
getnnz();
2976 LIT * rows =
new LIT[locnnz];
2977 LIT * cols =
new LIT[locnnz];
2978 NT * vals =
new NT[locnnz];
2979 for(LIT i=0; i < locnnz; ++i)
2985 LIT locm = getlocalcols();
2986 LIT locn = getlocalrows();
2989 LIT remotem, remoten, remotennz;
2990 std::swap(locm,locn);
2991 int diagneigh = commGrid->GetComplementRank();
2994 MPI_Sendrecv(&locnnz, 1, MPIType<LIT>(), diagneigh,
TRTAGNZ, &remotennz, 1, MPIType<LIT>(), diagneigh,
TRTAGNZ, commGrid->GetWorld(), &status);
2995 MPI_Sendrecv(&locn, 1, MPIType<LIT>(), diagneigh,
TRTAGM, &remotem, 1, MPIType<LIT>(), diagneigh,
TRTAGM, commGrid->GetWorld(), &status);
2996 MPI_Sendrecv(&locm, 1, MPIType<LIT>(), diagneigh,
TRTAGN, &remoten, 1, MPIType<LIT>(), diagneigh,
TRTAGN, commGrid->GetWorld(), &status);
2998 LIT * rowsrecv =
new LIT[remotennz];
2999 MPI_Sendrecv(rows, locnnz, MPIType<LIT>(), diagneigh,
TRTAGROWS, rowsrecv, remotennz, MPIType<LIT>(), diagneigh,
TRTAGROWS, commGrid->GetWorld(), &status);
3002 LIT * colsrecv =
new LIT[remotennz];
3003 MPI_Sendrecv(cols, locnnz, MPIType<LIT>(), diagneigh,
TRTAGCOLS, colsrecv, remotennz, MPIType<LIT>(), diagneigh,
TRTAGCOLS, commGrid->GetWorld(), &status);
3006 NT * valsrecv =
new NT[remotennz];
3007 MPI_Sendrecv(vals, locnnz, MPIType<NT>(), diagneigh,
TRTAGVALS, valsrecv, remotennz, MPIType<NT>(), diagneigh,
TRTAGVALS, commGrid->GetWorld(), &status);
3010 std::tuple<LIT,LIT,NT> * arrtuples =
new std::tuple<LIT,LIT,NT>[remotennz];
3011 for(LIT i=0; i< remotennz; ++i)
3013 arrtuples[i] = std::make_tuple(rowsrecv[i], colsrecv[i], valsrecv[i]);
3015 DeleteAll(rowsrecv, colsrecv, valsrecv);
3017 sort(arrtuples , arrtuples+remotennz, collexicogcmp );
3020 spSeq->Create( remotennz, remotem, remoten, arrtuples);
3025 template <
class IT,
class NT,
class DER>
3026 template <
class HANDLER>
3029 int proccols = commGrid->GetGridCols();
3030 int procrows = commGrid->GetGridRows();
3031 IT totalm = getnrow();
3032 IT totaln = getncol();
3033 IT totnnz = getnnz();
3036 if(commGrid->GetRank() == 0)
3039 std::stringstream strm;
3040 strm <<
"%%MatrixMarket matrix coordinate real general" << std::endl;
3041 strm << totalm <<
" " << totaln <<
" " << totnnz << std::endl;
3043 out.open(filename.c_str(),std::ios_base::trunc);
3044 flinelen = s.length();
3045 out.write(s.c_str(), flinelen);
3048 int colrank = commGrid->GetRankInProcCol();
3049 int colneighs = commGrid->GetGridRows();
3050 IT * locnrows =
new IT[colneighs];
3051 locnrows[colrank] = (IT) getlocalrows();
3052 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(),locnrows, 1, MPIType<IT>(), commGrid->GetColWorld());
3053 IT roffset = std::accumulate(locnrows, locnrows+colrank, 0);
3056 MPI_Datatype datatype;
3057 MPI_Type_contiguous(
sizeof(std::pair<IT,NT>), MPI_CHAR, &datatype);
3058 MPI_Type_commit(&datatype);
3060 for(
int i = 0; i < procrows; i++)
3062 if(commGrid->GetRankInProcCol() == i)
3064 IT localrows = spSeq->getnrow();
3065 std::vector< std::vector< std::pair<IT,NT> > > csr(localrows);
3066 if(commGrid->GetRankInProcRow() == 0)
3068 IT localcols = spSeq->getncol();
3069 MPI_Bcast(&localcols, 1, MPIType<IT>(), 0, commGrid->GetRowWorld());
3070 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
3072 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
3074 csr[nzit.rowid()].push_back( std::make_pair(colit.colid(), nzit.value()) );
3081 MPI_Bcast(&n_perproc, 1, MPIType<IT>(), 0, commGrid->GetRowWorld());
3082 IT noffset = commGrid->GetRankInProcRow() * n_perproc;
3083 for(
typename DER::SpColIter colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
3085 for(
typename DER::SpColIter::NzIter nzit = spSeq->begnz(colit); nzit != spSeq->endnz(colit); ++nzit)
3087 csr[nzit.rowid()].push_back( std::make_pair(colit.colid() + noffset, nzit.value()) );
3091 std::pair<IT,NT> * ents = NULL;
3092 int * gsizes = NULL, * dpls = NULL;
3093 if(commGrid->GetRankInProcRow() == 0)
3095 out.open(filename.c_str(),std::ios_base::app);
3096 gsizes =
new int[proccols];
3097 dpls =
new int[proccols]();
3099 for(
int j = 0; j < localrows; ++j)
3102 sort(csr[j].begin(), csr[j].end());
3103 int mysize = csr[j].size();
3104 MPI_Gather(&mysize, 1, MPI_INT, gsizes, 1, MPI_INT, 0, commGrid->GetRowWorld());
3105 if(commGrid->GetRankInProcRow() == 0)
3107 rowcnt = std::accumulate(gsizes, gsizes+proccols, static_cast<IT>(0));
3108 std::partial_sum(gsizes, gsizes+proccols-1, dpls+1);
3109 ents =
new std::pair<IT,NT>[rowcnt];
3114 MPI_Gatherv(SpHelper::p2a(csr[j]), mysize, datatype, ents, gsizes, dpls, datatype, 0, commGrid->GetRowWorld());
3115 if(commGrid->GetRankInProcRow() == 0)
3117 for(
int k=0; k< rowcnt; ++k)
3122 out << j + roffset + 1 <<
"\t" << ents[k].first + 1 <<
"\t";
3125 out << ents[k].first + 1 <<
"\t" << j + roffset + 1 <<
"\t";
3126 handler.save(out, ents[k].second, j + roffset, ents[k].first);
3132 if(commGrid->GetRankInProcRow() == 0)
3138 MPI_Barrier(commGrid->GetWorld());
3145 template <
class IT,
class NT,
class DER>
3149 int myrank = commGrid->GetRank();
3150 int nprocs = commGrid->GetSize();
3152 MPI_Offset fpos, end_fpos;
3154 if (stat(filename.c_str(), &st) == -1)
3156 MPI_Abort(MPI_COMM_WORLD,
NOFILE);
3158 int64_t file_size = st.st_size;
3161 std::cout <<
"File is " << file_size <<
" bytes" << std::endl;
3163 fpos = myrank * file_size / nprocs;
3165 if(myrank != (nprocs-1)) end_fpos = (myrank + 1) * file_size / nprocs;
3166 else end_fpos = file_size;
3169 MPI_File_open (commGrid->commWorld, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &mpi_fh);
3171 typedef std::map<std::string, uint64_t> KEYMAP;
3172 std::vector< KEYMAP > allkeys(nprocs);
3174 std::vector<std::string> lines;
3175 bool finished = SpParHelper::FetchBatch(mpi_fh, fpos, end_fpos,
true, lines, myrank);
3176 int64_t entriesread = lines.size();
3177 SpHelper::ProcessLinesWithStringKeys(allkeys, lines,nprocs);
3181 finished = SpParHelper::FetchBatch(mpi_fh, fpos, end_fpos,
false, lines, myrank);
3183 entriesread += lines.size();
3184 SpHelper::ProcessLinesWithStringKeys(allkeys, lines,nprocs);
3187 MPI_Reduce(&entriesread, &allentriesread, 1,
MPIType<int64_t>(), MPI_SUM, 0, commGrid->commWorld);
3188 #ifdef COMBBLAS_DEBUG 3190 std::cout <<
"Initial reading finished. Total number of entries read across all processors is " << allentriesread << std::endl;
3193 int * sendcnt =
new int[nprocs];
3194 int * recvcnt =
new int[nprocs];
3195 for(
int i=0; i<nprocs; ++i)
3196 sendcnt[i] = allkeys[i].
size();
3198 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetWorld());
3199 int * sdispls =
new int[nprocs]();
3200 int * rdispls =
new int[nprocs]();
3201 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
3202 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
3203 totsend = std::accumulate(sendcnt,sendcnt+nprocs, static_cast<IT>(0));
3204 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
3206 assert((totsend < std::numeric_limits<int>::max()));
3207 assert((totrecv < std::numeric_limits<int>::max()));
3212 senddata =
new TYPE2SEND[totsend];
3214 #pragma omp parallel for 3215 for(
int i=0; i<nprocs; ++i)
3218 for(
auto pobj:allkeys[i])
3221 std::array<char, MAXVERTNAME> vname;
3222 std::copy( pobj.first.begin(), pobj.first.end(), vname.begin() );
3223 if(pobj.first.length() <
MAXVERTNAME) vname[pobj.first.length()] =
'\0';
3225 senddata[sdispls[i]+j] = TYPE2SEND(vname, pobj.second);
3231 MPI_Datatype MPI_HASH;
3232 MPI_Type_contiguous(
sizeof(TYPE2SEND), MPI_CHAR, &MPI_HASH);
3233 MPI_Type_commit(&MPI_HASH);
3235 TYPE2SEND * recvdata =
new TYPE2SEND[totrecv];
3236 MPI_Alltoallv(senddata, sendcnt, sdispls, MPI_HASH, recvdata, recvcnt, rdispls, MPI_HASH, commGrid->GetWorld());
3239 std::set< std::pair<uint64_t, std::string> > uniqsorted;
3240 for(IT i=0; i< totrecv; ++i)
3242 auto locnull = std::find(recvdata[i].first.begin(), recvdata[i].first.end(),
'\0');
3243 std::string strtmp(recvdata[i].first.begin(), locnull);
3245 uniqsorted.insert(std::make_pair(recvdata[i].second, strtmp));
3247 uint64_t uniqsize = uniqsorted.size();
3249 #ifdef COMBBLAS_DEBUG 3251 std::cout <<
"out of " << totrecv <<
" vertices received, " << uniqsize <<
" were unique" << std::endl;
3253 uint64_t sizeuntil = 0;
3255 MPI_Exscan( &uniqsize, &sizeuntil, 1,
MPIType<uint64_t>(), MPI_SUM, commGrid->GetWorld() );
3256 MPI_Allreduce(&uniqsize, &totallength, 1,
MPIType<uint64_t>(), MPI_SUM, commGrid->GetWorld());
3257 if(myrank == 0) sizeuntil = 0;
3263 uint64_t locindex = 0;
3264 std::vector< std::vector< IT > > locs_send(nprocs);
3265 std::vector< std::vector< std::string > > data_send(nprocs);
3266 int * map_scnt =
new int[nprocs]();
3267 for(
auto itr = uniqsorted.begin(); itr != uniqsorted.end(); ++itr)
3269 uint64_t globalindex = sizeuntil + locindex;
3270 invindex.insert(std::make_pair(itr->second, globalindex));
3273 int owner = distmapper.Owner(globalindex, newlocid);
3278 locs_send[owner].push_back(newlocid);
3279 data_send[owner].push_back(itr->second);
3287 SpParHelper::ReDistributeToVector(map_scnt, locs_send, data_send, distmapper.arr, commGrid->GetWorld());
3290 for(IT i=0; i< totrecv; ++i)
3292 auto locnull = std::find(recvdata[i].first.begin(), recvdata[i].first.end(),
'\0');
3293 std::string searchstr(recvdata[i].first.begin(), locnull);
3295 auto resp = invindex.find(searchstr);
3296 if (resp != invindex.end())
3298 recvdata[i].second = resp->second;
3301 std::cout <<
"Assertion failed at proc " << myrank <<
": the absence of the entry in invindex is unexpected!!!" << std::endl;
3303 MPI_Alltoallv(recvdata, recvcnt, rdispls, MPI_HASH, senddata, sendcnt, sdispls, MPI_HASH, commGrid->GetWorld());
3304 DeleteAll(recvdata, sendcnt, recvcnt, sdispls, rdispls);
3305 MPI_Type_free(&MPI_HASH);
3317 template <
class IT,
class NT,
class DER>
3318 template <
typename _BinaryOperation>
3321 int myrank = commGrid->GetRank();
3322 int nprocs = commGrid->GetSize();
3323 TYPE2SEND * senddata;
3325 uint64_t totallength;
3328 MPI_File mpi_fh = TupleRead1stPassNExchange(filename, senddata, totsend, distmapper, totallength);
3330 typedef std::map<std::string, uint64_t> KEYMAP;
3331 KEYMAP ultimateperm;
3332 for(IT i=0; i< totsend; ++i)
3334 auto locnull = std::find(senddata[i].first.begin(), senddata[i].first.end(),
'\0');
3336 std::string searchstr(senddata[i].first.begin(), locnull);
3337 auto ret = ultimateperm.emplace(std::make_pair(searchstr, senddata[i].second));
3341 std::cout <<
"the duplication in ultimateperm is unexpected!!!" << std::endl;
3347 MPI_Offset fpos, end_fpos;
3349 if (stat(filename.c_str(), &st) == -1)
3351 MPI_Abort(MPI_COMM_WORLD,
NOFILE);
3353 int64_t file_size = st.st_size;
3355 fpos = myrank * file_size / nprocs;
3357 if(myrank != (nprocs-1)) end_fpos = (myrank + 1) * file_size / nprocs;
3358 else end_fpos = file_size;
3360 typedef typename DER::LocalIT LIT;
3361 std::vector<LIT> rows;
3362 std::vector<LIT> cols;
3363 std::vector<NT> vals;
3365 std::vector<std::string> lines;
3366 bool finished = SpParHelper::FetchBatch(mpi_fh, fpos, end_fpos,
true, lines, myrank);
3367 int64_t entriesread = lines.size();
3369 SpHelper::ProcessStrLinesNPermute(rows, cols, vals, lines, ultimateperm);
3373 finished = SpParHelper::FetchBatch(mpi_fh, fpos, end_fpos,
false, lines, myrank);
3374 entriesread += lines.size();
3375 SpHelper::ProcessStrLinesNPermute(rows, cols, vals, lines, ultimateperm);
3378 MPI_Reduce(&entriesread, &allentriesread, 1,
MPIType<int64_t>(), MPI_SUM, 0, commGrid->commWorld);
3379 #ifdef COMBBLAS_DEBUG 3381 std::cout <<
"Second reading finished. Total number of entries read across all processors is " << allentriesread << std::endl;
3384 MPI_File_close(&mpi_fh);
3385 std::vector< std::vector < std::tuple<LIT,LIT,NT> > > data(nprocs);
3387 LIT locsize = rows.size();
3388 for(LIT i=0; i<locsize; ++i)
3391 int owner = Owner(totallength, totallength, rows[i], cols[i], lrow, lcol);
3392 data[owner].push_back(std::make_tuple(lrow,lcol,vals[i]));
3394 std::vector<LIT>().swap(rows);
3395 std::vector<LIT>().swap(cols);
3396 std::vector<NT>().swap(vals);
3398 #ifdef COMBBLAS_DEBUG 3400 std::cout <<
"Packing to recipients finished, about to send..." << std::endl;
3403 if(spSeq)
delete spSeq;
3404 SparseCommon(data, locsize, totallength, totallength, BinOp);
3415 template <
class IT,
class NT,
class DER>
3416 template <
typename _BinaryOperation>
3420 int32_t symmetric = 0;
3421 int64_t nrows, ncols, nonzeros;
3425 int myrank = commGrid->GetRank();
3426 int nprocs = commGrid->GetSize();
3430 if ((f = fopen(filename.c_str(),
"r")) == NULL)
3432 printf(
"COMBBLAS: Matrix-market file %s can not be found\n", filename.c_str());
3433 MPI_Abort(MPI_COMM_WORLD,
NOFILE);
3437 printf(
"Could not process Matrix Market banner.\n");
3444 printf(
"Sorry, this application does not support complext types");
3449 std::cout <<
"Matrix is Float" << std::endl;
3454 std::cout <<
"Matrix is Integer" << std::endl;
3459 std::cout <<
"Matrix is Boolean" << std::endl;
3464 std::cout <<
"Matrix is symmetric" << std::endl;
3471 std::cout <<
"Total number of nonzeros expected across all processors is " << nonzeros << std::endl;
3474 MPI_Bcast(&type, 1, MPI_INT, 0, commGrid->commWorld);
3475 MPI_Bcast(&symmetric, 1, MPI_INT, 0, commGrid->commWorld);
3482 if (stat(filename.c_str(), &st) == -1)
3484 MPI_Abort(MPI_COMM_WORLD,
NOFILE);
3486 int64_t file_size = st.st_size;
3487 MPI_Offset fpos, end_fpos, endofheader;
3488 if(commGrid->GetRank() == 0)
3490 std::cout <<
"File is " << file_size <<
" bytes" << std::endl;
3493 MPI_Bcast(&endofheader, 1, MPIType<MPI_Offset>(), 0, commGrid->commWorld);
3498 MPI_Bcast(&endofheader, 1, MPIType<MPI_Offset>(), 0, commGrid->commWorld);
3499 fpos = endofheader + myrank * (file_size-endofheader) / nprocs;
3501 if(myrank != (nprocs-1)) end_fpos = endofheader + (myrank + 1) * (file_size-endofheader) / nprocs;
3502 else end_fpos = file_size;
3505 MPI_File_open (commGrid->commWorld, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &mpi_fh);
3508 typedef typename DER::LocalIT LIT;
3509 std::vector<LIT> rows;
3510 std::vector<LIT> cols;
3511 std::vector<NT> vals;
3513 std::vector<std::string> lines;
3514 bool finished = SpParHelper::FetchBatch(mpi_fh, fpos, end_fpos,
true, lines, myrank);
3515 int64_t entriesread = lines.size();
3516 SpHelper::ProcessLines(rows, cols, vals, lines, symmetric, type, onebased);
3517 MPI_Barrier(commGrid->commWorld);
3521 finished = SpParHelper::FetchBatch(mpi_fh, fpos, end_fpos,
false, lines, myrank);
3522 entriesread += lines.size();
3523 SpHelper::ProcessLines(rows, cols, vals, lines, symmetric, type, onebased);
3526 MPI_Reduce(&entriesread, &allentriesread, 1,
MPIType<int64_t>(), MPI_SUM, 0, commGrid->commWorld);
3527 #ifdef COMBBLAS_DEBUG 3529 std::cout <<
"Reading finished. Total number of entries read across all processors is " << allentriesread << std::endl;
3532 std::vector< std::vector < std::tuple<LIT,LIT,NT> > > data(nprocs);
3534 LIT locsize = rows.size();
3535 for(LIT i=0; i<locsize; ++i)
3538 int owner = Owner(nrows, ncols, rows[i], cols[i], lrow, lcol);
3539 data[owner].push_back(std::make_tuple(lrow,lcol,vals[i]));
3541 std::vector<LIT>().swap(rows);
3542 std::vector<LIT>().swap(cols);
3543 std::vector<NT>().swap(vals);
3545 #ifdef COMBBLAS_DEBUG 3547 std::cout <<
"Packing to recepients finished, about to send..." << std::endl;
3550 if(spSeq)
delete spSeq;
3551 SparseCommon(data, locsize, nrows, ncols, BinOp);
3558 template <
class IT,
class NT,
class DER>
3559 template <
class HANDLER>
3563 TAU_PROFILE_TIMER(rdtimer,
"ReadDistribute",
"void SpParMat::ReadDistribute (const string & , int, bool, HANDLER, bool)", TAU_DEFAULT);
3564 TAU_PROFILE_START(rdtimer);
3567 std::ifstream infile;
3568 FILE * binfile = NULL;
3571 if(commGrid->GetRank() == master)
3573 hfile =
ParseHeader(filename, binfile, seeklength);
3575 MPI_Bcast(&seeklength, 1, MPI_INT, master, commGrid->commWorld);
3577 IT total_m, total_n, total_nnz;
3578 IT m_perproc = 0, n_perproc = 0;
3580 int colneighs = commGrid->GetGridRows();
3581 int rowneighs = commGrid->GetGridCols();
3583 IT buffpercolneigh =
MEMORYINBYTES / (colneighs * (2 *
sizeof(IT) +
sizeof(NT)));
3584 IT buffperrowneigh =
MEMORYINBYTES / (rowneighs * (2 *
sizeof(IT) +
sizeof(NT)));
3591 buffpercolneigh /= colneighs;
3593 SpParHelper::Print(
"COMBBLAS: Parallel I/O requested but binary header is corrupted\n", commGrid->GetWorld());
3599 buffperrowneigh = std::max(buffperrowneigh, buffpercolneigh);
3600 if(std::max(buffpercolneigh * colneighs, buffperrowneigh * rowneighs) > std::numeric_limits<int>::max())
3602 SpParHelper::Print(
"COMBBLAS: MPI doesn't support sending int64_t send/recv counts or displacements\n", commGrid->GetWorld());
3605 int * cdispls =
new int[colneighs];
3606 for (IT i=0; i<colneighs; ++i) cdispls[i] = i*buffpercolneigh;
3607 int * rdispls =
new int[rowneighs];
3608 for (IT i=0; i<rowneighs; ++i) rdispls[i] = i*buffperrowneigh;
3610 int *ccurptrs = NULL, *rcurptrs = NULL;
3617 int rankincol = commGrid->GetRankInProcCol(master);
3618 int rankinrow = commGrid->GetRankInProcRow(master);
3619 std::vector< std::tuple<IT, IT, NT> > localtuples;
3621 if(commGrid->GetRank() == master)
3625 SpParHelper::Print(
"COMBBLAS: Input file doesn't exist\n", commGrid->GetWorld());
3626 total_n = 0; total_m = 0;
3627 BcastEssentials(commGrid->commWorld, total_m, total_n, total_nnz, master);
3632 SpParHelper::Print(
"COMBBLAS: Ascii input with binary headers is not supported\n", commGrid->GetWorld());
3633 total_n = 0; total_m = 0;
3634 BcastEssentials(commGrid->commWorld, total_m, total_n, total_nnz, master);
3639 infile.open(filename.c_str());
3641 infile.getline(comment,256);
3642 while(comment[0] ==
'%')
3644 infile.getline(comment,256);
3646 std::stringstream ss;
3647 ss << std::string(comment);
3648 ss >> total_m >> total_n >> total_nnz;
3651 SpParHelper::Print(
"COMBBLAS: Trying to read binary headerless file in parallel, aborting\n", commGrid->GetWorld());
3652 total_n = 0; total_m = 0;
3653 BcastEssentials(commGrid->commWorld, total_m, total_n, total_nnz, master);
3661 total_nnz = hfile.
nnz;
3663 m_perproc = total_m / colneighs;
3664 n_perproc = total_n / rowneighs;
3665 BcastEssentials(commGrid->commWorld, total_m, total_n, total_nnz, master);
3666 AllocateSetBuffers(rows, cols, vals, rcurptrs, ccurptrs, rowneighs, colneighs, buffpercolneigh);
3668 if(seeklength > 0 && pario)
3670 IT entriestoread = total_nnz / colneighs;
3673 commGrid->OpenDebugFile(
"Read", oput);
3674 oput <<
"Total nnz: " << total_nnz <<
" entries to read: " << entriestoread << std::endl;
3677 ReadAllMine(binfile, rows, cols, vals, localtuples, rcurptrs, ccurptrs, rdispls, cdispls, m_perproc, n_perproc,
3678 rowneighs, colneighs, buffperrowneigh, buffpercolneigh, entriestoread, handler, rankinrow, transpose);
3682 IT temprow, tempcol;
3684 IT ntrow = 0, ntcol = 0;
3686 bool nonumline = nonum;
3688 for(; cnz < total_nnz; ++cnz)
3692 std::stringstream linestream;
3696 infile.getline(line, 1024);
3698 linestream >> temprow >> tempcol;
3702 linestream >> std::skipws;
3703 nonumline = linestream.eof();
3712 handler.binaryfill(binfile, temprow , tempcol, tempval);
3720 colrec = std::min(static_cast<int>(temprow / m_perproc), colneighs-1);
3721 commonindex = colrec * buffpercolneigh + ccurptrs[colrec];
3723 rows[ commonindex ] = temprow;
3724 cols[ commonindex ] = tempcol;
3727 vals[ commonindex ] = nonumline ? handler.getNoNum(ntrow, ntcol) : handler.read(linestream, ntrow, ntcol);
3731 vals[ commonindex ] = tempval;
3733 ++ (ccurptrs[colrec]);
3734 if(ccurptrs[colrec] == buffpercolneigh || (cnz == (total_nnz-1)) )
3736 MPI_Scatter(ccurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankincol, commGrid->colWorld);
3739 IT * temprows =
new IT[recvcount];
3740 IT * tempcols =
new IT[recvcount];
3741 NT * tempvals =
new NT[recvcount];
3744 MPI_Scatterv(rows, ccurptrs, cdispls, MPIType<IT>(), temprows, recvcount, MPIType<IT>(), rankincol, commGrid->colWorld);
3745 MPI_Scatterv(cols, ccurptrs, cdispls, MPIType<IT>(), tempcols, recvcount, MPIType<IT>(), rankincol, commGrid->colWorld);
3746 MPI_Scatterv(vals, ccurptrs, cdispls, MPIType<NT>(), tempvals, recvcount, MPIType<NT>(), rankincol, commGrid->colWorld);
3748 std::fill_n(ccurptrs, colneighs, 0);
3751 HorizontalSend(rows, cols, vals,temprows, tempcols, tempvals, localtuples, rcurptrs, rdispls,
3752 buffperrowneigh, rowneighs, recvcount, m_perproc, n_perproc, rankinrow);
3754 if( cnz != (total_nnz-1) )
3757 rows =
new IT [ buffpercolneigh * colneighs ];
3758 cols =
new IT [ buffpercolneigh * colneighs ];
3759 vals =
new NT [ buffpercolneigh * colneighs ];
3763 assert (cnz == total_nnz);
3766 std::fill_n(ccurptrs, colneighs, std::numeric_limits<int>::max());
3767 MPI_Scatter(ccurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankincol, commGrid->colWorld);
3770 std::fill_n(rcurptrs, rowneighs, std::numeric_limits<int>::max());
3771 MPI_Scatter(rcurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankinrow, commGrid->rowWorld);
3774 else if( commGrid->OnSameProcCol(master) )
3776 BcastEssentials(commGrid->commWorld, total_m, total_n, total_nnz, master);
3777 m_perproc = total_m / colneighs;
3778 n_perproc = total_n / rowneighs;
3780 if(seeklength > 0 && pario)
3782 binfile = fopen(filename.c_str(),
"rb");
3783 IT entrysize = handler.entrylength();
3784 int myrankincol = commGrid->GetRankInProcCol();
3785 IT perreader = total_nnz / colneighs;
3786 IT read_offset = entrysize *
static_cast<IT
>(myrankincol) * perreader + seeklength;
3787 IT entriestoread = perreader;
3788 if (myrankincol == colneighs-1)
3789 entriestoread = total_nnz -
static_cast<IT
>(myrankincol) * perreader;
3790 fseek(binfile, read_offset, SEEK_SET);
3794 commGrid->OpenDebugFile(
"Read", oput);
3795 oput <<
"Total nnz: " << total_nnz <<
" OFFSET : " << read_offset <<
" entries to read: " << entriestoread << std::endl;
3799 AllocateSetBuffers(rows, cols, vals, rcurptrs, ccurptrs, rowneighs, colneighs, buffpercolneigh);
3800 ReadAllMine(binfile, rows, cols, vals, localtuples, rcurptrs, ccurptrs, rdispls, cdispls, m_perproc, n_perproc,
3801 rowneighs, colneighs, buffperrowneigh, buffpercolneigh, entriestoread, handler, rankinrow, transpose);
3805 while(total_n > 0 || total_m > 0)
3815 MPI_Scatter(ccurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankincol, commGrid->colWorld);
3816 if( recvcount == std::numeric_limits<int>::max())
break;
3819 IT * temprows =
new IT[recvcount];
3820 IT * tempcols =
new IT[recvcount];
3821 NT * tempvals =
new NT[recvcount];
3824 MPI_Scatterv(rows, ccurptrs, cdispls, MPIType<IT>(), temprows, recvcount, MPIType<IT>(), rankincol, commGrid->colWorld);
3825 MPI_Scatterv(cols, ccurptrs, cdispls, MPIType<IT>(), tempcols, recvcount, MPIType<IT>(), rankincol, commGrid->colWorld);
3826 MPI_Scatterv(vals, ccurptrs, cdispls, MPIType<NT>(), tempvals, recvcount, MPIType<NT>(), rankincol, commGrid->colWorld);
3829 rcurptrs =
new int[rowneighs];
3830 std::fill_n(rcurptrs, rowneighs, 0);
3833 HorizontalSend(rows, cols, vals,temprows, tempcols, tempvals, localtuples, rcurptrs, rdispls,
3834 buffperrowneigh, rowneighs, recvcount, m_perproc, n_perproc, rankinrow);
3839 std::fill_n(rcurptrs, rowneighs, std::numeric_limits<int>::max());
3840 MPI_Scatter(rcurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankinrow, commGrid->rowWorld);
3845 BcastEssentials(commGrid->commWorld, total_m, total_n, total_nnz, master);
3846 m_perproc = total_m / colneighs;
3847 n_perproc = total_n / rowneighs;
3848 while(total_n > 0 || total_m > 0)
3851 MPI_Scatter(rcurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankinrow, commGrid->rowWorld);
3852 if( recvcount == std::numeric_limits<int>::max())
3856 IT * temprows =
new IT[recvcount];
3857 IT * tempcols =
new IT[recvcount];
3858 NT * tempvals =
new NT[recvcount];
3860 MPI_Scatterv(rows, rcurptrs, rdispls, MPIType<IT>(), temprows, recvcount, MPIType<IT>(), rankinrow, commGrid->rowWorld);
3861 MPI_Scatterv(cols, rcurptrs, rdispls, MPIType<IT>(), tempcols, recvcount, MPIType<IT>(), rankinrow, commGrid->rowWorld);
3862 MPI_Scatterv(vals, rcurptrs, rdispls, MPIType<NT>(), tempvals, recvcount, MPIType<NT>(), rankinrow, commGrid->rowWorld);
3865 IT moffset = commGrid->myprocrow * m_perproc;
3866 IT noffset = commGrid->myproccol * n_perproc;
3868 for(IT i=0; i< recvcount; ++i)
3870 localtuples.push_back( std::make_tuple(temprows[i]-moffset, tempcols[i]-noffset, tempvals[i]) );
3876 std::tuple<IT,IT,NT> * arrtuples =
new std::tuple<IT,IT,NT>[localtuples.size()];
3877 std::copy(localtuples.begin(), localtuples.end(), arrtuples);
3879 IT localm = (commGrid->myprocrow != (commGrid->grrows-1))? m_perproc: (total_m - (m_perproc * (commGrid->grrows-1)));
3880 IT localn = (commGrid->myproccol != (commGrid->grcols-1))? n_perproc: (total_n - (n_perproc * (commGrid->grcols-1)));
3881 spSeq->Create( localtuples.size(), localm, localn, arrtuples);
3884 TAU_PROFILE_STOP(rdtimer);
3889 template <
class IT,
class NT,
class DER>
3893 rows =
new IT [ buffpercolneigh * colneighs ];
3894 cols =
new IT [ buffpercolneigh * colneighs ];
3895 vals =
new NT [ buffpercolneigh * colneighs ];
3897 ccurptrs =
new int[colneighs];
3898 rcurptrs =
new int[rowneighs];
3899 std::fill_n(ccurptrs, colneighs, 0);
3900 std::fill_n(rcurptrs, rowneighs, 0);
3903 template <
class IT,
class NT,
class DER>
3906 MPI_Bcast(&total_m, 1, MPIType<IT>(), master, world);
3907 MPI_Bcast(&total_n, 1, MPIType<IT>(), master, world);
3908 MPI_Bcast(&total_nnz, 1, MPIType<IT>(), master, world);
3915 template <
class IT,
class NT,
class DER>
3916 void SpParMat<IT,NT,DER>::VerticalSend(IT * & rows, IT * & cols, NT * & vals, std::vector< std::tuple<IT,IT,NT> > & localtuples,
int * rcurptrs,
int * ccurptrs,
int * rdispls,
int * cdispls,
3917 IT m_perproc, IT n_perproc,
int rowneighs,
int colneighs, IT buffperrowneigh, IT buffpercolneigh,
int rankinrow)
3920 int * colrecvdispls =
new int[colneighs];
3921 int * colrecvcounts =
new int[colneighs];
3922 MPI_Alltoall(ccurptrs, 1, MPI_INT, colrecvcounts, 1, MPI_INT, commGrid->colWorld);
3923 int totrecv = std::accumulate(colrecvcounts,colrecvcounts+colneighs,0);
3924 colrecvdispls[0] = 0;
3925 for(
int i=0; i<colneighs-1; ++i)
3926 colrecvdispls[i+1] = colrecvdispls[i] + colrecvcounts[i];
3929 IT * temprows =
new IT[totrecv];
3930 IT * tempcols =
new IT[totrecv];
3931 NT * tempvals =
new NT[totrecv];
3934 MPI_Alltoallv(rows, ccurptrs, cdispls, MPIType<IT>(), temprows, colrecvcounts, colrecvdispls, MPIType<IT>(), commGrid->colWorld);
3935 MPI_Alltoallv(cols, ccurptrs, cdispls, MPIType<IT>(), tempcols, colrecvcounts, colrecvdispls, MPIType<IT>(), commGrid->colWorld);
3936 MPI_Alltoallv(vals, ccurptrs, cdispls, MPIType<NT>(), tempvals, colrecvcounts, colrecvdispls, MPIType<NT>(), commGrid->colWorld);
3939 std::fill_n(ccurptrs, colneighs, 0);
3940 DeleteAll(colrecvdispls, colrecvcounts);
3944 HorizontalSend(rows, cols, vals,temprows, tempcols, tempvals, localtuples, rcurptrs, rdispls, buffperrowneigh, rowneighs, totrecv, m_perproc, n_perproc, rankinrow);
3947 rows =
new IT [ buffpercolneigh * colneighs ];
3948 cols =
new IT [ buffpercolneigh * colneighs ];
3949 vals =
new NT [ buffpercolneigh * colneighs ];
3959 template <
class IT,
class NT,
class DER>
3960 template <
class HANDLER>
3961 void SpParMat<IT,NT,DER>::ReadAllMine(FILE * binfile, IT * & rows, IT * & cols, NT * & vals, std::vector< std::tuple<IT,IT,NT> > & localtuples,
int * rcurptrs,
int * ccurptrs,
int * rdispls,
int * cdispls,
3962 IT m_perproc, IT n_perproc,
int rowneighs,
int colneighs, IT buffperrowneigh, IT buffpercolneigh, IT entriestoread, HANDLER handler,
int rankinrow,
bool transpose)
3964 assert(entriestoread != 0);
3966 IT temprow, tempcol;
3968 int finishedglobal = 1;
3969 while(cnz < entriestoread && !feof(binfile))
3971 handler.binaryfill(binfile, temprow , tempcol, tempval);
3978 int colrec = std::min(static_cast<int>(temprow / m_perproc), colneighs-1);
3979 size_t commonindex = colrec * buffpercolneigh + ccurptrs[colrec];
3980 rows[ commonindex ] = temprow;
3981 cols[ commonindex ] = tempcol;
3982 vals[ commonindex ] = tempval;
3983 ++ (ccurptrs[colrec]);
3984 if(ccurptrs[colrec] == buffpercolneigh || (cnz == (entriestoread-1)) )
3988 commGrid->OpenDebugFile(
"Read", oput);
3989 oput <<
"To column neighbors: ";
3990 std::copy(ccurptrs, ccurptrs+colneighs, std::ostream_iterator<int>(oput,
" ")); oput << std::endl;
3994 VerticalSend(rows, cols, vals, localtuples, rcurptrs, ccurptrs, rdispls, cdispls, m_perproc, n_perproc,
3995 rowneighs, colneighs, buffperrowneigh, buffpercolneigh, rankinrow);
3997 if(cnz == (entriestoread-1))
3999 int finishedlocal = 1;
4000 MPI_Allreduce( &finishedlocal, &finishedglobal, 1, MPI_INT, MPI_BAND, commGrid->colWorld);
4001 while(!finishedglobal)
4005 commGrid->OpenDebugFile(
"Read", oput);
4006 oput <<
"To column neighbors: ";
4007 std::copy(ccurptrs, ccurptrs+colneighs, std::ostream_iterator<int>(oput,
" ")); oput << std::endl;
4013 VerticalSend(rows, cols, vals, localtuples, rcurptrs, ccurptrs, rdispls, cdispls, m_perproc, n_perproc,
4014 rowneighs, colneighs, buffperrowneigh, buffpercolneigh, rankinrow);
4016 MPI_Allreduce( &finishedlocal, &finishedglobal, 1, MPI_INT, MPI_BAND, commGrid->colWorld);
4021 int finishedlocal = 0;
4022 MPI_Allreduce( &finishedlocal, &finishedglobal, 1, MPI_INT, MPI_BAND, commGrid->colWorld);
4029 std::fill_n(rcurptrs, rowneighs, std::numeric_limits<int>::max());
4031 MPI_Scatter(rcurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankinrow, commGrid->rowWorld);
4041 template <
class IT,
class NT,
class DER>
4042 void SpParMat<IT,NT,DER>::HorizontalSend(IT * & rows, IT * & cols, NT * & vals, IT * & temprows, IT * & tempcols, NT * & tempvals, std::vector < std::tuple <IT,IT,NT> > & localtuples,
4043 int * rcurptrs,
int * rdispls, IT buffperrowneigh,
int rowneighs,
int recvcount, IT m_perproc, IT n_perproc,
int rankinrow)
4045 rows =
new IT [ buffperrowneigh * rowneighs ];
4046 cols =
new IT [ buffperrowneigh * rowneighs ];
4047 vals =
new NT [ buffperrowneigh * rowneighs ];
4050 for(
int i=0; i< recvcount; ++i)
4052 int rowrec = std::min(static_cast<int>(tempcols[i] / n_perproc), rowneighs-1);
4053 rows[ rowrec * buffperrowneigh + rcurptrs[rowrec] ] = temprows[i];
4054 cols[ rowrec * buffperrowneigh + rcurptrs[rowrec] ] = tempcols[i];
4055 vals[ rowrec * buffperrowneigh + rcurptrs[rowrec] ] = tempvals[i];
4056 ++ (rcurptrs[rowrec]);
4061 commGrid->OpenDebugFile(
"Read", oput);
4062 oput <<
"To row neighbors: ";
4063 std::copy(rcurptrs, rcurptrs+rowneighs, std::ostream_iterator<int>(oput,
" ")); oput << std::endl;
4064 oput <<
"Row displacements were: ";
4065 std::copy(rdispls, rdispls+rowneighs, std::ostream_iterator<int>(oput,
" ")); oput << std::endl;
4069 MPI_Scatter(rcurptrs, 1, MPI_INT, &recvcount, 1, MPI_INT, rankinrow, commGrid->rowWorld);
4073 DeleteAll(temprows, tempcols, tempvals);
4074 temprows =
new IT[recvcount];
4075 tempcols =
new IT[recvcount];
4076 tempvals =
new NT[recvcount];
4079 MPI_Scatterv(rows, rcurptrs, rdispls, MPIType<IT>(), temprows, recvcount, MPIType<IT>(), rankinrow, commGrid->rowWorld);
4080 MPI_Scatterv(cols, rcurptrs, rdispls, MPIType<IT>(), tempcols, recvcount, MPIType<IT>(), rankinrow, commGrid->rowWorld);
4081 MPI_Scatterv(vals, rcurptrs, rdispls, MPIType<NT>(), tempvals, recvcount, MPIType<NT>(), rankinrow, commGrid->rowWorld);
4084 IT moffset = commGrid->myprocrow * m_perproc;
4085 IT noffset = commGrid->myproccol * n_perproc;
4087 for(
int i=0; i< recvcount; ++i)
4089 localtuples.push_back( std::make_tuple(temprows[i]-moffset, tempcols[i]-noffset, tempvals[i]) );
4092 std::fill_n(rcurptrs, rowneighs, 0);
4093 DeleteAll(rows, cols, vals, temprows, tempcols, tempvals);
4099 template <
class IT,
class NT,
class DER>
4102 if((*(distrows.commGrid) != *(distcols.commGrid)) || (*(distcols.commGrid) != *(distvals.commGrid)))
4104 SpParHelper::Print(
"Grids are not comparable, Find() fails!", commGrid->GetWorld());
4107 IT globallen = getnnz();
4114 IT prelen = Atuples.
getnnz();
4117 int rank = commGrid->GetRank();
4118 int nprocs = commGrid->GetSize();
4119 IT * prelens =
new IT[nprocs];
4120 prelens[
rank] = prelen;
4121 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), prelens, 1, MPIType<IT>(), commGrid->GetWorld());
4122 IT prelenuntil = std::accumulate(prelens, prelens+rank, static_cast<IT>(0));
4124 int * sendcnt =
new int[nprocs]();
4125 IT * rows =
new IT[prelen];
4126 IT * cols =
new IT[prelen];
4127 NT * vals =
new NT[prelen];
4129 int rowrank = commGrid->GetRankInProcRow();
4130 int colrank = commGrid->GetRankInProcCol();
4131 int rowneighs = commGrid->GetGridCols();
4132 int colneighs = commGrid->GetGridRows();
4133 IT * locnrows =
new IT[colneighs];
4134 IT * locncols =
new IT[rowneighs];
4135 locnrows[colrank] = getlocalrows();
4136 locncols[rowrank] = getlocalcols();
4138 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(),locnrows, 1, MPIType<IT>(), commGrid->GetColWorld());
4139 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(),locncols, 1, MPIType<IT>(), commGrid->GetRowWorld());
4141 IT roffset = std::accumulate(locnrows, locnrows+colrank, static_cast<IT>(0));
4142 IT coffset = std::accumulate(locncols, locncols+rowrank, static_cast<IT>(0));
4145 for(
int i=0; i< prelen; ++i)
4148 int owner = nrows.Owner(prelenuntil+i, locid);
4151 rows[i] = Atuples.
rowindex(i) + roffset;
4152 cols[i] = Atuples.
colindex(i) + coffset;
4156 int * recvcnt =
new int[nprocs];
4157 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetWorld());
4159 int * sdpls =
new int[nprocs]();
4160 int * rdpls =
new int[nprocs]();
4161 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdpls+1);
4162 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdpls+1);
4164 MPI_Alltoallv(rows, sendcnt, sdpls, MPIType<IT>(), SpHelper::p2a(nrows.arr), recvcnt, rdpls, MPIType<IT>(), commGrid->GetWorld());
4165 MPI_Alltoallv(cols, sendcnt, sdpls, MPIType<IT>(), SpHelper::p2a(ncols.arr), recvcnt, rdpls, MPIType<IT>(), commGrid->GetWorld());
4166 MPI_Alltoallv(vals, sendcnt, sdpls, MPIType<NT>(), SpHelper::p2a(nvals.arr), recvcnt, rdpls, MPIType<NT>(), commGrid->GetWorld());
4168 DeleteAll(sendcnt, recvcnt, sdpls, rdpls);
4177 template <
class IT,
class NT,
class DER>
4180 if((*(distrows.commGrid) != *(distcols.commGrid)) )
4182 SpParHelper::Print(
"Grids are not comparable, Find() fails!", commGrid->GetWorld());
4185 IT globallen = getnnz();
4191 IT prelen = Atuples.
getnnz();
4193 int rank = commGrid->GetRank();
4194 int nprocs = commGrid->GetSize();
4195 IT * prelens =
new IT[nprocs];
4196 prelens[
rank] = prelen;
4197 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(), prelens, 1, MPIType<IT>(), commGrid->GetWorld());
4198 IT prelenuntil = std::accumulate(prelens, prelens+rank, static_cast<IT>(0));
4200 int * sendcnt =
new int[nprocs]();
4201 IT * rows =
new IT[prelen];
4202 IT * cols =
new IT[prelen];
4203 NT * vals =
new NT[prelen];
4205 int rowrank = commGrid->GetRankInProcRow();
4206 int colrank = commGrid->GetRankInProcCol();
4207 int rowneighs = commGrid->GetGridCols();
4208 int colneighs = commGrid->GetGridRows();
4209 IT * locnrows =
new IT[colneighs];
4210 IT * locncols =
new IT[rowneighs];
4211 locnrows[colrank] = getlocalrows();
4212 locncols[rowrank] = getlocalcols();
4214 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(),locnrows, 1, MPIType<IT>(), commGrid->GetColWorld());
4215 MPI_Allgather(MPI_IN_PLACE, 0, MPIType<IT>(),locncols, 1, MPIType<IT>(), commGrid->GetColWorld());
4216 IT roffset = std::accumulate(locnrows, locnrows+colrank, static_cast<IT>(0));
4217 IT coffset = std::accumulate(locncols, locncols+rowrank, static_cast<IT>(0));
4220 for(
int i=0; i< prelen; ++i)
4223 int owner = nrows.Owner(prelenuntil+i, locid);
4226 rows[i] = Atuples.
rowindex(i) + roffset;
4227 cols[i] = Atuples.
colindex(i) + coffset;
4230 int * recvcnt =
new int[nprocs];
4231 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, commGrid->GetWorld());
4233 int * sdpls =
new int[nprocs]();
4234 int * rdpls =
new int[nprocs]();
4235 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdpls+1);
4236 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdpls+1);
4238 MPI_Alltoallv(rows, sendcnt, sdpls, MPIType<IT>(), SpHelper::p2a(nrows.arr), recvcnt, rdpls, MPIType<IT>(), commGrid->GetWorld());
4239 MPI_Alltoallv(cols, sendcnt, sdpls, MPIType<IT>(), SpHelper::p2a(ncols.arr), recvcnt, rdpls, MPIType<IT>(), commGrid->GetWorld());
4241 DeleteAll(sendcnt, recvcnt, sdpls, rdpls);
4247 template <
class IT,
class NT,
class DER>
4250 outfile << (*spSeq) << std::endl;
4254 template <
class IU,
class NU,
class UDER>
4255 std::ofstream& operator<<(std::ofstream& outfile, const SpParMat<IU, NU, UDER> & s)
4257 return s.
put(outfile) ;
4268 template <
class IT,
class NT,
class DER>
4269 template <
typename LIT>
4272 int procrows = commGrid->GetGridRows();
4273 int proccols = commGrid->GetGridCols();
4274 IT m_perproc = total_m / procrows;
4275 IT n_perproc = total_n / proccols;
4280 own_procrow = std::min(static_cast<int>(grow / m_perproc), procrows-1);
4284 own_procrow = procrows -1;
4289 own_proccol = std::min(static_cast<int>(gcol / n_perproc), proccols-1);
4293 own_proccol = proccols-1;
4295 lrow = grow - (own_procrow * m_perproc);
4296 lcol = gcol - (own_proccol * n_perproc);
4297 return commGrid->GetRank(own_procrow, own_proccol);
4304 template <
class IT,
class NT,
class DER>
4307 IT total_rows = getnrow();
4308 IT total_cols = getncol();
4310 int procrows = commGrid->GetGridRows();
4311 int proccols = commGrid->GetGridCols();
4312 IT rows_perproc = total_rows / procrows;
4313 IT cols_perproc = total_cols / proccols;
4315 rowOffset = commGrid->GetRankInProcCol()*rows_perproc;
4316 colOffset = commGrid->GetRankInProcRow()*cols_perproc;
#define mm_is_pattern(typecode)
Compute the minimum of two values.
int mm_read_mtx_crd_size(FILE *f, int64_t *M, int64_t *N, int64_t *nz, int64_t *numlinesread)
Compute the maximum of two values.
MPI_Datatype MPIType< int64_t >(void)
int mm_read_banner(FILE *f, MM_typecode *matcode)
IT AddLoops(NT loopval, bool replaceExisting=false)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
std::shared_ptr< CommGrid > getcommgrid() const
MPI_Datatype MPIType< uint64_t >(void)
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)
std::shared_ptr< CommGrid > getcommgrid() const
void RemoveDuplicates(BINFUNC BinOp)
IT Count(_Predicate pred) const
Return the number of elements for which pred is true.
std::ofstream & put(std::ofstream &outfile) const
void Set(const std::vector< int > &maxsizes, int mA)
MPI_Datatype MPIType< double >(void)
int64_t getGlobalV() const
void iota(IT globalsize, NT first)
HeaderInfo ParseHeader(const std::string &inputname, FILE *&f, int &seeklength)
#define mm_is_real(typecode)
#define mm_is_hermitian(typecode)
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
void PrintInfo(std::string vectorname) const
#define mm_is_integer(typecode)
Collection of Generic Sequential Functions.
void UpdateDense(DenseParMat< IT, NT > &rhs, _BinaryOperation __binary_op) const
#define MEM_EFFICIENT_STAGES
SpParMat()
Deprecated. Don't call the default constructor.
std::shared_ptr< CommGrid > commGrid
NT Reduce(_BinaryOperation __binary_op, NT identity) const
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
#define mm_is_complex(typecode)
#define mm_is_symmetric(typecode)
char * mm_typecode_to_str(MM_typecode matcode)
void EWiseScale(const DenseParMat< IT, NT > &rhs)