9 #ifndef ApproxWeightPerfectMatching_h 10 #define ApproxWeightPerfectMatching_h 12 #include "../CombBLAS.h" 15 #include <parallel/algorithm> 16 #include <parallel/numeric> 42 double t1Comp,
t1Comm,
t2Comp,
t2Comm,
t3Comp,
t3Comm,
t4Comp,
t4Comm,
t5Comp,
t5Comm,
tUpdateMateComp;
45 template <
class IT,
class NT>
46 std::vector<std::tuple<IT,IT,NT>>
ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
50 MPI_Datatype MPI_tuple;
51 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
52 MPI_Type_commit(&MPI_tuple);
55 MPI_Comm_size(World, &nprocs);
57 int * sendcnt =
new int[nprocs];
58 int * recvcnt =
new int[nprocs];
59 int * sdispls =
new int[nprocs]();
60 int * rdispls =
new int[nprocs]();
64 for(IT i=0; i<nprocs; ++i)
66 sendcnt[i] = tempTuples[i].size();
67 totsend += tempTuples[i].size();
70 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
72 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
73 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
74 IT totrecv = accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
77 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
78 for(
int i=0; i<nprocs; ++i)
80 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
81 std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]);
83 std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
84 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
85 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
86 MPI_Type_free(&MPI_tuple);
93 template <
class IT,
class NT>
94 std::vector<std::tuple<IT,IT,IT,NT>>
ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
98 MPI_Datatype MPI_tuple;
99 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
100 MPI_Type_commit(&MPI_tuple);
103 MPI_Comm_size(World, &nprocs);
105 int * sendcnt =
new int[nprocs];
106 int * recvcnt =
new int[nprocs];
107 int * sdispls =
new int[nprocs]();
108 int * rdispls =
new int[nprocs]();
112 for(IT i=0; i<nprocs; ++i)
114 sendcnt[i] = tempTuples[i].size();
115 totsend += tempTuples[i].size();
118 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
120 std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
121 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
122 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
124 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
125 for(
int i=0; i<nprocs; ++i)
127 copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
128 std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]);
130 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
131 MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
132 DeleteAll(sendcnt, recvcnt, sdispls, rdispls);
133 MPI_Type_free(&MPI_tuple);
142 template <
class IT,
class NT,
class DER>
146 int procrows = commGrid->GetGridRows();
147 int proccols = commGrid->GetGridCols();
149 IT m_perproc = nrows / procrows;
150 IT n_perproc = ncols / proccols;
153 pr = std::min(static_cast<int>(grow / m_perproc), procrows-1);
157 pc = std::min(static_cast<int>(gcol / n_perproc), proccols-1);
160 return commGrid->GetRank(pr, pc);
187 std::vector<std::tuple<IT,IT>>
MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
191 MPI_Datatype MPI_tuple;
192 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
193 MPI_Type_commit(&MPI_tuple);
197 MPI_Comm_size(World, &nprocs);
199 int * recvcnt =
new int[nprocs];
200 int * rdispls =
new int[nprocs]();
201 int sendcnt = sendTuples.size();
204 MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
206 std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
207 IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
209 std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
212 MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
213 recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
216 MPI_Type_free(&MPI_tuple);
227 template <
class IT,
class NT>
232 fill(RepMateWC2R.begin(), RepMateWC2R.end(),
static_cast<NT
>(0));
233 fill(RepMateWR2C.begin(), RepMateWR2C.end(),
static_cast<NT
>(0));
237 #pragma omp parallel for 239 for(
int k=0; k<param.
lncol; ++k)
243 IT mj = RepMateC2R[lj];
247 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
249 IT li = dcsc->
ir[cp];
254 RepMateWC2R[lj] = dcsc->
numx[cp];
264 MPI_Comm ColWorld = param.
commGrid->GetColWorld();
265 MPI_Comm RowWorld = param.
commGrid->GetRowWorld();
267 MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
268 MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
274 template <
class IT,
class NT,
class DER>
281 MPI_Comm World = commGrid->GetWorld();
282 int myrank=commGrid->GetRank();
283 int pr = commGrid->GetGridRows();
284 int pc = commGrid->GetGridCols();
290 int rowrank = commGrid->GetRankInProcRow();
291 int colrank = commGrid->GetRankInProcCol();
292 IT m_perproc = nrows / pr;
293 IT n_perproc = ncols / pc;
295 IT localRowStart = colrank * m_perproc;
296 IT localColStart = rowrank * n_perproc;
301 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
303 IT lj = colit.colid();
304 IT j = lj + localColStart;
306 for(
auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
309 IT li = nzit.rowid();
310 IT i = li + localRowStart;
314 trace += nzit.value();
320 MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
321 MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
336 minw = 99999999999999.0;
337 for(
int i=0; i<RepMateWC2R.size(); i++)
344 minw = std::min(minw, RepMateWC2R[i]);
347 MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
348 MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
362 MPI_Comm RowWorld = commGrid->GetRowWorld();
363 int rowroot = commGrid->GetDiagOfProcRow();
364 int pc = commGrid->GetGridCols();
369 for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
377 std::vector <int> sendcnts(pc);
378 std::vector <int> dpls(pc);
380 for(
int i=1; i<pc; i++)
382 dpls[i] = mateCol2Row.RowLenUntil(i);
383 sendcnts[i-1] = dpls[i] - dpls[i-1];
385 sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
388 IT* localC2R = (IT*) mateCol2Row.
GetLocArr();
389 MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
397 #ifndef L2_CACHE_SIZE 398 #define L2_CACHE_SIZE 256000 400 int THREAD_BUF_LEN = 256;
403 int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
407 THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
409 return THREAD_BUF_LEN;
414 template <
class IT,
class NT>
415 std::vector< std::tuple<IT,IT,NT> >
Phase1(
const AWPM_param<IT>& param,
Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
420 double tstart = MPI_Wtime();
423 MPI_Comm World = param.
commGrid->GetWorld();
428 std::vector<int> sendcnt(param.
nprocs,0);
435 std::vector<int> tsendcnt(param.
nprocs,0);
439 for(
int k=0; k<param.
lncol; ++k)
441 IT mj = RepMateC2R[k];
444 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
446 IT li = dcsc->
ir[cp];
448 IT mi = RepMateR2C[li];
452 int rrank = param.
m_perproc != 0 ? std::min(static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
453 int crank = param.
n_perproc != 0 ? std::min(static_cast<int>(mi / param.
n_perproc), param.
pc-1) : (param.
pc-1);
454 int owner = param.
commGrid->GetRank(rrank , crank);
459 for(
int i=0; i<param.
nprocs; i++)
461 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
468 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.
nprocs,
static_cast<IT
>(0));
469 std::vector<int> sdispls (param.
nprocs, 0);
470 std::partial_sum(sendcnt.data(), sendcnt.data()+param.
nprocs-1, sdispls.data()+1);
472 std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
473 std::vector<int> transferCount(param.
nprocs,0);
482 std::vector<int> tsendcnt(param.
nprocs,0);
483 std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.
nprocs*THREAD_BUF_LEN);
487 for(
int k=0; k<param.
lncol; ++k)
489 IT mj = RepMateC2R[k];
493 for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
495 IT li = dcsc->
ir[cp];
497 IT mi = RepMateR2C[li];
500 double w = dcsc->
numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
501 int rrank = param.
m_perproc != 0 ? std::min(static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
502 int crank = param.
n_perproc != 0 ? std::min(static_cast<int>(mi / param.
n_perproc), param.
pc-1) : (param.
pc-1);
503 int owner = param.
commGrid->GetRank(rrank , crank);
505 if (tsendcnt[owner] < THREAD_BUF_LEN)
507 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
512 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
513 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
515 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
521 for(
int owner=0; owner < param.
nprocs; owner++)
523 if (tsendcnt[owner] >0)
525 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
526 copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
531 t1Comp = MPI_Wtime() - tstart;
532 tstart = MPI_Wtime();
536 std::vector<int> recvcnt (param.
nprocs);
537 std::vector<int> rdispls (param.
nprocs, 0);
539 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
540 std::partial_sum(recvcnt.data(), recvcnt.data()+param.
nprocs-1, rdispls.data()+1);
541 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.
nprocs,
static_cast<IT
>(0));
544 MPI_Datatype MPI_tuple;
545 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
546 MPI_Type_commit(&MPI_tuple);
548 std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
549 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
550 MPI_Type_free(&MPI_tuple);
551 t1Comm = MPI_Wtime() - tstart;
559 template <
class IT,
class NT>
560 std::vector< std::tuple<IT,IT,IT,NT> >
Phase2(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples,
Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
563 MPI_Comm World = param.
commGrid->GetWorld();
565 double tstart = MPI_Wtime();
568 __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
569 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.
nprocs);
571 std::vector<int> sendcnt(param.
nprocs,0);
580 nBins = omp_get_num_threads() * 4;
585 #pragma omp parallel for 587 for(
int i=0; i<nBins; i++)
589 int perBin = recvTuples.size()/nBins;
590 int startBinIndex = perBin * i;
591 int endBinIndex = perBin * (i+1);
592 if(i==nBins-1) endBinIndex = recvTuples.size();
595 std::vector<int> tsendcnt(param.
nprocs,0);
596 for(
int k=startBinIndex; k<endBinIndex;)
599 IT mi = std::get<0>(recvTuples[k]);
601 IT i = RepMateC2R[lcol];
603 IT idx2 = colptr[lcol];
605 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
608 IT mj = std::get<1>(recvTuples[idx1]) ;
610 IT j = RepMateR2C[lrow];
611 IT lrowMat = dcsc->
ir[idx2];
614 NT weight = std::get<2>(recvTuples[idx1]);
615 NT cw = weight + RepMateWR2C[lrow];
618 int rrank = (param.
m_perproc != 0) ? std::min(static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
619 int crank = (param.
n_perproc != 0) ? std::min(static_cast<int>(j / param.
n_perproc), param.
pc-1) : (param.
pc-1);
620 int owner = param.
commGrid->GetRank(rrank , crank);
626 else if(lrowMat > lrow)
632 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
636 for(
int i=0; i<param.
nprocs; i++)
638 __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
645 IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.
nprocs,
static_cast<IT
>(0));
646 std::vector<int> sdispls (param.
nprocs, 0);
647 std::partial_sum(sendcnt.data(), sendcnt.data()+param.
nprocs-1, sdispls.data()+1);
649 std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
650 std::vector<int> transferCount(param.
nprocs,0);
656 #pragma omp parallel for 658 for(
int i=0; i<nBins; i++)
660 int perBin = recvTuples.size()/nBins;
661 int startBinIndex = perBin * i;
662 int endBinIndex = perBin * (i+1);
663 if(i==nBins-1) endBinIndex = recvTuples.size();
666 std::vector<int> tsendcnt(param.
nprocs,0);
667 std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.
nprocs*THREAD_BUF_LEN);
668 for(
int k=startBinIndex; k<endBinIndex;)
670 IT mi = std::get<0>(recvTuples[k]);
672 IT i = RepMateC2R[lcol];
674 IT idx2 = colptr[lcol];
676 for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];)
679 IT mj = std::get<1>(recvTuples[idx1]) ;
681 IT j = RepMateR2C[lrow];
682 IT lrowMat = dcsc->
ir[idx2];
685 NT weight = std::get<2>(recvTuples[idx1]);
686 NT cw = weight + RepMateWR2C[lrow];
689 int rrank = (param.
m_perproc != 0) ? std::min(static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
690 int crank = (param.
n_perproc != 0) ? std::min(static_cast<int>(j / param.
n_perproc), param.
pc-1) : (param.
pc-1);
691 int owner = param.
commGrid->GetRank(rrank , crank);
693 if (tsendcnt[owner] < THREAD_BUF_LEN)
695 tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
700 int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
701 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
703 tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
711 else if(lrowMat > lrow)
717 for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
721 for(
int owner=0; owner < param.
nprocs; owner++)
723 if (tsendcnt[owner] >0)
725 int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
726 std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
733 t2Comp = MPI_Wtime() - tstart;
734 tstart = MPI_Wtime();
736 std::vector<int> recvcnt (param.
nprocs);
737 std::vector<int> rdispls (param.
nprocs, 0);
739 MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
740 std::partial_sum(recvcnt.data(), recvcnt.data()+param.
nprocs-1, rdispls.data()+1);
741 IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.
nprocs,
static_cast<IT
>(0));
744 MPI_Datatype MPI_tuple;
745 MPI_Type_contiguous(
sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
746 MPI_Type_commit(&MPI_tuple);
748 std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
749 MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
750 MPI_Type_free(&MPI_tuple);
751 t2Comm = MPI_Wtime() - tstart;
758 template <
class IT,
class NT>
759 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>>
Phase2_old(
const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples,
Dcsc<IT, NT>* dcsc,
const std::vector<IT>& colptr,
const std::vector<IT>& RepMateR2C,
const std::vector<IT>& RepMateC2R,
const std::vector<NT>& RepMateWR2C,
const std::vector<NT>& RepMateWC2R )
762 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.
nprocs);
763 for(
int k=0; k<recvTuples.size(); ++k)
765 IT mi = std::get<0>(recvTuples[k]) ;
766 IT mj = std::get<1>(recvTuples[k]) ;
768 NT weight = std::get<2>(recvTuples[k]);
781 int rrank = (param.
m_perproc != 0) ? std::min(static_cast<int>(mj / param.
m_perproc), param.
pr-1) : (param.
pr-1);
782 int crank = (param.
n_perproc != 0) ? std::min(static_cast<int>(j / param.
n_perproc), param.
pc-1) : (param.
pc-1);
783 int owner = param.
commGrid->GetRank(rrank , crank);
784 tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
793 template <
class IT,
class NT,
class DER>
800 int myrank=commGrid->GetRank();
801 MPI_Comm World = commGrid->GetWorld();
802 MPI_Comm ColWorld = commGrid->GetColWorld();
803 MPI_Comm RowWorld = commGrid->GetRowWorld();
804 int nprocs = commGrid->GetSize();
805 int pr = commGrid->GetGridRows();
806 int pc = commGrid->GetGridCols();
807 int rowrank = commGrid->GetRankInProcRow();
808 int colrank = commGrid->GetRankInProcCol();
809 int diagneigh = commGrid->GetComplementRank();
817 IT m_perproc = nrows / pr;
818 IT n_perproc = ncols / pc;
821 IT lnrow = spSeq->getnrow();
822 IT lncol = spSeq->getncol();
823 IT localRowStart = colrank * m_perproc;
824 IT localColStart = rowrank * n_perproc;
839 double t1CompAll = 0, t1CommAll = 0, t2CompAll = 0, t2CommAll = 0, t3CompAll = 0, t3CommAll = 0, t4CompAll = 0, t4CommAll = 0, t5CompAll = 0, t5CommAll = 0, tUpdateMateCompAll = 0, tUpdateWeightAll = 0;
848 MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh,
TRX, &trxsize, 1, MPI_INT, diagneigh,
TRX, World, &status);
849 std::vector<IT> trxnums(trxsize);
850 MPI_Sendrecv(mateCol2Row.
GetLocArr(), xsize, MPIType<IT>(), diagneigh,
TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh,
TRX, World, &status);
853 std::vector<int> colsize(pc);
854 colsize[colrank] = trxsize;
855 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
856 std::vector<int> dpls(pc,0);
857 std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
858 int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
859 std::vector<IT> RepMateC2R(accsize);
860 MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
873 std::vector<int> rowsize(pr);
874 rowsize[rowrank] = xsize;
875 MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
876 std::vector<int> rdpls(pr,0);
877 std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
878 accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
879 std::vector<IT> RepMateR2C(accsize);
880 MPI_Allgatherv(mateRow2Col.
GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
886 std::vector<IT> colptr (lncol+1,-1);
887 for(
auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit)
889 IT lj = colit.colid();
891 colptr[lj] = colit.colptr();
893 colptr[lncol] = spSeq->getnnz();
894 for(IT k=lncol-1; k>=0; k--)
898 colptr[k] = colptr[k+1];
907 std::vector<NT> RepMateWR2C(lnrow);
908 std::vector<NT> RepMateWC2R(lncol);
915 NT weightPrev = weightCur - 999999999999;
916 while(weightCur > weightPrev && iterations++ < 10)
920 if(myrank==0) std::cout <<
"Iteration " << iterations <<
". matching weight: sum = "<< weightCur <<
" min = " << minw << std::endl;
926 std::vector<std::tuple<IT,IT,NT>> recvTuples =
Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
927 std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 =
Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
928 std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
932 tstart = MPI_Wtime();
934 std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
936 #pragma omp parallel for 938 for(
int k=0; k<lncol; ++k)
940 bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0);
943 for(
int k=0; k<recvTuples1.size(); ++k)
945 IT mj = std::get<0>(recvTuples1[k]) ;
946 IT mi = std::get<1>(recvTuples1[k]) ;
947 IT i = std::get<2>(recvTuples1[k]) ;
948 NT weight = std::get<3>(recvTuples1[k]);
949 IT j = RepMateR2C[mj - localRowStart];
950 IT lj = j - localColStart;
953 if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
955 bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
959 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
960 for(
int k=0; k<lncol; ++k)
962 if( std::get<0>(bestTuplesPhase3[k]) != -1)
966 IT i = std::get<0>(bestTuplesPhase3[k]) ;
967 IT mi = std::get<1>(bestTuplesPhase3[k]) ;
968 IT mj = std::get<2>(bestTuplesPhase3[k]) ;
969 IT j = RepMateR2C[mj - localRowStart];
970 NT weight = std::get<3>(bestTuplesPhase3[k]);
971 int owner =
OwnerProcs(A, i, mi, nrows, ncols);
973 tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
978 double t3Comp = MPI_Wtime() - tstart;
979 tstart = MPI_Wtime();
981 double t3Comm = MPI_Wtime() - tstart;
982 tstart = MPI_Wtime();
984 std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
990 #pragma omp parallel for 992 for(
int k=0; k<lncol; ++k)
994 bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
997 for(
int k=0; k<recvTuples1.size(); ++k)
999 IT i = std::get<0>(recvTuples1[k]) ;
1000 IT j = std::get<1>(recvTuples1[k]) ;
1001 IT mj = std::get<2>(recvTuples1[k]) ;
1002 IT mi = RepMateR2C[i-localRowStart];
1003 NT weight = std::get<3>(recvTuples1[k]);
1004 IT lmi = mi - localColStart;
1009 if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
1011 bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
1017 std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1020 for(
int k=0; k<lncol; ++k)
1022 if( std::get<0>(bestTuplesPhase4[k]) != -1)
1026 IT i = std::get<0>(bestTuplesPhase4[k]) ;
1027 IT j = std::get<1>(bestTuplesPhase4[k]) ;
1028 IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1029 IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1032 int owner =
OwnerProcs(A, mj, j, nrows, ncols);
1033 winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1038 winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1044 double t4Comp = MPI_Wtime() - tstart;
1045 tstart = MPI_Wtime();
1047 std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples =
ExchangeData1(winnerTuples, World);
1049 double t4Comm = MPI_Wtime() - tstart;
1050 tstart = MPI_Wtime();
1053 std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size());
1054 std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size());
1056 #pragma omp parallel for 1058 for(
int k=0; k<recvWinnerTuples.size(); ++k)
1060 IT i = std::get<0>(recvWinnerTuples[k]) ;
1061 IT j = std::get<1>(recvWinnerTuples[k]) ;
1062 IT mi = std::get<2>(recvWinnerTuples[k]) ;
1063 IT mj = std::get<3>(recvWinnerTuples[k]);
1065 colBcastTuples[k] = std::make_tuple(j,i);
1066 rowBcastTuples[k] = std::make_tuple(mj,mi);
1068 double t5Comp = MPI_Wtime() - tstart;
1069 tstart = MPI_Wtime();
1071 std::vector<std::tuple<IT,IT>> updatedR2C =
MateBcast(rowBcastTuples, RowWorld);
1072 std::vector<std::tuple<IT,IT>> updatedC2R =
MateBcast(colBcastTuples, ColWorld);
1074 double t5Comm = MPI_Wtime() - tstart;
1075 tstart = MPI_Wtime();
1078 #pragma omp parallel for 1080 for(
int k=0; k<updatedR2C.size(); k++)
1082 IT row = std::get<0>(updatedR2C[k]);
1083 IT mate = std::get<1>(updatedR2C[k]);
1084 RepMateR2C[row-localRowStart] = mate;
1088 #pragma omp parallel for 1090 for(
int k=0; k<updatedC2R.size(); k++)
1092 IT col = std::get<0>(updatedC2R[k]);
1093 IT mate = std::get<1>(updatedC2R[k]);
1094 RepMateC2R[col-localColStart] = mate;
1098 double tUpdateMateComp = MPI_Wtime() - tstart;
1099 tstart = MPI_Wtime();
1103 double tUpdateWeight = MPI_Wtime() - tstart;
1106 weightPrev = weightCur;
1115 std::cout << t1Comp <<
" " << t1Comm <<
" "<< t2Comp <<
" " << t2Comm <<
" " << t3Comp <<
" " << t3Comm <<
" " << t4Comp <<
" " << t4Comm <<
" " << t5Comp <<
" " << t5Comm <<
" " << tUpdateMateComp <<
" " << tUpdateWeight << std::endl;
1128 tUpdateWeightAll += tUpdateWeight;
1135 std::cout <<
"=========== overal timing ==========" << std::endl;
1136 std::cout << t1CompAll <<
" " << t1CommAll <<
" " << t2CompAll <<
" " << t2CommAll <<
" " << t3CompAll <<
" " << t3CommAll <<
" " << t4CompAll <<
" " << t4CommAll <<
" " << t5CompAll <<
" " << t5CommAll <<
" " << tUpdateMateCompAll <<
" " << tUpdateWeightAll << std::endl;
1140 UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1148 template <
class IT,
class NT,
class DER>
1155 A.
Apply([](NT val){
return (fabs(val));});
1159 A.
DimApply(
Row, maxvRow, [](NT val, NT maxval){
return val/maxval;});
1163 A.
DimApply(
Column, maxvCol, [](NT val, NT maxval){
return val/maxval;});
1166 A.
Apply([](NT val){
return log(val);});
1170 template <
class IT,
class NT>
1182 Abool.
Reduce(degCol,
Column, plus<IT>(), static_cast<IT>(0));
1187 double origWeight =
Trace(A, diagnnz);
1188 bool isOriginalPerfect = diagnnz==A.
getnrow();
1193 SpParHelper::Print(
"After Greedy sanity check\n");
1197 if(isOriginalPerfect && mclWeight<=origWeight)
1199 SpParHelper::Print(
"Maximal is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1202 mclWeight = origWeight;
1203 isPerfectMCL =
true;
1209 double mcmWeight = mclWeight;
1214 tmcm = MPI_Wtime() - ts;
1216 SpParHelper::Print(
"After MCM sanity check\n");
1224 double tawpm = MPI_Wtime() - ts;
1226 double awpmWeight =
MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1227 SpParHelper::Print(
"After AWPM sanity check\n");
1229 if(isOriginalPerfect && awpmWeight<origWeight)
1231 SpParHelper::Print(
"AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1234 awpmWeight = origWeight;
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true)
std::shared_ptr< CommGrid > getcommgrid() const
Compute the maximum of two values.
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void Apply(_UnaryOperation __unary_op)
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
void ReplicateMateWeights(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
std::shared_ptr< CommGrid > getcommgrid() const
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > ¶m, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
IT * ir
row indices, size nz
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT >>> &tempTuples, MPI_Comm World)
void DimApply(Dim dim, const FullyDistVec< IT, NT > &v, _BinaryOperation __binary_op)
void SetLocalElement(IT index, NT value)
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT >> sendTuples, MPI_Comm World)
std::shared_ptr< CommGrid > commGrid
void iota(IT globalsize, NT first)
void maximumMatching(PSpMat_s32p64 &Aeff, FullyDistVec< int64_t, int64_t > &mateRow2Col, FullyDistVec< int64_t, int64_t > &mateCol2Row)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
int ThreadBuffLenForBinning(int itemsize, int nbins)
NT * numx
generic values, size nz
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > ¶m, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT >>> &tempTuples, MPI_Comm World)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > °Col)
const NT * GetLocArr() const