COMBINATORIAL_BLAS  1.6
ApproxWeightPerfectMatching.h
Go to the documentation of this file.
1 //
2 // ApproxWeightPerfectMatching.h
3 //
4 //
5 // Created by Ariful Azad on 8/22/17.
6 //
7 //
8 
9 #ifndef ApproxWeightPerfectMatching_h
10 #define ApproxWeightPerfectMatching_h
11 
12 #include "../CombBLAS.h"
13 #include "BPMaximalMatching.h"
14 #include "BPMaximumMatching.h"
15 #include <parallel/algorithm>
16 #include <parallel/numeric>
17 #include <memory>
18 #include <limits>
19 
20 
21 using namespace std;
22 
23 namespace combblas {
24 
25 
26 template <class IT>
27 struct AWPM_param
28 {
29  int nprocs;
30  int myrank;
31  int pr;
32  int pc;
33  IT lncol;
34  IT lnrow;
39  std::shared_ptr<CommGrid> commGrid;
40 };
41 
43 
44 
45 template <class IT, class NT>
46 std::vector<std::tuple<IT,IT,NT>> ExchangeData(std::vector<std::vector<std::tuple<IT,IT,NT>>> & tempTuples, MPI_Comm World)
47 {
48 
49  /* Create/allocate variables for vector assignment */
50  MPI_Datatype MPI_tuple;
51  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
52  MPI_Type_commit(&MPI_tuple);
53 
54  int nprocs;
55  MPI_Comm_size(World, &nprocs);
56 
57  int * sendcnt = new int[nprocs];
58  int * recvcnt = new int[nprocs];
59  int * sdispls = new int[nprocs]();
60  int * rdispls = new int[nprocs]();
61 
62  // Set the newly found vector entries
63  IT totsend = 0;
64  for(IT i=0; i<nprocs; ++i)
65  {
66  sendcnt[i] = tempTuples[i].size();
67  totsend += tempTuples[i].size();
68  }
69 
70  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
71 
72  std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
73  std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
74  IT totrecv = accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
75 
76 
77  std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
78  for(int i=0; i<nprocs; ++i)
79  {
80  copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
81  std::vector< std::tuple<IT,IT,NT> >().swap(tempTuples[i]); // clear memory
82  }
83  std::vector< std::tuple<IT,IT,NT> > recvTuples(totrecv);
84  MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
85  DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
86  MPI_Type_free(&MPI_tuple);
87  return recvTuples;
88 
89 }
90 
91 
92 
93 template <class IT, class NT>
94 std::vector<std::tuple<IT,IT,IT,NT>> ExchangeData1(std::vector<std::vector<std::tuple<IT,IT,IT,NT>>> & tempTuples, MPI_Comm World)
95 {
96 
97  /* Create/allocate variables for vector assignment */
98  MPI_Datatype MPI_tuple;
99  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
100  MPI_Type_commit(&MPI_tuple);
101 
102  int nprocs;
103  MPI_Comm_size(World, &nprocs);
104 
105  int * sendcnt = new int[nprocs];
106  int * recvcnt = new int[nprocs];
107  int * sdispls = new int[nprocs]();
108  int * rdispls = new int[nprocs]();
109 
110  // Set the newly found vector entries
111  IT totsend = 0;
112  for(IT i=0; i<nprocs; ++i)
113  {
114  sendcnt[i] = tempTuples[i].size();
115  totsend += tempTuples[i].size();
116  }
117 
118  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
119 
120  std::partial_sum(sendcnt, sendcnt+nprocs-1, sdispls+1);
121  std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
122  IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
123 
124  std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
125  for(int i=0; i<nprocs; ++i)
126  {
127  copy(tempTuples[i].begin(), tempTuples[i].end(), sendTuples.data()+sdispls[i]);
128  std::vector< std::tuple<IT,IT,IT,NT> >().swap(tempTuples[i]); // clear memory
129  }
130  std::vector< std::tuple<IT,IT,IT,NT> > recvTuples(totrecv);
131  MPI_Alltoallv(sendTuples.data(), sendcnt, sdispls, MPI_tuple, recvTuples.data(), recvcnt, rdispls, MPI_tuple, World);
132  DeleteAll(sendcnt, recvcnt, sdispls, rdispls); // free all memory
133  MPI_Type_free(&MPI_tuple);
134  return recvTuples;
135 }
136 
137 
138 
139 
140 // remember that getnrow() and getncol() require collectives
141 // Hence, we save them once and pass them to this function
142 template <class IT, class NT,class DER>
143 int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
144 {
145  auto commGrid = A.getcommgrid();
146  int procrows = commGrid->GetGridRows();
147  int proccols = commGrid->GetGridCols();
148 
149  IT m_perproc = nrows / procrows;
150  IT n_perproc = ncols / proccols;
151  int pr, pc;
152  if(m_perproc != 0)
153  pr = std::min(static_cast<int>(grow / m_perproc), procrows-1);
154  else // all owned by the last processor row
155  pr = procrows -1;
156  if(n_perproc != 0)
157  pc = std::min(static_cast<int>(gcol / n_perproc), proccols-1);
158  else
159  pc = proccols-1;
160  return commGrid->GetRank(pr, pc);
161 }
162 
163 
164 /*
165 // Hence, we save them once and pass them to this function
166 template <class IT, class NT,class DER>
167 int OwnerProcs(SpParMat < IT, NT, DER > & A, IT grow, IT gcol, IT nrows, IT ncols)
168 {
169 
170 
171 
172  int pr1, pc1;
173  if(m_perproc != 0)
174  pr1 = std::min(static_cast<int>(grow / m_perproc), pr-1);
175  else // all owned by the last processor row
176  pr1 = pr -1;
177  if(n_perproc != 0)
178  pc1 = std::min(static_cast<int>(gcol / n_perproc), pc-1);
179  else
180  pc1 = pc-1;
181  return commGrid->GetRank(pr1, pc1);
182 }
183  */
184 
185 
186 template <class IT>
187 std::vector<std::tuple<IT,IT>> MateBcast(std::vector<std::tuple<IT,IT>> sendTuples, MPI_Comm World)
188 {
189 
190  /* Create/allocate variables for vector assignment */
191  MPI_Datatype MPI_tuple;
192  MPI_Type_contiguous(sizeof(std::tuple<IT,IT>) , MPI_CHAR, &MPI_tuple);
193  MPI_Type_commit(&MPI_tuple);
194 
195 
196  int nprocs;
197  MPI_Comm_size(World, &nprocs);
198 
199  int * recvcnt = new int[nprocs];
200  int * rdispls = new int[nprocs]();
201  int sendcnt = sendTuples.size();
202 
203 
204  MPI_Allgather(&sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, World);
205 
206  std::partial_sum(recvcnt, recvcnt+nprocs-1, rdispls+1);
207  IT totrecv = std::accumulate(recvcnt,recvcnt+nprocs, static_cast<IT>(0));
208 
209  std::vector< std::tuple<IT,IT> > recvTuples(totrecv);
210 
211 
212  MPI_Allgatherv(sendTuples.data(), sendcnt, MPI_tuple,
213  recvTuples.data(), recvcnt, rdispls,MPI_tuple,World );
214 
215  DeleteAll(recvcnt, rdispls); // free all memory
216  MPI_Type_free(&MPI_tuple);
217  return recvTuples;
218 
219 }
220 
221 
222 // -----------------------------------------------------------
223 // replicate weights of mates
224 // Can be improved by removing AllReduce by All2All
225 // -----------------------------------------------------------
226 
227 template <class IT, class NT>
228 void ReplicateMateWeights( const AWPM_param<IT>& param, Dcsc<IT, NT>*dcsc, const std::vector<IT>& colptr, std::vector<IT>& RepMateC2R, std::vector<NT>& RepMateWR2C, std::vector<NT>& RepMateWC2R)
229 {
230 
231 
232  fill(RepMateWC2R.begin(), RepMateWC2R.end(), static_cast<NT>(0));
233  fill(RepMateWR2C.begin(), RepMateWR2C.end(), static_cast<NT>(0));
234 
235 
236 #ifdef THREADED
237 #pragma omp parallel for
238 #endif
239  for(int k=0; k<param.lncol; ++k)
240  {
241 
242  IT lj = k; // local numbering
243  IT mj = RepMateC2R[lj]; // mate of j
244 
245  if(mj >= param.localRowStart && mj < (param.localRowStart+param.lnrow) )
246  {
247  for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
248  {
249  IT li = dcsc->ir[cp];
250  IT i = li + param.localRowStart;
251  // TODO: use binary search to directly go to mj-th entry if more than 32 nonzero in this column
252  if( i == mj)
253  {
254  RepMateWC2R[lj] = dcsc->numx[cp];
255  RepMateWR2C[mj-param.localRowStart] = dcsc->numx[cp];
256  //break;
257  }
258  }
259  }
260  }
261 
262 
263 
264  MPI_Comm ColWorld = param.commGrid->GetColWorld();
265  MPI_Comm RowWorld = param.commGrid->GetRowWorld();
266 
267  MPI_Allreduce(MPI_IN_PLACE, RepMateWC2R.data(), RepMateWC2R.size(), MPIType<NT>(), MPI_SUM, ColWorld);
268  MPI_Allreduce(MPI_IN_PLACE, RepMateWR2C.data(), RepMateWR2C.size(), MPIType<NT>(), MPI_SUM, RowWorld);
269 }
270 
271 
272 
273 
274 template <class IT, class NT,class DER>
275 NT Trace( SpParMat < IT, NT, DER > & A, IT& rettrnnz=0)
276 {
277 
278  IT nrows = A.getnrow();
279  IT ncols = A.getncol();
280  auto commGrid = A.getcommgrid();
281  MPI_Comm World = commGrid->GetWorld();
282  int myrank=commGrid->GetRank();
283  int pr = commGrid->GetGridRows();
284  int pc = commGrid->GetGridCols();
285 
286 
287  //Information about the matrix distribution
288  //Assume that A is an nrow x ncol matrix
289  //The local submatrix is an lnrow x lncol matrix
290  int rowrank = commGrid->GetRankInProcRow();
291  int colrank = commGrid->GetRankInProcCol();
292  IT m_perproc = nrows / pr;
293  IT n_perproc = ncols / pc;
294  DER* spSeq = A.seqptr(); // local submatrix
295  IT localRowStart = colrank * m_perproc; // first row in this process
296  IT localColStart = rowrank * n_perproc; // first col in this process
297 
298 
299  IT trnnz = 0;
300  NT trace = 0.0;
301  for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over columns
302  {
303  IT lj = colit.colid(); // local numbering
304  IT j = lj + localColStart;
305 
306  for(auto nzit = spSeq->begnz(colit); nzit < spSeq->endnz(colit); ++nzit)
307  {
308 
309  IT li = nzit.rowid();
310  IT i = li + localRowStart;
311  if( i == j)
312  {
313  trnnz ++;
314  trace += nzit.value();
315 
316  }
317  }
318 
319  }
320  MPI_Allreduce(MPI_IN_PLACE, &trnnz, 1, MPIType<IT>(), MPI_SUM, World);
321  MPI_Allreduce(MPI_IN_PLACE, &trace, 1, MPIType<NT>(), MPI_SUM, World);
322  rettrnnz = trnnz;
323  /*
324  if(myrank==0)
325  cout <<"nrows: " << nrows << " Nnz in the diag: " << trnnz << " sum of diag: " << trace << endl;
326  */
327  return trace;
328 
329 }
330 
331 
332 template <class NT>
333 NT MatchingWeight( std::vector<NT>& RepMateWC2R, MPI_Comm RowWorld, NT& minw)
334 {
335  NT w = 0;
336  minw = 99999999999999.0;
337  for(int i=0; i<RepMateWC2R.size(); i++)
338  {
339  //w += fabs(RepMateWC2R[i]);
340  //w += exp(RepMateWC2R[i]);
341  //minw = min(minw, exp(RepMateWC2R[i]));
342 
343  w += RepMateWC2R[i];
344  minw = std::min(minw, RepMateWC2R[i]);
345  }
346 
347  MPI_Allreduce(MPI_IN_PLACE, &w, 1, MPIType<NT>(), MPI_SUM, RowWorld);
348  MPI_Allreduce(MPI_IN_PLACE, &minw, 1, MPIType<NT>(), MPI_MIN, RowWorld);
349  return w;
350 }
351 
352 
353 
354 
355 
356 // update the distributed mate vectors from replicated mate vectors
357 template <class IT>
358 void UpdateMatching(FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, std::vector<IT>& RepMateR2C, std::vector<IT>& RepMateC2R)
359 {
360 
361  auto commGrid = mateRow2Col.getcommgrid();
362  MPI_Comm RowWorld = commGrid->GetRowWorld();
363  int rowroot = commGrid->GetDiagOfProcRow();
364  int pc = commGrid->GetGridCols();
365 
366  // mateRow2Col is easy
367  IT localLenR2C = mateRow2Col.LocArrSize();
368  //IT* localR2C = mateRow2Col.GetLocArr();
369  for(IT i=0, j = mateRow2Col.RowLenUntil(); i<localLenR2C; i++, j++)
370  {
371  mateRow2Col.SetLocalElement(i, RepMateR2C[j]);
372  //localR2C[i] = RepMateR2C[j];
373  }
374 
375 
376  // mateCol2Row requires communication
377  std::vector <int> sendcnts(pc);
378  std::vector <int> dpls(pc);
379  dpls[0] = 0;
380  for(int i=1; i<pc; i++)
381  {
382  dpls[i] = mateCol2Row.RowLenUntil(i);
383  sendcnts[i-1] = dpls[i] - dpls[i-1];
384  }
385  sendcnts[pc-1] = RepMateC2R.size() - dpls[pc-1];
386 
387  IT localLenC2R = mateCol2Row.LocArrSize();
388  IT* localC2R = (IT*) mateCol2Row.GetLocArr();
389  MPI_Scatterv(RepMateC2R.data(),sendcnts.data(), dpls.data(), MPIType<IT>(), localC2R, localLenC2R, MPIType<IT>(),rowroot, RowWorld);
390 }
391 
392 
393 
394 int ThreadBuffLenForBinning(int itemsize, int nbins)
395 {
396  // 1MB shared cache (per 2 cores) in KNL
397 #ifndef L2_CACHE_SIZE
398 #define L2_CACHE_SIZE 256000
399 #endif
400  int THREAD_BUF_LEN = 256;
401  while(true)
402  {
403  int bufferMem = THREAD_BUF_LEN * nbins * itemsize ;
404  if(bufferMem>L2_CACHE_SIZE ) THREAD_BUF_LEN/=2;
405  else break;
406  }
407  THREAD_BUF_LEN = std::min(nbins+1,THREAD_BUF_LEN);
408 
409  return THREAD_BUF_LEN;
410 }
411 
412 
413 
414 template <class IT, class NT>
415 std::vector< std::tuple<IT,IT,NT> > Phase1(const AWPM_param<IT>& param, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
416 {
417 
418 
419 
420  double tstart = MPI_Wtime();
421 
422 
423  MPI_Comm World = param.commGrid->GetWorld();
424 
425 
426 
427  //Step 1: Count the amount of data to be sent to different processors
428  std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
429 
430 
431 #ifdef THREADED
432 #pragma omp parallel
433 #endif
434  {
435  std::vector<int> tsendcnt(param.nprocs,0);
436 #ifdef THREADED
437 #pragma omp for
438 #endif
439  for(int k=0; k<param.lncol; ++k)
440  {
441  IT mj = RepMateC2R[k]; // lj = k
442  IT j = k + param.localColStart;
443 
444  for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
445  {
446  IT li = dcsc->ir[cp];
447  IT i = li + param.localRowStart;
448  IT mi = RepMateR2C[li];
449  if( i > mj) // TODO : stop when first come to this, may be use <
450  {
451 
452  int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
453  int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
454  int owner = param.commGrid->GetRank(rrank , crank);
455  tsendcnt[owner]++;
456  }
457  }
458  }
459  for(int i=0; i<param.nprocs; i++)
460  {
461  __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
462  }
463  }
464 
465 
466 
467 
468  IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
469  std::vector<int> sdispls (param.nprocs, 0);
470  std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
471 
472  std::vector< std::tuple<IT,IT,NT> > sendTuples(totsend);
473  std::vector<int> transferCount(param.nprocs,0);
474  int THREAD_BUF_LEN = ThreadBuffLenForBinning(24, param.nprocs);
475 
476 
477  //Step 2: Compile data to be sent to different processors
478 #ifdef THREADED
479 #pragma omp parallel
480 #endif
481  {
482  std::vector<int> tsendcnt(param.nprocs,0);
483  std::vector<std::tuple<IT,IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
484 #ifdef THREADED
485 #pragma omp for
486 #endif
487  for(int k=0; k<param.lncol; ++k)
488  {
489  IT mj = RepMateC2R[k];
490  IT lj = k;
491  IT j = k + param.localColStart;
492 
493  for(IT cp = colptr[k]; cp < colptr[k+1]; ++cp)
494  {
495  IT li = dcsc->ir[cp];
496  IT i = li + param.localRowStart;
497  IT mi = RepMateR2C[li];
498  if( i > mj) // TODO : stop when first come to this, may be use <
499  {
500  double w = dcsc->numx[cp]- RepMateWR2C[li] - RepMateWC2R[lj];
501  int rrank = param.m_perproc != 0 ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
502  int crank = param.n_perproc != 0 ? std::min(static_cast<int>(mi / param.n_perproc), param.pc-1) : (param.pc-1);
503  int owner = param.commGrid->GetRank(rrank , crank);
504 
505  if (tsendcnt[owner] < THREAD_BUF_LEN)
506  {
507  tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mi, mj, w);
508  tsendcnt[owner]++;
509  }
510  else
511  {
512  int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
513  copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
514 
515  tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mi, mj, w);
516  tsendcnt[owner] = 1;
517  }
518  }
519  }
520  }
521  for(int owner=0; owner < param.nprocs; owner++)
522  {
523  if (tsendcnt[owner] >0)
524  {
525  int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
526  copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
527  }
528  }
529  }
530 
531  t1Comp = MPI_Wtime() - tstart;
532  tstart = MPI_Wtime();
533 
534  // Step 3: Communicate data
535 
536  std::vector<int> recvcnt (param.nprocs);
537  std::vector<int> rdispls (param.nprocs, 0);
538 
539  MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
540  std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
541  IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
542 
543 
544  MPI_Datatype MPI_tuple;
545  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,NT>), MPI_CHAR, &MPI_tuple);
546  MPI_Type_commit(&MPI_tuple);
547 
548  std::vector< std::tuple<IT,IT,NT> > recvTuples1(totrecv);
549  MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
550  MPI_Type_free(&MPI_tuple);
551  t1Comm = MPI_Wtime() - tstart;
552  return recvTuples1;
553 }
554 
555 
556 
557 
558 
559 template <class IT, class NT>
560 std::vector< std::tuple<IT,IT,IT,NT> > Phase2(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
561 {
562 
563  MPI_Comm World = param.commGrid->GetWorld();
564 
565  double tstart = MPI_Wtime();
566 
567  // Step 1: Sort for effecient searching of indices
568  __gnu_parallel::sort(recvTuples.begin(), recvTuples.end());
569  std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
570 
571  std::vector<int> sendcnt(param.nprocs,0); // number items to be sent to each processor
572 
573  //Step 2: Count the amount of data to be sent to different processors
574  // Instead of binary search in each column, I am doing linear search
575  // Linear search is faster here because, we need to search 40%-50% of nnz
576  int nBins = 1;
577 #ifdef THREADED
578 #pragma omp parallel
579  {
580  nBins = omp_get_num_threads() * 4;
581  }
582 #endif
583 
584 #ifdef THREADED
585 #pragma omp parallel for
586 #endif
587  for(int i=0; i<nBins; i++)
588  {
589  int perBin = recvTuples.size()/nBins;
590  int startBinIndex = perBin * i;
591  int endBinIndex = perBin * (i+1);
592  if(i==nBins-1) endBinIndex = recvTuples.size();
593 
594 
595  std::vector<int> tsendcnt(param.nprocs,0);
596  for(int k=startBinIndex; k<endBinIndex;)
597  {
598 
599  IT mi = std::get<0>(recvTuples[k]);
600  IT lcol = mi - param.localColStart;
601  IT i = RepMateC2R[lcol];
602  IT idx1 = k;
603  IT idx2 = colptr[lcol];
604 
605  for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
606  {
607 
608  IT mj = std::get<1>(recvTuples[idx1]) ;
609  IT lrow = mj - param.localRowStart;
610  IT j = RepMateR2C[lrow];
611  IT lrowMat = dcsc->ir[idx2];
612  if(lrowMat == lrow)
613  {
614  NT weight = std::get<2>(recvTuples[idx1]);
615  NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
616  if (cw > 0)
617  {
618  int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
619  int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
620  int owner = param.commGrid->GetRank(rrank , crank);
621  tsendcnt[owner]++;
622  }
623 
624  idx1++; idx2++;
625  }
626  else if(lrowMat > lrow)
627  idx1 ++;
628  else
629  idx2 ++;
630  }
631 
632  for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
633  k = idx1;
634 
635  }
636  for(int i=0; i<param.nprocs; i++)
637  {
638  __sync_fetch_and_add(sendcnt.data()+i, tsendcnt[i]);
639  }
640  }
641 
642 
643 
644 
645  IT totsend = std::accumulate(sendcnt.data(), sendcnt.data()+param.nprocs, static_cast<IT>(0));
646  std::vector<int> sdispls (param.nprocs, 0);
647  std::partial_sum(sendcnt.data(), sendcnt.data()+param.nprocs-1, sdispls.data()+1);
648 
649  std::vector< std::tuple<IT,IT,IT,NT> > sendTuples(totsend);
650  std::vector<int> transferCount(param.nprocs,0);
651  int THREAD_BUF_LEN = ThreadBuffLenForBinning(32, param.nprocs);
652 
653 
654  //Step 3: Compile data to be sent to different processors
655 #ifdef THREADED
656 #pragma omp parallel for
657 #endif
658  for(int i=0; i<nBins; i++)
659  {
660  int perBin = recvTuples.size()/nBins;
661  int startBinIndex = perBin * i;
662  int endBinIndex = perBin * (i+1);
663  if(i==nBins-1) endBinIndex = recvTuples.size();
664 
665 
666  std::vector<int> tsendcnt(param.nprocs,0);
667  std::vector<std::tuple<IT,IT, IT, NT>> tsendTuples (param.nprocs*THREAD_BUF_LEN);
668  for(int k=startBinIndex; k<endBinIndex;)
669  {
670  IT mi = std::get<0>(recvTuples[k]);
671  IT lcol = mi - param.localColStart;
672  IT i = RepMateC2R[lcol];
673  IT idx1 = k;
674  IT idx2 = colptr[lcol];
675 
676  for(; std::get<0>(recvTuples[idx1]) == mi && idx2 < colptr[lcol+1];) //**
677  {
678 
679  IT mj = std::get<1>(recvTuples[idx1]) ;
680  IT lrow = mj - param.localRowStart;
681  IT j = RepMateR2C[lrow];
682  IT lrowMat = dcsc->ir[idx2];
683  if(lrowMat == lrow)
684  {
685  NT weight = std::get<2>(recvTuples[idx1]);
686  NT cw = weight + RepMateWR2C[lrow]; //w+W[M'[j],M[i]];
687  if (cw > 0)
688  {
689  int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
690  int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
691  int owner = param.commGrid->GetRank(rrank , crank);
692 
693  if (tsendcnt[owner] < THREAD_BUF_LEN)
694  {
695  tsendTuples[THREAD_BUF_LEN * owner + tsendcnt[owner]] = std::make_tuple(mj, mi, i, cw);
696  tsendcnt[owner]++;
697  }
698  else
699  {
700  int tt = __sync_fetch_and_add(transferCount.data()+owner, THREAD_BUF_LEN);
701  std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * (owner+1) , sendTuples.data() + sdispls[owner]+ tt);
702 
703  tsendTuples[THREAD_BUF_LEN * owner] = std::make_tuple(mj, mi, i, cw);
704  tsendcnt[owner] = 1;
705  }
706 
707  }
708 
709  idx1++; idx2++;
710  }
711  else if(lrowMat > lrow)
712  idx1 ++;
713  else
714  idx2 ++;
715  }
716 
717  for(;std::get<0>(recvTuples[idx1]) == mi ; idx1++);
718  k = idx1;
719  }
720 
721  for(int owner=0; owner < param.nprocs; owner++)
722  {
723  if (tsendcnt[owner] >0)
724  {
725  int tt = __sync_fetch_and_add(transferCount.data()+owner, tsendcnt[owner]);
726  std::copy( tsendTuples.data()+THREAD_BUF_LEN * owner, tsendTuples.data()+THREAD_BUF_LEN * owner + tsendcnt[owner], sendTuples.data() + sdispls[owner]+ tt);
727  }
728  }
729  }
730 
731  // Step 4: Communicate data
732 
733  t2Comp = MPI_Wtime() - tstart;
734  tstart = MPI_Wtime();
735 
736  std::vector<int> recvcnt (param.nprocs);
737  std::vector<int> rdispls (param.nprocs, 0);
738 
739  MPI_Alltoall(sendcnt.data(), 1, MPI_INT, recvcnt.data(), 1, MPI_INT, World);
740  std::partial_sum(recvcnt.data(), recvcnt.data()+param.nprocs-1, rdispls.data()+1);
741  IT totrecv = std::accumulate(recvcnt.data(), recvcnt.data()+param.nprocs, static_cast<IT>(0));
742 
743 
744  MPI_Datatype MPI_tuple;
745  MPI_Type_contiguous(sizeof(std::tuple<IT,IT,IT,NT>), MPI_CHAR, &MPI_tuple);
746  MPI_Type_commit(&MPI_tuple);
747 
748  std::vector< std::tuple<IT,IT,IT,NT> > recvTuples1(totrecv);
749  MPI_Alltoallv(sendTuples.data(), sendcnt.data(), sdispls.data(), MPI_tuple, recvTuples1.data(), recvcnt.data(), rdispls.data(), MPI_tuple, World);
750  MPI_Type_free(&MPI_tuple);
751  t2Comm = MPI_Wtime() - tstart;
752  return recvTuples1;
753 }
754 
755 
756 // Old version of Phase 2
757 // Not multithreaded (uses binary search)
758 template <class IT, class NT>
759 std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> Phase2_old(const AWPM_param<IT>& param, std::vector<std::tuple<IT,IT,NT>>& recvTuples, Dcsc<IT, NT>* dcsc, const std::vector<IT>& colptr, const std::vector<IT>& RepMateR2C, const std::vector<IT>& RepMateC2R, const std::vector<NT>& RepMateWR2C, const std::vector<NT>& RepMateWC2R )
760 {
761 
762  std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (param.nprocs);
763  for(int k=0; k<recvTuples.size(); ++k)
764  {
765  IT mi = std::get<0>(recvTuples[k]) ;
766  IT mj = std::get<1>(recvTuples[k]) ;
767  IT i = RepMateC2R[mi - param.localColStart];
768  NT weight = std::get<2>(recvTuples[k]);
769 
770  if(colptr[mi- param.localColStart+1] > colptr[mi- param.localColStart] )
771  {
772  IT * ele = find(dcsc->ir+colptr[mi - param.localColStart], dcsc->ir+colptr[mi - param.localColStart+1], mj - param.localRowStart);
773 
774  // TODO: Add a function that returns the edge weight directly
775  if (ele != dcsc->ir+colptr[mi - param.localColStart+1])
776  {
777  NT cw = weight + RepMateWR2C[mj - param.localRowStart]; //w+W[M'[j],M[i]];
778  if (cw > 0)
779  {
780  IT j = RepMateR2C[mj - param.localRowStart];
781  int rrank = (param.m_perproc != 0) ? std::min(static_cast<int>(mj / param.m_perproc), param.pr-1) : (param.pr-1);
782  int crank = (param.n_perproc != 0) ? std::min(static_cast<int>(j / param.n_perproc), param.pc-1) : (param.pc-1);
783  int owner = param.commGrid->GetRank(rrank , crank);
784  tempTuples1[owner].push_back(make_tuple(mj, mi, i, cw));
785  }
786  }
787  }
788  }
789 
790  return tempTuples1;
791 }
792 
793 template <class IT, class NT, class DER>
795 {
796 
797  // Information about CommGrid and matrix layout
798  // Assume that processes are laid in (pr x pc) process grid
799  auto commGrid = A.getcommgrid();
800  int myrank=commGrid->GetRank();
801  MPI_Comm World = commGrid->GetWorld();
802  MPI_Comm ColWorld = commGrid->GetColWorld();
803  MPI_Comm RowWorld = commGrid->GetRowWorld();
804  int nprocs = commGrid->GetSize();
805  int pr = commGrid->GetGridRows();
806  int pc = commGrid->GetGridCols();
807  int rowrank = commGrid->GetRankInProcRow();
808  int colrank = commGrid->GetRankInProcCol();
809  int diagneigh = commGrid->GetComplementRank();
810 
811  //Information about the matrix distribution
812  //Assume that A is an nrow x ncol matrix
813  //The local submatrix is an lnrow x lncol matrix
814  IT nrows = A.getnrow();
815  IT ncols = A.getncol();
816  IT nnz = A.getnnz();
817  IT m_perproc = nrows / pr;
818  IT n_perproc = ncols / pc;
819  DER* spSeq = A.seqptr(); // local submatrix
820  Dcsc<IT, NT>* dcsc = spSeq->GetDCSC();
821  IT lnrow = spSeq->getnrow();
822  IT lncol = spSeq->getncol();
823  IT localRowStart = colrank * m_perproc; // first row in this process
824  IT localColStart = rowrank * n_perproc; // first col in this process
825 
826  AWPM_param<IT> param;
827  param.nprocs = nprocs;
828  param.pr = pr;
829  param.pc = pc;
830  param.lncol = lncol;
831  param.lnrow = lnrow;
832  param.m_perproc = m_perproc;
833  param.n_perproc = n_perproc;
834  param.localRowStart = localRowStart;
835  param.localColStart = localColStart;
836  param.myrank = myrank;
837  param.commGrid = commGrid;
838 
839  double t1CompAll = 0, t1CommAll = 0, t2CompAll = 0, t2CommAll = 0, t3CompAll = 0, t3CommAll = 0, t4CompAll = 0, t4CommAll = 0, t5CompAll = 0, t5CommAll = 0, tUpdateMateCompAll = 0, tUpdateWeightAll = 0;
840 
841  // -----------------------------------------------------------
842  // replicate mate vectors for mateCol2Row
843  // Communication cost: same as the first communication of SpMV
844  // -----------------------------------------------------------
845  int xsize = (int) mateCol2Row.LocArrSize();
846  int trxsize = 0;
847  MPI_Status status;
848  MPI_Sendrecv(&xsize, 1, MPI_INT, diagneigh, TRX, &trxsize, 1, MPI_INT, diagneigh, TRX, World, &status);
849  std::vector<IT> trxnums(trxsize);
850  MPI_Sendrecv(mateCol2Row.GetLocArr(), xsize, MPIType<IT>(), diagneigh, TRX, trxnums.data(), trxsize, MPIType<IT>(), diagneigh, TRX, World, &status);
851 
852 
853  std::vector<int> colsize(pc);
854  colsize[colrank] = trxsize;
855  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, colsize.data(), 1, MPI_INT, ColWorld);
856  std::vector<int> dpls(pc,0); // displacements (zero initialized pid)
857  std::partial_sum(colsize.data(), colsize.data()+pc-1, dpls.data()+1);
858  int accsize = std::accumulate(colsize.data(), colsize.data()+pc, 0);
859  std::vector<IT> RepMateC2R(accsize);
860  MPI_Allgatherv(trxnums.data(), trxsize, MPIType<IT>(), RepMateC2R.data(), colsize.data(), dpls.data(), MPIType<IT>(), ColWorld);
861  // -----------------------------------------------------------
862 
863 
864  // -----------------------------------------------------------
865  // replicate mate vectors for mateRow2Col
866  // Communication cost: same as the first communication of SpMV
867  // (minus the cost of tranposing vector)
868  // -----------------------------------------------------------
869 
870 
871  xsize = (int) mateRow2Col.LocArrSize();
872 
873  std::vector<int> rowsize(pr);
874  rowsize[rowrank] = xsize;
875  MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, rowsize.data(), 1, MPI_INT, RowWorld);
876  std::vector<int> rdpls(pr,0); // displacements (zero initialized pid)
877  std::partial_sum(rowsize.data(), rowsize.data()+pr-1, rdpls.data()+1);
878  accsize = std::accumulate(rowsize.data(), rowsize.data()+pr, 0);
879  std::vector<IT> RepMateR2C(accsize);
880  MPI_Allgatherv(mateRow2Col.GetLocArr(), xsize, MPIType<IT>(), RepMateR2C.data(), rowsize.data(), rdpls.data(), MPIType<IT>(), RowWorld);
881  // -----------------------------------------------------------
882 
883 
884 
885  // Getting column pointers for all columns (for CSC-style access)
886  std::vector<IT> colptr (lncol+1,-1);
887  for(auto colit = spSeq->begcol(); colit != spSeq->endcol(); ++colit) // iterate over all columns
888  {
889  IT lj = colit.colid(); // local numbering
890 
891  colptr[lj] = colit.colptr();
892  }
893  colptr[lncol] = spSeq->getnnz();
894  for(IT k=lncol-1; k>=0; k--)
895  {
896  if(colptr[k] == -1)
897  {
898  colptr[k] = colptr[k+1];
899  }
900  }
901  // TODO: will this fail empty local matrix where every entry of colptr will be zero
902 
903 
904  // -----------------------------------------------------------
905  // replicate weights of mates
906  // -----------------------------------------------------------
907  std::vector<NT> RepMateWR2C(lnrow);
908  std::vector<NT> RepMateWC2R(lncol);
909  ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
910 
911 
912  int iterations = 0;
913  NT minw;
914  NT weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
915  NT weightPrev = weightCur - 999999999999;
916  while(weightCur > weightPrev && iterations++ < 10)
917  {
918 
919 
920  if(myrank==0) std::cout << "Iteration " << iterations << ". matching weight: sum = "<< weightCur << " min = " << minw << std::endl;
921  // C requests
922  // each row is for a processor where C requests will be sent to
923  double tstart;
924 
925 
926  std::vector<std::tuple<IT,IT,NT>> recvTuples = Phase1(param, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
927  std::vector<std::tuple<IT,IT,IT,NT>> recvTuples1 = Phase2(param, recvTuples, dcsc, colptr, RepMateR2C, RepMateC2R, RepMateWR2C, RepMateWC2R );
928  std::vector< std::tuple<IT,IT,NT> >().swap(recvTuples);
929 
930 
931 
932  tstart = MPI_Wtime();
933 
934  std::vector<std::tuple<IT,IT,IT,NT>> bestTuplesPhase3 (lncol);
935 #ifdef THREADED
936 #pragma omp parallel for
937 #endif
938  for(int k=0; k<lncol; ++k)
939  {
940  bestTuplesPhase3[k] = std::make_tuple(-1,-1,-1,0); // fix this
941  }
942 
943  for(int k=0; k<recvTuples1.size(); ++k)
944  {
945  IT mj = std::get<0>(recvTuples1[k]) ;
946  IT mi = std::get<1>(recvTuples1[k]) ;
947  IT i = std::get<2>(recvTuples1[k]) ;
948  NT weight = std::get<3>(recvTuples1[k]);
949  IT j = RepMateR2C[mj - localRowStart];
950  IT lj = j - localColStart;
951 
952  // we can get rid of the first check if edge weights are non negative
953  if( (std::get<0>(bestTuplesPhase3[lj]) == -1) || (weight > std::get<3>(bestTuplesPhase3[lj])) )
954  {
955  bestTuplesPhase3[lj] = std::make_tuple(i,mi,mj,weight);
956  }
957  }
958 
959  std::vector<std::vector<std::tuple<IT,IT, IT, NT>>> tempTuples1 (nprocs);
960  for(int k=0; k<lncol; ++k)
961  {
962  if( std::get<0>(bestTuplesPhase3[k]) != -1)
963  {
964  //IT j = RepMateR2C[mj - localRowStart]; /// fix me
965 
966  IT i = std::get<0>(bestTuplesPhase3[k]) ;
967  IT mi = std::get<1>(bestTuplesPhase3[k]) ;
968  IT mj = std::get<2>(bestTuplesPhase3[k]) ;
969  IT j = RepMateR2C[mj - localRowStart];
970  NT weight = std::get<3>(bestTuplesPhase3[k]);
971  int owner = OwnerProcs(A, i, mi, nrows, ncols);
972 
973  tempTuples1[owner].push_back(std::make_tuple(i, j, mj, weight));
974  }
975  }
976 
977  //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
978  double t3Comp = MPI_Wtime() - tstart;
979  tstart = MPI_Wtime();
980  recvTuples1 = ExchangeData1(tempTuples1, World);
981  double t3Comm = MPI_Wtime() - tstart;
982  tstart = MPI_Wtime();
983 
984  std::vector<std::tuple<IT,IT,IT,IT, NT>> bestTuplesPhase4 (lncol);
985  // we could have used lnrow in both bestTuplesPhase3 and bestTuplesPhase4
986 
987  // Phase 4
988  // at the owner of (i,mi)
989 #ifdef THREADED
990 #pragma omp parallel for
991 #endif
992  for(int k=0; k<lncol; ++k)
993  {
994  bestTuplesPhase4[k] = std::make_tuple(-1,-1,-1,-1,0);
995  }
996 
997  for(int k=0; k<recvTuples1.size(); ++k)
998  {
999  IT i = std::get<0>(recvTuples1[k]) ;
1000  IT j = std::get<1>(recvTuples1[k]) ;
1001  IT mj = std::get<2>(recvTuples1[k]) ;
1002  IT mi = RepMateR2C[i-localRowStart];
1003  NT weight = std::get<3>(recvTuples1[k]);
1004  IT lmi = mi - localColStart;
1005  //IT lj = j - localColStart;
1006 
1007  // cout <<"****" << i << " " << mi << " "<< j << " " << mj << " " << get<0>(bestTuplesPhase4[lj]) << endl;
1008  // we can get rid of the first check if edge weights are non negative
1009  if( ((std::get<0>(bestTuplesPhase4[lmi]) == -1) || (weight > std::get<4>(bestTuplesPhase4[lmi]))) && std::get<0>(bestTuplesPhase3[lmi])==-1 )
1010  {
1011  bestTuplesPhase4[lmi] = std::make_tuple(i,j,mi,mj,weight);
1012  //cout << "(("<< i << " " << mi << " "<< j << " " << mj << "))"<< endl;
1013  }
1014  }
1015 
1016 
1017  std::vector<std::vector<std::tuple<IT,IT,IT, IT>>> winnerTuples (nprocs);
1018 
1019 
1020  for(int k=0; k<lncol; ++k)
1021  {
1022  if( std::get<0>(bestTuplesPhase4[k]) != -1)
1023  {
1024  //int owner = OwnerProcs(A, get<0>(bestTuples[k]), get<1>(bestTuples[k]), nrows, ncols); // (i,mi)
1025  //tempTuples[owner].push_back(bestTuples[k]);
1026  IT i = std::get<0>(bestTuplesPhase4[k]) ;
1027  IT j = std::get<1>(bestTuplesPhase4[k]) ;
1028  IT mi = std::get<2>(bestTuplesPhase4[k]) ;
1029  IT mj = std::get<3>(bestTuplesPhase4[k]) ;
1030 
1031 
1032  int owner = OwnerProcs(A, mj, j, nrows, ncols);
1033  winnerTuples[owner].push_back(std::make_tuple(i, j, mi, mj));
1034 
1036  // passing the opposite of the matching to the owner of (i,mi)
1037  owner = OwnerProcs(A, i, mi, nrows, ncols);
1038  winnerTuples[owner].push_back(std::make_tuple(mj, mi, j, i));
1039  }
1040  }
1041 
1042 
1043  //vector< tuple<IT,IT,IT, NT> >().swap(recvTuples1);
1044  double t4Comp = MPI_Wtime() - tstart;
1045  tstart = MPI_Wtime();
1046 
1047  std::vector<std::tuple<IT,IT,IT,IT>> recvWinnerTuples = ExchangeData1(winnerTuples, World);
1048 
1049  double t4Comm = MPI_Wtime() - tstart;
1050  tstart = MPI_Wtime();
1051 
1052  // at the owner of (mj,j)
1053  std::vector<std::tuple<IT,IT>> rowBcastTuples(recvWinnerTuples.size()); //(mi,mj)
1054  std::vector<std::tuple<IT,IT>> colBcastTuples(recvWinnerTuples.size()); //(j,i)
1055 #ifdef THREADED
1056 #pragma omp parallel for
1057 #endif
1058  for(int k=0; k<recvWinnerTuples.size(); ++k)
1059  {
1060  IT i = std::get<0>(recvWinnerTuples[k]) ;
1061  IT j = std::get<1>(recvWinnerTuples[k]) ;
1062  IT mi = std::get<2>(recvWinnerTuples[k]) ;
1063  IT mj = std::get<3>(recvWinnerTuples[k]);
1064 
1065  colBcastTuples[k] = std::make_tuple(j,i);
1066  rowBcastTuples[k] = std::make_tuple(mj,mi);
1067  }
1068  double t5Comp = MPI_Wtime() - tstart;
1069  tstart = MPI_Wtime();
1070 
1071  std::vector<std::tuple<IT,IT>> updatedR2C = MateBcast(rowBcastTuples, RowWorld);
1072  std::vector<std::tuple<IT,IT>> updatedC2R = MateBcast(colBcastTuples, ColWorld);
1073 
1074  double t5Comm = MPI_Wtime() - tstart;
1075  tstart = MPI_Wtime();
1076 
1077 #ifdef THREADED
1078 #pragma omp parallel for
1079 #endif
1080  for(int k=0; k<updatedR2C.size(); k++)
1081  {
1082  IT row = std::get<0>(updatedR2C[k]);
1083  IT mate = std::get<1>(updatedR2C[k]);
1084  RepMateR2C[row-localRowStart] = mate;
1085  }
1086 
1087 #ifdef THREADED
1088 #pragma omp parallel for
1089 #endif
1090  for(int k=0; k<updatedC2R.size(); k++)
1091  {
1092  IT col = std::get<0>(updatedC2R[k]);
1093  IT mate = std::get<1>(updatedC2R[k]);
1094  RepMateC2R[col-localColStart] = mate;
1095  }
1096 
1097 
1098  double tUpdateMateComp = MPI_Wtime() - tstart;
1099  tstart = MPI_Wtime();
1100  // update weights of matched edges
1101  // we can do better than this since we are doing sparse updates
1102  ReplicateMateWeights(param, dcsc, colptr, RepMateC2R, RepMateWR2C, RepMateWC2R);
1103  double tUpdateWeight = MPI_Wtime() - tstart;
1104 
1105 
1106  weightPrev = weightCur;
1107  weightCur = MatchingWeight(RepMateWC2R, RowWorld, minw);
1108 
1109 
1110  //UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1111  //CheckMatching(mateRow2Col,mateCol2Row);
1112 
1113  if(myrank==0)
1114  {
1115  std::cout << t1Comp << " " << t1Comm << " "<< t2Comp << " " << t2Comm << " " << t3Comp << " " << t3Comm << " " << t4Comp << " " << t4Comm << " " << t5Comp << " " << t5Comm << " " << tUpdateMateComp << " " << tUpdateWeight << std::endl;
1116 
1117  t1CompAll += t1Comp;
1118  t1CommAll += t1Comm;
1119  t2CompAll += t2Comp;
1120  t2CommAll += t2Comm;
1121  t3CompAll += t3Comp;
1122  t3CommAll += t3Comm;
1123  t4CompAll += t4Comp;
1124  t4CommAll += t4Comm;
1125  t5CompAll += t5Comp;
1126  t5CommAll += t5Comm;
1127  tUpdateMateCompAll += tUpdateMateComp;
1128  tUpdateWeightAll += tUpdateWeight;
1129 
1130  }
1131  }
1132 
1133  if(myrank==0)
1134  {
1135  std::cout << "=========== overal timing ==========" << std::endl;
1136  std::cout << t1CompAll << " " << t1CommAll << " " << t2CompAll << " " << t2CommAll << " " << t3CompAll << " " << t3CommAll << " " << t4CompAll << " " << t4CommAll << " " << t5CompAll << " " << t5CommAll << " " << tUpdateMateCompAll << " " << tUpdateWeightAll << std::endl;
1137  }
1138 
1139  // update the distributed mate vectors from replicated mate vectors
1140  UpdateMatching(mateRow2Col, mateCol2Row, RepMateR2C, RepMateC2R);
1141  //weightCur = MatchingWeight(RepMateWC2R, RowWorld);
1142 
1143 
1144 
1145 }
1146 
1147 
1148  template <class IT, class NT, class DER>
1150  {
1151  //A.Apply([](NT val){return log(1+abs(val));});
1152  // if the matrix has explicit zero entries, we can still have problem.
1153  // One solution is to remove explicit zero entries before cardinality matching (to be tested)
1154  //A.Apply([](NT val){if(val==0) return log(numeric_limits<NT>::min()); else return log(fabs(val));});
1155  A.Apply([](NT val){return (fabs(val));});
1156 
1157  FullyDistVec<IT, NT> maxvRow(A.getcommgrid());
1158  A.Reduce(maxvRow, Row, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1159  A.DimApply(Row, maxvRow, [](NT val, NT maxval){return val/maxval;});
1160 
1161  FullyDistVec<IT, NT> maxvCol(A.getcommgrid());
1162  A.Reduce(maxvCol, Column, maximum<NT>(), static_cast<NT>(numeric_limits<NT>::lowest()));
1163  A.DimApply(Column, maxvCol, [](NT val, NT maxval){return val/maxval;});
1164 
1165  if(applylog)
1166  A.Apply([](NT val){return log(val);});
1167 
1168  }
1169 
1170  template <class IT, class NT>
1171  void AWPM(SpParMat < IT, NT, SpDCCols<IT, NT> > & A1, FullyDistVec<IT, IT>& mateRow2Col, FullyDistVec<IT, IT>& mateCol2Row, bool optimizeProd=true)
1172  {
1173  SpParMat < IT, NT, SpDCCols<IT, NT> > A(A1); // creating a copy because it is being transformed
1174 
1175  if(optimizeProd)
1176  TransformWeight(A, true);
1177  else
1178  TransformWeight(A, false);
1181  FullyDistVec<IT, IT> degCol(A.getcommgrid());
1182  Abool.Reduce(degCol, Column, plus<IT>(), static_cast<IT>(0));
1183  double ts;
1184 
1185  // Compute the initial trace
1186  IT diagnnz;
1187  double origWeight = Trace(A, diagnnz);
1188  bool isOriginalPerfect = diagnnz==A.getnrow();
1189 
1190  // compute the maximal matching
1191  WeightedGreedy(Acsc, mateRow2Col, mateCol2Row, degCol);
1192  double mclWeight = MatchingWeight( A, mateRow2Col, mateCol2Row);
1193  SpParHelper::Print("After Greedy sanity check\n");
1194  bool isPerfectMCL = CheckMatching(mateRow2Col,mateCol2Row);
1195 
1196  // if the original matrix has a perfect matching and better weight
1197  if(isOriginalPerfect && mclWeight<=origWeight)
1198  {
1199  SpParHelper::Print("Maximal is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1200  mateRow2Col.iota(A.getnrow(), 0);
1201  mateCol2Row.iota(A.getncol(), 0);
1202  mclWeight = origWeight;
1203  isPerfectMCL = true;
1204  }
1205 
1206 
1207  // MCM
1208  double tmcm = 0;
1209  double mcmWeight = mclWeight;
1210  if(!isPerfectMCL) // run MCM only if we don't have a perfect matching
1211  {
1212  ts = MPI_Wtime();
1213  maximumMatching(Acsc, mateRow2Col, mateCol2Row, true, false, true);
1214  tmcm = MPI_Wtime() - ts;
1215  mcmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1216  SpParHelper::Print("After MCM sanity check\n");
1217  CheckMatching(mateRow2Col,mateCol2Row);
1218  }
1219 
1220 
1221  // AWPM
1222  ts = MPI_Wtime();
1223  TwoThirdApprox(A, mateRow2Col, mateCol2Row);
1224  double tawpm = MPI_Wtime() - ts;
1225 
1226  double awpmWeight = MatchingWeight( A, mateRow2Col, mateCol2Row) ;
1227  SpParHelper::Print("After AWPM sanity check\n");
1228  CheckMatching(mateRow2Col,mateCol2Row);
1229  if(isOriginalPerfect && awpmWeight<origWeight) // keep original
1230  {
1231  SpParHelper::Print("AWPM is not better that the natural ordering. Hence, keeping the natural ordering.\n");
1232  mateRow2Col.iota(A.getnrow(), 0);
1233  mateCol2Row.iota(A.getncol(), 0);
1234  awpmWeight = origWeight;
1235  }
1236 
1237  }
1238 
1239 }
1240 
1241 #endif /* ApproxWeightPerfectMatching_h */
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
Definition: SpParMat.cpp:791
void AWPM(SpParMat< IT, NT, SpDCCols< IT, NT > > &A1, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool optimizeProd=true)
#define TRX
Definition: SpDefs.h:102
std::shared_ptr< CommGrid > getcommgrid() const
Definition: SpParMat.h:275
Compute the maximum of two values.
Definition: Operations.h:154
std::vector< std::tuple< IT, IT, IT, NT > > Phase2(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
void Apply(_UnaryOperation __unary_op)
Definition: SpParMat.h:145
int OwnerProcs(SpParMat< IT, NT, DER > &A, IT grow, IT gcol, IT nrows, IT ncols)
IT getnnz() const
Definition: SpParMat.cpp:676
void ReplicateMateWeights(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, std::vector< IT > &RepMateC2R, std::vector< NT > &RepMateWR2C, std::vector< NT > &RepMateWC2R)
NT Trace(SpParMat< IT, NT, DER > &A, IT &rettrnnz=0)
std::shared_ptr< CommGrid > getcommgrid() const
Definition: FullyDistVec.h:257
void UpdateMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, std::vector< IT > &RepMateR2C, std::vector< IT > &RepMateC2R)
void DeleteAll(A arr1)
Definition: Deleter.h:48
std::vector< std::tuple< IT, IT, NT > > Phase1(const AWPM_param< IT > &param, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
IT * ir
row indices, size nz
Definition: dcsc.h:121
std::vector< std::tuple< IT, IT, NT > > ExchangeData(std::vector< std::vector< std::tuple< IT, IT, NT >>> &tempTuples, MPI_Comm World)
void DimApply(Dim dim, const FullyDistVec< IT, NT > &v, _BinaryOperation __binary_op)
Definition: SpParMat.cpp:704
void SetLocalElement(IT index, NT value)
Definition: FullyDistVec.h:144
std::vector< std::tuple< IT, IT > > MateBcast(std::vector< std::tuple< IT, IT >> sendTuples, MPI_Comm World)
std::shared_ptr< CommGrid > commGrid
double A
void iota(IT globalsize, NT first)
void maximumMatching(PSpMat_s32p64 &Aeff, FullyDistVec< int64_t, int64_t > &mateRow2Col, FullyDistVec< int64_t, int64_t > &mateCol2Row)
void TransformWeight(SpParMat< IT, NT, DER > &A, bool applylog)
NT MatchingWeight(std::vector< NT > &RepMateWC2R, MPI_Comm RowWorld, NT &minw)
int ThreadBuffLenForBinning(int itemsize, int nbins)
NT * numx
generic values, size nz
Definition: dcsc.h:122
std::vector< std::vector< std::tuple< IT, IT, IT, NT > > > Phase2_old(const AWPM_param< IT > &param, std::vector< std::tuple< IT, IT, NT >> &recvTuples, Dcsc< IT, NT > *dcsc, const std::vector< IT > &colptr, const std::vector< IT > &RepMateR2C, const std::vector< IT > &RepMateC2R, const std::vector< NT > &RepMateWR2C, const std::vector< NT > &RepMateWC2R)
IT getncol() const
Definition: SpParMat.cpp:694
#define L2_CACHE_SIZE
Definition: CCGrid.h:4
std::vector< std::tuple< IT, IT, IT, NT > > ExchangeData1(std::vector< std::vector< std::tuple< IT, IT, IT, NT >>> &tempTuples, MPI_Comm World)
void TwoThirdApprox(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
bool CheckMatching(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row)
Definition: Utility.h:113
void WeightedGreedy(Par_MAT_Double &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &degCol)
const NT * GetLocArr() const
Definition: FullyDistVec.h:167
IT getnrow() const
Definition: SpParMat.cpp:685