COMBINATORIAL_BLAS  1.6
BPMaximumMatching.h
Go to the documentation of this file.
1 #ifndef BP_MAXIMUM_MATCHING_H
2 #define BP_MAXIMUM_MATCHING_H
3 
4 #include "../CombBLAS.h"
5 #include <mpi.h>
6 #include <sys/time.h>
7 #include <iostream>
8 #include <functional>
9 #include <algorithm>
10 #include <vector>
11 #include <string>
12 #include <sstream>
13 #include "MatchingDefs.h"
15 
16 namespace combblas {
17 
26 template <class IT, class DER>
28 {
29 
30  IT procsPerRow = ri.commGrid->GetGridCols(); // the number of processor in a row of processor grid
31  IT procsPerCol = ri.commGrid->GetGridRows(); // the number of processor in a column of processor grid
32 
33 
34  IT global_nrow = ri.TotalLength();
35  IT global_ncol = ncol;
36  IT m_perprocrow = global_nrow / procsPerRow;
37  IT n_perproccol = global_ncol / procsPerCol;
38 
39 
40  // The indices for FullyDistVec are offset'd to 1/p pieces
41  // The matrix indices are offset'd to 1/sqrt(p) pieces
42  // Add the corresponding offset before sending the data
43 
44  std::vector< std::vector<IT> > rowid(procsPerRow); // rowid in the local matrix of each vector entry
45  std::vector< std::vector<IT> > colid(procsPerRow); // colid in the local matrix of each vector entry
46 
47  IT locvec = ri.arr.size(); // nnz in local vector
48  IT roffset = ri.RowLenUntil(); // the number of vector elements in this processor row before the current processor
49  for(typename std::vector<IT>::size_type i=0; i< (unsigned)locvec; ++i)
50  {
51  if(ri.arr[i]>=0 && ri.arr[i]<ncol) // this specialized for matching. TODO: make it general purpose by passing a function
52  {
53  IT rowrec = (n_perproccol!=0) ? std::min(ri.arr[i] / n_perproccol, procsPerRow-1) : (procsPerRow-1);
54  // ri's numerical values give the colids and its local indices give rowids
55  rowid[rowrec].push_back( i + roffset);
56  colid[rowrec].push_back(ri.arr[i] - (rowrec * n_perproccol));
57  }
58 
59  }
60 
61 
62 
63  int * sendcnt = new int[procsPerRow];
64  int * recvcnt = new int[procsPerRow];
65  for(IT i=0; i<procsPerRow; ++i)
66  {
67  sendcnt[i] = rowid[i].size();
68  }
69 
70  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, ri.commGrid->GetRowWorld()); // share the counts
71 
72  int * sdispls = new int[procsPerRow]();
73  int * rdispls = new int[procsPerRow]();
74  partial_sum(sendcnt, sendcnt+procsPerRow-1, sdispls+1);
75  partial_sum(recvcnt, recvcnt+procsPerRow-1, rdispls+1);
76  IT p_nnz = accumulate(recvcnt,recvcnt+procsPerRow, static_cast<IT>(0));
77 
78 
79  IT * p_rows = new IT[p_nnz];
80  IT * p_cols = new IT[p_nnz];
81  IT * senddata = new IT[locvec];
82  for(int i=0; i<procsPerRow; ++i)
83  {
84  copy(rowid[i].begin(), rowid[i].end(), senddata+sdispls[i]);
85  std::vector<IT>().swap(rowid[i]); // clear memory of rowid
86  }
87  MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_rows, recvcnt, rdispls, MPIType<IT>(), ri.commGrid->GetRowWorld());
88 
89  for(int i=0; i<procsPerRow; ++i)
90  {
91  copy(colid[i].begin(), colid[i].end(), senddata+sdispls[i]);
92  std::vector<IT>().swap(colid[i]); // clear memory of colid
93  }
94  MPI_Alltoallv(senddata, sendcnt, sdispls, MPIType<IT>(), p_cols, recvcnt, rdispls, MPIType<IT>(), ri.commGrid->GetRowWorld());
95  delete [] senddata;
96 
97  std::tuple<IT,IT,bool> * p_tuples = new std::tuple<IT,IT,bool>[p_nnz];
98  for(IT i=0; i< p_nnz; ++i)
99  {
100  p_tuples[i] = make_tuple(p_rows[i], p_cols[i], 1);
101  }
102  DeleteAll(p_rows, p_cols);
103 
104 
105  // Now create the local matrix
106  IT local_nrow = ri.MyRowLength();
107  int my_proccol = ri.commGrid->GetRankInProcRow();
108  IT local_ncol = (my_proccol<(procsPerCol-1))? (n_perproccol) : (global_ncol - (n_perproccol*(procsPerCol-1)));
109 
110  // infer the concrete type SpMat<IT,IT>
111  typedef typename create_trait<DER, IT, bool>::T_inferred DER_IT;
112  DER_IT * PSeq = new DER_IT();
113  PSeq->Create( p_nnz, local_nrow, local_ncol, p_tuples); // deletion of tuples[] is handled by SpMat::Create
114 
115  SpParMat<IT,bool,DER_IT> P (PSeq, ri.commGrid);
116  //Par_DCSC_Bool P (PSeq, ri.commGrid);
117  return P;
118 }
119 
120 
121 
122 
123 /***************************************************************************
124 // Augment a matching by a set of vertex-disjoint augmenting paths.
125 // The paths are explored level-by-level similar to the level-synchronous BFS
126 // This approach is more effecient when we have many short augmenting paths
127 ***************************************************************************/
128 
129 template <typename IT>
131 {
132 
133  IT nrow = mateRow2Col.TotalLength();
134  IT ncol = mateCol2Row.TotalLength();
135  FullyDistSpVec<IT, IT> col(leaves, [](IT leaf){return leaf!=-1;});
136  FullyDistSpVec<IT, IT> row(mateRow2Col.getcommgrid(), nrow);
137  FullyDistSpVec<IT, IT> nextcol(col.getcommgrid(), ncol);
138 
139  while(col.getnnz()!=0)
140  {
141 
142  row = col.Invert(nrow);
143  row = EWiseApply<IT>(row, parentsRow,
144  [](IT root, IT parent){return parent;},
145  [](IT root, IT parent){return true;},
146  false, (IT)-1);
147 
148  col = row.Invert(ncol); // children array
149  nextcol = EWiseApply<IT>(col, mateCol2Row,
150  [](IT child, IT mate){return mate;},
151  [](IT child, IT mate){return mate!=-1;},
152  false, (IT)-1);
153  mateRow2Col.Set(row);
154  mateCol2Row.Set(col);
155  col = nextcol;
156  }
157 }
158 
159 
160 /***************************************************************************
161 // Augment a matching by a set of vertex-disjoint augmenting paths.
162 // An MPI processor is responsible for a complete path.
163 // This approach is more effecient when we have few long augmenting paths
164 // We used one-sided MPI. Any PGAS language should be fine as well.
165 // This function is not thread safe, hence multithreading is not used here
166  ***************************************************************************/
167 
168 template <typename IT>
170 {
171  MPI_Win win_mateRow2Col, win_mateCol2Row, win_parentsRow;
172  MPI_Win_create((IT*)mateRow2Col.GetLocArr(), mateRow2Col.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, mateRow2Col.commGrid->GetWorld(), &win_mateRow2Col);
173  MPI_Win_create((IT*)mateCol2Row.GetLocArr(), mateCol2Row.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, mateCol2Row.commGrid->GetWorld(), &win_mateCol2Row);
174  MPI_Win_create((IT*)parentsRow.GetLocArr(), parentsRow.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, parentsRow.commGrid->GetWorld(), &win_parentsRow);
175 
176 
177  IT* leaves_ptr = (IT*) leaves.GetLocArr();
178  //MPI_Win_fence(0, win_mateRow2Col);
179  //MPI_Win_fence(0, win_mateCol2Row);
180  //MPI_Win_fence(0, win_parentsRow);
181 
182  IT row, col=100, nextrow;
183  int owner_row, owner_col;
184  IT locind_row, locind_col;
185  int myrank;
186 
187  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
188 
189 
190  for(IT i=0; i<leaves.LocArrSize(); i++)
191  {
192  int depth=0;
193  row = *(leaves_ptr+i);
194  while(row != - 1)
195  {
196 
197  owner_row = mateRow2Col.Owner(row, locind_row);
198  MPI_Win_lock(MPI_LOCK_SHARED, owner_row, 0, win_parentsRow);
199  MPI_Get(&col, 1, MPIType<IT>(), owner_row, locind_row, 1, MPIType<IT>(), win_parentsRow);
200  MPI_Win_unlock(owner_row, win_parentsRow);
201 
202  owner_col = mateCol2Row.Owner(col, locind_col);
203  MPI_Win_lock(MPI_LOCK_SHARED, owner_col, 0, win_mateCol2Row);
204  MPI_Fetch_and_op(&row, &nextrow, MPIType<IT>(), owner_col, locind_col, MPI_REPLACE, win_mateCol2Row);
205  MPI_Win_unlock(owner_col, win_mateCol2Row);
206 
207  MPI_Win_lock(MPI_LOCK_SHARED, owner_row, 0, win_mateRow2Col);
208  MPI_Put(&col, 1, MPIType<IT>(), owner_row, locind_row, 1, MPIType<IT>(), win_mateRow2Col);
209  MPI_Win_unlock(owner_row, win_mateRow2Col); // we need this otherwise col might get overwritten before communication!
210  row = nextrow;
211 
212  }
213  }
214 
215  //MPI_Win_fence(0, win_mateRow2Col);
216  //MPI_Win_fence(0, win_mateCol2Row);
217  //MPI_Win_fence(0, win_parentsRow);
218 
219  MPI_Win_free(&win_mateRow2Col);
220  MPI_Win_free(&win_mateCol2Row);
221  MPI_Win_free(&win_parentsRow);
222 }
223 
224 
225 
226 
227 
228 // Maximum cardinality matching
229 // Output: mateRow2Col and mateRow2Col
230 template <typename IT, typename NT,typename DER>
232  FullyDistVec<IT, IT>& mateCol2Row, bool prune=true, bool randMM = false, bool maximizeWeight = false)
233 {
234 
236 
237  int nthreads=1;
238 #ifdef THREADED
239 #pragma omp parallel
240  {
241  nthreads = omp_get_num_threads();
242  }
243 #endif
244  PreAllocatedSPA<VertexType> SPA(A.seq(), nthreads*4);
245 
246  double tstart = MPI_Wtime();
247  int nprocs, myrank;
248  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
249  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
250 
251  IT nrow = A.getnrow();
252  IT ncol = A.getncol();
253 
254  FullyDistSpVec<IT, VertexType> fringeRow(A.getcommgrid(), nrow);
255  FullyDistSpVec<IT, IT> umFringeRow(A.getcommgrid(), nrow);
256  FullyDistVec<IT, IT> leaves ( A.getcommgrid(), ncol, (IT) -1);
257 
258  std::vector<std::vector<double> > timing;
259  std::vector<int> layers;
260  std::vector<int64_t> phaseMatched;
261  double t1, time_search, time_augment, time_phase;
262 
263  bool matched = true;
264  int phase = 0;
265  int totalLayer = 0;
266  IT numUnmatchedCol;
267 
268 
269  MPI_Win winLeaves;
270  MPI_Win_create((IT*)leaves.GetLocArr(), leaves.LocArrSize() * sizeof(IT), sizeof(IT), MPI_INFO_NULL, A.getcommgrid()->GetWorld(), &winLeaves);
271 
272 
273  while(matched)
274  {
275  time_phase = MPI_Wtime();
276 
277  std::vector<double> phase_timing(8,0);
278  leaves.Apply ( [](IT val){return (IT) -1;});
279  FullyDistVec<IT, IT> parentsRow ( A.getcommgrid(), nrow, (IT) -1);
280  FullyDistSpVec<IT, VertexType> fringeCol(A.getcommgrid(), ncol);
281  fringeCol = EWiseApply<VertexType>(fringeCol, mateCol2Row,
282  [](VertexType vtx, IT mate){return vtx;},
283  [](VertexType vtx, IT mate){return mate==-1;},
284  true, VertexType());
285 
286 
287  if(randMM) //select rand
288  {
289  fringeCol.ApplyInd([](VertexType vtx, IT idx){return VertexType(idx,idx,GlobalMT.rand());});
290  }
291  else
292  {
293  fringeCol.ApplyInd([](VertexType vtx, IT idx){return VertexType(idx,idx);});
294  }
295 
296  ++phase;
297  numUnmatchedCol = fringeCol.getnnz();
298  int layer = 0;
299 
300 
301  time_search = MPI_Wtime();
302  while(fringeCol.getnnz() > 0)
303  {
304  layer++;
305  t1 = MPI_Wtime();
306 
307  //TODO: think about this semiring
308  if(maximizeWeight)
309  SpMV<WeightMaxMMSR<NT, VertexType>>(A, fringeCol, fringeRow, false, SPA);
310  else
311  SpMV<Select2ndMinSR<NT, VertexType>>(A, fringeCol, fringeRow, false, SPA);
312  phase_timing[0] += MPI_Wtime()-t1;
313 
314 
315 
316  // remove vertices already having parents
317 
318  t1 = MPI_Wtime();
319  fringeRow = EWiseApply<VertexType>(fringeRow, parentsRow,
320  [](VertexType vtx, IT parent){return vtx;},
321  [](VertexType vtx, IT parent){return parent==-1;},
322  false, VertexType());
323 
324  // Set parent pointer
325  parentsRow.EWiseApply(fringeRow,
326  [](IT dval, VertexType svtx){return svtx.parent;},
327  [](IT dval, VertexType svtx){return true;},
328  false, VertexType());
329 
330 
331  umFringeRow = EWiseApply<IT>(fringeRow, mateRow2Col,
332  [](VertexType vtx, IT mate){return vtx.root;},
333  [](VertexType vtx, IT mate){return mate==-1;},
334  false, VertexType());
335 
336  phase_timing[1] += MPI_Wtime()-t1;
337 
338 
339  IT nnz_umFringeRow = umFringeRow.getnnz(); // careful about this timing
340 
341  t1 = MPI_Wtime();
342  if(nnz_umFringeRow >0)
343  {
344  /*
345  if(nnz_umFringeRow < 25*nprocs)
346  {
347  leaves.GSet(umFringeRow,
348  [](IT valRoot, IT idxLeaf){return valRoot;},
349  [](IT valRoot, IT idxLeaf){return idxLeaf;},
350  winLeaves);
351  // There might be a bug here. It does not return the same output for different number of processes
352  // e.g., check with g7jac200sc.mtx matrix
353  }
354  else*/
355  {
356  FullyDistSpVec<IT, IT> temp1(A.getcommgrid(), ncol);
357  temp1 = umFringeRow.Invert(ncol);
358  leaves.Set(temp1);
359  }
360  }
361 
362  phase_timing[2] += MPI_Wtime()-t1;
363 
364 
365 
366 
367  // matched row vertices in the the fringe
368  fringeRow = EWiseApply<VertexType>(fringeRow, mateRow2Col,
369  [](VertexType vtx, IT mate){return VertexType(mate, vtx.root);},
370  [](VertexType vtx, IT mate){return mate!=-1;},
371  false, VertexType());
372 
373  t1 = MPI_Wtime();
374  if(nnz_umFringeRow>0 && prune)
375  {
376  fringeRow.FilterByVal (umFringeRow,[](VertexType vtx){return vtx.root;}, false);
377  }
378  double tprune = MPI_Wtime()-t1;
379  phase_timing[3] += tprune;
380 
381 
382  // Go to matched column from matched row in the fringe. parent is automatically set to itself.
383  t1 = MPI_Wtime();
384  fringeCol = fringeRow.Invert(ncol,
385  [](VertexType& vtx, const IT & index){return vtx.parent;},
386  [](VertexType& vtx, const IT & index){return vtx;},
387  [](VertexType& vtx1, VertexType& vtx2){return vtx1;});
388  phase_timing[4] += MPI_Wtime()-t1;
389 
390 
391 
392 
393  }
394  time_search = MPI_Wtime() - time_search;
395  phase_timing[5] += time_search;
396 
397  IT numMatchedCol = leaves.Count([](IT leaf){return leaf!=-1;});
398  phaseMatched.push_back(numMatchedCol);
399  time_augment = MPI_Wtime();
400  if (numMatchedCol== 0) matched = false;
401  else
402  {
403 
404  if(numMatchedCol < (2* nprocs * nprocs))
405  AugmentPath(mateRow2Col, mateCol2Row,parentsRow, leaves);
406  else
407  AugmentLevel(mateRow2Col, mateCol2Row,parentsRow, leaves);
408  }
409  time_augment = MPI_Wtime() - time_augment;
410  phase_timing[6] += time_augment;
411 
412  time_phase = MPI_Wtime() - time_phase;
413  phase_timing[7] += time_phase;
414  timing.push_back(phase_timing);
415  totalLayer += layer;
416  layers.push_back(layer);
417 
418  }
419 
420 
421  MPI_Win_free(&winLeaves);
422 
423  tTotalMaximum = MPI_Wtime() - tstart;
424 
425  //isMaximalmatching(A, mateRow2Col, mateCol2Row, unmatchedRow, unmatchedCol);
426  //isMatching(mateCol2Row, mateRow2Col); //todo there is a better way to check this
427 
428 
429  // print statistics
430  double combTime;
431  if(myrank == 0)
432  {
433  std::cout << "****** maximum matching runtime ********\n";
434  std::cout << std::endl;
435  std::cout << "========================================================================\n";
436  std::cout << " BFS Search \n";
437  std::cout << "===================== ==================================================\n";
438  std::cout << "Phase Layer Match SpMV EWOpp CmUqL Prun CmMC BFS Aug Total\n";
439  std::cout << "===================== ===================================================\n";
440 
441  std::vector<double> totalTimes(timing[0].size(),0);
442  int nphases = timing.size();
443  for(int i=0; i<timing.size(); i++)
444  {
445  printf(" %3d %3d %8lld ", i+1, layers[i], phaseMatched[i]);
446  for(int j=0; j<timing[i].size(); j++)
447  {
448  totalTimes[j] += timing[i][j];
449  //timing[i][j] /= timing[i].back();
450  printf("%.2lf ", timing[i][j]);
451  }
452 
453  printf("\n");
454  }
455 
456  std::cout << "-----------------------------------------------------------------------\n";
457  std::cout << "Phase Layer UnMat SpMV EWOpp CmUqL Prun CmMC BFS Aug Total \n";
458  std::cout << "-----------------------------------------------------------------------\n";
459 
460  combTime = totalTimes.back();
461  printf(" %3d %3d %8lld ", nphases, totalLayer/nphases, numUnmatchedCol);
462  for(int j=0; j<totalTimes.size()-1; j++)
463  {
464  printf("%.2lf ", totalTimes[j]);
465  }
466  printf("%.2lf\n", combTime);
467  }
468 
469  IT nrows=A.getnrow();
470  IT matchedRow = mateRow2Col.Count([](IT mate){return mate!=-1;});
471  if(myrank==0)
472  {
473  std::cout << "***Final Maximum Matching***\n";
474  std::cout << "***Total-Rows Matched-Rows Total Time***\n";
475  printf("%lld %lld %lf \n",nrows, matchedRow, combTime);
476  printf("matched rows: %lld , which is: %lf percent \n",matchedRow, 100*(double)matchedRow/(nrows));
477  std::cout << "-------------------------------------------------------\n\n";
478  }
479 
480 }
481 
482 }
483 
484 #endif
485 
double rand()
std::shared_ptr< CommGrid > getcommgrid() const
Definition: SpParMat.h:275
void Set(const FullyDistSpVec< IT, NT > &rhs)
int size
std::shared_ptr< CommGrid > getcommgrid() const
Definition: FullyDistVec.h:257
void DeleteAll(A arr1)
Definition: Deleter.h:48
FullyDistSpVec< IT, NT > Invert(IT globallen)
IT Count(_Predicate pred) const
Return the number of elements for which pred is true.
double A
void AugmentLevel(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &parentsRow, FullyDistVec< IT, IT > &leaves)
void AugmentPath(FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, FullyDistVec< IT, IT > &parentsRow, FullyDistVec< IT, IT > &leaves)
SpParMat< IT, bool, DER > PermMat(const FullyDistVec< IT, IT > &ri, const IT ncol)
IT getncol() const
Definition: SpParMat.cpp:694
void ApplyInd(_BinaryOperation __binary_op)
Definition: CCGrid.h:4
void Apply(_UnaryOperation __unary_op)
double tTotalMaximum
const NT * GetLocArr() const
Definition: FullyDistVec.h:167
MTRand GlobalMT
Definition: RestrictionOp.h:18
IT Count(_Predicate pred) const
Return the number of elements for which pred is true.
IT getnrow() const
Definition: SpParMat.cpp:685
void maximumMatching(SpParMat< IT, NT, DER > &A, FullyDistVec< IT, IT > &mateRow2Col, FullyDistVec< IT, IT > &mateCol2Row, bool prune=true, bool randMM=false, bool maximizeWeight=false)