COMBINATORIAL_BLAS  1.6
RCM.cpp
Go to the documentation of this file.
1 #ifdef THREADED
2 #ifndef _OPENMP
3 #define _OPENMP // should be defined before any COMBBLAS header is included
4 #endif
5 #include <omp.h>
6 #endif
7 
8 #include "CombBLAS/CombBLAS.h"
9 #include <mpi.h>
10 #include <sys/time.h>
11 #include <iostream>
12 #include <functional>
13 #include <algorithm>
14 #include <vector>
15 #include <string>
16 #include <sstream>
17 
18 
19 #define EDGEFACTOR 16
20 
21 #ifdef DETERMINISTIC
22 MTRand GlobalMT(1);
23 #else
24 MTRand GlobalMT; // generate random numbers with Mersenne Twister
25 #endif
26 
32 
33 
34 
35 using namespace std;
36 using namespace combblas;
37 
38 
39 
42 
43 
44 
45 
46 template <typename PARMAT>
47 void Symmetricize(PARMAT & A)
48 {
49  // boolean addition is practically a "logical or"
50  // therefore this doesn't destruct any links
51  PARMAT AT = A;
52  AT.Transpose();
53  AT.RemoveLoops(); // not needed for boolean matrices, but no harm in keeping it
54  A += AT;
55 }
56 
57 
58 struct VertexType
59 {
60 public:
61  VertexType(int64_t ord=-1, int64_t deg=-1){order=ord; degree = deg;};
62 
63  friend bool operator<(const VertexType & vtx1, const VertexType & vtx2 )
64  {
65  if(vtx1.order==vtx2.order) return vtx1.degree < vtx2.degree;
66  else return vtx1.order<vtx2.order;
67  };
68  friend bool operator<=(const VertexType & vtx1, const VertexType & vtx2 )
69  {
70  if(vtx1.order==vtx2.order) return vtx1.degree <= vtx2.degree;
71  else return vtx1.order<vtx2.order;
72  };
73  friend bool operator>(const VertexType & vtx1, const VertexType & vtx2 )
74  {
75  if(vtx1.order==vtx2.order) return vtx1.degree > vtx2.degree;
76  else return vtx1.order>vtx2.order;
77  };
78  friend bool operator>=(const VertexType & vtx1, const VertexType & vtx2 )
79  {
80  if(vtx1.order==vtx2.order) return vtx1.degree >= vtx2.degree;
81  else return vtx1.order>vtx2.order;
82 
83  //if(vtx1.order==vtx2.order) return vtx1.degree <= vtx2.degree;
84  //else return vtx1.order<vtx2.order;
85  };
86  friend bool operator==(const VertexType & vtx1, const VertexType & vtx2 ){return vtx1.order==vtx2.order & vtx1.degree==vtx2.degree;};
87  friend ostream& operator<<(ostream& os, const VertexType & vertex ){os << "(" << vertex.order << "," << vertex.degree << ")"; return os;};
88  //private:
89  int64_t order;
91 };
92 
93 
94 
95 struct SelectMinSR
96 {
97  typedef int64_t T_promote;
98  static T_promote id(){ return -1; };
99  static bool returnedSAID() { return false; }
100  //static MPI_Op mpi_op() { return MPI_MIN; };
101 
102  static T_promote add(const T_promote & arg1, const T_promote & arg2)
103  {
104  return std::min(arg1, arg2);
105  }
106 
107  static T_promote multiply(const bool & arg1, const T_promote & arg2)
108  {
109  return arg2;
110  }
111 
112  static void axpy(bool a, const T_promote & x, T_promote & y)
113  {
114  y = std::min(y, x);
115  }
116 };
117 
118 
123 
124 
125 
126 
128 {
129 
130  int myrank, nprocs;
131  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
132  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
133 
134 
135  vector<int64_t> lind = fringeRow.GetLocalInd ();
136  vector<VertexType> lnum = fringeRow.GetLocalNum ();
137  int64_t ploclen = lind.size();
138 
139 
140  int64_t nparents = endLabel - startLabel + 1;
141  int64_t perproc = nparents/nprocs;
142 
143  int * rdispls = new int[nprocs+1];
144  int * recvcnt = new int[nprocs];
145  int * sendcnt = new int[nprocs](); // initialize to 0
146  int * sdispls = new int[nprocs+1];
147 
148  MPI_Barrier(MPI_COMM_WORLD);
149 
150 #ifdef _OPENMP
151 #pragma omp parallel for
152 #endif
153  for(int64_t k=0; k < ploclen; ++k)
154  {
155  int64_t temp = lnum[k].order-startLabel;
156  int owner;
157  if(perproc==0 || temp/perproc > nprocs-1)
158  owner = nprocs-1;
159  else
160  owner = temp/perproc;
161 
162 #ifdef _OPENMP
163  __sync_fetch_and_add(&sendcnt[owner], 1);
164 #else
165  sendcnt[owner]++;
166 #endif
167  }
168 
169  MPI_Barrier(MPI_COMM_WORLD);
170 
171 
172  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, MPI_COMM_WORLD); // share the request counts
173 
174  sdispls[0] = 0;
175  rdispls[0] = 0;
176  for(int i=0; i<nprocs; ++i)
177  {
178  sdispls[i+1] = sdispls[i] + sendcnt[i];
179  rdispls[i+1] = rdispls[i] + recvcnt[i];
180  }
181 
182 
183  int64_t * datbuf1 = new int64_t[ploclen];
184  int64_t * datbuf2 = new int64_t[ploclen];
185  int64_t * indbuf = new int64_t[ploclen];
186  int *count = new int[nprocs](); //current position
187 #ifdef _OPENMP
188 #pragma omp parallel for
189 #endif
190  for(int64_t i=0; i < ploclen; ++i)
191  {
192 
193  int64_t temp = lnum[i].order-startLabel;
194  int owner;
195  if(perproc==0 || temp/perproc > nprocs-1)
196  owner = nprocs-1;
197  else
198  owner = temp/perproc;
199 
200 
201  int id;
202 #ifdef _OPENMP
203  id = sdispls[owner] + __sync_fetch_and_add(&count[owner], 1);
204 #else
205  id = sdispls[owner] + count[owner];
206  count[owner]++;
207 #endif
208 
209  datbuf1[id] = temp;
210  datbuf2[id] = lnum[i].degree;
211  indbuf[id] = lind[i] + fringeRow.LengthUntil();
212  }
213  delete [] count;
214 
215  int64_t totrecv = rdispls[nprocs];
216  int64_t * recvdatbuf1 = new int64_t[totrecv];
217  int64_t * recvdatbuf2 = new int64_t[totrecv];
218  MPI_Alltoallv(datbuf1, sendcnt, sdispls, MPIType<int64_t>(), recvdatbuf1, recvcnt, rdispls, MPIType<int64_t>(), MPI_COMM_WORLD);
219  delete [] datbuf1;
220  MPI_Alltoallv(datbuf2, sendcnt, sdispls, MPIType<int64_t>(), recvdatbuf2, recvcnt, rdispls, MPIType<int64_t>(), MPI_COMM_WORLD);
221  delete [] datbuf2;
222 
223  int64_t * recvindbuf = new int64_t[totrecv];
224  MPI_Alltoallv(indbuf, sendcnt, sdispls, MPIType<int64_t>(), recvindbuf, recvcnt, rdispls, MPIType<int64_t>(), MPI_COMM_WORLD);
225  delete [] indbuf;
226 
227  tuple<int64_t,int64_t, int64_t>* tosort = static_cast<tuple<int64_t,int64_t, int64_t>*> (::operator new (sizeof(tuple<int64_t,int64_t, int64_t>)*totrecv));
228 
229 #ifdef _OPENMP
230 #pragma omp parallel for
231 #endif
232  for(int i=0; i<totrecv; ++i)
233  {
234  tosort[i] = make_tuple(recvdatbuf1[i], recvdatbuf2[i], recvindbuf[i]);
235  }
236 
237 
238 #if defined(GNU_PARALLEL) && defined(_OPENMP)
239  __gnu_parallel::sort(tosort, tosort+totrecv);
240 #else
241  std::sort(tosort, tosort+totrecv);
242 #endif
243 
244  // send order back
245  int * sendcnt1 = new int[nprocs]();
246 
247 #ifdef _OPENMP
248 #pragma omp parallel for
249 #endif
250  for(int64_t k=0; k < totrecv; ++k)
251  {
252  int64_t locind;
253  int owner = fringeRow.Owner(get<2>(tosort[k]), locind);
254 #ifdef _OPENMP
255  __sync_fetch_and_add(&sendcnt1[owner], 1);
256 #else
257  sendcnt1[owner]++;
258 #endif
259  }
260 
261  MPI_Alltoall(sendcnt1, 1, MPI_INT, recvcnt, 1, MPI_INT, MPI_COMM_WORLD); // share the request counts
262 
263  sdispls[0] = 0;
264  rdispls[0] = 0;
265  for(int i=0; i<nprocs; ++i)
266  {
267  sdispls[i+1] = sdispls[i] + sendcnt1[i];
268  rdispls[i+1] = rdispls[i] + recvcnt[i];
269  }
270 
271 
272  vector<int64_t> sortperproc (nprocs);
273  sortperproc[myrank] = totrecv;
274  MPI_Allgather(MPI_IN_PLACE, 1, MPIType<int64_t>(), sortperproc.data(), 1, MPIType<int64_t>(), MPI_COMM_WORLD);
275 
276  vector<int64_t> disp(nprocs+1);
277  disp[0] = 0;
278  for(int i=0; i<nprocs; ++i)
279  {
280  disp[i+1] = disp[i] + sortperproc[i];
281  }
282 
283 
284 
285  ploclen = totrecv;
286 
287  int64_t * datbuf = new int64_t[ploclen];
288  indbuf = new int64_t[ploclen];
289  count = new int[nprocs](); //current position
290 #ifdef _OPENMP
291 #pragma omp parallel for
292 #endif
293  for(int64_t i=0; i < ploclen; ++i)
294  {
295  int64_t locind;
296  int owner = fringeRow.Owner(get<2>(tosort[i]), locind);
297  int id;
298 #ifdef _OPENMP
299  id = sdispls[owner] + __sync_fetch_and_add(&count[owner], 1);
300 #else
301  id = sdispls[owner] + count[owner];
302  count[owner]++;
303 #endif
304  datbuf[id] = i + disp[myrank] + endLabel + 1;
305  //cout << datbuf[id] << endl;
306  indbuf[id] = locind;
307  }
308  delete [] count;
309 
310 
311  totrecv = rdispls[nprocs];
312  vector<int64_t> recvdatbuf3 (totrecv);
313  MPI_Alltoallv(datbuf, sendcnt1, sdispls, MPIType<int64_t>(), recvdatbuf3.data(), recvcnt, rdispls, MPIType<int64_t>(), MPI_COMM_WORLD);
314  delete [] datbuf;
315 
316  vector<int64_t> recvindbuf3 (totrecv);
317  MPI_Alltoallv(indbuf, sendcnt1, sdispls, MPIType<int64_t>(), recvindbuf3.data(), recvcnt, rdispls, MPIType<int64_t>(), MPI_COMM_WORLD);
318  delete [] indbuf;
319 
320 
321  FullyDistSpVec<int64_t, int64_t> order(fringeRow.getcommgrid(), fringeRow.TotalLength(), recvindbuf3, recvdatbuf3);
322  DeleteAll(recvindbuf, recvdatbuf1, recvdatbuf2);
323  DeleteAll(sdispls, rdispls, sendcnt, sendcnt1, recvcnt);
324  ::operator delete(tosort);
325 
326  return order;
327 }
328 
330 // perform ordering from a pseudo peripheral vertex
331 template <typename PARMAT>
333 {
334  int myrank;
335  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
336  double tSpMV=0, tOrder, tOther, tSpMV1, tsort=0, tsort1;
337  tOrder = MPI_Wtime();
338 
339  int64_t nv = A.getnrow();
340  FullyDistSpVec<int64_t, int64_t> fringe(A.getcommgrid(), nv );
341  order.SetElement(source, startOrder);
342  fringe.SetElement(source, startOrder);
343  int64_t curOrder = startOrder+1;
344 
345  if(myrank == 0) cout << " Computing the RCM ordering:" << endl;
346 
347 
348  int64_t startLabel = startOrder;
349  int64_t endLabel = startOrder;
350 
351 
352  while(startLabel <= endLabel) // continue until the frontier is empty
353  {
354 
355  fringe = EWiseApply<int64_t>(fringe, order,
356  [](int64_t parent_order, int64_t ord){return ord;},
357  [](int64_t parent_order, int64_t ord){return true;},
358  false, (int64_t) -1);
359 
360  tSpMV1 = MPI_Wtime();
361  SpMV<SelectMinSR>(A, fringe, fringe, false, SPA);
362  tSpMV += MPI_Wtime() - tSpMV1;
363  fringe = EWiseMult(fringe, order, true, (int64_t) -1);
364 
365  FullyDistSpVec<int64_t, VertexType> fringeRow = EWiseApply<VertexType>(fringe, degrees,
366  [](int64_t parent_order, int64_t degree){return VertexType(parent_order, degree);},
367  [](int64_t parent_order, int64_t degree){return true;},
368  false, (int64_t) -1);
369 
370  tsort1 = MPI_Wtime();
371  FullyDistSpVec<int64_t, int64_t> levelOrder = getOrder(fringeRow, startLabel, endLabel);
372  tsort += MPI_Wtime()-tsort1;
373  order.Set(levelOrder);
374  startLabel = endLabel + 1;
375  endLabel += fringe.getnnz();
376  }
377 
378  tOrder = MPI_Wtime() - tOrder;
379  tOther = tOrder - tSpMV - tsort;
380  if(myrank == 0)
381  {
382  cout << " Total time: " << tOrder << " seconds [SpMV: " << tSpMV << ", sorting: " << tsort << ", other: " << tOther << "]" << endl << endl;
383  }
384 
385  torderSpMV+=tSpMV; torderSort+=tsort; torderOther+=tOther;
386 }
387 
388 
389 template <typename PARMAT>
390 int64_t PseudoPeripheralVertex(PARMAT & A, FullyDistSpVec<int64_t, pair<int64_t, int64_t>>& unvisitedVertices, FullyDistVec<int64_t, int64_t> degrees, PreAllocatedSPA<int64_t>& SPA)
391 {
392 
393 
394  int myrank;
395  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
396  double tpvSpMV=0, tpvOther=0;
397  double tstart = MPI_Wtime();
398  int64_t prevLevel=-1, curLevel=0; // initialized just to make the first iteration going
399 
400 
401  // Select a minimum-degree unvisited vertex as the initial source
402  pair<int64_t, int64_t> mindegree_vertex = unvisitedVertices.Reduce(minimum<pair<int64_t, int64_t> >(), make_pair(LLONG_MAX, (int64_t)-1));
403  int64_t source = mindegree_vertex.second;
404 
405  //level structure in the current BFS tree
406  //we are not using this information. Currently it is serving as visited flag
407  FullyDistVec<int64_t, int64_t> level ( A.getcommgrid(), A.getnrow(), (int64_t) -1);
408 
409  int iterations = 0;
410  double tSpMV=0, tOther=0, tSpMV1;
411  if(myrank == 0) cout << " Computing a pseudo-peripheral vertex:" << endl;
412  while(curLevel > prevLevel)
413  {
414  double tItr = MPI_Wtime();
415  prevLevel = curLevel;
416  FullyDistSpVec<int64_t, int64_t> fringe(A.getcommgrid(), A.getnrow() );
417  level = (int64_t)-1; // reset level structure in every iteration
418  level.SetElement(source, 1); // place source at level 1
419  fringe.SetElement(source, 1); // include source to the initial fringe
420  curLevel = 2;
421  while(fringe.getnnz() > 0) // continue until the frontier is empty
422  {
423  tSpMV1 = MPI_Wtime();
424  SpMV<SelectMinSR>(A, fringe, fringe, false, SPA);
425  tSpMV += MPI_Wtime() - tSpMV1;
426  fringe = EWiseMult(fringe, level, true, (int64_t) -1);
427  // set value to the current level
428  fringe=curLevel;
429  curLevel++;
430  level.Set(fringe);
431  }
432  curLevel = curLevel-2;
433 
434 
435  // last non-empty level (we can avoid this by keeping the last nonempty fringe)
436  fringe = level.Find(curLevel);
437  fringe.setNumToInd();
438 
439  // find a minimum degree vertex in the last level
441  EWiseApply<pair<int64_t, int64_t>>(fringe, degrees,
442  [](int64_t vtx, int64_t deg){return make_pair(deg, vtx);},
443  [](int64_t vtx, int64_t deg){return true;},
444  false, (int64_t) -1);
445  mindegree_vertex = fringe_degree.Reduce(minimum<pair<int64_t, int64_t> >(), make_pair(LLONG_MAX, (int64_t)-1));
446  if (curLevel > prevLevel)
447  source = mindegree_vertex.second;
448  iterations++;
449 
450 
451  if(myrank == 0)
452  {
453  cout <<" iteration: "<< iterations << " BFS levels: " << curLevel << " Time: " << MPI_Wtime() - tItr << " seconds." << endl;
454  }
455 
456  }
457 
458  // remove vertices in the current connected component
459  //unvisitedVertices = EWiseMult(unvisitedVertices, level, true, (int64_t) -1);
460  unvisitedVertices = EWiseApply<pair<int64_t, int64_t>>(unvisitedVertices, level,
461  [](pair<int64_t, int64_t> vtx, int64_t visited){return vtx;},
462  [](pair<int64_t, int64_t> vtx, int64_t visited){return visited==-1;},
463  false, make_pair((int64_t)-1, (int64_t)0));
464 
465  tOther = MPI_Wtime() - tstart - tSpMV;
466  tpvSpMV += tSpMV;
467  tpvOther += tOther;
468  if(myrank == 0)
469  {
470  cout << " vertex " << source << " is a pseudo peripheral vertex" << endl;
471  cout << " pseudo diameter: " << curLevel << ", #iterations: "<< iterations << endl;
472  cout << " Total time: " << MPI_Wtime() - tstart << " seconds [SpMV: " << tSpMV << ", other: " << tOther << "]" << endl << endl;
473 
474  }
475  return source;
476 
477 }
478 
479 template <typename PARMAT>
481 {
482 
483 #ifdef TIMING
485  cblas_alltoalltime = 0;
487  cblas_transvectime = 0;
489 #endif
490  int myrank, nprocs;
491  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
492  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
493 
494  FullyDistSpVec<int64_t, int64_t> unvisited ( A.getcommgrid(), A.getnrow());
495  unvisited.iota(A.getnrow(), (int64_t) 0); // index and values become the same
496  // The list of unvisited vertices. The value is (degree, vertex index) pair
498  EWiseApply<pair<int64_t, int64_t>>(unvisited, degrees,
499  [](int64_t vtx, int64_t deg){return make_pair(deg, vtx);},
500  [](int64_t vtx, int64_t deg){return true;},
501  false, (int64_t) -1);
502 
503 
504  // The RCM order will be stored here
505  FullyDistVec<int64_t, int64_t> rcmorder ( A.getcommgrid(), A.getnrow(), (int64_t) -1);
506 
507  int cc = 1; // current connected component
508  int64_t numUnvisited = unvisitedVertices.getnnz();
509  while(numUnvisited>0) // for each connected component
510  {
511 
512  if(myrank==0) cout << "Connected component: " << cc++ << endl;
513  // Get a pseudo-peripheral vertex to start the RCM algorithm
514  int64_t source = PseudoPeripheralVertex(A, unvisitedVertices, degrees,SPA);
515 
516  // Get the RCM ordering in this connected component
517  int64_t curOrder = A.getnrow() - numUnvisited;
518  RCMOrder(A, source, rcmorder, curOrder, degrees, SPA);
519 
520  numUnvisited = unvisitedVertices.getnnz();
521  }
522 
523 #ifdef TIMING
524  double *td_ag_all, *td_a2a_all, *td_tv_all, *td_mc_all, *td_spmv_all;
525  if(myrank == 0)
526  {
527  td_ag_all = new double[nprocs];
528  td_a2a_all = new double[nprocs];
529  td_tv_all = new double[nprocs];
530  td_mc_all = new double[nprocs];
531  td_spmv_all = new double[nprocs];
532  }
533 
534  MPI_Gather(&cblas_allgathertime, 1, MPI_DOUBLE, td_ag_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
535  MPI_Gather(&cblas_alltoalltime, 1, MPI_DOUBLE, td_a2a_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
536  MPI_Gather(&cblas_transvectime, 1, MPI_DOUBLE, td_tv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
537  MPI_Gather(&cblas_mergeconttime, 1, MPI_DOUBLE, td_mc_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
538  MPI_Gather(&cblas_localspmvtime, 1, MPI_DOUBLE, td_spmv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
539 
540 
541  double td_ag_all1=0, td_a2a_all1=0, td_tv_all1=0, td_mc_all1=0,td_spmv_all1 = 0;
542 
543 
544  if(myrank == 0)
545  {
546 
547  vector<double> total_time(nprocs, 0);
548  for(int i=0; i< nprocs; ++i) // find the mean performing guy
549  total_time[i] += td_ag_all[i] + td_a2a_all[i] + td_tv_all[i] + td_mc_all[i] + td_spmv_all[i];
550 
551  // order
552  vector<pair<double, int>> tosort;
553  for(int i=0; i<nprocs; i++) tosort.push_back(make_pair(total_time[i], i));
554  sort(tosort.begin(), tosort.end());
555  //vector<int> permutation = SpHelper::order(total_time);
556  vector<int> permutation(nprocs);
557  for(int i=0; i<nprocs; i++) permutation[i] = tosort[i].second;
558 
559  int smallest = permutation[0];
560  int largest = permutation[nprocs-1];
561  int median = permutation[nprocs/2];
562  cout << "------ Detail timing --------" << endl;
563  cout << "TOTAL (accounted) MEAN: " << accumulate( total_time.begin(), total_time.end(), 0.0 )/ static_cast<double> (nprocs) << endl;
564  cout << "TOTAL (accounted) MAX: " << total_time[0] << endl;
565  cout << "TOTAL (accounted) MIN: " << total_time[nprocs-1] << endl;
566  cout << "TOTAL (accounted) MEDIAN: " << total_time[nprocs/2] << endl;
567  cout << "-------------------------------" << endl;
568 
569  cout << "allgather median: " << td_ag_all[median] << endl;
570  cout << "all2all median: " << td_a2a_all[median] << endl;
571  cout << "transposevector median: " << td_tv_all[median] << endl;
572  cout << "mergecontributions median: " << td_mc_all[median] << endl;
573  cout << "spmsv median: " << td_spmv_all[median] << endl;
574  cout << "-------------------------------" << endl;
575  td_ag_all1=td_ag_all[median]; td_a2a_all1=td_a2a_all[median];
576  td_tv_all1=td_tv_all[median]; td_mc_all1=td_mc_all[median];
577  td_spmv_all1 = td_spmv_all[median];
578 
579  cout << "allgather fastest: " << td_ag_all[smallest] << endl;
580  cout << "all2all fastest: " << td_a2a_all[smallest] << endl;
581  cout << "transposevector fastest: " << td_tv_all[smallest] << endl;
582  cout << "mergecontributions fastest: " << td_mc_all[smallest] << endl;
583  cout << "spmsv fastest: " << td_spmv_all[smallest] << endl;
584  cout << "-------------------------------" << endl;
585 
586 
587  cout << "allgather slowest: " << td_ag_all[largest] << endl;
588  cout << "all2all slowest: " << td_a2a_all[largest] << endl;
589  cout << "transposevector slowest: " << td_tv_all[largest] << endl;
590  cout << "mergecontributions slowest: " << td_mc_all[largest] << endl;
591  cout << "spmsv slowest: " << td_spmv_all[largest] << endl;
592  }
593 
594 
595 
596  if(myrank == 0)
597  {
598 
599  cout << "summary statistics" << endl;
600  cout << base_filename << " " << processors << " " << threads << " " << processors * threads << " "<< torderSpMV << " "<< torderSort<< " "<< torderOther<< " "<< td_ag_all1 << " "<< td_a2a_all1 << " "<< td_tv_all1 << " "<< td_mc_all1 << " "<< td_spmv_all1 << " "<< endl;
601 
602  }
603  #endif
604 
605 
606  return rcmorder;
607 }
608 
609 
610 int main(int argc, char* argv[])
611 {
612  int provided;
613  MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided);
614  if (provided < MPI_THREAD_SERIALIZED)
615  {
616  printf("ERROR: The MPI library does not have MPI_THREAD_SERIALIZED support\n");
617  MPI_Abort(MPI_COMM_WORLD, 1);
618  }
619 
620  int nprocs, myrank;
621  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
622  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
623 
624  if(argc < 3)
625  {
626  if(myrank == 0)
627  {
628 
629  cout << "Usage: ./rcm <rmat|er|input> <scale|filename> " << "-permute" << " -savercm" << endl;
630  cout << "Example with a user supplied matrix:" << endl;
631  cout << " mpirun -np 4 ./rcm input a.mtx" << endl;
632  cout << "Example with a user supplied matrix (pre-permute the input matrix for load balance):" << endl;
633  cout << " mpirun -np 4 ./rcm input a.mtx -permute " << endl;
634  cout << "Example with a user supplied matrix (pre-permute the input matrix for load balance) & save rcm order to input_file_name.rcm.txt file:" << endl;
635  cout << " mpirun -np 4 ./rcm input a.mtx -permute -savercm" << endl;
636  cout << "Example with RMAT matrix: mpirun -np 4 ./rcm rmat 20" << endl;
637  cout << "Example with an Erdos-Renyi matrix: mpirun -np 4 ./rcm er 20" << endl;
638 
639  }
640  MPI_Finalize();
641  return -1;
642  }
643  {
644 
645  string filename="";
646  bool randpermute = false;
647  bool savercm = false;
648  for (int i = 1; i < argc; i++)
649  {
650  if (strcmp(argv[i],"-permute")==0)
651  randpermute = true;
652  if (strcmp(argv[i],"-savercm")==0)
653  savercm = true;
654  }
655 
656 
657  Par_DCSC_Bool * ABool;
658  ostringstream tinfo;
659 
660  if(string(argv[1]) == string("input")) // input option
661  {
662  ABool = new Par_DCSC_Bool();
663  filename = argv[2];
664  tinfo.str("");
665  tinfo << "**** Reading input matrix: " << filename << " ******* " << endl;
666 
667  base_filename = filename.substr(filename.find_last_of("/\\") + 1);
668 
669  SpParHelper::Print(tinfo.str());
670  double t01 = MPI_Wtime();
671  ABool->ParallelReadMM(filename, true, maximum<bool>());
672  double t02 = MPI_Wtime();
673  Symmetricize(*ABool);
674  tinfo.str("");
675  tinfo << "matrix read and symmetricized " << endl;
676  tinfo << "Reader took " << t02-t01 << " seconds" << endl;
677  SpParHelper::Print(tinfo.str());
678 
679  }
680  else if(string(argv[1]) == string("rmat"))
681  {
682  unsigned scale;
683  scale = static_cast<unsigned>(atoi(argv[2]));
684  double initiator[4] = {.57, .19, .19, .05};
686  DEL->GenGraph500Data(initiator, scale, EDGEFACTOR, true, false );
687  MPI_Barrier(MPI_COMM_WORLD);
688 
689  ABool = new Par_DCSC_Bool(*DEL, false);
690  Symmetricize(*ABool);
691  delete DEL;
692  }
693  else if(string(argv[1]) == string("er"))
694  {
695  unsigned scale;
696  scale = static_cast<unsigned>(atoi(argv[2]));
697  double initiator[4] = {.25, .25, .25, .25};
699  DEL->GenGraph500Data(initiator, scale, EDGEFACTOR, true, false );
700  MPI_Barrier(MPI_COMM_WORLD);
701 
702  ABool = new Par_DCSC_Bool(*DEL, false);
703  Symmetricize(*ABool);
704  delete DEL;
705  }
706  else
707  {
708  SpParHelper::Print("Unknown input option\n");
709  MPI_Finalize();
710  return -1;
711  }
712 
713 
714  Par_DCSC_Bool ABoolOld(ABool->getcommgrid());
715  // needed for random permutation
717  if(randpermute && string(argv[1]) == string("input")) // do this only for user provided matrices
718  {
719  if(ABool->getnrow() == ABool->getncol())
720  {
721  ABoolOld = *ABool; // create a copy for bandwidth computation
722  randp.iota(ABool->getnrow(), 0);
723  randp.RandPerm();
724  (*ABool)(randp,randp,true);// in-place permute to save memory
725  SpParHelper::Print("Matrix is randomly permuted for load balance.\n");
726  }
727  else
728  {
729  SpParHelper::Print("Rectangular matrix: Can not apply symmetric permutation.\n");
730  }
731  }
732 
733  ABool->RemoveLoops();
734  Par_CSC_Bool * ABoolCSC;
735  FullyDistVec<int64_t, int64_t> degrees ( ABool->getcommgrid());
736  float balance;
737  balance = ABool->LoadImbalance();
738  ABool->Reduce(degrees, Column, plus<int64_t>(), static_cast<int64_t>(0));
739  ABoolCSC = new Par_CSC_Bool(*ABool);
740 
741  int nthreads = 1;
742 #ifdef THREADED
743 #pragma omp parallel
744  {
745  nthreads = omp_get_num_threads();
746  }
747 #endif
748 
749  threads = nthreads;
750  processors = nprocs;
751 
752  ostringstream outs;
753  outs << "--------------------------------------" << endl;
754  outs << "Number of MPI proceses: " << nprocs << endl;
755  outs << "Number of threads per procese: " << nthreads << endl;
756  outs << "Load balance: " << balance << endl;
757  outs << "--------------------------------------" << endl;
758  SpParHelper::Print(outs.str());
759 
760 
761  // create Pre allocated SPA for SpMSpV
762  PreAllocatedSPA<int64_t> SPA(ABoolCSC->seq(), nthreads*4);
763  // Compute the RCM ordering
764  FullyDistVec<int64_t, int64_t> rcmorder = RCM(*ABoolCSC, degrees, SPA);
765 
766 
767  FullyDistVec<int64_t, int64_t> reverseOrder = rcmorder;
768  // comment out the next two lines if you want the Cuthill-McKee ordering
769  reverseOrder= rcmorder.TotalLength();
770  reverseOrder -= rcmorder;
771 
772 
773 
774  //revert random permutation if applied before
775  if(randpermute==true && randp.TotalLength() >0)
776  {
777  // inverse permutation
778  FullyDistVec<int64_t, int64_t>invRandp = randp.sort();
779  reverseOrder = reverseOrder(invRandp);
780  }
781 
782  // Write the RCM ordering
783  // TODO: should we save the permutation instead?
784  if(savercm && filename!="")
785  {
786  string ofName = filename + ".rcm.txt";
787  reverseOrder.ParallelWrite(ofName, 1, false);
788  }
789 
790  // get permutation from the ordering
791  // sort returns permutation from ordering
792  // and make the original vector a sequence (like iota)
793  // TODO: Can we use invert() ?
794  FullyDistVec<int64_t, int64_t>rcmorder1 = reverseOrder.sort();
795 
796 
797 
798 
799  // Permute the original matrix with the RCM order
800  // this part is not timed as it is needed for sanity check only
801  if(randpermute==true && randp.TotalLength() >0)
802  {
803  int64_t bw_before1 = ABoolOld.Bandwidth();
804  int64_t bw_before2 = ABool->Bandwidth();
805  ABoolOld(rcmorder1,rcmorder1,true);
806  int64_t bw_after = ABoolOld.Bandwidth();
807 
808  ostringstream outs1;
809  outs1 << "Original Bandwidth: " << bw_before1 << endl;
810  outs1 << "Bandwidth after randomly permuting the matrix: " << bw_before2 << endl;
811  outs1 << "Bandwidth after the matrix is permuted by RCM: " << bw_after << endl << endl;
812  SpParHelper::Print(outs1.str());
813  }
814  else
815  {
816  int64_t bw_before1 = ABool->Bandwidth();
817  (*ABool)(rcmorder1,rcmorder1,true);
818  int64_t bw_after = ABool->Bandwidth();
819 
820  ostringstream outs1;
821  outs1 << "Original Bandwidth: " << bw_before1 << endl;
822  outs1 << "Bandwidth after the matrix is permuted by RCM: " << bw_after << endl << endl;;
823  SpParHelper::Print(outs1.str());
824  }
825 
826 
827  delete ABool;
828  delete ABoolCSC;
829 
830  }
831  MPI_Finalize();
832  return 0;
833 }
834 
int64_t T_promote
Definition: RCM.cpp:97
Compute the minimum of two values.
Definition: Operations.h:172
double torderSpMV
Definition: RCM.cpp:329
void SetElement(IT indx, NT numx)
Indexing is performed 0-based.
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
Definition: SpParMat.cpp:791
std::shared_ptr< CommGrid > getcommgrid() const
Definition: SpParMat.h:275
void SetElement(IT indx, NT numx)
Compute the maximum of two values.
Definition: Operations.h:154
double cblas_localspmvtime
Definition: RCM.cpp:29
SpParMat< int64_t, double, SpDCCols< int64_t, double > > Par_DCSC_Double
Definition: RCM.cpp:121
MTRand GlobalMT
Definition: RCM.cpp:24
MPI_Datatype MPIType< int64_t >(void)
Definition: MPIType.cpp:64
double cblas_mergeconttime
Definition: RCM.cpp:30
void GenGraph500Data(double initiator[4], int log_numverts, int edgefactor, bool scramble=false, bool packed=false)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
Definition: Friends.h:694
friend bool operator==(const VertexType &vtx1, const VertexType &vtx2)
Definition: RCM.cpp:86
int64_t degree
Definition: RCM.cpp:90
int main(int argc, char *argv[])
Definition: RCM.cpp:610
T median(const T &a, const T &b, const T &c, Pred comp)
Definition: sort.timpl.h:49
int threads
Definition: RCM.cpp:40
void DeleteAll(A arr1)
Definition: Deleter.h:48
float LoadImbalance() const
Definition: SpParMat.cpp:665
std::shared_ptr< CommGrid > getcommgrid() const
double torderSort
Definition: RCM.cpp:329
static T_promote add(const T_promote &arg1, const T_promote &arg2)
Definition: RCM.cpp:102
int64_t PseudoPeripheralVertex(PARMAT &A, FullyDistSpVec< int64_t, pair< int64_t, int64_t >> &unvisitedVertices, FullyDistVec< int64_t, int64_t > degrees, PreAllocatedSPA< int64_t > &SPA)
Definition: RCM.cpp:390
void Symmetricize(PARMAT &A)
Definition: RCM.cpp:47
friend bool operator>(const VertexType &vtx1, const VertexType &vtx2)
Definition: RCM.cpp:73
static void axpy(bool a, const T_promote &x, T_promote &y)
Definition: RCM.cpp:112
SpParMat< int64_t, bool, SpDCCols< int64_t, bool > > Par_DCSC_Bool
Definition: RCM.cpp:119
double torderOther
Definition: RCM.cpp:329
NT Reduce(_BinaryOperation __binary_op, NT init) const
string base_filename
Definition: RCM.cpp:41
double A
static T_promote multiply(const bool &arg1, const T_promote &arg2)
Definition: RCM.cpp:107
IT Bandwidth() const
Definition: SpParMat.cpp:1598
#define EDGEFACTOR
Definition: RCM.cpp:19
void iota(IT globalsize, NT first)
std::vector< NT > GetLocalNum()
long int64_t
Definition: compat.h:21
VertexType(int64_t ord=-1, int64_t deg=-1)
Definition: RCM.cpp:61
friend bool operator<(const VertexType &vtx1, const VertexType &vtx2)
Definition: RCM.cpp:63
IT getncol() const
Definition: SpParMat.cpp:694
int64_t order
Definition: RCM.cpp:87
double cblas_allgathertime
Definition: RCM.cpp:28
void ParallelWrite(const std::string &filename, bool onebased, HANDLER handler, bool includeindices=true)
Definition: FullyDistVec.h:96
FullyDistVec< IT, IT > sort()
double cblas_alltoalltime
Definition: RCM.cpp:27
int processors
Definition: RCM.cpp:40
static T_promote id()
Definition: RCM.cpp:98
Definition: CCGrid.h:4
friend bool operator<=(const VertexType &vtx1, const VertexType &vtx2)
Definition: RCM.cpp:68
friend bool operator>=(const VertexType &vtx1, const VertexType &vtx2)
Definition: RCM.cpp:78
FullyDistVec< int64_t, int64_t > RCM(PARMAT &A, FullyDistVec< int64_t, int64_t > degrees, PreAllocatedSPA< int64_t > &SPA)
Definition: RCM.cpp:480
SpParMat< int64_t, bool, SpCCols< int64_t, bool > > Par_CSC_Bool
Definition: RCM.cpp:122
SpParMat< int64_t, int64_t, SpDCCols< int64_t, int64_t > > Par_DCSC_int64_t
Definition: RCM.cpp:120
void RCMOrder(PARMAT &A, int64_t source, FullyDistVec< int64_t, int64_t > &order, int64_t startOrder, FullyDistVec< int64_t, int64_t > degrees, PreAllocatedSPA< int64_t > &SPA)
Definition: RCM.cpp:332
std::ostream & operator<<(std::ostream &os, const TwitterEdge &twe)
Definition: TwitterEdge.h:59
std::vector< IT > GetLocalInd()
double cblas_transvectime
Definition: RCM.cpp:31
FullyDistSpVec< int64_t, int64_t > getOrder(FullyDistSpVec< int64_t, VertexType > &fringeRow, int64_t startLabel, int64_t endLabel)
Definition: RCM.cpp:127
static bool returnedSAID()
Definition: RCM.cpp:99
void ParallelReadMM(const std::string &filename, bool onebased, _BinaryOperation BinOp)
Definition: SpParMat.cpp:3417
IT getnrow() const
Definition: SpParMat.cpp:685