COMBINATORIAL_BLAS  1.6
DirOptBFS.cpp
Go to the documentation of this file.
1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.6 -------------------------------------------------*/
4 /* date: 6/15/2017 ---------------------------------------------*/
5 /* authors: Ariful Azad, Aydin Buluc --------------------------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2017, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 
30 #define DETERMINISTIC
31 #define BOTTOMUPTIME
32 #include <mpi.h>
33 #include <sys/time.h>
34 #include <iostream>
35 #include <iomanip>
36 #include <functional>
37 #include <algorithm>
38 #include <vector>
39 #include <string>
40 #include <sstream>
41 #ifdef THREADED
42  #ifndef _OPENMP
43  #define _OPENMP
44  #endif
45  #include <omp.h>
46 #endif
47 
48 // These macros should be defined before stdint.h is included
49 #ifndef __STDC_CONSTANT_MACROS
50 #define __STDC_CONSTANT_MACROS
51 #endif
52 #ifndef __STDC_LIMIT_MACROS
53 #define __STDC_LIMIT_MACROS
54 #endif
55 #include <stdint.h>
56 
63 
68 
69 double bu_local;
70 double bu_update;
71 double bu_rotate;
73 
74 
75 #include "CombBLAS/CombBLAS.h"
76 
77 using namespace combblas;
78 using namespace std;
79 
80 #define ITERS 64
81 #define EDGEFACTOR 16
82 
83 // 64-bit floor(log2(x)) function
84 // note: least significant bit is the "zeroth" bit
85 // pre: v > 0
86 unsigned int highestbitset(uint64_t v)
87 {
88  // b in binary is {10,1100, 11110000, 1111111100000000 ...}
89  const uint64_t b[] = {0x2ULL, 0xCULL, 0xF0ULL, 0xFF00ULL, 0xFFFF0000ULL, 0xFFFFFFFF00000000ULL};
90  const unsigned int S[] = {1, 2, 4, 8, 16, 32};
91  int i;
92 
93  unsigned int r = 0; // result of log2(v) will go here
94  for (i = 5; i >= 0; i--)
95  {
96  if (v & b[i]) // highestbitset is on the left half (i.e. v > S[i] for sure)
97  {
98  v >>= S[i];
99  r |= S[i];
100  }
101  }
102  return r;
103 }
104 
105 template <class T>
106 bool from_string(T & t, const string& s, ios_base& (*f)(ios_base&))
107 {
108  istringstream iss(s);
109  return !(iss >> f >> t).fail();
110 }
111 
112 
113 template <typename PARMAT>
114 void Symmetricize(PARMAT & A)
115 {
116  // boolean addition is practically a "logical or"
117  // therefore this doesn't destruct any links
118  PARMAT AT = A;
119  AT.Transpose();
120  A += AT;
121 }
122 
127 struct prunediscovered: public binary_function<int64_t, int64_t, int64_t >
128 {
129  int64_t operator()(int64_t x, const int64_t & y) const
130  {
131  return ( y == -1 ) ? x: -1;
132  }
133 };
134 
135 int main(int argc, char* argv[])
136 {
137 #ifdef THREADED
138  cblas_splits = omp_get_max_threads();
139 #else
140  cblas_splits = 1;
141 #endif
142 
143 
144  int nprocs, myrank;
145 #ifdef _OPENMP
146  int provided, flag, claimed;
147  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided );
148  MPI_Is_thread_main( &flag );
149  if (!flag)
150  SpParHelper::Print("This thread called init_thread but Is_thread_main gave false\n");
151  MPI_Query_thread( &claimed );
152  if (claimed != provided)
153  SpParHelper::Print("Query thread gave different thread level than requested\n");
154 #else
155  MPI_Init(&argc, &argv);
156 #endif
157 
158  MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
159  MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
160  if(argc < 2)
161  {
162  if(myrank == 0)
163  {
164  cout << "Usage: ./dobfs <Scale>" << endl;
165  cout << "Example: ./dobfs 25" << endl;
166  }
167  MPI_Finalize();
168  return -1;
169  }
170  {
172  typedef SpParMat < int64_t, bool, SpDCCols<int32_t,bool> > PSpMat_s32p64; // sequentially use 32-bits for local matrices, but parallel semantics are 64-bits
173  typedef SpParMat < int64_t, int, SpDCCols<int32_t,int> > PSpMat_s32p64_Int; // similarly mixed, but holds integers as upposed to booleans
174 
175  // Declare objects
176  PSpMat_Bool A;
177  PSpMat_s32p64 Aeff;
178  PSpMat_s32p64 ALocalT;
179  shared_ptr<CommGrid> fullWorld;
180  fullWorld.reset( new CommGrid(MPI_COMM_WORLD, 0, 0) );
181  FullyDistVec<int64_t, int64_t> degrees(fullWorld); // degrees of vertices (including multi-edges and self-loops)
182  FullyDistVec<int64_t, int64_t> nonisov(fullWorld); // id's of non-isolated (connected) vertices
183  unsigned scale;
184  OptBuf<int32_t, int64_t> optbuf; // let indices be 32-bits
185 
186  scale = static_cast<unsigned>(atoi(argv[1]));
187  ostringstream outs;
188  outs << "Forcing scale to : " << scale << endl;
189  SpParHelper::Print(outs.str());
190 
191  SpParHelper::Print("Using fast vertex permutations; skipping edge permutations (like v2.1)\n");
192 
193  // this is an undirected graph, so A*x does indeed BFS
194  double initiator[4] = {.57, .19, .19, .05};
195 
196  double t01 = MPI_Wtime();
197  double t02;
199  DEL->GenGraph500Data(initiator, scale, EDGEFACTOR, true, true ); // generate packed edges
200  SpParHelper::Print("Generated renamed edge lists\n");
201  t02 = MPI_Wtime();
202  ostringstream tinfo;
203  tinfo << "Generation took " << t02-t01 << " seconds" << endl;
204  SpParHelper::Print(tinfo.str());
205 
206 
207  // Start Kernel #1
208  MPI_Barrier(MPI_COMM_WORLD);
209  double t1 = MPI_Wtime();
210 
211  // conversion from distributed edge list, keeps self-loops, sums duplicates
212  PSpMat_s32p64_Int * G = new PSpMat_s32p64_Int(*DEL, false);
213  delete DEL; // free memory before symmetricizing
214  SpParHelper::Print("Created Sparse Matrix (with int32 local indices and values)\n");
215 
216  MPI_Barrier(MPI_COMM_WORLD);
217  double redts = MPI_Wtime();
218  G->Reduce(degrees, Row, plus<int64_t>(), static_cast<int64_t>(0)); // Identity is 0
219  MPI_Barrier(MPI_COMM_WORLD);
220  double redtf = MPI_Wtime();
221 
222  ostringstream redtimeinfo;
223  redtimeinfo << "Calculated degrees in " << redtf-redts << " seconds" << endl;
224  SpParHelper::Print(redtimeinfo.str());
225  A = PSpMat_Bool(*G); // Convert to Boolean
226  delete G;
227  int64_t removed = A.RemoveLoops();
228 
229  ostringstream loopinfo;
230  loopinfo << "Converted to Boolean and removed " << removed << " loops" << endl;
231  SpParHelper::Print(loopinfo.str());
232  A.PrintInfo();
233 
234  FullyDistVec<int64_t, int64_t> * ColSums = new FullyDistVec<int64_t, int64_t>(A.getcommgrid());
235  FullyDistVec<int64_t, int64_t> * RowSums = new FullyDistVec<int64_t, int64_t>(A.getcommgrid());
236  A.Reduce(*ColSums, Column, plus<int64_t>(), static_cast<int64_t>(0));
237  A.Reduce(*RowSums, Row, plus<int64_t>(), static_cast<int64_t>(0));
238  SpParHelper::Print("Reductions done\n");
239  ColSums->EWiseApply(*RowSums, plus<int64_t>());
240  SpParHelper::Print("Intersection of colsums and rowsums found\n");
241  delete RowSums;
242 
243  nonisov = ColSums->FindInds(bind2nd(greater<int64_t>(), 0)); // only the indices of non-isolated vertices
244  delete ColSums;
245 
246  nonisov.RandPerm(); // so that A(v,v) is load-balanced (both memory and time wise)
247  SpParHelper::Print("Found non-isolated vertices\n");
248  A.PrintInfo();
249 
250 #ifndef NOPERMUTE
251  A(nonisov, nonisov, true); // in-place permute to save memory
252  SpParHelper::Print("Dropped isolated vertices from input\n");
253  A.PrintInfo();
254 #endif
255 
256  Aeff = PSpMat_s32p64(A); // Convert to 32-bit local integers
257  A.FreeMemory();
258  SpParHelper::Print("Converted to 32-bit integers\n");
259 
260  Symmetricize(Aeff); // A += A';
261  SpParHelper::Print("Symmetricized\n");
262 
263  Aeff.OptimizeForGraph500(optbuf); // Should be called before threading is activated
264  ALocalT = PSpMat_s32p64(Aeff.seq().TransposeConstPtr(), Aeff.getcommgrid()); // this should be copied before the threading is activated
265  #ifdef THREADED
266  tinfo << "Threading activated with " << cblas_splits << " threads" << endl;
267  SpParHelper::Print(tinfo.str());
269  #endif
270  Aeff.PrintInfo();
271 
272  MPI_Barrier(MPI_COMM_WORLD);
273  double t2=MPI_Wtime();
274 
275  ostringstream k1timeinfo;
276  k1timeinfo << (t2-t1) - (redtf-redts) << " seconds elapsed for Kernel #1" << endl;
277  SpParHelper::Print(k1timeinfo.str());
278 
279  Aeff.PrintInfo();
280  float balance = Aeff.LoadImbalance();
281  ostringstream lbout;
282  lbout << "Load balance: " << balance << endl;
283  SpParHelper::Print(lbout.str());
284 
285  MPI_Barrier(MPI_COMM_WORLD);
286  t1 = MPI_Wtime();
287 
288  // Now that every remaining vertex is non-isolated, randomly pick ITERS many of them as starting vertices
289  #ifndef NOPERMUTE
290  degrees = degrees(nonisov); // fix the degrees array too
291  degrees.PrintInfo("Degrees array");
292  #endif
293  // degrees.DebugPrint();
294  FullyDistVec<int64_t, int64_t> Cands(A.getcommgrid(), ITERS, 0);
295  double nver = (double) degrees.TotalLength();
296 
297  #ifdef DETERMINISTIC
298  uint64_t seed = 1383098845;
299  #else
300  uint64_t seed= time(NULL);
301  #endif
302  MTRand M(seed); // generate random numbers with Mersenne Twister
303 
304  vector<double> loccands(ITERS);
305  vector<int64_t> loccandints(ITERS);
306  if(myrank == 0)
307  {
308  for(int i=0; i<ITERS; ++i)
309  loccands[i] = M.rand();
310  copy(loccands.begin(), loccands.end(), ostream_iterator<double>(cout," ")); cout << endl;
311  transform(loccands.begin(), loccands.end(), loccands.begin(), bind2nd( multiplies<double>(), nver ));
312 
313  for(int i=0; i<ITERS; ++i)
314  loccandints[i] = static_cast<int64_t>(loccands[i]);
315  copy(loccandints.begin(), loccandints.end(), ostream_iterator<double>(cout," ")); cout << endl;
316  }
317 
318  MPI_Bcast(&(loccandints[0]), ITERS, MPIType<int64_t>(),0,MPI_COMM_WORLD);
319  for(int i=0; i<ITERS; ++i)
320  {
321  Cands.SetElement(i,loccandints[i]);
322  }
323 
324  #define MAXTRIALS 1
325  for(int trials =0; trials < MAXTRIALS; trials++) // try different algorithms for BFS if MAXTRIALS > 1
326  {
328  cblas_alltoalltime = 0;
330  cblas_transvectime = 0;
333  bottomup_sendrecv = 0;
334  bottomup_allgather = 0;
335  bottomup_total = 0;
336  bottomup_convert = 0;
337 
338  bu_local = 0;
339  bu_update = 0;
340  bu_rotate = 0;
341 
342  MPI_Pcontrol(1,"BFS");
343 
344  double MTEPS[ITERS]; double INVMTEPS[ITERS]; double TIMES[ITERS]; double EDGES[ITERS];
345 
346  for(int i=0; i<ITERS; ++i)
347  {
348  SpParHelper::Print("A BFS iteration is starting\n");
349 
350  // FullyDistVec ( shared_ptr<CommGrid> grid, IT globallen, NT initval);
351  FullyDistVec<int64_t, int64_t> parents ( Aeff.getcommgrid(), Aeff.getncol(), (int64_t) -1); // identity is -1
352 
353  // FullyDistSpVec ( shared_ptr<CommGrid> grid, IT glen);
354  FullyDistSpVec<int64_t, int64_t> fringe(Aeff.getcommgrid(), Aeff.getncol()); // numerical values are stored 0-based
355 
356  ostringstream devout;
357  devout.setf(ios::fixed);
358 
359  MPI_Barrier(MPI_COMM_WORLD);
360  double t1 = MPI_Wtime();
361 
362  int64_t num_edges = Aeff.getnnz();
363  int64_t num_nodes = Aeff.getncol();
364  int64_t up_cutoff = num_edges / 20;
365  int64_t down_cutoff = (((double) num_nodes) * ((double)num_nodes)) / ((double) num_edges * 12.0);
366 
367  devout << "param " << num_nodes << " vertices with " << num_edges << " edges" << endl;
368  devout << up_cutoff << " up and " << down_cutoff << " down" << endl;
369 
370  fringe.SetElement(Cands[i], Cands[i]);
371  parents.SetElement(Cands[i], Cands[i]);
372  int iterations = 0;
373 
374  BitMapFringe<int64_t,int64_t> bm_fringe(fringe.getcommgrid(), fringe);
375  BitMapCarousel<int64_t,int64_t> done(Aeff.getcommgrid(), parents.TotalLength(), bm_fringe.GetSubWordDisp());
376  SpDCCols<int,bool>::SpColIter *starts = CalcSubStarts(ALocalT, fringe, done);
377  int64_t fringe_size = fringe.getnnz();
378  int64_t last_fringe_size = 0;
379  double pred_start = MPI_Wtime();
380  fringe.Apply(myset<int64_t>(1));
381  int64_t pred = EWiseMult(fringe, degrees, false, (int64_t) 0).Reduce(plus<int64_t>(), (int64_t) 0);
382  double pred_end = MPI_Wtime();
383  devout << " s" << setw(15) << pred << setw(15) << setprecision(5) << (pred_end - pred_start) << endl;
384  cblas_ewisemulttime += (pred_end - pred_start);
385 
386  while(fringe_size > 0)
387  {
388  if ((pred > up_cutoff) && (last_fringe_size < fringe_size))
389  { // Bottom-up
390  MPI_Barrier(MPI_COMM_WORLD);
391  double conv_start = MPI_Wtime();
392  done.LoadVec(parents);
393  bm_fringe.LoadFromSpVec(fringe);
394  double conv_end = MPI_Wtime();
395  devout << " c" << setw(30) << setprecision(5) << (conv_end - conv_start) << endl;
396  bottomup_convert += (conv_end - conv_start);
397 
398  while (fringe_size > 0)
399  {
400  double step_start = MPI_Wtime();
401  BottomUpStep(ALocalT, fringe, bm_fringe, parents, done, starts);
402  double step_end = MPI_Wtime();
403 
404  devout << setw(2) << iterations << "u" << setw(15) << fringe_size << setprecision(5) << setw(15) << (step_end-step_start) << endl;
405  bottomup_total += (step_end-step_start);
406  iterations++;
407  last_fringe_size = fringe_size;
408  fringe_size = bm_fringe.GetNumSet();
409  if ((fringe_size < down_cutoff) && (last_fringe_size > fringe_size))
410  {
411  conv_start = MPI_Wtime();
412  bm_fringe.UpdateSpVec(fringe);
413  conv_end = MPI_Wtime();
414  devout << " c" << setw(30) << setprecision(5) << (conv_end - conv_start) << endl;
415  bottomup_convert += (conv_end - conv_start);
416  break;
417  }
418  }
419  }
420  else
421  { // Top-down
422  double step_start = MPI_Wtime();
423  fringe.setNumToInd();
424  fringe = SpMV(Aeff, fringe,optbuf);
425  double ewise_start = MPI_Wtime();
426  fringe = EWiseMult(fringe, parents, true, (int64_t) -1);
427  parents.Set(fringe);
428  double step_end = MPI_Wtime();
429  devout << setw(2) << iterations << "d" << setw(15) << fringe.getnnz() << setw(15) << setprecision(5) << (step_end-step_start) << endl;
430  cblas_ewisemulttime += (step_end - ewise_start);
431 
432  pred_start = MPI_Wtime();
433  fringe.Apply(myset<int64_t>(1));
434  pred = EWiseMult(fringe, degrees, false, (int64_t) 0).Reduce(plus<int64_t>(), (int64_t) 0);
435  pred_end = MPI_Wtime();
436  devout << " s" << setw(15) << pred << setw(15) << setprecision(5) << (pred_end - pred_start) << endl;
437  cblas_ewisemulttime += (pred_end - pred_start);
438  iterations++;
439  last_fringe_size = fringe_size;
440  fringe_size = fringe.getnnz();
441  }
442  }
443  MPI_Barrier(MPI_COMM_WORLD);
444  double t2 = MPI_Wtime();
445  delete[] starts;
446  SpParHelper::Print(devout.str());
447 
448  FullyDistSpVec<int64_t, int64_t> parentsp = parents.Find(bind2nd(greater<int64_t>(), -1));
449  parentsp.Apply(myset<int64_t>(1));
450  // we use degrees on the directed graph, so that we don't count the reverse edges in the teps score
451  int64_t nedges = EWiseMult(parentsp, degrees, false, (int64_t) 0).Reduce(plus<int64_t>(), (int64_t) 0);
452  int64_t nverts = parentsp.Reduce(plus<int64_t>(), (int64_t) 0);
453 
454  ostringstream outnew;
455  outnew << i << "th starting vertex was " << Cands[i] << endl;
456  outnew << "Number iterations: " << iterations << endl;
457  outnew << "Number of vertices found: " << nverts << endl;
458  outnew << "Number of edges traversed: " << nedges << endl;
459  outnew << "BFS time: " << t2-t1 << " seconds" << endl;
460  outnew << "MTEPS: " << static_cast<double>(nedges) / (t2-t1) / 1000000.0 << endl;
461  outnew << "Total communication (average so far): " << (cblas_allgathertime + cblas_alltoalltime) / (i+1) << endl;
462  TIMES[i] = t2-t1;
463  EDGES[i] = nedges;
464  MTEPS[i] = static_cast<double>(nedges) / (t2-t1) / 1000000.0;
465  SpParHelper::Print(outnew.str());
466  }
467  MPI_Pcontrol(-1,"BFS");
468  SpParHelper::Print("Finished\n");
469 #ifdef TIMING
470  double * bu_total, *bu_ag_all, *bu_sr_all, *bu_convert, *td_ag_all, *td_a2a_all, *td_tv_all, *td_mc_all, *td_spmv_all, *td_ewm_all;
471  if(myrank == 0)
472  {
473  bu_total = new double[nprocs];
474  bu_ag_all = new double[nprocs];
475  bu_sr_all = new double[nprocs];
476  bu_convert = new double[nprocs];
477  td_ag_all = new double[nprocs];
478  td_a2a_all = new double[nprocs];
479  td_tv_all = new double[nprocs];
480  td_mc_all = new double[nprocs];
481  td_spmv_all = new double[nprocs];
482  td_ewm_all = new double[nprocs];
483  }
484  bottomup_allgather /= static_cast<double>(ITERS);
485  bottomup_sendrecv /= static_cast<double>(ITERS);
486  bottomup_total /= static_cast<double>(ITERS);
487  bottomup_convert /= static_cast<double>(ITERS); // conversion not included in total time
488 
489  cblas_allgathertime /= static_cast<double>(ITERS);
490  cblas_alltoalltime /= static_cast<double>(ITERS);
491  cblas_transvectime /= static_cast<double>(ITERS);
492  cblas_mergeconttime /= static_cast<double>(ITERS);
493  cblas_localspmvtime /= static_cast<double>(ITERS);
494  cblas_ewisemulttime /= static_cast<double>(ITERS);
495 
496  MPI_Gather(&bottomup_convert, 1, MPI_DOUBLE, bu_convert, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
497  MPI_Gather(&bottomup_total, 1, MPI_DOUBLE, bu_total, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
498  MPI_Gather(&bottomup_allgather, 1, MPI_DOUBLE, bu_ag_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
499  MPI_Gather(&bottomup_sendrecv, 1, MPI_DOUBLE, bu_sr_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
500  MPI_Gather(&cblas_allgathertime, 1, MPI_DOUBLE, td_ag_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
501  MPI_Gather(&cblas_alltoalltime, 1, MPI_DOUBLE, td_a2a_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
502  MPI_Gather(&cblas_transvectime, 1, MPI_DOUBLE, td_tv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
503  MPI_Gather(&cblas_mergeconttime, 1, MPI_DOUBLE, td_mc_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
504  MPI_Gather(&cblas_localspmvtime, 1, MPI_DOUBLE, td_spmv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
505  MPI_Gather(&cblas_ewisemulttime, 1, MPI_DOUBLE, td_ewm_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
506 
507  double bu_local_total = 0;
508  double bu_update_total = 0;
509  double bu_rotate_total = 0;
510 
511  MPI_Allreduce(&bu_local, &bu_local_total, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
512  MPI_Allreduce(&bu_update, &bu_update_total, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
513  MPI_Allreduce(&bu_rotate, &bu_rotate_total, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
514 
515  if(myrank == 0)
516  {
517  cout << "BU Local: " << bu_local_total/nprocs << endl;
518  cout << "BU Update: " << bu_update_total/nprocs << endl;
519  cout << "BU Rotate: " << bu_rotate_total/nprocs << endl;
520 
521  vector<double> total_time(nprocs, 0);
522  for(int i=0; i< nprocs; ++i) // find the mean performing guy
523  total_time[i] += bu_total[i] + bu_convert[i] + td_ag_all[i] + td_a2a_all[i] + td_tv_all[i] + td_mc_all[i] + td_spmv_all[i] + td_ewm_all[i];
524 
525  vector<size_t> permutation = SpHelper::find_order(total_time);
526  size_t smallest = permutation[0];
527  size_t largest = permutation[nprocs-1];
528  size_t median = permutation[nprocs/2];
529 
530  cout << "TOTAL (accounted) MEAN: " << accumulate( total_time.begin(), total_time.end(), 0.0 )/ static_cast<double> (nprocs) << endl;
531  cout << "TOTAL (accounted) MAX: " << total_time[0] << endl;
532  cout << "TOTAL (accounted) MIN: " << total_time[nprocs-1] << endl;
533  cout << "TOTAL (accounted) MEDIAN: " << total_time[nprocs/2] << endl;
534  cout << "-------------------------------" << endl;
535 
536  cout << "Convert median: " << bu_convert[median] << endl;
537  cout << "Bottom-up allgather median: " << bu_ag_all[median] << endl;
538  cout << "Bottom-up send-recv median: " << bu_sr_all[median] << endl;
539  cout << "Bottom-up compute median: " << bu_total[median] - (bu_ag_all[median] + bu_sr_all[median]) << endl;
540  cout << "Top-down allgather median: " << td_ag_all[median] << endl;
541  cout << "Top-down all2all median: " << td_a2a_all[median] << endl;
542  cout << "Top-down transposevector median: " << td_tv_all[median] << endl;
543  cout << "Top-down mergecontributions median: " << td_mc_all[median] << endl;
544  cout << "Top-down spmsv median: " << td_spmv_all[median] << endl;
545  cout << "-------------------------------" << endl;
546 
547  cout << "Convert MEAN: " << accumulate( bu_convert, bu_convert+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
548  cout << "Bottom-up total MEAN: " << accumulate( bu_total, bu_total+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
549  cout << "Bottom-up allgather MEAN: " << accumulate( bu_ag_all, bu_ag_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
550  cout << "Bottom-up send-recv MEAN: " << accumulate( bu_sr_all, bu_sr_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
551  cout << "Top-down allgather MEAN: " << accumulate( td_ag_all, td_ag_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
552  cout << "Top-down all2all MEAN: " << accumulate( td_a2a_all, td_a2a_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
553  cout << "Top-down transposevector MEAN: " << accumulate( td_tv_all, td_tv_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
554  cout << "Top-down mergecontributions MEAN: " << accumulate( td_mc_all, td_mc_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
555  cout << "Top-down spmsv MEAN: " << accumulate( td_spmv_all, td_spmv_all+nprocs, 0.0 )/ static_cast<double> (nprocs) << endl;
556  cout << "-------------------------------" << endl;
557 
558 
559  cout << "Bottom-up allgather fastest: " << bu_ag_all[smallest] << endl;
560  cout << "Bottom-up send-recv fastest: " << bu_sr_all[smallest] << endl;
561  cout << "Bottom-up compute fastest: " << bu_total[smallest] - (bu_ag_all[smallest] + bu_sr_all[smallest]) << endl;
562  cout << "Top-down allgather fastest: " << td_ag_all[smallest] << endl;
563  cout << "Top-down all2all fastest: " << td_a2a_all[smallest] << endl;
564  cout << "Top-down transposevector fastest: " << td_tv_all[smallest] << endl;
565  cout << "Top-down mergecontributions fastest: " << td_mc_all[smallest] << endl;
566  cout << "Top-down spmsv fastest: " << td_spmv_all[smallest] << endl;
567  cout << "-------------------------------" << endl;
568 
569 
570  cout << "Bottom-up allgather slowest: " << bu_ag_all[largest] << endl;
571  cout << "Bottom-up send-recv slowest: " << bu_sr_all[largest] << endl;
572  cout << "Bottom-up compute slowest: " << bu_total[largest] - (bu_ag_all[largest] + bu_sr_all[largest]) << endl;
573  cout << "Top-down allgather slowest: " << td_ag_all[largest] << endl;
574  cout << "Top-down all2all slowest: " << td_a2a_all[largest] << endl;
575  cout << "Top-down transposevector slowest: " << td_tv_all[largest] << endl;
576  cout << "Top-down mergecontributions slowest: " << td_mc_all[largest] << endl;
577  cout << "Top-down spmsv slowest: " << td_spmv_all[largest] << endl;
578  }
579 #endif
580  ostringstream os;
581  sort(EDGES, EDGES+ITERS);
582  os << "--------------------------" << endl;
583  os << "Min nedges: " << EDGES[0] << endl;
584  os << "First Quartile nedges: " << (EDGES[(ITERS/4)-1] + EDGES[ITERS/4])/2 << endl;
585  os << "Median nedges: " << (EDGES[(ITERS/2)-1] + EDGES[ITERS/2])/2 << endl;
586  os << "Third Quartile nedges: " << (EDGES[(3*ITERS/4) -1 ] + EDGES[3*ITERS/4])/2 << endl;
587  os << "Max nedges: " << EDGES[ITERS-1] << endl;
588  double mean = accumulate( EDGES, EDGES+ITERS, 0.0 )/ ITERS;
589  vector<double> zero_mean(ITERS); // find distances to the mean
590  transform(EDGES, EDGES+ITERS, zero_mean.begin(), bind2nd( minus<double>(), mean ));
591  // self inner-product is sum of sum of squares
592  double deviation = inner_product( zero_mean.begin(),zero_mean.end(), zero_mean.begin(), 0.0 );
593  deviation = sqrt( deviation / (ITERS-1) );
594  os << "Mean nedges: " << mean << endl;
595  os << "STDDEV nedges: " << deviation << endl;
596  os << "--------------------------" << endl;
597 
598  sort(TIMES,TIMES+ITERS);
599  os << "Min time: " << TIMES[0] << " seconds" << endl;
600  os << "First Quartile time: " << (TIMES[(ITERS/4)-1] + TIMES[ITERS/4])/2 << " seconds" << endl;
601  os << "Median time: " << (TIMES[(ITERS/2)-1] + TIMES[ITERS/2])/2 << " seconds" << endl;
602  os << "Third Quartile time: " << (TIMES[(3*ITERS/4)-1] + TIMES[3*ITERS/4])/2 << " seconds" << endl;
603  os << "Max time: " << TIMES[ITERS-1] << " seconds" << endl;
604  mean = accumulate( TIMES, TIMES+ITERS, 0.0 )/ ITERS;
605  transform(TIMES, TIMES+ITERS, zero_mean.begin(), bind2nd( minus<double>(), mean ));
606  deviation = inner_product( zero_mean.begin(),zero_mean.end(), zero_mean.begin(), 0.0 );
607  deviation = sqrt( deviation / (ITERS-1) );
608  os << "Mean time: " << mean << " seconds" << endl;
609  os << "STDDEV time: " << deviation << " seconds" << endl;
610  os << "--------------------------" << endl;
611 
612  sort(MTEPS, MTEPS+ITERS);
613  os << "Min MTEPS: " << MTEPS[0] << endl;
614  os << "First Quartile MTEPS: " << (MTEPS[(ITERS/4)-1] + MTEPS[ITERS/4])/2 << endl;
615  os << "Median MTEPS: " << (MTEPS[(ITERS/2)-1] + MTEPS[ITERS/2])/2 << endl;
616  os << "Third Quartile MTEPS: " << (MTEPS[(3*ITERS/4)-1] + MTEPS[3*ITERS/4])/2 << endl;
617  os << "Max MTEPS: " << MTEPS[ITERS-1] << endl;
618  transform(MTEPS, MTEPS+ITERS, INVMTEPS, safemultinv<double>()); // returns inf for zero teps
619  double hteps = static_cast<double>(ITERS) / accumulate(INVMTEPS, INVMTEPS+ITERS, 0.0);
620  os << "Harmonic mean of MTEPS: " << hteps << endl;
621  transform(INVMTEPS, INVMTEPS+ITERS, zero_mean.begin(), bind2nd(minus<double>(), 1/hteps));
622  deviation = inner_product( zero_mean.begin(),zero_mean.end(), zero_mean.begin(), 0.0 );
623  deviation = sqrt( deviation / (ITERS-1) ) * (hteps*hteps); // harmonic_std_dev
624  os << "Harmonic standard deviation of MTEPS: " << deviation << endl;
625  SpParHelper::Print(os.str());
626  }
627  }
628  MPI_Finalize();
629  return 0;
630 }
631 
void SetElement(IT indx, NT numx)
Indexing is performed 0-based.
int cblas_splits
Definition: DirOptBFS.cpp:72
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
Definition: SpParMat.cpp:791
double rand()
std::shared_ptr< CommGrid > getcommgrid() const
Definition: SpParMat.h:275
void ActivateThreading(int numsplits)
Definition: SpParMat.cpp:2881
double bottomup_allgather
Definition: DirOptBFS.cpp:65
Iterate over (sparse) columns of the sparse matrix.
Definition: SpDCCols.h:84
void SetElement(IT indx, NT numx)
double bottomup_total
Definition: DirOptBFS.cpp:66
MPI_Datatype MPIType< int64_t >(void)
Definition: MPIType.cpp:64
void Set(const FullyDistSpVec< IT, NT > &rhs)
void GenGraph500Data(double initiator[4], int log_numverts, int edgefactor, bool scramble=false, bool packed=false)
FullyDistVec< IT, IT > FindInds(_Predicate pred) const
Return the indices where pred is true.
SpParMat< int64_t, int, SpDCCols< int32_t, int > > PSpMat_s32p64_Int
Definition: SpMSpVBench.cpp:62
IT getnnz() const
Definition: SpParMat.cpp:676
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
Definition: Friends.h:694
FullyDistSpVec< IT, NT > Find(_Predicate pred) const
Return the elements for which pred is true.
SpDCCols< int, bool >::SpColIter * CalcSubStarts(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapCarousel< IT, VT > &done)
Definition: BFSFriends.h:397
T median(const T &a, const T &b, const T &c, Pred comp)
Definition: sort.timpl.h:49
#define EDGEFACTOR
Definition: DirOptBFS.cpp:81
float LoadImbalance() const
Definition: SpParMat.cpp:665
void BottomUpStep(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapFringe< int64_t, int64_t > &bm_fringe, FullyDistVec< IT, VT > &parents, BitMapCarousel< IT, VT > &done, SpDCCols< int, bool >::SpColIter *starts)
Definition: BFSFriends.h:458
void EWiseApply(const FullyDistVec< IT, NT2 > &other, _BinaryOperation __binary_op, _BinaryPredicate _do_op, const bool useExtendedBinOp)
double bu_local
Definition: DirOptBFS.cpp:69
void OptimizeForGraph500(OptBuf< LIT, OT > &optbuf)
Definition: SpParMat.cpp:2780
int main(int argc, char *argv[])
Definition: DirOptBFS.cpp:135
double bu_update
Definition: DirOptBFS.cpp:70
double bu_rotate
Definition: DirOptBFS.cpp:71
#define ITERS
Definition: DirOptBFS.cpp:80
double A
FullyDistSpVec< IT, VT > SpMV(const SpParMat< IT, bool, UDER > &A, const FullyDistSpVec< IT, VT > &x, OptBuf< int32_t, VT > &optbuf)
Definition: BFSFriends.h:328
int64_t operator()(int64_t x, const int64_t &y) const
Definition: DirOptBFS.cpp:129
static void Print(const std::string &s)
double cblas_allgathertime
Definition: DirOptBFS.cpp:58
#define MAXTRIALS
bool from_string(T &t, const string &s, ios_base &(*f)(ios_base &))
Definition: DirOptBFS.cpp:106
double bottomup_convert
Definition: DirOptBFS.cpp:67
long int64_t
Definition: compat.h:21
static std::vector< size_t > find_order(const std::vector< T > &values)
Definition: SpHelper.h:57
double bottomup_sendrecv
Definition: DirOptBFS.cpp:64
double cblas_ewisemulttime
Definition: DirOptBFS.cpp:62
IT getncol() const
Definition: SpParMat.cpp:694
double cblas_mergeconttime
Definition: DirOptBFS.cpp:59
void PrintInfo() const
Definition: SpParMat.cpp:2387
double cblas_transvectime
Definition: DirOptBFS.cpp:60
void PrintInfo(std::string vectorname) const
SpParMat< int64_t, bool, SpDCCols< int64_t, bool > > PSpMat_Bool
Definition: FilteredMIS.cpp:93
Definition: CCGrid.h:4
double cblas_localspmvtime
Definition: DirOptBFS.cpp:61
NT Reduce(_BinaryOperation __binary_op, NT identity) const
unsigned int highestbitset(uint64_t v)
Definition: DirOptBFS.cpp:86
void Symmetricize(PARMAT &A)
Definition: ReadMatDist.h:29
double cblas_alltoalltime
Definition: DirOptBFS.cpp:57
SpParMat< int64_t, bool, SpDCCols< int32_t, bool > > PSpMat_s32p64