49 #ifndef __STDC_CONSTANT_MACROS 50 #define __STDC_CONSTANT_MACROS 52 #ifndef __STDC_LIMIT_MACROS 53 #define __STDC_LIMIT_MACROS 89 const uint64_t b[] = {0x2ULL, 0xCULL, 0xF0ULL, 0xFF00ULL, 0xFFFF0000ULL, 0xFFFFFFFF00000000ULL};
90 const unsigned int S[] = {1, 2, 4, 8, 16, 32};
94 for (i = 5; i >= 0; i--)
106 bool from_string(T & t,
const string& s, ios_base& (*f)(ios_base&))
108 istringstream iss(s);
109 return !(iss >> f >> t).fail();
113 template <
typename PARMAT>
131 return ( y == -1 ) ? x: -1;
135 int main(
int argc,
char* argv[])
146 int provided, flag, claimed;
147 MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided );
148 MPI_Is_thread_main( &flag );
151 MPI_Query_thread( &claimed );
152 if (claimed != provided)
155 MPI_Init(&argc, &argv);
158 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
159 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
164 cout <<
"Usage: ./dobfs <Scale>" << endl;
165 cout <<
"Example: ./dobfs 25" << endl;
178 PSpMat_s32p64 ALocalT;
179 shared_ptr<CommGrid> fullWorld;
180 fullWorld.reset(
new CommGrid(MPI_COMM_WORLD, 0, 0) );
186 scale =
static_cast<unsigned>(atoi(argv[1]));
188 outs <<
"Forcing scale to : " << scale << endl;
191 SpParHelper::Print(
"Using fast vertex permutations; skipping edge permutations (like v2.1)\n");
194 double initiator[4] = {.57, .19, .19, .05};
196 double t01 = MPI_Wtime();
203 tinfo <<
"Generation took " << t02-t01 <<
" seconds" << endl;
208 MPI_Barrier(MPI_COMM_WORLD);
209 double t1 = MPI_Wtime();
216 MPI_Barrier(MPI_COMM_WORLD);
217 double redts = MPI_Wtime();
218 G->
Reduce(degrees,
Row, plus<int64_t>(), static_cast<int64_t>(0));
219 MPI_Barrier(MPI_COMM_WORLD);
220 double redtf = MPI_Wtime();
222 ostringstream redtimeinfo;
223 redtimeinfo <<
"Calculated degrees in " << redtf-redts <<
" seconds" << endl;
229 ostringstream loopinfo;
230 loopinfo <<
"Converted to Boolean and removed " << removed <<
" loops" << endl;
236 A.
Reduce(*ColSums,
Column, plus<int64_t>(), static_cast<int64_t>(0));
237 A.Reduce(*RowSums,
Row, plus<int64_t>(), static_cast<int64_t>(0));
239 ColSums->
EWiseApply(*RowSums, plus<int64_t>());
243 nonisov = ColSums->
FindInds(bind2nd(greater<int64_t>(), 0));
251 A(nonisov, nonisov,
true);
266 tinfo <<
"Threading activated with " <<
cblas_splits <<
" threads" << endl;
272 MPI_Barrier(MPI_COMM_WORLD);
273 double t2=MPI_Wtime();
275 ostringstream k1timeinfo;
276 k1timeinfo << (t2-t1) - (redtf-redts) <<
" seconds elapsed for Kernel #1" << endl;
282 lbout <<
"Load balance: " << balance << endl;
285 MPI_Barrier(MPI_COMM_WORLD);
290 degrees = degrees(nonisov);
295 double nver = (double) degrees.TotalLength();
298 uint64_t seed = 1383098845;
300 uint64_t seed= time(NULL);
304 vector<double> loccands(
ITERS);
305 vector<int64_t> loccandints(
ITERS);
308 for(
int i=0; i<
ITERS; ++i)
309 loccands[i] = M.
rand();
310 copy(loccands.begin(), loccands.end(), ostream_iterator<double>(cout,
" ")); cout << endl;
311 transform(loccands.begin(), loccands.end(), loccands.begin(), bind2nd( multiplies<double>(), nver ));
313 for(
int i=0; i<
ITERS; ++i)
314 loccandints[i] = static_cast<int64_t>(loccands[i]);
315 copy(loccandints.begin(), loccandints.end(), ostream_iterator<double>(cout,
" ")); cout << endl;
319 for(
int i=0; i<
ITERS; ++i)
321 Cands.SetElement(i,loccandints[i]);
325 for(
int trials =0; trials <
MAXTRIALS; trials++)
342 MPI_Pcontrol(1,
"BFS");
346 for(
int i=0; i<
ITERS; ++i)
356 ostringstream devout;
357 devout.setf(ios::fixed);
359 MPI_Barrier(MPI_COMM_WORLD);
360 double t1 = MPI_Wtime();
364 int64_t up_cutoff = num_edges / 20;
365 int64_t down_cutoff = (((double) num_nodes) * ((double)num_nodes)) / ((
double) num_edges * 12.0);
367 devout <<
"param " << num_nodes <<
" vertices with " << num_edges <<
" edges" << endl;
368 devout << up_cutoff <<
" up and " << down_cutoff <<
" down" << endl;
377 int64_t fringe_size = fringe.getnnz();
379 double pred_start = MPI_Wtime();
382 double pred_end = MPI_Wtime();
383 devout <<
" s" << setw(15) << pred << setw(15) << setprecision(5) << (pred_end - pred_start) << endl;
386 while(fringe_size > 0)
388 if ((pred > up_cutoff) && (last_fringe_size < fringe_size))
390 MPI_Barrier(MPI_COMM_WORLD);
391 double conv_start = MPI_Wtime();
392 done.LoadVec(parents);
393 bm_fringe.LoadFromSpVec(fringe);
394 double conv_end = MPI_Wtime();
395 devout <<
" c" << setw(30) << setprecision(5) << (conv_end - conv_start) << endl;
398 while (fringe_size > 0)
400 double step_start = MPI_Wtime();
401 BottomUpStep(ALocalT, fringe, bm_fringe, parents, done, starts);
402 double step_end = MPI_Wtime();
404 devout << setw(2) << iterations <<
"u" << setw(15) << fringe_size << setprecision(5) << setw(15) << (step_end-step_start) << endl;
407 last_fringe_size = fringe_size;
408 fringe_size = bm_fringe.GetNumSet();
409 if ((fringe_size < down_cutoff) && (last_fringe_size > fringe_size))
411 conv_start = MPI_Wtime();
412 bm_fringe.UpdateSpVec(fringe);
413 conv_end = MPI_Wtime();
414 devout <<
" c" << setw(30) << setprecision(5) << (conv_end - conv_start) << endl;
415 bottomup_convert += (conv_end - conv_start);
422 double step_start = MPI_Wtime();
423 fringe.setNumToInd();
424 fringe =
SpMV(Aeff, fringe,optbuf);
425 double ewise_start = MPI_Wtime();
428 double step_end = MPI_Wtime();
429 devout << setw(2) << iterations <<
"d" << setw(15) << fringe.
getnnz() << setw(15) << setprecision(5) << (step_end-step_start) << endl;
430 cblas_ewisemulttime += (step_end - ewise_start);
432 pred_start = MPI_Wtime();
435 pred_end = MPI_Wtime();
436 devout <<
" s" << setw(15) << pred << setw(15) << setprecision(5) << (pred_end - pred_start) << endl;
437 cblas_ewisemulttime += (pred_end - pred_start);
439 last_fringe_size = fringe_size;
440 fringe_size = fringe.
getnnz();
443 MPI_Barrier(MPI_COMM_WORLD);
444 double t2 = MPI_Wtime();
454 ostringstream outnew;
455 outnew << i <<
"th starting vertex was " << Cands[i] << endl;
456 outnew <<
"Number iterations: " << iterations << endl;
457 outnew <<
"Number of vertices found: " << nverts << endl;
458 outnew <<
"Number of edges traversed: " << nedges << endl;
459 outnew <<
"BFS time: " << t2-t1 <<
" seconds" << endl;
460 outnew <<
"MTEPS: " <<
static_cast<double>(nedges) / (t2-t1) / 1000000.0 << endl;
464 MTEPS[i] =
static_cast<double>(nedges) / (t2-t1) / 1000000.0;
467 MPI_Pcontrol(-1,
"BFS");
470 double * bu_total, *bu_ag_all, *bu_sr_all, *bu_convert, *td_ag_all, *td_a2a_all, *td_tv_all, *td_mc_all, *td_spmv_all, *td_ewm_all;
473 bu_total =
new double[nprocs];
474 bu_ag_all =
new double[nprocs];
475 bu_sr_all =
new double[nprocs];
476 bu_convert =
new double[nprocs];
477 td_ag_all =
new double[nprocs];
478 td_a2a_all =
new double[nprocs];
479 td_tv_all =
new double[nprocs];
480 td_mc_all =
new double[nprocs];
481 td_spmv_all =
new double[nprocs];
482 td_ewm_all =
new double[nprocs];
496 MPI_Gather(&
bottomup_convert, 1, MPI_DOUBLE, bu_convert, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
497 MPI_Gather(&
bottomup_total, 1, MPI_DOUBLE, bu_total, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
498 MPI_Gather(&
bottomup_allgather, 1, MPI_DOUBLE, bu_ag_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
499 MPI_Gather(&
bottomup_sendrecv, 1, MPI_DOUBLE, bu_sr_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
500 MPI_Gather(&
cblas_allgathertime, 1, MPI_DOUBLE, td_ag_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
501 MPI_Gather(&
cblas_alltoalltime, 1, MPI_DOUBLE, td_a2a_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
502 MPI_Gather(&
cblas_transvectime, 1, MPI_DOUBLE, td_tv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
503 MPI_Gather(&
cblas_mergeconttime, 1, MPI_DOUBLE, td_mc_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
504 MPI_Gather(&
cblas_localspmvtime, 1, MPI_DOUBLE, td_spmv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
505 MPI_Gather(&
cblas_ewisemulttime, 1, MPI_DOUBLE, td_ewm_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
507 double bu_local_total = 0;
508 double bu_update_total = 0;
509 double bu_rotate_total = 0;
511 MPI_Allreduce(&
bu_local, &bu_local_total, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
512 MPI_Allreduce(&
bu_update, &bu_update_total, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
513 MPI_Allreduce(&
bu_rotate, &bu_rotate_total, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
517 cout <<
"BU Local: " << bu_local_total/nprocs << endl;
518 cout <<
"BU Update: " << bu_update_total/nprocs << endl;
519 cout <<
"BU Rotate: " << bu_rotate_total/nprocs << endl;
521 vector<double> total_time(nprocs, 0);
522 for(
int i=0; i< nprocs; ++i)
523 total_time[i] += bu_total[i] + bu_convert[i] + td_ag_all[i] + td_a2a_all[i] + td_tv_all[i] + td_mc_all[i] + td_spmv_all[i] + td_ewm_all[i];
526 size_t smallest = permutation[0];
527 size_t largest = permutation[nprocs-1];
528 size_t median = permutation[nprocs/2];
530 cout <<
"TOTAL (accounted) MEAN: " << accumulate( total_time.begin(), total_time.end(), 0.0 )/ static_cast<double> (nprocs) << endl;
531 cout <<
"TOTAL (accounted) MAX: " << total_time[0] << endl;
532 cout <<
"TOTAL (accounted) MIN: " << total_time[nprocs-1] << endl;
533 cout <<
"TOTAL (accounted) MEDIAN: " << total_time[nprocs/2] << endl;
534 cout <<
"-------------------------------" << endl;
536 cout <<
"Convert median: " << bu_convert[
median] << endl;
537 cout <<
"Bottom-up allgather median: " << bu_ag_all[
median] << endl;
538 cout <<
"Bottom-up send-recv median: " << bu_sr_all[
median] << endl;
539 cout <<
"Bottom-up compute median: " << bu_total[
median] - (bu_ag_all[
median] + bu_sr_all[
median]) << endl;
540 cout <<
"Top-down allgather median: " << td_ag_all[
median] << endl;
541 cout <<
"Top-down all2all median: " << td_a2a_all[
median] << endl;
542 cout <<
"Top-down transposevector median: " << td_tv_all[
median] << endl;
543 cout <<
"Top-down mergecontributions median: " << td_mc_all[
median] << endl;
544 cout <<
"Top-down spmsv median: " << td_spmv_all[
median] << endl;
545 cout <<
"-------------------------------" << endl;
547 cout <<
"Convert MEAN: " << accumulate( bu_convert, bu_convert+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
548 cout <<
"Bottom-up total MEAN: " << accumulate( bu_total, bu_total+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
549 cout <<
"Bottom-up allgather MEAN: " << accumulate( bu_ag_all, bu_ag_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
550 cout <<
"Bottom-up send-recv MEAN: " << accumulate( bu_sr_all, bu_sr_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
551 cout <<
"Top-down allgather MEAN: " << accumulate( td_ag_all, td_ag_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
552 cout <<
"Top-down all2all MEAN: " << accumulate( td_a2a_all, td_a2a_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
553 cout <<
"Top-down transposevector MEAN: " << accumulate( td_tv_all, td_tv_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
554 cout <<
"Top-down mergecontributions MEAN: " << accumulate( td_mc_all, td_mc_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
555 cout <<
"Top-down spmsv MEAN: " << accumulate( td_spmv_all, td_spmv_all+nprocs, 0.0 )/
static_cast<double> (nprocs) << endl;
556 cout <<
"-------------------------------" << endl;
559 cout <<
"Bottom-up allgather fastest: " << bu_ag_all[smallest] << endl;
560 cout <<
"Bottom-up send-recv fastest: " << bu_sr_all[smallest] << endl;
561 cout <<
"Bottom-up compute fastest: " << bu_total[smallest] - (bu_ag_all[smallest] + bu_sr_all[smallest]) << endl;
562 cout <<
"Top-down allgather fastest: " << td_ag_all[smallest] << endl;
563 cout <<
"Top-down all2all fastest: " << td_a2a_all[smallest] << endl;
564 cout <<
"Top-down transposevector fastest: " << td_tv_all[smallest] << endl;
565 cout <<
"Top-down mergecontributions fastest: " << td_mc_all[smallest] << endl;
566 cout <<
"Top-down spmsv fastest: " << td_spmv_all[smallest] << endl;
567 cout <<
"-------------------------------" << endl;
570 cout <<
"Bottom-up allgather slowest: " << bu_ag_all[largest] << endl;
571 cout <<
"Bottom-up send-recv slowest: " << bu_sr_all[largest] << endl;
572 cout <<
"Bottom-up compute slowest: " << bu_total[largest] - (bu_ag_all[largest] + bu_sr_all[largest]) << endl;
573 cout <<
"Top-down allgather slowest: " << td_ag_all[largest] << endl;
574 cout <<
"Top-down all2all slowest: " << td_a2a_all[largest] << endl;
575 cout <<
"Top-down transposevector slowest: " << td_tv_all[largest] << endl;
576 cout <<
"Top-down mergecontributions slowest: " << td_mc_all[largest] << endl;
577 cout <<
"Top-down spmsv slowest: " << td_spmv_all[largest] << endl;
581 sort(EDGES, EDGES+ITERS);
582 os <<
"--------------------------" << endl;
583 os <<
"Min nedges: " << EDGES[0] << endl;
584 os <<
"First Quartile nedges: " << (EDGES[(ITERS/4)-1] + EDGES[ITERS/4])/2 << endl;
585 os <<
"Median nedges: " << (EDGES[(ITERS/2)-1] + EDGES[ITERS/2])/2 << endl;
586 os <<
"Third Quartile nedges: " << (EDGES[(3*ITERS/4) -1 ] + EDGES[3*ITERS/4])/2 << endl;
587 os <<
"Max nedges: " << EDGES[ITERS-1] << endl;
588 double mean = accumulate( EDGES, EDGES+ITERS, 0.0 )/
ITERS;
589 vector<double> zero_mean(ITERS);
590 transform(EDGES, EDGES+ITERS, zero_mean.begin(), bind2nd( minus<double>(), mean ));
592 double deviation = inner_product( zero_mean.begin(),zero_mean.end(), zero_mean.begin(), 0.0 );
593 deviation = sqrt( deviation / (ITERS-1) );
594 os <<
"Mean nedges: " << mean << endl;
595 os <<
"STDDEV nedges: " << deviation << endl;
596 os <<
"--------------------------" << endl;
598 sort(TIMES,TIMES+ITERS);
599 os <<
"Min time: " << TIMES[0] <<
" seconds" << endl;
600 os <<
"First Quartile time: " << (TIMES[(ITERS/4)-1] + TIMES[ITERS/4])/2 <<
" seconds" << endl;
601 os <<
"Median time: " << (TIMES[(ITERS/2)-1] + TIMES[ITERS/2])/2 <<
" seconds" << endl;
602 os <<
"Third Quartile time: " << (TIMES[(3*ITERS/4)-1] + TIMES[3*ITERS/4])/2 <<
" seconds" << endl;
603 os <<
"Max time: " << TIMES[ITERS-1] <<
" seconds" << endl;
604 mean = accumulate( TIMES, TIMES+ITERS, 0.0 )/
ITERS;
605 transform(TIMES, TIMES+ITERS, zero_mean.begin(), bind2nd( minus<double>(), mean ));
606 deviation = inner_product( zero_mean.begin(),zero_mean.end(), zero_mean.begin(), 0.0 );
607 deviation = sqrt( deviation / (ITERS-1) );
608 os <<
"Mean time: " << mean <<
" seconds" << endl;
609 os <<
"STDDEV time: " << deviation <<
" seconds" << endl;
610 os <<
"--------------------------" << endl;
612 sort(MTEPS, MTEPS+ITERS);
613 os <<
"Min MTEPS: " << MTEPS[0] << endl;
614 os <<
"First Quartile MTEPS: " << (MTEPS[(ITERS/4)-1] + MTEPS[ITERS/4])/2 << endl;
615 os <<
"Median MTEPS: " << (MTEPS[(ITERS/2)-1] + MTEPS[ITERS/2])/2 << endl;
616 os <<
"Third Quartile MTEPS: " << (MTEPS[(3*ITERS/4)-1] + MTEPS[3*ITERS/4])/2 << endl;
617 os <<
"Max MTEPS: " << MTEPS[ITERS-1] << endl;
619 double hteps =
static_cast<double>(
ITERS) / accumulate(INVMTEPS, INVMTEPS+ITERS, 0.0);
620 os <<
"Harmonic mean of MTEPS: " << hteps << endl;
621 transform(INVMTEPS, INVMTEPS+ITERS, zero_mean.begin(), bind2nd(minus<double>(), 1/hteps));
622 deviation = inner_product( zero_mean.begin(),zero_mean.end(), zero_mean.begin(), 0.0 );
623 deviation = sqrt( deviation / (ITERS-1) ) * (hteps*hteps);
624 os <<
"Harmonic standard deviation of MTEPS: " << deviation << endl;
void SetElement(IT indx, NT numx)
Indexing is performed 0-based.
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
std::shared_ptr< CommGrid > getcommgrid() const
void ActivateThreading(int numsplits)
double bottomup_allgather
Iterate over (sparse) columns of the sparse matrix.
void SetElement(IT indx, NT numx)
MPI_Datatype MPIType< int64_t >(void)
void Set(const FullyDistSpVec< IT, NT > &rhs)
void GenGraph500Data(double initiator[4], int log_numverts, int edgefactor, bool scramble=false, bool packed=false)
FullyDistVec< IT, IT > FindInds(_Predicate pred) const
Return the indices where pred is true.
SpParMat< int64_t, int, SpDCCols< int32_t, int > > PSpMat_s32p64_Int
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
FullyDistSpVec< IT, NT > Find(_Predicate pred) const
Return the elements for which pred is true.
SpDCCols< int, bool >::SpColIter * CalcSubStarts(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapCarousel< IT, VT > &done)
T median(const T &a, const T &b, const T &c, Pred comp)
float LoadImbalance() const
void BottomUpStep(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapFringe< int64_t, int64_t > &bm_fringe, FullyDistVec< IT, VT > &parents, BitMapCarousel< IT, VT > &done, SpDCCols< int, bool >::SpColIter *starts)
void EWiseApply(const FullyDistVec< IT, NT2 > &other, _BinaryOperation __binary_op, _BinaryPredicate _do_op, const bool useExtendedBinOp)
void OptimizeForGraph500(OptBuf< LIT, OT > &optbuf)
int main(int argc, char *argv[])
FullyDistSpVec< IT, VT > SpMV(const SpParMat< IT, bool, UDER > &A, const FullyDistSpVec< IT, VT > &x, OptBuf< int32_t, VT > &optbuf)
int64_t operator()(int64_t x, const int64_t &y) const
static void Print(const std::string &s)
double cblas_allgathertime
bool from_string(T &t, const string &s, ios_base &(*f)(ios_base &))
static std::vector< size_t > find_order(const std::vector< T > &values)
double cblas_ewisemulttime
double cblas_mergeconttime
double cblas_transvectime
void PrintInfo(std::string vectorname) const
SpParMat< int64_t, bool, SpDCCols< int64_t, bool > > PSpMat_Bool
double cblas_localspmvtime
NT Reduce(_BinaryOperation __binary_op, NT identity) const
unsigned int highestbitset(uint64_t v)
void Symmetricize(PARMAT &A)
double cblas_alltoalltime
SpParMat< int64_t, bool, SpDCCols< int32_t, bool > > PSpMat_s32p64