3 #define _OPENMP // should be defined before any COMBBLAS header is included 46 template <
typename PARMAT>
65 if(vtx1.order==vtx2.order)
return vtx1.degree < vtx2.degree;
66 else return vtx1.order<vtx2.order;
70 if(vtx1.order==vtx2.order)
return vtx1.degree <= vtx2.degree;
71 else return vtx1.order<vtx2.order;
75 if(vtx1.order==vtx2.order)
return vtx1.degree > vtx2.degree;
76 else return vtx1.order>vtx2.order;
80 if(vtx1.order==vtx2.order)
return vtx1.degree >= vtx2.degree;
81 else return vtx1.order>vtx2.order;
87 friend ostream&
operator<<(ostream& os,
const VertexType & vertex ){os <<
"(" << vertex.order <<
"," << vertex.degree <<
")";
return os;};
98 static T_promote
id(){
return -1; };
102 static T_promote
add(
const T_promote & arg1,
const T_promote & arg2)
104 return std::min(arg1, arg2);
107 static T_promote
multiply(
const bool & arg1,
const T_promote & arg2)
112 static void axpy(
bool a,
const T_promote & x, T_promote & y)
131 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
132 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
136 vector<VertexType> lnum = fringeRow.
GetLocalNum ();
140 int64_t nparents = endLabel - startLabel + 1;
141 int64_t perproc = nparents/nprocs;
143 int * rdispls =
new int[nprocs+1];
144 int * recvcnt =
new int[nprocs];
145 int * sendcnt =
new int[nprocs]();
146 int * sdispls =
new int[nprocs+1];
148 MPI_Barrier(MPI_COMM_WORLD);
151 #pragma omp parallel for 153 for(
int64_t k=0; k < ploclen; ++k)
155 int64_t temp = lnum[k].order-startLabel;
157 if(perproc==0 || temp/perproc > nprocs-1)
160 owner = temp/perproc;
163 __sync_fetch_and_add(&sendcnt[owner], 1);
169 MPI_Barrier(MPI_COMM_WORLD);
172 MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, MPI_COMM_WORLD);
176 for(
int i=0; i<nprocs; ++i)
178 sdispls[i+1] = sdispls[i] + sendcnt[i];
179 rdispls[i+1] = rdispls[i] + recvcnt[i];
186 int *count =
new int[nprocs]();
188 #pragma omp parallel for 190 for(
int64_t i=0; i < ploclen; ++i)
193 int64_t temp = lnum[i].order-startLabel;
195 if(perproc==0 || temp/perproc > nprocs-1)
198 owner = temp/perproc;
203 id = sdispls[owner] + __sync_fetch_and_add(&count[owner], 1);
205 id = sdispls[owner] + count[owner];
210 datbuf2[id] = lnum[i].degree;
211 indbuf[id] = lind[i] + fringeRow.LengthUntil();
215 int64_t totrecv = rdispls[nprocs];
227 tuple<int64_t,int64_t, int64_t>* tosort =
static_cast<tuple<int64_t,int64_t, int64_t>*
> (::operator
new (
sizeof(tuple<int64_t,int64_t, int64_t>)*totrecv));
230 #pragma omp parallel for 232 for(
int i=0; i<totrecv; ++i)
234 tosort[i] = make_tuple(recvdatbuf1[i], recvdatbuf2[i], recvindbuf[i]);
238 #if defined(GNU_PARALLEL) && defined(_OPENMP) 239 __gnu_parallel::sort(tosort, tosort+totrecv);
241 std::sort(tosort, tosort+totrecv);
245 int * sendcnt1 =
new int[nprocs]();
248 #pragma omp parallel for 250 for(
int64_t k=0; k < totrecv; ++k)
253 int owner = fringeRow.Owner(get<2>(tosort[k]), locind);
255 __sync_fetch_and_add(&sendcnt1[owner], 1);
261 MPI_Alltoall(sendcnt1, 1, MPI_INT, recvcnt, 1, MPI_INT, MPI_COMM_WORLD);
265 for(
int i=0; i<nprocs; ++i)
267 sdispls[i+1] = sdispls[i] + sendcnt1[i];
268 rdispls[i+1] = rdispls[i] + recvcnt[i];
272 vector<int64_t> sortperproc (nprocs);
273 sortperproc[myrank] = totrecv;
276 vector<int64_t> disp(nprocs+1);
278 for(
int i=0; i<nprocs; ++i)
280 disp[i+1] = disp[i] + sortperproc[i];
289 count =
new int[nprocs]();
291 #pragma omp parallel for 293 for(
int64_t i=0; i < ploclen; ++i)
296 int owner = fringeRow.Owner(get<2>(tosort[i]), locind);
299 id = sdispls[owner] + __sync_fetch_and_add(&count[owner], 1);
301 id = sdispls[owner] + count[owner];
304 datbuf[id] = i + disp[myrank] + endLabel + 1;
311 totrecv = rdispls[nprocs];
312 vector<int64_t> recvdatbuf3 (totrecv);
316 vector<int64_t> recvindbuf3 (totrecv);
322 DeleteAll(recvindbuf, recvdatbuf1, recvdatbuf2);
323 DeleteAll(sdispls, rdispls, sendcnt, sendcnt1, recvcnt);
324 ::operator
delete(tosort);
331 template <
typename PARMAT>
335 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
336 double tSpMV=0, tOrder, tOther, tSpMV1, tsort=0, tsort1;
337 tOrder = MPI_Wtime();
342 fringe.SetElement(source, startOrder);
343 int64_t curOrder = startOrder+1;
345 if(myrank == 0) cout <<
" Computing the RCM ordering:" << endl;
348 int64_t startLabel = startOrder;
352 while(startLabel <= endLabel)
355 fringe = EWiseApply<int64_t>(fringe, order,
360 tSpMV1 = MPI_Wtime();
361 SpMV<SelectMinSR>(
A, fringe, fringe,
false, SPA);
362 tSpMV += MPI_Wtime() - tSpMV1;
370 tsort1 = MPI_Wtime();
372 tsort += MPI_Wtime()-tsort1;
373 order.Set(levelOrder);
374 startLabel = endLabel + 1;
375 endLabel += fringe.getnnz();
378 tOrder = MPI_Wtime() - tOrder;
379 tOther = tOrder - tSpMV - tsort;
382 cout <<
" Total time: " << tOrder <<
" seconds [SpMV: " << tSpMV <<
", sorting: " << tsort <<
", other: " << tOther <<
"]" << endl << endl;
389 template <
typename PARMAT>
395 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
396 double tpvSpMV=0, tpvOther=0;
397 double tstart = MPI_Wtime();
398 int64_t prevLevel=-1, curLevel=0;
402 pair<int64_t, int64_t> mindegree_vertex = unvisitedVertices.Reduce(
minimum<pair<int64_t, int64_t> >(), make_pair(LLONG_MAX, (
int64_t)-1));
403 int64_t source = mindegree_vertex.second;
410 double tSpMV=0, tOther=0, tSpMV1;
411 if(myrank == 0) cout <<
" Computing a pseudo-peripheral vertex:" << endl;
412 while(curLevel > prevLevel)
414 double tItr = MPI_Wtime();
415 prevLevel = curLevel;
419 fringe.SetElement(source, 1);
421 while(fringe.getnnz() > 0)
423 tSpMV1 = MPI_Wtime();
424 SpMV<SelectMinSR>(
A, fringe, fringe,
false, SPA);
425 tSpMV += MPI_Wtime() - tSpMV1;
432 curLevel = curLevel-2;
436 fringe = level.Find(curLevel);
441 EWiseApply<pair<int64_t, int64_t>>(fringe, degrees,
445 mindegree_vertex = fringe_degree.
Reduce(
minimum<pair<int64_t, int64_t> >(), make_pair(LLONG_MAX, (
int64_t)-1));
446 if (curLevel > prevLevel)
447 source = mindegree_vertex.second;
453 cout <<
" iteration: "<< iterations <<
" BFS levels: " << curLevel <<
" Time: " << MPI_Wtime() - tItr <<
" seconds." << endl;
460 unvisitedVertices = EWiseApply<pair<int64_t, int64_t>>(unvisitedVertices, level,
461 [](pair<int64_t, int64_t> vtx,
int64_t visited){
return vtx;},
462 [](pair<int64_t, int64_t> vtx,
int64_t visited){
return visited==-1;},
465 tOther = MPI_Wtime() - tstart - tSpMV;
470 cout <<
" vertex " << source <<
" is a pseudo peripheral vertex" << endl;
471 cout <<
" pseudo diameter: " << curLevel <<
", #iterations: "<< iterations << endl;
472 cout <<
" Total time: " << MPI_Wtime() - tstart <<
" seconds [SpMV: " << tSpMV <<
", other: " << tOther <<
"]" << endl << endl;
479 template <
typename PARMAT>
491 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
492 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
498 EWiseApply<pair<int64_t, int64_t>>(unvisited, degrees,
509 while(numUnvisited>0)
512 if(myrank==0) cout <<
"Connected component: " << cc++ << endl;
517 int64_t curOrder = A.getnrow() - numUnvisited;
518 RCMOrder(A, source, rcmorder, curOrder, degrees, SPA);
520 numUnvisited = unvisitedVertices.
getnnz();
524 double *td_ag_all, *td_a2a_all, *td_tv_all, *td_mc_all, *td_spmv_all;
527 td_ag_all =
new double[nprocs];
528 td_a2a_all =
new double[nprocs];
529 td_tv_all =
new double[nprocs];
530 td_mc_all =
new double[nprocs];
531 td_spmv_all =
new double[nprocs];
534 MPI_Gather(&
cblas_allgathertime, 1, MPI_DOUBLE, td_ag_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
535 MPI_Gather(&
cblas_alltoalltime, 1, MPI_DOUBLE, td_a2a_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
536 MPI_Gather(&
cblas_transvectime, 1, MPI_DOUBLE, td_tv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
537 MPI_Gather(&
cblas_mergeconttime, 1, MPI_DOUBLE, td_mc_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
538 MPI_Gather(&
cblas_localspmvtime, 1, MPI_DOUBLE, td_spmv_all, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
541 double td_ag_all1=0, td_a2a_all1=0, td_tv_all1=0, td_mc_all1=0,td_spmv_all1 = 0;
547 vector<double> total_time(nprocs, 0);
548 for(
int i=0; i< nprocs; ++i)
549 total_time[i] += td_ag_all[i] + td_a2a_all[i] + td_tv_all[i] + td_mc_all[i] + td_spmv_all[i];
552 vector<pair<double, int>> tosort;
553 for(
int i=0; i<nprocs; i++) tosort.push_back(make_pair(total_time[i], i));
554 sort(tosort.begin(), tosort.end());
556 vector<int> permutation(nprocs);
557 for(
int i=0; i<nprocs; i++) permutation[i] = tosort[i].second;
559 int smallest = permutation[0];
560 int largest = permutation[nprocs-1];
561 int median = permutation[nprocs/2];
562 cout <<
"------ Detail timing --------" << endl;
563 cout <<
"TOTAL (accounted) MEAN: " << accumulate( total_time.begin(), total_time.end(), 0.0 )/ static_cast<double> (nprocs) << endl;
564 cout <<
"TOTAL (accounted) MAX: " << total_time[0] << endl;
565 cout <<
"TOTAL (accounted) MIN: " << total_time[nprocs-1] << endl;
566 cout <<
"TOTAL (accounted) MEDIAN: " << total_time[nprocs/2] << endl;
567 cout <<
"-------------------------------" << endl;
569 cout <<
"allgather median: " << td_ag_all[
median] << endl;
570 cout <<
"all2all median: " << td_a2a_all[
median] << endl;
571 cout <<
"transposevector median: " << td_tv_all[
median] << endl;
572 cout <<
"mergecontributions median: " << td_mc_all[
median] << endl;
573 cout <<
"spmsv median: " << td_spmv_all[
median] << endl;
574 cout <<
"-------------------------------" << endl;
575 td_ag_all1=td_ag_all[
median]; td_a2a_all1=td_a2a_all[
median];
576 td_tv_all1=td_tv_all[
median]; td_mc_all1=td_mc_all[
median];
577 td_spmv_all1 = td_spmv_all[
median];
579 cout <<
"allgather fastest: " << td_ag_all[smallest] << endl;
580 cout <<
"all2all fastest: " << td_a2a_all[smallest] << endl;
581 cout <<
"transposevector fastest: " << td_tv_all[smallest] << endl;
582 cout <<
"mergecontributions fastest: " << td_mc_all[smallest] << endl;
583 cout <<
"spmsv fastest: " << td_spmv_all[smallest] << endl;
584 cout <<
"-------------------------------" << endl;
587 cout <<
"allgather slowest: " << td_ag_all[largest] << endl;
588 cout <<
"all2all slowest: " << td_a2a_all[largest] << endl;
589 cout <<
"transposevector slowest: " << td_tv_all[largest] << endl;
590 cout <<
"mergecontributions slowest: " << td_mc_all[largest] << endl;
591 cout <<
"spmsv slowest: " << td_spmv_all[largest] << endl;
599 cout <<
"summary statistics" << endl;
610 int main(
int argc,
char* argv[])
613 MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided);
614 if (provided < MPI_THREAD_SERIALIZED)
616 printf(
"ERROR: The MPI library does not have MPI_THREAD_SERIALIZED support\n");
617 MPI_Abort(MPI_COMM_WORLD, 1);
621 MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
622 MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
629 cout <<
"Usage: ./rcm <rmat|er|input> <scale|filename> " <<
"-permute" <<
" -savercm" << endl;
630 cout <<
"Example with a user supplied matrix:" << endl;
631 cout <<
" mpirun -np 4 ./rcm input a.mtx" << endl;
632 cout <<
"Example with a user supplied matrix (pre-permute the input matrix for load balance):" << endl;
633 cout <<
" mpirun -np 4 ./rcm input a.mtx -permute " << endl;
634 cout <<
"Example with a user supplied matrix (pre-permute the input matrix for load balance) & save rcm order to input_file_name.rcm.txt file:" << endl;
635 cout <<
" mpirun -np 4 ./rcm input a.mtx -permute -savercm" << endl;
636 cout <<
"Example with RMAT matrix: mpirun -np 4 ./rcm rmat 20" << endl;
637 cout <<
"Example with an Erdos-Renyi matrix: mpirun -np 4 ./rcm er 20" << endl;
646 bool randpermute =
false;
647 bool savercm =
false;
648 for (
int i = 1; i < argc; i++)
650 if (strcmp(argv[i],
"-permute")==0)
652 if (strcmp(argv[i],
"-savercm")==0)
660 if(
string(argv[1]) ==
string(
"input"))
665 tinfo <<
"**** Reading input matrix: " << filename <<
" ******* " << endl;
667 base_filename = filename.substr(filename.find_last_of(
"/\\") + 1);
669 SpParHelper::Print(tinfo.str());
670 double t01 = MPI_Wtime();
672 double t02 = MPI_Wtime();
675 tinfo <<
"matrix read and symmetricized " << endl;
676 tinfo <<
"Reader took " << t02-t01 <<
" seconds" << endl;
677 SpParHelper::Print(tinfo.str());
680 else if(
string(argv[1]) == string(
"rmat"))
683 scale =
static_cast<unsigned>(atoi(argv[2]));
684 double initiator[4] = {.57, .19, .19, .05};
687 MPI_Barrier(MPI_COMM_WORLD);
693 else if(
string(argv[1]) ==
string(
"er"))
696 scale =
static_cast<unsigned>(atoi(argv[2]));
697 double initiator[4] = {.25, .25, .25, .25};
700 MPI_Barrier(MPI_COMM_WORLD);
708 SpParHelper::Print(
"Unknown input option\n");
717 if(randpermute &&
string(argv[1]) == string(
"input"))
722 randp.iota(ABool->
getnrow(), 0);
724 (*ABool)(randp,randp,
true);
725 SpParHelper::Print(
"Matrix is randomly permuted for load balance.\n");
729 SpParHelper::Print(
"Rectangular matrix: Can not apply symmetric permutation.\n");
738 ABool->
Reduce(degrees,
Column, plus<int64_t>(), static_cast<int64_t>(0));
745 nthreads = omp_get_num_threads();
753 outs <<
"--------------------------------------" << endl;
754 outs <<
"Number of MPI proceses: " << nprocs << endl;
755 outs <<
"Number of threads per procese: " << nthreads << endl;
756 outs <<
"Load balance: " << balance << endl;
757 outs <<
"--------------------------------------" << endl;
758 SpParHelper::Print(outs.str());
769 reverseOrder= rcmorder.TotalLength();
770 reverseOrder -= rcmorder;
775 if(randpermute==
true && randp.TotalLength() >0)
779 reverseOrder = reverseOrder(invRandp);
784 if(savercm && filename!=
"")
786 string ofName = filename +
".rcm.txt";
801 if(randpermute==
true && randp.TotalLength() >0)
803 int64_t bw_before1 = ABoolOld.Bandwidth();
805 ABoolOld(rcmorder1,rcmorder1,
true);
806 int64_t bw_after = ABoolOld.Bandwidth();
809 outs1 <<
"Original Bandwidth: " << bw_before1 << endl;
810 outs1 <<
"Bandwidth after randomly permuting the matrix: " << bw_before2 << endl;
811 outs1 <<
"Bandwidth after the matrix is permuted by RCM: " << bw_after << endl << endl;
812 SpParHelper::Print(outs1.str());
817 (*ABool)(rcmorder1,rcmorder1,
true);
821 outs1 <<
"Original Bandwidth: " << bw_before1 << endl;
822 outs1 <<
"Bandwidth after the matrix is permuted by RCM: " << bw_after << endl << endl;;
823 SpParHelper::Print(outs1.str());
Compute the minimum of two values.
void SetElement(IT indx, NT numx)
Indexing is performed 0-based.
FullyDistVec< IT, NT > Reduce(Dim dim, _BinaryOperation __binary_op, NT id, _UnaryOperation __unary_op) const
std::shared_ptr< CommGrid > getcommgrid() const
void SetElement(IT indx, NT numx)
Compute the maximum of two values.
double cblas_localspmvtime
SpParMat< int64_t, double, SpDCCols< int64_t, double > > Par_DCSC_Double
MPI_Datatype MPIType< int64_t >(void)
double cblas_mergeconttime
void GenGraph500Data(double initiator[4], int log_numverts, int edgefactor, bool scramble=false, bool packed=false)
Dcsc< IU, typename promote_trait< NU1, NU2 >::T_promote > EWiseMult(const Dcsc< IU, NU1 > &A, const Dcsc< IU, NU2 > *B, bool exclude)
friend bool operator==(const VertexType &vtx1, const VertexType &vtx2)
int main(int argc, char *argv[])
T median(const T &a, const T &b, const T &c, Pred comp)
float LoadImbalance() const
std::shared_ptr< CommGrid > getcommgrid() const
static T_promote add(const T_promote &arg1, const T_promote &arg2)
int64_t PseudoPeripheralVertex(PARMAT &A, FullyDistSpVec< int64_t, pair< int64_t, int64_t >> &unvisitedVertices, FullyDistVec< int64_t, int64_t > degrees, PreAllocatedSPA< int64_t > &SPA)
void Symmetricize(PARMAT &A)
friend bool operator>(const VertexType &vtx1, const VertexType &vtx2)
static void axpy(bool a, const T_promote &x, T_promote &y)
SpParMat< int64_t, bool, SpDCCols< int64_t, bool > > Par_DCSC_Bool
NT Reduce(_BinaryOperation __binary_op, NT init) const
static T_promote multiply(const bool &arg1, const T_promote &arg2)
void iota(IT globalsize, NT first)
std::vector< NT > GetLocalNum()
VertexType(int64_t ord=-1, int64_t deg=-1)
friend bool operator<(const VertexType &vtx1, const VertexType &vtx2)
double cblas_allgathertime
void ParallelWrite(const std::string &filename, bool onebased, HANDLER handler, bool includeindices=true)
FullyDistVec< IT, IT > sort()
double cblas_alltoalltime
friend bool operator<=(const VertexType &vtx1, const VertexType &vtx2)
friend bool operator>=(const VertexType &vtx1, const VertexType &vtx2)
FullyDistVec< int64_t, int64_t > RCM(PARMAT &A, FullyDistVec< int64_t, int64_t > degrees, PreAllocatedSPA< int64_t > &SPA)
SpParMat< int64_t, bool, SpCCols< int64_t, bool > > Par_CSC_Bool
SpParMat< int64_t, int64_t, SpDCCols< int64_t, int64_t > > Par_DCSC_int64_t
void RCMOrder(PARMAT &A, int64_t source, FullyDistVec< int64_t, int64_t > &order, int64_t startOrder, FullyDistVec< int64_t, int64_t > degrees, PreAllocatedSPA< int64_t > &SPA)
std::vector< IT > GetLocalInd()
double cblas_transvectime
FullyDistSpVec< int64_t, int64_t > getOrder(FullyDistSpVec< int64_t, VertexType > &fringeRow, int64_t startLabel, int64_t endLabel)
static bool returnedSAID()
void ParallelReadMM(const std::string &filename, bool onebased, _BinaryOperation BinOp)