COMBINATORIAL_BLAS  1.6
BFSFriends.h
Go to the documentation of this file.
1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.4 -------------------------------------------------*/
4 /* date: 1/17/2014 ---------------------------------------------*/
5 /* authors: Aydin Buluc (abuluc@lbl.gov), Adam Lugowski --------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2014, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 #ifndef _BFS_FRIENDS_H_
30 #define _BFS_FRIENDS_H_
31 
32 #include "mpi.h"
33 #include <iostream>
34 #include "SpParMat.h"
35 #include "SpParHelper.h"
36 #include "MPIType.h"
37 #include "Friends.h"
38 #include "OptBuf.h"
39 #include "ParFriends.h"
40 #include "BitMap.h"
41 #include "BitMapCarousel.h"
42 #include "BitMapFringe.h"
43 
44 namespace combblas {
45 
46 template <class IT, class NT, class DER>
47 class SpParMat;
48 
49 /*************************************************************************************************/
50 /*********************** FRIEND FUNCTIONS FOR BFS ONLY (NO SEMIRINGS) RUNS **********************/
51 /***************************** BOTH PARALLEL AND SEQUENTIAL FUNCTIONS ****************************/
52 /*************************************************************************************************/
53 
58 template <typename IT, typename VT>
59 void dcsc_gespmv_threaded_setbuffers (const SpDCCols<IT, bool> & A, const int32_t * indx, const VT * numx, int32_t nnzx,
60  int32_t * sendindbuf, VT * sendnumbuf, int * cnts, int * dspls, int p_c)
61 {
63  if(A.getnnz() > 0 && nnzx > 0)
64  {
65  int splits = A.getnsplit();
66  if(splits > 0)
67  {
68  std::vector< std::vector<int32_t> > indy(splits);
69  std::vector< std::vector< VT > > numy(splits);
70  int32_t nlocrows = static_cast<int32_t>(A.getnrow());
71  int32_t perpiece = nlocrows / splits;
72 
73  #ifdef _OPENMP
74  #pragma omp parallel for
75  #endif
76  for(int i=0; i<splits; ++i)
77  {
78  if(i != splits-1)
79  SpMXSpV_ForThreading<BFSsring>(*(A.GetDCSC(i)), perpiece, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
80  else
81  SpMXSpV_ForThreading<BFSsring>(*(A.GetDCSC(i)), nlocrows - perpiece*i, indx, numx, nnzx, indy[i], numy[i], i*perpiece);
82  }
83 
84  int32_t perproc = nlocrows / p_c;
85  int32_t last_rec = p_c-1;
86 
87  // keep recipients of last entries in each split (-1 for an empty split)
88  // so that we can delete indy[] and numy[] contents as soon as they are processed
89  std::vector<int32_t> end_recs(splits);
90  for(int i=0; i<splits; ++i)
91  {
92  if(indy[i].empty())
93  end_recs[i] = -1;
94  else
95  end_recs[i] = std::min(indy[i].back() / perproc, last_rec);
96  }
97 
98  int ** loc_rec_cnts = new int *[splits];
99  #ifdef _OPENMP
100  #pragma omp parallel for
101  #endif
102  for(int i=0; i<splits; ++i)
103  {
104  loc_rec_cnts[i] = new int[p_c](); // thread-local recipient data
105  if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
106  {
107  int32_t cur_rec = std::min( indy[i].front() / perproc, last_rec);
108  int32_t lastdata = (cur_rec+1) * perproc; // one past last entry that goes to this current recipient
109  for(typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
110  {
111  if( ( (*it) >= lastdata ) && cur_rec != last_rec)
112  {
113  cur_rec = std::min( (*it) / perproc, last_rec);
114  lastdata = (cur_rec+1) * perproc;
115  }
116  ++loc_rec_cnts[i][cur_rec];
117  }
118  }
119  }
120  #ifdef _OPENMP
121  #pragma omp parallel for
122  #endif
123  for(int i=0; i<splits; ++i)
124  {
125  if(!indy[i].empty()) // guarantee that .begin() and .end() are not null
126  {
127  // FACT: Data is sorted, so if the recipient of begin is the same as the owner of end,
128  // then the whole data is sent to the same processor
129  int32_t beg_rec = std::min( indy[i].front() / perproc, last_rec);
130  int32_t alreadysent = 0; // already sent per recipient
131  for(int before = i-1; before >= 0; before--)
132  alreadysent += loc_rec_cnts[before][beg_rec];
133 
134  if(beg_rec == end_recs[i]) // fast case
135  {
136  std::transform(indy[i].begin(), indy[i].end(), indy[i].begin(), std::bind2nd(std::minus<int32_t>(), perproc*beg_rec));
137  std::copy(indy[i].begin(), indy[i].end(), sendindbuf + dspls[beg_rec] + alreadysent);
138  std::copy(numy[i].begin(), numy[i].end(), sendnumbuf + dspls[beg_rec] + alreadysent);
139  }
140  else // slow case
141  {
142  int32_t cur_rec = beg_rec;
143  int32_t lastdata = (cur_rec+1) * perproc; // one past last entry that goes to this current recipient
144  for(typename std::vector<int32_t>::iterator it = indy[i].begin(); it != indy[i].end(); ++it)
145  {
146  if( ( (*it) >= lastdata ) && cur_rec != last_rec )
147  {
148  cur_rec = std::min( (*it) / perproc, last_rec);
149  lastdata = (cur_rec+1) * perproc;
150 
151  // if this split switches to a new recipient after sending some data
152  // then it's sure that no data has been sent to that recipient yet
153  alreadysent = 0;
154  }
155  sendindbuf[ dspls[cur_rec] + alreadysent ] = (*it) - perproc*cur_rec; // convert to receiver's local index
156  sendnumbuf[ dspls[cur_rec] + (alreadysent++) ] = *(numy[i].begin() + (it-indy[i].begin()));
157  }
158  }
159  }
160  }
161  // Deallocated rec counts serially once all threads complete
162  for(int i=0; i< splits; ++i)
163  {
164  for(int j=0; j< p_c; ++j)
165  cnts[j] += loc_rec_cnts[i][j];
166  delete [] loc_rec_cnts[i];
167  }
168  delete [] loc_rec_cnts;
169  }
170  else
171  {
172  std::cout << "Something is wrong, splits should be nonzero for multithreaded execution" << std::endl;
173  }
174  }
175 }
176 
183 template<typename VT, typename IT, typename UDER>
184 void LocalSpMV(const SpParMat<IT,bool,UDER> & A, int rowneighs, OptBuf<int32_t, VT > & optbuf, int32_t * & indacc, VT * & numacc, int * sendcnt, int accnz)
185 {
186 
187 #ifdef TIMING
188  double t0=MPI_Wtime();
189 #endif
190  if(optbuf.totmax > 0) // graph500 optimization enabled
191  {
192  if(A.spSeq->getnsplit() > 0)
193  {
194  // optbuf.{inds/nums/dspls} and sendcnt are all pre-allocated and only filled by dcsc_gespmv_threaded
195 
196  generic_gespmv_threaded_setbuffers< Select2ndSRing<bool, VT, VT> > (*(A.spSeq), indacc, numacc, (int32_t) accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs);
197  }
198  else
199  {
200  // by-pass dcsc_gespmv call
201  if(A.getlocalnnz() > 0 && accnz > 0)
202  {
203  // ABAB: ignoring optbuf.isthere here
204  // \TODO: Remove .isthere from optbuf definition
205  SpMXSpV< Select2ndSRing<bool, VT, VT> >(*((A.spSeq)->GetInternal()), (int32_t) A.getlocalrows(), indacc, numacc,
206  accnz, optbuf.inds, optbuf.nums, sendcnt, optbuf.dspls, rowneighs);
207  }
208  }
209  DeleteAll(indacc,numacc);
210  }
211  else
212  {
213  SpParHelper::Print("BFS only (no semiring) function only work with optimization buffers\n");
214  }
215 
216 #ifdef TIMING
217  double t1=MPI_Wtime();
218  cblas_localspmvtime += (t1-t0);
219 #endif
220 }
221 
222 
223 template <typename IU, typename VT>
224 void MergeContributions(FullyDistSpVec<IU,VT> & y, int * & recvcnt, int * & rdispls, int32_t * & recvindbuf, VT * & recvnumbuf, int rowneighs)
225 {
226 #ifdef TIMING
227  double t0=MPI_Wtime();
228 #endif
229  // free memory of y, in case it was aliased
230  std::vector<IU>().swap(y.ind);
231  std::vector<VT>().swap(y.num);
232 
233 #ifndef HEAPMERGE
234  IU ysize = y.MyLocLength(); // my local length is only O(n/p)
235  bool * isthere = new bool[ysize];
236  std::vector< std::pair<IU,VT> > ts_pairs;
237  std::fill_n(isthere, ysize, false);
238 
239  // We don't need to keep a "merger" because minimum will always come from the processor
240  // with the smallest rank; so a linear sweep over the received buffer is enough
241  for(int i=0; i<rowneighs; ++i)
242  {
243  for(int j=0; j< recvcnt[i]; ++j)
244  {
245  int32_t index = recvindbuf[rdispls[i] + j];
246  if(!isthere[index])
247  ts_pairs.push_back(std::make_pair(index, recvnumbuf[rdispls[i] + j]));
248 
249  }
250  }
251  DeleteAll(recvcnt, rdispls);
252  DeleteAll(isthere, recvindbuf, recvnumbuf);
253  __gnu_parallel::sort(ts_pairs.begin(), ts_pairs.end());
254  int nnzy = ts_pairs.size();
255  y.ind.resize(nnzy);
256  y.num.resize(nnzy);
257  for(int i=0; i< nnzy; ++i)
258  {
259  y.ind[i] = ts_pairs[i].first;
260  y.num[i] = ts_pairs[i].second;
261  }
262 
263 #else
264  // Alternative 2: Heap-merge
265  int32_t hsize = 0;
266  int32_t inf = std::numeric_limits<int32_t>::min();
267  int32_t sup = std::numeric_limits<int32_t>::max();
268  KNHeap< int32_t, int32_t > sHeap(sup, inf);
269  int * processed = new int[rowneighs]();
270  for(int32_t i=0; i<rowneighs; ++i)
271  {
272  if(recvcnt[i] > 0)
273  {
274  // key, proc_id
275  sHeap.insert(recvindbuf[rdispls[i]], i);
276  ++hsize;
277  }
278  }
279  int32_t key, locv;
280  if(hsize > 0)
281  {
282  sHeap.deleteMin(&key, &locv);
283  y.ind.push_back( static_cast<IU>(key));
284  y.num.push_back(recvnumbuf[rdispls[locv]]); // nothing is processed yet
285 
286  if( (++(processed[locv])) < recvcnt[locv] )
287  sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
288  else
289  --hsize;
290  }
291 
292 // ofstream oput;
293 // y.commGrid->OpenDebugFile("Merge", oput);
294 // oput << "From displacements: "; copy(rdispls, rdispls+rowneighs, ostream_iterator<int>(oput, " ")); oput << endl;
295 // oput << "From counts: "; copy(recvcnt, recvcnt+rowneighs, ostream_iterator<int>(oput, " ")); oput << endl;
296  while(hsize > 0)
297  {
298  sHeap.deleteMin(&key, &locv);
299  IU deref = rdispls[locv] + processed[locv];
300  if(y.ind.back() != static_cast<IU>(key)) // y.ind is surely not empty
301  {
302  y.ind.push_back(static_cast<IU>(key));
303  y.num.push_back(recvnumbuf[deref]);
304  }
305 
306  if( (++(processed[locv])) < recvcnt[locv] )
307  sHeap.insert(recvindbuf[rdispls[locv]+processed[locv]], locv);
308  else
309  --hsize;
310  }
311  DeleteAll(recvcnt, rdispls,processed);
312  DeleteAll(recvindbuf, recvnumbuf);
313 #endif
314 
315 #ifdef TIMING
316  double t1=MPI_Wtime();
317  cblas_mergeconttime += (t1-t0);
318 #endif
319 }
320 
327 template <typename VT, typename IT, typename UDER>
329 {
330  CheckSpMVCompliance(A,x);
331  optbuf.MarkEmpty();
332 
333  MPI_Comm World = x.commGrid->GetWorld();
334  MPI_Comm ColWorld = x.commGrid->GetColWorld();
335  MPI_Comm RowWorld = x.commGrid->GetRowWorld();
336 
337  int accnz;
338  int32_t trxlocnz;
339  IT lenuntil;
340  int32_t *trxinds, *indacc;
341  VT *trxnums, *numacc;
342 
343 
344 #ifdef TIMING
345  double t0=MPI_Wtime();
346 #endif
347  TransposeVector(World, x, trxlocnz, lenuntil, trxinds, trxnums, true); // trxinds (and potentially trxnums) is allocated
348 #ifdef TIMING
349  double t1=MPI_Wtime();
350  cblas_transvectime += (t1-t0);
351 #endif
352  AllGatherVector(ColWorld, trxlocnz, lenuntil, trxinds, trxnums, indacc, numacc, accnz, true); // trxinds (and potentially trxnums) is deallocated, indacc/numacc allocated
353 
354  FullyDistSpVec<IT, VT> y ( x.commGrid, A.getnrow()); // identity doesn't matter for sparse vectors
355  int rowneighs; MPI_Comm_size(RowWorld,&rowneighs);
356  int * sendcnt = new int[rowneighs]();
357 
358  LocalSpMV(A, rowneighs, optbuf, indacc, numacc, sendcnt, accnz); // indacc/numacc deallocated
359 
360  int * rdispls = new int[rowneighs];
361  int * recvcnt = new int[rowneighs];
362  MPI_Alltoall(sendcnt, 1, MPI_INT, recvcnt, 1, MPI_INT, RowWorld); // share the request counts
363 
364  // receive displacements are exact whereas send displacements have slack
365  rdispls[0] = 0;
366  for(int i=0; i<rowneighs-1; ++i)
367  {
368  rdispls[i+1] = rdispls[i] + recvcnt[i];
369  }
370  int totrecv = std::accumulate(recvcnt,recvcnt+rowneighs,0);
371  int32_t * recvindbuf = new int32_t[totrecv];
372  VT * recvnumbuf = new VT[totrecv];
373 
374 #ifdef TIMING
375  double t2=MPI_Wtime();
376 #endif
377  if(optbuf.totmax > 0 ) // graph500 optimization enabled
378  {
379  MPI_Alltoallv(optbuf.inds, sendcnt, optbuf.dspls, MPIType<int32_t>(), recvindbuf, recvcnt, rdispls, MPIType<int32_t>(), RowWorld);
380  MPI_Alltoallv(optbuf.nums, sendcnt, optbuf.dspls, MPIType<VT>(), recvnumbuf, recvcnt, rdispls, MPIType<VT>(), RowWorld);
381  delete [] sendcnt;
382  }
383  else
384  {
385  SpParHelper::Print("BFS only (no semiring) function only work with optimization buffers\n");
386  }
387 #ifdef TIMING
388  double t3=MPI_Wtime();
389  cblas_alltoalltime += (t3-t2);
390 #endif
391 
392  MergeContributions(y,recvcnt, rdispls, recvindbuf, recvnumbuf, rowneighs);
393  return y;
394 }
395 
396 template <typename VT, typename IT, typename UDER>
398  std::shared_ptr<CommGrid> cg = A.getcommgrid();
399  IT rowuntil = x.LengthUntil();
400  MPI_Comm RowWorld = cg->GetRowWorld();
401  MPI_Bcast(&rowuntil, 1, MPIType<IT>(), 0, RowWorld);
402  int numcols = cg->GetGridCols();
403  SpDCCols<int,bool>::SpColIter colit = A.seq().begcol();
404 #ifdef THREADED
406  for(int c=0; c<numcols; c++) {
407  IT curr_sub_start = done.GetGlobalStartOfLocal(c) - rowuntil;
408  IT next_sub_start = done.GetGlobalEndOfLocal(c) - rowuntil;
409  IT sub_range = next_sub_start - curr_sub_start;
410  IT per_thread = (sub_range + cblas_splits - 1) / cblas_splits;
411  IT curr_thread_start = curr_sub_start;
412  for (int t=0; t<cblas_splits; t++) {
413  while(colit.colid() < curr_thread_start) {
414  ++colit;
415  }
416  starts[c*cblas_splits + t] = colit;
417  curr_thread_start = std::min(curr_thread_start + per_thread, next_sub_start);
418  }
419  }
420  starts[numcols*cblas_splits] = A.seq().endcol();
421 #else
423  for(int c=0; c<numcols; c++) {
424  IT next_start = done.GetGlobalStartOfLocal(c) - rowuntil;
425  while(colit.colid() < next_start) {
426  ++colit;
427  }
428  starts[c] = colit;
429  }
430  starts[numcols] = A.seq().endcol();
431 #endif
432  return starts;
433 }
434 
435 template <typename VT, typename IT>
436 void UpdateParents(MPI_Comm & RowWorld, std::pair<IT,IT> *updates, int num_updates, FullyDistVec<IT,VT> &parents, int source, int dest, BitMapFringe<int64_t,int64_t> &bm_fringe) {
437  int send_words = num_updates<<1, recv_words;
438  MPI_Status status;
439  MPI_Sendrecv(&send_words, 1, MPI_INT, dest, PUPSIZE,
440  &recv_words, 1, MPI_INT, source, PUPSIZE, RowWorld, &status);
441  std::pair<IT,IT>* recv_buff = new std::pair<IT,IT>[recv_words>>1];
442  MPI_Sendrecv(updates, send_words, MPIType<IT>(), dest, PUPDATA,
443  recv_buff, recv_words, MPIType<IT>(), source, PUPDATA, RowWorld, &status);
444 
445 #ifdef THREADED
446 #pragma omp parallel for
447 #endif
448  for (int i=0; i<recv_words>>1; i++) {
449  parents.SetLocalElement(recv_buff[i].first, recv_buff[i].second);
450  }
451 
452  bm_fringe.IncrementNumSet((recv_words>>1));
453  delete[] recv_buff;
454 }
455 
456 
457 template <typename VT, typename IT, typename UDER>
459 {
460  std::shared_ptr<CommGrid> cg = A.getcommgrid();
461  MPI_Comm World = cg->GetWorld();
462  MPI_Comm ColWorld = cg->GetColWorld();
463  MPI_Comm RowWorld = cg->GetRowWorld();
464  MPI_Status status;
465 
466  // get row and column offsets
467  IT rowuntil = x.LengthUntil(), my_coluntil = x.LengthUntil(), coluntil;
468  int diagneigh = cg->GetComplementRank();
469  MPI_Sendrecv(&my_coluntil, 1, MPIType<IT>(), diagneigh, TROST, &coluntil, 1, MPIType<IT>(), diagneigh, TROST, World, &status);
470  MPI_Bcast(&coluntil, 1, MPIType<IT>(), 0, ColWorld);
471  MPI_Bcast(&rowuntil, 1, MPIType<IT>(), 0, RowWorld);
472 
473  BitMap* frontier = bm_fringe.TransposeGather();
474  done.SaveOld();
475 
476 #ifdef THREADED
477  const int buff_size = 8192;
478  std::pair<IT,IT>* local_update_heads[cblas_splits];
479  for (int t=0; t<cblas_splits; t++)
480  local_update_heads[t] = new std::pair<IT,IT>[buff_size];
481 #endif
482 
483  // do bottom up work
484  int numcols = cg->GetGridCols();
485  int mycol = cg->GetRankInProcRow();
486  std::pair<IT,IT>* parent_updates = new std::pair<IT,IT>[done.SizeOfChunk()<<1]; // over-allocated
487 
488  for (int sub_step=0; sub_step<numcols; sub_step++) {
489  int num_updates = 0;
490  IT sub_start = done.GetGlobalStartOfLocal();
491  int dest_slice = (mycol + sub_step) % numcols;
492  int source_slice = (mycol - sub_step + numcols) % numcols;
493 #ifdef BOTTOMUPTIME
494  double t1 = MPI_Wtime();
495 #endif
496 #ifdef THREADED
497 #pragma omp parallel
498  {
499  int id = omp_get_thread_num();
500  int num_locals=0;
502  SpDCCols<int,bool>::SpColIter colit, colit_end;
503  std::pair<IT,IT>* local_updates = local_update_heads[id];
504  // vector<pair<IT,IT> > local_updates;
505  colit_end = starts[dest_slice*cblas_splits + id + 1];
506  for(colit = starts[dest_slice*cblas_splits + id]; colit != colit_end; ++colit) {
507  int32_t local_row_ind = colit.colid();
508  IT row = local_row_ind + rowuntil;
509  if (!done.GetBit(row)) {
510  nzit_end = A.seq().endnz(colit);
511  for(nzit = A.seq().begnz(colit); nzit != nzit_end; ++nzit) {
512  int32_t local_col_ind = nzit.rowid();
513  IT col = local_col_ind + coluntil;
514  if (frontier->get_bit(local_col_ind)) {
515  // local_updates.push_back(make_pair(row-sub_start, col));
516  if (num_locals == buff_size) {
517  int copy_start = __sync_fetch_and_add(&num_updates, buff_size);
518  std::copy(local_updates, local_updates + buff_size, parent_updates + copy_start);
519  num_locals = 0;
520  }
521  local_updates[num_locals++] = std::make_pair(row-sub_start, col);
522  done.SetBit(row);
523  break;
524  }
525  }
526  }
527  }
528  int copy_start = __sync_fetch_and_add(&num_updates, num_locals);
529  std::copy(local_updates, local_updates + num_locals, parent_updates + copy_start);
530  }
531 #else
533  SpDCCols<int,bool>::SpColIter colit, colit_end;
534  colit_end = starts[dest_slice+1];
535  for(colit = starts[dest_slice]; colit != colit_end; ++colit)
536  {
537  int32_t local_row_ind = colit.colid();
538  IT row = local_row_ind + rowuntil;
539  if (!done.GetBit(row))
540  {
541  nzit_end = A.seq().endnz(colit);
542  for(nzit = A.seq().begnz(colit); nzit != nzit_end; ++nzit)
543  {
544  int32_t local_col_ind = nzit.rowid();
545  IT col = local_col_ind + coluntil;
546  if (frontier->get_bit(local_col_ind))
547  {
548  parent_updates[num_updates++] = std::make_pair(row-sub_start, col);
549  done.SetBit(row);
550  break;
551  }
552  } // end_for
553  } // end_if
554  } // end_for
555 #endif
556 
557 #ifdef BOTTOMUPTIME
558  double t2 = MPI_Wtime();
559  bu_local += (t2-t1);
560  t1 = MPI_Wtime();
561 #endif
562  done.RotateAlongRow();
563 
564 #ifdef BOTTOMUPTIME
565  t2 = MPI_Wtime();
566  bu_rotate += (t2-t1);
567  t1 = MPI_Wtime();
568 #endif
569  UpdateParents(RowWorld, parent_updates, num_updates, parents, source_slice, dest_slice, bm_fringe);
570 #ifdef BOTTOMUPTIME
571  t2 = MPI_Wtime();
572  bu_update += (t2-t1);
573 #endif
574  }
575  bm_fringe.LoadFromNext();
576  done.UpdateFringe(bm_fringe);
577 #ifdef THREADED
578  for (int t=0; t<cblas_splits; t++)
579  delete[] local_update_heads[t];
580 #endif
581  delete[] parent_updates;
582 }
583 
584 }
585 
586 #endif
IT getnnz() const
Definition: SpDCCols.h:301
int cblas_splits
Definition: DirOptBFS.cpp:72
std::shared_ptr< CommGrid > getcommgrid() const
Definition: SpParMat.h:275
Iterate over (sparse) columns of the sparse matrix.
Definition: SpDCCols.h:84
int getnsplit() const
Definition: SpDCCols.h:303
DER::LocalIT getlocalrows() const
Definition: SpParMat.h:276
bool get_bit(uint64_t pos)
Definition: BitMap.h:106
#define TROST
Definition: SpDefs.h:105
SpDCCols< int, bool >::SpColIter * CalcSubStarts(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapCarousel< IT, VT > &done)
Definition: BFSFriends.h:397
void DeleteAll(A arr1)
Definition: Deleter.h:48
void BottomUpStep(SpParMat< IT, bool, UDER > &A, FullyDistSpVec< IT, VT > &x, BitMapFringe< int64_t, int64_t > &bm_fringe, FullyDistVec< IT, VT > &parents, BitMapCarousel< IT, VT > &done, SpDCCols< int, bool >::SpColIter *starts)
Definition: BFSFriends.h:458
double bu_local
Definition: DirOptBFS.cpp:69
MPI_Datatype MPIType< int32_t >(void)
Definition: MPIType.cpp:56
void SetLocalElement(IT index, NT value)
Definition: FullyDistVec.h:144
double bu_update
Definition: DirOptBFS.cpp:70
IT rowid() const
< Return the "local" rowid of the current nonzero entry.
Definition: SpDCCols.h:128
double bu_rotate
Definition: DirOptBFS.cpp:71
#define PUPSIZE
Definition: SpDefs.h:109
double A
FullyDistSpVec< IT, VT > SpMV(const SpParMat< IT, bool, UDER > &A, const FullyDistSpVec< IT, VT > &x, OptBuf< int32_t, VT > &optbuf)
Definition: BFSFriends.h:328
static void Print(const std::string &s)
Iterate over the nonzeros of the sparse column.
Definition: SpDCCols.h:87
void MergeContributions(FullyDistSpVec< IU, VT > &y, int *&recvcnt, int *&rdispls, int32_t *&recvindbuf, VT *&recvnumbuf, int rowneighs)
Definition: BFSFriends.h:224
void IncrementNumSet(int num_updates)
Definition: BitMapFringe.h:129
void CheckSpMVCompliance(const MATRIX &A, const VECTOR &x)
Definition: ParFriends.h:1020
void dcsc_gespmv_threaded_setbuffers(const SpDCCols< IT, bool > &A, const int32_t *indx, const VT *numx, int32_t nnzx, int32_t *sendindbuf, VT *sendnumbuf, int *cnts, int *dspls, int p_c)
Definition: BFSFriends.h:59
DER::LocalIT getlocalnnz() const
Definition: SpParMat.h:278
double cblas_mergeconttime
Definition: DirOptBFS.cpp:59
void TransposeVector(MPI_Comm &World, const FullyDistSpVec< IU, NV > &x, int32_t &trxlocnz, IU &lenuntil, int32_t *&trxinds, NV *&trxnums, bool indexisvalue)
Definition: ParFriends.h:1057
double cblas_transvectime
Definition: DirOptBFS.cpp:60
void LocalSpMV(const SpParMat< IT, bool, UDER > &A, int rowneighs, OptBuf< int32_t, VT > &optbuf, int32_t *&indacc, VT *&numacc, int *sendcnt, int accnz)
Definition: BFSFriends.h:184
#define PUPDATA
Definition: SpDefs.h:110
void UpdateParents(MPI_Comm &RowWorld, std::pair< IT, IT > *updates, int num_updates, FullyDistVec< IT, VT > &parents, int source, int dest, BitMapFringe< int64_t, int64_t > &bm_fringe)
Definition: BFSFriends.h:436
BitMap * TransposeGather()
Definition: BitMapFringe.h:134
Definition: CCGrid.h:4
void MarkEmpty()
Definition: OptBuf.h:47
IT getnrow() const
Definition: SpDCCols.h:299
double cblas_localspmvtime
Definition: DirOptBFS.cpp:61
IT colid() const
< Return the "local" colid of the current column.
Definition: SpDCCols.h:176
void AllGatherVector(MPI_Comm &ColWorld, int trxlocnz, IU lenuntil, int32_t *&trxinds, NV *&trxnums, int32_t *&indacc, NV *&numacc, int &accnz, bool indexisvalue)
Definition: ParFriends.h:1099
double cblas_alltoalltime
Definition: DirOptBFS.cpp:57
Dcsc< IT, NT > * GetDCSC() const
Definition: SpDCCols.h:322
IT getnrow() const
Definition: SpParMat.cpp:685