COMBINATORIAL_BLAS  1.6
ThreadedFriends.h
Go to the documentation of this file.
1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.6 -------------------------------------------------*/
4 /* date: 6/15/2017 ---------------------------------------------*/
5 /* authors: Ariful Azad, Aydin Buluc --------------------------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2017, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 
30 #ifndef _THREADED_FRIENDS_H_
31 #define _THREADED_FRIENDS_H_
32 
33 #include <iostream>
34 #include "SpMat.h" // Best to include the base class first
35 #include "SpHelper.h"
36 #include "StackEntry.h"
37 #include "Isect.h"
38 #include "Deleter.h"
39 #include "SpImpl.h"
40 #include "SpParHelper.h"
41 #include "Compare.h"
42 #include "CombBLAS.h"
43 #include "PreAllocatedSPA.h"
44 
45 namespace combblas {
46 
47 template <class IU, class NU>
48 class SpTuples;
49 
50 template <class IU, class NU>
51 class SpDCCols;
52 
53 template <class IU, class NU>
54 class Dcsc;
55 
56 
57 // multithreaded HeapSpGEMM
58 template <typename SR, typename NTO, typename IT, typename NT1, typename NT2>
59 SpTuples<IT, NTO> * LocalSpGEMM
60 (const SpDCCols<IT, NT1> & A,
61  const SpDCCols<IT, NT2> & B,
62  bool clearA, bool clearB)
63 {
64  IT mdim = A.getnrow();
65  IT ndim = B.getncol();
66  IT nnzA = A.getnnz();
67  if(A.isZero() || B.isZero())
68  {
69  return new SpTuples<IT, NTO>(0, mdim, ndim);
70  }
71 
72  Dcsc<IT,NT1>* Adcsc = A.GetDCSC();
73  Dcsc<IT,NT2>* Bdcsc = B.GetDCSC();
74  IT nA = A.getncol();
75  IT cnzmax = Adcsc->nz + Bdcsc->nz; // estimate on the size of resulting matrix C
76  float cf = static_cast<float>(nA+1) / static_cast<float>(Adcsc->nzc);
77  IT csize = static_cast<IT>(ceil(cf)); // chunk size
78  IT * aux;
79  Adcsc->ConstructAux(nA, aux);
80 
81  int numThreads = 1; // default case
82 #ifdef THREADED
83 #pragma omp parallel
84  {
85  numThreads = omp_get_num_threads();
86  }
87 #endif
88 
89  IT* colnnzC = estimateNNZ(A, B);
90  IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
91  delete [] colnnzC;
92  IT nnzc = colptrC[Bdcsc->nzc];
93  std::tuple<IT,IT,NTO> * tuplesC = static_cast<std::tuple<IT,IT,NTO> *> (::operator new (sizeof(std::tuple<IT,IT,NTO>[nnzc])));
94 
95  // thread private space for heap and colinds
96  std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
97  std::vector<std::vector<HeapEntry<IT,NT1>>> globalheapVec(numThreads);
98 
99  for(int i=0; i<numThreads; i++) //inital allocation per thread, may be an overestimate, but does not require more memoty than inputs
100  {
101  colindsVec[i].resize(nnzA/numThreads);
102  globalheapVec[i].resize(nnzA/numThreads);
103  }
104 
105 
106 #pragma omp parallel for
107  for(int i=0; i < Bdcsc->nzc; ++i)
108  {
109  IT nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i]; //nnz in the current column of B
110  int myThread = omp_get_thread_num();
111  if(colindsVec[myThread].size() < nnzcolB) //resize thread private vectors if needed
112  {
113  colindsVec[myThread].resize(nnzcolB);
114  globalheapVec[myThread].resize(nnzcolB);
115  }
116 
117 
118  // colinds.first vector keeps indices to A.cp, i.e. it dereferences "colnums" vector (above),
119  // colinds.second vector keeps the end indices (i.e. it gives the index to the last valid element of A.cpnack)
120  Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
121  std::pair<IT,IT> * colinds = colindsVec[myThread].data();
122  HeapEntry<IT,NT1> * wset = globalheapVec[myThread].data();
123  IT hsize = 0;
124 
125 
126  for(IT j = 0; (unsigned)j < nnzcolB; ++j) // create the initial heap
127  {
128  if(colinds[j].first != colinds[j].second) // current != end
129  {
130  wset[hsize++] = HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
131  }
132  }
133  std:make_heap(wset, wset+hsize);
134 
135  IT curptr = colptrC[i];
136  while(hsize > 0)
137  {
138  std::pop_heap(wset, wset + hsize); // result is stored in wset[hsize-1]
139  IT locb = wset[hsize-1].runr; // relative location of the nonzero in B's current column
140 
141  NTO mrhs = SR::multiply(wset[hsize-1].num, Bdcsc->numx[Bdcsc->cp[i]+locb]);
142  if (!SR::returnedSAID())
143  {
144  if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
145  {
146  std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
147  }
148  else
149  {
150  tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->jc[i], mrhs) ;
151  }
152 
153  }
154 
155  if( (++(colinds[locb].first)) != colinds[locb].second) // current != end
156  {
157  // runr stays the same !
158  wset[hsize-1].key = Adcsc->ir[colinds[locb].first];
159  wset[hsize-1].num = Adcsc->numx[colinds[locb].first];
160  std::push_heap(wset, wset+hsize);
161  }
162  else
163  {
164  --hsize;
165  }
166  }
167  }
168 
169  if(clearA)
170  delete const_cast<SpDCCols<IT, NT1> *>(&A);
171  if(clearB)
172  delete const_cast<SpDCCols<IT, NT2> *>(&B);
173 
174  delete [] colptrC;
175  delete [] aux;
176 
177  SpTuples<IT, NTO>* spTuplesC = new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC, true);
178  return spTuplesC;
179 
180 }
181 
182 }
183 
184 #endif
double B
int size
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
Definition: mtSpGEMM.h:78
double A
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
Definition: mtSpGEMM.h:211
Definition: CCGrid.h:4
SpDCCols< IT, NT > * multiply(SpDCCols< IT, NT > &splitA, SpDCCols< IT, NT > &splitB, CCGrid &CMG, bool isBT, bool threaded)
Definition: Multiplier.h:11