26 std::vector<T> tsum(nthreads+1);
28 T* out =
new T[size+1];
37 ithread = omp_get_thread_num();
42 #pragma omp for schedule(static) 44 for (
int i=0; i<
size; i++)
50 tsum[ithread+1] = sum;
55 for(
int i=0; i<(ithread+1); i++)
61 #pragma omp for schedule(static) 63 for (
int i=0; i<
size; i++)
76 template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
80 bool clearA,
bool clearB)
94 float cf =
static_cast<float>(nA+1) / static_cast<float>(Adcsc->nzc);
95 IT csize =
static_cast<IT
>(ceil(cf));
97 Adcsc->ConstructAux(nA, aux);
104 numThreads = omp_get_num_threads();
109 IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->
nzc, numThreads);
111 IT nnzc = colptrC[Bdcsc->
nzc];
112 std::tuple<IT,IT,NTO> * tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[nnzc])));
118 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
119 std::vector<std::vector<HeapEntry<IT,NT1>>> globalheapVec(numThreads);
121 for(
int i=0; i<numThreads; i++)
123 colindsVec[i].resize(nnzA/numThreads);
124 globalheapVec[i].resize(nnzA/numThreads);
129 #pragma omp parallel for 131 for(
int i=0; i < Bdcsc->
nzc; ++i)
133 size_t nnzcolB = Bdcsc->
cp[i+1] - Bdcsc->
cp[i];
136 myThread = omp_get_thread_num();
138 if(colindsVec[myThread].
size() < nnzcolB)
140 colindsVec[myThread].resize(nnzcolB);
141 globalheapVec[myThread].resize(nnzcolB);
147 Adcsc->FillColInds(Bdcsc->
ir + Bdcsc->
cp[i], nnzcolB, colindsVec[myThread], aux, csize);
148 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
153 for(IT j = 0; (unsigned)j < nnzcolB; ++j)
155 if(colinds[j].first != colinds[j].second)
157 wset[hsize++] =
HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
160 std::make_heap(wset, wset+hsize);
162 IT curptr = colptrC[i];
165 std::pop_heap(wset, wset + hsize);
166 IT locb = wset[hsize-1].
runr;
169 if (!SR::returnedSAID())
171 if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
173 std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
177 tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->
jc[i], mrhs) ;
182 if( (++(colinds[locb].first)) != colinds[locb].second)
185 wset[hsize-1].
key = Adcsc->ir[colinds[locb].first];
186 wset[hsize-1].
num = Adcsc->numx[colinds[locb].first];
187 std::push_heap(wset, wset+hsize);
210 template <
typename IT,
typename NT1,
typename NT2>
222 float cf =
static_cast<float>(A.
getncol()+1) / static_cast<float>(Adcsc->
nzc);
223 IT csize =
static_cast<IT
>(ceil(cf));
232 numThreads = omp_get_num_threads();
237 IT* colnnzC =
new IT[Bdcsc->
nzc];
240 #pragma omp parallel for 242 for(IT i=0; i< Bdcsc->
nzc; ++i)
248 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
249 std::vector<std::vector<std::pair<IT,IT>>> globalheapVec(numThreads);
252 for(
int i=0; i<numThreads; i++)
254 colindsVec[i].resize(nnzA/numThreads);
255 globalheapVec[i].resize(nnzA/numThreads);
259 #pragma omp parallel for 261 for(
int i=0; i < Bdcsc->
nzc; ++i)
263 size_t nnzcolB = Bdcsc->
cp[i+1] - Bdcsc->
cp[i];
266 myThread = omp_get_thread_num();
268 if(colindsVec[myThread].
size() < nnzcolB)
270 colindsVec[myThread].resize(nnzcolB);
271 globalheapVec[myThread].resize(nnzcolB);
276 Adcsc->
FillColInds(Bdcsc->
ir + Bdcsc->
cp[i], nnzcolB, colindsVec[myThread], aux, csize);
277 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
278 std::pair<IT,IT> * curheap = globalheapVec[myThread].data();
282 for(IT j = 0; (unsigned)j < nnzcolB; ++j)
284 if(colinds[j].first != colinds[j].second)
286 curheap[hsize++] = std::make_pair(Adcsc->
ir[colinds[j].first], j);
289 std::make_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
295 std::pop_heap(curheap, curheap + hsize, std::greater<std::pair<IT,IT>>());
296 IT locb = curheap[hsize-1].second;
298 if( curheap[hsize-1].first != prevRow)
300 prevRow = curheap[hsize-1].first;
304 if( (++(colinds[locb].first)) != colinds[locb].second)
306 curheap[hsize-1].first = Adcsc->
ir[colinds[locb].first];
307 std::push_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
IT * ir
row indices, size nz
IT * cp
The master array, size nzc+1 (keeps column pointers)
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
void FillColInds(const VT *colnums, IT nind, std::vector< std::pair< IT, IT > > &colinds, IT *aux, IT csize) const
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
NT * numx
generic values, size nz
IT nzc
number of columns with at least one non-zero in them
IT ConstructAux(IT ndim, IT *&aux) const
T * prefixsum(T *in, int size, int nthreads)
SpDCCols< IT, NT > * multiply(SpDCCols< IT, NT > &splitA, SpDCCols< IT, NT > &splitB, CCGrid &CMG, bool isBT, bool threaded)
IT * jc
col indices, size nzc
Dcsc< IT, NT > * GetDCSC() const