30 #ifndef _THREADED_FRIENDS_H_ 31 #define _THREADED_FRIENDS_H_ 47 template <
class IU,
class NU>
50 template <
class IU,
class NU>
53 template <
class IU,
class NU>
58 template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
60 (
const SpDCCols<IT, NT1> &
A,
61 const SpDCCols<IT, NT2> &
B,
62 bool clearA,
bool clearB)
64 IT mdim = A.getnrow();
65 IT ndim = B.getncol();
67 if(A.isZero() || B.isZero())
69 return new SpTuples<IT, NTO>(0, mdim, ndim);
72 Dcsc<IT,NT1>* Adcsc = A.GetDCSC();
73 Dcsc<IT,NT2>* Bdcsc = B.GetDCSC();
75 IT cnzmax = Adcsc->nz + Bdcsc->nz;
76 float cf =
static_cast<float>(nA+1) / static_cast<float>(Adcsc->nzc);
77 IT csize =
static_cast<IT
>(ceil(cf));
79 Adcsc->ConstructAux(nA, aux);
85 numThreads = omp_get_num_threads();
90 IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
92 IT nnzc = colptrC[Bdcsc->nzc];
93 std::tuple<IT,IT,NTO> * tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[nnzc])));
96 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
97 std::vector<std::vector<HeapEntry<IT,NT1>>> globalheapVec(numThreads);
99 for(
int i=0; i<numThreads; i++)
101 colindsVec[i].resize(nnzA/numThreads);
102 globalheapVec[i].resize(nnzA/numThreads);
106 #pragma omp parallel for 107 for(
int i=0; i < Bdcsc->nzc; ++i)
109 IT nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
110 int myThread = omp_get_thread_num();
111 if(colindsVec[myThread].
size() < nnzcolB)
113 colindsVec[myThread].resize(nnzcolB);
114 globalheapVec[myThread].resize(nnzcolB);
120 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
121 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
122 HeapEntry<IT,NT1> * wset = globalheapVec[myThread].data();
126 for(IT j = 0; (unsigned)j < nnzcolB; ++j)
128 if(colinds[j].first != colinds[j].second)
130 wset[hsize++] = HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
133 std:make_heap(wset, wset+hsize);
135 IT curptr = colptrC[i];
138 std::pop_heap(wset, wset + hsize);
139 IT locb = wset[hsize-1].runr;
141 NTO mrhs =
SR::multiply(wset[hsize-1].num, Bdcsc->numx[Bdcsc->cp[i]+locb]);
142 if (!SR::returnedSAID())
144 if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
146 std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
150 tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->jc[i], mrhs) ;
155 if( (++(colinds[locb].first)) != colinds[locb].second)
158 wset[hsize-1].key = Adcsc->ir[colinds[locb].first];
159 wset[hsize-1].num = Adcsc->numx[colinds[locb].first];
160 std::push_heap(wset, wset+hsize);
170 delete const_cast<SpDCCols<IT, NT1> *
>(&
A);
172 delete const_cast<SpDCCols<IT, NT2> *
>(&
B);
177 SpTuples<IT, NTO>* spTuplesC =
new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC,
true);
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
SpDCCols< IT, NT > * multiply(SpDCCols< IT, NT > &splitA, SpDCCols< IT, NT > &splitB, CCGrid &CMG, bool isBT, bool threaded)