26     std::vector<T> tsum(nthreads+1);
    28     T* out = 
new T[size+1];
    37         ithread = omp_get_thread_num();
    42 #pragma omp for schedule(static)    44         for (
int i=0; i<
size; i++)
    50         tsum[ithread+1] = sum;
    55         for(
int i=0; i<(ithread+1); i++)
    61 #pragma omp for schedule(static)    63         for (
int i=0; i<
size; i++)
    76 template <
typename SR, 
typename NTO, 
typename IT, 
typename NT1, 
typename NT2>
    80  bool clearA, 
bool clearB)
    94     float cf  = 
static_cast<float>(nA+1) / static_cast<float>(Adcsc->nzc);
    95     IT csize = 
static_cast<IT
>(ceil(cf));   
    97     Adcsc->ConstructAux(nA, aux);
   104         numThreads = omp_get_num_threads();
   109     IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->
nzc, numThreads);
   111     IT nnzc = colptrC[Bdcsc->
nzc];
   112     std::tuple<IT,IT,NTO> * tuplesC = 
static_cast<std::tuple<IT,IT,NTO> *
> (::operator 
new (
sizeof(std::tuple<IT,IT,NTO>[nnzc])));
   118     std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
   119     std::vector<std::vector<HeapEntry<IT,NT1>>> globalheapVec(numThreads);
   121     for(
int i=0; i<numThreads; i++) 
   123         colindsVec[i].resize(nnzA/numThreads);
   124         globalheapVec[i].resize(nnzA/numThreads);
   129 #pragma omp parallel for   131     for(
int i=0; i < Bdcsc->
nzc; ++i)
   133         size_t nnzcolB = Bdcsc->
cp[i+1] - Bdcsc->
cp[i]; 
   136         myThread = omp_get_thread_num();
   138         if(colindsVec[myThread].
size() < nnzcolB) 
   140             colindsVec[myThread].resize(nnzcolB);
   141             globalheapVec[myThread].resize(nnzcolB);
   147         Adcsc->FillColInds(Bdcsc->
ir + Bdcsc->
cp[i], nnzcolB, colindsVec[myThread], aux, csize);
   148         std::pair<IT,IT> * colinds = colindsVec[myThread].data();
   153         for(IT j = 0; (unsigned)j < nnzcolB; ++j)       
   155             if(colinds[j].first != colinds[j].second)   
   157                 wset[hsize++] = 
HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
   160         std::make_heap(wset, wset+hsize);
   162         IT curptr = colptrC[i];
   165             std::pop_heap(wset, wset + hsize);         
   166             IT locb = wset[hsize-1].
runr;   
   169             if (!SR::returnedSAID())
   171                 if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
   173                     std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
   177                     tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->
jc[i], mrhs) ;
   182             if( (++(colinds[locb].first)) != colinds[locb].second)  
   185                 wset[hsize-1].
key = Adcsc->ir[colinds[locb].first];
   186                 wset[hsize-1].
num = Adcsc->numx[colinds[locb].first];
   187                 std::push_heap(wset, wset+hsize);
   210 template <
typename IT, 
typename NT1, 
typename NT2>
   222     float cf  = 
static_cast<float>(A.
getncol()+1) / static_cast<float>(Adcsc->
nzc);
   223     IT csize = 
static_cast<IT
>(ceil(cf));   
   232         numThreads = omp_get_num_threads();
   237     IT* colnnzC = 
new IT[Bdcsc->
nzc]; 
   240 #pragma omp parallel for   242     for(IT i=0; i< Bdcsc->
nzc; ++i)
   248     std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
   249     std::vector<std::vector<std::pair<IT,IT>>> globalheapVec(numThreads);
   252     for(
int i=0; i<numThreads; i++) 
   254         colindsVec[i].resize(nnzA/numThreads);
   255         globalheapVec[i].resize(nnzA/numThreads);
   259 #pragma omp parallel for   261     for(
int i=0; i < Bdcsc->
nzc; ++i)
   263         size_t nnzcolB = Bdcsc->
cp[i+1] - Bdcsc->
cp[i]; 
   266         myThread = omp_get_thread_num();
   268         if(colindsVec[myThread].
size() < nnzcolB) 
   270             colindsVec[myThread].resize(nnzcolB);
   271             globalheapVec[myThread].resize(nnzcolB);
   276         Adcsc->
FillColInds(Bdcsc->
ir + Bdcsc->
cp[i], nnzcolB, colindsVec[myThread], aux, csize);
   277         std::pair<IT,IT> * colinds = colindsVec[myThread].data();
   278         std::pair<IT,IT> * curheap = globalheapVec[myThread].data();
   282         for(IT j = 0; (unsigned)j < nnzcolB; ++j)
   284             if(colinds[j].first != colinds[j].second)
   286                 curheap[hsize++] = std::make_pair(Adcsc->
ir[colinds[j].first], j);
   289         std::make_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
   295           std::pop_heap(curheap, curheap + hsize, std::greater<std::pair<IT,IT>>()); 
   296             IT locb = curheap[hsize-1].second;
   298             if( curheap[hsize-1].first != prevRow)
   300                 prevRow = curheap[hsize-1].first;
   304             if( (++(colinds[locb].first)) != colinds[locb].second)  
   306                 curheap[hsize-1].first = Adcsc->
ir[colinds[locb].first];
   307                 std::push_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
 
IT * ir
row indices, size nz 
IT * cp
The master array, size nzc+1 (keeps column pointers) 
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
void FillColInds(const VT *colnums, IT nind, std::vector< std::pair< IT, IT > > &colinds, IT *aux, IT csize) const
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
NT * numx
generic values, size nz 
IT nzc
number of columns with at least one non-zero in them 
IT ConstructAux(IT ndim, IT *&aux) const
T * prefixsum(T *in, int size, int nthreads)
SpDCCols< IT, NT > * multiply(SpDCCols< IT, NT > &splitA, SpDCCols< IT, NT > &splitB, CCGrid &CMG, bool isBT, bool threaded)
IT * jc
col indices, size nzc 
Dcsc< IT, NT > * GetDCSC() const