10 template <
class T,
class ITYPE>
14 template <
class T,
class ITYPE>
25 if(!(rowbits > 1 && colbits > 1))
27 cerr <<
"Matrix too small for this library" << endl;
31 rowhighbits =
static_cast<ITYPE
>(rowbits/2)-1;
32 colhighbits =
static_cast<ITYPE
>(colbits/2)-1;
34 rowlowbits = rowbits - rowhighbits;
35 collowbits = colbits - colhighbits;
37 lowrowmask = IntPower<ITYPE>(2,rowlowbits) - 1;
38 highrowmask = ((roundrowup - 1) ^ lowrowmask);
39 lowcolmask = IntPower<ITYPE>(2,collowbits) - 1;
40 highcolmask = ((roundcolup - 1) ^ lowcolmask);
42 nbc = IntPower<ITYPE>(2,colhighbits);
43 nbr = IntPower<ITYPE>(2,rowhighbits);
48 template <
class T,
class ITYPE>
51 assert(nz != 0 && n != 0 && m != 0);
56 top =
new ITYPE* [nbr];
58 for(ITYPE i=0; i<nbr; ++i)
59 top[i] =
new ITYPE[nbc+1];
64 template <
class T,
class ITYPE>
66 : nz(rhs.nz), m(rhs.m), n(rhs.n), ntop(rhs.ntop), nbr(rhs.nbr), nbc(rhs.nbc), rowhighbits(rhs.rowhighbits),rowlowbits(rhs.rowlowbits),
67 highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask), colhighbits(rhs.colhighbits), collowbits(rhs.collowbits),
68 highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask)
75 for(ITYPE i=0; i< nz; ++i)
77 for(ITYPE i=0; i< nz; ++i)
82 top =
new ITYPE* [nbr];
84 for(ITYPE i=0; i<nbr; ++i)
85 top[i] =
new ITYPE[nbc+1];
87 for(ITYPE i=0; i<nbr; ++i)
88 for(ITYPE j=0; j <= nbc; ++j)
89 top[i][j] = rhs.top[i][j];
93 template <
class T,
class ITYPE>
106 for(ITYPE i=0; i<nbr; ++i)
118 rowhighbits = rhs.rowhighbits;
119 rowlowbits = rhs.rowlowbits;
120 highrowmask = rhs.highrowmask;
121 lowrowmask = rhs.lowrowmask;
123 colhighbits = rhs.colhighbits;
124 collowbits = rhs.collowbits;
125 highcolmask = rhs.highcolmask;
126 lowcolmask = rhs.lowcolmask;
133 for(ITYPE i=0; i< nz; ++i)
135 for(ITYPE i=0; i< nz; ++i)
140 top =
new ITYPE* [nbr];
142 for(ITYPE i=0; i<nbr; ++i)
143 top[i] =
new ITYPE[nbc+1];
145 for(ITYPE i=0; i<nbr; ++i)
146 for(ITYPE j=0; j <= nbc; ++j)
147 top[i][j] = rhs.top[i][j];
153 template <
class T,
class ITYPE>
163 for(ITYPE i=0; i<nbr; ++i)
170 template <
class T,
class ITYPE>
173 typedef std::pair<ITYPE, ITYPE> ipair;
174 typedef std::pair<ITYPE, ipair> mypair;
176 assert(nz != 0 && n != 0 && m != 0);
182 top =
new ITYPE* [nbr];
183 for(ITYPE i=0; i<nbr; ++i)
184 top[i] =
new ITYPE[nbc+1];
186 mypair * pairarray =
new mypair[nz];
188 for(ITYPE j = 0; j < n; ++j)
190 for (ITYPE i = csc.jc [j] ; i < csc.jc[j+1] ; ++i)
193 ITYPE hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
194 | ((highcolmask & j) >> collowbits);
195 ITYPE lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
198 pairarray[k++] = mypair(hindex, ipair(lindex,i));
201 sort(pairarray, pairarray+nz);
209 for(ITYPE i = 0; i < nbr; ++i)
211 for(ITYPE j = 0; j < nbc; ++j)
214 while(cnz < nz && pairarray[cnz].first == ((i*nbc)+j) )
216 bot[cnz] = (pairarray[cnz].second).first;
217 num[cnz++] = csc.num[(pairarray[cnz].second).second];
226 template <
typename T,
typename ITYPE>
227 void Sym<T, ITYPE>::BMult(ITYPE * btop, ITYPE bstart, ITYPE bend,
const T * x, T * y, ITYPE ysize)
const
229 if((btop[bend] - btop[bstart] <
BREAKEVEN * ysize) || (bend-bstart == 1))
232 SubSpMV(btop, bstart, bend, x, y);
235 ITYPE nnb = (bend+bstart)/2;
237 cilk_spawn BMult(btop, bstart, nnb, x, y, ysize);
240 BMult(btop, nnb, bend, x, y, ysize);
244 T * temp =
new T[ysize];
245 for(ITYPE i=0; i<ysize; ++i)
248 BMult(btop, nnb, bend, x, temp, ysize);
251 for(ITYPE i=0; i<ysize; ++i)
261 template <
class T,
class ITYPE>
263 const T * __restrict x, T * __restrict suby)
const
265 for (ITYPE j = bstart ; j < bend ; ++j)
268 ITYPE chi = ((j << collowbits) & highcolmask);
269 const T * subx = &x[chi];
276 ITYPE blocknz = btop[j+1] - btop[j];
277 if(blocknz > 8 && blocknz < 4096)
280 const int botinc =
CLSIZE/
sizeof(ITYPE);
281 const int numinc =
CLSIZE/
sizeof(T);
283 for (ITYPE i=0; i < blocknz; i+=botinc)
285 __builtin_prefetch (&bot[btop[j]+i], 0, 0);
287 for (ITYPE i=0; i < blocknz; i+=numinc)
289 __builtin_prefetch (&num[btop[j]+i], 0, 0);
295 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
296 __m128i m2 = _mm_set_epi64x(mlong1, mlong2);
300 ofstream stat(
"Perfdeb.txt", ios::app);
301 stat << btop[j+1] - btop[j] << endl;
304 for (ITYPE k = btop[j] ; k < btop[j+1] ; ++k)
306 ITYPE rli = ((bot[k] >> collowbits) & lowrowmask);
307 ITYPE cli = (bot[k] & lowcolmask);
309 suby [rli] += num[k] * subx [cli] ;
315 template <
class T,
class ITYPE>
319 int * __restrict a = (
int*) addr;
320 const int inc =
CLSIZE/ssize;
323 for(
int i=0; i<total; i+=inc)
331 template <
class T,
class ITYPE>
336 ITYPE leaddim = lowcolmask+1;
342 for(ITYPE j = 0; j < leaddim; ++j)
344 for(ITYPE i = j; i < ntop ; i += leaddim)
347 cnz += top[i+1]-top[i];
unsigned int nextpoweroftwo(unsigned int v)
Sym< T, ITYPE > & operator=(const Sym< T, ITYPE > &rhs)
unsigned int highestbitset(unsigned __int64 v)