Compressed Sparse Blocks  1.2
 All Classes Files Functions Variables Typedefs Friends Macros Pages
sym.cpp
Go to the documentation of this file.
1 #include "sym.h"
2 #include "utility.h"
3 #include <cassert>
4 
5 
6 // You must declare "static data members" inside the class body,
7 // and you also need to define them outside the class body.
8 // When the program refers to a static data member that is only declared but not defined,
9 // the "unresolved external symbol" error occurs.
10 template <class T, class ITYPE>
12 
13 
14 template <class T, class ITYPE>
16 {
17  ITYPE roundrowup = nextpoweroftwo(m);
18  ITYPE roundcolup = nextpoweroftwo(n);
19  ITYPE roundnnzup = nextpoweroftwo(nz);
20 
21  ITYPE rowbits = highestbitset(roundrowup);
22  ITYPE colbits = highestbitset(roundcolup);
23  ITYPE nnzbits = highestbitset(roundnnzup);
24 
25  if(!(rowbits > 1 && colbits > 1))
26  {
27  cerr << "Matrix too small for this library" << endl;
28  return;
29  }
30 
31  rowhighbits = static_cast<ITYPE>(rowbits/2)-1; // # higher order bits for rows
32  colhighbits = static_cast<ITYPE>(colbits/2)-1; // # higher order bits for columns
33 
34  rowlowbits = rowbits - rowhighbits; // rowlowbits = # lower order bits for rows
35  collowbits = colbits - colhighbits; // collowbits = # lower order bits for columns
36 
37  lowrowmask = IntPower<ITYPE>(2,rowlowbits) - 1;
38  highrowmask = ((roundrowup - 1) ^ lowrowmask);
39  lowcolmask = IntPower<ITYPE>(2,collowbits) - 1;
40  highcolmask = ((roundcolup - 1) ^ lowcolmask);
41 
42  nbc = IntPower<ITYPE>(2,colhighbits); // #{block columns} = #{blocks in any block row}
43  nbr = IntPower<ITYPE>(2,rowhighbits); // #{block rows)
44  ntop = nbc * nbr; // size of top array
45 }
46 
47 // Constructing empty Sym objects (size = 0) are not allowed.
48 template <class T, class ITYPE>
49 Sym<T, ITYPE>::Sym (ITYPE size, ITYPE rows, ITYPE cols): nz(size),m(rows),n(cols)
50 {
51  assert(nz != 0 && n != 0 && m != 0);
52  Init();
53 
54  num = new T[nz];
55  bot = new ITYPE[nz];
56  top = new ITYPE* [nbr];
57 
58  for(ITYPE i=0; i<nbr; ++i)
59  top[i] = new ITYPE[nbc+1];
60 }
61 
62 
63 // copy constructor
64 template <class T, class ITYPE>
66 : nz(rhs.nz), m(rhs.m), n(rhs.n), ntop(rhs.ntop), nbr(rhs.nbr), nbc(rhs.nbc), rowhighbits(rhs.rowhighbits),rowlowbits(rhs.rowlowbits),
67 highrowmask(rhs.highrowmask), lowrowmask(rhs.lowrowmask), colhighbits(rhs.colhighbits), collowbits(rhs.collowbits),
68 highcolmask(rhs.highcolmask), lowcolmask(rhs.lowcolmask)
69 {
70  if(nz > 0)
71  {
72  num = new T[nz];
73  bot = new ITYPE[nz];
74 
75  for(ITYPE i=0; i< nz; ++i)
76  num[i]= rhs.num[i];
77  for(ITYPE i=0; i< nz; ++i)
78  bot[i]= rhs.bot[i];
79  }
80  if ( ntop > 0)
81  {
82  top = new ITYPE* [nbr];
83 
84  for(ITYPE i=0; i<nbr; ++i)
85  top[i] = new ITYPE[nbc+1];
86 
87  for(ITYPE i=0; i<nbr; ++i)
88  for(ITYPE j=0; j <= nbc; ++j)
89  top[i][j] = rhs.top[i][j];
90  }
91 }
92 
93 template <class T, class ITYPE>
95 {
96  if(this != &rhs)
97  {
98  if(nz > 0) // if the existing object is not empty
99  {
100  // make it empty
101  delete [] bot;
102  delete [] num;
103  }
104  if(ntop > 0)
105  {
106  for(ITYPE i=0; i<nbr; ++i)
107  delete [] top[i];
108  delete [] top;
109  }
110 
111  nz = rhs.nz;
112  n = rhs.n;
113  m = rhs.m;
114  ntop = rhs.ntop;
115  nbr = rhs.nbr;
116  nbc = rhs.nbc;
117 
118  rowhighbits = rhs.rowhighbits;
119  rowlowbits = rhs.rowlowbits;
120  highrowmask = rhs.highrowmask;
121  lowrowmask = rhs.lowrowmask;
122 
123  colhighbits = rhs.colhighbits;
124  collowbits = rhs.collowbits;
125  highcolmask = rhs.highcolmask;
126  lowcolmask = rhs.lowcolmask;
127 
128  if(nz > 0) // if the copied object is not empty
129  {
130  num = new T[nz];
131  bot = new ITYPE[nz];
132 
133  for(ITYPE i=0; i< nz; ++i)
134  num[i]= rhs.num[i];
135  for(ITYPE i=0; i< nz; ++i)
136  bot[i]= rhs.bot[i];
137  }
138  if(ntop > 0)
139  {
140  top = new ITYPE* [nbr];
141 
142  for(ITYPE i=0; i<nbr; ++i)
143  top[i] = new ITYPE[nbc+1];
144 
145  for(ITYPE i=0; i<nbr; ++i)
146  for(ITYPE j=0; j <= nbc; ++j)
147  top[i][j] = rhs.top[i][j];
148  }
149  }
150  return *this;
151 }
152 
153 template <class T, class ITYPE>
155 {
156  if( nz > 0)
157  {
158  delete [] bot;
159  delete [] num;
160  }
161  if ( ntop > 0)
162  {
163  for(ITYPE i=0; i<nbr; ++i)
164  delete [] top[i];
165  delete [] top;
166  }
167 }
168 
169 
170 template <class T, class ITYPE>
171 Sym<T, ITYPE>::Sym (Csc<T, ITYPE> & csc):nz(csc.nz),m(csc.m),n(csc.n)
172 {
173  typedef std::pair<ITYPE, ITYPE> ipair;
174  typedef std::pair<ITYPE, ipair> mypair;
175 
176  assert(nz != 0 && n != 0 && m != 0);
177  Init();
178 
179  num = new T[nz];
180  bot = new ITYPE[nz];
181 
182  top = new ITYPE* [nbr];
183  for(ITYPE i=0; i<nbr; ++i)
184  top[i] = new ITYPE[nbc+1];
185 
186  mypair * pairarray = new mypair[nz];
187  ITYPE k = 0;
188  for(ITYPE j = 0; j < n; ++j)
189  {
190  for (ITYPE i = csc.jc [j] ; i < csc.jc[j+1] ; ++i) // scan the jth column
191  {
192  // concatenate the higher/lower order half of both row (first) index and col (second) index bits
193  ITYPE hindex = (((highrowmask & csc.ir[i] ) >> rowlowbits) << colhighbits)
194  | ((highcolmask & j) >> collowbits);
195  ITYPE lindex = ((lowrowmask & csc.ir[i]) << collowbits) | (lowcolmask & j) ;
196 
197  // i => location of that nonzero in csc.ir and csc.num arrays
198  pairarray[k++] = mypair(hindex, ipair(lindex,i));
199  }
200  }
201  sort(pairarray, pairarray+nz); // sort according to hindex
202 
203  // Now, elements within each block are sorted with respect to their lindex (i.e. there are in row-major order)
204  // This is because the default comparison operator of pair<T1,T2> uses lexicographic comparison:
205  // within each block, hindex is the same, so they are sorted w.r.t ipair(lindex,i) lexicographically.
206 
207  ITYPE cnz = 0;
208 
209  for(ITYPE i = 0; i < nbr; ++i)
210  {
211  for(ITYPE j = 0; j < nbc; ++j)
212  {
213  top[i][j] = cnz;
214  while(cnz < nz && pairarray[cnz].first == ((i*nbc)+j) ) // as long as we're in that block
215  {
216  bot[cnz] = (pairarray[cnz].second).first;
217  num[cnz++] = csc.num[(pairarray[cnz].second).second];
218  }
219  }
220  top[i][nbc] = cnz;
221  }
222 
223  delete [] pairarray;
224 }
225 
226 template <typename T, typename ITYPE>
227 void Sym<T, ITYPE>::BMult(ITYPE * btop, ITYPE bstart, ITYPE bend, const T * x, T * y, ITYPE ysize) const
228 {
229  if((btop[bend] - btop[bstart] < BREAKEVEN * ysize) || (bend-bstart == 1))
230  {
231  // not enough nonzeros to amortize new array formation
232  SubSpMV(btop, bstart, bend, x, y);
233  return;
234  }
235  ITYPE nnb = (bend+bstart)/2;
236 
237  cilk_spawn BMult(btop, bstart, nnb, x, y, ysize);
238  if(SYNCHED)
239  {
240  BMult(btop, nnb, bend, x, y, ysize);
241  }
242  else
243  {
244  T * temp = new T[ysize];
245  for(ITYPE i=0; i<ysize; ++i)
246  temp[i] = 0.0;
247 
248  BMult(btop, nnb, bend, x, temp, ysize);
249  cilk_sync;
250 
251  for(ITYPE i=0; i<ysize; ++i)
252  y[i] += temp[i];
253 
254  delete [] temp;
255  }
256 }
257 
258 // double* restrict a; --> No aliases for a[0], a[1], ...
259 // bstart: block start index
260 // bend: block end index
261 template <class T, class ITYPE>
262 inline void Sym<T, ITYPE>::SubSpMV(ITYPE * __restrict btop, ITYPE bstart, ITYPE bend,
263  const T * __restrict x, T * __restrict suby) const
264 {
265  for (ITYPE j = bstart ; j < bend ; ++j) // for all blocks inside that block row
266  {
267  // get higher order bits for column indices
268  ITYPE chi = ((j << collowbits) & highcolmask);
269  const T * subx = &x[chi];
270 
271 #ifdef AMDPREOPT
272  //BlockPrefetch(&bot[btop[j]], btop[j+1]-btop[j], sizeof(ITYPE));
273  //BlockPrefetch(&num[btop[j]], btop[j+1]-btop[j], sizeof(T));
274  // optimization: block prefetch
275 
276  ITYPE blocknz = btop[j+1] - btop[j];
277  if(blocknz > 8 && blocknz < 4096)
278  {
279  // number of elements in one cache line
280  const int botinc = CLSIZE/sizeof(ITYPE);
281  const int numinc = CLSIZE/sizeof(T);
282 
283  for (ITYPE i=0; i < blocknz; i+=botinc) // prefetch once for every cache line
284  {
285  __builtin_prefetch (&bot[btop[j]+i], 0, 0); // prefetch read-only, with no temporal locality
286  }
287  for (ITYPE i=0; i < blocknz; i+=numinc)
288  {
289  __builtin_prefetch (&num[btop[j]+i], 0, 0);
290  }
291  }
292 #endif
293 
294 #ifdef SSEOPT
295  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
296  __m128i m2 = _mm_set_epi64x(mlong1, mlong2); // pack two __int64 integers (for x86_64)
297 #endif
298 
299 #ifdef PERFDEB
300  ofstream stat("Perfdeb.txt", ios::app);
301  stat << btop[j+1] - btop[j] << endl;
302 #endif
303 
304  for (ITYPE k = btop[j] ; k < btop[j+1] ; ++k) // for all nonzeros within ith block (expected =~ nnz/n = c)
305  {
306  ITYPE rli = ((bot[k] >> collowbits) & lowrowmask);
307  ITYPE cli = (bot[k] & lowcolmask);
308 
309  suby [rli] += num[k] * subx [cli] ;
310  }
311  }
312 }
313 
314 // This is a static member function does not have a "this" pointer
315 template <class T, class ITYPE>
316 void Sym<T, ITYPE>::BlockPrefetch(void * addr, int total, int ssize)
317 {
318  p_fetch = 0;
319  int * __restrict a = (int*) addr;
320  const int inc = CLSIZE/ssize; // number of elements in one cache line
321 
322  // Grab every 64th address
323  for(int i=0; i<total; i+=inc)
324  {
325  p_fetch += a[i];
326  }
327 }
328 
329 
330 
331 template <class T, class ITYPE>
333 {
334  // when we jump to the next block in the same block-column, we move leaddim positions inside "top" array
335  // leadim ~= sqrt(n) => number of blocks in each block-row
336  ITYPE leaddim = lowcolmask+1;
337  Sym symT(nz, m, n); // create empty transposed object
338 
339  ITYPE k = 0;
340  ITYPE cnz = 0;
341 
342  for(ITYPE j = 0; j < leaddim; ++j) // scan columns of top-level structure (~sqrt(n) iterations)
343  {
344  for(ITYPE i = j; i < ntop ; i += leaddim) // iterates ~ sqrt(m) times within the block column
345  {
346  symT.top[k++] = cnz;
347  cnz += top[i+1]-top[i];
348  }
349  }
350  symT.top[k] = cnz;
351 
352  // Embarrassingly parallel sort of indices to get new bottom array
353  // ITYPE nindex = (highmask & csc.ir [i]) | ((highmask & bot) >> 4);
354 }
355 
unsigned int nextpoweroftwo(unsigned int v)
Definition: utility.h:401
Definition: sym.h:14
Sym< T, ITYPE > & operator=(const Sym< T, ITYPE > &rhs)
Definition: sym.cpp:94
#define SYNCHED
Definition: utility.h:21
#define BREAKEVEN
Definition: utility.h:136
~Sym()
Definition: sym.cpp:154
#define CLSIZE
Definition: utility.h:133
void Transpose()
Definition: sym.cpp:332
unsigned int highestbitset(unsigned __int64 v)
Definition: utility.h:423
Definition: csc.h:15
Sym()
Definition: sym.h:17