4 #define __int64 long long
13 #include <xmmintrin.h>
14 #include <emmintrin.h>
15 #include <pmmintrin.h>
19 #include <cilk/cilk_api.h>
20 #include <cilk/cilk.h>
21 #define SYNCHED __cilkrts_synched()
22 #define DETECT __cilkscreen_enable_checking()
23 #define ENDDETECT __cilkscreen_disable_checking()
24 #define WORKERS __cilkrts_get_nworkers()
44 CILK_EXPORT __CILKRTS_NOTHROW
52 #define __cilkrts_synched() (1)
56 #include <cilk/reducer_opadd.h>
57 cilk::reducer_opadd<__int64> blockparcalls;
58 cilk::reducer_opadd<__int64> subspmvcalls;
59 cilk::reducer_opadd<__int64> atomicflops;
73 const uint64_t
masktable64[64] = {0x8000000000000000, 0x4000000000000000, 0x2000000000000000, 0x1000000000000000,
74 0x0800000000000000, 0x0400000000000000, 0x0200000000000000, 0x0100000000000000,
75 0x0080000000000000, 0x0040000000000000, 0x0020000000000000, 0x0010000000000000,
76 0x0008000000000000, 0x0004000000000000, 0x0002000000000000, 0x0001000000000000,
77 0x0000800000000000, 0x0000400000000000, 0x0000200000000000, 0x0000100000000000,
78 0x0000080000000000, 0x0000040000000000, 0x0000020000000000, 0x0000010000000000,
79 0x0000008000000000, 0x0000004000000000, 0x0000002000000000, 0x0000001000000000,
80 0x0000000800000000, 0x0000000400000000, 0x0000000200000000, 0x0000000100000000,
81 0x0000000080000000, 0x0000000040000000, 0x0000000020000000, 0x0000000010000000,
82 0x0000000008000000, 0x0000000004000000, 0x0000000002000000, 0x0000000001000000,
83 0x0000000000800000, 0x0000000000400000, 0x0000000000200000, 0x0000000000100000,
84 0x0000000000080000, 0x0000000000040000, 0x0000000000020000, 0x0000000000010000,
85 0x0000000000008000, 0x0000000000004000, 0x0000000000002000, 0x0000000000001000,
86 0x0000000000000800, 0x0000000000000400, 0x0000000000000200, 0x0000000000000100,
87 0x0000000000000080, 0x0000000000000040, 0x0000000000000020, 0x0000000000000010,
88 0x0000000000000008, 0x0000000000000004, 0x0000000000000002, 0x0000000000000001 };
91 const unsigned short masktable16[16] = {0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
92 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 };
95 const unsigned char masktable4[4] = { 0x08, 0x04, 0x02, 0x01 };
98 template <
typename MTYPE>
129 #define RBSIZE (RBDIM*RBDIM) // size of a register block (8x8 in this case)
132 #define L2SIZE (256*KBYTE / RHSDIM) // less than half of the L2 Cache (L2 should hold x & y at the same time) - scaled back by RHSDIM
133 #define CLSIZE 64 // cache line size
136 #define BREAKEVEN 4 // A block (or subblock) with less than (BREAKEVEN * dimension) nonzeros won't be parallelized
137 #define MINNNZTOPAR 128 // A block (or subblock) with less than MINNNZTOPAR nonzeros won't be parallelized
138 #define BREAKNRB (8/RBDIM) // register blocked version of BREAKEVEN
139 #define MINNRBTOPAR (256/RBDIM) // register blocked version of MINNNZPAR
143 #define EPSILON 0.0001
147 #define absdiff(x,y) ( (x) > (y) ? (x-y) : (y-x))
150 unsigned rmasks[32] = { 0x00000001, 0x00000002, 0x00000004, 0x00000008,
151 0x00000010, 0x00000020, 0x00000040, 0x00000080,
152 0x00000100, 0x00000200, 0x00000400, 0x00000800,
153 0x00001000, 0x00002000, 0x00004000, 0x00008000,
154 0x00010000, 0x00020000, 0x00040000, 0x00080000,
155 0x00100000, 0x00200000, 0x00400000, 0x00800000,
156 0x01000000, 0x02000000, 0x04000000, 0x08000000,
157 0x10000000, 0x20000000, 0x40000000, 0x80000000 };
160 void popcountall(
const uint64_t * __restrict M,
unsigned * __restrict count,
size_t size);
161 void popcountall(
const unsigned short * __restrict M,
unsigned * __restrict count,
size_t size);
162 void popcountall(
const unsigned char * __restrict M,
unsigned * __restrict count,
size_t size);
165 template <
typename T>
169 outfile.open(
"hist.csv");
170 vector<T> hist(bins);
171 for(
size_t i=0; i< size; ++i)
174 outfile <<
"Fill_ratio" <<
"," <<
"count" << endl;
175 for(
size_t i=0; i< bins; ++i)
177 outfile << static_cast<float>(i) / bins <<
"," << hist[i] <<
"\n";
190 template <
typename MTYPE>
191 unsigned prescan(
unsigned * a, MTYPE *
const M,
int n)
193 unsigned * end = a+n;
195 MTYPE * __restrict _M = M;
200 unsigned _n =
rmasks[lgn];
203 unsigned share = _n/numthreads;
204 cilk_for(
int t=0; t < numthreads; ++t)
206 popcountall(_M+t*share, _a+t*share, ((t+1)==numthreads)?(_n-t*share):share);
208 thdatas[t].
beg = _a + t*share;
209 thdatas[t].
end = _a + (((t+1)==numthreads)?_n:((t+1)*share));
210 thdatas[t].
sum = accumulate(thdatas[t].beg, thdatas[t].end, thdatas[t].sum);
212 for(
int t=0; t<numthreads; ++t)
214 unsigned temp = thdatas[t].
sum;
215 thdatas[t].
sum = sum;
218 cilk_for(
int tt=0; tt<numthreads; ++tt)
220 unsigned * beg = thdatas[tt].
beg;
221 unsigned * end = thdatas[tt].
end;
222 unsigned locsum = thdatas[tt].
sum;
226 unsigned temp = *beg;
249 unsigned char *ret_ptr = (
unsigned char *)malloc( size + 16 );
250 int temp = (
unsigned long)ret_ptr & 0xF;
251 int shift = 16 - temp;
253 ret_ptr[ -1 ] = shift;
264 template <
typename ITYPE>
269 for (ITYPE i = 0 ; i < size ; ++i)
279 template <
typename T>
284 machEps /=
static_cast<T
>(2.0);
288 while ((T)(
static_cast<T
>(1.0) + (machEps/
static_cast<T
>(2.0))) != 1.0);
294 template<
typename _ForwardIter,
typename T>
295 void iota(_ForwardIter __first, _ForwardIter __last, T __value)
297 while (__first != __last)
298 *__first++ = __value++;
301 template<
typename T,
typename I>
304 T ** array =
new T*[m];
305 for(I i = 0; i<m; ++i)
306 array[i] =
new T[n]();
310 template<
typename T,
typename I>
313 for(I i = 0; i<m; ++i)
319 template <
typename T >
322 T operator () ( T
const &arg1, T
const &arg2 )
const
325 return abs( arg1 - arg2 );
332 void MultAdd(
double & a,
const double & b,
const double & c)
334 for(
int i=0; i<D; i++)
343 template <
typename ITYPE>
347 int ite =
sizeof(z) * CHAR_BIT / 2;
349 for (
int i = 0; i < ite; ++i)
352 z |= (x & (1 << i)) << i | (y & (1 << i)) << (i + 1);
358 template <
typename ITYPE,
typename OTYPE>
362 int ite =
sizeof(x) * CHAR_BIT;
364 for (
int i = 0; i < ite; ++i)
367 z |= (x & (1 << i)) << i | (y & (1 << i)) << (i + 1);
372 template <
unsigned BASE>
378 while ( i <= exponent )
395 template <
typename T>
398 return ( (x>0) && ((x & (x-1)) == 0));
406 unsigned int n = v-1;
426 const unsigned __int64 b[] = {0x2ULL, 0xCULL, 0xF0ULL, 0xFF00ULL, 0xFFFF0000ULL, 0xFFFFFFFF00000000ULL};
427 const unsigned int S[] = {1, 2, 4, 8, 16, 32};
431 for (i = 5; i >= 0; i--)
446 cerr <<
"Indices can not be negative, aborting..." << endl;
453 return static_cast< __int64 > (ur);
463 const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
464 const unsigned int S[] = {1, 2, 4, 8, 16};
468 for (i = 4; i >= 0; i--)
483 cerr <<
"Indices can not be negative, aborting..." << endl;
488 unsigned int uv =
static_cast< unsigned int> (v);
490 return static_cast< int > (ur);
496 inline unsigned int getModulo(
unsigned int n,
unsigned int d)
498 return ( n & (d-1) );
const unsigned short masktable16[16]
void printhistogram(const T *scansum, size_t size, unsigned bins)
unsigned int getModulo(unsigned int n, unsigned int d)
unsigned int getDivident(unsigned int n, unsigned int d)
unsigned int nextpoweroftwo(unsigned int v)
const unsigned char masktable4[4]
const uint64_t masktable64[64]
void MultAdd(double &a, const double &b, const double &c)
T ** allocate2D(I m, I n)
unsigned char GetMaskTable< unsigned char >(unsigned int index)
uint64_t GetMaskTable< uint64_t >(unsigned int index)
unsigned prescan(unsigned *a, MTYPE *const M, int n)
unsigned short GetMaskTable< unsigned short >(unsigned int index)
OTYPE BitInterleave(ITYPE x, ITYPE y)
ITYPE BitInterleaveLow(ITYPE x, ITYPE y)
unsigned char * aligned_malloc(uint64_t size)
unsigned IntPower< 2 >(unsigned exponent)
void iota(_ForwardIter __first, _ForwardIter __last, T __value)
ITYPE CumulativeSum(ITYPE *arr, ITYPE size)
void aligned_free(unsigned char *ptr)
unsigned int highestbitset(unsigned __int64 v)
CILK_EXPORT __CILKRTS_NOTHROW int __cilkrts_synched(void)
unsigned IntPower(unsigned exponent)
void deallocate2D(T **array, I m)
MTYPE GetMaskTable(unsigned int index)
void popcountall(const uint64_t *__restrict M, unsigned *__restrict count, size_t size)