Compressed Sparse Blocks  1.2
 All Classes Files Functions Variables Typedefs Friends Macros Pages
SSEspmv.h
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <stdint.h>
4 #include <mmintrin.h> // MMX
5 #include <xmmintrin.h> // SSE
6 #include <emmintrin.h> // SSE 2
7 #include <pmmintrin.h> // SSE 3
8 
9 #ifndef AMD
10  #include <tmmintrin.h> // SSSE 3
11  #include <smmintrin.h> // SSE 4.1
12  #include <nmmintrin.h> // SSE 4.2
13  #include <wmmintrin.h> // SSE ?? (AES)
14 #else
15  #include <ammintrin.h> // SSE4A (amd's popcount)
16  #include <x86intrin.h>
17 #endif
18 
19 #include "timer.clock_gettime.c"
20 
21 
22 //---------------------------------------
23 // Type Definitions
24 //---------------------------------------
25 
26 typedef signed char ssp_s8;
27 typedef unsigned char ssp_u8;
28 
29 typedef signed short ssp_s16;
30 typedef unsigned short ssp_u16;
31 
32 typedef signed int ssp_s32;
33 typedef unsigned int ssp_u32;
34 
35 typedef float ssp_f32;
36 typedef double ssp_f64;
37 
38 typedef signed long long ssp_s64;
39 typedef unsigned long long ssp_u64;
40 
41 typedef union
42 {
43 __m128 f;
44 __m128d d;
45 __m128i i;
46 __m64 m64[ 2];
47 ssp_u64 u64[ 2];
48 ssp_s64 s64[ 2];
49 ssp_f64 f64[ 2];
50 ssp_u32 u32[ 4];
51 ssp_s32 s32[ 4];
52 ssp_f32 f32[ 4];
53 ssp_u16 u16[ 8];
54 ssp_s16 s16[ 8];
55 ssp_u8 u8 [16];
56 ssp_s8 s8 [16];
57 } ssp_m128;
58 
59 
61 inline __m128d ssp_blendv_pd_SSE2( __m128d a, __m128d b, __m128d mask )
62 {
63  ssp_m128 A, B, Mask;
64  A.d = a;
65  B.d = b;
66  Mask.d = mask;
67 
68 // _MM_SHUFFLE(z,y,x,w) does not select anything, this macro just creates a mask
69 // expands to the following value: (z<<6) | (y<<4) | (x<<2) | w
70 
71  Mask.i = _mm_shuffle_epi32( Mask.i, _MM_SHUFFLE(3, 3, 1, 1) );
72  Mask.i = _mm_srai_epi32 ( Mask.i, 31 );
73 
74  B.i = _mm_and_si128( B.i, Mask.i );
75  A.i = _mm_andnot_si128( Mask.i, A.i );
76  A.i = _mm_or_si128( A.i, B.i );
77  return A.d;
78 }
79 
80 
81 #ifdef AMD
82  #define _mm_blendv_pd ssp_blendv_pd_SSE2
83 #endif
84 
91 template <typename IT>
92 void SSEspmv(const double *V, const uint64_t *M, const IT *bot, const IT nrb, const double *X, double *Y){
93  const double *_V = V-1;
94  __m128d Y01QW = _mm_setzero_pd();
95  __m128d Y23QW = _mm_setzero_pd();
96  __m128d Y45QW = _mm_setzero_pd();
97  __m128d Y67QW = _mm_setzero_pd();
98  // use popcnt to index into nonzero stream
99  // use blendv where 1 = zero
100  for(IT i=0;i<nbr;++i){
101  const uint64_t Ci = C[i];
102  const uint64_t Zi = ~M[i]; // a 1 denotes a zero
103 
104 #ifdef AMD
105  const uint64_t Zil = Zi << 1;
106  __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
107 #else
108  __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zi<<1,1); // Z01[0][63] = Z[63]
109 #endif
110  __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
111  __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
112  __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
113  //--------------------------------------------------------------------------
114  const __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
115  Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0x8000000000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
116  Y23QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xE000000000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
117  Y45QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xF800000000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
118  Y67QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFE00000000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
119  //--------------------------------------------------------------------------
120  const __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
121  Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFF80000000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
122  Y23QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFE0000000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
123  Y45QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFF8000000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
124  Y67QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFE000000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
125  //--------------------------------------------------------------------------
126  const __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
127  Y01QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFF800000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
128  Y23QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFE00000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
129  Y45QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFF80000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
130  Y67QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFE0000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
131  //--------------------------------------------------------------------------
132  const __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
133  Y01QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFF8000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
134  Y23QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFE000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
135  Y45QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFF800000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
136  Y67QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFE00000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
137  //--------------------------------------------------------------------------
138  const __m128d X44QW = _mm_loaddup_pd(&X[4+Ci]);
139  Y01QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFF80000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
140  Y23QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFE0000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
141  Y45QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFF8000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
142  Y67QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFE000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
143  //--------------------------------------------------------------------------
144  const __m128d X55QW = _mm_loaddup_pd(&X[5+Ci]);
145  Y01QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFF800000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
146  Y23QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFE00000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
147  Y45QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFF80000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
148  Y67QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFE0000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
149  //--------------------------------------------------------------------------
150  const __m128d X66QW = _mm_loaddup_pd(&X[6+Ci]);
151  Y01QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFF8000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
152  Y23QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFE000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
153  Y45QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFF800)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
154  Y67QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFE00)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
155  //--------------------------------------------------------------------------
156  const __m128d X77QW = _mm_loaddup_pd(&X[7+Ci]);
157  Y01QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFF80)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);
158  Y23QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFFE0)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);
159  Y45QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFFF8)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);
160  Y67QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFFFE)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);
161  //--------------------------------------------------------------------------
162  _V = (double*)((uint64_t)_V+__builtin_popcountll(M[i]));
163  //--------------------------------------------------------------------------
164  }
165  _mm_store_pd(&Y[0],Y01QW);
166  _mm_store_pd(&Y[2],Y23QW);
167  _mm_store_pd(&Y[4],Y45QW);
168  _mm_store_pd(&Y[6],Y67QW);
169 }
170 
171 int main (){
172  timer_init();
173  uint64_t i,trial,trials=1000;
174 
175  uint64_t N = 10000;
176  double *V = ( double*)malloc(N*64*sizeof( double));for(i=0;i<N*64;i++)V[i]=1.0*(i+1);
177  uint64_t *M = (uint64_t*)malloc(N* 1*sizeof(uint64_t));for(i=0;i<N* 1;i++)M[i]=0x0000000000000000;
178  uint32_t *C = (uint32_t*)malloc(N* 1*sizeof(uint32_t));for(i=0;i<N* 1;i++)C[i]=0;
179  uint32_t *P = (uint32_t*)malloc(N* 1*sizeof(uint32_t));for(i=0;i<N* 1;i++)P[i]=0;
180 
181  double *X = ( double*)malloc(1024*sizeof( double));for(i=0;i<1024;i++)X[i]=1.0;
182  double *Y = ( double*)malloc(1024*sizeof( double));for(i=0;i<1024;i++)Y[i]=1.0;
183 
184  double t0 = timer_seconds_since_init();
185  for(trial=0;trial<trials;trial++){
186  SpMV(V,M,C,P,N,X,Y);
187  double *temp=Y;Y=X;X=temp;
188  }
189  double t1 = timer_seconds_since_init();
190 
191  printf("%0.9f seconds\n",t1-t0);
192  printf("%4.3f GFlop/s\n",128*N*trials/(t1-t0)/1e9);
193  for(i=0;i<8;i++)printf("%6.3f ",X[i]);printf("\n");
194  for(i=0;i<8;i++)printf("%6.3f ",Y[i]);printf("\n");
195  return 0;
196 }
unsigned int ssp_u32
Definition: SSEspmv.cpp:60
signed long long ssp_s64
Definition: SSEspmv.cpp:65
unsigned short ssp_u16
Definition: SSEspmv.cpp:57
signed long long ssp_s64
Definition: SSEspmv.h:38
double ssp_f64
Definition: SSEspmv.h:36
unsigned char ssp_u8
Definition: SSEspmv.cpp:54
__m128i i
Definition: SSEspmv.cpp:72
float ssp_f32
Definition: SSEspmv.cpp:62
signed char ssp_s8
Definition: SSEspmv.cpp:53
double ssp_f64
Definition: SSEspmv.cpp:63
unsigned long long ssp_u64
Definition: SSEspmv.h:39
signed int ssp_s32
Definition: SSEspmv.cpp:59
unsigned short ssp_u16
Definition: SSEspmv.h:30
unsigned long long ssp_u64
Definition: SSEspmv.cpp:66
unsigned int ssp_u32
Definition: SSEspmv.h:33
unsigned char ssp_u8
Definition: SSEspmv.h:27
__m128d d
Definition: SSEspmv.cpp:71
signed short ssp_s16
Definition: SSEspmv.cpp:56
float ssp_f32
Definition: SSEspmv.h:35
__m128d ssp_blendv_pd_SSE2(__m128d a, __m128d b, __m128d mask)
Definition: SSEspmv.h:61
signed short ssp_s16
Definition: SSEspmv.h:29
signed int ssp_s32
Definition: SSEspmv.h:32
signed char ssp_s8
Definition: SSEspmv.h:26
int main()
Definition: SSEspmv.h:171
void SSEspmv(const double *V, const uint64_t *M, const IT *bot, const IT nrb, const double *X, double *Y)
Definition: SSEspmv.h:92