10 #include <tmmintrin.h>
11 #include <smmintrin.h>
12 #include <nmmintrin.h>
13 #include <wmmintrin.h>
15 #include <ammintrin.h>
16 #include <x86intrin.h>
19 #include "timer.clock_gettime.c"
71 Mask.
i = _mm_shuffle_epi32( Mask.
i, _MM_SHUFFLE(3, 3, 1, 1) );
72 Mask.
i = _mm_srai_epi32 ( Mask.
i, 31 );
74 B.
i = _mm_and_si128( B.
i, Mask.
i );
75 A.
i = _mm_andnot_si128( Mask.
i, A.
i );
76 A.
i = _mm_or_si128( A.
i, B.
i );
82 #define _mm_blendv_pd ssp_blendv_pd_SSE2
91 template <
typename IT>
92 void SSEspmv(
const double *V,
const uint64_t *M,
const IT *bot,
const IT nrb,
const double *X,
double *Y){
93 const double *_V = V-1;
94 __m128d Y01QW = _mm_setzero_pd();
95 __m128d Y23QW = _mm_setzero_pd();
96 __m128d Y45QW = _mm_setzero_pd();
97 __m128d Y67QW = _mm_setzero_pd();
100 for(IT i=0;i<nbr;++i){
101 const uint64_t Ci = C[i];
102 const uint64_t Zi = ~M[i];
105 const uint64_t Zil = Zi << 1;
106 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
108 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zi<<1,1);
110 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
111 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
112 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
114 const __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
115 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0x8000000000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
116 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xE000000000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
117 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xF800000000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
118 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFE00000000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
120 const __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
121 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFF80000000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
122 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFE0000000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
123 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFF8000000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
124 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFE000000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
126 const __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
127 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFF800000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
128 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFE00000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
129 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFF80000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
130 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFE0000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
132 const __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
133 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFF8000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
134 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFE000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
135 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFF800000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
136 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFE00000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
138 const __m128d X44QW = _mm_loaddup_pd(&X[4+Ci]);
139 Y01QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFF80000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
140 Y23QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFE0000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
141 Y45QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFF8000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
142 Y67QW = _mm_add_pd(_mm_mul_pd(X44QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFE000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
144 const __m128d X55QW = _mm_loaddup_pd(&X[5+Ci]);
145 Y01QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFF800000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
146 Y23QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFE00000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
147 Y45QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFF80000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
148 Y67QW = _mm_add_pd(_mm_mul_pd(X55QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFE0000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
150 const __m128d X66QW = _mm_loaddup_pd(&X[6+Ci]);
151 Y01QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFF8000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
152 Y23QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFE000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);Z23QW=_mm_slli_epi64(Z23QW,8);
153 Y45QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFF800)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);Z45QW=_mm_slli_epi64(Z45QW,8);
154 Y67QW = _mm_add_pd(_mm_mul_pd(X66QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFE00)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
156 const __m128d X77QW = _mm_loaddup_pd(&X[7+Ci]);
157 Y01QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFF80)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);
158 Y23QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFFE0)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y23QW);
159 Y45QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFFF8)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y45QW);
160 Y67QW = _mm_add_pd(_mm_mul_pd(X77QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[i]&0xFFFFFFFFFFFFFFFE)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);
162 _V = (
double*)((uint64_t)_V+__builtin_popcountll(M[i]));
165 _mm_store_pd(&Y[0],Y01QW);
166 _mm_store_pd(&Y[2],Y23QW);
167 _mm_store_pd(&Y[4],Y45QW);
168 _mm_store_pd(&Y[6],Y67QW);
173 uint64_t i,trial,trials=1000;
176 double *V = (
double*)malloc(N*64*
sizeof(
double));
for(i=0;i<N*64;i++)V[i]=1.0*(i+1);
177 uint64_t *M = (uint64_t*)malloc(N* 1*
sizeof(uint64_t));
for(i=0;i<N* 1;i++)M[i]=0x0000000000000000;
178 uint32_t *C = (uint32_t*)malloc(N* 1*
sizeof(uint32_t));
for(i=0;i<N* 1;i++)C[i]=0;
179 uint32_t *P = (uint32_t*)malloc(N* 1*
sizeof(uint32_t));
for(i=0;i<N* 1;i++)P[i]=0;
181 double *X = (
double*)malloc(1024*
sizeof(
double));
for(i=0;i<1024;i++)X[i]=1.0;
182 double *Y = (
double*)malloc(1024*
sizeof(
double));
for(i=0;i<1024;i++)Y[i]=1.0;
184 double t0 = timer_seconds_since_init();
185 for(trial=0;trial<trials;trial++){
187 double *temp=Y;Y=X;X=temp;
189 double t1 = timer_seconds_since_init();
191 printf(
"%0.9f seconds\n",t1-t0);
192 printf(
"%4.3f GFlop/s\n",128*N*trials/(t1-t0)/1e9);
193 for(i=0;i<8;i++)printf(
"%6.3f ",X[i]);printf(
"\n");
194 for(i=0;i<8;i++)printf(
"%6.3f ",Y[i]);printf(
"\n");
unsigned long long ssp_u64
unsigned long long ssp_u64
__m128d ssp_blendv_pd_SSE2(__m128d a, __m128d b, __m128d mask)
void SSEspmv(const double *V, const uint64_t *M, const IT *bot, const IT nrb, const double *X, double *Y)