10 #include <pmmintrin.h>
13 #include <tmmintrin.h>
14 #include <smmintrin.h>
15 #include <nmmintrin.h>
16 #include <wmmintrin.h>
20 #include <nmmintrin.h>
22 #include <ammintrin.h>
28 const uint64_t
masktable64[64] = {0x8000000000000000, 0x4000000000000000, 0x2000000000000000, 0x1000000000000000,
29 0x0800000000000000, 0x0400000000000000, 0x0200000000000000, 0x0100000000000000,
30 0x0080000000000000, 0x0040000000000000, 0x0020000000000000, 0x0010000000000000,
31 0x0008000000000000, 0x0004000000000000, 0x0002000000000000, 0x0001000000000000,
32 0x0000800000000000, 0x0000400000000000, 0x0000200000000000, 0x0000100000000000,
33 0x0000080000000000, 0x0000040000000000, 0x0000020000000000, 0x0000010000000000,
34 0x0000008000000000, 0x0000004000000000, 0x0000002000000000, 0x0000001000000000,
35 0x0000000800000000, 0x0000000400000000, 0x0000000200000000, 0x0000000100000000,
36 0x0000000080000000, 0x0000000040000000, 0x0000000020000000, 0x0000000010000000,
37 0x0000000008000000, 0x0000000004000000, 0x0000000002000000, 0x0000000001000000,
38 0x0000000000800000, 0x0000000000400000, 0x0000000000200000, 0x0000000000100000,
39 0x0000000000080000, 0x0000000000040000, 0x0000000000020000, 0x0000000000010000,
40 0x0000000000008000, 0x0000000000004000, 0x0000000000002000, 0x0000000000001000,
41 0x0000000000000800, 0x0000000000000400, 0x0000000000000200, 0x0000000000000100,
42 0x0000000000000080, 0x0000000000000040, 0x0000000000000020, 0x0000000000000010,
43 0x0000000000000008, 0x0000000000000004, 0x0000000000000002, 0x0000000000000001 };
46 const unsigned short masktable16[16] = {0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
47 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 };
102 Mask.
i = _mm_shuffle_epi32( Mask.
i, _MM_SHUFFLE(3, 3, 1, 1) );
103 Mask.
i = _mm_srai_epi32 ( Mask.
i, 31 );
105 B.
i = _mm_and_si128( B.
i, Mask.
i );
106 A.
i = _mm_andnot_si128( Mask.
i, A.
i );
107 A.
i = _mm_or_si128( A.
i, B.
i );
113 #define _mm_blendv_pd ssp_blendv_pd_SSE2
117 #define __builtin_popcountll _mm_popcnt_u64
118 #define __builtin_popcount _mm_popcnt_u32
124 0x0, 0x20, 0x10, 0x30, 0x8, 0x28, 0x18, 0x38,
125 0x4, 0x24, 0x14, 0x34, 0xc, 0x2c, 0x1c, 0x3c,
126 0x2, 0x22, 0x12, 0x32, 0xa, 0x2a, 0x1a, 0x3a,
127 0x6, 0x26, 0x16, 0x36, 0xe, 0x2e, 0x1e, 0x3e,
128 0x1, 0x21, 0x11, 0x31, 0x9, 0x29, 0x19, 0x39,
129 0x5, 0x25, 0x15, 0x35, 0xd, 0x2d, 0x1d, 0x3d,
130 0x3, 0x23, 0x13, 0x33, 0xb, 0x2b, 0x1b, 0x3b,
131 0x7, 0x27, 0x17, 0x37, 0xf, 0x2f, 0x1f, 0x3f
148 "movq %0, %%rax \n\t"
149 "xorpd %%xmm0, %%xmm0 \n\t"
150 "movsd %1, %%xmm0\n\t"
153 "movq %%rax, %%xmm1\n\t"
154 "addsd %%xmm0, %%xmm1\n\t"
155 "movq %%xmm1, %%r8 \n\t"
156 "lock cmpxchgq %%r8, %0\n\t"
160 :
"cc",
"memory",
"%rax",
"%r8",
"%xmm0",
"%xmm1"
165 void symcsr(
const double * __restrict V,
const uint64_t * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
166 const double * __restrict X,
const double * __restrict XT,
double * Y,
double * YT,
unsigned lowmask,
unsigned nlowbits)
168 static const size_t NMortonRows64[] =
170 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
171 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
172 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
173 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7
175 static const size_t NMortonCols64[] =
177 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3,
178 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3,
179 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7,
180 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7
183 for(
unsigned i=0; i<nrb;++i)
185 const unsigned Ci = bot[i] & lowmask;
186 const unsigned Ri = (bot[i] >> nlowbits) & lowmask;
187 uint64_t mask = M[i];
188 for(
size_t j=0; j<64; ++j)
200 void symcsr(
const double * __restrict V,
const unsigned short * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
201 const double * __restrict X,
const double * __restrict XT,
double * Y,
double * YT,
unsigned lowmask,
unsigned nlowbits)
203 static const size_t NMortonRows16[] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3 };
204 static const size_t NMortonCols16[] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 };
206 for(
unsigned i=0; i<nrb;++i)
208 const unsigned Ci = bot[i] & lowmask;
209 const unsigned Ri = (bot[i] >> nlowbits) & lowmask;
210 unsigned short mask = M[i];
211 for(
size_t j=0; j<16; ++j)
222 void symcsr(
const double * __restrict V,
const unsigned char * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
223 const double * __restrict X,
const double * __restrict XT,
double * Y,
double * YT,
unsigned lowmask,
unsigned nlowbits)
225 for(
unsigned i=0; i<nrb;++i)
227 const unsigned Ci = bot[i] & lowmask;
228 const unsigned Ri = (bot[i] >> nlowbits) & lowmask;
229 unsigned char mask = M[i];
266 void SSEsym(
const double * __restrict V,
const unsigned char * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
267 const double * __restrict X,
double * __restrict Y,
unsigned lowmask,
unsigned nlbits)
269 const double * __restrict _V = V-1;
273 for(
unsigned ind=0;ind<nrb;++ind)
275 const unsigned Ci = bot[ind] & lowmask;
276 const unsigned Ri = (bot[ind] >> nlbits) & lowmask;
278 const uint64_t m64 = (uint64_t) M[ind];
279 const uint64_t Zi = ((~m64) << 60);
280 const uint64_t Zil = Zi << 1;
283 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
285 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
287 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
289 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
292 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
293 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
294 __m128d X01QW = _mm_loadu_pd(&X[Ri]);
299 __m128d A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0x8)])), _mm_setzero_pd(),(__m128d)Z01QW);
300 __m128d A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xE)])), _mm_setzero_pd(),(__m128d)Z23QW);
302 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,A01QW),Y01QW);
303 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,A23QW),Y01QW);
304 __m128d Y00QW = _mm_mul_pd(X01QW, A01QW);
305 __m128d Y11QW = _mm_mul_pd(X01QW, A23QW);
308 _V += __builtin_popcount(M[ind] & 0x0F);
315 _mm_store_pd(&Y[Ri],Y01QW);
318 Y[Ci+0] += yt0.
f64[0] + yt0.
f64[1];
319 Y[Ci+1] += yt1.
f64[0] + yt1.
f64[1];
329 void SSEsym(
const double * __restrict V,
const unsigned char * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
330 const double * __restrict X,
const double * __restrict XT,
double * Y,
double * YT,
unsigned lowmask,
unsigned nlbits)
332 const double * __restrict _V = V-1;
336 for(
unsigned ind=0;ind<nrb;++ind)
338 const unsigned Ci = bot[ind] & lowmask;
339 const unsigned Ri = (bot[ind] >> nlbits) & lowmask;
341 const uint64_t m64 = (uint64_t) M[ind];
342 const uint64_t Zi = ((~m64) << 60);
343 const uint64_t Zil = Zi << 1;
346 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
348 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
350 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
352 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
355 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
356 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
357 __m128d X01QW = _mm_loadu_pd(&XT[Ri]);
362 __m128d A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0x8)])), _mm_setzero_pd(),(__m128d)Z01QW);
363 __m128d A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xE)])), _mm_setzero_pd(),(__m128d)Z23QW);
365 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,A01QW),Y01QW);
366 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,A23QW),Y01QW);
367 __m128d YT0QW = _mm_mul_pd(X01QW, A01QW);
368 __m128d YT1QW = _mm_mul_pd(X01QW, A23QW);
371 _V += __builtin_popcount(M[ind] & 0x0F);
378 YT[Ci+0] += yt0.
f64[0] + yt0.
f64[1];
379 YT[Ci+1] += yt1.
f64[0] + yt1.
f64[1];
380 _mm_store_pd(&Y[Ri],Y01QW);
395 void SSEspmv(
const double * __restrict V,
const unsigned char * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
const double * __restrict X,
double * Y,
unsigned lcmask,
unsigned lrmask,
unsigned clbits)
397 const double * __restrict _V = V-1;
401 for(
unsigned ind=0;ind<nrb;++ind)
403 const unsigned Ci = bot[ind] & lcmask;
404 const unsigned Ri = (bot[ind] >> clbits) & lrmask;
406 const uint64_t m64 = (uint64_t) M[ind];
407 const uint64_t Zi = ((~m64) << 60);
408 const uint64_t Zil = Zi << 1;
411 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
413 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
415 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
417 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
420 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
421 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
426 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0x8)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);
427 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xE)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);
430 _V += __builtin_popcount(M[ind] & 0x0F);
433 _mm_store_pd(&Y[Ri],Y01QW);
439 void SSEsym(
const double * __restrict V,
const uint64_t * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
440 const double * __restrict X,
double * Y,
unsigned lowmask,
unsigned nlbits)
442 const double * __restrict _V = V-1;
444 for(
unsigned ind=0;ind<nrb;++ind)
446 const unsigned Ci = bot[ind] & lowmask;
447 const unsigned Ri = (bot[ind] >> nlbits) & lowmask;
448 const uint64_t Zi = ~M[ind];
449 const uint64_t Zil = Zi << 1;
452 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
454 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
456 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
457 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
458 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
460 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
461 __m128d Y23QW = _mm_loadu_pd(&Y[Ri+2]);
462 __m128d Y45QW = _mm_loadu_pd(&Y[Ri+4]);
463 __m128d Y67QW = _mm_loadu_pd(&Y[Ri+6]);
466 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
467 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
468 __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
469 __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
471 __m128d X01QW = _mm_loadu_pd(&X[Ri]);
472 __m128d X23QW = _mm_loadu_pd(&X[Ri+2]);
473 __m128d X45QW = _mm_loadu_pd(&X[Ri+4]);
474 __m128d X67QW = _mm_loadu_pd(&X[Ri+6]);
476 __m128d A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0x8000000000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
477 __m128d A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xE000000000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
478 __m128d A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xF800000000000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
479 __m128d A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFE00000000000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
481 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
482 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
483 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
484 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
486 __m128d Y00QW = _mm_mul_pd(X01QW, A01QW);
487 __m128d Y11QW = _mm_mul_pd(X01QW, A23QW);
488 Y00QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), Y00QW);
489 Y11QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), Y11QW);
492 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFF80000000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
493 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFE0000000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
494 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFF8000000000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
495 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFE000000000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
497 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
498 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
499 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
500 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
502 __m128d Y22QW = _mm_mul_pd(X01QW, A01QW);
503 __m128d Y33QW = _mm_mul_pd(X01QW, A23QW);
504 Y22QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), Y22QW);
505 Y33QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), Y33QW);
508 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFF800000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
509 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFE00000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
510 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFF80000000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
511 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFE0000000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
513 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y45QW); Z01QW=_mm_slli_epi64(Z01QW,8);
514 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y67QW); Z45QW=_mm_slli_epi64(Z45QW,8);
515 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y45QW); Z23QW=_mm_slli_epi64(Z23QW,8);
516 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y67QW); Z67QW=_mm_slli_epi64(Z67QW,8);
518 Y00QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), Y00QW);
519 Y11QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), Y11QW);
520 Y00QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), Y00QW);
521 Y11QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), Y11QW);
523 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFF8000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
524 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFE000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
525 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFF800000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
526 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFE00000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
528 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y45QW); Z01QW=_mm_slli_epi64(Z01QW,8);
529 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y67QW); Z45QW=_mm_slli_epi64(Z45QW,8);
530 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y45QW); Z23QW=_mm_slli_epi64(Z23QW,8);
531 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y67QW); Z67QW=_mm_slli_epi64(Z67QW,8);
533 Y22QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), Y22QW);
534 Y33QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), Y33QW);
535 Y22QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), Y22QW);
536 Y33QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), Y33QW);
540 X00QW = _mm_loaddup_pd(&X[4+Ci]);
541 X11QW = _mm_loaddup_pd(&X[5+Ci]);
542 X22QW = _mm_loaddup_pd(&X[6+Ci]);
543 X33QW = _mm_loaddup_pd(&X[7+Ci]);
546 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFF80000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
547 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFE0000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
548 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFF8000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
549 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFE000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
551 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
552 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
553 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
554 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
556 __m128d Y44QW = _mm_mul_pd(X01QW, A01QW);
557 __m128d Y55QW = _mm_mul_pd(X01QW, A23QW);
558 Y44QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), Y44QW);
559 Y55QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), Y55QW);
561 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFF800000)])), _mm_setzero_pd(),(__m128d)Z01QW);
562 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFE00000)])), _mm_setzero_pd(),(__m128d)Z23QW);
563 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFF80000)])), _mm_setzero_pd(),(__m128d)Z45QW);
564 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFE0000)])), _mm_setzero_pd(),(__m128d)Z67QW);
566 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
567 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
568 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
569 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
571 __m128d Y66QW = _mm_mul_pd(X01QW, A01QW);
572 __m128d Y77QW = _mm_mul_pd(X01QW, A23QW);
573 Y66QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), Y66QW);
574 Y77QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), Y77QW);
577 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFF8000)])), _mm_setzero_pd(),(__m128d)Z01QW);
578 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFE000)])), _mm_setzero_pd(),(__m128d)Z23QW);
579 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFF800)])), _mm_setzero_pd(),(__m128d)Z45QW);
580 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFE00)])), _mm_setzero_pd(),(__m128d)Z67QW);
582 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y45QW); Z01QW=_mm_slli_epi64(Z01QW,8);
583 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y67QW); Z45QW=_mm_slli_epi64(Z45QW,8);
584 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y45QW); Z23QW=_mm_slli_epi64(Z23QW,8);
585 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y67QW); Z67QW=_mm_slli_epi64(Z67QW,8);
587 Y44QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), Y44QW);
588 Y55QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), Y55QW);
589 Y44QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), Y44QW);
590 Y55QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), Y55QW);
592 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFF80)])), _mm_setzero_pd(),(__m128d)Z01QW);
593 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFE0)])), _mm_setzero_pd(),(__m128d)Z23QW);
594 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFF8)])), _mm_setzero_pd(),(__m128d)Z45QW);
595 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFFE)])), _mm_setzero_pd(),(__m128d)Z67QW);
597 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y45QW);
598 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y67QW);
599 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y45QW);
600 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y67QW);
602 Y66QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), Y66QW);
603 Y77QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), Y77QW);
604 Y66QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), Y66QW);
605 Y77QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), Y77QW);
608 _V += __builtin_popcountll(M[ind]);
611 _mm_store_pd(&Y[Ri],Y01QW);
612 _mm_store_pd(&Y[Ri+2],Y23QW);
613 _mm_store_pd(&Y[Ri+4],Y45QW);
614 _mm_store_pd(&Y[Ri+6],Y67QW);
618 ssp_m128 yt0, yt1, yt2, yt3,yt4,yt5,yt6,yt7;
628 Y[Ci+0] += yt0.
f64[0] + yt0.
f64[1];
629 Y[Ci+1] += yt1.
f64[0] + yt1.
f64[1];
630 Y[Ci+2] += yt2.
f64[0] + yt2.
f64[1];
631 Y[Ci+3] += yt3.
f64[0] + yt3.
f64[1];
632 Y[Ci+4] += yt4.
f64[0] + yt4.
f64[1];
633 Y[Ci+5] += yt5.
f64[0] + yt5.
f64[1];
634 Y[Ci+6] += yt6.
f64[0] + yt6.
f64[1];
635 Y[Ci+7] += yt7.
f64[0] + yt7.
f64[1];
642 void SSEsym(
const double * __restrict V,
const uint64_t * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
643 const double * __restrict X,
const double * __restrict XT,
double * __restrict Y,
double * __restrict YT,
unsigned lowmask,
unsigned nlbits)
645 const double * __restrict _V = V-1;
647 for(
unsigned ind=0;ind<nrb;++ind)
649 const unsigned Ci = bot[ind] & lowmask;
650 const unsigned Ri = (bot[ind] >> nlbits) & lowmask;
651 const uint64_t Zi = ~M[ind];
652 const uint64_t Zil = Zi << 1;
655 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
657 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
659 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
660 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
661 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
663 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
664 __m128d Y23QW = _mm_loadu_pd(&Y[Ri+2]);
665 __m128d Y45QW = _mm_loadu_pd(&Y[Ri+4]);
666 __m128d Y67QW = _mm_loadu_pd(&Y[Ri+6]);
669 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
670 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
671 __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
672 __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
674 __m128d X01QW = _mm_loadu_pd(&XT[Ri]);
675 __m128d X23QW = _mm_loadu_pd(&XT[Ri+2]);
676 __m128d X45QW = _mm_loadu_pd(&XT[Ri+4]);
677 __m128d X67QW = _mm_loadu_pd(&XT[Ri+6]);
679 __m128d A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0x8000000000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
680 __m128d A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xE000000000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
681 __m128d A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xF800000000000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
682 __m128d A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFE00000000000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
684 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
685 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
686 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
687 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
689 __m128d YT0QW = _mm_mul_pd(X01QW, A01QW);
690 __m128d YT1QW = _mm_mul_pd(X01QW, A23QW);
691 YT0QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), YT0QW);
692 YT1QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), YT1QW);
695 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFF80000000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
696 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFE0000000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
697 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFF8000000000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
698 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFE000000000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
700 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
701 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
702 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
703 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
705 __m128d YT2QW = _mm_mul_pd(X01QW, A01QW);
706 __m128d YT3QW = _mm_mul_pd(X01QW, A23QW);
707 YT2QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), YT2QW);
708 YT3QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), YT3QW);
711 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFF800000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
712 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFE00000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
713 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFF80000000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
714 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFE0000000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
716 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y45QW); Z01QW=_mm_slli_epi64(Z01QW,8);
717 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y67QW); Z45QW=_mm_slli_epi64(Z45QW,8);
718 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y45QW); Z23QW=_mm_slli_epi64(Z23QW,8);
719 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y67QW); Z67QW=_mm_slli_epi64(Z67QW,8);
721 YT0QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), YT0QW);
722 YT1QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), YT1QW);
723 YT0QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), YT0QW);
724 YT1QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), YT1QW);
726 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFF8000000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
727 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFE000000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
728 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFF800000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
729 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFE00000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
731 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y45QW); Z01QW=_mm_slli_epi64(Z01QW,8);
732 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y67QW); Z45QW=_mm_slli_epi64(Z45QW,8);
733 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y45QW); Z23QW=_mm_slli_epi64(Z23QW,8);
734 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y67QW); Z67QW=_mm_slli_epi64(Z67QW,8);
736 YT2QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), YT2QW);
737 YT3QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), YT3QW);
738 YT2QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), YT2QW);
739 YT3QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), YT3QW);
747 YT[Ci+0] += yt0.
f64[0] + yt0.
f64[1];
748 YT[Ci+1] += yt1.
f64[0] + yt1.
f64[1];
749 YT[Ci+2] += yt2.
f64[0] + yt2.
f64[1];
750 YT[Ci+3] += yt3.
f64[0] + yt3.
f64[1];
753 X00QW = _mm_loaddup_pd(&X[4+Ci]);
754 X11QW = _mm_loaddup_pd(&X[5+Ci]);
755 X22QW = _mm_loaddup_pd(&X[6+Ci]);
756 X33QW = _mm_loaddup_pd(&X[7+Ci]);
759 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFF80000000)])), _mm_setzero_pd(),(__m128d)Z01QW);
760 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFE0000000)])), _mm_setzero_pd(),(__m128d)Z23QW);
761 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFF8000000)])), _mm_setzero_pd(),(__m128d)Z45QW);
762 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFE000000)])), _mm_setzero_pd(),(__m128d)Z67QW);
764 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
765 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
766 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
767 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
769 YT0QW = _mm_mul_pd(X01QW, A01QW);
770 YT1QW = _mm_mul_pd(X01QW, A23QW);
771 YT0QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), YT0QW);
772 YT1QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), YT1QW);
774 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFF800000)])), _mm_setzero_pd(),(__m128d)Z01QW);
775 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFE00000)])), _mm_setzero_pd(),(__m128d)Z23QW);
776 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFF80000)])), _mm_setzero_pd(),(__m128d)Z45QW);
777 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFE0000)])), _mm_setzero_pd(),(__m128d)Z67QW);
779 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
780 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
781 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
782 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
784 YT2QW = _mm_mul_pd(X01QW, A01QW);
785 YT3QW = _mm_mul_pd(X01QW, A23QW);
786 YT2QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), YT2QW);
787 YT3QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), YT3QW);
790 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFF8000)])), _mm_setzero_pd(),(__m128d)Z01QW);
791 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFE000)])), _mm_setzero_pd(),(__m128d)Z23QW);
792 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFF800)])), _mm_setzero_pd(),(__m128d)Z45QW);
793 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFE00)])), _mm_setzero_pd(),(__m128d)Z67QW);
795 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y45QW); Z01QW=_mm_slli_epi64(Z01QW,8);
796 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y67QW); Z45QW=_mm_slli_epi64(Z45QW,8);
797 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y45QW); Z23QW=_mm_slli_epi64(Z23QW,8);
798 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y67QW); Z67QW=_mm_slli_epi64(Z67QW,8);
800 YT0QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), YT0QW);
801 YT1QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), YT1QW);
802 YT0QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), YT0QW);
803 YT1QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), YT1QW);
805 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFF80)])), _mm_setzero_pd(),(__m128d)Z01QW);
806 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFE0)])), _mm_setzero_pd(),(__m128d)Z23QW);
807 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFF8)])), _mm_setzero_pd(),(__m128d)Z45QW);
808 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFFE)])), _mm_setzero_pd(),(__m128d)Z67QW);
810 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y45QW);
811 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y67QW);
812 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y45QW);
813 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y67QW);
815 YT2QW = _mm_add_pd(_mm_mul_pd(X45QW, A01QW), YT2QW);
816 YT3QW = _mm_add_pd(_mm_mul_pd(X45QW, A23QW), YT3QW);
817 YT2QW = _mm_add_pd(_mm_mul_pd(X67QW, A45QW), YT2QW);
818 YT3QW = _mm_add_pd(_mm_mul_pd(X67QW, A67QW), YT3QW);
821 _V += __builtin_popcountll(M[ind]);
824 _mm_store_pd(&Y[Ri],Y01QW);
825 _mm_store_pd(&Y[Ri+2],Y23QW);
826 _mm_store_pd(&Y[Ri+4],Y45QW);
827 _mm_store_pd(&Y[Ri+6],Y67QW);
834 YT[Ci+4] += yt0.
f64[0] + yt0.
f64[1];
835 YT[Ci+5] += yt1.
f64[0] + yt1.
f64[1];
836 YT[Ci+6] += yt2.
f64[0] + yt2.
f64[1];
837 YT[Ci+7] += yt3.
f64[0] + yt3.
f64[1];
844 void SSEsym(
const double * __restrict V,
const unsigned short * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
845 const double * __restrict X,
double * Y,
unsigned lowmask,
unsigned nlbits)
847 const double * __restrict _V = V-1;
849 for(
unsigned ind=0;ind<nrb;++ind)
851 const unsigned Ci = bot[ind] & lowmask;
852 const unsigned Ri = (bot[ind] >> nlbits) & lowmask;
854 const uint64_t m64 = (uint64_t) M[ind];
855 const uint64_t Zi = ((~m64) << 48);
856 const uint64_t Zil = Zi << 1;
859 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
861 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
863 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
864 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
865 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
867 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
868 __m128d Y23QW = _mm_loadu_pd(&Y[Ri+2]);
871 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
872 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
873 __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
874 __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
876 __m128d X01QW = _mm_loadu_pd(&X[Ri]);
877 __m128d X23QW = _mm_loadu_pd(&X[Ri+2]);
879 __m128d A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0x8000)])), _mm_setzero_pd(),(__m128d)Z01QW);
880 __m128d A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xE000)])), _mm_setzero_pd(),(__m128d)Z23QW);
881 __m128d A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xF800)])), _mm_setzero_pd(),(__m128d)Z45QW);
882 __m128d A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFE00)])), _mm_setzero_pd(),(__m128d)Z67QW);
884 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
885 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
886 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
887 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
889 __m128d Y00QW = _mm_mul_pd(X01QW, A01QW);
890 __m128d Y11QW = _mm_mul_pd(X01QW, A23QW);
891 Y00QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), Y00QW);
892 Y11QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), Y11QW);
895 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFF80)])), _mm_setzero_pd(),(__m128d)Z01QW);
896 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFE0)])), _mm_setzero_pd(),(__m128d)Z23QW);
897 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFF8)])), _mm_setzero_pd(),(__m128d)Z45QW);
898 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFFE)])), _mm_setzero_pd(),(__m128d)Z67QW);
900 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y01QW);
901 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y23QW);
902 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y01QW);
903 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y23QW);
905 __m128d Y22QW = _mm_mul_pd(X01QW, A01QW);
906 __m128d Y33QW = _mm_mul_pd(X01QW, A23QW);
907 Y22QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), Y22QW);
908 Y33QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), Y33QW);
911 _V += __builtin_popcount(M[ind]);
914 _mm_store_pd(&Y[Ri],Y01QW);
915 _mm_store_pd(&Y[Ri+2],Y23QW);
924 Y[Ci+0] += yt0.
f64[0] + yt0.
f64[1];
925 Y[Ci+1] += yt1.
f64[0] + yt1.
f64[1];
926 Y[Ci+2] += yt2.
f64[0] + yt2.
f64[1];
927 Y[Ci+3] += yt3.
f64[0] + yt3.
f64[1];
931 void SSEsym(
const double * __restrict V,
const unsigned short * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
932 const double * __restrict X,
const double * __restrict XT,
double * Y,
double * YT,
unsigned lowmask,
unsigned nlbits)
934 const double * __restrict _V = V-1;
938 for(
unsigned ind=0;ind<nrb;++ind)
940 const unsigned Ci = bot[ind] & lowmask;
941 const unsigned Ri = (bot[ind] >> nlbits) & lowmask;
943 const uint64_t m64 = (uint64_t) M[ind];
944 const uint64_t Zi = ((~m64) << 48);
945 const uint64_t Zil = Zi << 1;
948 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
950 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
952 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
953 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
954 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
956 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
957 __m128d Y23QW = _mm_loadu_pd(&Y[Ri+2]);
960 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
961 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
962 __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
963 __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
965 __m128d X01QW = _mm_loadu_pd(&XT[Ri]);
966 __m128d X23QW = _mm_loadu_pd(&XT[Ri+2]);
968 __m128d A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0x8000)])), _mm_setzero_pd(),(__m128d)Z01QW);
969 __m128d A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xE000)])), _mm_setzero_pd(),(__m128d)Z23QW);
970 __m128d A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xF800)])), _mm_setzero_pd(),(__m128d)Z45QW);
971 __m128d A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFE00)])), _mm_setzero_pd(),(__m128d)Z67QW);
983 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW, A01QW), Y01QW); Z01QW=_mm_slli_epi64(Z01QW,8);
984 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW, A45QW), Y23QW); Z45QW=_mm_slli_epi64(Z45QW,8);
985 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW, A23QW), Y01QW); Z23QW=_mm_slli_epi64(Z23QW,8);
986 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW, A67QW), Y23QW); Z67QW=_mm_slli_epi64(Z67QW,8);
988 __m128d YT0QW = _mm_mul_pd(X01QW, A01QW);
989 __m128d YT1QW = _mm_mul_pd(X01QW, A23QW);
990 YT0QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), YT0QW);
991 YT1QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), YT1QW);
998 YT[Ci+0] += yt0.
f64[0] + yt0.
f64[1];
999 YT[Ci+1] += yt1.
f64[0] + yt1.
f64[1];
1003 A01QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFF80)])), _mm_setzero_pd(),(__m128d)Z01QW);
1004 A23QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFE0)])), _mm_setzero_pd(),(__m128d)Z23QW);
1005 A45QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFF8)])), _mm_setzero_pd(),(__m128d)Z45QW);
1006 A67QW = _mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFFE)])), _mm_setzero_pd(),(__m128d)Z67QW);
1008 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW, A01QW), Y01QW);
1009 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW, A45QW), Y23QW);
1010 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW, A23QW), Y01QW);
1011 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW, A67QW), Y23QW);
1013 __m128d YT2QW = _mm_mul_pd(X01QW, A01QW);
1014 __m128d YT3QW = _mm_mul_pd(X01QW, A23QW);
1015 YT2QW = _mm_add_pd(_mm_mul_pd(X23QW, A45QW), YT2QW);
1016 YT3QW = _mm_add_pd(_mm_mul_pd(X23QW, A67QW), YT3QW);
1023 YT[Ci+2] += yt2.
f64[0] + yt2.
f64[1];
1024 YT[Ci+3] += yt3.
f64[0] + yt3.
f64[1];
1028 _V += __builtin_popcount(M[ind]);
1031 _mm_store_pd(&Y[Ri],Y01QW);
1032 _mm_store_pd(&Y[Ri+2],Y23QW);
1047 void SSEspmv(
const double * __restrict V,
const unsigned short * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
const double * __restrict X,
double * Y,
unsigned lcmask,
unsigned lrmask,
unsigned clbits)
1049 const double * __restrict _V = V-1;
1053 for(
unsigned ind=0;ind<nrb;++ind)
1055 const unsigned Ci = bot[ind] & lcmask;
1056 const unsigned Ri = (bot[ind] >> clbits) & lrmask;
1058 const uint64_t m64 = (uint64_t) M[ind];
1059 const uint64_t Zi = ((~m64) << 48);
1060 const uint64_t Zil = Zi << 1;
1063 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
1065 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zil,1);
1067 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
1068 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
1069 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
1071 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
1072 __m128d Y23QW = _mm_loadu_pd(&Y[Ri+2]);
1075 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
1076 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
1077 __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
1078 __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
1090 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0x8000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1091 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xF800)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y23QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1092 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xE000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1093 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFE00)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y23QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1104 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFF80)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);
1105 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFF8)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y23QW);
1106 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFE0)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);
1107 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcount(M[ind]&0xFFFE)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y23QW);
1110 _V += __builtin_popcount(M[ind]);
1113 _mm_store_pd(&Y[Ri],Y01QW);
1114 _mm_store_pd(&Y[Ri+2],Y23QW);
1120 void SSEspmv(
const double * __restrict V,
const uint64_t * __restrict M,
const unsigned * __restrict bot,
const unsigned nrb,
const double * __restrict X,
double * Y,
unsigned lcmask,
unsigned lrmask,
unsigned clbits)
1122 const double * __restrict _V = V-1;
1126 for(
unsigned ind=0;ind<nrb;++ind)
1128 const unsigned Ci = bot[ind] & lcmask;
1129 const unsigned Ri = (bot[ind] >> clbits) & lrmask;
1130 const uint64_t Zi = ~M[ind];
1132 __m128d Y01QW = _mm_loadu_pd(&Y[Ri]);
1133 __m128d Y23QW = _mm_loadu_pd(&Y[Ri+2]);
1134 __m128d Y45QW = _mm_loadu_pd(&Y[Ri+4]);
1135 __m128d Y67QW = _mm_loadu_pd(&Y[Ri+6]);
1138 const uint64_t Zil = Zi << 1;
1139 __m128i Z01QW = _mm_unpacklo_epi64 (_mm_loadl_epi64((__m128i*)&Zi), _mm_loadl_epi64((__m128i*)&Zil));
1141 __m128i Z01QW = _mm_insert_epi64(_mm_loadl_epi64((__m128i*)&Zi),Zi<<1,1);
1143 __m128i Z23QW = _mm_slli_epi64(Z01QW, 2);
1144 __m128i Z45QW = _mm_slli_epi64(Z01QW, 4);
1145 __m128i Z67QW = _mm_slli_epi64(Z01QW, 6);
1148 __m128d X00QW = _mm_loaddup_pd(&X[0+Ci]);
1149 __m128d X11QW = _mm_loaddup_pd(&X[1+Ci]);
1150 __m128d X22QW = _mm_loaddup_pd(&X[2+Ci]);
1151 __m128d X33QW = _mm_loaddup_pd(&X[3+Ci]);
1163 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0x8000000000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1164 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xF800000000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y23QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1165 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xE000000000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1166 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFE00000000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y23QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1177 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFF80000000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1178 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFF8000000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y23QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1179 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFE0000000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1180 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFE000000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y23QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1183 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFF800000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y45QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1184 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFF80000000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y67QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1185 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFE00000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y45QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1186 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFE0000000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1188 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFF8000000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y45QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1189 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFE000000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y45QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1190 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFF800000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y67QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1191 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFE00000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1196 X00QW = _mm_loaddup_pd(&X[4+Ci]);
1197 X11QW = _mm_loaddup_pd(&X[5+Ci]);
1198 X22QW = _mm_loaddup_pd(&X[6+Ci]);
1199 X33QW = _mm_loaddup_pd(&X[7+Ci]);
1201 Y01QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFF80000000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1202 Y23QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFF8000000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y23QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1203 Y01QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFE0000000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1204 Y23QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFE000000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y23QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1206 Y01QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFF800000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y01QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1207 Y23QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFF80000)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y23QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1208 Y01QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFE00000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y01QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1209 Y23QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFE0000)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y23QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1211 Y45QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFF8000)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y45QW);Z01QW=_mm_slli_epi64(Z01QW,8);
1212 Y67QW = _mm_add_pd(_mm_mul_pd(X00QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFF800)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y67QW);Z45QW=_mm_slli_epi64(Z45QW,8);
1213 Y45QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFE000)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y45QW);Z23QW=_mm_slli_epi64(Z23QW,8);
1214 Y67QW = _mm_add_pd(_mm_mul_pd(X11QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFE00)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);Z67QW=_mm_slli_epi64(Z67QW,8);
1216 Y45QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFF80)])),_mm_setzero_pd(),(__m128d)Z01QW)),Y45QW);
1217 Y67QW = _mm_add_pd(_mm_mul_pd(X22QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFF8)])),_mm_setzero_pd(),(__m128d)Z45QW)),Y67QW);
1218 Y45QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFE0)])),_mm_setzero_pd(),(__m128d)Z23QW)),Y45QW);
1219 Y67QW = _mm_add_pd(_mm_mul_pd(X33QW,_mm_blendv_pd((__m128d)_mm_loadu_ps((
float*)&(_V[__builtin_popcountll(M[ind]&0xFFFFFFFFFFFFFFFE)])),_mm_setzero_pd(),(__m128d)Z67QW)),Y67QW);
1222 _V += __builtin_popcountll(M[ind]);
1225 _mm_store_pd(&Y[Ri],Y01QW);
1226 _mm_store_pd(&Y[Ri+2],Y23QW);
1227 _mm_store_pd(&Y[Ri+4],Y45QW);
1228 _mm_store_pd(&Y[Ri+6],Y67QW);
1232 void popcountall(
const unsigned char * __restrict M,
unsigned * __restrict counts,
size_t n)
1236 for(
size_t i=0; i<nn; ++i)
1238 counts[i*8] = __builtin_popcount(M[i*8] & 0x0F);
1239 counts[1+i*8] = __builtin_popcount(M[1+i*8] & 0x0F);
1240 counts[2+i*8] = __builtin_popcount(M[2+i*8] & 0x0F);
1241 counts[3+i*8] = __builtin_popcount(M[3+i*8] & 0x0F);
1242 counts[4+i*8] = __builtin_popcount(M[4+i*8] & 0x0F);
1243 counts[5+i*8] = __builtin_popcount(M[5+i*8] & 0x0F);
1244 counts[6+i*8] = __builtin_popcount(M[6+i*8] & 0x0F);
1245 counts[7+i*8] = __builtin_popcount(M[7+i*8] & 0x0F);
1247 for(
size_t i=nn*8; i<n; ++i)
1249 counts[i] = __builtin_popcount(M[i] & 0x0F);
1253 void popcountall(
const unsigned short * __restrict M,
unsigned * __restrict counts,
size_t n)
1256 for(
size_t i=0; i<nn; ++i)
1258 counts[i*8] = __builtin_popcount(M[i*8]);
1259 counts[1+i*8] = __builtin_popcount(M[1+i*8]);
1260 counts[2+i*8] = __builtin_popcount(M[2+i*8]);
1261 counts[3+i*8] = __builtin_popcount(M[3+i*8]);
1262 counts[4+i*8] = __builtin_popcount(M[4+i*8]);
1263 counts[5+i*8] = __builtin_popcount(M[5+i*8]);
1264 counts[6+i*8] = __builtin_popcount(M[6+i*8]);
1265 counts[7+i*8] = __builtin_popcount(M[7+i*8]);
1267 for(
size_t i=nn*8; i<n; ++i)
1269 counts[i] = __builtin_popcount(M[i]);
1273 void popcountall(
const uint64_t * __restrict M,
unsigned * __restrict counts,
size_t n)
1276 for(
size_t i=0; i<nn; ++i)
1278 counts[i*8] = __builtin_popcountl(M[i*8]);
1279 counts[1+i*8] = __builtin_popcountl(M[1+i*8]);
1280 counts[2+i*8] = __builtin_popcountl(M[2+i*8]);
1281 counts[3+i*8] = __builtin_popcountl(M[3+i*8]);
1282 counts[4+i*8] = __builtin_popcountl(M[4+i*8]);
1283 counts[5+i*8] = __builtin_popcountl(M[5+i*8]);
1284 counts[6+i*8] = __builtin_popcountl(M[6+i*8]);
1285 counts[7+i*8] = __builtin_popcountl(M[7+i*8]);
1287 for(
size_t i=nn*8; i<n; ++i)
1289 counts[i] = __builtin_popcountl(M[i]);
void SSEspmv(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
const unsigned short masktable16[16]
const uint64_t masktable64[64]
void symcsr(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lowmask, unsigned nlowbits)
__m128d ssp_blendv_pd_SSE2(__m128d a, __m128d b, __m128d mask)
void SSEsym(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *__restrict Y, unsigned lowmask, unsigned nlbits)
unsigned short BitReverse(unsigned short v)
unsigned long long ssp_u64
const unsigned char BitReverseTable64[]
void atomicallyIncrementDouble(volatile double *target, const double by)
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)