void atomicallyIncrementDouble(volatile double *target, const double by)
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
] & 0x0F);
1245 counts[7+i*8] = __builtin_popcount(M[7+i*8] & 0x0F);
1247 for(
size_t i=nn*8; i<n; ++i)
1249 counts[i] = __builtin_popcount(M[i] & 0x0F);
1253 void popcountall(
const unsigned short * __restrict M,
unsigned * __restrict counts,
size_t n)
1256 for(
size_t i=0; i<nn; ++i)
1258 counts[i*8] = __builtin_popcount(M[i*8]);
1259 counts[1+i*8] = __builtin_popcount(M[1+i*8]);
1260 counts[2+i*8] = __builtin_popcount(M[2+i*8]);
1261 counts[3+i*8] = __builtin_popcount(M[3+i*8]);
1262 counts[4+i*8] = __builtin_popcount(M[4+i*8]);
1263 counts[5+i*8] = __builtin_popcount(M[5+i*8]);
1264 counts[6+i*8] = __builtin_popcount(M[6+i*8]);
1265 counts[7+i*8] = __builtin_popcount(M[7+i*8]);
1267 for(
size_t i=nn*8; i<n; ++i)
1269 counts[i] = __builtin_popcount(M[i]);
1273 void popcountall(
const uint64_t * __restrict M,
unsigned * __restrict counts,
size_t n)
1276 for(
size_t i=0; i<nn; ++i)
1278 counts[i*8] = __builtin_popcountl(M[i*8]);
1279 counts[1+i*8] = __builtin_popcountl(M[1+i*8]);
1280 counts[2+i*8] = __builtin_popcountl(M[2+i*8]);
1281 counts[3+i*8] = __builtin_popcountl(M[3+i*8]);
1282 counts[4+i*8] = __builtin_popcountl(M[4+i*8]);
1283 counts[5+i*8] = __builtin_popcountl(M[5+i*8]);
1284 counts[6+i*8] = __builtin_popcountl(M[6+i*8]);
1285 counts[7+i*8] = __builtin_popcountl(M[7+i*8]);
1287 for(
size_t i=nn*8; i<n; ++i)
1289 counts[i] = __builtin_popcountl(M[i]);
void SSEspmv(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
const unsigned short masktable16[16]
const uint64_t masktable64[64]
void symcsr(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lowmask, unsigned nlowbits)
__m128d ssp_blendv_pd_SSE2(__m128d a, __m128d b, __m128d mask)
void SSEsym(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *__restrict Y, unsigned lowmask, unsigned nlbits)
unsigned short BitReverse(unsigned short v)
unsigned long long ssp_u64
const unsigned char BitReverseTable64[]
void atomicallyIncrementDouble(volatile double *target, const double by)
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
] & 0x0F);
1245 counts[7+i*8] = __builtin_popcount(M[7+i*8] & 0x0F);
1247 for(
size_t i=nn*8; i<n; ++i)
1249 counts[i] = __builtin_popcount(M[i] & 0x0F);
1253 void popcountall(
const unsigned short * __restrict M,
unsigned * __restrict counts,
size_t n)
1256 for(
size_t i=0; i<nn; ++i)
1258 counts[i*8] = __builtin_popcount(M[i*8]);
1259 counts[1+i*8] = __builtin_popcount(M[1+i*8]);
1260 counts[2+i*8] = __builtin_popcount(M[2+i*8]);
1261 counts[3+i*8] = __builtin_popcount(M[3+i*8]);
1262 counts[4+i*8] = __builtin_popcount(M[4+i*8]);
1263 counts[5+i*8] = __builtin_popcount(M[5+i*8]);
1264 counts[6+i*8] = __builtin_popcount(M[6+i*8]);
1265 counts[7+i*8] = __builtin_popcount(M[7+i*8]);
1267 for(
size_t i=nn*8; i<n; ++i)
1269 counts[i] = __builtin_popcount(M[i]);
1273 void popcountall(
const uint64_t * __restrict M,
unsigned * __restrict counts,
size_t n)
1276 for(
size_t i=0; i<nn; ++i)
1278 counts[i*8] = __builtin_popcountl(M[i*8]);
1279 counts[1+i*8] = __builtin_popcountl(M[1+i*8]);
1280 counts[2+i*8] = __builtin_popcountl(M[2+i*8]);
1281 counts[3+i*8] = __builtin_popcountl(M[3+i*8]);
1282 counts[4+i*8] = __builtin_popcountl(M[4+i*8]);
1283 counts[5+i*8] = __builtin_popcountl(M[5+i*8]);
1284 counts[6+i*8] = __builtin_popcountl(M[6+i*8]);
1285 counts[7+i*8] = __builtin_popcountl(M[7+i*8]);
1287 for(
size_t i=nn*8; i<n; ++i)
1289 counts[i] = __builtin_popcountl(M[i]);
void SSEspmv(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
const unsigned short masktable16[16]
const uint64_t masktable64[64]
void symcsr(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lowmask, unsigned nlowbits)
__m128d ssp_blendv_pd_SSE2(__m128d a, __m128d b, __m128d mask)
void SSEsym(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *__restrict Y, unsigned lowmask, unsigned nlbits)
unsigned short BitReverse(unsigned short v)
unsigned long long ssp_u64
const unsigned char BitReverseTable64[]
void atomicallyIncrementDouble(volatile double *target, const double by)
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
] & 0x0F);
1245 counts[7+i*8] = __builtin_popcount(M[7+i*8] & 0x0F);
1247 for(
size_t i=nn*8; i<n; ++i)
1249 counts[i] = __builtin_popcount(M[i] & 0x0F);
1253 void popcountall(
const unsigned short * __restrict M,
unsigned * __restrict counts,
size_t n)
1256 for(
size_t i=0; i<nn; ++i)
1258 counts[i*8] = __builtin_popcount(M[i*8]);
1259 counts[1+i*8] = __builtin_popcount(M[1+i*8]);
1260 counts[2+i*8] = __builtin_popcount(M[2+i*8]);
1261 counts[3+i*8] = __builtin_popcount(M[3+i*8]);
1262 counts[4+i*8] = __builtin_popcount(M[4+i*8]);
1263 counts[5+i*8] = __builtin_popcount(M[5+i*8]);
1264 counts[6+i*8] = __builtin_popcount(M[6+i*8]);
1265 counts[7+i*8] = __builtin_popcount(M[7+i*8]);
1267 for(
size_t i=nn*8; i<n; ++i)
1269 counts[i] = __builtin_popcount(M[i]);
1273 void popcountall(
const uint64_t * __restrict M,
unsigned * __restrict counts,
size_t n)
1276 for(
size_t i=0; i<nn; ++i)
1278 counts[i*8] = __builtin_popcountl(M[i*8]);
1279 counts[1+i*8] = __builtin_popcountl(M[1+i*8]);
1280 counts[2+i*8] = __builtin_popcountl(M[2+i*8]);
1281 counts[3+i*8] = __builtin_popcountl(M[3+i*8]);
1282 counts[4+i*8] = __builtin_popcountl(M[4+i*8]);
1283 counts[5+i*8] = __builtin_popcountl(M[5+i*8]);
1284 counts[6+i*8] = __builtin_popcountl(M[6+i*8]);
1285 counts[7+i*8] = __builtin_popcountl(M[7+i*8]);
1287 for(
size_t i=nn*8; i<n; ++i)
1289 counts[i] = __builtin_popcountl(M[i]);
void SSEspmv(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
const unsigned short masktable16[16]
const uint64_t masktable64[64]
void symcsr(const double *__restrict V, const uint64_t *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, const double *__restrict XT, double *Y, double *YT, unsigned lowmask, unsigned nlowbits)
__m128d ssp_blendv_pd_SSE2(__m128d a, __m128d b, __m128d mask)
void SSEsym(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *__restrict Y, unsigned lowmask, unsigned nlbits)
unsigned short BitReverse(unsigned short v)
unsigned long long ssp_u64
const unsigned char BitReverseTable64[]
void atomicallyIncrementDouble(volatile double *target, const double by)
void popcountall(const unsigned char *__restrict M, unsigned *__restrict counts, size_t n)
] & 0x0F);
1245 counts[7+i*8] = __builtin_popcount(M[7+i*8] & 0x0F);
1247 for(
size_t i=nn*8; i<n; ++i)
1249 counts[i] = __builtin_popcount(M[i] & 0x0F);
1253 void popcountall(
const unsigned short * __restrict M,
unsigned * __restrict counts,
size_t n)
1256 for(
size_t i=0; i<nn; ++i)
1258 counts[i*8] = __builtin_popcount(M[i*8]);
1259 counts[1+i*8] = __builtin_popcount(M[1+i*8]);
1260 counts[2+i*8] = __builtin_popcount(M[2+i*8]);
1261 counts[3+i*8] = __builtin_popcount(M[3+i*8]);
1262 counts[4+i*8] = __builtin_popcount(M[4+i*8]);
1263 counts[5+i*8] = __builtin_popcount(M[5+i*8]);
1264 counts[6+i*8] = __builtin_popcount(M[6+i*8]);
1265 counts[7+i*8] = __builtin_popcount(M[7+i*8]);
1267 for(
size_t i=nn*8; i<n; ++i)
1269 counts[i] = __builtin_popcount(M[i]);
1273 void popcountall(
const uint64_t * __restrict M,
unsigned * __restrict counts,
size_t n)
1276 for(
size_t i=0; i<nn; ++i)
1278 counts[i*8] = __builtin_popcountl(M[i*8]);
1279 counts[1+i*8] = __builtin_popcountl(M[1+i*8]);
1280 counts[2+i*8] = __builtin_popcountl(M[2+i*8]);
1281 counts[3+i*8] = __builtin_popcountl(M[3+i*8]);
1282 counts[4+i*8] = __builtin_popcountl(M[4+i*8]);
1283 counts[5+i*8] = __builtin_popcountl(M[5+i*8]);
1284 counts[6+i*8] = __builtin_popcountl(M[6+i*8]);
1285 counts[7+i*8] = __builtin_popcountl(M[7+i*8]);
1287 for(
size_t i=nn*8; i<n; ++i)
1289 counts[i] = __builtin_popcountl(M[i]);
void SSEspmv(const double *__restrict V, const unsigned char *__restrict M, const unsigned *__restrict bot, const unsigned nrb, const double *__restrict X, double *Y, unsigned lcmask, unsigned lrmask, unsigned clbits)
const unsigned short masktable16[16]
const uint64_t masktable64[64]