20 #ifndef VC_AVX_INTRINSICS_H 21 #define VC_AVX_INTRINSICS_H 23 #include "../common/windows_fix_intrin.h" 30 #include <immintrin.h> 32 #if (defined(VC_IMPL_XOP) || defined(VC_IMPL_FMA4)) && !defined(VC_MSVC) 33 #include <x86intrin.h> 37 #include "../common/fix_clang_emmintrin.h" 39 #if defined(VC_CLANG) && VC_CLANG < 0x30100 42 #define _mm_permute_ps(A, C) __extension__ ({ \ 44 (m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ 45 (C) & 0x3, ((C) & 0xc) >> 2, \ 46 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) 53 #if defined(VC_CLANG) || defined(VC_MSVC) || (defined(VC_GCC) && !defined(__OPTIMIZE__)) 54 #define VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT 57 #if defined(VC_CLANG) && VC_CLANG <= 0x30000 59 #undef _mm_alignr_epi8 60 #define _mm_alignr_epi8(a, b, n) ((m128i)__builtin_ia32_palignr128((a), (b), (n))) 90 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS 91 template<
typename T>
struct Alias
103 typedef Alias<__m128 >
m128 ;
104 typedef Alias<__m128d>
m128d;
105 typedef Alias<__m128i>
m128i;
106 typedef Alias<__m256 >
m256 ;
107 typedef Alias<__m256d>
m256d;
108 typedef Alias<__m256i>
m256i;
117 #if defined(VC_UNCONDITIONAL_AVX2_INTRINSICS) && defined(VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN) 133 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS 136 static Vc_INTRINSIC m256i
Vc_CONST _mm256_castps_si256(param256
a) { return ::_mm256_castps_si256(a); }
138 static Vc_INTRINSIC m256i
Vc_CONST _mm256_castpd_si256(param256d a) { return ::_mm256_castpd_si256(a); }
139 static Vc_INTRINSIC m256
Vc_CONST _mm256_castpd_ps (param256d a) { return ::_mm256_castpd_ps (a); }
140 static Vc_INTRINSIC m256
Vc_CONST _mm256_castsi256_ps(param256i a) { return ::_mm256_castsi256_ps(a); }
141 static Vc_INTRINSIC m256d
Vc_CONST _mm256_castsi256_pd(param256i a) { return ::_mm256_castsi256_pd(a); }
147 static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) {
return static_cast<m256d
>(
static_cast<__v4df
>(
a) * static_cast<__v4df>(b)); }
148 static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) {
return static_cast<m256d
>(
static_cast<__v4df
>(
a) + static_cast<__v4df>(b)); }
149 static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) {
return static_cast<m256d
>(
static_cast<__v4df
>(
a) - static_cast<__v4df>(b)); }
150 static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) {
return static_cast<m256
>(
static_cast<__v8sf
>(
a) * static_cast<__v8sf>(b)); }
151 static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) {
return static_cast<m256
>(
static_cast<__v8sf
>(
a) + static_cast<__v8sf>(b)); }
152 static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) {
return static_cast<m256
>(
static_cast<__v8sf
>(
a) - static_cast<__v8sf>(b)); }
160 #if defined(VC_GNU_ASM) && !defined(NVALGRIND) 175 #if defined(VC_GNU_ASM) && !defined(NVALGRIND) 177 #elif defined(VC_MSVC) 212 #ifdef VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT 213 #define _mm_extract_epu8 (x, i) (static_cast<unsigned char> (_mm_extract_epi8 ((x), (i)))) 214 #define _mm_extract_epu16(x, i) (static_cast<unsigned short>(_mm_extract_epi16((x), (i)))) 215 #define _mm_extract_epu32(x, i) (static_cast<unsigned int> (_mm_extract_epi32((x), (i)))) 251 #define AVX_TO_SSE_2(name) \ 252 static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, param256i b0) { \ 253 m128i a1 = _mm256_extractf128_si256(a0, 1); \ 254 m128i b1 = _mm256_extractf128_si256(b0, 1); \ 255 m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ 256 m128i r1 = _mm_##name(a1, b1); \ 257 return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ 259 #define AVX_TO_SSE_2_si128_si256(name) \ 260 static Vc_INTRINSIC m256i Vc_CONST _mm256_##name##_si256(param256i a0, param256i b0) { \ 261 m128i a1 = _mm256_extractf128_si256(a0, 1); \ 262 m128i b1 = _mm256_extractf128_si256(b0, 1); \ 263 m128i r0 = _mm_##name##_si128(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ 264 m128i r1 = _mm_##name##_si128(a1, b1); \ 265 return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ 267 #define AVX_TO_SSE_1(name) \ 268 static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0) { \ 269 m128i a1 = _mm256_extractf128_si256(a0, 1); \ 270 m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ 271 m128i r1 = _mm_##name(a1); \ 272 return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ 274 #define AVX_TO_SSE_1i(name) \ 275 static Vc_INTRINSIC m256i Vc_CONST _mm256_##name(param256i a0, const int i) { \ 276 m128i a1 = _mm256_extractf128_si256(a0, 1); \ 277 m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ 278 m128i r1 = _mm_##name(a1, i); \ 279 return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1); \ 293 #if defined _mm256_srli_si256 294 #undef _mm256_srli_si256 296 #if defined _mm256_slli_si256 297 #undef _mm256_slli_si256 299 #if defined _mm256_blend_epi16 300 #undef _mm256_blend_epi16 303 const m128i vLo = _mm256_castsi256_si128(a0);
304 const m128i vHi = _mm256_extractf128_si256(a0, 1);
307 case 1:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 1)), _mm_srli_si128(vHi, 1), 1);
308 case 2:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 2)), _mm_srli_si128(vHi, 2), 1);
309 case 3:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 3)), _mm_srli_si128(vHi, 3), 1);
310 case 4:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 4)), _mm_srli_si128(vHi, 4), 1);
311 case 5:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 5)), _mm_srli_si128(vHi, 5), 1);
312 case 6:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 6)), _mm_srli_si128(vHi, 6), 1);
313 case 7:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 7)), _mm_srli_si128(vHi, 7), 1);
314 case 8:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 8)), _mm_srli_si128(vHi, 8), 1);
315 case 9:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 9)), _mm_srli_si128(vHi, 9), 1);
316 case 10:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 10)), _mm_srli_si128(vHi, 10), 1);
317 case 11:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 11)), _mm_srli_si128(vHi, 11), 1);
318 case 12:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 12)), _mm_srli_si128(vHi, 12), 1);
319 case 13:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 13)), _mm_srli_si128(vHi, 13), 1);
320 case 14:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 14)), _mm_srli_si128(vHi, 14), 1);
321 case 15:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_alignr_epi8(vHi, vLo, 15)), _mm_srli_si128(vHi, 15), 1);
322 case 16:
return _mm256_permute2f128_si256(a0, a0, 0x81);
323 case 17:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 1)), 0x80);
324 case 18:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 2)), 0x80);
325 case 19:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 3)), 0x80);
326 case 20:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 4)), 0x80);
327 case 21:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 5)), 0x80);
328 case 22:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 6)), 0x80);
329 case 23:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 7)), 0x80);
330 case 24:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 8)), 0x80);
331 case 25:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 9)), 0x80);
332 case 26:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 10)), 0x80);
333 case 27:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 11)), 0x80);
334 case 28:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 12)), 0x80);
335 case 29:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 13)), 0x80);
336 case 30:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 14)), 0x80);
337 case 31:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), _mm256_castsi128_si256(_mm_srli_si128(vHi, 15)), 0x80);
339 return _mm256_setzero_si256();
342 const m128i vLo = _mm256_castsi256_si128(a0);
343 const m128i vHi = _mm256_extractf128_si256(a0, 1);
346 case 1:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm_alignr_epi8(vHi, vLo, 15), 1);
347 case 2:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm_alignr_epi8(vHi, vLo, 14), 1);
348 case 3:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm_alignr_epi8(vHi, vLo, 13), 1);
349 case 4:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm_alignr_epi8(vHi, vLo, 12), 1);
350 case 5:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm_alignr_epi8(vHi, vLo, 11), 1);
351 case 6:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm_alignr_epi8(vHi, vLo, 10), 1);
352 case 7:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm_alignr_epi8(vHi, vLo, 9), 1);
353 case 8:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm_alignr_epi8(vHi, vLo, 8), 1);
354 case 9:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm_alignr_epi8(vHi, vLo, 7), 1);
355 case 10:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm_alignr_epi8(vHi, vLo, 6), 1);
356 case 11:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm_alignr_epi8(vHi, vLo, 5), 1);
357 case 12:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm_alignr_epi8(vHi, vLo, 4), 1);
358 case 13:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm_alignr_epi8(vHi, vLo, 3), 1);
359 case 14:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm_alignr_epi8(vHi, vLo, 2), 1);
360 case 15:
return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm_alignr_epi8(vHi, vLo, 1), 1);
361 case 16:
return _mm256_permute2f128_si256(a0, a0, 0x8);
362 case 17:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 1)), 0x8);
363 case 18:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 2)), 0x8);
364 case 19:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 3)), 0x8);
365 case 20:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 4)), 0x8);
366 case 21:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 5)), 0x8);
367 case 22:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 6)), 0x8);
368 case 23:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 7)), 0x8);
369 case 24:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 8)), 0x8);
370 case 25:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 9)), 0x8);
371 case 26:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 10)), 0x8);
372 case 27:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 11)), 0x8);
373 case 28:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 12)), 0x8);
374 case 29:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 13)), 0x8);
375 case 30:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 14)), 0x8);
376 case 31:
return _mm256_permute2f128_si256(_mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), _mm256_castsi128_si256(_mm_slli_si128(vLo, 15)), 0x8);
378 return _mm256_setzero_si256();
382 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
385 return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
388 return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
391 return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
447 m128i a1 = _mm256_extractf128_si256(a0, 1);
448 return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
479 #if !defined(VC_REQUIRES_MACRO_FOR_IMMEDIATE_ARGUMENT) 481 m128i a1 = _mm256_extractf128_si256(a0, 1);
482 m128i b1 = _mm256_extractf128_si256(b0, 1);
483 m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
484 m128i
r1 = _mm_blend_epi16(a1, b1, m >> 8);
485 return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1);
488 # define _mm256_blend_epi16(a0, b0, m) \ 489 _mm256_insertf128_si256( \ 490 _mm256_castsi128_si256( \ 492 _mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff)), \ 493 _mm_blend_epi16(_mm256_extractf128_si256(a0, 1), _mm256_extractf128_si256(b0, 1), m >> 8);, 1) 496 m128i a1 = _mm256_extractf128_si256(a0, 1);
497 m128i b1 = _mm256_extractf128_si256(b0, 1);
498 m128i m1 = _mm256_extractf128_si256(m0, 1);
499 m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
500 m128i
r1 = _mm_blendv_epi8(a1, b1, m1);
501 return _mm256_insertf128_si256(_mm256_castsi128_si256(r0), r1, 1);
514 #if !defined(VC_CLANG) || VC_CLANG > 0x30100 541 m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(
_mm256_setmin_epi32())));
542 m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(
_mm256_setmin_epi32())));
543 return _mm256_insertf128_si256(_mm256_castsi128_si256(
544 _mm_cmplt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))),
545 _mm_cmplt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1);
550 return _mm256_insertf128_si256(_mm256_castsi128_si256(
551 _mm_cmpgt_epi32(_mm256_castsi256_si128(a), _mm256_castsi256_si128(b))),
552 _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)), 1);
556 #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE 557 _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
559 _mm256_maskstore_ps(mem, mask, v);
563 #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE 564 _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
566 _mm256_maskstore_pd(mem, mask, v);
570 #ifndef VC_MM256_MASKSTORE_WRONG_MASK_TYPE 571 _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
573 _mm256_maskstore_ps(reinterpret_cast<float *>(mem), _mm256_castsi256_ps(mask), _mm256_castsi256_ps(v));
580 #if defined(VC_IMPL_FMA4) && defined(VC_CLANG) && VC_CLANG < 0x30300 582 static Vc_INTRINSIC __m256 my256_macc_ps(__m256
a, __m256 b, __m256
c) {
585 asm(
"vfmaddps %[c], %[b], %[a], %[r]" : [
r]
"=x"(
r) : [a]
"x"(a), [b]
"x"(b), [c]
"x"(c));
588 #ifdef _mm256_macc_ps 589 #undef _mm256_macc_ps 591 #define _mm256_macc_ps(a, b, c) Vc::AVX::my256_macc_ps(a, b, c) 593 static Vc_INTRINSIC __m256d my256_macc_pd(__m256d a, __m256d b, __m256d c) {
596 asm(
"vfmaddpd %[c], %[b], %[a], %[r]" : [
r]
"=x"(
r) : [a]
"x"(a), [b]
"x"(b), [c]
"x"(c));
599 #ifdef _mm256_macc_pd 600 #undef _mm256_macc_pd 602 #define _mm256_macc_pd(a, b, c) Vc::AVX::my256_macc_pd(a, b, c) 611 #endif // VC_AVX_INTRINSICS_H static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi32()
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi32()
m256i Vc_INTRINSIC Vc_CONST _mm256_blend_epi16(param256i a0, param256i b0, const int m)
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi8()
static Vc_INTRINSIC m256 Vc_CONST _mm256_setabsmask_ps()
static Vc_INTRINSIC unsigned int Vc_CONST _mm_extract_epu32(param128i x, const int i)
static Vc_INTRINSIC m256i Vc_CONST _mm256_andnot_si256(param256i x, param256i y)
Namespace for new ROOT classes and functions.
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnle_ps(param256 a, param256 b)
static Vc_INTRINSIC m128i _mm_cmplt_epu16(param128i a, param128i b)
static Vc_INTRINSIC m128i Vc_CONST _mm_setallone_si128()
static Vc_INTRINSIC m128i Vc_CONST _mm_setallone()
static Vc_INTRINSIC m256i Vc_CONST _mm256_slli_si256(param256i a0, const int i)
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16()
static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32()
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu16()
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epi16()
static Vc_INTRINSIC m256i Vc_CONST _mm256_set2power31_epu32()
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpge_ps(param256 a, param256 b)
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi16()
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi8()
static Vc_INTRINSIC unsigned char Vc_CONST _mm_extract_epu8(param128i x, const int i)
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu8()
static Vc_INTRINSIC m256d Vc_CONST _mm256_setsignmask_pd()
Vc_INTRINSIC m256i Vc_CONST _mm256_blendv_epi8(param256i a0, param256i b0, param256i m0)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpunord_pd(param256d a, param256d b)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpeq_pd(param256d a, param256d b)
static Vc_INTRINSIC unsigned short Vc_CONST _mm_extract_epu16(param128i x, const int i)
static Vc_INTRINSIC m128d Vc_CONST _mm_setallone_pd()
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu16()
static Vc_INTRINSIC m256d Vc_CONST _mm256_setabsmask_pd()
static Vc_INTRINSIC m256i Vc_CONST _mm256_cmplt_epu32(param256i _a, param256i _b)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnle_pd(param256d a, param256d b)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpneq_pd(param256d a, param256d b)
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmple_ps(param256 a, param256 b)
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpgt_ps(param256 a, param256 b)
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmplt_ps(param256 a, param256 b)
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epi32()
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpunord_ps(param256 a, param256 b)
static Vc_INTRINSIC m256 Vc_CONST _mm256_set1_ps(float a)
#define AVX_TO_SSE_1(name)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmple_pd(param256d a, param256d b)
static Vc_INTRINSIC m256 Vc_CONST _mm256_set2power31_ps()
unsigned int r1[N_CITIES]
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpneq_ps(param256 a, param256 b)
static Vc_INTRINSIC m256d Vc_CONST _mm256_set1_pd(double a)
AVX_TO_SSE_1i(slli_epi16) AVX_TO_SSE_1i(slli_epi32) AVX_TO_SSE_1i(slli_epi64) AVX_TO_SSE_1i(srai_epi16) AVX_TO_SSE_1i(srai_epi32) AVX_TO_SSE_1i(srli_epi16) AVX_TO_SSE_1i(srli_epi32) AVX_TO_SSE_1i(srli_epi64) Vc_INTRINSIC int Vc_CONST _mm256_movemask_epi8(param256i a0)
static Vc_INTRINSIC m256i Vc_CONST _mm256_setone_epu32()
static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone()
static Vc_INTRINSIC m128i _mm_cmpgt_epu16(param128i a, param128i b)
static Vc_INTRINSIC m256i Vc_CONST _mm256_srli_si256(param256i a0, const int i)
static Vc_INTRINSIC m256i Vc_CONST _mm256_cmpgt_epu32(param256i _a, param256i _b)
static Vc_INTRINSIC m256i Vc_CONST _mm256_set1_epi32(int a)
static Vc_INTRINSIC m256 Vc_CONST _mm256_setsignmask_ps()
static Vc_INTRINSIC m256 Vc_CONST _mm256_setallone_ps()
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpeq_ps(param256 a, param256 b)
static Vc_INTRINSIC m128 Vc_CONST _mm_setallone_ps()
Binding & operator=(OUT(*fun)(void))
static Vc_INTRINSIC m256i Vc_CONST _mm256_and_si256(param256i x, param256i y)
static Vc_INTRINSIC m256d Vc_CONST _mm256_setallone_pd()
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpord_ps(param256 a, param256 b)
static Vc_INTRINSIC void _mm256_maskstore(float *mem, const param256 mask, const param256 v)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpnlt_pd(param256d a, param256d b)
static Vc_INTRINSIC m128i Vc_CONST _mm_setone_epu8()
static Vc_INTRINSIC m256 Vc_CONST _mm256_setone_ps()
static Vc_INTRINSIC m256i Vc_CONST _mm256_xor_si256(param256i x, param256i y)
static Vc_INTRINSIC m256i Vc_CONST _mm256_setmin_epi16()
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmplt_pd(param256d a, param256d b)
static Vc_INTRINSIC m256d Vc_CONST _mm256_cmpord_pd(param256d a, param256d b)
static Vc_INTRINSIC m256i Vc_CONST _mm256_or_si256(param256i x, param256i y)
static Vc_INTRINSIC m256d Vc_CONST _mm256_setone_pd()
static Vc_INTRINSIC m256 Vc_CONST _mm256_cmpnlt_ps(param256 a, param256 b)
#define AVX_TO_SSE_2(name)
static Vc_INTRINSIC m256i Vc_CONST _mm256_setallone_si256()