/* Intel AVX2 acceleration times two Copyright 2024 Ahmet Inan */ #pragma once #include template <> union SIMD { static const int SIZE = 16; typedef float value_type; typedef uint32_t uint_type; __m256 m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef double value_type; typedef uint64_t uint_type; __m256d m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 64; typedef int8_t value_type; typedef uint8_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 32; typedef int16_t value_type; typedef uint16_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef int32_t value_type; typedef uint32_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef int64_t value_type; typedef uint64_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 64; typedef uint8_t value_type; typedef uint8_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 32; typedef uint16_t value_type; typedef uint16_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef uint32_t value_type; typedef uint32_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef uint64_t value_type; typedef uint64_t uint_type; __m256i m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256d)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)a.m[i]; return tmp; } template <> inline SIMD vdup>(float a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_set1_ps(a); return tmp; } template <> inline SIMD vdup>(double a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_set1_pd(a); return tmp; } template <> inline SIMD vdup>(int8_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_set1_epi8(a); return tmp; } template <> inline SIMD vdup>(int16_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_set1_epi16(a); return tmp; } template <> inline SIMD vdup>(int32_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_set1_epi32(a); return tmp; } template <> inline SIMD vdup>(int64_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_set1_epi64x(a); return tmp; } template <> inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_setzero_ps(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_setzero_pd(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_setzero_si256(); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_add_ps(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_add_pd(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_add_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_add_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_add_epi32(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_add_epi64(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_adds_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_adds_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sub_ps(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sub_pd(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sub_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sub_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sub_epi32(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sub_epi64(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_subs_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_subs_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_subs_epu8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_subs_epu16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_mul_ps(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_mul_pd(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_ps(_mm256_set1_ps(-0.f), a.m[i]); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_pd(_mm256_set1_pd(-0.), a.m[i]); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_abs_epi8(_mm256_max_epi8(a.m[i], _mm256_set1_epi8(-INT8_MAX))); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_abs_epi16(_mm256_max_epi16(a.m[i], _mm256_set1_epi16(-INT16_MAX))); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_abs_epi32(_mm256_max_epi32(a.m[i], _mm256_set1_epi32(-INT32_MAX))); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_ps( _mm256_cmp_ps(a.m[i], _mm256_setzero_ps(), _CMP_EQ_OQ), _mm256_or_ps(_mm256_set1_ps(1.f), _mm256_and_ps(_mm256_set1_ps(-0.f), a.m[i]))); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_pd( _mm256_cmp_pd(a.m[i], _mm256_setzero_pd(), _CMP_EQ_OQ), _mm256_or_pd(_mm256_set1_pd(1.), _mm256_and_pd(_mm256_set1_pd(-0.), a.m[i]))); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sign_epi8(_mm256_set1_epi8(1), a.m[i]); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sign_epi16(_mm256_set1_epi16(1), a.m[i]); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sign_epi32(_mm256_set1_epi32(1), a.m[i]); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_ps( _mm256_cmp_ps(b.m[i], _mm256_setzero_ps(), _CMP_EQ_OQ), _mm256_xor_ps(a.m[i], _mm256_and_ps(_mm256_set1_ps(-0.f), b.m[i]))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_pd( _mm256_cmp_pd(b.m[i], _mm256_setzero_pd(), _CMP_EQ_OQ), _mm256_xor_pd(a.m[i], _mm256_and_pd(_mm256_set1_pd(-0.), b.m[i]))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sign_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sign_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_sign_epi32(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_ps( _mm256_andnot_ps(_mm256_set1_ps(-0.f), a.m[i]), _mm256_and_ps(_mm256_set1_ps(-0.f), b.m[i])); return tmp; } template <> inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_pd( _mm256_andnot_pd(_mm256_set1_pd(-0.), a.m[i]), _mm256_and_pd(_mm256_set1_pd(-0.), b.m[i])); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_and_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_and_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_and_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_and_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_xor_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_xor_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_xor_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_xor_si256(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_si256(b.m[i], a.m[i]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_si256(b.m[i], a.m[i]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_si256(b.m[i], a.m[i]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_andnot_si256(b.m[i], a.m[i]); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(_mm256_and_si256(a.m[i], b.m[i]), _mm256_andnot_si256(a.m[i], c.m[i])); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(_mm256_and_si256(a.m[i], b.m[i]), _mm256_andnot_si256(a.m[i], c.m[i])); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(_mm256_and_si256(a.m[i], b.m[i]), _mm256_andnot_si256(a.m[i], c.m[i])); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256(_mm256_and_si256(a.m[i], b.m[i]), _mm256_andnot_si256(a.m[i], c.m[i])); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_ps(a.m[i], _mm256_setzero_ps(), _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_pd(a.m[i], _mm256_setzero_pd(), _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi8(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi16(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi32(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi64(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_ps(a.m[i], b.m[i], _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_pd(a.m[i], b.m[i], _CMP_EQ_OQ); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi32(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpeq_epi64(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_ps(a.m[i], _mm256_setzero_ps(), _CMP_GT_OQ); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_pd(a.m[i], _mm256_setzero_pd(), _CMP_GT_OQ); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi8(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi16(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi32(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi64(a.m[i], _mm256_setzero_si256()); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_ps(a.m[i], _mm256_setzero_ps(), _CMP_LT_OQ); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_pd(a.m[i], _mm256_setzero_pd(), _CMP_LT_OQ); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi8(_mm256_setzero_si256(), a.m[i]); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi16(_mm256_setzero_si256(), a.m[i]); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi32(_mm256_setzero_si256(), a.m[i]); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a.m[i]); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_ps(a.m[i], _mm256_setzero_ps(), _CMP_LE_OQ); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = (__m256i)_mm256_cmp_pd(a.m[i], _mm256_setzero_pd(), _CMP_LE_OQ); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256( _mm256_cmpeq_epi8(a.m[i], _mm256_setzero_si256()), _mm256_cmpgt_epi8(_mm256_setzero_si256(), a.m[i])); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256( _mm256_cmpeq_epi16(a.m[i], _mm256_setzero_si256()), _mm256_cmpgt_epi16(_mm256_setzero_si256(), a.m[i])); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256( _mm256_cmpeq_epi32(a.m[i], _mm256_setzero_si256()), _mm256_cmpgt_epi32(_mm256_setzero_si256(), a.m[i])); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256( _mm256_cmpeq_epi64(a.m[i], _mm256_setzero_si256()), _mm256_cmpgt_epi64(_mm256_setzero_si256(), a.m[i])); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_ps(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_pd(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi32(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_max_ps(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_max_pd(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_max_epi8(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_max_epi16(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_max_epi32(a.m[i], b.m[i]); return tmp; } template <> inline SIMD vclamp(SIMD x, float a, float b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_ps(_mm256_max_ps(x.m[i], _mm256_set1_ps(a)), _mm256_set1_ps(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, double a, double b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_pd(_mm256_max_pd(x.m[i], _mm256_set1_pd(a)), _mm256_set1_pd(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int8_t a, int8_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi8(_mm256_max_epi8(x.m[i], _mm256_set1_epi8(a)), _mm256_set1_epi8(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int16_t a, int16_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi16(_mm256_max_epi16(x.m[i], _mm256_set1_epi16(a)), _mm256_set1_epi16(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int32_t a, int32_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi32(_mm256_max_epi32(x.m[i], _mm256_set1_epi32(a)), _mm256_set1_epi32(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int64_t a, int64_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_min_epi64(_mm256_max_epi64(x.m[i], _mm256_set1_epi64x(a)), _mm256_set1_epi64x(b)); return tmp; } template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) { __m256i c = _mm256_or_si256(b.m[i], _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(15))); __m256i d = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(16)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(31))); __m256i e = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(32)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(47))); __m256i f = _mm256_sub_epi8(b.m[i], _mm256_set1_epi8(48)); __m256i g = _mm256_or_si256( _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 0), c), _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 17), d)); __m256i h = _mm256_or_si256( _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 0), e), _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 17), f)); tmp.m[i] = _mm256_or_si256(g, h); } return tmp; } template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) { __m256i c = _mm256_or_si256(b.m[i], _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(15))); __m256i d = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(16)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(31))); __m256i e = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(32)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(47))); __m256i f = _mm256_sub_epi8(b.m[i], _mm256_set1_epi8(48)); __m256i g = _mm256_or_si256( _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 0), c), _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 17), d)); __m256i h = _mm256_or_si256( _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 0), e), _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 17), f)); tmp.m[i] = _mm256_or_si256(g, h); } return tmp; } template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_si256( _mm256_and_si256(_mm256_permutevar8x32_epi32(a.m[0], b.m[i]), _mm256_cmpgt_epi32(_mm256_set1_epi32(8), b.m[i])), _mm256_and_si256(_mm256_permutevar8x32_epi32(a.m[1], _mm256_sub_epi32(b.m[i], _mm256_set1_epi32(8))), _mm256_cmpgt_epi32(b.m[i], _mm256_set1_epi32(7)))); return tmp; } template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) tmp.m[i] = _mm256_or_ps( _mm256_and_ps(_mm256_permutevar8x32_ps(a.m[0], b.m[i]), (__m256)_mm256_cmpgt_epi32(_mm256_set1_epi32(8), b.m[i])), _mm256_and_ps(_mm256_permutevar8x32_ps(a.m[1], _mm256_sub_epi32(b.m[i], _mm256_set1_epi32(8))), (__m256)_mm256_cmpgt_epi32(b.m[i], _mm256_set1_epi32(7)))); return tmp; }