diff --git a/simd.hh b/simd.hh index 4a1fd28..e4ffa25 100644 --- a/simd.hh +++ b/simd.hh @@ -1502,6 +1502,9 @@ static inline SIMD vshuf(SIMD a, SIMD +*/ + +#pragma once + +#include + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef float value_type; + typedef uint32_t uint_type; + __m128 m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef double value_type; + typedef uint64_t uint_type; + __m128d m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef int8_t value_type; + typedef uint8_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef int16_t value_type; + typedef uint16_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef int32_t value_type; + typedef uint32_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef int64_t value_type; + typedef uint64_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef uint8_t value_type; + typedef uint8_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef uint16_t value_type; + typedef uint16_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef uint32_t value_type; + typedef uint32_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef uint64_t value_type; + typedef uint64_t uint_type; + __m128i m[2]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128d)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vdup>(float a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_set1_ps(a); + return tmp; +} + +template <> +inline SIMD vdup>(double a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_set1_pd(a); + return tmp; +} + +template <> +inline SIMD vdup>(int8_t a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_set1_epi8(a); + return tmp; +} + +template <> +inline SIMD vdup>(int16_t a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_set1_epi16(a); + return tmp; +} + +template <> +inline SIMD vdup>(int32_t a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_set1_epi32(a); + return tmp; +} + +template <> +inline SIMD vdup>(int64_t a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_set1_epi64x(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_setzero_ps(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_setzero_pd(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_add_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_add_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_add_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_add_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_add_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_add_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_adds_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_adds_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sub_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sub_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sub_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sub_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sub_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sub_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_subs_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_subs_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_subs_epu8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_subs_epu16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_mul_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_mul_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_ps(_mm_set1_ps(-0.f), a.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_pd(_mm_set1_pd(-0.), a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_abs_epi8(_mm_max_epi8(a.m[i], _mm_set1_epi8(-INT8_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_abs_epi16(_mm_max_epi16(a.m[i], _mm_set1_epi16(-INT16_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_abs_epi32(_mm_max_epi32(a.m[i], _mm_set1_epi32(-INT32_MAX))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_ps( + _mm_cmpeq_ps(a.m[i], _mm_setzero_ps()), + _mm_or_ps(_mm_set1_ps(1.f), _mm_and_ps(_mm_set1_ps(-0.f), a.m[i]))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_pd( + _mm_cmpeq_pd(a.m[i], _mm_setzero_pd()), + _mm_or_pd(_mm_set1_pd(1.), _mm_and_pd(_mm_set1_pd(-0.), a.m[i]))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sign_epi8(_mm_set1_epi8(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sign_epi16(_mm_set1_epi16(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sign_epi32(_mm_set1_epi32(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_ps( + _mm_cmpeq_ps(b.m[i], _mm_setzero_ps()), + _mm_xor_ps(a.m[i], _mm_and_ps(_mm_set1_ps(-0.f), b.m[i]))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_pd( + _mm_cmpeq_pd(b.m[i], _mm_setzero_pd()), + _mm_xor_pd(a.m[i], _mm_and_pd(_mm_set1_pd(-0.), b.m[i]))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sign_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sign_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_sign_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_ps( + _mm_andnot_ps(_mm_set1_ps(-0.f), a.m[i]), + _mm_and_ps(_mm_set1_ps(-0.f), b.m[i])); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_pd( + _mm_andnot_pd(_mm_set1_pd(-0.), a.m[i]), + _mm_and_pd(_mm_set1_pd(-0.), b.m[i])); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi8(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi16(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi32(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi64(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpeq_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmpgt_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmpgt_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi8(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi16(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi32(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi64(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmplt_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmplt_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi8(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi16(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi32(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_cmpgt_epi64(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmple_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (__m128i)_mm_cmple_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi8(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi8(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi16(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi16(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi32(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi32(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi64(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi64(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_max_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_max_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_max_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_max_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_max_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, float a, float b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_ps(_mm_max_ps(x.m[i], _mm_set1_ps(a)), _mm_set1_ps(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, double a, double b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_pd(_mm_max_pd(x.m[i], _mm_set1_pd(a)), _mm_set1_pd(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int8_t a, int8_t b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_epi8(_mm_max_epi8(x.m[i], _mm_set1_epi8(a)), _mm_set1_epi8(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int16_t a, int16_t b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_epi16(_mm_max_epi16(x.m[i], _mm_set1_epi16(a)), _mm_set1_epi16(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int32_t a, int32_t b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_min_epi32(_mm_max_epi32(x.m[i], _mm_set1_epi32(a)), _mm_set1_epi32(b)); + return tmp; +} + diff --git a/sse4_1_quadruple.hh b/sse4_1_quadruple.hh new file mode 100644 index 0000000..71d8fa9 --- /dev/null +++ b/sse4_1_quadruple.hh @@ -0,0 +1,1274 @@ +/* +Intel SSE4.1 acceleration times four + +Copyright 2024 Ahmet Inan +*/ + +#pragma once + +#include + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef float value_type; + typedef uint32_t uint_type; + __m128 m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef double value_type; + typedef uint64_t uint_type; + __m128d m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 64; + typedef int8_t value_type; + typedef uint8_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef int16_t value_type; + typedef uint16_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef int32_t value_type; + typedef uint32_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef int64_t value_type; + typedef uint64_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 64; + typedef uint8_t value_type; + typedef uint8_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef uint16_t value_type; + typedef uint16_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef uint32_t value_type; + typedef uint32_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef uint64_t value_type; + typedef uint64_t uint_type; + __m128i m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128d)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vdup>(float a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_set1_ps(a); + return tmp; +} + +template <> +inline SIMD vdup>(double a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_set1_pd(a); + return tmp; +} + +template <> +inline SIMD vdup>(int8_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_set1_epi8(a); + return tmp; +} + +template <> +inline SIMD vdup>(int16_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_set1_epi16(a); + return tmp; +} + +template <> +inline SIMD vdup>(int32_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_set1_epi32(a); + return tmp; +} + +template <> +inline SIMD vdup>(int64_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_set1_epi64x(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_setzero_ps(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_setzero_pd(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_add_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_add_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_add_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_add_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_add_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_add_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_adds_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_adds_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sub_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sub_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sub_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sub_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sub_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sub_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_subs_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_subs_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_subs_epu8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_subs_epu16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_mul_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_mul_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_ps(_mm_set1_ps(-0.f), a.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_pd(_mm_set1_pd(-0.), a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_abs_epi8(_mm_max_epi8(a.m[i], _mm_set1_epi8(-INT8_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_abs_epi16(_mm_max_epi16(a.m[i], _mm_set1_epi16(-INT16_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_abs_epi32(_mm_max_epi32(a.m[i], _mm_set1_epi32(-INT32_MAX))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_ps( + _mm_cmpeq_ps(a.m[i], _mm_setzero_ps()), + _mm_or_ps(_mm_set1_ps(1.f), _mm_and_ps(_mm_set1_ps(-0.f), a.m[i]))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_pd( + _mm_cmpeq_pd(a.m[i], _mm_setzero_pd()), + _mm_or_pd(_mm_set1_pd(1.), _mm_and_pd(_mm_set1_pd(-0.), a.m[i]))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sign_epi8(_mm_set1_epi8(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sign_epi16(_mm_set1_epi16(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sign_epi32(_mm_set1_epi32(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_ps( + _mm_cmpeq_ps(b.m[i], _mm_setzero_ps()), + _mm_xor_ps(a.m[i], _mm_and_ps(_mm_set1_ps(-0.f), b.m[i]))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_pd( + _mm_cmpeq_pd(b.m[i], _mm_setzero_pd()), + _mm_xor_pd(a.m[i], _mm_and_pd(_mm_set1_pd(-0.), b.m[i]))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sign_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sign_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_sign_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_ps( + _mm_andnot_ps(_mm_set1_ps(-0.f), a.m[i]), + _mm_and_ps(_mm_set1_ps(-0.f), b.m[i])); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_pd( + _mm_andnot_pd(_mm_set1_pd(-0.), a.m[i]), + _mm_and_pd(_mm_set1_pd(-0.), b.m[i])); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi8(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi16(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi32(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi64(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpeq_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmpgt_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmpgt_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi8(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi16(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi32(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi64(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmplt_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmplt_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi8(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi16(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi32(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_cmpgt_epi64(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmple_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (__m128i)_mm_cmple_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi8(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi8(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi16(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi16(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi32(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi32(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi64(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi64(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_max_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_max_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_max_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_max_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_max_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, float a, float b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_ps(_mm_max_ps(x.m[i], _mm_set1_ps(a)), _mm_set1_ps(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, double a, double b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_pd(_mm_max_pd(x.m[i], _mm_set1_pd(a)), _mm_set1_pd(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int8_t a, int8_t b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_epi8(_mm_max_epi8(x.m[i], _mm_set1_epi8(a)), _mm_set1_epi8(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int16_t a, int16_t b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_epi16(_mm_max_epi16(x.m[i], _mm_set1_epi16(a)), _mm_set1_epi16(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int32_t a, int32_t b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_min_epi32(_mm_max_epi32(x.m[i], _mm_set1_epi32(a)), _mm_set1_epi32(b)); + return tmp; +} + diff --git a/sse4_1_triple.hh b/sse4_1_triple.hh new file mode 100644 index 0000000..5b5818d --- /dev/null +++ b/sse4_1_triple.hh @@ -0,0 +1,1274 @@ +/* +Intel SSE4.1 acceleration times three + +Copyright 2024 Ahmet Inan +*/ + +#pragma once + +#include + +template <> +union SIMD +{ + static const int SIZE = 12; + typedef float value_type; + typedef uint32_t uint_type; + __m128 m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 6; + typedef double value_type; + typedef uint64_t uint_type; + __m128d m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 48; + typedef int8_t value_type; + typedef uint8_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 24; + typedef int16_t value_type; + typedef uint16_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 12; + typedef int32_t value_type; + typedef uint32_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 6; + typedef int64_t value_type; + typedef uint64_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 48; + typedef uint8_t value_type; + typedef uint8_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 24; + typedef uint16_t value_type; + typedef uint16_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 12; + typedef uint32_t value_type; + typedef uint32_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 6; + typedef uint64_t value_type; + typedef uint64_t uint_type; + __m128i m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128d)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)a.m[i]; + return tmp; +} + +template <> +inline SIMD vdup>(float a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_set1_ps(a); + return tmp; +} + +template <> +inline SIMD vdup>(double a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_set1_pd(a); + return tmp; +} + +template <> +inline SIMD vdup>(int8_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_set1_epi8(a); + return tmp; +} + +template <> +inline SIMD vdup>(int16_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_set1_epi16(a); + return tmp; +} + +template <> +inline SIMD vdup>(int32_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_set1_epi32(a); + return tmp; +} + +template <> +inline SIMD vdup>(int64_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_set1_epi64x(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_setzero_ps(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_setzero_pd(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_add_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_add_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_add_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_add_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_add_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_add_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_adds_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_adds_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sub_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sub_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sub_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sub_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sub_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sub_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_subs_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_subs_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_subs_epu8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_subs_epu16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_mul_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_mul_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_ps(_mm_set1_ps(-0.f), a.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_pd(_mm_set1_pd(-0.), a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_abs_epi8(_mm_max_epi8(a.m[i], _mm_set1_epi8(-INT8_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_abs_epi16(_mm_max_epi16(a.m[i], _mm_set1_epi16(-INT16_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_abs_epi32(_mm_max_epi32(a.m[i], _mm_set1_epi32(-INT32_MAX))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_ps( + _mm_cmpeq_ps(a.m[i], _mm_setzero_ps()), + _mm_or_ps(_mm_set1_ps(1.f), _mm_and_ps(_mm_set1_ps(-0.f), a.m[i]))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_pd( + _mm_cmpeq_pd(a.m[i], _mm_setzero_pd()), + _mm_or_pd(_mm_set1_pd(1.), _mm_and_pd(_mm_set1_pd(-0.), a.m[i]))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sign_epi8(_mm_set1_epi8(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sign_epi16(_mm_set1_epi16(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sign_epi32(_mm_set1_epi32(1), a.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_ps( + _mm_cmpeq_ps(b.m[i], _mm_setzero_ps()), + _mm_xor_ps(a.m[i], _mm_and_ps(_mm_set1_ps(-0.f), b.m[i]))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_pd( + _mm_cmpeq_pd(b.m[i], _mm_setzero_pd()), + _mm_xor_pd(a.m[i], _mm_and_pd(_mm_set1_pd(-0.), b.m[i]))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sign_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sign_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_sign_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_ps( + _mm_andnot_ps(_mm_set1_ps(-0.f), a.m[i]), + _mm_and_ps(_mm_set1_ps(-0.f), b.m[i])); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_pd( + _mm_andnot_pd(_mm_set1_pd(-0.), a.m[i]), + _mm_and_pd(_mm_set1_pd(-0.), b.m[i])); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_and_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_xor_si128(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_andnot_si128(b.m[i], a.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(_mm_and_si128(a.m[i], b.m[i]), _mm_andnot_si128(a.m[i], c.m[i])); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi8(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi16(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi32(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi64(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmpeq_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpeq_epi64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmpgt_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmpgt_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi8(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi16(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi32(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi64(a.m[i], _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmplt_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmplt_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi8(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi16(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi32(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_cmpgt_epi64(_mm_setzero_si128(), a.m[i]); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmple_ps(a.m[i], _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (__m128i)_mm_cmple_pd(a.m[i], _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi8(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi8(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi16(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi16(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi32(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi32(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128( + _mm_cmpeq_epi64(a.m[i], _mm_setzero_si128()), + _mm_cmpgt_epi64(_mm_setzero_si128(), a.m[i])); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_max_ps(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_max_pd(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_max_epi8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_max_epi16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_max_epi32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, float a, float b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_ps(_mm_max_ps(x.m[i], _mm_set1_ps(a)), _mm_set1_ps(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, double a, double b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_pd(_mm_max_pd(x.m[i], _mm_set1_pd(a)), _mm_set1_pd(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int8_t a, int8_t b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_epi8(_mm_max_epi8(x.m[i], _mm_set1_epi8(a)), _mm_set1_epi8(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int16_t a, int16_t b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_epi16(_mm_max_epi16(x.m[i], _mm_set1_epi16(a)), _mm_set1_epi16(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int32_t a, int32_t b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_min_epi32(_mm_max_epi32(x.m[i], _mm_set1_epi32(a)), _mm_set1_epi32(b)); + return tmp; +} +