From 4f11a22311b8db3130eace2478c553050a2b7ec0 Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Tue, 11 Dec 2018 09:05:12 +0100 Subject: [PATCH] added SIMD wrappers for ARM NEON, Intel SSE4.1 and AVX2 --- README.md | 7 + avx2.hh | 990 ++++++++++++++++++++++++++++++++++++++++++++ neon.hh | 843 +++++++++++++++++++++++++++++++++++++ simd.hh | 1189 +++++++++++++++++++++++++++++++++++++++++++++++++++++ sse4_1.hh | 975 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 4004 insertions(+) create mode 100644 avx2.hh create mode 100644 neon.hh create mode 100644 simd.hh create mode 100644 sse4_1.hh diff --git a/README.md b/README.md index 2f292bf..967da60 100644 --- a/README.md +++ b/README.md @@ -65,3 +65,10 @@ It computes the following, but having only O(N) complexity and using O(1) extra output[i] = op(output[i], input[j]); ``` +### [simd.hh](simd.hh) + +Single instruction, multiple data ([SIMD](https://en.wikipedia.org/wiki/SIMD)) wrappers for: +* [ARM NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(NEON)) ([neon.hh](neon.hh)) +* [Intel SSE4.1](https://en.wikipedia.org/wiki/SSE4) ([sse4_1.hh](sse4_1.hh)) +* [Intel AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) ([avx2.hh](avx2.hh)) + diff --git a/avx2.hh b/avx2.hh new file mode 100644 index 0000000..281db56 --- /dev/null +++ b/avx2.hh @@ -0,0 +1,990 @@ +/* +Intel AVX2 acceleration + +Copyright 2018 Ahmet Inan +*/ + +#ifndef AVX2_HH +#define AVX2_HH + +#include + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef float value_type; + typedef uint32_t uint_type; + __m256 m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef double value_type; + typedef uint64_t uint_type; + __m256d m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef int8_t value_type; + typedef uint8_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef int16_t value_type; + typedef uint16_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef int32_t value_type; + typedef uint32_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef int64_t value_type; + typedef uint64_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef uint8_t value_type; + typedef uint8_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef uint16_t value_type; + typedef uint16_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef uint32_t value_type; + typedef uint32_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef uint64_t value_type; + typedef uint64_t uint_type; + __m256i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256d)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)a.m; + return tmp; +} + +template <> +inline SIMD vdup>(float a) +{ + SIMD tmp; + tmp.m = _mm256_set1_ps(a); + return tmp; +} + +template <> +inline SIMD vdup>(double a) +{ + SIMD tmp; + tmp.m = _mm256_set1_pd(a); + return tmp; +} + +template <> +inline SIMD vdup>(int8_t a) +{ + SIMD tmp; + tmp.m = _mm256_set1_epi8(a); + return tmp; +} + +template <> +inline SIMD vdup>(int16_t a) +{ + SIMD tmp; + tmp.m = _mm256_set1_epi16(a); + return tmp; +} + +template <> +inline SIMD vdup>(int32_t a) +{ + SIMD tmp; + tmp.m = _mm256_set1_epi32(a); + return tmp; +} + +template <> +inline SIMD vdup>(int64_t a) +{ + SIMD tmp; + tmp.m = _mm256_set1_epi64x(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm256_setzero_ps(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm256_setzero_pd(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm256_setzero_si256(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm256_setzero_si256(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm256_setzero_si256(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm256_setzero_si256(); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_add_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_add_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_add_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_add_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_add_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_add_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_adds_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_adds_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sub_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sub_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sub_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sub_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sub_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sub_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_subs_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_subs_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_subs_epu8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_subs_epu16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_andnot_ps(_mm256_set1_ps(-0.f), a.m); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_andnot_pd(_mm256_set1_pd(-0.), a.m); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_abs_epi8(_mm256_max_epi8(a.m, _mm256_set1_epi8(-INT8_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_abs_epi16(_mm256_max_epi16(a.m, _mm256_set1_epi16(-INT16_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_abs_epi32(_mm256_max_epi32(a.m, _mm256_set1_epi32(-INT32_MAX))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_andnot_ps( + _mm256_cmp_ps(b.m, _mm256_setzero_ps(), _CMP_EQ_OQ), + _mm256_xor_ps(a.m, _mm256_and_ps(_mm256_set1_ps(-0.f), b.m))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_andnot_pd( + _mm256_cmp_pd(b.m, _mm256_setzero_pd(), _CMP_EQ_OQ), + _mm256_xor_pd(a.m, _mm256_and_pd(_mm256_set1_pd(-0.), b.m))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sign_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sign_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_sign_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_and_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_and_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_and_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_and_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_xor_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_xor_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_xor_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_xor_si256(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_andnot_si256(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_andnot_si256(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_andnot_si256(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_andnot_si256(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm256_or_si256(_mm256_and_si256(a.m, b.m), _mm256_andnot_si256(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_EQ_OQ); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_EQ_OQ); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi8(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi16(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi32(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi64(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_pd(a.m, b.m, _CMP_EQ_OQ); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_cmpeq_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_GT_OQ); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_GT_OQ); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi8(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi16(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi32(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi64(a.m, _mm256_setzero_si256()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_LT_OQ); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m256i)_mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_LT_OQ); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi8(_mm256_setzero_si256(), a.m); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi16(_mm256_setzero_si256(), a.m); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi32(_mm256_setzero_si256(), a.m); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_min_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_min_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_min_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_min_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_min_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_min_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_max_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_max_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_max_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_max_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_max_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_max_epi64(a.m, b.m); + return tmp; +} + +#endif diff --git a/neon.hh b/neon.hh new file mode 100644 index 0000000..ac5aa92 --- /dev/null +++ b/neon.hh @@ -0,0 +1,843 @@ +/* +ARM NEON acceleration + +Copyright 2018 Ahmet Inan +*/ + +#ifndef NEON_HH +#define NEON_HH + +#include + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef float value_type; + typedef uint32_t uint_type; + float32x4_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef int8_t value_type; + typedef uint8_t uint_type; + int8x16_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef int16_t value_type; + typedef uint16_t uint_type; + int16x8_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef int32_t value_type; + typedef uint32_t uint_type; + int32x4_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 2; + typedef int64_t value_type; + typedef uint64_t uint_type; + int64x2_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef uint8_t value_type; + typedef uint8_t uint_type; + uint8x16_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef uint16_t value_type; + typedef uint16_t uint_type; + uint16x8_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef uint32_t value_type; + typedef uint32_t uint_type; + uint32x4_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 2; + typedef uint64_t value_type; + typedef uint64_t uint_type; + uint64x2_t m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (float32x4_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (uint32x4_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (int8x16_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (uint8x16_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (int16x8_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (uint16x8_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (int32x4_t)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (uint32x4_t)a.m; + return tmp; +} + +template <> +inline SIMD vdup(float a) +{ + SIMD tmp; + tmp.m = vdupq_n_f32(a); + return tmp; +} + +template <> +inline SIMD vdup(int8_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_s8(a); + return tmp; +} + +template <> +inline SIMD vdup(int16_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_s16(a); + return tmp; +} + +template <> +inline SIMD vdup(int32_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_s32(a); + return tmp; +} + +template <> +inline SIMD vdup(int64_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_s64(a); + return tmp; +} + +template <> +inline SIMD vdup(uint8_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_u8(a); + return tmp; +} + +template <> +inline SIMD vdup(uint16_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_u16(a); + return tmp; +} + +template <> +inline SIMD vdup(uint32_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_u32(a); + return tmp; +} + +template <> +inline SIMD vdup(uint64_t a) +{ + SIMD tmp; + tmp.m = vdupq_n_u64(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = (float32x4_t)veorq_u32((uint32x4_t)tmp.m, (uint32x4_t)tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_s8(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_s16(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_s32(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_s64(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_u8(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_u16(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_u32(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = veorq_u64(tmp.m, tmp.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vaddq_f32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vaddq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vaddq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vaddq_s32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vaddq_s64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vqaddq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vqaddq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vsubq_f32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vsubq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vsubq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vsubq_s32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vsubq_s64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vqsubq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vqsubq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vqsubq_u8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vqsubq_u16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + tmp.m = vabsq_f32(a.m); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = vqabsq_s8(a.m); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = vqabsq_s16(a.m); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m, vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m)), + vceqq_f32(b.m, vdupq_n_f32(0.f))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m), (uint8x16_t)vnegq_s8(a.m)), + vandq_u8(vcgtq_s8(b.m, vdupq_n_s8(0)), (uint8x16_t)a.m)); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vorrq_u8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vorrq_u16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vorrq_u32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vorrq_u64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vandq_u8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vandq_u16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vandq_u32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vandq_u64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = veorq_u8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = veorq_u16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = veorq_u32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = veorq_u64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vbicq_u8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vbicq_u16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vbicq_u32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vbicq_u64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = vbslq_u8(a.m, b.m, c.m); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = vbslq_u16(a.m, b.m, c.m); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = vbslq_u32(a.m, b.m, c.m); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = vbslq_u64(a.m, b.m, c.m); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = vceqq_f32(a.m, vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = vceqq_s8(a.m, vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = vceqq_s16(a.m, vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = vceqq_s32(a.m, vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vceqq_f32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vceqq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vceqq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vceqq_s32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = vcgtq_f32(a.m, vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = vcgtq_s8(a.m, vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = vcgtq_s16(a.m, vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = vcgtq_s32(a.m, vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = vcltq_f32(a.m, vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = vcltq_s8(a.m, vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = vcltq_s16(a.m, vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = vcltq_s32(a.m, vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vminq_f32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vminq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vminq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vminq_s32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vmaxq_f32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vmaxq_s8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vmaxq_s16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vmaxq_s32(a.m, b.m); + return tmp; +} + +#endif diff --git a/simd.hh b/simd.hh new file mode 100644 index 0000000..197ca75 --- /dev/null +++ b/simd.hh @@ -0,0 +1,1189 @@ +/* +Single instruction, multiple data + +Copyright 2018 Ahmet Inan +*/ + +#ifndef SIMD_HH +#define SIMD_HH + +#include +#include +#include + +template +union SIMD; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef float value_type; + typedef uint32_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef double value_type; + typedef uint64_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef int8_t value_type; + typedef uint8_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef int16_t value_type; + typedef uint16_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef int32_t value_type; + typedef uint32_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef int64_t value_type; + typedef uint64_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef uint8_t value_type; + typedef uint8_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef uint16_t value_type; + typedef uint16_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef uint32_t value_type; + typedef uint32_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +union SIMD +{ + static const int SIZE = WIDTH; + typedef uint64_t value_type; + typedef uint64_t uint_type; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template +static inline TYPE vdup(typename TYPE::value_type a) +{ + TYPE tmp; + for (int i = 0; i < TYPE::SIZE; ++i) + tmp.v[i] = a; + return tmp; +} + +template +static inline TYPE vzero() +{ + TYPE tmp; + for (int i = 0; i < TYPE::SIZE; ++i) + tmp.u[i] ^= tmp.u[i]; + return tmp; +} + +template +static inline DST vreinterpret(SRC a) +{ + static_assert(SRC::SIZE == DST::SIZE, "source and destination width must be same"); + static_assert(sizeof(typename SRC::value_type) == sizeof(typename DST::value_type), "source and destination value type sizes must be same"); + DST tmp; + for (int i = 0; i < DST::SIZE; ++i) + tmp.u[i] = a.u[i]; + return tmp; +} + +template +static inline SIMD vmask(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vmask(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vmask(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vmask(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vmask(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vmask(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vunsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vunsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vunsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vunsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vsigned(SIMD a) +{ + return vreinterpret>(a); +} + +template +static inline SIMD vneg(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -a.v[i]; + return tmp; +} + +template +static inline SIMD vneg(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -a.v[i]; + return tmp; +} + +template +static inline SIMD vneg(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -a.v[i]; + return tmp; +} + +template +static inline SIMD vneg(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -a.v[i]; + return tmp; +} + +template +static inline SIMD vneg(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -a.v[i]; + return tmp; +} + +template +static inline SIMD vneg(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -a.v[i]; + return tmp; +} + +template +static inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::abs(a.v[i]); + return tmp; +} + +template +static inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::abs(std::max(a.v[i], -INT8_MAX)); + return tmp; +} + +template +static inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::abs(std::max(a.v[i], -INT16_MAX)); + return tmp; +} + +template +static inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::abs(std::max(a.v[i], -INT32_MAX)); + return tmp; +} + +template +static inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::abs(std::max(a.v[i], -INT64_MAX)); + return tmp; +} + +template +static inline SIMD vnot(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ~a.v[i]; + return tmp; +} + +template +static inline SIMD vnot(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ~a.v[i]; + return tmp; +} + +template +static inline SIMD vnot(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ~a.v[i]; + return tmp; +} + +template +static inline SIMD vnot(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ~a.v[i]; + return tmp; +} + +template +static inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] | b.v[i]; + return tmp; +} + +template +static inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] | b.v[i]; + return tmp; +} + +template +static inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] | b.v[i]; + return tmp; +} + +template +static inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] | b.v[i]; + return tmp; +} + +template +static inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & b.v[i]; + return tmp; +} + +template +static inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & b.v[i]; + return tmp; +} + +template +static inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & b.v[i]; + return tmp; +} + +template +static inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & b.v[i]; + return tmp; +} + +template +static inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] ^ b.v[i]; + return tmp; +} + +template +static inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] ^ b.v[i]; + return tmp; +} + +template +static inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] ^ b.v[i]; + return tmp; +} + +template +static inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] ^ b.v[i]; + return tmp; +} + +template +static inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & ~b.v[i]; + return tmp; +} + +template +static inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & ~b.v[i]; + return tmp; +} + +template +static inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & ~b.v[i]; + return tmp; +} + +template +static inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] & ~b.v[i]; + return tmp; +} + +template +static inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] & b.v[i]) | (~a.v[i] & c.v[i]); + return tmp; +} + +template +static inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] & b.v[i]) | (~a.v[i] & c.v[i]); + return tmp; +} + +template +static inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] & b.v[i]) | (~a.v[i] & c.v[i]); + return tmp; +} + +template +static inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] & b.v[i]) | (~a.v[i] & c.v[i]); + return tmp; +} + +template +static inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > 0.f); + return tmp; +} + +template +static inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > 0.); + return tmp; +} + +template +static inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > 0); + return tmp; +} + +template +static inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > 0); + return tmp; +} + +template +static inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > 0); + return tmp; +} + +template +static inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > 0); + return tmp; +} + +template +static inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == 0.f); + return tmp; +} + +template +static inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == 0.); + return tmp; +} + +template +static inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -!a.v[i]; + return tmp; +} + +template +static inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -!a.v[i]; + return tmp; +} + +template +static inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -!a.v[i]; + return tmp; +} + +template +static inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -!a.v[i]; + return tmp; +} + +template +static inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] < 0.f); + return tmp; +} + +template +static inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] < 0.); + return tmp; +} + +template +static inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vcgt(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] > b.v[i]); + return tmp; +} + +template +static inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == b.v[i]); + return tmp; +} + +template +static inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == b.v[i]); + return tmp; +} + +template +static inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == b.v[i]); + return tmp; +} + +template +static inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == b.v[i]); + return tmp; +} + +template +static inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == b.v[i]); + return tmp; +} + +template +static inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = -(a.v[i] == b.v[i]); + return tmp; +} + +template +static inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(a.v[i], b.v[i]); + return tmp; +} + +template +static inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] + b.v[i]; + return tmp; +} + +template +static inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] + b.v[i]; + return tmp; +} + +template +static inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] + b.v[i]; + return tmp; +} + +template +static inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] + b.v[i]; + return tmp; +} + +template +static inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] + b.v[i]; + return tmp; +} + +template +static inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] + b.v[i]; + return tmp; +} + +template +static inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(std::max(int16_t(a.v[i]) + int16_t(b.v[i]), INT8_MIN), INT8_MAX); + return tmp; +} + +template +static inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(std::max(int32_t(a.v[i]) + int32_t(b.v[i]), INT16_MIN), INT16_MAX); + return tmp; +} + +template +static inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] - b.v[i]; + return tmp; +} + +template +static inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] - b.v[i]; + return tmp; +} + +template +static inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] - b.v[i]; + return tmp; +} + +template +static inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] - b.v[i]; + return tmp; +} + +template +static inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] - b.v[i]; + return tmp; +} + +template +static inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] - b.v[i]; + return tmp; +} + +template +static inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(std::max(int16_t(a.v[i]) - int16_t(b.v[i]), INT8_MIN), INT8_MAX); + return tmp; +} + +template +static inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::min(std::max(int32_t(a.v[i]) - int32_t(b.v[i]), INT16_MIN), INT16_MAX); + return tmp; +} + +template +static inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(int16_t(a.v[i]) - int16_t(b.v[i]), 0); + return tmp; +} + +template +static inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = std::max(int32_t(a.v[i]) - int32_t(b.v[i]), 0); + return tmp; +} + +template +static inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ((b.v[i] > 0.f) - (b.v[i] < 0.f)) * a.v[i]; + return tmp; +} + +template +static inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ((b.v[i] > 0.) - (b.v[i] < 0.)) * a.v[i]; + return tmp; +} + +template +static inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ((b.v[i] > 0) - (b.v[i] < 0)) * a.v[i]; + return tmp; +} + +template +static inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ((b.v[i] > 0) - (b.v[i] < 0)) * a.v[i]; + return tmp; +} + +template +static inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ((b.v[i] > 0) - (b.v[i] < 0)) * a.v[i]; + return tmp; +} + +template +static inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = ((b.v[i] > 0) - (b.v[i] < 0)) * a.v[i]; + return tmp; +} + +#if 1 +#ifdef __AVX2__ +#include "avx2.hh" +#else +#ifdef __SSE4_1__ +#include "sse4_1.hh" +#endif +#endif + +#ifdef __ARM_NEON__ +#include "neon.hh" +#endif +#endif + +#endif diff --git a/sse4_1.hh b/sse4_1.hh new file mode 100644 index 0000000..42ec37d --- /dev/null +++ b/sse4_1.hh @@ -0,0 +1,975 @@ +/* +Intel SSE4.1 acceleration + +Copyright 2018 Ahmet Inan +*/ + +#ifndef SSE4_1_HH +#define SSE4_1_HH + +#include + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef float value_type; + typedef uint32_t uint_type; + __m128 m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef double value_type; + typedef uint64_t uint_type; + __m128d m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef int8_t value_type; + typedef uint8_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef int16_t value_type; + typedef uint16_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef int32_t value_type; + typedef uint32_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef int64_t value_type; + typedef uint64_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef uint8_t value_type; + typedef uint8_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef uint16_t value_type; + typedef uint16_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef uint32_t value_type; + typedef uint32_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 4; + typedef uint64_t value_type; + typedef uint64_t uint_type; + __m128i m; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128d)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)a.m; + return tmp; +} + +template <> +inline SIMD vdup>(float a) +{ + SIMD tmp; + tmp.m = _mm_set1_ps(a); + return tmp; +} + +template <> +inline SIMD vdup>(double a) +{ + SIMD tmp; + tmp.m = _mm_set1_pd(a); + return tmp; +} + +template <> +inline SIMD vdup>(int8_t a) +{ + SIMD tmp; + tmp.m = _mm_set1_epi8(a); + return tmp; +} + +template <> +inline SIMD vdup>(int16_t a) +{ + SIMD tmp; + tmp.m = _mm_set1_epi16(a); + return tmp; +} + +template <> +inline SIMD vdup>(int32_t a) +{ + SIMD tmp; + tmp.m = _mm_set1_epi32(a); + return tmp; +} + +template <> +inline SIMD vdup>(int64_t a) +{ + SIMD tmp; + tmp.m = _mm_set1_epi64x(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm_setzero_ps(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm_setzero_pd(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + tmp.m = _mm_setzero_si128(); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_add_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_add_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_add_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_add_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_add_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_add_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_adds_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_adds_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sub_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sub_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sub_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sub_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sub_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sub_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_subs_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_subs_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_subs_epu8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_subs_epu16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_andnot_ps(_mm_set1_ps(-0.f), a.m); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_andnot_pd(_mm_set1_pd(-0.), a.m); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_abs_epi8(_mm_max_epi8(a.m, _mm_set1_epi8(-INT8_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_abs_epi16(_mm_max_epi16(a.m, _mm_set1_epi16(-INT16_MAX))); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_abs_epi32(_mm_max_epi32(a.m, _mm_set1_epi32(-INT32_MAX))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_andnot_ps( + _mm_cmpeq_ps(b.m, _mm_setzero_ps()), + _mm_xor_ps(a.m, _mm_and_ps(_mm_set1_ps(-0.f), b.m))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_andnot_pd( + _mm_cmpeq_pd(b.m, _mm_setzero_pd()), + _mm_xor_pd(a.m, _mm_and_pd(_mm_set1_pd(-0.), b.m))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sign_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sign_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_sign_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_or_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_or_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_or_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_or_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_and_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_and_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_and_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_and_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_xor_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_xor_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_xor_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_xor_si128(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_andnot_si128(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_andnot_si128(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_andnot_si128(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_andnot_si128(b.m, a.m); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + tmp.m = _mm_or_si128(_mm_and_si128(a.m, b.m), _mm_andnot_si128(a.m, c.m)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmpeq_ps(a.m, _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmpeq_pd(a.m, _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi8(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi16(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi32(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi64(a.m, _mm_setzero_si128()); + return tmp; +} + + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmpeq_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmpeq_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_cmpeq_epi64(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmpgt_ps(a.m, _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmpgt_pd(a.m, _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi8(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi16(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi32(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi64(a.m, _mm_setzero_si128()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmplt_ps(a.m, _mm_setzero_ps()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = (__m128i)_mm_cmplt_pd(a.m, _mm_setzero_pd()); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi8(_mm_setzero_si128(), a.m); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi16(_mm_setzero_si128(), a.m); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi32(_mm_setzero_si128(), a.m); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_cmpgt_epi64(_mm_setzero_si128(), a.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_min_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_min_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_min_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_min_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_min_epi32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_max_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_max_pd(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_max_epi8(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_max_epi16(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_max_epi32(a.m, b.m); + return tmp; +} + +#endif