From 5721da8bf7733fa1596b032b6adc752e5dc93dac Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Tue, 13 Feb 2024 13:27:55 +0100 Subject: [PATCH] added vshuf versions for SSE --- sse4_1_double.hh | 22 ++++++++++++++++++++++ sse4_1_quadruple.hh | 26 ++++++++++++++++++++++++++ sse4_1_triple.hh | 24 ++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/sse4_1_double.hh b/sse4_1_double.hh index 4daa09d..41dc193 100644 --- a/sse4_1_double.hh +++ b/sse4_1_double.hh @@ -1272,3 +1272,25 @@ inline SIMD vclamp(SIMD x, int32_t a, int32_t b) return tmp; } +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128( + _mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))), + _mm_shuffle_epi8(a.m[1], _mm_sub_epi8(b.m[i], _mm_set1_epi8(16)))); + return tmp; +} + +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) + tmp.m[i] = _mm_or_si128( + _mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))), + _mm_shuffle_epi8(a.m[1], _mm_sub_epi8(b.m[i], _mm_set1_epi8(16)))); + return tmp; +} + diff --git a/sse4_1_quadruple.hh b/sse4_1_quadruple.hh index 71d8fa9..3b4db08 100644 --- a/sse4_1_quadruple.hh +++ b/sse4_1_quadruple.hh @@ -1272,3 +1272,29 @@ inline SIMD vclamp(SIMD x, int32_t a, int32_t b) return tmp; } +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(_mm_or_si128(_mm_or_si128( + _mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))), + _mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))), + _mm_shuffle_epi8(a.m[2], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(32)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(47))))), + _mm_shuffle_epi8(a.m[3], _mm_sub_epi8(b.m[i], _mm_set1_epi8(48)))); + return tmp; +} + +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = _mm_or_si128(_mm_or_si128(_mm_or_si128( + _mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))), + _mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))), + _mm_shuffle_epi8(a.m[2], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(32)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(47))))), + _mm_shuffle_epi8(a.m[3], _mm_sub_epi8(b.m[i], _mm_set1_epi8(48)))); + return tmp; +} + diff --git a/sse4_1_triple.hh b/sse4_1_triple.hh index 5b5818d..162d16c 100644 --- a/sse4_1_triple.hh +++ b/sse4_1_triple.hh @@ -1272,3 +1272,27 @@ inline SIMD vclamp(SIMD x, int32_t a, int32_t b) return tmp; } +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(_mm_or_si128( + _mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))), + _mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))), + _mm_shuffle_epi8(a.m[2], _mm_sub_epi8(b.m[i], _mm_set1_epi8(32)))); + return tmp; +} + +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = _mm_or_si128(_mm_or_si128( + _mm_shuffle_epi8(a.m[0], _mm_or_si128(b.m[i], _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(15)))), + _mm_shuffle_epi8(a.m[1], _mm_or_si128(_mm_sub_epi8(b.m[i], _mm_set1_epi8(16)), _mm_cmpgt_epi8(b.m[i], _mm_set1_epi8(31))))), + _mm_shuffle_epi8(a.m[2], _mm_sub_epi8(b.m[i], _mm_set1_epi8(32)))); + return tmp; +} +