From 2a20850635db1162a47f1c4484535f747bda73ad Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Tue, 13 Feb 2024 14:04:50 +0100 Subject: [PATCH] added doubled versions of vshuf for AVX2 --- avx2_double.hh | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/avx2_double.hh b/avx2_double.hh index a9fcfcd..376963f 100644 --- a/avx2_double.hh +++ b/avx2_double.hh @@ -1281,3 +1281,43 @@ inline SIMD vclamp(SIMD x, int64_t a, int64_t b) return tmp; } +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) { + __m256i c = _mm256_or_si256(b.m[i], _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(15))); + __m256i d = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(16)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(31))); + __m256i e = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(32)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(47))); + __m256i f = _mm256_sub_epi8(b.m[i], _mm256_set1_epi8(48)); + __m256i g = _mm256_or_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 0), c), + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 17), d)); + __m256i h = _mm256_or_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 0), e), + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 17), f)); + tmp.m[i] = _mm256_or_si256(g, h); + } + return tmp; +} + +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 2; ++i) { + __m256i c = _mm256_or_si256(b.m[i], _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(15))); + __m256i d = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(16)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(31))); + __m256i e = _mm256_or_si256(_mm256_sub_epi8(b.m[i], _mm256_set1_epi8(32)), _mm256_cmpgt_epi8(b.m[i], _mm256_set1_epi8(47))); + __m256i f = _mm256_sub_epi8(b.m[i], _mm256_set1_epi8(48)); + __m256i g = _mm256_or_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 0), c), + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[0], a.m[0], 17), d)); + __m256i h = _mm256_or_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 0), e), + _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m[1], a.m[1], 17), f)); + tmp.m[i] = _mm256_or_si256(g, h); + } + return tmp; +} +