From 6cddcef660f74d8d4d41e08096ad8004aaa0a153 Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Sat, 26 Jun 2021 08:00:52 +0200 Subject: [PATCH] added vcopysign() --- avx2.hh | 22 ++++++++++++++++++++++ neon.hh | 11 +++++++++++ simd.hh | 18 ++++++++++++++++++ sse4_1.hh | 22 ++++++++++++++++++++++ 4 files changed, 73 insertions(+) diff --git a/avx2.hh b/avx2.hh index 0d89776..9b12334 100644 --- a/avx2.hh +++ b/avx2.hh @@ -598,6 +598,28 @@ inline SIMD vsign(SIMD a, SIMD b) return tmp; } +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + __m256 negz = _mm256_set1_ps(-0.f); + tmp.m = _mm256_or_ps( + _mm256_andnot_ps(negz, a.m), + _mm256_and_ps(negz, b.m)); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + __m256d negz = _mm256_set1_pd(-0.); + tmp.m = _mm256_or_pd( + _mm256_andnot_pd(negz, a.m), + _mm256_and_pd(negz, b.m)); + return tmp; +} + template <> inline SIMD vorr(SIMD a, SIMD b) { diff --git a/neon.hh b/neon.hh index a94cc6c..6fe4aa6 100644 --- a/neon.hh +++ b/neon.hh @@ -522,6 +522,17 @@ inline SIMD vsign(SIMD a, SIMD b) return tmp; } +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + uint32x4_t negz = (uint32x4_t)vdupq_n_f32(-0.f); + tmp.m = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m, negz), + vandq_u32((uint32x4_t)b.m, negz)); + return tmp; +} + template <> inline SIMD vorr(SIMD a, SIMD b) { diff --git a/simd.hh b/simd.hh index 9b223b4..21dc163 100644 --- a/simd.hh +++ b/simd.hh @@ -1387,6 +1387,24 @@ static inline SIMD vsign(SIMD a, SIMD +static inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp, negz = vdup>(-0.f); + for (int i = 0; i < WIDTH; ++i) + tmp.u[i] = (a.u[i] & ~negz.u[i]) | (negz.u[i] & b.u[i]); + return tmp; +} + +template +static inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp, negz = vdup>(-0.); + for (int i = 0; i < WIDTH; ++i) + tmp.u[i] = (a.u[i] & ~negz.u[i]) | (negz.u[i] & b.u[i]); + return tmp; +} + template static inline SIMD vshuf(SIMD a, SIMD b) { diff --git a/sse4_1.hh b/sse4_1.hh index 0e61b9f..1b9d8c7 100644 --- a/sse4_1.hh +++ b/sse4_1.hh @@ -598,6 +598,28 @@ inline SIMD vsign(SIMD a, SIMD b) return tmp; } +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + __m128 negz = _mm_set1_ps(-0.f); + tmp.m = _mm_or_ps( + _mm_andnot_ps(negz, a.m), + _mm_and_ps(negz, b.m)); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + __m128 negz = _mm_set1_pd(-0.); + tmp.m = _mm_or_pd( + _mm_andnot_pd(negz, a.m), + _mm_and_pd(negz, b.m)); + return tmp; +} + template <> inline SIMD vorr(SIMD a, SIMD b) {