added vcopysign()

2026-04-27 14:30:36 +00:00 · 2021-06-26 08:00:52 +02:00 · 2021-06-26 08:00:52 +02:00 · 6cddcef660
commit 6cddcef660
parent 778d51ac0f
4 changed files with 73 additions and 0 deletions
--- a/avx2.hh
+++ b/avx2.hh
@ -598,6 +598,28 @@ inline SIMD<int32_t, 8> vsign(SIMD<int32_t, 8> a, SIMD<int32_t, 8> b)
 	return tmp;
 }

+template <>
+inline SIMD<float, 8> vcopysign(SIMD<float, 8> a, SIMD<float, 8> b)
+{
+	SIMD<float, 8> tmp;
+	__m256 negz = _mm256_set1_ps(-0.f);
+	tmp.m = _mm256_or_ps(
+		_mm256_andnot_ps(negz, a.m),
+		_mm256_and_ps(negz, b.m));
+	return tmp;
+}
+
+template <>
+inline SIMD<double, 4> vcopysign(SIMD<double, 4> a, SIMD<double, 4> b)
+{
+	SIMD<double, 4> tmp;
+	__m256d negz = _mm256_set1_pd(-0.);
+	tmp.m = _mm256_or_pd(
+		_mm256_andnot_pd(negz, a.m),
+		_mm256_and_pd(negz, b.m));
+	return tmp;
+}
+
 template <>
 inline SIMD<uint8_t, 32> vorr(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
 {
--- a/neon.hh
+++ b/neon.hh
@ -522,6 +522,17 @@ inline SIMD<int8_t, 16> vsign(SIMD<int8_t, 16> a, SIMD<int8_t, 16> b)
 	return tmp;
 }

+template <>
+inline SIMD<float, 4> vcopysign(SIMD<float, 4> a, SIMD<float, 4> b)
+{
+	SIMD<float, 4> tmp;
+	uint32x4_t negz = (uint32x4_t)vdupq_n_f32(-0.f);
+	tmp.m = (float32x4_t)vorrq_u32(
+		vbicq_u32((uint32x4_t)a.m, negz),
+		vandq_u32((uint32x4_t)b.m, negz));
+	return tmp;
+}
+
 template <>
 inline SIMD<uint8_t, 16> vorr(SIMD<uint8_t, 16> a, SIMD<uint8_t, 16> b)
 {
--- a/simd.hh
+++ b/simd.hh
@ -1387,6 +1387,24 @@ static inline SIMD<int64_t, WIDTH> vsign(SIMD<int64_t, WIDTH> a, SIMD<int64_t, W
 	return tmp;
 }

+template <int WIDTH>
+static inline SIMD<float, WIDTH> vcopysign(SIMD<float, WIDTH> a, SIMD<float, WIDTH> b)
+{
+	SIMD<float, WIDTH> tmp, negz = vdup<SIMD<float, WIDTH>>(-0.f);
+	for (int i = 0; i < WIDTH; ++i)
+		tmp.u[i] = (a.u[i] & ~negz.u[i]) | (negz.u[i] & b.u[i]);
+	return tmp;
+}
+
+template <int WIDTH>
+static inline SIMD<double, WIDTH> vcopysign(SIMD<double, WIDTH> a, SIMD<double, WIDTH> b)
+{
+	SIMD<double, WIDTH> tmp, negz = vdup<SIMD<double, WIDTH>>(-0.);
+	for (int i = 0; i < WIDTH; ++i)
+		tmp.u[i] = (a.u[i] & ~negz.u[i]) | (negz.u[i] & b.u[i]);
+	return tmp;
+}
+
 template <int WIDTH>
 static inline SIMD<uint8_t, WIDTH> vshuf(SIMD<uint8_t, WIDTH> a, SIMD<uint8_t, WIDTH> b)
 {
--- a/sse4_1.hh
+++ b/sse4_1.hh
@ -598,6 +598,28 @@ inline SIMD<int32_t, 4> vsign(SIMD<int32_t, 4> a, SIMD<int32_t, 4> b)
 	return tmp;
 }

+template <>
+inline SIMD<float, 4> vcopysign(SIMD<float, 4> a, SIMD<float, 4> b)
+{
+	SIMD<float, 4> tmp;
+	__m128 negz = _mm_set1_ps(-0.f);
+	tmp.m = _mm_or_ps(
+		_mm_andnot_ps(negz, a.m),
+		_mm_and_ps(negz, b.m));
+	return tmp;
+}
+
+template <>
+inline SIMD<double, 2> vcopysign(SIMD<double, 2> a, SIMD<double, 2> b)
+{
+	SIMD<double, 2> tmp;
+	__m128 negz = _mm_set1_pd(-0.);
+	tmp.m = _mm_or_pd(
+		_mm_andnot_pd(negz, a.m),
+		_mm_and_pd(negz, b.m));
+	return tmp;
+}
+
 template <>
 inline SIMD<uint8_t, 16> vorr(SIMD<uint8_t, 16> a, SIMD<uint8_t, 16> b)
 {