From 48303e7f58ba88a5dc4f5addfe580d40bdfcfb14 Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Sun, 12 Jul 2020 22:44:46 +0200 Subject: [PATCH] added vmul() and vsignum() --- avx2.hh | 60 ++++++++++++++++++++++++++++++ neon.hh | 35 ++++++++++++++++++ simd.hh | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ sse4_1.hh | 60 ++++++++++++++++++++++++++++++ 4 files changed, 263 insertions(+) diff --git a/avx2.hh b/avx2.hh index f767037..172918f 100644 --- a/avx2.hh +++ b/avx2.hh @@ -455,6 +455,22 @@ inline SIMD vqsub(SIMD a, SIMD b) return tmp; } +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_mul_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm256_mul_pd(a.m, b.m); + return tmp; +} + template <> inline SIMD vabs(SIMD a) { @@ -495,6 +511,50 @@ inline SIMD vqabs(SIMD a) return tmp; } +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_andnot_ps( + _mm256_cmp_ps(a.m, _mm256_setzero_ps(), _CMP_EQ_OQ), + _mm256_or_ps(_mm256_set1_ps(1.f), _mm256_and_ps(_mm256_set1_ps(-0.f), a.m))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_andnot_pd( + _mm256_cmp_pd(a.m, _mm256_setzero_pd(), _CMP_EQ_OQ), + _mm256_or_pd(_mm256_set1_pd(1.), _mm256_and_pd(_mm256_set1_pd(-0.), a.m))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_sign_epi8(_mm256_set1_epi8(1), a.m); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_sign_epi16(_mm256_set1_epi16(1), a.m); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm256_sign_epi32(_mm256_set1_epi32(1), a.m); + return tmp; +} + template <> inline SIMD vsign(SIMD a, SIMD b) { diff --git a/neon.hh b/neon.hh index 9a48bd5..a9cf1a9 100644 --- a/neon.hh +++ b/neon.hh @@ -444,6 +444,22 @@ inline SIMD vqsub(SIMD a, SIMD b) return tmp; } +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vmulq_f32(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = vmulq_s8(a.m, b.m); + return tmp; +} + template <> inline SIMD vabs(SIMD a) { @@ -468,6 +484,25 @@ inline SIMD vqabs(SIMD a) return tmp; } +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m)), + vceqq_f32(a.m, vdupq_n_f32(0.f))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m), + vandq_u8(vcgtq_s8(a.m, vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + return tmp; +} + template <> inline SIMD vsign(SIMD a, SIMD b) { diff --git a/simd.hh b/simd.hh index f3d095c..8e1d05d 100644 --- a/simd.hh +++ b/simd.hh @@ -1226,6 +1226,114 @@ static inline SIMD vqsub(SIMD a, SIMD +static inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] * b.v[i]; + return tmp; +} + +template +static inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] * b.v[i]; + return tmp; +} + +template +static inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] * b.v[i]; + return tmp; +} + +template +static inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] * b.v[i]; + return tmp; +} + +template +static inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] * b.v[i]; + return tmp; +} + +template +static inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = a.v[i] * b.v[i]; + return tmp; +} + +template +static inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] > 0.f) - (a.v[i] < 0.f); + return tmp; +} + +template +static inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] > 0.) - (a.v[i] < 0.); + return tmp; +} + +template +static inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] > 0) - (a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] > 0) - (a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] > 0) - (a.v[i] < 0); + return tmp; +} + +template +static inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < WIDTH; ++i) + tmp.v[i] = (a.v[i] > 0) - (a.v[i] < 0); + return tmp; +} + template static inline SIMD vsign(SIMD a, SIMD b) { diff --git a/sse4_1.hh b/sse4_1.hh index 18cde4d..d69c63a 100644 --- a/sse4_1.hh +++ b/sse4_1.hh @@ -455,6 +455,22 @@ inline SIMD vqsub(SIMD a, SIMD b) return tmp; } +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_mul_ps(a.m, b.m); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + tmp.m = _mm_mul_pd(a.m, b.m); + return tmp; +} + template <> inline SIMD vabs(SIMD a) { @@ -495,6 +511,50 @@ inline SIMD vqabs(SIMD a) return tmp; } +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_andnot_ps( + _mm_cmpeq_ps(a.m, _mm_setzero_ps()), + _mm_or_ps(_mm_set1_ps(1.f), _mm_and_ps(_mm_set1_ps(-0.f), a.m))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_andnot_pd( + _mm_cmpeq_pd(a.m, _mm_setzero_pd()), + _mm_or_pd(_mm_set1_pd(1.), _mm_and_pd(_mm_set1_pd(-0.), a.m))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_sign_epi8(_mm_set1_epi8(1), a.m); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_sign_epi16(_mm_set1_epi16(1), a.m); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + tmp.m = _mm_sign_epi32(_mm_set1_epi32(1), a.m); + return tmp; +} + template <> inline SIMD vsign(SIMD a, SIMD b) {