From 992c83933d9e2ae44af93ff785daef67ad3e34b9 Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Tue, 13 Feb 2024 11:30:11 +0100 Subject: [PATCH] added tripled and quadrupled versions --- neon_quadruple.hh | 1081 +++++++++++++++++++++++++++++++++++++++++++++ neon_triple.hh | 1079 ++++++++++++++++++++++++++++++++++++++++++++ simd.hh | 2 + 3 files changed, 2162 insertions(+) create mode 100644 neon_quadruple.hh create mode 100644 neon_triple.hh diff --git a/neon_quadruple.hh b/neon_quadruple.hh new file mode 100644 index 0000000..943ed71 --- /dev/null +++ b/neon_quadruple.hh @@ -0,0 +1,1081 @@ +/* +ARM NEON acceleration times four + +Copyright 2024 Ahmet Inan +*/ + +#pragma once + +#include + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef float value_type; + typedef uint32_t uint_type; + float32x4_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 64; + typedef int8_t value_type; + typedef uint8_t uint_type; + int8x16_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef int16_t value_type; + typedef uint16_t uint_type; + int16x8_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef int32_t value_type; + typedef uint32_t uint_type; + int32x4_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef int64_t value_type; + typedef uint64_t uint_type; + int64x2_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 64; + typedef uint8_t value_type; + typedef uint8_t uint_type; + uint8x16_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 32; + typedef uint16_t value_type; + typedef uint16_t uint_type; + uint16x8_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 16; + typedef uint32_t value_type; + typedef uint32_t uint_type; + uint32x4_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 8; + typedef uint64_t value_type; + typedef uint64_t uint_type; + uint64x2_t m[4]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (float32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (uint32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (int8x16_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (uint8x16_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (int16x8_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (uint16x8_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (int32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (uint32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vdup(float a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_f32(a); + return tmp; +} + +template <> +inline SIMD vdup(int8_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_s8(a); + return tmp; +} + +template <> +inline SIMD vdup(int16_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_s16(a); + return tmp; +} + +template <> +inline SIMD vdup(int32_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_s32(a); + return tmp; +} + +template <> +inline SIMD vdup(int64_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_s64(a); + return tmp; +} + +template <> +inline SIMD vdup(uint8_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_u8(a); + return tmp; +} + +template <> +inline SIMD vdup(uint16_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_u16(a); + return tmp; +} + +template <> +inline SIMD vdup(uint32_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_u32(a); + return tmp; +} + +template <> +inline SIMD vdup(uint64_t a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vdupq_n_u64(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vaddq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vaddq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vaddq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vaddq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vaddq_s64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vsubq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vsubq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vsubq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vsubq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vsubq_s64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vmulq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vmulq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vabsq_f32(a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqabsq_s8(a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vqabsq_s16(a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])), + vceqq_f32(a.m[i], vdupq_n_f32(0.f))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]), + vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])), + vceqq_f32(b.m[i], vdupq_n_f32(0.f))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])), + vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i])); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + uint32x4_t negz = (uint32x4_t)vdupq_n_f32(-0.f); + for (int i = 0; i < 4; ++i) + tmp.m[i] = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m[i], negz), + vandq_u32((uint32x4_t)b.m[i], negz)); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vorrq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vorrq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vorrq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vorrq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vandq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vandq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vandq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vandq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = veorq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbicq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbicq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbicq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbicq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vceqq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, float a, float b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int8_t a, int8_t b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int16_t a, int16_t b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int32_t a, int32_t b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b)); + return tmp; +} + +#ifdef __aarch64__ +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vorrq_u8(vorrq_u8(vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), + vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), + vqtbl1q_u8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))), + vqtbl1q_u8(a.m[3], vsubq_u8(b.m[i], vdupq_n_u8(48)))); + return tmp; +} + +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 4; ++i) + tmp.m[i] = vorrq_s8(vorrq_s8(vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), + vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), + vqtbl1q_s8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))), + vqtbl1q_s8(a.m[3], vsubq_u8(b.m[i], vdupq_n_u8(48)))); + return tmp; +} +#endif + diff --git a/neon_triple.hh b/neon_triple.hh new file mode 100644 index 0000000..f22c74b --- /dev/null +++ b/neon_triple.hh @@ -0,0 +1,1079 @@ +/* +ARM NEON acceleration times three + +Copyright 2024 Ahmet Inan +*/ + +#pragma once + +#include + +template <> +union SIMD +{ + static const int SIZE = 12; + typedef float value_type; + typedef uint32_t uint_type; + float32x4_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 48; + typedef int8_t value_type; + typedef uint8_t uint_type; + int8x16_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 24; + typedef int16_t value_type; + typedef uint16_t uint_type; + int16x8_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 12; + typedef int32_t value_type; + typedef uint32_t uint_type; + int32x4_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 6; + typedef int64_t value_type; + typedef uint64_t uint_type; + int64x2_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 48; + typedef uint8_t value_type; + typedef uint8_t uint_type; + uint8x16_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 24; + typedef uint16_t value_type; + typedef uint16_t uint_type; + uint16x8_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 12; + typedef uint32_t value_type; + typedef uint32_t uint_type; + uint32x4_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +union SIMD +{ + static const int SIZE = 6; + typedef uint64_t value_type; + typedef uint64_t uint_type; + uint64x2_t m[3]; + value_type v[SIZE]; + uint_type u[SIZE]; +}; + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (float32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (uint32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (int8x16_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (uint8x16_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (int16x8_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (uint16x8_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (int32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vreinterpret(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (uint32x4_t)a.m[i]; + return tmp; +} + +template <> +inline SIMD vdup(float a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_f32(a); + return tmp; +} + +template <> +inline SIMD vdup(int8_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_s8(a); + return tmp; +} + +template <> +inline SIMD vdup(int16_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_s16(a); + return tmp; +} + +template <> +inline SIMD vdup(int32_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_s32(a); + return tmp; +} + +template <> +inline SIMD vdup(int64_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_s64(a); + return tmp; +} + +template <> +inline SIMD vdup(uint8_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_u8(a); + return tmp; +} + +template <> +inline SIMD vdup(uint16_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_u16(a); + return tmp; +} + +template <> +inline SIMD vdup(uint32_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_u32(a); + return tmp; +} + +template <> +inline SIMD vdup(uint64_t a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vdupq_n_u64(a); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vzero() +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vaddq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vaddq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vaddq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vaddq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vaddq_s64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqadd(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vsubq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vsubq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vsubq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vsubq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vsubq_s64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vqsub(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vmulq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmul(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vmulq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vabsq_f32(a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqabsq_s8(a.m[i]); + return tmp; +} + +template <> +inline SIMD vqabs(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vqabsq_s16(a.m[i]); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])), + vceqq_f32(a.m[i], vdupq_n_f32(0.f))); + return tmp; +} + +template <> +inline SIMD vsignum(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]), + vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])), + vceqq_f32(b.m[i], vdupq_n_f32(0.f))); + return tmp; +} + +template <> +inline SIMD vsign(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])), + vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i])); + return tmp; +} + +template <> +inline SIMD vcopysign(SIMD a, SIMD b) +{ + SIMD tmp; + uint32x4_t negz = (uint32x4_t)vdupq_n_f32(-0.f); + for (int i = 0; i < 3; ++i) + tmp.m[i] = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m[i], negz), + vandq_u32((uint32x4_t)b.m[i], negz)); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vorrq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vorrq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vorrq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vorr(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vorrq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vandq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vandq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vandq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vand(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vandq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD veor(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = veorq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbicq_u8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbicq_u16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbicq_u32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbic(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbicq_u64(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vbsl(SIMD a, SIMD b, SIMD c) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vceqz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vceq(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vceqq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vcgtz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vcltz(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0)); + return tmp; +} + +template <> +inline SIMD vclez(SIMD a) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0)); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmin(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vmax(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, float a, float b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int8_t a, int8_t b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int16_t a, int16_t b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b)); + return tmp; +} + +template <> +inline SIMD vclamp(SIMD x, int32_t a, int32_t b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b)); + return tmp; +} + +#ifdef __aarch64__ +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vorrq_u8(vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), + vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), + vqtbl1q_u8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))); + return tmp; +} + +template <> +inline SIMD vshuf(SIMD a, SIMD b) +{ + SIMD tmp; + for (int i = 0; i < 3; ++i) + tmp.m[i] = vorrq_s8(vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), + vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), + vqtbl1q_s8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))); + return tmp; +} +#endif + diff --git a/simd.hh b/simd.hh index 45b3d99..e269d5e 100644 --- a/simd.hh +++ b/simd.hh @@ -1507,6 +1507,8 @@ static inline SIMD vshuf(SIMD a, SIMD