/* ARM NEON acceleration times two Copyright 2024 Ahmet Inan */ #pragma once #include template <> union SIMD { static const int SIZE = 8; typedef float value_type; typedef uint32_t uint_type; float32x4_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 32; typedef int8_t value_type; typedef uint8_t uint_type; int8x16_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef int16_t value_type; typedef uint16_t uint_type; int16x8_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef int32_t value_type; typedef uint32_t uint_type; int32x4_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef int64_t value_type; typedef uint64_t uint_type; int64x2_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 32; typedef uint8_t value_type; typedef uint8_t uint_type; uint8x16_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 16; typedef uint16_t value_type; typedef uint16_t uint_type; uint16x8_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 8; typedef uint32_t value_type; typedef uint32_t uint_type; uint32x4_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> union SIMD { static const int SIZE = 4; typedef uint64_t value_type; typedef uint64_t uint_type; uint64x2_t m[2]; value_type v[SIZE]; uint_type u[SIZE]; }; template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (float32x4_t)a.m[0]; tmp.m[1] = (float32x4_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (uint32x4_t)a.m[0]; tmp.m[1] = (uint32x4_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (int8x16_t)a.m[0]; tmp.m[1] = (int8x16_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (uint8x16_t)a.m[0]; tmp.m[1] = (uint8x16_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (int16x8_t)a.m[0]; tmp.m[1] = (int16x8_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (uint16x8_t)a.m[0]; tmp.m[1] = (uint16x8_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (int32x4_t)a.m[0]; tmp.m[1] = (int32x4_t)a.m[1]; return tmp; } template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; tmp.m[0] = (uint32x4_t)a.m[0]; tmp.m[1] = (uint32x4_t)a.m[1]; return tmp; } template <> inline SIMD vdup(float a) { SIMD tmp; tmp.m[0] = vdupq_n_f32(a); tmp.m[1] = vdupq_n_f32(a); return tmp; } template <> inline SIMD vdup(int8_t a) { SIMD tmp; tmp.m[0] = vdupq_n_s8(a); tmp.m[1] = vdupq_n_s8(a); return tmp; } template <> inline SIMD vdup(int16_t a) { SIMD tmp; tmp.m[0] = vdupq_n_s16(a); tmp.m[1] = vdupq_n_s16(a); return tmp; } template <> inline SIMD vdup(int32_t a) { SIMD tmp; tmp.m[0] = vdupq_n_s32(a); tmp.m[1] = vdupq_n_s32(a); return tmp; } template <> inline SIMD vdup(int64_t a) { SIMD tmp; tmp.m[0] = vdupq_n_s64(a); tmp.m[1] = vdupq_n_s64(a); return tmp; } template <> inline SIMD vdup(uint8_t a) { SIMD tmp; tmp.m[0] = vdupq_n_u8(a); tmp.m[1] = vdupq_n_u8(a); return tmp; } template <> inline SIMD vdup(uint16_t a) { SIMD tmp; tmp.m[0] = vdupq_n_u16(a); tmp.m[1] = vdupq_n_u16(a); return tmp; } template <> inline SIMD vdup(uint32_t a) { SIMD tmp; tmp.m[0] = vdupq_n_u32(a); tmp.m[1] = vdupq_n_u32(a); return tmp; } template <> inline SIMD vdup(uint64_t a) { SIMD tmp; tmp.m[0] = vdupq_n_u64(a); tmp.m[1] = vdupq_n_u64(a); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[0], (uint32x4_t)tmp.m[0]); tmp.m[1] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[1], (uint32x4_t)tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_s8(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_s8(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_s16(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_s16(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_s32(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_s32(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_s64(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_s64(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_u8(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_u8(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_u16(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_u16(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_u32(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_u32(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vzero() { SIMD tmp; tmp.m[0] = veorq_u64(tmp.m[0], tmp.m[0]); tmp.m[1] = veorq_u64(tmp.m[1], tmp.m[1]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vaddq_f32(a.m[0], b.m[0]); tmp.m[1] = vaddq_f32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vaddq_s8(a.m[0], b.m[0]); tmp.m[1] = vaddq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vaddq_s16(a.m[0], b.m[0]); tmp.m[1] = vaddq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vaddq_s32(a.m[0], b.m[0]); tmp.m[1] = vaddq_s32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[1] = vaddq_s64(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vqaddq_s8(a.m[0], b.m[0]); tmp.m[1] = vqaddq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vqaddq_s16(a.m[0], b.m[0]); tmp.m[1] = vqaddq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vsubq_f32(a.m[0], b.m[0]); tmp.m[1] = vsubq_f32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vsubq_s8(a.m[0], b.m[0]); tmp.m[1] = vsubq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vsubq_s16(a.m[0], b.m[0]); tmp.m[1] = vsubq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vsubq_s32(a.m[0], b.m[0]); tmp.m[1] = vsubq_s32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vsubq_s64(a.m[0], b.m[0]); tmp.m[1] = vsubq_s64(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vqsubq_s8(a.m[0], b.m[0]); tmp.m[1] = vqsubq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vqsubq_s16(a.m[0], b.m[0]); tmp.m[1] = vqsubq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vqsubq_u8(a.m[0], b.m[0]); tmp.m[1] = vqsubq_u8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vqsubq_u16(a.m[0], b.m[0]); tmp.m[1] = vqsubq_u16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vmulq_f32(a.m[0], b.m[0]); tmp.m[1] = vmulq_f32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vmulq_s8(a.m[0], b.m[0]); tmp.m[1] = vmulq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vabs(SIMD a) { SIMD tmp; tmp.m[0] = vabsq_f32(a.m[0]); tmp.m[1] = vabsq_f32(a.m[1]); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m[0] = vqabsq_s8(a.m[0]); tmp.m[1] = vqabsq_s8(a.m[1]); return tmp; } template <> inline SIMD vqabs(SIMD a) { SIMD tmp; tmp.m[0] = vqabsq_s16(a.m[0]); tmp.m[1] = vqabsq_s16(a.m[1]); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; tmp.m[0] = (float32x4_t)vbicq_u32( veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[0])), vceqq_f32(a.m[0], vdupq_n_f32(0.f))); tmp.m[1] = (float32x4_t)vbicq_u32( veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[1])), vceqq_f32(a.m[1], vdupq_n_f32(0.f))); return tmp; } template <> inline SIMD vsignum(SIMD a) { SIMD tmp; tmp.m[0] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[0]), vandq_u8(vcgtq_s8(a.m[0], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); tmp.m[1] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[1]), vandq_u8(vcgtq_s8(a.m[1], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = (float32x4_t)vbicq_u32( veorq_u32((uint32x4_t)a.m[0], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[0])), vceqq_f32(b.m[0], vdupq_n_f32(0.f))); tmp.m[1] = (float32x4_t)vbicq_u32( veorq_u32((uint32x4_t)a.m[1], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[1])), vceqq_f32(b.m[1], vdupq_n_f32(0.f))); return tmp; } template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = (int8x16_t)vorrq_u8( vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[0]), (uint8x16_t)vnegq_s8(a.m[0])), vandq_u8(vcgtq_s8(b.m[0], vdupq_n_s8(0)), (uint8x16_t)a.m[0])); tmp.m[1] = (int8x16_t)vorrq_u8( vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[1]), (uint8x16_t)vnegq_s8(a.m[1])), vandq_u8(vcgtq_s8(b.m[1], vdupq_n_s8(0)), (uint8x16_t)a.m[1])); return tmp; } template <> inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; uint32x4_t negz = (uint32x4_t)vdupq_n_f32(-0.f); tmp.m[0] = (float32x4_t)vorrq_u32( vbicq_u32((uint32x4_t)a.m[0], negz), vandq_u32((uint32x4_t)b.m[0], negz)); tmp.m[1] = (float32x4_t)vorrq_u32( vbicq_u32((uint32x4_t)a.m[1], negz), vandq_u32((uint32x4_t)b.m[1], negz)); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vorrq_u8(a.m[0], b.m[0]); tmp.m[1] = vorrq_u8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vorrq_u16(a.m[0], b.m[0]); tmp.m[1] = vorrq_u16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vorrq_u32(a.m[0], b.m[0]); tmp.m[1] = vorrq_u32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vorrq_u64(a.m[0], b.m[0]); tmp.m[1] = vorrq_u64(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vandq_u8(a.m[0], b.m[0]); tmp.m[1] = vandq_u8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vandq_u16(a.m[0], b.m[0]); tmp.m[1] = vandq_u16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vandq_u32(a.m[0], b.m[0]); tmp.m[1] = vandq_u32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vandq_u64(a.m[0], b.m[0]); tmp.m[1] = vandq_u64(a.m[1], b.m[1]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = veorq_u8(a.m[0], b.m[0]); tmp.m[1] = veorq_u8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = veorq_u16(a.m[0], b.m[0]); tmp.m[1] = veorq_u16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = veorq_u32(a.m[0], b.m[0]); tmp.m[1] = veorq_u32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = veorq_u64(a.m[0], b.m[0]); tmp.m[1] = veorq_u64(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vbicq_u8(a.m[0], b.m[0]); tmp.m[1] = vbicq_u8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vbicq_u16(a.m[0], b.m[0]); tmp.m[1] = vbicq_u16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vbicq_u32(a.m[0], b.m[0]); tmp.m[1] = vbicq_u32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vbicq_u64(a.m[0], b.m[0]); tmp.m[1] = vbicq_u64(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m[0] = vbslq_u8(a.m[0], b.m[0], c.m[0]); tmp.m[1] = vbslq_u8(a.m[1], b.m[1], c.m[1]); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m[0] = vbslq_u16(a.m[0], b.m[0], c.m[0]); tmp.m[1] = vbslq_u16(a.m[1], b.m[1], c.m[1]); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m[0] = vbslq_u32(a.m[0], b.m[0], c.m[0]); tmp.m[1] = vbslq_u32(a.m[1], b.m[1], c.m[1]); return tmp; } template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; tmp.m[0] = vbslq_u64(a.m[0], b.m[0], c.m[0]); tmp.m[1] = vbslq_u64(a.m[1], b.m[1], c.m[1]); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m[0] = vceqq_f32(a.m[0], vdupq_n_f32(0.f)); tmp.m[1] = vceqq_f32(a.m[1], vdupq_n_f32(0.f)); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m[0] = vceqq_s8(a.m[0], vdupq_n_s8(0)); tmp.m[1] = vceqq_s8(a.m[1], vdupq_n_s8(0)); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m[0] = vceqq_s16(a.m[0], vdupq_n_s16(0)); tmp.m[1] = vceqq_s16(a.m[1], vdupq_n_s16(0)); return tmp; } template <> inline SIMD vceqz(SIMD a) { SIMD tmp; tmp.m[0] = vceqq_s32(a.m[0], vdupq_n_s32(0)); tmp.m[1] = vceqq_s32(a.m[1], vdupq_n_s32(0)); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vceqq_f32(a.m[0], b.m[0]); tmp.m[1] = vceqq_f32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vceqq_s8(a.m[0], b.m[0]); tmp.m[1] = vceqq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vceqq_s16(a.m[0], b.m[0]); tmp.m[1] = vceqq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vceqq_s32(a.m[0], b.m[0]); tmp.m[1] = vceqq_s32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m[0] = vcgtq_f32(a.m[0], vdupq_n_f32(0.f)); tmp.m[1] = vcgtq_f32(a.m[1], vdupq_n_f32(0.f)); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m[0] = vcgtq_s8(a.m[0], vdupq_n_s8(0)); tmp.m[1] = vcgtq_s8(a.m[1], vdupq_n_s8(0)); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m[0] = vcgtq_s16(a.m[0], vdupq_n_s16(0)); tmp.m[1] = vcgtq_s16(a.m[1], vdupq_n_s16(0)); return tmp; } template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; tmp.m[0] = vcgtq_s32(a.m[0], vdupq_n_s32(0)); tmp.m[1] = vcgtq_s32(a.m[1], vdupq_n_s32(0)); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m[0] = vcltq_f32(a.m[0], vdupq_n_f32(0.f)); tmp.m[1] = vcltq_f32(a.m[1], vdupq_n_f32(0.f)); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m[0] = vcltq_s8(a.m[0], vdupq_n_s8(0)); tmp.m[1] = vcltq_s8(a.m[1], vdupq_n_s8(0)); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m[0] = vcltq_s16(a.m[0], vdupq_n_s16(0)); tmp.m[1] = vcltq_s16(a.m[1], vdupq_n_s16(0)); return tmp; } template <> inline SIMD vcltz(SIMD a) { SIMD tmp; tmp.m[0] = vcltq_s32(a.m[0], vdupq_n_s32(0)); tmp.m[1] = vcltq_s32(a.m[1], vdupq_n_s32(0)); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; tmp.m[0] = vcleq_f32(a.m[0], vdupq_n_f32(0.f)); tmp.m[1] = vcleq_f32(a.m[1], vdupq_n_f32(0.f)); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; tmp.m[0] = vcleq_s8(a.m[0], vdupq_n_s8(0)); tmp.m[1] = vcleq_s8(a.m[1], vdupq_n_s8(0)); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; tmp.m[0] = vcleq_s16(a.m[0], vdupq_n_s16(0)); tmp.m[1] = vcleq_s16(a.m[1], vdupq_n_s16(0)); return tmp; } template <> inline SIMD vclez(SIMD a) { SIMD tmp; tmp.m[0] = vcleq_s32(a.m[0], vdupq_n_s32(0)); tmp.m[1] = vcleq_s32(a.m[1], vdupq_n_s32(0)); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vminq_f32(a.m[0], b.m[0]); tmp.m[1] = vminq_f32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vminq_s8(a.m[0], b.m[0]); tmp.m[1] = vminq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vminq_s16(a.m[0], b.m[0]); tmp.m[1] = vminq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vminq_s32(a.m[0], b.m[0]); tmp.m[1] = vminq_s32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vmaxq_f32(a.m[0], b.m[0]); tmp.m[1] = vmaxq_f32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vmaxq_s8(a.m[0], b.m[0]); tmp.m[1] = vmaxq_s8(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vmaxq_s16(a.m[0], b.m[0]); tmp.m[1] = vmaxq_s16(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; tmp.m[0] = vmaxq_s32(a.m[0], b.m[0]); tmp.m[1] = vmaxq_s32(a.m[1], b.m[1]); return tmp; } template <> inline SIMD vclamp(SIMD x, float a, float b) { SIMD tmp; tmp.m[0] = vminq_f32(vmaxq_f32(x.m[0], vdupq_n_f32(a)), vdupq_n_f32(b)); tmp.m[1] = vminq_f32(vmaxq_f32(x.m[1], vdupq_n_f32(a)), vdupq_n_f32(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int8_t a, int8_t b) { SIMD tmp; tmp.m[0] = vminq_s8(vmaxq_s8(x.m[0], vdupq_n_s8(a)), vdupq_n_s8(b)); tmp.m[1] = vminq_s8(vmaxq_s8(x.m[1], vdupq_n_s8(a)), vdupq_n_s8(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int16_t a, int16_t b) { SIMD tmp; tmp.m[0] = vminq_s16(vmaxq_s16(x.m[0], vdupq_n_s16(a)), vdupq_n_s16(b)); tmp.m[1] = vminq_s16(vmaxq_s16(x.m[1], vdupq_n_s16(a)), vdupq_n_s16(b)); return tmp; } template <> inline SIMD vclamp(SIMD x, int32_t a, int32_t b) { SIMD tmp; tmp.m[0] = vminq_s32(vmaxq_s32(x.m[0], vdupq_n_s32(a)), vdupq_n_s32(b)); tmp.m[1] = vminq_s32(vmaxq_s32(x.m[1], vdupq_n_s32(a)), vdupq_n_s32(b)); return tmp; }