From 0bd5166a4a9e7e6d508147d01c3011960b60e3b2 Mon Sep 17 00:00:00 2001 From: Ahmet Inan Date: Tue, 20 Jan 2026 10:48:49 +0100 Subject: [PATCH] use neon struct types --- neon_double.hh | 248 +++++++++++++++++++++---------------------- neon_quadruple.hh | 260 +++++++++++++++++++++++----------------------- neon_triple.hh | 256 ++++++++++++++++++++++----------------------- 3 files changed, 382 insertions(+), 382 deletions(-) diff --git a/neon_double.hh b/neon_double.hh index e5ae2fd..35ea118 100644 --- a/neon_double.hh +++ b/neon_double.hh @@ -14,7 +14,7 @@ union SIMD static const int SIZE = 8; typedef float value_type; typedef uint32_t uint_type; - float32x4_t m[2]; + float32x4x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -25,7 +25,7 @@ union SIMD static const int SIZE = 32; typedef int8_t value_type; typedef uint8_t uint_type; - int8x16_t m[2]; + int8x16x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -36,7 +36,7 @@ union SIMD static const int SIZE = 16; typedef int16_t value_type; typedef uint16_t uint_type; - int16x8_t m[2]; + int16x8x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -47,7 +47,7 @@ union SIMD static const int SIZE = 8; typedef int32_t value_type; typedef uint32_t uint_type; - int32x4_t m[2]; + int32x4x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -58,7 +58,7 @@ union SIMD static const int SIZE = 4; typedef int64_t value_type; typedef uint64_t uint_type; - int64x2_t m[2]; + int64x2x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -69,7 +69,7 @@ union SIMD static const int SIZE = 32; typedef uint8_t value_type; typedef uint8_t uint_type; - uint8x16_t m[2]; + uint8x16x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -80,7 +80,7 @@ union SIMD static const int SIZE = 16; typedef uint16_t value_type; typedef uint16_t uint_type; - uint16x8_t m[2]; + uint16x8x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -91,7 +91,7 @@ union SIMD static const int SIZE = 8; typedef uint32_t value_type; typedef uint32_t uint_type; - uint32x4_t m[2]; + uint32x4x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -102,7 +102,7 @@ union SIMD static const int SIZE = 4; typedef uint64_t value_type; typedef uint64_t uint_type; - uint64x2_t m[2]; + uint64x2x2_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -112,7 +112,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (float32x4_t)a.m[i]; + tmp.m.val[i] = (float32x4_t)a.m.val[i]; return tmp; } @@ -121,7 +121,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (uint32x4_t)a.m[i]; + tmp.m.val[i] = (uint32x4_t)a.m.val[i]; return tmp; } @@ -130,7 +130,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (int8x16_t)a.m[i]; + tmp.m.val[i] = (int8x16_t)a.m.val[i]; return tmp; } @@ -139,7 +139,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (uint8x16_t)a.m[i]; + tmp.m.val[i] = (uint8x16_t)a.m.val[i]; return tmp; } @@ -148,7 +148,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (int16x8_t)a.m[i]; + tmp.m.val[i] = (int16x8_t)a.m.val[i]; return tmp; } @@ -157,7 +157,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (uint16x8_t)a.m[i]; + tmp.m.val[i] = (uint16x8_t)a.m.val[i]; return tmp; } @@ -166,7 +166,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (int32x4_t)a.m[i]; + tmp.m.val[i] = (int32x4_t)a.m.val[i]; return tmp; } @@ -175,7 +175,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (uint32x4_t)a.m[i]; + tmp.m.val[i] = (uint32x4_t)a.m.val[i]; return tmp; } @@ -184,7 +184,7 @@ inline SIMD vdup(float a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_f32(a); + tmp.m.val[i] = vdupq_n_f32(a); return tmp; } @@ -193,7 +193,7 @@ inline SIMD vdup(int8_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_s8(a); + tmp.m.val[i] = vdupq_n_s8(a); return tmp; } @@ -202,7 +202,7 @@ inline SIMD vdup(int16_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_s16(a); + tmp.m.val[i] = vdupq_n_s16(a); return tmp; } @@ -211,7 +211,7 @@ inline SIMD vdup(int32_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_s32(a); + tmp.m.val[i] = vdupq_n_s32(a); return tmp; } @@ -220,7 +220,7 @@ inline SIMD vdup(int64_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_s64(a); + tmp.m.val[i] = vdupq_n_s64(a); return tmp; } @@ -229,7 +229,7 @@ inline SIMD vdup(uint8_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_u8(a); + tmp.m.val[i] = vdupq_n_u8(a); return tmp; } @@ -238,7 +238,7 @@ inline SIMD vdup(uint16_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_u16(a); + tmp.m.val[i] = vdupq_n_u16(a); return tmp; } @@ -247,7 +247,7 @@ inline SIMD vdup(uint32_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_u32(a); + tmp.m.val[i] = vdupq_n_u32(a); return tmp; } @@ -256,7 +256,7 @@ inline SIMD vdup(uint64_t a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vdupq_n_u64(a); + tmp.m.val[i] = vdupq_n_u64(a); return tmp; } @@ -265,7 +265,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]); + tmp.m.val[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m.val[i], (uint32x4_t)tmp.m.val[i]); return tmp; } @@ -274,7 +274,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s8(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -283,7 +283,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s16(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -292,7 +292,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s32(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -301,7 +301,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s64(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -310,7 +310,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u8(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -319,7 +319,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u16(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -328,7 +328,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u32(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -337,7 +337,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u64(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -346,7 +346,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vaddq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -355,7 +355,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vaddq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -364,7 +364,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vaddq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -373,7 +373,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vaddq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -382,7 +382,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vaddq_s64(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s64(a.m.val[i], b.m.val[i]); return tmp; } @@ -391,7 +391,7 @@ inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vqaddq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -400,7 +400,7 @@ inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vqaddq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -409,7 +409,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vsubq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -418,7 +418,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vsubq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -427,7 +427,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vsubq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -436,7 +436,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vsubq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -445,7 +445,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vsubq_s64(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s64(a.m.val[i], b.m.val[i]); return tmp; } @@ -454,7 +454,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -463,7 +463,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -472,7 +472,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -481,7 +481,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -490,7 +490,7 @@ inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vmulq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vmulq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -499,7 +499,7 @@ inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vmulq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vmulq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -508,7 +508,7 @@ inline SIMD vabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vabsq_f32(a.m[i]); + tmp.m.val[i] = vabsq_f32(a.m.val[i]); return tmp; } @@ -517,7 +517,7 @@ inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqabsq_s8(a.m[i]); + tmp.m.val[i] = vqabsq_s8(a.m.val[i]); return tmp; } @@ -526,7 +526,7 @@ inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vqabsq_s16(a.m[i]); + tmp.m.val[i] = vqabsq_s16(a.m.val[i]); return tmp; } @@ -535,9 +535,9 @@ inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])), - vceqq_f32(a.m[i], vdupq_n_f32(0.f))); + tmp.m.val[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m.val[i])), + vceqq_f32(a.m.val[i], vdupq_n_f32(0.f))); return tmp; } @@ -546,8 +546,8 @@ inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]), - vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + tmp.m.val[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m.val[i]), + vandq_u8(vcgtq_s8(a.m.val[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); return tmp; } @@ -556,9 +556,9 @@ inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])), - vceqq_f32(b.m[i], vdupq_n_f32(0.f))); + tmp.m.val[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m.val[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m.val[i])), + vceqq_f32(b.m.val[i], vdupq_n_f32(0.f))); return tmp; } @@ -567,9 +567,9 @@ inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (int8x16_t)vorrq_u8( - vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])), - vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i])); + tmp.m.val[i] = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m.val[i]), (uint8x16_t)vnegq_s8(a.m.val[i])), + vandq_u8(vcgtq_s8(b.m.val[i], vdupq_n_s8(0)), (uint8x16_t)a.m.val[i])); return tmp; } @@ -578,9 +578,9 @@ inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = (float32x4_t)vorrq_u32( - vbicq_u32((uint32x4_t)a.m[i], (uint32x4_t)vdupq_n_f32(-0.f)), - vandq_u32((uint32x4_t)b.m[i], (uint32x4_t)vdupq_n_f32(-0.f))); + tmp.m.val[i] = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f)), + vandq_u32((uint32x4_t)b.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f))); return tmp; } @@ -589,7 +589,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vorrq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -598,7 +598,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vorrq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -607,7 +607,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vorrq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -616,7 +616,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vorrq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -625,7 +625,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vandq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -634,7 +634,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vandq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -643,7 +643,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vandq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -652,7 +652,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vandq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -661,7 +661,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -670,7 +670,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -679,7 +679,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -688,7 +688,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = veorq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -697,7 +697,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbicq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -706,7 +706,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbicq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -715,7 +715,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbicq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -724,7 +724,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbicq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -733,7 +733,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u8(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -742,7 +742,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD< { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u16(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -751,7 +751,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u32(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -760,7 +760,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u64(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -769,7 +769,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vceqq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -778,7 +778,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vceqq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -787,7 +787,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vceqq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -796,7 +796,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vceqq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -805,7 +805,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -814,7 +814,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -823,7 +823,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -832,7 +832,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vceqq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -841,7 +841,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcgtq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -850,7 +850,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcgtq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -859,7 +859,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcgtq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -868,7 +868,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcgtq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -877,7 +877,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcltq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -886,7 +886,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcltq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -895,7 +895,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcltq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -904,7 +904,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcltq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -913,7 +913,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcleq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -922,7 +922,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcleq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -931,7 +931,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcleq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -940,7 +940,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcleq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -949,7 +949,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -958,7 +958,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -967,7 +967,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -976,7 +976,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -985,7 +985,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -994,7 +994,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -1003,7 +1003,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -1012,7 +1012,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -1021,7 +1021,7 @@ inline SIMD vclamp(SIMD x, float a, float b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b)); + tmp.m.val[i] = vminq_f32(vmaxq_f32(x.m.val[i], vdupq_n_f32(a)), vdupq_n_f32(b)); return tmp; } @@ -1030,7 +1030,7 @@ inline SIMD vclamp(SIMD x, int8_t a, int8_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b)); + tmp.m.val[i] = vminq_s8(vmaxq_s8(x.m.val[i], vdupq_n_s8(a)), vdupq_n_s8(b)); return tmp; } @@ -1039,7 +1039,7 @@ inline SIMD vclamp(SIMD x, int16_t a, int16_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b)); + tmp.m.val[i] = vminq_s16(vmaxq_s16(x.m.val[i], vdupq_n_s16(a)), vdupq_n_s16(b)); return tmp; } @@ -1048,7 +1048,7 @@ inline SIMD vclamp(SIMD x, int32_t a, int32_t b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b)); + tmp.m.val[i] = vminq_s32(vmaxq_s32(x.m.val[i], vdupq_n_s32(a)), vdupq_n_s32(b)); return tmp; } @@ -1058,7 +1058,7 @@ inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))); + tmp.m.val[i] = vorrq_u8(vqtbl1q_u8(a.m.val[0], b.m.val[i]), vqtbl1q_u8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16)))); return tmp; } @@ -1067,7 +1067,7 @@ inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 2; ++i) - tmp.m[i] = vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))); + tmp.m.val[i] = vorrq_s8(vqtbl1q_s8(a.m.val[0], b.m.val[i]), vqtbl1q_s8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16)))); return tmp; } #endif diff --git a/neon_quadruple.hh b/neon_quadruple.hh index 0585d24..69bde6f 100644 --- a/neon_quadruple.hh +++ b/neon_quadruple.hh @@ -14,7 +14,7 @@ union SIMD static const int SIZE = 16; typedef float value_type; typedef uint32_t uint_type; - float32x4_t m[4]; + float32x4x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -25,7 +25,7 @@ union SIMD static const int SIZE = 64; typedef int8_t value_type; typedef uint8_t uint_type; - int8x16_t m[4]; + int8x16x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -36,7 +36,7 @@ union SIMD static const int SIZE = 32; typedef int16_t value_type; typedef uint16_t uint_type; - int16x8_t m[4]; + int16x8x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -47,7 +47,7 @@ union SIMD static const int SIZE = 16; typedef int32_t value_type; typedef uint32_t uint_type; - int32x4_t m[4]; + int32x4x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -58,7 +58,7 @@ union SIMD static const int SIZE = 8; typedef int64_t value_type; typedef uint64_t uint_type; - int64x2_t m[4]; + int64x2x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -69,7 +69,7 @@ union SIMD static const int SIZE = 64; typedef uint8_t value_type; typedef uint8_t uint_type; - uint8x16_t m[4]; + uint8x16x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -80,7 +80,7 @@ union SIMD static const int SIZE = 32; typedef uint16_t value_type; typedef uint16_t uint_type; - uint16x8_t m[4]; + uint16x8x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -91,7 +91,7 @@ union SIMD static const int SIZE = 16; typedef uint32_t value_type; typedef uint32_t uint_type; - uint32x4_t m[4]; + uint32x4x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -102,7 +102,7 @@ union SIMD static const int SIZE = 8; typedef uint64_t value_type; typedef uint64_t uint_type; - uint64x2_t m[4]; + uint64x2x4_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -112,7 +112,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (float32x4_t)a.m[i]; + tmp.m.val[i] = (float32x4_t)a.m.val[i]; return tmp; } @@ -121,7 +121,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (uint32x4_t)a.m[i]; + tmp.m.val[i] = (uint32x4_t)a.m.val[i]; return tmp; } @@ -130,7 +130,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (int8x16_t)a.m[i]; + tmp.m.val[i] = (int8x16_t)a.m.val[i]; return tmp; } @@ -139,7 +139,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (uint8x16_t)a.m[i]; + tmp.m.val[i] = (uint8x16_t)a.m.val[i]; return tmp; } @@ -148,7 +148,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (int16x8_t)a.m[i]; + tmp.m.val[i] = (int16x8_t)a.m.val[i]; return tmp; } @@ -157,7 +157,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (uint16x8_t)a.m[i]; + tmp.m.val[i] = (uint16x8_t)a.m.val[i]; return tmp; } @@ -166,7 +166,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (int32x4_t)a.m[i]; + tmp.m.val[i] = (int32x4_t)a.m.val[i]; return tmp; } @@ -175,7 +175,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (uint32x4_t)a.m[i]; + tmp.m.val[i] = (uint32x4_t)a.m.val[i]; return tmp; } @@ -184,7 +184,7 @@ inline SIMD vdup(float a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_f32(a); + tmp.m.val[i] = vdupq_n_f32(a); return tmp; } @@ -193,7 +193,7 @@ inline SIMD vdup(int8_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_s8(a); + tmp.m.val[i] = vdupq_n_s8(a); return tmp; } @@ -202,7 +202,7 @@ inline SIMD vdup(int16_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_s16(a); + tmp.m.val[i] = vdupq_n_s16(a); return tmp; } @@ -211,7 +211,7 @@ inline SIMD vdup(int32_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_s32(a); + tmp.m.val[i] = vdupq_n_s32(a); return tmp; } @@ -220,7 +220,7 @@ inline SIMD vdup(int64_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_s64(a); + tmp.m.val[i] = vdupq_n_s64(a); return tmp; } @@ -229,7 +229,7 @@ inline SIMD vdup(uint8_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_u8(a); + tmp.m.val[i] = vdupq_n_u8(a); return tmp; } @@ -238,7 +238,7 @@ inline SIMD vdup(uint16_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_u16(a); + tmp.m.val[i] = vdupq_n_u16(a); return tmp; } @@ -247,7 +247,7 @@ inline SIMD vdup(uint32_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_u32(a); + tmp.m.val[i] = vdupq_n_u32(a); return tmp; } @@ -256,7 +256,7 @@ inline SIMD vdup(uint64_t a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vdupq_n_u64(a); + tmp.m.val[i] = vdupq_n_u64(a); return tmp; } @@ -265,7 +265,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]); + tmp.m.val[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m.val[i], (uint32x4_t)tmp.m.val[i]); return tmp; } @@ -274,7 +274,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s8(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -283,7 +283,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s16(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -292,7 +292,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s32(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -301,7 +301,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s64(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -310,7 +310,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u8(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -319,7 +319,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u16(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -328,7 +328,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u32(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -337,7 +337,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u64(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -346,7 +346,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vaddq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -355,7 +355,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vaddq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -364,7 +364,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vaddq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -373,7 +373,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vaddq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -382,7 +382,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vaddq_s64(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s64(a.m.val[i], b.m.val[i]); return tmp; } @@ -391,7 +391,7 @@ inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vqaddq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -400,7 +400,7 @@ inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vqaddq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -409,7 +409,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vsubq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -418,7 +418,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vsubq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -427,7 +427,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vsubq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -436,7 +436,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vsubq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -445,7 +445,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vsubq_s64(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s64(a.m.val[i], b.m.val[i]); return tmp; } @@ -454,7 +454,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -463,7 +463,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -472,7 +472,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -481,7 +481,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -490,7 +490,7 @@ inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vmulq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vmulq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -499,7 +499,7 @@ inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vmulq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vmulq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -508,7 +508,7 @@ inline SIMD vabs(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vabsq_f32(a.m[i]); + tmp.m.val[i] = vabsq_f32(a.m.val[i]); return tmp; } @@ -517,7 +517,7 @@ inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqabsq_s8(a.m[i]); + tmp.m.val[i] = vqabsq_s8(a.m.val[i]); return tmp; } @@ -526,7 +526,7 @@ inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vqabsq_s16(a.m[i]); + tmp.m.val[i] = vqabsq_s16(a.m.val[i]); return tmp; } @@ -535,9 +535,9 @@ inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])), - vceqq_f32(a.m[i], vdupq_n_f32(0.f))); + tmp.m.val[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m.val[i])), + vceqq_f32(a.m.val[i], vdupq_n_f32(0.f))); return tmp; } @@ -546,8 +546,8 @@ inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]), - vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + tmp.m.val[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m.val[i]), + vandq_u8(vcgtq_s8(a.m.val[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); return tmp; } @@ -556,9 +556,9 @@ inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])), - vceqq_f32(b.m[i], vdupq_n_f32(0.f))); + tmp.m.val[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m.val[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m.val[i])), + vceqq_f32(b.m.val[i], vdupq_n_f32(0.f))); return tmp; } @@ -567,9 +567,9 @@ inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (int8x16_t)vorrq_u8( - vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])), - vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i])); + tmp.m.val[i] = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m.val[i]), (uint8x16_t)vnegq_s8(a.m.val[i])), + vandq_u8(vcgtq_s8(b.m.val[i], vdupq_n_s8(0)), (uint8x16_t)a.m.val[i])); return tmp; } @@ -578,9 +578,9 @@ inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = (float32x4_t)vorrq_u32( - vbicq_u32((uint32x4_t)a.m[i], (uint32x4_t)vdupq_n_f32(-0.f)), - vandq_u32((uint32x4_t)b.m[i], (uint32x4_t)vdupq_n_f32(-0.f))); + tmp.m.val[i] = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f)), + vandq_u32((uint32x4_t)b.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f))); return tmp; } @@ -589,7 +589,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vorrq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -598,7 +598,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vorrq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -607,7 +607,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vorrq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -616,7 +616,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vorrq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -625,7 +625,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vandq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -634,7 +634,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vandq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -643,7 +643,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vandq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -652,7 +652,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vandq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -661,7 +661,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -670,7 +670,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -679,7 +679,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -688,7 +688,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = veorq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -697,7 +697,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbicq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -706,7 +706,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbicq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -715,7 +715,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbicq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -724,7 +724,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbicq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -733,7 +733,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u8(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -742,7 +742,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD< { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u16(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -751,7 +751,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD< { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u32(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -760,7 +760,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u64(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -769,7 +769,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vceqq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -778,7 +778,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vceqq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -787,7 +787,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vceqq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -796,7 +796,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vceqq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -805,7 +805,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -814,7 +814,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -823,7 +823,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -832,7 +832,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vceqq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -841,7 +841,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcgtq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -850,7 +850,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcgtq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -859,7 +859,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcgtq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -868,7 +868,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcgtq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -877,7 +877,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcltq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -886,7 +886,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcltq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -895,7 +895,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcltq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -904,7 +904,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcltq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -913,7 +913,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcleq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -922,7 +922,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcleq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -931,7 +931,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcleq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -940,7 +940,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcleq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -949,7 +949,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -958,7 +958,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -967,7 +967,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -976,7 +976,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -985,7 +985,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -994,7 +994,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -1003,7 +1003,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -1012,7 +1012,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -1021,7 +1021,7 @@ inline SIMD vclamp(SIMD x, float a, float b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b)); + tmp.m.val[i] = vminq_f32(vmaxq_f32(x.m.val[i], vdupq_n_f32(a)), vdupq_n_f32(b)); return tmp; } @@ -1030,7 +1030,7 @@ inline SIMD vclamp(SIMD x, int8_t a, int8_t b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b)); + tmp.m.val[i] = vminq_s8(vmaxq_s8(x.m.val[i], vdupq_n_s8(a)), vdupq_n_s8(b)); return tmp; } @@ -1039,7 +1039,7 @@ inline SIMD vclamp(SIMD x, int16_t a, int16_t b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b)); + tmp.m.val[i] = vminq_s16(vmaxq_s16(x.m.val[i], vdupq_n_s16(a)), vdupq_n_s16(b)); return tmp; } @@ -1048,7 +1048,7 @@ inline SIMD vclamp(SIMD x, int32_t a, int32_t b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b)); + tmp.m.val[i] = vminq_s32(vmaxq_s32(x.m.val[i], vdupq_n_s32(a)), vdupq_n_s32(b)); return tmp; } @@ -1058,10 +1058,10 @@ inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vorrq_u8(vorrq_u8(vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), - vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), - vqtbl1q_u8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))), - vqtbl1q_u8(a.m[3], vsubq_u8(b.m[i], vdupq_n_u8(48)))); + tmp.m.val[i] = vorrq_u8(vorrq_u8(vorrq_u8(vqtbl1q_u8(a.m.val[0], b.m.val[i]), + vqtbl1q_u8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16)))), + vqtbl1q_u8(a.m.val[2], vsubq_u8(b.m.val[i], vdupq_n_u8(32)))), + vqtbl1q_u8(a.m.val[3], vsubq_u8(b.m.val[i], vdupq_n_u8(48)))); return tmp; } @@ -1070,10 +1070,10 @@ inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 4; ++i) - tmp.m[i] = vorrq_s8(vorrq_s8(vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), - vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), - vqtbl1q_s8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))), - vqtbl1q_s8(a.m[3], vsubq_u8(b.m[i], vdupq_n_u8(48)))); + tmp.m.val[i] = vorrq_s8(vorrq_s8(vorrq_s8(vqtbl1q_s8(a.m.val[0], b.m.val[i]), + vqtbl1q_s8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16)))), + vqtbl1q_s8(a.m.val[2], vsubq_u8(b.m.val[i], vdupq_n_u8(32)))), + vqtbl1q_s8(a.m.val[3], vsubq_u8(b.m.val[i], vdupq_n_u8(48)))); return tmp; } #endif diff --git a/neon_triple.hh b/neon_triple.hh index fc7fffd..4dc6e1d 100644 --- a/neon_triple.hh +++ b/neon_triple.hh @@ -14,7 +14,7 @@ union SIMD static const int SIZE = 12; typedef float value_type; typedef uint32_t uint_type; - float32x4_t m[3]; + float32x4x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -25,7 +25,7 @@ union SIMD static const int SIZE = 48; typedef int8_t value_type; typedef uint8_t uint_type; - int8x16_t m[3]; + int8x16x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -36,7 +36,7 @@ union SIMD static const int SIZE = 24; typedef int16_t value_type; typedef uint16_t uint_type; - int16x8_t m[3]; + int16x8x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -47,7 +47,7 @@ union SIMD static const int SIZE = 12; typedef int32_t value_type; typedef uint32_t uint_type; - int32x4_t m[3]; + int32x4x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -58,7 +58,7 @@ union SIMD static const int SIZE = 6; typedef int64_t value_type; typedef uint64_t uint_type; - int64x2_t m[3]; + int64x2x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -69,7 +69,7 @@ union SIMD static const int SIZE = 48; typedef uint8_t value_type; typedef uint8_t uint_type; - uint8x16_t m[3]; + uint8x16x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -80,7 +80,7 @@ union SIMD static const int SIZE = 24; typedef uint16_t value_type; typedef uint16_t uint_type; - uint16x8_t m[3]; + uint16x8x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -91,7 +91,7 @@ union SIMD static const int SIZE = 12; typedef uint32_t value_type; typedef uint32_t uint_type; - uint32x4_t m[3]; + uint32x4x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -102,7 +102,7 @@ union SIMD static const int SIZE = 6; typedef uint64_t value_type; typedef uint64_t uint_type; - uint64x2_t m[3]; + uint64x2x3_t m; value_type v[SIZE]; uint_type u[SIZE]; }; @@ -112,7 +112,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (float32x4_t)a.m[i]; + tmp.m.val[i] = (float32x4_t)a.m.val[i]; return tmp; } @@ -121,7 +121,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (uint32x4_t)a.m[i]; + tmp.m.val[i] = (uint32x4_t)a.m.val[i]; return tmp; } @@ -130,7 +130,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (int8x16_t)a.m[i]; + tmp.m.val[i] = (int8x16_t)a.m.val[i]; return tmp; } @@ -139,7 +139,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (uint8x16_t)a.m[i]; + tmp.m.val[i] = (uint8x16_t)a.m.val[i]; return tmp; } @@ -148,7 +148,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (int16x8_t)a.m[i]; + tmp.m.val[i] = (int16x8_t)a.m.val[i]; return tmp; } @@ -157,7 +157,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (uint16x8_t)a.m[i]; + tmp.m.val[i] = (uint16x8_t)a.m.val[i]; return tmp; } @@ -166,7 +166,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (int32x4_t)a.m[i]; + tmp.m.val[i] = (int32x4_t)a.m.val[i]; return tmp; } @@ -175,7 +175,7 @@ inline SIMD vreinterpret(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (uint32x4_t)a.m[i]; + tmp.m.val[i] = (uint32x4_t)a.m.val[i]; return tmp; } @@ -184,7 +184,7 @@ inline SIMD vdup(float a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_f32(a); + tmp.m.val[i] = vdupq_n_f32(a); return tmp; } @@ -193,7 +193,7 @@ inline SIMD vdup(int8_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_s8(a); + tmp.m.val[i] = vdupq_n_s8(a); return tmp; } @@ -202,7 +202,7 @@ inline SIMD vdup(int16_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_s16(a); + tmp.m.val[i] = vdupq_n_s16(a); return tmp; } @@ -211,7 +211,7 @@ inline SIMD vdup(int32_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_s32(a); + tmp.m.val[i] = vdupq_n_s32(a); return tmp; } @@ -220,7 +220,7 @@ inline SIMD vdup(int64_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_s64(a); + tmp.m.val[i] = vdupq_n_s64(a); return tmp; } @@ -229,7 +229,7 @@ inline SIMD vdup(uint8_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_u8(a); + tmp.m.val[i] = vdupq_n_u8(a); return tmp; } @@ -238,7 +238,7 @@ inline SIMD vdup(uint16_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_u16(a); + tmp.m.val[i] = vdupq_n_u16(a); return tmp; } @@ -247,7 +247,7 @@ inline SIMD vdup(uint32_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_u32(a); + tmp.m.val[i] = vdupq_n_u32(a); return tmp; } @@ -256,7 +256,7 @@ inline SIMD vdup(uint64_t a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vdupq_n_u64(a); + tmp.m.val[i] = vdupq_n_u64(a); return tmp; } @@ -265,7 +265,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]); + tmp.m.val[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m.val[i], (uint32x4_t)tmp.m.val[i]); return tmp; } @@ -274,7 +274,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s8(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -283,7 +283,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s16(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -292,7 +292,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s32(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -301,7 +301,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_s64(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -310,7 +310,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u8(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -319,7 +319,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u16(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -328,7 +328,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u32(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -337,7 +337,7 @@ inline SIMD vzero() { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]); + tmp.m.val[i] = veorq_u64(tmp.m.val[i], tmp.m.val[i]); return tmp; } @@ -346,7 +346,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vaddq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -355,7 +355,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vaddq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -364,7 +364,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vaddq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -373,7 +373,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vaddq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -382,7 +382,7 @@ inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vaddq_s64(a.m[i], b.m[i]); + tmp.m.val[i] = vaddq_s64(a.m.val[i], b.m.val[i]); return tmp; } @@ -391,7 +391,7 @@ inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vqaddq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -400,7 +400,7 @@ inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vqaddq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -409,7 +409,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vsubq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -418,7 +418,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vsubq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -427,7 +427,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vsubq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -436,7 +436,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vsubq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -445,7 +445,7 @@ inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vsubq_s64(a.m[i], b.m[i]); + tmp.m.val[i] = vsubq_s64(a.m.val[i], b.m.val[i]); return tmp; } @@ -454,7 +454,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -463,7 +463,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -472,7 +472,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -481,7 +481,7 @@ inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vqsubq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -490,7 +490,7 @@ inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vmulq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vmulq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -499,7 +499,7 @@ inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vmulq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vmulq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -508,7 +508,7 @@ inline SIMD vabs(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vabsq_f32(a.m[i]); + tmp.m.val[i] = vabsq_f32(a.m.val[i]); return tmp; } @@ -517,7 +517,7 @@ inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqabsq_s8(a.m[i]); + tmp.m.val[i] = vqabsq_s8(a.m.val[i]); return tmp; } @@ -526,7 +526,7 @@ inline SIMD vqabs(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vqabsq_s16(a.m[i]); + tmp.m.val[i] = vqabsq_s16(a.m.val[i]); return tmp; } @@ -535,9 +535,9 @@ inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])), - vceqq_f32(a.m[i], vdupq_n_f32(0.f))); + tmp.m.val[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m.val[i])), + vceqq_f32(a.m.val[i], vdupq_n_f32(0.f))); return tmp; } @@ -546,8 +546,8 @@ inline SIMD vsignum(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]), - vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + tmp.m.val[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m.val[i]), + vandq_u8(vcgtq_s8(a.m.val[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); return tmp; } @@ -556,9 +556,9 @@ inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])), - vceqq_f32(b.m[i], vdupq_n_f32(0.f))); + tmp.m.val[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m.val[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m.val[i])), + vceqq_f32(b.m.val[i], vdupq_n_f32(0.f))); return tmp; } @@ -567,9 +567,9 @@ inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (int8x16_t)vorrq_u8( - vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])), - vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i])); + tmp.m.val[i] = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m.val[i]), (uint8x16_t)vnegq_s8(a.m.val[i])), + vandq_u8(vcgtq_s8(b.m.val[i], vdupq_n_s8(0)), (uint8x16_t)a.m.val[i])); return tmp; } @@ -578,9 +578,9 @@ inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = (float32x4_t)vorrq_u32( - vbicq_u32((uint32x4_t)a.m[i], (uint32x4_t)vdupq_n_f32(-0.f)), - vandq_u32((uint32x4_t)b.m[i], (uint32x4_t)vdupq_n_f32(-0.f))); + tmp.m.val[i] = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f)), + vandq_u32((uint32x4_t)b.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f))); return tmp; } @@ -589,7 +589,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vorrq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -598,7 +598,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vorrq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -607,7 +607,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vorrq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -616,7 +616,7 @@ inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vorrq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vorrq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -625,7 +625,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vandq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -634,7 +634,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vandq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -643,7 +643,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vandq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -652,7 +652,7 @@ inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vandq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vandq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -661,7 +661,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -670,7 +670,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -679,7 +679,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -688,7 +688,7 @@ inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = veorq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = veorq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -697,7 +697,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbicq_u8(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u8(a.m.val[i], b.m.val[i]); return tmp; } @@ -706,7 +706,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbicq_u16(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u16(a.m.val[i], b.m.val[i]); return tmp; } @@ -715,7 +715,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbicq_u32(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u32(a.m.val[i], b.m.val[i]); return tmp; } @@ -724,7 +724,7 @@ inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbicq_u64(a.m[i], b.m[i]); + tmp.m.val[i] = vbicq_u64(a.m.val[i], b.m.val[i]); return tmp; } @@ -733,7 +733,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u8(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -742,7 +742,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD< { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u16(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -751,7 +751,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD< { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u32(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -760,7 +760,7 @@ inline SIMD vbsl(SIMD a, SIMD b, SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]); + tmp.m.val[i] = vbslq_u64(a.m.val[i], b.m.val[i], c.m.val[i]); return tmp; } @@ -769,7 +769,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vceqq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -778,7 +778,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vceqq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -787,7 +787,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vceqq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -796,7 +796,7 @@ inline SIMD vceqz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vceqq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -805,7 +805,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -814,7 +814,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -823,7 +823,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -832,7 +832,7 @@ inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vceqq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vceqq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -841,7 +841,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcgtq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -850,7 +850,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcgtq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -859,7 +859,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcgtq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -868,7 +868,7 @@ inline SIMD vcgtz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcgtq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -877,7 +877,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcltq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -886,7 +886,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcltq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -895,7 +895,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcltq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -904,7 +904,7 @@ inline SIMD vcltz(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcltq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -913,7 +913,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f)); + tmp.m.val[i] = vcleq_f32(a.m.val[i], vdupq_n_f32(0.f)); return tmp; } @@ -922,7 +922,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0)); + tmp.m.val[i] = vcleq_s8(a.m.val[i], vdupq_n_s8(0)); return tmp; } @@ -931,7 +931,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0)); + tmp.m.val[i] = vcleq_s16(a.m.val[i], vdupq_n_s16(0)); return tmp; } @@ -940,7 +940,7 @@ inline SIMD vclez(SIMD a) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0)); + tmp.m.val[i] = vcleq_s32(a.m.val[i], vdupq_n_s32(0)); return tmp; } @@ -949,7 +949,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -958,7 +958,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -967,7 +967,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -976,7 +976,7 @@ inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vminq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -985,7 +985,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_f32(a.m.val[i], b.m.val[i]); return tmp; } @@ -994,7 +994,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s8(a.m.val[i], b.m.val[i]); return tmp; } @@ -1003,7 +1003,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s16(a.m.val[i], b.m.val[i]); return tmp; } @@ -1012,7 +1012,7 @@ inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]); + tmp.m.val[i] = vmaxq_s32(a.m.val[i], b.m.val[i]); return tmp; } @@ -1021,7 +1021,7 @@ inline SIMD vclamp(SIMD x, float a, float b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b)); + tmp.m.val[i] = vminq_f32(vmaxq_f32(x.m.val[i], vdupq_n_f32(a)), vdupq_n_f32(b)); return tmp; } @@ -1030,7 +1030,7 @@ inline SIMD vclamp(SIMD x, int8_t a, int8_t b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b)); + tmp.m.val[i] = vminq_s8(vmaxq_s8(x.m.val[i], vdupq_n_s8(a)), vdupq_n_s8(b)); return tmp; } @@ -1039,7 +1039,7 @@ inline SIMD vclamp(SIMD x, int16_t a, int16_t b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b)); + tmp.m.val[i] = vminq_s16(vmaxq_s16(x.m.val[i], vdupq_n_s16(a)), vdupq_n_s16(b)); return tmp; } @@ -1048,7 +1048,7 @@ inline SIMD vclamp(SIMD x, int32_t a, int32_t b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b)); + tmp.m.val[i] = vminq_s32(vmaxq_s32(x.m.val[i], vdupq_n_s32(a)), vdupq_n_s32(b)); return tmp; } @@ -1058,9 +1058,9 @@ inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vorrq_u8(vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), - vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), - vqtbl1q_u8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))); + tmp.m.val[i] = vorrq_u8(vorrq_u8(vqtbl1q_u8(a.m.val[0], b.m.val[i]), + vqtbl1q_u8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16)))), + vqtbl1q_u8(a.m.val[2], vsubq_u8(b.m.val[i], vdupq_n_u8(32)))); return tmp; } @@ -1069,9 +1069,9 @@ inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; for (int i = 0; i < 3; ++i) - tmp.m[i] = vorrq_s8(vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), - vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))), - vqtbl1q_s8(a.m[2], vsubq_u8(b.m[i], vdupq_n_u8(32)))); + tmp.m.val[i] = vorrq_s8(vorrq_s8(vqtbl1q_s8(a.m.val[0], b.m.val[i]), + vqtbl1q_s8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16)))), + vqtbl1q_s8(a.m.val[2], vsubq_u8(b.m.val[i], vdupq_n_u8(32)))); return tmp; } #endif