diff --git a/neon_double.hh b/neon_double.hh index e559dce..141b95e 100644 --- a/neon_double.hh +++ b/neon_double.hh @@ -111,8 +111,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (float32x4_t)a.m[0]; - tmp.m[1] = (float32x4_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (float32x4_t)a.m[i]; return tmp; } @@ -120,8 +120,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (uint32x4_t)a.m[0]; - tmp.m[1] = (uint32x4_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (uint32x4_t)a.m[i]; return tmp; } @@ -129,8 +129,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (int8x16_t)a.m[0]; - tmp.m[1] = (int8x16_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (int8x16_t)a.m[i]; return tmp; } @@ -138,8 +138,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (uint8x16_t)a.m[0]; - tmp.m[1] = (uint8x16_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (uint8x16_t)a.m[i]; return tmp; } @@ -147,8 +147,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (int16x8_t)a.m[0]; - tmp.m[1] = (int16x8_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (int16x8_t)a.m[i]; return tmp; } @@ -156,8 +156,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (uint16x8_t)a.m[0]; - tmp.m[1] = (uint16x8_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (uint16x8_t)a.m[i]; return tmp; } @@ -165,8 +165,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (int32x4_t)a.m[0]; - tmp.m[1] = (int32x4_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (int32x4_t)a.m[i]; return tmp; } @@ -174,8 +174,8 @@ template <> inline SIMD vreinterpret(SIMD a) { SIMD tmp; - tmp.m[0] = (uint32x4_t)a.m[0]; - tmp.m[1] = (uint32x4_t)a.m[1]; + for (int i = 0; i < 2; ++i) + tmp.m[i] = (uint32x4_t)a.m[i]; return tmp; } @@ -183,8 +183,8 @@ template <> inline SIMD vdup(float a) { SIMD tmp; - tmp.m[0] = vdupq_n_f32(a); - tmp.m[1] = vdupq_n_f32(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_f32(a); return tmp; } @@ -192,8 +192,8 @@ template <> inline SIMD vdup(int8_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_s8(a); - tmp.m[1] = vdupq_n_s8(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_s8(a); return tmp; } @@ -201,8 +201,8 @@ template <> inline SIMD vdup(int16_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_s16(a); - tmp.m[1] = vdupq_n_s16(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_s16(a); return tmp; } @@ -210,8 +210,8 @@ template <> inline SIMD vdup(int32_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_s32(a); - tmp.m[1] = vdupq_n_s32(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_s32(a); return tmp; } @@ -219,8 +219,8 @@ template <> inline SIMD vdup(int64_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_s64(a); - tmp.m[1] = vdupq_n_s64(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_s64(a); return tmp; } @@ -228,8 +228,8 @@ template <> inline SIMD vdup(uint8_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_u8(a); - tmp.m[1] = vdupq_n_u8(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_u8(a); return tmp; } @@ -237,8 +237,8 @@ template <> inline SIMD vdup(uint16_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_u16(a); - tmp.m[1] = vdupq_n_u16(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_u16(a); return tmp; } @@ -246,8 +246,8 @@ template <> inline SIMD vdup(uint32_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_u32(a); - tmp.m[1] = vdupq_n_u32(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_u32(a); return tmp; } @@ -255,8 +255,8 @@ template <> inline SIMD vdup(uint64_t a) { SIMD tmp; - tmp.m[0] = vdupq_n_u64(a); - tmp.m[1] = vdupq_n_u64(a); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vdupq_n_u64(a); return tmp; } @@ -264,8 +264,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[0], (uint32x4_t)tmp.m[0]); - tmp.m[1] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[1], (uint32x4_t)tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]); return tmp; } @@ -273,8 +273,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_s8(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_s8(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]); return tmp; } @@ -282,8 +282,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_s16(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_s16(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]); return tmp; } @@ -291,8 +291,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_s32(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_s32(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]); return tmp; } @@ -300,8 +300,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_s64(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_s64(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]); return tmp; } @@ -309,8 +309,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_u8(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_u8(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]); return tmp; } @@ -318,8 +318,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_u16(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_u16(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]); return tmp; } @@ -327,8 +327,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_u32(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_u32(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]); return tmp; } @@ -336,8 +336,8 @@ template <> inline SIMD vzero() { SIMD tmp; - tmp.m[0] = veorq_u64(tmp.m[0], tmp.m[0]); - tmp.m[1] = veorq_u64(tmp.m[1], tmp.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]); return tmp; } @@ -345,8 +345,8 @@ template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vaddq_f32(a.m[0], b.m[0]); - tmp.m[1] = vaddq_f32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vaddq_f32(a.m[i], b.m[i]); return tmp; } @@ -354,8 +354,8 @@ template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vaddq_s8(a.m[0], b.m[0]); - tmp.m[1] = vaddq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vaddq_s8(a.m[i], b.m[i]); return tmp; } @@ -363,8 +363,8 @@ template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vaddq_s16(a.m[0], b.m[0]); - tmp.m[1] = vaddq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vaddq_s16(a.m[i], b.m[i]); return tmp; } @@ -372,8 +372,8 @@ template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vaddq_s32(a.m[0], b.m[0]); - tmp.m[1] = vaddq_s32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vaddq_s32(a.m[i], b.m[i]); return tmp; } @@ -381,7 +381,8 @@ template <> inline SIMD vadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[1] = vaddq_s64(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vaddq_s64(a.m[i], b.m[i]); return tmp; } @@ -389,8 +390,8 @@ template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vqaddq_s8(a.m[0], b.m[0]); - tmp.m[1] = vqaddq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]); return tmp; } @@ -398,8 +399,8 @@ template <> inline SIMD vqadd(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vqaddq_s16(a.m[0], b.m[0]); - tmp.m[1] = vqaddq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]); return tmp; } @@ -407,8 +408,8 @@ template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vsubq_f32(a.m[0], b.m[0]); - tmp.m[1] = vsubq_f32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vsubq_f32(a.m[i], b.m[i]); return tmp; } @@ -416,8 +417,8 @@ template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vsubq_s8(a.m[0], b.m[0]); - tmp.m[1] = vsubq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vsubq_s8(a.m[i], b.m[i]); return tmp; } @@ -425,8 +426,8 @@ template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vsubq_s16(a.m[0], b.m[0]); - tmp.m[1] = vsubq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vsubq_s16(a.m[i], b.m[i]); return tmp; } @@ -434,8 +435,8 @@ template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vsubq_s32(a.m[0], b.m[0]); - tmp.m[1] = vsubq_s32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vsubq_s32(a.m[i], b.m[i]); return tmp; } @@ -443,8 +444,8 @@ template <> inline SIMD vsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vsubq_s64(a.m[0], b.m[0]); - tmp.m[1] = vsubq_s64(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vsubq_s64(a.m[i], b.m[i]); return tmp; } @@ -452,8 +453,8 @@ template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vqsubq_s8(a.m[0], b.m[0]); - tmp.m[1] = vqsubq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]); return tmp; } @@ -461,8 +462,8 @@ template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vqsubq_s16(a.m[0], b.m[0]); - tmp.m[1] = vqsubq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]); return tmp; } @@ -470,8 +471,8 @@ template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vqsubq_u8(a.m[0], b.m[0]); - tmp.m[1] = vqsubq_u8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]); return tmp; } @@ -479,8 +480,8 @@ template <> inline SIMD vqsub(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vqsubq_u16(a.m[0], b.m[0]); - tmp.m[1] = vqsubq_u16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]); return tmp; } @@ -488,8 +489,8 @@ template <> inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vmulq_f32(a.m[0], b.m[0]); - tmp.m[1] = vmulq_f32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vmulq_f32(a.m[i], b.m[i]); return tmp; } @@ -497,8 +498,8 @@ template <> inline SIMD vmul(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vmulq_s8(a.m[0], b.m[0]); - tmp.m[1] = vmulq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vmulq_s8(a.m[i], b.m[i]); return tmp; } @@ -506,8 +507,8 @@ template <> inline SIMD vabs(SIMD a) { SIMD tmp; - tmp.m[0] = vabsq_f32(a.m[0]); - tmp.m[1] = vabsq_f32(a.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vabsq_f32(a.m[i]); return tmp; } @@ -515,8 +516,8 @@ template <> inline SIMD vqabs(SIMD a) { SIMD tmp; - tmp.m[0] = vqabsq_s8(a.m[0]); - tmp.m[1] = vqabsq_s8(a.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqabsq_s8(a.m[i]); return tmp; } @@ -524,8 +525,8 @@ template <> inline SIMD vqabs(SIMD a) { SIMD tmp; - tmp.m[0] = vqabsq_s16(a.m[0]); - tmp.m[1] = vqabsq_s16(a.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vqabsq_s16(a.m[i]); return tmp; } @@ -533,12 +534,10 @@ template <> inline SIMD vsignum(SIMD a) { SIMD tmp; - tmp.m[0] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[0])), - vceqq_f32(a.m[0], vdupq_n_f32(0.f))); - tmp.m[1] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[1])), - vceqq_f32(a.m[1], vdupq_n_f32(0.f))); + for (int i = 0; i < 2; ++i) + tmp.m[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])), + vceqq_f32(a.m[i], vdupq_n_f32(0.f))); return tmp; } @@ -546,10 +545,9 @@ template <> inline SIMD vsignum(SIMD a) { SIMD tmp; - tmp.m[0] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[0]), - vandq_u8(vcgtq_s8(a.m[0], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); - tmp.m[1] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[1]), - vandq_u8(vcgtq_s8(a.m[1], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); + for (int i = 0; i < 2; ++i) + tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]), + vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1))); return tmp; } @@ -557,12 +555,10 @@ template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)a.m[0], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[0])), - vceqq_f32(b.m[0], vdupq_n_f32(0.f))); - tmp.m[1] = (float32x4_t)vbicq_u32( - veorq_u32((uint32x4_t)a.m[1], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[1])), - vceqq_f32(b.m[1], vdupq_n_f32(0.f))); + for (int i = 0; i < 2; ++i) + tmp.m[i] = (float32x4_t)vbicq_u32( + veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])), + vceqq_f32(b.m[i], vdupq_n_f32(0.f))); return tmp; } @@ -570,12 +566,10 @@ template <> inline SIMD vsign(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = (int8x16_t)vorrq_u8( - vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[0]), (uint8x16_t)vnegq_s8(a.m[0])), - vandq_u8(vcgtq_s8(b.m[0], vdupq_n_s8(0)), (uint8x16_t)a.m[0])); - tmp.m[1] = (int8x16_t)vorrq_u8( - vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[1]), (uint8x16_t)vnegq_s8(a.m[1])), - vandq_u8(vcgtq_s8(b.m[1], vdupq_n_s8(0)), (uint8x16_t)a.m[1])); + for (int i = 0; i < 2; ++i) + tmp.m[i] = (int8x16_t)vorrq_u8( + vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])), + vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i])); return tmp; } @@ -584,12 +578,10 @@ inline SIMD vcopysign(SIMD a, SIMD b) { SIMD tmp; uint32x4_t negz = (uint32x4_t)vdupq_n_f32(-0.f); - tmp.m[0] = (float32x4_t)vorrq_u32( - vbicq_u32((uint32x4_t)a.m[0], negz), - vandq_u32((uint32x4_t)b.m[0], negz)); - tmp.m[1] = (float32x4_t)vorrq_u32( - vbicq_u32((uint32x4_t)a.m[1], negz), - vandq_u32((uint32x4_t)b.m[1], negz)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = (float32x4_t)vorrq_u32( + vbicq_u32((uint32x4_t)a.m[i], negz), + vandq_u32((uint32x4_t)b.m[i], negz)); return tmp; } @@ -597,8 +589,8 @@ template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vorrq_u8(a.m[0], b.m[0]); - tmp.m[1] = vorrq_u8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vorrq_u8(a.m[i], b.m[i]); return tmp; } @@ -606,8 +598,8 @@ template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vorrq_u16(a.m[0], b.m[0]); - tmp.m[1] = vorrq_u16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vorrq_u16(a.m[i], b.m[i]); return tmp; } @@ -615,8 +607,8 @@ template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vorrq_u32(a.m[0], b.m[0]); - tmp.m[1] = vorrq_u32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vorrq_u32(a.m[i], b.m[i]); return tmp; } @@ -624,8 +616,8 @@ template <> inline SIMD vorr(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vorrq_u64(a.m[0], b.m[0]); - tmp.m[1] = vorrq_u64(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vorrq_u64(a.m[i], b.m[i]); return tmp; } @@ -633,8 +625,8 @@ template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vandq_u8(a.m[0], b.m[0]); - tmp.m[1] = vandq_u8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vandq_u8(a.m[i], b.m[i]); return tmp; } @@ -642,8 +634,8 @@ template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vandq_u16(a.m[0], b.m[0]); - tmp.m[1] = vandq_u16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vandq_u16(a.m[i], b.m[i]); return tmp; } @@ -651,8 +643,8 @@ template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vandq_u32(a.m[0], b.m[0]); - tmp.m[1] = vandq_u32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vandq_u32(a.m[i], b.m[i]); return tmp; } @@ -660,8 +652,8 @@ template <> inline SIMD vand(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vandq_u64(a.m[0], b.m[0]); - tmp.m[1] = vandq_u64(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vandq_u64(a.m[i], b.m[i]); return tmp; } @@ -669,8 +661,8 @@ template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = veorq_u8(a.m[0], b.m[0]); - tmp.m[1] = veorq_u8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u8(a.m[i], b.m[i]); return tmp; } @@ -678,8 +670,8 @@ template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = veorq_u16(a.m[0], b.m[0]); - tmp.m[1] = veorq_u16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u16(a.m[i], b.m[i]); return tmp; } @@ -687,8 +679,8 @@ template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = veorq_u32(a.m[0], b.m[0]); - tmp.m[1] = veorq_u32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u32(a.m[i], b.m[i]); return tmp; } @@ -696,8 +688,8 @@ template <> inline SIMD veor(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = veorq_u64(a.m[0], b.m[0]); - tmp.m[1] = veorq_u64(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = veorq_u64(a.m[i], b.m[i]); return tmp; } @@ -705,8 +697,8 @@ template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vbicq_u8(a.m[0], b.m[0]); - tmp.m[1] = vbicq_u8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbicq_u8(a.m[i], b.m[i]); return tmp; } @@ -714,8 +706,8 @@ template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vbicq_u16(a.m[0], b.m[0]); - tmp.m[1] = vbicq_u16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbicq_u16(a.m[i], b.m[i]); return tmp; } @@ -723,8 +715,8 @@ template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vbicq_u32(a.m[0], b.m[0]); - tmp.m[1] = vbicq_u32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbicq_u32(a.m[i], b.m[i]); return tmp; } @@ -732,8 +724,8 @@ template <> inline SIMD vbic(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vbicq_u64(a.m[0], b.m[0]); - tmp.m[1] = vbicq_u64(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbicq_u64(a.m[i], b.m[i]); return tmp; } @@ -741,8 +733,8 @@ template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; - tmp.m[0] = vbslq_u8(a.m[0], b.m[0], c.m[0]); - tmp.m[1] = vbslq_u8(a.m[1], b.m[1], c.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]); return tmp; } @@ -750,8 +742,8 @@ template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; - tmp.m[0] = vbslq_u16(a.m[0], b.m[0], c.m[0]); - tmp.m[1] = vbslq_u16(a.m[1], b.m[1], c.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]); return tmp; } @@ -759,8 +751,8 @@ template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; - tmp.m[0] = vbslq_u32(a.m[0], b.m[0], c.m[0]); - tmp.m[1] = vbslq_u32(a.m[1], b.m[1], c.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]); return tmp; } @@ -768,8 +760,8 @@ template <> inline SIMD vbsl(SIMD a, SIMD b, SIMD c) { SIMD tmp; - tmp.m[0] = vbslq_u64(a.m[0], b.m[0], c.m[0]); - tmp.m[1] = vbslq_u64(a.m[1], b.m[1], c.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]); return tmp; } @@ -777,8 +769,8 @@ template <> inline SIMD vceqz(SIMD a) { SIMD tmp; - tmp.m[0] = vceqq_f32(a.m[0], vdupq_n_f32(0.f)); - tmp.m[1] = vceqq_f32(a.m[1], vdupq_n_f32(0.f)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f)); return tmp; } @@ -786,8 +778,8 @@ template <> inline SIMD vceqz(SIMD a) { SIMD tmp; - tmp.m[0] = vceqq_s8(a.m[0], vdupq_n_s8(0)); - tmp.m[1] = vceqq_s8(a.m[1], vdupq_n_s8(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0)); return tmp; } @@ -795,8 +787,8 @@ template <> inline SIMD vceqz(SIMD a) { SIMD tmp; - tmp.m[0] = vceqq_s16(a.m[0], vdupq_n_s16(0)); - tmp.m[1] = vceqq_s16(a.m[1], vdupq_n_s16(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0)); return tmp; } @@ -804,8 +796,8 @@ template <> inline SIMD vceqz(SIMD a) { SIMD tmp; - tmp.m[0] = vceqq_s32(a.m[0], vdupq_n_s32(0)); - tmp.m[1] = vceqq_s32(a.m[1], vdupq_n_s32(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0)); return tmp; } @@ -813,8 +805,8 @@ template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vceqq_f32(a.m[0], b.m[0]); - tmp.m[1] = vceqq_f32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_f32(a.m[i], b.m[i]); return tmp; } @@ -822,8 +814,8 @@ template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vceqq_s8(a.m[0], b.m[0]); - tmp.m[1] = vceqq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_s8(a.m[i], b.m[i]); return tmp; } @@ -831,8 +823,8 @@ template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vceqq_s16(a.m[0], b.m[0]); - tmp.m[1] = vceqq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_s16(a.m[i], b.m[i]); return tmp; } @@ -840,8 +832,8 @@ template <> inline SIMD vceq(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vceqq_s32(a.m[0], b.m[0]); - tmp.m[1] = vceqq_s32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vceqq_s32(a.m[i], b.m[i]); return tmp; } @@ -849,8 +841,8 @@ template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; - tmp.m[0] = vcgtq_f32(a.m[0], vdupq_n_f32(0.f)); - tmp.m[1] = vcgtq_f32(a.m[1], vdupq_n_f32(0.f)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f)); return tmp; } @@ -858,8 +850,8 @@ template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; - tmp.m[0] = vcgtq_s8(a.m[0], vdupq_n_s8(0)); - tmp.m[1] = vcgtq_s8(a.m[1], vdupq_n_s8(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0)); return tmp; } @@ -867,8 +859,8 @@ template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; - tmp.m[0] = vcgtq_s16(a.m[0], vdupq_n_s16(0)); - tmp.m[1] = vcgtq_s16(a.m[1], vdupq_n_s16(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0)); return tmp; } @@ -876,8 +868,8 @@ template <> inline SIMD vcgtz(SIMD a) { SIMD tmp; - tmp.m[0] = vcgtq_s32(a.m[0], vdupq_n_s32(0)); - tmp.m[1] = vcgtq_s32(a.m[1], vdupq_n_s32(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0)); return tmp; } @@ -885,8 +877,8 @@ template <> inline SIMD vcltz(SIMD a) { SIMD tmp; - tmp.m[0] = vcltq_f32(a.m[0], vdupq_n_f32(0.f)); - tmp.m[1] = vcltq_f32(a.m[1], vdupq_n_f32(0.f)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f)); return tmp; } @@ -894,8 +886,8 @@ template <> inline SIMD vcltz(SIMD a) { SIMD tmp; - tmp.m[0] = vcltq_s8(a.m[0], vdupq_n_s8(0)); - tmp.m[1] = vcltq_s8(a.m[1], vdupq_n_s8(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0)); return tmp; } @@ -903,8 +895,8 @@ template <> inline SIMD vcltz(SIMD a) { SIMD tmp; - tmp.m[0] = vcltq_s16(a.m[0], vdupq_n_s16(0)); - tmp.m[1] = vcltq_s16(a.m[1], vdupq_n_s16(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0)); return tmp; } @@ -912,8 +904,8 @@ template <> inline SIMD vcltz(SIMD a) { SIMD tmp; - tmp.m[0] = vcltq_s32(a.m[0], vdupq_n_s32(0)); - tmp.m[1] = vcltq_s32(a.m[1], vdupq_n_s32(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0)); return tmp; } @@ -921,8 +913,8 @@ template <> inline SIMD vclez(SIMD a) { SIMD tmp; - tmp.m[0] = vcleq_f32(a.m[0], vdupq_n_f32(0.f)); - tmp.m[1] = vcleq_f32(a.m[1], vdupq_n_f32(0.f)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f)); return tmp; } @@ -930,8 +922,8 @@ template <> inline SIMD vclez(SIMD a) { SIMD tmp; - tmp.m[0] = vcleq_s8(a.m[0], vdupq_n_s8(0)); - tmp.m[1] = vcleq_s8(a.m[1], vdupq_n_s8(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0)); return tmp; } @@ -939,8 +931,8 @@ template <> inline SIMD vclez(SIMD a) { SIMD tmp; - tmp.m[0] = vcleq_s16(a.m[0], vdupq_n_s16(0)); - tmp.m[1] = vcleq_s16(a.m[1], vdupq_n_s16(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0)); return tmp; } @@ -948,8 +940,8 @@ template <> inline SIMD vclez(SIMD a) { SIMD tmp; - tmp.m[0] = vcleq_s32(a.m[0], vdupq_n_s32(0)); - tmp.m[1] = vcleq_s32(a.m[1], vdupq_n_s32(0)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0)); return tmp; } @@ -957,8 +949,8 @@ template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vminq_f32(a.m[0], b.m[0]); - tmp.m[1] = vminq_f32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_f32(a.m[i], b.m[i]); return tmp; } @@ -966,8 +958,8 @@ template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vminq_s8(a.m[0], b.m[0]); - tmp.m[1] = vminq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_s8(a.m[i], b.m[i]); return tmp; } @@ -975,8 +967,8 @@ template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vminq_s16(a.m[0], b.m[0]); - tmp.m[1] = vminq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_s16(a.m[i], b.m[i]); return tmp; } @@ -984,8 +976,8 @@ template <> inline SIMD vmin(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vminq_s32(a.m[0], b.m[0]); - tmp.m[1] = vminq_s32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_s32(a.m[i], b.m[i]); return tmp; } @@ -993,8 +985,8 @@ template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vmaxq_f32(a.m[0], b.m[0]); - tmp.m[1] = vmaxq_f32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]); return tmp; } @@ -1002,8 +994,8 @@ template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vmaxq_s8(a.m[0], b.m[0]); - tmp.m[1] = vmaxq_s8(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]); return tmp; } @@ -1011,8 +1003,8 @@ template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vmaxq_s16(a.m[0], b.m[0]); - tmp.m[1] = vmaxq_s16(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]); return tmp; } @@ -1020,8 +1012,8 @@ template <> inline SIMD vmax(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vmaxq_s32(a.m[0], b.m[0]); - tmp.m[1] = vmaxq_s32(a.m[1], b.m[1]); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]); return tmp; } @@ -1029,8 +1021,8 @@ template <> inline SIMD vclamp(SIMD x, float a, float b) { SIMD tmp; - tmp.m[0] = vminq_f32(vmaxq_f32(x.m[0], vdupq_n_f32(a)), vdupq_n_f32(b)); - tmp.m[1] = vminq_f32(vmaxq_f32(x.m[1], vdupq_n_f32(a)), vdupq_n_f32(b)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b)); return tmp; } @@ -1038,8 +1030,8 @@ template <> inline SIMD vclamp(SIMD x, int8_t a, int8_t b) { SIMD tmp; - tmp.m[0] = vminq_s8(vmaxq_s8(x.m[0], vdupq_n_s8(a)), vdupq_n_s8(b)); - tmp.m[1] = vminq_s8(vmaxq_s8(x.m[1], vdupq_n_s8(a)), vdupq_n_s8(b)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b)); return tmp; } @@ -1047,8 +1039,8 @@ template <> inline SIMD vclamp(SIMD x, int16_t a, int16_t b) { SIMD tmp; - tmp.m[0] = vminq_s16(vmaxq_s16(x.m[0], vdupq_n_s16(a)), vdupq_n_s16(b)); - tmp.m[1] = vminq_s16(vmaxq_s16(x.m[1], vdupq_n_s16(a)), vdupq_n_s16(b)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b)); return tmp; } @@ -1056,8 +1048,8 @@ template <> inline SIMD vclamp(SIMD x, int32_t a, int32_t b) { SIMD tmp; - tmp.m[0] = vminq_s32(vmaxq_s32(x.m[0], vdupq_n_s32(a)), vdupq_n_s32(b)); - tmp.m[1] = vminq_s32(vmaxq_s32(x.m[1], vdupq_n_s32(a)), vdupq_n_s32(b)); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b)); return tmp; } @@ -1066,8 +1058,8 @@ template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vorrq_u8(vqtbl1q_u8(a.m[0], b.m[0]), vqtbl1q_u8(a.m[1], vsubq_u8(b.m[0], vdupq_n_u8(16)))); - tmp.m[1] = vorrq_u8(vqtbl1q_u8(a.m[0], b.m[1]), vqtbl1q_u8(a.m[1], vsubq_u8(b.m[1], vdupq_n_u8(16)))); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))); return tmp; } @@ -1075,8 +1067,8 @@ template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; - tmp.m[0] = vorrq_s8(vqtbl1q_s8(a.m[0], b.m[0]), vqtbl1q_s8(a.m[1], vsubq_u8(b.m[0], vdupq_n_u8(16)))); - tmp.m[1] = vorrq_s8(vqtbl1q_s8(a.m[0], b.m[1]), vqtbl1q_s8(a.m[1], vsubq_u8(b.m[1], vdupq_n_u8(16)))); + for (int i = 0; i < 2; ++i) + tmp.m[i] = vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16)))); return tmp; } #endif