use neon struct types

This commit is contained in:
Ahmet Inan 2026-01-20 10:48:49 +01:00
commit 0bd5166a4a
3 changed files with 382 additions and 382 deletions

View file

@ -14,7 +14,7 @@ union SIMD<float, 8>
static const int SIZE = 8;
typedef float value_type;
typedef uint32_t uint_type;
float32x4_t m[2];
float32x4x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -25,7 +25,7 @@ union SIMD<int8_t, 32>
static const int SIZE = 32;
typedef int8_t value_type;
typedef uint8_t uint_type;
int8x16_t m[2];
int8x16x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -36,7 +36,7 @@ union SIMD<int16_t, 16>
static const int SIZE = 16;
typedef int16_t value_type;
typedef uint16_t uint_type;
int16x8_t m[2];
int16x8x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -47,7 +47,7 @@ union SIMD<int32_t, 8>
static const int SIZE = 8;
typedef int32_t value_type;
typedef uint32_t uint_type;
int32x4_t m[2];
int32x4x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -58,7 +58,7 @@ union SIMD<int64_t, 4>
static const int SIZE = 4;
typedef int64_t value_type;
typedef uint64_t uint_type;
int64x2_t m[2];
int64x2x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -69,7 +69,7 @@ union SIMD<uint8_t, 32>
static const int SIZE = 32;
typedef uint8_t value_type;
typedef uint8_t uint_type;
uint8x16_t m[2];
uint8x16x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -80,7 +80,7 @@ union SIMD<uint16_t, 16>
static const int SIZE = 16;
typedef uint16_t value_type;
typedef uint16_t uint_type;
uint16x8_t m[2];
uint16x8x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -91,7 +91,7 @@ union SIMD<uint32_t, 8>
static const int SIZE = 8;
typedef uint32_t value_type;
typedef uint32_t uint_type;
uint32x4_t m[2];
uint32x4x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -102,7 +102,7 @@ union SIMD<uint64_t, 4>
static const int SIZE = 4;
typedef uint64_t value_type;
typedef uint64_t uint_type;
uint64x2_t m[2];
uint64x2x2_t m;
value_type v[SIZE];
uint_type u[SIZE];
};
@ -112,7 +112,7 @@ inline SIMD<float, 8> vreinterpret(SIMD<uint32_t, 8> a)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (float32x4_t)a.m[i];
tmp.m.val[i] = (float32x4_t)a.m.val[i];
return tmp;
}
@ -121,7 +121,7 @@ inline SIMD<uint32_t, 8> vreinterpret(SIMD<float, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (uint32x4_t)a.m[i];
tmp.m.val[i] = (uint32x4_t)a.m.val[i];
return tmp;
}
@ -130,7 +130,7 @@ inline SIMD<int8_t, 32> vreinterpret(SIMD<uint8_t, 32> a)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (int8x16_t)a.m[i];
tmp.m.val[i] = (int8x16_t)a.m.val[i];
return tmp;
}
@ -139,7 +139,7 @@ inline SIMD<uint8_t, 32> vreinterpret(SIMD<int8_t, 32> a)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (uint8x16_t)a.m[i];
tmp.m.val[i] = (uint8x16_t)a.m.val[i];
return tmp;
}
@ -148,7 +148,7 @@ inline SIMD<int16_t, 16> vreinterpret(SIMD<uint16_t, 16> a)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (int16x8_t)a.m[i];
tmp.m.val[i] = (int16x8_t)a.m.val[i];
return tmp;
}
@ -157,7 +157,7 @@ inline SIMD<uint16_t, 16> vreinterpret(SIMD<int16_t, 16> a)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (uint16x8_t)a.m[i];
tmp.m.val[i] = (uint16x8_t)a.m.val[i];
return tmp;
}
@ -166,7 +166,7 @@ inline SIMD<int32_t, 8> vreinterpret(SIMD<uint32_t, 8> a)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (int32x4_t)a.m[i];
tmp.m.val[i] = (int32x4_t)a.m.val[i];
return tmp;
}
@ -175,7 +175,7 @@ inline SIMD<uint32_t, 8> vreinterpret(SIMD<int32_t, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (uint32x4_t)a.m[i];
tmp.m.val[i] = (uint32x4_t)a.m.val[i];
return tmp;
}
@ -184,7 +184,7 @@ inline SIMD<float, 8> vdup(float a)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_f32(a);
tmp.m.val[i] = vdupq_n_f32(a);
return tmp;
}
@ -193,7 +193,7 @@ inline SIMD<int8_t, 32> vdup(int8_t a)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_s8(a);
tmp.m.val[i] = vdupq_n_s8(a);
return tmp;
}
@ -202,7 +202,7 @@ inline SIMD<int16_t, 16> vdup(int16_t a)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_s16(a);
tmp.m.val[i] = vdupq_n_s16(a);
return tmp;
}
@ -211,7 +211,7 @@ inline SIMD<int32_t, 8> vdup(int32_t a)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_s32(a);
tmp.m.val[i] = vdupq_n_s32(a);
return tmp;
}
@ -220,7 +220,7 @@ inline SIMD<int64_t, 4> vdup(int64_t a)
{
SIMD<int64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_s64(a);
tmp.m.val[i] = vdupq_n_s64(a);
return tmp;
}
@ -229,7 +229,7 @@ inline SIMD<uint8_t, 32> vdup(uint8_t a)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_u8(a);
tmp.m.val[i] = vdupq_n_u8(a);
return tmp;
}
@ -238,7 +238,7 @@ inline SIMD<uint16_t, 16> vdup(uint16_t a)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_u16(a);
tmp.m.val[i] = vdupq_n_u16(a);
return tmp;
}
@ -247,7 +247,7 @@ inline SIMD<uint32_t, 8> vdup(uint32_t a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_u32(a);
tmp.m.val[i] = vdupq_n_u32(a);
return tmp;
}
@ -256,7 +256,7 @@ inline SIMD<uint64_t, 4> vdup(uint64_t a)
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vdupq_n_u64(a);
tmp.m.val[i] = vdupq_n_u64(a);
return tmp;
}
@ -265,7 +265,7 @@ inline SIMD<float, 8> vzero()
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m[i], (uint32x4_t)tmp.m[i]);
tmp.m.val[i] = (float32x4_t)veorq_u32((uint32x4_t)tmp.m.val[i], (uint32x4_t)tmp.m.val[i]);
return tmp;
}
@ -274,7 +274,7 @@ inline SIMD<int8_t, 32> vzero()
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_s8(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_s8(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -283,7 +283,7 @@ inline SIMD<int16_t, 16> vzero()
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_s16(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_s16(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -292,7 +292,7 @@ inline SIMD<int32_t, 8> vzero()
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_s32(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_s32(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -301,7 +301,7 @@ inline SIMD<int64_t, 4> vzero()
{
SIMD<int64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_s64(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_s64(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -310,7 +310,7 @@ inline SIMD<uint8_t, 32> vzero()
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u8(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_u8(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -319,7 +319,7 @@ inline SIMD<uint16_t, 16> vzero()
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u16(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_u16(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -328,7 +328,7 @@ inline SIMD<uint32_t, 8> vzero()
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u32(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_u32(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -337,7 +337,7 @@ inline SIMD<uint64_t, 4> vzero()
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u64(tmp.m[i], tmp.m[i]);
tmp.m.val[i] = veorq_u64(tmp.m.val[i], tmp.m.val[i]);
return tmp;
}
@ -346,7 +346,7 @@ inline SIMD<float, 8> vadd(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vaddq_f32(a.m[i], b.m[i]);
tmp.m.val[i] = vaddq_f32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -355,7 +355,7 @@ inline SIMD<int8_t, 32> vadd(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vaddq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vaddq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -364,7 +364,7 @@ inline SIMD<int16_t, 16> vadd(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vaddq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vaddq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -373,7 +373,7 @@ inline SIMD<int32_t, 8> vadd(SIMD<int32_t, 8> a, SIMD<int32_t, 8> b)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vaddq_s32(a.m[i], b.m[i]);
tmp.m.val[i] = vaddq_s32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -382,7 +382,7 @@ inline SIMD<int64_t, 4> vadd(SIMD<int64_t, 4> a, SIMD<int64_t, 4> b)
{
SIMD<int64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vaddq_s64(a.m[i], b.m[i]);
tmp.m.val[i] = vaddq_s64(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -391,7 +391,7 @@ inline SIMD<int8_t, 32> vqadd(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqaddq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vqaddq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -400,7 +400,7 @@ inline SIMD<int16_t, 16> vqadd(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqaddq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vqaddq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -409,7 +409,7 @@ inline SIMD<float, 8> vsub(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vsubq_f32(a.m[i], b.m[i]);
tmp.m.val[i] = vsubq_f32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -418,7 +418,7 @@ inline SIMD<int8_t, 32> vsub(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vsubq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vsubq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -427,7 +427,7 @@ inline SIMD<int16_t, 16> vsub(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vsubq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vsubq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -436,7 +436,7 @@ inline SIMD<int32_t, 8> vsub(SIMD<int32_t, 8> a, SIMD<int32_t, 8> b)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vsubq_s32(a.m[i], b.m[i]);
tmp.m.val[i] = vsubq_s32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -445,7 +445,7 @@ inline SIMD<int64_t, 4> vsub(SIMD<int64_t, 4> a, SIMD<int64_t, 4> b)
{
SIMD<int64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vsubq_s64(a.m[i], b.m[i]);
tmp.m.val[i] = vsubq_s64(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -454,7 +454,7 @@ inline SIMD<int8_t, 32> vqsub(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqsubq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vqsubq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -463,7 +463,7 @@ inline SIMD<int16_t, 16> vqsub(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqsubq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vqsubq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -472,7 +472,7 @@ inline SIMD<uint8_t, 32> vqsub(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqsubq_u8(a.m[i], b.m[i]);
tmp.m.val[i] = vqsubq_u8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -481,7 +481,7 @@ inline SIMD<uint16_t, 16> vqsub(SIMD<uint16_t, 16> a, SIMD<uint16_t, 16> b)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqsubq_u16(a.m[i], b.m[i]);
tmp.m.val[i] = vqsubq_u16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -490,7 +490,7 @@ inline SIMD<float, 8> vmul(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vmulq_f32(a.m[i], b.m[i]);
tmp.m.val[i] = vmulq_f32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -499,7 +499,7 @@ inline SIMD<int8_t, 32> vmul(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vmulq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vmulq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -508,7 +508,7 @@ inline SIMD<float, 8> vabs(SIMD<float, 8> a)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vabsq_f32(a.m[i]);
tmp.m.val[i] = vabsq_f32(a.m.val[i]);
return tmp;
}
@ -517,7 +517,7 @@ inline SIMD<int8_t, 32> vqabs(SIMD<int8_t, 32> a)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqabsq_s8(a.m[i]);
tmp.m.val[i] = vqabsq_s8(a.m.val[i]);
return tmp;
}
@ -526,7 +526,7 @@ inline SIMD<int16_t, 16> vqabs(SIMD<int16_t, 16> a)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vqabsq_s16(a.m[i]);
tmp.m.val[i] = vqabsq_s16(a.m.val[i]);
return tmp;
}
@ -535,9 +535,9 @@ inline SIMD<float, 8> vsignum(SIMD<float, 8> a)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (float32x4_t)vbicq_u32(
veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m[i])),
vceqq_f32(a.m[i], vdupq_n_f32(0.f)));
tmp.m.val[i] = (float32x4_t)vbicq_u32(
veorq_u32((uint32x4_t)vdupq_n_f32(1.f), vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)a.m.val[i])),
vceqq_f32(a.m.val[i], vdupq_n_f32(0.f)));
return tmp;
}
@ -546,8 +546,8 @@ inline SIMD<int8_t, 32> vsignum(SIMD<int8_t, 32> a)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m[i]),
vandq_u8(vcgtq_s8(a.m[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1)));
tmp.m.val[i] = (int8x16_t)vorrq_u8(vcgtq_s8(vdupq_n_s8(0), a.m.val[i]),
vandq_u8(vcgtq_s8(a.m.val[i], vdupq_n_s8(0)), (uint8x16_t)vdupq_n_s8(1)));
return tmp;
}
@ -556,9 +556,9 @@ inline SIMD<float, 8> vsign(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (float32x4_t)vbicq_u32(
veorq_u32((uint32x4_t)a.m[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m[i])),
vceqq_f32(b.m[i], vdupq_n_f32(0.f)));
tmp.m.val[i] = (float32x4_t)vbicq_u32(
veorq_u32((uint32x4_t)a.m.val[i], vandq_u32((uint32x4_t)vdupq_n_f32(-0.f), (uint32x4_t)b.m.val[i])),
vceqq_f32(b.m.val[i], vdupq_n_f32(0.f)));
return tmp;
}
@ -567,9 +567,9 @@ inline SIMD<int8_t, 32> vsign(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (int8x16_t)vorrq_u8(
vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m[i]), (uint8x16_t)vnegq_s8(a.m[i])),
vandq_u8(vcgtq_s8(b.m[i], vdupq_n_s8(0)), (uint8x16_t)a.m[i]));
tmp.m.val[i] = (int8x16_t)vorrq_u8(
vandq_u8(vcgtq_s8(vdupq_n_s8(0), b.m.val[i]), (uint8x16_t)vnegq_s8(a.m.val[i])),
vandq_u8(vcgtq_s8(b.m.val[i], vdupq_n_s8(0)), (uint8x16_t)a.m.val[i]));
return tmp;
}
@ -578,9 +578,9 @@ inline SIMD<float, 8> vcopysign(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = (float32x4_t)vorrq_u32(
vbicq_u32((uint32x4_t)a.m[i], (uint32x4_t)vdupq_n_f32(-0.f)),
vandq_u32((uint32x4_t)b.m[i], (uint32x4_t)vdupq_n_f32(-0.f)));
tmp.m.val[i] = (float32x4_t)vorrq_u32(
vbicq_u32((uint32x4_t)a.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f)),
vandq_u32((uint32x4_t)b.m.val[i], (uint32x4_t)vdupq_n_f32(-0.f)));
return tmp;
}
@ -589,7 +589,7 @@ inline SIMD<uint8_t, 32> vorr(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vorrq_u8(a.m[i], b.m[i]);
tmp.m.val[i] = vorrq_u8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -598,7 +598,7 @@ inline SIMD<uint16_t, 16> vorr(SIMD<uint16_t, 16> a, SIMD<uint16_t, 16> b)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vorrq_u16(a.m[i], b.m[i]);
tmp.m.val[i] = vorrq_u16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -607,7 +607,7 @@ inline SIMD<uint32_t, 8> vorr(SIMD<uint32_t, 8> a, SIMD<uint32_t, 8> b)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vorrq_u32(a.m[i], b.m[i]);
tmp.m.val[i] = vorrq_u32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -616,7 +616,7 @@ inline SIMD<uint64_t, 4> vorr(SIMD<uint64_t, 4> a, SIMD<uint64_t, 4> b)
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vorrq_u64(a.m[i], b.m[i]);
tmp.m.val[i] = vorrq_u64(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -625,7 +625,7 @@ inline SIMD<uint8_t, 32> vand(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vandq_u8(a.m[i], b.m[i]);
tmp.m.val[i] = vandq_u8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -634,7 +634,7 @@ inline SIMD<uint16_t, 16> vand(SIMD<uint16_t, 16> a, SIMD<uint16_t, 16> b)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vandq_u16(a.m[i], b.m[i]);
tmp.m.val[i] = vandq_u16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -643,7 +643,7 @@ inline SIMD<uint32_t, 8> vand(SIMD<uint32_t, 8> a, SIMD<uint32_t, 8> b)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vandq_u32(a.m[i], b.m[i]);
tmp.m.val[i] = vandq_u32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -652,7 +652,7 @@ inline SIMD<uint64_t, 4> vand(SIMD<uint64_t, 4> a, SIMD<uint64_t, 4> b)
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vandq_u64(a.m[i], b.m[i]);
tmp.m.val[i] = vandq_u64(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -661,7 +661,7 @@ inline SIMD<uint8_t, 32> veor(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u8(a.m[i], b.m[i]);
tmp.m.val[i] = veorq_u8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -670,7 +670,7 @@ inline SIMD<uint16_t, 16> veor(SIMD<uint16_t, 16> a, SIMD<uint16_t, 16> b)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u16(a.m[i], b.m[i]);
tmp.m.val[i] = veorq_u16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -679,7 +679,7 @@ inline SIMD<uint32_t, 8> veor(SIMD<uint32_t, 8> a, SIMD<uint32_t, 8> b)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u32(a.m[i], b.m[i]);
tmp.m.val[i] = veorq_u32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -688,7 +688,7 @@ inline SIMD<uint64_t, 4> veor(SIMD<uint64_t, 4> a, SIMD<uint64_t, 4> b)
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = veorq_u64(a.m[i], b.m[i]);
tmp.m.val[i] = veorq_u64(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -697,7 +697,7 @@ inline SIMD<uint8_t, 32> vbic(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbicq_u8(a.m[i], b.m[i]);
tmp.m.val[i] = vbicq_u8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -706,7 +706,7 @@ inline SIMD<uint16_t, 16> vbic(SIMD<uint16_t, 16> a, SIMD<uint16_t, 16> b)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbicq_u16(a.m[i], b.m[i]);
tmp.m.val[i] = vbicq_u16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -715,7 +715,7 @@ inline SIMD<uint32_t, 8> vbic(SIMD<uint32_t, 8> a, SIMD<uint32_t, 8> b)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbicq_u32(a.m[i], b.m[i]);
tmp.m.val[i] = vbicq_u32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -724,7 +724,7 @@ inline SIMD<uint64_t, 4> vbic(SIMD<uint64_t, 4> a, SIMD<uint64_t, 4> b)
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbicq_u64(a.m[i], b.m[i]);
tmp.m.val[i] = vbicq_u64(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -733,7 +733,7 @@ inline SIMD<uint8_t, 32> vbsl(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b, SIMD<uin
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbslq_u8(a.m[i], b.m[i], c.m[i]);
tmp.m.val[i] = vbslq_u8(a.m.val[i], b.m.val[i], c.m.val[i]);
return tmp;
}
@ -742,7 +742,7 @@ inline SIMD<uint16_t, 16> vbsl(SIMD<uint16_t, 16> a, SIMD<uint16_t, 16> b, SIMD<
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbslq_u16(a.m[i], b.m[i], c.m[i]);
tmp.m.val[i] = vbslq_u16(a.m.val[i], b.m.val[i], c.m.val[i]);
return tmp;
}
@ -751,7 +751,7 @@ inline SIMD<uint32_t, 8> vbsl(SIMD<uint32_t, 8> a, SIMD<uint32_t, 8> b, SIMD<uin
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbslq_u32(a.m[i], b.m[i], c.m[i]);
tmp.m.val[i] = vbslq_u32(a.m.val[i], b.m.val[i], c.m.val[i]);
return tmp;
}
@ -760,7 +760,7 @@ inline SIMD<uint64_t, 4> vbsl(SIMD<uint64_t, 4> a, SIMD<uint64_t, 4> b, SIMD<uin
{
SIMD<uint64_t, 4> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vbslq_u64(a.m[i], b.m[i], c.m[i]);
tmp.m.val[i] = vbslq_u64(a.m.val[i], b.m.val[i], c.m.val[i]);
return tmp;
}
@ -769,7 +769,7 @@ inline SIMD<uint32_t, 8> vceqz(SIMD<float, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_f32(a.m[i], vdupq_n_f32(0.f));
tmp.m.val[i] = vceqq_f32(a.m.val[i], vdupq_n_f32(0.f));
return tmp;
}
@ -778,7 +778,7 @@ inline SIMD<uint8_t, 32> vceqz(SIMD<int8_t, 32> a)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_s8(a.m[i], vdupq_n_s8(0));
tmp.m.val[i] = vceqq_s8(a.m.val[i], vdupq_n_s8(0));
return tmp;
}
@ -787,7 +787,7 @@ inline SIMD<uint16_t, 16> vceqz(SIMD<int16_t, 16> a)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_s16(a.m[i], vdupq_n_s16(0));
tmp.m.val[i] = vceqq_s16(a.m.val[i], vdupq_n_s16(0));
return tmp;
}
@ -796,7 +796,7 @@ inline SIMD<uint32_t, 8> vceqz(SIMD<int32_t, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_s32(a.m[i], vdupq_n_s32(0));
tmp.m.val[i] = vceqq_s32(a.m.val[i], vdupq_n_s32(0));
return tmp;
}
@ -805,7 +805,7 @@ inline SIMD<uint32_t, 8> vceq(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_f32(a.m[i], b.m[i]);
tmp.m.val[i] = vceqq_f32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -814,7 +814,7 @@ inline SIMD<uint8_t, 32> vceq(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vceqq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -823,7 +823,7 @@ inline SIMD<uint16_t, 16> vceq(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vceqq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -832,7 +832,7 @@ inline SIMD<uint32_t, 8> vceq(SIMD<int32_t, 8> a, SIMD<int32_t, 8> b)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vceqq_s32(a.m[i], b.m[i]);
tmp.m.val[i] = vceqq_s32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -841,7 +841,7 @@ inline SIMD<uint32_t, 8> vcgtz(SIMD<float, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcgtq_f32(a.m[i], vdupq_n_f32(0.f));
tmp.m.val[i] = vcgtq_f32(a.m.val[i], vdupq_n_f32(0.f));
return tmp;
}
@ -850,7 +850,7 @@ inline SIMD<uint8_t, 32> vcgtz(SIMD<int8_t, 32> a)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcgtq_s8(a.m[i], vdupq_n_s8(0));
tmp.m.val[i] = vcgtq_s8(a.m.val[i], vdupq_n_s8(0));
return tmp;
}
@ -859,7 +859,7 @@ inline SIMD<uint16_t, 16> vcgtz(SIMD<int16_t, 16> a)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcgtq_s16(a.m[i], vdupq_n_s16(0));
tmp.m.val[i] = vcgtq_s16(a.m.val[i], vdupq_n_s16(0));
return tmp;
}
@ -868,7 +868,7 @@ inline SIMD<uint32_t, 8> vcgtz(SIMD<int32_t, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcgtq_s32(a.m[i], vdupq_n_s32(0));
tmp.m.val[i] = vcgtq_s32(a.m.val[i], vdupq_n_s32(0));
return tmp;
}
@ -877,7 +877,7 @@ inline SIMD<uint32_t, 8> vcltz(SIMD<float, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcltq_f32(a.m[i], vdupq_n_f32(0.f));
tmp.m.val[i] = vcltq_f32(a.m.val[i], vdupq_n_f32(0.f));
return tmp;
}
@ -886,7 +886,7 @@ inline SIMD<uint8_t, 32> vcltz(SIMD<int8_t, 32> a)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcltq_s8(a.m[i], vdupq_n_s8(0));
tmp.m.val[i] = vcltq_s8(a.m.val[i], vdupq_n_s8(0));
return tmp;
}
@ -895,7 +895,7 @@ inline SIMD<uint16_t, 16> vcltz(SIMD<int16_t, 16> a)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcltq_s16(a.m[i], vdupq_n_s16(0));
tmp.m.val[i] = vcltq_s16(a.m.val[i], vdupq_n_s16(0));
return tmp;
}
@ -904,7 +904,7 @@ inline SIMD<uint32_t, 8> vcltz(SIMD<int32_t, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcltq_s32(a.m[i], vdupq_n_s32(0));
tmp.m.val[i] = vcltq_s32(a.m.val[i], vdupq_n_s32(0));
return tmp;
}
@ -913,7 +913,7 @@ inline SIMD<uint32_t, 8> vclez(SIMD<float, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcleq_f32(a.m[i], vdupq_n_f32(0.f));
tmp.m.val[i] = vcleq_f32(a.m.val[i], vdupq_n_f32(0.f));
return tmp;
}
@ -922,7 +922,7 @@ inline SIMD<uint8_t, 32> vclez(SIMD<int8_t, 32> a)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcleq_s8(a.m[i], vdupq_n_s8(0));
tmp.m.val[i] = vcleq_s8(a.m.val[i], vdupq_n_s8(0));
return tmp;
}
@ -931,7 +931,7 @@ inline SIMD<uint16_t, 16> vclez(SIMD<int16_t, 16> a)
{
SIMD<uint16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcleq_s16(a.m[i], vdupq_n_s16(0));
tmp.m.val[i] = vcleq_s16(a.m.val[i], vdupq_n_s16(0));
return tmp;
}
@ -940,7 +940,7 @@ inline SIMD<uint32_t, 8> vclez(SIMD<int32_t, 8> a)
{
SIMD<uint32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vcleq_s32(a.m[i], vdupq_n_s32(0));
tmp.m.val[i] = vcleq_s32(a.m.val[i], vdupq_n_s32(0));
return tmp;
}
@ -949,7 +949,7 @@ inline SIMD<float, 8> vmin(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_f32(a.m[i], b.m[i]);
tmp.m.val[i] = vminq_f32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -958,7 +958,7 @@ inline SIMD<int8_t, 32> vmin(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vminq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -967,7 +967,7 @@ inline SIMD<int16_t, 16> vmin(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vminq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -976,7 +976,7 @@ inline SIMD<int32_t, 8> vmin(SIMD<int32_t, 8> a, SIMD<int32_t, 8> b)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_s32(a.m[i], b.m[i]);
tmp.m.val[i] = vminq_s32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -985,7 +985,7 @@ inline SIMD<float, 8> vmax(SIMD<float, 8> a, SIMD<float, 8> b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vmaxq_f32(a.m[i], b.m[i]);
tmp.m.val[i] = vmaxq_f32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -994,7 +994,7 @@ inline SIMD<int8_t, 32> vmax(SIMD<int8_t, 32> a, SIMD<int8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vmaxq_s8(a.m[i], b.m[i]);
tmp.m.val[i] = vmaxq_s8(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -1003,7 +1003,7 @@ inline SIMD<int16_t, 16> vmax(SIMD<int16_t, 16> a, SIMD<int16_t, 16> b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vmaxq_s16(a.m[i], b.m[i]);
tmp.m.val[i] = vmaxq_s16(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -1012,7 +1012,7 @@ inline SIMD<int32_t, 8> vmax(SIMD<int32_t, 8> a, SIMD<int32_t, 8> b)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vmaxq_s32(a.m[i], b.m[i]);
tmp.m.val[i] = vmaxq_s32(a.m.val[i], b.m.val[i]);
return tmp;
}
@ -1021,7 +1021,7 @@ inline SIMD<float, 8> vclamp(SIMD<float, 8> x, float a, float b)
{
SIMD<float, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_f32(vmaxq_f32(x.m[i], vdupq_n_f32(a)), vdupq_n_f32(b));
tmp.m.val[i] = vminq_f32(vmaxq_f32(x.m.val[i], vdupq_n_f32(a)), vdupq_n_f32(b));
return tmp;
}
@ -1030,7 +1030,7 @@ inline SIMD<int8_t, 32> vclamp(SIMD<int8_t, 32> x, int8_t a, int8_t b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_s8(vmaxq_s8(x.m[i], vdupq_n_s8(a)), vdupq_n_s8(b));
tmp.m.val[i] = vminq_s8(vmaxq_s8(x.m.val[i], vdupq_n_s8(a)), vdupq_n_s8(b));
return tmp;
}
@ -1039,7 +1039,7 @@ inline SIMD<int16_t, 16> vclamp(SIMD<int16_t, 16> x, int16_t a, int16_t b)
{
SIMD<int16_t, 16> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_s16(vmaxq_s16(x.m[i], vdupq_n_s16(a)), vdupq_n_s16(b));
tmp.m.val[i] = vminq_s16(vmaxq_s16(x.m.val[i], vdupq_n_s16(a)), vdupq_n_s16(b));
return tmp;
}
@ -1048,7 +1048,7 @@ inline SIMD<int32_t, 8> vclamp(SIMD<int32_t, 8> x, int32_t a, int32_t b)
{
SIMD<int32_t, 8> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vminq_s32(vmaxq_s32(x.m[i], vdupq_n_s32(a)), vdupq_n_s32(b));
tmp.m.val[i] = vminq_s32(vmaxq_s32(x.m.val[i], vdupq_n_s32(a)), vdupq_n_s32(b));
return tmp;
}
@ -1058,7 +1058,7 @@ inline SIMD<uint8_t, 32> vshuf(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<uint8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vorrq_u8(vqtbl1q_u8(a.m[0], b.m[i]), vqtbl1q_u8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16))));
tmp.m.val[i] = vorrq_u8(vqtbl1q_u8(a.m.val[0], b.m.val[i]), vqtbl1q_u8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16))));
return tmp;
}
@ -1067,7 +1067,7 @@ inline SIMD<int8_t, 32> vshuf(SIMD<int8_t, 32> a, SIMD<uint8_t, 32> b)
{
SIMD<int8_t, 32> tmp;
for (int i = 0; i < 2; ++i)
tmp.m[i] = vorrq_s8(vqtbl1q_s8(a.m[0], b.m[i]), vqtbl1q_s8(a.m[1], vsubq_u8(b.m[i], vdupq_n_u8(16))));
tmp.m.val[i] = vorrq_s8(vqtbl1q_s8(a.m.val[0], b.m.val[i]), vqtbl1q_s8(a.m.val[1], vsubq_u8(b.m.val[i], vdupq_n_u8(16))));
return tmp;
}
#endif