diff --git a/neon.hh b/neon.hh index 6fe4aa6..e7b235e 100644 --- a/neon.hh +++ b/neon.hh @@ -953,10 +953,14 @@ template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; +#ifdef __aarch64__ + tmp.m = vqtbl1q_u8(a.m, b.m); +#else uint8x8x2_t c { vget_low_u8(a.m), vget_high_u8(a.m) }; uint8x8_t d = vtbl2_u8(c, vget_low_u8(b.m)); uint8x8_t e = vtbl2_u8(c, vget_high_u8(b.m)); tmp.m = vcombine_u8(d, e); +#endif return tmp; } @@ -964,10 +968,14 @@ template <> inline SIMD vshuf(SIMD a, SIMD b) { SIMD tmp; +#ifdef __aarch64__ + tmp.m = vqtbl1q_s8(a.m, b.m); +#else int8x8x2_t c { vget_low_s8(a.m), vget_high_s8(a.m) }; int8x8_t d = vtbl2_s8(c, vget_low_s8((int8x16_t)b.m)); int8x8_t e = vtbl2_s8(c, vget_high_s8((int8x16_t)b.m)); tmp.m = vcombine_s8(d, e); +#endif return tmp; } diff --git a/rotate.hh b/rotate.hh index 93fc188..9275a96 100644 --- a/rotate.hh +++ b/rotate.hh @@ -51,11 +51,15 @@ public: { if (s < 0) s += WIDTH; + TYPE ret; +#ifdef __aarch64__ + ret.m = vqtbl1q_s8(a.m, vunsigned(rot[s]).m); +#else int8x8x2_t b { vget_low_s8(a.m), vget_high_s8(a.m) }; int8x8_t c = vtbl2_s8(b, vget_low_s8(rot[s].m)); int8x8_t d = vtbl2_s8(b, vget_high_s8(rot[s].m)); - TYPE ret; ret.m = vcombine_s8(c, d); +#endif return ret; } };