mirror of
https://github.com/aicodix/code.git
synced 2026-04-27 14:30:36 +00:00
use faster vqtbl1q_u8 on ARM64
This commit is contained in:
parent
986059694e
commit
2b7645ced3
2 changed files with 13 additions and 1 deletions
8
neon.hh
8
neon.hh
|
|
@ -953,10 +953,14 @@ template <>
|
||||||
inline SIMD<uint8_t, 16> vshuf(SIMD<uint8_t, 16> a, SIMD<uint8_t, 16> b)
|
inline SIMD<uint8_t, 16> vshuf(SIMD<uint8_t, 16> a, SIMD<uint8_t, 16> b)
|
||||||
{
|
{
|
||||||
SIMD<uint8_t, 16> tmp;
|
SIMD<uint8_t, 16> tmp;
|
||||||
|
#ifdef __aarch64__
|
||||||
|
tmp.m = vqtbl1q_u8(a.m, b.m);
|
||||||
|
#else
|
||||||
uint8x8x2_t c { vget_low_u8(a.m), vget_high_u8(a.m) };
|
uint8x8x2_t c { vget_low_u8(a.m), vget_high_u8(a.m) };
|
||||||
uint8x8_t d = vtbl2_u8(c, vget_low_u8(b.m));
|
uint8x8_t d = vtbl2_u8(c, vget_low_u8(b.m));
|
||||||
uint8x8_t e = vtbl2_u8(c, vget_high_u8(b.m));
|
uint8x8_t e = vtbl2_u8(c, vget_high_u8(b.m));
|
||||||
tmp.m = vcombine_u8(d, e);
|
tmp.m = vcombine_u8(d, e);
|
||||||
|
#endif
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -964,10 +968,14 @@ template <>
|
||||||
inline SIMD<int8_t, 16> vshuf(SIMD<int8_t, 16> a, SIMD<uint8_t, 16> b)
|
inline SIMD<int8_t, 16> vshuf(SIMD<int8_t, 16> a, SIMD<uint8_t, 16> b)
|
||||||
{
|
{
|
||||||
SIMD<int8_t, 16> tmp;
|
SIMD<int8_t, 16> tmp;
|
||||||
|
#ifdef __aarch64__
|
||||||
|
tmp.m = vqtbl1q_s8(a.m, b.m);
|
||||||
|
#else
|
||||||
int8x8x2_t c { vget_low_s8(a.m), vget_high_s8(a.m) };
|
int8x8x2_t c { vget_low_s8(a.m), vget_high_s8(a.m) };
|
||||||
int8x8_t d = vtbl2_s8(c, vget_low_s8((int8x16_t)b.m));
|
int8x8_t d = vtbl2_s8(c, vget_low_s8((int8x16_t)b.m));
|
||||||
int8x8_t e = vtbl2_s8(c, vget_high_s8((int8x16_t)b.m));
|
int8x8_t e = vtbl2_s8(c, vget_high_s8((int8x16_t)b.m));
|
||||||
tmp.m = vcombine_s8(d, e);
|
tmp.m = vcombine_s8(d, e);
|
||||||
|
#endif
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,11 +51,15 @@ public:
|
||||||
{
|
{
|
||||||
if (s < 0)
|
if (s < 0)
|
||||||
s += WIDTH;
|
s += WIDTH;
|
||||||
|
TYPE ret;
|
||||||
|
#ifdef __aarch64__
|
||||||
|
ret.m = vqtbl1q_s8(a.m, vunsigned(rot[s]).m);
|
||||||
|
#else
|
||||||
int8x8x2_t b { vget_low_s8(a.m), vget_high_s8(a.m) };
|
int8x8x2_t b { vget_low_s8(a.m), vget_high_s8(a.m) };
|
||||||
int8x8_t c = vtbl2_s8(b, vget_low_s8(rot[s].m));
|
int8x8_t c = vtbl2_s8(b, vget_low_s8(rot[s].m));
|
||||||
int8x8_t d = vtbl2_s8(b, vget_high_s8(rot[s].m));
|
int8x8_t d = vtbl2_s8(b, vget_high_s8(rot[s].m));
|
||||||
TYPE ret;
|
|
||||||
ret.m = vcombine_s8(c, d);
|
ret.m = vcombine_s8(c, d);
|
||||||
|
#endif
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue