mirror of
https://github.com/aicodix/code.git
synced 2026-04-27 14:30:36 +00:00
added vshuf()
This commit is contained in:
parent
369decd8c3
commit
778d51ac0f
4 changed files with 160 additions and 0 deletions
32
avx2.hh
32
avx2.hh
|
|
@ -1134,3 +1134,35 @@ inline SIMD<int64_t, 4> vclamp(SIMD<int64_t, 4> x, int64_t a, int64_t b)
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<uint8_t, 32> vshuf(SIMD<uint8_t, 32> a, SIMD<uint8_t, 32> b)
|
||||||
|
{
|
||||||
|
SIMD<uint8_t, 32> tmp;
|
||||||
|
__m256i c = _mm256_sub_epi8(b.m, _mm256_set1_epi8(16));
|
||||||
|
__m256i d = _mm256_or_si256(b.m, _mm256_cmpgt_epi8(b.m, _mm256_set1_epi8(15)));
|
||||||
|
__m256i e = _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m, a.m, 0), d);
|
||||||
|
__m256i f = _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m, a.m, 17), c);
|
||||||
|
tmp.m = _mm256_or_si256(e, f);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<int8_t, 32> vshuf(SIMD<int8_t, 32> a, SIMD<uint8_t, 32> b)
|
||||||
|
{
|
||||||
|
SIMD<int8_t, 32> tmp;
|
||||||
|
__m256i c = _mm256_sub_epi8(b.m, _mm256_set1_epi8(16));
|
||||||
|
__m256i d = _mm256_or_si256(b.m, _mm256_cmpgt_epi8(b.m, _mm256_set1_epi8(15)));
|
||||||
|
__m256i e = _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m, a.m, 0), d);
|
||||||
|
__m256i f = _mm256_shuffle_epi8(_mm256_permute2x128_si256(a.m, a.m, 17), c);
|
||||||
|
tmp.m = _mm256_or_si256(e, f);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<float, 8> vshuf(SIMD<float, 8> a, SIMD<uint32_t, 8> b)
|
||||||
|
{
|
||||||
|
SIMD<float, 8> tmp;
|
||||||
|
tmp.m = _mm256_permutevar8x32_ps(a.m, b.m);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
||||||
22
neon.hh
22
neon.hh
|
|
@ -938,3 +938,25 @@ inline SIMD<int32_t, 4> vclamp(SIMD<int32_t, 4> x, int32_t a, int32_t b)
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<uint8_t, 16> vshuf(SIMD<uint8_t, 16> a, SIMD<uint8_t, 16> b)
|
||||||
|
{
|
||||||
|
SIMD<uint8_t, 16> tmp;
|
||||||
|
uint8x8x2_t c { vget_low_u8(a.m), vget_high_u8(a.m) };
|
||||||
|
uint8x8_t d = vtbl2_u8(c, vget_low_u8(b.m));
|
||||||
|
uint8x8_t e = vtbl2_u8(c, vget_high_u8(b.m));
|
||||||
|
tmp.m = vcombine_u8(d, e);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<int8_t, 16> vshuf(SIMD<int8_t, 16> a, SIMD<uint8_t, 16> b)
|
||||||
|
{
|
||||||
|
SIMD<int8_t, 16> tmp;
|
||||||
|
int8x8x2_t c { vget_low_s8(a.m), vget_high_s8(a.m) };
|
||||||
|
int8x8_t d = vtbl2_s8(c, vget_low_s8((int8x16_t)b.m));
|
||||||
|
int8x8_t e = vtbl2_s8(c, vget_high_s8((int8x16_t)b.m));
|
||||||
|
tmp.m = vcombine_s8(d, e);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
||||||
90
simd.hh
90
simd.hh
|
|
@ -1387,6 +1387,96 @@ static inline SIMD<int64_t, WIDTH> vsign(SIMD<int64_t, WIDTH> a, SIMD<int64_t, W
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<uint8_t, WIDTH> vshuf(SIMD<uint8_t, WIDTH> a, SIMD<uint8_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<uint8_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<uint16_t, WIDTH> vshuf(SIMD<uint16_t, WIDTH> a, SIMD<uint16_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<uint16_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<uint32_t, WIDTH> vshuf(SIMD<uint32_t, WIDTH> a, SIMD<uint32_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<uint32_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<uint64_t, WIDTH> vshuf(SIMD<uint64_t, WIDTH> a, SIMD<uint64_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<uint64_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<int8_t, WIDTH> vshuf(SIMD<int8_t, WIDTH> a, SIMD<uint8_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<int8_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<int16_t, WIDTH> vshuf(SIMD<int16_t, WIDTH> a, SIMD<uint16_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<int16_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<int32_t, WIDTH> vshuf(SIMD<int32_t, WIDTH> a, SIMD<uint32_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<int32_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<int64_t, WIDTH> vshuf(SIMD<int64_t, WIDTH> a, SIMD<uint64_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<int64_t, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<float, WIDTH> vshuf(SIMD<float, WIDTH> a, SIMD<uint32_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<float, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0.f;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int WIDTH>
|
||||||
|
static inline SIMD<double, WIDTH> vshuf(SIMD<double, WIDTH> a, SIMD<uint64_t, WIDTH> b)
|
||||||
|
{
|
||||||
|
SIMD<double, WIDTH> tmp;
|
||||||
|
for (int i = 0; i < WIDTH; ++i)
|
||||||
|
tmp.v[i] = b.v[i] < WIDTH ? a.v[b.v[i]] : 0.;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
#include "avx2.hh"
|
#include "avx2.hh"
|
||||||
|
|
|
||||||
16
sse4_1.hh
16
sse4_1.hh
|
|
@ -1127,3 +1127,19 @@ inline SIMD<int32_t, 4> vclamp(SIMD<int32_t, 4> x, int32_t a, int32_t b)
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<uint8_t, 16> vshuf(SIMD<uint8_t, 16> a, SIMD<uint8_t, 16> b)
|
||||||
|
{
|
||||||
|
SIMD<uint8_t, 16> tmp;
|
||||||
|
tmp.m = _mm_shuffle_epi8(a.m, b.m);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline SIMD<int8_t, 16> vshuf(SIMD<int8_t, 16> a, SIMD<uint8_t, 16> b)
|
||||||
|
{
|
||||||
|
SIMD<int8_t, 16> tmp;
|
||||||
|
tmp.m = _mm_shuffle_epi8(a.m, b.m);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue