mirror of
https://github.com/aicodix/code.git
synced 2026-04-27 14:30:36 +00:00
139 lines
2.7 KiB
C++
139 lines
2.7 KiB
C++
/*
|
|
SIMD element wise horizontal rotation
|
|
|
|
Copyright 2019 Ahmet Inan <inan@aicodix.de>
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "simd.hh"
|
|
|
|
namespace CODE {
|
|
|
|
template <typename TYPE, int WIDTH>
|
|
class Rotate
|
|
{
|
|
static const int SIZE = TYPE::SIZE;
|
|
static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
|
|
public:
|
|
TYPE operator()(TYPE a, int s)
|
|
{
|
|
if (s < 0)
|
|
s += WIDTH;
|
|
int t = WIDTH - s;
|
|
TYPE ret;
|
|
for (int n = 0; n < s; ++n)
|
|
ret.v[n] = a.v[n+t];
|
|
for (int n = 0; n < t; ++n)
|
|
ret.v[n+s] = a.v[n];
|
|
return ret;
|
|
}
|
|
};
|
|
|
|
#ifdef __ARM_NEON__
|
|
template <int WIDTH>
|
|
class Rotate<SIMD<int8_t, 16>, WIDTH>
|
|
{
|
|
static const int SIZE = 16;
|
|
static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
|
|
typedef SIMD<int8_t, SIZE> TYPE;
|
|
TYPE rot[WIDTH];
|
|
public:
|
|
Rotate()
|
|
{
|
|
for (int i = 0; i < WIDTH; ++i) {
|
|
rot[i] = vdup<TYPE>(0x80);
|
|
for (int j = 0; j < WIDTH; ++j)
|
|
rot[i].v[j] = (j - i + WIDTH) % WIDTH;
|
|
}
|
|
}
|
|
TYPE operator()(TYPE a, int s)
|
|
{
|
|
if (s < 0)
|
|
s += WIDTH;
|
|
int8x8x2_t b { vget_low_s8(a.m), vget_high_s8(a.m) };
|
|
int8x8_t c = vtbl2_s8(b, vget_low_s8(rot[s].m));
|
|
int8x8_t d = vtbl2_s8(b, vget_high_s8(rot[s].m));
|
|
TYPE ret;
|
|
ret.m = vcombine_s8(c, d);
|
|
return ret;
|
|
}
|
|
};
|
|
#endif
|
|
|
|
#ifdef __AVX2__
|
|
template <int WIDTH>
|
|
class Rotate<SIMD<int8_t, 32>, WIDTH>
|
|
{
|
|
static const int SIZE = 32;
|
|
static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
|
|
typedef SIMD<int8_t, SIZE> TYPE;
|
|
TYPE rot0[WIDTH], rot1[WIDTH];
|
|
public:
|
|
Rotate()
|
|
{
|
|
for (int i = 0; i < WIDTH; ++i) {
|
|
rot0[i] = vdup<TYPE>(0x80);
|
|
rot1[i] = vdup<TYPE>(0x80);
|
|
for (int j = 0; j < WIDTH; ++j) {
|
|
int pos = (j - i + WIDTH) % WIDTH;
|
|
if (j < 16) {
|
|
if (pos < 16) {
|
|
rot0[i].v[j] = pos;
|
|
} else {
|
|
rot1[i].v[j+16] = pos-16;
|
|
}
|
|
} else {
|
|
if (pos < 16) {
|
|
rot1[i].v[j-16] = pos;
|
|
} else {
|
|
rot0[i].v[j] = pos-16;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
TYPE operator()(TYPE a, int s)
|
|
{
|
|
if (s < 0)
|
|
s += WIDTH;
|
|
__m256i b = _mm256_shuffle_epi8(a.m, rot0[s].m);
|
|
__m256i c = _mm256_shuffle_epi8(a.m, rot1[s].m);
|
|
__m256i d = _mm256_permute2x128_si256(c, c, 1);
|
|
TYPE ret;
|
|
ret.m = _mm256_or_si256(b, d);
|
|
return ret;
|
|
}
|
|
};
|
|
#else
|
|
#ifdef __SSE4_1__
|
|
template <int WIDTH>
|
|
class Rotate<SIMD<int8_t, 16>, WIDTH>
|
|
{
|
|
static const int SIZE = 16;
|
|
static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
|
|
typedef SIMD<int8_t, SIZE> TYPE;
|
|
TYPE rot[WIDTH];
|
|
public:
|
|
Rotate()
|
|
{
|
|
for (int i = 0; i < WIDTH; ++i) {
|
|
rot[i] = vdup<TYPE>(0x80);
|
|
for (int j = 0; j < WIDTH; ++j)
|
|
rot[i].v[j] = (j - i + WIDTH) % WIDTH;
|
|
}
|
|
}
|
|
TYPE operator()(TYPE a, int s)
|
|
{
|
|
if (s < 0)
|
|
s += WIDTH;
|
|
TYPE ret;
|
|
ret.m = _mm_shuffle_epi8(a.m, rot[s].m);
|
|
return ret;
|
|
}
|
|
};
|
|
#endif
|
|
#endif
|
|
|
|
}
|
|
|