added SIMD horizontal rotation helper

2026-04-27 14:30:36 +00:00 · 2019-09-24 00:05:57 +02:00 · 2019-09-24 00:05:57 +02:00 · 8de487af88
commit 8de487af88
parent a10cc6afcb
2 changed files with 112 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -87,3 +87,7 @@ Single instruction, multiple data ([SIMD](https://en.wikipedia.org/wiki/SIMD)) w
 * [Intel SSE4.1](https://en.wikipedia.org/wiki/SSE4) ([sse4_1.hh](sse4_1.hh))
 * [Intel AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) ([avx2.hh](avx2.hh))

+### [rotate.hh](rotate.hh)
+
+([SIMD](https://en.wikipedia.org/wiki/SIMD)) element wise horizontal rotation
+
--- a/rotate.hh
+++ b/rotate.hh
@ -0,0 +1,108 @@
+/*
+SIMD element wise horizontal rotation
+
+Copyright 2019 Ahmet Inan <inan@aicodix.de>
+*/
+
+#pragma once
+
+#include "simd.hh"
+
+namespace CODE {
+
+template <typename TYPE, int WIDTH>
+class Rotate
+{
+	static const int SIZE = TYPE::SIZE;
+	static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
+public:
+	TYPE operator()(TYPE a, int s)
+	{
+		if (s < 0)
+			s += WIDTH;
+		int t = WIDTH - s;
+		TYPE ret;
+		for (int n = 0; n < s; ++n)
+			ret.v[n] = a.v[n+t];
+		for (int n = 0; n < t; ++n)
+			ret.v[n+s] = a.v[n];
+		return ret;
+	}
+};
+
+#ifdef __SSE4_1__
+template <int WIDTH>
+class Rotate<SIMD<int8_t, 16>, WIDTH>
+{
+	static const int SIZE = 16;
+	static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
+	typedef SIMD<int8_t, SIZE> TYPE;
+	TYPE rot[WIDTH];
+public:
+	Rotate()
+	{
+		for (int i = 0; i < WIDTH; ++i) {
+			rot[i] = vdup<TYPE>(0x80);
+			for (int j = 0; j < WIDTH; ++j)
+				rot[i].v[j] = (j - i + WIDTH) % WIDTH;
+		}
+	}
+	TYPE operator()(TYPE a, int s)
+	{
+		if (s < 0)
+			s += WIDTH;
+		TYPE ret;
+		ret.m = _mm_shuffle_epi8(a.m, rot[s].m);
+		return ret;
+	}
+};
+#endif
+
+#ifdef __AVX2__
+template <int WIDTH>
+class Rotate<SIMD<int8_t, 32>, WIDTH>
+{
+	static const int SIZE = 32;
+	static_assert(WIDTH <= SIZE, "width must be smaller or equal to SIMD size");
+	typedef SIMD<int8_t, SIZE> TYPE;
+	TYPE rot0[WIDTH], rot1[WIDTH];
+public:
+	Rotate()
+	{
+		for (int i = 0; i < WIDTH; ++i) {
+			rot0[i] = vdup<TYPE>(0x80);
+			rot1[i] = vdup<TYPE>(0x80);
+			for (int j = 0; j < WIDTH; ++j) {
+				int pos = (j - i + WIDTH) % WIDTH;
+				if (j < 16) {
+					if (pos < 16) {
+						rot0[i].v[j] = pos;
+					} else {
+						rot1[i].v[j+16] = pos-16;
+					}
+				} else {
+					if (pos < 16) {
+						rot1[i].v[j-16] = pos;
+					} else {
+						rot0[i].v[j] = pos-16;
+					}
+				}
+			}
+		}
+	}
+	TYPE operator()(TYPE a, int s)
+	{
+		if (s < 0)
+			s += WIDTH;
+		__m256i b = _mm256_shuffle_epi8(a.m, rot0[s].m);
+		__m256i c = _mm256_shuffle_epi8(a.m, rot1[s].m);
+		__m256i d = _mm256_permute2x128_si256(c, c, 1);
+		TYPE ret;
+		ret.m = _mm256_or_si256(b, d);
+		return ret;
+	}
+};
+#endif
+
+}
+