From fe25fb2a696b3160cbd2230c10c1ef22e6d5c648 Mon Sep 17 00:00:00 2001
From: Ahmet Inan <inan@aicodix.de>
Date: Mon, 5 Jun 2023 12:18:27 +0200
Subject: [PATCH] flatten hot functions

---
 cauchy_reed_solomon_erasure_coding.hh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cauchy_reed_solomon_erasure_coding.hh b/cauchy_reed_solomon_erasure_coding.hh
index a02163c..e598c8f 100644
--- a/cauchy_reed_solomon_erasure_coding.hh
+++ b/cauchy_reed_solomon_erasure_coding.hh
@@ -27,12 +27,14 @@ struct CauchyReedSolomonErasureCoding
 	typedef typename GF::IndexType IndexType;
 	IndexType row_num, row_den;
 	// $a_{ij} = \frac{1}{x_i + y_j}$
+	__attribute__((flatten))
 	IndexType cauchy_matrix(int i, int j)
 	{
 		ValueType row(i), col(ValueType::N - j);
 		return rcp(index(row + col));
 	}
 	// $b_{ij} = \frac{\prod_{k=1}^{n}{(x_j + y_k)(x_k + y_i)}}{(x_j + y_i)\prod_{k \ne j}^{n}{(x_j - x_k)}\prod_{k \ne i}^{n}{(y_i - y_k)}}$
+	__attribute__((flatten))
 	IndexType inverse_cauchy_matrix(const ValueType *rows, int i, int j, int n)
 	{
 #if 0
@@ -80,7 +82,8 @@ struct CauchyReedSolomonErasureCoding
 	}
 #endif
 #if defined(__ARM_NEON) || defined(__AVX2__) || defined(__SSE4_1__)
-	void mac_simd(uint8_t *c, const uint8_t *a, IndexType b, int size, bool init)
+	__attribute__((flatten))
+	static inline void mac_simd(uint8_t *c, const uint8_t *a, IndexType b, int size, bool init)
 	{
 		alignas(16) uint8_t bln[16], bhn[16];
 		for (int i = 0; i < 16; ++i) {
@@ -135,7 +138,8 @@ struct CauchyReedSolomonErasureCoding
 #endif
 #endif
 	}
-	void mac_simd(uint16_t *c, const uint16_t *a, IndexType b, int size, bool init)
+	__attribute__((flatten))
+	static inline void mac_simd(uint16_t *c, const uint16_t *a, IndexType b, int size, bool init)
 	{
 		alignas(16) uint8_t blll[16], bllh[16], blhl[16], blhh[16], bhll[16], bhlh[16], bhhl[16], bhhh[16];
 		for (int i = 0; i < 16; ++i) {
@@ -272,7 +276,8 @@ struct CauchyReedSolomonErasureCoding
 #endif
 	}
 #endif
-	void multiply_accumulate(ValueType *c, const ValueType *a, IndexType b, int len, bool init)
+	__attribute__((flatten))
+	static inline void multiply_accumulate(ValueType *c, const ValueType *a, IndexType b, int len, bool init)
 	{
 #if defined(__ARM_NEON) || defined(__AVX2__) || defined(__SSE4_1__)
 #ifdef __AVX2__