16#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
37 const __m256 x_times_x = _mm256_mul_ps(x, x);
40 arctan = _mm256_mul_ps(x_times_x, arctan);
41 arctan = _mm256_add_ps(arctan, a11);
42 arctan = _mm256_mul_ps(x_times_x, arctan);
43 arctan = _mm256_add_ps(arctan, a9);
44 arctan = _mm256_mul_ps(x_times_x, arctan);
45 arctan = _mm256_add_ps(arctan, a7);
46 arctan = _mm256_mul_ps(x_times_x, arctan);
47 arctan = _mm256_add_ps(arctan, a5);
48 arctan = _mm256_mul_ps(x_times_x, arctan);
49 arctan = _mm256_add_ps(arctan, a3);
50 arctan = _mm256_mul_ps(x_times_x, arctan);
51 arctan = _mm256_add_ps(arctan, a1);
52 arctan = _mm256_mul_ps(x, arctan);
59 __m256 yl, yh, tmp1, tmp2;
60 yl = _mm256_moveldup_ps(y);
61 yh = _mm256_movehdup_ps(y);
62 tmp1 = _mm256_mul_ps(x, yl);
63 x = _mm256_shuffle_ps(x, x, 0xB1);
64 tmp2 = _mm256_mul_ps(x, yh);
67 return _mm256_addsub_ps(tmp1, tmp2);
72 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73 return _mm256_xor_ps(x, conjugator);
78 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79 const __m256 dreal = _mm256_moveldup_ps(y);
80 const __m256 dimag = _mm256_movehdup_ps(y);
82 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84 const __m256 multreal = _mm256_mul_ps(x, dreal);
85 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86 return _mm256_add_ps(multreal, multimag);
91 __m256 tmp1 = _mm256_mul_ps(val, val);
92 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93 tmp1 = _mm256_shuffle_ps(tmp1, tmp1,
_MM_SHUFFLE(3, 1, 2, 0));
94 tmp1 = _mm256_sqrt_ps(tmp1);
95 return _mm256_div_ps(val, tmp1);
100 __m256 complex1, complex2;
101 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
102 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
103 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105 return _mm256_hadd_ps(complex1, complex2);
114 const __m256 symbols1,
115 const __m256 points0,
116 const __m256 points1,
124 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
127 return _mm256_mul_ps(norms, scalar);
132 __m256 sign_mask_dummy = _mm256_setzero_ps();
186 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
194 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195 const __m256 abs_mask =
196 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
203 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
205 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206 return _mm256_or_ps(dst, sign);
218 llr0 = _mm256_xor_ps(llr0, sign_mask);
219 __m256 dst = _mm256_add_ps(llr0, llr1);
224 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
226 aux = _mm256_mul_ps(aux, val);
227 aux = _mm256_sub_ps(aux, acc);
228 aux = _mm256_mul_ps(aux, aux);
229 aux = _mm256_mul_ps(aux, rec);
230 return _mm256_add_ps(sq_acc, aux);
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3281
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition sse2neon.h:3034
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition sse2neon.h:4948
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition sse2neon.h:6445
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition sse2neon.h:4997
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3135
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition sse2neon.h:306
int64x2_t __m128i
Definition sse2neon.h:375
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:98
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:108
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:183
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition volk_avx_intrinsics.h:76
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition volk_avx_intrinsics.h:223
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition volk_avx_intrinsics.h:57
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:192
static __m256 _mm256_conjugate_ps(__m256 x)
Definition volk_avx_intrinsics.h:70
static __m256 _mm256_normalize_ps(__m256 val)
Definition volk_avx_intrinsics.h:89
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx_intrinsics.h:113
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition volk_avx_intrinsics.h:130
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition volk_avx_intrinsics.h:209