Vector Optimized Library of Kernels 3.1.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_avx_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * This file is intended to hold AVX intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-pasta.
14 */
15
16#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
18#include <immintrin.h>
19
20/*
21 * Approximate arctan(x) via polynomial expansion
22 * on the interval [-1, 1]
23 *
24 * Maximum relative error ~6.5e-7
25 * Polynomial evaluated via Horner's method
26 */
27static inline __m256 _m256_arctan_poly_avx(const __m256 x)
28{
29 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
36
37 const __m256 x_times_x = _mm256_mul_ps(x, x);
38 __m256 arctan;
39 arctan = a13;
40 arctan = _mm256_mul_ps(x_times_x, arctan);
41 arctan = _mm256_add_ps(arctan, a11);
42 arctan = _mm256_mul_ps(x_times_x, arctan);
43 arctan = _mm256_add_ps(arctan, a9);
44 arctan = _mm256_mul_ps(x_times_x, arctan);
45 arctan = _mm256_add_ps(arctan, a7);
46 arctan = _mm256_mul_ps(x_times_x, arctan);
47 arctan = _mm256_add_ps(arctan, a5);
48 arctan = _mm256_mul_ps(x_times_x, arctan);
49 arctan = _mm256_add_ps(arctan, a3);
50 arctan = _mm256_mul_ps(x_times_x, arctan);
51 arctan = _mm256_add_ps(arctan, a1);
52 arctan = _mm256_mul_ps(x, arctan);
53
54 return arctan;
55}
56
57static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
58{
59 __m256 yl, yh, tmp1, tmp2;
60 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
61 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
62 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
63 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
64 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
65
66 // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
67 return _mm256_addsub_ps(tmp1, tmp2);
68}
69
70static inline __m256 _mm256_conjugate_ps(__m256 x)
71{
72 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73 return _mm256_xor_ps(x, conjugator); // conjugate y
74}
75
76static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
77{
78 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79 const __m256 dreal = _mm256_moveldup_ps(y);
80 const __m256 dimag = _mm256_movehdup_ps(y);
81
82 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84 const __m256 multreal = _mm256_mul_ps(x, dreal);
85 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86 return _mm256_add_ps(multreal, multimag);
87}
88
89static inline __m256 _mm256_normalize_ps(__m256 val)
90{
91 __m256 tmp1 = _mm256_mul_ps(val, val);
92 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
94 tmp1 = _mm256_sqrt_ps(tmp1);
95 return _mm256_div_ps(val, tmp1);
96}
97
98static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
99{
100 __m256 complex1, complex2;
101 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
102 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
103 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105 return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
106}
107
108static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
109{
110 return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
111}
112
113static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
114 const __m256 symbols1,
115 const __m256 points0,
116 const __m256 points1,
117 const __m256 scalar)
118{
119 /*
120 * Calculate: |y - x|^2 * SNR_lin
121 * Consider 'symbolsX' and 'pointsX' to be complex float
122 * 'symbolsX' are 'y' and 'pointsX' are 'x'
123 */
124 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
126 const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
127 return _mm256_mul_ps(norms, scalar);
128}
129
130static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
131{
132 __m256 sign_mask_dummy = _mm256_setzero_ps();
133 const __m128i zeros = _mm_set1_epi8(0x00);
134 const __m128i sign_extract = _mm_set1_epi8(0x80);
135 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
136 0xff,
137 0xff,
138 0x00,
139 0xff,
140 0xff,
141 0xff,
142 0x01,
143 0xff,
144 0xff,
145 0xff,
146 0x02,
147 0xff,
148 0xff,
149 0xff,
150 0x03);
151 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
152 0xff,
153 0xff,
154 0x04,
155 0xff,
156 0xff,
157 0xff,
158 0x05,
159 0xff,
160 0xff,
161 0xff,
162 0x06,
163 0xff,
164 0xff,
165 0xff,
166 0x07);
167
168 fbits = _mm_cmpgt_epi8(fbits, zeros);
169 fbits = _mm_and_si128(fbits, sign_extract);
170 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
171 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
172
173 __m256 sign_mask =
174 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
175 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
176 // // This is the desired function call. Though it seems to be missing in GCC.
177 // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
178 // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
179 // _mm_castsi128_ps(sign_bits0));
180}
181
182static inline void
183_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
184{
185 // deinterleave values
186 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
190}
191
192static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
193{
194 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195 const __m256 abs_mask =
196 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
197
198 __m256 llr0, llr1;
199 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
200
201 // calculate result
202 __m256 sign =
203 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
204 __m256 dst =
205 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206 return _mm256_or_ps(dst, sign);
207}
208
209static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
210{
211 // prepare sign mask for correct +-
212 __m256 sign_mask = _mm256_polar_sign_mask(fbits);
213
214 __m256 llr0, llr1;
215 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
216
217 // calculate result
218 llr0 = _mm256_xor_ps(llr0, sign_mask);
219 __m256 dst = _mm256_add_ps(llr0, llr1);
220 return dst;
221}
222
224 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
225{
226 aux = _mm256_mul_ps(aux, val);
227 aux = _mm256_sub_ps(aux, acc);
228 aux = _mm256_mul_ps(aux, aux);
229 aux = _mm256_mul_ps(aux, rec);
230 return _mm256_add_ps(sq_acc, aux);
231}
232
233#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition sse2neon.h:3281
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition sse2neon.h:3034
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition sse2neon.h:4948
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition sse2neon.h:6445
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition sse2neon.h:4997
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3135
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition sse2neon.h:306
int64x2_t __m128i
Definition sse2neon.h:375
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:98
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:108
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:183
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition volk_avx_intrinsics.h:76
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition volk_avx_intrinsics.h:223
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition volk_avx_intrinsics.h:57
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:192
static __m256 _mm256_conjugate_ps(__m256 x)
Definition volk_avx_intrinsics.h:70
static __m256 _mm256_normalize_ps(__m256 val)
Definition volk_avx_intrinsics.h:89
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx_intrinsics.h:113
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition volk_avx_intrinsics.h:130
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition volk_avx_intrinsics.h:209