Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_avx_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
10/*
11 * This file is intended to hold AVX intrinsics of intrinsics.
12 * They should be used in VOLK kernels to avoid copy-pasta.
13 */
14
15#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
16#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17#include <immintrin.h>
18
19static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
20{
21 __m256 yl, yh, tmp1, tmp2;
22 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
23 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
24 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
25 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
26 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
27
28 // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
29 return _mm256_addsub_ps(tmp1, tmp2);
30}
31
32static inline __m256 _mm256_conjugate_ps(__m256 x)
33{
34 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
35 return _mm256_xor_ps(x, conjugator); // conjugate y
36}
37
38static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
39{
40 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
41 const __m256 dreal = _mm256_moveldup_ps(y);
42 const __m256 dimag = _mm256_movehdup_ps(y);
43
44 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
45 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
46 const __m256 multreal = _mm256_mul_ps(x, dreal);
47 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
48 return _mm256_add_ps(multreal, multimag);
49}
50
51static inline __m256 _mm256_normalize_ps(__m256 val)
52{
53 __m256 tmp1 = _mm256_mul_ps(val, val);
54 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
55 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
56 tmp1 = _mm256_sqrt_ps(tmp1);
57 return _mm256_div_ps(val, tmp1);
58}
59
60static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
61{
62 __m256 complex1, complex2;
63 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
64 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
65 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
66 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
67 return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
68}
69
70static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
71{
72 return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
73}
74
75static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
76 const __m256 symbols1,
77 const __m256 points0,
78 const __m256 points1,
79 const __m256 scalar)
80{
81 /*
82 * Calculate: |y - x|^2 * SNR_lin
83 * Consider 'symbolsX' and 'pointsX' to be complex float
84 * 'symbolsX' are 'y' and 'pointsX' are 'x'
85 */
86 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
87 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
88 const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
89 return _mm256_mul_ps(norms, scalar);
90}
91
92static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
93{
94 __m256 sign_mask_dummy = _mm256_setzero_ps();
95 const __m128i zeros = _mm_set1_epi8(0x00);
96 const __m128i sign_extract = _mm_set1_epi8(0x80);
97 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
98 0xff,
99 0xff,
100 0x00,
101 0xff,
102 0xff,
103 0xff,
104 0x01,
105 0xff,
106 0xff,
107 0xff,
108 0x02,
109 0xff,
110 0xff,
111 0xff,
112 0x03);
113 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
114 0xff,
115 0xff,
116 0x04,
117 0xff,
118 0xff,
119 0xff,
120 0x05,
121 0xff,
122 0xff,
123 0xff,
124 0x06,
125 0xff,
126 0xff,
127 0xff,
128 0x07);
129
130 fbits = _mm_cmpgt_epi8(fbits, zeros);
131 fbits = _mm_and_si128(fbits, sign_extract);
132 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
133 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
134
135 __m256 sign_mask =
136 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
137 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
138 // // This is the desired function call. Though it seems to be missing in GCC.
139 // // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
140 // return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
141 // _mm_castsi128_ps(sign_bits0));
142}
143
144static inline void
145_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
146{
147 // deinterleave values
148 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
149 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
150 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
151 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
152}
153
154static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
155{
156 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
157 const __m256 abs_mask =
158 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
159
160 __m256 llr0, llr1;
161 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
162
163 // calculate result
164 __m256 sign =
165 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
166 __m256 dst =
167 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
168 return _mm256_or_ps(dst, sign);
169}
170
171static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
172{
173 // prepare sign mask for correct +-
174 __m256 sign_mask = _mm256_polar_sign_mask(fbits);
175
176 __m256 llr0, llr1;
177 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
178
179 // calculate result
180 llr0 = _mm256_xor_ps(llr0, sign_mask);
181 __m256 dst = _mm256_add_ps(llr0, llr1);
182 return dst;
183}
184
186 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
187{
188 aux = _mm256_mul_ps(aux, val);
189 aux = _mm256_sub_ps(aux, acc);
190 aux = _mm256_mul_ps(aux, aux);
191 aux = _mm256_mul_ps(aux, rec);
192 return _mm256_add_ps(sq_acc, aux);
193}
194
195#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3391
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:60
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition: volk_avx_intrinsics.h:70
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:145
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:185
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:19
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:154
static __m256 _mm256_conjugate_ps(__m256 x)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:51
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:75
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition: volk_avx_intrinsics.h:92
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:171