59#ifndef INCLUDED_volk_32f_atan_32f_a_H
60#define INCLUDED_volk_32f_atan_32f_a_H
62#if LV_HAVE_AVX2 && LV_HAVE_FMA
66volk_32f_atan_32f_a_avx2_fma(
float* out,
const float* in,
unsigned int num_points)
68 const __m256 one = _mm256_set1_ps(1.f);
69 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
70 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
71 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
73 unsigned int number = 0;
74 unsigned int eighth_points = num_points / 8;
75 for (; number < eighth_points; number++) {
76 __m256 x = _mm256_load_ps(in);
77 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
78 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
79 _mm256_blendv_ps(one, x, swap_mask));
81 __m256 term = _mm256_and_ps(x_star, sign_mask);
82 term = _mm256_or_ps(pi_over_2, term);
83 term = _mm256_sub_ps(term, result);
84 result = _mm256_blendv_ps(result, term, swap_mask);
85 _mm256_store_ps(out, result);
90 number = eighth_points * 8;
91 for (; number < num_points; number++) {
103 const __m256 one = _mm256_set1_ps(1.f);
104 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
105 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
106 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
108 unsigned int number = 0;
109 unsigned int eighth_points = num_points / 8;
110 for (; number < eighth_points; number++) {
111 __m256 x = _mm256_load_ps(in);
112 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
113 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
114 _mm256_blendv_ps(one, x, swap_mask));
116 __m256 term = _mm256_and_ps(x_star, sign_mask);
117 term = _mm256_or_ps(pi_over_2, term);
118 term = _mm256_sub_ps(term, result);
119 result = _mm256_blendv_ps(result, term, swap_mask);
120 _mm256_store_ps(out, result);
125 number = eighth_points * 8;
126 for (; number < num_points; number++) {
133#include <smmintrin.h>
136volk_32f_atan_32f_a_sse4_1(
float* out,
const float* in,
unsigned int num_points)
143 unsigned int number = 0;
144 unsigned int quarter_points = num_points / 4;
145 for (; number < quarter_points; number++) {
160 number = quarter_points * 4;
161 for (; number < num_points; number++) {
168#ifndef INCLUDED_volk_32f_atan_32f_u_H
169#define INCLUDED_volk_32f_atan_32f_u_H
171#if LV_HAVE_AVX2 && LV_HAVE_FMA
172#include <immintrin.h>
174volk_32f_atan_32f_u_avx2_fma(
float* out,
const float* in,
unsigned int num_points)
176 const __m256 one = _mm256_set1_ps(1.f);
177 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
178 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
181 unsigned int number = 0;
182 unsigned int eighth_points = num_points / 8;
183 for (; number < eighth_points; number++) {
184 __m256 x = _mm256_loadu_ps(in);
185 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
186 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
187 _mm256_blendv_ps(one, x, swap_mask));
189 __m256 term = _mm256_and_ps(x_star, sign_mask);
190 term = _mm256_or_ps(pi_over_2, term);
191 term = _mm256_sub_ps(term, result);
192 result = _mm256_blendv_ps(result, term, swap_mask);
193 _mm256_storeu_ps(out, result);
198 number = eighth_points * 8;
199 for (; number < num_points; number++) {
206#include <immintrin.h>
210 const __m256 one = _mm256_set1_ps(1.f);
211 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
212 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
213 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
215 unsigned int number = 0;
216 unsigned int eighth_points = num_points / 8;
217 for (; number < eighth_points; number++) {
218 __m256 x = _mm256_loadu_ps(in);
219 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
220 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
221 _mm256_blendv_ps(one, x, swap_mask));
223 __m256 term = _mm256_and_ps(x_star, sign_mask);
224 term = _mm256_or_ps(pi_over_2, term);
225 term = _mm256_sub_ps(term, result);
226 result = _mm256_blendv_ps(result, term, swap_mask);
227 _mm256_storeu_ps(out, result);
232 number = eighth_points * 8;
233 for (; number < num_points; number++) {
240#include <smmintrin.h>
243volk_32f_atan_32f_u_sse4_1(
float* out,
const float* in,
unsigned int num_points)
250 unsigned int number = 0;
251 unsigned int quarter_points = num_points / 4;
252 for (; number < quarter_points; number++) {
267 number = quarter_points * 4;
268 for (; number < num_points; number++) {
274#ifdef LV_HAVE_GENERIC
278 unsigned int number = 0;
279 for (; number < num_points; number++) {
285#ifdef LV_HAVE_GENERIC
289 unsigned int number = 0;
290 for (; number < num_points; number++) {
291 *out++ = atanf(*in++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition sse2neon.h:2834
float32x4_t __m128
Definition sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition sse2neon.h:1756
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition sse2neon.h:2787
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition sse2neon.h:3250
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition sse2neon.h:7458
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition sse2neon.h:2237
static void volk_32f_atan_32f_u_avx2(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:208
static void volk_32f_atan_32f_generic(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:287
static void volk_32f_atan_32f_polynomial(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:276
static void volk_32f_atan_32f_a_avx2(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:101
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition volk_avx2_fma_intrinsics.h:26
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static float volk_arctan(const float x)
Definition volk_common.h:204
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition volk_sse_intrinsics.h:27