Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_atan_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
57#include <inttypes.h>
58#include <math.h>
59#include <stdio.h>
60
61/* This is the number of terms of Taylor series to evaluate, increase this for more
62 * accuracy*/
63#define TERMS 2
64
65#ifndef INCLUDED_volk_32f_atan_32f_a_H
66#define INCLUDED_volk_32f_atan_32f_a_H
67
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
69#include <immintrin.h>
70
71static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
72 const float* aVector,
73 unsigned int num_points)
74{
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
77
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
80 int i, j;
81
82 __m256 aVal, pio2, x, y, z, arctangent;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
84
85 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86 fzeroes = _mm256_setzero_ps();
87 fones = _mm256_set1_ps(1.0);
88 ftwos = _mm256_set1_ps(2.0);
89 ffours = _mm256_set1_ps(4.0);
90
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 z = aVal;
94 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
95 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
96 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
97 x = _mm256_add_ps(
98 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
99
100 for (i = 0; i < 2; i++) {
101 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
102 }
103 x = _mm256_div_ps(fones, x);
104 y = fzeroes;
105 for (j = TERMS - 1; j >= 0; j--) {
106 y = _mm256_fmadd_ps(
107 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
108 }
109
110 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
111 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
112
113 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
114 arctangent = y;
115 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
116 arctangent = _mm256_sub_ps(
117 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
118
119 _mm256_store_ps(bPtr, arctangent);
120 aPtr += 8;
121 bPtr += 8;
122 }
123
124 number = eighthPoints * 8;
125 for (; number < num_points; number++) {
126 *bPtr++ = atan(*aPtr++);
127 }
128}
129
130#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
131
132
133#ifdef LV_HAVE_AVX
134#include <immintrin.h>
135
136static inline void
137volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
138{
139 float* bPtr = bVector;
140 const float* aPtr = aVector;
141
142 unsigned int number = 0;
143 unsigned int eighthPoints = num_points / 8;
144 int i, j;
145
146 __m256 aVal, pio2, x, y, z, arctangent;
147 __m256 fzeroes, fones, ftwos, ffours, condition;
148
149 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
150 fzeroes = _mm256_setzero_ps();
151 fones = _mm256_set1_ps(1.0);
152 ftwos = _mm256_set1_ps(2.0);
153 ffours = _mm256_set1_ps(4.0);
154
155 for (; number < eighthPoints; number++) {
156 aVal = _mm256_load_ps(aPtr);
157 z = aVal;
158 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
159 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
160 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
161 x = _mm256_add_ps(
162 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
163
164 for (i = 0; i < 2; i++) {
165 x = _mm256_add_ps(x,
166 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
167 }
168 x = _mm256_div_ps(fones, x);
169 y = fzeroes;
170 for (j = TERMS - 1; j >= 0; j--) {
171 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
172 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
173 }
174
175 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
176 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
177
178 y = _mm256_add_ps(
179 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
180 arctangent = y;
181 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
182 arctangent = _mm256_sub_ps(
183 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
184
185 _mm256_store_ps(bPtr, arctangent);
186 aPtr += 8;
187 bPtr += 8;
188 }
189
190 number = eighthPoints * 8;
191 for (; number < num_points; number++) {
192 *bPtr++ = atan(*aPtr++);
193 }
194}
195
196#endif /* LV_HAVE_AVX for aligned */
197
198#ifdef LV_HAVE_SSE4_1
199#include <smmintrin.h>
200
201static inline void
202volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
203{
204 float* bPtr = bVector;
205 const float* aPtr = aVector;
206
207 unsigned int number = 0;
208 unsigned int quarterPoints = num_points / 4;
209 int i, j;
210
211 __m128 aVal, pio2, x, y, z, arctangent;
212 __m128 fzeroes, fones, ftwos, ffours, condition;
213
214 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
215 fzeroes = _mm_setzero_ps();
216 fones = _mm_set1_ps(1.0);
217 ftwos = _mm_set1_ps(2.0);
218 ffours = _mm_set1_ps(4.0);
219
220 for (; number < quarterPoints; number++) {
221 aVal = _mm_load_ps(aPtr);
222 z = aVal;
223 condition = _mm_cmplt_ps(z, fzeroes);
224 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
225 condition = _mm_cmplt_ps(z, fones);
226 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
227
228 for (i = 0; i < 2; i++) {
229 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
230 }
231 x = _mm_div_ps(fones, x);
232 y = fzeroes;
233 for (j = TERMS - 1; j >= 0; j--) {
234 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
235 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
236 }
237
238 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
239 condition = _mm_cmpgt_ps(z, fones);
240
241 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
242 arctangent = y;
243 condition = _mm_cmplt_ps(aVal, fzeroes);
244 arctangent =
245 _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
246
247 _mm_store_ps(bPtr, arctangent);
248 aPtr += 4;
249 bPtr += 4;
250 }
251
252 number = quarterPoints * 4;
253 for (; number < num_points; number++) {
254 *bPtr++ = atanf(*aPtr++);
255 }
256}
257
258#endif /* LV_HAVE_SSE4_1 for aligned */
259
260#endif /* INCLUDED_volk_32f_atan_32f_a_H */
261
262#ifndef INCLUDED_volk_32f_atan_32f_u_H
263#define INCLUDED_volk_32f_atan_32f_u_H
264
265#if LV_HAVE_AVX2 && LV_HAVE_FMA
266#include <immintrin.h>
267
268static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
269 const float* aVector,
270 unsigned int num_points)
271{
272 float* bPtr = bVector;
273 const float* aPtr = aVector;
274
275 unsigned int number = 0;
276 unsigned int eighthPoints = num_points / 8;
277 int i, j;
278
279 __m256 aVal, pio2, x, y, z, arctangent;
280 __m256 fzeroes, fones, ftwos, ffours, condition;
281
282 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
283 fzeroes = _mm256_setzero_ps();
284 fones = _mm256_set1_ps(1.0);
285 ftwos = _mm256_set1_ps(2.0);
286 ffours = _mm256_set1_ps(4.0);
287
288 for (; number < eighthPoints; number++) {
289 aVal = _mm256_loadu_ps(aPtr);
290 z = aVal;
291 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
292 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
293 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
294 x = _mm256_add_ps(
295 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
296
297 for (i = 0; i < 2; i++) {
298 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
299 }
300 x = _mm256_div_ps(fones, x);
301 y = fzeroes;
302 for (j = TERMS - 1; j >= 0; j--) {
303 y = _mm256_fmadd_ps(
304 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
305 }
306
307 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
308 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
309
310 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
311 arctangent = y;
312 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
313 arctangent = _mm256_sub_ps(
314 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
315
316 _mm256_storeu_ps(bPtr, arctangent);
317 aPtr += 8;
318 bPtr += 8;
319 }
320
321 number = eighthPoints * 8;
322 for (; number < num_points; number++) {
323 *bPtr++ = atan(*aPtr++);
324 }
325}
326
327#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
328
329
330#ifdef LV_HAVE_AVX
331#include <immintrin.h>
332
333static inline void
334volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
335{
336 float* bPtr = bVector;
337 const float* aPtr = aVector;
338
339 unsigned int number = 0;
340 unsigned int eighthPoints = num_points / 8;
341 int i, j;
342
343 __m256 aVal, pio2, x, y, z, arctangent;
344 __m256 fzeroes, fones, ftwos, ffours, condition;
345
346 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
347 fzeroes = _mm256_setzero_ps();
348 fones = _mm256_set1_ps(1.0);
349 ftwos = _mm256_set1_ps(2.0);
350 ffours = _mm256_set1_ps(4.0);
351
352 for (; number < eighthPoints; number++) {
353 aVal = _mm256_loadu_ps(aPtr);
354 z = aVal;
355 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
356 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
357 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
358 x = _mm256_add_ps(
359 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
360
361 for (i = 0; i < 2; i++) {
362 x = _mm256_add_ps(x,
363 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
364 }
365 x = _mm256_div_ps(fones, x);
366 y = fzeroes;
367 for (j = TERMS - 1; j >= 0; j--) {
368 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
369 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
370 }
371
372 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
373 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
374
375 y = _mm256_add_ps(
376 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
377 arctangent = y;
378 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
379 arctangent = _mm256_sub_ps(
380 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
381
382 _mm256_storeu_ps(bPtr, arctangent);
383 aPtr += 8;
384 bPtr += 8;
385 }
386
387 number = eighthPoints * 8;
388 for (; number < num_points; number++) {
389 *bPtr++ = atan(*aPtr++);
390 }
391}
392
393#endif /* LV_HAVE_AVX for unaligned */
394
395#ifdef LV_HAVE_SSE4_1
396#include <smmintrin.h>
397
398static inline void
399volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
400{
401 float* bPtr = bVector;
402 const float* aPtr = aVector;
403
404 unsigned int number = 0;
405 unsigned int quarterPoints = num_points / 4;
406 int i, j;
407
408 __m128 aVal, pio2, x, y, z, arctangent;
409 __m128 fzeroes, fones, ftwos, ffours, condition;
410
411 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
412 fzeroes = _mm_setzero_ps();
413 fones = _mm_set1_ps(1.0);
414 ftwos = _mm_set1_ps(2.0);
415 ffours = _mm_set1_ps(4.0);
416
417 for (; number < quarterPoints; number++) {
418 aVal = _mm_loadu_ps(aPtr);
419 z = aVal;
420 condition = _mm_cmplt_ps(z, fzeroes);
421 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
422 condition = _mm_cmplt_ps(z, fones);
423 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
424
425 for (i = 0; i < 2; i++)
426 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
427 x = _mm_div_ps(fones, x);
428 y = fzeroes;
429 for (j = TERMS - 1; j >= 0; j--)
430 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
431 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
432
433 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
434 condition = _mm_cmpgt_ps(z, fones);
435
436 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
437 arctangent = y;
438 condition = _mm_cmplt_ps(aVal, fzeroes);
439 arctangent =
440 _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
441
442 _mm_storeu_ps(bPtr, arctangent);
443 aPtr += 4;
444 bPtr += 4;
445 }
446
447 number = quarterPoints * 4;
448 for (; number < num_points; number++) {
449 *bPtr++ = atanf(*aPtr++);
450 }
451}
452
453#endif /* LV_HAVE_SSE4_1 for unaligned */
454
455#ifdef LV_HAVE_GENERIC
456
457static inline void
458volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
459{
460 float* bPtr = bVector;
461 const float* aPtr = aVector;
462 unsigned int number = 0;
463
464 for (number = 0; number < num_points; number++) {
465 *bPtr++ = atanf(*aPtr++);
466 }
467}
468#endif /* LV_HAVE_GENERIC */
469
470#endif /* INCLUDED_volk_32f_atan_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:334
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:458
#define TERMS
Definition: volk_32f_atan_32f.h:63
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:137
for i
Definition: volk_config_fixed.tmpl.h:13