52#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
53#define INCLUDED_volk_32f_invsqrt_32f_a_H
63 const float threehalfs = 1.5F;
71 u.i = 0x5f3759df - (u.i >> 1);
72 u.f = u.f * (threehalfs - (x2 * u.f * u.f));
85 unsigned int number = 0;
86 const unsigned int eighthPoints = num_points / 8;
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 cVal = _mm256_rsqrt_ps(aVal);
94 _mm256_store_ps(cPtr, cVal);
99 number = eighthPoints * 8;
100 for (; number < num_points; number++)
107#include <xmmintrin.h>
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
119 for (; number < quarterPoints; number++) {
131 number = quarterPoints * 4;
132 for (; number < num_points; number++) {
146 const unsigned int quarter_points = num_points / 4;
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
150 float32x4_t a_val, c_val;
151 for (number = 0; number < quarter_points; ++number) {
152 a_val = vld1q_f32(aPtr);
153 c_val = vrsqrteq_f32(a_val);
154 vst1q_f32(cPtr, c_val);
159 for (number = quarter_points * 4; number < num_points; number++)
165#ifdef LV_HAVE_GENERIC
168 const float* aVector,
169 unsigned int num_points)
171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
174 for (number = 0; number < num_points; number++) {
181#include <immintrin.h>
186 unsigned int number = 0;
187 const unsigned int eighthPoints = num_points / 8;
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
192 for (; number < eighthPoints; number++) {
193 aVal = _mm256_loadu_ps(aPtr);
194 cVal = _mm256_rsqrt_ps(aVal);
195 _mm256_storeu_ps(cPtr, cVal);
200 number = eighthPoints * 8;
201 for (; number < num_points; number++)
float32x4_t __m128
Definition sse2neon.h:366
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1843
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition sse2neon.h:2313
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2685
static void volk_32f_invsqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:143
static void volk_32f_invsqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:83
static void volk_32f_invsqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:167
static void volk_32f_invsqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:110
static void volk_32f_invsqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:184
static float Q_rsqrt(float number)
Definition volk_32f_invsqrt_32f.h:60
for i
Definition volk_config_fixed.tmpl.h:13