86#ifndef INCLUDED_volk_32f_exp_32f_a_H
87#define INCLUDED_volk_32f_exp_32f_a_H
95 float* bPtr = bVector;
96 const float* aPtr = aVector;
98 unsigned int number = 0;
99 unsigned int quarterPoints = num_points / 4;
102 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105 __m128i emm0, pi32_0x7f;
107 one = _mm_set1_ps(1.0);
108 exp_hi = _mm_set1_ps(88.3762626647949);
109 exp_lo = _mm_set1_ps(-88.3762626647949);
110 log2EF = _mm_set1_ps(1.44269504088896341);
111 half = _mm_set1_ps(0.5);
112 exp_C1 = _mm_set1_ps(0.693359375);
113 exp_C2 = _mm_set1_ps(-2.12194440e-4);
114 pi32_0x7f = _mm_set1_epi32(0x7f);
116 exp_p0 = _mm_set1_ps(1.9875691500e-4);
117 exp_p1 = _mm_set1_ps(1.3981999507e-3);
118 exp_p2 = _mm_set1_ps(8.3334519073e-3);
119 exp_p3 = _mm_set1_ps(4.1665795894e-2);
120 exp_p4 = _mm_set1_ps(1.6666665459e-1);
121 exp_p5 = _mm_set1_ps(5.0000001201e-1);
123 for (; number < quarterPoints; number++) {
124 aVal = _mm_load_ps(aPtr);
125 tmp = _mm_setzero_ps();
127 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
130 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
132 emm0 = _mm_cvttps_epi32(fx);
133 tmp = _mm_cvtepi32_ps(emm0);
135 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136 fx = _mm_sub_ps(tmp, mask);
138 tmp = _mm_mul_ps(fx, exp_C1);
139 z = _mm_mul_ps(fx, exp_C2);
140 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141 z = _mm_mul_ps(aVal, aVal);
143 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147 y = _mm_add_ps(y, one);
149 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
151 pow2n = _mm_castsi128_ps(emm0);
152 bVal = _mm_mul_ps(y, pow2n);
154 _mm_store_ps(bPtr, bVal);
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *bPtr++ = expf(*aPtr++);
170#ifndef INCLUDED_volk_32f_exp_32f_u_H
171#define INCLUDED_volk_32f_exp_32f_u_H
174#include <emmintrin.h>
179 float* bPtr = bVector;
180 const float* aPtr = aVector;
182 unsigned int number = 0;
183 unsigned int quarterPoints = num_points / 4;
186 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
187 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
188 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
189 __m128i emm0, pi32_0x7f;
191 one = _mm_set1_ps(1.0);
192 exp_hi = _mm_set1_ps(88.3762626647949);
193 exp_lo = _mm_set1_ps(-88.3762626647949);
194 log2EF = _mm_set1_ps(1.44269504088896341);
195 half = _mm_set1_ps(0.5);
196 exp_C1 = _mm_set1_ps(0.693359375);
197 exp_C2 = _mm_set1_ps(-2.12194440e-4);
198 pi32_0x7f = _mm_set1_epi32(0x7f);
200 exp_p0 = _mm_set1_ps(1.9875691500e-4);
201 exp_p1 = _mm_set1_ps(1.3981999507e-3);
202 exp_p2 = _mm_set1_ps(8.3334519073e-3);
203 exp_p3 = _mm_set1_ps(4.1665795894e-2);
204 exp_p4 = _mm_set1_ps(1.6666665459e-1);
205 exp_p5 = _mm_set1_ps(5.0000001201e-1);
208 for (; number < quarterPoints; number++) {
209 aVal = _mm_loadu_ps(aPtr);
210 tmp = _mm_setzero_ps();
212 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
215 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
217 emm0 = _mm_cvttps_epi32(fx);
218 tmp = _mm_cvtepi32_ps(emm0);
220 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
221 fx = _mm_sub_ps(tmp, mask);
223 tmp = _mm_mul_ps(fx, exp_C1);
224 z = _mm_mul_ps(fx, exp_C2);
225 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
226 z = _mm_mul_ps(aVal, aVal);
228 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
229 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
230 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
231 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
232 y = _mm_add_ps(y, one);
234 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
236 pow2n = _mm_castsi128_ps(emm0);
237 bVal = _mm_mul_ps(y, pow2n);
239 _mm_storeu_ps(bPtr, bVal);
244 number = quarterPoints * 4;
245 for (; number < num_points; number++) {
246 *bPtr++ = expf(*aPtr++);
253#ifdef LV_HAVE_GENERIC
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
260 unsigned int number = 0;
262 for (number = 0; number < num_points; number++) {
263 *bPtr++ = expf(*aPtr++);
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:93
static void volk_32f_exp_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:256
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:177