Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53#define INCLUDED_volk_32f_sqrt_32f_a_H
54
55#include <inttypes.h>
56#include <math.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_SSE
60#include <xmmintrin.h>
61
62static inline void
63volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
64{
65 unsigned int number = 0;
66 const unsigned int quarterPoints = num_points / 4;
67
68 float* cPtr = cVector;
69 const float* aPtr = aVector;
70
71 __m128 aVal, cVal;
72 for (; number < quarterPoints; number++) {
73 aVal = _mm_load_ps(aPtr);
74
75 cVal = _mm_sqrt_ps(aVal);
76
77 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
78
79 aPtr += 4;
80 cPtr += 4;
81 }
82
83 number = quarterPoints * 4;
84 for (; number < num_points; number++) {
85 *cPtr++ = sqrtf(*aPtr++);
86 }
87}
88
89#endif /* LV_HAVE_SSE */
90
91#ifdef LV_HAVE_AVX
92#include <immintrin.h>
93
94static inline void
95volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
96{
97 unsigned int number = 0;
98 const unsigned int eighthPoints = num_points / 8;
99
100 float* cPtr = cVector;
101 const float* aPtr = aVector;
102
103 __m256 aVal, cVal;
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
106
107 cVal = _mm256_sqrt_ps(aVal);
108
109 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
110
111 aPtr += 8;
112 cPtr += 8;
113 }
114
115 number = eighthPoints * 8;
116 for (; number < num_points; number++) {
117 *cPtr++ = sqrtf(*aPtr++);
118 }
119}
120
121#endif /* LV_HAVE_AVX */
122
123
124#ifdef LV_HAVE_NEON
125#include <arm_neon.h>
126
127static inline void
128volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
129{
130 float* cPtr = cVector;
131 const float* aPtr = aVector;
132 unsigned int number = 0;
133 unsigned int quarter_points = num_points / 4;
134 float32x4_t in_vec, out_vec;
135
136 for (number = 0; number < quarter_points; number++) {
137 in_vec = vld1q_f32(aPtr);
138 // note that armv8 has vsqrt_f32 which will be much better
139 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140 vst1q_f32(cPtr, out_vec);
141 aPtr += 4;
142 cPtr += 4;
143 }
144
145 for (number = quarter_points * 4; number < num_points; number++) {
146 *cPtr++ = sqrtf(*aPtr++);
147 }
148}
149
150#endif /* LV_HAVE_NEON */
151
152
153#ifdef LV_HAVE_GENERIC
154
155static inline void
156volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
157{
158 float* cPtr = cVector;
159 const float* aPtr = aVector;
160 unsigned int number = 0;
161
162 for (number = 0; number < num_points; number++) {
163 *cPtr++ = sqrtf(*aPtr++);
164 }
165}
166
167#endif /* LV_HAVE_GENERIC */
168
169
170#ifdef LV_HAVE_ORC
171
172extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
173
174static inline void
175volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
176{
177 volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
178}
179
180#endif /* LV_HAVE_ORC */
181
182#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
183
184#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
185#define INCLUDED_volk_32f_sqrt_32f_u_H
186
187#include <inttypes.h>
188#include <math.h>
189#include <stdio.h>
190#ifdef LV_HAVE_AVX
191#include <immintrin.h>
192
193static inline void
194volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
195{
196 unsigned int number = 0;
197 const unsigned int eighthPoints = num_points / 8;
198
199 float* cPtr = cVector;
200 const float* aPtr = aVector;
201
202 __m256 aVal, cVal;
203 for (; number < eighthPoints; number++) {
204 aVal = _mm256_loadu_ps(aPtr);
205
206 cVal = _mm256_sqrt_ps(aVal);
207
208 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
209
210 aPtr += 8;
211 cPtr += 8;
212 }
213
214 number = eighthPoints * 8;
215 for (; number < num_points; number++) {
216 *cPtr++ = sqrtf(*aPtr++);
217 }
218}
219
220#endif /* LV_HAVE_AVX */
221#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
float32x4_t __m128
Definition sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition sse2neon.h:2659
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sqrt_32f.h:128
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sqrt_32f.h:95
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sqrt_32f.h:63
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sqrt_32f.h:194
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sqrt_32f.h:156