Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
65#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
67
68#include <inttypes.h>
69#include <stdio.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
74
75static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
76 lv_32fc_t* src0,
77 lv_32fc_t* points,
78 unsigned int num_points)
79{
80 const unsigned int num_bytes = num_points * 8;
81 __m128 xmm0, xmm9, xmm10;
82 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83
84 lv_32fc_t diff;
85 float sq_dist;
86 int bound = num_bytes >> 6;
87 int leftovers0 = (num_bytes >> 5) & 1;
88 int leftovers1 = (num_bytes >> 4) & 1;
89 int leftovers2 = (num_bytes >> 3) & 1;
90 int i = 0;
91
92 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93 xmm1 = _mm256_setzero_ps();
94 xmm0 = _mm_load_ps((float*)src0);
95 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
98
99 for (; i < bound; ++i) {
100 xmm2 = _mm256_load_ps((float*)&points[0]);
101 xmm3 = _mm256_load_ps((float*)&points[4]);
102 points += 8;
103
104 xmm4 = _mm256_sub_ps(xmm1, xmm2);
105 xmm5 = _mm256_sub_ps(xmm1, xmm3);
106 xmm6 = _mm256_mul_ps(xmm4, xmm4);
107 xmm7 = _mm256_mul_ps(xmm5, xmm5);
108
109 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
111
112 _mm256_store_ps(target, xmm4);
113
114 target += 8;
115 }
116
117 for (i = 0; i < leftovers0; ++i) {
118
119 xmm2 = _mm256_load_ps((float*)&points[0]);
120
121 xmm4 = _mm256_sub_ps(xmm1, xmm2);
122
123 points += 4;
124
125 xmm6 = _mm256_mul_ps(xmm4, xmm4);
126
127 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
129
130 xmm9 = _mm256_extractf128_ps(xmm4, 1);
131 _mm_store_ps(target, xmm9);
132
133 target += 4;
134 }
135
136 for (i = 0; i < leftovers1; ++i) {
137 xmm9 = _mm_load_ps((float*)&points[0]);
138
139 xmm10 = _mm_sub_ps(xmm0, xmm9);
140
141 points += 2;
142
143 xmm9 = _mm_mul_ps(xmm10, xmm10);
144
145 xmm10 = _mm_hadd_ps(xmm9, xmm9);
146
147 _mm_storeh_pi((__m64*)target, xmm10);
148
149 target += 2;
150 }
151
152 for (i = 0; i < leftovers2; ++i) {
153
154 diff = src0[0] - points[0];
155
156 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
157
158 target[0] = sq_dist;
159 }
160}
161
162#endif /*LV_HAVE_AVX2*/
163
164#ifdef LV_HAVE_SSE3
165#include <pmmintrin.h>
166#include <xmmintrin.h>
167
168static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
169 lv_32fc_t* src0,
170 lv_32fc_t* points,
171 unsigned int num_points)
172{
173 const unsigned int num_bytes = num_points * 8;
174
175 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
176
177 lv_32fc_t diff;
178 float sq_dist;
179 int bound = num_bytes >> 5;
180 int i = 0;
181
182 xmm1 = _mm_setzero_ps();
183 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184 xmm2 = _mm_load_ps((float*)&points[0]);
185 xmm1 = _mm_movelh_ps(xmm1, xmm1);
186 xmm3 = _mm_load_ps((float*)&points[2]);
187
188 for (; i < bound - 1; ++i) {
189 xmm4 = _mm_sub_ps(xmm1, xmm2);
190 xmm5 = _mm_sub_ps(xmm1, xmm3);
191 points += 4;
192 xmm6 = _mm_mul_ps(xmm4, xmm4);
193 xmm7 = _mm_mul_ps(xmm5, xmm5);
194
195 xmm2 = _mm_load_ps((float*)&points[0]);
196
197 xmm4 = _mm_hadd_ps(xmm6, xmm7);
198
199 xmm3 = _mm_load_ps((float*)&points[2]);
200
201 _mm_store_ps(target, xmm4);
202
203 target += 4;
204 }
205
206 xmm4 = _mm_sub_ps(xmm1, xmm2);
207 xmm5 = _mm_sub_ps(xmm1, xmm3);
208
209 points += 4;
210 xmm6 = _mm_mul_ps(xmm4, xmm4);
211 xmm7 = _mm_mul_ps(xmm5, xmm5);
212
213 xmm4 = _mm_hadd_ps(xmm6, xmm7);
214
215 _mm_store_ps(target, xmm4);
216
217 target += 4;
218
219 if (num_bytes >> 4 & 1) {
220
221 xmm2 = _mm_load_ps((float*)&points[0]);
222
223 xmm4 = _mm_sub_ps(xmm1, xmm2);
224
225 points += 2;
226
227 xmm6 = _mm_mul_ps(xmm4, xmm4);
228
229 xmm4 = _mm_hadd_ps(xmm6, xmm6);
230
231 _mm_storeh_pi((__m64*)target, xmm4);
232
233 target += 2;
234 }
235
236 if (num_bytes >> 3 & 1) {
237
238 diff = src0[0] - points[0];
239
240 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
241
242 target[0] = sq_dist;
243 }
244}
245
246#endif /*LV_HAVE_SSE3*/
247
248
249#ifdef LV_HAVE_NEON
250#include <arm_neon.h>
251static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
252 lv_32fc_t* src0,
253 lv_32fc_t* points,
254 unsigned int num_points)
255{
256 const unsigned int quarter_points = num_points / 4;
257 unsigned int number;
258
259 float32x4x2_t a_vec, b_vec;
260 float32x4x2_t diff_vec;
261 float32x4_t tmp, tmp1, dist_sq;
262 a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
263 a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
264 for (number = 0; number < quarter_points; ++number) {
265 b_vec = vld2q_f32((float*)points);
266 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
267 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
268 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
269 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
270
271 dist_sq = vaddq_f32(tmp, tmp1);
272 vst1q_f32(target, dist_sq);
273 points += 4;
274 target += 4;
275 }
276 for (number = quarter_points * 4; number < num_points; ++number) {
277 lv_32fc_t diff = src0[0] - *points++;
278 *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
279 }
280}
281#endif /* LV_HAVE_NEON */
282
283
284#ifdef LV_HAVE_GENERIC
285static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
286 lv_32fc_t* src0,
287 lv_32fc_t* points,
288 unsigned int num_points)
289{
290 const unsigned int num_bytes = num_points * 8;
291
292 lv_32fc_t diff;
293 float sq_dist;
294 unsigned int i = 0;
295
296 for (; i<num_bytes>> 3; ++i) {
297 diff = src0[0] - points[i];
298
299 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
300
301 target[i] = sq_dist;
302 }
303}
304
305#endif /*LV_HAVE_GENERIC*/
306
307
308#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
309
310#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
311#define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
312
313#include <inttypes.h>
314#include <stdio.h>
315#include <volk/volk_complex.h>
316
317#ifdef LV_HAVE_AVX2
318#include <immintrin.h>
319
320static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
321 lv_32fc_t* src0,
322 lv_32fc_t* points,
323 unsigned int num_points)
324{
325 const unsigned int num_bytes = num_points * 8;
326 __m128 xmm0, xmm9;
327 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
328
329 lv_32fc_t diff;
330 float sq_dist;
331 int bound = num_bytes >> 6;
332 int leftovers1 = (num_bytes >> 3) & 0b11;
333 int i = 0;
334
335 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
336 xmm1 = _mm256_setzero_ps();
337 xmm0 = _mm_loadu_ps((float*)src0);
338 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
339 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
340 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
341
342 for (; i < bound; ++i) {
343 xmm2 = _mm256_loadu_ps((float*)&points[0]);
344 xmm3 = _mm256_loadu_ps((float*)&points[4]);
345 points += 8;
346
347 xmm4 = _mm256_sub_ps(xmm1, xmm2);
348 xmm5 = _mm256_sub_ps(xmm1, xmm3);
349 xmm6 = _mm256_mul_ps(xmm4, xmm4);
350 xmm7 = _mm256_mul_ps(xmm5, xmm5);
351
352 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
353 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
354
355 _mm256_storeu_ps(target, xmm4);
356
357 target += 8;
358 }
359
360 if (num_bytes >> 5 & 1) {
361
362 xmm2 = _mm256_loadu_ps((float*)&points[0]);
363
364 xmm4 = _mm256_sub_ps(xmm1, xmm2);
365
366 points += 4;
367
368 xmm6 = _mm256_mul_ps(xmm4, xmm4);
369
370 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
371 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
372
373 xmm9 = _mm256_extractf128_ps(xmm4, 1);
374 _mm_storeu_ps(target, xmm9);
375
376 target += 4;
377 }
378
379 for (i = 0; i < leftovers1; ++i) {
380
381 diff = src0[0] - points[0];
382 points += 1;
383
384 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
385
386 target[0] = sq_dist;
387 target += 1;
388 }
389}
390
391#endif /*LV_HAVE_AVX2*/
392
393#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1917
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2751
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:285
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:251
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:168
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13