Vector Optimized Library of Kernels 2.5.2
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_x2_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: GPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
47
48#include <stdio.h>
49#include <string.h>
50#include <volk/volk_common.h>
51#include <volk/volk_complex.h>
52
53
54#ifdef LV_HAVE_GENERIC
55
56
58 const lv_32fc_t* input,
59 const lv_32fc_t* taps,
60 unsigned int num_points)
61{
62
63 float* res = (float*)result;
64 float* in = (float*)input;
65 float* tp = (float*)taps;
66 unsigned int n_2_ccomplex_blocks = num_points / 2;
67
68 float sum0[2] = { 0, 0 };
69 float sum1[2] = { 0, 0 };
70 unsigned int i = 0;
71
72 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
73 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
74 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
75 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
76 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
77
78 in += 4;
79 tp += 4;
80 }
81
82 res[0] = sum0[0] + sum1[0];
83 res[1] = sum0[1] + sum1[1];
84
85 // Cleanup if we had an odd number of points
86 if (num_points & 1) {
87 *result += input[num_points - 1] * taps[num_points - 1];
88 }
89}
90
91#endif /*LV_HAVE_GENERIC*/
92
93
94#if LV_HAVE_SSE && LV_HAVE_64
95
96static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
97 const lv_32fc_t* input,
98 const lv_32fc_t* taps,
99 unsigned int num_points)
100{
101
102 const unsigned int num_bytes = num_points * 8;
103 unsigned int isodd = num_points & 1;
104
106 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
107 "# const float *taps, unsigned num_bytes)\n\t"
108 "# float sum0 = 0;\n\t"
109 "# float sum1 = 0;\n\t"
110 "# float sum2 = 0;\n\t"
111 "# float sum3 = 0;\n\t"
112 "# do {\n\t"
113 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
114 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
115 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
116 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
117 "# input += 4;\n\t"
118 "# taps += 4; \n\t"
119 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
120 "# result[0] = sum0 + sum2;\n\t"
121 "# result[1] = sum1 + sum3;\n\t"
122 "# TODO: prefetch and better scheduling\n\t"
123 " xor %%r9, %%r9\n\t"
124 " xor %%r10, %%r10\n\t"
125 " movq %%rcx, %%rax\n\t"
126 " movq %%rcx, %%r8\n\t"
127 " movq %[rsi], %%r9\n\t"
128 " movq %[rdx], %%r10\n\t"
129 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
130 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
131 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
132 " shr $4, %%r8\n\t"
133 " jmp .%=L1_test\n\t"
134 " # 4 taps / loop\n\t"
135 " # something like ?? cycles / loop\n\t"
136 ".%=Loop1: \n\t"
137 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
138 "# movups (%%r9), %%xmmA\n\t"
139 "# movups (%%r10), %%xmmB\n\t"
140 "# movups %%xmmA, %%xmmZ\n\t"
141 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
142 "# mulps %%xmmB, %%xmmA\n\t"
143 "# mulps %%xmmZ, %%xmmB\n\t"
144 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
145 "# xorps %%xmmPN, %%xmmA\n\t"
146 "# movups %%xmmA, %%xmmZ\n\t"
147 "# unpcklps %%xmmB, %%xmmA\n\t"
148 "# unpckhps %%xmmB, %%xmmZ\n\t"
149 "# movups %%xmmZ, %%xmmY\n\t"
150 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
151 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
152 "# addps %%xmmZ, %%xmmA\n\t"
153 "# addps %%xmmA, %%xmmC\n\t"
154 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
155 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
156 " movups 0(%%r9), %%xmm0\n\t"
157 " movups 16(%%r9), %%xmm1\n\t"
158 " movups %%xmm0, %%xmm4\n\t"
159 " movups 0(%%r10), %%xmm2\n\t"
160 " mulps %%xmm2, %%xmm0\n\t"
161 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
162 " movups 16(%%r10), %%xmm3\n\t"
163 " movups %%xmm1, %%xmm5\n\t"
164 " addps %%xmm0, %%xmm6\n\t"
165 " mulps %%xmm3, %%xmm1\n\t"
166 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
167 " addps %%xmm1, %%xmm6\n\t"
168 " mulps %%xmm4, %%xmm2\n\t"
169 " addps %%xmm2, %%xmm7\n\t"
170 " mulps %%xmm5, %%xmm3\n\t"
171 " add $32, %%r9\n\t"
172 " addps %%xmm3, %%xmm7\n\t"
173 " add $32, %%r10\n\t"
174 ".%=L1_test:\n\t"
175 " dec %%rax\n\t"
176 " jge .%=Loop1\n\t"
177 " # We've handled the bulk of multiplies up to here.\n\t"
178 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
179 " # If so, we've got 2 more taps to do.\n\t"
180 " and $1, %%r8\n\t"
181 " je .%=Leven\n\t"
182 " # The count was odd, do 2 more taps.\n\t"
183 " movups 0(%%r9), %%xmm0\n\t"
184 " movups %%xmm0, %%xmm4\n\t"
185 " movups 0(%%r10), %%xmm2\n\t"
186 " mulps %%xmm2, %%xmm0\n\t"
187 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
188 " addps %%xmm0, %%xmm6\n\t"
189 " mulps %%xmm4, %%xmm2\n\t"
190 " addps %%xmm2, %%xmm7\n\t"
191 ".%=Leven:\n\t"
192 " # neg inversor\n\t"
193 " xorps %%xmm1, %%xmm1\n\t"
194 " mov $0x80000000, %%r9\n\t"
195 " movd %%r9, %%xmm1\n\t"
196 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
197 " # pfpnacc\n\t"
198 " xorps %%xmm1, %%xmm6\n\t"
199 " movups %%xmm6, %%xmm2\n\t"
200 " unpcklps %%xmm7, %%xmm6\n\t"
201 " unpckhps %%xmm7, %%xmm2\n\t"
202 " movups %%xmm2, %%xmm3\n\t"
203 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
204 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
205 " addps %%xmm2, %%xmm6\n\t"
206 " # xmm6 = r1 i2 r3 i4\n\t"
207 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
208 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
209 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
210 "to memory\n\t"
211 :
212 : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
213 : "rax", "r8", "r9", "r10");
214
215
216 if (isodd) {
217 *result += input[num_points - 1] * taps[num_points - 1];
218 }
219
220 return;
221}
222
223#endif /* LV_HAVE_SSE && LV_HAVE_64 */
224
225
226#ifdef LV_HAVE_SSE3
227
228#include <pmmintrin.h>
229
231 const lv_32fc_t* input,
232 const lv_32fc_t* taps,
233 unsigned int num_points)
234{
235
236 lv_32fc_t dotProduct;
237 memset(&dotProduct, 0x0, 2 * sizeof(float));
238
239 unsigned int number = 0;
240 const unsigned int halfPoints = num_points / 2;
241 unsigned int isodd = num_points & 1;
242
243 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
244
245 const lv_32fc_t* a = input;
246 const lv_32fc_t* b = taps;
247
248 dotProdVal = _mm_setzero_ps();
249
250 for (; number < halfPoints; number++) {
251
252 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
253 y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
254
255 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
256 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
257
258 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
259
260 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
261
262 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
263
264 z = _mm_addsub_ps(tmp1,
265 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
266
267 dotProdVal =
268 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
269
270 a += 2;
271 b += 2;
272 }
273
274 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
275
276 _mm_storeu_ps((float*)dotProductVector,
277 dotProdVal); // Store the results back into the dot product vector
278
279 dotProduct += (dotProductVector[0] + dotProductVector[1]);
280
281 if (isodd) {
282 dotProduct += input[num_points - 1] * taps[num_points - 1];
283 }
284
285 *result = dotProduct;
286}
287
288#endif /*LV_HAVE_SSE3*/
289
290// #ifdef LV_HAVE_SSE4_1
291
292// #include <smmintrin.h>
293
294// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
295// const lv_32fc_t* input,
296// const lv_32fc_t* taps,
297// unsigned int num_points)
298// {
299
300// unsigned int i = 0;
301// const unsigned int qtr_points = num_points / 4;
302// const unsigned int isodd = num_points & 3;
303
304// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
305// float *p_input, *p_taps;
306// __m64* p_result;
307
308// p_result = (__m64*)result;
309// p_input = (float*)input;
310// p_taps = (float*)taps;
311
312// static const __m128i neg = { 0x000000000000000080000000 };
313
314// real0 = _mm_setzero_ps();
315// real1 = _mm_setzero_ps();
316// im0 = _mm_setzero_ps();
317// im1 = _mm_setzero_ps();
318
319// for (; i < qtr_points; ++i) {
320// xmm0 = _mm_loadu_ps(p_input);
321// xmm1 = _mm_loadu_ps(p_taps);
322
323// p_input += 4;
324// p_taps += 4;
325
326// xmm2 = _mm_loadu_ps(p_input);
327// xmm3 = _mm_loadu_ps(p_taps);
328
329// p_input += 4;
330// p_taps += 4;
331
332// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
333// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
334// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
335// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
336
337// // imaginary vector from input
338// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
339// // real vector from input
340// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
341// // imaginary vector from taps
342// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
343// // real vector from taps
344// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
345
346// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
347// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
348
349// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
350// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
351
352// real0 = _mm_add_ps(xmm4, real0);
353// real1 = _mm_add_ps(xmm5, real1);
354// im0 = _mm_add_ps(xmm6, im0);
355// im1 = _mm_add_ps(xmm7, im1);
356// }
357
358// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
359
360// im0 = _mm_add_ps(im0, im1);
361// real0 = _mm_add_ps(real0, real1);
362
363// im0 = _mm_add_ps(im0, real0);
364
365// _mm_storel_pi(p_result, im0);
366
367// for (i = num_points - isodd; i < num_points; i++) {
368// *result += input[i] * taps[i];
369// }
370// }
371
372// #endif /*LV_HAVE_SSE4_1*/
373
374#ifdef LV_HAVE_AVX
375
376#include <immintrin.h>
377
379 const lv_32fc_t* input,
380 const lv_32fc_t* taps,
381 unsigned int num_points)
382{
383
384 unsigned int isodd = num_points & 3;
385 unsigned int i = 0;
386 lv_32fc_t dotProduct;
387 memset(&dotProduct, 0x0, 2 * sizeof(float));
388
389 unsigned int number = 0;
390 const unsigned int quarterPoints = num_points / 4;
391
392 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
393
394 const lv_32fc_t* a = input;
395 const lv_32fc_t* b = taps;
396
397 dotProdVal = _mm256_setzero_ps();
398
399 for (; number < quarterPoints; number++) {
400 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
401 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
402
403 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
404 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
405
406 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
407
408 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
409
410 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
411
412 z = _mm256_addsub_ps(tmp1,
413 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
414
415 dotProdVal = _mm256_add_ps(dotProdVal,
416 z); // Add the complex multiplication results together
417
418 a += 4;
419 b += 4;
420 }
421
422 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
423
424 _mm256_storeu_ps((float*)dotProductVector,
425 dotProdVal); // Store the results back into the dot product vector
426
427 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
428 dotProductVector[3]);
429
430 for (i = num_points - isodd; i < num_points; i++) {
431 dotProduct += input[i] * taps[i];
432 }
433
434 *result = dotProduct;
435}
436
437#endif /*LV_HAVE_AVX*/
438
439#if LV_HAVE_AVX && LV_HAVE_FMA
440#include <immintrin.h>
441
442static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
443 const lv_32fc_t* input,
444 const lv_32fc_t* taps,
445 unsigned int num_points)
446{
447
448 unsigned int isodd = num_points & 3;
449 unsigned int i = 0;
450 lv_32fc_t dotProduct;
451 memset(&dotProduct, 0x0, 2 * sizeof(float));
452
453 unsigned int number = 0;
454 const unsigned int quarterPoints = num_points / 4;
455
456 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
457
458 const lv_32fc_t* a = input;
459 const lv_32fc_t* b = taps;
460
461 dotProdVal = _mm256_setzero_ps();
462
463 for (; number < quarterPoints; number++) {
464
465 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
466 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
467
468 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
469 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
470
471 tmp1 = x;
472
473 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
474
475 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
476
477 z = _mm256_fmaddsub_ps(
478 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
479
480 dotProdVal = _mm256_add_ps(dotProdVal,
481 z); // Add the complex multiplication results together
482
483 a += 4;
484 b += 4;
485 }
486
487 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
488
489 _mm256_storeu_ps((float*)dotProductVector,
490 dotProdVal); // Store the results back into the dot product vector
491
492 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
493 dotProductVector[3]);
494
495 for (i = num_points - isodd; i < num_points; i++) {
496 dotProduct += input[i] * taps[i];
497 }
498
499 *result = dotProduct;
500}
501
502#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
503
504#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
505
506#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
507#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
508
509#include <stdio.h>
510#include <string.h>
511#include <volk/volk_common.h>
512#include <volk/volk_complex.h>
513
514
515#ifdef LV_HAVE_GENERIC
516
517
519 const lv_32fc_t* input,
520 const lv_32fc_t* taps,
521 unsigned int num_points)
522{
523
524 const unsigned int num_bytes = num_points * 8;
525
526 float* res = (float*)result;
527 float* in = (float*)input;
528 float* tp = (float*)taps;
529 unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
530
531 float sum0[2] = { 0, 0 };
532 float sum1[2] = { 0, 0 };
533 unsigned int i = 0;
534
535 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
536 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
537 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
538 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
539 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
540
541 in += 4;
542 tp += 4;
543 }
544
545 res[0] = sum0[0] + sum1[0];
546 res[1] = sum0[1] + sum1[1];
547
548 if (num_points & 1) {
549 *result += input[num_points - 1] * taps[num_points - 1];
550 }
551}
552
553#endif /*LV_HAVE_GENERIC*/
554
555
556#if LV_HAVE_SSE && LV_HAVE_64
557
558
559static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
560 const lv_32fc_t* input,
561 const lv_32fc_t* taps,
562 unsigned int num_points)
563{
564
565 const unsigned int num_bytes = num_points * 8;
566 unsigned int isodd = num_points & 1;
567
569 "# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
570 "# const float *taps, unsigned num_bytes)\n\t"
571 "# float sum0 = 0;\n\t"
572 "# float sum1 = 0;\n\t"
573 "# float sum2 = 0;\n\t"
574 "# float sum3 = 0;\n\t"
575 "# do {\n\t"
576 "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
577 "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
578 "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
579 "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
580 "# input += 4;\n\t"
581 "# taps += 4; \n\t"
582 "# } while (--n_2_ccomplex_blocks != 0);\n\t"
583 "# result[0] = sum0 + sum2;\n\t"
584 "# result[1] = sum1 + sum3;\n\t"
585 "# TODO: prefetch and better scheduling\n\t"
586 " xor %%r9, %%r9\n\t"
587 " xor %%r10, %%r10\n\t"
588 " movq %%rcx, %%rax\n\t"
589 " movq %%rcx, %%r8\n\t"
590 " movq %[rsi], %%r9\n\t"
591 " movq %[rdx], %%r10\n\t"
592 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
593 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
594 " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
595 " shr $4, %%r8\n\t"
596 " jmp .%=L1_test\n\t"
597 " # 4 taps / loop\n\t"
598 " # something like ?? cycles / loop\n\t"
599 ".%=Loop1: \n\t"
600 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
601 "# movaps (%%r9), %%xmmA\n\t"
602 "# movaps (%%r10), %%xmmB\n\t"
603 "# movaps %%xmmA, %%xmmZ\n\t"
604 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
605 "# mulps %%xmmB, %%xmmA\n\t"
606 "# mulps %%xmmZ, %%xmmB\n\t"
607 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
608 "# xorps %%xmmPN, %%xmmA\n\t"
609 "# movaps %%xmmA, %%xmmZ\n\t"
610 "# unpcklps %%xmmB, %%xmmA\n\t"
611 "# unpckhps %%xmmB, %%xmmZ\n\t"
612 "# movaps %%xmmZ, %%xmmY\n\t"
613 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
614 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
615 "# addps %%xmmZ, %%xmmA\n\t"
616 "# addps %%xmmA, %%xmmC\n\t"
617 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
618 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
619 " movaps 0(%%r9), %%xmm0\n\t"
620 " movaps 16(%%r9), %%xmm1\n\t"
621 " movaps %%xmm0, %%xmm4\n\t"
622 " movaps 0(%%r10), %%xmm2\n\t"
623 " mulps %%xmm2, %%xmm0\n\t"
624 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
625 " movaps 16(%%r10), %%xmm3\n\t"
626 " movaps %%xmm1, %%xmm5\n\t"
627 " addps %%xmm0, %%xmm6\n\t"
628 " mulps %%xmm3, %%xmm1\n\t"
629 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
630 " addps %%xmm1, %%xmm6\n\t"
631 " mulps %%xmm4, %%xmm2\n\t"
632 " addps %%xmm2, %%xmm7\n\t"
633 " mulps %%xmm5, %%xmm3\n\t"
634 " add $32, %%r9\n\t"
635 " addps %%xmm3, %%xmm7\n\t"
636 " add $32, %%r10\n\t"
637 ".%=L1_test:\n\t"
638 " dec %%rax\n\t"
639 " jge .%=Loop1\n\t"
640 " # We've handled the bulk of multiplies up to here.\n\t"
641 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
642 " # If so, we've got 2 more taps to do.\n\t"
643 " and $1, %%r8\n\t"
644 " je .%=Leven\n\t"
645 " # The count was odd, do 2 more taps.\n\t"
646 " movaps 0(%%r9), %%xmm0\n\t"
647 " movaps %%xmm0, %%xmm4\n\t"
648 " movaps 0(%%r10), %%xmm2\n\t"
649 " mulps %%xmm2, %%xmm0\n\t"
650 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
651 " addps %%xmm0, %%xmm6\n\t"
652 " mulps %%xmm4, %%xmm2\n\t"
653 " addps %%xmm2, %%xmm7\n\t"
654 ".%=Leven:\n\t"
655 " # neg inversor\n\t"
656 " xorps %%xmm1, %%xmm1\n\t"
657 " mov $0x80000000, %%r9\n\t"
658 " movd %%r9, %%xmm1\n\t"
659 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
660 " # pfpnacc\n\t"
661 " xorps %%xmm1, %%xmm6\n\t"
662 " movaps %%xmm6, %%xmm2\n\t"
663 " unpcklps %%xmm7, %%xmm6\n\t"
664 " unpckhps %%xmm7, %%xmm2\n\t"
665 " movaps %%xmm2, %%xmm3\n\t"
666 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
667 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
668 " addps %%xmm2, %%xmm6\n\t"
669 " # xmm6 = r1 i2 r3 i4\n\t"
670 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
671 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
672 " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) "
673 "to memory\n\t"
674 :
675 : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
676 : "rax", "r8", "r9", "r10");
677
678
679 if (isodd) {
680 *result += input[num_points - 1] * taps[num_points - 1];
681 }
682
683 return;
684}
685
686#endif
687
688#if LV_HAVE_SSE && LV_HAVE_32
689
690static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
691 const lv_32fc_t* input,
692 const lv_32fc_t* taps,
693 unsigned int num_points)
694{
695
696 volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
697
698#if 0
699 const unsigned int num_bytes = num_points*8;
700 unsigned int isodd = num_points & 1;
701
703 (
704 " #pushl %%ebp\n\t"
705 " #movl %%esp, %%ebp\n\t"
706 " movl 12(%%ebp), %%eax # input\n\t"
707 " movl 16(%%ebp), %%edx # taps\n\t"
708 " movl 20(%%ebp), %%ecx # n_bytes\n\t"
709 " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
710 " movaps 0(%%eax), %%xmm0\n\t"
711 " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
712 " movaps 0(%%edx), %%xmm2\n\t"
713 " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
714 " jmp .%=L1_test\n\t"
715 " # 4 taps / loop\n\t"
716 " # something like ?? cycles / loop\n\t"
717 ".%=Loop1: \n\t"
718 "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
719 "# movaps (%%eax), %%xmmA\n\t"
720 "# movaps (%%edx), %%xmmB\n\t"
721 "# movaps %%xmmA, %%xmmZ\n\t"
722 "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
723 "# mulps %%xmmB, %%xmmA\n\t"
724 "# mulps %%xmmZ, %%xmmB\n\t"
725 "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
726 "# xorps %%xmmPN, %%xmmA\n\t"
727 "# movaps %%xmmA, %%xmmZ\n\t"
728 "# unpcklps %%xmmB, %%xmmA\n\t"
729 "# unpckhps %%xmmB, %%xmmZ\n\t"
730 "# movaps %%xmmZ, %%xmmY\n\t"
731 "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
732 "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
733 "# addps %%xmmZ, %%xmmA\n\t"
734 "# addps %%xmmA, %%xmmC\n\t"
735 "# A=xmm0, B=xmm2, Z=xmm4\n\t"
736 "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
737 " movaps 16(%%eax), %%xmm1\n\t"
738 " movaps %%xmm0, %%xmm4\n\t"
739 " mulps %%xmm2, %%xmm0\n\t"
740 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
741 " movaps 16(%%edx), %%xmm3\n\t"
742 " movaps %%xmm1, %%xmm5\n\t"
743 " addps %%xmm0, %%xmm6\n\t"
744 " mulps %%xmm3, %%xmm1\n\t"
745 " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
746 " addps %%xmm1, %%xmm6\n\t"
747 " mulps %%xmm4, %%xmm2\n\t"
748 " movaps 32(%%eax), %%xmm0\n\t"
749 " addps %%xmm2, %%xmm7\n\t"
750 " mulps %%xmm5, %%xmm3\n\t"
751 " addl $32, %%eax\n\t"
752 " movaps 32(%%edx), %%xmm2\n\t"
753 " addps %%xmm3, %%xmm7\n\t"
754 " addl $32, %%edx\n\t"
755 ".%=L1_test:\n\t"
756 " decl %%ecx\n\t"
757 " jge .%=Loop1\n\t"
758 " # We've handled the bulk of multiplies up to here.\n\t"
759 " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
760 " # If so, we've got 2 more taps to do.\n\t"
761 " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
762 " shrl $4, %%ecx\n\t"
763 " andl $1, %%ecx\n\t"
764 " je .%=Leven\n\t"
765 " # The count was odd, do 2 more taps.\n\t"
766 " # Note that we've already got mm0/mm2 preloaded\n\t"
767 " # from the main loop.\n\t"
768 " movaps %%xmm0, %%xmm4\n\t"
769 " mulps %%xmm2, %%xmm0\n\t"
770 " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
771 " addps %%xmm0, %%xmm6\n\t"
772 " mulps %%xmm4, %%xmm2\n\t"
773 " addps %%xmm2, %%xmm7\n\t"
774 ".%=Leven:\n\t"
775 " # neg inversor\n\t"
776 " movl 8(%%ebp), %%eax \n\t"
777 " xorps %%xmm1, %%xmm1\n\t"
778 " movl $0x80000000, (%%eax)\n\t"
779 " movss (%%eax), %%xmm1\n\t"
780 " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
781 " # pfpnacc\n\t"
782 " xorps %%xmm1, %%xmm6\n\t"
783 " movaps %%xmm6, %%xmm2\n\t"
784 " unpcklps %%xmm7, %%xmm6\n\t"
785 " unpckhps %%xmm7, %%xmm2\n\t"
786 " movaps %%xmm2, %%xmm3\n\t"
787 " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
788 " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
789 " addps %%xmm2, %%xmm6\n\t"
790 " # xmm6 = r1 i2 r3 i4\n\t"
791 " #movl 8(%%ebp), %%eax # @result\n\t"
792 " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
793 " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
794 " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
795 " #popl %%ebp\n\t"
796 :
797 :
798 : "eax", "ecx", "edx"
799 );
800
801
802 int getem = num_bytes % 16;
803
804 if(isodd) {
805 *result += (input[num_points - 1] * taps[num_points - 1]);
806 }
807
808 return;
809#endif
810}
811
812#endif /*LV_HAVE_SSE*/
813
814#ifdef LV_HAVE_SSE3
815
816#include <pmmintrin.h>
817
819 const lv_32fc_t* input,
820 const lv_32fc_t* taps,
821 unsigned int num_points)
822{
823
824 const unsigned int num_bytes = num_points * 8;
825 unsigned int isodd = num_points & 1;
826
827 lv_32fc_t dotProduct;
828 memset(&dotProduct, 0x0, 2 * sizeof(float));
829
830 unsigned int number = 0;
831 const unsigned int halfPoints = num_bytes >> 4;
832
833 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
834
835 const lv_32fc_t* a = input;
836 const lv_32fc_t* b = taps;
837
838 dotProdVal = _mm_setzero_ps();
839
840 for (; number < halfPoints; number++) {
841
842 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
843 y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
844
845 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
846 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
847
848 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
849
850 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
851
852 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
853
854 z = _mm_addsub_ps(tmp1,
855 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
856
857 dotProdVal =
858 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
859
860 a += 2;
861 b += 2;
862 }
863
864 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
865
866 _mm_store_ps((float*)dotProductVector,
867 dotProdVal); // Store the results back into the dot product vector
868
869 dotProduct += (dotProductVector[0] + dotProductVector[1]);
870
871 if (isodd) {
872 dotProduct += input[num_points - 1] * taps[num_points - 1];
873 }
874
875 *result = dotProduct;
876}
877
878#endif /*LV_HAVE_SSE3*/
879
880
881// #ifdef LV_HAVE_SSE4_1
882
883// #include <smmintrin.h>
884
885// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
886// const lv_32fc_t* input,
887// const lv_32fc_t* taps,
888// unsigned int num_points)
889// {
890
891// unsigned int i = 0;
892// const unsigned int qtr_points = num_points / 4;
893// const unsigned int isodd = num_points & 3;
894
895// __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
896// float *p_input, *p_taps;
897// __m64* p_result;
898
899// static const __m128i neg = { 0x000000000000000080000000 };
900
901// p_result = (__m64*)result;
902// p_input = (float*)input;
903// p_taps = (float*)taps;
904
905// real0 = _mm_setzero_ps();
906// real1 = _mm_setzero_ps();
907// im0 = _mm_setzero_ps();
908// im1 = _mm_setzero_ps();
909
910// for (; i < qtr_points; ++i) {
911// xmm0 = _mm_load_ps(p_input);
912// xmm1 = _mm_load_ps(p_taps);
913
914// p_input += 4;
915// p_taps += 4;
916
917// xmm2 = _mm_load_ps(p_input);
918// xmm3 = _mm_load_ps(p_taps);
919
920// p_input += 4;
921// p_taps += 4;
922
923// xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
924// xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
925// xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
926// xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
927
928// // imaginary vector from input
929// xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
930// // real vector from input
931// xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
932// // imaginary vector from taps
933// xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
934// // real vector from taps
935// xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
936
937// xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
938// xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
939
940// xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
941// xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
942
943// real0 = _mm_add_ps(xmm4, real0);
944// real1 = _mm_add_ps(xmm5, real1);
945// im0 = _mm_add_ps(xmm6, im0);
946// im1 = _mm_add_ps(xmm7, im1);
947// }
948
949// real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
950
951// im0 = _mm_add_ps(im0, im1);
952// real0 = _mm_add_ps(real0, real1);
953
954// im0 = _mm_add_ps(im0, real0);
955
956// _mm_storel_pi(p_result, im0);
957
958// for (i = num_points - isodd; i < num_points; i++) {
959// *result += input[i] * taps[i];
960// }
961// }
962
963// #endif /*LV_HAVE_SSE4_1*/
964
965#ifdef LV_HAVE_NEON
966#include <arm_neon.h>
967
969 const lv_32fc_t* input,
970 const lv_32fc_t* taps,
971 unsigned int num_points)
972{
973
974 unsigned int quarter_points = num_points / 4;
975 unsigned int number;
976
977 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
978 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
979 // for 2-lane vectors, 1st lane holds the real part,
980 // 2nd lane holds the imaginary part
981 float32x4x2_t a_val, b_val, c_val, accumulator;
982 float32x4x2_t tmp_real, tmp_imag;
983 accumulator.val[0] = vdupq_n_f32(0);
984 accumulator.val[1] = vdupq_n_f32(0);
985
986 for (number = 0; number < quarter_points; ++number) {
987 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
988 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
989 __VOLK_PREFETCH(a_ptr + 8);
990 __VOLK_PREFETCH(b_ptr + 8);
991
992 // multiply the real*real and imag*imag to get real result
993 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
994 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
995 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
996 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
997
998 // Multiply cross terms to get the imaginary result
999 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
1000 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
1001 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
1002 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
1003
1004 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
1005 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
1006
1007 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
1008 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
1009
1010 a_ptr += 4;
1011 b_ptr += 4;
1012 }
1013 lv_32fc_t accum_result[4];
1014 vst2q_f32((float*)accum_result, accumulator);
1015 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1016
1017 // tail case
1018 for (number = quarter_points * 4; number < num_points; ++number) {
1019 *result += (*a_ptr++) * (*b_ptr++);
1020 }
1021}
1022#endif /*LV_HAVE_NEON*/
1023
1024#ifdef LV_HAVE_NEON
1025#include <arm_neon.h>
1027 const lv_32fc_t* input,
1028 const lv_32fc_t* taps,
1029 unsigned int num_points)
1030{
1031
1032 unsigned int quarter_points = num_points / 4;
1033 unsigned int number;
1034
1035 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
1036 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
1037 // for 2-lane vectors, 1st lane holds the real part,
1038 // 2nd lane holds the imaginary part
1039 float32x4x2_t a_val, b_val, accumulator;
1040 float32x4x2_t tmp_imag;
1041 accumulator.val[0] = vdupq_n_f32(0);
1042 accumulator.val[1] = vdupq_n_f32(0);
1043
1044 for (number = 0; number < quarter_points; ++number) {
1045 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
1046 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
1047 __VOLK_PREFETCH(a_ptr + 8);
1048 __VOLK_PREFETCH(b_ptr + 8);
1049
1050 // do the first multiply
1051 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
1052 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
1053
1054 // use multiply accumulate/subtract to get result
1055 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
1056 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
1057
1058 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
1059 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
1060
1061 // increment pointers
1062 a_ptr += 4;
1063 b_ptr += 4;
1064 }
1065 lv_32fc_t accum_result[4];
1066 vst2q_f32((float*)accum_result, accumulator);
1067 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1068
1069 // tail case
1070 for (number = quarter_points * 4; number < num_points; ++number) {
1071 *result += (*a_ptr++) * (*b_ptr++);
1072 }
1073}
1074#endif /*LV_HAVE_NEON*/
1075
1076#ifdef LV_HAVE_NEON
1078 const lv_32fc_t* input,
1079 const lv_32fc_t* taps,
1080 unsigned int num_points)
1081{
1082
1083 unsigned int quarter_points = num_points / 4;
1084 unsigned int number;
1085
1086 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
1087 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
1088 // for 2-lane vectors, 1st lane holds the real part,
1089 // 2nd lane holds the imaginary part
1090 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
1091 accumulator1.val[0] = vdupq_n_f32(0);
1092 accumulator1.val[1] = vdupq_n_f32(0);
1093 accumulator2.val[0] = vdupq_n_f32(0);
1094 accumulator2.val[1] = vdupq_n_f32(0);
1095
1096 for (number = 0; number < quarter_points; ++number) {
1097 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
1098 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
1099 __VOLK_PREFETCH(a_ptr + 8);
1100 __VOLK_PREFETCH(b_ptr + 8);
1101
1102 // use 2 accumulators to remove inter-instruction data dependencies
1103 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1104 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1105 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1106 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1107 // increment pointers
1108 a_ptr += 4;
1109 b_ptr += 4;
1110 }
1111 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1112 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1113 lv_32fc_t accum_result[4];
1114 vst2q_f32((float*)accum_result, accumulator1);
1115 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1116
1117 // tail case
1118 for (number = quarter_points * 4; number < num_points; ++number) {
1119 *result += (*a_ptr++) * (*b_ptr++);
1120 }
1121}
1122#endif /*LV_HAVE_NEON*/
1123
1124#ifdef LV_HAVE_NEON
1126 const lv_32fc_t* input,
1127 const lv_32fc_t* taps,
1128 unsigned int num_points)
1129{
1130 // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
1131 // fast
1132
1133 unsigned int quarter_points = num_points / 8;
1134 unsigned int number;
1135
1136 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
1137 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
1138 // for 2-lane vectors, 1st lane holds the real part,
1139 // 2nd lane holds the imaginary part
1140 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
1141 float32x4x2_t reduced_accumulator;
1142 accumulator1.val[0] = vdupq_n_f32(0);
1143 accumulator1.val[1] = vdupq_n_f32(0);
1144 accumulator1.val[2] = vdupq_n_f32(0);
1145 accumulator1.val[3] = vdupq_n_f32(0);
1146 accumulator2.val[0] = vdupq_n_f32(0);
1147 accumulator2.val[1] = vdupq_n_f32(0);
1148 accumulator2.val[2] = vdupq_n_f32(0);
1149 accumulator2.val[3] = vdupq_n_f32(0);
1150
1151 // 8 input regs, 8 accumulators -> 16/16 neon regs are used
1152 for (number = 0; number < quarter_points; ++number) {
1153 a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
1154 b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
1155 __VOLK_PREFETCH(a_ptr + 8);
1156 __VOLK_PREFETCH(b_ptr + 8);
1157
1158 // use 2 accumulators to remove inter-instruction data dependencies
1159 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
1160 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
1161
1162 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
1163 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
1164
1165 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
1166 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
1167
1168 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
1169 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
1170 // increment pointers
1171 a_ptr += 8;
1172 b_ptr += 8;
1173 }
1174 // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
1175 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
1176 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
1177 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
1178 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
1179 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
1180 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
1181 // now reduce accumulators to scalars
1182 lv_32fc_t accum_result[4];
1183 vst2q_f32((float*)accum_result, reduced_accumulator);
1184 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
1185
1186 // tail case
1187 for (number = quarter_points * 8; number < num_points; ++number) {
1188 *result += (*a_ptr++) * (*b_ptr++);
1189 }
1190}
1191#endif /*LV_HAVE_NEON*/
1192
1193
1194#ifdef LV_HAVE_AVX
1195
1196#include <immintrin.h>
1197
1199 const lv_32fc_t* input,
1200 const lv_32fc_t* taps,
1201 unsigned int num_points)
1202{
1203
1204 unsigned int isodd = num_points & 3;
1205 unsigned int i = 0;
1206 lv_32fc_t dotProduct;
1207 memset(&dotProduct, 0x0, 2 * sizeof(float));
1208
1209 unsigned int number = 0;
1210 const unsigned int quarterPoints = num_points / 4;
1211
1212 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1213
1214 const lv_32fc_t* a = input;
1215 const lv_32fc_t* b = taps;
1216
1217 dotProdVal = _mm256_setzero_ps();
1218
1219 for (; number < quarterPoints; number++) {
1220
1221 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
1222 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
1223
1224 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
1225 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
1226
1227 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
1228
1229 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
1230
1231 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
1232
1233 z = _mm256_addsub_ps(tmp1,
1234 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
1235
1236 dotProdVal = _mm256_add_ps(dotProdVal,
1237 z); // Add the complex multiplication results together
1238
1239 a += 4;
1240 b += 4;
1241 }
1242
1243 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
1244
1245 _mm256_store_ps((float*)dotProductVector,
1246 dotProdVal); // Store the results back into the dot product vector
1247
1248 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1249 dotProductVector[3]);
1250
1251 for (i = num_points - isodd; i < num_points; i++) {
1252 dotProduct += input[i] * taps[i];
1253 }
1254
1255 *result = dotProduct;
1256}
1257
1258#endif /*LV_HAVE_AVX*/
1259
1260#if LV_HAVE_AVX && LV_HAVE_FMA
1261#include <immintrin.h>
1262
1263static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
1264 const lv_32fc_t* input,
1265 const lv_32fc_t* taps,
1266 unsigned int num_points)
1267{
1268
1269 unsigned int isodd = num_points & 3;
1270 unsigned int i = 0;
1271 lv_32fc_t dotProduct;
1272 memset(&dotProduct, 0x0, 2 * sizeof(float));
1273
1274 unsigned int number = 0;
1275 const unsigned int quarterPoints = num_points / 4;
1276
1277 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
1278
1279 const lv_32fc_t* a = input;
1280 const lv_32fc_t* b = taps;
1281
1282 dotProdVal = _mm256_setzero_ps();
1283
1284 for (; number < quarterPoints; number++) {
1285
1286 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
1287 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
1288
1289 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
1290 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
1291
1292 tmp1 = x;
1293
1294 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
1295
1296 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
1297
1298 z = _mm256_fmaddsub_ps(
1299 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
1300
1301 dotProdVal = _mm256_add_ps(dotProdVal,
1302 z); // Add the complex multiplication results together
1303
1304 a += 4;
1305 b += 4;
1306 }
1307
1308 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
1309
1310 _mm256_store_ps((float*)dotProductVector,
1311 dotProdVal); // Store the results back into the dot product vector
1312
1313 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
1314 dotProductVector[3]);
1315
1316 for (i = num_points - isodd; i < num_points; i++) {
1317 dotProduct += input[i] * taps[i];
1318 }
1319
1320 *result = dotProduct;
1321}
1322
1323#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
1324
1325
1326#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1125
static void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:818
static void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1198
static void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:378
static void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:968
static void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:57
static void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:230
static void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1026
static void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:518
static void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_32fc_x2_dot_prod_32fc.h:1077
#define __VOLK_VOLATILE
Definition: volk_common.h:73
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ASM
Definition: volk_common.h:72
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13