63#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64#define INCLUDED_volk_32fc_index_max_16u_a_H
76static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
82 const __m256i indices_increment = _mm256_set1_epi32(8);
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
90 __m256 max_values = _mm256_setzero_ps();
91 __m256i max_indices = _mm256_setzero_si256();
93 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
94 __m256 in0 = _mm256_load_ps((
float*)src0);
95 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
97 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
104 _mm256_store_ps(max_values_buffer, max_values);
105 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
109 for (
unsigned i = 0;
i < 8;
i++) {
110 if (max_values_buffer[
i] > max) {
111 max = max_values_buffer[
i];
112 index = max_indices_buffer[
i];
117 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
118 const float abs_squared =
120 if (abs_squared > max) {
133#include <immintrin.h>
136static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
142 const __m256i indices_increment = _mm256_set1_epi32(8);
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
150 __m256 max_values = _mm256_setzero_ps();
151 __m256i max_indices = _mm256_setzero_si256();
153 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
154 __m256 in0 = _mm256_load_ps((
float*)src0);
155 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
157 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
164 _mm256_store_ps(max_values_buffer, max_values);
165 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
169 for (
unsigned i = 0;
i < 8;
i++) {
170 if (max_values_buffer[
i] > max) {
171 max = max_values_buffer[
i];
172 index = max_indices_buffer[
i];
177 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
178 const float abs_squared =
180 if (abs_squared > max) {
193#include <pmmintrin.h>
194#include <xmmintrin.h>
199 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
200 const uint32_t num_bytes = num_points * 8;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210 xmm5.
int_vec = _mm_setzero_si128();
211 xmm4.
int_vec = _mm_setzero_si128();
212 holderf.
int_vec = _mm_setzero_si128();
213 holderi.
int_vec = _mm_setzero_si128();
215 int bound = num_bytes >> 5;
218 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
219 xmm9 = _mm_setzero_si128();
220 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
221 xmm3 = _mm_setzero_ps();
223 for (;
i < bound; ++
i) {
224 xmm1 = _mm_load_ps((
float*)src0);
225 xmm2 = _mm_load_ps((
float*)&src0[2]);
229 xmm1 = _mm_mul_ps(xmm1, xmm1);
230 xmm2 = _mm_mul_ps(xmm2, xmm2);
232 xmm1 = _mm_hadd_ps(xmm1, xmm2);
234 xmm3 = _mm_max_ps(xmm1, xmm3);
236 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
237 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
240 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
242 xmm9 = _mm_add_epi32(xmm11, xmm12);
244 xmm8 = _mm_add_epi32(xmm8, xmm10);
247 if (num_bytes >> 4 & 1) {
248 xmm2 = _mm_load_ps((
float*)src0);
253 xmm2 = _mm_mul_ps(xmm2, xmm2);
257 xmm1 = _mm_hadd_ps(xmm2, xmm2);
259 xmm3 = _mm_max_ps(xmm1, xmm3);
261 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
264 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
267 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
269 xmm9 = _mm_add_epi32(xmm11, xmm12);
271 xmm8 = _mm_add_epi32(xmm8, xmm10);
274 if (num_bytes >> 3 & 1) {
278 xmm2 = _mm_load1_ps(&sq_dist);
282 xmm3 = _mm_max_ss(xmm3, xmm2);
284 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
285 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
290 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
292 xmm9 = _mm_add_epi32(xmm11, xmm12);
295 _mm_store_ps((
float*)&(holderf.
f), xmm3);
296 _mm_store_si128(&(holderi.
int_vec), xmm9);
298 target[0] = holderi.
i[0];
299 sq_dist = holderf.
f[0];
300 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
301 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
302 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
303 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
304 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
305 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
310#ifdef LV_HAVE_GENERIC
314 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
316 const uint32_t num_bytes = num_points * 8;
324 for (; i<num_bytes>> 3; ++
i) {
340#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
341#define INCLUDED_volk_32fc_index_max_16u_u_H
350#include <immintrin.h>
353static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
357 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
359 const __m256i indices_increment = _mm256_set1_epi32(8);
365 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
367 __m256 max_values = _mm256_setzero_ps();
368 __m256i max_indices = _mm256_setzero_si256();
370 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
371 __m256 in0 = _mm256_loadu_ps((
float*)src0);
372 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
374 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
381 _mm256_store_ps(max_values_buffer, max_values);
382 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
386 for (
unsigned i = 0;
i < 8;
i++) {
387 if (max_values_buffer[
i] > max) {
388 max = max_values_buffer[
i];
389 index = max_indices_buffer[
i];
394 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
395 const float abs_squared =
397 if (abs_squared > max) {
410#include <immintrin.h>
413static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
417 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
419 const __m256i indices_increment = _mm256_set1_epi32(8);
425 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
427 __m256 max_values = _mm256_setzero_ps();
428 __m256i max_indices = _mm256_setzero_si256();
430 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
431 __m256 in0 = _mm256_loadu_ps((
float*)src0);
432 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
434 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
441 _mm256_store_ps(max_values_buffer, max_values);
442 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
446 for (
unsigned i = 0;
i < 8;
i++) {
447 if (max_values_buffer[
i] > max) {
448 max = max_values_buffer[
i];
449 index = max_indices_buffer[
i];
454 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
455 const float abs_squared =
457 if (abs_squared > max) {
Definition: volk_common.h:116
float f[4]
Definition: volk_common.h:120
__m128i int_vec
Definition: volk_common.h:128
uint32_t i[4]
Definition: volk_common.h:119
__m128 float_vec
Definition: volk_common.h:124
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:197
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:312
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:203
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:141
#define bit128_p(x)
Definition: volk_common.h:147
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13