Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47
48typedef union {
49 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50 unsigned int w[64 /*NUMSTATES*/ / 32];
51 unsigned short s[64 /*NUMSTATES*/ / 16];
52 unsigned char c[64 /*NUMSTATES*/ / 8];
53#ifdef _MSC_VER
55#else
56} decision_t __attribute__((aligned(16)));
57#endif
58
59
60static inline void renormalize(unsigned char* X, unsigned char threshold)
61{
62 int NUMSTATES = 64;
63 int i;
64
65 unsigned char min = X[0];
66 // if(min > threshold) {
67 for (i = 0; i < NUMSTATES; i++)
68 if (min > X[i])
69 min = X[i];
70 for (i = 0; i < NUMSTATES; i++)
71 X[i] -= min;
72 //}
73}
74
75
76// helper BFLY for GENERIC version
77static inline void BFLY(int i,
78 int s,
79 unsigned char* syms,
80 unsigned char* Y,
81 unsigned char* X,
82 decision_t* d,
83 unsigned char* Branchtab)
84{
85 int j, decision0, decision1;
86 unsigned char metric, m0, m1, m2, m3;
87
88 int NUMSTATES = 64;
89 int RATE = 2;
90 int METRICSHIFT = 2;
91 int PRECISIONSHIFT = 2;
92
93 metric = 0;
94 for (j = 0; j < RATE; j++)
95 metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
96 metric = metric >> PRECISIONSHIFT;
97
98 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
99
100 m0 = X[i] + metric;
101 m1 = X[i + NUMSTATES / 2] + (max - metric);
102 m2 = X[i] + (max - metric);
103 m3 = X[i + NUMSTATES / 2] + metric;
104
105 decision0 = (signed int)(m0 - m1) > 0;
106 decision1 = (signed int)(m2 - m3) > 0;
107
108 Y[2 * i] = decision0 ? m1 : m0;
109 Y[2 * i + 1] = decision1 ? m3 : m2;
110
111 d->w[i / (sizeof(unsigned int) * 8 / 2) +
112 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
113 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
114}
115
116
117//#if LV_HAVE_AVX2
118//
119//#include <immintrin.h>
120//#include <stdio.h>
121//
122// static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
123// unsigned char* X,
124// unsigned char* syms,
125// unsigned char* dec,
126// unsigned int framebits,
127// unsigned int excess,
128// unsigned char* Branchtab)
129//{
130// unsigned int i9;
131// for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
132// unsigned char a75, a81;
133// int a73, a92;
134// int s20, s21;
135// unsigned char *a80, *b6;
136// int *a110, *a91, *a93;
137// __m256i *a112, *a71, *a72, *a77, *a83, *a95;
138// __m256i a86, a87;
139// __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
140// m26,
141// s18, s19, s22, s23, s24, s25, t13, t14, t15;
142// a71 = ((__m256i*)X);
143// s18 = *(a71);
144// a72 = (a71 + 1);
145// s19 = *(a72);
146// s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
147// s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
148// s18 = s22;
149// a73 = (4 * i9);
150// b6 = (syms + a73);
151// a75 = *(b6);
152// a76 = _mm256_set1_epi8(a75);
153// a77 = ((__m256i*)Branchtab);
154// a78 = *(a77);
155// a79 = _mm256_xor_si256(a76, a78);
156// a80 = (b6 + 1);
157// a81 = *(a80);
158// a82 = _mm256_set1_epi8(a81);
159// a83 = (a77 + 1);
160// a84 = *(a83);
161// a85 = _mm256_xor_si256(a82, a84);
162// t13 = _mm256_avg_epu8(a79, a85);
163// a86 = ((__m256i)t13);
164// a87 = _mm256_srli_epi16(a86, 2);
165// a88 = ((__m256i)a87);
166// t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
167// t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
168// m23 = _mm256_adds_epu8(s18, t14);
169// m24 = _mm256_adds_epu8(s19, t15);
170// m25 = _mm256_adds_epu8(s18, t15);
171// m26 = _mm256_adds_epu8(s19, t14);
172// a89 = _mm256_min_epu8(m24, m23);
173// d9 = _mm256_cmpeq_epi8(a89, m24);
174// a90 = _mm256_min_epu8(m26, m25);
175// d10 = _mm256_cmpeq_epi8(a90, m26);
176// s22 = _mm256_unpacklo_epi8(d9, d10);
177// s23 = _mm256_unpackhi_epi8(d9, d10);
178// s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
179// a91 = ((int*)dec);
180// a92 = (4 * i9);
181// a93 = (a91 + a92);
182// *(a93) = s20;
183// s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
184// a110 = (a93 + 1);
185// *(a110) = s21;
186// s22 = _mm256_unpacklo_epi8(a89, a90);
187// s23 = _mm256_unpackhi_epi8(a89, a90);
188// a95 = ((__m256i*)Y);
189// s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
190// *(a95) = s24;
191// s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
192// a112 = (a95 + 1);
193// *(a112) = s23;
194// if ((((unsigned char*)Y)[0] > 210)) {
195// __m256i m5, m6;
196// m5 = ((__m256i*)Y)[0];
197// m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
198// __m256i m7;
199// m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
200// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
201// ((__m256i)m7)));
202// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
203// ((__m256i)m7)));
204// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
205// ((__m256i)m7)));
206// m7 = _mm256_unpacklo_epi8(m7, m7);
207// m7 = _mm256_shufflelo_epi16(m7, 0);
208// m6 = _mm256_unpacklo_epi64(m7, m7);
209// m6 = _mm256_permute2x128_si256(
210// m6, m6, 0); // copy lower half of m6 to upper half, since above ops
211// // operate on 128 bit lanes
212// ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
213// ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
214// }
215// unsigned char a188, a194;
216// int a205;
217// int s48, s54;
218// unsigned char *a187, *a193;
219// int *a204, *a206, *a223, *b16;
220// __m256i *a184, *a185, *a190, *a196, *a208, *a225;
221// __m256i a199, a200;
222// __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
223// m40,
224// m41, m42, s46, s47, s50, s51, t25, t26, t27;
225// a184 = ((__m256i*)Y);
226// s46 = *(a184);
227// a185 = (a184 + 1);
228// s47 = *(a185);
229// s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
230// s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
231// s46 = s50;
232// a187 = (b6 + 2);
233// a188 = *(a187);
234// a189 = _mm256_set1_epi8(a188);
235// a190 = ((__m256i*)Branchtab);
236// a191 = *(a190);
237// a192 = _mm256_xor_si256(a189, a191);
238// a193 = (b6 + 3);
239// a194 = *(a193);
240// a195 = _mm256_set1_epi8(a194);
241// a196 = (a190 + 1);
242// a197 = *(a196);
243// a198 = _mm256_xor_si256(a195, a197);
244// t25 = _mm256_avg_epu8(a192, a198);
245// a199 = ((__m256i)t25);
246// a200 = _mm256_srli_epi16(a199, 2);
247// a201 = ((__m256i)a200);
248// t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
249// t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
250// m39 = _mm256_adds_epu8(s46, t26);
251// m40 = _mm256_adds_epu8(s47, t27);
252// m41 = _mm256_adds_epu8(s46, t27);
253// m42 = _mm256_adds_epu8(s47, t26);
254// a202 = _mm256_min_epu8(m40, m39);
255// d17 = _mm256_cmpeq_epi8(a202, m40);
256// a203 = _mm256_min_epu8(m42, m41);
257// d18 = _mm256_cmpeq_epi8(a203, m42);
258// s24 = _mm256_unpacklo_epi8(d17, d18);
259// s25 = _mm256_unpackhi_epi8(d17, d18);
260// s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
261// a204 = ((int*)dec);
262// a205 = (4 * i9);
263// b16 = (a204 + a205);
264// a206 = (b16 + 2);
265// *(a206) = s48;
266// s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
267// a223 = (b16 + 3);
268// *(a223) = s54;
269// s50 = _mm256_unpacklo_epi8(a202, a203);
270// s51 = _mm256_unpackhi_epi8(a202, a203);
271// s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
272// s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
273// a208 = ((__m256i*)X);
274// *(a208) = s25;
275// a225 = (a208 + 1);
276// *(a225) = s51;
277//
278// if ((((unsigned char*)X)[0] > 210)) {
279// __m256i m12, m13;
280// m12 = ((__m256i*)X)[0];
281// m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
282// __m256i m14;
283// m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
284// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
285// ((__m256i)m14)));
286// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
287// ((__m256i)m14)));
288// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
289// ((__m256i)m14)));
290// m14 = _mm256_unpacklo_epi8(m14, m14);
291// m14 = _mm256_shufflelo_epi16(m14, 0);
292// m13 = _mm256_unpacklo_epi64(m14, m14);
293// m13 = _mm256_permute2x128_si256(m13, m13, 0);
294// ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
295// ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
296// }
297// }
298//
299// renormalize(X, 210);
300//
301// unsigned int j;
302// for (j = 0; j < (framebits + excess) % 2; ++j) {
303// int i;
304// for (i = 0; i < 64 / 2; i++) {
305// BFLY(i,
306// (((framebits + excess) >> 1) << 1) + j,
307// syms,
308// Y,
309// X,
310// (decision_t*)dec,
311// Branchtab);
312// }
313//
314// renormalize(Y, 210);
315// }
316// /*skip*/
317//}
318//
319//#endif /*LV_HAVE_AVX2*/
320
321
322#if LV_HAVE_SSE3
323
324#include <emmintrin.h>
325#include <mmintrin.h>
326#include <pmmintrin.h>
327#include <stdio.h>
328#include <xmmintrin.h>
329
330static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
331 unsigned char* X,
332 unsigned char* syms,
333 unsigned char* dec,
334 unsigned int framebits,
335 unsigned int excess,
336 unsigned char* Branchtab)
337{
338 unsigned int i9;
339 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
340 unsigned char a75, a81;
341 int a73, a92;
342 short int s20, s21, s26, s27;
343 unsigned char *a74, *a80, *b6;
344 short int *a110, *a111, *a91, *a93, *a94;
345 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
346 __m128i a105, a106, a86, a87;
347 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
348 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
349 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
350 a71 = ((__m128i*)X);
351 s18 = *(a71);
352 a72 = (a71 + 2);
353 s19 = *(a72);
354 a73 = (4 * i9);
355 a74 = (syms + a73);
356 a75 = *(a74);
357 a76 = _mm_set1_epi8(a75);
358 a77 = ((__m128i*)Branchtab);
359 a78 = *(a77);
360 a79 = _mm_xor_si128(a76, a78);
361 b6 = (a73 + syms);
362 a80 = (b6 + 1);
363 a81 = *(a80);
364 a82 = _mm_set1_epi8(a81);
365 a83 = (a77 + 2);
366 a84 = *(a83);
367 a85 = _mm_xor_si128(a82, a84);
368 t13 = _mm_avg_epu8(a79, a85);
369 a86 = ((__m128i)t13);
370 a87 = _mm_srli_epi16(a86, 2);
371 a88 = ((__m128i)a87);
372 t14 = _mm_and_si128(
373 a88,
374 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
375 t15 = _mm_subs_epu8(
376 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
377 t14);
378 m23 = _mm_adds_epu8(s18, t14);
379 m24 = _mm_adds_epu8(s19, t15);
380 m25 = _mm_adds_epu8(s18, t15);
381 m26 = _mm_adds_epu8(s19, t14);
382 a89 = _mm_min_epu8(m24, m23);
383 d9 = _mm_cmpeq_epi8(a89, m24);
384 a90 = _mm_min_epu8(m26, m25);
385 d10 = _mm_cmpeq_epi8(a90, m26);
386 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
387 a91 = ((short int*)dec);
388 a92 = (8 * i9);
389 a93 = (a91 + a92);
390 *(a93) = s20;
391 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
392 a94 = (a93 + 1);
393 *(a94) = s21;
394 s22 = _mm_unpacklo_epi8(a89, a90);
395 s23 = _mm_unpackhi_epi8(a89, a90);
396 a95 = ((__m128i*)Y);
397 *(a95) = s22;
398 a96 = (a95 + 1);
399 *(a96) = s23;
400 a97 = (a71 + 1);
401 s24 = *(a97);
402 a98 = (a71 + 3);
403 s25 = *(a98);
404 a99 = (a77 + 1);
405 a100 = *(a99);
406 a101 = _mm_xor_si128(a76, a100);
407 a102 = (a77 + 3);
408 a103 = *(a102);
409 a104 = _mm_xor_si128(a82, a103);
410 t16 = _mm_avg_epu8(a101, a104);
411 a105 = ((__m128i)t16);
412 a106 = _mm_srli_epi16(a105, 2);
413 a107 = ((__m128i)a106);
414 t17 = _mm_and_si128(
415 a107,
416 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
417 t18 = _mm_subs_epu8(
418 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
419 t17);
420 m27 = _mm_adds_epu8(s24, t17);
421 m28 = _mm_adds_epu8(s25, t18);
422 m29 = _mm_adds_epu8(s24, t18);
423 m30 = _mm_adds_epu8(s25, t17);
424 a108 = _mm_min_epu8(m28, m27);
425 d11 = _mm_cmpeq_epi8(a108, m28);
426 a109 = _mm_min_epu8(m30, m29);
427 d12 = _mm_cmpeq_epi8(a109, m30);
428 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
429 a110 = (a93 + 2);
430 *(a110) = s26;
431 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
432 a111 = (a93 + 3);
433 *(a111) = s27;
434 s28 = _mm_unpacklo_epi8(a108, a109);
435 s29 = _mm_unpackhi_epi8(a108, a109);
436 a112 = (a95 + 2);
437 *(a112) = s28;
438 a113 = (a95 + 3);
439 *(a113) = s29;
440 if ((((unsigned char*)Y)[0] > 210)) {
441 __m128i m5, m6;
442 m5 = ((__m128i*)Y)[0];
443 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
444 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
445 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
446 __m128i m7;
447 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
448 m7 =
449 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
450 m7 =
451 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
452 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
453 m7 = _mm_unpacklo_epi8(m7, m7);
454 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
455 m6 = _mm_unpacklo_epi64(m7, m7);
456 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
457 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
458 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
459 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
460 }
461 unsigned char a188, a194;
462 int a186, a205;
463 short int s48, s49, s54, s55;
464 unsigned char *a187, *a193, *b15;
465 short int *a204, *a206, *a207, *a223, *a224, *b16;
466 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
467 *a225, *a226;
468 __m128i a199, a200, a218, a219;
469 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
470 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
471 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
472 a184 = ((__m128i*)Y);
473 s46 = *(a184);
474 a185 = (a184 + 2);
475 s47 = *(a185);
476 a186 = (4 * i9);
477 b15 = (a186 + syms);
478 a187 = (b15 + 2);
479 a188 = *(a187);
480 a189 = _mm_set1_epi8(a188);
481 a190 = ((__m128i*)Branchtab);
482 a191 = *(a190);
483 a192 = _mm_xor_si128(a189, a191);
484 a193 = (b15 + 3);
485 a194 = *(a193);
486 a195 = _mm_set1_epi8(a194);
487 a196 = (a190 + 2);
488 a197 = *(a196);
489 a198 = _mm_xor_si128(a195, a197);
490 t25 = _mm_avg_epu8(a192, a198);
491 a199 = ((__m128i)t25);
492 a200 = _mm_srli_epi16(a199, 2);
493 a201 = ((__m128i)a200);
494 t26 = _mm_and_si128(
495 a201,
496 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
497 t27 = _mm_subs_epu8(
498 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
499 t26);
500 m39 = _mm_adds_epu8(s46, t26);
501 m40 = _mm_adds_epu8(s47, t27);
502 m41 = _mm_adds_epu8(s46, t27);
503 m42 = _mm_adds_epu8(s47, t26);
504 a202 = _mm_min_epu8(m40, m39);
505 d17 = _mm_cmpeq_epi8(a202, m40);
506 a203 = _mm_min_epu8(m42, m41);
507 d18 = _mm_cmpeq_epi8(a203, m42);
508 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
509 a204 = ((short int*)dec);
510 a205 = (8 * i9);
511 b16 = (a204 + a205);
512 a206 = (b16 + 4);
513 *(a206) = s48;
514 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
515 a207 = (b16 + 5);
516 *(a207) = s49;
517 s50 = _mm_unpacklo_epi8(a202, a203);
518 s51 = _mm_unpackhi_epi8(a202, a203);
519 a208 = ((__m128i*)X);
520 *(a208) = s50;
521 a209 = (a208 + 1);
522 *(a209) = s51;
523 a210 = (a184 + 1);
524 s52 = *(a210);
525 a211 = (a184 + 3);
526 s53 = *(a211);
527 a212 = (a190 + 1);
528 a213 = *(a212);
529 a214 = _mm_xor_si128(a189, a213);
530 a215 = (a190 + 3);
531 a216 = *(a215);
532 a217 = _mm_xor_si128(a195, a216);
533 t28 = _mm_avg_epu8(a214, a217);
534 a218 = ((__m128i)t28);
535 a219 = _mm_srli_epi16(a218, 2);
536 a220 = ((__m128i)a219);
537 t29 = _mm_and_si128(
538 a220,
539 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
540 t30 = _mm_subs_epu8(
541 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
542 t29);
543 m43 = _mm_adds_epu8(s52, t29);
544 m44 = _mm_adds_epu8(s53, t30);
545 m45 = _mm_adds_epu8(s52, t30);
546 m46 = _mm_adds_epu8(s53, t29);
547 a221 = _mm_min_epu8(m44, m43);
548 d19 = _mm_cmpeq_epi8(a221, m44);
549 a222 = _mm_min_epu8(m46, m45);
550 d20 = _mm_cmpeq_epi8(a222, m46);
551 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
552 a223 = (b16 + 6);
553 *(a223) = s54;
554 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
555 a224 = (b16 + 7);
556 *(a224) = s55;
557 s56 = _mm_unpacklo_epi8(a221, a222);
558 s57 = _mm_unpackhi_epi8(a221, a222);
559 a225 = (a208 + 2);
560 *(a225) = s56;
561 a226 = (a208 + 3);
562 *(a226) = s57;
563 if ((((unsigned char*)X)[0] > 210)) {
564 __m128i m12, m13;
565 m12 = ((__m128i*)X)[0];
566 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
567 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
568 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
569 __m128i m14;
570 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
571 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
572 ((__m128i)m14)));
573 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
574 ((__m128i)m14)));
575 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
576 ((__m128i)m14)));
577 m14 = _mm_unpacklo_epi8(m14, m14);
578 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
579 m13 = _mm_unpacklo_epi64(m14, m14);
580 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
581 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
582 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
583 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
584 }
585 }
586
587 renormalize(X, 210);
588
589 /*int ch;
590 for(ch = 0; ch < 64; ch++) {
591 printf("%d,", X[ch]);
592 }
593 printf("\n");*/
594
595 unsigned int j;
596 for (j = 0; j < (framebits + excess) % 2; ++j) {
597 int i;
598 for (i = 0; i < 64 / 2; i++) {
599 BFLY(i,
600 (((framebits + excess) >> 1) << 1) + j,
601 syms,
602 Y,
603 X,
604 (decision_t*)dec,
605 Branchtab);
606 }
607
608
609 renormalize(Y, 210);
610
611 /*printf("\n");
612 for(ch = 0; ch < 64; ch++) {
613 printf("%d,", Y[ch]);
614 }
615 printf("\n");*/
616 }
617 /*skip*/
618}
619
620#endif /*LV_HAVE_SSE3*/
621
622#if LV_HAVE_NEON
623
624#include "volk/sse2neon.h"
625
626static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
627 unsigned char* X,
628 unsigned char* syms,
629 unsigned char* dec,
630 unsigned int framebits,
631 unsigned int excess,
632 unsigned char* Branchtab)
633{
634 unsigned int i9;
635 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
636 unsigned char a75, a81;
637 int a73, a92;
638 short int s20, s21, s26, s27;
639 unsigned char *a74, *a80, *b6;
640 short int *a110, *a111, *a91, *a93, *a94;
641 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
642 __m128i a105, a106, a86, a87;
643 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
644 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
645 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
646 a71 = ((__m128i*)X);
647 s18 = *(a71);
648 a72 = (a71 + 2);
649 s19 = *(a72);
650 a73 = (4 * i9);
651 a74 = (syms + a73);
652 a75 = *(a74);
653 a76 = _mm_set1_epi8(a75);
654 a77 = ((__m128i*)Branchtab);
655 a78 = *(a77);
656 a79 = _mm_xor_si128(a76, a78);
657 b6 = (a73 + syms);
658 a80 = (b6 + 1);
659 a81 = *(a80);
660 a82 = _mm_set1_epi8(a81);
661 a83 = (a77 + 2);
662 a84 = *(a83);
663 a85 = _mm_xor_si128(a82, a84);
664 t13 = _mm_avg_epu8(a79, a85);
665 a86 = ((__m128i)t13);
666 a87 = _mm_srli_epi16(a86, 2);
667 a88 = ((__m128i)a87);
668 t14 = _mm_and_si128(
669 a88,
670 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
671 t15 = _mm_subs_epu8(
672 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
673 t14);
674 m23 = _mm_adds_epu8(s18, t14);
675 m24 = _mm_adds_epu8(s19, t15);
676 m25 = _mm_adds_epu8(s18, t15);
677 m26 = _mm_adds_epu8(s19, t14);
678 a89 = _mm_min_epu8(m24, m23);
679 d9 = _mm_cmpeq_epi8(a89, m24);
680 a90 = _mm_min_epu8(m26, m25);
681 d10 = _mm_cmpeq_epi8(a90, m26);
682 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
683 a91 = ((short int*)dec);
684 a92 = (8 * i9);
685 a93 = (a91 + a92);
686 *(a93) = s20;
687 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
688 a94 = (a93 + 1);
689 *(a94) = s21;
690 s22 = _mm_unpacklo_epi8(a89, a90);
691 s23 = _mm_unpackhi_epi8(a89, a90);
692 a95 = ((__m128i*)Y);
693 *(a95) = s22;
694 a96 = (a95 + 1);
695 *(a96) = s23;
696 a97 = (a71 + 1);
697 s24 = *(a97);
698 a98 = (a71 + 3);
699 s25 = *(a98);
700 a99 = (a77 + 1);
701 a100 = *(a99);
702 a101 = _mm_xor_si128(a76, a100);
703 a102 = (a77 + 3);
704 a103 = *(a102);
705 a104 = _mm_xor_si128(a82, a103);
706 t16 = _mm_avg_epu8(a101, a104);
707 a105 = ((__m128i)t16);
708 a106 = _mm_srli_epi16(a105, 2);
709 a107 = ((__m128i)a106);
710 t17 = _mm_and_si128(
711 a107,
712 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
713 t18 = _mm_subs_epu8(
714 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
715 t17);
716 m27 = _mm_adds_epu8(s24, t17);
717 m28 = _mm_adds_epu8(s25, t18);
718 m29 = _mm_adds_epu8(s24, t18);
719 m30 = _mm_adds_epu8(s25, t17);
720 a108 = _mm_min_epu8(m28, m27);
721 d11 = _mm_cmpeq_epi8(a108, m28);
722 a109 = _mm_min_epu8(m30, m29);
723 d12 = _mm_cmpeq_epi8(a109, m30);
724 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
725 a110 = (a93 + 2);
726 *(a110) = s26;
727 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
728 a111 = (a93 + 3);
729 *(a111) = s27;
730 s28 = _mm_unpacklo_epi8(a108, a109);
731 s29 = _mm_unpackhi_epi8(a108, a109);
732 a112 = (a95 + 2);
733 *(a112) = s28;
734 a113 = (a95 + 3);
735 *(a113) = s29;
736 if ((((unsigned char*)Y)[0] > 210)) {
737 __m128i m5, m6;
738 m5 = ((__m128i*)Y)[0];
739 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
740 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
741 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
742 __m128i m7;
743 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
744 m7 =
745 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
746 m7 =
747 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
748 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
749 m7 = _mm_unpacklo_epi8(m7, m7);
750 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
751 m6 = _mm_unpacklo_epi64(m7, m7);
752 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
753 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
754 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
755 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
756 }
757 unsigned char a188, a194;
758 int a186, a205;
759 short int s48, s49, s54, s55;
760 unsigned char *a187, *a193, *b15;
761 short int *a204, *a206, *a207, *a223, *a224, *b16;
762 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
763 *a225, *a226;
764 __m128i a199, a200, a218, a219;
765 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
766 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
767 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
768 a184 = ((__m128i*)Y);
769 s46 = *(a184);
770 a185 = (a184 + 2);
771 s47 = *(a185);
772 a186 = (4 * i9);
773 b15 = (a186 + syms);
774 a187 = (b15 + 2);
775 a188 = *(a187);
776 a189 = _mm_set1_epi8(a188);
777 a190 = ((__m128i*)Branchtab);
778 a191 = *(a190);
779 a192 = _mm_xor_si128(a189, a191);
780 a193 = (b15 + 3);
781 a194 = *(a193);
782 a195 = _mm_set1_epi8(a194);
783 a196 = (a190 + 2);
784 a197 = *(a196);
785 a198 = _mm_xor_si128(a195, a197);
786 t25 = _mm_avg_epu8(a192, a198);
787 a199 = ((__m128i)t25);
788 a200 = _mm_srli_epi16(a199, 2);
789 a201 = ((__m128i)a200);
790 t26 = _mm_and_si128(
791 a201,
792 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
793 t27 = _mm_subs_epu8(
794 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
795 t26);
796 m39 = _mm_adds_epu8(s46, t26);
797 m40 = _mm_adds_epu8(s47, t27);
798 m41 = _mm_adds_epu8(s46, t27);
799 m42 = _mm_adds_epu8(s47, t26);
800 a202 = _mm_min_epu8(m40, m39);
801 d17 = _mm_cmpeq_epi8(a202, m40);
802 a203 = _mm_min_epu8(m42, m41);
803 d18 = _mm_cmpeq_epi8(a203, m42);
804 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
805 a204 = ((short int*)dec);
806 a205 = (8 * i9);
807 b16 = (a204 + a205);
808 a206 = (b16 + 4);
809 *(a206) = s48;
810 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
811 a207 = (b16 + 5);
812 *(a207) = s49;
813 s50 = _mm_unpacklo_epi8(a202, a203);
814 s51 = _mm_unpackhi_epi8(a202, a203);
815 a208 = ((__m128i*)X);
816 *(a208) = s50;
817 a209 = (a208 + 1);
818 *(a209) = s51;
819 a210 = (a184 + 1);
820 s52 = *(a210);
821 a211 = (a184 + 3);
822 s53 = *(a211);
823 a212 = (a190 + 1);
824 a213 = *(a212);
825 a214 = _mm_xor_si128(a189, a213);
826 a215 = (a190 + 3);
827 a216 = *(a215);
828 a217 = _mm_xor_si128(a195, a216);
829 t28 = _mm_avg_epu8(a214, a217);
830 a218 = ((__m128i)t28);
831 a219 = _mm_srli_epi16(a218, 2);
832 a220 = ((__m128i)a219);
833 t29 = _mm_and_si128(
834 a220,
835 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
836 t30 = _mm_subs_epu8(
837 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
838 t29);
839 m43 = _mm_adds_epu8(s52, t29);
840 m44 = _mm_adds_epu8(s53, t30);
841 m45 = _mm_adds_epu8(s52, t30);
842 m46 = _mm_adds_epu8(s53, t29);
843 a221 = _mm_min_epu8(m44, m43);
844 d19 = _mm_cmpeq_epi8(a221, m44);
845 a222 = _mm_min_epu8(m46, m45);
846 d20 = _mm_cmpeq_epi8(a222, m46);
847 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
848 a223 = (b16 + 6);
849 *(a223) = s54;
850 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
851 a224 = (b16 + 7);
852 *(a224) = s55;
853 s56 = _mm_unpacklo_epi8(a221, a222);
854 s57 = _mm_unpackhi_epi8(a221, a222);
855 a225 = (a208 + 2);
856 *(a225) = s56;
857 a226 = (a208 + 3);
858 *(a226) = s57;
859 if ((((unsigned char*)X)[0] > 210)) {
860 __m128i m12, m13;
861 m12 = ((__m128i*)X)[0];
862 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
863 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
864 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
865 __m128i m14;
866 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
867 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
868 ((__m128i)m14)));
869 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
870 ((__m128i)m14)));
871 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
872 ((__m128i)m14)));
873 m14 = _mm_unpacklo_epi8(m14, m14);
874 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
875 m13 = _mm_unpacklo_epi64(m14, m14);
876 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
877 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
878 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
879 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
880 }
881 }
882
883 renormalize(X, 210);
884
885 /*int ch;
886 for(ch = 0; ch < 64; ch++) {
887 printf("%d,", X[ch]);
888 }
889 printf("\n");*/
890
891 unsigned int j;
892 for (j = 0; j < (framebits + excess) % 2; ++j) {
893 int i;
894 for (i = 0; i < 64 / 2; i++) {
895 BFLY(i,
896 (((framebits + excess) >> 1) << 1) + j,
897 syms,
898 Y,
899 X,
900 (decision_t*)dec,
901 Branchtab);
902 }
903
904
905 renormalize(Y, 210);
906
907 /*printf("\n");
908 for(ch = 0; ch < 64; ch++) {
909 printf("%d,", Y[ch]);
910 }
911 printf("\n");*/
912 }
913 /*skip*/
914}
915
916#endif /*LV_HAVE_NEON*/
917
918#if LV_HAVE_GENERIC
919
920static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
921 unsigned char* X,
922 unsigned char* syms,
923 unsigned char* dec,
924 unsigned int framebits,
925 unsigned int excess,
926 unsigned char* Branchtab)
927{
928 int nbits = framebits + excess;
929 int NUMSTATES = 64;
930 int RENORMALIZE_THRESHOLD = 210;
931
932 int s, i;
933 for (s = 0; s < nbits; s++) {
934 void* tmp;
935 for (i = 0; i < NUMSTATES / 2; i++) {
936 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
937 }
938
939 renormalize(Y, RENORMALIZE_THRESHOLD);
940
942 tmp = (void*)X;
943 X = Y;
944 Y = (unsigned char*)tmp;
945 }
946}
947
948#endif /* LV_HAVE_GENERIC */
949
950#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6405
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3101
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4776
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
#define _mm_srli_epi64(a, imm)
Definition: sse2neon.h:5863
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:5239
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6300
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3284
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4696
#define _mm_srli_epi16(a, imm)
Definition: sse2neon.h:5812
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:6206
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3187
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
Definition: volk_8u_x4_conv_k7_r2_8u.h:48
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:50
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:77
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:330
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:626
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:920
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
for i
Definition: volk_config_fixed.tmpl.h:13