Vector Optimized Library of Kernels 3.1.2
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47
48typedef union {
49 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50 unsigned int w[64 /*NUMSTATES*/ / 32];
51 unsigned short s[64 /*NUMSTATES*/ / 16];
52 unsigned char c[64 /*NUMSTATES*/ / 8];
53#ifdef _MSC_VER
55#else
56} decision_t __attribute__((aligned(16)));
57#endif
58
59
60static inline void renormalize(unsigned char* X)
61{
62 int NUMSTATES = 64;
63 int i;
64
65 unsigned char min = X[0];
66 for (i = 0; i < NUMSTATES; i++)
67 if (min > X[i])
68 min = X[i];
69 for (i = 0; i < NUMSTATES; i++)
70 X[i] -= min;
71}
72
73
74// helper BFLY for GENERIC version
75static inline void BFLY(int i,
76 int s,
77 unsigned char* syms,
78 unsigned char* Y,
79 unsigned char* X,
80 decision_t* d,
81 unsigned char* Branchtab)
82{
83 int j;
84 unsigned int decision0, decision1;
85 unsigned char metric, m0, m1, m2, m3;
86 unsigned short metricsum;
87
88 int NUMSTATES = 64;
89 int RATE = 2;
90 int METRICSHIFT = 1;
91 int PRECISIONSHIFT = 2;
92
93 metricsum = 1;
94 for (j = 0; j < RATE; j++)
95 metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
96 metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
97
98 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
99
100 m0 = X[i] + metric;
101 m1 = X[i + NUMSTATES / 2] + (max - metric);
102 m2 = X[i] + (max - metric);
103 m3 = X[i + NUMSTATES / 2] + metric;
104
105 decision0 = (signed int)(m0 - m1) >= 0;
106 decision1 = (signed int)(m2 - m3) >= 0;
107
108 Y[2 * i] = decision0 ? m1 : m0;
109 Y[2 * i + 1] = decision1 ? m3 : m2;
110
111 d->w[i / (sizeof(unsigned int) * 8 / 2) +
112 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
113 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
114}
115
116
117#if LV_HAVE_AVX2
118
119#include <immintrin.h>
120#include <stdio.h>
121
122static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
123 unsigned char* X,
124 unsigned char* syms,
125 unsigned char* dec,
126 unsigned int framebits,
127 unsigned int excess,
128 unsigned char* Branchtab)
129{
130 unsigned int i;
131 for (i = 0; i < framebits + excess; i++) {
132 unsigned char* tmp;
133 unsigned int* dec_int = (unsigned int*)dec;
134 __m256i a76, a78, a79, a82, a84, a85, a86, a88, a89, a90, d10, d9, m23, m24, m25,
135 m26, s18, s19, s22, s23, t14, t15;
136
137 // Butterfly
138 s18 = ((__m256i*)X)[0];
139 s19 = ((__m256i*)X)[1];
140 a76 = _mm256_set1_epi8(syms[2 * i]);
141 a78 = ((__m256i*)Branchtab)[0];
142 a79 = _mm256_xor_si256(a76, a78);
143 a82 = _mm256_set1_epi8(syms[2 * i + 1]);
144 a84 = ((__m256i*)Branchtab)[1];
145 a85 = _mm256_xor_si256(a82, a84);
146 a86 = _mm256_avg_epu8(a79, a85);
147 a88 = _mm256_srli_epi16(a86, 2);
148 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
149 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
150 m23 = _mm256_adds_epu8(s18, t14);
151 m24 = _mm256_adds_epu8(s19, t15);
152 m25 = _mm256_adds_epu8(s18, t15);
153 m26 = _mm256_adds_epu8(s19, t14);
154 a89 = _mm256_min_epu8(m24, m23);
155 d9 = _mm256_cmpeq_epi8(a89, m24);
156 a90 = _mm256_min_epu8(m26, m25);
157 d10 = _mm256_cmpeq_epi8(a90, m26);
158 s22 = _mm256_unpacklo_epi8(d9, d10);
159 s23 = _mm256_unpackhi_epi8(d9, d10);
160 dec_int[2 * i] = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
161 dec_int[2 * i + 1] =
162 _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
163 s22 = _mm256_unpacklo_epi8(a89, a90);
164 s23 = _mm256_unpackhi_epi8(a89, a90);
165 ((__m256i*)Y)[0] = _mm256_permute2x128_si256(s22, s23, 0x20);
166 ((__m256i*)Y)[1] = _mm256_permute2x128_si256(s22, s23, 0x31);
167
168 // Renormalize
169 __m256i m5, m6;
170 m5 = ((__m256i*)Y)[0];
171 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
172 m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
173 __m256i m7;
174 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
175 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
176 ((__m256i)m7)));
177 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
178 ((__m256i)m7)));
179 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
180 ((__m256i)m7)));
181 m7 = _mm256_unpacklo_epi8(m7, m7);
182 m7 = _mm256_shufflelo_epi16(m7, 0);
183 m6 = _mm256_unpacklo_epi64(m7, m7);
184 m6 = _mm256_permute2x128_si256(
185 m6, m6, 0); // copy lower half of m6 to upper half, since above ops
186 // operate on 128 bit lanes
187 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
188 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
189
190 // Swap pointers to old and new metrics
191 tmp = X;
192 X = Y;
193 Y = tmp;
194 }
195}
196
197#endif /*LV_HAVE_AVX2*/
198
199
200#if LV_HAVE_SSE3
201
202#include <emmintrin.h>
203#include <mmintrin.h>
204#include <pmmintrin.h>
205#include <stdio.h>
206#include <xmmintrin.h>
207
208static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
209 unsigned char* X,
210 unsigned char* syms,
211 unsigned char* dec,
212 unsigned int framebits,
213 unsigned int excess,
214 unsigned char* Branchtab)
215{
216 unsigned int i;
217 for (i = 0; i < framebits + excess; i++) {
218 unsigned char* tmp;
219 unsigned short* dec_short = (unsigned short*)dec;
220 __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
221 a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
222 m30, s18, s19, s24, s25, t14, t15, t17, t18;
223
224 // First half of butterfly
225 s18 = ((__m128i*)X)[0];
226 s19 = ((__m128i*)X)[2];
227 a76 = _mm_set1_epi8(syms[2 * i]);
228 a78 = ((__m128i*)Branchtab)[0];
229 a79 = _mm_xor_si128(a76, a78);
230 a82 = _mm_set1_epi8(syms[2 * i + 1]);
231 a84 = ((__m128i*)Branchtab)[2];
232 a85 = _mm_xor_si128(a82, a84);
233 a86 = _mm_avg_epu8(a79, a85);
234 a88 = _mm_srli_epi16(a86, 2);
235 t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
236 t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
237 m23 = _mm_adds_epu8(s18, t14);
238 m24 = _mm_adds_epu8(s19, t15);
239 m25 = _mm_adds_epu8(s18, t15);
240 m26 = _mm_adds_epu8(s19, t14);
241 a89 = _mm_min_epu8(m24, m23);
242 d9 = _mm_cmpeq_epi8(a89, m24);
243 a90 = _mm_min_epu8(m26, m25);
244 d10 = _mm_cmpeq_epi8(a90, m26);
245 dec_short[4 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
246 dec_short[4 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
247 ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
248 ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
249
250 // Second half of butterfly
251 s24 = ((__m128i*)X)[1];
252 s25 = ((__m128i*)X)[3];
253 a100 = ((__m128i*)Branchtab)[1];
254 a101 = _mm_xor_si128(a76, a100);
255 a103 = ((__m128i*)Branchtab)[3];
256 a104 = _mm_xor_si128(a82, a103);
257 a105 = _mm_avg_epu8(a101, a104);
258 a107 = _mm_srli_epi16(a105, 2);
259 t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
260 t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
261 m27 = _mm_adds_epu8(s24, t17);
262 m28 = _mm_adds_epu8(s25, t18);
263 m29 = _mm_adds_epu8(s24, t18);
264 m30 = _mm_adds_epu8(s25, t17);
265 a108 = _mm_min_epu8(m28, m27);
266 d11 = _mm_cmpeq_epi8(a108, m28);
267 a109 = _mm_min_epu8(m30, m29);
268 d12 = _mm_cmpeq_epi8(a109, m30);
269 dec_short[4 * i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
270 dec_short[4 * i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
271 ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
272 ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
273
274 // Renormalize
275 __m128i m5, m6;
276 m5 = ((__m128i*)Y)[0];
277 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
278 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
279 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
280 __m128i m7;
281 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
282 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
283 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
284 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
285 m7 = _mm_unpacklo_epi8(m7, m7);
286 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
287 m6 = _mm_unpacklo_epi64(m7, m7);
288 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
289 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
290 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
291 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
292
293 // Swap pointers to old and new metrics
294 tmp = X;
295 X = Y;
296 Y = tmp;
297 }
298}
299
300#endif /*LV_HAVE_SSE3*/
301
302#if LV_HAVE_NEON
303
304#include <arm_neon.h>
305
306static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
307 unsigned char* X,
308 unsigned char* syms,
309 unsigned char* dec,
310 unsigned int framebits,
311 unsigned int excess,
312 unsigned char* Branchtab)
313{
314 unsigned int i;
315 for (i = 0; i < framebits + excess; i++) {
316 unsigned char* tmp;
317 unsigned int* dec_int = (unsigned int*)dec;
318 uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
319 a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
320 s19, s24, s25, t14, t15, t17, t18;
321 uint16x8_t high_bits;
322 uint32x4_t paired16;
323 uint8x16_t paired32;
324 uint8x8_t left, right;
325 uint8x8x2_t both;
326
327 // First half of butterfly
328 s18 = ((uint8x16_t*)X)[0];
329 s19 = ((uint8x16_t*)X)[2];
330 a76 = vdupq_n_u8(syms[2 * i]);
331 a78 = ((uint8x16_t*)Branchtab)[0];
332 a79 = veorq_u8(a76, a78);
333 a82 = vdupq_n_u8(syms[2 * i + 1]);
334 a84 = ((uint8x16_t*)Branchtab)[2];
335 a85 = veorq_u8(a82, a84);
336 a86 = vrhaddq_u8(a79, a85);
337 t14 = vshrq_n_u8(a86, 2);
338 t15 = vqsubq_u8(vdupq_n_u8(63), t14);
339 m23 = vqaddq_u8(s18, t14);
340 m24 = vqaddq_u8(s19, t15);
341 m25 = vqaddq_u8(s18, t15);
342 m26 = vqaddq_u8(s19, t14);
343 a89 = vminq_u8(m24, m23);
344 d9 = vceqq_u8(a89, m24);
345 a90 = vminq_u8(m26, m25);
346 d10 = vceqq_u8(a90, m26);
347 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
348 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
349 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
350 dec_int[2 * i] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
351 ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
352 ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
353 ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
354 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
355 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
356 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
357 dec_int[2 * i] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
358 ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
359 ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
360 ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
361 left = vget_low_u8(a89);
362 right = vget_low_u8(a90);
363 both = vzip_u8(left, right);
364 ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
365 left = vget_high_u8(a89);
366 right = vget_high_u8(a90);
367 both = vzip_u8(left, right);
368 ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
369
370 // Second half of butterfly
371 s24 = ((uint8x16_t*)X)[1];
372 s25 = ((uint8x16_t*)X)[3];
373 a100 = ((uint8x16_t*)Branchtab)[1];
374 a101 = veorq_u8(a76, a100);
375 a103 = ((uint8x16_t*)Branchtab)[3];
376 a104 = veorq_u8(a82, a103);
377 a105 = vrhaddq_u8(a101, a104);
378 t17 = vshrq_n_u8(a105, 2);
379 t18 = vqsubq_u8(vdupq_n_u8(63), t17);
380 m27 = vqaddq_u8(s24, t17);
381 m28 = vqaddq_u8(s25, t18);
382 m29 = vqaddq_u8(s24, t18);
383 m30 = vqaddq_u8(s25, t17);
384 a108 = vminq_u8(m28, m27);
385 d11 = vceqq_u8(a108, m28);
386 a109 = vminq_u8(m30, m29);
387 d12 = vceqq_u8(a109, m30);
388 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
389 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
390 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
391 dec_int[2 * i + 1] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
392 ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
393 ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
394 ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
395 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
396 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
397 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
398 dec_int[2 * i + 1] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
399 ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
400 ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
401 ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
402 left = vget_low_u8(a108);
403 right = vget_low_u8(a109);
404 both = vzip_u8(left, right);
405 ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
406 left = vget_high_u8(a108);
407 right = vget_high_u8(a109);
408 both = vzip_u8(left, right);
409 ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
410
411 // Renormalize
412 uint8x16_t m5, m6;
413 m5 = ((uint8x16_t*)Y)[0];
414 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
415 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
416 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
417 uint8x8_t m7;
418 m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
419 m7 = vpmin_u8(m7, m7);
420 m7 = vpmin_u8(m7, m7);
421 m7 = vpmin_u8(m7, m7);
422 m6 = vcombine_u8(m7, m7);
423 ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
424 ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
425 ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
426 ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);
427
428 // Swap pointers to old and new metrics
429 tmp = X;
430 X = Y;
431 Y = tmp;
432 }
433}
434
435#endif /*LV_HAVE_NEON*/
436
437#if LV_HAVE_GENERIC
438
439static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
440 unsigned char* X,
441 unsigned char* syms,
442 unsigned char* dec,
443 unsigned int framebits,
444 unsigned int excess,
445 unsigned char* Branchtab)
446{
447 int nbits = framebits + excess;
448 int NUMSTATES = 64;
449
450 int s, i;
451 for (s = 0; s < nbits; s++) {
452 void* tmp;
453 for (i = 0; i < NUMSTATES / 2; i++) {
454 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
455 }
456
457 renormalize(Y);
458
460 tmp = (void*)X;
461 X = Y;
462 Y = (unsigned char*)tmp;
463 }
464}
465
466#endif /* LV_HAVE_GENERIC */
467
468#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
Definition: volk_8u_x4_conv_k7_r2_8u.h:48
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:50
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:75
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:208
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:306
static void renormalize(unsigned char *X)
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:439
for i
Definition: volk_config_fixed.tmpl.h:13