45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
49 unsigned char t[64 / 8 ];
50 unsigned int w[64 / 32];
51 unsigned short s[64 / 16];
52 unsigned char c[64 / 8];
65 unsigned char min = X[0];
66 for (
i = 0;
i < NUMSTATES;
i++)
69 for (
i = 0;
i < NUMSTATES;
i++)
81 unsigned char* Branchtab)
84 unsigned int decision0, decision1;
85 unsigned char metric, m0, m1, m2, m3;
86 unsigned short metricsum;
91 int PRECISIONSHIFT = 2;
94 for (j = 0; j < RATE; j++)
95 metricsum += (Branchtab[
i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
96 metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
98 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
101 m1 = X[
i + NUMSTATES / 2] + (max - metric);
102 m2 = X[
i] + (max - metric);
103 m3 = X[
i + NUMSTATES / 2] + metric;
105 decision0 = (
signed int)(m0 - m1) >= 0;
106 decision1 = (
signed int)(m2 - m3) >= 0;
108 Y[2 *
i] = decision0 ? m1 : m0;
109 Y[2 *
i + 1] = decision1 ? m3 : m2;
111 d->
w[
i / (
sizeof(
unsigned int) * 8 / 2) +
112 s * (
sizeof(
decision_t) /
sizeof(
unsigned int))] |=
113 (decision0 | decision1 << 1) << ((2 *
i) & (
sizeof(
unsigned int) * 8 - 1));
119#include <immintrin.h>
122static inline void volk_8u_x4_conv_k7_r2_8u_avx2(
unsigned char* Y,
126 unsigned int framebits,
128 unsigned char* Branchtab)
131 for (
i = 0;
i < framebits + excess;
i++) {
133 unsigned int* dec_int = (
unsigned int*)dec;
134 __m256i a76, a78, a79, a82, a84, a85, a86, a88, a89, a90, d10, d9, m23, m24, m25,
135 m26, s18, s19, s22, s23, t14, t15;
138 s18 = ((__m256i*)X)[0];
139 s19 = ((__m256i*)X)[1];
140 a76 = _mm256_set1_epi8(syms[2 *
i]);
141 a78 = ((__m256i*)Branchtab)[0];
142 a79 = _mm256_xor_si256(a76, a78);
143 a82 = _mm256_set1_epi8(syms[2 *
i + 1]);
144 a84 = ((__m256i*)Branchtab)[1];
145 a85 = _mm256_xor_si256(a82, a84);
146 a86 = _mm256_avg_epu8(a79, a85);
147 a88 = _mm256_srli_epi16(a86, 2);
148 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
149 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
150 m23 = _mm256_adds_epu8(s18, t14);
151 m24 = _mm256_adds_epu8(s19, t15);
152 m25 = _mm256_adds_epu8(s18, t15);
153 m26 = _mm256_adds_epu8(s19, t14);
154 a89 = _mm256_min_epu8(m24, m23);
155 d9 = _mm256_cmpeq_epi8(a89, m24);
156 a90 = _mm256_min_epu8(m26, m25);
157 d10 = _mm256_cmpeq_epi8(a90, m26);
158 s22 = _mm256_unpacklo_epi8(d9, d10);
159 s23 = _mm256_unpackhi_epi8(d9, d10);
160 dec_int[2 *
i] = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
162 _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
163 s22 = _mm256_unpacklo_epi8(a89, a90);
164 s23 = _mm256_unpackhi_epi8(a89, a90);
165 ((__m256i*)Y)[0] = _mm256_permute2x128_si256(s22, s23, 0x20);
166 ((__m256i*)Y)[1] = _mm256_permute2x128_si256(s22, s23, 0x31);
170 m5 = ((__m256i*)Y)[0];
171 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
172 m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
174 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
175 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
177 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
179 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
181 m7 = _mm256_unpacklo_epi8(m7, m7);
182 m7 = _mm256_shufflelo_epi16(m7, 0);
183 m6 = _mm256_unpacklo_epi64(m7, m7);
184 m6 = _mm256_permute2x128_si256(
187 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
188 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
202#include <emmintrin.h>
204#include <pmmintrin.h>
206#include <xmmintrin.h>
212 unsigned int framebits,
214 unsigned char* Branchtab)
217 for (
i = 0;
i < framebits + excess;
i++) {
219 unsigned short* dec_short = (
unsigned short*)dec;
220 __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
221 a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
222 m30, s18, s19, s24, s25, t14, t15, t17, t18;
225 s18 = ((__m128i*)X)[0];
226 s19 = ((__m128i*)X)[2];
227 a76 = _mm_set1_epi8(syms[2 *
i]);
228 a78 = ((__m128i*)Branchtab)[0];
229 a79 = _mm_xor_si128(a76, a78);
230 a82 = _mm_set1_epi8(syms[2 *
i + 1]);
231 a84 = ((__m128i*)Branchtab)[2];
232 a85 = _mm_xor_si128(a82, a84);
233 a86 = _mm_avg_epu8(a79, a85);
234 a88 = _mm_srli_epi16(a86, 2);
235 t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
236 t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
237 m23 = _mm_adds_epu8(s18, t14);
238 m24 = _mm_adds_epu8(s19, t15);
239 m25 = _mm_adds_epu8(s18, t15);
240 m26 = _mm_adds_epu8(s19, t14);
241 a89 = _mm_min_epu8(m24, m23);
242 d9 = _mm_cmpeq_epi8(a89, m24);
243 a90 = _mm_min_epu8(m26, m25);
244 d10 = _mm_cmpeq_epi8(a90, m26);
245 dec_short[4 *
i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
246 dec_short[4 *
i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
247 ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
248 ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
251 s24 = ((__m128i*)X)[1];
252 s25 = ((__m128i*)X)[3];
253 a100 = ((__m128i*)Branchtab)[1];
254 a101 = _mm_xor_si128(a76, a100);
255 a103 = ((__m128i*)Branchtab)[3];
256 a104 = _mm_xor_si128(a82, a103);
257 a105 = _mm_avg_epu8(a101, a104);
258 a107 = _mm_srli_epi16(a105, 2);
259 t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
260 t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
261 m27 = _mm_adds_epu8(s24, t17);
262 m28 = _mm_adds_epu8(s25, t18);
263 m29 = _mm_adds_epu8(s24, t18);
264 m30 = _mm_adds_epu8(s25, t17);
265 a108 = _mm_min_epu8(m28, m27);
266 d11 = _mm_cmpeq_epi8(a108, m28);
267 a109 = _mm_min_epu8(m30, m29);
268 d12 = _mm_cmpeq_epi8(a109, m30);
269 dec_short[4 *
i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
270 dec_short[4 *
i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
271 ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
272 ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
276 m5 = ((__m128i*)Y)[0];
277 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
278 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
279 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
281 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
282 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
283 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
284 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
285 m7 = _mm_unpacklo_epi8(m7, m7);
286 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
287 m6 = _mm_unpacklo_epi64(m7, m7);
288 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
289 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
290 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
291 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
310 unsigned int framebits,
312 unsigned char* Branchtab)
315 for (
i = 0;
i < framebits + excess;
i++) {
317 unsigned int* dec_int = (
unsigned int*)dec;
318 uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
319 a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
320 s19, s24, s25, t14, t15, t17, t18;
321 uint16x8_t high_bits;
324 uint8x8_t left, right;
328 s18 = ((uint8x16_t*)X)[0];
329 s19 = ((uint8x16_t*)X)[2];
330 a76 = vdupq_n_u8(syms[2 *
i]);
331 a78 = ((uint8x16_t*)Branchtab)[0];
332 a79 = veorq_u8(a76, a78);
333 a82 = vdupq_n_u8(syms[2 *
i + 1]);
334 a84 = ((uint8x16_t*)Branchtab)[2];
335 a85 = veorq_u8(a82, a84);
336 a86 = vrhaddq_u8(a79, a85);
337 t14 = vshrq_n_u8(a86, 2);
338 t15 = vqsubq_u8(vdupq_n_u8(63), t14);
339 m23 = vqaddq_u8(s18, t14);
340 m24 = vqaddq_u8(s19, t15);
341 m25 = vqaddq_u8(s18, t15);
342 m26 = vqaddq_u8(s19, t14);
343 a89 = vminq_u8(m24, m23);
344 d9 = vceqq_u8(a89, m24);
345 a90 = vminq_u8(m26, m25);
346 d10 = vceqq_u8(a90, m26);
347 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
348 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
349 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
350 dec_int[2 *
i] = ((
unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
351 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
352 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
353 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 24);
354 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
355 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
356 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
357 dec_int[2 *
i] |= ((
unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
358 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
359 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
360 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 25);
361 left = vget_low_u8(a89);
362 right = vget_low_u8(a90);
363 both = vzip_u8(left, right);
364 ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
365 left = vget_high_u8(a89);
366 right = vget_high_u8(a90);
367 both = vzip_u8(left, right);
368 ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
371 s24 = ((uint8x16_t*)X)[1];
372 s25 = ((uint8x16_t*)X)[3];
373 a100 = ((uint8x16_t*)Branchtab)[1];
374 a101 = veorq_u8(a76, a100);
375 a103 = ((uint8x16_t*)Branchtab)[3];
376 a104 = veorq_u8(a82, a103);
377 a105 = vrhaddq_u8(a101, a104);
378 t17 = vshrq_n_u8(a105, 2);
379 t18 = vqsubq_u8(vdupq_n_u8(63), t17);
380 m27 = vqaddq_u8(s24, t17);
381 m28 = vqaddq_u8(s25, t18);
382 m29 = vqaddq_u8(s24, t18);
383 m30 = vqaddq_u8(s25, t17);
384 a108 = vminq_u8(m28, m27);
385 d11 = vceqq_u8(a108, m28);
386 a109 = vminq_u8(m30, m29);
387 d12 = vceqq_u8(a109, m30);
388 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
389 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
390 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
391 dec_int[2 *
i + 1] = ((
unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
392 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
393 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
394 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 24);
395 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
396 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
397 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
398 dec_int[2 *
i + 1] |= ((
unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
399 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
400 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
401 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 25);
402 left = vget_low_u8(a108);
403 right = vget_low_u8(a109);
404 both = vzip_u8(left, right);
405 ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
406 left = vget_high_u8(a108);
407 right = vget_high_u8(a109);
408 both = vzip_u8(left, right);
409 ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
413 m5 = ((uint8x16_t*)Y)[0];
414 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
415 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
416 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
418 m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
419 m7 = vpmin_u8(m7, m7);
420 m7 = vpmin_u8(m7, m7);
421 m7 = vpmin_u8(m7, m7);
422 m6 = vcombine_u8(m7, m7);
423 ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
424 ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
425 ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
426 ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);
443 unsigned int framebits,
445 unsigned char* Branchtab)
447 int nbits = framebits + excess;
451 for (s = 0; s < nbits; s++) {
453 for (
i = 0;
i < NUMSTATES / 2;
i++) {
462 Y = (
unsigned char*)tmp;
Definition volk_8u_x4_conv_k7_r2_8u.h:48
unsigned int w[64/32]
Definition volk_8u_x4_conv_k7_r2_8u.h:50
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:75
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:208
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:306
static void renormalize(unsigned char *X)
Definition volk_8u_x4_conv_k7_r2_8u.h:60
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:439
for i
Definition volk_config_fixed.tmpl.h:13