14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
25 unsigned int res = (val & b[0]) != 0;
26 res |= ((val & b[4]) != 0) << 4;
27 res |= ((val & b[3]) != 0) << 3;
28 res |= ((val & b[2]) != 0) << 2;
29 res |= ((val & b[1]) != 0) << 1;
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
46 frame_ptr += frame_half;
54 unsigned int frame_size)
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
63 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
78 unsigned int frame_size)
80 if (frame_size < 16) {
87 unsigned int stage = po2;
88 unsigned char* frame_ptr = frame;
89 unsigned char* temp_ptr = temp;
91 unsigned int frame_half = frame_size >> 1;
92 unsigned int num_branches = 1;
115 __m128i r_frame0, r_temp0, shifted;
119 const __m128i shuffle_separate =
120 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
127 for (branch = 0; branch < num_branches; ++branch) {
128 for (bit = 0; bit < frame_half; bit += 16) {
152 frame_ptr += frame_half;
154 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
156 num_branches = num_branches << 1;
157 frame_half = frame_half >> 1;
173 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
223 for (branch = 0; branch < num_branches; ++branch) {
258#include <immintrin.h>
260static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
262 unsigned int frame_size)
264 if (frame_size < 32) {
271 unsigned int stage = po2;
272 unsigned char* frame_ptr = frame;
273 unsigned char* temp_ptr = temp;
275 unsigned int frame_half = frame_size >> 1;
276 unsigned int num_branches = 1;
281 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
331 __m256i r_frame0, r_temp0, shifted;
332 __m128i r_temp2, r_frame2, shifted2;
334 __m256i r_frame1, r_temp1;
336 const __m256i shuffle_separate = _mm256_setr_epi8(0,
368 const __m128i shuffle_separate128 =
369 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
376 for (branch = 0; branch < num_branches; ++branch) {
377 for (bit = 0; bit < frame_half; bit += 32) {
378 if ((frame_half - bit) <
404 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
406 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
409 shifted = _mm256_srli_si256(r_temp0, 1);
410 shifted = _mm256_and_si256(shifted, mask_stage1);
411 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
414 shifted = _mm256_srli_si256(r_temp1, 1);
415 shifted = _mm256_and_si256(shifted, mask_stage1);
416 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
419 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
424 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
426 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
430 frame_ptr += frame_half;
432 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
434 num_branches = num_branches << 1;
435 frame_half = frame_half >> 1;
450 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
482 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
514 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
546 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
579 for (branch = 0; branch < num_branches / 2; ++branch) {
580 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
587 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
589 shifted = _mm256_srli_si256(r_temp0, 8);
590 shifted = _mm256_and_si256(shifted, mask_stage4);
591 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
594 shifted = _mm256_srli_si256(r_frame0, 4);
595 shifted = _mm256_and_si256(shifted, mask_stage3);
596 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
598 shifted = _mm256_srli_si256(r_frame0, 2);
599 shifted = _mm256_and_si256(shifted, mask_stage2);
600 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
602 shifted = _mm256_srli_si256(r_frame0, 1);
603 shifted = _mm256_and_si256(shifted, mask_stage1);
604 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
607 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
615#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
619#include <tmmintrin.h>
623 unsigned int frame_size)
625 if (frame_size < 16) {
632 unsigned int stage = po2;
633 unsigned char* frame_ptr = frame;
634 unsigned char* temp_ptr = temp;
636 unsigned int frame_half = frame_size >> 1;
637 unsigned int num_branches = 1;
660 __m128i r_frame0, r_temp0, shifted;
664 const __m128i shuffle_separate =
665 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
672 for (branch = 0; branch < num_branches; ++branch) {
673 for (bit = 0; bit < frame_half; bit += 16) {
697 frame_ptr += frame_half;
699 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
701 num_branches = num_branches << 1;
702 frame_half = frame_half >> 1;
718 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
768 for (branch = 0; branch < num_branches; ++branch) {
802#include <immintrin.h>
804static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
806 unsigned int frame_size)
808 if (frame_size < 32) {
815 unsigned int stage = po2;
816 unsigned char* frame_ptr = frame;
817 unsigned char* temp_ptr = temp;
819 unsigned int frame_half = frame_size >> 1;
820 unsigned int num_branches = 1;
825 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
875 __m256i r_frame0, r_temp0, shifted;
876 __m128i r_temp2, r_frame2, shifted2;
878 __m256i r_frame1, r_temp1;
880 const __m256i shuffle_separate = _mm256_setr_epi8(0,
912 const __m128i shuffle_separate128 =
913 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
920 for (branch = 0; branch < num_branches; ++branch) {
921 for (bit = 0; bit < frame_half; bit += 32) {
922 if ((frame_half - bit) <
948 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
950 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
953 shifted = _mm256_srli_si256(r_temp0, 1);
954 shifted = _mm256_and_si256(shifted, mask_stage1);
955 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
956 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
958 shifted = _mm256_srli_si256(r_temp1, 1);
959 shifted = _mm256_and_si256(shifted, mask_stage1);
960 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
961 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
963 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
964 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
965 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
966 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
968 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
970 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
974 frame_ptr += frame_half;
976 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
978 num_branches = num_branches << 1;
979 frame_half = frame_half >> 1;
994 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1026 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1058 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1090 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1123 for (branch = 0; branch < num_branches / 2; ++branch) {
1124 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1131 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1133 shifted = _mm256_srli_si256(r_temp0, 8);
1134 shifted = _mm256_and_si256(shifted, mask_stage4);
1135 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1137 shifted = _mm256_srli_si256(r_frame0, 4);
1138 shifted = _mm256_and_si256(shifted, mask_stage3);
1139 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1141 shifted = _mm256_srli_si256(r_frame0, 2);
1142 shifted = _mm256_and_si256(shifted, mask_stage2);
1143 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1145 shifted = _mm256_srli_si256(r_frame0, 1);
1146 shifted = _mm256_and_si256(shifted, mask_stage1);
1147 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1150 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:5140
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6281
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7069
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:5293
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5885
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6386
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:621
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:33
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:52
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:18
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:72