Vector Optimized Library of Kernels 3.1.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
sse2neon.h
Go to the documentation of this file.
1#ifndef SSE2NEON_H
2#define SSE2NEON_H
3
4// This header file provides a simple API translation layer
5// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6//
7// Contributors to this work are:
8// John W. Ratcliff <jratcliffscarab@gmail.com>
9// Brandon Rowlett <browlett@nvidia.com>
10// Ken Fast <kfast@gdeb.com>
11// Eric van Beurden <evanbeurden@nvidia.com>
12// Alexander Potylitsin <apotylitsin@nvidia.com>
13// Hasindu Gamaarachchi <hasindu2008@gmail.com>
14// Jim Huang <jserv@ccns.ncku.edu.tw>
15// Mark Cheng <marktwtn@gmail.com>
16// Malcolm James MacLeod <malcolm@gulden.com>
17// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
18// Sebastian Pop <spop@amazon.com>
19// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
20// Danila Kutenin <danilak@google.com>
21// François Turban (JishinMaster) <francois.turban@gmail.com>
22// Pei-Hsuan Hung <afcidk@gmail.com>
23// Yang-Hao Yuan <yuanyanghau@gmail.com>
24// Syoyo Fujita <syoyo@lighttransport.com>
25// Brecht Van Lommel <brecht@blender.org>
26// Jonathan Hue <jhue@adobe.com>
27// Cuda Chen <clh960524@gmail.com>
28// Aymen Qader <aymen.qader@arm.com>
29// Anthony Roberts <anthony.roberts@linaro.org>
30
31/*
32 * sse2neon is freely redistributable under the MIT License.
33 *
34 * Permission is hereby granted, free of charge, to any person obtaining a copy
35 * of this software and associated documentation files (the "Software"), to deal
36 * in the Software without restriction, including without limitation the rights
37 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38 * copies of the Software, and to permit persons to whom the Software is
39 * furnished to do so, subject to the following conditions:
40 *
41 * The above copyright notice and this permission notice shall be included in
42 * all copies or substantial portions of the Software.
43 *
44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
50 * SOFTWARE.
51 */
52
53/* Tunable configurations */
54
55/* Enable precise implementation of math operations
56 * This would slow down the computation a bit, but gives consistent result with
57 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
58 */
59/* _mm_min|max_ps|ss|pd|sd */
60#ifndef SSE2NEON_PRECISE_MINMAX
61#define SSE2NEON_PRECISE_MINMAX (0)
62#endif
63/* _mm_rcp_ps and _mm_div_ps */
64#ifndef SSE2NEON_PRECISE_DIV
65#define SSE2NEON_PRECISE_DIV (0)
66#endif
67/* _mm_sqrt_ps and _mm_rsqrt_ps */
68#ifndef SSE2NEON_PRECISE_SQRT
69#define SSE2NEON_PRECISE_SQRT (0)
70#endif
71/* _mm_dp_pd */
72#ifndef SSE2NEON_PRECISE_DP
73#define SSE2NEON_PRECISE_DP (0)
74#endif
75
76/* Enable inclusion of windows.h on MSVC platforms
77 * This makes _mm_clflush functional on windows, as there is no builtin.
78 */
79#ifndef SSE2NEON_INCLUDE_WINDOWS_H
80#define SSE2NEON_INCLUDE_WINDOWS_H (0)
81#endif
82
83/* compiler specific definitions */
84#if defined(__GNUC__) || defined(__clang__)
85#pragma push_macro("FORCE_INLINE")
86#pragma push_macro("ALIGN_STRUCT")
87#define FORCE_INLINE static inline __attribute__((always_inline))
88#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
89#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
90#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
91#elif defined(_MSC_VER)
92#if _MSVC_TRADITIONAL
93#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94#endif
95#ifndef FORCE_INLINE
96#define FORCE_INLINE static inline
97#endif
98#ifndef ALIGN_STRUCT
99#define ALIGN_STRUCT(x) __declspec(align(x))
100#endif
101#define _sse2neon_likely(x) (x)
102#define _sse2neon_unlikely(x) (x)
103#else
104#pragma message("Macro name collisions may happen with unsupported compilers.")
105#endif
106
107/* C language does not allow initializing a variable with a function call. */
108#ifdef __cplusplus
109#define _sse2neon_const static const
110#else
111#define _sse2neon_const const
112#endif
113
114#include <stdint.h>
115#include <stdlib.h>
116
117#if defined(_WIN32)
118/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
119 * from both MinGW-w64 and MSVC.
120 */
121#define SSE2NEON_ALLOC_DEFINED
122#endif
123
124/* If using MSVC */
125#ifdef _MSC_VER
126#include <intrin.h>
127#if SSE2NEON_INCLUDE_WINDOWS_H
128#include <processthreadsapi.h>
129#include <windows.h>
130#endif
131
132#if !defined(__cplusplus)
133#error sse2neon only supports C++ compilation with this compiler
134#endif
135
136#ifdef SSE2NEON_ALLOC_DEFINED
137#include <malloc.h>
138#endif
139
140#if (defined(_M_AMD64) || defined(__x86_64__)) || \
141 (defined(_M_ARM64) || defined(__arm64__))
142#define SSE2NEON_HAS_BITSCAN64
143#endif
144#endif
145
146#if defined(__GNUC__) || defined(__clang__)
147#define _sse2neon_define0(type, s, body) \
148 __extension__({ \
149 type _a = (s); \
150 body \
151 })
152#define _sse2neon_define1(type, s, body) \
153 __extension__({ \
154 type _a = (s); \
155 body \
156 })
157#define _sse2neon_define2(type, a, b, body) \
158 __extension__({ \
159 type _a = (a), _b = (b); \
160 body \
161 })
162#define _sse2neon_return(ret) (ret)
163#else
164#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
165#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
166#define _sse2neon_define2(type, a, b, body) \
167 [](type _a, type _b) { body }((a), (b))
168#define _sse2neon_return(ret) return ret
169#endif
170
171#define _sse2neon_init(...) \
172 { \
173 __VA_ARGS__ \
174 }
175
176/* Compiler barrier */
177#if defined(_MSC_VER)
178#define SSE2NEON_BARRIER() _ReadWriteBarrier()
179#else
180#define SSE2NEON_BARRIER() \
181 do { \
182 __asm__ __volatile__("" ::: "memory"); \
183 (void) 0; \
184 } while (0)
185#endif
186
187/* Memory barriers
188 * __atomic_thread_fence does not include a compiler barrier; instead,
189 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
190 * semantics.
191 */
192#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
193#include <stdatomic.h>
194#endif
195
196FORCE_INLINE void _sse2neon_smp_mb(void)
197{
199#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
200 !defined(__STDC_NO_ATOMICS__)
201 atomic_thread_fence(memory_order_seq_cst);
202#elif defined(__GNUC__) || defined(__clang__)
203 __atomic_thread_fence(__ATOMIC_SEQ_CST);
204#else /* MSVC */
205 __dmb(_ARM64_BARRIER_ISH);
206#endif
207}
208
209/* Architecture-specific build options */
210/* FIXME: #pragma GCC push_options is only available on GCC */
211#if defined(__GNUC__)
212#if defined(__arm__) && __ARM_ARCH == 7
213/* According to ARM C Language Extensions Architecture specification,
214 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
215 * architecture supported.
216 */
217#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
218#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
219#endif
220#if !defined(__clang__)
221#pragma GCC push_options
222#pragma GCC target("fpu=neon")
223#endif
224#elif defined(__aarch64__) || defined(_M_ARM64)
225#if !defined(__clang__) && !defined(_MSC_VER)
226#pragma GCC push_options
227#pragma GCC target("+simd")
228#endif
229#elif __ARM_ARCH == 8
230#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
231#error \
232 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
233#endif
234#if !defined(__clang__) && !defined(_MSC_VER)
235#pragma GCC push_options
236#endif
237#else
238#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
239#endif
240#endif
241
242#include <arm_neon.h>
243#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
244#if defined __has_include && __has_include(<arm_acle.h>)
245#include <arm_acle.h>
246#endif
247#endif
248
249/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
250 * and other Arm microarchitectures use.
251 * From sysctl -a on Apple M1:
252 * hw.cachelinesize: 128
253 */
254#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
255#define SSE2NEON_CACHELINE_SIZE 128
256#else
257#define SSE2NEON_CACHELINE_SIZE 64
258#endif
259
260/* Rounding functions require either Aarch64 instructions or libm fallback */
261#if !defined(__aarch64__) && !defined(_M_ARM64)
262#include <math.h>
263#endif
264
265/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
266 * or even not accessible in user mode.
267 * To write or access to these registers in user mode,
268 * we have to perform syscall instead.
269 */
270#if (!defined(__aarch64__) && !defined(_M_ARM64))
271#include <sys/time.h>
272#endif
273
274/* "__has_builtin" can be used to query support for built-in functions
275 * provided by gcc/clang and other compilers that support it.
276 */
277#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
278/* Compatibility with gcc <= 9 */
279#if defined(__GNUC__) && (__GNUC__ <= 9)
280#define __has_builtin(x) HAS##x
281#define HAS__builtin_popcount 1
282#define HAS__builtin_popcountll 1
283
284// __builtin_shuffle introduced in GCC 4.7.0
285#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
286#define HAS__builtin_shuffle 1
287#else
288#define HAS__builtin_shuffle 0
289#endif
290
291#define HAS__builtin_shufflevector 0
292#define HAS__builtin_nontemporal_store 0
293#else
294#define __has_builtin(x) 0
295#endif
296#endif
297
306#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
307 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
308
309#if __has_builtin(__builtin_shufflevector)
310#define _sse2neon_shuffle(type, a, b, ...) \
311 __builtin_shufflevector(a, b, __VA_ARGS__)
312#elif __has_builtin(__builtin_shuffle)
313#define _sse2neon_shuffle(type, a, b, ...) \
314 __extension__({ \
315 type tmp = {__VA_ARGS__}; \
316 __builtin_shuffle(a, b, tmp); \
317 })
318#endif
319
320#ifdef _sse2neon_shuffle
321#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
322#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
323#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
324#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
325#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
326#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
327#endif
328
329/* Rounding mode macros. */
330#define _MM_FROUND_TO_NEAREST_INT 0x00
331#define _MM_FROUND_TO_NEG_INF 0x01
332#define _MM_FROUND_TO_POS_INF 0x02
333#define _MM_FROUND_TO_ZERO 0x03
334#define _MM_FROUND_CUR_DIRECTION 0x04
335#define _MM_FROUND_NO_EXC 0x08
336#define _MM_FROUND_RAISE_EXC 0x00
337#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
338#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
339#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
340#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
341#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
342#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
343#define _MM_ROUND_NEAREST 0x0000
344#define _MM_ROUND_DOWN 0x2000
345#define _MM_ROUND_UP 0x4000
346#define _MM_ROUND_TOWARD_ZERO 0x6000
347/* Flush zero mode macros. */
348#define _MM_FLUSH_ZERO_MASK 0x8000
349#define _MM_FLUSH_ZERO_ON 0x8000
350#define _MM_FLUSH_ZERO_OFF 0x0000
351/* Denormals are zeros mode macros. */
352#define _MM_DENORMALS_ZERO_MASK 0x0040
353#define _MM_DENORMALS_ZERO_ON 0x0040
354#define _MM_DENORMALS_ZERO_OFF 0x0000
355
356/* indicate immediate constant argument in a given range */
357#define __constrange(a, b) const
358
359/* A few intrinsics accept traditional data types like ints or floats, but
360 * most operate on data types that are specific to SSE.
361 * If a vector type ends in d, it contains doubles, and if it does not have
362 * a suffix, it contains floats. An integer vector type can contain any type
363 * of integer, from chars to shorts to unsigned long longs.
364 */
365typedef int64x1_t __m64;
366typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
367// On ARM 32-bit architecture, the float64x2_t is not supported.
368// The data type __m128d should be represented in a different way for related
369// intrinsic conversion.
370#if defined(__aarch64__) || defined(_M_ARM64)
371typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
372#else
373typedef float32x4_t __m128d;
374#endif
375typedef int64x2_t __m128i; /* 128-bit vector containing integers */
376
377// __int64 is defined in the Intrinsics Guide which maps to different datatype
378// in different data model
379#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
380#if (defined(__x86_64__) || defined(__i386__))
381#define __int64 long long
382#else
383#define __int64 int64_t
384#endif
385#endif
386
387/* type-safe casting between types */
388
389#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
390#define vreinterpretq_m128_f32(x) (x)
391#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
392
393#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
394#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
395#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
396#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
397
398#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
399#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
400#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
401#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
402
403#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
404#define vreinterpretq_f32_m128(x) (x)
405#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
406
407#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
408#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
409#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
410#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
411
412#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
413#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
414#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
415#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
416
417#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
418#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
419#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
420#define vreinterpretq_m128i_s64(x) (x)
421
422#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
423#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
424#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
425#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
426
427#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
428#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
429
430#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
431#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
432#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
433#define vreinterpretq_s64_m128i(x) (x)
434
435#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
436#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
437#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
438#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
439
440#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
441#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
442#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
443#define vreinterpret_m64_s64(x) (x)
444
445#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
446#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
447#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
448#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
449
450#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
451#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
452#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
453
454#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
455#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
456#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
457#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
458
459#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
460#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
461#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
462#define vreinterpret_s64_m64(x) (x)
463
464#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
465
466#if defined(__aarch64__) || defined(_M_ARM64)
467#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
468#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
469
470#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
471
472#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
473#define vreinterpretq_m128d_f64(x) (x)
474
475#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
476
477#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
478#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
479
480#define vreinterpretq_f64_m128d(x) (x)
481#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
482#else
483#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
484#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
485
486#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
487#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
488
489#define vreinterpretq_m128d_f32(x) (x)
490
491#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
492
493#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
494#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
495
496#define vreinterpretq_f32_m128d(x) (x)
497#endif
498
499// A struct is defined in this header file called 'SIMDVec' which can be used
500// by applications which attempt to access the contents of an __m128 struct
501// directly. It is important to note that accessing the __m128 struct directly
502// is bad coding practice by Microsoft: @see:
503// https://learn.microsoft.com/en-us/cpp/cpp/m128
504//
505// However, some legacy source code may try to access the contents of an __m128
506// struct directly so the developer can use the SIMDVec as an alias for it. Any
507// casting must be done manually by the developer, as you cannot cast or
508// otherwise alias the base NEON data type for intrinsic operations.
509//
510// union intended to allow direct access to an __m128 variable using the names
511// that the MSVC compiler provides. This union should really only be used when
512// trying to access the members of the vector as integer values. GCC/clang
513// allow native access to the float members through a simple array access
514// operator (in C since 4.6, in C++ since 4.8).
515//
516// Ideally direct accesses to SIMD vectors should not be used since it can cause
517// a performance hit. If it really is needed however, the original __m128
518// variable can be aliased with a pointer to this union and used to access
519// individual components. The use of this union should be hidden behind a macro
520// that is used throughout the codebase to access the members instead of always
521// declaring this type of variable.
522typedef union ALIGN_STRUCT(16) SIMDVec {
523 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
524 int8_t m128_i8[16]; // as signed 8-bit integers.
525 int16_t m128_i16[8]; // as signed 16-bit integers.
526 int32_t m128_i32[4]; // as signed 32-bit integers.
527 int64_t m128_i64[2]; // as signed 64-bit integers.
528 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
529 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
530 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
531 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
533
534// casting using SIMDVec
535#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
536#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
537#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
538
539/* SSE macros */
540#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
541#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
542#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
543#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
544
545// Function declaration
546// SSE
547FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
548FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
549FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
550FORCE_INLINE __m128 _mm_set_ps1(float);
551FORCE_INLINE __m128 _mm_setzero_ps(void);
552// SSE2
553FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
554FORCE_INLINE __m128i _mm_castps_si128(__m128);
556FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
557FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
558FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
559FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
560FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
561FORCE_INLINE __m128d _mm_set_pd(double, double);
562FORCE_INLINE __m128i _mm_set1_epi32(int);
563FORCE_INLINE __m128i _mm_setzero_si128(void);
564// SSE4.1
565FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
566FORCE_INLINE __m128 _mm_ceil_ps(__m128);
567FORCE_INLINE __m128d _mm_floor_pd(__m128d);
568FORCE_INLINE __m128 _mm_floor_ps(__m128);
569FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
570FORCE_INLINE __m128 _mm_round_ps(__m128, int);
571// SSE4.2
572FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
573
574/* Backwards compatibility for compilers with lack of specific type support */
575
576// Older gcc does not define vld1q_u8_x4 type
577#if defined(__GNUC__) && !defined(__clang__) && \
578 ((__GNUC__ <= 13 && defined(__arm__)) || \
579 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
580 (__GNUC__ <= 9 && defined(__aarch64__)))
581FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
582{
583 uint8x16x4_t ret;
584 ret.val[0] = vld1q_u8(p + 0);
585 ret.val[1] = vld1q_u8(p + 16);
586 ret.val[2] = vld1q_u8(p + 32);
587 ret.val[3] = vld1q_u8(p + 48);
588 return ret;
589}
590#else
591// Wraps vld1q_u8_x4
592FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
593{
594 return vld1q_u8_x4(p);
595}
596#endif
597
598#if !defined(__aarch64__) && !defined(_M_ARM64)
599/* emulate vaddv u8 variant */
600FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
601{
602 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
603 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
604}
605#else
606// Wraps vaddv_u8
607FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
608{
609 return vaddv_u8(v8);
610}
611#endif
612
613#if !defined(__aarch64__) && !defined(_M_ARM64)
614/* emulate vaddvq u8 variant */
615FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
616{
617 uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
618 uint8_t res = 0;
619 for (int i = 0; i < 8; ++i)
620 res += tmp[i];
621 return res;
622}
623#else
624// Wraps vaddvq_u8
625FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
626{
627 return vaddvq_u8(a);
628}
629#endif
630
631#if !defined(__aarch64__) && !defined(_M_ARM64)
632/* emulate vaddvq u16 variant */
633FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
634{
635 uint32x4_t m = vpaddlq_u16(a);
636 uint64x2_t n = vpaddlq_u32(m);
637 uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
638
639 return vget_lane_u32((uint32x2_t) o, 0);
640}
641#else
642// Wraps vaddvq_u16
643FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
644{
645 return vaddvq_u16(a);
646}
647#endif
648
649/* Function Naming Conventions
650 * The naming convention of SSE intrinsics is straightforward. A generic SSE
651 * intrinsic function is given as follows:
652 * _mm_<name>_<data_type>
653 *
654 * The parts of this format are given as follows:
655 * 1. <name> describes the operation performed by the intrinsic
656 * 2. <data_type> identifies the data type of the function's primary arguments
657 *
658 * This last part, <data_type>, is a little complicated. It identifies the
659 * content of the input values, and can be set to any of the following values:
660 * + ps - vectors contain floats (ps stands for packed single-precision)
661 * + pd - vectors contain doubles (pd stands for packed double-precision)
662 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
663 * signed integers
664 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
665 * unsigned integers
666 * + si128 - unspecified 128-bit vector or 256-bit vector
667 * + m128/m128i/m128d - identifies input vector types when they are different
668 * than the type of the returned vector
669 *
670 * For example, _mm_setzero_ps. The _mm implies that the function returns
671 * a 128-bit vector. The _ps at the end implies that the argument vectors
672 * contain floats.
673 *
674 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
675 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
676 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
677 * // Set packed 8-bit integers
678 * // 128 bits, 16 chars, per 8 bits
679 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
680 * 4, 5, 12, 13, 6, 7, 14, 15);
681 * // Shuffle packed 8-bit integers
682 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
683 */
684
685/* Constants for use with _mm_prefetch. */
687 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
688 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
689 _MM_HINT_T1 = 2, /* load data to L2 cache only */
690 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
691};
692
693// The bit field mapping to the FPCR(floating-point control register)
694typedef struct {
695 uint16_t res0;
696 uint8_t res1 : 6;
697 uint8_t bit22 : 1;
698 uint8_t bit23 : 1;
699 uint8_t bit24 : 1;
700 uint8_t res2 : 7;
701#if defined(__aarch64__) || defined(_M_ARM64)
702 uint32_t res3;
703#endif
705
706// Takes the upper 64 bits of a and places it in the low end of the result
707// Takes the lower 64 bits of b and places it into the high end of the result.
709{
710 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
711 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
712 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
713}
714
715// takes the lower two 32-bit values from a and swaps them and places in high
716// end of result takes the higher two 32 bit values from b and swaps them and
717// places in low end of result.
719{
720 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
721 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
722 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
723}
724
726{
727 float32x2_t a21 = vget_high_f32(
729 float32x2_t b03 = vget_low_f32(
731 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
732}
733
735{
736 float32x2_t a03 = vget_low_f32(
738 float32x2_t b21 = vget_high_f32(
740 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
741}
742
744{
745 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
746 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
747 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
748}
749
751{
752 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
753 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
754 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
755}
756
758{
759 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
760 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
761 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
762}
763
764// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
765// high
767{
768 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
769 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
770 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
771}
772
774{
775 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
776 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
777 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
778}
779
781{
782 float32x2_t a22 =
783 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
784 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
785 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
786}
787
789{
790 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
791 float32x2_t b22 =
792 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
793 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
794}
795
797{
798 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
799 float32x2_t a22 =
800 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
801 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
802 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
803 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
804}
805
807{
808 float32x2_t a33 =
809 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
810 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
811 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
812}
813
815{
816 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
817 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
818 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
819 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
820 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
821}
822
824{
825 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
826 float32_t b2 = vgetq_lane_f32(b, 2);
827 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
828 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
829 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
830}
831
833{
834 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
835 float32_t b2 = vgetq_lane_f32(b, 2);
836 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
837 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
838 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
839}
840
841// For MSVC, we check only if it is ARM64, as every single ARM64 processor
842// supported by WoA has crypto extensions. If this changes in the future,
843// this can be verified via the runtime-only method of:
844// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
845#if (defined(_M_ARM64) && !defined(__clang__)) || \
846 (defined(__ARM_FEATURE_CRYPTO) && \
847 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
848// Wraps vmull_p64
849FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
850{
851 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
852 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
853#if defined(_MSC_VER)
854 __n64 a1 = {a}, b1 = {b};
855 return vreinterpretq_u64_p128(vmull_p64(a1, b1));
856#else
857 return vreinterpretq_u64_p128(vmull_p64(a, b));
858#endif
859}
860#else // ARMv7 polyfill
861// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
862//
863// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
864// 64-bit->128-bit polynomial multiply.
865//
866// It needs some work and is somewhat slow, but it is still faster than all
867// known scalar methods.
868//
869// Algorithm adapted to C from
870// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
871// from "Fast Software Polynomial Multiplication on ARM Processors Using the
872// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
873// (https://hal.inria.fr/hal-01506572)
874static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
875{
876 poly8x8_t a = vreinterpret_p8_u64(_a);
877 poly8x8_t b = vreinterpret_p8_u64(_b);
878
879 // Masks
880 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
881 vcreate_u8(0x00000000ffffffff));
882 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
883 vcreate_u8(0x0000000000000000));
884
885 // Do the multiplies, rotating with vext to get all combinations
886 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
887 uint8x16_t e =
888 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
889 uint8x16_t f =
890 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
891 uint8x16_t g =
892 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
893 uint8x16_t h =
894 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
895 uint8x16_t i =
896 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
897 uint8x16_t j =
898 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
899 uint8x16_t k =
900 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
901
902 // Add cross products
903 uint8x16_t l = veorq_u8(e, f); // L = E + F
904 uint8x16_t m = veorq_u8(g, h); // M = G + H
905 uint8x16_t n = veorq_u8(i, j); // N = I + J
906
907 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
908 // instructions.
909#if defined(__aarch64__)
910 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
911 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
912 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
913 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
914 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
915 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
916 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
917 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
918#else
919 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
920 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
921 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
922 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
923#endif
924 // t0 = (L) (P0 + P1) << 8
925 // t1 = (M) (P2 + P3) << 16
926 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
927 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
928 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
929
930 // t2 = (N) (P4 + P5) << 24
931 // t3 = (K) (P6 + P7) << 32
932 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
933 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
934 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
935
936 // De-interleave
937#if defined(__aarch64__)
938 uint8x16_t t0 = vreinterpretq_u8_u64(
939 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
940 uint8x16_t t1 = vreinterpretq_u8_u64(
941 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
942 uint8x16_t t2 = vreinterpretq_u8_u64(
943 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
944 uint8x16_t t3 = vreinterpretq_u8_u64(
945 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
946#else
947 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
948 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
949 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
950 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
951#endif
952 // Shift the cross products
953 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
954 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
955 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
956 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
957
958 // Accumulate the products
959 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
960 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
961 uint8x16_t mix = veorq_u8(d, cross1);
962 uint8x16_t r = veorq_u8(mix, cross2);
963 return vreinterpretq_u64_u8(r);
964}
965#endif // ARMv7 polyfill
966
967// C equivalent:
968// __m128i _mm_shuffle_epi32_default(__m128i a,
969// __constrange(0, 255) int imm) {
970// __m128i ret;
971// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
972// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
973// return ret;
974// }
975#define _mm_shuffle_epi32_default(a, imm) \
976 vreinterpretq_m128i_s32(vsetq_lane_s32( \
977 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
978 vsetq_lane_s32( \
979 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
980 vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
981 ((imm) >> 2) & 0x3), \
982 vmovq_n_s32(vgetq_lane_s32( \
983 vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
984 1), \
985 2), \
986 3))
987
988// Takes the upper 64 bits of a and places it in the low end of the result
989// Takes the lower 64 bits of a and places it into the high end of the result.
991{
992 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
993 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
994 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
995}
996
997// takes the lower two 32-bit values from a and swaps them and places in low end
998// of result takes the higher two 32 bit values from a and swaps them and places
999// in high end of result.
1001{
1002 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1003 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1004 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1005}
1006
1007// rotates the least significant 32 bits into the most significant 32 bits, and
1008// shifts the rest down
1010{
1013}
1014
1015// rotates the most significant 32 bits into the least significant 32 bits, and
1016// shifts the rest up
1018{
1021}
1022
1023// gets the lower 64 bits of a, and places it in the upper 64 bits
1024// gets the lower 64 bits of a and places it in the lower 64 bits
1026{
1027 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1028 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1029}
1030
1031// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1032// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1034{
1035 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1036 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1037 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1038}
1039
1040// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1041// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1042// places it in the lower 64 bits
1044{
1045 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1046 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1047}
1048
1050{
1051 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1052 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1053 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1054}
1055
1057{
1058 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1059 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1060 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1061}
1062
1064{
1065 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1066 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1067 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1068}
1069
1070#if defined(__aarch64__) || defined(_M_ARM64)
1071#define _mm_shuffle_epi32_splat(a, imm) \
1072 vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1073#else
1074#define _mm_shuffle_epi32_splat(a, imm) \
1075 vreinterpretq_m128i_s32( \
1076 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1077#endif
1078
1079// NEON does not support a general purpose permute intrinsic.
1080// Shuffle single-precision (32-bit) floating-point elements in a using the
1081// control in imm8, and store the results in dst.
1082//
1083// C equivalent:
1084// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1085// __constrange(0, 255) int imm) {
1086// __m128 ret;
1087// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1088// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1089// return ret;
1090// }
1091//
1092// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1093#define _mm_shuffle_ps_default(a, b, imm) \
1094 vreinterpretq_m128_f32(vsetq_lane_f32( \
1095 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1096 vsetq_lane_f32( \
1097 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1098 vsetq_lane_f32( \
1099 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1100 vmovq_n_f32( \
1101 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1102 1), \
1103 2), \
1104 3))
1105
1106// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1107// Store the results in the low 64 bits of dst, with the high 64 bits being
1108// copied from a to dst.
1109// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1110#define _mm_shufflelo_epi16_function(a, imm) \
1111 _sse2neon_define1( \
1112 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1113 int16x4_t lowBits = vget_low_s16(ret); \
1114 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1115 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1116 1); \
1117 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1118 2); \
1119 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1120 3); \
1121 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1122
1123// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1124// Store the results in the high 64 bits of dst, with the low 64 bits being
1125// copied from a to dst.
1126// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1127#define _mm_shufflehi_epi16_function(a, imm) \
1128 _sse2neon_define1( \
1129 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1130 int16x4_t highBits = vget_high_s16(ret); \
1131 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1132 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1133 5); \
1134 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1135 6); \
1136 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1137 7); \
1138 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1139
1140/* MMX */
1141
1142//_mm_empty is a no-op on arm
1143FORCE_INLINE void _mm_empty(void) {}
1144
1145/* SSE */
1146
1147// Add packed single-precision (32-bit) floating-point elements in a and b, and
1148// store the results in dst.
1149// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1150FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1151{
1154}
1155
1156// Add the lower single-precision (32-bit) floating-point element in a and b,
1157// store the result in the lower element of dst, and copy the upper 3 packed
1158// elements from a to the upper elements of dst.
1159// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1160FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1161{
1162 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1163 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1164 // the upper values in the result must be the remnants of <a>.
1165 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1166}
1167
1168// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1169// elements in a and b, and store the results in dst.
1170// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1171FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1172{
1175}
1176
1177// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1178// elements in a and then AND with b, and store the results in dst.
1179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1181{
1183 vbicq_s32(vreinterpretq_s32_m128(b),
1184 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1185}
1186
1187// Average packed unsigned 16-bit integers in a and b, and store the results in
1188// dst.
1189// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1190FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1191{
1192 return vreinterpret_m64_u16(
1193 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1194}
1195
1196// Average packed unsigned 8-bit integers in a and b, and store the results in
1197// dst.
1198// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1199FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1200{
1201 return vreinterpret_m64_u8(
1202 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1203}
1204
1205// Compare packed single-precision (32-bit) floating-point elements in a and b
1206// for equality, and store the results in dst.
1207// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1209{
1212}
1213
1214// Compare the lower single-precision (32-bit) floating-point elements in a and
1215// b for equality, store the result in the lower element of dst, and copy the
1216// upper 3 packed elements from a to the upper elements of dst.
1217// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1219{
1220 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1221}
1222
1223// Compare packed single-precision (32-bit) floating-point elements in a and b
1224// for greater-than-or-equal, and store the results in dst.
1225// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1227{
1230}
1231
1232// Compare the lower single-precision (32-bit) floating-point elements in a and
1233// b for greater-than-or-equal, store the result in the lower element of dst,
1234// and copy the upper 3 packed elements from a to the upper elements of dst.
1235// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1237{
1238 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1239}
1240
1241// Compare packed single-precision (32-bit) floating-point elements in a and b
1242// for greater-than, and store the results in dst.
1243// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1245{
1248}
1249
1250// Compare the lower single-precision (32-bit) floating-point elements in a and
1251// b for greater-than, store the result in the lower element of dst, and copy
1252// the upper 3 packed elements from a to the upper elements of dst.
1253// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1255{
1256 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1257}
1258
1259// Compare packed single-precision (32-bit) floating-point elements in a and b
1260// for less-than-or-equal, and store the results in dst.
1261// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1263{
1266}
1267
1268// Compare the lower single-precision (32-bit) floating-point elements in a and
1269// b for less-than-or-equal, store the result in the lower element of dst, and
1270// copy the upper 3 packed elements from a to the upper elements of dst.
1271// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1273{
1274 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1275}
1276
1277// Compare packed single-precision (32-bit) floating-point elements in a and b
1278// for less-than, and store the results in dst.
1279// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1281{
1284}
1285
1286// Compare the lower single-precision (32-bit) floating-point elements in a and
1287// b for less-than, store the result in the lower element of dst, and copy the
1288// upper 3 packed elements from a to the upper elements of dst.
1289// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1291{
1292 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1293}
1294
1295// Compare packed single-precision (32-bit) floating-point elements in a and b
1296// for not-equal, and store the results in dst.
1297// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1299{
1300 return vreinterpretq_m128_u32(vmvnq_u32(
1302}
1303
1304// Compare the lower single-precision (32-bit) floating-point elements in a and
1305// b for not-equal, store the result in the lower element of dst, and copy the
1306// upper 3 packed elements from a to the upper elements of dst.
1307// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1309{
1310 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1311}
1312
1313// Compare packed single-precision (32-bit) floating-point elements in a and b
1314// for not-greater-than-or-equal, and store the results in dst.
1315// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1317{
1318 return vreinterpretq_m128_u32(vmvnq_u32(
1320}
1321
1322// Compare the lower single-precision (32-bit) floating-point elements in a and
1323// b for not-greater-than-or-equal, store the result in the lower element of
1324// dst, and copy the upper 3 packed elements from a to the upper elements of
1325// dst.
1326// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1328{
1329 return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1330}
1331
1332// Compare packed single-precision (32-bit) floating-point elements in a and b
1333// for not-greater-than, and store the results in dst.
1334// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1336{
1337 return vreinterpretq_m128_u32(vmvnq_u32(
1339}
1340
1341// Compare the lower single-precision (32-bit) floating-point elements in a and
1342// b for not-greater-than, store the result in the lower element of dst, and
1343// copy the upper 3 packed elements from a to the upper elements of dst.
1344// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1346{
1347 return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1348}
1349
1350// Compare packed single-precision (32-bit) floating-point elements in a and b
1351// for not-less-than-or-equal, and store the results in dst.
1352// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1354{
1355 return vreinterpretq_m128_u32(vmvnq_u32(
1357}
1358
1359// Compare the lower single-precision (32-bit) floating-point elements in a and
1360// b for not-less-than-or-equal, store the result in the lower element of dst,
1361// and copy the upper 3 packed elements from a to the upper elements of dst.
1362// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1364{
1365 return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1366}
1367
1368// Compare packed single-precision (32-bit) floating-point elements in a and b
1369// for not-less-than, and store the results in dst.
1370// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1372{
1373 return vreinterpretq_m128_u32(vmvnq_u32(
1375}
1376
1377// Compare the lower single-precision (32-bit) floating-point elements in a and
1378// b for not-less-than, store the result in the lower element of dst, and copy
1379// the upper 3 packed elements from a to the upper elements of dst.
1380// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1382{
1383 return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1384}
1385
1386// Compare packed single-precision (32-bit) floating-point elements in a and b
1387// to see if neither is NaN, and store the results in dst.
1388// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1389//
1390// See also:
1391// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1392// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1394{
1395 // Note: NEON does not have ordered compare builtin
1396 // Need to compare a eq a and b eq b to check for NaN
1397 // Do AND of results to get final
1398 uint32x4_t ceqaa =
1400 uint32x4_t ceqbb =
1402 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1403}
1404
1405// Compare the lower single-precision (32-bit) floating-point elements in a and
1406// b to see if neither is NaN, store the result in the lower element of dst, and
1407// copy the upper 3 packed elements from a to the upper elements of dst.
1408// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1410{
1411 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1412}
1413
1414// Compare packed single-precision (32-bit) floating-point elements in a and b
1415// to see if either is NaN, and store the results in dst.
1416// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
1418{
1419 uint32x4_t f32a =
1421 uint32x4_t f32b =
1423 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1424}
1425
1426// Compare the lower single-precision (32-bit) floating-point elements in a and
1427// b to see if either is NaN, store the result in the lower element of dst, and
1428// copy the upper 3 packed elements from a to the upper elements of dst.
1429// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
1431{
1432 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1433}
1434
1435// Compare the lower single-precision (32-bit) floating-point element in a and b
1436// for equality, and return the boolean result (0 or 1).
1437// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
1438FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1439{
1440 uint32x4_t a_eq_b =
1442 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1443}
1444
1445// Compare the lower single-precision (32-bit) floating-point element in a and b
1446// for greater-than-or-equal, and return the boolean result (0 or 1).
1447// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
1448FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1449{
1450 uint32x4_t a_ge_b =
1452 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1453}
1454
1455// Compare the lower single-precision (32-bit) floating-point element in a and b
1456// for greater-than, and return the boolean result (0 or 1).
1457// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
1458FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1459{
1460 uint32x4_t a_gt_b =
1462 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1463}
1464
1465// Compare the lower single-precision (32-bit) floating-point element in a and b
1466// for less-than-or-equal, and return the boolean result (0 or 1).
1467// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
1468FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1469{
1470 uint32x4_t a_le_b =
1472 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1473}
1474
1475// Compare the lower single-precision (32-bit) floating-point element in a and b
1476// for less-than, and return the boolean result (0 or 1).
1477// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
1478FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1479{
1480 uint32x4_t a_lt_b =
1482 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1483}
1484
1485// Compare the lower single-precision (32-bit) floating-point element in a and b
1486// for not-equal, and return the boolean result (0 or 1).
1487// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
1488FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1489{
1490 return !_mm_comieq_ss(a, b);
1491}
1492
1493// Convert packed signed 32-bit integers in b to packed single-precision
1494// (32-bit) floating-point elements, store the results in the lower 2 elements
1495// of dst, and copy the upper 2 packed elements from a to the upper elements of
1496// dst.
1497// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1499{
1501 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1502 vget_high_f32(vreinterpretq_f32_m128(a))));
1503}
1504
1505// Convert packed single-precision (32-bit) floating-point elements in a to
1506// packed 32-bit integers, and store the results in dst.
1507// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1509{
1510#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1511 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1512 return vreinterpret_m64_s32(
1513 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1514#else
1515 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1517#endif
1518}
1519
1520// Convert the signed 32-bit integer b to a single-precision (32-bit)
1521// floating-point element, store the result in the lower element of dst, and
1522// copy the upper 3 packed elements from a to the upper elements of dst.
1523// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1524FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1525{
1527 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1528}
1529
1530// Convert the lower single-precision (32-bit) floating-point element in a to a
1531// 32-bit integer, and store the result in dst.
1532// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1533FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1534{
1535#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1536 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1537 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1538 0);
1539#else
1540 float32_t data = vgetq_lane_f32(
1542 return (int32_t) data;
1543#endif
1544}
1545
1546// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1547// floating-point elements, and store the results in dst.
1548// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1550{
1552 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1553}
1554
1555// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1556// floating-point elements, store the results in the lower 2 elements of dst,
1557// and copy the upper 2 packed elements from a to the upper elements of dst.
1558// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1560{
1562 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1563 vget_high_f32(vreinterpretq_f32_m128(a))));
1564}
1565
1566// Convert packed signed 32-bit integers in a to packed single-precision
1567// (32-bit) floating-point elements, store the results in the lower 2 elements
1568// of dst, then convert the packed signed 32-bit integers in b to
1569// single-precision (32-bit) floating-point element, and store the results in
1570// the upper 2 elements of dst.
1571// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1573{
1574 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1575 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1576}
1577
1578// Convert the lower packed 8-bit integers in a to packed single-precision
1579// (32-bit) floating-point elements, and store the results in dst.
1580// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1582{
1583 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1584 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1585}
1586
1587// Convert packed single-precision (32-bit) floating-point elements in a to
1588// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1589// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1590// 0x7FFFFFFF.
1591// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1593{
1594 return vreinterpret_m64_s16(
1596}
1597
1598// Convert packed single-precision (32-bit) floating-point elements in a to
1599// packed 32-bit integers, and store the results in dst.
1600// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1601#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1602
1603// Convert packed single-precision (32-bit) floating-point elements in a to
1604// packed 8-bit integers, and store the results in lower 4 elements of dst.
1605// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1606// between 0x7F and 0x7FFFFFFF.
1607// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1609{
1610 return vreinterpret_m64_s8(vqmovn_s16(
1611 vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1612}
1613
1614// Convert packed unsigned 16-bit integers in a to packed single-precision
1615// (32-bit) floating-point elements, and store the results in dst.
1616// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1618{
1620 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1621}
1622
1623// Convert the lower packed unsigned 8-bit integers in a to packed
1624// single-precision (32-bit) floating-point elements, and store the results in
1625// dst.
1626// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1628{
1629 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1630 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1631}
1632
1633// Convert the signed 32-bit integer b to a single-precision (32-bit)
1634// floating-point element, store the result in the lower element of dst, and
1635// copy the upper 3 packed elements from a to the upper elements of dst.
1636// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1637#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1638
1639// Convert the signed 64-bit integer b to a single-precision (32-bit)
1640// floating-point element, store the result in the lower element of dst, and
1641// copy the upper 3 packed elements from a to the upper elements of dst.
1642// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1643FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1644{
1646 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1647}
1648
1649// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1650// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1651FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1652{
1653 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1654}
1655
1656// Convert the lower single-precision (32-bit) floating-point element in a to a
1657// 32-bit integer, and store the result in dst.
1658// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1659#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1660
1661// Convert the lower single-precision (32-bit) floating-point element in a to a
1662// 64-bit integer, and store the result in dst.
1663// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1664FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1665{
1666#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1667 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1668 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1669#else
1670 float32_t data = vgetq_lane_f32(
1672 return (int64_t) data;
1673#endif
1674}
1675
1676// Convert packed single-precision (32-bit) floating-point elements in a to
1677// packed 32-bit integers with truncation, and store the results in dst.
1678// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1680{
1681 return vreinterpret_m64_s32(
1682 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1683}
1684
1685// Convert the lower single-precision (32-bit) floating-point element in a to a
1686// 32-bit integer with truncation, and store the result in dst.
1687// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1688FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1689{
1690 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1691}
1692
1693// Convert packed single-precision (32-bit) floating-point elements in a to
1694// packed 32-bit integers with truncation, and store the results in dst.
1695// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1696#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1697
1698// Convert the lower single-precision (32-bit) floating-point element in a to a
1699// 32-bit integer with truncation, and store the result in dst.
1700// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1701#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1702
1703// Convert the lower single-precision (32-bit) floating-point element in a to a
1704// 64-bit integer with truncation, and store the result in dst.
1705// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1706FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1707{
1708 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1709}
1710
1711// Divide packed single-precision (32-bit) floating-point elements in a by
1712// packed elements in b, and store the results in dst.
1713// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
1714// division by multiplying a by b's reciprocal before using the Newton-Raphson
1715// method to approximate the results.
1716// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
1717FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1718{
1719#if defined(__aarch64__) || defined(_M_ARM64)
1722#else
1723 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1724 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1725 // Additional Netwon-Raphson iteration for accuracy
1726 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1727 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1728#endif
1729}
1730
1731// Divide the lower single-precision (32-bit) floating-point element in a by the
1732// lower single-precision (32-bit) floating-point element in b, store the result
1733// in the lower element of dst, and copy the upper 3 packed elements from a to
1734// the upper elements of dst.
1735// Warning: ARMv7-A does not produce the same result compared to Intel and not
1736// IEEE-compliant.
1737// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
1738FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1739{
1740 float32_t value =
1741 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1743 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1744}
1745
1746// Extract a 16-bit integer from a, selected with imm8, and store the result in
1747// the lower element of dst.
1748// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1749#define _mm_extract_pi16(a, imm) \
1750 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1751
1752// Free aligned memory that was allocated with _mm_malloc.
1753// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1754#if !defined(SSE2NEON_ALLOC_DEFINED)
1755FORCE_INLINE void _mm_free(void *addr)
1756{
1757 free(addr);
1758}
1759#endif
1760
1761FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
1762{
1763 uint64_t value;
1764#if defined(_MSC_VER)
1765 value = _ReadStatusReg(ARM64_FPCR);
1766#else
1767 __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
1768#endif
1769 return value;
1770}
1771
1772FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
1773{
1774#if defined(_MSC_VER)
1775 _WriteStatusReg(ARM64_FPCR, value);
1776#else
1777 __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
1778#endif
1779}
1780
1781// Macro: Get the flush zero bits from the MXCSR control and status register.
1782// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1783// _MM_FLUSH_ZERO_OFF
1784// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1785FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
1786{
1787 union {
1788 fpcr_bitfield field;
1789#if defined(__aarch64__) || defined(_M_ARM64)
1790 uint64_t value;
1791#else
1792 uint32_t value;
1793#endif
1794 } r;
1795
1796#if defined(__aarch64__) || defined(_M_ARM64)
1797 r.value = _sse2neon_get_fpcr();
1798#else
1799 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1800#endif
1801
1802 return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1803}
1804
1805// Macro: Get the rounding mode bits from the MXCSR control and status register.
1806// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1807// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1808// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1809FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
1810{
1811 union {
1812 fpcr_bitfield field;
1813#if defined(__aarch64__) || defined(_M_ARM64)
1814 uint64_t value;
1815#else
1816 uint32_t value;
1817#endif
1818 } r;
1819
1820#if defined(__aarch64__) || defined(_M_ARM64)
1821 r.value = _sse2neon_get_fpcr();
1822#else
1823 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1824#endif
1825
1826 if (r.field.bit22) {
1827 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1828 } else {
1829 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1830 }
1831}
1832
1833// Copy a to dst, and insert the 16-bit integer i into dst at the location
1834// specified by imm8.
1835// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1836#define _mm_insert_pi16(a, b, imm) \
1837 vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1838
1839// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1840// elements) from memory into dst. mem_addr must be aligned on a 16-byte
1841// boundary or a general-protection exception may be generated.
1842// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
1843FORCE_INLINE __m128 _mm_load_ps(const float *p)
1844{
1845 return vreinterpretq_m128_f32(vld1q_f32(p));
1846}
1847
1848// Load a single-precision (32-bit) floating-point element from memory into all
1849// elements of dst.
1850//
1851// dst[31:0] := MEM[mem_addr+31:mem_addr]
1852// dst[63:32] := MEM[mem_addr+31:mem_addr]
1853// dst[95:64] := MEM[mem_addr+31:mem_addr]
1854// dst[127:96] := MEM[mem_addr+31:mem_addr]
1855//
1856// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1857#define _mm_load_ps1 _mm_load1_ps
1858
1859// Load a single-precision (32-bit) floating-point element from memory into the
1860// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
1861// aligned on any particular boundary.
1862// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
1863FORCE_INLINE __m128 _mm_load_ss(const float *p)
1864{
1865 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1866}
1867
1868// Load a single-precision (32-bit) floating-point element from memory into all
1869// elements of dst.
1870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
1871FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1872{
1873 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1874}
1875
1876// Load 2 single-precision (32-bit) floating-point elements from memory into the
1877// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
1878// mem_addr does not need to be aligned on any particular boundary.
1879// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
1880FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1881{
1883 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1884}
1885
1886// Load 2 single-precision (32-bit) floating-point elements from memory into the
1887// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
1888// mem_addr does not need to be aligned on any particular boundary.
1889// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
1890FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1891{
1893 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1894}
1895
1896// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1897// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1898// general-protection exception may be generated.
1899// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
1900FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1901{
1902 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1903 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1904}
1905
1906// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1907// elements) from memory into dst. mem_addr does not need to be aligned on any
1908// particular boundary.
1909// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
1910FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1911{
1912 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1913 // equivalent for neon
1914 return vreinterpretq_m128_f32(vld1q_f32(p));
1915}
1916
1917// Load unaligned 16-bit integer from memory into the first element of dst.
1918// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
1919FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1920{
1922 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1923}
1924
1925// Load unaligned 64-bit integer from memory into the first element of dst.
1926// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
1927FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1928{
1930 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1931}
1932
1933// Allocate size bytes of memory, aligned to the alignment specified in align,
1934// and return a pointer to the allocated memory. _mm_free should be used to free
1935// memory that is allocated with _mm_malloc.
1936// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
1937#if !defined(SSE2NEON_ALLOC_DEFINED)
1938FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1939{
1940 void *ptr;
1941 if (align == 1)
1942 return malloc(size);
1943 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1944 align = sizeof(void *);
1945 if (!posix_memalign(&ptr, align, size))
1946 return ptr;
1947 return NULL;
1948}
1949#endif
1950
1951// Conditionally store 8-bit integer elements from a into memory using mask
1952// (elements are not stored when the highest bit is not set in the corresponding
1953// element) and a non-temporal memory hint.
1954// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
1955FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1956{
1957 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1958 __m128 b = _mm_load_ps((const float *) mem_addr);
1959 int8x8_t masked =
1960 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1961 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1962 vst1_s8((int8_t *) mem_addr, masked);
1963}
1964
1965// Conditionally store 8-bit integer elements from a into memory using mask
1966// (elements are not stored when the highest bit is not set in the corresponding
1967// element) and a non-temporal memory hint.
1968// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
1969#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1970
1971// Compare packed signed 16-bit integers in a and b, and store packed maximum
1972// values in dst.
1973// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
1974FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1975{
1976 return vreinterpret_m64_s16(
1978}
1979
1980// Compare packed single-precision (32-bit) floating-point elements in a and b,
1981// and store packed maximum values in dst. dst does not follow the IEEE Standard
1982// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
1983// signed-zero values.
1984// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
1985FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1986{
1987#if SSE2NEON_PRECISE_MINMAX
1988 float32x4_t _a = vreinterpretq_f32_m128(a);
1989 float32x4_t _b = vreinterpretq_f32_m128(b);
1990 return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1991#else
1994#endif
1995}
1996
1997// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1998// values in dst.
1999// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2000FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2001{
2002 return vreinterpret_m64_u8(
2004}
2005
2006// Compare the lower single-precision (32-bit) floating-point elements in a and
2007// b, store the maximum value in the lower element of dst, and copy the upper 3
2008// packed elements from a to the upper element of dst. dst does not follow the
2009// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
2010// inputs are NaN or signed-zero values.
2011// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
2012FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2013{
2014 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2016 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2017}
2018
2019// Compare packed signed 16-bit integers in a and b, and store packed minimum
2020// values in dst.
2021// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2022FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2023{
2024 return vreinterpret_m64_s16(
2026}
2027
2028// Compare packed single-precision (32-bit) floating-point elements in a and b,
2029// and store packed minimum values in dst. dst does not follow the IEEE Standard
2030// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
2031// signed-zero values.
2032// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
2033FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2034{
2035#if SSE2NEON_PRECISE_MINMAX
2036 float32x4_t _a = vreinterpretq_f32_m128(a);
2037 float32x4_t _b = vreinterpretq_f32_m128(b);
2038 return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2039#else
2042#endif
2043}
2044
2045// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2046// values in dst.
2047// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2048FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2049{
2050 return vreinterpret_m64_u8(
2052}
2053
2054// Compare the lower single-precision (32-bit) floating-point elements in a and
2055// b, store the minimum value in the lower element of dst, and copy the upper 3
2056// packed elements from a to the upper element of dst. dst does not follow the
2057// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
2058// inputs are NaN or signed-zero values.
2059// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
2060FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2061{
2062 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2064 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2065}
2066
2067// Move the lower single-precision (32-bit) floating-point element from b to the
2068// lower element of dst, and copy the upper 3 packed elements from a to the
2069// upper elements of dst.
2070// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
2072{
2074 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2075 vreinterpretq_f32_m128(a), 0));
2076}
2077
2078// Move the upper 2 single-precision (32-bit) floating-point elements from b to
2079// the lower 2 elements of dst, and copy the upper 2 elements from a to the
2080// upper 2 elements of dst.
2081// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
2083{
2084#if defined(aarch64__)
2087#else
2088 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2089 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2090 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2091#endif
2092}
2093
2094// Move the lower 2 single-precision (32-bit) floating-point elements from b to
2095// the upper 2 elements of dst, and copy the lower 2 elements from a to the
2096// lower 2 elements of dst.
2097// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
2098FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2099{
2100 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2101 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2102 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2103}
2104
2105// Create mask from the most significant bit of each 8-bit element in a, and
2106// store the result in dst.
2107// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2108FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2109{
2110 uint8x8_t input = vreinterpret_u8_m64(a);
2111#if defined(__aarch64__) || defined(_M_ARM64)
2112 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2113 uint8x8_t tmp = vshr_n_u8(input, 7);
2114 return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2115#else
2116 // Refer the implementation of `_mm_movemask_epi8`
2117 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2118 uint32x2_t paired16 =
2119 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2120 uint8x8_t paired32 =
2121 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2122 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2123#endif
2124}
2125
2126// Set each bit of mask dst based on the most significant bit of the
2127// corresponding packed single-precision (32-bit) floating-point element in a.
2128// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
2129FORCE_INLINE int _mm_movemask_ps(__m128 a)
2130{
2131 uint32x4_t input = vreinterpretq_u32_m128(a);
2132#if defined(__aarch64__) || defined(_M_ARM64)
2133 static const int32_t shift[4] = {0, 1, 2, 3};
2134 uint32x4_t tmp = vshrq_n_u32(input, 31);
2135 return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2136#else
2137 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2138 // Shift out everything but the sign bits with a 32-bit unsigned shift
2139 // right.
2140 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2141 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2142 uint8x16_t paired =
2143 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2144 // Extract the result.
2145 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2146#endif
2147}
2148
2149// Multiply packed single-precision (32-bit) floating-point elements in a and b,
2150// and store the results in dst.
2151// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
2152FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2153{
2156}
2157
2158// Multiply the lower single-precision (32-bit) floating-point element in a and
2159// b, store the result in the lower element of dst, and copy the upper 3 packed
2160// elements from a to the upper elements of dst.
2161// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2162FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2163{
2164 return _mm_move_ss(a, _mm_mul_ps(a, b));
2165}
2166
2167// Multiply the packed unsigned 16-bit integers in a and b, producing
2168// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2169// integers in dst.
2170// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2172{
2173 return vreinterpret_m64_u16(vshrn_n_u32(
2174 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2175}
2176
2177// Compute the bitwise OR of packed single-precision (32-bit) floating-point
2178// elements in a and b, and store the results in dst.
2179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
2180FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2181{
2184}
2185
2186// Average packed unsigned 8-bit integers in a and b, and store the results in
2187// dst.
2188// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2189#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2190
2191// Average packed unsigned 16-bit integers in a and b, and store the results in
2192// dst.
2193// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2194#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2195
2196// Extract a 16-bit integer from a, selected with imm8, and store the result in
2197// the lower element of dst.
2198// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2199#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2200
2201// Copy a to dst, and insert the 16-bit integer i into dst at the location
2202// specified by imm8.
2203// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2204#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2205
2206// Compare packed signed 16-bit integers in a and b, and store packed maximum
2207// values in dst.
2208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2209#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2210
2211// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2212// values in dst.
2213// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2214#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2215
2216// Compare packed signed 16-bit integers in a and b, and store packed minimum
2217// values in dst.
2218// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2219#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2220
2221// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2222// values in dst.
2223// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2224#define _m_pminub(a, b) _mm_min_pu8(a, b)
2225
2226// Create mask from the most significant bit of each 8-bit element in a, and
2227// store the result in dst.
2228// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2229#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2230
2231// Multiply the packed unsigned 16-bit integers in a and b, producing
2232// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2233// integers in dst.
2234// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2235#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2236
2237// Fetch the line of data from memory that contains address p to a location in
2238// the cache hierarchy specified by the locality hint i.
2239// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2240FORCE_INLINE void _mm_prefetch(char const *p, int i)
2241{
2242 (void) i;
2243#if defined(_MSC_VER)
2244 switch (i) {
2245 case _MM_HINT_NTA:
2246 __prefetch2(p, 1);
2247 break;
2248 case _MM_HINT_T0:
2249 __prefetch2(p, 0);
2250 break;
2251 case _MM_HINT_T1:
2252 __prefetch2(p, 2);
2253 break;
2254 case _MM_HINT_T2:
2255 __prefetch2(p, 4);
2256 break;
2257 }
2258#else
2259 switch (i) {
2260 case _MM_HINT_NTA:
2261 __builtin_prefetch(p, 0, 0);
2262 break;
2263 case _MM_HINT_T0:
2264 __builtin_prefetch(p, 0, 3);
2265 break;
2266 case _MM_HINT_T1:
2267 __builtin_prefetch(p, 0, 2);
2268 break;
2269 case _MM_HINT_T2:
2270 __builtin_prefetch(p, 0, 1);
2271 break;
2272 }
2273#endif
2274}
2275
2276// Compute the absolute differences of packed unsigned 8-bit integers in a and
2277// b, then horizontally sum each consecutive 8 differences to produce four
2278// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2279// 16 bits of dst.
2280// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2281#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2282
2283// Shuffle 16-bit integers in a using the control in imm8, and store the results
2284// in dst.
2285// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2286#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2287
2288// Compute the approximate reciprocal of packed single-precision (32-bit)
2289// floating-point elements in a, and store the results in dst. The maximum
2290// relative error for this approximation is less than 1.5*2^-12.
2291// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2292FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2293{
2294 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2295 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2296 return vreinterpretq_m128_f32(recip);
2297}
2298
2299// Compute the approximate reciprocal of the lower single-precision (32-bit)
2300// floating-point element in a, store the result in the lower element of dst,
2301// and copy the upper 3 packed elements from a to the upper elements of dst. The
2302// maximum relative error for this approximation is less than 1.5*2^-12.
2303// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2304FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2305{
2306 return _mm_move_ss(a, _mm_rcp_ps(a));
2307}
2308
2309// Compute the approximate reciprocal square root of packed single-precision
2310// (32-bit) floating-point elements in a, and store the results in dst. The
2311// maximum relative error for this approximation is less than 1.5*2^-12.
2312// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
2314{
2315 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2316
2317 // Generate masks for detecting whether input has any 0.0f/-0.0f
2318 // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
2319 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2320 const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2321 const uint32x4_t has_pos_zero =
2322 vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2323 const uint32x4_t has_neg_zero =
2324 vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2325
2326 out = vmulq_f32(
2327 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2328
2329 // Set output vector element to infinity/negative-infinity if
2330 // the corresponding input vector element is 0.0f/-0.0f.
2331 out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2332 out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2333
2334 return vreinterpretq_m128_f32(out);
2335}
2336
2337// Compute the approximate reciprocal square root of the lower single-precision
2338// (32-bit) floating-point element in a, store the result in the lower element
2339// of dst, and copy the upper 3 packed elements from a to the upper elements of
2340// dst.
2341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2343{
2344 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2345}
2346
2347// Compute the absolute differences of packed unsigned 8-bit integers in a and
2348// b, then horizontally sum each consecutive 8 differences to produce four
2349// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2350// 16 bits of dst.
2351// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2352FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2353{
2354 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2355 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2356 return vreinterpret_m64_u16(
2357 vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2358}
2359
2360// Macro: Set the flush zero bits of the MXCSR control and status register to
2361// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2362// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2363// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2364FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2365{
2366 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2367 // regardless of the value of the FZ bit.
2368 union {
2369 fpcr_bitfield field;
2370#if defined(__aarch64__) || defined(_M_ARM64)
2371 uint64_t value;
2372#else
2373 uint32_t value;
2374#endif
2375 } r;
2376
2377#if defined(__aarch64__) || defined(_M_ARM64)
2378 r.value = _sse2neon_get_fpcr();
2379#else
2380 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2381#endif
2382
2383 r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2384
2385#if defined(__aarch64__) || defined(_M_ARM64)
2386 _sse2neon_set_fpcr(r.value);
2387#else
2388 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2389#endif
2390}
2391
2392// Set packed single-precision (32-bit) floating-point elements in dst with the
2393// supplied values.
2394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
2395FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2396{
2397 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2398 return vreinterpretq_m128_f32(vld1q_f32(data));
2399}
2400
2401// Broadcast single-precision (32-bit) floating-point value a to all elements of
2402// dst.
2403// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
2404FORCE_INLINE __m128 _mm_set_ps1(float _w)
2405{
2406 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2407}
2408
2409// Macro: Set the rounding mode bits of the MXCSR control and status register to
2410// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2411// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2412// _MM_ROUND_TOWARD_ZERO
2413// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2414FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2415{
2416 union {
2417 fpcr_bitfield field;
2418#if defined(__aarch64__) || defined(_M_ARM64)
2419 uint64_t value;
2420#else
2421 uint32_t value;
2422#endif
2423 } r;
2424
2425#if defined(__aarch64__) || defined(_M_ARM64)
2426 r.value = _sse2neon_get_fpcr();
2427#else
2428 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2429#endif
2430
2431 switch (rounding) {
2433 r.field.bit22 = 1;
2434 r.field.bit23 = 1;
2435 break;
2436 case _MM_ROUND_DOWN:
2437 r.field.bit22 = 0;
2438 r.field.bit23 = 1;
2439 break;
2440 case _MM_ROUND_UP:
2441 r.field.bit22 = 1;
2442 r.field.bit23 = 0;
2443 break;
2444 default: //_MM_ROUND_NEAREST
2445 r.field.bit22 = 0;
2446 r.field.bit23 = 0;
2447 }
2448
2449#if defined(__aarch64__) || defined(_M_ARM64)
2450 _sse2neon_set_fpcr(r.value);
2451#else
2452 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2453#endif
2454}
2455
2456// Copy single-precision (32-bit) floating-point element a to the lower element
2457// of dst, and zero the upper 3 elements.
2458// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2459FORCE_INLINE __m128 _mm_set_ss(float a)
2460{
2461 return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2462}
2463
2464// Broadcast single-precision (32-bit) floating-point value a to all elements of
2465// dst.
2466// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
2467FORCE_INLINE __m128 _mm_set1_ps(float _w)
2468{
2469 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2470}
2471
2472// Set the MXCSR control and status register with the value in unsigned 32-bit
2473// integer a.
2474// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
2475// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2476FORCE_INLINE void _mm_setcsr(unsigned int a)
2477{
2479}
2480
2481// Get the unsigned 32-bit value of the MXCSR control and status register.
2482// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
2483// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2484FORCE_INLINE unsigned int _mm_getcsr(void)
2485{
2486 return _MM_GET_ROUNDING_MODE();
2487}
2488
2489// Set packed single-precision (32-bit) floating-point elements in dst with the
2490// supplied values in reverse order.
2491// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
2492FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2493{
2494 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2495 return vreinterpretq_m128_f32(vld1q_f32(data));
2496}
2497
2498// Return vector of type __m128 with all elements set to zero.
2499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
2500FORCE_INLINE __m128 _mm_setzero_ps(void)
2501{
2502 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2503}
2504
2505// Shuffle 16-bit integers in a using the control in imm8, and store the results
2506// in dst.
2507// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2508#ifdef _sse2neon_shuffle
2509#define _mm_shuffle_pi16(a, imm) \
2510 vreinterpret_m64_s16(vshuffle_s16( \
2511 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2512 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
2513#else
2514#define _mm_shuffle_pi16(a, imm) \
2515 _sse2neon_define1( \
2516 __m64, a, int16x4_t ret; \
2517 ret = vmov_n_s16( \
2518 vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2519 ret = vset_lane_s16( \
2520 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2521 1); \
2522 ret = vset_lane_s16( \
2523 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2524 2); \
2525 ret = vset_lane_s16( \
2526 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2527 3); \
2528 _sse2neon_return(vreinterpret_m64_s16(ret));)
2529#endif
2530
2531// Perform a serializing operation on all store-to-memory instructions that were
2532// issued prior to this instruction. Guarantees that every store instruction
2533// that precedes, in program order, is globally visible before any store
2534// instruction which follows the fence in program order.
2535// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2536FORCE_INLINE void _mm_sfence(void)
2537{
2539}
2540
2541// Perform a serializing operation on all load-from-memory and store-to-memory
2542// instructions that were issued prior to this instruction. Guarantees that
2543// every memory access that precedes, in program order, the memory fence
2544// instruction is globally visible before any memory instruction which follows
2545// the fence in program order.
2546// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2547FORCE_INLINE void _mm_mfence(void)
2548{
2550}
2551
2552// Perform a serializing operation on all load-from-memory instructions that
2553// were issued prior to this instruction. Guarantees that every load instruction
2554// that precedes, in program order, is globally visible before any load
2555// instruction which follows the fence in program order.
2556// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2557FORCE_INLINE void _mm_lfence(void)
2558{
2560}
2561
2562// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2563// int imm)
2564#ifdef _sse2neon_shuffle
2565#define _mm_shuffle_ps(a, b, imm) \
2566 __extension__({ \
2567 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2568 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2569 float32x4_t _shuf = \
2570 vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2571 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2572 vreinterpretq_m128_f32(_shuf); \
2573 })
2574#else // generic
2575#define _mm_shuffle_ps(a, b, imm) \
2576 _sse2neon_define2( \
2577 __m128, a, b, __m128 ret; switch (imm) { \
2578 case _MM_SHUFFLE(1, 0, 3, 2): \
2579 ret = _mm_shuffle_ps_1032(_a, _b); \
2580 break; \
2581 case _MM_SHUFFLE(2, 3, 0, 1): \
2582 ret = _mm_shuffle_ps_2301(_a, _b); \
2583 break; \
2584 case _MM_SHUFFLE(0, 3, 2, 1): \
2585 ret = _mm_shuffle_ps_0321(_a, _b); \
2586 break; \
2587 case _MM_SHUFFLE(2, 1, 0, 3): \
2588 ret = _mm_shuffle_ps_2103(_a, _b); \
2589 break; \
2590 case _MM_SHUFFLE(1, 0, 1, 0): \
2591 ret = _mm_movelh_ps(_a, _b); \
2592 break; \
2593 case _MM_SHUFFLE(1, 0, 0, 1): \
2594 ret = _mm_shuffle_ps_1001(_a, _b); \
2595 break; \
2596 case _MM_SHUFFLE(0, 1, 0, 1): \
2597 ret = _mm_shuffle_ps_0101(_a, _b); \
2598 break; \
2599 case _MM_SHUFFLE(3, 2, 1, 0): \
2600 ret = _mm_shuffle_ps_3210(_a, _b); \
2601 break; \
2602 case _MM_SHUFFLE(0, 0, 1, 1): \
2603 ret = _mm_shuffle_ps_0011(_a, _b); \
2604 break; \
2605 case _MM_SHUFFLE(0, 0, 2, 2): \
2606 ret = _mm_shuffle_ps_0022(_a, _b); \
2607 break; \
2608 case _MM_SHUFFLE(2, 2, 0, 0): \
2609 ret = _mm_shuffle_ps_2200(_a, _b); \
2610 break; \
2611 case _MM_SHUFFLE(3, 2, 0, 2): \
2612 ret = _mm_shuffle_ps_3202(_a, _b); \
2613 break; \
2614 case _MM_SHUFFLE(3, 2, 3, 2): \
2615 ret = _mm_movehl_ps(_b, _a); \
2616 break; \
2617 case _MM_SHUFFLE(1, 1, 3, 3): \
2618 ret = _mm_shuffle_ps_1133(_a, _b); \
2619 break; \
2620 case _MM_SHUFFLE(2, 0, 1, 0): \
2621 ret = _mm_shuffle_ps_2010(_a, _b); \
2622 break; \
2623 case _MM_SHUFFLE(2, 0, 0, 1): \
2624 ret = _mm_shuffle_ps_2001(_a, _b); \
2625 break; \
2626 case _MM_SHUFFLE(2, 0, 3, 2): \
2627 ret = _mm_shuffle_ps_2032(_a, _b); \
2628 break; \
2629 default: \
2630 ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2631 break; \
2632 } _sse2neon_return(ret);)
2633#endif
2634
2635// Compute the square root of packed single-precision (32-bit) floating-point
2636// elements in a, and store the results in dst.
2637// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
2638// square root by multiplying input in with its reciprocal square root before
2639// using the Newton-Raphson method to approximate the results.
2640// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
2641FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2642{
2643#if defined(__aarch64__) || defined(_M_ARM64)
2644 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2645#else
2646 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2647
2648 // Test for vrsqrteq_f32(0) -> positive infinity case.
2649 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2650 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2651 const uint32x4_t div_by_zero =
2652 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2653 recip = vreinterpretq_f32_u32(
2654 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2655
2656 recip = vmulq_f32(
2657 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2658 recip);
2659 // Additional Netwon-Raphson iteration for accuracy
2660 recip = vmulq_f32(
2661 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2662 recip);
2663
2664 // sqrt(s) = s * 1/sqrt(s)
2665 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2666#endif
2667}
2668
2669// Compute the square root of the lower single-precision (32-bit) floating-point
2670// element in a, store the result in the lower element of dst, and copy the
2671// upper 3 packed elements from a to the upper elements of dst.
2672// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
2673FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2674{
2675 float32_t value =
2676 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2678 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2679}
2680
2681// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2682// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
2683// or a general-protection exception may be generated.
2684// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
2685FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2686{
2687 vst1q_f32(p, vreinterpretq_f32_m128(a));
2688}
2689
2690// Store the lower single-precision (32-bit) floating-point element from a into
2691// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2692// boundary or a general-protection exception may be generated.
2693// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2694FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2695{
2696 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2697 vst1q_f32(p, vdupq_n_f32(a0));
2698}
2699
2700// Store the lower single-precision (32-bit) floating-point element from a into
2701// memory. mem_addr does not need to be aligned on any particular boundary.
2702// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
2703FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2704{
2705 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2706}
2707
2708// Store the lower single-precision (32-bit) floating-point element from a into
2709// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2710// boundary or a general-protection exception may be generated.
2711// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2712#define _mm_store1_ps _mm_store_ps1
2713
2714// Store the upper 2 single-precision (32-bit) floating-point elements from a
2715// into memory.
2716// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
2717FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2718{
2719 *p = vreinterpret_m64_f32(vget_high_f32(a));
2720}
2721
2722// Store the lower 2 single-precision (32-bit) floating-point elements from a
2723// into memory.
2724// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
2725FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2726{
2727 *p = vreinterpret_m64_f32(vget_low_f32(a));
2728}
2729
2730// Store 4 single-precision (32-bit) floating-point elements from a into memory
2731// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2732// general-protection exception may be generated.
2733// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2734FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2735{
2736 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2737 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2738 vst1q_f32(p, rev);
2739}
2740
2741// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2742// elements) from a into memory. mem_addr does not need to be aligned on any
2743// particular boundary.
2744// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
2745FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2746{
2747 vst1q_f32(p, vreinterpretq_f32_m128(a));
2748}
2749
2750// Stores 16-bits of integer data a at the address p.
2751// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2752FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2753{
2754 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2755}
2756
2757// Stores 64-bits of integer data a at the address p.
2758// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2759FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2760{
2761 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2762}
2763
2764// Store 64-bits of integer data from a into memory using a non-temporal memory
2765// hint.
2766// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2767FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2768{
2769 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2770}
2771
2772// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2773// point elements) from a into memory using a non-temporal memory hint.
2774// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2775FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2776{
2777#if __has_builtin(__builtin_nontemporal_store)
2778 __builtin_nontemporal_store(a, (float32x4_t *) p);
2779#else
2780 vst1q_f32(p, vreinterpretq_f32_m128(a));
2781#endif
2782}
2783
2784// Subtract packed single-precision (32-bit) floating-point elements in b from
2785// packed single-precision (32-bit) floating-point elements in a, and store the
2786// results in dst.
2787// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
2788FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2789{
2792}
2793
2794// Subtract the lower single-precision (32-bit) floating-point element in b from
2795// the lower single-precision (32-bit) floating-point element in a, store the
2796// result in the lower element of dst, and copy the upper 3 packed elements from
2797// a to the upper elements of dst.
2798// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2799FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2800{
2801 return _mm_move_ss(a, _mm_sub_ps(a, b));
2802}
2803
2804// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2805// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2806// transposed matrix in these vectors (row0 now contains column 0, etc.).
2807// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
2808#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2809 do { \
2810 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2811 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2812 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2813 vget_low_f32(ROW23.val[0])); \
2814 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2815 vget_low_f32(ROW23.val[1])); \
2816 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2817 vget_high_f32(ROW23.val[0])); \
2818 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2819 vget_high_f32(ROW23.val[1])); \
2820 } while (0)
2821
2822// according to the documentation, these intrinsics behave the same as the
2823// non-'u' versions. We'll just alias them here.
2824#define _mm_ucomieq_ss _mm_comieq_ss
2825#define _mm_ucomige_ss _mm_comige_ss
2826#define _mm_ucomigt_ss _mm_comigt_ss
2827#define _mm_ucomile_ss _mm_comile_ss
2828#define _mm_ucomilt_ss _mm_comilt_ss
2829#define _mm_ucomineq_ss _mm_comineq_ss
2830
2831// Return vector of type __m128i with undefined elements.
2832// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
2834{
2835#if defined(__GNUC__) || defined(__clang__)
2836#pragma GCC diagnostic push
2837#pragma GCC diagnostic ignored "-Wuninitialized"
2838#endif
2839 __m128i a;
2840#if defined(_MSC_VER)
2841 a = _mm_setzero_si128();
2842#endif
2843 return a;
2844#if defined(__GNUC__) || defined(__clang__)
2845#pragma GCC diagnostic pop
2846#endif
2847}
2848
2849// Return vector of type __m128 with undefined elements.
2850// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
2851FORCE_INLINE __m128 _mm_undefined_ps(void)
2852{
2853#if defined(__GNUC__) || defined(__clang__)
2854#pragma GCC diagnostic push
2855#pragma GCC diagnostic ignored "-Wuninitialized"
2856#endif
2857 __m128 a;
2858#if defined(_MSC_VER)
2859 a = _mm_setzero_ps();
2860#endif
2861 return a;
2862#if defined(__GNUC__) || defined(__clang__)
2863#pragma GCC diagnostic pop
2864#endif
2865}
2866
2867// Unpack and interleave single-precision (32-bit) floating-point elements from
2868// the high half a and b, and store the results in dst.
2869// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
2871{
2872#if defined(__aarch64__) || defined(_M_ARM64)
2875#else
2876 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2877 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2878 float32x2x2_t result = vzip_f32(a1, b1);
2879 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2880#endif
2881}
2882
2883// Unpack and interleave single-precision (32-bit) floating-point elements from
2884// the low half of a and b, and store the results in dst.
2885// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
2887{
2888#if defined(__aarch64__) || defined(_M_ARM64)
2891#else
2892 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2893 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2894 float32x2x2_t result = vzip_f32(a1, b1);
2895 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2896#endif
2897}
2898
2899// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
2900// elements in a and b, and store the results in dst.
2901// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
2902FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2903{
2906}
2907
2908/* SSE2 */
2909
2910// Add packed 16-bit integers in a and b, and store the results in dst.
2911// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
2913{
2916}
2917
2918// Add packed 32-bit integers in a and b, and store the results in dst.
2919// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
2921{
2924}
2925
2926// Add packed 64-bit integers in a and b, and store the results in dst.
2927// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
2929{
2932}
2933
2934// Add packed 8-bit integers in a and b, and store the results in dst.
2935// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
2937{
2940}
2941
2942// Add packed double-precision (64-bit) floating-point elements in a and b, and
2943// store the results in dst.
2944// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
2946{
2947#if defined(__aarch64__) || defined(_M_ARM64)
2948 return vreinterpretq_m128d_f64(
2949 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2950#else
2951 double *da = (double *) &a;
2952 double *db = (double *) &b;
2953 double c[2];
2954 c[0] = da[0] + db[0];
2955 c[1] = da[1] + db[1];
2956 return vld1q_f32((float32_t *) c);
2957#endif
2958}
2959
2960// Add the lower double-precision (64-bit) floating-point element in a and b,
2961// store the result in the lower element of dst, and copy the upper element from
2962// a to the upper element of dst.
2963// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
2965{
2966#if defined(__aarch64__) || defined(_M_ARM64)
2967 return _mm_move_sd(a, _mm_add_pd(a, b));
2968#else
2969 double *da = (double *) &a;
2970 double *db = (double *) &b;
2971 double c[2];
2972 c[0] = da[0] + db[0];
2973 c[1] = da[1];
2974 return vld1q_f32((float32_t *) c);
2975#endif
2976}
2977
2978// Add 64-bit integers a and b, and store the result in dst.
2979// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
2980FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2981{
2982 return vreinterpret_m64_s64(
2984}
2985
2986// Add packed signed 16-bit integers in a and b using saturation, and store the
2987// results in dst.
2988// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
2990{
2993}
2994
2995// Add packed signed 8-bit integers in a and b using saturation, and store the
2996// results in dst.
2997// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
2999{
3002}
3003
3004// Add packed unsigned 16-bit integers in a and b using saturation, and store
3005// the results in dst.
3006// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3008{
3011}
3012
3013// Add packed unsigned 8-bit integers in a and b using saturation, and store the
3014// results in dst.
3015// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
3017{
3020}
3021
3022// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3023// elements in a and b, and store the results in dst.
3024// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3026{
3029}
3030
3031// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
3032// and store the result in dst.
3033// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
3035{
3038}
3039
3040// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3041// elements in a and then AND with b, and store the results in dst.
3042// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3044{
3045 // *NOTE* argument swap
3048}
3049
3050// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
3051// AND with b, and store the result in dst.
3052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
3054{
3056 vbicq_s32(vreinterpretq_s32_m128i(b),
3057 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3058}
3059
3060// Average packed unsigned 16-bit integers in a and b, and store the results in
3061// dst.
3062// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
3064{
3065 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3067}
3068
3069// Average packed unsigned 8-bit integers in a and b, and store the results in
3070// dst.
3071// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
3073{
3076}
3077
3078// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3079// dst.
3080// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3081#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3082
3083// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3084// dst.
3085// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3086#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3087
3088// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3089// compilation and does not generate any instructions, thus it has zero latency.
3090// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3092{
3094}
3095
3096// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3097// compilation and does not generate any instructions, thus it has zero latency.
3098// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3100{
3102}
3103
3104// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3105// compilation and does not generate any instructions, thus it has zero latency.
3106// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3108{
3110}
3111
3112// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
3113// compilation and does not generate any instructions, thus it has zero latency.
3114// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
3116{
3118}
3119
3120// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3121// compilation and does not generate any instructions, thus it has zero latency.
3122// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3124{
3125#if defined(__aarch64__) || defined(_M_ARM64)
3126 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3127#else
3129#endif
3130}
3131
3132// Cast vector of type __m128i to type __m128. This intrinsic is only used for
3133// compilation and does not generate any instructions, thus it has zero latency.
3134// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
3136{
3138}
3139
3140// Invalidate and flush the cache line that contains p from all levels of the
3141// cache hierarchy.
3142// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3143#if defined(__APPLE__)
3144#include <libkern/OSCacheControl.h>
3145#endif
3146FORCE_INLINE void _mm_clflush(void const *p)
3147{
3148 (void) p;
3149
3150 /* sys_icache_invalidate is supported since macOS 10.5.
3151 * However, it does not work on non-jailbroken iOS devices, although the
3152 * compilation is successful.
3153 */
3154#if defined(__APPLE__)
3155 sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3156#elif defined(__GNUC__) || defined(__clang__)
3157 uintptr_t ptr = (uintptr_t) p;
3158 __builtin___clear_cache((char *) ptr,
3159 (char *) ptr + SSE2NEON_CACHELINE_SIZE);
3160#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3161 FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
3162#endif
3163}
3164
3165// Compare packed 16-bit integers in a and b for equality, and store the results
3166// in dst.
3167// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
3169{
3172}
3173
3174// Compare packed 32-bit integers in a and b for equality, and store the results
3175// in dst.
3176// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
3178{
3181}
3182
3183// Compare packed 8-bit integers in a and b for equality, and store the results
3184// in dst.
3185// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
3187{
3190}
3191
3192// Compare packed double-precision (64-bit) floating-point elements in a and b
3193// for equality, and store the results in dst.
3194// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3196{
3197#if defined(__aarch64__) || defined(_M_ARM64)
3199 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3200#else
3201 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3202 uint32x4_t cmp =
3204 uint32x4_t swapped = vrev64q_u32(cmp);
3205 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3206#endif
3207}
3208
3209// Compare the lower double-precision (64-bit) floating-point elements in a and
3210// b for equality, store the result in the lower element of dst, and copy the
3211// upper element from a to the upper element of dst.
3212// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3214{
3215 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3216}
3217
3218// Compare packed double-precision (64-bit) floating-point elements in a and b
3219// for greater-than-or-equal, and store the results in dst.
3220// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3222{
3223#if defined(__aarch64__) || defined(_M_ARM64)
3225 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3226#else
3227 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3228 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3229 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3230 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3231 uint64_t d[2];
3232 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3233 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3234
3235 return vreinterpretq_m128d_u64(vld1q_u64(d));
3236#endif
3237}
3238
3239// Compare the lower double-precision (64-bit) floating-point elements in a and
3240// b for greater-than-or-equal, store the result in the lower element of dst,
3241// and copy the upper element from a to the upper element of dst.
3242// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3244{
3245#if defined(__aarch64__) || defined(_M_ARM64)
3246 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3247#else
3248 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3249 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3250 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3251 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3252 uint64_t d[2];
3253 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3254 d[1] = a1;
3255
3256 return vreinterpretq_m128d_u64(vld1q_u64(d));
3257#endif
3258}
3259
3260// Compare packed signed 16-bit integers in a and b for greater-than, and store
3261// the results in dst.
3262// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
3264{
3267}
3268
3269// Compare packed signed 32-bit integers in a and b for greater-than, and store
3270// the results in dst.
3271// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
3273{
3276}
3277
3278// Compare packed signed 8-bit integers in a and b for greater-than, and store
3279// the results in dst.
3280// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
3282{
3285}
3286
3287// Compare packed double-precision (64-bit) floating-point elements in a and b
3288// for greater-than, and store the results in dst.
3289// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3291{
3292#if defined(__aarch64__) || defined(_M_ARM64)
3294 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3295#else
3296 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3297 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3298 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3299 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3300 uint64_t d[2];
3301 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3302 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3303
3304 return vreinterpretq_m128d_u64(vld1q_u64(d));
3305#endif
3306}
3307
3308// Compare the lower double-precision (64-bit) floating-point elements in a and
3309// b for greater-than, store the result in the lower element of dst, and copy
3310// the upper element from a to the upper element of dst.
3311// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3313{
3314#if defined(__aarch64__) || defined(_M_ARM64)
3315 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3316#else
3317 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3318 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3319 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3320 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3321 uint64_t d[2];
3322 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3323 d[1] = a1;
3324
3325 return vreinterpretq_m128d_u64(vld1q_u64(d));
3326#endif
3327}
3328
3329// Compare packed double-precision (64-bit) floating-point elements in a and b
3330// for less-than-or-equal, and store the results in dst.
3331// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3333{
3334#if defined(__aarch64__) || defined(_M_ARM64)
3336 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3337#else
3338 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3339 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3340 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3341 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3342 uint64_t d[2];
3343 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3344 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3345
3346 return vreinterpretq_m128d_u64(vld1q_u64(d));
3347#endif
3348}
3349
3350// Compare the lower double-precision (64-bit) floating-point elements in a and
3351// b for less-than-or-equal, store the result in the lower element of dst, and
3352// copy the upper element from a to the upper element of dst.
3353// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3355{
3356#if defined(__aarch64__) || defined(_M_ARM64)
3357 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3358#else
3359 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3360 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3361 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3362 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3363 uint64_t d[2];
3364 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3365 d[1] = a1;
3366
3367 return vreinterpretq_m128d_u64(vld1q_u64(d));
3368#endif
3369}
3370
3371// Compare packed signed 16-bit integers in a and b for less-than, and store the
3372// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
3373// order of the operands switched.
3374// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
3376{
3379}
3380
3381// Compare packed signed 32-bit integers in a and b for less-than, and store the
3382// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
3383// order of the operands switched.
3384// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
3386{
3389}
3390
3391// Compare packed signed 8-bit integers in a and b for less-than, and store the
3392// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
3393// order of the operands switched.
3394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
3396{
3399}
3400
3401// Compare packed double-precision (64-bit) floating-point elements in a and b
3402// for less-than, and store the results in dst.
3403// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3405{
3406#if defined(__aarch64__) || defined(_M_ARM64)
3408 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3409#else
3410 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3411 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3412 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3413 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3414 uint64_t d[2];
3415 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3416 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3417
3418 return vreinterpretq_m128d_u64(vld1q_u64(d));
3419#endif
3420}
3421
3422// Compare the lower double-precision (64-bit) floating-point elements in a and
3423// b for less-than, store the result in the lower element of dst, and copy the
3424// upper element from a to the upper element of dst.
3425// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3427{
3428#if defined(__aarch64__) || defined(_M_ARM64)
3429 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3430#else
3431 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3432 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3433 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3434 uint64_t d[2];
3435 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3436 d[1] = a1;
3437
3438 return vreinterpretq_m128d_u64(vld1q_u64(d));
3439#endif
3440}
3441
3442// Compare packed double-precision (64-bit) floating-point elements in a and b
3443// for not-equal, and store the results in dst.
3444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3446{
3447#if defined(__aarch64__) || defined(_M_ARM64)
3448 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3449 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3450#else
3451 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3452 uint32x4_t cmp =
3454 uint32x4_t swapped = vrev64q_u32(cmp);
3455 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3456#endif
3457}
3458
3459// Compare the lower double-precision (64-bit) floating-point elements in a and
3460// b for not-equal, store the result in the lower element of dst, and copy the
3461// upper element from a to the upper element of dst.
3462// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3464{
3465 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3466}
3467
3468// Compare packed double-precision (64-bit) floating-point elements in a and b
3469// for not-greater-than-or-equal, and store the results in dst.
3470// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3472{
3473#if defined(__aarch64__) || defined(_M_ARM64)
3474 return vreinterpretq_m128d_u64(veorq_u64(
3475 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3476 vdupq_n_u64(UINT64_MAX)));
3477#else
3478 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3479 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3480 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3481 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3482 uint64_t d[2];
3483 d[0] =
3484 !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3485 d[1] =
3486 !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3487
3488 return vreinterpretq_m128d_u64(vld1q_u64(d));
3489#endif
3490}
3491
3492// Compare the lower double-precision (64-bit) floating-point elements in a and
3493// b for not-greater-than-or-equal, store the result in the lower element of
3494// dst, and copy the upper element from a to the upper element of dst.
3495// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3497{
3498 return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3499}
3500
3501// Compare packed double-precision (64-bit) floating-point elements in a and b
3502// for not-greater-than, and store the results in dst.
3503// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3505{
3506#if defined(__aarch64__) || defined(_M_ARM64)
3507 return vreinterpretq_m128d_u64(veorq_u64(
3508 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3509 vdupq_n_u64(UINT64_MAX)));
3510#else
3511 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3512 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3513 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3514 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3515 uint64_t d[2];
3516 d[0] =
3517 !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3518 d[1] =
3519 !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3520
3521 return vreinterpretq_m128d_u64(vld1q_u64(d));
3522#endif
3523}
3524
3525// Compare the lower double-precision (64-bit) floating-point elements in a and
3526// b for not-greater-than, store the result in the lower element of dst, and
3527// copy the upper element from a to the upper element of dst.
3528// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3530{
3531 return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3532}
3533
3534// Compare packed double-precision (64-bit) floating-point elements in a and b
3535// for not-less-than-or-equal, and store the results in dst.
3536// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3538{
3539#if defined(__aarch64__) || defined(_M_ARM64)
3540 return vreinterpretq_m128d_u64(veorq_u64(
3541 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3542 vdupq_n_u64(UINT64_MAX)));
3543#else
3544 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3545 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3546 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3547 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3548 uint64_t d[2];
3549 d[0] =
3550 !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3551 d[1] =
3552 !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3553
3554 return vreinterpretq_m128d_u64(vld1q_u64(d));
3555#endif
3556}
3557
3558// Compare the lower double-precision (64-bit) floating-point elements in a and
3559// b for not-less-than-or-equal, store the result in the lower element of dst,
3560// and copy the upper element from a to the upper element of dst.
3561// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3563{
3564 return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3565}
3566
3567// Compare packed double-precision (64-bit) floating-point elements in a and b
3568// for not-less-than, and store the results in dst.
3569// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3571{
3572#if defined(__aarch64__) || defined(_M_ARM64)
3573 return vreinterpretq_m128d_u64(veorq_u64(
3574 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3575 vdupq_n_u64(UINT64_MAX)));
3576#else
3577 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3578 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3579 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3580 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3581 uint64_t d[2];
3582 d[0] =
3583 !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3584 d[1] =
3585 !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3586
3587 return vreinterpretq_m128d_u64(vld1q_u64(d));
3588#endif
3589}
3590
3591// Compare the lower double-precision (64-bit) floating-point elements in a and
3592// b for not-less-than, store the result in the lower element of dst, and copy
3593// the upper element from a to the upper element of dst.
3594// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3596{
3597 return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3598}
3599
3600// Compare packed double-precision (64-bit) floating-point elements in a and b
3601// to see if neither is NaN, and store the results in dst.
3602// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3604{
3605#if defined(__aarch64__) || defined(_M_ARM64)
3606 // Excluding NaNs, any two floating point numbers can be compared.
3607 uint64x2_t not_nan_a =
3608 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3609 uint64x2_t not_nan_b =
3610 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3611 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3612#else
3613 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3614 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3615 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3616 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3617 uint64_t d[2];
3618 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3619 (*(double *) &b0) == (*(double *) &b0))
3620 ? ~UINT64_C(0)
3621 : UINT64_C(0);
3622 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3623 (*(double *) &b1) == (*(double *) &b1))
3624 ? ~UINT64_C(0)
3625 : UINT64_C(0);
3626
3627 return vreinterpretq_m128d_u64(vld1q_u64(d));
3628#endif
3629}
3630
3631// Compare the lower double-precision (64-bit) floating-point elements in a and
3632// b to see if neither is NaN, store the result in the lower element of dst, and
3633// copy the upper element from a to the upper element of dst.
3634// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3636{
3637#if defined(__aarch64__) || defined(_M_ARM64)
3638 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3639#else
3640 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3641 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3642 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3643 uint64_t d[2];
3644 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3645 (*(double *) &b0) == (*(double *) &b0))
3646 ? ~UINT64_C(0)
3647 : UINT64_C(0);
3648 d[1] = a1;
3649
3650 return vreinterpretq_m128d_u64(vld1q_u64(d));
3651#endif
3652}
3653
3654// Compare packed double-precision (64-bit) floating-point elements in a and b
3655// to see if either is NaN, and store the results in dst.
3656// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3658{
3659#if defined(__aarch64__) || defined(_M_ARM64)
3660 // Two NaNs are not equal in comparison operation.
3661 uint64x2_t not_nan_a =
3662 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3663 uint64x2_t not_nan_b =
3664 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3666 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3667#else
3668 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3669 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3670 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3671 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3672 uint64_t d[2];
3673 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3674 (*(double *) &b0) == (*(double *) &b0))
3675 ? UINT64_C(0)
3676 : ~UINT64_C(0);
3677 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3678 (*(double *) &b1) == (*(double *) &b1))
3679 ? UINT64_C(0)
3680 : ~UINT64_C(0);
3681
3682 return vreinterpretq_m128d_u64(vld1q_u64(d));
3683#endif
3684}
3685
3686// Compare the lower double-precision (64-bit) floating-point elements in a and
3687// b to see if either is NaN, store the result in the lower element of dst, and
3688// copy the upper element from a to the upper element of dst.
3689// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3691{
3692#if defined(__aarch64__) || defined(_M_ARM64)
3693 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3694#else
3695 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3696 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3697 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3698 uint64_t d[2];
3699 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3700 (*(double *) &b0) == (*(double *) &b0))
3701 ? UINT64_C(0)
3702 : ~UINT64_C(0);
3703 d[1] = a1;
3704
3705 return vreinterpretq_m128d_u64(vld1q_u64(d));
3706#endif
3707}
3708
3709// Compare the lower double-precision (64-bit) floating-point element in a and b
3710// for greater-than-or-equal, and return the boolean result (0 or 1).
3711// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3712FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3713{
3714#if defined(__aarch64__) || defined(_M_ARM64)
3715 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3716#else
3717 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3718 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3719
3720 return (*(double *) &a0 >= *(double *) &b0);
3721#endif
3722}
3723
3724// Compare the lower double-precision (64-bit) floating-point element in a and b
3725// for greater-than, and return the boolean result (0 or 1).
3726// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
3727FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3728{
3729#if defined(__aarch64__) || defined(_M_ARM64)
3730 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3731#else
3732 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3733 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3734
3735 return (*(double *) &a0 > *(double *) &b0);
3736#endif
3737}
3738
3739// Compare the lower double-precision (64-bit) floating-point element in a and b
3740// for less-than-or-equal, and return the boolean result (0 or 1).
3741// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
3742FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3743{
3744#if defined(__aarch64__) || defined(_M_ARM64)
3745 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3746#else
3747 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3748 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3749
3750 return (*(double *) &a0 <= *(double *) &b0);
3751#endif
3752}
3753
3754// Compare the lower double-precision (64-bit) floating-point element in a and b
3755// for less-than, and return the boolean result (0 or 1).
3756// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
3757FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3758{
3759#if defined(__aarch64__) || defined(_M_ARM64)
3760 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3761#else
3762 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3763 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3764
3765 return (*(double *) &a0 < *(double *) &b0);
3766#endif
3767}
3768
3769// Compare the lower double-precision (64-bit) floating-point element in a and b
3770// for equality, and return the boolean result (0 or 1).
3771// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
3772FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3773{
3774#if defined(__aarch64__) || defined(_M_ARM64)
3775 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3776#else
3777 uint32x4_t a_not_nan =
3779 uint32x4_t b_not_nan =
3781 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3782 uint32x4_t a_eq_b =
3784 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3785 vreinterpretq_u64_u32(a_eq_b));
3786 return vgetq_lane_u64(and_results, 0) & 0x1;
3787#endif
3788}
3789
3790// Compare the lower double-precision (64-bit) floating-point element in a and b
3791// for not-equal, and return the boolean result (0 or 1).
3792// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
3793FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3794{
3795 return !_mm_comieq_sd(a, b);
3796}
3797
3798// Convert packed signed 32-bit integers in a to packed double-precision
3799// (64-bit) floating-point elements, and store the results in dst.
3800// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
3802{
3803#if defined(__aarch64__) || defined(_M_ARM64)
3804 return vreinterpretq_m128d_f64(
3805 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3806#else
3807 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3808 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3809 return _mm_set_pd(a1, a0);
3810#endif
3811}
3812
3813// Convert packed signed 32-bit integers in a to packed single-precision
3814// (32-bit) floating-point elements, and store the results in dst.
3815// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
3817{
3818 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3819}
3820
3821// Convert packed double-precision (64-bit) floating-point elements in a to
3822// packed 32-bit integers, and store the results in dst.
3823// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
3825{
3826// vrnd32xq_f64 not supported on clang
3827#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3828 float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3829 int64x2_t integers = vcvtq_s64_f64(rounded);
3831 vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3832#else
3834 double d0 = ((double *) &rnd)[0];
3835 double d1 = ((double *) &rnd)[1];
3836 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3837#endif
3838}
3839
3840// Convert packed double-precision (64-bit) floating-point elements in a to
3841// packed 32-bit integers, and store the results in dst.
3842// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
3844{
3846 double d0 = ((double *) &rnd)[0];
3847 double d1 = ((double *) &rnd)[1];
3848 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3849 return vreinterpret_m64_s32(vld1_s32(data));
3850}
3851
3852// Convert packed double-precision (64-bit) floating-point elements in a to
3853// packed single-precision (32-bit) floating-point elements, and store the
3854// results in dst.
3855// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
3857{
3858#if defined(__aarch64__) || defined(_M_ARM64)
3859 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3860 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3861#else
3862 float a0 = (float) ((double *) &a)[0];
3863 float a1 = (float) ((double *) &a)[1];
3864 return _mm_set_ps(0, 0, a1, a0);
3865#endif
3866}
3867
3868// Convert packed signed 32-bit integers in a to packed double-precision
3869// (64-bit) floating-point elements, and store the results in dst.
3870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
3872{
3873#if defined(__aarch64__) || defined(_M_ARM64)
3874 return vreinterpretq_m128d_f64(
3875 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3876#else
3877 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3878 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3879 return _mm_set_pd(a1, a0);
3880#endif
3881}
3882
3883// Convert packed single-precision (32-bit) floating-point elements in a to
3884// packed 32-bit integers, and store the results in dst.
3885// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
3886// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3887// does not support! It is supported on ARMv8-A however.
3889{
3890#if defined(__ARM_FEATURE_FRINT)
3891 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
3892#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3893 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3894 switch (_MM_GET_ROUNDING_MODE()) {
3895 case _MM_ROUND_NEAREST:
3896 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3897 case _MM_ROUND_DOWN:
3898 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3899 case _MM_ROUND_UP:
3900 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3901 default: // _MM_ROUND_TOWARD_ZERO
3902 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3903 }
3904#else
3905 float *f = (float *) &a;
3906 switch (_MM_GET_ROUNDING_MODE()) {
3907 case _MM_ROUND_NEAREST: {
3908 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3909 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3910 vdupq_n_f32(0.5f)); /* +/- 0.5 */
3911 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3912 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3913 int32x4_t r_trunc = vcvtq_s32_f32(
3914 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3915 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3916 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3917 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3918 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3919 float32x4_t delta = vsubq_f32(
3921 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3922 uint32x4_t is_delta_half =
3923 vceqq_f32(delta, half); /* delta == +/- 0.5 */
3925 vbslq_s32(is_delta_half, r_even, r_normal));
3926 }
3927 case _MM_ROUND_DOWN:
3928 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3929 floorf(f[0]));
3930 case _MM_ROUND_UP:
3931 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3932 ceilf(f[0]));
3933 default: // _MM_ROUND_TOWARD_ZERO
3934 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3935 (int32_t) f[0]);
3936 }
3937#endif
3938}
3939
3940// Convert packed single-precision (32-bit) floating-point elements in a to
3941// packed double-precision (64-bit) floating-point elements, and store the
3942// results in dst.
3943// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
3945{
3946#if defined(__aarch64__) || defined(_M_ARM64)
3947 return vreinterpretq_m128d_f64(
3948 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3949#else
3950 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3951 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3952 return _mm_set_pd(a1, a0);
3953#endif
3954}
3955
3956// Copy the lower double-precision (64-bit) floating-point element of a to dst.
3957// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
3958FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3959{
3960#if defined(__aarch64__) || defined(_M_ARM64)
3961 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3962#else
3963 return ((double *) &a)[0];
3964#endif
3965}
3966
3967// Convert the lower double-precision (64-bit) floating-point element in a to a
3968// 32-bit integer, and store the result in dst.
3969// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
3970FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3971{
3972#if defined(__aarch64__) || defined(_M_ARM64)
3973 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3974#else
3976 double ret = ((double *) &rnd)[0];
3977 return (int32_t) ret;
3978#endif
3979}
3980
3981// Convert the lower double-precision (64-bit) floating-point element in a to a
3982// 64-bit integer, and store the result in dst.
3983// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
3984FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3985{
3986#if defined(__aarch64__) || defined(_M_ARM64)
3987 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3988#else
3990 double ret = ((double *) &rnd)[0];
3991 return (int64_t) ret;
3992#endif
3993}
3994
3995// Convert the lower double-precision (64-bit) floating-point element in a to a
3996// 64-bit integer, and store the result in dst.
3997// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
3998#define _mm_cvtsd_si64x _mm_cvtsd_si64
3999
4000// Convert the lower double-precision (64-bit) floating-point element in b to a
4001// single-precision (32-bit) floating-point element, store the result in the
4002// lower element of dst, and copy the upper 3 packed elements from a to the
4003// upper elements of dst.
4004// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4006{
4007#if defined(__aarch64__) || defined(_M_ARM64)
4008 return vreinterpretq_m128_f32(vsetq_lane_f32(
4009 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4010 vreinterpretq_f32_m128(a), 0));
4011#else
4012 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4013 vreinterpretq_f32_m128(a), 0));
4014#endif
4015}
4016
4017// Copy the lower 32-bit integer in a to dst.
4018// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4019FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4020{
4021 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4022}
4023
4024// Copy the lower 64-bit integer in a to dst.
4025// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4026FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4027{
4028 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4029}
4030
4031// Copy the lower 64-bit integer in a to dst.
4032// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4033#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4034
4035// Convert the signed 32-bit integer b to a double-precision (64-bit)
4036// floating-point element, store the result in the lower element of dst, and
4037// copy the upper element from a to the upper element of dst.
4038// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4039FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4040{
4041#if defined(__aarch64__) || defined(_M_ARM64)
4042 return vreinterpretq_m128d_f64(
4043 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4044#else
4045 double bf = (double) b;
4047 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4048#endif
4049}
4050
4051// Copy the lower 64-bit integer in a to dst.
4052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4053#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4054
4055// Copy 32-bit integer a to the lower elements of dst, and zero the upper
4056// elements of dst.
4057// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
4058FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4059{
4060 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4061}
4062
4063// Convert the signed 64-bit integer b to a double-precision (64-bit)
4064// floating-point element, store the result in the lower element of dst, and
4065// copy the upper element from a to the upper element of dst.
4066// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4067FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4068{
4069#if defined(__aarch64__) || defined(_M_ARM64)
4070 return vreinterpretq_m128d_f64(
4071 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4072#else
4073 double bf = (double) b;
4075 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4076#endif
4077}
4078
4079// Copy 64-bit integer a to the lower element of dst, and zero the upper
4080// element.
4081// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
4082FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4083{
4084 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4085}
4086
4087// Copy 64-bit integer a to the lower element of dst, and zero the upper
4088// element.
4089// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4090#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4091
4092// Convert the signed 64-bit integer b to a double-precision (64-bit)
4093// floating-point element, store the result in the lower element of dst, and
4094// copy the upper element from a to the upper element of dst.
4095// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4096#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4097
4098// Convert the lower single-precision (32-bit) floating-point element in b to a
4099// double-precision (64-bit) floating-point element, store the result in the
4100// lower element of dst, and copy the upper element from a to the upper element
4101// of dst.
4102// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4104{
4105 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4106#if defined(__aarch64__) || defined(_M_ARM64)
4107 return vreinterpretq_m128d_f64(
4108 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4109#else
4111 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4112#endif
4113}
4114
4115// Convert packed double-precision (64-bit) floating-point elements in a to
4116// packed 32-bit integers with truncation, and store the results in dst.
4117// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4119{
4120 double a0 = ((double *) &a)[0];
4121 double a1 = ((double *) &a)[1];
4122 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4123}
4124
4125// Convert packed double-precision (64-bit) floating-point elements in a to
4126// packed 32-bit integers with truncation, and store the results in dst.
4127// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4129{
4130 double a0 = ((double *) &a)[0];
4131 double a1 = ((double *) &a)[1];
4132 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4133 return vreinterpret_m64_s32(vld1_s32(data));
4134}
4135
4136// Convert packed single-precision (32-bit) floating-point elements in a to
4137// packed 32-bit integers with truncation, and store the results in dst.
4138// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
4140{
4141 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4142}
4143
4144// Convert the lower double-precision (64-bit) floating-point element in a to a
4145// 32-bit integer with truncation, and store the result in dst.
4146// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4147FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4148{
4149 double ret = *((double *) &a);
4150 return (int32_t) ret;
4151}
4152
4153// Convert the lower double-precision (64-bit) floating-point element in a to a
4154// 64-bit integer with truncation, and store the result in dst.
4155// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4156FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4157{
4158#if defined(__aarch64__) || defined(_M_ARM64)
4159 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4160#else
4161 double ret = *((double *) &a);
4162 return (int64_t) ret;
4163#endif
4164}
4165
4166// Convert the lower double-precision (64-bit) floating-point element in a to a
4167// 64-bit integer with truncation, and store the result in dst.
4168// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4169#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4170
4171// Divide packed double-precision (64-bit) floating-point elements in a by
4172// packed elements in b, and store the results in dst.
4173// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4175{
4176#if defined(__aarch64__) || defined(_M_ARM64)
4177 return vreinterpretq_m128d_f64(
4178 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4179#else
4180 double *da = (double *) &a;
4181 double *db = (double *) &b;
4182 double c[2];
4183 c[0] = da[0] / db[0];
4184 c[1] = da[1] / db[1];
4185 return vld1q_f32((float32_t *) c);
4186#endif
4187}
4188
4189// Divide the lower double-precision (64-bit) floating-point element in a by the
4190// lower double-precision (64-bit) floating-point element in b, store the result
4191// in the lower element of dst, and copy the upper element from a to the upper
4192// element of dst.
4193// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4195{
4196#if defined(__aarch64__) || defined(_M_ARM64)
4197 float64x2_t tmp =
4198 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4199 return vreinterpretq_m128d_f64(
4200 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4201#else
4202 return _mm_move_sd(a, _mm_div_pd(a, b));
4203#endif
4204}
4205
4206// Extract a 16-bit integer from a, selected with imm8, and store the result in
4207// the lower element of dst.
4208// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
4209// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4210#define _mm_extract_epi16(a, imm) \
4211 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4212
4213// Copy a to dst, and insert the 16-bit integer i into dst at the location
4214// specified by imm8.
4215// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
4216// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4217// __constrange(0,8) int imm)
4218#define _mm_insert_epi16(a, b, imm) \
4219 vreinterpretq_m128i_s16( \
4220 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4221
4222// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
4223// elements) from memory into dst. mem_addr must be aligned on a 16-byte
4224// boundary or a general-protection exception may be generated.
4225// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4226FORCE_INLINE __m128d _mm_load_pd(const double *p)
4227{
4228#if defined(__aarch64__) || defined(_M_ARM64)
4229 return vreinterpretq_m128d_f64(vld1q_f64(p));
4230#else
4231 const float *fp = (const float *) p;
4232 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4233 return vreinterpretq_m128d_f32(vld1q_f32(data));
4234#endif
4235}
4236
4237// Load a double-precision (64-bit) floating-point element from memory into both
4238// elements of dst.
4239// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4240#define _mm_load_pd1 _mm_load1_pd
4241
4242// Load a double-precision (64-bit) floating-point element from memory into the
4243// lower of dst, and zero the upper element. mem_addr does not need to be
4244// aligned on any particular boundary.
4245// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4246FORCE_INLINE __m128d _mm_load_sd(const double *p)
4247{
4248#if defined(__aarch64__) || defined(_M_ARM64)
4249 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4250#else
4251 const float *fp = (const float *) p;
4252 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4253 return vreinterpretq_m128d_f32(vld1q_f32(data));
4254#endif
4255}
4256
4257// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
4258// on a 16-byte boundary or a general-protection exception may be generated.
4259// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
4260FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4261{
4262 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4263}
4264
4265// Load a double-precision (64-bit) floating-point element from memory into both
4266// elements of dst.
4267// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4268FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4269{
4270#if defined(__aarch64__) || defined(_M_ARM64)
4271 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4272#else
4273 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4274#endif
4275}
4276
4277// Load a double-precision (64-bit) floating-point element from memory into the
4278// upper element of dst, and copy the lower element from a to dst. mem_addr does
4279// not need to be aligned on any particular boundary.
4280// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4281FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4282{
4283#if defined(__aarch64__) || defined(_M_ARM64)
4284 return vreinterpretq_m128d_f64(
4285 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4286#else
4287 return vreinterpretq_m128d_f32(vcombine_f32(
4288 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4289#endif
4290}
4291
4292// Load 64-bit integer from memory into the first element of dst.
4293// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4294FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4295{
4296 /* Load the lower 64 bits of the value pointed to by p into the
4297 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4298 */
4300 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4301}
4302
4303// Load a double-precision (64-bit) floating-point element from memory into the
4304// lower element of dst, and copy the upper element from a to dst. mem_addr does
4305// not need to be aligned on any particular boundary.
4306// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4307FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4308{
4309#if defined(__aarch64__) || defined(_M_ARM64)
4310 return vreinterpretq_m128d_f64(
4311 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4312#else
4314 vcombine_f32(vld1_f32((const float *) p),
4315 vget_high_f32(vreinterpretq_f32_m128d(a))));
4316#endif
4317}
4318
4319// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4320// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4321// general-protection exception may be generated.
4322// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4323FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4324{
4325#if defined(__aarch64__) || defined(_M_ARM64)
4326 float64x2_t v = vld1q_f64(p);
4327 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4328#else
4329 int64x2_t v = vld1q_s64((const int64_t *) p);
4330 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4331#endif
4332}
4333
4334// Loads two double-precision from unaligned memory, floating-point values.
4335// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4336FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4337{
4338 return _mm_load_pd(p);
4339}
4340
4341// Load 128-bits of integer data from memory into dst. mem_addr does not need to
4342// be aligned on any particular boundary.
4343// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
4344FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4345{
4346 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4347}
4348
4349// Load unaligned 32-bit integer from memory into the first element of dst.
4350// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4351FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4352{
4354 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4355}
4356
4357// Multiply packed signed 16-bit integers in a and b, producing intermediate
4358// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
4359// 32-bit integers, and pack the results in dst.
4360// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
4362{
4363 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4364 vget_low_s16(vreinterpretq_s16_m128i(b)));
4365#if defined(__aarch64__) || defined(_M_ARM64)
4366 int32x4_t high =
4367 vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4368
4369 return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4370#else
4371 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4372 vget_high_s16(vreinterpretq_s16_m128i(b)));
4373
4374 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4375 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4376
4377 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4378#endif
4379}
4380
4381// Conditionally store 8-bit integer elements from a into memory using mask
4382// (elements are not stored when the highest bit is not set in the corresponding
4383// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4384// on any particular boundary.
4385// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4386FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4387{
4388 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4389 __m128 b = _mm_load_ps((const float *) mem_addr);
4390 int8x16_t masked =
4391 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4393 vst1q_s8((int8_t *) mem_addr, masked);
4394}
4395
4396// Compare packed signed 16-bit integers in a and b, and store packed maximum
4397// values in dst.
4398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
4400{
4403}
4404
4405// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4406// values in dst.
4407// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
4409{
4412}
4413
4414// Compare packed double-precision (64-bit) floating-point elements in a and b,
4415// and store packed maximum values in dst.
4416// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4418{
4419#if defined(__aarch64__) || defined(_M_ARM64)
4420#if SSE2NEON_PRECISE_MINMAX
4421 float64x2_t _a = vreinterpretq_f64_m128d(a);
4422 float64x2_t _b = vreinterpretq_f64_m128d(b);
4423 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4424#else
4425 return vreinterpretq_m128d_f64(
4426 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4427#endif
4428#else
4429 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4430 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4431 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4432 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4433 uint64_t d[2];
4434 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4435 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4436
4437 return vreinterpretq_m128d_u64(vld1q_u64(d));
4438#endif
4439}
4440
4441// Compare the lower double-precision (64-bit) floating-point elements in a and
4442// b, store the maximum value in the lower element of dst, and copy the upper
4443// element from a to the upper element of dst.
4444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4446{
4447#if defined(__aarch64__) || defined(_M_ARM64)
4448 return _mm_move_sd(a, _mm_max_pd(a, b));
4449#else
4450 double *da = (double *) &a;
4451 double *db = (double *) &b;
4452 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4453 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4454#endif
4455}
4456
4457// Compare packed signed 16-bit integers in a and b, and store packed minimum
4458// values in dst.
4459// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
4461{
4464}
4465
4466// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4467// values in dst.
4468// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
4470{
4473}
4474
4475// Compare packed double-precision (64-bit) floating-point elements in a and b,
4476// and store packed minimum values in dst.
4477// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4479{
4480#if defined(__aarch64__) || defined(_M_ARM64)
4481#if SSE2NEON_PRECISE_MINMAX
4482 float64x2_t _a = vreinterpretq_f64_m128d(a);
4483 float64x2_t _b = vreinterpretq_f64_m128d(b);
4484 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4485#else
4486 return vreinterpretq_m128d_f64(
4487 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4488#endif
4489#else
4490 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4491 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4492 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4493 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4494 uint64_t d[2];
4495 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4496 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4497 return vreinterpretq_m128d_u64(vld1q_u64(d));
4498#endif
4499}
4500
4501// Compare the lower double-precision (64-bit) floating-point elements in a and
4502// b, store the minimum value in the lower element of dst, and copy the upper
4503// element from a to the upper element of dst.
4504// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4506{
4507#if defined(__aarch64__) || defined(_M_ARM64)
4508 return _mm_move_sd(a, _mm_min_pd(a, b));
4509#else
4510 double *da = (double *) &a;
4511 double *db = (double *) &b;
4512 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4513 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4514#endif
4515}
4516
4517// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4518// upper element.
4519// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4521{
4523 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4524}
4525
4526// Move the lower double-precision (64-bit) floating-point element from b to the
4527// lower element of dst, and copy the upper element from a to the upper element
4528// of dst.
4529// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4531{
4533 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4534 vget_high_f32(vreinterpretq_f32_m128d(a))));
4535}
4536
4537// Create mask from the most significant bit of each 8-bit element in a, and
4538// store the result in dst.
4539// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
4540FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4541{
4542 // Use increasingly wide shifts+adds to collect the sign bits
4543 // together.
4544 // Since the widening shifts would be rather confusing to follow in little
4545 // endian, everything will be illustrated in big endian order instead. This
4546 // has a different result - the bits would actually be reversed on a big
4547 // endian machine.
4548
4549 // Starting input (only half the elements are shown):
4550 // 89 ff 1d c0 00 10 99 33
4551 uint8x16_t input = vreinterpretq_u8_m128i(a);
4552
4553 // Shift out everything but the sign bits with an unsigned shift right.
4554 //
4555 // Bytes of the vector::
4556 // 89 ff 1d c0 00 10 99 33
4557 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4558 // | | | | | | | |
4559 // 01 01 00 01 00 00 01 00
4560 //
4561 // Bits of first important lane(s):
4562 // 10001001 (89)
4563 // \______
4564 // |
4565 // 00000001 (01)
4566 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4567
4568 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4569 // 'xx' represents garbage data which will be ignored in the final result.
4570 // In the important bytes, the add functions like a binary OR.
4571 //
4572 // 01 01 00 01 00 00 01 00
4573 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4574 // \| \| \| \|
4575 // xx 03 xx 01 xx 00 xx 02
4576 //
4577 // 00000001 00000001 (01 01)
4578 // \_______ |
4579 // \|
4580 // xxxxxxxx xxxxxx11 (xx 03)
4581 uint32x4_t paired16 =
4582 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4583
4584 // Repeat with a wider 32-bit shift + add.
4585 // xx 03 xx 01 xx 00 xx 02
4586 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4587 // 14))
4588 // \| \|
4589 // xx xx xx 0d xx xx xx 02
4590 //
4591 // 00000011 00000001 (03 01)
4592 // \\_____ ||
4593 // '----.\||
4594 // xxxxxxxx xxxx1101 (xx 0d)
4595 uint64x2_t paired32 =
4596 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4597
4598 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4599 // lanes. xx xx xx 0d xx xx xx 02
4600 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4601 // 28))
4602 // \|
4603 // xx xx xx xx xx xx xx d2
4604 //
4605 // 00001101 00000010 (0d 02)
4606 // \ \___ | |
4607 // '---. \| |
4608 // xxxxxxxx 11010010 (xx d2)
4609 uint8x16_t paired64 =
4610 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4611
4612 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4613 // xx xx xx xx xx xx xx d2
4614 // || return paired64[0]
4615 // d2
4616 // Note: Little endian would return the correct value 4b (01001011) instead.
4617 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4618}
4619
4620// Set each bit of mask dst based on the most significant bit of the
4621// corresponding packed double-precision (64-bit) floating-point element in a.
4622// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
4623FORCE_INLINE int _mm_movemask_pd(__m128d a)
4624{
4625 uint64x2_t input = vreinterpretq_u64_m128d(a);
4626 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4627 return (int) (vgetq_lane_u64(high_bits, 0) |
4628 (vgetq_lane_u64(high_bits, 1) << 1));
4629}
4630
4631// Copy the lower 64-bit integer in a to dst.
4632// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
4634{
4635 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4636}
4637
4638// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4639// element.
4640// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
4642{
4644 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4645}
4646
4647// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4648// a and b, and store the unsigned 64-bit results in dst.
4649// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
4651{
4652 // vmull_u32 upcasts instead of masking, so we downcast.
4653 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4654 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4655 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4656}
4657
4658// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4659// and store the results in dst.
4660// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
4662{
4663#if defined(__aarch64__) || defined(_M_ARM64)
4664 return vreinterpretq_m128d_f64(
4665 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4666#else
4667 double *da = (double *) &a;
4668 double *db = (double *) &b;
4669 double c[2];
4670 c[0] = da[0] * db[0];
4671 c[1] = da[1] * db[1];
4672 return vld1q_f32((float32_t *) c);
4673#endif
4674}
4675
4676// Multiply the lower double-precision (64-bit) floating-point element in a and
4677// b, store the result in the lower element of dst, and copy the upper element
4678// from a to the upper element of dst.
4679// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
4681{
4682 return _mm_move_sd(a, _mm_mul_pd(a, b));
4683}
4684
4685// Multiply the low unsigned 32-bit integers from a and b, and store the
4686// unsigned 64-bit result in dst.
4687// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
4688FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4689{
4690 return vreinterpret_m64_u64(vget_low_u64(
4691 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4692}
4693
4694// Multiply the packed signed 16-bit integers in a and b, producing intermediate
4695// 32-bit integers, and store the high 16 bits of the intermediate integers in
4696// dst.
4697// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
4699{
4700 /* FIXME: issue with large values because of result saturation */
4701 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4702 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4703 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4704 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4705 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4706 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4707 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4708 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4709 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4710 uint16x8x2_t r =
4711 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4712 return vreinterpretq_m128i_u16(r.val[1]);
4713}
4714
4715// Multiply the packed unsigned 16-bit integers in a and b, producing
4716// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4717// integers in dst.
4718// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
4720{
4721 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4722 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4723 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4724#if defined(__aarch64__) || defined(_M_ARM64)
4725 uint32x4_t ab7654 =
4726 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4727 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4728 vreinterpretq_u16_u32(ab7654));
4729 return vreinterpretq_m128i_u16(r);
4730#else
4731 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4732 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4733 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4734 uint16x8x2_t r =
4735 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4736 return vreinterpretq_m128i_u16(r.val[1]);
4737#endif
4738}
4739
4740// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
4741// integers, and store the low 16 bits of the intermediate integers in dst.
4742// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
4744{
4747}
4748
4749// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4750// elements in a and b, and store the results in dst.
4751// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
4753{
4756}
4757
4758// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
4759// and store the result in dst.
4760// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
4762{
4765}
4766
4767// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4768// using signed saturation, and store the results in dst.
4769// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
4771{
4773 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4774 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4775}
4776
4777// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
4778// using signed saturation, and store the results in dst.
4779// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
4781{
4783 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4784 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4785}
4786
4787// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4788// using unsigned saturation, and store the results in dst.
4789// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
4790FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4791{
4793 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4794 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4795}
4796
4797// Pause the processor. This is typically used in spin-wait loops and depending
4798// on the x86 processor typical values are in the 40-100 cycle range. The
4799// 'yield' instruction isn't a good fit because it's effectively a nop on most
4800// Arm cores. Experience with several databases has shown has shown an 'isb' is
4801// a reasonable approximation.
4802// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
4803FORCE_INLINE void _mm_pause(void)
4804{
4805#if defined(_MSC_VER)
4806 __isb(_ARM64_BARRIER_SY);
4807#else
4808 __asm__ __volatile__("isb\n");
4809#endif
4810}
4811
4812// Compute the absolute differences of packed unsigned 8-bit integers in a and
4813// b, then horizontally sum each consecutive 8 differences to produce two
4814// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4815// 16 bits of 64-bit elements in dst.
4816// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
4818{
4819 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4820 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4821}
4822
4823// Set packed 16-bit integers in dst with the supplied values.
4824// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
4825FORCE_INLINE __m128i _mm_set_epi16(short i7,
4826 short i6,
4827 short i5,
4828 short i4,
4829 short i3,
4830 short i2,
4831 short i1,
4832 short i0)
4833{
4834 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4835 return vreinterpretq_m128i_s16(vld1q_s16(data));
4836}
4837
4838// Set packed 32-bit integers in dst with the supplied values.
4839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
4840FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4841{
4842 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4843 return vreinterpretq_m128i_s32(vld1q_s32(data));
4844}
4845
4846// Set packed 64-bit integers in dst with the supplied values.
4847// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
4848FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4849{
4850 return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4851}
4852
4853// Set packed 64-bit integers in dst with the supplied values.
4854// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
4855FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4856{
4858 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4859}
4860
4861// Set packed 8-bit integers in dst with the supplied values.
4862// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
4863FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4864 signed char b14,
4865 signed char b13,
4866 signed char b12,
4867 signed char b11,
4868 signed char b10,
4869 signed char b9,
4870 signed char b8,
4871 signed char b7,
4872 signed char b6,
4873 signed char b5,
4874 signed char b4,
4875 signed char b3,
4876 signed char b2,
4877 signed char b1,
4878 signed char b0)
4879{
4880 int8_t ALIGN_STRUCT(16)
4881 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4882 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4883 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4884 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4885 return (__m128i) vld1q_s8(data);
4886}
4887
4888// Set packed double-precision (64-bit) floating-point elements in dst with the
4889// supplied values.
4890// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
4891FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4892{
4893 double ALIGN_STRUCT(16) data[2] = {e0, e1};
4894#if defined(__aarch64__) || defined(_M_ARM64)
4895 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4896#else
4897 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4898#endif
4899}
4900
4901// Broadcast double-precision (64-bit) floating-point value a to all elements of
4902// dst.
4903// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
4904#define _mm_set_pd1 _mm_set1_pd
4905
4906// Copy double-precision (64-bit) floating-point element a to the lower element
4907// of dst, and zero the upper element.
4908// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
4909FORCE_INLINE __m128d _mm_set_sd(double a)
4910{
4911#if defined(__aarch64__) || defined(_M_ARM64)
4912 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
4913#else
4914 return _mm_set_pd(0, a);
4915#endif
4916}
4917
4918// Broadcast 16-bit integer a to all elements of dst.
4919// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
4920FORCE_INLINE __m128i _mm_set1_epi16(short w)
4921{
4922 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4923}
4924
4925// Broadcast 32-bit integer a to all elements of dst.
4926// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
4927FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4928{
4929 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4930}
4931
4932// Broadcast 64-bit integer a to all elements of dst.
4933// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
4935{
4936 return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
4937}
4938
4939// Broadcast 64-bit integer a to all elements of dst.
4940// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
4941FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4942{
4943 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4944}
4945
4946// Broadcast 8-bit integer a to all elements of dst.
4947// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
4948FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4949{
4950 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4951}
4952
4953// Broadcast double-precision (64-bit) floating-point value a to all elements of
4954// dst.
4955// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
4956FORCE_INLINE __m128d _mm_set1_pd(double d)
4957{
4958#if defined(__aarch64__) || defined(_M_ARM64)
4959 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4960#else
4961 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4962#endif
4963}
4964
4965// Set packed 16-bit integers in dst with the supplied values in reverse order.
4966// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
4967FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4968 short w1,
4969 short w2,
4970 short w3,
4971 short w4,
4972 short w5,
4973 short w6,
4974 short w7)
4975{
4976 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
4977 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
4978}
4979
4980// Set packed 32-bit integers in dst with the supplied values in reverse order.
4981// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
4982FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
4983{
4984 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
4985 return vreinterpretq_m128i_s32(vld1q_s32(data));
4986}
4987
4988// Set packed 64-bit integers in dst with the supplied values in reverse order.
4989// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
4990FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
4991{
4992 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
4993}
4994
4995// Set packed 8-bit integers in dst with the supplied values in reverse order.
4996// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
4997FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
4998 signed char b1,
4999 signed char b2,
5000 signed char b3,
5001 signed char b4,
5002 signed char b5,
5003 signed char b6,
5004 signed char b7,
5005 signed char b8,
5006 signed char b9,
5007 signed char b10,
5008 signed char b11,
5009 signed char b12,
5010 signed char b13,
5011 signed char b14,
5012 signed char b15)
5013{
5014 int8_t ALIGN_STRUCT(16)
5015 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5016 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5017 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5018 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5019 return (__m128i) vld1q_s8(data);
5020}
5021
5022// Set packed double-precision (64-bit) floating-point elements in dst with the
5023// supplied values in reverse order.
5024// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5025FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5026{
5027 return _mm_set_pd(e0, e1);
5028}
5029
5030// Return vector of type __m128d with all elements set to zero.
5031// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5032FORCE_INLINE __m128d _mm_setzero_pd(void)
5033{
5034#if defined(__aarch64__) || defined(_M_ARM64)
5035 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5036#else
5037 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5038#endif
5039}
5040
5041// Return vector of type __m128i with all elements set to zero.
5042// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
5043FORCE_INLINE __m128i _mm_setzero_si128(void)
5044{
5045 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5046}
5047
5048// Shuffle 32-bit integers in a using the control in imm8, and store the results
5049// in dst.
5050// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
5051// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5052// __constrange(0,255) int imm)
5053#if defined(_sse2neon_shuffle)
5054#define _mm_shuffle_epi32(a, imm) \
5055 __extension__({ \
5056 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5057 int32x4_t _shuf = \
5058 vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5059 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5060 vreinterpretq_m128i_s32(_shuf); \
5061 })
5062#else // generic
5063#define _mm_shuffle_epi32(a, imm) \
5064 _sse2neon_define1( \
5065 __m128i, a, __m128i ret; switch (imm) { \
5066 case _MM_SHUFFLE(1, 0, 3, 2): \
5067 ret = _mm_shuffle_epi_1032(_a); \
5068 break; \
5069 case _MM_SHUFFLE(2, 3, 0, 1): \
5070 ret = _mm_shuffle_epi_2301(_a); \
5071 break; \
5072 case _MM_SHUFFLE(0, 3, 2, 1): \
5073 ret = _mm_shuffle_epi_0321(_a); \
5074 break; \
5075 case _MM_SHUFFLE(2, 1, 0, 3): \
5076 ret = _mm_shuffle_epi_2103(_a); \
5077 break; \
5078 case _MM_SHUFFLE(1, 0, 1, 0): \
5079 ret = _mm_shuffle_epi_1010(_a); \
5080 break; \
5081 case _MM_SHUFFLE(1, 0, 0, 1): \
5082 ret = _mm_shuffle_epi_1001(_a); \
5083 break; \
5084 case _MM_SHUFFLE(0, 1, 0, 1): \
5085 ret = _mm_shuffle_epi_0101(_a); \
5086 break; \
5087 case _MM_SHUFFLE(2, 2, 1, 1): \
5088 ret = _mm_shuffle_epi_2211(_a); \
5089 break; \
5090 case _MM_SHUFFLE(0, 1, 2, 2): \
5091 ret = _mm_shuffle_epi_0122(_a); \
5092 break; \
5093 case _MM_SHUFFLE(3, 3, 3, 2): \
5094 ret = _mm_shuffle_epi_3332(_a); \
5095 break; \
5096 case _MM_SHUFFLE(0, 0, 0, 0): \
5097 ret = _mm_shuffle_epi32_splat(_a, 0); \
5098 break; \
5099 case _MM_SHUFFLE(1, 1, 1, 1): \
5100 ret = _mm_shuffle_epi32_splat(_a, 1); \
5101 break; \
5102 case _MM_SHUFFLE(2, 2, 2, 2): \
5103 ret = _mm_shuffle_epi32_splat(_a, 2); \
5104 break; \
5105 case _MM_SHUFFLE(3, 3, 3, 3): \
5106 ret = _mm_shuffle_epi32_splat(_a, 3); \
5107 break; \
5108 default: \
5109 ret = _mm_shuffle_epi32_default(_a, (imm)); \
5110 break; \
5111 } _sse2neon_return(ret);)
5112#endif
5113
5114// Shuffle double-precision (64-bit) floating-point elements using the control
5115// in imm8, and store the results in dst.
5116// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5117#ifdef _sse2neon_shuffle
5118#define _mm_shuffle_pd(a, b, imm8) \
5119 vreinterpretq_m128d_s64( \
5120 vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5121 imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5122#else
5123#define _mm_shuffle_pd(a, b, imm8) \
5124 _mm_castsi128_pd(_mm_set_epi64x( \
5125 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5126 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5127#endif
5128
5129// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5130// __constrange(0,255) int imm)
5131#if defined(_sse2neon_shuffle)
5132#define _mm_shufflehi_epi16(a, imm) \
5133 __extension__({ \
5134 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5135 int16x8_t _shuf = \
5136 vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5137 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5138 (((imm) >> 6) & 0x3) + 4); \
5139 vreinterpretq_m128i_s16(_shuf); \
5140 })
5141#else // generic
5142#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5143#endif
5144
5145// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5146// __constrange(0,255) int imm)
5147#if defined(_sse2neon_shuffle)
5148#define _mm_shufflelo_epi16(a, imm) \
5149 __extension__({ \
5150 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5151 int16x8_t _shuf = vshuffleq_s16( \
5152 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5153 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5154 vreinterpretq_m128i_s16(_shuf); \
5155 })
5156#else // generic
5157#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5158#endif
5159
5160// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5161// store the results in dst.
5162// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5163FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5164{
5165 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5166 if (_sse2neon_unlikely(c & ~15))
5167 return _mm_setzero_si128();
5168
5169 int16x8_t vc = vdupq_n_s16((int16_t) c);
5170 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5171}
5172
5173// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5174// store the results in dst.
5175// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5176FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5177{
5178 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5179 if (_sse2neon_unlikely(c & ~31))
5180 return _mm_setzero_si128();
5181
5182 int32x4_t vc = vdupq_n_s32((int32_t) c);
5183 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5184}
5185
5186// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5187// store the results in dst.
5188// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5189FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5190{
5191 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5192 if (_sse2neon_unlikely(c & ~63))
5193 return _mm_setzero_si128();
5194
5195 int64x2_t vc = vdupq_n_s64((int64_t) c);
5196 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5197}
5198
5199// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5200// store the results in dst.
5201// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5202FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5203{
5204 if (_sse2neon_unlikely(imm & ~15))
5205 return _mm_setzero_si128();
5207 vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5208}
5209
5210// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5211// store the results in dst.
5212// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5213FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5214{
5215 if (_sse2neon_unlikely(imm & ~31))
5216 return _mm_setzero_si128();
5218 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5219}
5220
5221// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5222// store the results in dst.
5223// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5224FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5225{
5226 if (_sse2neon_unlikely(imm & ~63))
5227 return _mm_setzero_si128();
5229 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5230}
5231
5232// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5233// dst.
5234// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5235#define _mm_slli_si128(a, imm) \
5236 _sse2neon_define1( \
5237 __m128i, a, int8x16_t ret; \
5238 if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
5239 else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5240 else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5241 ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5242 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5243
5244// Compute the square root of packed double-precision (64-bit) floating-point
5245// elements in a, and store the results in dst.
5246// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5248{
5249#if defined(__aarch64__) || defined(_M_ARM64)
5250 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5251#else
5252 double a0 = sqrt(((double *) &a)[0]);
5253 double a1 = sqrt(((double *) &a)[1]);
5254 return _mm_set_pd(a1, a0);
5255#endif
5256}
5257
5258// Compute the square root of the lower double-precision (64-bit) floating-point
5259// element in b, store the result in the lower element of dst, and copy the
5260// upper element from a to the upper element of dst.
5261// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5263{
5264#if defined(__aarch64__) || defined(_M_ARM64)
5265 return _mm_move_sd(a, _mm_sqrt_pd(b));
5266#else
5267 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5268#endif
5269}
5270
5271// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5272// and store the results in dst.
5273// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5274FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5275{
5276 int64_t c = vgetq_lane_s64(count, 0);
5277 if (_sse2neon_unlikely(c & ~15))
5280 vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
5281}
5282
5283// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5284// and store the results in dst.
5285// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5286FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5287{
5288 int64_t c = vgetq_lane_s64(count, 0);
5289 if (_sse2neon_unlikely(c & ~31))
5292 vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
5293}
5294
5295// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5296// bits, and store the results in dst.
5297// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5298FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5299{
5300 const int count = (imm & ~15) ? 15 : imm;
5301 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5302}
5303
5304// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5305// and store the results in dst.
5306// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5307// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5308#define _mm_srai_epi32(a, imm) \
5309 _sse2neon_define0( \
5310 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5311 ret = _a; \
5312 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5313 ret = vreinterpretq_m128i_s32( \
5314 vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5315 } else { \
5316 ret = vreinterpretq_m128i_s32( \
5317 vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5318 } _sse2neon_return(ret);)
5319
5320// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5321// store the results in dst.
5322// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5323FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5324{
5325 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5326 if (_sse2neon_unlikely(c & ~15))
5327 return _mm_setzero_si128();
5328
5329 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5330 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5331}
5332
5333// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5334// store the results in dst.
5335// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5336FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5337{
5338 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5339 if (_sse2neon_unlikely(c & ~31))
5340 return _mm_setzero_si128();
5341
5342 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5343 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5344}
5345
5346// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5347// store the results in dst.
5348// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5349FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5350{
5351 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5352 if (_sse2neon_unlikely(c & ~63))
5353 return _mm_setzero_si128();
5354
5355 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5356 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5357}
5358
5359// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5360// store the results in dst.
5361// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
5362#define _mm_srli_epi16(a, imm) \
5363 _sse2neon_define0( \
5364 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5365 ret = _mm_setzero_si128(); \
5366 } else { \
5367 ret = vreinterpretq_m128i_u16( \
5368 vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
5369 } _sse2neon_return(ret);)
5370
5371// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5372// store the results in dst.
5373// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
5374// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5375#define _mm_srli_epi32(a, imm) \
5376 _sse2neon_define0( \
5377 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5378 ret = _mm_setzero_si128(); \
5379 } else { \
5380 ret = vreinterpretq_m128i_u32( \
5381 vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5382 } _sse2neon_return(ret);)
5383
5384// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5385// store the results in dst.
5386// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
5387#define _mm_srli_epi64(a, imm) \
5388 _sse2neon_define0( \
5389 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5390 ret = _mm_setzero_si128(); \
5391 } else { \
5392 ret = vreinterpretq_m128i_u64( \
5393 vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5394 } _sse2neon_return(ret);)
5395
5396// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5397// dst.
5398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
5399#define _mm_srli_si128(a, imm) \
5400 _sse2neon_define1( \
5401 __m128i, a, int8x16_t ret; \
5402 if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5403 else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5404 (imm > 15 ? 0 : imm)); \
5405 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5406
5407// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5408// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5409// or a general-protection exception may be generated.
5410// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
5411FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5412{
5413#if defined(__aarch64__) || defined(_M_ARM64)
5414 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5415#else
5416 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5417#endif
5418}
5419
5420// Store the lower double-precision (64-bit) floating-point element from a into
5421// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5422// boundary or a general-protection exception may be generated.
5423// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
5424FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5425{
5426#if defined(__aarch64__) || defined(_M_ARM64)
5427 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5428 vst1q_f64((float64_t *) mem_addr,
5429 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5430#else
5431 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5432 vst1q_f32((float32_t *) mem_addr,
5433 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5434#endif
5435}
5436
5437// Store the lower double-precision (64-bit) floating-point element from a into
5438// memory. mem_addr does not need to be aligned on any particular boundary.
5439// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
5440FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5441{
5442#if defined(__aarch64__) || defined(_M_ARM64)
5443 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5444#else
5445 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5446#endif
5447}
5448
5449// Store 128-bits of integer data from a into memory. mem_addr must be aligned
5450// on a 16-byte boundary or a general-protection exception may be generated.
5451// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
5452FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5453{
5454 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5455}
5456
5457// Store the lower double-precision (64-bit) floating-point element from a into
5458// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5459// boundary or a general-protection exception may be generated.
5460// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
5461#define _mm_store1_pd _mm_store_pd1
5462
5463// Store the upper double-precision (64-bit) floating-point element from a into
5464// memory.
5465// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
5466FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5467{
5468#if defined(__aarch64__) || defined(_M_ARM64)
5469 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5470#else
5471 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5472#endif
5473}
5474
5475// Store 64-bit integer from the first element of a into memory.
5476// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
5477FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5478{
5479 vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5480}
5481
5482// Store the lower double-precision (64-bit) floating-point element from a into
5483// memory.
5484// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
5485FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5486{
5487#if defined(__aarch64__) || defined(_M_ARM64)
5488 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5489#else
5490 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5491#endif
5492}
5493
5494// Store 2 double-precision (64-bit) floating-point elements from a into memory
5495// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5496// general-protection exception may be generated.
5497// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
5498FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5499{
5500 float32x4_t f = vreinterpretq_f32_m128d(a);
5501 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5502}
5503
5504// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5505// elements) from a into memory. mem_addr does not need to be aligned on any
5506// particular boundary.
5507// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
5508FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5509{
5510 _mm_store_pd(mem_addr, a);
5511}
5512
5513// Store 128-bits of integer data from a into memory. mem_addr does not need to
5514// be aligned on any particular boundary.
5515// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
5516FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5517{
5518 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5519}
5520
5521// Store 32-bit integer from the first element of a into memory. mem_addr does
5522// not need to be aligned on any particular boundary.
5523// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
5524FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5525{
5526 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5527}
5528
5529// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5530// elements) from a into memory using a non-temporal memory hint. mem_addr must
5531// be aligned on a 16-byte boundary or a general-protection exception may be
5532// generated.
5533// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
5534FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5535{
5536#if __has_builtin(__builtin_nontemporal_store)
5537 __builtin_nontemporal_store(a, (__m128d *) p);
5538#elif defined(__aarch64__) || defined(_M_ARM64)
5539 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5540#else
5541 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5542#endif
5543}
5544
5545// Store 128-bits of integer data from a into memory using a non-temporal memory
5546// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
5547// exception may be generated.
5548// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
5549FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5550{
5551#if __has_builtin(__builtin_nontemporal_store)
5552 __builtin_nontemporal_store(a, p);
5553#else
5554 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5555#endif
5556}
5557
5558// Store 32-bit integer a into memory using a non-temporal hint to minimize
5559// cache pollution. If the cache line containing address mem_addr is already in
5560// the cache, the cache will be updated.
5561// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
5562FORCE_INLINE void _mm_stream_si32(int *p, int a)
5563{
5564 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5565}
5566
5567// Store 64-bit integer a into memory using a non-temporal hint to minimize
5568// cache pollution. If the cache line containing address mem_addr is already in
5569// the cache, the cache will be updated.
5570// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
5571FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
5572{
5573 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5574}
5575
5576// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5577// store the results in dst.
5578// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
5580{
5583}
5584
5585// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
5586// store the results in dst.
5587// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
5589{
5592}
5593
5594// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
5595// store the results in dst.
5596// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
5598{
5601}
5602
5603// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5604// store the results in dst.
5605// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
5607{
5610}
5611
5612// Subtract packed double-precision (64-bit) floating-point elements in b from
5613// packed double-precision (64-bit) floating-point elements in a, and store the
5614// results in dst.
5615// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
5617{
5618#if defined(__aarch64__) || defined(_M_ARM64)
5619 return vreinterpretq_m128d_f64(
5620 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5621#else
5622 double *da = (double *) &a;
5623 double *db = (double *) &b;
5624 double c[2];
5625 c[0] = da[0] - db[0];
5626 c[1] = da[1] - db[1];
5627 return vld1q_f32((float32_t *) c);
5628#endif
5629}
5630
5631// Subtract the lower double-precision (64-bit) floating-point element in b from
5632// the lower double-precision (64-bit) floating-point element in a, store the
5633// result in the lower element of dst, and copy the upper element from a to the
5634// upper element of dst.
5635// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
5637{
5638 return _mm_move_sd(a, _mm_sub_pd(a, b));
5639}
5640
5641// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5642// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
5643FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5644{
5645 return vreinterpret_m64_s64(
5647}
5648
5649// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
5650// using saturation, and store the results in dst.
5651// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
5653{
5656}
5657
5658// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
5659// using saturation, and store the results in dst.
5660// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
5662{
5665}
5666
5667// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
5668// integers in a using saturation, and store the results in dst.
5669// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
5671{
5674}
5675
5676// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
5677// integers in a using saturation, and store the results in dst.
5678// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
5680{
5683}
5684
5685#define _mm_ucomieq_sd _mm_comieq_sd
5686#define _mm_ucomige_sd _mm_comige_sd
5687#define _mm_ucomigt_sd _mm_comigt_sd
5688#define _mm_ucomile_sd _mm_comile_sd
5689#define _mm_ucomilt_sd _mm_comilt_sd
5690#define _mm_ucomineq_sd _mm_comineq_sd
5691
5692// Return vector of type __m128d with undefined elements.
5693// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
5694FORCE_INLINE __m128d _mm_undefined_pd(void)
5695{
5696#if defined(__GNUC__) || defined(__clang__)
5697#pragma GCC diagnostic push
5698#pragma GCC diagnostic ignored "-Wuninitialized"
5699#endif
5700 __m128d a;
5701#if defined(_MSC_VER)
5702 a = _mm_setzero_pd();
5703#endif
5704 return a;
5705#if defined(__GNUC__) || defined(__clang__)
5706#pragma GCC diagnostic pop
5707#endif
5708}
5709
5710// Unpack and interleave 16-bit integers from the high half of a and b, and
5711// store the results in dst.
5712// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
5714{
5715#if defined(__aarch64__) || defined(_M_ARM64)
5718#else
5719 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5720 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5721 int16x4x2_t result = vzip_s16(a1, b1);
5722 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5723#endif
5724}
5725
5726// Unpack and interleave 32-bit integers from the high half of a and b, and
5727// store the results in dst.
5728// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
5730{
5731#if defined(__aarch64__) || defined(_M_ARM64)
5734#else
5735 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5736 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5737 int32x2x2_t result = vzip_s32(a1, b1);
5738 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5739#endif
5740}
5741
5742// Unpack and interleave 64-bit integers from the high half of a and b, and
5743// store the results in dst.
5744// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
5746{
5747#if defined(__aarch64__) || defined(_M_ARM64)
5750#else
5751 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5752 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5753 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5754#endif
5755}
5756
5757// Unpack and interleave 8-bit integers from the high half of a and b, and store
5758// the results in dst.
5759// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
5761{
5762#if defined(__aarch64__) || defined(_M_ARM64)
5765#else
5766 int8x8_t a1 =
5767 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5768 int8x8_t b1 =
5769 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5770 int8x8x2_t result = vzip_s8(a1, b1);
5771 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5772#endif
5773}
5774
5775// Unpack and interleave double-precision (64-bit) floating-point elements from
5776// the high half of a and b, and store the results in dst.
5777// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
5779{
5780#if defined(__aarch64__) || defined(_M_ARM64)
5781 return vreinterpretq_m128d_f64(
5782 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5783#else
5785 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
5786 vget_high_s64(vreinterpretq_s64_m128d(b))));
5787#endif
5788}
5789
5790// Unpack and interleave 16-bit integers from the low half of a and b, and store
5791// the results in dst.
5792// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
5794{
5795#if defined(__aarch64__) || defined(_M_ARM64)
5798#else
5799 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5800 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5801 int16x4x2_t result = vzip_s16(a1, b1);
5802 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5803#endif
5804}
5805
5806// Unpack and interleave 32-bit integers from the low half of a and b, and store
5807// the results in dst.
5808// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
5810{
5811#if defined(__aarch64__) || defined(_M_ARM64)
5814#else
5815 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5816 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5817 int32x2x2_t result = vzip_s32(a1, b1);
5818 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5819#endif
5820}
5821
5822// Unpack and interleave 64-bit integers from the low half of a and b, and store
5823// the results in dst.
5824// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
5826{
5827#if defined(__aarch64__) || defined(_M_ARM64)
5830#else
5831 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5832 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5833 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5834#endif
5835}
5836
5837// Unpack and interleave 8-bit integers from the low half of a and b, and store
5838// the results in dst.
5839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
5841{
5842#if defined(__aarch64__) || defined(_M_ARM64)
5845#else
5846 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5847 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5848 int8x8x2_t result = vzip_s8(a1, b1);
5849 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5850#endif
5851}
5852
5853// Unpack and interleave double-precision (64-bit) floating-point elements from
5854// the low half of a and b, and store the results in dst.
5855// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
5857{
5858#if defined(__aarch64__) || defined(_M_ARM64)
5859 return vreinterpretq_m128d_f64(
5860 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5861#else
5863 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
5864 vget_low_s64(vreinterpretq_s64_m128d(b))));
5865#endif
5866}
5867
5868// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
5869// elements in a and b, and store the results in dst.
5870// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
5872{
5875}
5876
5877// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
5878// and store the result in dst.
5879// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
5881{
5884}
5885
5886/* SSE3 */
5887
5888// Alternatively add and subtract packed double-precision (64-bit)
5889// floating-point elements in a to/from packed elements in b, and store the
5890// results in dst.
5891// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
5893{
5894 _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
5895#if defined(__aarch64__) || defined(_M_ARM64)
5896 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
5897 vreinterpretq_f64_m128d(b),
5898 vreinterpretq_f64_m128d(mask)));
5899#else
5900 return _mm_add_pd(_mm_mul_pd(b, mask), a);
5901#endif
5902}
5903
5904// Alternatively add and subtract packed single-precision (32-bit)
5905// floating-point elements in a to/from packed elements in b, and store the
5906// results in dst.
5907// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
5909{
5910 _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
5911#if (defined(__aarch64__) || defined(_M_ARM64)) || \
5912 defined(__ARM_FEATURE_FMA) /* VFPv4+ */
5916#else
5917 return _mm_add_ps(_mm_mul_ps(b, mask), a);
5918#endif
5919}
5920
5921// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
5922// elements in a and b, and pack the results in dst.
5923// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
5925{
5926#if defined(__aarch64__) || defined(_M_ARM64)
5927 return vreinterpretq_m128d_f64(
5928 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5929#else
5930 double *da = (double *) &a;
5931 double *db = (double *) &b;
5932 double c[] = {da[0] + da[1], db[0] + db[1]};
5933 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5934#endif
5935}
5936
5937// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
5938// elements in a and b, and pack the results in dst.
5939// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
5941{
5942#if defined(__aarch64__) || defined(_M_ARM64)
5945#else
5946 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
5947 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
5948 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
5949 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
5951 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
5952#endif
5953}
5954
5955// Horizontally subtract adjacent pairs of double-precision (64-bit)
5956// floating-point elements in a and b, and pack the results in dst.
5957// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
5959{
5960#if defined(__aarch64__) || defined(_M_ARM64)
5961 float64x2_t a = vreinterpretq_f64_m128d(_a);
5962 float64x2_t b = vreinterpretq_f64_m128d(_b);
5963 return vreinterpretq_m128d_f64(
5964 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
5965#else
5966 double *da = (double *) &_a;
5967 double *db = (double *) &_b;
5968 double c[] = {da[0] - da[1], db[0] - db[1]};
5969 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5970#endif
5971}
5972
5973// Horizontally subtract adjacent pairs of single-precision (32-bit)
5974// floating-point elements in a and b, and pack the results in dst.
5975// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
5976FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
5977{
5978 float32x4_t a = vreinterpretq_f32_m128(_a);
5979 float32x4_t b = vreinterpretq_f32_m128(_b);
5980#if defined(__aarch64__) || defined(_M_ARM64)
5982 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
5983#else
5984 float32x4x2_t c = vuzpq_f32(a, b);
5985 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
5986#endif
5987}
5988
5989// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5990// may perform better than _mm_loadu_si128 when the data crosses a cache line
5991// boundary.
5992// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
5993#define _mm_lddqu_si128 _mm_loadu_si128
5994
5995// Load a double-precision (64-bit) floating-point element from memory into both
5996// elements of dst.
5997// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
5998#define _mm_loaddup_pd _mm_load1_pd
5999
6000// Duplicate the low double-precision (64-bit) floating-point element from a,
6001// and store the results in dst.
6002// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6004{
6005#if defined(__aarch64__) || defined(_M_ARM64)
6006 return vreinterpretq_m128d_f64(
6007 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6008#else
6010 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6011#endif
6012}
6013
6014// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6015// from a, and store the results in dst.
6016// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6018{
6019#if defined(__aarch64__) || defined(_M_ARM64)
6022#elif defined(_sse2neon_shuffle)
6023 return vreinterpretq_m128_f32(vshuffleq_s32(
6025#else
6026 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6027 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6028 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6029 return vreinterpretq_m128_f32(vld1q_f32(data));
6030#endif
6031}
6032
6033// Duplicate even-indexed single-precision (32-bit) floating-point elements
6034// from a, and store the results in dst.
6035// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6037{
6038#if defined(__aarch64__) || defined(_M_ARM64)
6041#elif defined(_sse2neon_shuffle)
6042 return vreinterpretq_m128_f32(vshuffleq_s32(
6044#else
6045 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6046 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6047 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6048 return vreinterpretq_m128_f32(vld1q_f32(data));
6049#endif
6050}
6051
6052/* SSSE3 */
6053
6054// Compute the absolute value of packed signed 16-bit integers in a, and store
6055// the unsigned results in dst.
6056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6058{
6060}
6061
6062// Compute the absolute value of packed signed 32-bit integers in a, and store
6063// the unsigned results in dst.
6064// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6066{
6068}
6069
6070// Compute the absolute value of packed signed 8-bit integers in a, and store
6071// the unsigned results in dst.
6072// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6074{
6076}
6077
6078// Compute the absolute value of packed signed 16-bit integers in a, and store
6079// the unsigned results in dst.
6080// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6081FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6082{
6083 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6084}
6085
6086// Compute the absolute value of packed signed 32-bit integers in a, and store
6087// the unsigned results in dst.
6088// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6089FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6090{
6091 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6092}
6093
6094// Compute the absolute value of packed signed 8-bit integers in a, and store
6095// the unsigned results in dst.
6096// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6097FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6098{
6099 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6100}
6101
6102// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6103// the result right by imm8 bytes, and store the low 16 bytes in dst.
6104// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6105#if defined(__GNUC__) && !defined(__clang__)
6106#define _mm_alignr_epi8(a, b, imm) \
6107 __extension__({ \
6108 uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6109 uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6110 __m128i ret; \
6111 if (_sse2neon_unlikely((imm) & ~31)) \
6112 ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6113 else if (imm >= 16) \
6114 ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
6115 else \
6116 ret = \
6117 vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6118 ret; \
6119 })
6120
6121#else
6122#define _mm_alignr_epi8(a, b, imm) \
6123 _sse2neon_define2( \
6124 __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6125 uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6126 if (_sse2neon_unlikely((imm) & ~31)) ret = \
6127 vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6128 else if (imm >= 16) ret = \
6129 _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \
6130 else ret = \
6131 vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
6132 _sse2neon_return(ret);)
6133
6134#endif
6135
6136// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6137// the result right by imm8 bytes, and store the low 8 bytes in dst.
6138// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6139#define _mm_alignr_pi8(a, b, imm) \
6140 _sse2neon_define2( \
6141 __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6142 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6143 } else { \
6144 uint8x8_t tmp_low; \
6145 uint8x8_t tmp_high; \
6146 if ((imm) >= 8) { \
6147 const int idx = (imm) -8; \
6148 tmp_low = vreinterpret_u8_m64(_a); \
6149 tmp_high = vdup_n_u8(0); \
6150 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6151 } else { \
6152 const int idx = (imm); \
6153 tmp_low = vreinterpret_u8_m64(_b); \
6154 tmp_high = vreinterpret_u8_m64(_a); \
6155 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6156 } \
6157 } _sse2neon_return(ret);)
6158
6159// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6160// signed 16-bit results in dst.
6161// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
6163{
6164 int16x8_t a = vreinterpretq_s16_m128i(_a);
6165 int16x8_t b = vreinterpretq_s16_m128i(_b);
6166#if defined(__aarch64__) || defined(_M_ARM64)
6167 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6168#else
6170 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6171 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6172#endif
6173}
6174
6175// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6176// signed 32-bit results in dst.
6177// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
6179{
6180 int32x4_t a = vreinterpretq_s32_m128i(_a);
6181 int32x4_t b = vreinterpretq_s32_m128i(_b);
6182#if defined(__aarch64__) || defined(_M_ARM64)
6183 return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6184#else
6186 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6187 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6188#endif
6189}
6190
6191// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6192// signed 16-bit results in dst.
6193// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
6194FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6195{
6196 return vreinterpret_m64_s16(
6197 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6198}
6199
6200// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6201// signed 32-bit results in dst.
6202// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
6203FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6204{
6205 return vreinterpret_m64_s32(
6206 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6207}
6208
6209// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6210// saturation, and pack the signed 16-bit results in dst.
6211// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
6213{
6214#if defined(__aarch64__) || defined(_M_ARM64)
6215 int16x8_t a = vreinterpretq_s16_m128i(_a);
6216 int16x8_t b = vreinterpretq_s16_m128i(_b);
6217 return vreinterpretq_s64_s16(
6218 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6219#else
6220 int32x4_t a = vreinterpretq_s32_m128i(_a);
6221 int32x4_t b = vreinterpretq_s32_m128i(_b);
6222 // Interleave using vshrn/vmovn
6223 // [a0|a2|a4|a6|b0|b2|b4|b6]
6224 // [a1|a3|a5|a7|b1|b3|b5|b7]
6225 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6226 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6227 // Saturated add
6228 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6229#endif
6230}
6231
6232// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6233// saturation, and pack the signed 16-bit results in dst.
6234// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
6235FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6236{
6237 int16x4_t a = vreinterpret_s16_m64(_a);
6238 int16x4_t b = vreinterpret_s16_m64(_b);
6239#if defined(__aarch64__) || defined(_M_ARM64)
6240 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6241#else
6242 int16x4x2_t res = vuzp_s16(a, b);
6243 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6244#endif
6245}
6246
6247// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6248// the signed 16-bit results in dst.
6249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
6251{
6252 int16x8_t a = vreinterpretq_s16_m128i(_a);
6253 int16x8_t b = vreinterpretq_s16_m128i(_b);
6254#if defined(__aarch64__) || defined(_M_ARM64)
6256 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6257#else
6258 int16x8x2_t c = vuzpq_s16(a, b);
6259 return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6260#endif
6261}
6262
6263// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6264// the signed 32-bit results in dst.
6265// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
6267{
6268 int32x4_t a = vreinterpretq_s32_m128i(_a);
6269 int32x4_t b = vreinterpretq_s32_m128i(_b);
6270#if defined(__aarch64__) || defined(_M_ARM64)
6272 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6273#else
6274 int32x4x2_t c = vuzpq_s32(a, b);
6275 return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6276#endif
6277}
6278
6279// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6280// the signed 16-bit results in dst.
6281// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
6282FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6283{
6284 int16x4_t a = vreinterpret_s16_m64(_a);
6285 int16x4_t b = vreinterpret_s16_m64(_b);
6286#if defined(__aarch64__) || defined(_M_ARM64)
6287 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6288#else
6289 int16x4x2_t c = vuzp_s16(a, b);
6290 return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6291#endif
6292}
6293
6294// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6295// the signed 32-bit results in dst.
6296// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
6297FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6298{
6299 int32x2_t a = vreinterpret_s32_m64(_a);
6300 int32x2_t b = vreinterpret_s32_m64(_b);
6301#if defined(__aarch64__) || defined(_M_ARM64)
6302 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6303#else
6304 int32x2x2_t c = vuzp_s32(a, b);
6305 return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6306#endif
6307}
6308
6309// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6310// using saturation, and pack the signed 16-bit results in dst.
6311// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
6313{
6314 int16x8_t a = vreinterpretq_s16_m128i(_a);
6315 int16x8_t b = vreinterpretq_s16_m128i(_b);
6316#if defined(__aarch64__) || defined(_M_ARM64)
6318 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6319#else
6320 int16x8x2_t c = vuzpq_s16(a, b);
6321 return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6322#endif
6323}
6324
6325// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6326// using saturation, and pack the signed 16-bit results in dst.
6327// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
6328FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6329{
6330 int16x4_t a = vreinterpret_s16_m64(_a);
6331 int16x4_t b = vreinterpret_s16_m64(_b);
6332#if defined(__aarch64__) || defined(_M_ARM64)
6333 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6334#else
6335 int16x4x2_t c = vuzp_s16(a, b);
6336 return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6337#endif
6338}
6339
6340// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6341// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6342// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6343// and pack the saturated results in dst.
6344// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
6346{
6347#if defined(__aarch64__) || defined(_M_ARM64)
6348 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6349 int8x16_t b = vreinterpretq_s8_m128i(_b);
6350 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6351 vmovl_s8(vget_low_s8(b)));
6352 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6353 vmovl_s8(vget_high_s8(b)));
6355 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6356#else
6357 // This would be much simpler if x86 would choose to zero extend OR sign
6358 // extend, not both. This could probably be optimized better.
6359 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6360 int16x8_t b = vreinterpretq_s16_m128i(_b);
6361
6362 // Zero extend a
6363 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6364 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6365
6366 // Sign extend by shifting left then shifting right.
6367 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6368 int16x8_t b_odd = vshrq_n_s16(b, 8);
6369
6370 // multiply
6371 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6372 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6373
6374 // saturated add
6375 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6376#endif
6377}
6378
6379// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6380// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6381// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6382// pack the saturated results in dst.
6383// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
6384FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6385{
6386 uint16x4_t a = vreinterpret_u16_m64(_a);
6387 int16x4_t b = vreinterpret_s16_m64(_b);
6388
6389 // Zero extend a
6390 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6391 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6392
6393 // Sign extend by shifting left then shifting right.
6394 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6395 int16x4_t b_odd = vshr_n_s16(b, 8);
6396
6397 // multiply
6398 int16x4_t prod1 = vmul_s16(a_even, b_even);
6399 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6400
6401 // saturated add
6402 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6403}
6404
6405// Multiply packed signed 16-bit integers in a and b, producing intermediate
6406// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6407// the packed 16-bit integers in dst.
6408// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
6410{
6411 // Has issues due to saturation
6412 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6413
6414 // Multiply
6415 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6416 vget_low_s16(vreinterpretq_s16_m128i(b)));
6417 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6418 vget_high_s16(vreinterpretq_s16_m128i(b)));
6419
6420 // Rounding narrowing shift right
6421 // narrow = (int16_t)((mul + 16384) >> 15);
6422 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6423 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6424
6425 // Join together
6426 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6427}
6428
6429// Multiply packed signed 16-bit integers in a and b, producing intermediate
6430// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6431// significant bits, round by adding 1, and store bits [16:1] to dst.
6432// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
6434{
6435 int32x4_t mul_extend =
6436 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6437
6438 // Rounding narrowing shift right
6439 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6440}
6441
6442// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6443// corresponding 8-bit element of b, and store the results in dst.
6444// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
6446{
6447 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6448 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6449 uint8x16_t idx_masked =
6450 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6451#if defined(__aarch64__) || defined(_M_ARM64)
6452 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6453#elif defined(__GNUC__)
6454 int8x16_t ret;
6455 // %e and %f represent the even and odd D registers
6456 // respectively.
6457 __asm__ __volatile__(
6458 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6459 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6460 : [ret] "=&w"(ret)
6461 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6462 return vreinterpretq_m128i_s8(ret);
6463#else
6464 // use this line if testing on aarch64
6465 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6467 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6468 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6469#endif
6470}
6471
6472// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6473// corresponding 8-bit element of b, and store the results in dst.
6474// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
6476{
6477 const int8x8_t controlMask =
6478 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
6479 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6480 return vreinterpret_m64_s8(res);
6481}
6482
6483// Negate packed 16-bit integers in a when the corresponding signed
6484// 16-bit integer in b is negative, and store the results in dst.
6485// Element in dst are zeroed out when the corresponding element
6486// in b is zero.
6487// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
6489{
6490 int16x8_t a = vreinterpretq_s16_m128i(_a);
6491 int16x8_t b = vreinterpretq_s16_m128i(_b);
6492
6493 // signed shift right: faster than vclt
6494 // (b < 0) ? 0xFFFF : 0
6495 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6496 // (b == 0) ? 0xFFFF : 0
6497#if defined(__aarch64__) || defined(_M_ARM64)
6498 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6499#else
6500 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6501#endif
6502
6503 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6504 // 'a') based on ltMask
6505 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6506 // res = masked & (~zeroMask)
6507 int16x8_t res = vbicq_s16(masked, zeroMask);
6508 return vreinterpretq_m128i_s16(res);
6509}
6510
6511// Negate packed 32-bit integers in a when the corresponding signed
6512// 32-bit integer in b is negative, and store the results in dst.
6513// Element in dst are zeroed out when the corresponding element
6514// in b is zero.
6515// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
6517{
6518 int32x4_t a = vreinterpretq_s32_m128i(_a);
6519 int32x4_t b = vreinterpretq_s32_m128i(_b);
6520
6521 // signed shift right: faster than vclt
6522 // (b < 0) ? 0xFFFFFFFF : 0
6523 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6524
6525 // (b == 0) ? 0xFFFFFFFF : 0
6526#if defined(__aarch64__) || defined(_M_ARM64)
6527 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6528#else
6529 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6530#endif
6531
6532 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6533 // 'a') based on ltMask
6534 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6535 // res = masked & (~zeroMask)
6536 int32x4_t res = vbicq_s32(masked, zeroMask);
6537 return vreinterpretq_m128i_s32(res);
6538}
6539
6540// Negate packed 8-bit integers in a when the corresponding signed
6541// 8-bit integer in b is negative, and store the results in dst.
6542// Element in dst are zeroed out when the corresponding element
6543// in b is zero.
6544// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
6546{
6547 int8x16_t a = vreinterpretq_s8_m128i(_a);
6548 int8x16_t b = vreinterpretq_s8_m128i(_b);
6549
6550 // signed shift right: faster than vclt
6551 // (b < 0) ? 0xFF : 0
6552 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6553
6554 // (b == 0) ? 0xFF : 0
6555#if defined(__aarch64__) || defined(_M_ARM64)
6556 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6557#else
6558 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6559#endif
6560
6561 // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
6562 // based on ltMask
6563 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6564 // res = masked & (~zeroMask)
6565 int8x16_t res = vbicq_s8(masked, zeroMask);
6566
6567 return vreinterpretq_m128i_s8(res);
6568}
6569
6570// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6571// integer in b is negative, and store the results in dst. Element in dst are
6572// zeroed out when the corresponding element in b is zero.
6573// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
6574FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6575{
6576 int16x4_t a = vreinterpret_s16_m64(_a);
6577 int16x4_t b = vreinterpret_s16_m64(_b);
6578
6579 // signed shift right: faster than vclt
6580 // (b < 0) ? 0xFFFF : 0
6581 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6582
6583 // (b == 0) ? 0xFFFF : 0
6584#if defined(__aarch64__) || defined(_M_ARM64)
6585 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6586#else
6587 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6588#endif
6589
6590 // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
6591 // based on ltMask
6592 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6593 // res = masked & (~zeroMask)
6594 int16x4_t res = vbic_s16(masked, zeroMask);
6595
6596 return vreinterpret_m64_s16(res);
6597}
6598
6599// Negate packed 32-bit integers in a when the corresponding signed 32-bit
6600// integer in b is negative, and store the results in dst. Element in dst are
6601// zeroed out when the corresponding element in b is zero.
6602// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
6603FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6604{
6605 int32x2_t a = vreinterpret_s32_m64(_a);
6606 int32x2_t b = vreinterpret_s32_m64(_b);
6607
6608 // signed shift right: faster than vclt
6609 // (b < 0) ? 0xFFFFFFFF : 0
6610 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6611
6612 // (b == 0) ? 0xFFFFFFFF : 0
6613#if defined(__aarch64__) || defined(_M_ARM64)
6614 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6615#else
6616 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6617#endif
6618
6619 // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
6620 // based on ltMask
6621 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6622 // res = masked & (~zeroMask)
6623 int32x2_t res = vbic_s32(masked, zeroMask);
6624
6625 return vreinterpret_m64_s32(res);
6626}
6627
6628// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6629// in b is negative, and store the results in dst. Element in dst are zeroed out
6630// when the corresponding element in b is zero.
6631// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
6632FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6633{
6634 int8x8_t a = vreinterpret_s8_m64(_a);
6635 int8x8_t b = vreinterpret_s8_m64(_b);
6636
6637 // signed shift right: faster than vclt
6638 // (b < 0) ? 0xFF : 0
6639 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6640
6641 // (b == 0) ? 0xFF : 0
6642#if defined(__aarch64__) || defined(_M_ARM64)
6643 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6644#else
6645 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6646#endif
6647
6648 // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
6649 // based on ltMask
6650 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6651 // res = masked & (~zeroMask)
6652 int8x8_t res = vbic_s8(masked, zeroMask);
6653
6654 return vreinterpret_m64_s8(res);
6655}
6656
6657/* SSE4.1 */
6658
6659// Blend packed 16-bit integers from a and b using control mask imm8, and store
6660// the results in dst.
6661// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
6662// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
6663// __constrange(0,255) int imm)
6664#define _mm_blend_epi16(a, b, imm) \
6665 _sse2neon_define2( \
6666 __m128i, a, b, \
6667 const uint16_t _mask[8] = \
6668 _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
6669 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
6670 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
6671 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
6672 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
6673 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
6674 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
6675 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
6676 uint16x8_t _mask_vec = vld1q_u16(_mask); \
6677 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6678 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6679 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6680
6681// Blend packed double-precision (64-bit) floating-point elements from a and b
6682// using control mask imm8, and store the results in dst.
6683// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
6684#define _mm_blend_pd(a, b, imm) \
6685 _sse2neon_define2( \
6686 __m128d, a, b, \
6687 const uint64_t _mask[2] = \
6688 _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6689 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6690 uint64x2_t _mask_vec = vld1q_u64(_mask); \
6691 uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6692 uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6693 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6694
6695// Blend packed single-precision (32-bit) floating-point elements from a and b
6696// using mask, and store the results in dst.
6697// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
6698FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
6699{
6700 const uint32_t ALIGN_STRUCT(16)
6701 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
6702 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
6703 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
6704 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
6705 uint32x4_t mask = vld1q_u32(data);
6706 float32x4_t a = vreinterpretq_f32_m128(_a);
6707 float32x4_t b = vreinterpretq_f32_m128(_b);
6708 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6709}
6710
6711// Blend packed 8-bit integers from a and b using mask, and store the results in
6712// dst.
6713// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
6714FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
6715{
6716 // Use a signed shift right to create a mask with the sign bit
6717 uint8x16_t mask =
6718 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
6719 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6720 uint8x16_t b = vreinterpretq_u8_m128i(_b);
6721 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
6722}
6723
6724// Blend packed double-precision (64-bit) floating-point elements from a and b
6725// using mask, and store the results in dst.
6726// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
6727FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
6728{
6729 uint64x2_t mask =
6730 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
6731#if defined(__aarch64__) || defined(_M_ARM64)
6732 float64x2_t a = vreinterpretq_f64_m128d(_a);
6733 float64x2_t b = vreinterpretq_f64_m128d(_b);
6734 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6735#else
6736 uint64x2_t a = vreinterpretq_u64_m128d(_a);
6737 uint64x2_t b = vreinterpretq_u64_m128d(_b);
6738 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
6739#endif
6740}
6741
6742// Blend packed single-precision (32-bit) floating-point elements from a and b
6743// using mask, and store the results in dst.
6744// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
6745FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
6746{
6747 // Use a signed shift right to create a mask with the sign bit
6748 uint32x4_t mask =
6749 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
6750 float32x4_t a = vreinterpretq_f32_m128(_a);
6751 float32x4_t b = vreinterpretq_f32_m128(_b);
6752 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6753}
6754
6755// Round the packed double-precision (64-bit) floating-point elements in a up
6756// to an integer value, and store the results as packed double-precision
6757// floating-point elements in dst.
6758// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
6760{
6761#if defined(__aarch64__) || defined(_M_ARM64)
6762 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6763#else
6764 double *f = (double *) &a;
6765 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
6766#endif
6767}
6768
6769// Round the packed single-precision (32-bit) floating-point elements in a up to
6770// an integer value, and store the results as packed single-precision
6771// floating-point elements in dst.
6772// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
6774{
6775#if (defined(__aarch64__) || defined(_M_ARM64)) || \
6776 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6777 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
6778#else
6779 float *f = (float *) &a;
6780 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6781#endif
6782}
6783
6784// Round the lower double-precision (64-bit) floating-point element in b up to
6785// an integer value, store the result as a double-precision floating-point
6786// element in the lower element of dst, and copy the upper element from a to the
6787// upper element of dst.
6788// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
6790{
6791 return _mm_move_sd(a, _mm_ceil_pd(b));
6792}
6793
6794// Round the lower single-precision (32-bit) floating-point element in b up to
6795// an integer value, store the result as a single-precision floating-point
6796// element in the lower element of dst, and copy the upper 3 packed elements
6797// from a to the upper elements of dst.
6798// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
6800{
6801 return _mm_move_ss(a, _mm_ceil_ps(b));
6802}
6803
6804// Compare packed 64-bit integers in a and b for equality, and store the results
6805// in dst
6807{
6808#if defined(__aarch64__) || defined(_M_ARM64)
6811#else
6812 // ARMv7 lacks vceqq_u64
6813 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
6814 uint32x4_t cmp =
6816 uint32x4_t swapped = vrev64q_u32(cmp);
6817 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
6818#endif
6819}
6820
6821// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
6822// the results in dst.
6823// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
6825{
6827 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
6828}
6829
6830// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
6831// the results in dst.
6832// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
6834{
6835 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6836 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6837 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6838 return vreinterpretq_m128i_s64(s64x2);
6839}
6840
6841// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
6842// the results in dst.
6843// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
6845{
6847 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
6848}
6849
6850// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
6851// the results in dst.
6852// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
6854{
6855 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6856 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6857 return vreinterpretq_m128i_s16(s16x8);
6858}
6859
6860// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
6861// the results in dst.
6862// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
6864{
6865 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6866 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6867 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
6868 return vreinterpretq_m128i_s32(s32x4);
6869}
6870
6871// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
6872// integers, and store the results in dst.
6873// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
6875{
6876 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
6877 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6878 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6879 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6880 return vreinterpretq_m128i_s64(s64x2);
6881}
6882
6883// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
6884// and store the results in dst.
6885// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
6887{
6889 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
6890}
6891
6892// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
6893// and store the results in dst.
6894// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
6896{
6897 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6898 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6899 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6900 return vreinterpretq_m128i_u64(u64x2);
6901}
6902
6903// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
6904// and store the results in dst.
6905// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
6907{
6909 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
6910}
6911
6912// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
6913// and store the results in dst.
6914// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
6916{
6917 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
6918 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
6919 return vreinterpretq_m128i_u16(u16x8);
6920}
6921
6922// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
6923// and store the results in dst.
6924// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
6926{
6927 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
6928 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6929 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
6930 return vreinterpretq_m128i_u32(u32x4);
6931}
6932
6933// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
6934// 64-bit integers, and store the results in dst.
6935// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
6937{
6938 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
6939 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6940 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6941 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6942 return vreinterpretq_m128i_u64(u64x2);
6943}
6944
6945// Conditionally multiply the packed double-precision (64-bit) floating-point
6946// elements in a and b using the high 4 bits in imm8, sum the four products, and
6947// conditionally store the sum in dst using the low 4 bits of imm8.
6948// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
6949FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
6950{
6951 // Generate mask value from constant immediate bit value
6952 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
6953 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
6954#if !SSE2NEON_PRECISE_DP
6955 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
6956 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
6957#endif
6958 // Conditional multiplication
6959#if !SSE2NEON_PRECISE_DP
6960 __m128d mul = _mm_mul_pd(a, b);
6961 const __m128d mulMask =
6962 _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
6963 __m128d tmp = _mm_and_pd(mul, mulMask);
6964#else
6965#if defined(__aarch64__) || defined(_M_ARM64)
6966 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
6967 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
6968 : 0;
6969 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
6970 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
6971 : 0;
6972#else
6973 double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
6974 double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
6975#endif
6976 __m128d tmp = _mm_set_pd(d1, d0);
6977#endif
6978 // Sum the products
6979#if defined(__aarch64__) || defined(_M_ARM64)
6980 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
6981#else
6982 double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
6983#endif
6984 // Conditionally store the sum
6985 const __m128d sumMask =
6986 _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
6987 __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
6988 return res;
6989}
6990
6991// Conditionally multiply the packed single-precision (32-bit) floating-point
6992// elements in a and b using the high 4 bits in imm8, sum the four products,
6993// and conditionally store the sum in dst using the low 4 bits of imm.
6994// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
6995FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
6996{
6997 float32x4_t elementwise_prod = _mm_mul_ps(a, b);
6998
6999#if defined(__aarch64__) || defined(_M_ARM64)
7000 /* shortcuts */
7001 if (imm == 0xFF) {
7002 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7003 }
7004
7005 if ((imm & 0x0F) == 0x0F) {
7006 if (!(imm & (1 << 4)))
7007 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7008 if (!(imm & (1 << 5)))
7009 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7010 if (!(imm & (1 << 6)))
7011 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7012 if (!(imm & (1 << 7)))
7013 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7014
7015 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7016 }
7017#endif
7018
7019 float s = 0.0f;
7020
7021 if (imm & (1 << 4))
7022 s += vgetq_lane_f32(elementwise_prod, 0);
7023 if (imm & (1 << 5))
7024 s += vgetq_lane_f32(elementwise_prod, 1);
7025 if (imm & (1 << 6))
7026 s += vgetq_lane_f32(elementwise_prod, 2);
7027 if (imm & (1 << 7))
7028 s += vgetq_lane_f32(elementwise_prod, 3);
7029
7030 const float32_t res[4] = {
7031 (imm & 0x1) ? s : 0.0f,
7032 (imm & 0x2) ? s : 0.0f,
7033 (imm & 0x4) ? s : 0.0f,
7034 (imm & 0x8) ? s : 0.0f,
7035 };
7036 return vreinterpretq_m128_f32(vld1q_f32(res));
7037}
7038
7039// Extract a 32-bit integer from a, selected with imm8, and store the result in
7040// dst.
7041// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
7042// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7043#define _mm_extract_epi32(a, imm) \
7044 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7045
7046// Extract a 64-bit integer from a, selected with imm8, and store the result in
7047// dst.
7048// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
7049// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7050#define _mm_extract_epi64(a, imm) \
7051 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7052
7053// Extract an 8-bit integer from a, selected with imm8, and store the result in
7054// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
7055// __constrange(0,16) int imm)
7056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7057#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7058
7059// Extracts the selected single-precision (32-bit) floating-point from a.
7060// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7061#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7062
7063// Round the packed double-precision (64-bit) floating-point elements in a down
7064// to an integer value, and store the results as packed double-precision
7065// floating-point elements in dst.
7066// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7068{
7069#if defined(__aarch64__) || defined(_M_ARM64)
7070 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7071#else
7072 double *f = (double *) &a;
7073 return _mm_set_pd(floor(f[1]), floor(f[0]));
7074#endif
7075}
7076
7077// Round the packed single-precision (32-bit) floating-point elements in a down
7078// to an integer value, and store the results as packed single-precision
7079// floating-point elements in dst.
7080// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7082{
7083#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7084 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7085 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7086#else
7087 float *f = (float *) &a;
7088 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7089#endif
7090}
7091
7092// Round the lower double-precision (64-bit) floating-point element in b down to
7093// an integer value, store the result as a double-precision floating-point
7094// element in the lower element of dst, and copy the upper element from a to the
7095// upper element of dst.
7096// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7098{
7099 return _mm_move_sd(a, _mm_floor_pd(b));
7100}
7101
7102// Round the lower single-precision (32-bit) floating-point element in b down to
7103// an integer value, store the result as a single-precision floating-point
7104// element in the lower element of dst, and copy the upper 3 packed elements
7105// from a to the upper elements of dst.
7106// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
7108{
7109 return _mm_move_ss(a, _mm_floor_ps(b));
7110}
7111
7112// Copy a to dst, and insert the 32-bit integer i into dst at the location
7113// specified by imm8.
7114// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
7115// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7116// __constrange(0,4) int imm)
7117#define _mm_insert_epi32(a, b, imm) \
7118 vreinterpretq_m128i_s32( \
7119 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7120
7121// Copy a to dst, and insert the 64-bit integer i into dst at the location
7122// specified by imm8.
7123// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
7124// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7125// __constrange(0,2) int imm)
7126#define _mm_insert_epi64(a, b, imm) \
7127 vreinterpretq_m128i_s64( \
7128 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7129
7130// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
7131// location specified by imm8.
7132// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
7133// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7134// __constrange(0,16) int imm)
7135#define _mm_insert_epi8(a, b, imm) \
7136 vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7137
7138// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7139// element from b into tmp using the control in imm8. Store tmp to dst using
7140// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7141// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
7142#define _mm_insert_ps(a, b, imm8) \
7143 _sse2neon_define2( \
7144 __m128, a, b, \
7145 float32x4_t tmp1 = \
7146 vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \
7147 vreinterpretq_f32_m128(_a), 0); \
7148 float32x4_t tmp2 = \
7149 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7150 vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
7151 const uint32_t data[4] = \
7152 _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7153 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7154 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7155 ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7156 uint32x4_t mask = vld1q_u32(data); \
7157 float32x4_t all_zeros = vdupq_n_f32(0); \
7158 \
7159 _sse2neon_return(vreinterpretq_m128_f32( \
7160 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7161
7162// Compare packed signed 32-bit integers in a and b, and store packed maximum
7163// values in dst.
7164// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
7166{
7169}
7170
7171// Compare packed signed 8-bit integers in a and b, and store packed maximum
7172// values in dst.
7173// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
7175{
7178}
7179
7180// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7181// values in dst.
7182// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
7184{
7187}
7188
7189// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7190// values in dst.
7191// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7193{
7196}
7197
7198// Compare packed signed 32-bit integers in a and b, and store packed minimum
7199// values in dst.
7200// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
7202{
7205}
7206
7207// Compare packed signed 8-bit integers in a and b, and store packed minimum
7208// values in dst.
7209// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
7211{
7214}
7215
7216// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7217// values in dst.
7218// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
7220{
7223}
7224
7225// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7226// values in dst.
7227// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7229{
7232}
7233
7234// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7235// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7236// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
7238{
7239 __m128i dst;
7240 uint16_t min, idx = 0;
7241#if defined(__aarch64__) || defined(_M_ARM64)
7242 // Find the minimum value
7243 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7244
7245 // Get the index of the minimum value
7246 static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7247 uint16x8_t minv = vdupq_n_u16(min);
7248 uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
7249 idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7250#else
7251 // Find the minimum value
7252 __m64 tmp;
7254 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7255 vget_high_u16(vreinterpretq_u16_m128i(a))));
7257 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7259 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7260 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7261 // Get the index of the minimum value
7262 int i;
7263 for (i = 0; i < 8; i++) {
7264 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7265 idx = (uint16_t) i;
7266 break;
7267 }
7268 a = _mm_srli_si128(a, 2);
7269 }
7270#endif
7271 // Generate result
7272 dst = _mm_setzero_si128();
7274 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7276 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7277 return dst;
7278}
7279
7280// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7281// 8-bit integers in a compared to those in b, and store the 16-bit results in
7282// dst. Eight SADs are performed using one quadruplet from b and eight
7283// quadruplets from a. One quadruplet is selected from b starting at on the
7284// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7285// integers selected from a starting at the offset specified in imm8.
7286// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
7287FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7288{
7289 uint8x16_t _a, _b;
7290
7291 switch (imm & 0x4) {
7292 case 0:
7293 // do nothing
7294 _a = vreinterpretq_u8_m128i(a);
7295 break;
7296 case 4:
7297 _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7299 break;
7300 default:
7301#if defined(__GNUC__) || defined(__clang__)
7302 __builtin_unreachable();
7303#elif defined(_MSC_VER)
7304 __assume(0);
7305#endif
7306 break;
7307 }
7308
7309 switch (imm & 0x3) {
7310 case 0:
7311 _b = vreinterpretq_u8_u32(
7312 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7313 break;
7314 case 1:
7315 _b = vreinterpretq_u8_u32(
7316 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7317 break;
7318 case 2:
7319 _b = vreinterpretq_u8_u32(
7320 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7321 break;
7322 case 3:
7323 _b = vreinterpretq_u8_u32(
7324 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7325 break;
7326 default:
7327#if defined(__GNUC__) || defined(__clang__)
7328 __builtin_unreachable();
7329#elif defined(_MSC_VER)
7330 __assume(0);
7331#endif
7332 break;
7333 }
7334
7335 int16x8_t c04, c15, c26, c37;
7336 uint8x8_t low_b = vget_low_u8(_b);
7337 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7338 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7339 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7340 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7341 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7342 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7343 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7344#if defined(__aarch64__) || defined(_M_ARM64)
7345 // |0|4|2|6|
7346 c04 = vpaddq_s16(c04, c26);
7347 // |1|5|3|7|
7348 c15 = vpaddq_s16(c15, c37);
7349
7350 int32x4_t trn1_c =
7351 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7352 int32x4_t trn2_c =
7353 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7354 return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
7355 vreinterpretq_s16_s32(trn2_c)));
7356#else
7357 int16x4_t c01, c23, c45, c67;
7358 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7359 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7360 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7361 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7362
7364 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7365#endif
7366}
7367
7368// Multiply the low signed 32-bit integers from each packed 64-bit element in
7369// a and b, and store the signed 64-bit results in dst.
7370// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
7372{
7373 // vmull_s32 upcasts instead of masking, so we downcast.
7374 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7375 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7376 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7377}
7378
7379// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
7380// integers, and store the low 32 bits of the intermediate integers in dst.
7381// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
7383{
7386}
7387
7388// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
7389// using unsigned saturation, and store the results in dst.
7390// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
7392{
7394 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7395 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7396}
7397
7398// Round the packed double-precision (64-bit) floating-point elements in a using
7399// the rounding parameter, and store the results as packed double-precision
7400// floating-point elements in dst.
7401// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
7402FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7403{
7404#if defined(__aarch64__) || defined(_M_ARM64)
7405 switch (rounding) {
7407 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7409 return _mm_floor_pd(a);
7411 return _mm_ceil_pd(a);
7413 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7414 default: //_MM_FROUND_CUR_DIRECTION
7415 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7416 }
7417#else
7418 double *v_double = (double *) &a;
7419
7420 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7421 (rounding == _MM_FROUND_CUR_DIRECTION &&
7423 double res[2], tmp;
7424 for (int i = 0; i < 2; i++) {
7425 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7426 double roundDown = floor(tmp); // Round down value
7427 double roundUp = ceil(tmp); // Round up value
7428 double diffDown = tmp - roundDown;
7429 double diffUp = roundUp - tmp;
7430 if (diffDown < diffUp) {
7431 /* If it's closer to the round down value, then use it */
7432 res[i] = roundDown;
7433 } else if (diffDown > diffUp) {
7434 /* If it's closer to the round up value, then use it */
7435 res[i] = roundUp;
7436 } else {
7437 /* If it's equidistant between round up and round down value,
7438 * pick the one which is an even number */
7439 double half = roundDown / 2;
7440 if (half != floor(half)) {
7441 /* If the round down value is odd, return the round up value
7442 */
7443 res[i] = roundUp;
7444 } else {
7445 /* If the round up value is odd, return the round down value
7446 */
7447 res[i] = roundDown;
7448 }
7449 }
7450 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7451 }
7452 return _mm_set_pd(res[1], res[0]);
7453 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7454 (rounding == _MM_FROUND_CUR_DIRECTION &&
7456 return _mm_floor_pd(a);
7457 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7458 (rounding == _MM_FROUND_CUR_DIRECTION &&
7460 return _mm_ceil_pd(a);
7461 }
7462 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7463 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7464#endif
7465}
7466
7467// Round the packed single-precision (32-bit) floating-point elements in a using
7468// the rounding parameter, and store the results as packed single-precision
7469// floating-point elements in dst.
7470// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7471FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7472{
7473#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7474 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7475 switch (rounding) {
7477 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7479 return _mm_floor_ps(a);
7481 return _mm_ceil_ps(a);
7483 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7484 default: //_MM_FROUND_CUR_DIRECTION
7485 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7486 }
7487#else
7488 float *v_float = (float *) &a;
7489
7490 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7491 (rounding == _MM_FROUND_CUR_DIRECTION &&
7493 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7494 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7495 vdupq_n_f32(0.5f)); /* +/- 0.5 */
7496 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7497 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7498 int32x4_t r_trunc = vcvtq_s32_f32(
7499 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7500 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7501 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7502 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7503 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7504 float32x4_t delta = vsubq_f32(
7506 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7507 uint32x4_t is_delta_half =
7508 vceqq_f32(delta, half); /* delta == +/- 0.5 */
7510 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7511 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7512 (rounding == _MM_FROUND_CUR_DIRECTION &&
7514 return _mm_floor_ps(a);
7515 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7516 (rounding == _MM_FROUND_CUR_DIRECTION &&
7518 return _mm_ceil_ps(a);
7519 }
7520 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7521 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7522 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7523 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7524#endif
7525}
7526
7527// Round the lower double-precision (64-bit) floating-point element in b using
7528// the rounding parameter, store the result as a double-precision floating-point
7529// element in the lower element of dst, and copy the upper element from a to the
7530// upper element of dst.
7531// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
7532FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7533{
7534 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7535}
7536
7537// Round the lower single-precision (32-bit) floating-point element in b using
7538// the rounding parameter, store the result as a single-precision floating-point
7539// element in the lower element of dst, and copy the upper 3 packed elements
7540// from a to the upper elements of dst. Rounding is done according to the
7541// rounding[3:0] parameter, which can be one of:
7542// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7543// suppress exceptions
7544// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7545// suppress exceptions
7546// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7547// exceptions
7548// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7549// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7550// _MM_SET_ROUNDING_MODE
7551// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
7552FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7553{
7554 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7555}
7556
7557// Load 128-bits of integer data from memory into dst using a non-temporal
7558// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7559// general-protection exception may be generated.
7560// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
7562{
7563#if __has_builtin(__builtin_nontemporal_store)
7564 return __builtin_nontemporal_load(p);
7565#else
7566 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7567#endif
7568}
7569
7570// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7571// all 1's, and return 1 if the result is zero, otherwise return 0.
7572// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
7573FORCE_INLINE int _mm_test_all_ones(__m128i a)
7574{
7575 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7576 ~(uint64_t) 0;
7577}
7578
7579// Compute the bitwise AND of 128 bits (representing integer data) in a and
7580// mask, and return 1 if the result is zero, otherwise return 0.
7581// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
7582FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7583{
7584 int64x2_t a_and_mask =
7586 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7587}
7588
7589// Compute the bitwise AND of 128 bits (representing integer data) in a and
7590// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7591// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7592// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7593// otherwise return 0.
7594// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
7595// Note: Argument names may be wrong in the Intel intrinsics guide.
7596FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7597{
7598 uint64x2_t v = vreinterpretq_u64_m128i(a);
7599 uint64x2_t m = vreinterpretq_u64_m128i(mask);
7600
7601 // find ones (set-bits) and zeros (clear-bits) under clip mask
7602 uint64x2_t ones = vandq_u64(m, v);
7603 uint64x2_t zeros = vbicq_u64(m, v);
7604
7605 // If both 128-bit variables are populated (non-zero) then return 1.
7606 // For comparision purposes, first compact each var down to 32-bits.
7607 uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
7608
7609 // if folding minimum is non-zero then both vars must be non-zero
7610 return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
7611}
7612
7613// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7614// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7615// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7616// otherwise set CF to 0. Return the CF value.
7617// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
7618FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7619{
7620 int64x2_t s64 =
7622 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7623}
7624
7625// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7626// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7627// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7628// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7629// otherwise return 0.
7630// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
7631#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7632
7633// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7634// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7635// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7636// otherwise set CF to 0. Return the ZF value.
7637// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
7638FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7639{
7640 int64x2_t s64 =
7642 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7643}
7644
7645/* SSE4.2 */
7646
7647static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7648 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7649};
7650static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7651 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7652 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7653};
7654
7655/* specify the source data format */
7656#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
7657#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
7658#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
7659#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
7660
7661/* specify the comparison operation */
7662#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
7663#define _SIDD_CMP_RANGES 0x04 /* compare ranges */
7664#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
7665#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
7666
7667/* specify the polarity */
7668#define _SIDD_POSITIVE_POLARITY 0x00
7669#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7670#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
7671#define _SIDD_MASKED_NEGATIVE_POLARITY \
7672 0x30 /* negate results only before end of string */
7673
7674/* specify the output selection in _mm_cmpXstri */
7675#define _SIDD_LEAST_SIGNIFICANT 0x00
7676#define _SIDD_MOST_SIGNIFICANT 0x40
7677
7678/* specify the output selection in _mm_cmpXstrm */
7679#define _SIDD_BIT_MASK 0x00
7680#define _SIDD_UNIT_MASK 0x40
7681
7682/* Pattern Matching for C macros.
7683 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
7684 */
7685
7686/* catenate */
7687#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7688#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7689
7690#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7691/* run the 2nd parameter */
7692#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7693/* run the 1st parameter */
7694#define SSE2NEON_IIF_1(t, ...) t
7695
7696#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7697#define SSE2NEON_COMPL_0 1
7698#define SSE2NEON_COMPL_1 0
7699
7700#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7701#define SSE2NEON_DEC_1 0
7702#define SSE2NEON_DEC_2 1
7703#define SSE2NEON_DEC_3 2
7704#define SSE2NEON_DEC_4 3
7705#define SSE2NEON_DEC_5 4
7706#define SSE2NEON_DEC_6 5
7707#define SSE2NEON_DEC_7 6
7708#define SSE2NEON_DEC_8 7
7709#define SSE2NEON_DEC_9 8
7710#define SSE2NEON_DEC_10 9
7711#define SSE2NEON_DEC_11 10
7712#define SSE2NEON_DEC_12 11
7713#define SSE2NEON_DEC_13 12
7714#define SSE2NEON_DEC_14 13
7715#define SSE2NEON_DEC_15 14
7716#define SSE2NEON_DEC_16 15
7717
7718/* detection */
7719#define SSE2NEON_CHECK_N(x, n, ...) n
7720#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7721#define SSE2NEON_PROBE(x) x, 1,
7722
7723#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7724#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7725
7726#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7727#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7728
7729#define SSE2NEON_EAT(...)
7730#define SSE2NEON_EXPAND(...) __VA_ARGS__
7731#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7732
7733/* recursion */
7734/* deferred expression */
7735#define SSE2NEON_EMPTY()
7736#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7737#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7738#define SSE2NEON_EXPAND(...) __VA_ARGS__
7739
7740#define SSE2NEON_EVAL(...) \
7741 SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7742#define SSE2NEON_EVAL1(...) \
7743 SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7744#define SSE2NEON_EVAL2(...) \
7745 SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7746#define SSE2NEON_EVAL3(...) __VA_ARGS__
7747
7748#define SSE2NEON_REPEAT(count, macro, ...) \
7749 SSE2NEON_WHEN(count) \
7750 (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7751 SSE2NEON_DEC(count), macro, \
7752 __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7753 __VA_ARGS__))
7754#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7755
7756#define SSE2NEON_SIZE_OF_byte 8
7757#define SSE2NEON_NUMBER_OF_LANES_byte 16
7758#define SSE2NEON_SIZE_OF_word 16
7759#define SSE2NEON_NUMBER_OF_LANES_word 8
7760
7761#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7762 mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7763 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7764 vreinterpretq_##type##_m128i(a)));
7765
7766#define SSE2NEON_FILL_LANE(i, type) \
7767 vec_b[i] = \
7768 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7769
7770#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7771 number_of_lanes, byte_or_word) \
7772 do { \
7773 SSE2NEON_CAT( \
7774 data_type_prefix, \
7775 SSE2NEON_CAT(size, \
7776 SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7777 vec_b[number_of_lanes]; \
7778 __m128i mask = SSE2NEON_IIF(byte_or_word)( \
7779 vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7780 vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7781 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7782 SSE2NEON_CAT(type_prefix, size))) \
7783 for (int i = 0; i < number_of_lanes; i++) { \
7784 mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7785 size)(SSE2NEON_CAT(vbslq_u, size)( \
7786 SSE2NEON_CAT(vreinterpretq_u, \
7787 SSE2NEON_CAT(size, _m128i))(mask), \
7788 SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7789 vec_b[i], \
7790 SSE2NEON_CAT( \
7791 vreinterpretq_, \
7792 SSE2NEON_CAT(type_prefix, \
7793 SSE2NEON_CAT(size, _m128i(a))))), \
7794 SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7795 vec_b[i], \
7796 SSE2NEON_CAT( \
7797 vreinterpretq_, \
7798 SSE2NEON_CAT(type_prefix, \
7799 SSE2NEON_CAT(size, _m128i(a))))))); \
7800 } \
7801 } while (0)
7802
7803#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7804 do { \
7805 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7806 SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7807 SSE2NEON_CAT(u, size))) \
7808 } while (0)
7809
7810#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7811 static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
7812 int lb) \
7813 { \
7814 __m128i mtx[16]; \
7815 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7816 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7817 return SSE2NEON_CAT( \
7818 _sse2neon_aggregate_equal_any_, \
7819 SSE2NEON_CAT( \
7820 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7821 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7822 type))))(la, lb, mtx); \
7823 }
7824
7825#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7826 static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
7827 int lb) \
7828 { \
7829 __m128i mtx[16]; \
7830 PCMPSTR_RANGES( \
7831 a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7832 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7833 return SSE2NEON_CAT( \
7834 _sse2neon_aggregate_ranges_, \
7835 SSE2NEON_CAT( \
7836 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7837 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7838 type))))(la, lb, mtx); \
7839 }
7840
7841#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
7842 static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
7843 __m128i b, int lb) \
7844 { \
7845 __m128i mtx[16]; \
7846 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7847 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7848 return SSE2NEON_CAT( \
7849 _sse2neon_aggregate_equal_ordered_, \
7850 SSE2NEON_CAT( \
7851 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7852 SSE2NEON_CAT(x, \
7853 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
7854 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
7855 }
7856
7857static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
7858{
7859 int res = 0;
7860 int m = (1 << la) - 1;
7861 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7862 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7863 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7864 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7865 for (int j = 0; j < lb; j++) {
7866 mtx[j] = vreinterpretq_m128i_u8(
7867 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7868 mtx[j] = vreinterpretq_m128i_u8(
7869 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7870 int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
7871 res |= (tmp << j);
7872 }
7873 return res;
7874}
7875
7876static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
7877{
7878 int res = 0;
7879 int m = (1 << la) - 1;
7880 uint16x8_t vec =
7881 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7882 for (int j = 0; j < lb; j++) {
7883 mtx[j] = vreinterpretq_m128i_u16(
7884 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7885 mtx[j] = vreinterpretq_m128i_u16(
7886 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7887 int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
7888 res |= (tmp << j);
7889 }
7890 return res;
7891}
7892
7893/* clang-format off */
7894#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
7895 prefix##IMPL(byte) \
7896 prefix##IMPL(word)
7897/* clang-format on */
7898
7899SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
7900
7901static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
7902{
7903 int res = 0;
7904 int m = (1 << la) - 1;
7905 uint16x8_t vec =
7906 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7907 for (int j = 0; j < lb; j++) {
7908 mtx[j] = vreinterpretq_m128i_u16(
7909 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7910 mtx[j] = vreinterpretq_m128i_u16(
7911 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7913 vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
7914 uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
7916#if defined(__aarch64__) || defined(_M_ARM64)
7917 int t = vaddvq_u32(vec_res) ? 1 : 0;
7918#else
7919 uint64x2_t sumh = vpaddlq_u32(vec_res);
7920 int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
7921#endif
7922 res |= (t << j);
7923 }
7924 return res;
7925}
7926
7927static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
7928{
7929 int res = 0;
7930 int m = (1 << la) - 1;
7931 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7932 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7933 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7934 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7935 for (int j = 0; j < lb; j++) {
7936 mtx[j] = vreinterpretq_m128i_u8(
7937 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7938 mtx[j] = vreinterpretq_m128i_u8(
7939 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7941 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
7942 uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
7944 int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
7945 res |= (t << j);
7946 }
7947 return res;
7948}
7949
7950#define SSE2NEON_CMP_RANGES_IS_BYTE 1
7951#define SSE2NEON_CMP_RANGES_IS_WORD 0
7952
7953/* clang-format off */
7954#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
7955 prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
7956 prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
7957 prefix##IMPL(word, uint, u, prefix##IS_WORD) \
7958 prefix##IMPL(word, int, s, prefix##IS_WORD)
7959/* clang-format on */
7960
7961SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
7962
7963#undef SSE2NEON_CMP_RANGES_IS_BYTE
7964#undef SSE2NEON_CMP_RANGES_IS_WORD
7965
7966static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
7967{
7968 uint8x16_t mtx =
7970 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7971 int m1 = 0x10000 - (1 << la);
7972 int tb = 0x10000 - (1 << lb);
7973 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7974 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7975 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7976 vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7977 vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7978 vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7979 vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7980 tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7981 tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7982
7983 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7984 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
7985 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
7986 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
7987 res_lo = vand_u8(res_lo, vec_mask);
7988 res_hi = vand_u8(res_hi, vec_mask);
7989
7990 int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
7991 return res;
7992}
7993
7994static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
7995{
7996 uint16x8_t mtx =
7998 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7999 int m1 = 0x100 - (1 << la);
8000 int tb = 0x100 - (1 << lb);
8001 uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8002 uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8003 uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8004 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8005 mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8006 mtx = vbslq_u16(vec1, tmp, mtx);
8007 mtx = vandq_u16(mtx, vec_mask);
8008 return _sse2neon_vaddvq_u16(mtx);
8009}
8010
8011#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8012#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8013
8014#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8015 static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8016 int bound, int la, int lb, __m128i mtx[16]) \
8017 { \
8018 int res = 0; \
8019 int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
8020 uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8021 vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8022 vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8023 uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8024 vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
8025 vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8026 vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
8027 uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8028 uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8029 for (int j = 0; j < lb; j++) { \
8030 mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8031 vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8032 } \
8033 for (int j = lb; j < bound; j++) { \
8034 mtx[j] = vreinterpretq_m128i_u##size( \
8035 vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8036 } \
8037 unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8038 (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8039 for (int i = 0; i < bound; i++) { \
8040 int val = 1; \
8041 for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8042 val &= ptr[k * bound + j]; \
8043 res += val << i; \
8044 } \
8045 return res; \
8046 }
8047
8048/* clang-format off */
8049#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8050 prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8051 prefix##IMPL(16, 8, prefix##IS_UWORD)
8052/* clang-format on */
8053
8054SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8055
8056#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8057#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8058
8059/* clang-format off */
8060#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8061 prefix##IMPL(byte) \
8062 prefix##IMPL(word)
8063/* clang-format on */
8064
8065SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
8066
8067#define SSE2NEON_CMPESTR_LIST \
8068 _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8069 _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8070 _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8071 _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8072 _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8073 _(CMP_UWORD_RANGES, cmp_uword_ranges) \
8074 _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8075 _(CMP_SWORD_RANGES, cmp_sword_ranges) \
8076 _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8077 _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8078 _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8079 _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8080 _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8081 _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8082 _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8083 _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8084
8085enum {
8086#define _(name, func_suffix) name,
8088#undef _
8090typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
8092#define _(name, func_suffix) _sse2neon_##func_suffix,
8094#undef _
8095};
8096
8097FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
8098{
8099 switch (imm8 & 0x30) {
8101 res ^= 0xffffffff;
8102 break;
8104 res ^= (1 << lb) - 1;
8105 break;
8106 default:
8107 break;
8108 }
8109
8110 return res & ((bound == 8) ? 0xFF : 0xFFFF);
8111}
8112
8113FORCE_INLINE int _sse2neon_clz(unsigned int x)
8114{
8115#ifdef _MSC_VER
8116 unsigned long cnt = 0;
8117 if (_BitScanReverse(&cnt, x))
8118 return 31 - cnt;
8119 return 32;
8120#else
8121 return x != 0 ? __builtin_clz(x) : 32;
8122#endif
8123}
8124
8125FORCE_INLINE int _sse2neon_ctz(unsigned int x)
8126{
8127#ifdef _MSC_VER
8128 unsigned long cnt = 0;
8129 if (_BitScanForward(&cnt, x))
8130 return cnt;
8131 return 32;
8132#else
8133 return x != 0 ? __builtin_ctz(x) : 32;
8134#endif
8135}
8136
8137FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
8138{
8139#ifdef _MSC_VER
8140 unsigned long cnt;
8141#if defined(SSE2NEON_HAS_BITSCAN64)
8142 if (_BitScanForward64(&cnt, x))
8143 return (int) (cnt);
8144#else
8145 if (_BitScanForward(&cnt, (unsigned long) (x)))
8146 return (int) cnt;
8147 if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
8148 return (int) (cnt + 32);
8149#endif /* SSE2NEON_HAS_BITSCAN64 */
8150 return 64;
8151#else /* assume GNU compatible compilers */
8152 return x != 0 ? __builtin_ctzll(x) : 64;
8153#endif
8154}
8155
8156#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8157
8158#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8159 const int var = (imm & 0x01) ? 8 : 16
8160
8161#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8162 int tmp1 = la ^ (la >> 31); \
8163 la = tmp1 - (la >> 31); \
8164 int tmp2 = lb ^ (lb >> 31); \
8165 lb = tmp2 - (lb >> 31); \
8166 la = SSE2NEON_MIN(la, bound); \
8167 lb = SSE2NEON_MIN(lb, bound)
8168
8169// Compare all pairs of character in string a and b,
8170// then aggregate the result.
8171// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
8172// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
8173// string a and b.
8174#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8175 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8176 SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8177 int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
8178 r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8179
8180#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8181 return (r2 == 0) ? bound \
8182 : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
8183 : _sse2neon_ctz(r2))
8184
8185#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8186 __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8187 if (imm8 & 0x40) { \
8188 if (bound == 8) { \
8189 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8190 vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8191 dst = vreinterpretq_m128i_u16(vbslq_u16( \
8192 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8193 } else { \
8194 uint8x16_t vec_r2 = \
8195 vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
8196 uint8x16_t tmp = \
8197 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8198 dst = vreinterpretq_m128i_u8( \
8199 vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8200 } \
8201 } else { \
8202 if (bound == 16) { \
8203 dst = vreinterpretq_m128i_u16( \
8204 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8205 } else { \
8206 dst = vreinterpretq_m128i_u8( \
8207 vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
8208 } \
8209 } \
8210 return dst
8211
8212// Compare packed strings in a and b with lengths la and lb using the control
8213// in imm8, and returns 1 if b did not contain a null character and the
8214// resulting mask was zero, and 0 otherwise.
8215// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
8216FORCE_INLINE int _mm_cmpestra(__m128i a,
8217 int la,
8218 __m128i b,
8219 int lb,
8220 const int imm8)
8221{
8222 int lb_cpy = lb;
8223 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8224 return !r2 & (lb_cpy > bound);
8225}
8226
8227// Compare packed strings in a and b with lengths la and lb using the control in
8228// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8229// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
8230FORCE_INLINE int _mm_cmpestrc(__m128i a,
8231 int la,
8232 __m128i b,
8233 int lb,
8234 const int imm8)
8235{
8236 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8237 return r2 != 0;
8238}
8239
8240// Compare packed strings in a and b with lengths la and lb using the control
8241// in imm8, and store the generated index in dst.
8242// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
8243FORCE_INLINE int _mm_cmpestri(__m128i a,
8244 int la,
8245 __m128i b,
8246 int lb,
8247 const int imm8)
8248{
8249 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8250 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8251}
8252
8253// Compare packed strings in a and b with lengths la and lb using the control
8254// in imm8, and store the generated mask in dst.
8255// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
8256FORCE_INLINE __m128i
8257_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
8258{
8259 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8261}
8262
8263// Compare packed strings in a and b with lengths la and lb using the control in
8264// imm8, and returns bit 0 of the resulting bit mask.
8265// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
8266FORCE_INLINE int _mm_cmpestro(__m128i a,
8267 int la,
8268 __m128i b,
8269 int lb,
8270 const int imm8)
8271{
8272 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8273 return r2 & 1;
8274}
8275
8276// Compare packed strings in a and b with lengths la and lb using the control in
8277// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8278// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
8279FORCE_INLINE int _mm_cmpestrs(__m128i a,
8280 int la,
8281 __m128i b,
8282 int lb,
8283 const int imm8)
8284{
8285 (void) a;
8286 (void) b;
8287 (void) lb;
8288 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8289 return la <= (bound - 1);
8290}
8291
8292// Compare packed strings in a and b with lengths la and lb using the control in
8293// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8294// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
8295FORCE_INLINE int _mm_cmpestrz(__m128i a,
8296 int la,
8297 __m128i b,
8298 int lb,
8299 const int imm8)
8300{
8301 (void) a;
8302 (void) b;
8303 (void) la;
8304 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8305 return lb <= (bound - 1);
8306}
8307
8308#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8309 do { \
8310 if (imm8 & 0x01) { \
8311 uint16x8_t equal_mask_##str = \
8312 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8313 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8314 uint64_t matches_##str = \
8315 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8316 len = _sse2neon_ctzll(matches_##str) >> 3; \
8317 } else { \
8318 uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8319 vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8320 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8321 uint64_t matches_##str = \
8322 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8323 len = _sse2neon_ctzll(matches_##str) >> 2; \
8324 } \
8325 } while (0)
8326
8327#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8328 int la, lb; \
8329 do { \
8330 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8331 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8332 } while (0)
8333
8334// Compare packed strings with implicit lengths in a and b using the control in
8335// imm8, and returns 1 if b did not contain a null character and the resulting
8336// mask was zero, and 0 otherwise.
8337// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
8338FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
8339{
8340 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8341 return !r2 & (lb >= bound);
8342}
8343
8344// Compare packed strings with implicit lengths in a and b using the control in
8345// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8346// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
8347FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
8348{
8349 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8350 return r2 != 0;
8351}
8352
8353// Compare packed strings with implicit lengths in a and b using the control in
8354// imm8, and store the generated index in dst.
8355// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
8356FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
8357{
8358 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8359 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8360}
8361
8362// Compare packed strings with implicit lengths in a and b using the control in
8363// imm8, and store the generated mask in dst.
8364// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
8365FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
8366{
8367 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8369}
8370
8371// Compare packed strings with implicit lengths in a and b using the control in
8372// imm8, and returns bit 0 of the resulting bit mask.
8373// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
8374FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
8375{
8376 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8377 return r2 & 1;
8378}
8379
8380// Compare packed strings with implicit lengths in a and b using the control in
8381// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8382// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
8383FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
8384{
8385 (void) b;
8386 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8387 int la;
8388 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
8389 return la <= (bound - 1);
8390}
8391
8392// Compare packed strings with implicit lengths in a and b using the control in
8393// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
8395FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
8396{
8397 (void) a;
8398 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8399 int lb;
8400 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
8401 return lb <= (bound - 1);
8402}
8403
8404// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8405// in b for greater than.
8407{
8408#if defined(__aarch64__) || defined(_M_ARM64)
8411#else
8412 return vreinterpretq_m128i_s64(vshrq_n_s64(
8414 63));
8415#endif
8416}
8417
8418// Starting with the initial value in crc, accumulates a CRC32 value for
8419// unsigned 16-bit integer v, and stores the result in dst.
8420// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
8421FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8422{
8423#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8424 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8425 : [c] "+r"(crc)
8426 : [v] "r"(v));
8427#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8428 (defined(_M_ARM64) && !defined(__clang__))
8429 crc = __crc32ch(crc, v);
8430#else
8431 crc = _mm_crc32_u8(crc, v & 0xff);
8432 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8433#endif
8434 return crc;
8435}
8436
8437// Starting with the initial value in crc, accumulates a CRC32 value for
8438// unsigned 32-bit integer v, and stores the result in dst.
8439// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
8440FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8441{
8442#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8443 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8444 : [c] "+r"(crc)
8445 : [v] "r"(v));
8446#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8447 (defined(_M_ARM64) && !defined(__clang__))
8448 crc = __crc32cw(crc, v);
8449#else
8450 crc = _mm_crc32_u16(crc, v & 0xffff);
8451 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8452#endif
8453 return crc;
8454}
8455
8456// Starting with the initial value in crc, accumulates a CRC32 value for
8457// unsigned 64-bit integer v, and stores the result in dst.
8458// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
8459FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8460{
8461#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8462 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8463 : [c] "+r"(crc)
8464 : [v] "r"(v));
8465#elif (defined(_M_ARM64) && !defined(__clang__))
8466 crc = __crc32cd((uint32_t) crc, v);
8467#else
8468 crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8469 crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8470#endif
8471 return crc;
8472}
8473
8474// Starting with the initial value in crc, accumulates a CRC32 value for
8475// unsigned 8-bit integer v, and stores the result in dst.
8476// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
8477FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8478{
8479#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8480 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8481 : [c] "+r"(crc)
8482 : [v] "r"(v));
8483#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8484 (defined(_M_ARM64) && !defined(__clang__))
8485 crc = __crc32cb(crc, v);
8486#else
8487 crc ^= v;
8488 for (int bit = 0; bit < 8; bit++) {
8489 if (crc & 1)
8490 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8491 else
8492 crc = (crc >> 1);
8493 }
8494#endif
8495 return crc;
8496}
8497
8498/* AES */
8499
8500#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
8501/* clang-format off */
8502#define SSE2NEON_AES_SBOX(w) \
8503 { \
8504 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8505 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8506 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8507 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8508 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8509 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8510 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8511 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8512 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8513 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8514 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8515 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8516 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8517 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8518 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8519 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8520 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8521 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8522 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8523 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8524 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8525 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8526 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8527 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8528 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8529 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8530 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8531 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8532 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8533 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8534 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8535 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8536 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8537 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8538 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8539 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8540 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8541 }
8542#define SSE2NEON_AES_RSBOX(w) \
8543 { \
8544 w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8545 w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8546 w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8547 w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8548 w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8549 w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8550 w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8551 w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8552 w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8553 w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8554 w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8555 w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8556 w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8557 w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8558 w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8559 w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8560 w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8561 w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8562 w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8563 w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8564 w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8565 w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8566 w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8567 w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8568 w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8569 w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8570 w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8571 w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8572 w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8573 w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8574 w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8575 w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8576 w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8577 w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8578 w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8579 w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8580 w(0x55), w(0x21), w(0x0c), w(0x7d) \
8581 }
8582/* clang-format on */
8583
8584/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8585#define SSE2NEON_AES_H0(x) (x)
8588#undef SSE2NEON_AES_H0
8589
8590/* x_time function and matrix multiply function */
8591#if !defined(__aarch64__) && !defined(_M_ARM64)
8592#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8593#define SSE2NEON_MULTIPLY(x, y) \
8594 (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8595 ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8596 ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8597 ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8598#endif
8599
8600// In the absence of crypto extensions, implement aesenc using regular NEON
8601// intrinsics instead. See:
8602// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8603// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8604// for more information.
8605FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
8606{
8607#if defined(__aarch64__) || defined(_M_ARM64)
8608 static const uint8_t shift_rows[] = {
8609 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8610 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8611 };
8612 static const uint8_t ror32by8[] = {
8613 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8614 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8615 };
8616
8617 uint8x16_t v;
8618 uint8x16_t w = vreinterpretq_u8_m128i(a);
8619
8620 /* shift rows */
8621 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8622
8623 /* sub bytes */
8624 // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
8625 // look up each of the table. After each lookup, we load the next table
8626 // which locates at the next 64-bytes. In the meantime, the index in the
8627 // table would be smaller than it was, so the index parameters of
8628 // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
8629 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8630 // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
8631 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8632 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8633 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8634
8635 /* mix columns */
8636 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8637 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8638 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8639
8640 /* add round key */
8641 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8642
8643#else /* ARMv7-A implementation for a table-based AES */
8644#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8645 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8646 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8647// muliplying 'x' by 2 in GF(2^8)
8648#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8649// muliplying 'x' by 3 in GF(2^8)
8650#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8651#define SSE2NEON_AES_U0(p) \
8652 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8653#define SSE2NEON_AES_U1(p) \
8654 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8655#define SSE2NEON_AES_U2(p) \
8656 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8657#define SSE2NEON_AES_U3(p) \
8658 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8659
8660 // this generates a table containing every possible permutation of
8661 // shift_rows() and sub_bytes() with mix_columns().
8662 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8667 };
8668#undef SSE2NEON_AES_B2W
8669#undef SSE2NEON_AES_F2
8670#undef SSE2NEON_AES_F3
8671#undef SSE2NEON_AES_U0
8672#undef SSE2NEON_AES_U1
8673#undef SSE2NEON_AES_U2
8674#undef SSE2NEON_AES_U3
8675
8676 uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
8677 uint32_t x1 =
8678 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
8679 uint32_t x2 =
8680 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
8681 uint32_t x3 =
8682 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
8683
8684 // finish the modulo addition step in mix_columns()
8685 __m128i out = _mm_set_epi32(
8686 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8687 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8688 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8689 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8690 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8691 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8692 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8693 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8694
8695 return _mm_xor_si128(out, RoundKey);
8696#endif
8697}
8698
8699// Perform one round of an AES decryption flow on data (state) in a using the
8700// round key in RoundKey, and store the result in dst.
8701// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8702FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8703{
8704#if defined(__aarch64__)
8705 static const uint8_t inv_shift_rows[] = {
8706 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8707 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8708 };
8709 static const uint8_t ror32by8[] = {
8710 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8711 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8712 };
8713
8714 uint8x16_t v;
8715 uint8x16_t w = vreinterpretq_u8_m128i(a);
8716
8717 // inverse shift rows
8718 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8719
8720 // inverse sub bytes
8721 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8722 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8723 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8724 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8725
8726 // inverse mix columns
8727 // multiplying 'v' by 4 in GF(2^8)
8728 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8729 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8730 v ^= w;
8731 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8732
8733 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8734 0x1b); // muliplying 'v' by 2 in GF(2^8)
8735 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8736 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8737
8738 // add round key
8739 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8740
8741#else /* ARMv7-A NEON implementation */
8742 /* FIXME: optimized for NEON */
8743 uint8_t i, e, f, g, h, v[4][4];
8744 uint8_t *_a = (uint8_t *) &a;
8745 for (i = 0; i < 16; ++i) {
8746 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8747 }
8748
8749 // inverse mix columns
8750 for (i = 0; i < 4; ++i) {
8751 e = v[i][0];
8752 f = v[i][1];
8753 g = v[i][2];
8754 h = v[i][3];
8755
8756 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8757 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8758 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8759 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8760 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8761 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8762 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8763 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8764 }
8765
8766 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8767#endif
8768}
8769
8770// Perform the last round of an AES encryption flow on data (state) in a using
8771// the round key in RoundKey, and store the result in dst.
8772// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8774{
8775#if defined(__aarch64__)
8776 static const uint8_t shift_rows[] = {
8777 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8778 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8779 };
8780
8781 uint8x16_t v;
8782 uint8x16_t w = vreinterpretq_u8_m128i(a);
8783
8784 // shift rows
8785 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8786
8787 // sub bytes
8788 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8789 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8790 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8791 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8792
8793 // add round key
8794 return vreinterpretq_m128i_u8(v) ^ RoundKey;
8795
8796#else /* ARMv7-A implementation */
8797 uint8_t v[16] = {
8798 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
8799 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
8800 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
8801 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
8802 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
8803 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
8804 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
8805 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
8806 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
8807 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
8808 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
8809 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
8810 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
8811 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
8812 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
8813 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
8814 };
8815
8816 return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
8817#endif
8818}
8819
8820// Perform the last round of an AES decryption flow on data (state) in a using
8821// the round key in RoundKey, and store the result in dst.
8822// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8824{
8825#if defined(__aarch64__)
8826 static const uint8_t inv_shift_rows[] = {
8827 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8828 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8829 };
8830
8831 uint8x16_t v;
8832 uint8x16_t w = vreinterpretq_u8_m128i(a);
8833
8834 // inverse shift rows
8835 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8836
8837 // inverse sub bytes
8838 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8839 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8840 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8841 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8842
8843 // add round key
8844 return vreinterpretq_m128i_u8(v) ^ RoundKey;
8845
8846#else /* ARMv7-A NEON implementation */
8847 /* FIXME: optimized for NEON */
8848 uint8_t v[4][4];
8849 uint8_t *_a = (uint8_t *) &a;
8850 for (int i = 0; i < 16; ++i) {
8851 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8852 }
8853
8854 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8855#endif
8856}
8857
8858// Perform the InvMixColumns transformation on a and store the result in dst.
8859// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8861{
8862#if defined(__aarch64__)
8863 static const uint8_t ror32by8[] = {
8864 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8865 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8866 };
8867 uint8x16_t v = vreinterpretq_u8_m128i(a);
8868 uint8x16_t w;
8869
8870 // multiplying 'v' by 4 in GF(2^8)
8871 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8872 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8873 v ^= w;
8874 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8875
8876 // multiplying 'v' by 2 in GF(2^8)
8877 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8878 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8879 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8880 return vreinterpretq_m128i_u8(w);
8881
8882#else /* ARMv7-A NEON implementation */
8883 uint8_t i, e, f, g, h, v[4][4];
8884 vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
8885 for (i = 0; i < 4; ++i) {
8886 e = v[i][0];
8887 f = v[i][1];
8888 g = v[i][2];
8889 h = v[i][3];
8890
8891 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8892 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8893 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8894 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8895 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8896 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8897 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8898 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8899 }
8900
8901 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
8902#endif
8903}
8904
8905// Assist in expanding the AES cipher key by computing steps towards generating
8906// a round key for encryption cipher using data from a and an 8-bit round
8907// constant specified in imm8, and store the result in dst.
8908// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8909//
8910// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8911// This instruction generates a round key for AES encryption. See
8912// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8913// for details.
8914FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8915{
8916#if defined(__aarch64__)
8917 uint8x16_t _a = vreinterpretq_u8_m128i(a);
8918 uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
8919 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
8920 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
8921 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
8922
8923 uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
8924 uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
8925 uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
8926
8927 return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
8928
8929#else /* ARMv7-A NEON implementation */
8930 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
8931 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
8932 for (int i = 0; i < 4; ++i) {
8933 ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
8934 ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
8935 }
8936 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8937 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8938#endif
8939}
8940#undef SSE2NEON_AES_SBOX
8941#undef SSE2NEON_AES_RSBOX
8942
8943#if defined(__aarch64__)
8944#undef SSE2NEON_XT
8945#undef SSE2NEON_MULTIPLY
8946#endif
8947
8948#else /* __ARM_FEATURE_CRYPTO */
8949// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8950// AESMC and then manually applying the real key as an xor operation. This
8951// unfortunately means an additional xor op; the compiler should be able to
8952// optimize this away for repeated calls however. See
8953// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8954// for more details.
8955FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8956{
8957 return vreinterpretq_m128i_u8(veorq_u8(
8958 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8960}
8961
8962// Perform one round of an AES decryption flow on data (state) in a using the
8963// round key in RoundKey, and store the result in dst.
8964// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8965FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8966{
8967 return vreinterpretq_m128i_u8(veorq_u8(
8968 vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8969 vreinterpretq_u8_m128i(RoundKey)));
8970}
8971
8972// Perform the last round of an AES encryption flow on data (state) in a using
8973// the round key in RoundKey, and store the result in dst.
8974// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8975FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8976{
8977 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8978 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8979 RoundKey);
8980}
8981
8982// Perform the last round of an AES decryption flow on data (state) in a using
8983// the round key in RoundKey, and store the result in dst.
8984// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8985FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8986{
8988 veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
8989 vreinterpretq_u8_m128i(RoundKey)));
8990}
8991
8992// Perform the InvMixColumns transformation on a and store the result in dst.
8993// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8994FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8995{
8996 return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
8997}
8998
8999// Assist in expanding the AES cipher key by computing steps towards generating
9000// a round key for encryption cipher using data from a and an 8-bit round
9001// constant specified in imm8, and store the result in dst."
9002// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9003FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9004{
9005 // AESE does ShiftRows and SubBytes on A
9006 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
9007
9008#ifndef _MSC_VER
9009 uint8x16_t dest = {
9010 // Undo ShiftRows step from AESE and extract X1 and X3
9011 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
9012 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
9013 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
9014 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
9015 };
9016 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9018#else
9019 // We have to do this hack because MSVC is strictly adhering to the CPP
9020 // standard, in particular C++03 8.5.1 sub-section 15, which states that
9021 // unions must be initialized by their first member type.
9022
9023 // As per the Windows ARM64 ABI, it is always little endian, so this works
9024 __n128 dest{
9025 ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9026 ((uint64_t) u8.n128_u8[0xE] << 16) |
9027 ((uint64_t) u8.n128_u8[0xB] << 24) |
9028 ((uint64_t) u8.n128_u8[0x1] << 32) |
9029 ((uint64_t) u8.n128_u8[0xE] << 40) |
9030 ((uint64_t) u8.n128_u8[0xB] << 48) |
9031 ((uint64_t) u8.n128_u8[0x4] << 56),
9032 ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9033 ((uint64_t) u8.n128_u8[0x6] << 16) |
9034 ((uint64_t) u8.n128_u8[0x3] << 24) |
9035 ((uint64_t) u8.n128_u8[0x9] << 32) |
9036 ((uint64_t) u8.n128_u8[0x6] << 40) |
9037 ((uint64_t) u8.n128_u8[0x3] << 48) |
9038 ((uint64_t) u8.n128_u8[0xC] << 56)};
9039
9040 dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9041 dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9042
9043 return dest;
9044#endif
9045}
9046#endif
9047
9048/* Others */
9049
9050// Perform a carry-less multiplication of two 64-bit integers, selected from a
9051// and b according to imm8, and store the results in dst.
9052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9053FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9054{
9055 uint64x2_t a = vreinterpretq_u64_m128i(_a);
9056 uint64x2_t b = vreinterpretq_u64_m128i(_b);
9057 switch (imm & 0x11) {
9058 case 0x00:
9060 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9061 case 0x01:
9063 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9064 case 0x10:
9066 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9067 case 0x11:
9069 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9070 default:
9071 abort();
9072 }
9073}
9074
9075FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
9076{
9077 union {
9078 fpcr_bitfield field;
9079#if defined(__aarch64__) || defined(_M_ARM64)
9080 uint64_t value;
9081#else
9082 uint32_t value;
9083#endif
9084 } r;
9085
9086#if defined(__aarch64__) || defined(_M_ARM64)
9087 r.value = _sse2neon_get_fpcr();
9088#else
9089 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9090#endif
9091
9092 return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9093}
9094
9095// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9096// return that count in dst.
9097// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9098FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9099{
9100#if defined(__aarch64__) || defined(_M_ARM64)
9101#if __has_builtin(__builtin_popcount)
9102 return __builtin_popcount(a);
9103#elif defined(_MSC_VER)
9104 return _CountOneBits(a);
9105#else
9106 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9107#endif
9108#else
9109 uint32_t count = 0;
9110 uint8x8_t input_val, count8x8_val;
9111 uint16x4_t count16x4_val;
9112 uint32x2_t count32x2_val;
9113
9114 input_val = vld1_u8((uint8_t *) &a);
9115 count8x8_val = vcnt_u8(input_val);
9116 count16x4_val = vpaddl_u8(count8x8_val);
9117 count32x2_val = vpaddl_u16(count16x4_val);
9118
9119 vst1_u32(&count, count32x2_val);
9120 return count;
9121#endif
9122}
9123
9124// Count the number of bits set to 1 in unsigned 64-bit integer a, and
9125// return that count in dst.
9126// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
9127FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9128{
9129#if defined(__aarch64__) || defined(_M_ARM64)
9130#if __has_builtin(__builtin_popcountll)
9131 return __builtin_popcountll(a);
9132#elif defined(_MSC_VER)
9133 return _CountOneBits64(a);
9134#else
9135 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9136#endif
9137#else
9138 uint64_t count = 0;
9139 uint8x8_t input_val, count8x8_val;
9140 uint16x4_t count16x4_val;
9141 uint32x2_t count32x2_val;
9142 uint64x1_t count64x1_val;
9143
9144 input_val = vld1_u8((uint8_t *) &a);
9145 count8x8_val = vcnt_u8(input_val);
9146 count16x4_val = vpaddl_u8(count8x8_val);
9147 count32x2_val = vpaddl_u16(count16x4_val);
9148 count64x1_val = vpaddl_u32(count32x2_val);
9149 vst1_u64(&count, count64x1_val);
9150 return count;
9151#endif
9152}
9153
9154FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9155{
9156 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
9157 // regardless of the value of the FZ bit.
9158 union {
9159 fpcr_bitfield field;
9160#if defined(__aarch64__) || defined(_M_ARM64)
9161 uint64_t value;
9162#else
9163 uint32_t value;
9164#endif
9165 } r;
9166
9167#if defined(__aarch64__) || defined(_M_ARM64)
9168 r.value = _sse2neon_get_fpcr();
9169#else
9170 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9171#endif
9172
9173 r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
9174
9175#if defined(__aarch64__) || defined(_M_ARM64)
9176 _sse2neon_set_fpcr(r.value);
9177#else
9178 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
9179#endif
9180}
9181
9182// Return the current 64-bit value of the processor's time-stamp counter.
9183// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
9184FORCE_INLINE uint64_t _rdtsc(void)
9185{
9186#if defined(__aarch64__) || defined(_M_ARM64)
9187 uint64_t val;
9188
9189 /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
9190 * system counter is at least 56 bits wide; from Armv8.6, the counter
9191 * must be 64 bits wide. So the system counter could be less than 64
9192 * bits wide and it is attributed with the flag 'cap_user_time_short'
9193 * is true.
9194 */
9195#if defined(_MSC_VER)
9196 val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9197#else
9198 __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
9199#endif
9200
9201 return val;
9202#else
9203 uint32_t pmccntr, pmuseren, pmcntenset;
9204 // Read the user mode Performance Monitoring Unit (PMU)
9205 // User Enable Register (PMUSERENR) access permissions.
9206 __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
9207 if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
9208 __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
9209 if (pmcntenset & 0x80000000UL) { // Is it counting?
9210 __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
9211 // The counter is set up to count every 64th cycle
9212 return (uint64_t) (pmccntr) << 6;
9213 }
9214 }
9215
9216 // Fallback to syscall as we can't enable PMUSERENR in user mode.
9217 struct timeval tv;
9218 gettimeofday(&tv, NULL);
9219 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9220#endif
9221}
9222
9223#if defined(__GNUC__) || defined(__clang__)
9224#pragma pop_macro("ALIGN_STRUCT")
9225#pragma pop_macro("FORCE_INLINE")
9226#endif
9227
9228#if defined(__GNUC__) && !defined(__clang__)
9229#pragma GCC pop_options
9230#endif
9231
9232#endif
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3712
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2902
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1371
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
Definition: sse2neon.h:2364
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1438
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1335
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6017
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
Definition: sse2neon.h:6073
#define vreinterpret_m64_f32(x)
Definition: sse2neon.h:451
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:6806
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)
Definition: sse2neon.h:8185
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
Definition: sse2neon.h:633
#define _MM_FROUND_TO_POS_INF
Definition: sse2neon.h:332
#define vreinterpretq_u32_m128d(x)
Definition: sse2neon.h:493
#define vreinterpret_m64_s32(x)
Definition: sse2neon.h:442
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5713
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
Definition: sse2neon.h:4990
#define SSE2NEON_CACHELINE_SIZE
Definition: sse2neon.h:257
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
Definition: sse2neon.h:1533
FORCE_INLINE unsigned int _mm_getcsr(void)
Definition: sse2neon.h:2484
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:4780
#define vreinterpretq_m128_s32(x)
Definition: sse2neon.h:400
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7174
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
Definition: sse2neon.h:4067
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3281
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5213
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4460
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2000
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3195
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2788
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4194
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3025
FORCE_INLINE __m128d _mm_setzero_pd(void)
Definition: sse2neon.h:5032
#define SSE2NEON_AES_U2(p)
FORCE_INLINE __m128i _mm_set1_epi16(short w)
Definition: sse2neon.h:4920
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5452
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
Definition: sse2neon.h:1581
#define vreinterpretq_m128_f32(x)
Definition: sse2neon.h:390
float32x4_t __m128
Definition: sse2neon.h:366
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
Definition: sse2neon.h:3958
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:2998
FORCE_INLINE int _mm_cmpestrs(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8279
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
Definition: sse2neon.h:1919
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:2928
static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
Definition: sse2neon.h:7994
FORCE_INLINE void _mm_sfence(void)
Definition: sse2neon.h:2536
FORCE_INLINE __m128 _mm_load_ss(const float *p)
Definition: sse2neon.h:1863
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3213
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
Definition: sse2neon.h:2775
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3053
#define SSE2NEON_MULTIPLY(x, y)
Definition: sse2neon.h:8593
#define SSE2NEON_GENERATE_CMP_RANGES(prefix)
Definition: sse2neon.h:7954
FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0)
Definition: sse2neon.h:4863
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:5908
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7371
#define vreinterpretq_m128i_s8(x)
Definition: sse2neon.h:417
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
Definition: sse2neon.h:2752
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
Definition: sse2neon.h:8773
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6312
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:5840
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
Definition: sse2neon.h:1000
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3016
#define vreinterpret_m64_s8(x)
Definition: sse2neon.h:440
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
Definition: sse2neon.h:4039
FORCE_INLINE int _mm_cmpestri(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8243
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6162
#define _SIDD_MASKED_NEGATIVE_POLARITY
Definition: sse2neon.h:7671
#define vreinterpret_u16_m64(x)
Definition: sse2neon.h:455
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:5940
#define vreinterpretq_m128i_u64(x)
Definition: sse2neon.h:425
FORCE_INLINE int _mm_cmpestrz(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8295
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int)
Definition: sse2neon.h:4840
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3635
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
Definition: sse2neon.h:592
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5809
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
Definition: sse2neon.h:3944
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
Definition: sse2neon.h:6297
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
Definition: sse2neon.h:708
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm)
Definition: sse2neon.h:8158
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4478
#define vreinterpret_m64_s64(x)
Definition: sse2neon.h:443
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8365
#define vreinterpretq_u8_m128i(x)
Definition: sse2neon.h:435
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5323
FORCE_INLINE __m128d _mm_ceil_pd(__m128d)
Definition: sse2neon.h:6759
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
Definition: sse2neon.h:2694
#define vreinterpretq_s16_m128i(x)
Definition: sse2neon.h:431
#define _MM_FLUSH_ZERO_MASK
Definition: sse2neon.h:348
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4698
#define vreinterpretq_u64_m128d(x)
Definition: sse2neon.h:494
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1316
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6409
#define vreinterpretq_s8_m128(x)
Definition: sse2neon.h:412
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
Definition: sse2neon.h:4641
FORCE_INLINE void _sse2neon_smp_mb(void)
Definition: sse2neon.h:196
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:1199
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2920
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7391
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2492
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4226
#define vreinterpretq_m128_u64(x)
Definition: sse2neon.h:396
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3426
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3272
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5729
#define vreinterpret_m64_u8(x)
Definition: sse2neon.h:445
#define vreinterpretq_m128d_s32(x)
Definition: sse2neon.h:483
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
Definition: sse2neon.h:1955
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4680
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2162
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5274
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1717
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1448
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
Definition: sse2neon.h:2734
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
Definition: sse2neon.h:6545
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3312
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
Definition: sse2neon.h:4323
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
Definition: sse2neon.h:8440
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
Definition: sse2neon.h:6603
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
Definition: sse2neon.h:4307
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
Definition: sse2neon.h:4336
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
Definition: sse2neon.h:725
static cmpestr_func_t _sse2neon_cmpfunc_table[]
Definition: sse2neon.h:8091
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3168
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
Definition: sse2neon.h:8421
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:5670
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
Definition: sse2neon.h:5025
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8395
#define vreinterpretq_f32_m128i(x)
Definition: sse2neon.h:427
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
Definition: sse2neon.h:1009
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
Definition: sse2neon.h:615
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:5745
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:5597
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4344
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:3888
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4540
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5336
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
Definition: sse2neon.h:1508
#define vreinterpret_m64_u16(x)
Definition: sse2neon.h:446
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
Definition: sse2neon.h:1498
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
Definition: sse2neon.h:1706
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2352
FORCE_INLINE int _mm_test_all_ones(__m128i a)
Definition: sse2neon.h:7573
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3177
#define _MM_DENORMALS_ZERO_OFF
Definition: sse2neon.h:354
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3243
FORCE_INLINE __m128i _mm_setzero_si128(void)
Definition: sse2neon.h:5043
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3793
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6212
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
Definition: sse2neon.h:6081
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
Definition: sse2neon.h:2759
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
Definition: sse2neon.h:6714
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
Definition: sse2neon.h:7532
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
Definition: sse2neon.h:2964
#define vreinterpretq_m128_u32(x)
Definition: sse2neon.h:395
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5466
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2395
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d)
Definition: sse2neon.h:4530
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3772
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1218
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
Definition: sse2neon.h:3824
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1226
#define SSE2NEON_AES_U0(p)
#define _sse2neon_const
Definition: sse2neon.h:111
#define vreinterpretq_s32_m128i(x)
Definition: sse2neon.h:432
#define vreinterpret_s64_m64(x)
Definition: sse2neon.h:462
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
Definition: sse2neon.h:7638
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
Definition: sse2neon.h:6065
#define vreinterpretq_m128i_s16(x)
Definition: sse2neon.h:418
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
Definition: sse2neon.h:4688
#define vreinterpretq_f32_m128d(x)
Definition: sse2neon.h:496
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3034
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:3007
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
Definition: sse2neon.h:1608
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
Definition: sse2neon.h:6995
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
Definition: sse2neon.h:2673
#define vreinterpretq_m128d_s64(x)
Definition: sse2neon.h:484
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:4927
FORCE_INLINE void _mm_setcsr(unsigned int a)
Definition: sse2neon.h:2476
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
Definition: sse2neon.h:990
float32x4_t __m128d
Definition: sse2neon.h:373
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2870
#define SSE2NEON_AES_RSBOX(w)
Definition: sse2neon.h:8542
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
Definition: sse2neon.h:1056
FORCE_INLINE int _mm_cmpestrc(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8230
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
Definition: sse2neon.h:2292
#define vreinterpretq_s8_m128i(x)
Definition: sse2neon.h:430
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4408
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
Definition: sse2neon.h:4351
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
Definition: sse2neon.h:5571
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
Definition: sse2neon.h:718
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3496
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
Definition: sse2neon.h:750
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
Definition: sse2neon.h:3871
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4361
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix)
Definition: sse2neon.h:8060
FORCE_INLINE __m128d _mm_floor_pd(__m128d)
Definition: sse2neon.h:7067
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1353
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
Definition: sse2neon.h:6789
FORCE_INLINE uint64_t _rdtsc(void)
Definition: sse2neon.h:9184
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2912
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3404
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
Definition: sse2neon.h:4147
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2745
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6282
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3603
#define vreinterpretq_s64_m128d(x)
Definition: sse2neon.h:491
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:4982
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
Definition: sse2neon.h:6915
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6036
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
Definition: sse2neon.h:1025
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4752
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
Definition: sse2neon.h:6925
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
Definition: sse2neon.h:6632
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition: sse2neon.h:4268
#define _MM_DENORMALS_ZERO_ON
Definition: sse2neon.h:353
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:2936
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
Definition: sse2neon.h:1679
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
Definition: sse2neon.h:766
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2152
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
Definition: sse2neon.h:4948
#define vreinterpretq_m128i_u8(x)
Definition: sse2neon.h:422
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1468
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4139
FORCE_INLINE __m128d _mm_round_pd(__m128d, int)
Definition: sse2neon.h:7402
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
Definition: sse2neon.h:1549
#define _MM_ROUND_NEAREST
Definition: sse2neon.h:343
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1890
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
Definition: sse2neon.h:743
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
Definition: sse2neon.h:7582
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
Definition: sse2neon.h:9053
#define vreinterpret_u32_m64(x)
Definition: sse2neon.h:456
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
Definition: sse2neon.h:4633
#define vreinterpretq_nth_u64_m128i(x, n)
Definition: sse2neon.h:535
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1417
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3445
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:7210
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:7228
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
Definition: sse2neon.h:1559
#define vreinterpretq_m128d_f32(x)
Definition: sse2neon.h:489
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3570
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
Definition: sse2neon.h:3984
#define vreinterpretq_f64_m128i(x)
Definition: sse2neon.h:428
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1262
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5778
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
Definition: sse2neon.h:4118
union ALIGN_STRUCT(16) SIMDVec
Definition: sse2neon.h:522
#define vreinterpret_m64_s16(x)
Definition: sse2neon.h:441
FORCE_INLINE int _mm_movemask_pd(__m128d a)
Definition: sse2neon.h:4623
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2404
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
Definition: sse2neon.h:1785
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)
Definition: sse2neon.h:8180
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3562
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:7192
#define _SIDD_NEGATIVE_POLARITY
Definition: sse2neon.h:7670
#define _MM_ROUND_DOWN
Definition: sse2neon.h:344
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7201
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
Definition: sse2neon.h:814
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
Definition: sse2neon.h:4934
FORCE_INLINE __m128 _mm_undefined_ps(void)
Definition: sse2neon.h:2851
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3690
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5924
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2467
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
Definition: sse2neon.h:5224
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2799
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
Definition: sse2neon.h:4790
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
Definition: sse2neon.h:6844
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
Definition: sse2neon.h:5298
FORCE_INLINE __m128d _mm_set_sd(double a)
Definition: sse2neon.h:4909
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
Definition: sse2neon.h:3843
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
Definition: sse2neon.h:6799
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:6445
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
Definition: sse2neon.h:6097
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5892
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1345
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3757
FORCE_INLINE __m128i _mm_undefined_si128(void)
Definition: sse2neon.h:2833
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3742
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4417
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
Definition: sse2neon.h:773
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
Definition: sse2neon.h:2048
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
Definition: sse2neon.h:3123
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4661
#define _MM_FROUND_TO_NEG_INF
Definition: sse2neon.h:331
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
Definition: sse2neon.h:6949
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
Definition: sse2neon.h:7596
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5485
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2725
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
Definition: sse2neon.h:9098
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1244
static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
Definition: sse2neon.h:7876
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
Definition: sse2neon.h:1627
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2012
FORCE_INLINE void _mm_empty(void)
Definition: sse2neon.h:1143
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5588
#define vreinterpretq_u64_m128(x)
Definition: sse2neon.h:410
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
Definition: sse2neon.h:5524
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:5760
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5549
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t)
Definition: sse2neon.h:8477
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1308
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
Definition: sse2neon.h:6863
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3504
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5508
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
Definition: sse2neon.h:8137
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1381
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
Definition: sse2neon.h:1664
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1430
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4770
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
Definition: sse2neon.h:2980
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1910
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6178
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1478
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
Definition: sse2neon.h:4026
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:5880
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6345
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
Definition: sse2neon.h:780
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3186
#define vreinterpretq_s32_m128(x)
Definition: sse2neon.h:414
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition: sse2neon.h:2703
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:6433
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1363
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6328
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
Definition: sse2neon.h:3099
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3537
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5176
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6516
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
Definition: sse2neon.h:600
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
Definition: sse2neon.h:6824
#define vreinterpretq_u32_m128i(x)
Definition: sse2neon.h:437
FORCE_INLINE void * _mm_malloc(size_t size, size_t align)
Definition: sse2neon.h:1938
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6235
static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
Definition: sse2neon.h:7901
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15)
Definition: sse2neon.h:4997
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4260
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1180
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2098
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3657
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5579
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
Definition: sse2neon.h:6874
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
Definition: sse2neon.h:1063
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
Definition: sse2neon.h:2342
#define vreinterpretq_m128i_s64(x)
Definition: sse2neon.h:420
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2500
FORCE_INLINE int _sse2neon_clz(unsigned int x)
Definition: sse2neon.h:8113
static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
Definition: sse2neon.h:7857
#define vreinterpretq_m128_s64(x)
Definition: sse2neon.h:401
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5516
FORCE_INLINE void _mm_free(void *addr)
Definition: sse2neon.h:1755
#define vreinterpretq_u32_m128(x)
Definition: sse2neon.h:409
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)
Definition: sse2neon.h:8174
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4174
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
Definition: sse2neon.h:1900
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4817
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3727
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
Definition: sse2neon.h:6936
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1171
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
Definition: sse2neon.h:8914
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3385
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3332
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6250
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1488
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
Definition: sse2neon.h:4941
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
Definition: sse2neon.h:4520
#define _MM_ROUND_UP
Definition: sse2neon.h:345
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
Definition: sse2neon.h:5958
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
Definition: sse2neon.h:5202
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1738
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8347
int64x1_t __m64
Definition: sse2neon.h:365
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:7219
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:4469
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
Definition: sse2neon.h:5286
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3135
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3395
#define vreinterpret_s32_m64(x)
Definition: sse2neon.h:461
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8383
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
Definition: sse2neon.h:6475
#define vreinterpretq_m128d_u32(x)
Definition: sse2neon.h:486
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3043
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
Definition: sse2neon.h:4103
#define vreinterpret_m64_u64(x)
Definition: sse2neon.h:448
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:1974
#define __int64
Definition: sse2neon.h:383
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
Definition: sse2neon.h:796
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3263
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7081
static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
Definition: sse2neon.h:7927
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3290
#define vreinterpretq_u64_m128i(x)
Definition: sse2neon.h:438
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
Definition: sse2neon.h:4082
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
Definition: sse2neon.h:6886
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition: sse2neon.h:4294
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
Definition: sse2neon.h:9154
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:6745
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t)
Definition: sse2neon.h:4855
#define _MM_ROUND_TOWARD_ZERO
Definition: sse2neon.h:346
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
Definition: sse2neon.h:757
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1208
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
Definition: sse2neon.h:8860
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
Definition: sse2neon.h:1809
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
Definition: sse2neon.h:5643
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
Definition: sse2neon.h:4386
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1290
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7165
FORCE_INLINE int _mm_cmpestra(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8216
#define SSE2NEON_CMPESTR_LIST
Definition: sse2neon.h:8067
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:5661
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1150
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
Definition: sse2neon.h:1772
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1409
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1280
FORCE_INLINE int _mm_movemask_ps(__m128 a)
Definition: sse2neon.h:2129
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
Definition: sse2neon.h:7097
FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
Definition: sse2neon.h:4967
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:4761
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3463
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
Definition: sse2neon.h:9075
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
Definition: sse2neon.h:8823
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:6853
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
Definition: sse2neon.h:6906
#define _mm_set_pd1
Definition: sse2neon.h:4904
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4399
FORCE_INLINE void _mm_stream_si32(int *p, int a)
Definition: sse2neon.h:5562
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:7183
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
Definition: sse2neon.h:6266
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3595
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
Definition: sse2neon.h:8702
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2989
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
Definition: sse2neon.h:3970
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
Definition: sse2neon.h:5477
FORCE_INLINE __m128d _mm_set1_pd(double d)
Definition: sse2neon.h:4956
#define _MM_FROUND_TO_NEAREST_INT
Definition: sse2neon.h:330
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2886
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
Definition: sse2neon.h:1643
#define vreinterpretq_m128d_u64(x)
Definition: sse2neon.h:487
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
Definition: sse2neon.h:4058
static const uint8_t _sse2neon_sbox[256]
Definition: sse2neon.h:8586
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1272
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5616
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
Definition: sse2neon.h:8459
#define SSE2NEON_AES_U3(p)
#define vreinterpret_s16_m64(x)
Definition: sse2neon.h:460
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
Definition: sse2neon.h:6057
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition: sse2neon.h:4825
FORCE_INLINE __m128d _mm_load_sd(const double *p)
Definition: sse2neon.h:4246
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1327
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
Definition: sse2neon.h:3856
#define vreinterpretq_m128i_u32(x)
Definition: sse2neon.h:424
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:5679
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:2022
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
Definition: sse2neon.h:1043
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8356
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2082
#define SSE2NEON_AES_U1(p)
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
Definition: sse2neon.h:5636
FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8257
FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
Definition: sse2neon.h:8097
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:3063
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
Definition: sse2neon.h:1017
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1160
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
Definition: sse2neon.h:2767
FORCE_INLINE __m128 _mm_ceil_ps(__m128)
Definition: sse2neon.h:6773
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
Definition: sse2neon.h:7287
#define vreinterpretq_m128i_s32(x)
Definition: sse2neon.h:419
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:7382
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
Definition: sse2neon.h:6698
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
Definition: sse2neon.h:8605
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
Definition: sse2neon.h:734
FORCE_INLINE __m128 _mm_set_ss(float a)
Definition: sse2neon.h:2459
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
Definition: sse2neon.h:832
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4743
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
Definition: sse2neon.h:823
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition: sse2neon.h:9127
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5652
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
Definition: sse2neon.h:7237
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix)
Definition: sse2neon.h:7894
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
Definition: sse2neon.h:3801
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:5262
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8374
#define SSE2NEON_BARRIER()
Definition: sse2neon.h:180
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1880
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1254
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
Definition: sse2neon.h:2304
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition: sse2neon.h:3091
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128)
Definition: sse2neon.h:2071
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
Definition: sse2neon.h:4128
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:6488
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3375
#define vreinterpretq_f32_m128(x)
Definition: sse2neon.h:404
FORCE_INLINE __m128 _mm_round_ps(__m128, int)
Definition: sse2neon.h:7471
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
Definition: sse2neon.h:788
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
Definition: sse2neon.h:4650
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3471
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
Definition: sse2neon.h:2171
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
Definition: sse2neon.h:7107
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
Definition: sse2neon.h:4848
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5411
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:4719
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1298
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5498
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
Definition: sse2neon.h:8338
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
Definition: sse2neon.h:1033
FORCE_INLINE __m128d _mm_undefined_pd(void)
Definition: sse2neon.h:5694
#define _MM_FROUND_TO_ZERO
Definition: sse2neon.h:333
FORCE_INLINE __m128d _mm_set_pd(double, double)
Definition: sse2neon.h:4891
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix)
Definition: sse2neon.h:8049
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
Definition: sse2neon.h:3107
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
Definition: sse2neon.h:806
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1871
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
Definition: sse2neon.h:6895
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
Definition: sse2neon.h:6833
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
Definition: sse2neon.h:5976
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
Definition: sse2neon.h:1572
#define _MM_FLUSH_ZERO_ON
Definition: sse2neon.h:349
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:8406
FORCE_INLINE void _mm_prefetch(char const *p, int i)
Definition: sse2neon.h:2240
#define _MM_DENORMALS_ZERO_MASK
Definition: sse2neon.h:352
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
Definition: sse2neon.h:1617
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
Definition: sse2neon.h:7618
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3115
FORCE_INLINE void _mm_lfence(void)
Definition: sse2neon.h:2557
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1843
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4505
#define vreinterpretq_m128i_u16(x)
Definition: sse2neon.h:423
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
Definition: sse2neon.h:1592
_mm_hint
Definition: sse2neon.h:686
@ _MM_HINT_T1
Definition: sse2neon.h:689
@ _MM_HINT_T0
Definition: sse2neon.h:688
@ _MM_HINT_T2
Definition: sse2neon.h:690
@ _MM_HINT_NTA
Definition: sse2neon.h:687
#define _MM_FROUND_CUR_DIRECTION
Definition: sse2neon.h:334
int(* cmpestr_func_t)(__m128i a, int la, __m128i b, int lb)
Definition: sse2neon.h:8090
int64x2_t __m128i
Definition: sse2neon.h:375
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
Definition: sse2neon.h:3072
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
Definition: sse2neon.h:7552
#define SSE2NEON_AES_SBOX(w)
Definition: sse2neon.h:8502
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
Definition: sse2neon.h:2313
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5440
#define vreinterpretq_u16_m128i(x)
Definition: sse2neon.h:436
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
Definition: sse2neon.h:4005
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
Definition: sse2neon.h:2108
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
Definition: sse2neon.h:1049
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
Definition: sse2neon.h:1688
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
Definition: sse2neon.h:3221
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
Definition: sse2neon.h:5534
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
Definition: sse2neon.h:2945
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2685
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
Definition: sse2neon.h:4156
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
Definition: sse2neon.h:5825
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
Definition: sse2neon.h:5247
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
Definition: sse2neon.h:6194
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
Definition: sse2neon.h:2414
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
Definition: sse2neon.h:4281
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:5606
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
Definition: sse2neon.h:5189
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)
Definition: sse2neon.h:8308
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
Definition: sse2neon.h:4445
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
Definition: sse2neon.h:1524
#define vreinterpret_s8_m64(x)
Definition: sse2neon.h:459
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
Definition: sse2neon.h:1761
#define _mm_srli_si128(a, imm)
Definition: sse2neon.h:5399
static const uint8_t _sse2neon_rsbox[256]
Definition: sse2neon.h:8587
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5063
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2641
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
Definition: sse2neon.h:5163
#define vreinterpret_u8_m64(x)
Definition: sse2neon.h:454
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2717
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5793
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1458
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6574
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition: sse2neon.h:1236
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2033
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
Definition: sse2neon.h:874
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
Definition: sse2neon.h:6003
FORCE_INLINE void _mm_pause(void)
Definition: sse2neon.h:4803
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
Definition: sse2neon.h:1927
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5871
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
Definition: sse2neon.h:7561
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
Definition: sse2neon.h:5424
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3816
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2180
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
Definition: sse2neon.h:6727
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
Definition: sse2neon.h:5856
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
Definition: sse2neon.h:8125
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2060
static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
Definition: sse2neon.h:7966
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3354
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
Definition: sse2neon.h:3529
#define SSE2NEON_AES_H0(x)
Definition: sse2neon.h:8585
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
Definition: sse2neon.h:6089
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
Definition: sse2neon.h:5349
FORCE_INLINE int _mm_cmpestro(__m128i a, int la, __m128i b, int lb, const int imm8)
Definition: sse2neon.h:8266
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1985
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
Definition: sse2neon.h:6203
FORCE_INLINE void _mm_mfence(void)
Definition: sse2neon.h:2547
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
Definition: sse2neon.h:1651
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1393
#define _MM_FROUND_NO_EXC
Definition: sse2neon.h:335
#define vreinterpretq_s64_m128i(x)
Definition: sse2neon.h:433
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
Definition: sse2neon.h:4019
#define _MM_FLUSH_ZERO_OFF
Definition: sse2neon.h:350
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
Definition: sse2neon.h:6384
SIMDVec
Definition: sse2neon.h:532
FORCE_INLINE void _mm_clflush(void const *p)
Definition: sse2neon.h:3146
Definition: sse2neon.h:694
uint16_t res0
Definition: sse2neon.h:695
uint8_t bit23
Definition: sse2neon.h:698
uint8_t res1
Definition: sse2neon.h:696
uint8_t bit24
Definition: sse2neon.h:699
uint8_t bit22
Definition: sse2neon.h:697
uint8_t res2
Definition: sse2neon.h:700
static int gettimeofday(struct timeval *tv, struct timezone *tz)
Definition: time.h:48
for i
Definition: volk_config_fixed.tmpl.h:13