1 /* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2  * SPDX-License-Identifier: Apache-2.0"
3  *
4  * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
5  * AWS Cryptographic Algorithms Group.
6  */
7 
8 // This file contains definitions of macros for SIMD intrinsic functions, used
9 // throughout the code package. Where necessary, we add a suffix to a macro,
10 // and denote the type of the elements (operateds). For example,
11 //   - I16 denotes 16-bit wide integers,
12 //   - U64 denotes 64-bit wide unsigned integers.
13 
14 #pragma once
15 
16 #if defined(X86_64)
17 #  include <immintrin.h>
bswap_64(uint64_t x)18 #endif
19 
20 // For functions in gf2x_mul.c we use exactly the same code for
21 // PORTABLE, AVX2, AVX512 implementations. Based on the implementation,
22 // we define macros for the different data types (uint64_t, __m256i, __m512i),
23 // and all the required operations (LOAD, STORE, >>, <<) on these types.
24 #if defined(AVX2_INTERNAL)
25 
26 #  define REG_T __m256i
27 
28 #  define LOAD(mem)       _mm256_loadu_si256((const void *)(mem))
29 #  define STORE(mem, reg) _mm256_storeu_si256((void *)(mem), (reg))
30 
31 #  define SLLI_I64(a, imm) _mm256_slli_epi64(a, imm)
32 #  define SRLI_I64(a, imm) _mm256_srli_epi64(a, imm)
33 
34 #elif defined(AVX512_INTERNAL)
35 
36 #  define REG_T __m512i
37 
38 #  define LOAD(mem)       _mm512_loadu_si512((mem))
39 #  define STORE(mem, reg) _mm512_storeu_si512((mem), (reg))
40 
41 #  define SLLI_I64(a, imm) _mm512_slli_epi64(a, imm)
42 #  define SRLI_I64(a, imm) _mm512_srli_epi64(a, imm)
43 
44 #elif defined(PORTABLE_INTERNAL)
45 
46 #  define REG_T uint64_t
47 
48 #  define LOAD(mem)       (mem)[0]
49 #  define STORE(mem, val) (mem)[0] = val
50 
51 #  define SLLI_I64(a, imm) ((a) << (imm))
52 #  define SRLI_I64(a, imm) ((a) >> (imm))
53 
54 #endif
55 
56 // NOLINT is used to avoid the sizeof(T)/sizeof(T) warning when REG_T is defined
57 // to be uint64_t
58 #define REG_QWORDS (sizeof(REG_T) / sizeof(uint64_t)) // NOLINT
59 #define REG_DWORDS (sizeof(REG_T) / sizeof(uint32_t)) // NOLINT
60 
61 // The rest of the SIMD macros that are
62 // required for AVX2 and AVX512 implementation.
63 #if defined(AVX2_INTERNAL)
64 
65 #  define SET_I8(...)  _mm256_set_epi8(__VA_ARGS__)
66 #  define SET_I32(...) _mm256_set_epi32(__VA_ARGS__)
67 #  define SET_I64(...) _mm256_set_epi64x(__VA_ARGS__)
68 #  define SET1_I8(a)   _mm256_set1_epi8(a)
69 #  define SET1_I16(a)  _mm256_set1_epi16(a)
70 #  define SET1_I32(a)  _mm256_set1_epi32(a)
71 #  define SET1_I64(a)  _mm256_set1_epi64x(a)
72 #  define SET_ZERO     _mm256_setzero_si256()
73 
74 #  define ADD_I8(a, b)     _mm256_add_epi8(a, b)
75 #  define SUB_I8(a, b)     _mm256_sub_epi8(a, b)
76 #  define ADD_I16(a, b)    _mm256_add_epi16(a, b)
77 #  define SUB_I16(a, b)    _mm256_sub_epi16(a, b)
78 #  define ADD_I64(a, b)    _mm256_add_epi64(a, b)
79 #  define SRLI_I16(a, imm) _mm256_srli_epi16(a, imm)
80 #  define SLLI_I32(a, imm) _mm256_slli_epi32(a, imm)
81 #  define SLLV_I32(a, b)   _mm256_sllv_epi32(a, b)
82 
83 #  define CMPGT_I16(a, b) _mm256_cmpgt_epi16(a, b)
84 #  define CMPEQ_I16(a, b) _mm256_cmpeq_epi16(a, b)
85 #  define CMPEQ_I32(a, b) _mm256_cmpeq_epi32(a, b)
86 #  define CMPEQ_I64(a, b) _mm256_cmpeq_epi64(a, b)
87 
88 #  define SHUF_I8(a, b)         _mm256_shuffle_epi8(a, b)
89 #  define BLENDV_I8(a, b, mask) _mm256_blendv_epi8(a, b, mask)
90 #  define PERMVAR_I32(a, idx)   _mm256_permutevar8x32_epi32(a, idx)
91 #  define PERM_I64(a, imm)      _mm256_permute4x64_epi64(a, imm)
92 
93 #  define MOVEMASK(a) _mm256_movemask_epi8(a)
94 
95 #elif defined(AVX512_INTERNAL)
96 
97 #  define MSTORE(mem, mask, reg) _mm512_mask_store_epi64((mem), (mask), (reg))
98 
99 #  define SET1_I8(a)         _mm512_set1_epi8(a)
100 #  define SET1_I32(a)        _mm512_set1_epi32(a)
101 #  define SET1_I64(a)        _mm512_set1_epi64(a)
102 #  define SET1MZ_I8(mask, a) _mm512_maskz_set1_epi8(mask, a)
103 #  define SET1_I16(a)        _mm512_set1_epi16(a)
104 #  define SET_I64(...)       _mm512_set_epi64(__VA_ARGS__)
105 #  define SET_ZERO           _mm512_setzero_si512()
106 
107 #  define ADD_I16(a, b)             _mm512_add_epi16(a, b)
108 #  define ADD_I64(a, b)             _mm512_add_epi64(a, b)
109 #  define MSUB_I16(src, k, a, b)    _mm512_mask_sub_epi16(src, k, a, b)
110 #  define SRLI_I16(a, imm)          _mm512_srli_epi16(a, imm)
111 #  define SRLV_I64(a, cnt)          _mm512_srlv_epi64(a, cnt)
112 #  define SLLV_I64(a, cnt)          _mm512_sllv_epi64(a, cnt)
113 #  define MOR_I64(src, mask, a, b)  _mm512_mask_or_epi64(src, mask, a, b)
114 #  define MXOR_I64(src, mask, a, b) _mm512_mask_xor_epi64(src, mask, a, b)
115 #  define VALIGN(a, b, count)       _mm512_alignr_epi64(a, b, count)
116 
117 #  define CMPM_U8(a, b, cmp_op)  _mm512_cmp_epu8_mask(a, b, cmp_op)
118 #  define CMPM_U16(a, b, cmp_op) _mm512_cmp_epu16_mask(a, b, cmp_op)
119 #  define CMPMEQ_I64(a, b)       _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ)
120 #  define MCMPMEQ_I32(mask, a, b) \
121     _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ)
122 
123 #  define PERMX_I64(a, imm)        _mm512_permutex_epi64(a, imm)
124 #  define PERMX2VAR_I64(a, idx, b) _mm512_permutex2var_epi64(a, idx, b)
125 #  define PERMXVAR_I64(idx, a)     _mm512_permutexvar_epi64(idx, a)
126 
127 #endif
128