1 /* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 * SPDX-License-Identifier: Apache-2.0"
3 *
4 * Written by Nir Drucker, Shay Gueron and Dusan Kostic,
5 * AWS Cryptographic Algorithms Group.
6 */
7
8 // This file contains definitions of macros for SIMD intrinsic functions, used
9 // throughout the code package. Where necessary, we add a suffix to a macro,
10 // and denote the type of the elements (operateds). For example,
11 // - I16 denotes 16-bit wide integers,
12 // - U64 denotes 64-bit wide unsigned integers.
13
14 #pragma once
15
16 #if defined(X86_64)
17 # include <immintrin.h>
bswap_64(uint64_t x)18 #endif
19
20 // For functions in gf2x_mul.c we use exactly the same code for
21 // PORTABLE, AVX2, AVX512 implementations. Based on the implementation,
22 // we define macros for the different data types (uint64_t, __m256i, __m512i),
23 // and all the required operations (LOAD, STORE, >>, <<) on these types.
24 #if defined(AVX2_INTERNAL)
25
26 # define REG_T __m256i
27
28 # define LOAD(mem) _mm256_loadu_si256((const void *)(mem))
29 # define STORE(mem, reg) _mm256_storeu_si256((void *)(mem), (reg))
30
31 # define SLLI_I64(a, imm) _mm256_slli_epi64(a, imm)
32 # define SRLI_I64(a, imm) _mm256_srli_epi64(a, imm)
33
34 #elif defined(AVX512_INTERNAL)
35
36 # define REG_T __m512i
37
38 # define LOAD(mem) _mm512_loadu_si512((mem))
39 # define STORE(mem, reg) _mm512_storeu_si512((mem), (reg))
40
41 # define SLLI_I64(a, imm) _mm512_slli_epi64(a, imm)
42 # define SRLI_I64(a, imm) _mm512_srli_epi64(a, imm)
43
44 #elif defined(PORTABLE_INTERNAL)
45
46 # define REG_T uint64_t
47
48 # define LOAD(mem) (mem)[0]
49 # define STORE(mem, val) (mem)[0] = val
50
51 # define SLLI_I64(a, imm) ((a) << (imm))
52 # define SRLI_I64(a, imm) ((a) >> (imm))
53
54 #endif
55
56 // NOLINT is used to avoid the sizeof(T)/sizeof(T) warning when REG_T is defined
57 // to be uint64_t
58 #define REG_QWORDS (sizeof(REG_T) / sizeof(uint64_t)) // NOLINT
59 #define REG_DWORDS (sizeof(REG_T) / sizeof(uint32_t)) // NOLINT
60
61 // The rest of the SIMD macros that are
62 // required for AVX2 and AVX512 implementation.
63 #if defined(AVX2_INTERNAL)
64
65 # define SET_I8(...) _mm256_set_epi8(__VA_ARGS__)
66 # define SET_I32(...) _mm256_set_epi32(__VA_ARGS__)
67 # define SET_I64(...) _mm256_set_epi64x(__VA_ARGS__)
68 # define SET1_I8(a) _mm256_set1_epi8(a)
69 # define SET1_I16(a) _mm256_set1_epi16(a)
70 # define SET1_I32(a) _mm256_set1_epi32(a)
71 # define SET1_I64(a) _mm256_set1_epi64x(a)
72 # define SET_ZERO _mm256_setzero_si256()
73
74 # define ADD_I8(a, b) _mm256_add_epi8(a, b)
75 # define SUB_I8(a, b) _mm256_sub_epi8(a, b)
76 # define ADD_I16(a, b) _mm256_add_epi16(a, b)
77 # define SUB_I16(a, b) _mm256_sub_epi16(a, b)
78 # define ADD_I64(a, b) _mm256_add_epi64(a, b)
79 # define SRLI_I16(a, imm) _mm256_srli_epi16(a, imm)
80 # define SLLI_I32(a, imm) _mm256_slli_epi32(a, imm)
81 # define SLLV_I32(a, b) _mm256_sllv_epi32(a, b)
82
83 # define CMPGT_I16(a, b) _mm256_cmpgt_epi16(a, b)
84 # define CMPEQ_I16(a, b) _mm256_cmpeq_epi16(a, b)
85 # define CMPEQ_I32(a, b) _mm256_cmpeq_epi32(a, b)
86 # define CMPEQ_I64(a, b) _mm256_cmpeq_epi64(a, b)
87
88 # define SHUF_I8(a, b) _mm256_shuffle_epi8(a, b)
89 # define BLENDV_I8(a, b, mask) _mm256_blendv_epi8(a, b, mask)
90 # define PERMVAR_I32(a, idx) _mm256_permutevar8x32_epi32(a, idx)
91 # define PERM_I64(a, imm) _mm256_permute4x64_epi64(a, imm)
92
93 # define MOVEMASK(a) _mm256_movemask_epi8(a)
94
95 #elif defined(AVX512_INTERNAL)
96
97 # define MSTORE(mem, mask, reg) _mm512_mask_store_epi64((mem), (mask), (reg))
98
99 # define SET1_I8(a) _mm512_set1_epi8(a)
100 # define SET1_I32(a) _mm512_set1_epi32(a)
101 # define SET1_I64(a) _mm512_set1_epi64(a)
102 # define SET1MZ_I8(mask, a) _mm512_maskz_set1_epi8(mask, a)
103 # define SET1_I16(a) _mm512_set1_epi16(a)
104 # define SET_I64(...) _mm512_set_epi64(__VA_ARGS__)
105 # define SET_ZERO _mm512_setzero_si512()
106
107 # define ADD_I16(a, b) _mm512_add_epi16(a, b)
108 # define ADD_I64(a, b) _mm512_add_epi64(a, b)
109 # define MSUB_I16(src, k, a, b) _mm512_mask_sub_epi16(src, k, a, b)
110 # define SRLI_I16(a, imm) _mm512_srli_epi16(a, imm)
111 # define SRLV_I64(a, cnt) _mm512_srlv_epi64(a, cnt)
112 # define SLLV_I64(a, cnt) _mm512_sllv_epi64(a, cnt)
113 # define MOR_I64(src, mask, a, b) _mm512_mask_or_epi64(src, mask, a, b)
114 # define MXOR_I64(src, mask, a, b) _mm512_mask_xor_epi64(src, mask, a, b)
115 # define VALIGN(a, b, count) _mm512_alignr_epi64(a, b, count)
116
117 # define CMPM_U8(a, b, cmp_op) _mm512_cmp_epu8_mask(a, b, cmp_op)
118 # define CMPM_U16(a, b, cmp_op) _mm512_cmp_epu16_mask(a, b, cmp_op)
119 # define CMPMEQ_I64(a, b) _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ)
120 # define MCMPMEQ_I32(mask, a, b) \
121 _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ)
122
123 # define PERMX_I64(a, imm) _mm512_permutex_epi64(a, imm)
124 # define PERMX2VAR_I64(a, idx, b) _mm512_permutex2var_epi64(a, idx, b)
125 # define PERMXVAR_I64(idx, a) _mm512_permutexvar_epi64(idx, a)
126
127 #endif
128