1 /*
2 * (C) 2018 Jack Lloyd
3 *
4 * Botan is released under the Simplified BSD License (see license.txt)
5 */
6 
7 #ifndef BOTAN_SIMD_AVX2_H_
8 #define BOTAN_SIMD_AVX2_H_
9 
10 #include <botan/types.h>
11 #include <immintrin.h>
12 
13 namespace Botan {
14 
15 class SIMD_8x32 final
16    {
17    public:
18 
19       SIMD_8x32& operator=(const SIMD_8x32& other) = default;
20       SIMD_8x32(const SIMD_8x32& other) = default;
21 
22       SIMD_8x32& operator=(SIMD_8x32&& other) = default;
23       SIMD_8x32(SIMD_8x32&& other) = default;
24 
25       BOTAN_FUNC_ISA("avx2")
SIMD_8x32()26       BOTAN_FORCE_INLINE SIMD_8x32()
27          {
28          m_avx2 = _mm256_setzero_si256();
29          }
30 
31       BOTAN_FUNC_ISA("avx2")
SIMD_8x32(const uint32_t B[8])32       explicit SIMD_8x32(const uint32_t B[8])
33          {
34          m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B));
35          }
36 
37       BOTAN_FUNC_ISA("avx2")
SIMD_8x32(uint32_t B0,uint32_t B1,uint32_t B2,uint32_t B3,uint32_t B4,uint32_t B5,uint32_t B6,uint32_t B7)38       explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3,
39                          uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7)
40          {
41          m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0);
42          }
43 
44       BOTAN_FUNC_ISA("avx2")
splat(uint32_t B)45       static SIMD_8x32 splat(uint32_t B)
46          {
47          return SIMD_8x32(_mm256_set1_epi32(B));
48          }
49 
50       BOTAN_FUNC_ISA("avx2")
load_le(const uint8_t * in)51       static SIMD_8x32 load_le(const uint8_t* in)
52          {
53          return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
54          }
55 
56       BOTAN_FUNC_ISA("avx2")
load_be(const uint8_t * in)57       static SIMD_8x32 load_be(const uint8_t* in)
58          {
59          return load_le(in).bswap();
60          }
61 
62       BOTAN_FUNC_ISA("avx2")
store_le(uint8_t out[])63       void store_le(uint8_t out[]) const
64          {
65          _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2);
66          }
67 
68       BOTAN_FUNC_ISA("avx2")
store_be(uint8_t out[])69       void store_be(uint8_t out[]) const
70          {
71          bswap().store_le(out);
72          }
73 
74       template<size_t ROT>
75       BOTAN_FUNC_ISA("avx2")
rotl()76       SIMD_8x32 rotl() const
77          {
78          static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant");
79 
80 #if defined(__AVX512VL__)
81          return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT));
82 #else
83          BOTAN_IF_CONSTEXPR(ROT == 8)
84             {
85             const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
86                                                         14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
87 
88             return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8));
89             }
90          else BOTAN_IF_CONSTEXPR(ROT == 16)
91             {
92             const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
93                                                          13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
94 
95             return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16));
96             }
97          else
98             {
99             return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)),
100                                              _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT))));
101             }
102 #endif
103          }
104 
105       template<size_t ROT>
106       BOTAN_FUNC_ISA("avx2")
rotr()107       SIMD_8x32 rotr() const
108          {
109          return this->rotl<32-ROT>();
110          }
111 
112       template<size_t ROT1, size_t ROT2, size_t ROT3>
rho()113       SIMD_8x32 BOTAN_FUNC_ISA("avx2") rho() const
114          {
115          SIMD_8x32 res;
116 
117          const SIMD_8x32 rot1 = this->rotr<ROT1>();
118          const SIMD_8x32 rot2 = this->rotr<ROT2>();
119          const SIMD_8x32 rot3 = this->rotr<ROT3>();
120 
121          return rot1 ^ rot2 ^ rot3;
122          }
123 
124       BOTAN_FUNC_ISA("avx2")
125       SIMD_8x32 operator+(const SIMD_8x32& other) const
126          {
127          SIMD_8x32 retval(*this);
128          retval += other;
129          return retval;
130          }
131 
132       BOTAN_FUNC_ISA("avx2")
133       SIMD_8x32 operator-(const SIMD_8x32& other) const
134          {
135          SIMD_8x32 retval(*this);
136          retval -= other;
137          return retval;
138          }
139 
140       BOTAN_FUNC_ISA("avx2")
141       SIMD_8x32 operator^(const SIMD_8x32& other) const
142          {
143          SIMD_8x32 retval(*this);
144          retval ^= other;
145          return retval;
146          }
147 
148       BOTAN_FUNC_ISA("avx2")
149       SIMD_8x32 operator|(const SIMD_8x32& other) const
150          {
151          SIMD_8x32 retval(*this);
152          retval |= other;
153          return retval;
154          }
155 
156       BOTAN_FUNC_ISA("avx2")
157       SIMD_8x32 operator&(const SIMD_8x32& other) const
158          {
159          SIMD_8x32 retval(*this);
160          retval &= other;
161          return retval;
162          }
163 
164       BOTAN_FUNC_ISA("avx2")
165       void operator+=(const SIMD_8x32& other)
166          {
167          m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2);
168          }
169 
170       BOTAN_FUNC_ISA("avx2")
171       void operator-=(const SIMD_8x32& other)
172          {
173          m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2);
174          }
175 
176       BOTAN_FUNC_ISA("avx2")
177       void operator^=(const SIMD_8x32& other)
178          {
179          m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2);
180          }
181 
182       BOTAN_FUNC_ISA("avx2")
183       void operator|=(const SIMD_8x32& other)
184          {
185          m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2);
186          }
187 
188       BOTAN_FUNC_ISA("avx2")
189       void operator&=(const SIMD_8x32& other)
190          {
191          m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2);
192          }
193 
shl()194       template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shl() const
195          {
196          return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT));
197          }
198 
shr()199       template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shr() const
200          {
201          return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT));
202          }
203 
204       BOTAN_FUNC_ISA("avx2")
205       SIMD_8x32 operator~() const
206          {
207          return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF)));
208          }
209 
210       // (~reg) & other
211       BOTAN_FUNC_ISA("avx2")
andc(const SIMD_8x32 & other)212       SIMD_8x32 andc(const SIMD_8x32& other) const
213          {
214          return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2));
215          }
216 
217       BOTAN_FUNC_ISA("avx2")
bswap()218       SIMD_8x32 bswap() const
219          {
220          const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0,
221                                           7, 6, 5, 4,
222                                           11, 10, 9, 8,
223                                           15, 14, 13, 12,
224                                           19, 18, 17, 16,
225                                           23, 22, 21, 20,
226                                           27, 26, 25, 24,
227                                           31, 30, 29, 28 };
228 
229          const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK));
230 
231          const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap);
232 
233          return SIMD_8x32(output);
234          }
235 
236       BOTAN_FUNC_ISA("avx2")
transpose(SIMD_8x32 & B0,SIMD_8x32 & B1,SIMD_8x32 & B2,SIMD_8x32 & B3)237       static void transpose(SIMD_8x32& B0, SIMD_8x32& B1,
238                             SIMD_8x32& B2, SIMD_8x32& B3)
239          {
240          const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2);
241          const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2);
242          const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2);
243          const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2);
244 
245          B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1);
246          B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1);
247          B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3);
248          B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3);
249          }
250 
251       BOTAN_FUNC_ISA("avx2")
transpose(SIMD_8x32 & B0,SIMD_8x32 & B1,SIMD_8x32 & B2,SIMD_8x32 & B3,SIMD_8x32 & B4,SIMD_8x32 & B5,SIMD_8x32 & B6,SIMD_8x32 & B7)252       static void transpose(SIMD_8x32& B0, SIMD_8x32& B1,
253                             SIMD_8x32& B2, SIMD_8x32& B3,
254                             SIMD_8x32& B4, SIMD_8x32& B5,
255                             SIMD_8x32& B6, SIMD_8x32& B7)
256          {
257          transpose(B0, B1, B2, B3);
258          transpose(B4, B5, B6, B7);
259 
260          swap_tops(B0, B4);
261          swap_tops(B1, B5);
262          swap_tops(B2, B6);
263          swap_tops(B3, B7);
264          }
265 
266       BOTAN_FUNC_ISA("avx2")
reset_registers()267       static void reset_registers()
268          {
269          _mm256_zeroupper();
270          }
271 
272       BOTAN_FUNC_ISA("avx2")
zero_registers()273       static void zero_registers()
274          {
275          _mm256_zeroall();
276          }
277 
handle()278       __m256i BOTAN_FUNC_ISA("avx2") handle() const { return m_avx2; }
279 
280       BOTAN_FUNC_ISA("avx2")
SIMD_8x32(__m256i x)281       SIMD_8x32(__m256i x) : m_avx2(x) {}
282 
283    private:
284 
285       BOTAN_FUNC_ISA("avx2")
swap_tops(SIMD_8x32 & A,SIMD_8x32 & B)286       static void swap_tops(SIMD_8x32& A, SIMD_8x32& B)
287          {
288          SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4));
289          SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4));
290          A = T0;
291          B = T1;
292          }
293 
294       __m256i m_avx2;
295    };
296 
297 }
298 
299 #endif
300