1 /* 2 * (C) 2018 Jack Lloyd 3 * 4 * Botan is released under the Simplified BSD License (see license.txt) 5 */ 6 7 #ifndef BOTAN_SIMD_AVX2_H_ 8 #define BOTAN_SIMD_AVX2_H_ 9 10 #include <botan/types.h> 11 #include <immintrin.h> 12 13 namespace Botan { 14 15 class SIMD_8x32 final 16 { 17 public: 18 19 SIMD_8x32& operator=(const SIMD_8x32& other) = default; 20 SIMD_8x32(const SIMD_8x32& other) = default; 21 22 SIMD_8x32& operator=(SIMD_8x32&& other) = default; 23 SIMD_8x32(SIMD_8x32&& other) = default; 24 25 BOTAN_FUNC_ISA("avx2") SIMD_8x32()26 BOTAN_FORCE_INLINE SIMD_8x32() 27 { 28 m_avx2 = _mm256_setzero_si256(); 29 } 30 31 BOTAN_FUNC_ISA("avx2") SIMD_8x32(const uint32_t B[8])32 explicit SIMD_8x32(const uint32_t B[8]) 33 { 34 m_avx2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B)); 35 } 36 37 BOTAN_FUNC_ISA("avx2") SIMD_8x32(uint32_t B0,uint32_t B1,uint32_t B2,uint32_t B3,uint32_t B4,uint32_t B5,uint32_t B6,uint32_t B7)38 explicit SIMD_8x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3, 39 uint32_t B4, uint32_t B5, uint32_t B6, uint32_t B7) 40 { 41 m_avx2 = _mm256_set_epi32(B7, B6, B5, B4, B3, B2, B1, B0); 42 } 43 44 BOTAN_FUNC_ISA("avx2") splat(uint32_t B)45 static SIMD_8x32 splat(uint32_t B) 46 { 47 return SIMD_8x32(_mm256_set1_epi32(B)); 48 } 49 50 BOTAN_FUNC_ISA("avx2") load_le(const uint8_t * in)51 static SIMD_8x32 load_le(const uint8_t* in) 52 { 53 return SIMD_8x32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in))); 54 } 55 56 BOTAN_FUNC_ISA("avx2") load_be(const uint8_t * in)57 static SIMD_8x32 load_be(const uint8_t* in) 58 { 59 return load_le(in).bswap(); 60 } 61 62 BOTAN_FUNC_ISA("avx2") store_le(uint8_t out[])63 void store_le(uint8_t out[]) const 64 { 65 _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_avx2); 66 } 67 68 BOTAN_FUNC_ISA("avx2") store_be(uint8_t out[])69 void store_be(uint8_t out[]) const 70 { 71 bswap().store_le(out); 72 } 73 74 template<size_t ROT> 75 BOTAN_FUNC_ISA("avx2") rotl()76 SIMD_8x32 rotl() const 77 { 78 static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant"); 79 80 #if defined(__AVX512VL__) 81 return SIMD_8x32(_mm256_rol_epi32(m_avx2, ROT)); 82 #else 83 BOTAN_IF_CONSTEXPR(ROT == 8) 84 { 85 const __m256i shuf_rotl_8 = _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 86 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); 87 88 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_8)); 89 } 90 else BOTAN_IF_CONSTEXPR(ROT == 16) 91 { 92 const __m256i shuf_rotl_16 = _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 93 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); 94 95 return SIMD_8x32(_mm256_shuffle_epi8(m_avx2, shuf_rotl_16)); 96 } 97 else 98 { 99 return SIMD_8x32(_mm256_or_si256(_mm256_slli_epi32(m_avx2, static_cast<int>(ROT)), 100 _mm256_srli_epi32(m_avx2, static_cast<int>(32-ROT)))); 101 } 102 #endif 103 } 104 105 template<size_t ROT> 106 BOTAN_FUNC_ISA("avx2") rotr()107 SIMD_8x32 rotr() const 108 { 109 return this->rotl<32-ROT>(); 110 } 111 112 template<size_t ROT1, size_t ROT2, size_t ROT3> rho()113 SIMD_8x32 BOTAN_FUNC_ISA("avx2") rho() const 114 { 115 SIMD_8x32 res; 116 117 const SIMD_8x32 rot1 = this->rotr<ROT1>(); 118 const SIMD_8x32 rot2 = this->rotr<ROT2>(); 119 const SIMD_8x32 rot3 = this->rotr<ROT3>(); 120 121 return rot1 ^ rot2 ^ rot3; 122 } 123 124 BOTAN_FUNC_ISA("avx2") 125 SIMD_8x32 operator+(const SIMD_8x32& other) const 126 { 127 SIMD_8x32 retval(*this); 128 retval += other; 129 return retval; 130 } 131 132 BOTAN_FUNC_ISA("avx2") 133 SIMD_8x32 operator-(const SIMD_8x32& other) const 134 { 135 SIMD_8x32 retval(*this); 136 retval -= other; 137 return retval; 138 } 139 140 BOTAN_FUNC_ISA("avx2") 141 SIMD_8x32 operator^(const SIMD_8x32& other) const 142 { 143 SIMD_8x32 retval(*this); 144 retval ^= other; 145 return retval; 146 } 147 148 BOTAN_FUNC_ISA("avx2") 149 SIMD_8x32 operator|(const SIMD_8x32& other) const 150 { 151 SIMD_8x32 retval(*this); 152 retval |= other; 153 return retval; 154 } 155 156 BOTAN_FUNC_ISA("avx2") 157 SIMD_8x32 operator&(const SIMD_8x32& other) const 158 { 159 SIMD_8x32 retval(*this); 160 retval &= other; 161 return retval; 162 } 163 164 BOTAN_FUNC_ISA("avx2") 165 void operator+=(const SIMD_8x32& other) 166 { 167 m_avx2 = _mm256_add_epi32(m_avx2, other.m_avx2); 168 } 169 170 BOTAN_FUNC_ISA("avx2") 171 void operator-=(const SIMD_8x32& other) 172 { 173 m_avx2 = _mm256_sub_epi32(m_avx2, other.m_avx2); 174 } 175 176 BOTAN_FUNC_ISA("avx2") 177 void operator^=(const SIMD_8x32& other) 178 { 179 m_avx2 = _mm256_xor_si256(m_avx2, other.m_avx2); 180 } 181 182 BOTAN_FUNC_ISA("avx2") 183 void operator|=(const SIMD_8x32& other) 184 { 185 m_avx2 = _mm256_or_si256(m_avx2, other.m_avx2); 186 } 187 188 BOTAN_FUNC_ISA("avx2") 189 void operator&=(const SIMD_8x32& other) 190 { 191 m_avx2 = _mm256_and_si256(m_avx2, other.m_avx2); 192 } 193 shl()194 template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shl() const 195 { 196 return SIMD_8x32(_mm256_slli_epi32(m_avx2, SHIFT)); 197 } 198 shr()199 template<int SHIFT> BOTAN_FUNC_ISA("avx2") SIMD_8x32 shr() const 200 { 201 return SIMD_8x32(_mm256_srli_epi32(m_avx2, SHIFT)); 202 } 203 204 BOTAN_FUNC_ISA("avx2") 205 SIMD_8x32 operator~() const 206 { 207 return SIMD_8x32(_mm256_xor_si256(m_avx2, _mm256_set1_epi32(0xFFFFFFFF))); 208 } 209 210 // (~reg) & other 211 BOTAN_FUNC_ISA("avx2") andc(const SIMD_8x32 & other)212 SIMD_8x32 andc(const SIMD_8x32& other) const 213 { 214 return SIMD_8x32(_mm256_andnot_si256(m_avx2, other.m_avx2)); 215 } 216 217 BOTAN_FUNC_ISA("avx2") bswap()218 SIMD_8x32 bswap() const 219 { 220 const uint8_t BSWAP_MASK[32] = { 3, 2, 1, 0, 221 7, 6, 5, 4, 222 11, 10, 9, 8, 223 15, 14, 13, 12, 224 19, 18, 17, 16, 225 23, 22, 21, 20, 226 27, 26, 25, 24, 227 31, 30, 29, 28 }; 228 229 const __m256i bswap = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(BSWAP_MASK)); 230 231 const __m256i output = _mm256_shuffle_epi8(m_avx2, bswap); 232 233 return SIMD_8x32(output); 234 } 235 236 BOTAN_FUNC_ISA("avx2") transpose(SIMD_8x32 & B0,SIMD_8x32 & B1,SIMD_8x32 & B2,SIMD_8x32 & B3)237 static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, 238 SIMD_8x32& B2, SIMD_8x32& B3) 239 { 240 const __m256i T0 = _mm256_unpacklo_epi32(B0.m_avx2, B1.m_avx2); 241 const __m256i T1 = _mm256_unpacklo_epi32(B2.m_avx2, B3.m_avx2); 242 const __m256i T2 = _mm256_unpackhi_epi32(B0.m_avx2, B1.m_avx2); 243 const __m256i T3 = _mm256_unpackhi_epi32(B2.m_avx2, B3.m_avx2); 244 245 B0.m_avx2 = _mm256_unpacklo_epi64(T0, T1); 246 B1.m_avx2 = _mm256_unpackhi_epi64(T0, T1); 247 B2.m_avx2 = _mm256_unpacklo_epi64(T2, T3); 248 B3.m_avx2 = _mm256_unpackhi_epi64(T2, T3); 249 } 250 251 BOTAN_FUNC_ISA("avx2") transpose(SIMD_8x32 & B0,SIMD_8x32 & B1,SIMD_8x32 & B2,SIMD_8x32 & B3,SIMD_8x32 & B4,SIMD_8x32 & B5,SIMD_8x32 & B6,SIMD_8x32 & B7)252 static void transpose(SIMD_8x32& B0, SIMD_8x32& B1, 253 SIMD_8x32& B2, SIMD_8x32& B3, 254 SIMD_8x32& B4, SIMD_8x32& B5, 255 SIMD_8x32& B6, SIMD_8x32& B7) 256 { 257 transpose(B0, B1, B2, B3); 258 transpose(B4, B5, B6, B7); 259 260 swap_tops(B0, B4); 261 swap_tops(B1, B5); 262 swap_tops(B2, B6); 263 swap_tops(B3, B7); 264 } 265 266 BOTAN_FUNC_ISA("avx2") reset_registers()267 static void reset_registers() 268 { 269 _mm256_zeroupper(); 270 } 271 272 BOTAN_FUNC_ISA("avx2") zero_registers()273 static void zero_registers() 274 { 275 _mm256_zeroall(); 276 } 277 handle()278 __m256i BOTAN_FUNC_ISA("avx2") handle() const { return m_avx2; } 279 280 BOTAN_FUNC_ISA("avx2") SIMD_8x32(__m256i x)281 SIMD_8x32(__m256i x) : m_avx2(x) {} 282 283 private: 284 285 BOTAN_FUNC_ISA("avx2") swap_tops(SIMD_8x32 & A,SIMD_8x32 & B)286 static void swap_tops(SIMD_8x32& A, SIMD_8x32& B) 287 { 288 SIMD_8x32 T0 = _mm256_permute2x128_si256(A.handle(), B.handle(), 0 + (2 << 4)); 289 SIMD_8x32 T1 = _mm256_permute2x128_si256(A.handle(), B.handle(), 1 + (3 << 4)); 290 A = T0; 291 B = T1; 292 } 293 294 __m256i m_avx2; 295 }; 296 297 } 298 299 #endif 300