1 2 #define VEC8_ROT(A, IMM) \ 3 _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM))) 4 5 /* implements a vector quarter round by-the-book (naive!) */ 6 #define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \ 7 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 8 t_##A = _mm256_xor_si256(x_##D, x_##A); \ 9 x_##D = VEC8_ROT(t_##A, 16); \ 10 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 11 t_##C = _mm256_xor_si256(x_##B, x_##C); \ 12 x_##B = VEC8_ROT(t_##C, 12); \ 13 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 14 t_##A = _mm256_xor_si256(x_##D, x_##A); \ 15 x_##D = VEC8_ROT(t_##A, 8); \ 16 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 17 t_##C = _mm256_xor_si256(x_##B, x_##C); \ 18 x_##B = VEC8_ROT(t_##C, 7) 19 20 /* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & 21 * 16) (better) */ 22 #define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \ 23 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 24 t_##A = _mm256_xor_si256(x_##D, x_##A); \ 25 x_##D = _mm256_shuffle_epi8(t_##A, rot16); \ 26 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 27 t_##C = _mm256_xor_si256(x_##B, x_##C); \ 28 x_##B = VEC8_ROT(t_##C, 12); \ 29 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 30 t_##A = _mm256_xor_si256(x_##D, x_##A); \ 31 x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ 32 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 33 t_##C = _mm256_xor_si256(x_##B, x_##C); \ 34 x_##B = VEC8_ROT(t_##C, 7) 35 36 /* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles 37 * (8 & 16) (not as good as previous) */ 38 #define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \ 39 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 40 t_##A = _mm256_xor_si256(x_##D, x_##A); \ 41 x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \ 42 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 43 t_##C = _mm256_xor_si256(x_##B, x_##C); \ 44 x_##B = VEC8_ROT(t_##C, 12); \ 45 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 46 t_##A = _mm256_xor_si256(x_##D, x_##A); \ 47 x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ 48 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 49 t_##C = _mm256_xor_si256(x_##B, x_##C); \ 50 x_##B = VEC8_ROT(t_##C, 7) 51 52 #define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) 53 54 #define VEC8_LINE1(A, B, C, D) \ 55 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 56 x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16) 57 #define VEC8_LINE2(A, B, C, D) \ 58 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 59 x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12) 60 #define VEC8_LINE3(A, B, C, D) \ 61 x_##A = _mm256_add_epi32(x_##A, x_##B); \ 62 x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8) 63 #define VEC8_LINE4(A, B, C, D) \ 64 x_##C = _mm256_add_epi32(x_##C, x_##D); \ 65 x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7) 66 67 #define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \ 68 C4, D4) \ 69 VEC8_LINE1(A1, B1, C1, D1); \ 70 VEC8_LINE1(A2, B2, C2, D2); \ 71 VEC8_LINE1(A3, B3, C3, D3); \ 72 VEC8_LINE1(A4, B4, C4, D4); \ 73 VEC8_LINE2(A1, B1, C1, D1); \ 74 VEC8_LINE2(A2, B2, C2, D2); \ 75 VEC8_LINE2(A3, B3, C3, D3); \ 76 VEC8_LINE2(A4, B4, C4, D4); \ 77 VEC8_LINE3(A1, B1, C1, D1); \ 78 VEC8_LINE3(A2, B2, C2, D2); \ 79 VEC8_LINE3(A3, B3, C3, D3); \ 80 VEC8_LINE3(A4, B4, C4, D4); \ 81 VEC8_LINE4(A1, B1, C1, D1); \ 82 VEC8_LINE4(A2, B2, C2, D2); \ 83 VEC8_LINE4(A3, B3, C3, D3); \ 84 VEC8_LINE4(A4, B4, C4, D4) 85 86 #define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \ 87 B4, C4, D4) \ 88 VEC8_LINE1(A1, B1, C1, D1); \ 89 VEC8_LINE1(A2, B2, C2, D2); \ 90 VEC8_LINE2(A1, B1, C1, D1); \ 91 VEC8_LINE2(A2, B2, C2, D2); \ 92 VEC8_LINE3(A1, B1, C1, D1); \ 93 VEC8_LINE3(A2, B2, C2, D2); \ 94 VEC8_LINE4(A1, B1, C1, D1); \ 95 VEC8_LINE4(A2, B2, C2, D2); \ 96 VEC8_LINE1(A3, B3, C3, D3); \ 97 VEC8_LINE1(A4, B4, C4, D4); \ 98 VEC8_LINE2(A3, B3, C3, D3); \ 99 VEC8_LINE2(A4, B4, C4, D4); \ 100 VEC8_LINE3(A3, B3, C3, D3); \ 101 VEC8_LINE3(A4, B4, C4, D4); \ 102 VEC8_LINE4(A3, B3, C3, D3); \ 103 VEC8_LINE4(A4, B4, C4, D4) 104 105 #define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \ 106 A4, B4, C4, D4) \ 107 VEC8_LINE1(A1, B1, C1, D1); \ 108 VEC8_LINE1(A2, B2, C2, D2); \ 109 VEC8_LINE2(A1, B1, C1, D1); \ 110 VEC8_LINE2(A2, B2, C2, D2); \ 111 VEC8_LINE1(A3, B3, C3, D3); \ 112 VEC8_LINE1(A4, B4, C4, D4); \ 113 VEC8_LINE2(A3, B3, C3, D3); \ 114 VEC8_LINE2(A4, B4, C4, D4); \ 115 VEC8_LINE3(A1, B1, C1, D1); \ 116 VEC8_LINE3(A2, B2, C2, D2); \ 117 VEC8_LINE4(A1, B1, C1, D1); \ 118 VEC8_LINE4(A2, B2, C2, D2); \ 119 VEC8_LINE3(A3, B3, C3, D3); \ 120 VEC8_LINE3(A4, B4, C4, D4); \ 121 VEC8_LINE4(A3, B3, C3, D3); \ 122 VEC8_LINE4(A4, B4, C4, D4) 123 124 #define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ 125 D4) \ 126 VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ 127 D4) 128 129 if (bytes >= 512) { 130 /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ 131 __m256i rot16 = 132 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 133 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); 134 __m256i rot8 = 135 _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 136 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); 137 uint32_t in12, in13; 138 139 /* the naive way seems as fast (if not a bit faster) than the vector way */ 140 __m256i x_0 = _mm256_set1_epi32(x[0]); 141 __m256i x_1 = _mm256_set1_epi32(x[1]); 142 __m256i x_2 = _mm256_set1_epi32(x[2]); 143 __m256i x_3 = _mm256_set1_epi32(x[3]); 144 __m256i x_4 = _mm256_set1_epi32(x[4]); 145 __m256i x_5 = _mm256_set1_epi32(x[5]); 146 __m256i x_6 = _mm256_set1_epi32(x[6]); 147 __m256i x_7 = _mm256_set1_epi32(x[7]); 148 __m256i x_8 = _mm256_set1_epi32(x[8]); 149 __m256i x_9 = _mm256_set1_epi32(x[9]); 150 __m256i x_10 = _mm256_set1_epi32(x[10]); 151 __m256i x_11 = _mm256_set1_epi32(x[11]); 152 __m256i x_12; 153 __m256i x_13; 154 __m256i x_14 = _mm256_set1_epi32(x[14]); 155 __m256i x_15 = _mm256_set1_epi32(x[15]); 156 157 __m256i orig0 = x_0; 158 __m256i orig1 = x_1; 159 __m256i orig2 = x_2; 160 __m256i orig3 = x_3; 161 __m256i orig4 = x_4; 162 __m256i orig5 = x_5; 163 __m256i orig6 = x_6; 164 __m256i orig7 = x_7; 165 __m256i orig8 = x_8; 166 __m256i orig9 = x_9; 167 __m256i orig10 = x_10; 168 __m256i orig11 = x_11; 169 __m256i orig12; 170 __m256i orig13; 171 __m256i orig14 = x_14; 172 __m256i orig15 = x_15; 173 __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, 174 t_13, t_14, t_15; 175 176 while (bytes >= 512) { 177 const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0); 178 const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4); 179 const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); 180 __m256i t12, t13; 181 182 uint64_t in1213; 183 int i; 184 185 x_0 = orig0; 186 x_1 = orig1; 187 x_2 = orig2; 188 x_3 = orig3; 189 x_4 = orig4; 190 x_5 = orig5; 191 x_6 = orig6; 192 x_7 = orig7; 193 x_8 = orig8; 194 x_9 = orig9; 195 x_10 = orig10; 196 x_11 = orig11; 197 x_14 = orig14; 198 x_15 = orig15; 199 200 in12 = x[12]; 201 in13 = x[13]; 202 in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); 203 x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213)); 204 205 t12 = _mm256_add_epi64(addv12, x_12); 206 t13 = _mm256_add_epi64(addv13, x_13); 207 208 x_12 = _mm256_unpacklo_epi32(t12, t13); 209 x_13 = _mm256_unpackhi_epi32(t12, t13); 210 211 t12 = _mm256_unpacklo_epi32(x_12, x_13); 212 t13 = _mm256_unpackhi_epi32(x_12, x_13); 213 214 /* required because unpack* are intra-lane */ 215 x_12 = _mm256_permutevar8x32_epi32(t12, permute); 216 x_13 = _mm256_permutevar8x32_epi32(t13, permute); 217 218 orig12 = x_12; 219 orig13 = x_13; 220 221 in1213 += 8; 222 223 x[12] = in1213 & 0xFFFFFFFF; 224 x[13] = (in1213 >> 32) & 0xFFFFFFFF; 225 226 for (i = 0; i < ROUNDS; i += 2) { 227 VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); 228 VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14); 229 } 230 231 #define ONEQUAD_TRANSPOSE(A, B, C, D) \ 232 { \ 233 __m128i t0, t1, t2, t3; \ 234 x_##A = _mm256_add_epi32(x_##A, orig##A); \ 235 x_##B = _mm256_add_epi32(x_##B, orig##B); \ 236 x_##C = _mm256_add_epi32(x_##C, orig##C); \ 237 x_##D = _mm256_add_epi32(x_##D, orig##D); \ 238 t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ 239 t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ 240 t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ 241 t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ 242 x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ 243 x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ 244 x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ 245 x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ 246 t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \ 247 _mm_loadu_si128((__m128i*) (m + 0))); \ 248 _mm_storeu_si128((__m128i*) (c + 0), t0); \ 249 t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \ 250 _mm_loadu_si128((__m128i*) (m + 64))); \ 251 _mm_storeu_si128((__m128i*) (c + 64), t1); \ 252 t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \ 253 _mm_loadu_si128((__m128i*) (m + 128))); \ 254 _mm_storeu_si128((__m128i*) (c + 128), t2); \ 255 t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \ 256 _mm_loadu_si128((__m128i*) (m + 192))); \ 257 _mm_storeu_si128((__m128i*) (c + 192), t3); \ 258 t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \ 259 _mm_loadu_si128((__m128i*) (m + 256))); \ 260 _mm_storeu_si128((__m128i*) (c + 256), t0); \ 261 t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \ 262 _mm_loadu_si128((__m128i*) (m + 320))); \ 263 _mm_storeu_si128((__m128i*) (c + 320), t1); \ 264 t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \ 265 _mm_loadu_si128((__m128i*) (m + 384))); \ 266 _mm_storeu_si128((__m128i*) (c + 384), t2); \ 267 t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \ 268 _mm_loadu_si128((__m128i*) (m + 448))); \ 269 _mm_storeu_si128((__m128i*) (c + 448), t3); \ 270 } 271 272 #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) 273 274 #define ONEQUAD_UNPCK(A, B, C, D) \ 275 { \ 276 x_##A = _mm256_add_epi32(x_##A, orig##A); \ 277 x_##B = _mm256_add_epi32(x_##B, orig##B); \ 278 x_##C = _mm256_add_epi32(x_##C, orig##C); \ 279 x_##D = _mm256_add_epi32(x_##D, orig##D); \ 280 t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ 281 t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ 282 t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ 283 t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ 284 x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ 285 x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ 286 x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ 287 x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ 288 } 289 290 #define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ 291 { \ 292 ONEQUAD_UNPCK(A, B, C, D); \ 293 ONEQUAD_UNPCK(A2, B2, C2, D2); \ 294 t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \ 295 t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \ 296 t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \ 297 t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \ 298 t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \ 299 t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \ 300 t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \ 301 t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \ 302 t_##A = \ 303 _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0))); \ 304 t_##B = \ 305 _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64))); \ 306 t_##C = \ 307 _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \ 308 t_##D = \ 309 _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \ 310 t_##A2 = _mm256_xor_si256(t_##A2, \ 311 _mm256_loadu_si256((__m256i*) (m + 256))); \ 312 t_##B2 = _mm256_xor_si256(t_##B2, \ 313 _mm256_loadu_si256((__m256i*) (m + 320))); \ 314 t_##C2 = _mm256_xor_si256(t_##C2, \ 315 _mm256_loadu_si256((__m256i*) (m + 384))); \ 316 t_##D2 = _mm256_xor_si256(t_##D2, \ 317 _mm256_loadu_si256((__m256i*) (m + 448))); \ 318 _mm256_storeu_si256((__m256i*) (c + 0), t_##A); \ 319 _mm256_storeu_si256((__m256i*) (c + 64), t_##B); \ 320 _mm256_storeu_si256((__m256i*) (c + 128), t_##C); \ 321 _mm256_storeu_si256((__m256i*) (c + 192), t_##D); \ 322 _mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \ 323 _mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \ 324 _mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \ 325 _mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \ 326 } 327 328 ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); 329 m += 32; 330 c += 32; 331 ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); 332 m -= 32; 333 c -= 32; 334 335 #undef ONEQUAD 336 #undef ONEQUAD_TRANSPOSE 337 #undef ONEQUAD_UNPCK 338 #undef ONEOCTO 339 340 bytes -= 512; 341 c += 512; 342 m += 512; 343 } 344 } 345 #undef VEC8_ROT 346 #undef VEC8_QUARTERROUND 347 #undef VEC8_QUARTERROUND_NAIVE 348 #undef VEC8_QUARTERROUND_SHUFFLE 349 #undef VEC8_QUARTERROUND_SHUFFLE2 350 #undef VEC8_LINE1 351 #undef VEC8_LINE2 352 #undef VEC8_LINE3 353 #undef VEC8_LINE4 354 #undef VEC8_ROUND 355 #undef VEC8_ROUND_SEQ 356 #undef VEC8_ROUND_HALF 357 #undef VEC8_ROUND_HALFANDHALF 358