1 if (bytes >= 512) { 2 __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, 3 y15; 4 5 /* the naive way seems as fast (if not a bit faster) than the vector way */ 6 __m256i z0 = _mm256_set1_epi32(x[0]); 7 __m256i z5 = _mm256_set1_epi32(x[1]); 8 __m256i z10 = _mm256_set1_epi32(x[2]); 9 __m256i z15 = _mm256_set1_epi32(x[3]); 10 __m256i z12 = _mm256_set1_epi32(x[4]); 11 __m256i z1 = _mm256_set1_epi32(x[5]); 12 __m256i z6 = _mm256_set1_epi32(x[6]); 13 __m256i z11 = _mm256_set1_epi32(x[7]); 14 __m256i z8; /* useless */ 15 __m256i z13 = _mm256_set1_epi32(x[9]); 16 __m256i z2 = _mm256_set1_epi32(x[10]); 17 __m256i z7 = _mm256_set1_epi32(x[11]); 18 __m256i z4 = _mm256_set1_epi32(x[12]); 19 __m256i z9; /* useless */ 20 __m256i z14 = _mm256_set1_epi32(x[14]); 21 __m256i z3 = _mm256_set1_epi32(x[15]); 22 23 __m256i orig0 = z0; 24 __m256i orig1 = z1; 25 __m256i orig2 = z2; 26 __m256i orig3 = z3; 27 __m256i orig4 = z4; 28 __m256i orig5 = z5; 29 __m256i orig6 = z6; 30 __m256i orig7 = z7; 31 __m256i orig8; 32 __m256i orig9; 33 __m256i orig10 = z10; 34 __m256i orig11 = z11; 35 __m256i orig12 = z12; 36 __m256i orig13 = z13; 37 __m256i orig14 = z14; 38 __m256i orig15 = z15; 39 40 uint32_t in8; 41 uint32_t in9; 42 int i; 43 44 while (bytes >= 512) { 45 /* vector implementation for z8 and z9 */ 46 /* faster than the naive version for 8 blocks */ 47 const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0); 48 const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4); 49 const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); 50 51 __m256i t8, t9; 52 uint64_t in89; 53 54 in8 = x[8]; 55 in9 = x[13]; /* see arrays above for the address translation */ 56 in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); 57 58 z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89)); 59 60 t8 = _mm256_add_epi64(addv8, z8); 61 t9 = _mm256_add_epi64(addv9, z9); 62 63 z8 = _mm256_unpacklo_epi32(t8, t9); 64 z9 = _mm256_unpackhi_epi32(t8, t9); 65 66 t8 = _mm256_unpacklo_epi32(z8, z9); 67 t9 = _mm256_unpackhi_epi32(z8, z9); 68 69 /* required because unpack* are intra-lane */ 70 z8 = _mm256_permutevar8x32_epi32(t8, permute); 71 z9 = _mm256_permutevar8x32_epi32(t9, permute); 72 73 orig8 = z8; 74 orig9 = z9; 75 76 in89 += 8; 77 78 x[8] = in89 & 0xFFFFFFFF; 79 x[13] = (in89 >> 32) & 0xFFFFFFFF; 80 81 z5 = orig5; 82 z10 = orig10; 83 z15 = orig15; 84 z14 = orig14; 85 z3 = orig3; 86 z6 = orig6; 87 z11 = orig11; 88 z1 = orig1; 89 90 z7 = orig7; 91 z13 = orig13; 92 z2 = orig2; 93 z9 = orig9; 94 z0 = orig0; 95 z12 = orig12; 96 z4 = orig4; 97 z8 = orig8; 98 99 for (i = 0; i < ROUNDS; i += 2) { 100 /* the inner loop is a direct translation (regexp search/replace) 101 * from the amd64-xmm6 ASM */ 102 __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, 103 r14, r15; 104 105 y4 = z12; 106 y4 = _mm256_add_epi32(y4, z0); 107 r4 = y4; 108 y4 = _mm256_slli_epi32(y4, 7); 109 z4 = _mm256_xor_si256(z4, y4); 110 r4 = _mm256_srli_epi32(r4, 25); 111 z4 = _mm256_xor_si256(z4, r4); 112 113 y9 = z1; 114 y9 = _mm256_add_epi32(y9, z5); 115 r9 = y9; 116 y9 = _mm256_slli_epi32(y9, 7); 117 z9 = _mm256_xor_si256(z9, y9); 118 r9 = _mm256_srli_epi32(r9, 25); 119 z9 = _mm256_xor_si256(z9, r9); 120 121 y8 = z0; 122 y8 = _mm256_add_epi32(y8, z4); 123 r8 = y8; 124 y8 = _mm256_slli_epi32(y8, 9); 125 z8 = _mm256_xor_si256(z8, y8); 126 r8 = _mm256_srli_epi32(r8, 23); 127 z8 = _mm256_xor_si256(z8, r8); 128 129 y13 = z5; 130 y13 = _mm256_add_epi32(y13, z9); 131 r13 = y13; 132 y13 = _mm256_slli_epi32(y13, 9); 133 z13 = _mm256_xor_si256(z13, y13); 134 r13 = _mm256_srli_epi32(r13, 23); 135 z13 = _mm256_xor_si256(z13, r13); 136 137 y12 = z4; 138 y12 = _mm256_add_epi32(y12, z8); 139 r12 = y12; 140 y12 = _mm256_slli_epi32(y12, 13); 141 z12 = _mm256_xor_si256(z12, y12); 142 r12 = _mm256_srli_epi32(r12, 19); 143 z12 = _mm256_xor_si256(z12, r12); 144 145 y1 = z9; 146 y1 = _mm256_add_epi32(y1, z13); 147 r1 = y1; 148 y1 = _mm256_slli_epi32(y1, 13); 149 z1 = _mm256_xor_si256(z1, y1); 150 r1 = _mm256_srli_epi32(r1, 19); 151 z1 = _mm256_xor_si256(z1, r1); 152 153 y0 = z8; 154 y0 = _mm256_add_epi32(y0, z12); 155 r0 = y0; 156 y0 = _mm256_slli_epi32(y0, 18); 157 z0 = _mm256_xor_si256(z0, y0); 158 r0 = _mm256_srli_epi32(r0, 14); 159 z0 = _mm256_xor_si256(z0, r0); 160 161 y5 = z13; 162 y5 = _mm256_add_epi32(y5, z1); 163 r5 = y5; 164 y5 = _mm256_slli_epi32(y5, 18); 165 z5 = _mm256_xor_si256(z5, y5); 166 r5 = _mm256_srli_epi32(r5, 14); 167 z5 = _mm256_xor_si256(z5, r5); 168 169 y14 = z6; 170 y14 = _mm256_add_epi32(y14, z10); 171 r14 = y14; 172 y14 = _mm256_slli_epi32(y14, 7); 173 z14 = _mm256_xor_si256(z14, y14); 174 r14 = _mm256_srli_epi32(r14, 25); 175 z14 = _mm256_xor_si256(z14, r14); 176 177 y3 = z11; 178 y3 = _mm256_add_epi32(y3, z15); 179 r3 = y3; 180 y3 = _mm256_slli_epi32(y3, 7); 181 z3 = _mm256_xor_si256(z3, y3); 182 r3 = _mm256_srli_epi32(r3, 25); 183 z3 = _mm256_xor_si256(z3, r3); 184 185 y2 = z10; 186 y2 = _mm256_add_epi32(y2, z14); 187 r2 = y2; 188 y2 = _mm256_slli_epi32(y2, 9); 189 z2 = _mm256_xor_si256(z2, y2); 190 r2 = _mm256_srli_epi32(r2, 23); 191 z2 = _mm256_xor_si256(z2, r2); 192 193 y7 = z15; 194 y7 = _mm256_add_epi32(y7, z3); 195 r7 = y7; 196 y7 = _mm256_slli_epi32(y7, 9); 197 z7 = _mm256_xor_si256(z7, y7); 198 r7 = _mm256_srli_epi32(r7, 23); 199 z7 = _mm256_xor_si256(z7, r7); 200 201 y6 = z14; 202 y6 = _mm256_add_epi32(y6, z2); 203 r6 = y6; 204 y6 = _mm256_slli_epi32(y6, 13); 205 z6 = _mm256_xor_si256(z6, y6); 206 r6 = _mm256_srli_epi32(r6, 19); 207 z6 = _mm256_xor_si256(z6, r6); 208 209 y11 = z3; 210 y11 = _mm256_add_epi32(y11, z7); 211 r11 = y11; 212 y11 = _mm256_slli_epi32(y11, 13); 213 z11 = _mm256_xor_si256(z11, y11); 214 r11 = _mm256_srli_epi32(r11, 19); 215 z11 = _mm256_xor_si256(z11, r11); 216 217 y10 = z2; 218 y10 = _mm256_add_epi32(y10, z6); 219 r10 = y10; 220 y10 = _mm256_slli_epi32(y10, 18); 221 z10 = _mm256_xor_si256(z10, y10); 222 r10 = _mm256_srli_epi32(r10, 14); 223 z10 = _mm256_xor_si256(z10, r10); 224 225 y1 = z3; 226 y1 = _mm256_add_epi32(y1, z0); 227 r1 = y1; 228 y1 = _mm256_slli_epi32(y1, 7); 229 z1 = _mm256_xor_si256(z1, y1); 230 r1 = _mm256_srli_epi32(r1, 25); 231 z1 = _mm256_xor_si256(z1, r1); 232 233 y15 = z7; 234 y15 = _mm256_add_epi32(y15, z11); 235 r15 = y15; 236 y15 = _mm256_slli_epi32(y15, 18); 237 z15 = _mm256_xor_si256(z15, y15); 238 r15 = _mm256_srli_epi32(r15, 14); 239 z15 = _mm256_xor_si256(z15, r15); 240 241 y6 = z4; 242 y6 = _mm256_add_epi32(y6, z5); 243 r6 = y6; 244 y6 = _mm256_slli_epi32(y6, 7); 245 z6 = _mm256_xor_si256(z6, y6); 246 r6 = _mm256_srli_epi32(r6, 25); 247 z6 = _mm256_xor_si256(z6, r6); 248 249 y2 = z0; 250 y2 = _mm256_add_epi32(y2, z1); 251 r2 = y2; 252 y2 = _mm256_slli_epi32(y2, 9); 253 z2 = _mm256_xor_si256(z2, y2); 254 r2 = _mm256_srli_epi32(r2, 23); 255 z2 = _mm256_xor_si256(z2, r2); 256 257 y7 = z5; 258 y7 = _mm256_add_epi32(y7, z6); 259 r7 = y7; 260 y7 = _mm256_slli_epi32(y7, 9); 261 z7 = _mm256_xor_si256(z7, y7); 262 r7 = _mm256_srli_epi32(r7, 23); 263 z7 = _mm256_xor_si256(z7, r7); 264 265 y3 = z1; 266 y3 = _mm256_add_epi32(y3, z2); 267 r3 = y3; 268 y3 = _mm256_slli_epi32(y3, 13); 269 z3 = _mm256_xor_si256(z3, y3); 270 r3 = _mm256_srli_epi32(r3, 19); 271 z3 = _mm256_xor_si256(z3, r3); 272 273 y4 = z6; 274 y4 = _mm256_add_epi32(y4, z7); 275 r4 = y4; 276 y4 = _mm256_slli_epi32(y4, 13); 277 z4 = _mm256_xor_si256(z4, y4); 278 r4 = _mm256_srli_epi32(r4, 19); 279 z4 = _mm256_xor_si256(z4, r4); 280 281 y0 = z2; 282 y0 = _mm256_add_epi32(y0, z3); 283 r0 = y0; 284 y0 = _mm256_slli_epi32(y0, 18); 285 z0 = _mm256_xor_si256(z0, y0); 286 r0 = _mm256_srli_epi32(r0, 14); 287 z0 = _mm256_xor_si256(z0, r0); 288 289 y5 = z7; 290 y5 = _mm256_add_epi32(y5, z4); 291 r5 = y5; 292 y5 = _mm256_slli_epi32(y5, 18); 293 z5 = _mm256_xor_si256(z5, y5); 294 r5 = _mm256_srli_epi32(r5, 14); 295 z5 = _mm256_xor_si256(z5, r5); 296 297 y11 = z9; 298 y11 = _mm256_add_epi32(y11, z10); 299 r11 = y11; 300 y11 = _mm256_slli_epi32(y11, 7); 301 z11 = _mm256_xor_si256(z11, y11); 302 r11 = _mm256_srli_epi32(r11, 25); 303 z11 = _mm256_xor_si256(z11, r11); 304 305 y12 = z14; 306 y12 = _mm256_add_epi32(y12, z15); 307 r12 = y12; 308 y12 = _mm256_slli_epi32(y12, 7); 309 z12 = _mm256_xor_si256(z12, y12); 310 r12 = _mm256_srli_epi32(r12, 25); 311 z12 = _mm256_xor_si256(z12, r12); 312 313 y8 = z10; 314 y8 = _mm256_add_epi32(y8, z11); 315 r8 = y8; 316 y8 = _mm256_slli_epi32(y8, 9); 317 z8 = _mm256_xor_si256(z8, y8); 318 r8 = _mm256_srli_epi32(r8, 23); 319 z8 = _mm256_xor_si256(z8, r8); 320 321 y13 = z15; 322 y13 = _mm256_add_epi32(y13, z12); 323 r13 = y13; 324 y13 = _mm256_slli_epi32(y13, 9); 325 z13 = _mm256_xor_si256(z13, y13); 326 r13 = _mm256_srli_epi32(r13, 23); 327 z13 = _mm256_xor_si256(z13, r13); 328 329 y9 = z11; 330 y9 = _mm256_add_epi32(y9, z8); 331 r9 = y9; 332 y9 = _mm256_slli_epi32(y9, 13); 333 z9 = _mm256_xor_si256(z9, y9); 334 r9 = _mm256_srli_epi32(r9, 19); 335 z9 = _mm256_xor_si256(z9, r9); 336 337 y14 = z12; 338 y14 = _mm256_add_epi32(y14, z13); 339 r14 = y14; 340 y14 = _mm256_slli_epi32(y14, 13); 341 z14 = _mm256_xor_si256(z14, y14); 342 r14 = _mm256_srli_epi32(r14, 19); 343 z14 = _mm256_xor_si256(z14, r14); 344 345 y10 = z8; 346 y10 = _mm256_add_epi32(y10, z9); 347 r10 = y10; 348 y10 = _mm256_slli_epi32(y10, 18); 349 z10 = _mm256_xor_si256(z10, y10); 350 r10 = _mm256_srli_epi32(r10, 14); 351 z10 = _mm256_xor_si256(z10, r10); 352 353 y15 = z13; 354 y15 = _mm256_add_epi32(y15, z14); 355 r15 = y15; 356 y15 = _mm256_slli_epi32(y15, 18); 357 z15 = _mm256_xor_si256(z15, y15); 358 r15 = _mm256_srli_epi32(r15, 14); 359 z15 = _mm256_xor_si256(z15, r15); 360 } 361 362 /* store data ; this macro first transpose data in-registers, and then store 363 * them in memory. much faster with icc. */ 364 #define ONEQUAD_TRANSPOSE(A, B, C, D) \ 365 { \ 366 __m128i t0, t1, t2, t3; \ 367 z##A = _mm256_add_epi32(z##A, orig##A); \ 368 z##B = _mm256_add_epi32(z##B, orig##B); \ 369 z##C = _mm256_add_epi32(z##C, orig##C); \ 370 z##D = _mm256_add_epi32(z##D, orig##D); \ 371 y##A = _mm256_unpacklo_epi32(z##A, z##B); \ 372 y##B = _mm256_unpacklo_epi32(z##C, z##D); \ 373 y##C = _mm256_unpackhi_epi32(z##A, z##B); \ 374 y##D = _mm256_unpackhi_epi32(z##C, z##D); \ 375 z##A = _mm256_unpacklo_epi64(y##A, y##B); \ 376 z##B = _mm256_unpackhi_epi64(y##A, y##B); \ 377 z##C = _mm256_unpacklo_epi64(y##C, y##D); \ 378 z##D = _mm256_unpackhi_epi64(y##C, y##D); \ 379 t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \ 380 _mm_loadu_si128((__m128i*) (m + 0))); \ 381 _mm_storeu_si128((__m128i*) (c + 0), t0); \ 382 t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \ 383 _mm_loadu_si128((__m128i*) (m + 64))); \ 384 _mm_storeu_si128((__m128i*) (c + 64), t1); \ 385 t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \ 386 _mm_loadu_si128((__m128i*) (m + 128))); \ 387 _mm_storeu_si128((__m128i*) (c + 128), t2); \ 388 t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \ 389 _mm_loadu_si128((__m128i*) (m + 192))); \ 390 _mm_storeu_si128((__m128i*) (c + 192), t3); \ 391 t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \ 392 _mm_loadu_si128((__m128i*) (m + 256))); \ 393 _mm_storeu_si128((__m128i*) (c + 256), t0); \ 394 t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \ 395 _mm_loadu_si128((__m128i*) (m + 320))); \ 396 _mm_storeu_si128((__m128i*) (c + 320), t1); \ 397 t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \ 398 _mm_loadu_si128((__m128i*) (m + 384))); \ 399 _mm_storeu_si128((__m128i*) (c + 384), t2); \ 400 t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \ 401 _mm_loadu_si128((__m128i*) (m + 448))); \ 402 _mm_storeu_si128((__m128i*) (c + 448), t3); \ 403 } 404 405 #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) 406 407 #define ONEQUAD_UNPCK(A, B, C, D) \ 408 { \ 409 z##A = _mm256_add_epi32(z##A, orig##A); \ 410 z##B = _mm256_add_epi32(z##B, orig##B); \ 411 z##C = _mm256_add_epi32(z##C, orig##C); \ 412 z##D = _mm256_add_epi32(z##D, orig##D); \ 413 y##A = _mm256_unpacklo_epi32(z##A, z##B); \ 414 y##B = _mm256_unpacklo_epi32(z##C, z##D); \ 415 y##C = _mm256_unpackhi_epi32(z##A, z##B); \ 416 y##D = _mm256_unpackhi_epi32(z##C, z##D); \ 417 z##A = _mm256_unpacklo_epi64(y##A, y##B); \ 418 z##B = _mm256_unpackhi_epi64(y##A, y##B); \ 419 z##C = _mm256_unpacklo_epi64(y##C, y##D); \ 420 z##D = _mm256_unpackhi_epi64(y##C, y##D); \ 421 } 422 423 #define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ 424 { \ 425 ONEQUAD_UNPCK(A, B, C, D); \ 426 ONEQUAD_UNPCK(A2, B2, C2, D2); \ 427 y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \ 428 y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \ 429 y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \ 430 y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \ 431 y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \ 432 y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \ 433 y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \ 434 y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \ 435 y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \ 436 y##B = \ 437 _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \ 438 y##C = \ 439 _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \ 440 y##D = \ 441 _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \ 442 y##A2 = \ 443 _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \ 444 y##B2 = \ 445 _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \ 446 y##C2 = \ 447 _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \ 448 y##D2 = \ 449 _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \ 450 _mm256_storeu_si256((__m256i*) (c + 0), y##A); \ 451 _mm256_storeu_si256((__m256i*) (c + 64), y##B); \ 452 _mm256_storeu_si256((__m256i*) (c + 128), y##C); \ 453 _mm256_storeu_si256((__m256i*) (c + 192), y##D); \ 454 _mm256_storeu_si256((__m256i*) (c + 256), y##A2); \ 455 _mm256_storeu_si256((__m256i*) (c + 320), y##B2); \ 456 _mm256_storeu_si256((__m256i*) (c + 384), y##C2); \ 457 _mm256_storeu_si256((__m256i*) (c + 448), y##D2); \ 458 } 459 460 ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); 461 m += 32; 462 c += 32; 463 ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); 464 m -= 32; 465 c -= 32; 466 467 #undef ONEQUAD 468 #undef ONEQUAD_TRANSPOSE 469 #undef ONEQUAD_UNPCK 470 #undef ONEOCTO 471 472 bytes -= 512; 473 c += 512; 474 m += 512; 475 } 476 } 477