1 if (bytes >= 256) { 2 __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, 3 y15; 4 __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, 5 z15; 6 __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8, 7 orig9, orig10, orig11, orig12, orig13, orig14, orig15; 8 9 uint32_t in8; 10 uint32_t in9; 11 int i; 12 13 /* element broadcast immediate for _mm_shuffle_epi32 are in order: 14 0x00, 0x55, 0xaa, 0xff */ 15 z0 = _mm_loadu_si128((__m128i *) (x + 0)); 16 z5 = _mm_shuffle_epi32(z0, 0x55); 17 z10 = _mm_shuffle_epi32(z0, 0xaa); 18 z15 = _mm_shuffle_epi32(z0, 0xff); 19 z0 = _mm_shuffle_epi32(z0, 0x00); 20 z1 = _mm_loadu_si128((__m128i *) (x + 4)); 21 z6 = _mm_shuffle_epi32(z1, 0xaa); 22 z11 = _mm_shuffle_epi32(z1, 0xff); 23 z12 = _mm_shuffle_epi32(z1, 0x00); 24 z1 = _mm_shuffle_epi32(z1, 0x55); 25 z2 = _mm_loadu_si128((__m128i *) (x + 8)); 26 z7 = _mm_shuffle_epi32(z2, 0xff); 27 z13 = _mm_shuffle_epi32(z2, 0x55); 28 z2 = _mm_shuffle_epi32(z2, 0xaa); 29 /* no z8 -> first half of the nonce, will fill later */ 30 z3 = _mm_loadu_si128((__m128i *) (x + 12)); 31 z4 = _mm_shuffle_epi32(z3, 0x00); 32 z14 = _mm_shuffle_epi32(z3, 0xaa); 33 z3 = _mm_shuffle_epi32(z3, 0xff); 34 /* no z9 -> second half of the nonce, will fill later */ 35 orig0 = z0; 36 orig1 = z1; 37 orig2 = z2; 38 orig3 = z3; 39 orig4 = z4; 40 orig5 = z5; 41 orig6 = z6; 42 orig7 = z7; 43 orig10 = z10; 44 orig11 = z11; 45 orig12 = z12; 46 orig13 = z13; 47 orig14 = z14; 48 orig15 = z15; 49 50 while (bytes >= 256) { 51 /* vector implementation for z8 and z9 */ 52 /* not sure if it helps for only 4 blocks */ 53 const __m128i addv8 = _mm_set_epi64x(1, 0); 54 const __m128i addv9 = _mm_set_epi64x(3, 2); 55 __m128i t8, t9; 56 uint64_t in89; 57 58 in8 = x[8]; 59 in9 = x[13]; 60 in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); 61 t8 = _mm_set1_epi64x(in89); 62 t9 = _mm_set1_epi64x(in89); 63 64 z8 = _mm_add_epi64(addv8, t8); 65 z9 = _mm_add_epi64(addv9, t9); 66 67 t8 = _mm_unpacklo_epi32(z8, z9); 68 t9 = _mm_unpackhi_epi32(z8, z9); 69 70 z8 = _mm_unpacklo_epi32(t8, t9); 71 z9 = _mm_unpackhi_epi32(t8, t9); 72 73 orig8 = z8; 74 orig9 = z9; 75 76 in89 += 4; 77 78 x[8] = in89 & 0xFFFFFFFF; 79 x[13] = (in89 >> 32) & 0xFFFFFFFF; 80 81 z5 = orig5; 82 z10 = orig10; 83 z15 = orig15; 84 z14 = orig14; 85 z3 = orig3; 86 z6 = orig6; 87 z11 = orig11; 88 z1 = orig1; 89 90 z7 = orig7; 91 z13 = orig13; 92 z2 = orig2; 93 z9 = orig9; 94 z0 = orig0; 95 z12 = orig12; 96 z4 = orig4; 97 z8 = orig8; 98 99 for (i = 0; i < ROUNDS; i += 2) { 100 /* the inner loop is a direct translation (regexp search/replace) 101 * from the amd64-xmm6 ASM */ 102 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, 103 r14, r15; 104 105 y4 = z12; 106 y4 = _mm_add_epi32(y4, z0); 107 r4 = y4; 108 y4 = _mm_slli_epi32(y4, 7); 109 z4 = _mm_xor_si128(z4, y4); 110 r4 = _mm_srli_epi32(r4, 25); 111 z4 = _mm_xor_si128(z4, r4); 112 113 y9 = z1; 114 y9 = _mm_add_epi32(y9, z5); 115 r9 = y9; 116 y9 = _mm_slli_epi32(y9, 7); 117 z9 = _mm_xor_si128(z9, y9); 118 r9 = _mm_srli_epi32(r9, 25); 119 z9 = _mm_xor_si128(z9, r9); 120 121 y8 = z0; 122 y8 = _mm_add_epi32(y8, z4); 123 r8 = y8; 124 y8 = _mm_slli_epi32(y8, 9); 125 z8 = _mm_xor_si128(z8, y8); 126 r8 = _mm_srli_epi32(r8, 23); 127 z8 = _mm_xor_si128(z8, r8); 128 129 y13 = z5; 130 y13 = _mm_add_epi32(y13, z9); 131 r13 = y13; 132 y13 = _mm_slli_epi32(y13, 9); 133 z13 = _mm_xor_si128(z13, y13); 134 r13 = _mm_srli_epi32(r13, 23); 135 z13 = _mm_xor_si128(z13, r13); 136 137 y12 = z4; 138 y12 = _mm_add_epi32(y12, z8); 139 r12 = y12; 140 y12 = _mm_slli_epi32(y12, 13); 141 z12 = _mm_xor_si128(z12, y12); 142 r12 = _mm_srli_epi32(r12, 19); 143 z12 = _mm_xor_si128(z12, r12); 144 145 y1 = z9; 146 y1 = _mm_add_epi32(y1, z13); 147 r1 = y1; 148 y1 = _mm_slli_epi32(y1, 13); 149 z1 = _mm_xor_si128(z1, y1); 150 r1 = _mm_srli_epi32(r1, 19); 151 z1 = _mm_xor_si128(z1, r1); 152 153 y0 = z8; 154 y0 = _mm_add_epi32(y0, z12); 155 r0 = y0; 156 y0 = _mm_slli_epi32(y0, 18); 157 z0 = _mm_xor_si128(z0, y0); 158 r0 = _mm_srli_epi32(r0, 14); 159 z0 = _mm_xor_si128(z0, r0); 160 161 y5 = z13; 162 y5 = _mm_add_epi32(y5, z1); 163 r5 = y5; 164 y5 = _mm_slli_epi32(y5, 18); 165 z5 = _mm_xor_si128(z5, y5); 166 r5 = _mm_srli_epi32(r5, 14); 167 z5 = _mm_xor_si128(z5, r5); 168 169 y14 = z6; 170 y14 = _mm_add_epi32(y14, z10); 171 r14 = y14; 172 y14 = _mm_slli_epi32(y14, 7); 173 z14 = _mm_xor_si128(z14, y14); 174 r14 = _mm_srli_epi32(r14, 25); 175 z14 = _mm_xor_si128(z14, r14); 176 177 y3 = z11; 178 y3 = _mm_add_epi32(y3, z15); 179 r3 = y3; 180 y3 = _mm_slli_epi32(y3, 7); 181 z3 = _mm_xor_si128(z3, y3); 182 r3 = _mm_srli_epi32(r3, 25); 183 z3 = _mm_xor_si128(z3, r3); 184 185 y2 = z10; 186 y2 = _mm_add_epi32(y2, z14); 187 r2 = y2; 188 y2 = _mm_slli_epi32(y2, 9); 189 z2 = _mm_xor_si128(z2, y2); 190 r2 = _mm_srli_epi32(r2, 23); 191 z2 = _mm_xor_si128(z2, r2); 192 193 y7 = z15; 194 y7 = _mm_add_epi32(y7, z3); 195 r7 = y7; 196 y7 = _mm_slli_epi32(y7, 9); 197 z7 = _mm_xor_si128(z7, y7); 198 r7 = _mm_srli_epi32(r7, 23); 199 z7 = _mm_xor_si128(z7, r7); 200 201 y6 = z14; 202 y6 = _mm_add_epi32(y6, z2); 203 r6 = y6; 204 y6 = _mm_slli_epi32(y6, 13); 205 z6 = _mm_xor_si128(z6, y6); 206 r6 = _mm_srli_epi32(r6, 19); 207 z6 = _mm_xor_si128(z6, r6); 208 209 y11 = z3; 210 y11 = _mm_add_epi32(y11, z7); 211 r11 = y11; 212 y11 = _mm_slli_epi32(y11, 13); 213 z11 = _mm_xor_si128(z11, y11); 214 r11 = _mm_srli_epi32(r11, 19); 215 z11 = _mm_xor_si128(z11, r11); 216 217 y10 = z2; 218 y10 = _mm_add_epi32(y10, z6); 219 r10 = y10; 220 y10 = _mm_slli_epi32(y10, 18); 221 z10 = _mm_xor_si128(z10, y10); 222 r10 = _mm_srli_epi32(r10, 14); 223 z10 = _mm_xor_si128(z10, r10); 224 225 y1 = z3; 226 y1 = _mm_add_epi32(y1, z0); 227 r1 = y1; 228 y1 = _mm_slli_epi32(y1, 7); 229 z1 = _mm_xor_si128(z1, y1); 230 r1 = _mm_srli_epi32(r1, 25); 231 z1 = _mm_xor_si128(z1, r1); 232 233 y15 = z7; 234 y15 = _mm_add_epi32(y15, z11); 235 r15 = y15; 236 y15 = _mm_slli_epi32(y15, 18); 237 z15 = _mm_xor_si128(z15, y15); 238 r15 = _mm_srli_epi32(r15, 14); 239 z15 = _mm_xor_si128(z15, r15); 240 241 y6 = z4; 242 y6 = _mm_add_epi32(y6, z5); 243 r6 = y6; 244 y6 = _mm_slli_epi32(y6, 7); 245 z6 = _mm_xor_si128(z6, y6); 246 r6 = _mm_srli_epi32(r6, 25); 247 z6 = _mm_xor_si128(z6, r6); 248 249 y2 = z0; 250 y2 = _mm_add_epi32(y2, z1); 251 r2 = y2; 252 y2 = _mm_slli_epi32(y2, 9); 253 z2 = _mm_xor_si128(z2, y2); 254 r2 = _mm_srli_epi32(r2, 23); 255 z2 = _mm_xor_si128(z2, r2); 256 257 y7 = z5; 258 y7 = _mm_add_epi32(y7, z6); 259 r7 = y7; 260 y7 = _mm_slli_epi32(y7, 9); 261 z7 = _mm_xor_si128(z7, y7); 262 r7 = _mm_srli_epi32(r7, 23); 263 z7 = _mm_xor_si128(z7, r7); 264 265 y3 = z1; 266 y3 = _mm_add_epi32(y3, z2); 267 r3 = y3; 268 y3 = _mm_slli_epi32(y3, 13); 269 z3 = _mm_xor_si128(z3, y3); 270 r3 = _mm_srli_epi32(r3, 19); 271 z3 = _mm_xor_si128(z3, r3); 272 273 y4 = z6; 274 y4 = _mm_add_epi32(y4, z7); 275 r4 = y4; 276 y4 = _mm_slli_epi32(y4, 13); 277 z4 = _mm_xor_si128(z4, y4); 278 r4 = _mm_srli_epi32(r4, 19); 279 z4 = _mm_xor_si128(z4, r4); 280 281 y0 = z2; 282 y0 = _mm_add_epi32(y0, z3); 283 r0 = y0; 284 y0 = _mm_slli_epi32(y0, 18); 285 z0 = _mm_xor_si128(z0, y0); 286 r0 = _mm_srli_epi32(r0, 14); 287 z0 = _mm_xor_si128(z0, r0); 288 289 y5 = z7; 290 y5 = _mm_add_epi32(y5, z4); 291 r5 = y5; 292 y5 = _mm_slli_epi32(y5, 18); 293 z5 = _mm_xor_si128(z5, y5); 294 r5 = _mm_srli_epi32(r5, 14); 295 z5 = _mm_xor_si128(z5, r5); 296 297 y11 = z9; 298 y11 = _mm_add_epi32(y11, z10); 299 r11 = y11; 300 y11 = _mm_slli_epi32(y11, 7); 301 z11 = _mm_xor_si128(z11, y11); 302 r11 = _mm_srli_epi32(r11, 25); 303 z11 = _mm_xor_si128(z11, r11); 304 305 y12 = z14; 306 y12 = _mm_add_epi32(y12, z15); 307 r12 = y12; 308 y12 = _mm_slli_epi32(y12, 7); 309 z12 = _mm_xor_si128(z12, y12); 310 r12 = _mm_srli_epi32(r12, 25); 311 z12 = _mm_xor_si128(z12, r12); 312 313 y8 = z10; 314 y8 = _mm_add_epi32(y8, z11); 315 r8 = y8; 316 y8 = _mm_slli_epi32(y8, 9); 317 z8 = _mm_xor_si128(z8, y8); 318 r8 = _mm_srli_epi32(r8, 23); 319 z8 = _mm_xor_si128(z8, r8); 320 321 y13 = z15; 322 y13 = _mm_add_epi32(y13, z12); 323 r13 = y13; 324 y13 = _mm_slli_epi32(y13, 9); 325 z13 = _mm_xor_si128(z13, y13); 326 r13 = _mm_srli_epi32(r13, 23); 327 z13 = _mm_xor_si128(z13, r13); 328 329 y9 = z11; 330 y9 = _mm_add_epi32(y9, z8); 331 r9 = y9; 332 y9 = _mm_slli_epi32(y9, 13); 333 z9 = _mm_xor_si128(z9, y9); 334 r9 = _mm_srli_epi32(r9, 19); 335 z9 = _mm_xor_si128(z9, r9); 336 337 y14 = z12; 338 y14 = _mm_add_epi32(y14, z13); 339 r14 = y14; 340 y14 = _mm_slli_epi32(y14, 13); 341 z14 = _mm_xor_si128(z14, y14); 342 r14 = _mm_srli_epi32(r14, 19); 343 z14 = _mm_xor_si128(z14, r14); 344 345 y10 = z8; 346 y10 = _mm_add_epi32(y10, z9); 347 r10 = y10; 348 y10 = _mm_slli_epi32(y10, 18); 349 z10 = _mm_xor_si128(z10, y10); 350 r10 = _mm_srli_epi32(r10, 14); 351 z10 = _mm_xor_si128(z10, r10); 352 353 y15 = z13; 354 y15 = _mm_add_epi32(y15, z14); 355 r15 = y15; 356 y15 = _mm_slli_epi32(y15, 18); 357 z15 = _mm_xor_si128(z15, y15); 358 r15 = _mm_srli_epi32(r15, 14); 359 z15 = _mm_xor_si128(z15, r15); 360 } 361 362 /* store data ; this macro replicates the original amd64-xmm6 code */ 363 #define ONEQUAD_SHUFFLE(A, B, C, D) \ 364 z##A = _mm_add_epi32(z##A, orig##A); \ 365 z##B = _mm_add_epi32(z##B, orig##B); \ 366 z##C = _mm_add_epi32(z##C, orig##C); \ 367 z##D = _mm_add_epi32(z##D, orig##D); \ 368 in##A = _mm_cvtsi128_si32(z##A); \ 369 in##B = _mm_cvtsi128_si32(z##B); \ 370 in##C = _mm_cvtsi128_si32(z##C); \ 371 in##D = _mm_cvtsi128_si32(z##D); \ 372 z##A = _mm_shuffle_epi32(z##A, 0x39); \ 373 z##B = _mm_shuffle_epi32(z##B, 0x39); \ 374 z##C = _mm_shuffle_epi32(z##C, 0x39); \ 375 z##D = _mm_shuffle_epi32(z##D, 0x39); \ 376 \ 377 in##A ^= *(uint32_t *) (m + 0); \ 378 in##B ^= *(uint32_t *) (m + 4); \ 379 in##C ^= *(uint32_t *) (m + 8); \ 380 in##D ^= *(uint32_t *) (m + 12); \ 381 \ 382 *(uint32_t *) (c + 0) = in##A; \ 383 *(uint32_t *) (c + 4) = in##B; \ 384 *(uint32_t *) (c + 8) = in##C; \ 385 *(uint32_t *) (c + 12) = in##D; \ 386 \ 387 in##A = _mm_cvtsi128_si32(z##A); \ 388 in##B = _mm_cvtsi128_si32(z##B); \ 389 in##C = _mm_cvtsi128_si32(z##C); \ 390 in##D = _mm_cvtsi128_si32(z##D); \ 391 z##A = _mm_shuffle_epi32(z##A, 0x39); \ 392 z##B = _mm_shuffle_epi32(z##B, 0x39); \ 393 z##C = _mm_shuffle_epi32(z##C, 0x39); \ 394 z##D = _mm_shuffle_epi32(z##D, 0x39); \ 395 \ 396 in##A ^= *(uint32_t *) (m + 64); \ 397 in##B ^= *(uint32_t *) (m + 68); \ 398 in##C ^= *(uint32_t *) (m + 72); \ 399 in##D ^= *(uint32_t *) (m + 76); \ 400 *(uint32_t *) (c + 64) = in##A; \ 401 *(uint32_t *) (c + 68) = in##B; \ 402 *(uint32_t *) (c + 72) = in##C; \ 403 *(uint32_t *) (c + 76) = in##D; \ 404 \ 405 in##A = _mm_cvtsi128_si32(z##A); \ 406 in##B = _mm_cvtsi128_si32(z##B); \ 407 in##C = _mm_cvtsi128_si32(z##C); \ 408 in##D = _mm_cvtsi128_si32(z##D); \ 409 z##A = _mm_shuffle_epi32(z##A, 0x39); \ 410 z##B = _mm_shuffle_epi32(z##B, 0x39); \ 411 z##C = _mm_shuffle_epi32(z##C, 0x39); \ 412 z##D = _mm_shuffle_epi32(z##D, 0x39); \ 413 \ 414 in##A ^= *(uint32_t *) (m + 128); \ 415 in##B ^= *(uint32_t *) (m + 132); \ 416 in##C ^= *(uint32_t *) (m + 136); \ 417 in##D ^= *(uint32_t *) (m + 140); \ 418 *(uint32_t *) (c + 128) = in##A; \ 419 *(uint32_t *) (c + 132) = in##B; \ 420 *(uint32_t *) (c + 136) = in##C; \ 421 *(uint32_t *) (c + 140) = in##D; \ 422 \ 423 in##A = _mm_cvtsi128_si32(z##A); \ 424 in##B = _mm_cvtsi128_si32(z##B); \ 425 in##C = _mm_cvtsi128_si32(z##C); \ 426 in##D = _mm_cvtsi128_si32(z##D); \ 427 \ 428 in##A ^= *(uint32_t *) (m + 192); \ 429 in##B ^= *(uint32_t *) (m + 196); \ 430 in##C ^= *(uint32_t *) (m + 200); \ 431 in##D ^= *(uint32_t *) (m + 204); \ 432 *(uint32_t *) (c + 192) = in##A; \ 433 *(uint32_t *) (c + 196) = in##B; \ 434 *(uint32_t *) (c + 200) = in##C; \ 435 *(uint32_t *) (c + 204) = in##D 436 437 /* store data ; this macro replaces shuffle+mov by a direct extract; not much 438 * difference */ 439 #define ONEQUAD_EXTRACT(A, B, C, D) \ 440 z##A = _mm_add_epi32(z##A, orig##A); \ 441 z##B = _mm_add_epi32(z##B, orig##B); \ 442 z##C = _mm_add_epi32(z##C, orig##C); \ 443 z##D = _mm_add_epi32(z##D, orig##D); \ 444 in##A = _mm_cvtsi128_si32(z##A); \ 445 in##B = _mm_cvtsi128_si32(z##B); \ 446 in##C = _mm_cvtsi128_si32(z##C); \ 447 in##D = _mm_cvtsi128_si32(z##D); \ 448 in##A ^= *(uint32_t *) (m + 0); \ 449 in##B ^= *(uint32_t *) (m + 4); \ 450 in##C ^= *(uint32_t *) (m + 8); \ 451 in##D ^= *(uint32_t *) (m + 12); \ 452 *(uint32_t *) (c + 0) = in##A; \ 453 *(uint32_t *) (c + 4) = in##B; \ 454 *(uint32_t *) (c + 8) = in##C; \ 455 *(uint32_t *) (c + 12) = in##D; \ 456 \ 457 in##A = _mm_extract_epi32(z##A, 1); \ 458 in##B = _mm_extract_epi32(z##B, 1); \ 459 in##C = _mm_extract_epi32(z##C, 1); \ 460 in##D = _mm_extract_epi32(z##D, 1); \ 461 \ 462 in##A ^= *(uint32_t *) (m + 64); \ 463 in##B ^= *(uint32_t *) (m + 68); \ 464 in##C ^= *(uint32_t *) (m + 72); \ 465 in##D ^= *(uint32_t *) (m + 76); \ 466 *(uint32_t *) (c + 64) = in##A; \ 467 *(uint32_t *) (c + 68) = in##B; \ 468 *(uint32_t *) (c + 72) = in##C; \ 469 *(uint32_t *) (c + 76) = in##D; \ 470 \ 471 in##A = _mm_extract_epi32(z##A, 2); \ 472 in##B = _mm_extract_epi32(z##B, 2); \ 473 in##C = _mm_extract_epi32(z##C, 2); \ 474 in##D = _mm_extract_epi32(z##D, 2); \ 475 \ 476 in##A ^= *(uint32_t *) (m + 128); \ 477 in##B ^= *(uint32_t *) (m + 132); \ 478 in##C ^= *(uint32_t *) (m + 136); \ 479 in##D ^= *(uint32_t *) (m + 140); \ 480 *(uint32_t *) (c + 128) = in##A; \ 481 *(uint32_t *) (c + 132) = in##B; \ 482 *(uint32_t *) (c + 136) = in##C; \ 483 *(uint32_t *) (c + 140) = in##D; \ 484 \ 485 in##A = _mm_extract_epi32(z##A, 3); \ 486 in##B = _mm_extract_epi32(z##B, 3); \ 487 in##C = _mm_extract_epi32(z##C, 3); \ 488 in##D = _mm_extract_epi32(z##D, 3); \ 489 \ 490 in##A ^= *(uint32_t *) (m + 192); \ 491 in##B ^= *(uint32_t *) (m + 196); \ 492 in##C ^= *(uint32_t *) (m + 200); \ 493 in##D ^= *(uint32_t *) (m + 204); \ 494 *(uint32_t *) (c + 192) = in##A; \ 495 *(uint32_t *) (c + 196) = in##B; \ 496 *(uint32_t *) (c + 200) = in##C; \ 497 *(uint32_t *) (c + 204) = in##D 498 499 /* store data ; this macro first transpose data in-registers, and then store 500 * them in memory. much faster with icc. */ 501 #define ONEQUAD_TRANSPOSE(A, B, C, D) \ 502 z##A = _mm_add_epi32(z##A, orig##A); \ 503 z##B = _mm_add_epi32(z##B, orig##B); \ 504 z##C = _mm_add_epi32(z##C, orig##C); \ 505 z##D = _mm_add_epi32(z##D, orig##D); \ 506 y##A = _mm_unpacklo_epi32(z##A, z##B); \ 507 y##B = _mm_unpacklo_epi32(z##C, z##D); \ 508 y##C = _mm_unpackhi_epi32(z##A, z##B); \ 509 y##D = _mm_unpackhi_epi32(z##C, z##D); \ 510 z##A = _mm_unpacklo_epi64(y##A, y##B); \ 511 z##B = _mm_unpackhi_epi64(y##A, y##B); \ 512 z##C = _mm_unpacklo_epi64(y##C, y##D); \ 513 z##D = _mm_unpackhi_epi64(y##C, y##D); \ 514 y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0))); \ 515 _mm_storeu_si128((__m128i *) (c + 0), y##A); \ 516 y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64))); \ 517 _mm_storeu_si128((__m128i *) (c + 64), y##B); \ 518 y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \ 519 _mm_storeu_si128((__m128i *) (c + 128), y##C); \ 520 y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \ 521 _mm_storeu_si128((__m128i *) (c + 192), y##D) 522 523 #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) 524 525 ONEQUAD(0, 1, 2, 3); 526 m += 16; 527 c += 16; 528 ONEQUAD(4, 5, 6, 7); 529 m += 16; 530 c += 16; 531 ONEQUAD(8, 9, 10, 11); 532 m += 16; 533 c += 16; 534 ONEQUAD(12, 13, 14, 15); 535 m -= 48; 536 c -= 48; 537 538 #undef ONEQUAD 539 #undef ONEQUAD_TRANSPOSE 540 #undef ONEQUAD_EXTRACT 541 #undef ONEQUAD_SHUFFLE 542 543 bytes -= 256; 544 c += 256; 545 m += 256; 546 } 547 } 548