10ac341f1SConrad Meyer if (bytes >= 512) {
20ac341f1SConrad Meyer     __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
30ac341f1SConrad Meyer         y15;
40ac341f1SConrad Meyer 
50ac341f1SConrad Meyer     /* the naive way seems as fast (if not a bit faster) than the vector way */
60ac341f1SConrad Meyer     __m256i z0  = _mm256_set1_epi32(x[0]);
70ac341f1SConrad Meyer     __m256i z5  = _mm256_set1_epi32(x[1]);
80ac341f1SConrad Meyer     __m256i z10 = _mm256_set1_epi32(x[2]);
90ac341f1SConrad Meyer     __m256i z15 = _mm256_set1_epi32(x[3]);
100ac341f1SConrad Meyer     __m256i z12 = _mm256_set1_epi32(x[4]);
110ac341f1SConrad Meyer     __m256i z1  = _mm256_set1_epi32(x[5]);
120ac341f1SConrad Meyer     __m256i z6  = _mm256_set1_epi32(x[6]);
130ac341f1SConrad Meyer     __m256i z11 = _mm256_set1_epi32(x[7]);
140ac341f1SConrad Meyer     __m256i z8; /* useless */
150ac341f1SConrad Meyer     __m256i z13 = _mm256_set1_epi32(x[9]);
160ac341f1SConrad Meyer     __m256i z2  = _mm256_set1_epi32(x[10]);
170ac341f1SConrad Meyer     __m256i z7  = _mm256_set1_epi32(x[11]);
180ac341f1SConrad Meyer     __m256i z4  = _mm256_set1_epi32(x[12]);
190ac341f1SConrad Meyer     __m256i z9; /* useless */
200ac341f1SConrad Meyer     __m256i z14 = _mm256_set1_epi32(x[14]);
210ac341f1SConrad Meyer     __m256i z3  = _mm256_set1_epi32(x[15]);
220ac341f1SConrad Meyer 
230ac341f1SConrad Meyer     __m256i orig0 = z0;
240ac341f1SConrad Meyer     __m256i orig1 = z1;
250ac341f1SConrad Meyer     __m256i orig2 = z2;
260ac341f1SConrad Meyer     __m256i orig3 = z3;
270ac341f1SConrad Meyer     __m256i orig4 = z4;
280ac341f1SConrad Meyer     __m256i orig5 = z5;
290ac341f1SConrad Meyer     __m256i orig6 = z6;
300ac341f1SConrad Meyer     __m256i orig7 = z7;
310ac341f1SConrad Meyer     __m256i orig8;
320ac341f1SConrad Meyer     __m256i orig9;
330ac341f1SConrad Meyer     __m256i orig10 = z10;
340ac341f1SConrad Meyer     __m256i orig11 = z11;
350ac341f1SConrad Meyer     __m256i orig12 = z12;
360ac341f1SConrad Meyer     __m256i orig13 = z13;
370ac341f1SConrad Meyer     __m256i orig14 = z14;
380ac341f1SConrad Meyer     __m256i orig15 = z15;
390ac341f1SConrad Meyer 
400ac341f1SConrad Meyer     uint32_t in8;
410ac341f1SConrad Meyer     uint32_t in9;
420ac341f1SConrad Meyer     int      i;
430ac341f1SConrad Meyer 
440ac341f1SConrad Meyer     while (bytes >= 512) {
450ac341f1SConrad Meyer         /* vector implementation for z8 and z9 */
460ac341f1SConrad Meyer         /* faster than the naive version for 8 blocks */
470ac341f1SConrad Meyer         const __m256i addv8   = _mm256_set_epi64x(3, 2, 1, 0);
480ac341f1SConrad Meyer         const __m256i addv9   = _mm256_set_epi64x(7, 6, 5, 4);
490ac341f1SConrad Meyer         const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
500ac341f1SConrad Meyer 
510ac341f1SConrad Meyer         __m256i  t8, t9;
520ac341f1SConrad Meyer         uint64_t in89;
530ac341f1SConrad Meyer 
540ac341f1SConrad Meyer         in8  = x[8];
550ac341f1SConrad Meyer         in9  = x[13]; /* see arrays above for the address translation */
560ac341f1SConrad Meyer         in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
570ac341f1SConrad Meyer 
580ac341f1SConrad Meyer         z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89));
590ac341f1SConrad Meyer 
600ac341f1SConrad Meyer         t8 = _mm256_add_epi64(addv8, z8);
610ac341f1SConrad Meyer         t9 = _mm256_add_epi64(addv9, z9);
620ac341f1SConrad Meyer 
630ac341f1SConrad Meyer         z8 = _mm256_unpacklo_epi32(t8, t9);
640ac341f1SConrad Meyer         z9 = _mm256_unpackhi_epi32(t8, t9);
650ac341f1SConrad Meyer 
660ac341f1SConrad Meyer         t8 = _mm256_unpacklo_epi32(z8, z9);
670ac341f1SConrad Meyer         t9 = _mm256_unpackhi_epi32(z8, z9);
680ac341f1SConrad Meyer 
690ac341f1SConrad Meyer         /* required because unpack* are intra-lane */
700ac341f1SConrad Meyer         z8 = _mm256_permutevar8x32_epi32(t8, permute);
710ac341f1SConrad Meyer         z9 = _mm256_permutevar8x32_epi32(t9, permute);
720ac341f1SConrad Meyer 
730ac341f1SConrad Meyer         orig8 = z8;
740ac341f1SConrad Meyer         orig9 = z9;
750ac341f1SConrad Meyer 
760ac341f1SConrad Meyer         in89 += 8;
770ac341f1SConrad Meyer 
780ac341f1SConrad Meyer         x[8]  = in89 & 0xFFFFFFFF;
790ac341f1SConrad Meyer         x[13] = (in89 >> 32) & 0xFFFFFFFF;
800ac341f1SConrad Meyer 
810ac341f1SConrad Meyer         z5  = orig5;
820ac341f1SConrad Meyer         z10 = orig10;
830ac341f1SConrad Meyer         z15 = orig15;
840ac341f1SConrad Meyer         z14 = orig14;
850ac341f1SConrad Meyer         z3  = orig3;
860ac341f1SConrad Meyer         z6  = orig6;
870ac341f1SConrad Meyer         z11 = orig11;
880ac341f1SConrad Meyer         z1  = orig1;
890ac341f1SConrad Meyer 
900ac341f1SConrad Meyer         z7  = orig7;
910ac341f1SConrad Meyer         z13 = orig13;
920ac341f1SConrad Meyer         z2  = orig2;
930ac341f1SConrad Meyer         z9  = orig9;
940ac341f1SConrad Meyer         z0  = orig0;
950ac341f1SConrad Meyer         z12 = orig12;
960ac341f1SConrad Meyer         z4  = orig4;
970ac341f1SConrad Meyer         z8  = orig8;
980ac341f1SConrad Meyer 
990ac341f1SConrad Meyer         for (i = 0; i < ROUNDS; i += 2) {
1000ac341f1SConrad Meyer             /* the inner loop is a direct translation (regexp search/replace)
1010ac341f1SConrad Meyer              * from the amd64-xmm6 ASM */
1020ac341f1SConrad Meyer             __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
1030ac341f1SConrad Meyer                 r14, r15;
1040ac341f1SConrad Meyer 
1050ac341f1SConrad Meyer             y4 = z12;
1060ac341f1SConrad Meyer             y4 = _mm256_add_epi32(y4, z0);
1070ac341f1SConrad Meyer             r4 = y4;
1080ac341f1SConrad Meyer             y4 = _mm256_slli_epi32(y4, 7);
1090ac341f1SConrad Meyer             z4 = _mm256_xor_si256(z4, y4);
1100ac341f1SConrad Meyer             r4 = _mm256_srli_epi32(r4, 25);
1110ac341f1SConrad Meyer             z4 = _mm256_xor_si256(z4, r4);
1120ac341f1SConrad Meyer 
1130ac341f1SConrad Meyer             y9 = z1;
1140ac341f1SConrad Meyer             y9 = _mm256_add_epi32(y9, z5);
1150ac341f1SConrad Meyer             r9 = y9;
1160ac341f1SConrad Meyer             y9 = _mm256_slli_epi32(y9, 7);
1170ac341f1SConrad Meyer             z9 = _mm256_xor_si256(z9, y9);
1180ac341f1SConrad Meyer             r9 = _mm256_srli_epi32(r9, 25);
1190ac341f1SConrad Meyer             z9 = _mm256_xor_si256(z9, r9);
1200ac341f1SConrad Meyer 
1210ac341f1SConrad Meyer             y8 = z0;
1220ac341f1SConrad Meyer             y8 = _mm256_add_epi32(y8, z4);
1230ac341f1SConrad Meyer             r8 = y8;
1240ac341f1SConrad Meyer             y8 = _mm256_slli_epi32(y8, 9);
1250ac341f1SConrad Meyer             z8 = _mm256_xor_si256(z8, y8);
1260ac341f1SConrad Meyer             r8 = _mm256_srli_epi32(r8, 23);
1270ac341f1SConrad Meyer             z8 = _mm256_xor_si256(z8, r8);
1280ac341f1SConrad Meyer 
1290ac341f1SConrad Meyer             y13 = z5;
1300ac341f1SConrad Meyer             y13 = _mm256_add_epi32(y13, z9);
1310ac341f1SConrad Meyer             r13 = y13;
1320ac341f1SConrad Meyer             y13 = _mm256_slli_epi32(y13, 9);
1330ac341f1SConrad Meyer             z13 = _mm256_xor_si256(z13, y13);
1340ac341f1SConrad Meyer             r13 = _mm256_srli_epi32(r13, 23);
1350ac341f1SConrad Meyer             z13 = _mm256_xor_si256(z13, r13);
1360ac341f1SConrad Meyer 
1370ac341f1SConrad Meyer             y12 = z4;
1380ac341f1SConrad Meyer             y12 = _mm256_add_epi32(y12, z8);
1390ac341f1SConrad Meyer             r12 = y12;
1400ac341f1SConrad Meyer             y12 = _mm256_slli_epi32(y12, 13);
1410ac341f1SConrad Meyer             z12 = _mm256_xor_si256(z12, y12);
1420ac341f1SConrad Meyer             r12 = _mm256_srli_epi32(r12, 19);
1430ac341f1SConrad Meyer             z12 = _mm256_xor_si256(z12, r12);
1440ac341f1SConrad Meyer 
1450ac341f1SConrad Meyer             y1 = z9;
1460ac341f1SConrad Meyer             y1 = _mm256_add_epi32(y1, z13);
1470ac341f1SConrad Meyer             r1 = y1;
1480ac341f1SConrad Meyer             y1 = _mm256_slli_epi32(y1, 13);
1490ac341f1SConrad Meyer             z1 = _mm256_xor_si256(z1, y1);
1500ac341f1SConrad Meyer             r1 = _mm256_srli_epi32(r1, 19);
1510ac341f1SConrad Meyer             z1 = _mm256_xor_si256(z1, r1);
1520ac341f1SConrad Meyer 
1530ac341f1SConrad Meyer             y0 = z8;
1540ac341f1SConrad Meyer             y0 = _mm256_add_epi32(y0, z12);
1550ac341f1SConrad Meyer             r0 = y0;
1560ac341f1SConrad Meyer             y0 = _mm256_slli_epi32(y0, 18);
1570ac341f1SConrad Meyer             z0 = _mm256_xor_si256(z0, y0);
1580ac341f1SConrad Meyer             r0 = _mm256_srli_epi32(r0, 14);
1590ac341f1SConrad Meyer             z0 = _mm256_xor_si256(z0, r0);
1600ac341f1SConrad Meyer 
1610ac341f1SConrad Meyer             y5 = z13;
1620ac341f1SConrad Meyer             y5 = _mm256_add_epi32(y5, z1);
1630ac341f1SConrad Meyer             r5 = y5;
1640ac341f1SConrad Meyer             y5 = _mm256_slli_epi32(y5, 18);
1650ac341f1SConrad Meyer             z5 = _mm256_xor_si256(z5, y5);
1660ac341f1SConrad Meyer             r5 = _mm256_srli_epi32(r5, 14);
1670ac341f1SConrad Meyer             z5 = _mm256_xor_si256(z5, r5);
1680ac341f1SConrad Meyer 
1690ac341f1SConrad Meyer             y14 = z6;
1700ac341f1SConrad Meyer             y14 = _mm256_add_epi32(y14, z10);
1710ac341f1SConrad Meyer             r14 = y14;
1720ac341f1SConrad Meyer             y14 = _mm256_slli_epi32(y14, 7);
1730ac341f1SConrad Meyer             z14 = _mm256_xor_si256(z14, y14);
1740ac341f1SConrad Meyer             r14 = _mm256_srli_epi32(r14, 25);
1750ac341f1SConrad Meyer             z14 = _mm256_xor_si256(z14, r14);
1760ac341f1SConrad Meyer 
1770ac341f1SConrad Meyer             y3 = z11;
1780ac341f1SConrad Meyer             y3 = _mm256_add_epi32(y3, z15);
1790ac341f1SConrad Meyer             r3 = y3;
1800ac341f1SConrad Meyer             y3 = _mm256_slli_epi32(y3, 7);
1810ac341f1SConrad Meyer             z3 = _mm256_xor_si256(z3, y3);
1820ac341f1SConrad Meyer             r3 = _mm256_srli_epi32(r3, 25);
1830ac341f1SConrad Meyer             z3 = _mm256_xor_si256(z3, r3);
1840ac341f1SConrad Meyer 
1850ac341f1SConrad Meyer             y2 = z10;
1860ac341f1SConrad Meyer             y2 = _mm256_add_epi32(y2, z14);
1870ac341f1SConrad Meyer             r2 = y2;
1880ac341f1SConrad Meyer             y2 = _mm256_slli_epi32(y2, 9);
1890ac341f1SConrad Meyer             z2 = _mm256_xor_si256(z2, y2);
1900ac341f1SConrad Meyer             r2 = _mm256_srli_epi32(r2, 23);
1910ac341f1SConrad Meyer             z2 = _mm256_xor_si256(z2, r2);
1920ac341f1SConrad Meyer 
1930ac341f1SConrad Meyer             y7 = z15;
1940ac341f1SConrad Meyer             y7 = _mm256_add_epi32(y7, z3);
1950ac341f1SConrad Meyer             r7 = y7;
1960ac341f1SConrad Meyer             y7 = _mm256_slli_epi32(y7, 9);
1970ac341f1SConrad Meyer             z7 = _mm256_xor_si256(z7, y7);
1980ac341f1SConrad Meyer             r7 = _mm256_srli_epi32(r7, 23);
1990ac341f1SConrad Meyer             z7 = _mm256_xor_si256(z7, r7);
2000ac341f1SConrad Meyer 
2010ac341f1SConrad Meyer             y6 = z14;
2020ac341f1SConrad Meyer             y6 = _mm256_add_epi32(y6, z2);
2030ac341f1SConrad Meyer             r6 = y6;
2040ac341f1SConrad Meyer             y6 = _mm256_slli_epi32(y6, 13);
2050ac341f1SConrad Meyer             z6 = _mm256_xor_si256(z6, y6);
2060ac341f1SConrad Meyer             r6 = _mm256_srli_epi32(r6, 19);
2070ac341f1SConrad Meyer             z6 = _mm256_xor_si256(z6, r6);
2080ac341f1SConrad Meyer 
2090ac341f1SConrad Meyer             y11 = z3;
2100ac341f1SConrad Meyer             y11 = _mm256_add_epi32(y11, z7);
2110ac341f1SConrad Meyer             r11 = y11;
2120ac341f1SConrad Meyer             y11 = _mm256_slli_epi32(y11, 13);
2130ac341f1SConrad Meyer             z11 = _mm256_xor_si256(z11, y11);
2140ac341f1SConrad Meyer             r11 = _mm256_srli_epi32(r11, 19);
2150ac341f1SConrad Meyer             z11 = _mm256_xor_si256(z11, r11);
2160ac341f1SConrad Meyer 
2170ac341f1SConrad Meyer             y10 = z2;
2180ac341f1SConrad Meyer             y10 = _mm256_add_epi32(y10, z6);
2190ac341f1SConrad Meyer             r10 = y10;
2200ac341f1SConrad Meyer             y10 = _mm256_slli_epi32(y10, 18);
2210ac341f1SConrad Meyer             z10 = _mm256_xor_si256(z10, y10);
2220ac341f1SConrad Meyer             r10 = _mm256_srli_epi32(r10, 14);
2230ac341f1SConrad Meyer             z10 = _mm256_xor_si256(z10, r10);
2240ac341f1SConrad Meyer 
2250ac341f1SConrad Meyer             y1 = z3;
2260ac341f1SConrad Meyer             y1 = _mm256_add_epi32(y1, z0);
2270ac341f1SConrad Meyer             r1 = y1;
2280ac341f1SConrad Meyer             y1 = _mm256_slli_epi32(y1, 7);
2290ac341f1SConrad Meyer             z1 = _mm256_xor_si256(z1, y1);
2300ac341f1SConrad Meyer             r1 = _mm256_srli_epi32(r1, 25);
2310ac341f1SConrad Meyer             z1 = _mm256_xor_si256(z1, r1);
2320ac341f1SConrad Meyer 
2330ac341f1SConrad Meyer             y15 = z7;
2340ac341f1SConrad Meyer             y15 = _mm256_add_epi32(y15, z11);
2350ac341f1SConrad Meyer             r15 = y15;
2360ac341f1SConrad Meyer             y15 = _mm256_slli_epi32(y15, 18);
2370ac341f1SConrad Meyer             z15 = _mm256_xor_si256(z15, y15);
2380ac341f1SConrad Meyer             r15 = _mm256_srli_epi32(r15, 14);
2390ac341f1SConrad Meyer             z15 = _mm256_xor_si256(z15, r15);
2400ac341f1SConrad Meyer 
2410ac341f1SConrad Meyer             y6 = z4;
2420ac341f1SConrad Meyer             y6 = _mm256_add_epi32(y6, z5);
2430ac341f1SConrad Meyer             r6 = y6;
2440ac341f1SConrad Meyer             y6 = _mm256_slli_epi32(y6, 7);
2450ac341f1SConrad Meyer             z6 = _mm256_xor_si256(z6, y6);
2460ac341f1SConrad Meyer             r6 = _mm256_srli_epi32(r6, 25);
2470ac341f1SConrad Meyer             z6 = _mm256_xor_si256(z6, r6);
2480ac341f1SConrad Meyer 
2490ac341f1SConrad Meyer             y2 = z0;
2500ac341f1SConrad Meyer             y2 = _mm256_add_epi32(y2, z1);
2510ac341f1SConrad Meyer             r2 = y2;
2520ac341f1SConrad Meyer             y2 = _mm256_slli_epi32(y2, 9);
2530ac341f1SConrad Meyer             z2 = _mm256_xor_si256(z2, y2);
2540ac341f1SConrad Meyer             r2 = _mm256_srli_epi32(r2, 23);
2550ac341f1SConrad Meyer             z2 = _mm256_xor_si256(z2, r2);
2560ac341f1SConrad Meyer 
2570ac341f1SConrad Meyer             y7 = z5;
2580ac341f1SConrad Meyer             y7 = _mm256_add_epi32(y7, z6);
2590ac341f1SConrad Meyer             r7 = y7;
2600ac341f1SConrad Meyer             y7 = _mm256_slli_epi32(y7, 9);
2610ac341f1SConrad Meyer             z7 = _mm256_xor_si256(z7, y7);
2620ac341f1SConrad Meyer             r7 = _mm256_srli_epi32(r7, 23);
2630ac341f1SConrad Meyer             z7 = _mm256_xor_si256(z7, r7);
2640ac341f1SConrad Meyer 
2650ac341f1SConrad Meyer             y3 = z1;
2660ac341f1SConrad Meyer             y3 = _mm256_add_epi32(y3, z2);
2670ac341f1SConrad Meyer             r3 = y3;
2680ac341f1SConrad Meyer             y3 = _mm256_slli_epi32(y3, 13);
2690ac341f1SConrad Meyer             z3 = _mm256_xor_si256(z3, y3);
2700ac341f1SConrad Meyer             r3 = _mm256_srli_epi32(r3, 19);
2710ac341f1SConrad Meyer             z3 = _mm256_xor_si256(z3, r3);
2720ac341f1SConrad Meyer 
2730ac341f1SConrad Meyer             y4 = z6;
2740ac341f1SConrad Meyer             y4 = _mm256_add_epi32(y4, z7);
2750ac341f1SConrad Meyer             r4 = y4;
2760ac341f1SConrad Meyer             y4 = _mm256_slli_epi32(y4, 13);
2770ac341f1SConrad Meyer             z4 = _mm256_xor_si256(z4, y4);
2780ac341f1SConrad Meyer             r4 = _mm256_srli_epi32(r4, 19);
2790ac341f1SConrad Meyer             z4 = _mm256_xor_si256(z4, r4);
2800ac341f1SConrad Meyer 
2810ac341f1SConrad Meyer             y0 = z2;
2820ac341f1SConrad Meyer             y0 = _mm256_add_epi32(y0, z3);
2830ac341f1SConrad Meyer             r0 = y0;
2840ac341f1SConrad Meyer             y0 = _mm256_slli_epi32(y0, 18);
2850ac341f1SConrad Meyer             z0 = _mm256_xor_si256(z0, y0);
2860ac341f1SConrad Meyer             r0 = _mm256_srli_epi32(r0, 14);
2870ac341f1SConrad Meyer             z0 = _mm256_xor_si256(z0, r0);
2880ac341f1SConrad Meyer 
2890ac341f1SConrad Meyer             y5 = z7;
2900ac341f1SConrad Meyer             y5 = _mm256_add_epi32(y5, z4);
2910ac341f1SConrad Meyer             r5 = y5;
2920ac341f1SConrad Meyer             y5 = _mm256_slli_epi32(y5, 18);
2930ac341f1SConrad Meyer             z5 = _mm256_xor_si256(z5, y5);
2940ac341f1SConrad Meyer             r5 = _mm256_srli_epi32(r5, 14);
2950ac341f1SConrad Meyer             z5 = _mm256_xor_si256(z5, r5);
2960ac341f1SConrad Meyer 
2970ac341f1SConrad Meyer             y11 = z9;
2980ac341f1SConrad Meyer             y11 = _mm256_add_epi32(y11, z10);
2990ac341f1SConrad Meyer             r11 = y11;
3000ac341f1SConrad Meyer             y11 = _mm256_slli_epi32(y11, 7);
3010ac341f1SConrad Meyer             z11 = _mm256_xor_si256(z11, y11);
3020ac341f1SConrad Meyer             r11 = _mm256_srli_epi32(r11, 25);
3030ac341f1SConrad Meyer             z11 = _mm256_xor_si256(z11, r11);
3040ac341f1SConrad Meyer 
3050ac341f1SConrad Meyer             y12 = z14;
3060ac341f1SConrad Meyer             y12 = _mm256_add_epi32(y12, z15);
3070ac341f1SConrad Meyer             r12 = y12;
3080ac341f1SConrad Meyer             y12 = _mm256_slli_epi32(y12, 7);
3090ac341f1SConrad Meyer             z12 = _mm256_xor_si256(z12, y12);
3100ac341f1SConrad Meyer             r12 = _mm256_srli_epi32(r12, 25);
3110ac341f1SConrad Meyer             z12 = _mm256_xor_si256(z12, r12);
3120ac341f1SConrad Meyer 
3130ac341f1SConrad Meyer             y8 = z10;
3140ac341f1SConrad Meyer             y8 = _mm256_add_epi32(y8, z11);
3150ac341f1SConrad Meyer             r8 = y8;
3160ac341f1SConrad Meyer             y8 = _mm256_slli_epi32(y8, 9);
3170ac341f1SConrad Meyer             z8 = _mm256_xor_si256(z8, y8);
3180ac341f1SConrad Meyer             r8 = _mm256_srli_epi32(r8, 23);
3190ac341f1SConrad Meyer             z8 = _mm256_xor_si256(z8, r8);
3200ac341f1SConrad Meyer 
3210ac341f1SConrad Meyer             y13 = z15;
3220ac341f1SConrad Meyer             y13 = _mm256_add_epi32(y13, z12);
3230ac341f1SConrad Meyer             r13 = y13;
3240ac341f1SConrad Meyer             y13 = _mm256_slli_epi32(y13, 9);
3250ac341f1SConrad Meyer             z13 = _mm256_xor_si256(z13, y13);
3260ac341f1SConrad Meyer             r13 = _mm256_srli_epi32(r13, 23);
3270ac341f1SConrad Meyer             z13 = _mm256_xor_si256(z13, r13);
3280ac341f1SConrad Meyer 
3290ac341f1SConrad Meyer             y9 = z11;
3300ac341f1SConrad Meyer             y9 = _mm256_add_epi32(y9, z8);
3310ac341f1SConrad Meyer             r9 = y9;
3320ac341f1SConrad Meyer             y9 = _mm256_slli_epi32(y9, 13);
3330ac341f1SConrad Meyer             z9 = _mm256_xor_si256(z9, y9);
3340ac341f1SConrad Meyer             r9 = _mm256_srli_epi32(r9, 19);
3350ac341f1SConrad Meyer             z9 = _mm256_xor_si256(z9, r9);
3360ac341f1SConrad Meyer 
3370ac341f1SConrad Meyer             y14 = z12;
3380ac341f1SConrad Meyer             y14 = _mm256_add_epi32(y14, z13);
3390ac341f1SConrad Meyer             r14 = y14;
3400ac341f1SConrad Meyer             y14 = _mm256_slli_epi32(y14, 13);
3410ac341f1SConrad Meyer             z14 = _mm256_xor_si256(z14, y14);
3420ac341f1SConrad Meyer             r14 = _mm256_srli_epi32(r14, 19);
3430ac341f1SConrad Meyer             z14 = _mm256_xor_si256(z14, r14);
3440ac341f1SConrad Meyer 
3450ac341f1SConrad Meyer             y10 = z8;
3460ac341f1SConrad Meyer             y10 = _mm256_add_epi32(y10, z9);
3470ac341f1SConrad Meyer             r10 = y10;
3480ac341f1SConrad Meyer             y10 = _mm256_slli_epi32(y10, 18);
3490ac341f1SConrad Meyer             z10 = _mm256_xor_si256(z10, y10);
3500ac341f1SConrad Meyer             r10 = _mm256_srli_epi32(r10, 14);
3510ac341f1SConrad Meyer             z10 = _mm256_xor_si256(z10, r10);
3520ac341f1SConrad Meyer 
3530ac341f1SConrad Meyer             y15 = z13;
3540ac341f1SConrad Meyer             y15 = _mm256_add_epi32(y15, z14);
3550ac341f1SConrad Meyer             r15 = y15;
3560ac341f1SConrad Meyer             y15 = _mm256_slli_epi32(y15, 18);
3570ac341f1SConrad Meyer             z15 = _mm256_xor_si256(z15, y15);
3580ac341f1SConrad Meyer             r15 = _mm256_srli_epi32(r15, 14);
3590ac341f1SConrad Meyer             z15 = _mm256_xor_si256(z15, r15);
3600ac341f1SConrad Meyer         }
3610ac341f1SConrad Meyer 
3620ac341f1SConrad Meyer /* store data ; this macro first transpose data in-registers, and then store
3630ac341f1SConrad Meyer  * them in memory. much faster with icc. */
3640ac341f1SConrad Meyer #define ONEQUAD_TRANSPOSE(A, B, C, D)                              \
3650ac341f1SConrad Meyer     {                                                              \
3660ac341f1SConrad Meyer         __m128i t0, t1, t2, t3;                                    \
3670ac341f1SConrad Meyer         z##A = _mm256_add_epi32(z##A, orig##A);                    \
3680ac341f1SConrad Meyer         z##B = _mm256_add_epi32(z##B, orig##B);                    \
3690ac341f1SConrad Meyer         z##C = _mm256_add_epi32(z##C, orig##C);                    \
3700ac341f1SConrad Meyer         z##D = _mm256_add_epi32(z##D, orig##D);                    \
3710ac341f1SConrad Meyer         y##A = _mm256_unpacklo_epi32(z##A, z##B);                  \
3720ac341f1SConrad Meyer         y##B = _mm256_unpacklo_epi32(z##C, z##D);                  \
3730ac341f1SConrad Meyer         y##C = _mm256_unpackhi_epi32(z##A, z##B);                  \
3740ac341f1SConrad Meyer         y##D = _mm256_unpackhi_epi32(z##C, z##D);                  \
3750ac341f1SConrad Meyer         z##A = _mm256_unpacklo_epi64(y##A, y##B);                  \
3760ac341f1SConrad Meyer         z##B = _mm256_unpackhi_epi64(y##A, y##B);                  \
3770ac341f1SConrad Meyer         z##C = _mm256_unpacklo_epi64(y##C, y##D);                  \
3780ac341f1SConrad Meyer         z##D = _mm256_unpackhi_epi64(y##C, y##D);                  \
3790ac341f1SConrad Meyer         t0   = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0),    \
3800ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 0)));   \
3810ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 0), t0);                  \
3820ac341f1SConrad Meyer         t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0),      \
3830ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 64)));  \
3840ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 64), t1);                 \
3850ac341f1SConrad Meyer         t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0),      \
3860ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 128))); \
3870ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 128), t2);                \
3880ac341f1SConrad Meyer         t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0),      \
3890ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 192))); \
3900ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 192), t3);                \
3910ac341f1SConrad Meyer         t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1),      \
3920ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 256))); \
3930ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 256), t0);                \
3940ac341f1SConrad Meyer         t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1),      \
3950ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 320))); \
3960ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 320), t1);                \
3970ac341f1SConrad Meyer         t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1),      \
3980ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 384))); \
3990ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 384), t2);                \
4000ac341f1SConrad Meyer         t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1),      \
4010ac341f1SConrad Meyer                            _mm_loadu_si128((__m128i*) (m + 448))); \
4020ac341f1SConrad Meyer         _mm_storeu_si128((__m128i*) (c + 448), t3);                \
4030ac341f1SConrad Meyer     }
4040ac341f1SConrad Meyer 
4050ac341f1SConrad Meyer #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
4060ac341f1SConrad Meyer 
4070ac341f1SConrad Meyer #define ONEQUAD_UNPCK(A, B, C, D)                 \
4080ac341f1SConrad Meyer     {                                             \
4090ac341f1SConrad Meyer         z##A = _mm256_add_epi32(z##A, orig##A);   \
4100ac341f1SConrad Meyer         z##B = _mm256_add_epi32(z##B, orig##B);   \
4110ac341f1SConrad Meyer         z##C = _mm256_add_epi32(z##C, orig##C);   \
4120ac341f1SConrad Meyer         z##D = _mm256_add_epi32(z##D, orig##D);   \
4130ac341f1SConrad Meyer         y##A = _mm256_unpacklo_epi32(z##A, z##B); \
4140ac341f1SConrad Meyer         y##B = _mm256_unpacklo_epi32(z##C, z##D); \
4150ac341f1SConrad Meyer         y##C = _mm256_unpackhi_epi32(z##A, z##B); \
4160ac341f1SConrad Meyer         y##D = _mm256_unpackhi_epi32(z##C, z##D); \
4170ac341f1SConrad Meyer         z##A = _mm256_unpacklo_epi64(y##A, y##B); \
4180ac341f1SConrad Meyer         z##B = _mm256_unpackhi_epi64(y##A, y##B); \
4190ac341f1SConrad Meyer         z##C = _mm256_unpacklo_epi64(y##C, y##D); \
4200ac341f1SConrad Meyer         z##D = _mm256_unpackhi_epi64(y##C, y##D); \
4210ac341f1SConrad Meyer     }
4220ac341f1SConrad Meyer 
4230ac341f1SConrad Meyer #define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                                     \
4240ac341f1SConrad Meyer     {                                                                           \
4250ac341f1SConrad Meyer         ONEQUAD_UNPCK(A, B, C, D);                                              \
4260ac341f1SConrad Meyer         ONEQUAD_UNPCK(A2, B2, C2, D2);                                          \
4270ac341f1SConrad Meyer         y##A  = _mm256_permute2x128_si256(z##A, z##A2, 0x20);                   \
4280ac341f1SConrad Meyer         y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31);                   \
4290ac341f1SConrad Meyer         y##B  = _mm256_permute2x128_si256(z##B, z##B2, 0x20);                   \
4300ac341f1SConrad Meyer         y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31);                   \
4310ac341f1SConrad Meyer         y##C  = _mm256_permute2x128_si256(z##C, z##C2, 0x20);                   \
4320ac341f1SConrad Meyer         y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31);                   \
4330ac341f1SConrad Meyer         y##D  = _mm256_permute2x128_si256(z##D, z##D2, 0x20);                   \
4340ac341f1SConrad Meyer         y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31);                   \
4350ac341f1SConrad Meyer         y##A  = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \
4360ac341f1SConrad Meyer         y##B =                                                                  \
4370ac341f1SConrad Meyer             _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64)));    \
4380ac341f1SConrad Meyer         y##C =                                                                  \
4390ac341f1SConrad Meyer             _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128)));   \
4400ac341f1SConrad Meyer         y##D =                                                                  \
4410ac341f1SConrad Meyer             _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192)));   \
4420ac341f1SConrad Meyer         y##A2 =                                                                 \
4430ac341f1SConrad Meyer             _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256)));  \
4440ac341f1SConrad Meyer         y##B2 =                                                                 \
4450ac341f1SConrad Meyer             _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320)));  \
4460ac341f1SConrad Meyer         y##C2 =                                                                 \
4470ac341f1SConrad Meyer             _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384)));  \
4480ac341f1SConrad Meyer         y##D2 =                                                                 \
4490ac341f1SConrad Meyer             _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448)));  \
4500ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 0), y##A);                          \
4510ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 64), y##B);                         \
4520ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 128), y##C);                        \
4530ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 192), y##D);                        \
4540ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 256), y##A2);                       \
4550ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 320), y##B2);                       \
4560ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 384), y##C2);                       \
4570ac341f1SConrad Meyer         _mm256_storeu_si256((__m256i*) (c + 448), y##D2);                       \
4580ac341f1SConrad Meyer     }
4590ac341f1SConrad Meyer 
4600ac341f1SConrad Meyer         ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
4610ac341f1SConrad Meyer         m += 32;
4620ac341f1SConrad Meyer         c += 32;
4630ac341f1SConrad Meyer         ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
4640ac341f1SConrad Meyer         m -= 32;
4650ac341f1SConrad Meyer         c -= 32;
4660ac341f1SConrad Meyer 
4670ac341f1SConrad Meyer #undef ONEQUAD
4680ac341f1SConrad Meyer #undef ONEQUAD_TRANSPOSE
4690ac341f1SConrad Meyer #undef ONEQUAD_UNPCK
4700ac341f1SConrad Meyer #undef ONEOCTO
4710ac341f1SConrad Meyer 
4720ac341f1SConrad Meyer         bytes -= 512;
4730ac341f1SConrad Meyer         c += 512;
4740ac341f1SConrad Meyer         m += 512;
4750ac341f1SConrad Meyer     }
4760ac341f1SConrad Meyer }
477