10ac341f1SConrad Meyer if (bytes >= 512) { 20ac341f1SConrad Meyer __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, 30ac341f1SConrad Meyer y15; 40ac341f1SConrad Meyer 50ac341f1SConrad Meyer /* the naive way seems as fast (if not a bit faster) than the vector way */ 60ac341f1SConrad Meyer __m256i z0 = _mm256_set1_epi32(x[0]); 70ac341f1SConrad Meyer __m256i z5 = _mm256_set1_epi32(x[1]); 80ac341f1SConrad Meyer __m256i z10 = _mm256_set1_epi32(x[2]); 90ac341f1SConrad Meyer __m256i z15 = _mm256_set1_epi32(x[3]); 100ac341f1SConrad Meyer __m256i z12 = _mm256_set1_epi32(x[4]); 110ac341f1SConrad Meyer __m256i z1 = _mm256_set1_epi32(x[5]); 120ac341f1SConrad Meyer __m256i z6 = _mm256_set1_epi32(x[6]); 130ac341f1SConrad Meyer __m256i z11 = _mm256_set1_epi32(x[7]); 140ac341f1SConrad Meyer __m256i z8; /* useless */ 150ac341f1SConrad Meyer __m256i z13 = _mm256_set1_epi32(x[9]); 160ac341f1SConrad Meyer __m256i z2 = _mm256_set1_epi32(x[10]); 170ac341f1SConrad Meyer __m256i z7 = _mm256_set1_epi32(x[11]); 180ac341f1SConrad Meyer __m256i z4 = _mm256_set1_epi32(x[12]); 190ac341f1SConrad Meyer __m256i z9; /* useless */ 200ac341f1SConrad Meyer __m256i z14 = _mm256_set1_epi32(x[14]); 210ac341f1SConrad Meyer __m256i z3 = _mm256_set1_epi32(x[15]); 220ac341f1SConrad Meyer 230ac341f1SConrad Meyer __m256i orig0 = z0; 240ac341f1SConrad Meyer __m256i orig1 = z1; 250ac341f1SConrad Meyer __m256i orig2 = z2; 260ac341f1SConrad Meyer __m256i orig3 = z3; 270ac341f1SConrad Meyer __m256i orig4 = z4; 280ac341f1SConrad Meyer __m256i orig5 = z5; 290ac341f1SConrad Meyer __m256i orig6 = z6; 300ac341f1SConrad Meyer __m256i orig7 = z7; 310ac341f1SConrad Meyer __m256i orig8; 320ac341f1SConrad Meyer __m256i orig9; 330ac341f1SConrad Meyer __m256i orig10 = z10; 340ac341f1SConrad Meyer __m256i orig11 = z11; 350ac341f1SConrad Meyer __m256i orig12 = z12; 360ac341f1SConrad Meyer __m256i orig13 = z13; 370ac341f1SConrad Meyer __m256i orig14 = z14; 380ac341f1SConrad Meyer __m256i orig15 = z15; 390ac341f1SConrad Meyer 400ac341f1SConrad Meyer uint32_t in8; 410ac341f1SConrad Meyer uint32_t in9; 420ac341f1SConrad Meyer int i; 430ac341f1SConrad Meyer 440ac341f1SConrad Meyer while (bytes >= 512) { 450ac341f1SConrad Meyer /* vector implementation for z8 and z9 */ 460ac341f1SConrad Meyer /* faster than the naive version for 8 blocks */ 470ac341f1SConrad Meyer const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0); 480ac341f1SConrad Meyer const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4); 490ac341f1SConrad Meyer const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); 500ac341f1SConrad Meyer 510ac341f1SConrad Meyer __m256i t8, t9; 520ac341f1SConrad Meyer uint64_t in89; 530ac341f1SConrad Meyer 540ac341f1SConrad Meyer in8 = x[8]; 550ac341f1SConrad Meyer in9 = x[13]; /* see arrays above for the address translation */ 560ac341f1SConrad Meyer in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); 570ac341f1SConrad Meyer 580ac341f1SConrad Meyer z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89)); 590ac341f1SConrad Meyer 600ac341f1SConrad Meyer t8 = _mm256_add_epi64(addv8, z8); 610ac341f1SConrad Meyer t9 = _mm256_add_epi64(addv9, z9); 620ac341f1SConrad Meyer 630ac341f1SConrad Meyer z8 = _mm256_unpacklo_epi32(t8, t9); 640ac341f1SConrad Meyer z9 = _mm256_unpackhi_epi32(t8, t9); 650ac341f1SConrad Meyer 660ac341f1SConrad Meyer t8 = _mm256_unpacklo_epi32(z8, z9); 670ac341f1SConrad Meyer t9 = _mm256_unpackhi_epi32(z8, z9); 680ac341f1SConrad Meyer 690ac341f1SConrad Meyer /* required because unpack* are intra-lane */ 700ac341f1SConrad Meyer z8 = _mm256_permutevar8x32_epi32(t8, permute); 710ac341f1SConrad Meyer z9 = _mm256_permutevar8x32_epi32(t9, permute); 720ac341f1SConrad Meyer 730ac341f1SConrad Meyer orig8 = z8; 740ac341f1SConrad Meyer orig9 = z9; 750ac341f1SConrad Meyer 760ac341f1SConrad Meyer in89 += 8; 770ac341f1SConrad Meyer 780ac341f1SConrad Meyer x[8] = in89 & 0xFFFFFFFF; 790ac341f1SConrad Meyer x[13] = (in89 >> 32) & 0xFFFFFFFF; 800ac341f1SConrad Meyer 810ac341f1SConrad Meyer z5 = orig5; 820ac341f1SConrad Meyer z10 = orig10; 830ac341f1SConrad Meyer z15 = orig15; 840ac341f1SConrad Meyer z14 = orig14; 850ac341f1SConrad Meyer z3 = orig3; 860ac341f1SConrad Meyer z6 = orig6; 870ac341f1SConrad Meyer z11 = orig11; 880ac341f1SConrad Meyer z1 = orig1; 890ac341f1SConrad Meyer 900ac341f1SConrad Meyer z7 = orig7; 910ac341f1SConrad Meyer z13 = orig13; 920ac341f1SConrad Meyer z2 = orig2; 930ac341f1SConrad Meyer z9 = orig9; 940ac341f1SConrad Meyer z0 = orig0; 950ac341f1SConrad Meyer z12 = orig12; 960ac341f1SConrad Meyer z4 = orig4; 970ac341f1SConrad Meyer z8 = orig8; 980ac341f1SConrad Meyer 990ac341f1SConrad Meyer for (i = 0; i < ROUNDS; i += 2) { 1000ac341f1SConrad Meyer /* the inner loop is a direct translation (regexp search/replace) 1010ac341f1SConrad Meyer * from the amd64-xmm6 ASM */ 1020ac341f1SConrad Meyer __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, 1030ac341f1SConrad Meyer r14, r15; 1040ac341f1SConrad Meyer 1050ac341f1SConrad Meyer y4 = z12; 1060ac341f1SConrad Meyer y4 = _mm256_add_epi32(y4, z0); 1070ac341f1SConrad Meyer r4 = y4; 1080ac341f1SConrad Meyer y4 = _mm256_slli_epi32(y4, 7); 1090ac341f1SConrad Meyer z4 = _mm256_xor_si256(z4, y4); 1100ac341f1SConrad Meyer r4 = _mm256_srli_epi32(r4, 25); 1110ac341f1SConrad Meyer z4 = _mm256_xor_si256(z4, r4); 1120ac341f1SConrad Meyer 1130ac341f1SConrad Meyer y9 = z1; 1140ac341f1SConrad Meyer y9 = _mm256_add_epi32(y9, z5); 1150ac341f1SConrad Meyer r9 = y9; 1160ac341f1SConrad Meyer y9 = _mm256_slli_epi32(y9, 7); 1170ac341f1SConrad Meyer z9 = _mm256_xor_si256(z9, y9); 1180ac341f1SConrad Meyer r9 = _mm256_srli_epi32(r9, 25); 1190ac341f1SConrad Meyer z9 = _mm256_xor_si256(z9, r9); 1200ac341f1SConrad Meyer 1210ac341f1SConrad Meyer y8 = z0; 1220ac341f1SConrad Meyer y8 = _mm256_add_epi32(y8, z4); 1230ac341f1SConrad Meyer r8 = y8; 1240ac341f1SConrad Meyer y8 = _mm256_slli_epi32(y8, 9); 1250ac341f1SConrad Meyer z8 = _mm256_xor_si256(z8, y8); 1260ac341f1SConrad Meyer r8 = _mm256_srli_epi32(r8, 23); 1270ac341f1SConrad Meyer z8 = _mm256_xor_si256(z8, r8); 1280ac341f1SConrad Meyer 1290ac341f1SConrad Meyer y13 = z5; 1300ac341f1SConrad Meyer y13 = _mm256_add_epi32(y13, z9); 1310ac341f1SConrad Meyer r13 = y13; 1320ac341f1SConrad Meyer y13 = _mm256_slli_epi32(y13, 9); 1330ac341f1SConrad Meyer z13 = _mm256_xor_si256(z13, y13); 1340ac341f1SConrad Meyer r13 = _mm256_srli_epi32(r13, 23); 1350ac341f1SConrad Meyer z13 = _mm256_xor_si256(z13, r13); 1360ac341f1SConrad Meyer 1370ac341f1SConrad Meyer y12 = z4; 1380ac341f1SConrad Meyer y12 = _mm256_add_epi32(y12, z8); 1390ac341f1SConrad Meyer r12 = y12; 1400ac341f1SConrad Meyer y12 = _mm256_slli_epi32(y12, 13); 1410ac341f1SConrad Meyer z12 = _mm256_xor_si256(z12, y12); 1420ac341f1SConrad Meyer r12 = _mm256_srli_epi32(r12, 19); 1430ac341f1SConrad Meyer z12 = _mm256_xor_si256(z12, r12); 1440ac341f1SConrad Meyer 1450ac341f1SConrad Meyer y1 = z9; 1460ac341f1SConrad Meyer y1 = _mm256_add_epi32(y1, z13); 1470ac341f1SConrad Meyer r1 = y1; 1480ac341f1SConrad Meyer y1 = _mm256_slli_epi32(y1, 13); 1490ac341f1SConrad Meyer z1 = _mm256_xor_si256(z1, y1); 1500ac341f1SConrad Meyer r1 = _mm256_srli_epi32(r1, 19); 1510ac341f1SConrad Meyer z1 = _mm256_xor_si256(z1, r1); 1520ac341f1SConrad Meyer 1530ac341f1SConrad Meyer y0 = z8; 1540ac341f1SConrad Meyer y0 = _mm256_add_epi32(y0, z12); 1550ac341f1SConrad Meyer r0 = y0; 1560ac341f1SConrad Meyer y0 = _mm256_slli_epi32(y0, 18); 1570ac341f1SConrad Meyer z0 = _mm256_xor_si256(z0, y0); 1580ac341f1SConrad Meyer r0 = _mm256_srli_epi32(r0, 14); 1590ac341f1SConrad Meyer z0 = _mm256_xor_si256(z0, r0); 1600ac341f1SConrad Meyer 1610ac341f1SConrad Meyer y5 = z13; 1620ac341f1SConrad Meyer y5 = _mm256_add_epi32(y5, z1); 1630ac341f1SConrad Meyer r5 = y5; 1640ac341f1SConrad Meyer y5 = _mm256_slli_epi32(y5, 18); 1650ac341f1SConrad Meyer z5 = _mm256_xor_si256(z5, y5); 1660ac341f1SConrad Meyer r5 = _mm256_srli_epi32(r5, 14); 1670ac341f1SConrad Meyer z5 = _mm256_xor_si256(z5, r5); 1680ac341f1SConrad Meyer 1690ac341f1SConrad Meyer y14 = z6; 1700ac341f1SConrad Meyer y14 = _mm256_add_epi32(y14, z10); 1710ac341f1SConrad Meyer r14 = y14; 1720ac341f1SConrad Meyer y14 = _mm256_slli_epi32(y14, 7); 1730ac341f1SConrad Meyer z14 = _mm256_xor_si256(z14, y14); 1740ac341f1SConrad Meyer r14 = _mm256_srli_epi32(r14, 25); 1750ac341f1SConrad Meyer z14 = _mm256_xor_si256(z14, r14); 1760ac341f1SConrad Meyer 1770ac341f1SConrad Meyer y3 = z11; 1780ac341f1SConrad Meyer y3 = _mm256_add_epi32(y3, z15); 1790ac341f1SConrad Meyer r3 = y3; 1800ac341f1SConrad Meyer y3 = _mm256_slli_epi32(y3, 7); 1810ac341f1SConrad Meyer z3 = _mm256_xor_si256(z3, y3); 1820ac341f1SConrad Meyer r3 = _mm256_srli_epi32(r3, 25); 1830ac341f1SConrad Meyer z3 = _mm256_xor_si256(z3, r3); 1840ac341f1SConrad Meyer 1850ac341f1SConrad Meyer y2 = z10; 1860ac341f1SConrad Meyer y2 = _mm256_add_epi32(y2, z14); 1870ac341f1SConrad Meyer r2 = y2; 1880ac341f1SConrad Meyer y2 = _mm256_slli_epi32(y2, 9); 1890ac341f1SConrad Meyer z2 = _mm256_xor_si256(z2, y2); 1900ac341f1SConrad Meyer r2 = _mm256_srli_epi32(r2, 23); 1910ac341f1SConrad Meyer z2 = _mm256_xor_si256(z2, r2); 1920ac341f1SConrad Meyer 1930ac341f1SConrad Meyer y7 = z15; 1940ac341f1SConrad Meyer y7 = _mm256_add_epi32(y7, z3); 1950ac341f1SConrad Meyer r7 = y7; 1960ac341f1SConrad Meyer y7 = _mm256_slli_epi32(y7, 9); 1970ac341f1SConrad Meyer z7 = _mm256_xor_si256(z7, y7); 1980ac341f1SConrad Meyer r7 = _mm256_srli_epi32(r7, 23); 1990ac341f1SConrad Meyer z7 = _mm256_xor_si256(z7, r7); 2000ac341f1SConrad Meyer 2010ac341f1SConrad Meyer y6 = z14; 2020ac341f1SConrad Meyer y6 = _mm256_add_epi32(y6, z2); 2030ac341f1SConrad Meyer r6 = y6; 2040ac341f1SConrad Meyer y6 = _mm256_slli_epi32(y6, 13); 2050ac341f1SConrad Meyer z6 = _mm256_xor_si256(z6, y6); 2060ac341f1SConrad Meyer r6 = _mm256_srli_epi32(r6, 19); 2070ac341f1SConrad Meyer z6 = _mm256_xor_si256(z6, r6); 2080ac341f1SConrad Meyer 2090ac341f1SConrad Meyer y11 = z3; 2100ac341f1SConrad Meyer y11 = _mm256_add_epi32(y11, z7); 2110ac341f1SConrad Meyer r11 = y11; 2120ac341f1SConrad Meyer y11 = _mm256_slli_epi32(y11, 13); 2130ac341f1SConrad Meyer z11 = _mm256_xor_si256(z11, y11); 2140ac341f1SConrad Meyer r11 = _mm256_srli_epi32(r11, 19); 2150ac341f1SConrad Meyer z11 = _mm256_xor_si256(z11, r11); 2160ac341f1SConrad Meyer 2170ac341f1SConrad Meyer y10 = z2; 2180ac341f1SConrad Meyer y10 = _mm256_add_epi32(y10, z6); 2190ac341f1SConrad Meyer r10 = y10; 2200ac341f1SConrad Meyer y10 = _mm256_slli_epi32(y10, 18); 2210ac341f1SConrad Meyer z10 = _mm256_xor_si256(z10, y10); 2220ac341f1SConrad Meyer r10 = _mm256_srli_epi32(r10, 14); 2230ac341f1SConrad Meyer z10 = _mm256_xor_si256(z10, r10); 2240ac341f1SConrad Meyer 2250ac341f1SConrad Meyer y1 = z3; 2260ac341f1SConrad Meyer y1 = _mm256_add_epi32(y1, z0); 2270ac341f1SConrad Meyer r1 = y1; 2280ac341f1SConrad Meyer y1 = _mm256_slli_epi32(y1, 7); 2290ac341f1SConrad Meyer z1 = _mm256_xor_si256(z1, y1); 2300ac341f1SConrad Meyer r1 = _mm256_srli_epi32(r1, 25); 2310ac341f1SConrad Meyer z1 = _mm256_xor_si256(z1, r1); 2320ac341f1SConrad Meyer 2330ac341f1SConrad Meyer y15 = z7; 2340ac341f1SConrad Meyer y15 = _mm256_add_epi32(y15, z11); 2350ac341f1SConrad Meyer r15 = y15; 2360ac341f1SConrad Meyer y15 = _mm256_slli_epi32(y15, 18); 2370ac341f1SConrad Meyer z15 = _mm256_xor_si256(z15, y15); 2380ac341f1SConrad Meyer r15 = _mm256_srli_epi32(r15, 14); 2390ac341f1SConrad Meyer z15 = _mm256_xor_si256(z15, r15); 2400ac341f1SConrad Meyer 2410ac341f1SConrad Meyer y6 = z4; 2420ac341f1SConrad Meyer y6 = _mm256_add_epi32(y6, z5); 2430ac341f1SConrad Meyer r6 = y6; 2440ac341f1SConrad Meyer y6 = _mm256_slli_epi32(y6, 7); 2450ac341f1SConrad Meyer z6 = _mm256_xor_si256(z6, y6); 2460ac341f1SConrad Meyer r6 = _mm256_srli_epi32(r6, 25); 2470ac341f1SConrad Meyer z6 = _mm256_xor_si256(z6, r6); 2480ac341f1SConrad Meyer 2490ac341f1SConrad Meyer y2 = z0; 2500ac341f1SConrad Meyer y2 = _mm256_add_epi32(y2, z1); 2510ac341f1SConrad Meyer r2 = y2; 2520ac341f1SConrad Meyer y2 = _mm256_slli_epi32(y2, 9); 2530ac341f1SConrad Meyer z2 = _mm256_xor_si256(z2, y2); 2540ac341f1SConrad Meyer r2 = _mm256_srli_epi32(r2, 23); 2550ac341f1SConrad Meyer z2 = _mm256_xor_si256(z2, r2); 2560ac341f1SConrad Meyer 2570ac341f1SConrad Meyer y7 = z5; 2580ac341f1SConrad Meyer y7 = _mm256_add_epi32(y7, z6); 2590ac341f1SConrad Meyer r7 = y7; 2600ac341f1SConrad Meyer y7 = _mm256_slli_epi32(y7, 9); 2610ac341f1SConrad Meyer z7 = _mm256_xor_si256(z7, y7); 2620ac341f1SConrad Meyer r7 = _mm256_srli_epi32(r7, 23); 2630ac341f1SConrad Meyer z7 = _mm256_xor_si256(z7, r7); 2640ac341f1SConrad Meyer 2650ac341f1SConrad Meyer y3 = z1; 2660ac341f1SConrad Meyer y3 = _mm256_add_epi32(y3, z2); 2670ac341f1SConrad Meyer r3 = y3; 2680ac341f1SConrad Meyer y3 = _mm256_slli_epi32(y3, 13); 2690ac341f1SConrad Meyer z3 = _mm256_xor_si256(z3, y3); 2700ac341f1SConrad Meyer r3 = _mm256_srli_epi32(r3, 19); 2710ac341f1SConrad Meyer z3 = _mm256_xor_si256(z3, r3); 2720ac341f1SConrad Meyer 2730ac341f1SConrad Meyer y4 = z6; 2740ac341f1SConrad Meyer y4 = _mm256_add_epi32(y4, z7); 2750ac341f1SConrad Meyer r4 = y4; 2760ac341f1SConrad Meyer y4 = _mm256_slli_epi32(y4, 13); 2770ac341f1SConrad Meyer z4 = _mm256_xor_si256(z4, y4); 2780ac341f1SConrad Meyer r4 = _mm256_srli_epi32(r4, 19); 2790ac341f1SConrad Meyer z4 = _mm256_xor_si256(z4, r4); 2800ac341f1SConrad Meyer 2810ac341f1SConrad Meyer y0 = z2; 2820ac341f1SConrad Meyer y0 = _mm256_add_epi32(y0, z3); 2830ac341f1SConrad Meyer r0 = y0; 2840ac341f1SConrad Meyer y0 = _mm256_slli_epi32(y0, 18); 2850ac341f1SConrad Meyer z0 = _mm256_xor_si256(z0, y0); 2860ac341f1SConrad Meyer r0 = _mm256_srli_epi32(r0, 14); 2870ac341f1SConrad Meyer z0 = _mm256_xor_si256(z0, r0); 2880ac341f1SConrad Meyer 2890ac341f1SConrad Meyer y5 = z7; 2900ac341f1SConrad Meyer y5 = _mm256_add_epi32(y5, z4); 2910ac341f1SConrad Meyer r5 = y5; 2920ac341f1SConrad Meyer y5 = _mm256_slli_epi32(y5, 18); 2930ac341f1SConrad Meyer z5 = _mm256_xor_si256(z5, y5); 2940ac341f1SConrad Meyer r5 = _mm256_srli_epi32(r5, 14); 2950ac341f1SConrad Meyer z5 = _mm256_xor_si256(z5, r5); 2960ac341f1SConrad Meyer 2970ac341f1SConrad Meyer y11 = z9; 2980ac341f1SConrad Meyer y11 = _mm256_add_epi32(y11, z10); 2990ac341f1SConrad Meyer r11 = y11; 3000ac341f1SConrad Meyer y11 = _mm256_slli_epi32(y11, 7); 3010ac341f1SConrad Meyer z11 = _mm256_xor_si256(z11, y11); 3020ac341f1SConrad Meyer r11 = _mm256_srli_epi32(r11, 25); 3030ac341f1SConrad Meyer z11 = _mm256_xor_si256(z11, r11); 3040ac341f1SConrad Meyer 3050ac341f1SConrad Meyer y12 = z14; 3060ac341f1SConrad Meyer y12 = _mm256_add_epi32(y12, z15); 3070ac341f1SConrad Meyer r12 = y12; 3080ac341f1SConrad Meyer y12 = _mm256_slli_epi32(y12, 7); 3090ac341f1SConrad Meyer z12 = _mm256_xor_si256(z12, y12); 3100ac341f1SConrad Meyer r12 = _mm256_srli_epi32(r12, 25); 3110ac341f1SConrad Meyer z12 = _mm256_xor_si256(z12, r12); 3120ac341f1SConrad Meyer 3130ac341f1SConrad Meyer y8 = z10; 3140ac341f1SConrad Meyer y8 = _mm256_add_epi32(y8, z11); 3150ac341f1SConrad Meyer r8 = y8; 3160ac341f1SConrad Meyer y8 = _mm256_slli_epi32(y8, 9); 3170ac341f1SConrad Meyer z8 = _mm256_xor_si256(z8, y8); 3180ac341f1SConrad Meyer r8 = _mm256_srli_epi32(r8, 23); 3190ac341f1SConrad Meyer z8 = _mm256_xor_si256(z8, r8); 3200ac341f1SConrad Meyer 3210ac341f1SConrad Meyer y13 = z15; 3220ac341f1SConrad Meyer y13 = _mm256_add_epi32(y13, z12); 3230ac341f1SConrad Meyer r13 = y13; 3240ac341f1SConrad Meyer y13 = _mm256_slli_epi32(y13, 9); 3250ac341f1SConrad Meyer z13 = _mm256_xor_si256(z13, y13); 3260ac341f1SConrad Meyer r13 = _mm256_srli_epi32(r13, 23); 3270ac341f1SConrad Meyer z13 = _mm256_xor_si256(z13, r13); 3280ac341f1SConrad Meyer 3290ac341f1SConrad Meyer y9 = z11; 3300ac341f1SConrad Meyer y9 = _mm256_add_epi32(y9, z8); 3310ac341f1SConrad Meyer r9 = y9; 3320ac341f1SConrad Meyer y9 = _mm256_slli_epi32(y9, 13); 3330ac341f1SConrad Meyer z9 = _mm256_xor_si256(z9, y9); 3340ac341f1SConrad Meyer r9 = _mm256_srli_epi32(r9, 19); 3350ac341f1SConrad Meyer z9 = _mm256_xor_si256(z9, r9); 3360ac341f1SConrad Meyer 3370ac341f1SConrad Meyer y14 = z12; 3380ac341f1SConrad Meyer y14 = _mm256_add_epi32(y14, z13); 3390ac341f1SConrad Meyer r14 = y14; 3400ac341f1SConrad Meyer y14 = _mm256_slli_epi32(y14, 13); 3410ac341f1SConrad Meyer z14 = _mm256_xor_si256(z14, y14); 3420ac341f1SConrad Meyer r14 = _mm256_srli_epi32(r14, 19); 3430ac341f1SConrad Meyer z14 = _mm256_xor_si256(z14, r14); 3440ac341f1SConrad Meyer 3450ac341f1SConrad Meyer y10 = z8; 3460ac341f1SConrad Meyer y10 = _mm256_add_epi32(y10, z9); 3470ac341f1SConrad Meyer r10 = y10; 3480ac341f1SConrad Meyer y10 = _mm256_slli_epi32(y10, 18); 3490ac341f1SConrad Meyer z10 = _mm256_xor_si256(z10, y10); 3500ac341f1SConrad Meyer r10 = _mm256_srli_epi32(r10, 14); 3510ac341f1SConrad Meyer z10 = _mm256_xor_si256(z10, r10); 3520ac341f1SConrad Meyer 3530ac341f1SConrad Meyer y15 = z13; 3540ac341f1SConrad Meyer y15 = _mm256_add_epi32(y15, z14); 3550ac341f1SConrad Meyer r15 = y15; 3560ac341f1SConrad Meyer y15 = _mm256_slli_epi32(y15, 18); 3570ac341f1SConrad Meyer z15 = _mm256_xor_si256(z15, y15); 3580ac341f1SConrad Meyer r15 = _mm256_srli_epi32(r15, 14); 3590ac341f1SConrad Meyer z15 = _mm256_xor_si256(z15, r15); 3600ac341f1SConrad Meyer } 3610ac341f1SConrad Meyer 3620ac341f1SConrad Meyer /* store data ; this macro first transpose data in-registers, and then store 3630ac341f1SConrad Meyer * them in memory. much faster with icc. */ 3640ac341f1SConrad Meyer #define ONEQUAD_TRANSPOSE(A, B, C, D) \ 3650ac341f1SConrad Meyer { \ 3660ac341f1SConrad Meyer __m128i t0, t1, t2, t3; \ 3670ac341f1SConrad Meyer z##A = _mm256_add_epi32(z##A, orig##A); \ 3680ac341f1SConrad Meyer z##B = _mm256_add_epi32(z##B, orig##B); \ 3690ac341f1SConrad Meyer z##C = _mm256_add_epi32(z##C, orig##C); \ 3700ac341f1SConrad Meyer z##D = _mm256_add_epi32(z##D, orig##D); \ 3710ac341f1SConrad Meyer y##A = _mm256_unpacklo_epi32(z##A, z##B); \ 3720ac341f1SConrad Meyer y##B = _mm256_unpacklo_epi32(z##C, z##D); \ 3730ac341f1SConrad Meyer y##C = _mm256_unpackhi_epi32(z##A, z##B); \ 3740ac341f1SConrad Meyer y##D = _mm256_unpackhi_epi32(z##C, z##D); \ 3750ac341f1SConrad Meyer z##A = _mm256_unpacklo_epi64(y##A, y##B); \ 3760ac341f1SConrad Meyer z##B = _mm256_unpackhi_epi64(y##A, y##B); \ 3770ac341f1SConrad Meyer z##C = _mm256_unpacklo_epi64(y##C, y##D); \ 3780ac341f1SConrad Meyer z##D = _mm256_unpackhi_epi64(y##C, y##D); \ 3790ac341f1SConrad Meyer t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \ 3800ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 0))); \ 3810ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 0), t0); \ 3820ac341f1SConrad Meyer t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \ 3830ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 64))); \ 3840ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 64), t1); \ 3850ac341f1SConrad Meyer t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \ 3860ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 128))); \ 3870ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 128), t2); \ 3880ac341f1SConrad Meyer t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \ 3890ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 192))); \ 3900ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 192), t3); \ 3910ac341f1SConrad Meyer t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \ 3920ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 256))); \ 3930ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 256), t0); \ 3940ac341f1SConrad Meyer t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \ 3950ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 320))); \ 3960ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 320), t1); \ 3970ac341f1SConrad Meyer t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \ 3980ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 384))); \ 3990ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 384), t2); \ 4000ac341f1SConrad Meyer t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \ 4010ac341f1SConrad Meyer _mm_loadu_si128((__m128i*) (m + 448))); \ 4020ac341f1SConrad Meyer _mm_storeu_si128((__m128i*) (c + 448), t3); \ 4030ac341f1SConrad Meyer } 4040ac341f1SConrad Meyer 4050ac341f1SConrad Meyer #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) 4060ac341f1SConrad Meyer 4070ac341f1SConrad Meyer #define ONEQUAD_UNPCK(A, B, C, D) \ 4080ac341f1SConrad Meyer { \ 4090ac341f1SConrad Meyer z##A = _mm256_add_epi32(z##A, orig##A); \ 4100ac341f1SConrad Meyer z##B = _mm256_add_epi32(z##B, orig##B); \ 4110ac341f1SConrad Meyer z##C = _mm256_add_epi32(z##C, orig##C); \ 4120ac341f1SConrad Meyer z##D = _mm256_add_epi32(z##D, orig##D); \ 4130ac341f1SConrad Meyer y##A = _mm256_unpacklo_epi32(z##A, z##B); \ 4140ac341f1SConrad Meyer y##B = _mm256_unpacklo_epi32(z##C, z##D); \ 4150ac341f1SConrad Meyer y##C = _mm256_unpackhi_epi32(z##A, z##B); \ 4160ac341f1SConrad Meyer y##D = _mm256_unpackhi_epi32(z##C, z##D); \ 4170ac341f1SConrad Meyer z##A = _mm256_unpacklo_epi64(y##A, y##B); \ 4180ac341f1SConrad Meyer z##B = _mm256_unpackhi_epi64(y##A, y##B); \ 4190ac341f1SConrad Meyer z##C = _mm256_unpacklo_epi64(y##C, y##D); \ 4200ac341f1SConrad Meyer z##D = _mm256_unpackhi_epi64(y##C, y##D); \ 4210ac341f1SConrad Meyer } 4220ac341f1SConrad Meyer 4230ac341f1SConrad Meyer #define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ 4240ac341f1SConrad Meyer { \ 4250ac341f1SConrad Meyer ONEQUAD_UNPCK(A, B, C, D); \ 4260ac341f1SConrad Meyer ONEQUAD_UNPCK(A2, B2, C2, D2); \ 4270ac341f1SConrad Meyer y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \ 4280ac341f1SConrad Meyer y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \ 4290ac341f1SConrad Meyer y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \ 4300ac341f1SConrad Meyer y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \ 4310ac341f1SConrad Meyer y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \ 4320ac341f1SConrad Meyer y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \ 4330ac341f1SConrad Meyer y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \ 4340ac341f1SConrad Meyer y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \ 4350ac341f1SConrad Meyer y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \ 4360ac341f1SConrad Meyer y##B = \ 4370ac341f1SConrad Meyer _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \ 4380ac341f1SConrad Meyer y##C = \ 4390ac341f1SConrad Meyer _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \ 4400ac341f1SConrad Meyer y##D = \ 4410ac341f1SConrad Meyer _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \ 4420ac341f1SConrad Meyer y##A2 = \ 4430ac341f1SConrad Meyer _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \ 4440ac341f1SConrad Meyer y##B2 = \ 4450ac341f1SConrad Meyer _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \ 4460ac341f1SConrad Meyer y##C2 = \ 4470ac341f1SConrad Meyer _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \ 4480ac341f1SConrad Meyer y##D2 = \ 4490ac341f1SConrad Meyer _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \ 4500ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 0), y##A); \ 4510ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 64), y##B); \ 4520ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 128), y##C); \ 4530ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 192), y##D); \ 4540ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 256), y##A2); \ 4550ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 320), y##B2); \ 4560ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 384), y##C2); \ 4570ac341f1SConrad Meyer _mm256_storeu_si256((__m256i*) (c + 448), y##D2); \ 4580ac341f1SConrad Meyer } 4590ac341f1SConrad Meyer 4600ac341f1SConrad Meyer ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); 4610ac341f1SConrad Meyer m += 32; 4620ac341f1SConrad Meyer c += 32; 4630ac341f1SConrad Meyer ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); 4640ac341f1SConrad Meyer m -= 32; 4650ac341f1SConrad Meyer c -= 32; 4660ac341f1SConrad Meyer 4670ac341f1SConrad Meyer #undef ONEQUAD 4680ac341f1SConrad Meyer #undef ONEQUAD_TRANSPOSE 4690ac341f1SConrad Meyer #undef ONEQUAD_UNPCK 4700ac341f1SConrad Meyer #undef ONEOCTO 4710ac341f1SConrad Meyer 4720ac341f1SConrad Meyer bytes -= 512; 4730ac341f1SConrad Meyer c += 512; 4740ac341f1SConrad Meyer m += 512; 4750ac341f1SConrad Meyer } 4760ac341f1SConrad Meyer } 477