10ac341f1SConrad Meyer while (bytes >= 64) {
20ac341f1SConrad Meyer     __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
30ac341f1SConrad Meyer     __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
40ac341f1SConrad Meyer     __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
50ac341f1SConrad Meyer     __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
60ac341f1SConrad Meyer     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
70ac341f1SConrad Meyer     __m128i b0, b1, b2, b3, b4, b5, b6, b7;
80ac341f1SConrad Meyer 
90ac341f1SConrad Meyer     uint32_t in8;
100ac341f1SConrad Meyer     uint32_t in9;
110ac341f1SConrad Meyer     int      i;
120ac341f1SConrad Meyer 
130ac341f1SConrad Meyer     a0 = diag1;
140ac341f1SConrad Meyer     for (i = 0; i < ROUNDS; i += 4) {
150ac341f1SConrad Meyer         a0    = _mm_add_epi32(a0, diag0);
160ac341f1SConrad Meyer         a1    = diag0;
170ac341f1SConrad Meyer         b0    = a0;
180ac341f1SConrad Meyer         a0    = _mm_slli_epi32(a0, 7);
190ac341f1SConrad Meyer         b0    = _mm_srli_epi32(b0, 25);
200ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, a0);
210ac341f1SConrad Meyer 
220ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, b0);
230ac341f1SConrad Meyer 
240ac341f1SConrad Meyer         a1    = _mm_add_epi32(a1, diag3);
250ac341f1SConrad Meyer         a2    = diag3;
260ac341f1SConrad Meyer         b1    = a1;
270ac341f1SConrad Meyer         a1    = _mm_slli_epi32(a1, 9);
280ac341f1SConrad Meyer         b1    = _mm_srli_epi32(b1, 23);
290ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, a1);
300ac341f1SConrad Meyer         diag3 = _mm_shuffle_epi32(diag3, 0x93);
310ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, b1);
320ac341f1SConrad Meyer 
330ac341f1SConrad Meyer         a2    = _mm_add_epi32(a2, diag2);
340ac341f1SConrad Meyer         a3    = diag2;
350ac341f1SConrad Meyer         b2    = a2;
360ac341f1SConrad Meyer         a2    = _mm_slli_epi32(a2, 13);
370ac341f1SConrad Meyer         b2    = _mm_srli_epi32(b2, 19);
380ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, a2);
390ac341f1SConrad Meyer         diag2 = _mm_shuffle_epi32(diag2, 0x4e);
400ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, b2);
410ac341f1SConrad Meyer 
420ac341f1SConrad Meyer         a3    = _mm_add_epi32(a3, diag1);
430ac341f1SConrad Meyer         a4    = diag3;
440ac341f1SConrad Meyer         b3    = a3;
450ac341f1SConrad Meyer         a3    = _mm_slli_epi32(a3, 18);
460ac341f1SConrad Meyer         b3    = _mm_srli_epi32(b3, 14);
470ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, a3);
480ac341f1SConrad Meyer         diag1 = _mm_shuffle_epi32(diag1, 0x39);
490ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, b3);
500ac341f1SConrad Meyer 
510ac341f1SConrad Meyer         a4    = _mm_add_epi32(a4, diag0);
520ac341f1SConrad Meyer         a5    = diag0;
530ac341f1SConrad Meyer         b4    = a4;
540ac341f1SConrad Meyer         a4    = _mm_slli_epi32(a4, 7);
550ac341f1SConrad Meyer         b4    = _mm_srli_epi32(b4, 25);
560ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, a4);
570ac341f1SConrad Meyer 
580ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, b4);
590ac341f1SConrad Meyer 
600ac341f1SConrad Meyer         a5    = _mm_add_epi32(a5, diag1);
610ac341f1SConrad Meyer         a6    = diag1;
620ac341f1SConrad Meyer         b5    = a5;
630ac341f1SConrad Meyer         a5    = _mm_slli_epi32(a5, 9);
640ac341f1SConrad Meyer         b5    = _mm_srli_epi32(b5, 23);
650ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, a5);
660ac341f1SConrad Meyer         diag1 = _mm_shuffle_epi32(diag1, 0x93);
670ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, b5);
680ac341f1SConrad Meyer 
690ac341f1SConrad Meyer         a6    = _mm_add_epi32(a6, diag2);
700ac341f1SConrad Meyer         a7    = diag2;
710ac341f1SConrad Meyer         b6    = a6;
720ac341f1SConrad Meyer         a6    = _mm_slli_epi32(a6, 13);
730ac341f1SConrad Meyer         b6    = _mm_srli_epi32(b6, 19);
740ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, a6);
750ac341f1SConrad Meyer         diag2 = _mm_shuffle_epi32(diag2, 0x4e);
760ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, b6);
770ac341f1SConrad Meyer 
780ac341f1SConrad Meyer         a7    = _mm_add_epi32(a7, diag3);
790ac341f1SConrad Meyer         a0    = diag1;
800ac341f1SConrad Meyer         b7    = a7;
810ac341f1SConrad Meyer         a7    = _mm_slli_epi32(a7, 18);
820ac341f1SConrad Meyer         b7    = _mm_srli_epi32(b7, 14);
830ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, a7);
840ac341f1SConrad Meyer         diag3 = _mm_shuffle_epi32(diag3, 0x39);
850ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, b7);
860ac341f1SConrad Meyer 
870ac341f1SConrad Meyer         a0    = _mm_add_epi32(a0, diag0);
880ac341f1SConrad Meyer         a1    = diag0;
890ac341f1SConrad Meyer         b0    = a0;
900ac341f1SConrad Meyer         a0    = _mm_slli_epi32(a0, 7);
910ac341f1SConrad Meyer         b0    = _mm_srli_epi32(b0, 25);
920ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, a0);
930ac341f1SConrad Meyer 
940ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, b0);
950ac341f1SConrad Meyer 
960ac341f1SConrad Meyer         a1    = _mm_add_epi32(a1, diag3);
970ac341f1SConrad Meyer         a2    = diag3;
980ac341f1SConrad Meyer         b1    = a1;
990ac341f1SConrad Meyer         a1    = _mm_slli_epi32(a1, 9);
1000ac341f1SConrad Meyer         b1    = _mm_srli_epi32(b1, 23);
1010ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, a1);
1020ac341f1SConrad Meyer         diag3 = _mm_shuffle_epi32(diag3, 0x93);
1030ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, b1);
1040ac341f1SConrad Meyer 
1050ac341f1SConrad Meyer         a2    = _mm_add_epi32(a2, diag2);
1060ac341f1SConrad Meyer         a3    = diag2;
1070ac341f1SConrad Meyer         b2    = a2;
1080ac341f1SConrad Meyer         a2    = _mm_slli_epi32(a2, 13);
1090ac341f1SConrad Meyer         b2    = _mm_srli_epi32(b2, 19);
1100ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, a2);
1110ac341f1SConrad Meyer         diag2 = _mm_shuffle_epi32(diag2, 0x4e);
1120ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, b2);
1130ac341f1SConrad Meyer 
1140ac341f1SConrad Meyer         a3    = _mm_add_epi32(a3, diag1);
1150ac341f1SConrad Meyer         a4    = diag3;
1160ac341f1SConrad Meyer         b3    = a3;
1170ac341f1SConrad Meyer         a3    = _mm_slli_epi32(a3, 18);
1180ac341f1SConrad Meyer         b3    = _mm_srli_epi32(b3, 14);
1190ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, a3);
1200ac341f1SConrad Meyer         diag1 = _mm_shuffle_epi32(diag1, 0x39);
1210ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, b3);
1220ac341f1SConrad Meyer 
1230ac341f1SConrad Meyer         a4    = _mm_add_epi32(a4, diag0);
1240ac341f1SConrad Meyer         a5    = diag0;
1250ac341f1SConrad Meyer         b4    = a4;
1260ac341f1SConrad Meyer         a4    = _mm_slli_epi32(a4, 7);
1270ac341f1SConrad Meyer         b4    = _mm_srli_epi32(b4, 25);
1280ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, a4);
1290ac341f1SConrad Meyer 
1300ac341f1SConrad Meyer         diag1 = _mm_xor_si128(diag1, b4);
1310ac341f1SConrad Meyer 
1320ac341f1SConrad Meyer         a5    = _mm_add_epi32(a5, diag1);
1330ac341f1SConrad Meyer         a6    = diag1;
1340ac341f1SConrad Meyer         b5    = a5;
1350ac341f1SConrad Meyer         a5    = _mm_slli_epi32(a5, 9);
1360ac341f1SConrad Meyer         b5    = _mm_srli_epi32(b5, 23);
1370ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, a5);
1380ac341f1SConrad Meyer         diag1 = _mm_shuffle_epi32(diag1, 0x93);
1390ac341f1SConrad Meyer         diag2 = _mm_xor_si128(diag2, b5);
1400ac341f1SConrad Meyer 
1410ac341f1SConrad Meyer         a6    = _mm_add_epi32(a6, diag2);
1420ac341f1SConrad Meyer         a7    = diag2;
1430ac341f1SConrad Meyer         b6    = a6;
1440ac341f1SConrad Meyer         a6    = _mm_slli_epi32(a6, 13);
1450ac341f1SConrad Meyer         b6    = _mm_srli_epi32(b6, 19);
1460ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, a6);
1470ac341f1SConrad Meyer         diag2 = _mm_shuffle_epi32(diag2, 0x4e);
1480ac341f1SConrad Meyer         diag3 = _mm_xor_si128(diag3, b6);
1490ac341f1SConrad Meyer 
1500ac341f1SConrad Meyer         a7    = _mm_add_epi32(a7, diag3);
1510ac341f1SConrad Meyer         a0    = diag1;
1520ac341f1SConrad Meyer         b7    = a7;
1530ac341f1SConrad Meyer         a7    = _mm_slli_epi32(a7, 18);
1540ac341f1SConrad Meyer         b7    = _mm_srli_epi32(b7, 14);
1550ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, a7);
1560ac341f1SConrad Meyer         diag3 = _mm_shuffle_epi32(diag3, 0x39);
1570ac341f1SConrad Meyer         diag0 = _mm_xor_si128(diag0, b7);
1580ac341f1SConrad Meyer     }
1590ac341f1SConrad Meyer 
1600ac341f1SConrad Meyer     diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
1610ac341f1SConrad Meyer     diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
1620ac341f1SConrad Meyer     diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
1630ac341f1SConrad Meyer     diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
1640ac341f1SConrad Meyer 
1650ac341f1SConrad Meyer #define ONEQUAD_SHUFFLE(A, B, C, D)                      \
1660ac341f1SConrad Meyer     do {                                                 \
1670ac341f1SConrad Meyer         uint32_t in##A = _mm_cvtsi128_si32(diag0);       \
1680ac341f1SConrad Meyer         uint32_t in##B = _mm_cvtsi128_si32(diag1);       \
1690ac341f1SConrad Meyer         uint32_t in##C = _mm_cvtsi128_si32(diag2);       \
1700ac341f1SConrad Meyer         uint32_t in##D = _mm_cvtsi128_si32(diag3);       \
1710ac341f1SConrad Meyer         diag0          = _mm_shuffle_epi32(diag0, 0x39); \
1720ac341f1SConrad Meyer         diag1          = _mm_shuffle_epi32(diag1, 0x39); \
1730ac341f1SConrad Meyer         diag2          = _mm_shuffle_epi32(diag2, 0x39); \
1740ac341f1SConrad Meyer         diag3          = _mm_shuffle_epi32(diag3, 0x39); \
1750ac341f1SConrad Meyer         in##A ^= *(uint32_t *) (m + (A * 4));            \
1760ac341f1SConrad Meyer         in##B ^= *(uint32_t *) (m + (B * 4));            \
1770ac341f1SConrad Meyer         in##C ^= *(uint32_t *) (m + (C * 4));            \
1780ac341f1SConrad Meyer         in##D ^= *(uint32_t *) (m + (D * 4));            \
1790ac341f1SConrad Meyer         *(uint32_t *) (c + (A * 4)) = in##A;             \
1800ac341f1SConrad Meyer         *(uint32_t *) (c + (B * 4)) = in##B;             \
1810ac341f1SConrad Meyer         *(uint32_t *) (c + (C * 4)) = in##C;             \
1820ac341f1SConrad Meyer         *(uint32_t *) (c + (D * 4)) = in##D;             \
1830ac341f1SConrad Meyer     } while (0)
1840ac341f1SConrad Meyer 
1850ac341f1SConrad Meyer #define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
1860ac341f1SConrad Meyer 
1870ac341f1SConrad Meyer     ONEQUAD(0, 12, 8, 4);
1880ac341f1SConrad Meyer     ONEQUAD(5, 1, 13, 9);
1890ac341f1SConrad Meyer     ONEQUAD(10, 6, 2, 14);
1900ac341f1SConrad Meyer     ONEQUAD(15, 11, 7, 3);
1910ac341f1SConrad Meyer 
1920ac341f1SConrad Meyer #undef ONEQUAD
1930ac341f1SConrad Meyer #undef ONEQUAD_SHUFFLE
1940ac341f1SConrad Meyer 
1950ac341f1SConrad Meyer     in8 = x[8];
1960ac341f1SConrad Meyer     in9 = x[13];
1970ac341f1SConrad Meyer     in8++;
1980ac341f1SConrad Meyer     if (in8 == 0) {
1990ac341f1SConrad Meyer         in9++;
2000ac341f1SConrad Meyer     }
2010ac341f1SConrad Meyer     x[8]  = in8;
2020ac341f1SConrad Meyer     x[13] = in9;
2030ac341f1SConrad Meyer 
2040ac341f1SConrad Meyer     c += 64;
2050ac341f1SConrad Meyer     m += 64;
2060ac341f1SConrad Meyer     bytes -= 64;
2070ac341f1SConrad Meyer }
208