10ac341f1SConrad Meyer while (bytes >= 64) { 20ac341f1SConrad Meyer __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0)); 30ac341f1SConrad Meyer __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4)); 40ac341f1SConrad Meyer __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8)); 50ac341f1SConrad Meyer __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12)); 60ac341f1SConrad Meyer __m128i a0, a1, a2, a3, a4, a5, a6, a7; 70ac341f1SConrad Meyer __m128i b0, b1, b2, b3, b4, b5, b6, b7; 80ac341f1SConrad Meyer 90ac341f1SConrad Meyer uint32_t in8; 100ac341f1SConrad Meyer uint32_t in9; 110ac341f1SConrad Meyer int i; 120ac341f1SConrad Meyer 130ac341f1SConrad Meyer a0 = diag1; 140ac341f1SConrad Meyer for (i = 0; i < ROUNDS; i += 4) { 150ac341f1SConrad Meyer a0 = _mm_add_epi32(a0, diag0); 160ac341f1SConrad Meyer a1 = diag0; 170ac341f1SConrad Meyer b0 = a0; 180ac341f1SConrad Meyer a0 = _mm_slli_epi32(a0, 7); 190ac341f1SConrad Meyer b0 = _mm_srli_epi32(b0, 25); 200ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, a0); 210ac341f1SConrad Meyer 220ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, b0); 230ac341f1SConrad Meyer 240ac341f1SConrad Meyer a1 = _mm_add_epi32(a1, diag3); 250ac341f1SConrad Meyer a2 = diag3; 260ac341f1SConrad Meyer b1 = a1; 270ac341f1SConrad Meyer a1 = _mm_slli_epi32(a1, 9); 280ac341f1SConrad Meyer b1 = _mm_srli_epi32(b1, 23); 290ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, a1); 300ac341f1SConrad Meyer diag3 = _mm_shuffle_epi32(diag3, 0x93); 310ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, b1); 320ac341f1SConrad Meyer 330ac341f1SConrad Meyer a2 = _mm_add_epi32(a2, diag2); 340ac341f1SConrad Meyer a3 = diag2; 350ac341f1SConrad Meyer b2 = a2; 360ac341f1SConrad Meyer a2 = _mm_slli_epi32(a2, 13); 370ac341f1SConrad Meyer b2 = _mm_srli_epi32(b2, 19); 380ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, a2); 390ac341f1SConrad Meyer diag2 = _mm_shuffle_epi32(diag2, 0x4e); 400ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, b2); 410ac341f1SConrad Meyer 420ac341f1SConrad Meyer a3 = _mm_add_epi32(a3, diag1); 430ac341f1SConrad Meyer a4 = diag3; 440ac341f1SConrad Meyer b3 = a3; 450ac341f1SConrad Meyer a3 = _mm_slli_epi32(a3, 18); 460ac341f1SConrad Meyer b3 = _mm_srli_epi32(b3, 14); 470ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, a3); 480ac341f1SConrad Meyer diag1 = _mm_shuffle_epi32(diag1, 0x39); 490ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, b3); 500ac341f1SConrad Meyer 510ac341f1SConrad Meyer a4 = _mm_add_epi32(a4, diag0); 520ac341f1SConrad Meyer a5 = diag0; 530ac341f1SConrad Meyer b4 = a4; 540ac341f1SConrad Meyer a4 = _mm_slli_epi32(a4, 7); 550ac341f1SConrad Meyer b4 = _mm_srli_epi32(b4, 25); 560ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, a4); 570ac341f1SConrad Meyer 580ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, b4); 590ac341f1SConrad Meyer 600ac341f1SConrad Meyer a5 = _mm_add_epi32(a5, diag1); 610ac341f1SConrad Meyer a6 = diag1; 620ac341f1SConrad Meyer b5 = a5; 630ac341f1SConrad Meyer a5 = _mm_slli_epi32(a5, 9); 640ac341f1SConrad Meyer b5 = _mm_srli_epi32(b5, 23); 650ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, a5); 660ac341f1SConrad Meyer diag1 = _mm_shuffle_epi32(diag1, 0x93); 670ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, b5); 680ac341f1SConrad Meyer 690ac341f1SConrad Meyer a6 = _mm_add_epi32(a6, diag2); 700ac341f1SConrad Meyer a7 = diag2; 710ac341f1SConrad Meyer b6 = a6; 720ac341f1SConrad Meyer a6 = _mm_slli_epi32(a6, 13); 730ac341f1SConrad Meyer b6 = _mm_srli_epi32(b6, 19); 740ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, a6); 750ac341f1SConrad Meyer diag2 = _mm_shuffle_epi32(diag2, 0x4e); 760ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, b6); 770ac341f1SConrad Meyer 780ac341f1SConrad Meyer a7 = _mm_add_epi32(a7, diag3); 790ac341f1SConrad Meyer a0 = diag1; 800ac341f1SConrad Meyer b7 = a7; 810ac341f1SConrad Meyer a7 = _mm_slli_epi32(a7, 18); 820ac341f1SConrad Meyer b7 = _mm_srli_epi32(b7, 14); 830ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, a7); 840ac341f1SConrad Meyer diag3 = _mm_shuffle_epi32(diag3, 0x39); 850ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, b7); 860ac341f1SConrad Meyer 870ac341f1SConrad Meyer a0 = _mm_add_epi32(a0, diag0); 880ac341f1SConrad Meyer a1 = diag0; 890ac341f1SConrad Meyer b0 = a0; 900ac341f1SConrad Meyer a0 = _mm_slli_epi32(a0, 7); 910ac341f1SConrad Meyer b0 = _mm_srli_epi32(b0, 25); 920ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, a0); 930ac341f1SConrad Meyer 940ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, b0); 950ac341f1SConrad Meyer 960ac341f1SConrad Meyer a1 = _mm_add_epi32(a1, diag3); 970ac341f1SConrad Meyer a2 = diag3; 980ac341f1SConrad Meyer b1 = a1; 990ac341f1SConrad Meyer a1 = _mm_slli_epi32(a1, 9); 1000ac341f1SConrad Meyer b1 = _mm_srli_epi32(b1, 23); 1010ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, a1); 1020ac341f1SConrad Meyer diag3 = _mm_shuffle_epi32(diag3, 0x93); 1030ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, b1); 1040ac341f1SConrad Meyer 1050ac341f1SConrad Meyer a2 = _mm_add_epi32(a2, diag2); 1060ac341f1SConrad Meyer a3 = diag2; 1070ac341f1SConrad Meyer b2 = a2; 1080ac341f1SConrad Meyer a2 = _mm_slli_epi32(a2, 13); 1090ac341f1SConrad Meyer b2 = _mm_srli_epi32(b2, 19); 1100ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, a2); 1110ac341f1SConrad Meyer diag2 = _mm_shuffle_epi32(diag2, 0x4e); 1120ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, b2); 1130ac341f1SConrad Meyer 1140ac341f1SConrad Meyer a3 = _mm_add_epi32(a3, diag1); 1150ac341f1SConrad Meyer a4 = diag3; 1160ac341f1SConrad Meyer b3 = a3; 1170ac341f1SConrad Meyer a3 = _mm_slli_epi32(a3, 18); 1180ac341f1SConrad Meyer b3 = _mm_srli_epi32(b3, 14); 1190ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, a3); 1200ac341f1SConrad Meyer diag1 = _mm_shuffle_epi32(diag1, 0x39); 1210ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, b3); 1220ac341f1SConrad Meyer 1230ac341f1SConrad Meyer a4 = _mm_add_epi32(a4, diag0); 1240ac341f1SConrad Meyer a5 = diag0; 1250ac341f1SConrad Meyer b4 = a4; 1260ac341f1SConrad Meyer a4 = _mm_slli_epi32(a4, 7); 1270ac341f1SConrad Meyer b4 = _mm_srli_epi32(b4, 25); 1280ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, a4); 1290ac341f1SConrad Meyer 1300ac341f1SConrad Meyer diag1 = _mm_xor_si128(diag1, b4); 1310ac341f1SConrad Meyer 1320ac341f1SConrad Meyer a5 = _mm_add_epi32(a5, diag1); 1330ac341f1SConrad Meyer a6 = diag1; 1340ac341f1SConrad Meyer b5 = a5; 1350ac341f1SConrad Meyer a5 = _mm_slli_epi32(a5, 9); 1360ac341f1SConrad Meyer b5 = _mm_srli_epi32(b5, 23); 1370ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, a5); 1380ac341f1SConrad Meyer diag1 = _mm_shuffle_epi32(diag1, 0x93); 1390ac341f1SConrad Meyer diag2 = _mm_xor_si128(diag2, b5); 1400ac341f1SConrad Meyer 1410ac341f1SConrad Meyer a6 = _mm_add_epi32(a6, diag2); 1420ac341f1SConrad Meyer a7 = diag2; 1430ac341f1SConrad Meyer b6 = a6; 1440ac341f1SConrad Meyer a6 = _mm_slli_epi32(a6, 13); 1450ac341f1SConrad Meyer b6 = _mm_srli_epi32(b6, 19); 1460ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, a6); 1470ac341f1SConrad Meyer diag2 = _mm_shuffle_epi32(diag2, 0x4e); 1480ac341f1SConrad Meyer diag3 = _mm_xor_si128(diag3, b6); 1490ac341f1SConrad Meyer 1500ac341f1SConrad Meyer a7 = _mm_add_epi32(a7, diag3); 1510ac341f1SConrad Meyer a0 = diag1; 1520ac341f1SConrad Meyer b7 = a7; 1530ac341f1SConrad Meyer a7 = _mm_slli_epi32(a7, 18); 1540ac341f1SConrad Meyer b7 = _mm_srli_epi32(b7, 14); 1550ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, a7); 1560ac341f1SConrad Meyer diag3 = _mm_shuffle_epi32(diag3, 0x39); 1570ac341f1SConrad Meyer diag0 = _mm_xor_si128(diag0, b7); 1580ac341f1SConrad Meyer } 1590ac341f1SConrad Meyer 1600ac341f1SConrad Meyer diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0))); 1610ac341f1SConrad Meyer diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4))); 1620ac341f1SConrad Meyer diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8))); 1630ac341f1SConrad Meyer diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12))); 1640ac341f1SConrad Meyer 1650ac341f1SConrad Meyer #define ONEQUAD_SHUFFLE(A, B, C, D) \ 1660ac341f1SConrad Meyer do { \ 1670ac341f1SConrad Meyer uint32_t in##A = _mm_cvtsi128_si32(diag0); \ 1680ac341f1SConrad Meyer uint32_t in##B = _mm_cvtsi128_si32(diag1); \ 1690ac341f1SConrad Meyer uint32_t in##C = _mm_cvtsi128_si32(diag2); \ 1700ac341f1SConrad Meyer uint32_t in##D = _mm_cvtsi128_si32(diag3); \ 1710ac341f1SConrad Meyer diag0 = _mm_shuffle_epi32(diag0, 0x39); \ 1720ac341f1SConrad Meyer diag1 = _mm_shuffle_epi32(diag1, 0x39); \ 1730ac341f1SConrad Meyer diag2 = _mm_shuffle_epi32(diag2, 0x39); \ 1740ac341f1SConrad Meyer diag3 = _mm_shuffle_epi32(diag3, 0x39); \ 1750ac341f1SConrad Meyer in##A ^= *(uint32_t *) (m + (A * 4)); \ 1760ac341f1SConrad Meyer in##B ^= *(uint32_t *) (m + (B * 4)); \ 1770ac341f1SConrad Meyer in##C ^= *(uint32_t *) (m + (C * 4)); \ 1780ac341f1SConrad Meyer in##D ^= *(uint32_t *) (m + (D * 4)); \ 1790ac341f1SConrad Meyer *(uint32_t *) (c + (A * 4)) = in##A; \ 1800ac341f1SConrad Meyer *(uint32_t *) (c + (B * 4)) = in##B; \ 1810ac341f1SConrad Meyer *(uint32_t *) (c + (C * 4)) = in##C; \ 1820ac341f1SConrad Meyer *(uint32_t *) (c + (D * 4)) = in##D; \ 1830ac341f1SConrad Meyer } while (0) 1840ac341f1SConrad Meyer 1850ac341f1SConrad Meyer #define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) 1860ac341f1SConrad Meyer 1870ac341f1SConrad Meyer ONEQUAD(0, 12, 8, 4); 1880ac341f1SConrad Meyer ONEQUAD(5, 1, 13, 9); 1890ac341f1SConrad Meyer ONEQUAD(10, 6, 2, 14); 1900ac341f1SConrad Meyer ONEQUAD(15, 11, 7, 3); 1910ac341f1SConrad Meyer 1920ac341f1SConrad Meyer #undef ONEQUAD 1930ac341f1SConrad Meyer #undef ONEQUAD_SHUFFLE 1940ac341f1SConrad Meyer 1950ac341f1SConrad Meyer in8 = x[8]; 1960ac341f1SConrad Meyer in9 = x[13]; 1970ac341f1SConrad Meyer in8++; 1980ac341f1SConrad Meyer if (in8 == 0) { 1990ac341f1SConrad Meyer in9++; 2000ac341f1SConrad Meyer } 2010ac341f1SConrad Meyer x[8] = in8; 2020ac341f1SConrad Meyer x[13] = in9; 2030ac341f1SConrad Meyer 2040ac341f1SConrad Meyer c += 64; 2050ac341f1SConrad Meyer m += 64; 2060ac341f1SConrad Meyer bytes -= 64; 2070ac341f1SConrad Meyer } 208