1 while (bytes >= 64) { 2 __m128i x_0, x_1, x_2, x_3; 3 __m128i t_1; 4 const __m128i rot16 = 5 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); 6 const __m128i rot8 = 7 _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); 8 9 uint32_t in12; 10 uint32_t in13; 11 int i; 12 13 x_0 = _mm_loadu_si128((__m128i*) (x + 0)); 14 x_1 = _mm_loadu_si128((__m128i*) (x + 4)); 15 x_2 = _mm_loadu_si128((__m128i*) (x + 8)); 16 x_3 = _mm_loadu_si128((__m128i*) (x + 12)); 17 18 for (i = 0; i < ROUNDS; i += 2) { 19 x_0 = _mm_add_epi32(x_0, x_1); 20 x_3 = _mm_xor_si128(x_3, x_0); 21 x_3 = _mm_shuffle_epi8(x_3, rot16); 22 23 x_2 = _mm_add_epi32(x_2, x_3); 24 x_1 = _mm_xor_si128(x_1, x_2); 25 26 t_1 = x_1; 27 x_1 = _mm_slli_epi32(x_1, 12); 28 t_1 = _mm_srli_epi32(t_1, 20); 29 x_1 = _mm_xor_si128(x_1, t_1); 30 31 x_0 = _mm_add_epi32(x_0, x_1); 32 x_3 = _mm_xor_si128(x_3, x_0); 33 x_0 = _mm_shuffle_epi32(x_0, 0x93); 34 x_3 = _mm_shuffle_epi8(x_3, rot8); 35 36 x_2 = _mm_add_epi32(x_2, x_3); 37 x_3 = _mm_shuffle_epi32(x_3, 0x4e); 38 x_1 = _mm_xor_si128(x_1, x_2); 39 x_2 = _mm_shuffle_epi32(x_2, 0x39); 40 41 t_1 = x_1; 42 x_1 = _mm_slli_epi32(x_1, 7); 43 t_1 = _mm_srli_epi32(t_1, 25); 44 x_1 = _mm_xor_si128(x_1, t_1); 45 46 x_0 = _mm_add_epi32(x_0, x_1); 47 x_3 = _mm_xor_si128(x_3, x_0); 48 x_3 = _mm_shuffle_epi8(x_3, rot16); 49 50 x_2 = _mm_add_epi32(x_2, x_3); 51 x_1 = _mm_xor_si128(x_1, x_2); 52 53 t_1 = x_1; 54 x_1 = _mm_slli_epi32(x_1, 12); 55 t_1 = _mm_srli_epi32(t_1, 20); 56 x_1 = _mm_xor_si128(x_1, t_1); 57 58 x_0 = _mm_add_epi32(x_0, x_1); 59 x_3 = _mm_xor_si128(x_3, x_0); 60 x_0 = _mm_shuffle_epi32(x_0, 0x39); 61 x_3 = _mm_shuffle_epi8(x_3, rot8); 62 63 x_2 = _mm_add_epi32(x_2, x_3); 64 x_3 = _mm_shuffle_epi32(x_3, 0x4e); 65 x_1 = _mm_xor_si128(x_1, x_2); 66 x_2 = _mm_shuffle_epi32(x_2, 0x93); 67 68 t_1 = x_1; 69 x_1 = _mm_slli_epi32(x_1, 7); 70 t_1 = _mm_srli_epi32(t_1, 25); 71 x_1 = _mm_xor_si128(x_1, t_1); 72 } 73 x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0))); 74 x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4))); 75 x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8))); 76 x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12))); 77 x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0))); 78 x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16))); 79 x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32))); 80 x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48))); 81 _mm_storeu_si128((__m128i*) (c + 0), x_0); 82 _mm_storeu_si128((__m128i*) (c + 16), x_1); 83 _mm_storeu_si128((__m128i*) (c + 32), x_2); 84 _mm_storeu_si128((__m128i*) (c + 48), x_3); 85 86 in12 = x[12]; 87 in13 = x[13]; 88 in12++; 89 if (in12 == 0) { 90 in13++; 91 } 92 x[12] = in12; 93 x[13] = in13; 94 95 bytes -= 64; 96 c += 64; 97 m += 64; 98 } 99