1 if (bytes > 0) { 2 __m128i x_0, x_1, x_2, x_3; 3 __m128i t_1; 4 const __m128i rot16 = 5 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); 6 const __m128i rot8 = 7 _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); 8 uint8_t partialblock[64]; 9 10 unsigned int i; 11 12 x_0 = _mm_loadu_si128((__m128i*) (x + 0)); 13 x_1 = _mm_loadu_si128((__m128i*) (x + 4)); 14 x_2 = _mm_loadu_si128((__m128i*) (x + 8)); 15 x_3 = _mm_loadu_si128((__m128i*) (x + 12)); 16 17 for (i = 0; i < ROUNDS; i += 2) { 18 x_0 = _mm_add_epi32(x_0, x_1); 19 x_3 = _mm_xor_si128(x_3, x_0); 20 x_3 = _mm_shuffle_epi8(x_3, rot16); 21 22 x_2 = _mm_add_epi32(x_2, x_3); 23 x_1 = _mm_xor_si128(x_1, x_2); 24 25 t_1 = x_1; 26 x_1 = _mm_slli_epi32(x_1, 12); 27 t_1 = _mm_srli_epi32(t_1, 20); 28 x_1 = _mm_xor_si128(x_1, t_1); 29 30 x_0 = _mm_add_epi32(x_0, x_1); 31 x_3 = _mm_xor_si128(x_3, x_0); 32 x_0 = _mm_shuffle_epi32(x_0, 0x93); 33 x_3 = _mm_shuffle_epi8(x_3, rot8); 34 35 x_2 = _mm_add_epi32(x_2, x_3); 36 x_3 = _mm_shuffle_epi32(x_3, 0x4e); 37 x_1 = _mm_xor_si128(x_1, x_2); 38 x_2 = _mm_shuffle_epi32(x_2, 0x39); 39 40 t_1 = x_1; 41 x_1 = _mm_slli_epi32(x_1, 7); 42 t_1 = _mm_srli_epi32(t_1, 25); 43 x_1 = _mm_xor_si128(x_1, t_1); 44 45 x_0 = _mm_add_epi32(x_0, x_1); 46 x_3 = _mm_xor_si128(x_3, x_0); 47 x_3 = _mm_shuffle_epi8(x_3, rot16); 48 49 x_2 = _mm_add_epi32(x_2, x_3); 50 x_1 = _mm_xor_si128(x_1, x_2); 51 52 t_1 = x_1; 53 x_1 = _mm_slli_epi32(x_1, 12); 54 t_1 = _mm_srli_epi32(t_1, 20); 55 x_1 = _mm_xor_si128(x_1, t_1); 56 57 x_0 = _mm_add_epi32(x_0, x_1); 58 x_3 = _mm_xor_si128(x_3, x_0); 59 x_0 = _mm_shuffle_epi32(x_0, 0x39); 60 x_3 = _mm_shuffle_epi8(x_3, rot8); 61 62 x_2 = _mm_add_epi32(x_2, x_3); 63 x_3 = _mm_shuffle_epi32(x_3, 0x4e); 64 x_1 = _mm_xor_si128(x_1, x_2); 65 x_2 = _mm_shuffle_epi32(x_2, 0x93); 66 67 t_1 = x_1; 68 x_1 = _mm_slli_epi32(x_1, 7); 69 t_1 = _mm_srli_epi32(t_1, 25); 70 x_1 = _mm_xor_si128(x_1, t_1); 71 } 72 x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0))); 73 x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4))); 74 x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8))); 75 x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12))); 76 _mm_storeu_si128((__m128i*) (partialblock + 0), x_0); 77 _mm_storeu_si128((__m128i*) (partialblock + 16), x_1); 78 _mm_storeu_si128((__m128i*) (partialblock + 32), x_2); 79 _mm_storeu_si128((__m128i*) (partialblock + 48), x_3); 80 81 for (i = 0; i < bytes; i++) { 82 c[i] = m[i] ^ partialblock[i]; 83 } 84 85 sodium_memzero(partialblock, sizeof partialblock); 86 } 87