1 #ifndef blamka_round_ssse3_H 2 #define blamka_round_ssse3_H 3 4 #include "private/common.h" 5 #include "private/sse2_64_32.h" 6 7 #define r16 \ 8 (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) 9 #define r24 \ 10 (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) 11 #define _mm_roti_epi64(x, c) \ 12 (-(c) == 32) \ 13 ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ 14 : (-(c) == 24) \ 15 ? _mm_shuffle_epi8((x), r24) \ 16 : (-(c) == 16) \ 17 ? _mm_shuffle_epi8((x), r16) \ 18 : (-(c) == 63) \ 19 ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ 20 _mm_add_epi64((x), (x))) \ 21 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ 22 _mm_slli_epi64((x), 64 - (-(c)))) 23 24 static inline __m128i 25 fBlaMka(__m128i x, __m128i y) 26 { 27 const __m128i z = _mm_mul_epu32(x, y); 28 return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); 29 } 30 31 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ 32 do { \ 33 A0 = fBlaMka(A0, B0); \ 34 A1 = fBlaMka(A1, B1); \ 35 \ 36 D0 = _mm_xor_si128(D0, A0); \ 37 D1 = _mm_xor_si128(D1, A1); \ 38 \ 39 D0 = _mm_roti_epi64(D0, -32); \ 40 D1 = _mm_roti_epi64(D1, -32); \ 41 \ 42 C0 = fBlaMka(C0, D0); \ 43 C1 = fBlaMka(C1, D1); \ 44 \ 45 B0 = _mm_xor_si128(B0, C0); \ 46 B1 = _mm_xor_si128(B1, C1); \ 47 \ 48 B0 = _mm_roti_epi64(B0, -24); \ 49 B1 = _mm_roti_epi64(B1, -24); \ 50 } while ((void) 0, 0) 51 52 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ 53 do { \ 54 A0 = fBlaMka(A0, B0); \ 55 A1 = fBlaMka(A1, B1); \ 56 \ 57 D0 = _mm_xor_si128(D0, A0); \ 58 D1 = _mm_xor_si128(D1, A1); \ 59 \ 60 D0 = _mm_roti_epi64(D0, -16); \ 61 D1 = _mm_roti_epi64(D1, -16); \ 62 \ 63 C0 = fBlaMka(C0, D0); \ 64 C1 = fBlaMka(C1, D1); \ 65 \ 66 B0 = _mm_xor_si128(B0, C0); \ 67 B1 = _mm_xor_si128(B1, C1); \ 68 \ 69 B0 = _mm_roti_epi64(B0, -63); \ 70 B1 = _mm_roti_epi64(B1, -63); \ 71 } while ((void) 0, 0) 72 73 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 74 do { \ 75 __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ 76 __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ 77 B0 = t0; \ 78 B1 = t1; \ 79 \ 80 t0 = C0; \ 81 C0 = C1; \ 82 C1 = t0; \ 83 \ 84 t0 = _mm_alignr_epi8(D1, D0, 8); \ 85 t1 = _mm_alignr_epi8(D0, D1, 8); \ 86 D0 = t1; \ 87 D1 = t0; \ 88 } while ((void) 0, 0) 89 90 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 91 do { \ 92 __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ 93 __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ 94 B0 = t0; \ 95 B1 = t1; \ 96 \ 97 t0 = C0; \ 98 C0 = C1; \ 99 C1 = t0; \ 100 \ 101 t0 = _mm_alignr_epi8(D0, D1, 8); \ 102 t1 = _mm_alignr_epi8(D1, D0, 8); \ 103 D0 = t1; \ 104 D1 = t0; \ 105 } while ((void) 0, 0) 106 107 #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ 108 do { \ 109 G1(A0, B0, C0, D0, A1, B1, C1, D1); \ 110 G2(A0, B0, C0, D0, A1, B1, C1, D1); \ 111 \ 112 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 113 \ 114 G1(A0, B0, C0, D0, A1, B1, C1, D1); \ 115 G2(A0, B0, C0, D0, A1, B1, C1, D1); \ 116 \ 117 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 118 } while ((void) 0, 0) 119 120 #endif 121