1 /*
2 * Argon2 reference source code package - reference C implementations
3 *
4 * Copyright 2015
5 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
6 *
7 * You may use this work under the terms of a Creative Commons CC0 1.0
8 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
9 * these licenses can be found at:
10 *
11 * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
12 * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * You should have received a copy of both of these licenses along with this
15 * software. If not, they may be obtained at the above URLs.
16 */
17
18 #ifndef BLAKE_ROUND_MKA_OPT_H
19 #define BLAKE_ROUND_MKA_OPT_H
20
21 #include "blake2-impl.h"
22
23 #include <emmintrin.h>
24 #if defined(__SSSE3__)
25 #include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
26 #endif
27
28 #if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
29 #include <x86intrin.h>
30 #endif
31
32 #if !defined(__XOP__)
33 #if defined(__SSSE3__)
34 #define r16 \
35 (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
36 #define r24 \
37 (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
38 #define _mm_roti_epi64(x, c) \
39 (-(c) == 32) \
40 ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
41 : (-(c) == 24) \
42 ? _mm_shuffle_epi8((x), r24) \
43 : (-(c) == 16) \
44 ? _mm_shuffle_epi8((x), r16) \
45 : (-(c) == 63) \
46 ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
47 _mm_add_epi64((x), (x))) \
48 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
49 _mm_slli_epi64((x), 64 - (-(c))))
50 #else /* defined(__SSE2__) */
51 #define _mm_roti_epi64(r, c) \
52 _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
53 #endif
54 #else
55 #endif
56
fBlaMka(__m128i x,__m128i y)57 static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
58 const __m128i z = _mm_mul_epu32(x, y);
59 return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
60 }
61
62 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
63 do { \
64 A0 = fBlaMka(A0, B0); \
65 A1 = fBlaMka(A1, B1); \
66 \
67 D0 = _mm_xor_si128(D0, A0); \
68 D1 = _mm_xor_si128(D1, A1); \
69 \
70 D0 = _mm_roti_epi64(D0, -32); \
71 D1 = _mm_roti_epi64(D1, -32); \
72 \
73 C0 = fBlaMka(C0, D0); \
74 C1 = fBlaMka(C1, D1); \
75 \
76 B0 = _mm_xor_si128(B0, C0); \
77 B1 = _mm_xor_si128(B1, C1); \
78 \
79 B0 = _mm_roti_epi64(B0, -24); \
80 B1 = _mm_roti_epi64(B1, -24); \
81 } while ((void)0, 0)
82
83 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
84 do { \
85 A0 = fBlaMka(A0, B0); \
86 A1 = fBlaMka(A1, B1); \
87 \
88 D0 = _mm_xor_si128(D0, A0); \
89 D1 = _mm_xor_si128(D1, A1); \
90 \
91 D0 = _mm_roti_epi64(D0, -16); \
92 D1 = _mm_roti_epi64(D1, -16); \
93 \
94 C0 = fBlaMka(C0, D0); \
95 C1 = fBlaMka(C1, D1); \
96 \
97 B0 = _mm_xor_si128(B0, C0); \
98 B1 = _mm_xor_si128(B1, C1); \
99 \
100 B0 = _mm_roti_epi64(B0, -63); \
101 B1 = _mm_roti_epi64(B1, -63); \
102 } while ((void)0, 0)
103
104 #if defined(__SSSE3__)
105 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
106 do { \
107 __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
108 __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
109 B0 = t0; \
110 B1 = t1; \
111 \
112 t0 = C0; \
113 C0 = C1; \
114 C1 = t0; \
115 \
116 t0 = _mm_alignr_epi8(D1, D0, 8); \
117 t1 = _mm_alignr_epi8(D0, D1, 8); \
118 D0 = t1; \
119 D1 = t0; \
120 } while ((void)0, 0)
121
122 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
123 do { \
124 __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
125 __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
126 B0 = t0; \
127 B1 = t1; \
128 \
129 t0 = C0; \
130 C0 = C1; \
131 C1 = t0; \
132 \
133 t0 = _mm_alignr_epi8(D0, D1, 8); \
134 t1 = _mm_alignr_epi8(D1, D0, 8); \
135 D0 = t1; \
136 D1 = t0; \
137 } while ((void)0, 0)
138 #else /* SSE2 */
139 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
140 do { \
141 __m128i t0 = D0; \
142 __m128i t1 = B0; \
143 D0 = C0; \
144 C0 = C1; \
145 C1 = D0; \
146 D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \
147 D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \
148 B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \
149 B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \
150 } while ((void)0, 0)
151
152 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
153 do { \
154 __m128i t0, t1; \
155 t0 = C0; \
156 C0 = C1; \
157 C1 = t0; \
158 t0 = B0; \
159 t1 = D0; \
160 B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \
161 B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \
162 D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \
163 D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \
164 } while ((void)0, 0)
165 #endif
166
167 #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
168 do { \
169 G1(A0, B0, C0, D0, A1, B1, C1, D1); \
170 G2(A0, B0, C0, D0, A1, B1, C1, D1); \
171 \
172 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
173 \
174 G1(A0, B0, C0, D0, A1, B1, C1, D1); \
175 G2(A0, B0, C0, D0, A1, B1, C1, D1); \
176 \
177 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
178 } while ((void)0, 0)
179
180 #endif
181