1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_ENABLE_INTRINSICS   1
26 #include "inner.h"
27 
28 /*
29  * This is the GHASH implementation that leverages the pclmulqdq opcode
30  * (from the AES-NI instructions).
31  */
32 
33 #if BR_AES_X86NI
34 
35 /*
36  * Test CPU support for PCLMULQDQ.
37  */
38 static inline int
39 pclmul_supported(void)
40 {
41 	/*
42 	 * Bit mask for features in ECX:
43 	 *    1   PCLMULQDQ support
44 	 */
45 	return br_cpuid(0, 0, 0x00000002, 0);
46 }
47 
48 /* see bearssl_hash.h */
49 br_ghash
50 br_ghash_pclmul_get(void)
51 {
52 	return pclmul_supported() ? &br_ghash_pclmul : 0;
53 }
54 
55 BR_TARGETS_X86_UP
56 
57 /*
58  * GHASH is defined over elements of GF(2^128) with "full little-endian"
59  * representation: leftmost byte is least significant, and, within each
60  * byte, leftmost _bit_ is least significant. The natural ordering in
61  * x86 is "mixed little-endian": bytes are ordered from least to most
62  * significant, but bits within a byte are in most-to-least significant
63  * order. Going to full little-endian representation would require
64  * reversing bits within each byte, which is doable but expensive.
65  *
66  * Instead, we go to full big-endian representation, by swapping bytes
67  * around, which is done with a single _mm_shuffle_epi8() opcode (it
68  * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
69  * can use a full big-endian representation because in a carryless
70  * multiplication, we have a nice bit reversal property:
71  *
72  *    rev_128(x) * rev_128(y) = rev_255(x * y)
73  *
74  * So by using full big-endian, we still get the right result, except
75  * that it is right-shifted by 1 bit. The left-shift is relatively
76  * inexpensive, and it can be mutualised.
77  *
78  *
79  * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
80  * values with bit precision, we have to break down values into 64-bit
81  * chunks. We number chunks from 0 to 3 in left to right order.
82  */
83 
84 /*
85  * Byte-swap a complete 128-bit value. This normally uses
86  * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
87  * However, this crashes old Clang versions, so, for Clang before 3.8,
88  * we use an alternate (and less efficient) version.
89  */
90 #if BR_CLANG && !BR_CLANG_3_8
91 #define BYTESWAP_DECL
92 #define BYTESWAP_PREP   (void)0
93 #define BYTESWAP(x)   do { \
94 		__m128i byteswap1, byteswap2; \
95 		byteswap1 = (x); \
96 		byteswap2 = _mm_srli_epi16(byteswap1, 8); \
97 		byteswap1 = _mm_slli_epi16(byteswap1, 8); \
98 		byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
99 		byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
100 		byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
101 		(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
102 	} while (0)
103 #else
104 #define BYTESWAP_DECL   __m128i byteswap_index;
105 #define BYTESWAP_PREP   do { \
106 		byteswap_index = _mm_set_epi8( \
107 			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
108 	} while (0)
109 #define BYTESWAP(x)   do { \
110 		(x) = _mm_shuffle_epi8((x), byteswap_index); \
111 	} while (0)
112 #endif
113 
114 /*
115  * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
116  * for that compiler, we use inline assembly. Inline assembly is
117  * potentially a bit slower because the compiler does not understand
118  * what the opcode does, and thus cannot optimize instruction
119  * scheduling.
120  *
121  * We use a target of "sse2" only, so that Clang may still handle the
122  * '__m128i' type and allocate SSE2 registers.
123  */
124 #if BR_CLANG
125 BR_TARGET("sse2")
126 static inline __m128i
127 pclmulqdq00(__m128i x, __m128i y)
128 {
129 	__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
130 	return x;
131 }
132 BR_TARGET("sse2")
133 static inline __m128i
134 pclmulqdq11(__m128i x, __m128i y)
135 {
136 	__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
137 	return x;
138 }
139 #else
140 #define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
141 #define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
142 #endif
143 
144 /*
145  * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
146  * halves of kw (into the right half of kx; left half is unspecified).
147  */
148 #define BK(kw, kx)   do { \
149 		kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
150 	} while (0)
151 
152 /*
153  * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
154  * the XOR of the two values (kx).
155  */
156 #define PBK(k0, k1, kw, kx)   do { \
157 		kw = _mm_unpacklo_epi64(k1, k0); \
158 		kx = _mm_xor_si128(k0, k1); \
159 	} while (0)
160 
161 /*
162  * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
163  */
164 #define SL_256(x0, x1, x2, x3)   do { \
165 		x0 = _mm_or_si128( \
166 			_mm_slli_epi64(x0, 1), \
167 			_mm_srli_epi64(x1, 63)); \
168 		x1 = _mm_or_si128( \
169 			_mm_slli_epi64(x1, 1), \
170 			_mm_srli_epi64(x2, 63)); \
171 		x2 = _mm_or_si128( \
172 			_mm_slli_epi64(x2, 1), \
173 			_mm_srli_epi64(x3, 63)); \
174 		x3 = _mm_slli_epi64(x3, 1); \
175 	} while (0)
176 
177 /*
178  * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
179  * result is written in x0..x1.
180  */
181 #define REDUCE_F128(x0, x1, x2, x3)   do { \
182 		x1 = _mm_xor_si128( \
183 			x1, \
184 			_mm_xor_si128( \
185 				_mm_xor_si128( \
186 					x3, \
187 					_mm_srli_epi64(x3, 1)), \
188 				_mm_xor_si128( \
189 					_mm_srli_epi64(x3, 2), \
190 					_mm_srli_epi64(x3, 7)))); \
191 		x2 = _mm_xor_si128( \
192 			_mm_xor_si128( \
193 				x2, \
194 				_mm_slli_epi64(x3, 63)), \
195 			_mm_xor_si128( \
196 				_mm_slli_epi64(x3, 62), \
197 				_mm_slli_epi64(x3, 57))); \
198 		x0 = _mm_xor_si128( \
199 			x0, \
200 			_mm_xor_si128( \
201 				_mm_xor_si128( \
202 					x2, \
203 					_mm_srli_epi64(x2, 1)), \
204 				_mm_xor_si128( \
205 					_mm_srli_epi64(x2, 2), \
206 					_mm_srli_epi64(x2, 7)))); \
207 		x1 = _mm_xor_si128( \
208 			_mm_xor_si128( \
209 				x1, \
210 				_mm_slli_epi64(x2, 63)), \
211 			_mm_xor_si128( \
212 				_mm_slli_epi64(x2, 62), \
213 				_mm_slli_epi64(x2, 57))); \
214 	} while (0)
215 
216 /*
217  * Square value kw into (dw,dx).
218  */
219 #define SQUARE_F128(kw, dw, dx)   do { \
220 		__m128i z0, z1, z2, z3; \
221 		z1 = pclmulqdq11(kw, kw); \
222 		z3 = pclmulqdq00(kw, kw); \
223 		z0 = _mm_shuffle_epi32(z1, 0x0E); \
224 		z2 = _mm_shuffle_epi32(z3, 0x0E); \
225 		SL_256(z0, z1, z2, z3); \
226 		REDUCE_F128(z0, z1, z2, z3); \
227 		PBK(z0, z1, dw, dx); \
228 	} while (0)
229 
230 /* see bearssl_hash.h */
231 BR_TARGET("ssse3,pclmul")
232 void
233 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
234 {
235 	const unsigned char *buf1, *buf2;
236 	unsigned char tmp[64];
237 	size_t num4, num1;
238 	__m128i yw, h1w, h1x;
239 	BYTESWAP_DECL
240 
241 	/*
242 	 * We split data into two chunks. First chunk starts at buf1
243 	 * and contains num4 blocks of 64-byte values. Second chunk
244 	 * starts at buf2 and contains num1 blocks of 16-byte values.
245 	 * We want the first chunk to be as large as possible.
246 	 */
247 	buf1 = data;
248 	num4 = len >> 6;
249 	len &= 63;
250 	buf2 = buf1 + (num4 << 6);
251 	num1 = (len + 15) >> 4;
252 	if ((len & 15) != 0) {
253 		memcpy(tmp, buf2, len);
254 		memset(tmp + len, 0, (num1 << 4) - len);
255 		buf2 = tmp;
256 	}
257 
258 	/*
259 	 * Preparatory step for endian conversions.
260 	 */
261 	BYTESWAP_PREP;
262 
263 	/*
264 	 * Load y and h.
265 	 */
266 	yw = _mm_loadu_si128(y);
267 	h1w = _mm_loadu_si128(h);
268 	BYTESWAP(yw);
269 	BYTESWAP(h1w);
270 	BK(h1w, h1x);
271 
272 	if (num4 > 0) {
273 		__m128i h2w, h2x, h3w, h3x, h4w, h4x;
274 		__m128i t0, t1, t2, t3;
275 
276 		/*
277 		 * Compute h2 = h^2.
278 		 */
279 		SQUARE_F128(h1w, h2w, h2x);
280 
281 		/*
282 		 * Compute h3 = h^3 = h*(h^2).
283 		 */
284 		t1 = pclmulqdq11(h1w, h2w);
285 		t3 = pclmulqdq00(h1w, h2w);
286 		t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
287 			_mm_xor_si128(t1, t3));
288 		t0 = _mm_shuffle_epi32(t1, 0x0E);
289 		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
290 		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
291 		SL_256(t0, t1, t2, t3);
292 		REDUCE_F128(t0, t1, t2, t3);
293 		PBK(t0, t1, h3w, h3x);
294 
295 		/*
296 		 * Compute h4 = h^4 = (h^2)^2.
297 		 */
298 		SQUARE_F128(h2w, h4w, h4x);
299 
300 		while (num4 -- > 0) {
301 			__m128i aw0, aw1, aw2, aw3;
302 			__m128i ax0, ax1, ax2, ax3;
303 
304 			aw0 = _mm_loadu_si128((void *)(buf1 +  0));
305 			aw1 = _mm_loadu_si128((void *)(buf1 + 16));
306 			aw2 = _mm_loadu_si128((void *)(buf1 + 32));
307 			aw3 = _mm_loadu_si128((void *)(buf1 + 48));
308 			BYTESWAP(aw0);
309 			BYTESWAP(aw1);
310 			BYTESWAP(aw2);
311 			BYTESWAP(aw3);
312 			buf1 += 64;
313 
314 			aw0 = _mm_xor_si128(aw0, yw);
315 			BK(aw1, ax1);
316 			BK(aw2, ax2);
317 			BK(aw3, ax3);
318 			BK(aw0, ax0);
319 
320 			t1 = _mm_xor_si128(
321 				_mm_xor_si128(
322 					pclmulqdq11(aw0, h4w),
323 					pclmulqdq11(aw1, h3w)),
324 				_mm_xor_si128(
325 					pclmulqdq11(aw2, h2w),
326 					pclmulqdq11(aw3, h1w)));
327 			t3 = _mm_xor_si128(
328 				_mm_xor_si128(
329 					pclmulqdq00(aw0, h4w),
330 					pclmulqdq00(aw1, h3w)),
331 				_mm_xor_si128(
332 					pclmulqdq00(aw2, h2w),
333 					pclmulqdq00(aw3, h1w)));
334 			t2 = _mm_xor_si128(
335 				_mm_xor_si128(
336 					pclmulqdq00(ax0, h4x),
337 					pclmulqdq00(ax1, h3x)),
338 				_mm_xor_si128(
339 					pclmulqdq00(ax2, h2x),
340 					pclmulqdq00(ax3, h1x)));
341 			t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
342 			t0 = _mm_shuffle_epi32(t1, 0x0E);
343 			t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
344 			t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
345 			SL_256(t0, t1, t2, t3);
346 			REDUCE_F128(t0, t1, t2, t3);
347 			yw = _mm_unpacklo_epi64(t1, t0);
348 		}
349 	}
350 
351 	while (num1 -- > 0) {
352 		__m128i aw, ax;
353 		__m128i t0, t1, t2, t3;
354 
355 		aw = _mm_loadu_si128((void *)buf2);
356 		BYTESWAP(aw);
357 		buf2 += 16;
358 
359 		aw = _mm_xor_si128(aw, yw);
360 		BK(aw, ax);
361 
362 		t1 = pclmulqdq11(aw, h1w);
363 		t3 = pclmulqdq00(aw, h1w);
364 		t2 = pclmulqdq00(ax, h1x);
365 		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
366 		t0 = _mm_shuffle_epi32(t1, 0x0E);
367 		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
368 		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
369 		SL_256(t0, t1, t2, t3);
370 		REDUCE_F128(t0, t1, t2, t3);
371 		yw = _mm_unpacklo_epi64(t1, t0);
372 	}
373 
374 	BYTESWAP(yw);
375 	_mm_storeu_si128(y, yw);
376 }
377 
378 BR_TARGETS_X86_DOWN
379 
380 #else
381 
382 /* see bearssl_hash.h */
383 br_ghash
384 br_ghash_pclmul_get(void)
385 {
386 	return 0;
387 }
388 
389 #endif
390