xref: /freebsd/contrib/bearssl/src/hash/ghash_pwr8.c (revision 81ad6265)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_POWER_ASM_MACROS   1
26 #include "inner.h"
27 
28 /*
29  * This is the GHASH implementation that leverages the POWER8 opcodes.
30  */
31 
32 #if BR_POWER8
33 
34 /*
35  * Some symbolic names for registers.
36  *   HB0 = 16 bytes of value 0
37  *   HB1 = 16 bytes of value 1
38  *   HB2 = 16 bytes of value 2
39  *   HB6 = 16 bytes of value 6
40  *   HB7 = 16 bytes of value 7
41  *   TT0, TT1 and TT2 are temporaries
42  *
43  * BSW holds the pattern for byteswapping 32-bit words; this is set only
44  * on little-endian systems. XBSW is the same register with the +32 offset
45  * for access with the VSX opcodes.
46  */
47 #define HB0     0
48 #define HB1     1
49 #define HB2     2
50 #define HB6     3
51 #define HB7     4
52 #define TT0     5
53 #define TT1     6
54 #define TT2     7
55 
56 #define BSW     8
57 #define XBSW   40
58 
59 /*
60  * Macro to initialise the constants.
61  */
62 #define INIT \
63 		vxor(HB0, HB0, HB0) \
64 		vspltisb(HB1, 1) \
65 		vspltisb(HB2, 2) \
66 		vspltisb(HB6, 6) \
67 		vspltisb(HB7, 7) \
68 		INIT_BSW
69 
70 /*
71  * Fix endianness of a value after reading it or before writing it, if
72  * necessary.
73  */
74 #if BR_POWER8_LE
75 #define INIT_BSW         lxvw4x(XBSW, 0, %[idx2be])
76 #define FIX_ENDIAN(xx)   vperm(xx, xx, xx, BSW)
77 #else
78 #define INIT_BSW
79 #define FIX_ENDIAN(xx)
80 #endif
81 
82 /*
83  * Left-shift x0:x1 by one bit to the left. This is a corrective action
84  * needed because GHASH is defined in full little-endian specification,
85  * while the opcodes use full big-endian convention, so the 255-bit product
86  * ends up one bit to the right.
87  */
88 #define SL_256(x0, x1) \
89 		vsldoi(TT0, HB0, x1, 1) \
90 		vsl(x0, x0, HB1) \
91 		vsr(TT0, TT0, HB7) \
92 		vsl(x1, x1, HB1) \
93 		vxor(x0, x0, TT0)
94 
95 /*
96  * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
97  * x0 or x1, or a different register). x0 and x1 are modified.
98  */
99 #define REDUCE_F128(xd, x0, x1) \
100 		vxor(x0, x0, x1) \
101 		vsr(TT0, x1, HB1) \
102 		vsr(TT1, x1, HB2) \
103 		vsr(TT2, x1, HB7) \
104 		vxor(x0, x0, TT0) \
105 		vxor(TT1, TT1, TT2) \
106 		vxor(x0, x0, TT1) \
107 		vsldoi(x1, x1, HB0, 15) \
108 		vsl(TT1, x1, HB6) \
109 		vsl(TT2, x1, HB1) \
110 		vxor(x1, TT1, TT2) \
111 		vsr(TT0, x1, HB1) \
112 		vsr(TT1, x1, HB2) \
113 		vsr(TT2, x1, HB7) \
114 		vxor(x0, x0, x1) \
115 		vxor(x0, x0, TT0) \
116 		vxor(TT1, TT1, TT2) \
117 		vxor(xd, x0, TT1)
118 
119 /* see bearssl_hash.h */
120 void
121 br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
122 {
123 	const unsigned char *buf1, *buf2;
124 	size_t num4, num1;
125 	unsigned char tmp[64];
126 	long cc0, cc1, cc2, cc3;
127 
128 #if BR_POWER8_LE
129 	static const uint32_t idx2be[] = {
130 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
131 	};
132 #endif
133 
134 	buf1 = data;
135 
136 	/*
137 	 * Assembly code requires data into two chunks; first chunk
138 	 * must contain a number of blocks which is a multiple of 4.
139 	 * Since the processing for the first chunk is faster, we want
140 	 * to make it as big as possible.
141 	 *
142 	 * For the remainder, there are two possibilities:
143 	 *  -- if the remainder size is a multiple of 16, then use it
144 	 *     in place;
145 	 *  -- otherwise, copy it to the tmp[] array and pad it with
146 	 *     zeros.
147 	 */
148 	num4 = len >> 6;
149 	buf2 = buf1 + (num4 << 6);
150 	len &= 63;
151 	num1 = (len + 15) >> 4;
152 	if ((len & 15) != 0) {
153 		memcpy(tmp, buf2, len);
154 		memset(tmp + len, 0, (num1 << 4) - len);
155 		buf2 = tmp;
156 	}
157 
158 	cc0 =  0;
159 	cc1 = 16;
160 	cc2 = 32;
161 	cc3 = 48;
162 	asm volatile (
163 		INIT
164 
165 		/*
166 		 * Load current h (denoted hereafter h1) in v9.
167 		 */
168 		lxvw4x(41, 0, %[h])
169 		FIX_ENDIAN(9)
170 
171 		/*
172 		 * Load current y into v28.
173 		 */
174 		lxvw4x(60, 0, %[y])
175 		FIX_ENDIAN(28)
176 
177 		/*
178 		 * Split h1 into three registers:
179 		 *   v17 = h1_1:h1_0
180 		 *   v18 =    0:h1_0
181 		 *   v19 = h1_1:0
182 		 */
183 		xxpermdi(49, 41, 41, 2)
184 		vsldoi(18, HB0, 9, 8)
185 		vsldoi(19, 9, HB0, 8)
186 
187 		/*
188 		 * If num4 is 0, skip directly to the second chunk.
189 		 */
190 		cmpldi(%[num4], 0)
191 		beq(chunk1)
192 
193 		/*
194 		 * Compute h2 = h*h in v10.
195 		 */
196 		vpmsumd(10, 18, 18)
197 		vpmsumd(11, 19, 19)
198 		SL_256(10, 11)
199 		REDUCE_F128(10, 10, 11)
200 
201 		/*
202 		 * Compute h3 = h*h*h in v11.
203 		 * We first split h2 into:
204 		 *   v10 = h2_0:h2_1
205 		 *   v11 =    0:h2_0
206 		 *   v12 = h2_1:0
207 		 * Then we do the product with h1, and reduce into v11.
208 		 */
209 		vsldoi(11, HB0, 10, 8)
210 		vsldoi(12, 10, HB0, 8)
211 		vpmsumd(13, 10, 17)
212 		vpmsumd(11, 11, 18)
213 		vpmsumd(12, 12, 19)
214 		vsldoi(14, HB0, 13, 8)
215 		vsldoi(15, 13, HB0, 8)
216 		vxor(11, 11, 14)
217 		vxor(12, 12, 15)
218 		SL_256(11, 12)
219 		REDUCE_F128(11, 11, 12)
220 
221 		/*
222 		 * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
223 		 */
224 		vsldoi(12, HB0, 10, 8)
225 		vsldoi(13, 10, HB0, 8)
226 		vpmsumd(12, 12, 12)
227 		vpmsumd(13, 13, 13)
228 		SL_256(12, 13)
229 		REDUCE_F128(12, 12, 13)
230 
231 		/*
232 		 * Repack h1, h2, h3 and h4:
233 		 *   v13 = h4_0:h3_0
234 		 *   v14 = h4_1:h3_1
235 		 *   v15 = h2_0:h1_0
236 		 *   v16 = h2_1:h1_1
237 		 */
238 		xxpermdi(45, 44, 43, 0)
239 		xxpermdi(46, 44, 43, 3)
240 		xxpermdi(47, 42, 41, 0)
241 		xxpermdi(48, 42, 41, 3)
242 
243 		/*
244 		 * Loop for each group of four blocks.
245 		 */
246 		mtctr(%[num4])
247 	label(loop4)
248 		/*
249 		 * Read the four next blocks.
250 		 *   v20 = y + a0 = b0
251 		 *   v21 = a1     = b1
252 		 *   v22 = a2     = b2
253 		 *   v23 = a3     = b3
254 		 */
255 		lxvw4x(52, %[cc0], %[buf1])
256 		lxvw4x(53, %[cc1], %[buf1])
257 		lxvw4x(54, %[cc2], %[buf1])
258 		lxvw4x(55, %[cc3], %[buf1])
259 		FIX_ENDIAN(20)
260 		FIX_ENDIAN(21)
261 		FIX_ENDIAN(22)
262 		FIX_ENDIAN(23)
263 		addi(%[buf1], %[buf1], 64)
264 		vxor(20, 20, 28)
265 
266 		/*
267 		 * Repack the blocks into v9, v10, v11 and v12.
268 		 *   v9  = b0_0:b1_0
269 		 *   v10 = b0_1:b1_1
270 		 *   v11 = b2_0:b3_0
271 		 *   v12 = b2_1:b3_1
272 		 */
273 		xxpermdi(41, 52, 53, 0)
274 		xxpermdi(42, 52, 53, 3)
275 		xxpermdi(43, 54, 55, 0)
276 		xxpermdi(44, 54, 55, 3)
277 
278 		/*
279 		 * Compute the products.
280 		 *   v20 = b0_0*h4_0 + b1_0*h3_0
281 		 *   v21 = b0_1*h4_0 + b1_1*h3_0
282 		 *   v22 = b0_0*h4_1 + b1_0*h3_1
283 		 *   v23 = b0_1*h4_1 + b1_1*h3_1
284 		 *   v24 = b2_0*h2_0 + b3_0*h1_0
285 		 *   v25 = b2_1*h2_0 + b3_1*h1_0
286 		 *   v26 = b2_0*h2_1 + b3_0*h1_1
287 		 *   v27 = b2_1*h2_1 + b3_1*h1_1
288 		 */
289 		vpmsumd(20, 13,  9)
290 		vpmsumd(21, 13, 10)
291 		vpmsumd(22, 14,  9)
292 		vpmsumd(23, 14, 10)
293 		vpmsumd(24, 15, 11)
294 		vpmsumd(25, 15, 12)
295 		vpmsumd(26, 16, 11)
296 		vpmsumd(27, 16, 12)
297 
298 		/*
299 		 * Sum products into a single 256-bit result in v11:v12.
300 		 */
301 		vxor(11, 20, 24)
302 		vxor(12, 23, 27)
303 		vxor( 9, 21, 22)
304 		vxor(10, 25, 26)
305 		vxor(20,  9, 10)
306 		vsldoi( 9, HB0, 20, 8)
307 		vsldoi(10, 20, HB0, 8)
308 		vxor(11, 11, 9)
309 		vxor(12, 12, 10)
310 
311 		/*
312 		 * Fix and reduce in GF(2^128); this is the new y (in v28).
313 		 */
314 		SL_256(11, 12)
315 		REDUCE_F128(28, 11, 12)
316 
317 		/*
318 		 * Loop for next group of four blocks.
319 		 */
320 		bdnz(loop4)
321 
322 		/*
323 		 * Process second chunk, one block at a time.
324 		 */
325 	label(chunk1)
326 		cmpldi(%[num1], 0)
327 		beq(done)
328 
329 		mtctr(%[num1])
330 	label(loop1)
331 		/*
332 		 * Load next data block and XOR it into y.
333 		 */
334 		lxvw4x(41, 0, %[buf2])
335 #if BR_POWER8_LE
336 		FIX_ENDIAN(9)
337 #endif
338 		addi(%[buf2], %[buf2], 16)
339 		vxor(9, 28, 9)
340 
341 		/*
342 		 * Split y into doublewords:
343 		 *   v9  = y_0:y_1
344 		 *   v10 =   0:y_0
345 		 *   v11 = y_1:0
346 		 */
347 		vsldoi(10, HB0, 9, 8)
348 		vsldoi(11, 9, HB0, 8)
349 
350 		/*
351 		 * Compute products with h:
352 		 *   v12 = y_0 * h_0
353 		 *   v13 = y_1 * h_1
354 		 *   v14 = y_1 * h_0 + y_0 * h_1
355 		 */
356 		vpmsumd(14,  9, 17)
357 		vpmsumd(12, 10, 18)
358 		vpmsumd(13, 11, 19)
359 
360 		/*
361 		 * Propagate v14 into v12:v13 to finalise product.
362 		 */
363 		vsldoi(10, HB0, 14, 8)
364 		vsldoi(11, 14, HB0, 8)
365 		vxor(12, 12, 10)
366 		vxor(13, 13, 11)
367 
368 		/*
369 		 * Fix result and reduce into v28 (next value for y).
370 		 */
371 		SL_256(12, 13)
372 		REDUCE_F128(28, 12, 13)
373 		bdnz(loop1)
374 
375 	label(done)
376 		/*
377 		 * Write back the new y.
378 		 */
379 		FIX_ENDIAN(28)
380 		stxvw4x(60, 0, %[y])
381 
382 : [buf1] "+b" (buf1), [buf2] "+b" (buf2)
383 : [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
384   [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
385 #if BR_POWER8_LE
386 	, [idx2be] "b" (idx2be)
387 #endif
388 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
389   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
390   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
391   "ctr", "memory"
392 	);
393 }
394 
395 /* see bearssl_hash.h */
396 br_ghash
397 br_ghash_pwr8_get(void)
398 {
399 	return &br_ghash_pwr8;
400 }
401 
402 #else
403 
404 /* see bearssl_hash.h */
405 br_ghash
406 br_ghash_pwr8_get(void)
407 {
408 	return 0;
409 }
410 
411 #endif
412