1#! /usr/bin/env perl
2# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for for PowerISA v2.07.
18#
19# July 2014
20#
21# Accurate performance measurements are problematic, because it's
22# always virtualized setup with possibly throttled processor.
23# Relative comparison is therefore more informative. This initial
24# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25# faster than "4-bit" integer-only compiler-generated 64-bit code.
26# "Initial version" means that there is room for further improvement.
27
28# May 2016
29#
30# 2x aggregated reduction improves performance by 50% (resulting
31# performance on POWER8 is 1 cycle per processed byte), and 4x
32# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33# POWER9 delivers 0.51 cpb.
34
35$flavour=shift;
36$output =shift;
37
38if ($flavour =~ /64/) {
39	$SIZE_T=8;
40	$LRSAVE=2*$SIZE_T;
41	$STU="stdu";
42	$POP="ld";
43	$PUSH="std";
44	$UCMP="cmpld";
45	$SHRI="srdi";
46} elsif ($flavour =~ /32/) {
47	$SIZE_T=4;
48	$LRSAVE=$SIZE_T;
49	$STU="stwu";
50	$POP="lwz";
51	$PUSH="stw";
52	$UCMP="cmplw";
53	$SHRI="srwi";
54} else { die "nonsense $flavour"; }
55
56$sp="r1";
57$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62die "can't locate ppc-xlate.pl";
63
64open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
67
68my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
69my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
70my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
71my $vrsave="r12";
72
73$code=<<___;
74.machine	"any"
75
76.text
77
78.globl	.gcm_init_p8
79.align	5
80.gcm_init_p8:
81	li		r0,-4096
82	li		r8,0x10
83	mfspr		$vrsave,256
84	li		r9,0x20
85	mtspr		256,r0
86	li		r10,0x30
87	lvx_u		$H,0,r4			# load H
88
89	vspltisb	$xC2,-16		# 0xf0
90	vspltisb	$t0,1			# one
91	vaddubm		$xC2,$xC2,$xC2		# 0xe0
92	vxor		$zero,$zero,$zero
93	vor		$xC2,$xC2,$t0		# 0xe1
94	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
95	vsldoi		$t1,$zero,$t0,1		# ...1
96	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
97	vspltisb	$t2,7
98	vor		$xC2,$xC2,$t1		# 0xc2....01
99	vspltb		$t1,$H,0		# most significant byte
100	vsl		$H,$H,$t0		# H<<=1
101	vsrab		$t1,$t1,$t2		# broadcast carry bit
102	vand		$t1,$t1,$xC2
103	vxor		$IN,$H,$t1		# twisted H
104
105	vsldoi		$H,$IN,$IN,8		# twist even more ...
106	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
107	vsldoi		$Hl,$zero,$H,8		# ... and split
108	vsldoi		$Hh,$H,$zero,8
109
110	stvx_u		$xC2,0,r3		# save pre-computed table
111	stvx_u		$Hl,r8,r3
112	li		r8,0x40
113	stvx_u		$H, r9,r3
114	li		r9,0x50
115	stvx_u		$Hh,r10,r3
116	li		r10,0x60
117
118	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
119	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
120	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
121
122	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
123
124	vsldoi		$t0,$Xm,$zero,8
125	vsldoi		$t1,$zero,$Xm,8
126	vxor		$Xl,$Xl,$t0
127	vxor		$Xh,$Xh,$t1
128
129	vsldoi		$Xl,$Xl,$Xl,8
130	vxor		$Xl,$Xl,$t2
131
132	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
133	vpmsumd		$Xl,$Xl,$xC2
134	vxor		$t1,$t1,$Xh
135	vxor		$IN1,$Xl,$t1
136
137	vsldoi		$H2,$IN1,$IN1,8
138	vsldoi		$H2l,$zero,$H2,8
139	vsldoi		$H2h,$H2,$zero,8
140
141	stvx_u		$H2l,r8,r3		# save H^2
142	li		r8,0x70
143	stvx_u		$H2,r9,r3
144	li		r9,0x80
145	stvx_u		$H2h,r10,r3
146	li		r10,0x90
147___
148{
149my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
150$code.=<<___;
151	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
152	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
153	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
154	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
155	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
156	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
157
158	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
159	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
160
161	vsldoi		$t0,$Xm,$zero,8
162	vsldoi		$t1,$zero,$Xm,8
163	 vsldoi		$t4,$Xm1,$zero,8
164	 vsldoi		$t5,$zero,$Xm1,8
165	vxor		$Xl,$Xl,$t0
166	vxor		$Xh,$Xh,$t1
167	 vxor		$Xl1,$Xl1,$t4
168	 vxor		$Xh1,$Xh1,$t5
169
170	vsldoi		$Xl,$Xl,$Xl,8
171	 vsldoi		$Xl1,$Xl1,$Xl1,8
172	vxor		$Xl,$Xl,$t2
173	 vxor		$Xl1,$Xl1,$t6
174
175	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
176	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
177	vpmsumd		$Xl,$Xl,$xC2
178	 vpmsumd	$Xl1,$Xl1,$xC2
179	vxor		$t1,$t1,$Xh
180	 vxor		$t5,$t5,$Xh1
181	vxor		$Xl,$Xl,$t1
182	 vxor		$Xl1,$Xl1,$t5
183
184	vsldoi		$H,$Xl,$Xl,8
185	 vsldoi		$H2,$Xl1,$Xl1,8
186	vsldoi		$Hl,$zero,$H,8
187	vsldoi		$Hh,$H,$zero,8
188	 vsldoi		$H2l,$zero,$H2,8
189	 vsldoi		$H2h,$H2,$zero,8
190
191	stvx_u		$Hl,r8,r3		# save H^3
192	li		r8,0xa0
193	stvx_u		$H,r9,r3
194	li		r9,0xb0
195	stvx_u		$Hh,r10,r3
196	li		r10,0xc0
197	 stvx_u		$H2l,r8,r3		# save H^4
198	 stvx_u		$H2,r9,r3
199	 stvx_u		$H2h,r10,r3
200
201	mtspr		256,$vrsave
202	blr
203	.long		0
204	.byte		0,12,0x14,0,0,0,2,0
205	.long		0
206.size	.gcm_init_p8,.-.gcm_init_p8
207___
208}
209$code.=<<___;
210.globl	.gcm_gmult_p8
211.align	5
212.gcm_gmult_p8:
213	lis		r0,0xfff8
214	li		r8,0x10
215	mfspr		$vrsave,256
216	li		r9,0x20
217	mtspr		256,r0
218	li		r10,0x30
219	lvx_u		$IN,0,$Xip		# load Xi
220
221	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
222	 le?lvsl	$lemask,r0,r0
223	lvx_u		$H, r9,$Htbl
224	 le?vspltisb	$t0,0x07
225	lvx_u		$Hh,r10,$Htbl
226	 le?vxor	$lemask,$lemask,$t0
227	lvx_u		$xC2,0,$Htbl
228	 le?vperm	$IN,$IN,$IN,$lemask
229	vxor		$zero,$zero,$zero
230
231	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
232	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
233	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
234
235	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
236
237	vsldoi		$t0,$Xm,$zero,8
238	vsldoi		$t1,$zero,$Xm,8
239	vxor		$Xl,$Xl,$t0
240	vxor		$Xh,$Xh,$t1
241
242	vsldoi		$Xl,$Xl,$Xl,8
243	vxor		$Xl,$Xl,$t2
244
245	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
246	vpmsumd		$Xl,$Xl,$xC2
247	vxor		$t1,$t1,$Xh
248	vxor		$Xl,$Xl,$t1
249
250	le?vperm	$Xl,$Xl,$Xl,$lemask
251	stvx_u		$Xl,0,$Xip		# write out Xi
252
253	mtspr		256,$vrsave
254	blr
255	.long		0
256	.byte		0,12,0x14,0,0,0,2,0
257	.long		0
258.size	.gcm_gmult_p8,.-.gcm_gmult_p8
259
260.globl	.gcm_ghash_p8
261.align	5
262.gcm_ghash_p8:
263	li		r0,-4096
264	li		r8,0x10
265	mfspr		$vrsave,256
266	li		r9,0x20
267	mtspr		256,r0
268	li		r10,0x30
269	lvx_u		$Xl,0,$Xip		# load Xi
270
271	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
272	li		r8,0x40
273	 le?lvsl	$lemask,r0,r0
274	lvx_u		$H, r9,$Htbl
275	li		r9,0x50
276	 le?vspltisb	$t0,0x07
277	lvx_u		$Hh,r10,$Htbl
278	li		r10,0x60
279	 le?vxor	$lemask,$lemask,$t0
280	lvx_u		$xC2,0,$Htbl
281	 le?vperm	$Xl,$Xl,$Xl,$lemask
282	vxor		$zero,$zero,$zero
283
284	${UCMP}i	$len,64
285	bge		Lgcm_ghash_p8_4x
286
287	lvx_u		$IN,0,$inp
288	addi		$inp,$inp,16
289	subic.		$len,$len,16
290	 le?vperm	$IN,$IN,$IN,$lemask
291	vxor		$IN,$IN,$Xl
292	beq		Lshort
293
294	lvx_u		$H2l,r8,$Htbl		# load H^2
295	li		r8,16
296	lvx_u		$H2, r9,$Htbl
297	add		r9,$inp,$len		# end of input
298	lvx_u		$H2h,r10,$Htbl
299	be?b		Loop_2x
300
301.align	5
302Loop_2x:
303	lvx_u		$IN1,0,$inp
304	le?vperm	$IN1,$IN1,$IN1,$lemask
305
306	 subic		$len,$len,32
307	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
308	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
309	 subfe		r0,r0,r0		# borrow?-1:0
310	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
311	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
312	 and		r0,r0,$len
313	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
314	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
315	 add		$inp,$inp,r0
316
317	vxor		$Xl,$Xl,$Xl1
318	vxor		$Xm,$Xm,$Xm1
319
320	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
321
322	vsldoi		$t0,$Xm,$zero,8
323	vsldoi		$t1,$zero,$Xm,8
324	 vxor		$Xh,$Xh,$Xh1
325	vxor		$Xl,$Xl,$t0
326	vxor		$Xh,$Xh,$t1
327
328	vsldoi		$Xl,$Xl,$Xl,8
329	vxor		$Xl,$Xl,$t2
330	 lvx_u		$IN,r8,$inp
331	 addi		$inp,$inp,32
332
333	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
334	vpmsumd		$Xl,$Xl,$xC2
335	 le?vperm	$IN,$IN,$IN,$lemask
336	vxor		$t1,$t1,$Xh
337	vxor		$IN,$IN,$t1
338	vxor		$IN,$IN,$Xl
339	$UCMP		r9,$inp
340	bgt		Loop_2x			# done yet?
341
342	cmplwi		$len,0
343	bne		Leven
344
345Lshort:
346	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
347	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
348	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
349
350	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
351
352	vsldoi		$t0,$Xm,$zero,8
353	vsldoi		$t1,$zero,$Xm,8
354	vxor		$Xl,$Xl,$t0
355	vxor		$Xh,$Xh,$t1
356
357	vsldoi		$Xl,$Xl,$Xl,8
358	vxor		$Xl,$Xl,$t2
359
360	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
361	vpmsumd		$Xl,$Xl,$xC2
362	vxor		$t1,$t1,$Xh
363
364Leven:
365	vxor		$Xl,$Xl,$t1
366	le?vperm	$Xl,$Xl,$Xl,$lemask
367	stvx_u		$Xl,0,$Xip		# write out Xi
368
369	mtspr		256,$vrsave
370	blr
371	.long		0
372	.byte		0,12,0x14,0,0,0,4,0
373	.long		0
374___
375{
376my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
377    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
378my $IN0=$IN;
379my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
380
381$code.=<<___;
382.align	5
383.gcm_ghash_p8_4x:
384Lgcm_ghash_p8_4x:
385	$STU		$sp,-$FRAME($sp)
386	li		r10,`15+6*$SIZE_T`
387	li		r11,`31+6*$SIZE_T`
388	stvx		v20,r10,$sp
389	addi		r10,r10,32
390	stvx		v21,r11,$sp
391	addi		r11,r11,32
392	stvx		v22,r10,$sp
393	addi		r10,r10,32
394	stvx		v23,r11,$sp
395	addi		r11,r11,32
396	stvx		v24,r10,$sp
397	addi		r10,r10,32
398	stvx		v25,r11,$sp
399	addi		r11,r11,32
400	stvx		v26,r10,$sp
401	addi		r10,r10,32
402	stvx		v27,r11,$sp
403	addi		r11,r11,32
404	stvx		v28,r10,$sp
405	addi		r10,r10,32
406	stvx		v29,r11,$sp
407	addi		r11,r11,32
408	stvx		v30,r10,$sp
409	li		r10,0x60
410	stvx		v31,r11,$sp
411	li		r0,-1
412	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
413	mtspr		256,r0			# preserve all AltiVec registers
414
415	lvsl		$t0,0,r8		# 0x0001..0e0f
416	#lvx_u		$H2l,r8,$Htbl		# load H^2
417	li		r8,0x70
418	lvx_u		$H2, r9,$Htbl
419	li		r9,0x80
420	vspltisb	$t1,8			# 0x0808..0808
421	#lvx_u		$H2h,r10,$Htbl
422	li		r10,0x90
423	lvx_u		$H3l,r8,$Htbl		# load H^3
424	li		r8,0xa0
425	lvx_u		$H3, r9,$Htbl
426	li		r9,0xb0
427	lvx_u		$H3h,r10,$Htbl
428	li		r10,0xc0
429	lvx_u		$H4l,r8,$Htbl		# load H^4
430	li		r8,0x10
431	lvx_u		$H4, r9,$Htbl
432	li		r9,0x20
433	lvx_u		$H4h,r10,$Htbl
434	li		r10,0x30
435
436	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
437	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
438	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
439
440	$SHRI		$len,$len,4		# this allows to use sign bit
441						# as carry
442	lvx_u		$IN0,0,$inp		# load input
443	lvx_u		$IN1,r8,$inp
444	subic.		$len,$len,8
445	lvx_u		$IN2,r9,$inp
446	lvx_u		$IN3,r10,$inp
447	addi		$inp,$inp,0x40
448	le?vperm	$IN0,$IN0,$IN0,$lemask
449	le?vperm	$IN1,$IN1,$IN1,$lemask
450	le?vperm	$IN2,$IN2,$IN2,$lemask
451	le?vperm	$IN3,$IN3,$IN3,$lemask
452
453	vxor		$Xh,$IN0,$Xl
454
455	 vpmsumd	$Xl1,$IN1,$H3l
456	 vpmsumd	$Xm1,$IN1,$H3
457	 vpmsumd	$Xh1,$IN1,$H3h
458
459	 vperm		$H21l,$H2,$H,$hiperm
460	 vperm		$t0,$IN2,$IN3,$loperm
461	 vperm		$H21h,$H2,$H,$loperm
462	 vperm		$t1,$IN2,$IN3,$hiperm
463	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
464	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
465	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
466	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
467
468	 vxor		$Xm2,$Xm2,$Xm1
469	 vxor		$Xl3,$Xl3,$Xl1
470	 vxor		$Xm3,$Xm3,$Xm2
471	 vxor		$Xh3,$Xh3,$Xh1
472
473	blt		Ltail_4x
474
475Loop_4x:
476	lvx_u		$IN0,0,$inp
477	lvx_u		$IN1,r8,$inp
478	subic.		$len,$len,4
479	lvx_u		$IN2,r9,$inp
480	lvx_u		$IN3,r10,$inp
481	addi		$inp,$inp,0x40
482	le?vperm	$IN1,$IN1,$IN1,$lemask
483	le?vperm	$IN2,$IN2,$IN2,$lemask
484	le?vperm	$IN3,$IN3,$IN3,$lemask
485	le?vperm	$IN0,$IN0,$IN0,$lemask
486
487	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
488	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
489	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
490	 vpmsumd	$Xl1,$IN1,$H3l
491	 vpmsumd	$Xm1,$IN1,$H3
492	 vpmsumd	$Xh1,$IN1,$H3h
493
494	vxor		$Xl,$Xl,$Xl3
495	vxor		$Xm,$Xm,$Xm3
496	vxor		$Xh,$Xh,$Xh3
497	 vperm		$t0,$IN2,$IN3,$loperm
498	 vperm		$t1,$IN2,$IN3,$hiperm
499
500	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
501	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
502	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
503
504	vsldoi		$t0,$Xm,$zero,8
505	vsldoi		$t1,$zero,$Xm,8
506	vxor		$Xl,$Xl,$t0
507	vxor		$Xh,$Xh,$t1
508
509	vsldoi		$Xl,$Xl,$Xl,8
510	vxor		$Xl,$Xl,$t2
511
512	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
513	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
514	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
515	vpmsumd		$Xl,$Xl,$xC2
516
517	 vxor		$Xl3,$Xl3,$Xl1
518	 vxor		$Xh3,$Xh3,$Xh1
519	vxor		$Xh,$Xh,$IN0
520	 vxor		$Xm2,$Xm2,$Xm1
521	vxor		$Xh,$Xh,$t1
522	 vxor		$Xm3,$Xm3,$Xm2
523	vxor		$Xh,$Xh,$Xl
524	bge		Loop_4x
525
526Ltail_4x:
527	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
528	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
529	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
530
531	vxor		$Xl,$Xl,$Xl3
532	vxor		$Xm,$Xm,$Xm3
533
534	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
535
536	vsldoi		$t0,$Xm,$zero,8
537	vsldoi		$t1,$zero,$Xm,8
538	 vxor		$Xh,$Xh,$Xh3
539	vxor		$Xl,$Xl,$t0
540	vxor		$Xh,$Xh,$t1
541
542	vsldoi		$Xl,$Xl,$Xl,8
543	vxor		$Xl,$Xl,$t2
544
545	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
546	vpmsumd		$Xl,$Xl,$xC2
547	vxor		$t1,$t1,$Xh
548	vxor		$Xl,$Xl,$t1
549
550	addic.		$len,$len,4
551	beq		Ldone_4x
552
553	lvx_u		$IN0,0,$inp
554	${UCMP}i	$len,2
555	li		$len,-4
556	blt		Lone
557	lvx_u		$IN1,r8,$inp
558	beq		Ltwo
559
560Lthree:
561	lvx_u		$IN2,r9,$inp
562	le?vperm	$IN0,$IN0,$IN0,$lemask
563	le?vperm	$IN1,$IN1,$IN1,$lemask
564	le?vperm	$IN2,$IN2,$IN2,$lemask
565
566	vxor		$Xh,$IN0,$Xl
567	vmr		$H4l,$H3l
568	vmr		$H4, $H3
569	vmr		$H4h,$H3h
570
571	vperm		$t0,$IN1,$IN2,$loperm
572	vperm		$t1,$IN1,$IN2,$hiperm
573	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
574	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
575	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
576	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
577
578	vxor		$Xm3,$Xm3,$Xm2
579	b		Ltail_4x
580
581.align	4
582Ltwo:
583	le?vperm	$IN0,$IN0,$IN0,$lemask
584	le?vperm	$IN1,$IN1,$IN1,$lemask
585
586	vxor		$Xh,$IN0,$Xl
587	vperm		$t0,$zero,$IN1,$loperm
588	vperm		$t1,$zero,$IN1,$hiperm
589
590	vsldoi		$H4l,$zero,$H2,8
591	vmr		$H4, $H2
592	vsldoi		$H4h,$H2,$zero,8
593
594	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
595	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
596	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
597
598	b		Ltail_4x
599
600.align	4
601Lone:
602	le?vperm	$IN0,$IN0,$IN0,$lemask
603
604	vsldoi		$H4l,$zero,$H,8
605	vmr		$H4, $H
606	vsldoi		$H4h,$H,$zero,8
607
608	vxor		$Xh,$IN0,$Xl
609	vxor		$Xl3,$Xl3,$Xl3
610	vxor		$Xm3,$Xm3,$Xm3
611	vxor		$Xh3,$Xh3,$Xh3
612
613	b		Ltail_4x
614
615Ldone_4x:
616	le?vperm	$Xl,$Xl,$Xl,$lemask
617	stvx_u		$Xl,0,$Xip		# write out Xi
618
619	li		r10,`15+6*$SIZE_T`
620	li		r11,`31+6*$SIZE_T`
621	mtspr		256,$vrsave
622	lvx		v20,r10,$sp
623	addi		r10,r10,32
624	lvx		v21,r11,$sp
625	addi		r11,r11,32
626	lvx		v22,r10,$sp
627	addi		r10,r10,32
628	lvx		v23,r11,$sp
629	addi		r11,r11,32
630	lvx		v24,r10,$sp
631	addi		r10,r10,32
632	lvx		v25,r11,$sp
633	addi		r11,r11,32
634	lvx		v26,r10,$sp
635	addi		r10,r10,32
636	lvx		v27,r11,$sp
637	addi		r11,r11,32
638	lvx		v28,r10,$sp
639	addi		r10,r10,32
640	lvx		v29,r11,$sp
641	addi		r11,r11,32
642	lvx		v30,r10,$sp
643	lvx		v31,r11,$sp
644	addi		$sp,$sp,$FRAME
645	blr
646	.long		0
647	.byte		0,12,0x04,0,0x80,0,4,0
648	.long		0
649___
650}
651$code.=<<___;
652.size	.gcm_ghash_p8,.-.gcm_ghash_p8
653
654.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
655.align  2
656___
657
658foreach (split("\n",$code)) {
659	s/\`([^\`]*)\`/eval $1/geo;
660
661	if ($flavour =~ /le$/o) {	# little-endian
662	    s/le\?//o		or
663	    s/be\?/#be#/o;
664	} else {
665	    s/le\?/#le#/o	or
666	    s/be\?//o;
667	}
668	print $_,"\n";
669}
670
671close STDOUT; # enforce flush
672