1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for AES instructions as per PowerISA
11# specification version 2.07, first implemented by POWER8 processor.
12# The module is endian-agnostic in sense that it supports both big-
13# and little-endian cases. Data alignment in parallelizable modes is
14# handled with VSX loads and stores, which implies MSR.VSX flag being
15# set. It should also be noted that ISA specification doesn't prohibit
16# alignment exceptions for these instructions on page boundaries.
17# Initially alignment was handled in pure AltiVec/VMX way [when data
18# is aligned programmatically, which in turn guarantees exception-
19# free execution], but it turned to hamper performance when vcipher
20# instructions are interleaved. It's reckoned that eventual
21# misalignment penalties at page boundaries are in average lower
22# than additional overhead in pure AltiVec approach.
23
24$flavour = shift;
25
26if ($flavour =~ /64/) {
27	$SIZE_T	=8;
28	$LRSAVE	=2*$SIZE_T;
29	$STU	="stdu";
30	$POP	="ld";
31	$PUSH	="std";
32	$UCMP	="cmpld";
33	$SHL	="sldi";
34} elsif ($flavour =~ /32/) {
35	$SIZE_T	=4;
36	$LRSAVE	=$SIZE_T;
37	$STU	="stwu";
38	$POP	="lwz";
39	$PUSH	="stw";
40	$UCMP	="cmplw";
41	$SHL	="slwi";
42} else { die "nonsense $flavour"; }
43
44$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49die "can't locate ppc-xlate.pl";
50
51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53$FRAME=8*$SIZE_T;
54$prefix="aes_p8";
55
56$sp="r1";
57$vrsave="r12";
58
59#########################################################################
60{{{	# Key setup procedures						#
61my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
62my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
63my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
64
65$code.=<<___;
66.machine	"any"
67
68.text
69
70.align	7
71rcon:
72.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
73.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
74.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
75.long	0,0,0,0						?asis
76Lconsts:
77	mflr	r0
78	bcl	20,31,\$+4
79	mflr	$ptr	 #vvvvv "distance between . and rcon
80	addi	$ptr,$ptr,-0x48
81	mtlr	r0
82	blr
83	.long	0
84	.byte	0,12,0x14,0,0,0,0,0
85.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
86
87.globl	.${prefix}_set_encrypt_key
88.align	5
89.${prefix}_set_encrypt_key:
90Lset_encrypt_key:
91	mflr		r11
92	$PUSH		r11,$LRSAVE($sp)
93
94	li		$ptr,-1
95	${UCMP}i	$inp,0
96	beq-		Lenc_key_abort		# if ($inp==0) return -1;
97	${UCMP}i	$out,0
98	beq-		Lenc_key_abort		# if ($out==0) return -1;
99	li		$ptr,-2
100	cmpwi		$bits,128
101	blt-		Lenc_key_abort
102	cmpwi		$bits,256
103	bgt-		Lenc_key_abort
104	andi.		r0,$bits,0x3f
105	bne-		Lenc_key_abort
106
107	lis		r0,0xfff0
108	mfspr		$vrsave,256
109	mtspr		256,r0
110
111	bl		Lconsts
112	mtlr		r11
113
114	neg		r9,$inp
115	lvx		$in0,0,$inp
116	addi		$inp,$inp,15		# 15 is not typo
117	lvsr		$key,0,r9		# borrow $key
118	li		r8,0x20
119	cmpwi		$bits,192
120	lvx		$in1,0,$inp
121	le?vspltisb	$mask,0x0f		# borrow $mask
122	lvx		$rcon,0,$ptr
123	le?vxor		$key,$key,$mask		# adjust for byte swap
124	lvx		$mask,r8,$ptr
125	addi		$ptr,$ptr,0x10
126	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
127	li		$cnt,8
128	vxor		$zero,$zero,$zero
129	mtctr		$cnt
130
131	?lvsr		$outperm,0,$out
132	vspltisb	$outmask,-1
133	lvx		$outhead,0,$out
134	?vperm		$outmask,$zero,$outmask,$outperm
135
136	blt		Loop128
137	addi		$inp,$inp,8
138	beq		L192
139	addi		$inp,$inp,8
140	b		L256
141
142.align	4
143Loop128:
144	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
145	vsldoi		$tmp,$zero,$in0,12	# >>32
146	 vperm		$outtail,$in0,$in0,$outperm	# rotate
147	 vsel		$stage,$outhead,$outtail,$outmask
148	 vmr		$outhead,$outtail
149	vcipherlast	$key,$key,$rcon
150	 stvx		$stage,0,$out
151	 addi		$out,$out,16
152
153	vxor		$in0,$in0,$tmp
154	vsldoi		$tmp,$zero,$tmp,12	# >>32
155	vxor		$in0,$in0,$tmp
156	vsldoi		$tmp,$zero,$tmp,12	# >>32
157	vxor		$in0,$in0,$tmp
158	 vadduwm	$rcon,$rcon,$rcon
159	vxor		$in0,$in0,$key
160	bdnz		Loop128
161
162	lvx		$rcon,0,$ptr		# last two round keys
163
164	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
165	vsldoi		$tmp,$zero,$in0,12	# >>32
166	 vperm		$outtail,$in0,$in0,$outperm	# rotate
167	 vsel		$stage,$outhead,$outtail,$outmask
168	 vmr		$outhead,$outtail
169	vcipherlast	$key,$key,$rcon
170	 stvx		$stage,0,$out
171	 addi		$out,$out,16
172
173	vxor		$in0,$in0,$tmp
174	vsldoi		$tmp,$zero,$tmp,12	# >>32
175	vxor		$in0,$in0,$tmp
176	vsldoi		$tmp,$zero,$tmp,12	# >>32
177	vxor		$in0,$in0,$tmp
178	 vadduwm	$rcon,$rcon,$rcon
179	vxor		$in0,$in0,$key
180
181	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
182	vsldoi		$tmp,$zero,$in0,12	# >>32
183	 vperm		$outtail,$in0,$in0,$outperm	# rotate
184	 vsel		$stage,$outhead,$outtail,$outmask
185	 vmr		$outhead,$outtail
186	vcipherlast	$key,$key,$rcon
187	 stvx		$stage,0,$out
188	 addi		$out,$out,16
189
190	vxor		$in0,$in0,$tmp
191	vsldoi		$tmp,$zero,$tmp,12	# >>32
192	vxor		$in0,$in0,$tmp
193	vsldoi		$tmp,$zero,$tmp,12	# >>32
194	vxor		$in0,$in0,$tmp
195	vxor		$in0,$in0,$key
196	 vperm		$outtail,$in0,$in0,$outperm	# rotate
197	 vsel		$stage,$outhead,$outtail,$outmask
198	 vmr		$outhead,$outtail
199	 stvx		$stage,0,$out
200
201	addi		$inp,$out,15		# 15 is not typo
202	addi		$out,$out,0x50
203
204	li		$rounds,10
205	b		Ldone
206
207.align	4
208L192:
209	lvx		$tmp,0,$inp
210	li		$cnt,4
211	 vperm		$outtail,$in0,$in0,$outperm	# rotate
212	 vsel		$stage,$outhead,$outtail,$outmask
213	 vmr		$outhead,$outtail
214	 stvx		$stage,0,$out
215	 addi		$out,$out,16
216	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
217	vspltisb	$key,8			# borrow $key
218	mtctr		$cnt
219	vsububm		$mask,$mask,$key	# adjust the mask
220
221Loop192:
222	vperm		$key,$in1,$in1,$mask	# roate-n-splat
223	vsldoi		$tmp,$zero,$in0,12	# >>32
224	vcipherlast	$key,$key,$rcon
225
226	vxor		$in0,$in0,$tmp
227	vsldoi		$tmp,$zero,$tmp,12	# >>32
228	vxor		$in0,$in0,$tmp
229	vsldoi		$tmp,$zero,$tmp,12	# >>32
230	vxor		$in0,$in0,$tmp
231
232	 vsldoi		$stage,$zero,$in1,8
233	vspltw		$tmp,$in0,3
234	vxor		$tmp,$tmp,$in1
235	vsldoi		$in1,$zero,$in1,12	# >>32
236	 vadduwm	$rcon,$rcon,$rcon
237	vxor		$in1,$in1,$tmp
238	vxor		$in0,$in0,$key
239	vxor		$in1,$in1,$key
240	 vsldoi		$stage,$stage,$in0,8
241
242	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
243	vsldoi		$tmp,$zero,$in0,12	# >>32
244	 vperm		$outtail,$stage,$stage,$outperm	# rotate
245	 vsel		$stage,$outhead,$outtail,$outmask
246	 vmr		$outhead,$outtail
247	vcipherlast	$key,$key,$rcon
248	 stvx		$stage,0,$out
249	 addi		$out,$out,16
250
251	 vsldoi		$stage,$in0,$in1,8
252	vxor		$in0,$in0,$tmp
253	vsldoi		$tmp,$zero,$tmp,12	# >>32
254	 vperm		$outtail,$stage,$stage,$outperm	# rotate
255	 vsel		$stage,$outhead,$outtail,$outmask
256	 vmr		$outhead,$outtail
257	vxor		$in0,$in0,$tmp
258	vsldoi		$tmp,$zero,$tmp,12	# >>32
259	vxor		$in0,$in0,$tmp
260	 stvx		$stage,0,$out
261	 addi		$out,$out,16
262
263	vspltw		$tmp,$in0,3
264	vxor		$tmp,$tmp,$in1
265	vsldoi		$in1,$zero,$in1,12	# >>32
266	 vadduwm	$rcon,$rcon,$rcon
267	vxor		$in1,$in1,$tmp
268	vxor		$in0,$in0,$key
269	vxor		$in1,$in1,$key
270	 vperm		$outtail,$in0,$in0,$outperm	# rotate
271	 vsel		$stage,$outhead,$outtail,$outmask
272	 vmr		$outhead,$outtail
273	 stvx		$stage,0,$out
274	 addi		$inp,$out,15		# 15 is not typo
275	 addi		$out,$out,16
276	bdnz		Loop192
277
278	li		$rounds,12
279	addi		$out,$out,0x20
280	b		Ldone
281
282.align	4
283L256:
284	lvx		$tmp,0,$inp
285	li		$cnt,7
286	li		$rounds,14
287	 vperm		$outtail,$in0,$in0,$outperm	# rotate
288	 vsel		$stage,$outhead,$outtail,$outmask
289	 vmr		$outhead,$outtail
290	 stvx		$stage,0,$out
291	 addi		$out,$out,16
292	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
293	mtctr		$cnt
294
295Loop256:
296	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
297	vsldoi		$tmp,$zero,$in0,12	# >>32
298	 vperm		$outtail,$in1,$in1,$outperm	# rotate
299	 vsel		$stage,$outhead,$outtail,$outmask
300	 vmr		$outhead,$outtail
301	vcipherlast	$key,$key,$rcon
302	 stvx		$stage,0,$out
303	 addi		$out,$out,16
304
305	vxor		$in0,$in0,$tmp
306	vsldoi		$tmp,$zero,$tmp,12	# >>32
307	vxor		$in0,$in0,$tmp
308	vsldoi		$tmp,$zero,$tmp,12	# >>32
309	vxor		$in0,$in0,$tmp
310	 vadduwm	$rcon,$rcon,$rcon
311	vxor		$in0,$in0,$key
312	 vperm		$outtail,$in0,$in0,$outperm	# rotate
313	 vsel		$stage,$outhead,$outtail,$outmask
314	 vmr		$outhead,$outtail
315	 stvx		$stage,0,$out
316	 addi		$inp,$out,15		# 15 is not typo
317	 addi		$out,$out,16
318	bdz		Ldone
319
320	vspltw		$key,$in0,3		# just splat
321	vsldoi		$tmp,$zero,$in1,12	# >>32
322	vsbox		$key,$key
323
324	vxor		$in1,$in1,$tmp
325	vsldoi		$tmp,$zero,$tmp,12	# >>32
326	vxor		$in1,$in1,$tmp
327	vsldoi		$tmp,$zero,$tmp,12	# >>32
328	vxor		$in1,$in1,$tmp
329
330	vxor		$in1,$in1,$key
331	b		Loop256
332
333.align	4
334Ldone:
335	lvx		$in1,0,$inp		# redundant in aligned case
336	vsel		$in1,$outhead,$in1,$outmask
337	stvx		$in1,0,$inp
338	li		$ptr,0
339	mtspr		256,$vrsave
340	stw		$rounds,0($out)
341
342Lenc_key_abort:
343	mr		r3,$ptr
344	blr
345	.long		0
346	.byte		0,12,0x14,1,0,0,3,0
347	.long		0
348.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
349
350.globl	.${prefix}_set_decrypt_key
351.align	5
352.${prefix}_set_decrypt_key:
353	$STU		$sp,-$FRAME($sp)
354	mflr		r10
355	$PUSH		r10,$FRAME+$LRSAVE($sp)
356	bl		Lset_encrypt_key
357	mtlr		r10
358
359	cmpwi		r3,0
360	bne-		Ldec_key_abort
361
362	slwi		$cnt,$rounds,4
363	subi		$inp,$out,240		# first round key
364	srwi		$rounds,$rounds,1
365	add		$out,$inp,$cnt		# last round key
366	mtctr		$rounds
367
368Ldeckey:
369	lwz		r0, 0($inp)
370	lwz		r6, 4($inp)
371	lwz		r7, 8($inp)
372	lwz		r8, 12($inp)
373	addi		$inp,$inp,16
374	lwz		r9, 0($out)
375	lwz		r10,4($out)
376	lwz		r11,8($out)
377	lwz		r12,12($out)
378	stw		r0, 0($out)
379	stw		r6, 4($out)
380	stw		r7, 8($out)
381	stw		r8, 12($out)
382	subi		$out,$out,16
383	stw		r9, -16($inp)
384	stw		r10,-12($inp)
385	stw		r11,-8($inp)
386	stw		r12,-4($inp)
387	bdnz		Ldeckey
388
389	xor		r3,r3,r3		# return value
390Ldec_key_abort:
391	addi		$sp,$sp,$FRAME
392	blr
393	.long		0
394	.byte		0,12,4,1,0x80,0,3,0
395	.long		0
396.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
397___
398}}}
399#########################################################################
400{{{	# Single block en- and decrypt procedures			#
401sub gen_block () {
402my $dir = shift;
403my $n   = $dir eq "de" ? "n" : "";
404my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
405
406$code.=<<___;
407.globl	.${prefix}_${dir}crypt
408.align	5
409.${prefix}_${dir}crypt:
410	lwz		$rounds,240($key)
411	lis		r0,0xfc00
412	mfspr		$vrsave,256
413	li		$idx,15			# 15 is not typo
414	mtspr		256,r0
415
416	lvx		v0,0,$inp
417	neg		r11,$out
418	lvx		v1,$idx,$inp
419	lvsl		v2,0,$inp		# inpperm
420	le?vspltisb	v4,0x0f
421	?lvsl		v3,0,r11		# outperm
422	le?vxor		v2,v2,v4
423	li		$idx,16
424	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
425	lvx		v1,0,$key
426	?lvsl		v5,0,$key		# keyperm
427	srwi		$rounds,$rounds,1
428	lvx		v2,$idx,$key
429	addi		$idx,$idx,16
430	subi		$rounds,$rounds,1
431	?vperm		v1,v1,v2,v5		# align round key
432
433	vxor		v0,v0,v1
434	lvx		v1,$idx,$key
435	addi		$idx,$idx,16
436	mtctr		$rounds
437
438Loop_${dir}c:
439	?vperm		v2,v2,v1,v5
440	v${n}cipher	v0,v0,v2
441	lvx		v2,$idx,$key
442	addi		$idx,$idx,16
443	?vperm		v1,v1,v2,v5
444	v${n}cipher	v0,v0,v1
445	lvx		v1,$idx,$key
446	addi		$idx,$idx,16
447	bdnz		Loop_${dir}c
448
449	?vperm		v2,v2,v1,v5
450	v${n}cipher	v0,v0,v2
451	lvx		v2,$idx,$key
452	?vperm		v1,v1,v2,v5
453	v${n}cipherlast	v0,v0,v1
454
455	vspltisb	v2,-1
456	vxor		v1,v1,v1
457	li		$idx,15			# 15 is not typo
458	?vperm		v2,v1,v2,v3		# outmask
459	le?vxor		v3,v3,v4
460	lvx		v1,0,$out		# outhead
461	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
462	vsel		v1,v1,v0,v2
463	lvx		v4,$idx,$out
464	stvx		v1,0,$out
465	vsel		v0,v0,v4,v2
466	stvx		v0,$idx,$out
467
468	mtspr		256,$vrsave
469	blr
470	.long		0
471	.byte		0,12,0x14,0,0,0,3,0
472	.long		0
473.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
474___
475}
476&gen_block("en");
477&gen_block("de");
478}}}
479#########################################################################
480{{{	# CBC en- and decrypt procedures				#
481my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
482my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
483my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
484						map("v$_",(4..10));
485$code.=<<___;
486.globl	.${prefix}_cbc_encrypt
487.align	5
488.${prefix}_cbc_encrypt:
489	${UCMP}i	$len,16
490	bltlr-
491
492	cmpwi		$enc,0			# test direction
493	lis		r0,0xffe0
494	mfspr		$vrsave,256
495	mtspr		256,r0
496
497	li		$idx,15
498	vxor		$rndkey0,$rndkey0,$rndkey0
499	le?vspltisb	$tmp,0x0f
500
501	lvx		$ivec,0,$ivp		# load [unaligned] iv
502	lvsl		$inpperm,0,$ivp
503	lvx		$inptail,$idx,$ivp
504	le?vxor		$inpperm,$inpperm,$tmp
505	vperm		$ivec,$ivec,$inptail,$inpperm
506
507	neg		r11,$inp
508	?lvsl		$keyperm,0,$key		# prepare for unaligned key
509	lwz		$rounds,240($key)
510
511	lvsr		$inpperm,0,r11		# prepare for unaligned load
512	lvx		$inptail,0,$inp
513	addi		$inp,$inp,15		# 15 is not typo
514	le?vxor		$inpperm,$inpperm,$tmp
515
516	?lvsr		$outperm,0,$out		# prepare for unaligned store
517	vspltisb	$outmask,-1
518	lvx		$outhead,0,$out
519	?vperm		$outmask,$rndkey0,$outmask,$outperm
520	le?vxor		$outperm,$outperm,$tmp
521
522	srwi		$rounds,$rounds,1
523	li		$idx,16
524	subi		$rounds,$rounds,1
525	beq		Lcbc_dec
526
527Lcbc_enc:
528	vmr		$inout,$inptail
529	lvx		$inptail,0,$inp
530	addi		$inp,$inp,16
531	mtctr		$rounds
532	subi		$len,$len,16		# len-=16
533
534	lvx		$rndkey0,0,$key
535	 vperm		$inout,$inout,$inptail,$inpperm
536	lvx		$rndkey1,$idx,$key
537	addi		$idx,$idx,16
538	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
539	vxor		$inout,$inout,$rndkey0
540	lvx		$rndkey0,$idx,$key
541	addi		$idx,$idx,16
542	vxor		$inout,$inout,$ivec
543
544Loop_cbc_enc:
545	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
546	vcipher		$inout,$inout,$rndkey1
547	lvx		$rndkey1,$idx,$key
548	addi		$idx,$idx,16
549	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
550	vcipher		$inout,$inout,$rndkey0
551	lvx		$rndkey0,$idx,$key
552	addi		$idx,$idx,16
553	bdnz		Loop_cbc_enc
554
555	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
556	vcipher		$inout,$inout,$rndkey1
557	lvx		$rndkey1,$idx,$key
558	li		$idx,16
559	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
560	vcipherlast	$ivec,$inout,$rndkey0
561	${UCMP}i	$len,16
562
563	vperm		$tmp,$ivec,$ivec,$outperm
564	vsel		$inout,$outhead,$tmp,$outmask
565	vmr		$outhead,$tmp
566	stvx		$inout,0,$out
567	addi		$out,$out,16
568	bge		Lcbc_enc
569
570	b		Lcbc_done
571
572.align	4
573Lcbc_dec:
574	${UCMP}i	$len,128
575	bge		_aesp8_cbc_decrypt8x
576	vmr		$tmp,$inptail
577	lvx		$inptail,0,$inp
578	addi		$inp,$inp,16
579	mtctr		$rounds
580	subi		$len,$len,16		# len-=16
581
582	lvx		$rndkey0,0,$key
583	 vperm		$tmp,$tmp,$inptail,$inpperm
584	lvx		$rndkey1,$idx,$key
585	addi		$idx,$idx,16
586	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
587	vxor		$inout,$tmp,$rndkey0
588	lvx		$rndkey0,$idx,$key
589	addi		$idx,$idx,16
590
591Loop_cbc_dec:
592	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
593	vncipher	$inout,$inout,$rndkey1
594	lvx		$rndkey1,$idx,$key
595	addi		$idx,$idx,16
596	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
597	vncipher	$inout,$inout,$rndkey0
598	lvx		$rndkey0,$idx,$key
599	addi		$idx,$idx,16
600	bdnz		Loop_cbc_dec
601
602	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
603	vncipher	$inout,$inout,$rndkey1
604	lvx		$rndkey1,$idx,$key
605	li		$idx,16
606	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
607	vncipherlast	$inout,$inout,$rndkey0
608	${UCMP}i	$len,16
609
610	vxor		$inout,$inout,$ivec
611	vmr		$ivec,$tmp
612	vperm		$tmp,$inout,$inout,$outperm
613	vsel		$inout,$outhead,$tmp,$outmask
614	vmr		$outhead,$tmp
615	stvx		$inout,0,$out
616	addi		$out,$out,16
617	bge		Lcbc_dec
618
619Lcbc_done:
620	addi		$out,$out,-1
621	lvx		$inout,0,$out		# redundant in aligned case
622	vsel		$inout,$outhead,$inout,$outmask
623	stvx		$inout,0,$out
624
625	neg		$enc,$ivp		# write [unaligned] iv
626	li		$idx,15			# 15 is not typo
627	vxor		$rndkey0,$rndkey0,$rndkey0
628	vspltisb	$outmask,-1
629	le?vspltisb	$tmp,0x0f
630	?lvsl		$outperm,0,$enc
631	?vperm		$outmask,$rndkey0,$outmask,$outperm
632	le?vxor		$outperm,$outperm,$tmp
633	lvx		$outhead,0,$ivp
634	vperm		$ivec,$ivec,$ivec,$outperm
635	vsel		$inout,$outhead,$ivec,$outmask
636	lvx		$inptail,$idx,$ivp
637	stvx		$inout,0,$ivp
638	vsel		$inout,$ivec,$inptail,$outmask
639	stvx		$inout,$idx,$ivp
640
641	mtspr		256,$vrsave
642	blr
643	.long		0
644	.byte		0,12,0x14,0,0,0,6,0
645	.long		0
646___
647#########################################################################
648{{	# Optimized CBC decrypt procedure				#
649my $key_="r11";
650my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
651    $x00=0 if ($flavour =~ /osx/);
652my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
653my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
654my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
655			# v26-v31 last 6 round keys
656my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
657
658$code.=<<___;
659.align	5
660_aesp8_cbc_decrypt8x:
661	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
662	li		r10,`$FRAME+8*16+15`
663	li		r11,`$FRAME+8*16+31`
664	stvx		v20,r10,$sp		# ABI says so
665	addi		r10,r10,32
666	stvx		v21,r11,$sp
667	addi		r11,r11,32
668	stvx		v22,r10,$sp
669	addi		r10,r10,32
670	stvx		v23,r11,$sp
671	addi		r11,r11,32
672	stvx		v24,r10,$sp
673	addi		r10,r10,32
674	stvx		v25,r11,$sp
675	addi		r11,r11,32
676	stvx		v26,r10,$sp
677	addi		r10,r10,32
678	stvx		v27,r11,$sp
679	addi		r11,r11,32
680	stvx		v28,r10,$sp
681	addi		r10,r10,32
682	stvx		v29,r11,$sp
683	addi		r11,r11,32
684	stvx		v30,r10,$sp
685	stvx		v31,r11,$sp
686	li		r0,-1
687	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
688	li		$x10,0x10
689	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
690	li		$x20,0x20
691	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
692	li		$x30,0x30
693	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
694	li		$x40,0x40
695	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
696	li		$x50,0x50
697	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
698	li		$x60,0x60
699	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
700	li		$x70,0x70
701	mtspr		256,r0
702
703	subi		$rounds,$rounds,3	# -4 in total
704	subi		$len,$len,128		# bias
705
706	lvx		$rndkey0,$x00,$key	# load key schedule
707	lvx		v30,$x10,$key
708	addi		$key,$key,0x20
709	lvx		v31,$x00,$key
710	?vperm		$rndkey0,$rndkey0,v30,$keyperm
711	addi		$key_,$sp,$FRAME+15
712	mtctr		$rounds
713
714Load_cbc_dec_key:
715	?vperm		v24,v30,v31,$keyperm
716	lvx		v30,$x10,$key
717	addi		$key,$key,0x20
718	stvx		v24,$x00,$key_		# off-load round[1]
719	?vperm		v25,v31,v30,$keyperm
720	lvx		v31,$x00,$key
721	stvx		v25,$x10,$key_		# off-load round[2]
722	addi		$key_,$key_,0x20
723	bdnz		Load_cbc_dec_key
724
725	lvx		v26,$x10,$key
726	?vperm		v24,v30,v31,$keyperm
727	lvx		v27,$x20,$key
728	stvx		v24,$x00,$key_		# off-load round[3]
729	?vperm		v25,v31,v26,$keyperm
730	lvx		v28,$x30,$key
731	stvx		v25,$x10,$key_		# off-load round[4]
732	addi		$key_,$sp,$FRAME+15	# rewind $key_
733	?vperm		v26,v26,v27,$keyperm
734	lvx		v29,$x40,$key
735	?vperm		v27,v27,v28,$keyperm
736	lvx		v30,$x50,$key
737	?vperm		v28,v28,v29,$keyperm
738	lvx		v31,$x60,$key
739	?vperm		v29,v29,v30,$keyperm
740	lvx		$out0,$x70,$key		# borrow $out0
741	?vperm		v30,v30,v31,$keyperm
742	lvx		v24,$x00,$key_		# pre-load round[1]
743	?vperm		v31,v31,$out0,$keyperm
744	lvx		v25,$x10,$key_		# pre-load round[2]
745
746	#lvx		$inptail,0,$inp		# "caller" already did this
747	#addi		$inp,$inp,15		# 15 is not typo
748	subi		$inp,$inp,15		# undo "caller"
749
750	 le?li		$idx,8
751	lvx_u		$in0,$x00,$inp		# load first 8 "words"
752	 le?lvsl	$inpperm,0,$idx
753	 le?vspltisb	$tmp,0x0f
754	lvx_u		$in1,$x10,$inp
755	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
756	lvx_u		$in2,$x20,$inp
757	 le?vperm	$in0,$in0,$in0,$inpperm
758	lvx_u		$in3,$x30,$inp
759	 le?vperm	$in1,$in1,$in1,$inpperm
760	lvx_u		$in4,$x40,$inp
761	 le?vperm	$in2,$in2,$in2,$inpperm
762	vxor		$out0,$in0,$rndkey0
763	lvx_u		$in5,$x50,$inp
764	 le?vperm	$in3,$in3,$in3,$inpperm
765	vxor		$out1,$in1,$rndkey0
766	lvx_u		$in6,$x60,$inp
767	 le?vperm	$in4,$in4,$in4,$inpperm
768	vxor		$out2,$in2,$rndkey0
769	lvx_u		$in7,$x70,$inp
770	addi		$inp,$inp,0x80
771	 le?vperm	$in5,$in5,$in5,$inpperm
772	vxor		$out3,$in3,$rndkey0
773	 le?vperm	$in6,$in6,$in6,$inpperm
774	vxor		$out4,$in4,$rndkey0
775	 le?vperm	$in7,$in7,$in7,$inpperm
776	vxor		$out5,$in5,$rndkey0
777	vxor		$out6,$in6,$rndkey0
778	vxor		$out7,$in7,$rndkey0
779
780	mtctr		$rounds
781	b		Loop_cbc_dec8x
782.align	5
783Loop_cbc_dec8x:
784	vncipher	$out0,$out0,v24
785	vncipher	$out1,$out1,v24
786	vncipher	$out2,$out2,v24
787	vncipher	$out3,$out3,v24
788	vncipher	$out4,$out4,v24
789	vncipher	$out5,$out5,v24
790	vncipher	$out6,$out6,v24
791	vncipher	$out7,$out7,v24
792	lvx		v24,$x20,$key_		# round[3]
793	addi		$key_,$key_,0x20
794
795	vncipher	$out0,$out0,v25
796	vncipher	$out1,$out1,v25
797	vncipher	$out2,$out2,v25
798	vncipher	$out3,$out3,v25
799	vncipher	$out4,$out4,v25
800	vncipher	$out5,$out5,v25
801	vncipher	$out6,$out6,v25
802	vncipher	$out7,$out7,v25
803	lvx		v25,$x10,$key_		# round[4]
804	bdnz		Loop_cbc_dec8x
805
806	subic		$len,$len,128		# $len-=128
807	vncipher	$out0,$out0,v24
808	vncipher	$out1,$out1,v24
809	vncipher	$out2,$out2,v24
810	vncipher	$out3,$out3,v24
811	vncipher	$out4,$out4,v24
812	vncipher	$out5,$out5,v24
813	vncipher	$out6,$out6,v24
814	vncipher	$out7,$out7,v24
815
816	subfe.		r0,r0,r0		# borrow?-1:0
817	vncipher	$out0,$out0,v25
818	vncipher	$out1,$out1,v25
819	vncipher	$out2,$out2,v25
820	vncipher	$out3,$out3,v25
821	vncipher	$out4,$out4,v25
822	vncipher	$out5,$out5,v25
823	vncipher	$out6,$out6,v25
824	vncipher	$out7,$out7,v25
825
826	and		r0,r0,$len
827	vncipher	$out0,$out0,v26
828	vncipher	$out1,$out1,v26
829	vncipher	$out2,$out2,v26
830	vncipher	$out3,$out3,v26
831	vncipher	$out4,$out4,v26
832	vncipher	$out5,$out5,v26
833	vncipher	$out6,$out6,v26
834	vncipher	$out7,$out7,v26
835
836	add		$inp,$inp,r0		# $inp is adjusted in such
837						# way that at exit from the
838						# loop inX-in7 are loaded
839						# with last "words"
840	vncipher	$out0,$out0,v27
841	vncipher	$out1,$out1,v27
842	vncipher	$out2,$out2,v27
843	vncipher	$out3,$out3,v27
844	vncipher	$out4,$out4,v27
845	vncipher	$out5,$out5,v27
846	vncipher	$out6,$out6,v27
847	vncipher	$out7,$out7,v27
848
849	addi		$key_,$sp,$FRAME+15	# rewind $key_
850	vncipher	$out0,$out0,v28
851	vncipher	$out1,$out1,v28
852	vncipher	$out2,$out2,v28
853	vncipher	$out3,$out3,v28
854	vncipher	$out4,$out4,v28
855	vncipher	$out5,$out5,v28
856	vncipher	$out6,$out6,v28
857	vncipher	$out7,$out7,v28
858	lvx		v24,$x00,$key_		# re-pre-load round[1]
859
860	vncipher	$out0,$out0,v29
861	vncipher	$out1,$out1,v29
862	vncipher	$out2,$out2,v29
863	vncipher	$out3,$out3,v29
864	vncipher	$out4,$out4,v29
865	vncipher	$out5,$out5,v29
866	vncipher	$out6,$out6,v29
867	vncipher	$out7,$out7,v29
868	lvx		v25,$x10,$key_		# re-pre-load round[2]
869
870	vncipher	$out0,$out0,v30
871	 vxor		$ivec,$ivec,v31		# xor with last round key
872	vncipher	$out1,$out1,v30
873	 vxor		$in0,$in0,v31
874	vncipher	$out2,$out2,v30
875	 vxor		$in1,$in1,v31
876	vncipher	$out3,$out3,v30
877	 vxor		$in2,$in2,v31
878	vncipher	$out4,$out4,v30
879	 vxor		$in3,$in3,v31
880	vncipher	$out5,$out5,v30
881	 vxor		$in4,$in4,v31
882	vncipher	$out6,$out6,v30
883	 vxor		$in5,$in5,v31
884	vncipher	$out7,$out7,v30
885	 vxor		$in6,$in6,v31
886
887	vncipherlast	$out0,$out0,$ivec
888	vncipherlast	$out1,$out1,$in0
889	 lvx_u		$in0,$x00,$inp		# load next input block
890	vncipherlast	$out2,$out2,$in1
891	 lvx_u		$in1,$x10,$inp
892	vncipherlast	$out3,$out3,$in2
893	 le?vperm	$in0,$in0,$in0,$inpperm
894	 lvx_u		$in2,$x20,$inp
895	vncipherlast	$out4,$out4,$in3
896	 le?vperm	$in1,$in1,$in1,$inpperm
897	 lvx_u		$in3,$x30,$inp
898	vncipherlast	$out5,$out5,$in4
899	 le?vperm	$in2,$in2,$in2,$inpperm
900	 lvx_u		$in4,$x40,$inp
901	vncipherlast	$out6,$out6,$in5
902	 le?vperm	$in3,$in3,$in3,$inpperm
903	 lvx_u		$in5,$x50,$inp
904	vncipherlast	$out7,$out7,$in6
905	 le?vperm	$in4,$in4,$in4,$inpperm
906	 lvx_u		$in6,$x60,$inp
907	vmr		$ivec,$in7
908	 le?vperm	$in5,$in5,$in5,$inpperm
909	 lvx_u		$in7,$x70,$inp
910	 addi		$inp,$inp,0x80
911
912	le?vperm	$out0,$out0,$out0,$inpperm
913	le?vperm	$out1,$out1,$out1,$inpperm
914	stvx_u		$out0,$x00,$out
915	 le?vperm	$in6,$in6,$in6,$inpperm
916	 vxor		$out0,$in0,$rndkey0
917	le?vperm	$out2,$out2,$out2,$inpperm
918	stvx_u		$out1,$x10,$out
919	 le?vperm	$in7,$in7,$in7,$inpperm
920	 vxor		$out1,$in1,$rndkey0
921	le?vperm	$out3,$out3,$out3,$inpperm
922	stvx_u		$out2,$x20,$out
923	 vxor		$out2,$in2,$rndkey0
924	le?vperm	$out4,$out4,$out4,$inpperm
925	stvx_u		$out3,$x30,$out
926	 vxor		$out3,$in3,$rndkey0
927	le?vperm	$out5,$out5,$out5,$inpperm
928	stvx_u		$out4,$x40,$out
929	 vxor		$out4,$in4,$rndkey0
930	le?vperm	$out6,$out6,$out6,$inpperm
931	stvx_u		$out5,$x50,$out
932	 vxor		$out5,$in5,$rndkey0
933	le?vperm	$out7,$out7,$out7,$inpperm
934	stvx_u		$out6,$x60,$out
935	 vxor		$out6,$in6,$rndkey0
936	stvx_u		$out7,$x70,$out
937	addi		$out,$out,0x80
938	 vxor		$out7,$in7,$rndkey0
939
940	mtctr		$rounds
941	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
942
943	addic.		$len,$len,128
944	beq		Lcbc_dec8x_done
945	nop
946	nop
947
948Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
949	vncipher	$out1,$out1,v24
950	vncipher	$out2,$out2,v24
951	vncipher	$out3,$out3,v24
952	vncipher	$out4,$out4,v24
953	vncipher	$out5,$out5,v24
954	vncipher	$out6,$out6,v24
955	vncipher	$out7,$out7,v24
956	lvx		v24,$x20,$key_		# round[3]
957	addi		$key_,$key_,0x20
958
959	vncipher	$out1,$out1,v25
960	vncipher	$out2,$out2,v25
961	vncipher	$out3,$out3,v25
962	vncipher	$out4,$out4,v25
963	vncipher	$out5,$out5,v25
964	vncipher	$out6,$out6,v25
965	vncipher	$out7,$out7,v25
966	lvx		v25,$x10,$key_		# round[4]
967	bdnz		Loop_cbc_dec8x_tail
968
969	vncipher	$out1,$out1,v24
970	vncipher	$out2,$out2,v24
971	vncipher	$out3,$out3,v24
972	vncipher	$out4,$out4,v24
973	vncipher	$out5,$out5,v24
974	vncipher	$out6,$out6,v24
975	vncipher	$out7,$out7,v24
976
977	vncipher	$out1,$out1,v25
978	vncipher	$out2,$out2,v25
979	vncipher	$out3,$out3,v25
980	vncipher	$out4,$out4,v25
981	vncipher	$out5,$out5,v25
982	vncipher	$out6,$out6,v25
983	vncipher	$out7,$out7,v25
984
985	vncipher	$out1,$out1,v26
986	vncipher	$out2,$out2,v26
987	vncipher	$out3,$out3,v26
988	vncipher	$out4,$out4,v26
989	vncipher	$out5,$out5,v26
990	vncipher	$out6,$out6,v26
991	vncipher	$out7,$out7,v26
992
993	vncipher	$out1,$out1,v27
994	vncipher	$out2,$out2,v27
995	vncipher	$out3,$out3,v27
996	vncipher	$out4,$out4,v27
997	vncipher	$out5,$out5,v27
998	vncipher	$out6,$out6,v27
999	vncipher	$out7,$out7,v27
1000
1001	vncipher	$out1,$out1,v28
1002	vncipher	$out2,$out2,v28
1003	vncipher	$out3,$out3,v28
1004	vncipher	$out4,$out4,v28
1005	vncipher	$out5,$out5,v28
1006	vncipher	$out6,$out6,v28
1007	vncipher	$out7,$out7,v28
1008
1009	vncipher	$out1,$out1,v29
1010	vncipher	$out2,$out2,v29
1011	vncipher	$out3,$out3,v29
1012	vncipher	$out4,$out4,v29
1013	vncipher	$out5,$out5,v29
1014	vncipher	$out6,$out6,v29
1015	vncipher	$out7,$out7,v29
1016
1017	vncipher	$out1,$out1,v30
1018	 vxor		$ivec,$ivec,v31		# last round key
1019	vncipher	$out2,$out2,v30
1020	 vxor		$in1,$in1,v31
1021	vncipher	$out3,$out3,v30
1022	 vxor		$in2,$in2,v31
1023	vncipher	$out4,$out4,v30
1024	 vxor		$in3,$in3,v31
1025	vncipher	$out5,$out5,v30
1026	 vxor		$in4,$in4,v31
1027	vncipher	$out6,$out6,v30
1028	 vxor		$in5,$in5,v31
1029	vncipher	$out7,$out7,v30
1030	 vxor		$in6,$in6,v31
1031
1032	cmplwi		$len,32			# switch($len)
1033	blt		Lcbc_dec8x_one
1034	nop
1035	beq		Lcbc_dec8x_two
1036	cmplwi		$len,64
1037	blt		Lcbc_dec8x_three
1038	nop
1039	beq		Lcbc_dec8x_four
1040	cmplwi		$len,96
1041	blt		Lcbc_dec8x_five
1042	nop
1043	beq		Lcbc_dec8x_six
1044
1045Lcbc_dec8x_seven:
1046	vncipherlast	$out1,$out1,$ivec
1047	vncipherlast	$out2,$out2,$in1
1048	vncipherlast	$out3,$out3,$in2
1049	vncipherlast	$out4,$out4,$in3
1050	vncipherlast	$out5,$out5,$in4
1051	vncipherlast	$out6,$out6,$in5
1052	vncipherlast	$out7,$out7,$in6
1053	vmr		$ivec,$in7
1054
1055	le?vperm	$out1,$out1,$out1,$inpperm
1056	le?vperm	$out2,$out2,$out2,$inpperm
1057	stvx_u		$out1,$x00,$out
1058	le?vperm	$out3,$out3,$out3,$inpperm
1059	stvx_u		$out2,$x10,$out
1060	le?vperm	$out4,$out4,$out4,$inpperm
1061	stvx_u		$out3,$x20,$out
1062	le?vperm	$out5,$out5,$out5,$inpperm
1063	stvx_u		$out4,$x30,$out
1064	le?vperm	$out6,$out6,$out6,$inpperm
1065	stvx_u		$out5,$x40,$out
1066	le?vperm	$out7,$out7,$out7,$inpperm
1067	stvx_u		$out6,$x50,$out
1068	stvx_u		$out7,$x60,$out
1069	addi		$out,$out,0x70
1070	b		Lcbc_dec8x_done
1071
1072.align	5
1073Lcbc_dec8x_six:
1074	vncipherlast	$out2,$out2,$ivec
1075	vncipherlast	$out3,$out3,$in2
1076	vncipherlast	$out4,$out4,$in3
1077	vncipherlast	$out5,$out5,$in4
1078	vncipherlast	$out6,$out6,$in5
1079	vncipherlast	$out7,$out7,$in6
1080	vmr		$ivec,$in7
1081
1082	le?vperm	$out2,$out2,$out2,$inpperm
1083	le?vperm	$out3,$out3,$out3,$inpperm
1084	stvx_u		$out2,$x00,$out
1085	le?vperm	$out4,$out4,$out4,$inpperm
1086	stvx_u		$out3,$x10,$out
1087	le?vperm	$out5,$out5,$out5,$inpperm
1088	stvx_u		$out4,$x20,$out
1089	le?vperm	$out6,$out6,$out6,$inpperm
1090	stvx_u		$out5,$x30,$out
1091	le?vperm	$out7,$out7,$out7,$inpperm
1092	stvx_u		$out6,$x40,$out
1093	stvx_u		$out7,$x50,$out
1094	addi		$out,$out,0x60
1095	b		Lcbc_dec8x_done
1096
1097.align	5
1098Lcbc_dec8x_five:
1099	vncipherlast	$out3,$out3,$ivec
1100	vncipherlast	$out4,$out4,$in3
1101	vncipherlast	$out5,$out5,$in4
1102	vncipherlast	$out6,$out6,$in5
1103	vncipherlast	$out7,$out7,$in6
1104	vmr		$ivec,$in7
1105
1106	le?vperm	$out3,$out3,$out3,$inpperm
1107	le?vperm	$out4,$out4,$out4,$inpperm
1108	stvx_u		$out3,$x00,$out
1109	le?vperm	$out5,$out5,$out5,$inpperm
1110	stvx_u		$out4,$x10,$out
1111	le?vperm	$out6,$out6,$out6,$inpperm
1112	stvx_u		$out5,$x20,$out
1113	le?vperm	$out7,$out7,$out7,$inpperm
1114	stvx_u		$out6,$x30,$out
1115	stvx_u		$out7,$x40,$out
1116	addi		$out,$out,0x50
1117	b		Lcbc_dec8x_done
1118
1119.align	5
1120Lcbc_dec8x_four:
1121	vncipherlast	$out4,$out4,$ivec
1122	vncipherlast	$out5,$out5,$in4
1123	vncipherlast	$out6,$out6,$in5
1124	vncipherlast	$out7,$out7,$in6
1125	vmr		$ivec,$in7
1126
1127	le?vperm	$out4,$out4,$out4,$inpperm
1128	le?vperm	$out5,$out5,$out5,$inpperm
1129	stvx_u		$out4,$x00,$out
1130	le?vperm	$out6,$out6,$out6,$inpperm
1131	stvx_u		$out5,$x10,$out
1132	le?vperm	$out7,$out7,$out7,$inpperm
1133	stvx_u		$out6,$x20,$out
1134	stvx_u		$out7,$x30,$out
1135	addi		$out,$out,0x40
1136	b		Lcbc_dec8x_done
1137
1138.align	5
1139Lcbc_dec8x_three:
1140	vncipherlast	$out5,$out5,$ivec
1141	vncipherlast	$out6,$out6,$in5
1142	vncipherlast	$out7,$out7,$in6
1143	vmr		$ivec,$in7
1144
1145	le?vperm	$out5,$out5,$out5,$inpperm
1146	le?vperm	$out6,$out6,$out6,$inpperm
1147	stvx_u		$out5,$x00,$out
1148	le?vperm	$out7,$out7,$out7,$inpperm
1149	stvx_u		$out6,$x10,$out
1150	stvx_u		$out7,$x20,$out
1151	addi		$out,$out,0x30
1152	b		Lcbc_dec8x_done
1153
1154.align	5
1155Lcbc_dec8x_two:
1156	vncipherlast	$out6,$out6,$ivec
1157	vncipherlast	$out7,$out7,$in6
1158	vmr		$ivec,$in7
1159
1160	le?vperm	$out6,$out6,$out6,$inpperm
1161	le?vperm	$out7,$out7,$out7,$inpperm
1162	stvx_u		$out6,$x00,$out
1163	stvx_u		$out7,$x10,$out
1164	addi		$out,$out,0x20
1165	b		Lcbc_dec8x_done
1166
1167.align	5
1168Lcbc_dec8x_one:
1169	vncipherlast	$out7,$out7,$ivec
1170	vmr		$ivec,$in7
1171
1172	le?vperm	$out7,$out7,$out7,$inpperm
1173	stvx_u		$out7,0,$out
1174	addi		$out,$out,0x10
1175
1176Lcbc_dec8x_done:
1177	le?vperm	$ivec,$ivec,$ivec,$inpperm
1178	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1179
1180	li		r10,`$FRAME+15`
1181	li		r11,`$FRAME+31`
1182	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1183	addi		r10,r10,32
1184	stvx		$inpperm,r11,$sp
1185	addi		r11,r11,32
1186	stvx		$inpperm,r10,$sp
1187	addi		r10,r10,32
1188	stvx		$inpperm,r11,$sp
1189	addi		r11,r11,32
1190	stvx		$inpperm,r10,$sp
1191	addi		r10,r10,32
1192	stvx		$inpperm,r11,$sp
1193	addi		r11,r11,32
1194	stvx		$inpperm,r10,$sp
1195	addi		r10,r10,32
1196	stvx		$inpperm,r11,$sp
1197	addi		r11,r11,32
1198
1199	mtspr		256,$vrsave
1200	lvx		v20,r10,$sp		# ABI says so
1201	addi		r10,r10,32
1202	lvx		v21,r11,$sp
1203	addi		r11,r11,32
1204	lvx		v22,r10,$sp
1205	addi		r10,r10,32
1206	lvx		v23,r11,$sp
1207	addi		r11,r11,32
1208	lvx		v24,r10,$sp
1209	addi		r10,r10,32
1210	lvx		v25,r11,$sp
1211	addi		r11,r11,32
1212	lvx		v26,r10,$sp
1213	addi		r10,r10,32
1214	lvx		v27,r11,$sp
1215	addi		r11,r11,32
1216	lvx		v28,r10,$sp
1217	addi		r10,r10,32
1218	lvx		v29,r11,$sp
1219	addi		r11,r11,32
1220	lvx		v30,r10,$sp
1221	lvx		v31,r11,$sp
1222	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1223	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1224	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1225	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1226	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1227	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1228	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1229	blr
1230	.long		0
1231	.byte		0,12,0x04,0,0x80,6,6,0
1232	.long		0
1233.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1234___
1235}}	}}}
1236
1237#########################################################################
1238{{{	# CTR procedure[s]						#
1239my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1240my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1241my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1242						map("v$_",(4..11));
1243my $dat=$tmp;
1244
1245$code.=<<___;
1246.globl	.${prefix}_ctr32_encrypt_blocks
1247.align	5
1248.${prefix}_ctr32_encrypt_blocks:
1249	${UCMP}i	$len,1
1250	bltlr-
1251
1252	lis		r0,0xfff0
1253	mfspr		$vrsave,256
1254	mtspr		256,r0
1255
1256	li		$idx,15
1257	vxor		$rndkey0,$rndkey0,$rndkey0
1258	le?vspltisb	$tmp,0x0f
1259
1260	lvx		$ivec,0,$ivp		# load [unaligned] iv
1261	lvsl		$inpperm,0,$ivp
1262	lvx		$inptail,$idx,$ivp
1263	 vspltisb	$one,1
1264	le?vxor		$inpperm,$inpperm,$tmp
1265	vperm		$ivec,$ivec,$inptail,$inpperm
1266	 vsldoi		$one,$rndkey0,$one,1
1267
1268	neg		r11,$inp
1269	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1270	lwz		$rounds,240($key)
1271
1272	lvsr		$inpperm,0,r11		# prepare for unaligned load
1273	lvx		$inptail,0,$inp
1274	addi		$inp,$inp,15		# 15 is not typo
1275	le?vxor		$inpperm,$inpperm,$tmp
1276
1277	srwi		$rounds,$rounds,1
1278	li		$idx,16
1279	subi		$rounds,$rounds,1
1280
1281	${UCMP}i	$len,8
1282	bge		_aesp8_ctr32_encrypt8x
1283
1284	?lvsr		$outperm,0,$out		# prepare for unaligned store
1285	vspltisb	$outmask,-1
1286	lvx		$outhead,0,$out
1287	?vperm		$outmask,$rndkey0,$outmask,$outperm
1288	le?vxor		$outperm,$outperm,$tmp
1289
1290	lvx		$rndkey0,0,$key
1291	mtctr		$rounds
1292	lvx		$rndkey1,$idx,$key
1293	addi		$idx,$idx,16
1294	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1295	vxor		$inout,$ivec,$rndkey0
1296	lvx		$rndkey0,$idx,$key
1297	addi		$idx,$idx,16
1298	b		Loop_ctr32_enc
1299
1300.align	5
1301Loop_ctr32_enc:
1302	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1303	vcipher		$inout,$inout,$rndkey1
1304	lvx		$rndkey1,$idx,$key
1305	addi		$idx,$idx,16
1306	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1307	vcipher		$inout,$inout,$rndkey0
1308	lvx		$rndkey0,$idx,$key
1309	addi		$idx,$idx,16
1310	bdnz		Loop_ctr32_enc
1311
1312	vadduwm		$ivec,$ivec,$one
1313	 vmr		$dat,$inptail
1314	 lvx		$inptail,0,$inp
1315	 addi		$inp,$inp,16
1316	 subic.		$len,$len,1		# blocks--
1317
1318	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1319	vcipher		$inout,$inout,$rndkey1
1320	lvx		$rndkey1,$idx,$key
1321	 vperm		$dat,$dat,$inptail,$inpperm
1322	 li		$idx,16
1323	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1324	 lvx		$rndkey0,0,$key
1325	vxor		$dat,$dat,$rndkey1	# last round key
1326	vcipherlast	$inout,$inout,$dat
1327
1328	 lvx		$rndkey1,$idx,$key
1329	 addi		$idx,$idx,16
1330	vperm		$inout,$inout,$inout,$outperm
1331	vsel		$dat,$outhead,$inout,$outmask
1332	 mtctr		$rounds
1333	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1334	vmr		$outhead,$inout
1335	 vxor		$inout,$ivec,$rndkey0
1336	 lvx		$rndkey0,$idx,$key
1337	 addi		$idx,$idx,16
1338	stvx		$dat,0,$out
1339	addi		$out,$out,16
1340	bne		Loop_ctr32_enc
1341
1342	addi		$out,$out,-1
1343	lvx		$inout,0,$out		# redundant in aligned case
1344	vsel		$inout,$outhead,$inout,$outmask
1345	stvx		$inout,0,$out
1346
1347	mtspr		256,$vrsave
1348	blr
1349	.long		0
1350	.byte		0,12,0x14,0,0,0,6,0
1351	.long		0
1352___
1353#########################################################################
1354{{	# Optimized CTR procedure					#
1355my $key_="r11";
1356my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1357    $x00=0 if ($flavour =~ /osx/);
1358my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1359my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1360my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1361			# v26-v31 last 6 round keys
1362my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1363my ($two,$three,$four)=($outhead,$outperm,$outmask);
1364
1365$code.=<<___;
1366.align	5
1367_aesp8_ctr32_encrypt8x:
1368	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1369	li		r10,`$FRAME+8*16+15`
1370	li		r11,`$FRAME+8*16+31`
1371	stvx		v20,r10,$sp		# ABI says so
1372	addi		r10,r10,32
1373	stvx		v21,r11,$sp
1374	addi		r11,r11,32
1375	stvx		v22,r10,$sp
1376	addi		r10,r10,32
1377	stvx		v23,r11,$sp
1378	addi		r11,r11,32
1379	stvx		v24,r10,$sp
1380	addi		r10,r10,32
1381	stvx		v25,r11,$sp
1382	addi		r11,r11,32
1383	stvx		v26,r10,$sp
1384	addi		r10,r10,32
1385	stvx		v27,r11,$sp
1386	addi		r11,r11,32
1387	stvx		v28,r10,$sp
1388	addi		r10,r10,32
1389	stvx		v29,r11,$sp
1390	addi		r11,r11,32
1391	stvx		v30,r10,$sp
1392	stvx		v31,r11,$sp
1393	li		r0,-1
1394	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1395	li		$x10,0x10
1396	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1397	li		$x20,0x20
1398	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1399	li		$x30,0x30
1400	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1401	li		$x40,0x40
1402	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1403	li		$x50,0x50
1404	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1405	li		$x60,0x60
1406	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1407	li		$x70,0x70
1408	mtspr		256,r0
1409
1410	subi		$rounds,$rounds,3	# -4 in total
1411
1412	lvx		$rndkey0,$x00,$key	# load key schedule
1413	lvx		v30,$x10,$key
1414	addi		$key,$key,0x20
1415	lvx		v31,$x00,$key
1416	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1417	addi		$key_,$sp,$FRAME+15
1418	mtctr		$rounds
1419
1420Load_ctr32_enc_key:
1421	?vperm		v24,v30,v31,$keyperm
1422	lvx		v30,$x10,$key
1423	addi		$key,$key,0x20
1424	stvx		v24,$x00,$key_		# off-load round[1]
1425	?vperm		v25,v31,v30,$keyperm
1426	lvx		v31,$x00,$key
1427	stvx		v25,$x10,$key_		# off-load round[2]
1428	addi		$key_,$key_,0x20
1429	bdnz		Load_ctr32_enc_key
1430
1431	lvx		v26,$x10,$key
1432	?vperm		v24,v30,v31,$keyperm
1433	lvx		v27,$x20,$key
1434	stvx		v24,$x00,$key_		# off-load round[3]
1435	?vperm		v25,v31,v26,$keyperm
1436	lvx		v28,$x30,$key
1437	stvx		v25,$x10,$key_		# off-load round[4]
1438	addi		$key_,$sp,$FRAME+15	# rewind $key_
1439	?vperm		v26,v26,v27,$keyperm
1440	lvx		v29,$x40,$key
1441	?vperm		v27,v27,v28,$keyperm
1442	lvx		v30,$x50,$key
1443	?vperm		v28,v28,v29,$keyperm
1444	lvx		v31,$x60,$key
1445	?vperm		v29,v29,v30,$keyperm
1446	lvx		$out0,$x70,$key		# borrow $out0
1447	?vperm		v30,v30,v31,$keyperm
1448	lvx		v24,$x00,$key_		# pre-load round[1]
1449	?vperm		v31,v31,$out0,$keyperm
1450	lvx		v25,$x10,$key_		# pre-load round[2]
1451
1452	vadduwm		$two,$one,$one
1453	subi		$inp,$inp,15		# undo "caller"
1454	$SHL		$len,$len,4
1455
1456	vadduwm		$out1,$ivec,$one	# counter values ...
1457	vadduwm		$out2,$ivec,$two
1458	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1459	 le?li		$idx,8
1460	vadduwm		$out3,$out1,$two
1461	vxor		$out1,$out1,$rndkey0
1462	 le?lvsl	$inpperm,0,$idx
1463	vadduwm		$out4,$out2,$two
1464	vxor		$out2,$out2,$rndkey0
1465	 le?vspltisb	$tmp,0x0f
1466	vadduwm		$out5,$out3,$two
1467	vxor		$out3,$out3,$rndkey0
1468	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1469	vadduwm		$out6,$out4,$two
1470	vxor		$out4,$out4,$rndkey0
1471	vadduwm		$out7,$out5,$two
1472	vxor		$out5,$out5,$rndkey0
1473	vadduwm		$ivec,$out6,$two	# next counter value
1474	vxor		$out6,$out6,$rndkey0
1475	vxor		$out7,$out7,$rndkey0
1476
1477	mtctr		$rounds
1478	b		Loop_ctr32_enc8x
1479.align	5
1480Loop_ctr32_enc8x:
1481	vcipher 	$out0,$out0,v24
1482	vcipher 	$out1,$out1,v24
1483	vcipher 	$out2,$out2,v24
1484	vcipher 	$out3,$out3,v24
1485	vcipher 	$out4,$out4,v24
1486	vcipher 	$out5,$out5,v24
1487	vcipher 	$out6,$out6,v24
1488	vcipher 	$out7,$out7,v24
1489Loop_ctr32_enc8x_middle:
1490	lvx		v24,$x20,$key_		# round[3]
1491	addi		$key_,$key_,0x20
1492
1493	vcipher 	$out0,$out0,v25
1494	vcipher 	$out1,$out1,v25
1495	vcipher 	$out2,$out2,v25
1496	vcipher 	$out3,$out3,v25
1497	vcipher 	$out4,$out4,v25
1498	vcipher 	$out5,$out5,v25
1499	vcipher 	$out6,$out6,v25
1500	vcipher 	$out7,$out7,v25
1501	lvx		v25,$x10,$key_		# round[4]
1502	bdnz		Loop_ctr32_enc8x
1503
1504	subic		r11,$len,256		# $len-256, borrow $key_
1505	vcipher 	$out0,$out0,v24
1506	vcipher 	$out1,$out1,v24
1507	vcipher 	$out2,$out2,v24
1508	vcipher 	$out3,$out3,v24
1509	vcipher 	$out4,$out4,v24
1510	vcipher 	$out5,$out5,v24
1511	vcipher 	$out6,$out6,v24
1512	vcipher 	$out7,$out7,v24
1513
1514	subfe		r0,r0,r0		# borrow?-1:0
1515	vcipher 	$out0,$out0,v25
1516	vcipher 	$out1,$out1,v25
1517	vcipher 	$out2,$out2,v25
1518	vcipher 	$out3,$out3,v25
1519	vcipher 	$out4,$out4,v25
1520	vcipher		$out5,$out5,v25
1521	vcipher		$out6,$out6,v25
1522	vcipher		$out7,$out7,v25
1523
1524	and		r0,r0,r11
1525	addi		$key_,$sp,$FRAME+15	# rewind $key_
1526	vcipher		$out0,$out0,v26
1527	vcipher		$out1,$out1,v26
1528	vcipher		$out2,$out2,v26
1529	vcipher		$out3,$out3,v26
1530	vcipher		$out4,$out4,v26
1531	vcipher		$out5,$out5,v26
1532	vcipher		$out6,$out6,v26
1533	vcipher		$out7,$out7,v26
1534	lvx		v24,$x00,$key_		# re-pre-load round[1]
1535
1536	subic		$len,$len,129		# $len-=129
1537	vcipher		$out0,$out0,v27
1538	addi		$len,$len,1		# $len-=128 really
1539	vcipher		$out1,$out1,v27
1540	vcipher		$out2,$out2,v27
1541	vcipher		$out3,$out3,v27
1542	vcipher		$out4,$out4,v27
1543	vcipher		$out5,$out5,v27
1544	vcipher		$out6,$out6,v27
1545	vcipher		$out7,$out7,v27
1546	lvx		v25,$x10,$key_		# re-pre-load round[2]
1547
1548	vcipher		$out0,$out0,v28
1549	 lvx_u		$in0,$x00,$inp		# load input
1550	vcipher		$out1,$out1,v28
1551	 lvx_u		$in1,$x10,$inp
1552	vcipher		$out2,$out2,v28
1553	 lvx_u		$in2,$x20,$inp
1554	vcipher		$out3,$out3,v28
1555	 lvx_u		$in3,$x30,$inp
1556	vcipher		$out4,$out4,v28
1557	 lvx_u		$in4,$x40,$inp
1558	vcipher		$out5,$out5,v28
1559	 lvx_u		$in5,$x50,$inp
1560	vcipher		$out6,$out6,v28
1561	 lvx_u		$in6,$x60,$inp
1562	vcipher		$out7,$out7,v28
1563	 lvx_u		$in7,$x70,$inp
1564	 addi		$inp,$inp,0x80
1565
1566	vcipher		$out0,$out0,v29
1567	 le?vperm	$in0,$in0,$in0,$inpperm
1568	vcipher		$out1,$out1,v29
1569	 le?vperm	$in1,$in1,$in1,$inpperm
1570	vcipher		$out2,$out2,v29
1571	 le?vperm	$in2,$in2,$in2,$inpperm
1572	vcipher		$out3,$out3,v29
1573	 le?vperm	$in3,$in3,$in3,$inpperm
1574	vcipher		$out4,$out4,v29
1575	 le?vperm	$in4,$in4,$in4,$inpperm
1576	vcipher		$out5,$out5,v29
1577	 le?vperm	$in5,$in5,$in5,$inpperm
1578	vcipher		$out6,$out6,v29
1579	 le?vperm	$in6,$in6,$in6,$inpperm
1580	vcipher		$out7,$out7,v29
1581	 le?vperm	$in7,$in7,$in7,$inpperm
1582
1583	add		$inp,$inp,r0		# $inp is adjusted in such
1584						# way that at exit from the
1585						# loop inX-in7 are loaded
1586						# with last "words"
1587	subfe.		r0,r0,r0		# borrow?-1:0
1588	vcipher		$out0,$out0,v30
1589	 vxor		$in0,$in0,v31		# xor with last round key
1590	vcipher		$out1,$out1,v30
1591	 vxor		$in1,$in1,v31
1592	vcipher		$out2,$out2,v30
1593	 vxor		$in2,$in2,v31
1594	vcipher		$out3,$out3,v30
1595	 vxor		$in3,$in3,v31
1596	vcipher		$out4,$out4,v30
1597	 vxor		$in4,$in4,v31
1598	vcipher		$out5,$out5,v30
1599	 vxor		$in5,$in5,v31
1600	vcipher		$out6,$out6,v30
1601	 vxor		$in6,$in6,v31
1602	vcipher		$out7,$out7,v30
1603	 vxor		$in7,$in7,v31
1604
1605	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1606
1607	vcipherlast	$in0,$out0,$in0
1608	vcipherlast	$in1,$out1,$in1
1609	 vadduwm	$out1,$ivec,$one	# counter values ...
1610	vcipherlast	$in2,$out2,$in2
1611	 vadduwm	$out2,$ivec,$two
1612	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1613	vcipherlast	$in3,$out3,$in3
1614	 vadduwm	$out3,$out1,$two
1615	 vxor		$out1,$out1,$rndkey0
1616	vcipherlast	$in4,$out4,$in4
1617	 vadduwm	$out4,$out2,$two
1618	 vxor		$out2,$out2,$rndkey0
1619	vcipherlast	$in5,$out5,$in5
1620	 vadduwm	$out5,$out3,$two
1621	 vxor		$out3,$out3,$rndkey0
1622	vcipherlast	$in6,$out6,$in6
1623	 vadduwm	$out6,$out4,$two
1624	 vxor		$out4,$out4,$rndkey0
1625	vcipherlast	$in7,$out7,$in7
1626	 vadduwm	$out7,$out5,$two
1627	 vxor		$out5,$out5,$rndkey0
1628	le?vperm	$in0,$in0,$in0,$inpperm
1629	 vadduwm	$ivec,$out6,$two	# next counter value
1630	 vxor		$out6,$out6,$rndkey0
1631	le?vperm	$in1,$in1,$in1,$inpperm
1632	 vxor		$out7,$out7,$rndkey0
1633	mtctr		$rounds
1634
1635	 vcipher	$out0,$out0,v24
1636	stvx_u		$in0,$x00,$out
1637	le?vperm	$in2,$in2,$in2,$inpperm
1638	 vcipher	$out1,$out1,v24
1639	stvx_u		$in1,$x10,$out
1640	le?vperm	$in3,$in3,$in3,$inpperm
1641	 vcipher	$out2,$out2,v24
1642	stvx_u		$in2,$x20,$out
1643	le?vperm	$in4,$in4,$in4,$inpperm
1644	 vcipher	$out3,$out3,v24
1645	stvx_u		$in3,$x30,$out
1646	le?vperm	$in5,$in5,$in5,$inpperm
1647	 vcipher	$out4,$out4,v24
1648	stvx_u		$in4,$x40,$out
1649	le?vperm	$in6,$in6,$in6,$inpperm
1650	 vcipher	$out5,$out5,v24
1651	stvx_u		$in5,$x50,$out
1652	le?vperm	$in7,$in7,$in7,$inpperm
1653	 vcipher	$out6,$out6,v24
1654	stvx_u		$in6,$x60,$out
1655	 vcipher	$out7,$out7,v24
1656	stvx_u		$in7,$x70,$out
1657	addi		$out,$out,0x80
1658
1659	b		Loop_ctr32_enc8x_middle
1660
1661.align	5
1662Lctr32_enc8x_break:
1663	cmpwi		$len,-0x60
1664	blt		Lctr32_enc8x_one
1665	nop
1666	beq		Lctr32_enc8x_two
1667	cmpwi		$len,-0x40
1668	blt		Lctr32_enc8x_three
1669	nop
1670	beq		Lctr32_enc8x_four
1671	cmpwi		$len,-0x20
1672	blt		Lctr32_enc8x_five
1673	nop
1674	beq		Lctr32_enc8x_six
1675	cmpwi		$len,0x00
1676	blt		Lctr32_enc8x_seven
1677
1678Lctr32_enc8x_eight:
1679	vcipherlast	$out0,$out0,$in0
1680	vcipherlast	$out1,$out1,$in1
1681	vcipherlast	$out2,$out2,$in2
1682	vcipherlast	$out3,$out3,$in3
1683	vcipherlast	$out4,$out4,$in4
1684	vcipherlast	$out5,$out5,$in5
1685	vcipherlast	$out6,$out6,$in6
1686	vcipherlast	$out7,$out7,$in7
1687
1688	le?vperm	$out0,$out0,$out0,$inpperm
1689	le?vperm	$out1,$out1,$out1,$inpperm
1690	stvx_u		$out0,$x00,$out
1691	le?vperm	$out2,$out2,$out2,$inpperm
1692	stvx_u		$out1,$x10,$out
1693	le?vperm	$out3,$out3,$out3,$inpperm
1694	stvx_u		$out2,$x20,$out
1695	le?vperm	$out4,$out4,$out4,$inpperm
1696	stvx_u		$out3,$x30,$out
1697	le?vperm	$out5,$out5,$out5,$inpperm
1698	stvx_u		$out4,$x40,$out
1699	le?vperm	$out6,$out6,$out6,$inpperm
1700	stvx_u		$out5,$x50,$out
1701	le?vperm	$out7,$out7,$out7,$inpperm
1702	stvx_u		$out6,$x60,$out
1703	stvx_u		$out7,$x70,$out
1704	addi		$out,$out,0x80
1705	b		Lctr32_enc8x_done
1706
1707.align	5
1708Lctr32_enc8x_seven:
1709	vcipherlast	$out0,$out0,$in1
1710	vcipherlast	$out1,$out1,$in2
1711	vcipherlast	$out2,$out2,$in3
1712	vcipherlast	$out3,$out3,$in4
1713	vcipherlast	$out4,$out4,$in5
1714	vcipherlast	$out5,$out5,$in6
1715	vcipherlast	$out6,$out6,$in7
1716
1717	le?vperm	$out0,$out0,$out0,$inpperm
1718	le?vperm	$out1,$out1,$out1,$inpperm
1719	stvx_u		$out0,$x00,$out
1720	le?vperm	$out2,$out2,$out2,$inpperm
1721	stvx_u		$out1,$x10,$out
1722	le?vperm	$out3,$out3,$out3,$inpperm
1723	stvx_u		$out2,$x20,$out
1724	le?vperm	$out4,$out4,$out4,$inpperm
1725	stvx_u		$out3,$x30,$out
1726	le?vperm	$out5,$out5,$out5,$inpperm
1727	stvx_u		$out4,$x40,$out
1728	le?vperm	$out6,$out6,$out6,$inpperm
1729	stvx_u		$out5,$x50,$out
1730	stvx_u		$out6,$x60,$out
1731	addi		$out,$out,0x70
1732	b		Lctr32_enc8x_done
1733
1734.align	5
1735Lctr32_enc8x_six:
1736	vcipherlast	$out0,$out0,$in2
1737	vcipherlast	$out1,$out1,$in3
1738	vcipherlast	$out2,$out2,$in4
1739	vcipherlast	$out3,$out3,$in5
1740	vcipherlast	$out4,$out4,$in6
1741	vcipherlast	$out5,$out5,$in7
1742
1743	le?vperm	$out0,$out0,$out0,$inpperm
1744	le?vperm	$out1,$out1,$out1,$inpperm
1745	stvx_u		$out0,$x00,$out
1746	le?vperm	$out2,$out2,$out2,$inpperm
1747	stvx_u		$out1,$x10,$out
1748	le?vperm	$out3,$out3,$out3,$inpperm
1749	stvx_u		$out2,$x20,$out
1750	le?vperm	$out4,$out4,$out4,$inpperm
1751	stvx_u		$out3,$x30,$out
1752	le?vperm	$out5,$out5,$out5,$inpperm
1753	stvx_u		$out4,$x40,$out
1754	stvx_u		$out5,$x50,$out
1755	addi		$out,$out,0x60
1756	b		Lctr32_enc8x_done
1757
1758.align	5
1759Lctr32_enc8x_five:
1760	vcipherlast	$out0,$out0,$in3
1761	vcipherlast	$out1,$out1,$in4
1762	vcipherlast	$out2,$out2,$in5
1763	vcipherlast	$out3,$out3,$in6
1764	vcipherlast	$out4,$out4,$in7
1765
1766	le?vperm	$out0,$out0,$out0,$inpperm
1767	le?vperm	$out1,$out1,$out1,$inpperm
1768	stvx_u		$out0,$x00,$out
1769	le?vperm	$out2,$out2,$out2,$inpperm
1770	stvx_u		$out1,$x10,$out
1771	le?vperm	$out3,$out3,$out3,$inpperm
1772	stvx_u		$out2,$x20,$out
1773	le?vperm	$out4,$out4,$out4,$inpperm
1774	stvx_u		$out3,$x30,$out
1775	stvx_u		$out4,$x40,$out
1776	addi		$out,$out,0x50
1777	b		Lctr32_enc8x_done
1778
1779.align	5
1780Lctr32_enc8x_four:
1781	vcipherlast	$out0,$out0,$in4
1782	vcipherlast	$out1,$out1,$in5
1783	vcipherlast	$out2,$out2,$in6
1784	vcipherlast	$out3,$out3,$in7
1785
1786	le?vperm	$out0,$out0,$out0,$inpperm
1787	le?vperm	$out1,$out1,$out1,$inpperm
1788	stvx_u		$out0,$x00,$out
1789	le?vperm	$out2,$out2,$out2,$inpperm
1790	stvx_u		$out1,$x10,$out
1791	le?vperm	$out3,$out3,$out3,$inpperm
1792	stvx_u		$out2,$x20,$out
1793	stvx_u		$out3,$x30,$out
1794	addi		$out,$out,0x40
1795	b		Lctr32_enc8x_done
1796
1797.align	5
1798Lctr32_enc8x_three:
1799	vcipherlast	$out0,$out0,$in5
1800	vcipherlast	$out1,$out1,$in6
1801	vcipherlast	$out2,$out2,$in7
1802
1803	le?vperm	$out0,$out0,$out0,$inpperm
1804	le?vperm	$out1,$out1,$out1,$inpperm
1805	stvx_u		$out0,$x00,$out
1806	le?vperm	$out2,$out2,$out2,$inpperm
1807	stvx_u		$out1,$x10,$out
1808	stvx_u		$out2,$x20,$out
1809	addi		$out,$out,0x30
1810	b		Lcbc_dec8x_done
1811
1812.align	5
1813Lctr32_enc8x_two:
1814	vcipherlast	$out0,$out0,$in6
1815	vcipherlast	$out1,$out1,$in7
1816
1817	le?vperm	$out0,$out0,$out0,$inpperm
1818	le?vperm	$out1,$out1,$out1,$inpperm
1819	stvx_u		$out0,$x00,$out
1820	stvx_u		$out1,$x10,$out
1821	addi		$out,$out,0x20
1822	b		Lcbc_dec8x_done
1823
1824.align	5
1825Lctr32_enc8x_one:
1826	vcipherlast	$out0,$out0,$in7
1827
1828	le?vperm	$out0,$out0,$out0,$inpperm
1829	stvx_u		$out0,0,$out
1830	addi		$out,$out,0x10
1831
1832Lctr32_enc8x_done:
1833	li		r10,`$FRAME+15`
1834	li		r11,`$FRAME+31`
1835	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1836	addi		r10,r10,32
1837	stvx		$inpperm,r11,$sp
1838	addi		r11,r11,32
1839	stvx		$inpperm,r10,$sp
1840	addi		r10,r10,32
1841	stvx		$inpperm,r11,$sp
1842	addi		r11,r11,32
1843	stvx		$inpperm,r10,$sp
1844	addi		r10,r10,32
1845	stvx		$inpperm,r11,$sp
1846	addi		r11,r11,32
1847	stvx		$inpperm,r10,$sp
1848	addi		r10,r10,32
1849	stvx		$inpperm,r11,$sp
1850	addi		r11,r11,32
1851
1852	mtspr		256,$vrsave
1853	lvx		v20,r10,$sp		# ABI says so
1854	addi		r10,r10,32
1855	lvx		v21,r11,$sp
1856	addi		r11,r11,32
1857	lvx		v22,r10,$sp
1858	addi		r10,r10,32
1859	lvx		v23,r11,$sp
1860	addi		r11,r11,32
1861	lvx		v24,r10,$sp
1862	addi		r10,r10,32
1863	lvx		v25,r11,$sp
1864	addi		r11,r11,32
1865	lvx		v26,r10,$sp
1866	addi		r10,r10,32
1867	lvx		v27,r11,$sp
1868	addi		r11,r11,32
1869	lvx		v28,r10,$sp
1870	addi		r10,r10,32
1871	lvx		v29,r11,$sp
1872	addi		r11,r11,32
1873	lvx		v30,r10,$sp
1874	lvx		v31,r11,$sp
1875	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1876	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1877	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1878	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1879	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1880	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1881	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1882	blr
1883	.long		0
1884	.byte		0,12,0x04,0,0x80,6,6,0
1885	.long		0
1886.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1887___
1888}}	}}}
1889
1890my $consts=1;
1891foreach(split("\n",$code)) {
1892        s/\`([^\`]*)\`/eval($1)/geo;
1893
1894	# constants table endian-specific conversion
1895	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
1896	    my $conv=$3;
1897	    my @bytes=();
1898
1899	    # convert to endian-agnostic format
1900	    if ($1 eq "long") {
1901	      foreach (split(/,\s*/,$2)) {
1902		my $l = /^0/?oct:int;
1903		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1904	      }
1905	    } else {
1906		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
1907	    }
1908
1909	    # little-endian conversion
1910	    if ($flavour =~ /le$/o) {
1911		SWITCH: for($conv)  {
1912		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1913		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
1914		}
1915	    }
1916
1917	    #emit
1918	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1919	    next;
1920	}
1921	$consts=0 if (m/Lconsts:/o);	# end of table
1922
1923	# instructions prefixed with '?' are endian-specific and need
1924	# to be adjusted accordingly...
1925	if ($flavour =~ /le$/o) {	# little-endian
1926	    s/le\?//o		or
1927	    s/be\?/#be#/o	or
1928	    s/\?lvsr/lvsl/o	or
1929	    s/\?lvsl/lvsr/o	or
1930	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1931	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1932	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1933	} else {			# big-endian
1934	    s/le\?/#le#/o	or
1935	    s/be\?//o		or
1936	    s/\?([a-z]+)/$1/o;
1937	}
1938
1939        print $_,"\n";
1940}
1941
1942close STDOUT;
1943