1#! /usr/bin/env perl
2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# Specific modes implementations for SPARC Architecture 2011. There
11# is T4 dependency though, an ASI value that is not specified in the
12# Architecture Manual. But as SPARC universe is rather monocultural,
13# we imply that processor capable of executing crypto instructions
14# can handle the ASI in question as well. This means that we ought to
15# keep eyes open when new processors emerge...
16#
17# As for above mentioned ASI. It's so called "block initializing
18# store" which cancels "read" in "read-update-write" on cache lines.
19# This is "cooperative" optimization, as it reduces overall pressure
20# on memory interface. Benefits can't be observed/quantified with
21# usual benchmarks, on the contrary you can notice that single-thread
22# performance for parallelizable modes is ~1.5% worse for largest
23# block sizes [though few percent better for not so long ones]. All
24# this based on suggestions from David Miller.
25
26$::bias="STACK_BIAS";
27$::frame="STACK_FRAME";
28$::size_t_cc="SIZE_T_CC";
29
30sub asm_init {		# to be called with @ARGV as argument
31    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
32    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
33    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
34}
35
36# unified interface
37my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
38# local variables
39my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
40
41sub alg_cbc_encrypt_implement {
42my ($alg,$bits) = @_;
43
44$::code.=<<___;
45.globl	${alg}${bits}_t4_cbc_encrypt
46.align	32
47${alg}${bits}_t4_cbc_encrypt:
48	save		%sp, -$::frame, %sp
49	cmp		$len, 0
50	be,pn		$::size_t_cc, .L${bits}_cbc_enc_abort
51	srln		$len, 0, $len		! needed on v8+, "nop" on v9
52	sub		$inp, $out, $blk_init	! $inp!=$out
53___
54$::code.=<<___ if (!$::evp);
55	andcc		$ivec, 7, $ivoff
56	alignaddr	$ivec, %g0, $ivec
57
58	ldd		[$ivec + 0], %f0	! load ivec
59	bz,pt		%icc, 1f
60	ldd		[$ivec + 8], %f2
61	ldd		[$ivec + 16], %f4
62	faligndata	%f0, %f2, %f0
63	faligndata	%f2, %f4, %f2
641:
65___
66$::code.=<<___ if ($::evp);
67	ld		[$ivec + 0], %f0
68	ld		[$ivec + 4], %f1
69	ld		[$ivec + 8], %f2
70	ld		[$ivec + 12], %f3
71___
72$::code.=<<___;
73	prefetch	[$inp], 20
74	prefetch	[$inp + 63], 20
75	call		_${alg}${bits}_load_enckey
76	and		$inp, 7, $ileft
77	andn		$inp, 7, $inp
78	sll		$ileft, 3, $ileft
79	mov		64, $iright
80	mov		0xff, $omask
81	sub		$iright, $ileft, $iright
82	and		$out, 7, $ooff
83	cmp		$len, 127
84	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
85	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
86	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
87	srl		$omask, $ooff, $omask
88
89	alignaddrl	$out, %g0, $out
90	srlx		$len, 4, $len
91	prefetch	[$out], 22
92
93.L${bits}_cbc_enc_loop:
94	ldx		[$inp + 0], %o0
95	brz,pt		$ileft, 4f
96	ldx		[$inp + 8], %o1
97
98	ldx		[$inp + 16], %o2
99	sllx		%o0, $ileft, %o0
100	srlx		%o1, $iright, %g1
101	sllx		%o1, $ileft, %o1
102	or		%g1, %o0, %o0
103	srlx		%o2, $iright, %o2
104	or		%o2, %o1, %o1
1054:
106	xor		%g4, %o0, %o0		! ^= rk[0]
107	xor		%g5, %o1, %o1
108	movxtod		%o0, %f12
109	movxtod		%o1, %f14
110
111	fxor		%f12, %f0, %f0		! ^= ivec
112	fxor		%f14, %f2, %f2
113	prefetch	[$out + 63], 22
114	prefetch	[$inp + 16+63], 20
115	call		_${alg}${bits}_encrypt_1x
116	add		$inp, 16, $inp
117
118	brnz,pn		$ooff, 2f
119	sub		$len, 1, $len
120
121	std		%f0, [$out + 0]
122	std		%f2, [$out + 8]
123	brnz,pt		$len, .L${bits}_cbc_enc_loop
124	add		$out, 16, $out
125___
126$::code.=<<___ if ($::evp);
127	st		%f0, [$ivec + 0]
128	st		%f1, [$ivec + 4]
129	st		%f2, [$ivec + 8]
130	st		%f3, [$ivec + 12]
131___
132$::code.=<<___ if (!$::evp);
133	brnz,pn		$ivoff, 3f
134	nop
135
136	std		%f0, [$ivec + 0]	! write out ivec
137	std		%f2, [$ivec + 8]
138___
139$::code.=<<___;
140.L${bits}_cbc_enc_abort:
141	ret
142	restore
143
144.align	16
1452:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
146						! and ~3x deterioration
147						! in inp==out case
148	faligndata	%f0, %f0, %f4		! handle unaligned output
149	faligndata	%f0, %f2, %f6
150	faligndata	%f2, %f2, %f8
151
152	stda		%f4, [$out + $omask]0xc0	! partial store
153	std		%f6, [$out + 8]
154	add		$out, 16, $out
155	orn		%g0, $omask, $omask
156	stda		%f8, [$out + $omask]0xc0	! partial store
157
158	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
159	orn		%g0, $omask, $omask
160___
161$::code.=<<___ if ($::evp);
162	st		%f0, [$ivec + 0]
163	st		%f1, [$ivec + 4]
164	st		%f2, [$ivec + 8]
165	st		%f3, [$ivec + 12]
166___
167$::code.=<<___ if (!$::evp);
168	brnz,pn		$ivoff, 3f
169	nop
170
171	std		%f0, [$ivec + 0]	! write out ivec
172	std		%f2, [$ivec + 8]
173	ret
174	restore
175
176.align	16
1773:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
178	mov		0xff, $omask
179	srl		$omask, $ivoff, $omask
180	faligndata	%f0, %f0, %f4
181	faligndata	%f0, %f2, %f6
182	faligndata	%f2, %f2, %f8
183	stda		%f4, [$ivec + $omask]0xc0
184	std		%f6, [$ivec + 8]
185	add		$ivec, 16, $ivec
186	orn		%g0, $omask, $omask
187	stda		%f8, [$ivec + $omask]0xc0
188___
189$::code.=<<___;
190	ret
191	restore
192
193!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194.align	32
195.L${bits}cbc_enc_blk:
196	add	$out, $len, $blk_init
197	and	$blk_init, 63, $blk_init	! tail
198	sub	$len, $blk_init, $len
199	add	$blk_init, 15, $blk_init	! round up to 16n
200	srlx	$len, 4, $len
201	srl	$blk_init, 4, $blk_init
202
203.L${bits}_cbc_enc_blk_loop:
204	ldx		[$inp + 0], %o0
205	brz,pt		$ileft, 5f
206	ldx		[$inp + 8], %o1
207
208	ldx		[$inp + 16], %o2
209	sllx		%o0, $ileft, %o0
210	srlx		%o1, $iright, %g1
211	sllx		%o1, $ileft, %o1
212	or		%g1, %o0, %o0
213	srlx		%o2, $iright, %o2
214	or		%o2, %o1, %o1
2155:
216	xor		%g4, %o0, %o0		! ^= rk[0]
217	xor		%g5, %o1, %o1
218	movxtod		%o0, %f12
219	movxtod		%o1, %f14
220
221	fxor		%f12, %f0, %f0		! ^= ivec
222	fxor		%f14, %f2, %f2
223	prefetch	[$inp + 16+63], 20
224	call		_${alg}${bits}_encrypt_1x
225	add		$inp, 16, $inp
226	sub		$len, 1, $len
227
228	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
229	add		$out, 8, $out
230	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
231	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
232	add		$out, 8, $out
233
234	membar		#StoreLoad|#StoreStore
235	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
236	mov		$blk_init, $len
237___
238$::code.=<<___ if ($::evp);
239	st		%f0, [$ivec + 0]
240	st		%f1, [$ivec + 4]
241	st		%f2, [$ivec + 8]
242	st		%f3, [$ivec + 12]
243___
244$::code.=<<___ if (!$::evp);
245	brnz,pn		$ivoff, 3b
246	nop
247
248	std		%f0, [$ivec + 0]	! write out ivec
249	std		%f2, [$ivec + 8]
250___
251$::code.=<<___;
252	ret
253	restore
254.type	${alg}${bits}_t4_cbc_encrypt,#function
255.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
256___
257}
258
259sub alg_cbc_decrypt_implement {
260my ($alg,$bits) = @_;
261
262$::code.=<<___;
263.globl	${alg}${bits}_t4_cbc_decrypt
264.align	32
265${alg}${bits}_t4_cbc_decrypt:
266	save		%sp, -$::frame, %sp
267	cmp		$len, 0
268	be,pn		$::size_t_cc, .L${bits}_cbc_dec_abort
269	srln		$len, 0, $len		! needed on v8+, "nop" on v9
270	sub		$inp, $out, $blk_init	! $inp!=$out
271___
272$::code.=<<___ if (!$::evp);
273	andcc		$ivec, 7, $ivoff
274	alignaddr	$ivec, %g0, $ivec
275
276	ldd		[$ivec + 0], %f12	! load ivec
277	bz,pt		%icc, 1f
278	ldd		[$ivec + 8], %f14
279	ldd		[$ivec + 16], %f0
280	faligndata	%f12, %f14, %f12
281	faligndata	%f14, %f0, %f14
2821:
283___
284$::code.=<<___ if ($::evp);
285	ld		[$ivec + 0], %f12	! load ivec
286	ld		[$ivec + 4], %f13
287	ld		[$ivec + 8], %f14
288	ld		[$ivec + 12], %f15
289___
290$::code.=<<___;
291	prefetch	[$inp], 20
292	prefetch	[$inp + 63], 20
293	call		_${alg}${bits}_load_deckey
294	and		$inp, 7, $ileft
295	andn		$inp, 7, $inp
296	sll		$ileft, 3, $ileft
297	mov		64, $iright
298	mov		0xff, $omask
299	sub		$iright, $ileft, $iright
300	and		$out, 7, $ooff
301	cmp		$len, 255
302	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
303	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
304	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
305	srl		$omask, $ooff, $omask
306
307	andcc		$len, 16, %g0		! is number of blocks even?
308	srlx		$len, 4, $len
309	alignaddrl	$out, %g0, $out
310	bz		%icc, .L${bits}_cbc_dec_loop2x
311	prefetch	[$out], 22
312.L${bits}_cbc_dec_loop:
313	ldx		[$inp + 0], %o0
314	brz,pt		$ileft, 4f
315	ldx		[$inp + 8], %o1
316
317	ldx		[$inp + 16], %o2
318	sllx		%o0, $ileft, %o0
319	srlx		%o1, $iright, %g1
320	sllx		%o1, $ileft, %o1
321	or		%g1, %o0, %o0
322	srlx		%o2, $iright, %o2
323	or		%o2, %o1, %o1
3244:
325	xor		%g4, %o0, %o2		! ^= rk[0]
326	xor		%g5, %o1, %o3
327	movxtod		%o2, %f0
328	movxtod		%o3, %f2
329
330	prefetch	[$out + 63], 22
331	prefetch	[$inp + 16+63], 20
332	call		_${alg}${bits}_decrypt_1x
333	add		$inp, 16, $inp
334
335	fxor		%f12, %f0, %f0		! ^= ivec
336	fxor		%f14, %f2, %f2
337	movxtod		%o0, %f12
338	movxtod		%o1, %f14
339
340	brnz,pn		$ooff, 2f
341	sub		$len, 1, $len
342
343	std		%f0, [$out + 0]
344	std		%f2, [$out + 8]
345	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
346	add		$out, 16, $out
347___
348$::code.=<<___ if ($::evp);
349	st		%f12, [$ivec + 0]
350	st		%f13, [$ivec + 4]
351	st		%f14, [$ivec + 8]
352	st		%f15, [$ivec + 12]
353___
354$::code.=<<___ if (!$::evp);
355	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
356	nop
357
358	std		%f12, [$ivec + 0]	! write out ivec
359	std		%f14, [$ivec + 8]
360___
361$::code.=<<___;
362.L${bits}_cbc_dec_abort:
363	ret
364	restore
365
366.align	16
3672:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
368						! and ~3x deterioration
369						! in inp==out case
370	faligndata	%f0, %f0, %f4		! handle unaligned output
371	faligndata	%f0, %f2, %f6
372	faligndata	%f2, %f2, %f8
373
374	stda		%f4, [$out + $omask]0xc0	! partial store
375	std		%f6, [$out + 8]
376	add		$out, 16, $out
377	orn		%g0, $omask, $omask
378	stda		%f8, [$out + $omask]0xc0	! partial store
379
380	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
381	orn		%g0, $omask, $omask
382___
383$::code.=<<___ if ($::evp);
384	st		%f12, [$ivec + 0]
385	st		%f13, [$ivec + 4]
386	st		%f14, [$ivec + 8]
387	st		%f15, [$ivec + 12]
388___
389$::code.=<<___ if (!$::evp);
390	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
391	nop
392
393	std		%f12, [$ivec + 0]	! write out ivec
394	std		%f14, [$ivec + 8]
395___
396$::code.=<<___;
397	ret
398	restore
399
400!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
401.align	32
402.L${bits}_cbc_dec_loop2x:
403	ldx		[$inp + 0], %o0
404	ldx		[$inp + 8], %o1
405	ldx		[$inp + 16], %o2
406	brz,pt		$ileft, 4f
407	ldx		[$inp + 24], %o3
408
409	ldx		[$inp + 32], %o4
410	sllx		%o0, $ileft, %o0
411	srlx		%o1, $iright, %g1
412	or		%g1, %o0, %o0
413	sllx		%o1, $ileft, %o1
414	srlx		%o2, $iright, %g1
415	or		%g1, %o1, %o1
416	sllx		%o2, $ileft, %o2
417	srlx		%o3, $iright, %g1
418	or		%g1, %o2, %o2
419	sllx		%o3, $ileft, %o3
420	srlx		%o4, $iright, %o4
421	or		%o4, %o3, %o3
4224:
423	xor		%g4, %o0, %o4		! ^= rk[0]
424	xor		%g5, %o1, %o5
425	movxtod		%o4, %f0
426	movxtod		%o5, %f2
427	xor		%g4, %o2, %o4
428	xor		%g5, %o3, %o5
429	movxtod		%o4, %f4
430	movxtod		%o5, %f6
431
432	prefetch	[$out + 63], 22
433	prefetch	[$inp + 32+63], 20
434	call		_${alg}${bits}_decrypt_2x
435	add		$inp, 32, $inp
436
437	movxtod		%o0, %f8
438	movxtod		%o1, %f10
439	fxor		%f12, %f0, %f0		! ^= ivec
440	fxor		%f14, %f2, %f2
441	movxtod		%o2, %f12
442	movxtod		%o3, %f14
443	fxor		%f8, %f4, %f4
444	fxor		%f10, %f6, %f6
445
446	brnz,pn		$ooff, 2f
447	sub		$len, 2, $len
448
449	std		%f0, [$out + 0]
450	std		%f2, [$out + 8]
451	std		%f4, [$out + 16]
452	std		%f6, [$out + 24]
453	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
454	add		$out, 32, $out
455___
456$::code.=<<___ if ($::evp);
457	st		%f12, [$ivec + 0]
458	st		%f13, [$ivec + 4]
459	st		%f14, [$ivec + 8]
460	st		%f15, [$ivec + 12]
461___
462$::code.=<<___ if (!$::evp);
463	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
464	nop
465
466	std		%f12, [$ivec + 0]	! write out ivec
467	std		%f14, [$ivec + 8]
468___
469$::code.=<<___;
470	ret
471	restore
472
473.align	16
4742:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
475						! and ~3x deterioration
476						! in inp==out case
477	faligndata	%f0, %f0, %f8		! handle unaligned output
478	faligndata	%f0, %f2, %f0
479	faligndata	%f2, %f4, %f2
480	faligndata	%f4, %f6, %f4
481	faligndata	%f6, %f6, %f6
482	stda		%f8, [$out + $omask]0xc0	! partial store
483	std		%f0, [$out + 8]
484	std		%f2, [$out + 16]
485	std		%f4, [$out + 24]
486	add		$out, 32, $out
487	orn		%g0, $omask, $omask
488	stda		%f6, [$out + $omask]0xc0	! partial store
489
490	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
491	orn		%g0, $omask, $omask
492___
493$::code.=<<___ if ($::evp);
494	st		%f12, [$ivec + 0]
495	st		%f13, [$ivec + 4]
496	st		%f14, [$ivec + 8]
497	st		%f15, [$ivec + 12]
498___
499$::code.=<<___ if (!$::evp);
500	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
501	nop
502
503	std		%f12, [$ivec + 0]	! write out ivec
504	std		%f14, [$ivec + 8]
505	ret
506	restore
507
508.align	16
509.L${bits}_cbc_dec_unaligned_ivec:
510	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
511	mov		0xff, $omask
512	srl		$omask, $ivoff, $omask
513	faligndata	%f12, %f12, %f0
514	faligndata	%f12, %f14, %f2
515	faligndata	%f14, %f14, %f4
516	stda		%f0, [$ivec + $omask]0xc0
517	std		%f2, [$ivec + 8]
518	add		$ivec, 16, $ivec
519	orn		%g0, $omask, $omask
520	stda		%f4, [$ivec + $omask]0xc0
521___
522$::code.=<<___;
523	ret
524	restore
525
526!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
527.align	32
528.L${bits}cbc_dec_blk:
529	add	$out, $len, $blk_init
530	and	$blk_init, 63, $blk_init	! tail
531	sub	$len, $blk_init, $len
532	add	$blk_init, 15, $blk_init	! round up to 16n
533	srlx	$len, 4, $len
534	srl	$blk_init, 4, $blk_init
535	sub	$len, 1, $len
536	add	$blk_init, 1, $blk_init
537
538.L${bits}_cbc_dec_blk_loop2x:
539	ldx		[$inp + 0], %o0
540	ldx		[$inp + 8], %o1
541	ldx		[$inp + 16], %o2
542	brz,pt		$ileft, 5f
543	ldx		[$inp + 24], %o3
544
545	ldx		[$inp + 32], %o4
546	sllx		%o0, $ileft, %o0
547	srlx		%o1, $iright, %g1
548	or		%g1, %o0, %o0
549	sllx		%o1, $ileft, %o1
550	srlx		%o2, $iright, %g1
551	or		%g1, %o1, %o1
552	sllx		%o2, $ileft, %o2
553	srlx		%o3, $iright, %g1
554	or		%g1, %o2, %o2
555	sllx		%o3, $ileft, %o3
556	srlx		%o4, $iright, %o4
557	or		%o4, %o3, %o3
5585:
559	xor		%g4, %o0, %o4		! ^= rk[0]
560	xor		%g5, %o1, %o5
561	movxtod		%o4, %f0
562	movxtod		%o5, %f2
563	xor		%g4, %o2, %o4
564	xor		%g5, %o3, %o5
565	movxtod		%o4, %f4
566	movxtod		%o5, %f6
567
568	prefetch	[$inp + 32+63], 20
569	call		_${alg}${bits}_decrypt_2x
570	add		$inp, 32, $inp
571	subcc		$len, 2, $len
572
573	movxtod		%o0, %f8
574	movxtod		%o1, %f10
575	fxor		%f12, %f0, %f0		! ^= ivec
576	fxor		%f14, %f2, %f2
577	movxtod		%o2, %f12
578	movxtod		%o3, %f14
579	fxor		%f8, %f4, %f4
580	fxor		%f10, %f6, %f6
581
582	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
583	add		$out, 8, $out
584	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
585	add		$out, 8, $out
586	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
587	add		$out, 8, $out
588	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
589	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
590	add		$out, 8, $out
591
592	add		$blk_init, $len, $len
593	andcc		$len, 1, %g0		! is number of blocks even?
594	membar		#StoreLoad|#StoreStore
595	bnz,pt		%icc, .L${bits}_cbc_dec_loop
596	srl		$len, 0, $len
597	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
598	nop
599___
600$::code.=<<___ if ($::evp);
601	st		%f12, [$ivec + 0]	! write out ivec
602	st		%f13, [$ivec + 4]
603	st		%f14, [$ivec + 8]
604	st		%f15, [$ivec + 12]
605___
606$::code.=<<___ if (!$::evp);
607	brnz,pn		$ivoff, 3b
608	nop
609
610	std		%f12, [$ivec + 0]	! write out ivec
611	std		%f14, [$ivec + 8]
612___
613$::code.=<<___;
614	ret
615	restore
616.type	${alg}${bits}_t4_cbc_decrypt,#function
617.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
618___
619}
620
621sub alg_ctr32_implement {
622my ($alg,$bits) = @_;
623
624$::code.=<<___;
625.globl	${alg}${bits}_t4_ctr32_encrypt
626.align	32
627${alg}${bits}_t4_ctr32_encrypt:
628	save		%sp, -$::frame, %sp
629	srln		$len, 0, $len		! needed on v8+, "nop" on v9
630
631	prefetch	[$inp], 20
632	prefetch	[$inp + 63], 20
633	call		_${alg}${bits}_load_enckey
634	sllx		$len, 4, $len
635
636	ld		[$ivec + 0], %l4	! counter
637	ld		[$ivec + 4], %l5
638	ld		[$ivec + 8], %l6
639	ld		[$ivec + 12], %l7
640
641	sllx		%l4, 32, %o5
642	or		%l5, %o5, %o5
643	sllx		%l6, 32, %g1
644	xor		%o5, %g4, %g4		! ^= rk[0]
645	xor		%g1, %g5, %g5
646	movxtod		%g4, %f14		! most significant 64 bits
647
648	sub		$inp, $out, $blk_init	! $inp!=$out
649	and		$inp, 7, $ileft
650	andn		$inp, 7, $inp
651	sll		$ileft, 3, $ileft
652	mov		64, $iright
653	mov		0xff, $omask
654	sub		$iright, $ileft, $iright
655	and		$out, 7, $ooff
656	cmp		$len, 255
657	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
658	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
659	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
660	srl		$omask, $ooff, $omask
661
662	andcc		$len, 16, %g0		! is number of blocks even?
663	alignaddrl	$out, %g0, $out
664	bz		%icc, .L${bits}_ctr32_loop2x
665	srlx		$len, 4, $len
666.L${bits}_ctr32_loop:
667	ldx		[$inp + 0], %o0
668	brz,pt		$ileft, 4f
669	ldx		[$inp + 8], %o1
670
671	ldx		[$inp + 16], %o2
672	sllx		%o0, $ileft, %o0
673	srlx		%o1, $iright, %g1
674	sllx		%o1, $ileft, %o1
675	or		%g1, %o0, %o0
676	srlx		%o2, $iright, %o2
677	or		%o2, %o1, %o1
6784:
679	xor		%g5, %l7, %g1		! ^= rk[0]
680	add		%l7, 1, %l7
681	movxtod		%g1, %f2
682	srl		%l7, 0, %l7		! clruw
683	prefetch	[$out + 63], 22
684	prefetch	[$inp + 16+63], 20
685___
686$::code.=<<___ if ($alg eq "aes");
687	aes_eround01	%f16, %f14, %f2, %f4
688	aes_eround23	%f18, %f14, %f2, %f2
689___
690$::code.=<<___ if ($alg eq "cmll");
691	camellia_f	%f16, %f2, %f14, %f2
692	camellia_f	%f18, %f14, %f2, %f0
693___
694$::code.=<<___;
695	call		_${alg}${bits}_encrypt_1x+8
696	add		$inp, 16, $inp
697
698	movxtod		%o0, %f10
699	movxtod		%o1, %f12
700	fxor		%f10, %f0, %f0		! ^= inp
701	fxor		%f12, %f2, %f2
702
703	brnz,pn		$ooff, 2f
704	sub		$len, 1, $len
705
706	std		%f0, [$out + 0]
707	std		%f2, [$out + 8]
708	brnz,pt		$len, .L${bits}_ctr32_loop2x
709	add		$out, 16, $out
710
711	ret
712	restore
713
714.align	16
7152:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
716						! and ~3x deterioration
717						! in inp==out case
718	faligndata	%f0, %f0, %f4		! handle unaligned output
719	faligndata	%f0, %f2, %f6
720	faligndata	%f2, %f2, %f8
721	stda		%f4, [$out + $omask]0xc0	! partial store
722	std		%f6, [$out + 8]
723	add		$out, 16, $out
724	orn		%g0, $omask, $omask
725	stda		%f8, [$out + $omask]0xc0	! partial store
726
727	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
728	orn		%g0, $omask, $omask
729
730	ret
731	restore
732
733!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
734.align	32
735.L${bits}_ctr32_loop2x:
736	ldx		[$inp + 0], %o0
737	ldx		[$inp + 8], %o1
738	ldx		[$inp + 16], %o2
739	brz,pt		$ileft, 4f
740	ldx		[$inp + 24], %o3
741
742	ldx		[$inp + 32], %o4
743	sllx		%o0, $ileft, %o0
744	srlx		%o1, $iright, %g1
745	or		%g1, %o0, %o0
746	sllx		%o1, $ileft, %o1
747	srlx		%o2, $iright, %g1
748	or		%g1, %o1, %o1
749	sllx		%o2, $ileft, %o2
750	srlx		%o3, $iright, %g1
751	or		%g1, %o2, %o2
752	sllx		%o3, $ileft, %o3
753	srlx		%o4, $iright, %o4
754	or		%o4, %o3, %o3
7554:
756	xor		%g5, %l7, %g1		! ^= rk[0]
757	add		%l7, 1, %l7
758	movxtod		%g1, %f2
759	srl		%l7, 0, %l7		! clruw
760	xor		%g5, %l7, %g1
761	add		%l7, 1, %l7
762	movxtod		%g1, %f6
763	srl		%l7, 0, %l7		! clruw
764	prefetch	[$out + 63], 22
765	prefetch	[$inp + 32+63], 20
766___
767$::code.=<<___ if ($alg eq "aes");
768	aes_eround01	%f16, %f14, %f2, %f8
769	aes_eround23	%f18, %f14, %f2, %f2
770	aes_eround01	%f16, %f14, %f6, %f10
771	aes_eround23	%f18, %f14, %f6, %f6
772___
773$::code.=<<___ if ($alg eq "cmll");
774	camellia_f	%f16, %f2, %f14, %f2
775	camellia_f	%f16, %f6, %f14, %f6
776	camellia_f	%f18, %f14, %f2, %f0
777	camellia_f	%f18, %f14, %f6, %f4
778___
779$::code.=<<___;
780	call		_${alg}${bits}_encrypt_2x+16
781	add		$inp, 32, $inp
782
783	movxtod		%o0, %f8
784	movxtod		%o1, %f10
785	movxtod		%o2, %f12
786	fxor		%f8, %f0, %f0		! ^= inp
787	movxtod		%o3, %f8
788	fxor		%f10, %f2, %f2
789	fxor		%f12, %f4, %f4
790	fxor		%f8, %f6, %f6
791
792	brnz,pn		$ooff, 2f
793	sub		$len, 2, $len
794
795	std		%f0, [$out + 0]
796	std		%f2, [$out + 8]
797	std		%f4, [$out + 16]
798	std		%f6, [$out + 24]
799	brnz,pt		$len, .L${bits}_ctr32_loop2x
800	add		$out, 32, $out
801
802	ret
803	restore
804
805.align	16
8062:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
807						! and ~3x deterioration
808						! in inp==out case
809	faligndata	%f0, %f0, %f8		! handle unaligned output
810	faligndata	%f0, %f2, %f0
811	faligndata	%f2, %f4, %f2
812	faligndata	%f4, %f6, %f4
813	faligndata	%f6, %f6, %f6
814
815	stda		%f8, [$out + $omask]0xc0	! partial store
816	std		%f0, [$out + 8]
817	std		%f2, [$out + 16]
818	std		%f4, [$out + 24]
819	add		$out, 32, $out
820	orn		%g0, $omask, $omask
821	stda		%f6, [$out + $omask]0xc0	! partial store
822
823	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
824	orn		%g0, $omask, $omask
825
826	ret
827	restore
828
829!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
830.align	32
831.L${bits}_ctr32_blk:
832	add	$out, $len, $blk_init
833	and	$blk_init, 63, $blk_init	! tail
834	sub	$len, $blk_init, $len
835	add	$blk_init, 15, $blk_init	! round up to 16n
836	srlx	$len, 4, $len
837	srl	$blk_init, 4, $blk_init
838	sub	$len, 1, $len
839	add	$blk_init, 1, $blk_init
840
841.L${bits}_ctr32_blk_loop2x:
842	ldx		[$inp + 0], %o0
843	ldx		[$inp + 8], %o1
844	ldx		[$inp + 16], %o2
845	brz,pt		$ileft, 5f
846	ldx		[$inp + 24], %o3
847
848	ldx		[$inp + 32], %o4
849	sllx		%o0, $ileft, %o0
850	srlx		%o1, $iright, %g1
851	or		%g1, %o0, %o0
852	sllx		%o1, $ileft, %o1
853	srlx		%o2, $iright, %g1
854	or		%g1, %o1, %o1
855	sllx		%o2, $ileft, %o2
856	srlx		%o3, $iright, %g1
857	or		%g1, %o2, %o2
858	sllx		%o3, $ileft, %o3
859	srlx		%o4, $iright, %o4
860	or		%o4, %o3, %o3
8615:
862	xor		%g5, %l7, %g1		! ^= rk[0]
863	add		%l7, 1, %l7
864	movxtod		%g1, %f2
865	srl		%l7, 0, %l7		! clruw
866	xor		%g5, %l7, %g1
867	add		%l7, 1, %l7
868	movxtod		%g1, %f6
869	srl		%l7, 0, %l7		! clruw
870	prefetch	[$inp + 32+63], 20
871___
872$::code.=<<___ if ($alg eq "aes");
873	aes_eround01	%f16, %f14, %f2, %f8
874	aes_eround23	%f18, %f14, %f2, %f2
875	aes_eround01	%f16, %f14, %f6, %f10
876	aes_eround23	%f18, %f14, %f6, %f6
877___
878$::code.=<<___ if ($alg eq "cmll");
879	camellia_f	%f16, %f2, %f14, %f2
880	camellia_f	%f16, %f6, %f14, %f6
881	camellia_f	%f18, %f14, %f2, %f0
882	camellia_f	%f18, %f14, %f6, %f4
883___
884$::code.=<<___;
885	call		_${alg}${bits}_encrypt_2x+16
886	add		$inp, 32, $inp
887	subcc		$len, 2, $len
888
889	movxtod		%o0, %f8
890	movxtod		%o1, %f10
891	movxtod		%o2, %f12
892	fxor		%f8, %f0, %f0		! ^= inp
893	movxtod		%o3, %f8
894	fxor		%f10, %f2, %f2
895	fxor		%f12, %f4, %f4
896	fxor		%f8, %f6, %f6
897
898	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
899	add		$out, 8, $out
900	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
901	add		$out, 8, $out
902	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
903	add		$out, 8, $out
904	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
905	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
906	add		$out, 8, $out
907
908	add		$blk_init, $len, $len
909	andcc		$len, 1, %g0		! is number of blocks even?
910	membar		#StoreLoad|#StoreStore
911	bnz,pt		%icc, .L${bits}_ctr32_loop
912	srl		$len, 0, $len
913	brnz,pn		$len, .L${bits}_ctr32_loop2x
914	nop
915
916	ret
917	restore
918.type	${alg}${bits}_t4_ctr32_encrypt,#function
919.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
920___
921}
922
923sub alg_xts_implement {
924my ($alg,$bits,$dir) = @_;
925my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
926my $rem=$ivec;
927
928$::code.=<<___;
929.globl	${alg}${bits}_t4_xts_${dir}crypt
930.align	32
931${alg}${bits}_t4_xts_${dir}crypt:
932	save		%sp, -$::frame-16, %sp
933	srln		$len, 0, $len		! needed on v8+, "nop" on v9
934
935	mov		$ivec, %o0
936	add		%fp, $::bias-16, %o1
937	call		${alg}_t4_encrypt
938	mov		$key2, %o2
939
940	add		%fp, $::bias-16, %l7
941	ldxa		[%l7]0x88, %g2
942	add		%fp, $::bias-8, %l7
943	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
944
945	sethi		%hi(0x76543210), %l7
946	or		%l7, %lo(0x76543210), %l7
947	bmask		%l7, %g0, %g0		! byte swap mask
948
949	prefetch	[$inp], 20
950	prefetch	[$inp + 63], 20
951	call		_${alg}${bits}_load_${dir}ckey
952	and		$len, 15,  $rem
953	and		$len, -16, $len
954___
955$code.=<<___ if ($dir eq "de");
956	mov		0, %l7
957	movrnz		$rem, 16,  %l7
958	sub		$len, %l7, $len
959___
960$code.=<<___;
961
962	sub		$inp, $out, $blk_init	! $inp!=$out
963	and		$inp, 7, $ileft
964	andn		$inp, 7, $inp
965	sll		$ileft, 3, $ileft
966	mov		64, $iright
967	mov		0xff, $omask
968	sub		$iright, $ileft, $iright
969	and		$out, 7, $ooff
970	cmp		$len, 255
971	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
972	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
973	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
974	srl		$omask, $ooff, $omask
975
976	andcc		$len, 16, %g0		! is number of blocks even?
977___
978$code.=<<___ if ($dir eq "de");
979	brz,pn		$len, .L${bits}_xts_${dir}steal
980___
981$code.=<<___;
982	alignaddrl	$out, %g0, $out
983	bz		%icc, .L${bits}_xts_${dir}loop2x
984	srlx		$len, 4, $len
985.L${bits}_xts_${dir}loop:
986	ldx		[$inp + 0], %o0
987	brz,pt		$ileft, 4f
988	ldx		[$inp + 8], %o1
989
990	ldx		[$inp + 16], %o2
991	sllx		%o0, $ileft, %o0
992	srlx		%o1, $iright, %g1
993	sllx		%o1, $ileft, %o1
994	or		%g1, %o0, %o0
995	srlx		%o2, $iright, %o2
996	or		%o2, %o1, %o1
9974:
998	movxtod		%g2, %f12
999	movxtod		%g3, %f14
1000	bshuffle	%f12, %f12, %f12
1001	bshuffle	%f14, %f14, %f14
1002
1003	xor		%g4, %o0, %o0		! ^= rk[0]
1004	xor		%g5, %o1, %o1
1005	movxtod		%o0, %f0
1006	movxtod		%o1, %f2
1007
1008	fxor		%f12, %f0, %f0		! ^= tweak[0]
1009	fxor		%f14, %f2, %f2
1010
1011	prefetch	[$out + 63], 22
1012	prefetch	[$inp + 16+63], 20
1013	call		_${alg}${bits}_${dir}crypt_1x
1014	add		$inp, 16, $inp
1015
1016	fxor		%f12, %f0, %f0		! ^= tweak[0]
1017	fxor		%f14, %f2, %f2
1018
1019	srax		%g3, 63, %l7		! next tweak value
1020	addcc		%g2, %g2, %g2
1021	and		%l7, 0x87, %l7
1022	addxc		%g3, %g3, %g3
1023	xor		%l7, %g2, %g2
1024
1025	brnz,pn		$ooff, 2f
1026	sub		$len, 1, $len
1027
1028	std		%f0, [$out + 0]
1029	std		%f2, [$out + 8]
1030	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1031	add		$out, 16, $out
1032
1033	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1034	nop
1035
1036	ret
1037	restore
1038
1039.align	16
10402:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1041						! and ~3x deterioration
1042						! in inp==out case
1043	faligndata	%f0, %f0, %f4		! handle unaligned output
1044	faligndata	%f0, %f2, %f6
1045	faligndata	%f2, %f2, %f8
1046	stda		%f4, [$out + $omask]0xc0	! partial store
1047	std		%f6, [$out + 8]
1048	add		$out, 16, $out
1049	orn		%g0, $omask, $omask
1050	stda		%f8, [$out + $omask]0xc0	! partial store
1051
1052	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1053	orn		%g0, $omask, $omask
1054
1055	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1056	nop
1057
1058	ret
1059	restore
1060
1061!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1062.align	32
1063.L${bits}_xts_${dir}loop2x:
1064	ldx		[$inp + 0], %o0
1065	ldx		[$inp + 8], %o1
1066	ldx		[$inp + 16], %o2
1067	brz,pt		$ileft, 4f
1068	ldx		[$inp + 24], %o3
1069
1070	ldx		[$inp + 32], %o4
1071	sllx		%o0, $ileft, %o0
1072	srlx		%o1, $iright, %g1
1073	or		%g1, %o0, %o0
1074	sllx		%o1, $ileft, %o1
1075	srlx		%o2, $iright, %g1
1076	or		%g1, %o1, %o1
1077	sllx		%o2, $ileft, %o2
1078	srlx		%o3, $iright, %g1
1079	or		%g1, %o2, %o2
1080	sllx		%o3, $ileft, %o3
1081	srlx		%o4, $iright, %o4
1082	or		%o4, %o3, %o3
10834:
1084	movxtod		%g2, %f12
1085	movxtod		%g3, %f14
1086	bshuffle	%f12, %f12, %f12
1087	bshuffle	%f14, %f14, %f14
1088
1089	srax		%g3, 63, %l7		! next tweak value
1090	addcc		%g2, %g2, %g2
1091	and		%l7, 0x87, %l7
1092	addxc		%g3, %g3, %g3
1093	xor		%l7, %g2, %g2
1094
1095	movxtod		%g2, %f8
1096	movxtod		%g3, %f10
1097	bshuffle	%f8,  %f8,  %f8
1098	bshuffle	%f10, %f10, %f10
1099
1100	xor		%g4, %o0, %o0		! ^= rk[0]
1101	xor		%g5, %o1, %o1
1102	xor		%g4, %o2, %o2		! ^= rk[0]
1103	xor		%g5, %o3, %o3
1104	movxtod		%o0, %f0
1105	movxtod		%o1, %f2
1106	movxtod		%o2, %f4
1107	movxtod		%o3, %f6
1108
1109	fxor		%f12, %f0, %f0		! ^= tweak[0]
1110	fxor		%f14, %f2, %f2
1111	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1112	fxor		%f10, %f6, %f6
1113
1114	prefetch	[$out + 63], 22
1115	prefetch	[$inp + 32+63], 20
1116	call		_${alg}${bits}_${dir}crypt_2x
1117	add		$inp, 32, $inp
1118
1119	movxtod		%g2, %f8
1120	movxtod		%g3, %f10
1121
1122	srax		%g3, 63, %l7		! next tweak value
1123	addcc		%g2, %g2, %g2
1124	and		%l7, 0x87, %l7
1125	addxc		%g3, %g3, %g3
1126	xor		%l7, %g2, %g2
1127
1128	bshuffle	%f8,  %f8,  %f8
1129	bshuffle	%f10, %f10, %f10
1130
1131	fxor		%f12, %f0, %f0		! ^= tweak[0]
1132	fxor		%f14, %f2, %f2
1133	fxor		%f8,  %f4, %f4
1134	fxor		%f10, %f6, %f6
1135
1136	brnz,pn		$ooff, 2f
1137	sub		$len, 2, $len
1138
1139	std		%f0, [$out + 0]
1140	std		%f2, [$out + 8]
1141	std		%f4, [$out + 16]
1142	std		%f6, [$out + 24]
1143	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1144	add		$out, 32, $out
1145
1146	fsrc2		%f4, %f0
1147	fsrc2		%f6, %f2
1148	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1149	nop
1150
1151	ret
1152	restore
1153
1154.align	16
11552:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1156						! and ~3x deterioration
1157						! in inp==out case
1158	faligndata	%f0, %f0, %f8		! handle unaligned output
1159	faligndata	%f0, %f2, %f10
1160	faligndata	%f2, %f4, %f12
1161	faligndata	%f4, %f6, %f14
1162	faligndata	%f6, %f6, %f0
1163
1164	stda		%f8, [$out + $omask]0xc0	! partial store
1165	std		%f10, [$out + 8]
1166	std		%f12, [$out + 16]
1167	std		%f14, [$out + 24]
1168	add		$out, 32, $out
1169	orn		%g0, $omask, $omask
1170	stda		%f0, [$out + $omask]0xc0	! partial store
1171
1172	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1173	orn		%g0, $omask, $omask
1174
1175	fsrc2		%f4, %f0
1176	fsrc2		%f6, %f2
1177	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1178	nop
1179
1180	ret
1181	restore
1182
1183!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1184.align	32
1185.L${bits}_xts_${dir}blk:
1186	add	$out, $len, $blk_init
1187	and	$blk_init, 63, $blk_init	! tail
1188	sub	$len, $blk_init, $len
1189	add	$blk_init, 15, $blk_init	! round up to 16n
1190	srlx	$len, 4, $len
1191	srl	$blk_init, 4, $blk_init
1192	sub	$len, 1, $len
1193	add	$blk_init, 1, $blk_init
1194
1195.L${bits}_xts_${dir}blk2x:
1196	ldx		[$inp + 0], %o0
1197	ldx		[$inp + 8], %o1
1198	ldx		[$inp + 16], %o2
1199	brz,pt		$ileft, 5f
1200	ldx		[$inp + 24], %o3
1201
1202	ldx		[$inp + 32], %o4
1203	sllx		%o0, $ileft, %o0
1204	srlx		%o1, $iright, %g1
1205	or		%g1, %o0, %o0
1206	sllx		%o1, $ileft, %o1
1207	srlx		%o2, $iright, %g1
1208	or		%g1, %o1, %o1
1209	sllx		%o2, $ileft, %o2
1210	srlx		%o3, $iright, %g1
1211	or		%g1, %o2, %o2
1212	sllx		%o3, $ileft, %o3
1213	srlx		%o4, $iright, %o4
1214	or		%o4, %o3, %o3
12155:
1216	movxtod		%g2, %f12
1217	movxtod		%g3, %f14
1218	bshuffle	%f12, %f12, %f12
1219	bshuffle	%f14, %f14, %f14
1220
1221	srax		%g3, 63, %l7		! next tweak value
1222	addcc		%g2, %g2, %g2
1223	and		%l7, 0x87, %l7
1224	addxc		%g3, %g3, %g3
1225	xor		%l7, %g2, %g2
1226
1227	movxtod		%g2, %f8
1228	movxtod		%g3, %f10
1229	bshuffle	%f8,  %f8,  %f8
1230	bshuffle	%f10, %f10, %f10
1231
1232	xor		%g4, %o0, %o0		! ^= rk[0]
1233	xor		%g5, %o1, %o1
1234	xor		%g4, %o2, %o2		! ^= rk[0]
1235	xor		%g5, %o3, %o3
1236	movxtod		%o0, %f0
1237	movxtod		%o1, %f2
1238	movxtod		%o2, %f4
1239	movxtod		%o3, %f6
1240
1241	fxor		%f12, %f0, %f0		! ^= tweak[0]
1242	fxor		%f14, %f2, %f2
1243	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1244	fxor		%f10, %f6, %f6
1245
1246	prefetch	[$inp + 32+63], 20
1247	call		_${alg}${bits}_${dir}crypt_2x
1248	add		$inp, 32, $inp
1249
1250	movxtod		%g2, %f8
1251	movxtod		%g3, %f10
1252
1253	srax		%g3, 63, %l7		! next tweak value
1254	addcc		%g2, %g2, %g2
1255	and		%l7, 0x87, %l7
1256	addxc		%g3, %g3, %g3
1257	xor		%l7, %g2, %g2
1258
1259	bshuffle	%f8,  %f8,  %f8
1260	bshuffle	%f10, %f10, %f10
1261
1262	fxor		%f12, %f0, %f0		! ^= tweak[0]
1263	fxor		%f14, %f2, %f2
1264	fxor		%f8,  %f4, %f4
1265	fxor		%f10, %f6, %f6
1266
1267	subcc		$len, 2, $len
1268	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1269	add		$out, 8, $out
1270	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1271	add		$out, 8, $out
1272	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1273	add		$out, 8, $out
1274	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1275	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
1276	add		$out, 8, $out
1277
1278	add		$blk_init, $len, $len
1279	andcc		$len, 1, %g0		! is number of blocks even?
1280	membar		#StoreLoad|#StoreStore
1281	bnz,pt		%icc, .L${bits}_xts_${dir}loop
1282	srl		$len, 0, $len
1283	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
1284	nop
1285
1286	fsrc2		%f4, %f0
1287	fsrc2		%f6, %f2
1288	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1289	nop
1290
1291	ret
1292	restore
1293!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1294___
1295$code.=<<___ if ($dir eq "en");
1296.align	32
1297.L${bits}_xts_${dir}steal:
1298	std		%f0, [%fp + $::bias-16]	! copy of output
1299	std		%f2, [%fp + $::bias-8]
1300
1301	srl		$ileft, 3, $ileft
1302	add		%fp, $::bias-16, %l7
1303	add		$inp, $ileft, $inp	! original $inp+$len&-15
1304	add		$out, $ooff, $out	! original $out+$len&-15
1305	mov		0, $ileft
1306	nop					! align
1307
1308.L${bits}_xts_${dir}stealing:
1309	ldub		[$inp + $ileft], %o0
1310	ldub		[%l7  + $ileft], %o1
1311	dec		$rem
1312	stb		%o0, [%l7  + $ileft]
1313	stb		%o1, [$out + $ileft]
1314	brnz		$rem, .L${bits}_xts_${dir}stealing
1315	inc		$ileft
1316
1317	mov		%l7, $inp
1318	sub		$out, 16, $out
1319	mov		0, $ileft
1320	sub		$out, $ooff, $out
1321	ba		.L${bits}_xts_${dir}loop	! one more time
1322	mov		1, $len				! $rem is 0
1323___
1324$code.=<<___ if ($dir eq "de");
1325.align	32
1326.L${bits}_xts_${dir}steal:
1327	ldx		[$inp + 0], %o0
1328	brz,pt		$ileft, 8f
1329	ldx		[$inp + 8], %o1
1330
1331	ldx		[$inp + 16], %o2
1332	sllx		%o0, $ileft, %o0
1333	srlx		%o1, $iright, %g1
1334	sllx		%o1, $ileft, %o1
1335	or		%g1, %o0, %o0
1336	srlx		%o2, $iright, %o2
1337	or		%o2, %o1, %o1
13388:
1339	srax		%g3, 63, %l7		! next tweak value
1340	addcc		%g2, %g2, %o2
1341	and		%l7, 0x87, %l7
1342	addxc		%g3, %g3, %o3
1343	xor		%l7, %o2, %o2
1344
1345	movxtod		%o2, %f12
1346	movxtod		%o3, %f14
1347	bshuffle	%f12, %f12, %f12
1348	bshuffle	%f14, %f14, %f14
1349
1350	xor		%g4, %o0, %o0		! ^= rk[0]
1351	xor		%g5, %o1, %o1
1352	movxtod		%o0, %f0
1353	movxtod		%o1, %f2
1354
1355	fxor		%f12, %f0, %f0		! ^= tweak[0]
1356	fxor		%f14, %f2, %f2
1357
1358	call		_${alg}${bits}_${dir}crypt_1x
1359	add		$inp, 16, $inp
1360
1361	fxor		%f12, %f0, %f0		! ^= tweak[0]
1362	fxor		%f14, %f2, %f2
1363
1364	std		%f0, [%fp + $::bias-16]
1365	std		%f2, [%fp + $::bias-8]
1366
1367	srl		$ileft, 3, $ileft
1368	add		%fp, $::bias-16, %l7
1369	add		$inp, $ileft, $inp	! original $inp+$len&-15
1370	add		$out, $ooff, $out	! original $out+$len&-15
1371	mov		0, $ileft
1372	add		$out, 16, $out
1373	nop					! align
1374
1375.L${bits}_xts_${dir}stealing:
1376	ldub		[$inp + $ileft], %o0
1377	ldub		[%l7  + $ileft], %o1
1378	dec		$rem
1379	stb		%o0, [%l7  + $ileft]
1380	stb		%o1, [$out + $ileft]
1381	brnz		$rem, .L${bits}_xts_${dir}stealing
1382	inc		$ileft
1383
1384	mov		%l7, $inp
1385	sub		$out, 16, $out
1386	mov		0, $ileft
1387	sub		$out, $ooff, $out
1388	ba		.L${bits}_xts_${dir}loop	! one more time
1389	mov		1, $len				! $rem is 0
1390___
1391$code.=<<___;
1392	ret
1393	restore
1394.type	${alg}${bits}_t4_xts_${dir}crypt,#function
1395.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1396___
1397}
1398
1399# Purpose of these subroutines is to explicitly encode VIS instructions,
1400# so that one can compile the module without having to specify VIS
1401# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1402# Idea is to reserve for option to produce "universal" binary and let
1403# programmer detect if current CPU is VIS capable at run-time.
1404sub unvis {
1405my ($mnemonic,$rs1,$rs2,$rd)=@_;
1406my ($ref,$opf);
1407my %visopf = (	"faligndata"	=> 0x048,
1408		"bshuffle"	=> 0x04c,
1409		"fnot2"		=> 0x066,
1410		"fxor"		=> 0x06c,
1411		"fsrc2"		=> 0x078	);
1412
1413    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1414
1415    if ($opf=$visopf{$mnemonic}) {
1416	foreach ($rs1,$rs2,$rd) {
1417	    return $ref if (!/%f([0-9]{1,2})/);
1418	    $_=$1;
1419	    if ($1>=32) {
1420		return $ref if ($1&1);
1421		# re-encode for upper double register addressing
1422		$_=($1|$1>>5)&31;
1423	    }
1424	}
1425
1426	return	sprintf ".word\t0x%08x !%s",
1427			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1428			$ref;
1429    } else {
1430	return $ref;
1431    }
1432}
1433
1434sub unvis3 {
1435my ($mnemonic,$rs1,$rs2,$rd)=@_;
1436my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1437my ($ref,$opf);
1438my %visopf = (	"addxc"		=> 0x011,
1439		"addxccc"	=> 0x013,
1440		"umulxhi"	=> 0x016,
1441		"alignaddr"	=> 0x018,
1442		"bmask"		=> 0x019,
1443		"alignaddrl"	=> 0x01a	);
1444
1445    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1446
1447    if ($opf=$visopf{$mnemonic}) {
1448	foreach ($rs1,$rs2,$rd) {
1449	    return $ref if (!/%([goli])([0-9])/);
1450	    $_=$bias{$1}+$2;
1451	}
1452
1453	return	sprintf ".word\t0x%08x !%s",
1454			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1455			$ref;
1456    } else {
1457	return $ref;
1458    }
1459}
1460
1461sub unaes_round {	# 4-argument instructions
1462my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1463my ($ref,$opf);
1464my %aesopf = (	"aes_eround01"	=> 0,
1465		"aes_eround23"	=> 1,
1466		"aes_dround01"	=> 2,
1467		"aes_dround23"	=> 3,
1468		"aes_eround01_l"=> 4,
1469		"aes_eround23_l"=> 5,
1470		"aes_dround01_l"=> 6,
1471		"aes_dround23_l"=> 7,
1472		"aes_kexpand1"	=> 8	);
1473
1474    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1475
1476    if (defined($opf=$aesopf{$mnemonic})) {
1477	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1478	foreach ($rs1,$rs2,$rd) {
1479	    return $ref if (!/%f([0-9]{1,2})/);
1480	    $_=$1;
1481	    if ($1>=32) {
1482		return $ref if ($1&1);
1483		# re-encode for upper double register addressing
1484		$_=($1|$1>>5)&31;
1485	    }
1486	}
1487
1488	return	sprintf ".word\t0x%08x !%s",
1489			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1490			$ref;
1491    } else {
1492	return $ref;
1493    }
1494}
1495
1496sub unaes_kexpand {	# 3-argument instructions
1497my ($mnemonic,$rs1,$rs2,$rd)=@_;
1498my ($ref,$opf);
1499my %aesopf = (	"aes_kexpand0"	=> 0x130,
1500		"aes_kexpand2"	=> 0x131	);
1501
1502    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1503
1504    if (defined($opf=$aesopf{$mnemonic})) {
1505	foreach ($rs1,$rs2,$rd) {
1506	    return $ref if (!/%f([0-9]{1,2})/);
1507	    $_=$1;
1508	    if ($1>=32) {
1509		return $ref if ($1&1);
1510		# re-encode for upper double register addressing
1511		$_=($1|$1>>5)&31;
1512	    }
1513	}
1514
1515	return	sprintf ".word\t0x%08x !%s",
1516			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1517			$ref;
1518    } else {
1519	return $ref;
1520    }
1521}
1522
1523sub uncamellia_f {	# 4-argument instructions
1524my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1525my ($ref,$opf);
1526
1527    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1528
1529    if (1) {
1530	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1531	foreach ($rs1,$rs2,$rd) {
1532	    return $ref if (!/%f([0-9]{1,2})/);
1533	    $_=$1;
1534	    if ($1>=32) {
1535		return $ref if ($1&1);
1536		# re-encode for upper double register addressing
1537		$_=($1|$1>>5)&31;
1538	    }
1539	}
1540
1541	return	sprintf ".word\t0x%08x !%s",
1542			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1543			$ref;
1544    } else {
1545	return $ref;
1546    }
1547}
1548
1549sub uncamellia3 {	# 3-argument instructions
1550my ($mnemonic,$rs1,$rs2,$rd)=@_;
1551my ($ref,$opf);
1552my %cmllopf = (	"camellia_fl"	=> 0x13c,
1553		"camellia_fli"	=> 0x13d	);
1554
1555    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1556
1557    if (defined($opf=$cmllopf{$mnemonic})) {
1558	foreach ($rs1,$rs2,$rd) {
1559	    return $ref if (!/%f([0-9]{1,2})/);
1560	    $_=$1;
1561	    if ($1>=32) {
1562		return $ref if ($1&1);
1563		# re-encode for upper double register addressing
1564		$_=($1|$1>>5)&31;
1565	    }
1566	}
1567
1568	return	sprintf ".word\t0x%08x !%s",
1569			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1570			$ref;
1571    } else {
1572	return $ref;
1573    }
1574}
1575
1576sub unmovxtox {		# 2-argument instructions
1577my ($mnemonic,$rs,$rd)=@_;
1578my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1579my ($ref,$opf);
1580my %movxopf = (	"movdtox"	=> 0x110,
1581		"movstouw"	=> 0x111,
1582		"movstosw"	=> 0x113,
1583		"movxtod"	=> 0x118,
1584		"movwtos"	=> 0x119	);
1585
1586    $ref = "$mnemonic\t$rs,$rd";
1587
1588    if (defined($opf=$movxopf{$mnemonic})) {
1589	foreach ($rs,$rd) {
1590	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
1591	    $_=$bias{$1}+$2;
1592	    if ($2>=32) {
1593		return $ref if ($2&1);
1594		# re-encode for upper double register addressing
1595		$_=($2|$2>>5)&31;
1596	    }
1597	}
1598
1599	return	sprintf ".word\t0x%08x !%s",
1600			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1601			$ref;
1602    } else {
1603	return $ref;
1604    }
1605}
1606
1607sub undes {
1608my ($mnemonic)=shift;
1609my @args=@_;
1610my ($ref,$opf);
1611my %desopf = (	"des_round"	=> 0b1001,
1612		"des_ip"	=> 0b100110100,
1613		"des_iip"	=> 0b100110101,
1614		"des_kexpand"	=> 0b100110110	);
1615
1616    $ref = "$mnemonic\t".join(",",@_);
1617
1618    if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
1619	if ($mnemonic eq "des_round") {
1620	    foreach (@args[0..3]) {
1621		return $ref if (!/%f([0-9]{1,2})/);
1622		$_=$1;
1623		if ($1>=32) {
1624		    return $ref if ($1&1);
1625		    # re-encode for upper double register addressing
1626		    $_=($1|$1>>5)&31;
1627		}
1628	    }
1629	    return  sprintf ".word\t0x%08x !%s",
1630			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1631			    $ref;
1632	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
1633	    foreach (@args[0..2]) {
1634		return $ref if (!/(%f)?([0-9]{1,2})/);
1635		$_=$2;
1636		if ($2>=32) {
1637		    return $ref if ($2&1);
1638		    # re-encode for upper double register addressing
1639		    $_=($2|$2>>5)&31;
1640		}
1641	    }
1642	    return  sprintf ".word\t0x%08x !%s",
1643			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1644			    $ref;
1645	} else {				# 2-arg
1646	    foreach (@args[0..1]) {
1647		return $ref if (!/%f([0-9]{1,2})/);
1648		$_=$1;
1649		if ($1>=32) {
1650		    return $ref if ($2&1);
1651		    # re-encode for upper double register addressing
1652		    $_=($1|$1>>5)&31;
1653		}
1654	    }
1655	    return  sprintf ".word\t0x%08x !%s",
1656			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1657			    $ref;
1658	}
1659    } else {
1660	return $ref;
1661    }
1662}
1663
1664sub emit_assembler {
1665    foreach (split("\n",$::code)) {
1666	s/\`([^\`]*)\`/eval $1/ge;
1667
1668	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1669
1670	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1671		&unaes_round($1,$2,$3,$4,$5)
1672	 /geo or
1673	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1674		&unaes_kexpand($1,$2,$3,$4)
1675	 /geo or
1676	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1677		&uncamellia_f($1,$2,$3,$4,$5)
1678	 /geo or
1679	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1680		&uncamellia3($1,$2,$3,$4)
1681	 /geo or
1682	s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1683		&undes($1,$2,$3,$4,$5)
1684	 /geo or
1685	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1686		&unmovxtox($1,$2,$3)
1687	 /geo or
1688	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1689		&unmovxtox($1,$2,$3)
1690	 /geo or
1691	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1692		&unvis($1,$2,$3,$4)
1693	 /geo or
1694	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1695		&unvis3($1,$2,$3,$4)
1696	 /geo;
1697
1698	print $_,"\n";
1699    }
1700}
1701
17021;
1703