1#!/usr/bin/env perl
2
3# Specific modes implementations for SPARC Architecture 2011. There
4# is T4 dependency though, an ASI value that is not specified in the
5# Architecture Manual. But as SPARC universe is rather monocultural,
6# we imply that processor capable of executing crypto instructions
7# can handle the ASI in question as well. This means that we ought to
8# keep eyes open when new processors emerge...
9#
10# As for above mentioned ASI. It's so called "block initializing
11# store" which cancels "read" in "read-update-write" on cache lines.
12# This is "cooperative" optimization, as it reduces overall pressure
13# on memory interface. Benefits can't be observed/quantified with
14# usual benchmarks, on the contrary you can notice that single-thread
15# performance for parallelizable modes is ~1.5% worse for largest
16# block sizes [though few percent better for not so long ones]. All
17# this based on suggestions from David Miller.
18
19sub asm_init {		# to be called with @ARGV as argument
20    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
21    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
22    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
23}
24
25# unified interface
26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
27# local variables
28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
29
30sub alg_cbc_encrypt_implement {
31my ($alg,$bits) = @_;
32
33$::code.=<<___;
34.globl	${alg}${bits}_t4_cbc_encrypt
35.align	32
36${alg}${bits}_t4_cbc_encrypt:
37	save		%sp, -$::frame, %sp
38	cmp		$len, 0
39	be,pn		$::size_t_cc, .L${bits}_cbc_enc_abort
40	sub		$inp, $out, $blk_init	! $inp!=$out
41___
42$::code.=<<___ if (!$::evp);
43	andcc		$ivec, 7, $ivoff
44	alignaddr	$ivec, %g0, $ivec
45
46	ldd		[$ivec + 0], %f0	! load ivec
47	bz,pt		%icc, 1f
48	ldd		[$ivec + 8], %f2
49	ldd		[$ivec + 16], %f4
50	faligndata	%f0, %f2, %f0
51	faligndata	%f2, %f4, %f2
521:
53___
54$::code.=<<___ if ($::evp);
55	ld		[$ivec + 0], %f0
56	ld		[$ivec + 4], %f1
57	ld		[$ivec + 8], %f2
58	ld		[$ivec + 12], %f3
59___
60$::code.=<<___;
61	prefetch	[$inp], 20
62	prefetch	[$inp + 63], 20
63	call		_${alg}${bits}_load_enckey
64	and		$inp, 7, $ileft
65	andn		$inp, 7, $inp
66	sll		$ileft, 3, $ileft
67	mov		64, $iright
68	mov		0xff, $omask
69	sub		$iright, $ileft, $iright
70	and		$out, 7, $ooff
71	cmp		$len, 127
72	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
73	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
74	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
75	srl		$omask, $ooff, $omask
76
77	alignaddrl	$out, %g0, $out
78	srlx		$len, 4, $len
79	prefetch	[$out], 22
80
81.L${bits}_cbc_enc_loop:
82	ldx		[$inp + 0], %o0
83	brz,pt		$ileft, 4f
84	ldx		[$inp + 8], %o1
85
86	ldx		[$inp + 16], %o2
87	sllx		%o0, $ileft, %o0
88	srlx		%o1, $iright, %g1
89	sllx		%o1, $ileft, %o1
90	or		%g1, %o0, %o0
91	srlx		%o2, $iright, %o2
92	or		%o2, %o1, %o1
934:
94	xor		%g4, %o0, %o0		! ^= rk[0]
95	xor		%g5, %o1, %o1
96	movxtod		%o0, %f12
97	movxtod		%o1, %f14
98
99	fxor		%f12, %f0, %f0		! ^= ivec
100	fxor		%f14, %f2, %f2
101	prefetch	[$out + 63], 22
102	prefetch	[$inp + 16+63], 20
103	call		_${alg}${bits}_encrypt_1x
104	add		$inp, 16, $inp
105
106	brnz,pn		$ooff, 2f
107	sub		$len, 1, $len
108
109	std		%f0, [$out + 0]
110	std		%f2, [$out + 8]
111	brnz,pt		$len, .L${bits}_cbc_enc_loop
112	add		$out, 16, $out
113___
114$::code.=<<___ if ($::evp);
115	st		%f0, [$ivec + 0]
116	st		%f1, [$ivec + 4]
117	st		%f2, [$ivec + 8]
118	st		%f3, [$ivec + 12]
119___
120$::code.=<<___ if (!$::evp);
121	brnz,pn		$ivoff, 3f
122	nop
123
124	std		%f0, [$ivec + 0]	! write out ivec
125	std		%f2, [$ivec + 8]
126___
127$::code.=<<___;
128.L${bits}_cbc_enc_abort:
129	ret
130	restore
131
132.align	16
1332:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
134						! and ~3x deterioration
135						! in inp==out case
136	faligndata	%f0, %f0, %f4		! handle unaligned output
137	faligndata	%f0, %f2, %f6
138	faligndata	%f2, %f2, %f8
139
140	stda		%f4, [$out + $omask]0xc0	! partial store
141	std		%f6, [$out + 8]
142	add		$out, 16, $out
143	orn		%g0, $omask, $omask
144	stda		%f8, [$out + $omask]0xc0	! partial store
145
146	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
147	orn		%g0, $omask, $omask
148___
149$::code.=<<___ if ($::evp);
150	st		%f0, [$ivec + 0]
151	st		%f1, [$ivec + 4]
152	st		%f2, [$ivec + 8]
153	st		%f3, [$ivec + 12]
154___
155$::code.=<<___ if (!$::evp);
156	brnz,pn		$ivoff, 3f
157	nop
158
159	std		%f0, [$ivec + 0]	! write out ivec
160	std		%f2, [$ivec + 8]
161	ret
162	restore
163
164.align	16
1653:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
166	mov		0xff, $omask
167	srl		$omask, $ivoff, $omask
168	faligndata	%f0, %f0, %f4
169	faligndata	%f0, %f2, %f6
170	faligndata	%f2, %f2, %f8
171	stda		%f4, [$ivec + $omask]0xc0
172	std		%f6, [$ivec + 8]
173	add		$ivec, 16, $ivec
174	orn		%g0, $omask, $omask
175	stda		%f8, [$ivec + $omask]0xc0
176___
177$::code.=<<___;
178	ret
179	restore
180
181!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
182.align	32
183.L${bits}cbc_enc_blk:
184	add	$out, $len, $blk_init
185	and	$blk_init, 63, $blk_init	! tail
186	sub	$len, $blk_init, $len
187	add	$blk_init, 15, $blk_init	! round up to 16n
188	srlx	$len, 4, $len
189	srl	$blk_init, 4, $blk_init
190
191.L${bits}_cbc_enc_blk_loop:
192	ldx		[$inp + 0], %o0
193	brz,pt		$ileft, 5f
194	ldx		[$inp + 8], %o1
195
196	ldx		[$inp + 16], %o2
197	sllx		%o0, $ileft, %o0
198	srlx		%o1, $iright, %g1
199	sllx		%o1, $ileft, %o1
200	or		%g1, %o0, %o0
201	srlx		%o2, $iright, %o2
202	or		%o2, %o1, %o1
2035:
204	xor		%g4, %o0, %o0		! ^= rk[0]
205	xor		%g5, %o1, %o1
206	movxtod		%o0, %f12
207	movxtod		%o1, %f14
208
209	fxor		%f12, %f0, %f0		! ^= ivec
210	fxor		%f14, %f2, %f2
211	prefetch	[$inp + 16+63], 20
212	call		_${alg}${bits}_encrypt_1x
213	add		$inp, 16, $inp
214	sub		$len, 1, $len
215
216	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
217	add		$out, 8, $out
218	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
219	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
220	add		$out, 8, $out
221
222	membar		#StoreLoad|#StoreStore
223	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
224	mov		$blk_init, $len
225___
226$::code.=<<___ if ($::evp);
227	st		%f0, [$ivec + 0]
228	st		%f1, [$ivec + 4]
229	st		%f2, [$ivec + 8]
230	st		%f3, [$ivec + 12]
231___
232$::code.=<<___ if (!$::evp);
233	brnz,pn		$ivoff, 3b
234	nop
235
236	std		%f0, [$ivec + 0]	! write out ivec
237	std		%f2, [$ivec + 8]
238___
239$::code.=<<___;
240	ret
241	restore
242.type	${alg}${bits}_t4_cbc_encrypt,#function
243.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
244___
245}
246
247sub alg_cbc_decrypt_implement {
248my ($alg,$bits) = @_;
249
250$::code.=<<___;
251.globl	${alg}${bits}_t4_cbc_decrypt
252.align	32
253${alg}${bits}_t4_cbc_decrypt:
254	save		%sp, -$::frame, %sp
255	cmp		$len, 0
256	be,pn		$::size_t_cc, .L${bits}_cbc_dec_abort
257	sub		$inp, $out, $blk_init	! $inp!=$out
258___
259$::code.=<<___ if (!$::evp);
260	andcc		$ivec, 7, $ivoff
261	alignaddr	$ivec, %g0, $ivec
262
263	ldd		[$ivec + 0], %f12	! load ivec
264	bz,pt		%icc, 1f
265	ldd		[$ivec + 8], %f14
266	ldd		[$ivec + 16], %f0
267	faligndata	%f12, %f14, %f12
268	faligndata	%f14, %f0, %f14
2691:
270___
271$::code.=<<___ if ($::evp);
272	ld		[$ivec + 0], %f12	! load ivec
273	ld		[$ivec + 4], %f13
274	ld		[$ivec + 8], %f14
275	ld		[$ivec + 12], %f15
276___
277$::code.=<<___;
278	prefetch	[$inp], 20
279	prefetch	[$inp + 63], 20
280	call		_${alg}${bits}_load_deckey
281	and		$inp, 7, $ileft
282	andn		$inp, 7, $inp
283	sll		$ileft, 3, $ileft
284	mov		64, $iright
285	mov		0xff, $omask
286	sub		$iright, $ileft, $iright
287	and		$out, 7, $ooff
288	cmp		$len, 255
289	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
290	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
291	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
292	srl		$omask, $ooff, $omask
293
294	andcc		$len, 16, %g0		! is number of blocks even?
295	srlx		$len, 4, $len
296	alignaddrl	$out, %g0, $out
297	bz		%icc, .L${bits}_cbc_dec_loop2x
298	prefetch	[$out], 22
299.L${bits}_cbc_dec_loop:
300	ldx		[$inp + 0], %o0
301	brz,pt		$ileft, 4f
302	ldx		[$inp + 8], %o1
303
304	ldx		[$inp + 16], %o2
305	sllx		%o0, $ileft, %o0
306	srlx		%o1, $iright, %g1
307	sllx		%o1, $ileft, %o1
308	or		%g1, %o0, %o0
309	srlx		%o2, $iright, %o2
310	or		%o2, %o1, %o1
3114:
312	xor		%g4, %o0, %o2		! ^= rk[0]
313	xor		%g5, %o1, %o3
314	movxtod		%o2, %f0
315	movxtod		%o3, %f2
316
317	prefetch	[$out + 63], 22
318	prefetch	[$inp + 16+63], 20
319	call		_${alg}${bits}_decrypt_1x
320	add		$inp, 16, $inp
321
322	fxor		%f12, %f0, %f0		! ^= ivec
323	fxor		%f14, %f2, %f2
324	movxtod		%o0, %f12
325	movxtod		%o1, %f14
326
327	brnz,pn		$ooff, 2f
328	sub		$len, 1, $len
329
330	std		%f0, [$out + 0]
331	std		%f2, [$out + 8]
332	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
333	add		$out, 16, $out
334___
335$::code.=<<___ if ($::evp);
336	st		%f12, [$ivec + 0]
337	st		%f13, [$ivec + 4]
338	st		%f14, [$ivec + 8]
339	st		%f15, [$ivec + 12]
340___
341$::code.=<<___ if (!$::evp);
342	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
343	nop
344
345	std		%f12, [$ivec + 0]	! write out ivec
346	std		%f14, [$ivec + 8]
347___
348$::code.=<<___;
349.L${bits}_cbc_dec_abort:
350	ret
351	restore
352
353.align	16
3542:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
355						! and ~3x deterioration
356						! in inp==out case
357	faligndata	%f0, %f0, %f4		! handle unaligned output
358	faligndata	%f0, %f2, %f6
359	faligndata	%f2, %f2, %f8
360
361	stda		%f4, [$out + $omask]0xc0	! partial store
362	std		%f6, [$out + 8]
363	add		$out, 16, $out
364	orn		%g0, $omask, $omask
365	stda		%f8, [$out + $omask]0xc0	! partial store
366
367	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
368	orn		%g0, $omask, $omask
369___
370$::code.=<<___ if ($::evp);
371	st		%f12, [$ivec + 0]
372	st		%f13, [$ivec + 4]
373	st		%f14, [$ivec + 8]
374	st		%f15, [$ivec + 12]
375___
376$::code.=<<___ if (!$::evp);
377	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
378	nop
379
380	std		%f12, [$ivec + 0]	! write out ivec
381	std		%f14, [$ivec + 8]
382___
383$::code.=<<___;
384	ret
385	restore
386
387!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
388.align	32
389.L${bits}_cbc_dec_loop2x:
390	ldx		[$inp + 0], %o0
391	ldx		[$inp + 8], %o1
392	ldx		[$inp + 16], %o2
393	brz,pt		$ileft, 4f
394	ldx		[$inp + 24], %o3
395
396	ldx		[$inp + 32], %o4
397	sllx		%o0, $ileft, %o0
398	srlx		%o1, $iright, %g1
399	or		%g1, %o0, %o0
400	sllx		%o1, $ileft, %o1
401	srlx		%o2, $iright, %g1
402	or		%g1, %o1, %o1
403	sllx		%o2, $ileft, %o2
404	srlx		%o3, $iright, %g1
405	or		%g1, %o2, %o2
406	sllx		%o3, $ileft, %o3
407	srlx		%o4, $iright, %o4
408	or		%o4, %o3, %o3
4094:
410	xor		%g4, %o0, %o4		! ^= rk[0]
411	xor		%g5, %o1, %o5
412	movxtod		%o4, %f0
413	movxtod		%o5, %f2
414	xor		%g4, %o2, %o4
415	xor		%g5, %o3, %o5
416	movxtod		%o4, %f4
417	movxtod		%o5, %f6
418
419	prefetch	[$out + 63], 22
420	prefetch	[$inp + 32+63], 20
421	call		_${alg}${bits}_decrypt_2x
422	add		$inp, 32, $inp
423
424	movxtod		%o0, %f8
425	movxtod		%o1, %f10
426	fxor		%f12, %f0, %f0		! ^= ivec
427	fxor		%f14, %f2, %f2
428	movxtod		%o2, %f12
429	movxtod		%o3, %f14
430	fxor		%f8, %f4, %f4
431	fxor		%f10, %f6, %f6
432
433	brnz,pn		$ooff, 2f
434	sub		$len, 2, $len
435
436	std		%f0, [$out + 0]
437	std		%f2, [$out + 8]
438	std		%f4, [$out + 16]
439	std		%f6, [$out + 24]
440	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
441	add		$out, 32, $out
442___
443$::code.=<<___ if ($::evp);
444	st		%f12, [$ivec + 0]
445	st		%f13, [$ivec + 4]
446	st		%f14, [$ivec + 8]
447	st		%f15, [$ivec + 12]
448___
449$::code.=<<___ if (!$::evp);
450	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
451	nop
452
453	std		%f12, [$ivec + 0]	! write out ivec
454	std		%f14, [$ivec + 8]
455___
456$::code.=<<___;
457	ret
458	restore
459
460.align	16
4612:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
462						! and ~3x deterioration
463						! in inp==out case
464	faligndata	%f0, %f0, %f8		! handle unaligned output
465	faligndata	%f0, %f2, %f0
466	faligndata	%f2, %f4, %f2
467	faligndata	%f4, %f6, %f4
468	faligndata	%f6, %f6, %f6
469	stda		%f8, [$out + $omask]0xc0	! partial store
470	std		%f0, [$out + 8]
471	std		%f2, [$out + 16]
472	std		%f4, [$out + 24]
473	add		$out, 32, $out
474	orn		%g0, $omask, $omask
475	stda		%f6, [$out + $omask]0xc0	! partial store
476
477	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
478	orn		%g0, $omask, $omask
479___
480$::code.=<<___ if ($::evp);
481	st		%f12, [$ivec + 0]
482	st		%f13, [$ivec + 4]
483	st		%f14, [$ivec + 8]
484	st		%f15, [$ivec + 12]
485___
486$::code.=<<___ if (!$::evp);
487	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
488	nop
489
490	std		%f12, [$ivec + 0]	! write out ivec
491	std		%f14, [$ivec + 8]
492	ret
493	restore
494
495.align	16
496.L${bits}_cbc_dec_unaligned_ivec:
497	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
498	mov		0xff, $omask
499	srl		$omask, $ivoff, $omask
500	faligndata	%f12, %f12, %f0
501	faligndata	%f12, %f14, %f2
502	faligndata	%f14, %f14, %f4
503	stda		%f0, [$ivec + $omask]0xc0
504	std		%f2, [$ivec + 8]
505	add		$ivec, 16, $ivec
506	orn		%g0, $omask, $omask
507	stda		%f4, [$ivec + $omask]0xc0
508___
509$::code.=<<___;
510	ret
511	restore
512
513!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
514.align	32
515.L${bits}cbc_dec_blk:
516	add	$out, $len, $blk_init
517	and	$blk_init, 63, $blk_init	! tail
518	sub	$len, $blk_init, $len
519	add	$blk_init, 15, $blk_init	! round up to 16n
520	srlx	$len, 4, $len
521	srl	$blk_init, 4, $blk_init
522	sub	$len, 1, $len
523	add	$blk_init, 1, $blk_init
524
525.L${bits}_cbc_dec_blk_loop2x:
526	ldx		[$inp + 0], %o0
527	ldx		[$inp + 8], %o1
528	ldx		[$inp + 16], %o2
529	brz,pt		$ileft, 5f
530	ldx		[$inp + 24], %o3
531
532	ldx		[$inp + 32], %o4
533	sllx		%o0, $ileft, %o0
534	srlx		%o1, $iright, %g1
535	or		%g1, %o0, %o0
536	sllx		%o1, $ileft, %o1
537	srlx		%o2, $iright, %g1
538	or		%g1, %o1, %o1
539	sllx		%o2, $ileft, %o2
540	srlx		%o3, $iright, %g1
541	or		%g1, %o2, %o2
542	sllx		%o3, $ileft, %o3
543	srlx		%o4, $iright, %o4
544	or		%o4, %o3, %o3
5455:
546	xor		%g4, %o0, %o4		! ^= rk[0]
547	xor		%g5, %o1, %o5
548	movxtod		%o4, %f0
549	movxtod		%o5, %f2
550	xor		%g4, %o2, %o4
551	xor		%g5, %o3, %o5
552	movxtod		%o4, %f4
553	movxtod		%o5, %f6
554
555	prefetch	[$inp + 32+63], 20
556	call		_${alg}${bits}_decrypt_2x
557	add		$inp, 32, $inp
558	subcc		$len, 2, $len
559
560	movxtod		%o0, %f8
561	movxtod		%o1, %f10
562	fxor		%f12, %f0, %f0		! ^= ivec
563	fxor		%f14, %f2, %f2
564	movxtod		%o2, %f12
565	movxtod		%o3, %f14
566	fxor		%f8, %f4, %f4
567	fxor		%f10, %f6, %f6
568
569	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
570	add		$out, 8, $out
571	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
572	add		$out, 8, $out
573	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
574	add		$out, 8, $out
575	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
576	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
577	add		$out, 8, $out
578
579	add		$blk_init, $len, $len
580	andcc		$len, 1, %g0		! is number of blocks even?
581	membar		#StoreLoad|#StoreStore
582	bnz,pt		%icc, .L${bits}_cbc_dec_loop
583	srl		$len, 0, $len
584	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
585	nop
586___
587$::code.=<<___ if ($::evp);
588	st		%f12, [$ivec + 0]	! write out ivec
589	st		%f13, [$ivec + 4]
590	st		%f14, [$ivec + 8]
591	st		%f15, [$ivec + 12]
592___
593$::code.=<<___ if (!$::evp);
594	brnz,pn		$ivoff, 3b
595	nop
596
597	std		%f12, [$ivec + 0]	! write out ivec
598	std		%f14, [$ivec + 8]
599___
600$::code.=<<___;
601	ret
602	restore
603.type	${alg}${bits}_t4_cbc_decrypt,#function
604.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
605___
606}
607
608sub alg_ctr32_implement {
609my ($alg,$bits) = @_;
610
611$::code.=<<___;
612.globl	${alg}${bits}_t4_ctr32_encrypt
613.align	32
614${alg}${bits}_t4_ctr32_encrypt:
615	save		%sp, -$::frame, %sp
616
617	prefetch	[$inp], 20
618	prefetch	[$inp + 63], 20
619	call		_${alg}${bits}_load_enckey
620	sllx		$len, 4, $len
621
622	ld		[$ivec + 0], %l4	! counter
623	ld		[$ivec + 4], %l5
624	ld		[$ivec + 8], %l6
625	ld		[$ivec + 12], %l7
626
627	sllx		%l4, 32, %o5
628	or		%l5, %o5, %o5
629	sllx		%l6, 32, %g1
630	xor		%o5, %g4, %g4		! ^= rk[0]
631	xor		%g1, %g5, %g5
632	movxtod		%g4, %f14		! most significant 64 bits
633
634	sub		$inp, $out, $blk_init	! $inp!=$out
635	and		$inp, 7, $ileft
636	andn		$inp, 7, $inp
637	sll		$ileft, 3, $ileft
638	mov		64, $iright
639	mov		0xff, $omask
640	sub		$iright, $ileft, $iright
641	and		$out, 7, $ooff
642	cmp		$len, 255
643	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
644	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
645	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
646	srl		$omask, $ooff, $omask
647
648	andcc		$len, 16, %g0		! is number of blocks even?
649	alignaddrl	$out, %g0, $out
650	bz		%icc, .L${bits}_ctr32_loop2x
651	srlx		$len, 4, $len
652.L${bits}_ctr32_loop:
653	ldx		[$inp + 0], %o0
654	brz,pt		$ileft, 4f
655	ldx		[$inp + 8], %o1
656
657	ldx		[$inp + 16], %o2
658	sllx		%o0, $ileft, %o0
659	srlx		%o1, $iright, %g1
660	sllx		%o1, $ileft, %o1
661	or		%g1, %o0, %o0
662	srlx		%o2, $iright, %o2
663	or		%o2, %o1, %o1
6644:
665	xor		%g5, %l7, %g1		! ^= rk[0]
666	add		%l7, 1, %l7
667	movxtod		%g1, %f2
668	srl		%l7, 0, %l7		! clruw
669	prefetch	[$out + 63], 22
670	prefetch	[$inp + 16+63], 20
671___
672$::code.=<<___ if ($alg eq "aes");
673	aes_eround01	%f16, %f14, %f2, %f4
674	aes_eround23	%f18, %f14, %f2, %f2
675___
676$::code.=<<___ if ($alg eq "cmll");
677	camellia_f	%f16, %f2, %f14, %f2
678	camellia_f	%f18, %f14, %f2, %f0
679___
680$::code.=<<___;
681	call		_${alg}${bits}_encrypt_1x+8
682	add		$inp, 16, $inp
683
684	movxtod		%o0, %f10
685	movxtod		%o1, %f12
686	fxor		%f10, %f0, %f0		! ^= inp
687	fxor		%f12, %f2, %f2
688
689	brnz,pn		$ooff, 2f
690	sub		$len, 1, $len
691
692	std		%f0, [$out + 0]
693	std		%f2, [$out + 8]
694	brnz,pt		$len, .L${bits}_ctr32_loop2x
695	add		$out, 16, $out
696
697	ret
698	restore
699
700.align	16
7012:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
702						! and ~3x deterioration
703						! in inp==out case
704	faligndata	%f0, %f0, %f4		! handle unaligned output
705	faligndata	%f0, %f2, %f6
706	faligndata	%f2, %f2, %f8
707	stda		%f4, [$out + $omask]0xc0	! partial store
708	std		%f6, [$out + 8]
709	add		$out, 16, $out
710	orn		%g0, $omask, $omask
711	stda		%f8, [$out + $omask]0xc0	! partial store
712
713	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
714	orn		%g0, $omask, $omask
715
716	ret
717	restore
718
719!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
720.align	32
721.L${bits}_ctr32_loop2x:
722	ldx		[$inp + 0], %o0
723	ldx		[$inp + 8], %o1
724	ldx		[$inp + 16], %o2
725	brz,pt		$ileft, 4f
726	ldx		[$inp + 24], %o3
727
728	ldx		[$inp + 32], %o4
729	sllx		%o0, $ileft, %o0
730	srlx		%o1, $iright, %g1
731	or		%g1, %o0, %o0
732	sllx		%o1, $ileft, %o1
733	srlx		%o2, $iright, %g1
734	or		%g1, %o1, %o1
735	sllx		%o2, $ileft, %o2
736	srlx		%o3, $iright, %g1
737	or		%g1, %o2, %o2
738	sllx		%o3, $ileft, %o3
739	srlx		%o4, $iright, %o4
740	or		%o4, %o3, %o3
7414:
742	xor		%g5, %l7, %g1		! ^= rk[0]
743	add		%l7, 1, %l7
744	movxtod		%g1, %f2
745	srl		%l7, 0, %l7		! clruw
746	xor		%g5, %l7, %g1
747	add		%l7, 1, %l7
748	movxtod		%g1, %f6
749	srl		%l7, 0, %l7		! clruw
750	prefetch	[$out + 63], 22
751	prefetch	[$inp + 32+63], 20
752___
753$::code.=<<___ if ($alg eq "aes");
754	aes_eround01	%f16, %f14, %f2, %f8
755	aes_eround23	%f18, %f14, %f2, %f2
756	aes_eround01	%f16, %f14, %f6, %f10
757	aes_eround23	%f18, %f14, %f6, %f6
758___
759$::code.=<<___ if ($alg eq "cmll");
760	camellia_f	%f16, %f2, %f14, %f2
761	camellia_f	%f16, %f6, %f14, %f6
762	camellia_f	%f18, %f14, %f2, %f0
763	camellia_f	%f18, %f14, %f6, %f4
764___
765$::code.=<<___;
766	call		_${alg}${bits}_encrypt_2x+16
767	add		$inp, 32, $inp
768
769	movxtod		%o0, %f8
770	movxtod		%o1, %f10
771	movxtod		%o2, %f12
772	fxor		%f8, %f0, %f0		! ^= inp
773	movxtod		%o3, %f8
774	fxor		%f10, %f2, %f2
775	fxor		%f12, %f4, %f4
776	fxor		%f8, %f6, %f6
777
778	brnz,pn		$ooff, 2f
779	sub		$len, 2, $len
780
781	std		%f0, [$out + 0]
782	std		%f2, [$out + 8]
783	std		%f4, [$out + 16]
784	std		%f6, [$out + 24]
785	brnz,pt		$len, .L${bits}_ctr32_loop2x
786	add		$out, 32, $out
787
788	ret
789	restore
790
791.align	16
7922:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
793						! and ~3x deterioration
794						! in inp==out case
795	faligndata	%f0, %f0, %f8		! handle unaligned output
796	faligndata	%f0, %f2, %f0
797	faligndata	%f2, %f4, %f2
798	faligndata	%f4, %f6, %f4
799	faligndata	%f6, %f6, %f6
800
801	stda		%f8, [$out + $omask]0xc0	! partial store
802	std		%f0, [$out + 8]
803	std		%f2, [$out + 16]
804	std		%f4, [$out + 24]
805	add		$out, 32, $out
806	orn		%g0, $omask, $omask
807	stda		%f6, [$out + $omask]0xc0	! partial store
808
809	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
810	orn		%g0, $omask, $omask
811
812	ret
813	restore
814
815!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
816.align	32
817.L${bits}_ctr32_blk:
818	add	$out, $len, $blk_init
819	and	$blk_init, 63, $blk_init	! tail
820	sub	$len, $blk_init, $len
821	add	$blk_init, 15, $blk_init	! round up to 16n
822	srlx	$len, 4, $len
823	srl	$blk_init, 4, $blk_init
824	sub	$len, 1, $len
825	add	$blk_init, 1, $blk_init
826
827.L${bits}_ctr32_blk_loop2x:
828	ldx		[$inp + 0], %o0
829	ldx		[$inp + 8], %o1
830	ldx		[$inp + 16], %o2
831	brz,pt		$ileft, 5f
832	ldx		[$inp + 24], %o3
833
834	ldx		[$inp + 32], %o4
835	sllx		%o0, $ileft, %o0
836	srlx		%o1, $iright, %g1
837	or		%g1, %o0, %o0
838	sllx		%o1, $ileft, %o1
839	srlx		%o2, $iright, %g1
840	or		%g1, %o1, %o1
841	sllx		%o2, $ileft, %o2
842	srlx		%o3, $iright, %g1
843	or		%g1, %o2, %o2
844	sllx		%o3, $ileft, %o3
845	srlx		%o4, $iright, %o4
846	or		%o4, %o3, %o3
8475:
848	xor		%g5, %l7, %g1		! ^= rk[0]
849	add		%l7, 1, %l7
850	movxtod		%g1, %f2
851	srl		%l7, 0, %l7		! clruw
852	xor		%g5, %l7, %g1
853	add		%l7, 1, %l7
854	movxtod		%g1, %f6
855	srl		%l7, 0, %l7		! clruw
856	prefetch	[$inp + 32+63], 20
857___
858$::code.=<<___ if ($alg eq "aes");
859	aes_eround01	%f16, %f14, %f2, %f8
860	aes_eround23	%f18, %f14, %f2, %f2
861	aes_eround01	%f16, %f14, %f6, %f10
862	aes_eround23	%f18, %f14, %f6, %f6
863___
864$::code.=<<___ if ($alg eq "cmll");
865	camellia_f	%f16, %f2, %f14, %f2
866	camellia_f	%f16, %f6, %f14, %f6
867	camellia_f	%f18, %f14, %f2, %f0
868	camellia_f	%f18, %f14, %f6, %f4
869___
870$::code.=<<___;
871	call		_${alg}${bits}_encrypt_2x+16
872	add		$inp, 32, $inp
873	subcc		$len, 2, $len
874
875	movxtod		%o0, %f8
876	movxtod		%o1, %f10
877	movxtod		%o2, %f12
878	fxor		%f8, %f0, %f0		! ^= inp
879	movxtod		%o3, %f8
880	fxor		%f10, %f2, %f2
881	fxor		%f12, %f4, %f4
882	fxor		%f8, %f6, %f6
883
884	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
885	add		$out, 8, $out
886	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
887	add		$out, 8, $out
888	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
889	add		$out, 8, $out
890	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
891	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
892	add		$out, 8, $out
893
894	add		$blk_init, $len, $len
895	andcc		$len, 1, %g0		! is number of blocks even?
896	membar		#StoreLoad|#StoreStore
897	bnz,pt		%icc, .L${bits}_ctr32_loop
898	srl		$len, 0, $len
899	brnz,pn		$len, .L${bits}_ctr32_loop2x
900	nop
901
902	ret
903	restore
904.type	${alg}${bits}_t4_ctr32_encrypt,#function
905.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
906___
907}
908
909sub alg_xts_implement {
910my ($alg,$bits,$dir) = @_;
911my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
912my $rem=$ivec;
913
914$::code.=<<___;
915.globl	${alg}${bits}_t4_xts_${dir}crypt
916.align	32
917${alg}${bits}_t4_xts_${dir}crypt:
918	save		%sp, -$::frame-16, %sp
919
920	mov		$ivec, %o0
921	add		%fp, $::bias-16, %o1
922	call		${alg}_t4_encrypt
923	mov		$key2, %o2
924
925	add		%fp, $::bias-16, %l7
926	ldxa		[%l7]0x88, %g2
927	add		%fp, $::bias-8, %l7
928	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
929
930	sethi		%hi(0x76543210), %l7
931	or		%l7, %lo(0x76543210), %l7
932	bmask		%l7, %g0, %g0		! byte swap mask
933
934	prefetch	[$inp], 20
935	prefetch	[$inp + 63], 20
936	call		_${alg}${bits}_load_${dir}ckey
937	and		$len, 15,  $rem
938	and		$len, -16, $len
939___
940$code.=<<___ if ($dir eq "de");
941	mov		0, %l7
942	movrnz		$rem, 16,  %l7
943	sub		$len, %l7, $len
944___
945$code.=<<___;
946
947	sub		$inp, $out, $blk_init	! $inp!=$out
948	and		$inp, 7, $ileft
949	andn		$inp, 7, $inp
950	sll		$ileft, 3, $ileft
951	mov		64, $iright
952	mov		0xff, $omask
953	sub		$iright, $ileft, $iright
954	and		$out, 7, $ooff
955	cmp		$len, 255
956	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
957	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
958	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
959	srl		$omask, $ooff, $omask
960
961	andcc		$len, 16, %g0		! is number of blocks even?
962___
963$code.=<<___ if ($dir eq "de");
964	brz,pn		$len, .L${bits}_xts_${dir}steal
965___
966$code.=<<___;
967	alignaddrl	$out, %g0, $out
968	bz		%icc, .L${bits}_xts_${dir}loop2x
969	srlx		$len, 4, $len
970.L${bits}_xts_${dir}loop:
971	ldx		[$inp + 0], %o0
972	brz,pt		$ileft, 4f
973	ldx		[$inp + 8], %o1
974
975	ldx		[$inp + 16], %o2
976	sllx		%o0, $ileft, %o0
977	srlx		%o1, $iright, %g1
978	sllx		%o1, $ileft, %o1
979	or		%g1, %o0, %o0
980	srlx		%o2, $iright, %o2
981	or		%o2, %o1, %o1
9824:
983	movxtod		%g2, %f12
984	movxtod		%g3, %f14
985	bshuffle	%f12, %f12, %f12
986	bshuffle	%f14, %f14, %f14
987
988	xor		%g4, %o0, %o0		! ^= rk[0]
989	xor		%g5, %o1, %o1
990	movxtod		%o0, %f0
991	movxtod		%o1, %f2
992
993	fxor		%f12, %f0, %f0		! ^= tweak[0]
994	fxor		%f14, %f2, %f2
995
996	prefetch	[$out + 63], 22
997	prefetch	[$inp + 16+63], 20
998	call		_${alg}${bits}_${dir}crypt_1x
999	add		$inp, 16, $inp
1000
1001	fxor		%f12, %f0, %f0		! ^= tweak[0]
1002	fxor		%f14, %f2, %f2
1003
1004	srax		%g3, 63, %l7		! next tweak value
1005	addcc		%g2, %g2, %g2
1006	and		%l7, 0x87, %l7
1007	addxc		%g3, %g3, %g3
1008	xor		%l7, %g2, %g2
1009
1010	brnz,pn		$ooff, 2f
1011	sub		$len, 1, $len
1012
1013	std		%f0, [$out + 0]
1014	std		%f2, [$out + 8]
1015	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1016	add		$out, 16, $out
1017
1018	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1019	nop
1020
1021	ret
1022	restore
1023
1024.align	16
10252:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1026						! and ~3x deterioration
1027						! in inp==out case
1028	faligndata	%f0, %f0, %f4		! handle unaligned output
1029	faligndata	%f0, %f2, %f6
1030	faligndata	%f2, %f2, %f8
1031	stda		%f4, [$out + $omask]0xc0	! partial store
1032	std		%f6, [$out + 8]
1033	add		$out, 16, $out
1034	orn		%g0, $omask, $omask
1035	stda		%f8, [$out + $omask]0xc0	! partial store
1036
1037	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1038	orn		%g0, $omask, $omask
1039
1040	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1041	nop
1042
1043	ret
1044	restore
1045
1046!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1047.align	32
1048.L${bits}_xts_${dir}loop2x:
1049	ldx		[$inp + 0], %o0
1050	ldx		[$inp + 8], %o1
1051	ldx		[$inp + 16], %o2
1052	brz,pt		$ileft, 4f
1053	ldx		[$inp + 24], %o3
1054
1055	ldx		[$inp + 32], %o4
1056	sllx		%o0, $ileft, %o0
1057	srlx		%o1, $iright, %g1
1058	or		%g1, %o0, %o0
1059	sllx		%o1, $ileft, %o1
1060	srlx		%o2, $iright, %g1
1061	or		%g1, %o1, %o1
1062	sllx		%o2, $ileft, %o2
1063	srlx		%o3, $iright, %g1
1064	or		%g1, %o2, %o2
1065	sllx		%o3, $ileft, %o3
1066	srlx		%o4, $iright, %o4
1067	or		%o4, %o3, %o3
10684:
1069	movxtod		%g2, %f12
1070	movxtod		%g3, %f14
1071	bshuffle	%f12, %f12, %f12
1072	bshuffle	%f14, %f14, %f14
1073
1074	srax		%g3, 63, %l7		! next tweak value
1075	addcc		%g2, %g2, %g2
1076	and		%l7, 0x87, %l7
1077	addxc		%g3, %g3, %g3
1078	xor		%l7, %g2, %g2
1079
1080	movxtod		%g2, %f8
1081	movxtod		%g3, %f10
1082	bshuffle	%f8,  %f8,  %f8
1083	bshuffle	%f10, %f10, %f10
1084
1085	xor		%g4, %o0, %o0		! ^= rk[0]
1086	xor		%g5, %o1, %o1
1087	xor		%g4, %o2, %o2		! ^= rk[0]
1088	xor		%g5, %o3, %o3
1089	movxtod		%o0, %f0
1090	movxtod		%o1, %f2
1091	movxtod		%o2, %f4
1092	movxtod		%o3, %f6
1093
1094	fxor		%f12, %f0, %f0		! ^= tweak[0]
1095	fxor		%f14, %f2, %f2
1096	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1097	fxor		%f10, %f6, %f6
1098
1099	prefetch	[$out + 63], 22
1100	prefetch	[$inp + 32+63], 20
1101	call		_${alg}${bits}_${dir}crypt_2x
1102	add		$inp, 32, $inp
1103
1104	movxtod		%g2, %f8
1105	movxtod		%g3, %f10
1106
1107	srax		%g3, 63, %l7		! next tweak value
1108	addcc		%g2, %g2, %g2
1109	and		%l7, 0x87, %l7
1110	addxc		%g3, %g3, %g3
1111	xor		%l7, %g2, %g2
1112
1113	bshuffle	%f8,  %f8,  %f8
1114	bshuffle	%f10, %f10, %f10
1115
1116	fxor		%f12, %f0, %f0		! ^= tweak[0]
1117	fxor		%f14, %f2, %f2
1118	fxor		%f8,  %f4, %f4
1119	fxor		%f10, %f6, %f6
1120
1121	brnz,pn		$ooff, 2f
1122	sub		$len, 2, $len
1123
1124	std		%f0, [$out + 0]
1125	std		%f2, [$out + 8]
1126	std		%f4, [$out + 16]
1127	std		%f6, [$out + 24]
1128	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
1129	add		$out, 32, $out
1130
1131	fsrc2		%f4, %f0
1132	fsrc2		%f6, %f2
1133	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1134	nop
1135
1136	ret
1137	restore
1138
1139.align	16
11402:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
1141						! and ~3x deterioration
1142						! in inp==out case
1143	faligndata	%f0, %f0, %f8		! handle unaligned output
1144	faligndata	%f0, %f2, %f10
1145	faligndata	%f2, %f4, %f12
1146	faligndata	%f4, %f6, %f14
1147	faligndata	%f6, %f6, %f0
1148
1149	stda		%f8, [$out + $omask]0xc0	! partial store
1150	std		%f10, [$out + 8]
1151	std		%f12, [$out + 16]
1152	std		%f14, [$out + 24]
1153	add		$out, 32, $out
1154	orn		%g0, $omask, $omask
1155	stda		%f0, [$out + $omask]0xc0	! partial store
1156
1157	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
1158	orn		%g0, $omask, $omask
1159
1160	fsrc2		%f4, %f0
1161	fsrc2		%f6, %f2
1162	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1163	nop
1164
1165	ret
1166	restore
1167
1168!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1169.align	32
1170.L${bits}_xts_${dir}blk:
1171	add	$out, $len, $blk_init
1172	and	$blk_init, 63, $blk_init	! tail
1173	sub	$len, $blk_init, $len
1174	add	$blk_init, 15, $blk_init	! round up to 16n
1175	srlx	$len, 4, $len
1176	srl	$blk_init, 4, $blk_init
1177	sub	$len, 1, $len
1178	add	$blk_init, 1, $blk_init
1179
1180.L${bits}_xts_${dir}blk2x:
1181	ldx		[$inp + 0], %o0
1182	ldx		[$inp + 8], %o1
1183	ldx		[$inp + 16], %o2
1184	brz,pt		$ileft, 5f
1185	ldx		[$inp + 24], %o3
1186
1187	ldx		[$inp + 32], %o4
1188	sllx		%o0, $ileft, %o0
1189	srlx		%o1, $iright, %g1
1190	or		%g1, %o0, %o0
1191	sllx		%o1, $ileft, %o1
1192	srlx		%o2, $iright, %g1
1193	or		%g1, %o1, %o1
1194	sllx		%o2, $ileft, %o2
1195	srlx		%o3, $iright, %g1
1196	or		%g1, %o2, %o2
1197	sllx		%o3, $ileft, %o3
1198	srlx		%o4, $iright, %o4
1199	or		%o4, %o3, %o3
12005:
1201	movxtod		%g2, %f12
1202	movxtod		%g3, %f14
1203	bshuffle	%f12, %f12, %f12
1204	bshuffle	%f14, %f14, %f14
1205
1206	srax		%g3, 63, %l7		! next tweak value
1207	addcc		%g2, %g2, %g2
1208	and		%l7, 0x87, %l7
1209	addxc		%g3, %g3, %g3
1210	xor		%l7, %g2, %g2
1211
1212	movxtod		%g2, %f8
1213	movxtod		%g3, %f10
1214	bshuffle	%f8,  %f8,  %f8
1215	bshuffle	%f10, %f10, %f10
1216
1217	xor		%g4, %o0, %o0		! ^= rk[0]
1218	xor		%g5, %o1, %o1
1219	xor		%g4, %o2, %o2		! ^= rk[0]
1220	xor		%g5, %o3, %o3
1221	movxtod		%o0, %f0
1222	movxtod		%o1, %f2
1223	movxtod		%o2, %f4
1224	movxtod		%o3, %f6
1225
1226	fxor		%f12, %f0, %f0		! ^= tweak[0]
1227	fxor		%f14, %f2, %f2
1228	fxor		%f8,  %f4, %f4		! ^= tweak[0]
1229	fxor		%f10, %f6, %f6
1230
1231	prefetch	[$inp + 32+63], 20
1232	call		_${alg}${bits}_${dir}crypt_2x
1233	add		$inp, 32, $inp
1234
1235	movxtod		%g2, %f8
1236	movxtod		%g3, %f10
1237
1238	srax		%g3, 63, %l7		! next tweak value
1239	addcc		%g2, %g2, %g2
1240	and		%l7, 0x87, %l7
1241	addxc		%g3, %g3, %g3
1242	xor		%l7, %g2, %g2
1243
1244	bshuffle	%f8,  %f8,  %f8
1245	bshuffle	%f10, %f10, %f10
1246
1247	fxor		%f12, %f0, %f0		! ^= tweak[0]
1248	fxor		%f14, %f2, %f2
1249	fxor		%f8,  %f4, %f4
1250	fxor		%f10, %f6, %f6
1251
1252	subcc		$len, 2, $len
1253	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1254	add		$out, 8, $out
1255	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1256	add		$out, 8, $out
1257	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1258	add		$out, 8, $out
1259	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
1260	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
1261	add		$out, 8, $out
1262
1263	add		$blk_init, $len, $len
1264	andcc		$len, 1, %g0		! is number of blocks even?
1265	membar		#StoreLoad|#StoreStore
1266	bnz,pt		%icc, .L${bits}_xts_${dir}loop
1267	srl		$len, 0, $len
1268	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
1269	nop
1270
1271	fsrc2		%f4, %f0
1272	fsrc2		%f6, %f2
1273	brnz,pn		$rem, .L${bits}_xts_${dir}steal
1274	nop
1275
1276	ret
1277	restore
1278!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1279___
1280$code.=<<___ if ($dir eq "en");
1281.align	32
1282.L${bits}_xts_${dir}steal:
1283	std		%f0, [%fp + $::bias-16]	! copy of output
1284	std		%f2, [%fp + $::bias-8]
1285
1286	srl		$ileft, 3, $ileft
1287	add		%fp, $::bias-16, %l7
1288	add		$inp, $ileft, $inp	! original $inp+$len&-15
1289	add		$out, $ooff, $out	! original $out+$len&-15
1290	mov		0, $ileft
1291	nop					! align
1292
1293.L${bits}_xts_${dir}stealing:
1294	ldub		[$inp + $ileft], %o0
1295	ldub		[%l7  + $ileft], %o1
1296	dec		$rem
1297	stb		%o0, [%l7  + $ileft]
1298	stb		%o1, [$out + $ileft]
1299	brnz		$rem, .L${bits}_xts_${dir}stealing
1300	inc		$ileft
1301
1302	mov		%l7, $inp
1303	sub		$out, 16, $out
1304	mov		0, $ileft
1305	sub		$out, $ooff, $out
1306	ba		.L${bits}_xts_${dir}loop	! one more time
1307	mov		1, $len				! $rem is 0
1308___
1309$code.=<<___ if ($dir eq "de");
1310.align	32
1311.L${bits}_xts_${dir}steal:
1312	ldx		[$inp + 0], %o0
1313	brz,pt		$ileft, 8f
1314	ldx		[$inp + 8], %o1
1315
1316	ldx		[$inp + 16], %o2
1317	sllx		%o0, $ileft, %o0
1318	srlx		%o1, $iright, %g1
1319	sllx		%o1, $ileft, %o1
1320	or		%g1, %o0, %o0
1321	srlx		%o2, $iright, %o2
1322	or		%o2, %o1, %o1
13238:
1324	srax		%g3, 63, %l7		! next tweak value
1325	addcc		%g2, %g2, %o2
1326	and		%l7, 0x87, %l7
1327	addxc		%g3, %g3, %o3
1328	xor		%l7, %o2, %o2
1329
1330	movxtod		%o2, %f12
1331	movxtod		%o3, %f14
1332	bshuffle	%f12, %f12, %f12
1333	bshuffle	%f14, %f14, %f14
1334
1335	xor		%g4, %o0, %o0		! ^= rk[0]
1336	xor		%g5, %o1, %o1
1337	movxtod		%o0, %f0
1338	movxtod		%o1, %f2
1339
1340	fxor		%f12, %f0, %f0		! ^= tweak[0]
1341	fxor		%f14, %f2, %f2
1342
1343	call		_${alg}${bits}_${dir}crypt_1x
1344	add		$inp, 16, $inp
1345
1346	fxor		%f12, %f0, %f0		! ^= tweak[0]
1347	fxor		%f14, %f2, %f2
1348
1349	std		%f0, [%fp + $::bias-16]
1350	std		%f2, [%fp + $::bias-8]
1351
1352	srl		$ileft, 3, $ileft
1353	add		%fp, $::bias-16, %l7
1354	add		$inp, $ileft, $inp	! original $inp+$len&-15
1355	add		$out, $ooff, $out	! original $out+$len&-15
1356	mov		0, $ileft
1357	add		$out, 16, $out
1358	nop					! align
1359
1360.L${bits}_xts_${dir}stealing:
1361	ldub		[$inp + $ileft], %o0
1362	ldub		[%l7  + $ileft], %o1
1363	dec		$rem
1364	stb		%o0, [%l7  + $ileft]
1365	stb		%o1, [$out + $ileft]
1366	brnz		$rem, .L${bits}_xts_${dir}stealing
1367	inc		$ileft
1368
1369	mov		%l7, $inp
1370	sub		$out, 16, $out
1371	mov		0, $ileft
1372	sub		$out, $ooff, $out
1373	ba		.L${bits}_xts_${dir}loop	! one more time
1374	mov		1, $len				! $rem is 0
1375___
1376$code.=<<___;
1377	ret
1378	restore
1379.type	${alg}${bits}_t4_xts_${dir}crypt,#function
1380.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1381___
1382}
1383
1384# Purpose of these subroutines is to explicitly encode VIS instructions,
1385# so that one can compile the module without having to specify VIS
1386# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1387# Idea is to reserve for option to produce "universal" binary and let
1388# programmer detect if current CPU is VIS capable at run-time.
1389sub unvis {
1390my ($mnemonic,$rs1,$rs2,$rd)=@_;
1391my ($ref,$opf);
1392my %visopf = (	"faligndata"	=> 0x048,
1393		"bshuffle"	=> 0x04c,
1394		"fnot2"		=> 0x066,
1395		"fxor"		=> 0x06c,
1396		"fsrc2"		=> 0x078	);
1397
1398    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1399
1400    if ($opf=$visopf{$mnemonic}) {
1401	foreach ($rs1,$rs2,$rd) {
1402	    return $ref if (!/%f([0-9]{1,2})/);
1403	    $_=$1;
1404	    if ($1>=32) {
1405		return $ref if ($1&1);
1406		# re-encode for upper double register addressing
1407		$_=($1|$1>>5)&31;
1408	    }
1409	}
1410
1411	return	sprintf ".word\t0x%08x !%s",
1412			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1413			$ref;
1414    } else {
1415	return $ref;
1416    }
1417}
1418
1419sub unvis3 {
1420my ($mnemonic,$rs1,$rs2,$rd)=@_;
1421my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1422my ($ref,$opf);
1423my %visopf = (	"addxc"		=> 0x011,
1424		"addxccc"	=> 0x013,
1425		"umulxhi"	=> 0x016,
1426		"alignaddr"	=> 0x018,
1427		"bmask"		=> 0x019,
1428		"alignaddrl"	=> 0x01a	);
1429
1430    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1431
1432    if ($opf=$visopf{$mnemonic}) {
1433	foreach ($rs1,$rs2,$rd) {
1434	    return $ref if (!/%([goli])([0-9])/);
1435	    $_=$bias{$1}+$2;
1436	}
1437
1438	return	sprintf ".word\t0x%08x !%s",
1439			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1440			$ref;
1441    } else {
1442	return $ref;
1443    }
1444}
1445
1446sub unaes_round {	# 4-argument instructions
1447my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1448my ($ref,$opf);
1449my %aesopf = (	"aes_eround01"	=> 0,
1450		"aes_eround23"	=> 1,
1451		"aes_dround01"	=> 2,
1452		"aes_dround23"	=> 3,
1453		"aes_eround01_l"=> 4,
1454		"aes_eround23_l"=> 5,
1455		"aes_dround01_l"=> 6,
1456		"aes_dround23_l"=> 7,
1457		"aes_kexpand1"	=> 8	);
1458
1459    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1460
1461    if (defined($opf=$aesopf{$mnemonic})) {
1462	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1463	foreach ($rs1,$rs2,$rd) {
1464	    return $ref if (!/%f([0-9]{1,2})/);
1465	    $_=$1;
1466	    if ($1>=32) {
1467		return $ref if ($1&1);
1468		# re-encode for upper double register addressing
1469		$_=($1|$1>>5)&31;
1470	    }
1471	}
1472
1473	return	sprintf ".word\t0x%08x !%s",
1474			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1475			$ref;
1476    } else {
1477	return $ref;
1478    }
1479}
1480
1481sub unaes_kexpand {	# 3-argument instructions
1482my ($mnemonic,$rs1,$rs2,$rd)=@_;
1483my ($ref,$opf);
1484my %aesopf = (	"aes_kexpand0"	=> 0x130,
1485		"aes_kexpand2"	=> 0x131	);
1486
1487    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1488
1489    if (defined($opf=$aesopf{$mnemonic})) {
1490	foreach ($rs1,$rs2,$rd) {
1491	    return $ref if (!/%f([0-9]{1,2})/);
1492	    $_=$1;
1493	    if ($1>=32) {
1494		return $ref if ($1&1);
1495		# re-encode for upper double register addressing
1496		$_=($1|$1>>5)&31;
1497	    }
1498	}
1499
1500	return	sprintf ".word\t0x%08x !%s",
1501			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1502			$ref;
1503    } else {
1504	return $ref;
1505    }
1506}
1507
1508sub uncamellia_f {	# 4-argument instructions
1509my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1510my ($ref,$opf);
1511
1512    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1513
1514    if (1) {
1515	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1516	foreach ($rs1,$rs2,$rd) {
1517	    return $ref if (!/%f([0-9]{1,2})/);
1518	    $_=$1;
1519	    if ($1>=32) {
1520		return $ref if ($1&1);
1521		# re-encode for upper double register addressing
1522		$_=($1|$1>>5)&31;
1523	    }
1524	}
1525
1526	return	sprintf ".word\t0x%08x !%s",
1527			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1528			$ref;
1529    } else {
1530	return $ref;
1531    }
1532}
1533
1534sub uncamellia3 {	# 3-argument instructions
1535my ($mnemonic,$rs1,$rs2,$rd)=@_;
1536my ($ref,$opf);
1537my %cmllopf = (	"camellia_fl"	=> 0x13c,
1538		"camellia_fli"	=> 0x13d	);
1539
1540    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1541
1542    if (defined($opf=$cmllopf{$mnemonic})) {
1543	foreach ($rs1,$rs2,$rd) {
1544	    return $ref if (!/%f([0-9]{1,2})/);
1545	    $_=$1;
1546	    if ($1>=32) {
1547		return $ref if ($1&1);
1548		# re-encode for upper double register addressing
1549		$_=($1|$1>>5)&31;
1550	    }
1551	}
1552
1553	return	sprintf ".word\t0x%08x !%s",
1554			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1555			$ref;
1556    } else {
1557	return $ref;
1558    }
1559}
1560
1561sub unmovxtox {		# 2-argument instructions
1562my ($mnemonic,$rs,$rd)=@_;
1563my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1564my ($ref,$opf);
1565my %movxopf = (	"movdtox"	=> 0x110,
1566		"movstouw"	=> 0x111,
1567		"movstosw"	=> 0x113,
1568		"movxtod"	=> 0x118,
1569		"movwtos"	=> 0x119	);
1570
1571    $ref = "$mnemonic\t$rs,$rd";
1572
1573    if (defined($opf=$movxopf{$mnemonic})) {
1574	foreach ($rs,$rd) {
1575	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
1576	    $_=$bias{$1}+$2;
1577	    if ($2>=32) {
1578		return $ref if ($2&1);
1579		# re-encode for upper double register addressing
1580		$_=($2|$2>>5)&31;
1581	    }
1582	}
1583
1584	return	sprintf ".word\t0x%08x !%s",
1585			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1586			$ref;
1587    } else {
1588	return $ref;
1589    }
1590}
1591
1592sub undes {
1593my ($mnemonic)=shift;
1594my @args=@_;
1595my ($ref,$opf);
1596my %desopf = (	"des_round"	=> 0b1001,
1597		"des_ip"	=> 0b100110100,
1598		"des_iip"	=> 0b100110101,
1599		"des_kexpand"	=> 0b100110110	);
1600
1601    $ref = "$mnemonic\t".join(",",@_);
1602
1603    if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
1604	if ($mnemonic eq "des_round") {
1605	    foreach (@args[0..3]) {
1606		return $ref if (!/%f([0-9]{1,2})/);
1607		$_=$1;
1608		if ($1>=32) {
1609		    return $ref if ($1&1);
1610		    # re-encode for upper double register addressing
1611		    $_=($1|$1>>5)&31;
1612		}
1613	    }
1614	    return  sprintf ".word\t0x%08x !%s",
1615			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1616			    $ref;
1617	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
1618	    foreach (@args[0..2]) {
1619		return $ref if (!/(%f)?([0-9]{1,2})/);
1620		$_=$2;
1621		if ($2>=32) {
1622		    return $ref if ($2&1);
1623		    # re-encode for upper double register addressing
1624		    $_=($2|$2>>5)&31;
1625		}
1626	    }
1627	    return  sprintf ".word\t0x%08x !%s",
1628			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1629			    $ref;
1630	} else {				# 2-arg
1631	    foreach (@args[0..1]) {
1632		return $ref if (!/%f([0-9]{1,2})/);
1633		$_=$1;
1634		if ($1>=32) {
1635		    return $ref if ($2&1);
1636		    # re-encode for upper double register addressing
1637		    $_=($1|$1>>5)&31;
1638		}
1639	    }
1640	    return  sprintf ".word\t0x%08x !%s",
1641			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1642			    $ref;
1643	}
1644    } else {
1645	return $ref;
1646    }
1647}
1648
1649sub emit_assembler {
1650    foreach (split("\n",$::code)) {
1651	s/\`([^\`]*)\`/eval $1/ge;
1652
1653	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1654
1655	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1656		&unaes_round($1,$2,$3,$4,$5)
1657	 /geo or
1658	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1659		&unaes_kexpand($1,$2,$3,$4)
1660	 /geo or
1661	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1662		&uncamellia_f($1,$2,$3,$4,$5)
1663	 /geo or
1664	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1665		&uncamellia3($1,$2,$3,$4)
1666	 /geo or
1667	s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1668		&undes($1,$2,$3,$4,$5)
1669	 /geo or
1670	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1671		&unmovxtox($1,$2,$3)
1672	 /geo or
1673	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1674		&unmovxtox($1,$2,$3)
1675	 /geo or
1676	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1677		&unvis($1,$2,$3,$4)
1678	 /geo or
1679	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1680		&unvis3($1,$2,$3,$4)
1681	 /geo;
1682
1683	print $_,"\n";
1684    }
1685}
1686
16871;
1688