1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by David S. Miller and Andy Polyakov.
12# The module is licensed under 2-clause BSD
13# license. March 2013. All rights reserved.
14# ====================================================================
15
16######################################################################
17# DES for SPARC T4.
18#
19# As with other hardware-assisted ciphers CBC encrypt results [for
20# aligned data] are virtually identical to critical path lengths:
21#
22#		DES		Triple-DES
23# CBC encrypt	4.14/4.15(*)	11.7/11.7
24# CBC decrypt	1.77/4.11(**)	6.42/7.47
25#
26#			 (*)	numbers after slash are for
27#				misaligned data;
28#			 (**)	this is result for largest
29#				block size, unlike all other
30#				cases smaller blocks results
31#				are better[?];
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34push(@INC,"${dir}","${dir}../../perlasm");
35require "sparcv9_modes.pl";
36
37$output=pop;
38open STDOUT,">$output";
39
40$code.=<<___;
41#include "sparc_arch.h"
42
43#ifdef	__arch64__
44.register       %g2,#scratch
45.register       %g3,#scratch
46#endif
47
48.text
49___
50
51{ my ($inp,$out)=("%o0","%o1");
52
53$code.=<<___;
54.align	32
55.globl	des_t4_key_expand
56.type	des_t4_key_expand,#function
57des_t4_key_expand:
58	andcc		$inp, 0x7, %g0
59	alignaddr	$inp, %g0, $inp
60	bz,pt		%icc, 1f
61	ldd		[$inp + 0x00], %f0
62	ldd		[$inp + 0x08], %f2
63	faligndata	%f0, %f2, %f0
641:	des_kexpand	%f0, 0, %f0
65	des_kexpand	%f0, 1, %f2
66	std		%f0, [$out + 0x00]
67	des_kexpand	%f2, 3, %f6
68	std		%f2, [$out + 0x08]
69	des_kexpand	%f2, 2, %f4
70	des_kexpand	%f6, 3, %f10
71	std		%f6, [$out + 0x18]
72	des_kexpand	%f6, 2, %f8
73	std		%f4, [$out + 0x10]
74	des_kexpand	%f10, 3, %f14
75	std		%f10, [$out + 0x28]
76	des_kexpand	%f10, 2, %f12
77	std		%f8, [$out + 0x20]
78	des_kexpand	%f14, 1, %f16
79	std		%f14, [$out + 0x38]
80	des_kexpand	%f16, 3, %f20
81	std		%f12, [$out + 0x30]
82	des_kexpand	%f16, 2, %f18
83	std		%f16, [$out + 0x40]
84	des_kexpand	%f20, 3, %f24
85	std		%f20, [$out + 0x50]
86	des_kexpand	%f20, 2, %f22
87	std		%f18, [$out + 0x48]
88	des_kexpand	%f24, 3, %f28
89	std		%f24, [$out + 0x60]
90	des_kexpand	%f24, 2, %f26
91	std		%f22, [$out + 0x58]
92	des_kexpand	%f28, 1, %f30
93	std		%f28, [$out + 0x70]
94	std		%f26, [$out + 0x68]
95	retl
96	std		%f30, [$out + 0x78]
97.size	des_t4_key_expand,.-des_t4_key_expand
98___
99}
100{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
101  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
102
103$code.=<<___;
104.globl	des_t4_cbc_encrypt
105.align	32
106des_t4_cbc_encrypt:
107	cmp		$len, 0
108	be,pn		$::size_t_cc, .Lcbc_abort
109	srln		$len, 0, $len		! needed on v8+, "nop" on v9
110	ld		[$ivec + 0], %f0	! load ivec
111	ld		[$ivec + 4], %f1
112
113	and		$inp, 7, $ileft
114	andn		$inp, 7, $inp
115	sll		$ileft, 3, $ileft
116	mov		0xff, $omask
117	prefetch	[$inp], 20
118	prefetch	[$inp + 63], 20
119	sub		%g0, $ileft, $iright
120	and		$out, 7, %g4
121	alignaddrl	$out, %g0, $out
122	srl		$omask, %g4, $omask
123	srlx		$len, 3, $len
124	movrz		%g4, 0, $omask
125	prefetch	[$out], 22
126
127	ldd		[$key + 0x00], %f4	! load key schedule
128	ldd		[$key + 0x08], %f6
129	ldd		[$key + 0x10], %f8
130	ldd		[$key + 0x18], %f10
131	ldd		[$key + 0x20], %f12
132	ldd		[$key + 0x28], %f14
133	ldd		[$key + 0x30], %f16
134	ldd		[$key + 0x38], %f18
135	ldd		[$key + 0x40], %f20
136	ldd		[$key + 0x48], %f22
137	ldd		[$key + 0x50], %f24
138	ldd		[$key + 0x58], %f26
139	ldd		[$key + 0x60], %f28
140	ldd		[$key + 0x68], %f30
141	ldd		[$key + 0x70], %f32
142	ldd		[$key + 0x78], %f34
143
144.Ldes_cbc_enc_loop:
145	ldx		[$inp + 0], %g4
146	brz,pt		$ileft, 4f
147	nop
148
149	ldx		[$inp + 8], %g5
150	sllx		%g4, $ileft, %g4
151	srlx		%g5, $iright, %g5
152	or		%g5, %g4, %g4
1534:
154	movxtod		%g4, %f2
155	prefetch	[$inp + 8+63], 20
156	add		$inp, 8, $inp
157	fxor		%f2, %f0, %f0		! ^= ivec
158	prefetch	[$out + 63], 22
159
160	des_ip		%f0, %f0
161	des_round	%f4, %f6, %f0, %f0
162	des_round	%f8, %f10, %f0, %f0
163	des_round	%f12, %f14, %f0, %f0
164	des_round	%f16, %f18, %f0, %f0
165	des_round	%f20, %f22, %f0, %f0
166	des_round	%f24, %f26, %f0, %f0
167	des_round	%f28, %f30, %f0, %f0
168	des_round	%f32, %f34, %f0, %f0
169	des_iip		%f0, %f0
170
171	brnz,pn		$omask, 2f
172	sub		$len, 1, $len
173
174	std		%f0, [$out + 0]
175	brnz,pt		$len, .Ldes_cbc_enc_loop
176	add		$out, 8, $out
177
178	st		%f0, [$ivec + 0]	! write out ivec
179	retl
180	st		%f1, [$ivec + 4]
181.Lcbc_abort:
182	retl
183	nop
184
185.align	16
1862:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
187						! and ~4x deterioration
188						! in inp==out case
189	faligndata	%f0, %f0, %f2		! handle unaligned output
190
191	stda		%f2, [$out + $omask]0xc0	! partial store
192	add		$out, 8, $out
193	orn		%g0, $omask, $omask
194	stda		%f2, [$out + $omask]0xc0	! partial store
195
196	brnz,pt		$len, .Ldes_cbc_enc_loop+4
197	orn		%g0, $omask, $omask
198
199	st		%f0, [$ivec + 0]	! write out ivec
200	retl
201	st		%f1, [$ivec + 4]
202.type	des_t4_cbc_encrypt,#function
203.size	des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
204
205.globl	des_t4_cbc_decrypt
206.align	32
207des_t4_cbc_decrypt:
208	cmp		$len, 0
209	be,pn		$::size_t_cc, .Lcbc_abort
210	srln		$len, 0, $len		! needed on v8+, "nop" on v9
211	ld		[$ivec + 0], %f2	! load ivec
212	ld		[$ivec + 4], %f3
213
214	and		$inp, 7, $ileft
215	andn		$inp, 7, $inp
216	sll		$ileft, 3, $ileft
217	mov		0xff, $omask
218	prefetch	[$inp], 20
219	prefetch	[$inp + 63], 20
220	sub		%g0, $ileft, $iright
221	and		$out, 7, %g4
222	alignaddrl	$out, %g0, $out
223	srl		$omask, %g4, $omask
224	srlx		$len, 3, $len
225	movrz		%g4, 0, $omask
226	prefetch	[$out], 22
227
228	ldd		[$key + 0x78], %f4	! load key schedule
229	ldd		[$key + 0x70], %f6
230	ldd		[$key + 0x68], %f8
231	ldd		[$key + 0x60], %f10
232	ldd		[$key + 0x58], %f12
233	ldd		[$key + 0x50], %f14
234	ldd		[$key + 0x48], %f16
235	ldd		[$key + 0x40], %f18
236	ldd		[$key + 0x38], %f20
237	ldd		[$key + 0x30], %f22
238	ldd		[$key + 0x28], %f24
239	ldd		[$key + 0x20], %f26
240	ldd		[$key + 0x18], %f28
241	ldd		[$key + 0x10], %f30
242	ldd		[$key + 0x08], %f32
243	ldd		[$key + 0x00], %f34
244
245.Ldes_cbc_dec_loop:
246	ldx		[$inp + 0], %g4
247	brz,pt		$ileft, 4f
248	nop
249
250	ldx		[$inp + 8], %g5
251	sllx		%g4, $ileft, %g4
252	srlx		%g5, $iright, %g5
253	or		%g5, %g4, %g4
2544:
255	movxtod		%g4, %f0
256	prefetch	[$inp + 8+63], 20
257	add		$inp, 8, $inp
258	prefetch	[$out + 63], 22
259
260	des_ip		%f0, %f0
261	des_round	%f4, %f6, %f0, %f0
262	des_round	%f8, %f10, %f0, %f0
263	des_round	%f12, %f14, %f0, %f0
264	des_round	%f16, %f18, %f0, %f0
265	des_round	%f20, %f22, %f0, %f0
266	des_round	%f24, %f26, %f0, %f0
267	des_round	%f28, %f30, %f0, %f0
268	des_round	%f32, %f34, %f0, %f0
269	des_iip		%f0, %f0
270
271	fxor		%f2, %f0, %f0		! ^= ivec
272	movxtod		%g4, %f2
273
274	brnz,pn		$omask, 2f
275	sub		$len, 1, $len
276
277	std		%f0, [$out + 0]
278	brnz,pt		$len, .Ldes_cbc_dec_loop
279	add		$out, 8, $out
280
281	st		%f2, [$ivec + 0]	! write out ivec
282	retl
283	st		%f3, [$ivec + 4]
284
285.align	16
2862:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
287						! and ~4x deterioration
288						! in inp==out case
289	faligndata	%f0, %f0, %f0		! handle unaligned output
290
291	stda		%f0, [$out + $omask]0xc0	! partial store
292	add		$out, 8, $out
293	orn		%g0, $omask, $omask
294	stda		%f0, [$out + $omask]0xc0	! partial store
295
296	brnz,pt		$len, .Ldes_cbc_dec_loop+4
297	orn		%g0, $omask, $omask
298
299	st		%f2, [$ivec + 0]	! write out ivec
300	retl
301	st		%f3, [$ivec + 4]
302.type	des_t4_cbc_decrypt,#function
303.size	des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
304___
305
306# One might wonder why does one have back-to-back des_iip/des_ip
307# pairs between EDE passes. Indeed, aren't they inverse of each other?
308# They almost are. Outcome of the pair is 32-bit words being swapped
309# in target register. Consider pair of des_iip/des_ip as a way to
310# perform the due swap, it's actually fastest way in this case.
311
312$code.=<<___;
313.globl	des_t4_ede3_cbc_encrypt
314.align	32
315des_t4_ede3_cbc_encrypt:
316	cmp		$len, 0
317	be,pn		$::size_t_cc, .Lcbc_abort
318	srln		$len, 0, $len		! needed on v8+, "nop" on v9
319	ld		[$ivec + 0], %f0	! load ivec
320	ld		[$ivec + 4], %f1
321
322	and		$inp, 7, $ileft
323	andn		$inp, 7, $inp
324	sll		$ileft, 3, $ileft
325	mov		0xff, $omask
326	prefetch	[$inp], 20
327	prefetch	[$inp + 63], 20
328	sub		%g0, $ileft, $iright
329	and		$out, 7, %g4
330	alignaddrl	$out, %g0, $out
331	srl		$omask, %g4, $omask
332	srlx		$len, 3, $len
333	movrz		%g4, 0, $omask
334	prefetch	[$out], 22
335
336	ldd		[$key + 0x00], %f4	! load key schedule
337	ldd		[$key + 0x08], %f6
338	ldd		[$key + 0x10], %f8
339	ldd		[$key + 0x18], %f10
340	ldd		[$key + 0x20], %f12
341	ldd		[$key + 0x28], %f14
342	ldd		[$key + 0x30], %f16
343	ldd		[$key + 0x38], %f18
344	ldd		[$key + 0x40], %f20
345	ldd		[$key + 0x48], %f22
346	ldd		[$key + 0x50], %f24
347	ldd		[$key + 0x58], %f26
348	ldd		[$key + 0x60], %f28
349	ldd		[$key + 0x68], %f30
350	ldd		[$key + 0x70], %f32
351	ldd		[$key + 0x78], %f34
352
353.Ldes_ede3_cbc_enc_loop:
354	ldx		[$inp + 0], %g4
355	brz,pt		$ileft, 4f
356	nop
357
358	ldx		[$inp + 8], %g5
359	sllx		%g4, $ileft, %g4
360	srlx		%g5, $iright, %g5
361	or		%g5, %g4, %g4
3624:
363	movxtod		%g4, %f2
364	prefetch	[$inp + 8+63], 20
365	add		$inp, 8, $inp
366	fxor		%f2, %f0, %f0		! ^= ivec
367	prefetch	[$out + 63], 22
368
369	des_ip		%f0, %f0
370	des_round	%f4, %f6, %f0, %f0
371	des_round	%f8, %f10, %f0, %f0
372	des_round	%f12, %f14, %f0, %f0
373	des_round	%f16, %f18, %f0, %f0
374	ldd		[$key + 0x100-0x08], %f36
375	ldd		[$key + 0x100-0x10], %f38
376	des_round	%f20, %f22, %f0, %f0
377	ldd		[$key + 0x100-0x18], %f40
378	ldd		[$key + 0x100-0x20], %f42
379	des_round	%f24, %f26, %f0, %f0
380	ldd		[$key + 0x100-0x28], %f44
381	ldd		[$key + 0x100-0x30], %f46
382	des_round	%f28, %f30, %f0, %f0
383	ldd		[$key + 0x100-0x38], %f48
384	ldd		[$key + 0x100-0x40], %f50
385	des_round	%f32, %f34, %f0, %f0
386	ldd		[$key + 0x100-0x48], %f52
387	ldd		[$key + 0x100-0x50], %f54
388	des_iip		%f0, %f0
389
390	ldd		[$key + 0x100-0x58], %f56
391	ldd		[$key + 0x100-0x60], %f58
392	des_ip		%f0, %f0
393	ldd		[$key + 0x100-0x68], %f60
394	ldd		[$key + 0x100-0x70], %f62
395	des_round	%f36, %f38, %f0, %f0
396	ldd		[$key + 0x100-0x78], %f36
397	ldd		[$key + 0x100-0x80], %f38
398	des_round	%f40, %f42, %f0, %f0
399	des_round	%f44, %f46, %f0, %f0
400	des_round	%f48, %f50, %f0, %f0
401	ldd		[$key + 0x100+0x00], %f40
402	ldd		[$key + 0x100+0x08], %f42
403	des_round	%f52, %f54, %f0, %f0
404	ldd		[$key + 0x100+0x10], %f44
405	ldd		[$key + 0x100+0x18], %f46
406	des_round	%f56, %f58, %f0, %f0
407	ldd		[$key + 0x100+0x20], %f48
408	ldd		[$key + 0x100+0x28], %f50
409	des_round	%f60, %f62, %f0, %f0
410	ldd		[$key + 0x100+0x30], %f52
411	ldd		[$key + 0x100+0x38], %f54
412	des_round	%f36, %f38, %f0, %f0
413	ldd		[$key + 0x100+0x40], %f56
414	ldd		[$key + 0x100+0x48], %f58
415	des_iip		%f0, %f0
416
417	ldd		[$key + 0x100+0x50], %f60
418	ldd		[$key + 0x100+0x58], %f62
419	des_ip		%f0, %f0
420	ldd		[$key + 0x100+0x60], %f36
421	ldd		[$key + 0x100+0x68], %f38
422	des_round	%f40, %f42, %f0, %f0
423	ldd		[$key + 0x100+0x70], %f40
424	ldd		[$key + 0x100+0x78], %f42
425	des_round	%f44, %f46, %f0, %f0
426	des_round	%f48, %f50, %f0, %f0
427	des_round	%f52, %f54, %f0, %f0
428	des_round	%f56, %f58, %f0, %f0
429	des_round	%f60, %f62, %f0, %f0
430	des_round	%f36, %f38, %f0, %f0
431	des_round	%f40, %f42, %f0, %f0
432	des_iip		%f0, %f0
433
434	brnz,pn		$omask, 2f
435	sub		$len, 1, $len
436
437	std		%f0, [$out + 0]
438	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop
439	add		$out, 8, $out
440
441	st		%f0, [$ivec + 0]	! write out ivec
442	retl
443	st		%f1, [$ivec + 4]
444
445.align	16
4462:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
447						! and ~2x deterioration
448						! in inp==out case
449	faligndata	%f0, %f0, %f2		! handle unaligned output
450
451	stda		%f2, [$out + $omask]0xc0	! partial store
452	add		$out, 8, $out
453	orn		%g0, $omask, $omask
454	stda		%f2, [$out + $omask]0xc0	! partial store
455
456	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop+4
457	orn		%g0, $omask, $omask
458
459	st		%f0, [$ivec + 0]	! write out ivec
460	retl
461	st		%f1, [$ivec + 4]
462.type	des_t4_ede3_cbc_encrypt,#function
463.size	des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
464
465.globl	des_t4_ede3_cbc_decrypt
466.align	32
467des_t4_ede3_cbc_decrypt:
468	cmp		$len, 0
469	be,pn		$::size_t_cc, .Lcbc_abort
470	srln		$len, 0, $len		! needed on v8+, "nop" on v9
471	ld		[$ivec + 0], %f2	! load ivec
472	ld		[$ivec + 4], %f3
473
474	and		$inp, 7, $ileft
475	andn		$inp, 7, $inp
476	sll		$ileft, 3, $ileft
477	mov		0xff, $omask
478	prefetch	[$inp], 20
479	prefetch	[$inp + 63], 20
480	sub		%g0, $ileft, $iright
481	and		$out, 7, %g4
482	alignaddrl	$out, %g0, $out
483	srl		$omask, %g4, $omask
484	srlx		$len, 3, $len
485	movrz		%g4, 0, $omask
486	prefetch	[$out], 22
487
488	ldd		[$key + 0x100+0x78], %f4	! load key schedule
489	ldd		[$key + 0x100+0x70], %f6
490	ldd		[$key + 0x100+0x68], %f8
491	ldd		[$key + 0x100+0x60], %f10
492	ldd		[$key + 0x100+0x58], %f12
493	ldd		[$key + 0x100+0x50], %f14
494	ldd		[$key + 0x100+0x48], %f16
495	ldd		[$key + 0x100+0x40], %f18
496	ldd		[$key + 0x100+0x38], %f20
497	ldd		[$key + 0x100+0x30], %f22
498	ldd		[$key + 0x100+0x28], %f24
499	ldd		[$key + 0x100+0x20], %f26
500	ldd		[$key + 0x100+0x18], %f28
501	ldd		[$key + 0x100+0x10], %f30
502	ldd		[$key + 0x100+0x08], %f32
503	ldd		[$key + 0x100+0x00], %f34
504
505.Ldes_ede3_cbc_dec_loop:
506	ldx		[$inp + 0], %g4
507	brz,pt		$ileft, 4f
508	nop
509
510	ldx		[$inp + 8], %g5
511	sllx		%g4, $ileft, %g4
512	srlx		%g5, $iright, %g5
513	or		%g5, %g4, %g4
5144:
515	movxtod		%g4, %f0
516	prefetch	[$inp + 8+63], 20
517	add		$inp, 8, $inp
518	prefetch	[$out + 63], 22
519
520	des_ip		%f0, %f0
521	des_round	%f4, %f6, %f0, %f0
522	des_round	%f8, %f10, %f0, %f0
523	des_round	%f12, %f14, %f0, %f0
524	des_round	%f16, %f18, %f0, %f0
525	ldd		[$key + 0x80+0x00], %f36
526	ldd		[$key + 0x80+0x08], %f38
527	des_round	%f20, %f22, %f0, %f0
528	ldd		[$key + 0x80+0x10], %f40
529	ldd		[$key + 0x80+0x18], %f42
530	des_round	%f24, %f26, %f0, %f0
531	ldd		[$key + 0x80+0x20], %f44
532	ldd		[$key + 0x80+0x28], %f46
533	des_round	%f28, %f30, %f0, %f0
534	ldd		[$key + 0x80+0x30], %f48
535	ldd		[$key + 0x80+0x38], %f50
536	des_round	%f32, %f34, %f0, %f0
537	ldd		[$key + 0x80+0x40], %f52
538	ldd		[$key + 0x80+0x48], %f54
539	des_iip		%f0, %f0
540
541	ldd		[$key + 0x80+0x50], %f56
542	ldd		[$key + 0x80+0x58], %f58
543	des_ip		%f0, %f0
544	ldd		[$key + 0x80+0x60], %f60
545	ldd		[$key + 0x80+0x68], %f62
546	des_round	%f36, %f38, %f0, %f0
547	ldd		[$key + 0x80+0x70], %f36
548	ldd		[$key + 0x80+0x78], %f38
549	des_round	%f40, %f42, %f0, %f0
550	des_round	%f44, %f46, %f0, %f0
551	des_round	%f48, %f50, %f0, %f0
552	ldd		[$key + 0x80-0x08], %f40
553	ldd		[$key + 0x80-0x10], %f42
554	des_round	%f52, %f54, %f0, %f0
555	ldd		[$key + 0x80-0x18], %f44
556	ldd		[$key + 0x80-0x20], %f46
557	des_round	%f56, %f58, %f0, %f0
558	ldd		[$key + 0x80-0x28], %f48
559	ldd		[$key + 0x80-0x30], %f50
560	des_round	%f60, %f62, %f0, %f0
561	ldd		[$key + 0x80-0x38], %f52
562	ldd		[$key + 0x80-0x40], %f54
563	des_round	%f36, %f38, %f0, %f0
564	ldd		[$key + 0x80-0x48], %f56
565	ldd		[$key + 0x80-0x50], %f58
566	des_iip		%f0, %f0
567
568	ldd		[$key + 0x80-0x58], %f60
569	ldd		[$key + 0x80-0x60], %f62
570	des_ip		%f0, %f0
571	ldd		[$key + 0x80-0x68], %f36
572	ldd		[$key + 0x80-0x70], %f38
573	des_round	%f40, %f42, %f0, %f0
574	ldd		[$key + 0x80-0x78], %f40
575	ldd		[$key + 0x80-0x80], %f42
576	des_round	%f44, %f46, %f0, %f0
577	des_round	%f48, %f50, %f0, %f0
578	des_round	%f52, %f54, %f0, %f0
579	des_round	%f56, %f58, %f0, %f0
580	des_round	%f60, %f62, %f0, %f0
581	des_round	%f36, %f38, %f0, %f0
582	des_round	%f40, %f42, %f0, %f0
583	des_iip		%f0, %f0
584
585	fxor		%f2, %f0, %f0		! ^= ivec
586	movxtod		%g4, %f2
587
588	brnz,pn		$omask, 2f
589	sub		$len, 1, $len
590
591	std		%f0, [$out + 0]
592	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop
593	add		$out, 8, $out
594
595	st		%f2, [$ivec + 0]	! write out ivec
596	retl
597	st		%f3, [$ivec + 4]
598
599.align	16
6002:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
601						! and ~3x deterioration
602						! in inp==out case
603	faligndata	%f0, %f0, %f0		! handle unaligned output
604
605	stda		%f0, [$out + $omask]0xc0	! partial store
606	add		$out, 8, $out
607	orn		%g0, $omask, $omask
608	stda		%f0, [$out + $omask]0xc0	! partial store
609
610	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop+4
611	orn		%g0, $omask, $omask
612
613	st		%f2, [$ivec + 0]	! write out ivec
614	retl
615	st		%f3, [$ivec + 4]
616.type	des_t4_ede3_cbc_decrypt,#function
617.size	des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
618___
619}
620$code.=<<___;
621.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
622.align  4
623___
624
625&emit_assembler();
626
627close STDOUT;
628