1#! /usr/bin/env perl
2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12#
13# This module may be used under the terms of either the GNU General
14# Public License version 2 or later, the GNU Lesser General Public
15# License version 2.1 or later, the Mozilla Public License version
16# 1.1 or the BSD License. The exact terms of either license are
17# distributed along with this module. For further details see
18# http://www.openssl.org/~appro/camellia/.
19# ====================================================================
20
21# Performance in cycles per processed byte (less is better) in
22# 'openssl speed ...' benchmark:
23#
24#			AMD64	Core2	EM64T
25# -evp camellia-128-ecb	16.7	21.0	22.7
26# + over gcc 3.4.6	+25%	+5%	0%
27#
28# camellia-128-cbc	15.7	20.4	21.1
29#
30# 128-bit key setup	128	216	205	cycles/key
31# + over gcc 3.4.6	+54%	+39%	+15%
32#
33# Numbers in "+" rows represent performance improvement over compiler
34# generated code. Key setup timings are impressive on AMD and Core2
35# thanks to 64-bit operations being covertly deployed. Improvement on
36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37# apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39$flavour = shift;
40$output  = shift;
41if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48die "can't locate x86_64-xlate.pl";
49
50open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51*STDOUT=*OUT;
52
53sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
54sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55                        $r =~ s/%[er]([sd]i)/%\1l/;
56                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
57
58$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59@S=("%r8d","%r9d","%r10d","%r11d");
60$i0="%esi";
61$i1="%edi";
62$Tbl="%rbp";	# size optimization
63$inp="%r12";
64$out="%r13";
65$key="%r14";
66$keyend="%r15";
67$arg0d=$win64?"%ecx":"%edi";
68
69# const unsigned int Camellia_SBOX[4][256];
70# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71# and [2][] - with [3][]. This is done to minimize code size.
72$SBOX1_1110=0;		# Camellia_SBOX[0]
73$SBOX4_4404=4;		# Camellia_SBOX[1]
74$SBOX2_0222=2048;	# Camellia_SBOX[2]
75$SBOX3_3033=2052;	# Camellia_SBOX[3]
76
77sub Camellia_Feistel {
78my $i=@_[0];
79my $seed=defined(@_[1])?@_[1]:0;
80my $scale=$seed<0?-8:8;
81my $j=($i&1)*2;
82my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84$code.=<<___;
85	xor	$s0,$t0				# t0^=key[0]
86	xor	$s1,$t1				# t1^=key[1]
87	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff
88	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff
89	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]
90	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]
91	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff
92	shr	\$16,$t0
93	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff
94	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]
95	shr	\$16,$t1
96	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]
97	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff
98	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff
99	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]
100	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]
101	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff
102	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff
103	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]
104	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]
105	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]
106	mov	`$seed+($i+1)*$scale+4`($key),$t0
107	xor	$t3,$t2				# t2^=t3
108	ror	\$8,$t3				# t3=RightRotate(t3,8)
109	xor	$t2,$s2
110	xor	$t2,$s3
111	xor	$t3,$s3
112___
113}
114
115# void Camellia_EncryptBlock_Rounds(
116#		int grandRounds,
117#		const Byte plaintext[],
118#		const KEY_TABLE_TYPE keyTable,
119#		Byte ciphertext[])
120$code=<<___;
121.text
122
123# V1.x API
124.globl	Camellia_EncryptBlock
125.type	Camellia_EncryptBlock,\@abi-omnipotent
126.align	16
127Camellia_EncryptBlock:
128.cfi_startproc
129	movl	\$128,%eax
130	subl	$arg0d,%eax
131	movl	\$3,$arg0d
132	adcl	\$0,$arg0d	# keyBitLength==128?3:4
133	jmp	.Lenc_rounds
134.cfi_endproc
135.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock
136# V2
137.globl	Camellia_EncryptBlock_Rounds
138.type	Camellia_EncryptBlock_Rounds,\@function,4
139.align	16
140.Lenc_rounds:
141Camellia_EncryptBlock_Rounds:
142.cfi_startproc
143	push	%rbx
144.cfi_push	%rbx
145	push	%rbp
146.cfi_push	%rbp
147	push	%r13
148.cfi_push	%r13
149	push	%r14
150.cfi_push	%r14
151	push	%r15
152.cfi_push	%r15
153.Lenc_prologue:
154
155	#mov	%rsi,$inp		# put away arguments
156	mov	%rcx,$out
157	mov	%rdx,$key
158
159	shl	\$6,%edi		# process grandRounds
160	lea	.LCamellia_SBOX(%rip),$Tbl
161	lea	($key,%rdi),$keyend
162
163	mov	0(%rsi),@S[0]		# load plaintext
164	mov	4(%rsi),@S[1]
165	mov	8(%rsi),@S[2]
166	bswap	@S[0]
167	mov	12(%rsi),@S[3]
168	bswap	@S[1]
169	bswap	@S[2]
170	bswap	@S[3]
171
172	call	_x86_64_Camellia_encrypt
173
174	bswap	@S[0]
175	bswap	@S[1]
176	bswap	@S[2]
177	mov	@S[0],0($out)
178	bswap	@S[3]
179	mov	@S[1],4($out)
180	mov	@S[2],8($out)
181	mov	@S[3],12($out)
182
183	mov	0(%rsp),%r15
184.cfi_restore	%r15
185	mov	8(%rsp),%r14
186.cfi_restore	%r14
187	mov	16(%rsp),%r13
188.cfi_restore	%r13
189	mov	24(%rsp),%rbp
190.cfi_restore	%rbp
191	mov	32(%rsp),%rbx
192.cfi_restore	%rbx
193	lea	40(%rsp),%rsp
194.cfi_adjust_cfa_offset	-40
195.Lenc_epilogue:
196	ret
197.cfi_endproc
198.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
199
200.type	_x86_64_Camellia_encrypt,\@abi-omnipotent
201.align	16
202_x86_64_Camellia_encrypt:
203.cfi_startproc
204	xor	0($key),@S[1]
205	xor	4($key),@S[0]		# ^=key[0-3]
206	xor	8($key),@S[3]
207	xor	12($key),@S[2]
208.align	16
209.Leloop:
210	mov	16($key),$t1		# prefetch key[4-5]
211	mov	20($key),$t0
212
213___
214	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
215$code.=<<___;
216	lea	16*4($key),$key
217	cmp	$keyend,$key
218	mov	8($key),$t3		# prefetch key[2-3]
219	mov	12($key),$t2
220	je	.Ledone
221
222	and	@S[0],$t0
223	or	@S[3],$t3
224	rol	\$1,$t0
225	xor	$t3,@S[2]		# s2^=s3|key[3];
226	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
227	and	@S[2],$t2
228	or	@S[1],$t1
229	rol	\$1,$t2
230	xor	$t1,@S[0]		# s0^=s1|key[1];
231	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
232	jmp	.Leloop
233
234.align	16
235.Ledone:
236	xor	@S[2],$t0		# SwapHalf
237	xor	@S[3],$t1
238	xor	@S[0],$t2
239	xor	@S[1],$t3
240
241	mov	$t0,@S[0]
242	mov	$t1,@S[1]
243	mov	$t2,@S[2]
244	mov	$t3,@S[3]
245
246	.byte	0xf3,0xc3		# rep ret
247.cfi_endproc
248.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
249
250# V1.x API
251.globl	Camellia_DecryptBlock
252.type	Camellia_DecryptBlock,\@abi-omnipotent
253.align	16
254Camellia_DecryptBlock:
255.cfi_startproc
256	movl	\$128,%eax
257	subl	$arg0d,%eax
258	movl	\$3,$arg0d
259	adcl	\$0,$arg0d	# keyBitLength==128?3:4
260	jmp	.Ldec_rounds
261.cfi_endproc
262.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock
263# V2
264.globl	Camellia_DecryptBlock_Rounds
265.type	Camellia_DecryptBlock_Rounds,\@function,4
266.align	16
267.Ldec_rounds:
268Camellia_DecryptBlock_Rounds:
269.cfi_startproc
270	push	%rbx
271.cfi_push	%rbx
272	push	%rbp
273.cfi_push	%rbp
274	push	%r13
275.cfi_push	%r13
276	push	%r14
277.cfi_push	%r14
278	push	%r15
279.cfi_push	%r15
280.Ldec_prologue:
281
282	#mov	%rsi,$inp		# put away arguments
283	mov	%rcx,$out
284	mov	%rdx,$keyend
285
286	shl	\$6,%edi		# process grandRounds
287	lea	.LCamellia_SBOX(%rip),$Tbl
288	lea	($keyend,%rdi),$key
289
290	mov	0(%rsi),@S[0]		# load plaintext
291	mov	4(%rsi),@S[1]
292	mov	8(%rsi),@S[2]
293	bswap	@S[0]
294	mov	12(%rsi),@S[3]
295	bswap	@S[1]
296	bswap	@S[2]
297	bswap	@S[3]
298
299	call	_x86_64_Camellia_decrypt
300
301	bswap	@S[0]
302	bswap	@S[1]
303	bswap	@S[2]
304	mov	@S[0],0($out)
305	bswap	@S[3]
306	mov	@S[1],4($out)
307	mov	@S[2],8($out)
308	mov	@S[3],12($out)
309
310	mov	0(%rsp),%r15
311.cfi_restore	%r15
312	mov	8(%rsp),%r14
313.cfi_restore	%r14
314	mov	16(%rsp),%r13
315.cfi_restore	%r13
316	mov	24(%rsp),%rbp
317.cfi_restore	%rbp
318	mov	32(%rsp),%rbx
319.cfi_restore	%rbx
320	lea	40(%rsp),%rsp
321.cfi_adjust_cfa_offset	-40
322.Ldec_epilogue:
323	ret
324.cfi_endproc
325.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
326
327.type	_x86_64_Camellia_decrypt,\@abi-omnipotent
328.align	16
329_x86_64_Camellia_decrypt:
330.cfi_startproc
331	xor	0($key),@S[1]
332	xor	4($key),@S[0]		# ^=key[0-3]
333	xor	8($key),@S[3]
334	xor	12($key),@S[2]
335.align	16
336.Ldloop:
337	mov	-8($key),$t1		# prefetch key[4-5]
338	mov	-4($key),$t0
339
340___
341	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
342$code.=<<___;
343	lea	-16*4($key),$key
344	cmp	$keyend,$key
345	mov	0($key),$t3		# prefetch key[2-3]
346	mov	4($key),$t2
347	je	.Lddone
348
349	and	@S[0],$t0
350	or	@S[3],$t3
351	rol	\$1,$t0
352	xor	$t3,@S[2]		# s2^=s3|key[3];
353	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
354	and	@S[2],$t2
355	or	@S[1],$t1
356	rol	\$1,$t2
357	xor	$t1,@S[0]		# s0^=s1|key[1];
358	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
359
360	jmp	.Ldloop
361
362.align	16
363.Lddone:
364	xor	@S[2],$t2
365	xor	@S[3],$t3
366	xor	@S[0],$t0
367	xor	@S[1],$t1
368
369	mov	$t2,@S[0]		# SwapHalf
370	mov	$t3,@S[1]
371	mov	$t0,@S[2]
372	mov	$t1,@S[3]
373
374	.byte	0xf3,0xc3		# rep ret
375.cfi_endproc
376.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
377___
378
379sub _saveround {
380my ($rnd,$key,@T)=@_;
381my $bias=int(@T[0])?shift(@T):0;
382
383    if ($#T==3) {
384	$code.=<<___;
385	mov	@T[1],`$bias+$rnd*8+0`($key)
386	mov	@T[0],`$bias+$rnd*8+4`($key)
387	mov	@T[3],`$bias+$rnd*8+8`($key)
388	mov	@T[2],`$bias+$rnd*8+12`($key)
389___
390    } else {
391	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";
392	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);
393    }
394}
395
396sub _loadround {
397my ($rnd,$key,@T)=@_;
398my $bias=int(@T[0])?shift(@T):0;
399
400$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";
401$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);
402}
403
404# shld is very slow on Intel EM64T family. Even on AMD it limits
405# instruction decode rate [because it's VectorPath] and consequently
406# performance...
407sub __rotl128 {
408my ($i0,$i1,$rot)=@_;
409
410    if ($rot) {
411	$code.=<<___;
412	mov	$i0,%r11
413	shld	\$$rot,$i1,$i0
414	shld	\$$rot,%r11,$i1
415___
416    }
417}
418
419# ... Implementing 128-bit rotate without shld gives 80% better
420# performance EM64T, +15% on AMD64 and only ~7% degradation on
421# Core2. This is therefore preferred.
422sub _rotl128 {
423my ($i0,$i1,$rot)=@_;
424
425    if ($rot) {
426	$code.=<<___;
427	mov	$i0,%r11
428	shl	\$$rot,$i0
429	mov	$i1,%r9
430	shr	\$`64-$rot`,%r9
431	shr	\$`64-$rot`,%r11
432	or	%r9,$i0
433	shl	\$$rot,$i1
434	or	%r11,$i1
435___
436    }
437}
438
439{ my $step=0;
440
441$code.=<<___;
442.globl	Camellia_Ekeygen
443.type	Camellia_Ekeygen,\@function,3
444.align	16
445Camellia_Ekeygen:
446.cfi_startproc
447	push	%rbx
448.cfi_push	%rbx
449	push	%rbp
450.cfi_push	%rbp
451	push	%r13
452.cfi_push	%r13
453	push	%r14
454.cfi_push	%r14
455	push	%r15
456.cfi_push	%r15
457.Lkey_prologue:
458
459	mov	%edi,${keyend}d		# put away arguments, keyBitLength
460	mov	%rdx,$out		# keyTable
461
462	mov	0(%rsi),@S[0]		# load 0-127 bits
463	mov	4(%rsi),@S[1]
464	mov	8(%rsi),@S[2]
465	mov	12(%rsi),@S[3]
466
467	bswap	@S[0]
468	bswap	@S[1]
469	bswap	@S[2]
470	bswap	@S[3]
471___
472	&_saveround	(0,$out,@S);	# KL<<<0
473$code.=<<___;
474	cmp	\$128,$keyend		# check keyBitLength
475	je	.L1st128
476
477	mov	16(%rsi),@S[0]		# load 128-191 bits
478	mov	20(%rsi),@S[1]
479	cmp	\$192,$keyend
480	je	.L1st192
481	mov	24(%rsi),@S[2]		# load 192-255 bits
482	mov	28(%rsi),@S[3]
483	jmp	.L1st256
484.L1st192:
485	mov	@S[0],@S[2]
486	mov	@S[1],@S[3]
487	not	@S[2]
488	not	@S[3]
489.L1st256:
490	bswap	@S[0]
491	bswap	@S[1]
492	bswap	@S[2]
493	bswap	@S[3]
494___
495	&_saveround	(4,$out,@S);	# temp storage for KR!
496$code.=<<___;
497	xor	0($out),@S[1]		# KR^KL
498	xor	4($out),@S[0]
499	xor	8($out),@S[3]
500	xor	12($out),@S[2]
501
502.L1st128:
503	lea	.LCamellia_SIGMA(%rip),$key
504	lea	.LCamellia_SBOX(%rip),$Tbl
505
506	mov	0($key),$t1
507	mov	4($key),$t0
508___
509	&Camellia_Feistel($step++);
510	&Camellia_Feistel($step++);
511$code.=<<___;
512	xor	0($out),@S[1]		# ^KL
513	xor	4($out),@S[0]
514	xor	8($out),@S[3]
515	xor	12($out),@S[2]
516___
517	&Camellia_Feistel($step++);
518	&Camellia_Feistel($step++);
519$code.=<<___;
520	cmp	\$128,$keyend
521	jne	.L2nd256
522
523	lea	128($out),$out		# size optimization
524	shl	\$32,%r8		# @S[0]||
525	shl	\$32,%r10		# @S[2]||
526	or	%r9,%r8			# ||@S[1]
527	or	%r11,%r10		# ||@S[3]
528___
529	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL
530	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0
531	&_rotl128	("%rax","%rbx",15);
532	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15
533	&_rotl128	("%r8","%r10",15);
534	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15
535	&_rotl128	("%r8","%r10",15);		# 15+15=30
536	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30
537	&_rotl128	("%rax","%rbx",30);		# 15+30=45
538	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45
539	&_rotl128	("%r8","%r10",15);		# 30+15=45
540	&_saveround	(12,$out,-128,"%r8");		# KA<<<45
541	&_rotl128	("%rax","%rbx",15);		# 45+15=60
542	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60
543	&_rotl128	("%r8","%r10",15);		# 45+15=60
544	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60
545	&_rotl128	("%rax","%rbx",17);		# 60+17=77
546	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77
547	&_rotl128	("%rax","%rbx",17);		# 77+17=94
548	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94
549	&_rotl128	("%r8","%r10",34);		# 60+34=94
550	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94
551	&_rotl128	("%rax","%rbx",17);		# 94+17=111
552	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111
553	&_rotl128	("%r8","%r10",17);		# 94+17=111
554	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111
555$code.=<<___;
556	mov	\$3,%eax
557	jmp	.Ldone
558.align	16
559.L2nd256:
560___
561	&_saveround	(6,$out,@S);	# temp storage for KA!
562$code.=<<___;
563	xor	`4*8+0`($out),@S[1]	# KA^KR
564	xor	`4*8+4`($out),@S[0]
565	xor	`5*8+0`($out),@S[3]
566	xor	`5*8+4`($out),@S[2]
567___
568	&Camellia_Feistel($step++);
569	&Camellia_Feistel($step++);
570
571	&_loadround	(0,$out,"%rax","%rbx");	# KL
572	&_loadround	(4,$out,"%rcx","%rdx");	# KR
573	&_loadround	(6,$out,"%r14","%r15");	# KA
574$code.=<<___;
575	lea	128($out),$out		# size optimization
576	shl	\$32,%r8		# @S[0]||
577	shl	\$32,%r10		# @S[2]||
578	or	%r9,%r8			# ||@S[1]
579	or	%r11,%r10		# ||@S[3]
580___
581	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0
582	&_rotl128	("%rcx","%rdx",15);
583	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15
584	&_rotl128	("%r14","%r15",15);
585	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15
586	&_rotl128	("%rcx","%rdx",15);		# 15+15=30
587	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30
588	&_rotl128	("%r8","%r10",30);
589	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30
590	&_rotl128	("%rax","%rbx",45);
591	&_saveround	(12,$out,-128,"%rax","%rbx");	# KL<<<45
592	&_rotl128	("%r14","%r15",30);		# 15+30=45
593	&_saveround	(14,$out,-128,"%r14","%r15");	# KA<<<45
594	&_rotl128	("%rax","%rbx",15);		# 45+15=60
595	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<60
596	&_rotl128	("%rcx","%rdx",30);		# 30+30=60
597	&_saveround	(18,$out,-128,"%rcx","%rdx");	# KR<<<60
598	&_rotl128	("%r8","%r10",30);		# 30+30=60
599	&_saveround	(20,$out,-128,"%r8","%r10");	# KB<<<60
600	&_rotl128	("%rax","%rbx",17);		# 60+17=77
601	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<77
602	&_rotl128	("%r14","%r15",32);		# 45+32=77
603	&_saveround	(24,$out,-128,"%r14","%r15");	# KA<<<77
604	&_rotl128	("%rcx","%rdx",34);		# 60+34=94
605	&_saveround	(26,$out,-128,"%rcx","%rdx");	# KR<<<94
606	&_rotl128	("%r14","%r15",17);		# 77+17=94
607	&_saveround	(28,$out,-128,"%r14","%r15");	# KA<<<77
608	&_rotl128	("%rax","%rbx",34);		# 77+34=111
609	&_saveround	(30,$out,-128,"%rax","%rbx");	# KL<<<111
610	&_rotl128	("%r8","%r10",51);		# 60+51=111
611	&_saveround	(32,$out,-128,"%r8","%r10");	# KB<<<111
612$code.=<<___;
613	mov	\$4,%eax
614.Ldone:
615	mov	0(%rsp),%r15
616.cfi_restore	%r15
617	mov	8(%rsp),%r14
618.cfi_restore	%r14
619	mov	16(%rsp),%r13
620.cfi_restore	%r13
621	mov	24(%rsp),%rbp
622.cfi_restore	%rbp
623	mov	32(%rsp),%rbx
624.cfi_restore	%rbx
625	lea	40(%rsp),%rsp
626.cfi_adjust_cfa_offset	-40
627.Lkey_epilogue:
628	ret
629.cfi_endproc
630.size	Camellia_Ekeygen,.-Camellia_Ekeygen
631___
632}
633
634@SBOX=(
635112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
636 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
637134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
638166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
639139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
640223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
641 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
642254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
643170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
644 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
645135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
646 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
647233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
648120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
649114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
650 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
651
652sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
653sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
654sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
655sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
656
657$code.=<<___;
658.align	64
659.LCamellia_SIGMA:
660.long	0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
661.long	0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
662.long	0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
663.long	0,          0,          0,          0
664.LCamellia_SBOX:
665___
666# tables are interleaved, remember?
667sub data_word { $code.=".long\t".join(',',@_)."\n"; }
668for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
669for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
670
671# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
672#			size_t length, const CAMELLIA_KEY *key,
673#			unsigned char *ivp,const int enc);
674{
675$_key="0(%rsp)";
676$_end="8(%rsp)";	# inp+len&~15
677$_res="16(%rsp)";	# len&15
678$ivec="24(%rsp)";
679$_ivp="40(%rsp)";
680$_rsp="48(%rsp)";
681
682$code.=<<___;
683.globl	Camellia_cbc_encrypt
684.type	Camellia_cbc_encrypt,\@function,6
685.align	16
686Camellia_cbc_encrypt:
687.cfi_startproc
688	cmp	\$0,%rdx
689	je	.Lcbc_abort
690	push	%rbx
691.cfi_push	%rbx
692	push	%rbp
693.cfi_push	%rbp
694	push	%r12
695.cfi_push	%r12
696	push	%r13
697.cfi_push	%r13
698	push	%r14
699.cfi_push	%r14
700	push	%r15
701.cfi_push	%r15
702.Lcbc_prologue:
703
704	mov	%rsp,%rbp
705.cfi_def_cfa_register	%rbp
706	sub	\$64,%rsp
707	and	\$-64,%rsp
708
709	# place stack frame just "above mod 1024" the key schedule,
710	# this ensures that cache associativity suffices
711	lea	-64-63(%rcx),%r10
712	sub	%rsp,%r10
713	neg	%r10
714	and	\$0x3C0,%r10
715	sub	%r10,%rsp
716	#add	\$8,%rsp		# 8 is reserved for callee's ra
717
718	mov	%rdi,$inp		# inp argument
719	mov	%rsi,$out		# out argument
720	mov	%r8,%rbx		# ivp argument
721	mov	%rcx,$key		# key argument
722	mov	272(%rcx),${keyend}d	# grandRounds
723
724	mov	%r8,$_ivp
725	mov	%rbp,$_rsp
726.cfi_cfa_expression	$_rsp,deref,+56
727
728.Lcbc_body:
729	lea	.LCamellia_SBOX(%rip),$Tbl
730
731	mov	\$32,%ecx
732.align	4
733.Lcbc_prefetch_sbox:
734	mov	0($Tbl),%rax
735	mov	32($Tbl),%rsi
736	mov	64($Tbl),%rdi
737	mov	96($Tbl),%r11
738	lea	128($Tbl),$Tbl
739	loop	.Lcbc_prefetch_sbox
740	sub	\$4096,$Tbl
741	shl	\$6,$keyend
742	mov	%rdx,%rcx		# len argument
743	lea	($key,$keyend),$keyend
744
745	cmp	\$0,%r9d		# enc argument
746	je	.LCBC_DECRYPT
747
748	and	\$-16,%rdx
749	and	\$15,%rcx		# length residue
750	lea	($inp,%rdx),%rdx
751	mov	$key,$_key
752	mov	%rdx,$_end
753	mov	%rcx,$_res
754
755	cmp	$inp,%rdx
756	mov	0(%rbx),@S[0]		# load IV
757	mov	4(%rbx),@S[1]
758	mov	8(%rbx),@S[2]
759	mov	12(%rbx),@S[3]
760	je	.Lcbc_enc_tail
761	jmp	.Lcbc_eloop
762
763.align	16
764.Lcbc_eloop:
765	xor	0($inp),@S[0]
766	xor	4($inp),@S[1]
767	xor	8($inp),@S[2]
768	bswap	@S[0]
769	xor	12($inp),@S[3]
770	bswap	@S[1]
771	bswap	@S[2]
772	bswap	@S[3]
773
774	call	_x86_64_Camellia_encrypt
775
776	mov	$_key,$key		# "rewind" the key
777	bswap	@S[0]
778	mov	$_end,%rdx
779	bswap	@S[1]
780	mov	$_res,%rcx
781	bswap	@S[2]
782	mov	@S[0],0($out)
783	bswap	@S[3]
784	mov	@S[1],4($out)
785	mov	@S[2],8($out)
786	lea	16($inp),$inp
787	mov	@S[3],12($out)
788	cmp	%rdx,$inp
789	lea	16($out),$out
790	jne	.Lcbc_eloop
791
792	cmp	\$0,%rcx
793	jne	.Lcbc_enc_tail
794
795	mov	$_ivp,$out
796	mov	@S[0],0($out)		# write out IV residue
797	mov	@S[1],4($out)
798	mov	@S[2],8($out)
799	mov	@S[3],12($out)
800	jmp	.Lcbc_done
801
802.align	16
803.Lcbc_enc_tail:
804	xor	%rax,%rax
805	mov	%rax,0+$ivec
806	mov	%rax,8+$ivec
807	mov	%rax,$_res
808
809.Lcbc_enc_pushf:
810	pushfq
811	cld
812	mov	$inp,%rsi
813	lea	8+$ivec,%rdi
814	.long	0x9066A4F3		# rep movsb
815	popfq
816.Lcbc_enc_popf:
817
818	lea	$ivec,$inp
819	lea	16+$ivec,%rax
820	mov	%rax,$_end
821	jmp	.Lcbc_eloop		# one more time
822
823.align	16
824.LCBC_DECRYPT:
825	xchg	$key,$keyend
826	add	\$15,%rdx
827	and	\$15,%rcx		# length residue
828	and	\$-16,%rdx
829	mov	$key,$_key
830	lea	($inp,%rdx),%rdx
831	mov	%rdx,$_end
832	mov	%rcx,$_res
833
834	mov	(%rbx),%rax		# load IV
835	mov	8(%rbx),%rbx
836	jmp	.Lcbc_dloop
837.align	16
838.Lcbc_dloop:
839	mov	0($inp),@S[0]
840	mov	4($inp),@S[1]
841	mov	8($inp),@S[2]
842	bswap	@S[0]
843	mov	12($inp),@S[3]
844	bswap	@S[1]
845	mov	%rax,0+$ivec		# save IV to temporary storage
846	bswap	@S[2]
847	mov	%rbx,8+$ivec
848	bswap	@S[3]
849
850	call	_x86_64_Camellia_decrypt
851
852	mov	$_key,$key		# "rewind" the key
853	mov	$_end,%rdx
854	mov	$_res,%rcx
855
856	bswap	@S[0]
857	mov	($inp),%rax		# load IV for next iteration
858	bswap	@S[1]
859	mov	8($inp),%rbx
860	bswap	@S[2]
861	xor	0+$ivec,@S[0]
862	bswap	@S[3]
863	xor	4+$ivec,@S[1]
864	xor	8+$ivec,@S[2]
865	lea	16($inp),$inp
866	xor	12+$ivec,@S[3]
867	cmp	%rdx,$inp
868	je	.Lcbc_ddone
869
870	mov	@S[0],0($out)
871	mov	@S[1],4($out)
872	mov	@S[2],8($out)
873	mov	@S[3],12($out)
874
875	lea	16($out),$out
876	jmp	.Lcbc_dloop
877
878.align	16
879.Lcbc_ddone:
880	mov	$_ivp,%rdx
881	cmp	\$0,%rcx
882	jne	.Lcbc_dec_tail
883
884	mov	@S[0],0($out)
885	mov	@S[1],4($out)
886	mov	@S[2],8($out)
887	mov	@S[3],12($out)
888
889	mov	%rax,(%rdx)		# write out IV residue
890	mov	%rbx,8(%rdx)
891	jmp	.Lcbc_done
892.align	16
893.Lcbc_dec_tail:
894	mov	@S[0],0+$ivec
895	mov	@S[1],4+$ivec
896	mov	@S[2],8+$ivec
897	mov	@S[3],12+$ivec
898
899.Lcbc_dec_pushf:
900	pushfq
901	cld
902	lea	8+$ivec,%rsi
903	lea	($out),%rdi
904	.long	0x9066A4F3		# rep movsb
905	popfq
906.Lcbc_dec_popf:
907
908	mov	%rax,(%rdx)		# write out IV residue
909	mov	%rbx,8(%rdx)
910	jmp	.Lcbc_done
911
912.align	16
913.Lcbc_done:
914	mov	$_rsp,%rcx
915.cfi_def_cfa	%rcx,56
916	mov	0(%rcx),%r15
917.cfi_restore	%r15
918	mov	8(%rcx),%r14
919.cfi_restore	%r14
920	mov	16(%rcx),%r13
921.cfi_restore	%r13
922	mov	24(%rcx),%r12
923.cfi_restore	%r12
924	mov	32(%rcx),%rbp
925.cfi_restore	%rbp
926	mov	40(%rcx),%rbx
927.cfi_restore	%rbx
928	lea	48(%rcx),%rsp
929.cfi_def_cfa	%rsp,8
930.Lcbc_abort:
931	ret
932.cfi_endproc
933.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
934
935.asciz	"Camellia for x86_64 by <appro\@openssl.org>"
936___
937}
938
939# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
940#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
941if ($win64) {
942$rec="%rcx";
943$frame="%rdx";
944$context="%r8";
945$disp="%r9";
946
947$code.=<<___;
948.extern	__imp_RtlVirtualUnwind
949.type	common_se_handler,\@abi-omnipotent
950.align	16
951common_se_handler:
952	push	%rsi
953	push	%rdi
954	push	%rbx
955	push	%rbp
956	push	%r12
957	push	%r13
958	push	%r14
959	push	%r15
960	pushfq
961	lea	-64(%rsp),%rsp
962
963	mov	120($context),%rax	# pull context->Rax
964	mov	248($context),%rbx	# pull context->Rip
965
966	mov	8($disp),%rsi		# disp->ImageBase
967	mov	56($disp),%r11		# disp->HandlerData
968
969	mov	0(%r11),%r10d		# HandlerData[0]
970	lea	(%rsi,%r10),%r10	# prologue label
971	cmp	%r10,%rbx		# context->Rip<prologue label
972	jb	.Lin_prologue
973
974	mov	152($context),%rax	# pull context->Rsp
975
976	mov	4(%r11),%r10d		# HandlerData[1]
977	lea	(%rsi,%r10),%r10	# epilogue label
978	cmp	%r10,%rbx		# context->Rip>=epilogue label
979	jae	.Lin_prologue
980
981	lea	40(%rax),%rax
982	mov	-8(%rax),%rbx
983	mov	-16(%rax),%rbp
984	mov	-24(%rax),%r13
985	mov	-32(%rax),%r14
986	mov	-40(%rax),%r15
987	mov	%rbx,144($context)	# restore context->Rbx
988	mov	%rbp,160($context)	# restore context->Rbp
989	mov	%r13,224($context)	# restore context->R13
990	mov	%r14,232($context)	# restore context->R14
991	mov	%r15,240($context)	# restore context->R15
992
993.Lin_prologue:
994	mov	8(%rax),%rdi
995	mov	16(%rax),%rsi
996	mov	%rax,152($context)	# restore context->Rsp
997	mov	%rsi,168($context)	# restore context->Rsi
998	mov	%rdi,176($context)	# restore context->Rdi
999
1000	jmp	.Lcommon_seh_exit
1001.size	common_se_handler,.-common_se_handler
1002
1003.type	cbc_se_handler,\@abi-omnipotent
1004.align	16
1005cbc_se_handler:
1006	push	%rsi
1007	push	%rdi
1008	push	%rbx
1009	push	%rbp
1010	push	%r12
1011	push	%r13
1012	push	%r14
1013	push	%r15
1014	pushfq
1015	lea	-64(%rsp),%rsp
1016
1017	mov	120($context),%rax	# pull context->Rax
1018	mov	248($context),%rbx	# pull context->Rip
1019
1020	lea	.Lcbc_prologue(%rip),%r10
1021	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
1022	jb	.Lin_cbc_prologue
1023
1024	lea	.Lcbc_body(%rip),%r10
1025	cmp	%r10,%rbx		# context->Rip<.Lcbc_body
1026	jb	.Lin_cbc_frame_setup
1027
1028	mov	152($context),%rax	# pull context->Rsp
1029
1030	lea	.Lcbc_abort(%rip),%r10
1031	cmp	%r10,%rbx		# context->Rip>=.Lcbc_abort
1032	jae	.Lin_cbc_prologue
1033
1034	# handle pushf/popf in Camellia_cbc_encrypt
1035	lea	.Lcbc_enc_pushf(%rip),%r10
1036	cmp	%r10,%rbx		# context->Rip<=.Lcbc_enc_pushf
1037	jbe	.Lin_cbc_no_flag
1038	lea	8(%rax),%rax
1039	lea	.Lcbc_enc_popf(%rip),%r10
1040	cmp	%r10,%rbx		# context->Rip<.Lcbc_enc_popf
1041	jb	.Lin_cbc_no_flag
1042	lea	-8(%rax),%rax
1043	lea	.Lcbc_dec_pushf(%rip),%r10
1044	cmp	%r10,%rbx		# context->Rip<=.Lcbc_dec_pushf
1045	jbe	.Lin_cbc_no_flag
1046	lea	8(%rax),%rax
1047	lea	.Lcbc_dec_popf(%rip),%r10
1048	cmp	%r10,%rbx		# context->Rip<.Lcbc_dec_popf
1049	jb	.Lin_cbc_no_flag
1050	lea	-8(%rax),%rax
1051
1052.Lin_cbc_no_flag:
1053	mov	48(%rax),%rax		# $_rsp
1054	lea	48(%rax),%rax
1055
1056.Lin_cbc_frame_setup:
1057	mov	-8(%rax),%rbx
1058	mov	-16(%rax),%rbp
1059	mov	-24(%rax),%r12
1060	mov	-32(%rax),%r13
1061	mov	-40(%rax),%r14
1062	mov	-48(%rax),%r15
1063	mov	%rbx,144($context)	# restore context->Rbx
1064	mov	%rbp,160($context)	# restore context->Rbp
1065	mov	%r12,216($context)	# restore context->R12
1066	mov	%r13,224($context)	# restore context->R13
1067	mov	%r14,232($context)	# restore context->R14
1068	mov	%r15,240($context)	# restore context->R15
1069
1070.Lin_cbc_prologue:
1071	mov	8(%rax),%rdi
1072	mov	16(%rax),%rsi
1073	mov	%rax,152($context)	# restore context->Rsp
1074	mov	%rsi,168($context)	# restore context->Rsi
1075	mov	%rdi,176($context)	# restore context->Rdi
1076
1077.align	4
1078.Lcommon_seh_exit:
1079
1080	mov	40($disp),%rdi		# disp->ContextRecord
1081	mov	$context,%rsi		# context
1082	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1083	.long	0xa548f3fc		# cld; rep movsq
1084
1085	mov	$disp,%rsi
1086	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1087	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1088	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1089	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1090	mov	40(%rsi),%r10		# disp->ContextRecord
1091	lea	56(%rsi),%r11		# &disp->HandlerData
1092	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1093	mov	%r10,32(%rsp)		# arg5
1094	mov	%r11,40(%rsp)		# arg6
1095	mov	%r12,48(%rsp)		# arg7
1096	mov	%rcx,56(%rsp)		# arg8, (NULL)
1097	call	*__imp_RtlVirtualUnwind(%rip)
1098
1099	mov	\$1,%eax		# ExceptionContinueSearch
1100	lea	64(%rsp),%rsp
1101	popfq
1102	pop	%r15
1103	pop	%r14
1104	pop	%r13
1105	pop	%r12
1106	pop	%rbp
1107	pop	%rbx
1108	pop	%rdi
1109	pop	%rsi
1110	ret
1111.size	cbc_se_handler,.-cbc_se_handler
1112
1113.section	.pdata
1114.align	4
1115	.rva	.LSEH_begin_Camellia_EncryptBlock_Rounds
1116	.rva	.LSEH_end_Camellia_EncryptBlock_Rounds
1117	.rva	.LSEH_info_Camellia_EncryptBlock_Rounds
1118
1119	.rva	.LSEH_begin_Camellia_DecryptBlock_Rounds
1120	.rva	.LSEH_end_Camellia_DecryptBlock_Rounds
1121	.rva	.LSEH_info_Camellia_DecryptBlock_Rounds
1122
1123	.rva	.LSEH_begin_Camellia_Ekeygen
1124	.rva	.LSEH_end_Camellia_Ekeygen
1125	.rva	.LSEH_info_Camellia_Ekeygen
1126
1127	.rva	.LSEH_begin_Camellia_cbc_encrypt
1128	.rva	.LSEH_end_Camellia_cbc_encrypt
1129	.rva	.LSEH_info_Camellia_cbc_encrypt
1130
1131.section	.xdata
1132.align	8
1133.LSEH_info_Camellia_EncryptBlock_Rounds:
1134	.byte	9,0,0,0
1135	.rva	common_se_handler
1136	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
1137.LSEH_info_Camellia_DecryptBlock_Rounds:
1138	.byte	9,0,0,0
1139	.rva	common_se_handler
1140	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
1141.LSEH_info_Camellia_Ekeygen:
1142	.byte	9,0,0,0
1143	.rva	common_se_handler
1144	.rva	.Lkey_prologue,.Lkey_epilogue	# HandlerData[]
1145.LSEH_info_Camellia_cbc_encrypt:
1146	.byte	9,0,0,0
1147	.rva	cbc_se_handler
1148___
1149}
1150
1151$code =~ s/\`([^\`]*)\`/eval $1/gem;
1152print $code;
1153close STDOUT or die "error closing STDOUT: $!";
1154