1#! /usr/bin/env perl
2# Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12#
13# This module may be used under the terms of either the GNU General
14# Public License version 2 or later, the GNU Lesser General Public
15# License version 2.1 or later, the Mozilla Public License version
16# 1.1 or the BSD License. The exact terms of either license are
17# distributed along with this module. For further details see
18# http://www.openssl.org/~appro/camellia/.
19# ====================================================================
20
21# Performance in cycles per processed byte (less is better) in
22# 'openssl speed ...' benchmark:
23#
24#			AMD64	Core2	EM64T
25# -evp camellia-128-ecb	16.7	21.0	22.7
26# + over gcc 3.4.6	+25%	+5%	0%
27#
28# camellia-128-cbc	15.7	20.4	21.1
29#
30# 128-bit key setup	128	216	205	cycles/key
31# + over gcc 3.4.6	+54%	+39%	+15%
32#
33# Numbers in "+" rows represent performance improvement over compiler
34# generated code. Key setup timings are impressive on AMD and Core2
35# thanks to 64-bit operations being covertly deployed. Improvement on
36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37# apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39$flavour = shift;
40$output  = shift;
41if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48die "can't locate x86_64-xlate.pl";
49
50open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51*STDOUT=*OUT;
52
53sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
54sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55                        $r =~ s/%[er]([sd]i)/%\1l/;
56                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
57
58$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59@S=("%r8d","%r9d","%r10d","%r11d");
60$i0="%esi";
61$i1="%edi";
62$Tbl="%rbp";	# size optimization
63$inp="%r12";
64$out="%r13";
65$key="%r14";
66$keyend="%r15";
67$arg0d=$win64?"%ecx":"%edi";
68
69# const unsigned int Camellia_SBOX[4][256];
70# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71# and [2][] - with [3][]. This is done to minimize code size.
72$SBOX1_1110=0;		# Camellia_SBOX[0]
73$SBOX4_4404=4;		# Camellia_SBOX[1]
74$SBOX2_0222=2048;	# Camellia_SBOX[2]
75$SBOX3_3033=2052;	# Camellia_SBOX[3]
76
77sub Camellia_Feistel {
78my $i=@_[0];
79my $seed=defined(@_[1])?@_[1]:0;
80my $scale=$seed<0?-8:8;
81my $j=($i&1)*2;
82my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84$code.=<<___;
85	xor	$s0,$t0				# t0^=key[0]
86	xor	$s1,$t1				# t1^=key[1]
87	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff
88	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff
89	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]
90	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]
91	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff
92	shr	\$16,$t0
93	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff
94	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]
95	shr	\$16,$t1
96	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]
97	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff
98	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff
99	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]
100	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]
101	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff
102	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff
103	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]
104	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]
105	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]
106	mov	`$seed+($i+1)*$scale+4`($key),$t0
107	xor	$t3,$t2				# t2^=t3
108	ror	\$8,$t3				# t3=RightRotate(t3,8)
109	xor	$t2,$s2
110	xor	$t2,$s3
111	xor	$t3,$s3
112___
113}
114
115# void Camellia_EncryptBlock_Rounds(
116#		int grandRounds,
117#		const Byte plaintext[],
118#		const KEY_TABLE_TYPE keyTable,
119#		Byte ciphertext[])
120$code=<<___;
121.text
122
123# V1.x API
124.globl	Camellia_EncryptBlock
125.type	Camellia_EncryptBlock,\@abi-omnipotent
126.align	16
127Camellia_EncryptBlock:
128	movl	\$128,%eax
129	subl	$arg0d,%eax
130	movl	\$3,$arg0d
131	adcl	\$0,$arg0d	# keyBitLength==128?3:4
132	jmp	.Lenc_rounds
133.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock
134# V2
135.globl	Camellia_EncryptBlock_Rounds
136.type	Camellia_EncryptBlock_Rounds,\@function,4
137.align	16
138.Lenc_rounds:
139Camellia_EncryptBlock_Rounds:
140.cfi_startproc
141	push	%rbx
142.cfi_push	%rbx
143	push	%rbp
144.cfi_push	%rbp
145	push	%r13
146.cfi_push	%r13
147	push	%r14
148.cfi_push	%r14
149	push	%r15
150.cfi_push	%r15
151.Lenc_prologue:
152
153	#mov	%rsi,$inp		# put away arguments
154	mov	%rcx,$out
155	mov	%rdx,$key
156
157	shl	\$6,%edi		# process grandRounds
158	lea	.LCamellia_SBOX(%rip),$Tbl
159	lea	($key,%rdi),$keyend
160
161	mov	0(%rsi),@S[0]		# load plaintext
162	mov	4(%rsi),@S[1]
163	mov	8(%rsi),@S[2]
164	bswap	@S[0]
165	mov	12(%rsi),@S[3]
166	bswap	@S[1]
167	bswap	@S[2]
168	bswap	@S[3]
169
170	call	_x86_64_Camellia_encrypt
171
172	bswap	@S[0]
173	bswap	@S[1]
174	bswap	@S[2]
175	mov	@S[0],0($out)
176	bswap	@S[3]
177	mov	@S[1],4($out)
178	mov	@S[2],8($out)
179	mov	@S[3],12($out)
180
181	mov	0(%rsp),%r15
182.cfi_restore	%r15
183	mov	8(%rsp),%r14
184.cfi_restore	%r14
185	mov	16(%rsp),%r13
186.cfi_restore	%r13
187	mov	24(%rsp),%rbp
188.cfi_restore	%rbp
189	mov	32(%rsp),%rbx
190.cfi_restore	%rbx
191	lea	40(%rsp),%rsp
192.cfi_adjust_cfa_offset	-40
193.Lenc_epilogue:
194	ret
195.cfi_endproc
196.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
197
198.type	_x86_64_Camellia_encrypt,\@abi-omnipotent
199.align	16
200_x86_64_Camellia_encrypt:
201	xor	0($key),@S[1]
202	xor	4($key),@S[0]		# ^=key[0-3]
203	xor	8($key),@S[3]
204	xor	12($key),@S[2]
205.align	16
206.Leloop:
207	mov	16($key),$t1		# prefetch key[4-5]
208	mov	20($key),$t0
209
210___
211	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
212$code.=<<___;
213	lea	16*4($key),$key
214	cmp	$keyend,$key
215	mov	8($key),$t3		# prefetch key[2-3]
216	mov	12($key),$t2
217	je	.Ledone
218
219	and	@S[0],$t0
220	or	@S[3],$t3
221	rol	\$1,$t0
222	xor	$t3,@S[2]		# s2^=s3|key[3];
223	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
224	and	@S[2],$t2
225	or	@S[1],$t1
226	rol	\$1,$t2
227	xor	$t1,@S[0]		# s0^=s1|key[1];
228	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
229	jmp	.Leloop
230
231.align	16
232.Ledone:
233	xor	@S[2],$t0		# SwapHalf
234	xor	@S[3],$t1
235	xor	@S[0],$t2
236	xor	@S[1],$t3
237
238	mov	$t0,@S[0]
239	mov	$t1,@S[1]
240	mov	$t2,@S[2]
241	mov	$t3,@S[3]
242
243	.byte	0xf3,0xc3		# rep ret
244.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
245
246# V1.x API
247.globl	Camellia_DecryptBlock
248.type	Camellia_DecryptBlock,\@abi-omnipotent
249.align	16
250Camellia_DecryptBlock:
251	movl	\$128,%eax
252	subl	$arg0d,%eax
253	movl	\$3,$arg0d
254	adcl	\$0,$arg0d	# keyBitLength==128?3:4
255	jmp	.Ldec_rounds
256.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock
257# V2
258.globl	Camellia_DecryptBlock_Rounds
259.type	Camellia_DecryptBlock_Rounds,\@function,4
260.align	16
261.Ldec_rounds:
262Camellia_DecryptBlock_Rounds:
263.cfi_startproc
264	push	%rbx
265.cfi_push	%rbx
266	push	%rbp
267.cfi_push	%rbp
268	push	%r13
269.cfi_push	%r13
270	push	%r14
271.cfi_push	%r14
272	push	%r15
273.cfi_push	%r15
274.Ldec_prologue:
275
276	#mov	%rsi,$inp		# put away arguments
277	mov	%rcx,$out
278	mov	%rdx,$keyend
279
280	shl	\$6,%edi		# process grandRounds
281	lea	.LCamellia_SBOX(%rip),$Tbl
282	lea	($keyend,%rdi),$key
283
284	mov	0(%rsi),@S[0]		# load plaintext
285	mov	4(%rsi),@S[1]
286	mov	8(%rsi),@S[2]
287	bswap	@S[0]
288	mov	12(%rsi),@S[3]
289	bswap	@S[1]
290	bswap	@S[2]
291	bswap	@S[3]
292
293	call	_x86_64_Camellia_decrypt
294
295	bswap	@S[0]
296	bswap	@S[1]
297	bswap	@S[2]
298	mov	@S[0],0($out)
299	bswap	@S[3]
300	mov	@S[1],4($out)
301	mov	@S[2],8($out)
302	mov	@S[3],12($out)
303
304	mov	0(%rsp),%r15
305.cfi_restore	%r15
306	mov	8(%rsp),%r14
307.cfi_restore	%r14
308	mov	16(%rsp),%r13
309.cfi_restore	%r13
310	mov	24(%rsp),%rbp
311.cfi_restore	%rbp
312	mov	32(%rsp),%rbx
313.cfi_restore	%rbx
314	lea	40(%rsp),%rsp
315.cfi_adjust_cfa_offset	-40
316.Ldec_epilogue:
317	ret
318.cfi_endproc
319.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
320
321.type	_x86_64_Camellia_decrypt,\@abi-omnipotent
322.align	16
323_x86_64_Camellia_decrypt:
324	xor	0($key),@S[1]
325	xor	4($key),@S[0]		# ^=key[0-3]
326	xor	8($key),@S[3]
327	xor	12($key),@S[2]
328.align	16
329.Ldloop:
330	mov	-8($key),$t1		# prefetch key[4-5]
331	mov	-4($key),$t0
332
333___
334	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
335$code.=<<___;
336	lea	-16*4($key),$key
337	cmp	$keyend,$key
338	mov	0($key),$t3		# prefetch key[2-3]
339	mov	4($key),$t2
340	je	.Lddone
341
342	and	@S[0],$t0
343	or	@S[3],$t3
344	rol	\$1,$t0
345	xor	$t3,@S[2]		# s2^=s3|key[3];
346	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
347	and	@S[2],$t2
348	or	@S[1],$t1
349	rol	\$1,$t2
350	xor	$t1,@S[0]		# s0^=s1|key[1];
351	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
352
353	jmp	.Ldloop
354
355.align	16
356.Lddone:
357	xor	@S[2],$t2
358	xor	@S[3],$t3
359	xor	@S[0],$t0
360	xor	@S[1],$t1
361
362	mov	$t2,@S[0]		# SwapHalf
363	mov	$t3,@S[1]
364	mov	$t0,@S[2]
365	mov	$t1,@S[3]
366
367	.byte	0xf3,0xc3		# rep ret
368.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
369___
370
371sub _saveround {
372my ($rnd,$key,@T)=@_;
373my $bias=int(@T[0])?shift(@T):0;
374
375    if ($#T==3) {
376	$code.=<<___;
377	mov	@T[1],`$bias+$rnd*8+0`($key)
378	mov	@T[0],`$bias+$rnd*8+4`($key)
379	mov	@T[3],`$bias+$rnd*8+8`($key)
380	mov	@T[2],`$bias+$rnd*8+12`($key)
381___
382    } else {
383	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";
384	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);
385    }
386}
387
388sub _loadround {
389my ($rnd,$key,@T)=@_;
390my $bias=int(@T[0])?shift(@T):0;
391
392$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";
393$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);
394}
395
396# shld is very slow on Intel EM64T family. Even on AMD it limits
397# instruction decode rate [because it's VectorPath] and consequently
398# performance...
399sub __rotl128 {
400my ($i0,$i1,$rot)=@_;
401
402    if ($rot) {
403	$code.=<<___;
404	mov	$i0,%r11
405	shld	\$$rot,$i1,$i0
406	shld	\$$rot,%r11,$i1
407___
408    }
409}
410
411# ... Implementing 128-bit rotate without shld gives 80% better
412# performance EM64T, +15% on AMD64 and only ~7% degradation on
413# Core2. This is therefore preferred.
414sub _rotl128 {
415my ($i0,$i1,$rot)=@_;
416
417    if ($rot) {
418	$code.=<<___;
419	mov	$i0,%r11
420	shl	\$$rot,$i0
421	mov	$i1,%r9
422	shr	\$`64-$rot`,%r9
423	shr	\$`64-$rot`,%r11
424	or	%r9,$i0
425	shl	\$$rot,$i1
426	or	%r11,$i1
427___
428    }
429}
430
431{ my $step=0;
432
433$code.=<<___;
434.globl	Camellia_Ekeygen
435.type	Camellia_Ekeygen,\@function,3
436.align	16
437Camellia_Ekeygen:
438.cfi_startproc
439	push	%rbx
440.cfi_push	%rbx
441	push	%rbp
442.cfi_push	%rbp
443	push	%r13
444.cfi_push	%r13
445	push	%r14
446.cfi_push	%r14
447	push	%r15
448.cfi_push	%r15
449.Lkey_prologue:
450
451	mov	%edi,${keyend}d		# put away arguments, keyBitLength
452	mov	%rdx,$out		# keyTable
453
454	mov	0(%rsi),@S[0]		# load 0-127 bits
455	mov	4(%rsi),@S[1]
456	mov	8(%rsi),@S[2]
457	mov	12(%rsi),@S[3]
458
459	bswap	@S[0]
460	bswap	@S[1]
461	bswap	@S[2]
462	bswap	@S[3]
463___
464	&_saveround	(0,$out,@S);	# KL<<<0
465$code.=<<___;
466	cmp	\$128,$keyend		# check keyBitLength
467	je	.L1st128
468
469	mov	16(%rsi),@S[0]		# load 128-191 bits
470	mov	20(%rsi),@S[1]
471	cmp	\$192,$keyend
472	je	.L1st192
473	mov	24(%rsi),@S[2]		# load 192-255 bits
474	mov	28(%rsi),@S[3]
475	jmp	.L1st256
476.L1st192:
477	mov	@S[0],@S[2]
478	mov	@S[1],@S[3]
479	not	@S[2]
480	not	@S[3]
481.L1st256:
482	bswap	@S[0]
483	bswap	@S[1]
484	bswap	@S[2]
485	bswap	@S[3]
486___
487	&_saveround	(4,$out,@S);	# temp storage for KR!
488$code.=<<___;
489	xor	0($out),@S[1]		# KR^KL
490	xor	4($out),@S[0]
491	xor	8($out),@S[3]
492	xor	12($out),@S[2]
493
494.L1st128:
495	lea	.LCamellia_SIGMA(%rip),$key
496	lea	.LCamellia_SBOX(%rip),$Tbl
497
498	mov	0($key),$t1
499	mov	4($key),$t0
500___
501	&Camellia_Feistel($step++);
502	&Camellia_Feistel($step++);
503$code.=<<___;
504	xor	0($out),@S[1]		# ^KL
505	xor	4($out),@S[0]
506	xor	8($out),@S[3]
507	xor	12($out),@S[2]
508___
509	&Camellia_Feistel($step++);
510	&Camellia_Feistel($step++);
511$code.=<<___;
512	cmp	\$128,$keyend
513	jne	.L2nd256
514
515	lea	128($out),$out		# size optimization
516	shl	\$32,%r8		# @S[0]||
517	shl	\$32,%r10		# @S[2]||
518	or	%r9,%r8			# ||@S[1]
519	or	%r11,%r10		# ||@S[3]
520___
521	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL
522	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0
523	&_rotl128	("%rax","%rbx",15);
524	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15
525	&_rotl128	("%r8","%r10",15);
526	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15
527	&_rotl128	("%r8","%r10",15);		# 15+15=30
528	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30
529	&_rotl128	("%rax","%rbx",30);		# 15+30=45
530	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45
531	&_rotl128	("%r8","%r10",15);		# 30+15=45
532	&_saveround	(12,$out,-128,"%r8");		# KA<<<45
533	&_rotl128	("%rax","%rbx",15);		# 45+15=60
534	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60
535	&_rotl128	("%r8","%r10",15);		# 45+15=60
536	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60
537	&_rotl128	("%rax","%rbx",17);		# 60+17=77
538	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77
539	&_rotl128	("%rax","%rbx",17);		# 77+17=94
540	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94
541	&_rotl128	("%r8","%r10",34);		# 60+34=94
542	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94
543	&_rotl128	("%rax","%rbx",17);		# 94+17=111
544	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111
545	&_rotl128	("%r8","%r10",17);		# 94+17=111
546	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111
547$code.=<<___;
548	mov	\$3,%eax
549	jmp	.Ldone
550.align	16
551.L2nd256:
552___
553	&_saveround	(6,$out,@S);	# temp storage for KA!
554$code.=<<___;
555	xor	`4*8+0`($out),@S[1]	# KA^KR
556	xor	`4*8+4`($out),@S[0]
557	xor	`5*8+0`($out),@S[3]
558	xor	`5*8+4`($out),@S[2]
559___
560	&Camellia_Feistel($step++);
561	&Camellia_Feistel($step++);
562
563	&_loadround	(0,$out,"%rax","%rbx");	# KL
564	&_loadround	(4,$out,"%rcx","%rdx");	# KR
565	&_loadround	(6,$out,"%r14","%r15");	# KA
566$code.=<<___;
567	lea	128($out),$out		# size optimization
568	shl	\$32,%r8		# @S[0]||
569	shl	\$32,%r10		# @S[2]||
570	or	%r9,%r8			# ||@S[1]
571	or	%r11,%r10		# ||@S[3]
572___
573	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0
574	&_rotl128	("%rcx","%rdx",15);
575	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15
576	&_rotl128	("%r14","%r15",15);
577	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15
578	&_rotl128	("%rcx","%rdx",15);		# 15+15=30
579	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30
580	&_rotl128	("%r8","%r10",30);
581	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30
582	&_rotl128	("%rax","%rbx",45);
583	&_saveround	(12,$out,-128,"%rax","%rbx");	# KL<<<45
584	&_rotl128	("%r14","%r15",30);		# 15+30=45
585	&_saveround	(14,$out,-128,"%r14","%r15");	# KA<<<45
586	&_rotl128	("%rax","%rbx",15);		# 45+15=60
587	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<60
588	&_rotl128	("%rcx","%rdx",30);		# 30+30=60
589	&_saveround	(18,$out,-128,"%rcx","%rdx");	# KR<<<60
590	&_rotl128	("%r8","%r10",30);		# 30+30=60
591	&_saveround	(20,$out,-128,"%r8","%r10");	# KB<<<60
592	&_rotl128	("%rax","%rbx",17);		# 60+17=77
593	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<77
594	&_rotl128	("%r14","%r15",32);		# 45+32=77
595	&_saveround	(24,$out,-128,"%r14","%r15");	# KA<<<77
596	&_rotl128	("%rcx","%rdx",34);		# 60+34=94
597	&_saveround	(26,$out,-128,"%rcx","%rdx");	# KR<<<94
598	&_rotl128	("%r14","%r15",17);		# 77+17=94
599	&_saveround	(28,$out,-128,"%r14","%r15");	# KA<<<77
600	&_rotl128	("%rax","%rbx",34);		# 77+34=111
601	&_saveround	(30,$out,-128,"%rax","%rbx");	# KL<<<111
602	&_rotl128	("%r8","%r10",51);		# 60+51=111
603	&_saveround	(32,$out,-128,"%r8","%r10");	# KB<<<111
604$code.=<<___;
605	mov	\$4,%eax
606.Ldone:
607	mov	0(%rsp),%r15
608.cfi_restore	%r15
609	mov	8(%rsp),%r14
610.cfi_restore	%r14
611	mov	16(%rsp),%r13
612.cfi_restore	%r13
613	mov	24(%rsp),%rbp
614.cfi_restore	%rbp
615	mov	32(%rsp),%rbx
616.cfi_restore	%rbx
617	lea	40(%rsp),%rsp
618.cfi_adjust_cfa_offset	-40
619.Lkey_epilogue:
620	ret
621.cfi_endproc
622.size	Camellia_Ekeygen,.-Camellia_Ekeygen
623___
624}
625
626@SBOX=(
627112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
628 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
629134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
630166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
631139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
632223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
633 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
634254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
635170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
636 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
637135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
638 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
639233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
640120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
641114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
642 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
643
644sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
645sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
646sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
647sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
648
649$code.=<<___;
650.align	64
651.LCamellia_SIGMA:
652.long	0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
653.long	0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
654.long	0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
655.long	0,          0,          0,          0
656.LCamellia_SBOX:
657___
658# tables are interleaved, remember?
659sub data_word { $code.=".long\t".join(',',@_)."\n"; }
660for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
661for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
662
663# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
664#			size_t length, const CAMELLIA_KEY *key,
665#			unsigned char *ivp,const int enc);
666{
667$_key="0(%rsp)";
668$_end="8(%rsp)";	# inp+len&~15
669$_res="16(%rsp)";	# len&15
670$ivec="24(%rsp)";
671$_ivp="40(%rsp)";
672$_rsp="48(%rsp)";
673
674$code.=<<___;
675.globl	Camellia_cbc_encrypt
676.type	Camellia_cbc_encrypt,\@function,6
677.align	16
678Camellia_cbc_encrypt:
679.cfi_startproc
680	cmp	\$0,%rdx
681	je	.Lcbc_abort
682	push	%rbx
683.cfi_push	%rbx
684	push	%rbp
685.cfi_push	%rbp
686	push	%r12
687.cfi_push	%r12
688	push	%r13
689.cfi_push	%r13
690	push	%r14
691.cfi_push	%r14
692	push	%r15
693.cfi_push	%r15
694.Lcbc_prologue:
695
696	mov	%rsp,%rbp
697.cfi_def_cfa_register	%rbp
698	sub	\$64,%rsp
699	and	\$-64,%rsp
700
701	# place stack frame just "above mod 1024" the key schedule,
702	# this ensures that cache associativity suffices
703	lea	-64-63(%rcx),%r10
704	sub	%rsp,%r10
705	neg	%r10
706	and	\$0x3C0,%r10
707	sub	%r10,%rsp
708	#add	\$8,%rsp		# 8 is reserved for callee's ra
709
710	mov	%rdi,$inp		# inp argument
711	mov	%rsi,$out		# out argument
712	mov	%r8,%rbx		# ivp argument
713	mov	%rcx,$key		# key argument
714	mov	272(%rcx),${keyend}d	# grandRounds
715
716	mov	%r8,$_ivp
717	mov	%rbp,$_rsp
718.cfi_cfa_expression	$_rsp,deref,+56
719
720.Lcbc_body:
721	lea	.LCamellia_SBOX(%rip),$Tbl
722
723	mov	\$32,%ecx
724.align	4
725.Lcbc_prefetch_sbox:
726	mov	0($Tbl),%rax
727	mov	32($Tbl),%rsi
728	mov	64($Tbl),%rdi
729	mov	96($Tbl),%r11
730	lea	128($Tbl),$Tbl
731	loop	.Lcbc_prefetch_sbox
732	sub	\$4096,$Tbl
733	shl	\$6,$keyend
734	mov	%rdx,%rcx		# len argument
735	lea	($key,$keyend),$keyend
736
737	cmp	\$0,%r9d		# enc argument
738	je	.LCBC_DECRYPT
739
740	and	\$-16,%rdx
741	and	\$15,%rcx		# length residue
742	lea	($inp,%rdx),%rdx
743	mov	$key,$_key
744	mov	%rdx,$_end
745	mov	%rcx,$_res
746
747	cmp	$inp,%rdx
748	mov	0(%rbx),@S[0]		# load IV
749	mov	4(%rbx),@S[1]
750	mov	8(%rbx),@S[2]
751	mov	12(%rbx),@S[3]
752	je	.Lcbc_enc_tail
753	jmp	.Lcbc_eloop
754
755.align	16
756.Lcbc_eloop:
757	xor	0($inp),@S[0]
758	xor	4($inp),@S[1]
759	xor	8($inp),@S[2]
760	bswap	@S[0]
761	xor	12($inp),@S[3]
762	bswap	@S[1]
763	bswap	@S[2]
764	bswap	@S[3]
765
766	call	_x86_64_Camellia_encrypt
767
768	mov	$_key,$key		# "rewind" the key
769	bswap	@S[0]
770	mov	$_end,%rdx
771	bswap	@S[1]
772	mov	$_res,%rcx
773	bswap	@S[2]
774	mov	@S[0],0($out)
775	bswap	@S[3]
776	mov	@S[1],4($out)
777	mov	@S[2],8($out)
778	lea	16($inp),$inp
779	mov	@S[3],12($out)
780	cmp	%rdx,$inp
781	lea	16($out),$out
782	jne	.Lcbc_eloop
783
784	cmp	\$0,%rcx
785	jne	.Lcbc_enc_tail
786
787	mov	$_ivp,$out
788	mov	@S[0],0($out)		# write out IV residue
789	mov	@S[1],4($out)
790	mov	@S[2],8($out)
791	mov	@S[3],12($out)
792	jmp	.Lcbc_done
793
794.align	16
795.Lcbc_enc_tail:
796	xor	%rax,%rax
797	mov	%rax,0+$ivec
798	mov	%rax,8+$ivec
799	mov	%rax,$_res
800
801.Lcbc_enc_pushf:
802	pushfq
803	cld
804	mov	$inp,%rsi
805	lea	8+$ivec,%rdi
806	.long	0x9066A4F3		# rep movsb
807	popfq
808.Lcbc_enc_popf:
809
810	lea	$ivec,$inp
811	lea	16+$ivec,%rax
812	mov	%rax,$_end
813	jmp	.Lcbc_eloop		# one more time
814
815.align	16
816.LCBC_DECRYPT:
817	xchg	$key,$keyend
818	add	\$15,%rdx
819	and	\$15,%rcx		# length residue
820	and	\$-16,%rdx
821	mov	$key,$_key
822	lea	($inp,%rdx),%rdx
823	mov	%rdx,$_end
824	mov	%rcx,$_res
825
826	mov	(%rbx),%rax		# load IV
827	mov	8(%rbx),%rbx
828	jmp	.Lcbc_dloop
829.align	16
830.Lcbc_dloop:
831	mov	0($inp),@S[0]
832	mov	4($inp),@S[1]
833	mov	8($inp),@S[2]
834	bswap	@S[0]
835	mov	12($inp),@S[3]
836	bswap	@S[1]
837	mov	%rax,0+$ivec		# save IV to temporary storage
838	bswap	@S[2]
839	mov	%rbx,8+$ivec
840	bswap	@S[3]
841
842	call	_x86_64_Camellia_decrypt
843
844	mov	$_key,$key		# "rewind" the key
845	mov	$_end,%rdx
846	mov	$_res,%rcx
847
848	bswap	@S[0]
849	mov	($inp),%rax		# load IV for next iteration
850	bswap	@S[1]
851	mov	8($inp),%rbx
852	bswap	@S[2]
853	xor	0+$ivec,@S[0]
854	bswap	@S[3]
855	xor	4+$ivec,@S[1]
856	xor	8+$ivec,@S[2]
857	lea	16($inp),$inp
858	xor	12+$ivec,@S[3]
859	cmp	%rdx,$inp
860	je	.Lcbc_ddone
861
862	mov	@S[0],0($out)
863	mov	@S[1],4($out)
864	mov	@S[2],8($out)
865	mov	@S[3],12($out)
866
867	lea	16($out),$out
868	jmp	.Lcbc_dloop
869
870.align	16
871.Lcbc_ddone:
872	mov	$_ivp,%rdx
873	cmp	\$0,%rcx
874	jne	.Lcbc_dec_tail
875
876	mov	@S[0],0($out)
877	mov	@S[1],4($out)
878	mov	@S[2],8($out)
879	mov	@S[3],12($out)
880
881	mov	%rax,(%rdx)		# write out IV residue
882	mov	%rbx,8(%rdx)
883	jmp	.Lcbc_done
884.align	16
885.Lcbc_dec_tail:
886	mov	@S[0],0+$ivec
887	mov	@S[1],4+$ivec
888	mov	@S[2],8+$ivec
889	mov	@S[3],12+$ivec
890
891.Lcbc_dec_pushf:
892	pushfq
893	cld
894	lea	8+$ivec,%rsi
895	lea	($out),%rdi
896	.long	0x9066A4F3		# rep movsb
897	popfq
898.Lcbc_dec_popf:
899
900	mov	%rax,(%rdx)		# write out IV residue
901	mov	%rbx,8(%rdx)
902	jmp	.Lcbc_done
903
904.align	16
905.Lcbc_done:
906	mov	$_rsp,%rcx
907.cfi_def_cfa	%rcx,56
908	mov	0(%rcx),%r15
909.cfi_restore	%r15
910	mov	8(%rcx),%r14
911.cfi_restore	%r14
912	mov	16(%rcx),%r13
913.cfi_restore	%r13
914	mov	24(%rcx),%r12
915.cfi_restore	%r12
916	mov	32(%rcx),%rbp
917.cfi_restore	%rbp
918	mov	40(%rcx),%rbx
919.cfi_restore	%rbx
920	lea	48(%rcx),%rsp
921.cfi_def_cfa	%rsp,8
922.Lcbc_abort:
923	ret
924.cfi_endproc
925.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
926
927.asciz	"Camellia for x86_64 by <appro\@openssl.org>"
928___
929}
930
931# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
932#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
933if ($win64) {
934$rec="%rcx";
935$frame="%rdx";
936$context="%r8";
937$disp="%r9";
938
939$code.=<<___;
940.extern	__imp_RtlVirtualUnwind
941.type	common_se_handler,\@abi-omnipotent
942.align	16
943common_se_handler:
944	push	%rsi
945	push	%rdi
946	push	%rbx
947	push	%rbp
948	push	%r12
949	push	%r13
950	push	%r14
951	push	%r15
952	pushfq
953	lea	-64(%rsp),%rsp
954
955	mov	120($context),%rax	# pull context->Rax
956	mov	248($context),%rbx	# pull context->Rip
957
958	mov	8($disp),%rsi		# disp->ImageBase
959	mov	56($disp),%r11		# disp->HandlerData
960
961	mov	0(%r11),%r10d		# HandlerData[0]
962	lea	(%rsi,%r10),%r10	# prologue label
963	cmp	%r10,%rbx		# context->Rip<prologue label
964	jb	.Lin_prologue
965
966	mov	152($context),%rax	# pull context->Rsp
967
968	mov	4(%r11),%r10d		# HandlerData[1]
969	lea	(%rsi,%r10),%r10	# epilogue label
970	cmp	%r10,%rbx		# context->Rip>=epilogue label
971	jae	.Lin_prologue
972
973	lea	40(%rax),%rax
974	mov	-8(%rax),%rbx
975	mov	-16(%rax),%rbp
976	mov	-24(%rax),%r13
977	mov	-32(%rax),%r14
978	mov	-40(%rax),%r15
979	mov	%rbx,144($context)	# restore context->Rbx
980	mov	%rbp,160($context)	# restore context->Rbp
981	mov	%r13,224($context)	# restore context->R13
982	mov	%r14,232($context)	# restore context->R14
983	mov	%r15,240($context)	# restore context->R15
984
985.Lin_prologue:
986	mov	8(%rax),%rdi
987	mov	16(%rax),%rsi
988	mov	%rax,152($context)	# restore context->Rsp
989	mov	%rsi,168($context)	# restore context->Rsi
990	mov	%rdi,176($context)	# restore context->Rdi
991
992	jmp	.Lcommon_seh_exit
993.size	common_se_handler,.-common_se_handler
994
995.type	cbc_se_handler,\@abi-omnipotent
996.align	16
997cbc_se_handler:
998	push	%rsi
999	push	%rdi
1000	push	%rbx
1001	push	%rbp
1002	push	%r12
1003	push	%r13
1004	push	%r14
1005	push	%r15
1006	pushfq
1007	lea	-64(%rsp),%rsp
1008
1009	mov	120($context),%rax	# pull context->Rax
1010	mov	248($context),%rbx	# pull context->Rip
1011
1012	lea	.Lcbc_prologue(%rip),%r10
1013	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
1014	jb	.Lin_cbc_prologue
1015
1016	lea	.Lcbc_body(%rip),%r10
1017	cmp	%r10,%rbx		# context->Rip<.Lcbc_body
1018	jb	.Lin_cbc_frame_setup
1019
1020	mov	152($context),%rax	# pull context->Rsp
1021
1022	lea	.Lcbc_abort(%rip),%r10
1023	cmp	%r10,%rbx		# context->Rip>=.Lcbc_abort
1024	jae	.Lin_cbc_prologue
1025
1026	# handle pushf/popf in Camellia_cbc_encrypt
1027	lea	.Lcbc_enc_pushf(%rip),%r10
1028	cmp	%r10,%rbx		# context->Rip<=.Lcbc_enc_pushf
1029	jbe	.Lin_cbc_no_flag
1030	lea	8(%rax),%rax
1031	lea	.Lcbc_enc_popf(%rip),%r10
1032	cmp	%r10,%rbx		# context->Rip<.Lcbc_enc_popf
1033	jb	.Lin_cbc_no_flag
1034	lea	-8(%rax),%rax
1035	lea	.Lcbc_dec_pushf(%rip),%r10
1036	cmp	%r10,%rbx		# context->Rip<=.Lcbc_dec_pushf
1037	jbe	.Lin_cbc_no_flag
1038	lea	8(%rax),%rax
1039	lea	.Lcbc_dec_popf(%rip),%r10
1040	cmp	%r10,%rbx		# context->Rip<.Lcbc_dec_popf
1041	jb	.Lin_cbc_no_flag
1042	lea	-8(%rax),%rax
1043
1044.Lin_cbc_no_flag:
1045	mov	48(%rax),%rax		# $_rsp
1046	lea	48(%rax),%rax
1047
1048.Lin_cbc_frame_setup:
1049	mov	-8(%rax),%rbx
1050	mov	-16(%rax),%rbp
1051	mov	-24(%rax),%r12
1052	mov	-32(%rax),%r13
1053	mov	-40(%rax),%r14
1054	mov	-48(%rax),%r15
1055	mov	%rbx,144($context)	# restore context->Rbx
1056	mov	%rbp,160($context)	# restore context->Rbp
1057	mov	%r12,216($context)	# restore context->R12
1058	mov	%r13,224($context)	# restore context->R13
1059	mov	%r14,232($context)	# restore context->R14
1060	mov	%r15,240($context)	# restore context->R15
1061
1062.Lin_cbc_prologue:
1063	mov	8(%rax),%rdi
1064	mov	16(%rax),%rsi
1065	mov	%rax,152($context)	# restore context->Rsp
1066	mov	%rsi,168($context)	# restore context->Rsi
1067	mov	%rdi,176($context)	# restore context->Rdi
1068
1069.align	4
1070.Lcommon_seh_exit:
1071
1072	mov	40($disp),%rdi		# disp->ContextRecord
1073	mov	$context,%rsi		# context
1074	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1075	.long	0xa548f3fc		# cld; rep movsq
1076
1077	mov	$disp,%rsi
1078	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1079	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1080	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1081	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1082	mov	40(%rsi),%r10		# disp->ContextRecord
1083	lea	56(%rsi),%r11		# &disp->HandlerData
1084	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1085	mov	%r10,32(%rsp)		# arg5
1086	mov	%r11,40(%rsp)		# arg6
1087	mov	%r12,48(%rsp)		# arg7
1088	mov	%rcx,56(%rsp)		# arg8, (NULL)
1089	call	*__imp_RtlVirtualUnwind(%rip)
1090
1091	mov	\$1,%eax		# ExceptionContinueSearch
1092	lea	64(%rsp),%rsp
1093	popfq
1094	pop	%r15
1095	pop	%r14
1096	pop	%r13
1097	pop	%r12
1098	pop	%rbp
1099	pop	%rbx
1100	pop	%rdi
1101	pop	%rsi
1102	ret
1103.size	cbc_se_handler,.-cbc_se_handler
1104
1105.section	.pdata
1106.align	4
1107	.rva	.LSEH_begin_Camellia_EncryptBlock_Rounds
1108	.rva	.LSEH_end_Camellia_EncryptBlock_Rounds
1109	.rva	.LSEH_info_Camellia_EncryptBlock_Rounds
1110
1111	.rva	.LSEH_begin_Camellia_DecryptBlock_Rounds
1112	.rva	.LSEH_end_Camellia_DecryptBlock_Rounds
1113	.rva	.LSEH_info_Camellia_DecryptBlock_Rounds
1114
1115	.rva	.LSEH_begin_Camellia_Ekeygen
1116	.rva	.LSEH_end_Camellia_Ekeygen
1117	.rva	.LSEH_info_Camellia_Ekeygen
1118
1119	.rva	.LSEH_begin_Camellia_cbc_encrypt
1120	.rva	.LSEH_end_Camellia_cbc_encrypt
1121	.rva	.LSEH_info_Camellia_cbc_encrypt
1122
1123.section	.xdata
1124.align	8
1125.LSEH_info_Camellia_EncryptBlock_Rounds:
1126	.byte	9,0,0,0
1127	.rva	common_se_handler
1128	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
1129.LSEH_info_Camellia_DecryptBlock_Rounds:
1130	.byte	9,0,0,0
1131	.rva	common_se_handler
1132	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
1133.LSEH_info_Camellia_Ekeygen:
1134	.byte	9,0,0,0
1135	.rva	common_se_handler
1136	.rva	.Lkey_prologue,.Lkey_epilogue	# HandlerData[]
1137.LSEH_info_Camellia_cbc_encrypt:
1138	.byte	9,0,0,0
1139	.rva	cbc_se_handler
1140___
1141}
1142
1143$code =~ s/\`([^\`]*)\`/eval $1/gem;
1144print $code;
1145close STDOUT;
1146