1#! /usr/bin/env perl 2# Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12# 13# This module may be used under the terms of either the GNU General 14# Public License version 2 or later, the GNU Lesser General Public 15# License version 2.1 or later, the Mozilla Public License version 16# 1.1 or the BSD License. The exact terms of either license are 17# distributed along with this module. For further details see 18# http://www.openssl.org/~appro/camellia/. 19# ==================================================================== 20 21# Performance in cycles per processed byte (less is better) in 22# 'openssl speed ...' benchmark: 23# 24# AMD64 Core2 EM64T 25# -evp camellia-128-ecb 16.7 21.0 22.7 26# + over gcc 3.4.6 +25% +5% 0% 27# 28# camellia-128-cbc 15.7 20.4 21.1 29# 30# 128-bit key setup 128 216 205 cycles/key 31# + over gcc 3.4.6 +54% +39% +15% 32# 33# Numbers in "+" rows represent performance improvement over compiler 34# generated code. Key setup timings are impressive on AMD and Core2 35# thanks to 64-bit operations being covertly deployed. Improvement on 36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it 37# apparently emulates some of 64-bit operations in [32-bit] microcode. 38 39$flavour = shift; 40$output = shift; 41if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 42 43$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 48die "can't locate x86_64-xlate.pl"; 49 50open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 51*STDOUT=*OUT; 52 53sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 54sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 55 $r =~ s/%[er]([sd]i)/%\1l/; 56 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 57 58$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; 59@S=("%r8d","%r9d","%r10d","%r11d"); 60$i0="%esi"; 61$i1="%edi"; 62$Tbl="%rbp"; # size optimization 63$inp="%r12"; 64$out="%r13"; 65$key="%r14"; 66$keyend="%r15"; 67$arg0d=$win64?"%ecx":"%edi"; 68 69# const unsigned int Camellia_SBOX[4][256]; 70# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 71# and [2][] - with [3][]. This is done to minimize code size. 72$SBOX1_1110=0; # Camellia_SBOX[0] 73$SBOX4_4404=4; # Camellia_SBOX[1] 74$SBOX2_0222=2048; # Camellia_SBOX[2] 75$SBOX3_3033=2052; # Camellia_SBOX[3] 76 77sub Camellia_Feistel { 78my $i=@_[0]; 79my $seed=defined(@_[1])?@_[1]:0; 80my $scale=$seed<0?-8:8; 81my $j=($i&1)*2; 82my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]); 83 84$code.=<<___; 85 xor $s0,$t0 # t0^=key[0] 86 xor $s1,$t1 # t1^=key[1] 87 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff 88 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff 89 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] 90 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] 91 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff 92 shr \$16,$t0 93 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff 94 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] 95 shr \$16,$t1 96 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] 97 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff 98 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff 99 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] 100 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] 101 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff 102 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff 103 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] 104 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] 105 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] 106 mov `$seed+($i+1)*$scale+4`($key),$t0 107 xor $t3,$t2 # t2^=t3 108 ror \$8,$t3 # t3=RightRotate(t3,8) 109 xor $t2,$s2 110 xor $t2,$s3 111 xor $t3,$s3 112___ 113} 114 115# void Camellia_EncryptBlock_Rounds( 116# int grandRounds, 117# const Byte plaintext[], 118# const KEY_TABLE_TYPE keyTable, 119# Byte ciphertext[]) 120$code=<<___; 121.text 122 123# V1.x API 124.globl Camellia_EncryptBlock 125.type Camellia_EncryptBlock,\@abi-omnipotent 126.align 16 127Camellia_EncryptBlock: 128 movl \$128,%eax 129 subl $arg0d,%eax 130 movl \$3,$arg0d 131 adcl \$0,$arg0d # keyBitLength==128?3:4 132 jmp .Lenc_rounds 133.size Camellia_EncryptBlock,.-Camellia_EncryptBlock 134# V2 135.globl Camellia_EncryptBlock_Rounds 136.type Camellia_EncryptBlock_Rounds,\@function,4 137.align 16 138.Lenc_rounds: 139Camellia_EncryptBlock_Rounds: 140.cfi_startproc 141 push %rbx 142.cfi_push %rbx 143 push %rbp 144.cfi_push %rbp 145 push %r13 146.cfi_push %r13 147 push %r14 148.cfi_push %r14 149 push %r15 150.cfi_push %r15 151.Lenc_prologue: 152 153 #mov %rsi,$inp # put away arguments 154 mov %rcx,$out 155 mov %rdx,$key 156 157 shl \$6,%edi # process grandRounds 158 lea .LCamellia_SBOX(%rip),$Tbl 159 lea ($key,%rdi),$keyend 160 161 mov 0(%rsi),@S[0] # load plaintext 162 mov 4(%rsi),@S[1] 163 mov 8(%rsi),@S[2] 164 bswap @S[0] 165 mov 12(%rsi),@S[3] 166 bswap @S[1] 167 bswap @S[2] 168 bswap @S[3] 169 170 call _x86_64_Camellia_encrypt 171 172 bswap @S[0] 173 bswap @S[1] 174 bswap @S[2] 175 mov @S[0],0($out) 176 bswap @S[3] 177 mov @S[1],4($out) 178 mov @S[2],8($out) 179 mov @S[3],12($out) 180 181 mov 0(%rsp),%r15 182.cfi_restore %r15 183 mov 8(%rsp),%r14 184.cfi_restore %r14 185 mov 16(%rsp),%r13 186.cfi_restore %r13 187 mov 24(%rsp),%rbp 188.cfi_restore %rbp 189 mov 32(%rsp),%rbx 190.cfi_restore %rbx 191 lea 40(%rsp),%rsp 192.cfi_adjust_cfa_offset -40 193.Lenc_epilogue: 194 ret 195.cfi_endproc 196.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds 197 198.type _x86_64_Camellia_encrypt,\@abi-omnipotent 199.align 16 200_x86_64_Camellia_encrypt: 201 xor 0($key),@S[1] 202 xor 4($key),@S[0] # ^=key[0-3] 203 xor 8($key),@S[3] 204 xor 12($key),@S[2] 205.align 16 206.Leloop: 207 mov 16($key),$t1 # prefetch key[4-5] 208 mov 20($key),$t0 209 210___ 211 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } 212$code.=<<___; 213 lea 16*4($key),$key 214 cmp $keyend,$key 215 mov 8($key),$t3 # prefetch key[2-3] 216 mov 12($key),$t2 217 je .Ledone 218 219 and @S[0],$t0 220 or @S[3],$t3 221 rol \$1,$t0 222 xor $t3,@S[2] # s2^=s3|key[3]; 223 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 224 and @S[2],$t2 225 or @S[1],$t1 226 rol \$1,$t2 227 xor $t1,@S[0] # s0^=s1|key[1]; 228 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 229 jmp .Leloop 230 231.align 16 232.Ledone: 233 xor @S[2],$t0 # SwapHalf 234 xor @S[3],$t1 235 xor @S[0],$t2 236 xor @S[1],$t3 237 238 mov $t0,@S[0] 239 mov $t1,@S[1] 240 mov $t2,@S[2] 241 mov $t3,@S[3] 242 243 .byte 0xf3,0xc3 # rep ret 244.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt 245 246# V1.x API 247.globl Camellia_DecryptBlock 248.type Camellia_DecryptBlock,\@abi-omnipotent 249.align 16 250Camellia_DecryptBlock: 251 movl \$128,%eax 252 subl $arg0d,%eax 253 movl \$3,$arg0d 254 adcl \$0,$arg0d # keyBitLength==128?3:4 255 jmp .Ldec_rounds 256.size Camellia_DecryptBlock,.-Camellia_DecryptBlock 257# V2 258.globl Camellia_DecryptBlock_Rounds 259.type Camellia_DecryptBlock_Rounds,\@function,4 260.align 16 261.Ldec_rounds: 262Camellia_DecryptBlock_Rounds: 263.cfi_startproc 264 push %rbx 265.cfi_push %rbx 266 push %rbp 267.cfi_push %rbp 268 push %r13 269.cfi_push %r13 270 push %r14 271.cfi_push %r14 272 push %r15 273.cfi_push %r15 274.Ldec_prologue: 275 276 #mov %rsi,$inp # put away arguments 277 mov %rcx,$out 278 mov %rdx,$keyend 279 280 shl \$6,%edi # process grandRounds 281 lea .LCamellia_SBOX(%rip),$Tbl 282 lea ($keyend,%rdi),$key 283 284 mov 0(%rsi),@S[0] # load plaintext 285 mov 4(%rsi),@S[1] 286 mov 8(%rsi),@S[2] 287 bswap @S[0] 288 mov 12(%rsi),@S[3] 289 bswap @S[1] 290 bswap @S[2] 291 bswap @S[3] 292 293 call _x86_64_Camellia_decrypt 294 295 bswap @S[0] 296 bswap @S[1] 297 bswap @S[2] 298 mov @S[0],0($out) 299 bswap @S[3] 300 mov @S[1],4($out) 301 mov @S[2],8($out) 302 mov @S[3],12($out) 303 304 mov 0(%rsp),%r15 305.cfi_restore %r15 306 mov 8(%rsp),%r14 307.cfi_restore %r14 308 mov 16(%rsp),%r13 309.cfi_restore %r13 310 mov 24(%rsp),%rbp 311.cfi_restore %rbp 312 mov 32(%rsp),%rbx 313.cfi_restore %rbx 314 lea 40(%rsp),%rsp 315.cfi_adjust_cfa_offset -40 316.Ldec_epilogue: 317 ret 318.cfi_endproc 319.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds 320 321.type _x86_64_Camellia_decrypt,\@abi-omnipotent 322.align 16 323_x86_64_Camellia_decrypt: 324 xor 0($key),@S[1] 325 xor 4($key),@S[0] # ^=key[0-3] 326 xor 8($key),@S[3] 327 xor 12($key),@S[2] 328.align 16 329.Ldloop: 330 mov -8($key),$t1 # prefetch key[4-5] 331 mov -4($key),$t0 332 333___ 334 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } 335$code.=<<___; 336 lea -16*4($key),$key 337 cmp $keyend,$key 338 mov 0($key),$t3 # prefetch key[2-3] 339 mov 4($key),$t2 340 je .Lddone 341 342 and @S[0],$t0 343 or @S[3],$t3 344 rol \$1,$t0 345 xor $t3,@S[2] # s2^=s3|key[3]; 346 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 347 and @S[2],$t2 348 or @S[1],$t1 349 rol \$1,$t2 350 xor $t1,@S[0] # s0^=s1|key[1]; 351 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 352 353 jmp .Ldloop 354 355.align 16 356.Lddone: 357 xor @S[2],$t2 358 xor @S[3],$t3 359 xor @S[0],$t0 360 xor @S[1],$t1 361 362 mov $t2,@S[0] # SwapHalf 363 mov $t3,@S[1] 364 mov $t0,@S[2] 365 mov $t1,@S[3] 366 367 .byte 0xf3,0xc3 # rep ret 368.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt 369___ 370 371sub _saveround { 372my ($rnd,$key,@T)=@_; 373my $bias=int(@T[0])?shift(@T):0; 374 375 if ($#T==3) { 376 $code.=<<___; 377 mov @T[1],`$bias+$rnd*8+0`($key) 378 mov @T[0],`$bias+$rnd*8+4`($key) 379 mov @T[3],`$bias+$rnd*8+8`($key) 380 mov @T[2],`$bias+$rnd*8+12`($key) 381___ 382 } else { 383 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; 384 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); 385 } 386} 387 388sub _loadround { 389my ($rnd,$key,@T)=@_; 390my $bias=int(@T[0])?shift(@T):0; 391 392$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; 393$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); 394} 395 396# shld is very slow on Intel EM64T family. Even on AMD it limits 397# instruction decode rate [because it's VectorPath] and consequently 398# performance... 399sub __rotl128 { 400my ($i0,$i1,$rot)=@_; 401 402 if ($rot) { 403 $code.=<<___; 404 mov $i0,%r11 405 shld \$$rot,$i1,$i0 406 shld \$$rot,%r11,$i1 407___ 408 } 409} 410 411# ... Implementing 128-bit rotate without shld gives 80% better 412# performance EM64T, +15% on AMD64 and only ~7% degradation on 413# Core2. This is therefore preferred. 414sub _rotl128 { 415my ($i0,$i1,$rot)=@_; 416 417 if ($rot) { 418 $code.=<<___; 419 mov $i0,%r11 420 shl \$$rot,$i0 421 mov $i1,%r9 422 shr \$`64-$rot`,%r9 423 shr \$`64-$rot`,%r11 424 or %r9,$i0 425 shl \$$rot,$i1 426 or %r11,$i1 427___ 428 } 429} 430 431{ my $step=0; 432 433$code.=<<___; 434.globl Camellia_Ekeygen 435.type Camellia_Ekeygen,\@function,3 436.align 16 437Camellia_Ekeygen: 438.cfi_startproc 439 push %rbx 440.cfi_push %rbx 441 push %rbp 442.cfi_push %rbp 443 push %r13 444.cfi_push %r13 445 push %r14 446.cfi_push %r14 447 push %r15 448.cfi_push %r15 449.Lkey_prologue: 450 451 mov %edi,${keyend}d # put away arguments, keyBitLength 452 mov %rdx,$out # keyTable 453 454 mov 0(%rsi),@S[0] # load 0-127 bits 455 mov 4(%rsi),@S[1] 456 mov 8(%rsi),@S[2] 457 mov 12(%rsi),@S[3] 458 459 bswap @S[0] 460 bswap @S[1] 461 bswap @S[2] 462 bswap @S[3] 463___ 464 &_saveround (0,$out,@S); # KL<<<0 465$code.=<<___; 466 cmp \$128,$keyend # check keyBitLength 467 je .L1st128 468 469 mov 16(%rsi),@S[0] # load 128-191 bits 470 mov 20(%rsi),@S[1] 471 cmp \$192,$keyend 472 je .L1st192 473 mov 24(%rsi),@S[2] # load 192-255 bits 474 mov 28(%rsi),@S[3] 475 jmp .L1st256 476.L1st192: 477 mov @S[0],@S[2] 478 mov @S[1],@S[3] 479 not @S[2] 480 not @S[3] 481.L1st256: 482 bswap @S[0] 483 bswap @S[1] 484 bswap @S[2] 485 bswap @S[3] 486___ 487 &_saveround (4,$out,@S); # temp storage for KR! 488$code.=<<___; 489 xor 0($out),@S[1] # KR^KL 490 xor 4($out),@S[0] 491 xor 8($out),@S[3] 492 xor 12($out),@S[2] 493 494.L1st128: 495 lea .LCamellia_SIGMA(%rip),$key 496 lea .LCamellia_SBOX(%rip),$Tbl 497 498 mov 0($key),$t1 499 mov 4($key),$t0 500___ 501 &Camellia_Feistel($step++); 502 &Camellia_Feistel($step++); 503$code.=<<___; 504 xor 0($out),@S[1] # ^KL 505 xor 4($out),@S[0] 506 xor 8($out),@S[3] 507 xor 12($out),@S[2] 508___ 509 &Camellia_Feistel($step++); 510 &Camellia_Feistel($step++); 511$code.=<<___; 512 cmp \$128,$keyend 513 jne .L2nd256 514 515 lea 128($out),$out # size optimization 516 shl \$32,%r8 # @S[0]|| 517 shl \$32,%r10 # @S[2]|| 518 or %r9,%r8 # ||@S[1] 519 or %r11,%r10 # ||@S[3] 520___ 521 &_loadround (0,$out,-128,"%rax","%rbx"); # KL 522 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 523 &_rotl128 ("%rax","%rbx",15); 524 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 525 &_rotl128 ("%r8","%r10",15); 526 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 527 &_rotl128 ("%r8","%r10",15); # 15+15=30 528 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 529 &_rotl128 ("%rax","%rbx",30); # 15+30=45 530 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 531 &_rotl128 ("%r8","%r10",15); # 30+15=45 532 &_saveround (12,$out,-128,"%r8"); # KA<<<45 533 &_rotl128 ("%rax","%rbx",15); # 45+15=60 534 &_saveround (13,$out,-128,"%rbx"); # KL<<<60 535 &_rotl128 ("%r8","%r10",15); # 45+15=60 536 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 537 &_rotl128 ("%rax","%rbx",17); # 60+17=77 538 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 539 &_rotl128 ("%rax","%rbx",17); # 77+17=94 540 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 541 &_rotl128 ("%r8","%r10",34); # 60+34=94 542 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 543 &_rotl128 ("%rax","%rbx",17); # 94+17=111 544 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 545 &_rotl128 ("%r8","%r10",17); # 94+17=111 546 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 547$code.=<<___; 548 mov \$3,%eax 549 jmp .Ldone 550.align 16 551.L2nd256: 552___ 553 &_saveround (6,$out,@S); # temp storage for KA! 554$code.=<<___; 555 xor `4*8+0`($out),@S[1] # KA^KR 556 xor `4*8+4`($out),@S[0] 557 xor `5*8+0`($out),@S[3] 558 xor `5*8+4`($out),@S[2] 559___ 560 &Camellia_Feistel($step++); 561 &Camellia_Feistel($step++); 562 563 &_loadround (0,$out,"%rax","%rbx"); # KL 564 &_loadround (4,$out,"%rcx","%rdx"); # KR 565 &_loadround (6,$out,"%r14","%r15"); # KA 566$code.=<<___; 567 lea 128($out),$out # size optimization 568 shl \$32,%r8 # @S[0]|| 569 shl \$32,%r10 # @S[2]|| 570 or %r9,%r8 # ||@S[1] 571 or %r11,%r10 # ||@S[3] 572___ 573 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 574 &_rotl128 ("%rcx","%rdx",15); 575 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 576 &_rotl128 ("%r14","%r15",15); 577 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 578 &_rotl128 ("%rcx","%rdx",15); # 15+15=30 579 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 580 &_rotl128 ("%r8","%r10",30); 581 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 582 &_rotl128 ("%rax","%rbx",45); 583 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 584 &_rotl128 ("%r14","%r15",30); # 15+30=45 585 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 586 &_rotl128 ("%rax","%rbx",15); # 45+15=60 587 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 588 &_rotl128 ("%rcx","%rdx",30); # 30+30=60 589 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 590 &_rotl128 ("%r8","%r10",30); # 30+30=60 591 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 592 &_rotl128 ("%rax","%rbx",17); # 60+17=77 593 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 594 &_rotl128 ("%r14","%r15",32); # 45+32=77 595 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 596 &_rotl128 ("%rcx","%rdx",34); # 60+34=94 597 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 598 &_rotl128 ("%r14","%r15",17); # 77+17=94 599 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 600 &_rotl128 ("%rax","%rbx",34); # 77+34=111 601 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 602 &_rotl128 ("%r8","%r10",51); # 60+51=111 603 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 604$code.=<<___; 605 mov \$4,%eax 606.Ldone: 607 mov 0(%rsp),%r15 608.cfi_restore %r15 609 mov 8(%rsp),%r14 610.cfi_restore %r14 611 mov 16(%rsp),%r13 612.cfi_restore %r13 613 mov 24(%rsp),%rbp 614.cfi_restore %rbp 615 mov 32(%rsp),%rbx 616.cfi_restore %rbx 617 lea 40(%rsp),%rsp 618.cfi_adjust_cfa_offset -40 619.Lkey_epilogue: 620 ret 621.cfi_endproc 622.size Camellia_Ekeygen,.-Camellia_Ekeygen 623___ 624} 625 626@SBOX=( 627112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 628 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 629134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 630166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 631139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 632223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 633 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 634254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 635170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 636 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 637135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 638 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 639233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 640120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 641114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 642 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 643 644sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } 645sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } 646sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } 647sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } 648 649$code.=<<___; 650.align 64 651.LCamellia_SIGMA: 652.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 653.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 654.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 655.long 0, 0, 0, 0 656.LCamellia_SBOX: 657___ 658# tables are interleaved, remember? 659sub data_word { $code.=".long\t".join(',',@_)."\n"; } 660for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 661for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 662 663# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 664# size_t length, const CAMELLIA_KEY *key, 665# unsigned char *ivp,const int enc); 666{ 667$_key="0(%rsp)"; 668$_end="8(%rsp)"; # inp+len&~15 669$_res="16(%rsp)"; # len&15 670$ivec="24(%rsp)"; 671$_ivp="40(%rsp)"; 672$_rsp="48(%rsp)"; 673 674$code.=<<___; 675.globl Camellia_cbc_encrypt 676.type Camellia_cbc_encrypt,\@function,6 677.align 16 678Camellia_cbc_encrypt: 679.cfi_startproc 680 cmp \$0,%rdx 681 je .Lcbc_abort 682 push %rbx 683.cfi_push %rbx 684 push %rbp 685.cfi_push %rbp 686 push %r12 687.cfi_push %r12 688 push %r13 689.cfi_push %r13 690 push %r14 691.cfi_push %r14 692 push %r15 693.cfi_push %r15 694.Lcbc_prologue: 695 696 mov %rsp,%rbp 697.cfi_def_cfa_register %rbp 698 sub \$64,%rsp 699 and \$-64,%rsp 700 701 # place stack frame just "above mod 1024" the key schedule, 702 # this ensures that cache associativity suffices 703 lea -64-63(%rcx),%r10 704 sub %rsp,%r10 705 neg %r10 706 and \$0x3C0,%r10 707 sub %r10,%rsp 708 #add \$8,%rsp # 8 is reserved for callee's ra 709 710 mov %rdi,$inp # inp argument 711 mov %rsi,$out # out argument 712 mov %r8,%rbx # ivp argument 713 mov %rcx,$key # key argument 714 mov 272(%rcx),${keyend}d # grandRounds 715 716 mov %r8,$_ivp 717 mov %rbp,$_rsp 718.cfi_cfa_expression $_rsp,deref,+56 719 720.Lcbc_body: 721 lea .LCamellia_SBOX(%rip),$Tbl 722 723 mov \$32,%ecx 724.align 4 725.Lcbc_prefetch_sbox: 726 mov 0($Tbl),%rax 727 mov 32($Tbl),%rsi 728 mov 64($Tbl),%rdi 729 mov 96($Tbl),%r11 730 lea 128($Tbl),$Tbl 731 loop .Lcbc_prefetch_sbox 732 sub \$4096,$Tbl 733 shl \$6,$keyend 734 mov %rdx,%rcx # len argument 735 lea ($key,$keyend),$keyend 736 737 cmp \$0,%r9d # enc argument 738 je .LCBC_DECRYPT 739 740 and \$-16,%rdx 741 and \$15,%rcx # length residue 742 lea ($inp,%rdx),%rdx 743 mov $key,$_key 744 mov %rdx,$_end 745 mov %rcx,$_res 746 747 cmp $inp,%rdx 748 mov 0(%rbx),@S[0] # load IV 749 mov 4(%rbx),@S[1] 750 mov 8(%rbx),@S[2] 751 mov 12(%rbx),@S[3] 752 je .Lcbc_enc_tail 753 jmp .Lcbc_eloop 754 755.align 16 756.Lcbc_eloop: 757 xor 0($inp),@S[0] 758 xor 4($inp),@S[1] 759 xor 8($inp),@S[2] 760 bswap @S[0] 761 xor 12($inp),@S[3] 762 bswap @S[1] 763 bswap @S[2] 764 bswap @S[3] 765 766 call _x86_64_Camellia_encrypt 767 768 mov $_key,$key # "rewind" the key 769 bswap @S[0] 770 mov $_end,%rdx 771 bswap @S[1] 772 mov $_res,%rcx 773 bswap @S[2] 774 mov @S[0],0($out) 775 bswap @S[3] 776 mov @S[1],4($out) 777 mov @S[2],8($out) 778 lea 16($inp),$inp 779 mov @S[3],12($out) 780 cmp %rdx,$inp 781 lea 16($out),$out 782 jne .Lcbc_eloop 783 784 cmp \$0,%rcx 785 jne .Lcbc_enc_tail 786 787 mov $_ivp,$out 788 mov @S[0],0($out) # write out IV residue 789 mov @S[1],4($out) 790 mov @S[2],8($out) 791 mov @S[3],12($out) 792 jmp .Lcbc_done 793 794.align 16 795.Lcbc_enc_tail: 796 xor %rax,%rax 797 mov %rax,0+$ivec 798 mov %rax,8+$ivec 799 mov %rax,$_res 800 801.Lcbc_enc_pushf: 802 pushfq 803 cld 804 mov $inp,%rsi 805 lea 8+$ivec,%rdi 806 .long 0x9066A4F3 # rep movsb 807 popfq 808.Lcbc_enc_popf: 809 810 lea $ivec,$inp 811 lea 16+$ivec,%rax 812 mov %rax,$_end 813 jmp .Lcbc_eloop # one more time 814 815.align 16 816.LCBC_DECRYPT: 817 xchg $key,$keyend 818 add \$15,%rdx 819 and \$15,%rcx # length residue 820 and \$-16,%rdx 821 mov $key,$_key 822 lea ($inp,%rdx),%rdx 823 mov %rdx,$_end 824 mov %rcx,$_res 825 826 mov (%rbx),%rax # load IV 827 mov 8(%rbx),%rbx 828 jmp .Lcbc_dloop 829.align 16 830.Lcbc_dloop: 831 mov 0($inp),@S[0] 832 mov 4($inp),@S[1] 833 mov 8($inp),@S[2] 834 bswap @S[0] 835 mov 12($inp),@S[3] 836 bswap @S[1] 837 mov %rax,0+$ivec # save IV to temporary storage 838 bswap @S[2] 839 mov %rbx,8+$ivec 840 bswap @S[3] 841 842 call _x86_64_Camellia_decrypt 843 844 mov $_key,$key # "rewind" the key 845 mov $_end,%rdx 846 mov $_res,%rcx 847 848 bswap @S[0] 849 mov ($inp),%rax # load IV for next iteration 850 bswap @S[1] 851 mov 8($inp),%rbx 852 bswap @S[2] 853 xor 0+$ivec,@S[0] 854 bswap @S[3] 855 xor 4+$ivec,@S[1] 856 xor 8+$ivec,@S[2] 857 lea 16($inp),$inp 858 xor 12+$ivec,@S[3] 859 cmp %rdx,$inp 860 je .Lcbc_ddone 861 862 mov @S[0],0($out) 863 mov @S[1],4($out) 864 mov @S[2],8($out) 865 mov @S[3],12($out) 866 867 lea 16($out),$out 868 jmp .Lcbc_dloop 869 870.align 16 871.Lcbc_ddone: 872 mov $_ivp,%rdx 873 cmp \$0,%rcx 874 jne .Lcbc_dec_tail 875 876 mov @S[0],0($out) 877 mov @S[1],4($out) 878 mov @S[2],8($out) 879 mov @S[3],12($out) 880 881 mov %rax,(%rdx) # write out IV residue 882 mov %rbx,8(%rdx) 883 jmp .Lcbc_done 884.align 16 885.Lcbc_dec_tail: 886 mov @S[0],0+$ivec 887 mov @S[1],4+$ivec 888 mov @S[2],8+$ivec 889 mov @S[3],12+$ivec 890 891.Lcbc_dec_pushf: 892 pushfq 893 cld 894 lea 8+$ivec,%rsi 895 lea ($out),%rdi 896 .long 0x9066A4F3 # rep movsb 897 popfq 898.Lcbc_dec_popf: 899 900 mov %rax,(%rdx) # write out IV residue 901 mov %rbx,8(%rdx) 902 jmp .Lcbc_done 903 904.align 16 905.Lcbc_done: 906 mov $_rsp,%rcx 907.cfi_def_cfa %rcx,56 908 mov 0(%rcx),%r15 909.cfi_restore %r15 910 mov 8(%rcx),%r14 911.cfi_restore %r14 912 mov 16(%rcx),%r13 913.cfi_restore %r13 914 mov 24(%rcx),%r12 915.cfi_restore %r12 916 mov 32(%rcx),%rbp 917.cfi_restore %rbp 918 mov 40(%rcx),%rbx 919.cfi_restore %rbx 920 lea 48(%rcx),%rsp 921.cfi_def_cfa %rsp,8 922.Lcbc_abort: 923 ret 924.cfi_endproc 925.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt 926 927.asciz "Camellia for x86_64 by <appro\@openssl.org>" 928___ 929} 930 931# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 932# CONTEXT *context,DISPATCHER_CONTEXT *disp) 933if ($win64) { 934$rec="%rcx"; 935$frame="%rdx"; 936$context="%r8"; 937$disp="%r9"; 938 939$code.=<<___; 940.extern __imp_RtlVirtualUnwind 941.type common_se_handler,\@abi-omnipotent 942.align 16 943common_se_handler: 944 push %rsi 945 push %rdi 946 push %rbx 947 push %rbp 948 push %r12 949 push %r13 950 push %r14 951 push %r15 952 pushfq 953 lea -64(%rsp),%rsp 954 955 mov 120($context),%rax # pull context->Rax 956 mov 248($context),%rbx # pull context->Rip 957 958 mov 8($disp),%rsi # disp->ImageBase 959 mov 56($disp),%r11 # disp->HandlerData 960 961 mov 0(%r11),%r10d # HandlerData[0] 962 lea (%rsi,%r10),%r10 # prologue label 963 cmp %r10,%rbx # context->Rip<prologue label 964 jb .Lin_prologue 965 966 mov 152($context),%rax # pull context->Rsp 967 968 mov 4(%r11),%r10d # HandlerData[1] 969 lea (%rsi,%r10),%r10 # epilogue label 970 cmp %r10,%rbx # context->Rip>=epilogue label 971 jae .Lin_prologue 972 973 lea 40(%rax),%rax 974 mov -8(%rax),%rbx 975 mov -16(%rax),%rbp 976 mov -24(%rax),%r13 977 mov -32(%rax),%r14 978 mov -40(%rax),%r15 979 mov %rbx,144($context) # restore context->Rbx 980 mov %rbp,160($context) # restore context->Rbp 981 mov %r13,224($context) # restore context->R13 982 mov %r14,232($context) # restore context->R14 983 mov %r15,240($context) # restore context->R15 984 985.Lin_prologue: 986 mov 8(%rax),%rdi 987 mov 16(%rax),%rsi 988 mov %rax,152($context) # restore context->Rsp 989 mov %rsi,168($context) # restore context->Rsi 990 mov %rdi,176($context) # restore context->Rdi 991 992 jmp .Lcommon_seh_exit 993.size common_se_handler,.-common_se_handler 994 995.type cbc_se_handler,\@abi-omnipotent 996.align 16 997cbc_se_handler: 998 push %rsi 999 push %rdi 1000 push %rbx 1001 push %rbp 1002 push %r12 1003 push %r13 1004 push %r14 1005 push %r15 1006 pushfq 1007 lea -64(%rsp),%rsp 1008 1009 mov 120($context),%rax # pull context->Rax 1010 mov 248($context),%rbx # pull context->Rip 1011 1012 lea .Lcbc_prologue(%rip),%r10 1013 cmp %r10,%rbx # context->Rip<.Lcbc_prologue 1014 jb .Lin_cbc_prologue 1015 1016 lea .Lcbc_body(%rip),%r10 1017 cmp %r10,%rbx # context->Rip<.Lcbc_body 1018 jb .Lin_cbc_frame_setup 1019 1020 mov 152($context),%rax # pull context->Rsp 1021 1022 lea .Lcbc_abort(%rip),%r10 1023 cmp %r10,%rbx # context->Rip>=.Lcbc_abort 1024 jae .Lin_cbc_prologue 1025 1026 # handle pushf/popf in Camellia_cbc_encrypt 1027 lea .Lcbc_enc_pushf(%rip),%r10 1028 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf 1029 jbe .Lin_cbc_no_flag 1030 lea 8(%rax),%rax 1031 lea .Lcbc_enc_popf(%rip),%r10 1032 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf 1033 jb .Lin_cbc_no_flag 1034 lea -8(%rax),%rax 1035 lea .Lcbc_dec_pushf(%rip),%r10 1036 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf 1037 jbe .Lin_cbc_no_flag 1038 lea 8(%rax),%rax 1039 lea .Lcbc_dec_popf(%rip),%r10 1040 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf 1041 jb .Lin_cbc_no_flag 1042 lea -8(%rax),%rax 1043 1044.Lin_cbc_no_flag: 1045 mov 48(%rax),%rax # $_rsp 1046 lea 48(%rax),%rax 1047 1048.Lin_cbc_frame_setup: 1049 mov -8(%rax),%rbx 1050 mov -16(%rax),%rbp 1051 mov -24(%rax),%r12 1052 mov -32(%rax),%r13 1053 mov -40(%rax),%r14 1054 mov -48(%rax),%r15 1055 mov %rbx,144($context) # restore context->Rbx 1056 mov %rbp,160($context) # restore context->Rbp 1057 mov %r12,216($context) # restore context->R12 1058 mov %r13,224($context) # restore context->R13 1059 mov %r14,232($context) # restore context->R14 1060 mov %r15,240($context) # restore context->R15 1061 1062.Lin_cbc_prologue: 1063 mov 8(%rax),%rdi 1064 mov 16(%rax),%rsi 1065 mov %rax,152($context) # restore context->Rsp 1066 mov %rsi,168($context) # restore context->Rsi 1067 mov %rdi,176($context) # restore context->Rdi 1068 1069.align 4 1070.Lcommon_seh_exit: 1071 1072 mov 40($disp),%rdi # disp->ContextRecord 1073 mov $context,%rsi # context 1074 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1075 .long 0xa548f3fc # cld; rep movsq 1076 1077 mov $disp,%rsi 1078 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1079 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1080 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1081 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1082 mov 40(%rsi),%r10 # disp->ContextRecord 1083 lea 56(%rsi),%r11 # &disp->HandlerData 1084 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1085 mov %r10,32(%rsp) # arg5 1086 mov %r11,40(%rsp) # arg6 1087 mov %r12,48(%rsp) # arg7 1088 mov %rcx,56(%rsp) # arg8, (NULL) 1089 call *__imp_RtlVirtualUnwind(%rip) 1090 1091 mov \$1,%eax # ExceptionContinueSearch 1092 lea 64(%rsp),%rsp 1093 popfq 1094 pop %r15 1095 pop %r14 1096 pop %r13 1097 pop %r12 1098 pop %rbp 1099 pop %rbx 1100 pop %rdi 1101 pop %rsi 1102 ret 1103.size cbc_se_handler,.-cbc_se_handler 1104 1105.section .pdata 1106.align 4 1107 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds 1108 .rva .LSEH_end_Camellia_EncryptBlock_Rounds 1109 .rva .LSEH_info_Camellia_EncryptBlock_Rounds 1110 1111 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds 1112 .rva .LSEH_end_Camellia_DecryptBlock_Rounds 1113 .rva .LSEH_info_Camellia_DecryptBlock_Rounds 1114 1115 .rva .LSEH_begin_Camellia_Ekeygen 1116 .rva .LSEH_end_Camellia_Ekeygen 1117 .rva .LSEH_info_Camellia_Ekeygen 1118 1119 .rva .LSEH_begin_Camellia_cbc_encrypt 1120 .rva .LSEH_end_Camellia_cbc_encrypt 1121 .rva .LSEH_info_Camellia_cbc_encrypt 1122 1123.section .xdata 1124.align 8 1125.LSEH_info_Camellia_EncryptBlock_Rounds: 1126 .byte 9,0,0,0 1127 .rva common_se_handler 1128 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 1129.LSEH_info_Camellia_DecryptBlock_Rounds: 1130 .byte 9,0,0,0 1131 .rva common_se_handler 1132 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 1133.LSEH_info_Camellia_Ekeygen: 1134 .byte 9,0,0,0 1135 .rva common_se_handler 1136 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] 1137.LSEH_info_Camellia_cbc_encrypt: 1138 .byte 9,0,0,0 1139 .rva cbc_se_handler 1140___ 1141} 1142 1143$code =~ s/\`([^\`]*)\`/eval $1/gem; 1144print $code; 1145close STDOUT; 1146