1#! /usr/bin/env perl 2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 12# 13# This module may be used under the terms of either the GNU General 14# Public License version 2 or later, the GNU Lesser General Public 15# License version 2.1 or later, the Mozilla Public License version 16# 1.1 or the BSD License. The exact terms of either license are 17# distributed along with this module. For further details see 18# http://www.openssl.org/~appro/camellia/. 19# ==================================================================== 20 21# Performance in cycles per processed byte (less is better) in 22# 'openssl speed ...' benchmark: 23# 24# AMD64 Core2 EM64T 25# -evp camellia-128-ecb 16.7 21.0 22.7 26# + over gcc 3.4.6 +25% +5% 0% 27# 28# camellia-128-cbc 15.7 20.4 21.1 29# 30# 128-bit key setup 128 216 205 cycles/key 31# + over gcc 3.4.6 +54% +39% +15% 32# 33# Numbers in "+" rows represent performance improvement over compiler 34# generated code. Key setup timings are impressive on AMD and Core2 35# thanks to 64-bit operations being covertly deployed. Improvement on 36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it 37# apparently emulates some of 64-bit operations in [32-bit] microcode. 38 39$flavour = shift; 40$output = shift; 41if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 42 43$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 48die "can't locate x86_64-xlate.pl"; 49 50open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 51*STDOUT=*OUT; 52 53sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 54sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 55 $r =~ s/%[er]([sd]i)/%\1l/; 56 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 57 58$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; 59@S=("%r8d","%r9d","%r10d","%r11d"); 60$i0="%esi"; 61$i1="%edi"; 62$Tbl="%rbp"; # size optimization 63$inp="%r12"; 64$out="%r13"; 65$key="%r14"; 66$keyend="%r15"; 67$arg0d=$win64?"%ecx":"%edi"; 68 69# const unsigned int Camellia_SBOX[4][256]; 70# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 71# and [2][] - with [3][]. This is done to minimize code size. 72$SBOX1_1110=0; # Camellia_SBOX[0] 73$SBOX4_4404=4; # Camellia_SBOX[1] 74$SBOX2_0222=2048; # Camellia_SBOX[2] 75$SBOX3_3033=2052; # Camellia_SBOX[3] 76 77sub Camellia_Feistel { 78my $i=@_[0]; 79my $seed=defined(@_[1])?@_[1]:0; 80my $scale=$seed<0?-8:8; 81my $j=($i&1)*2; 82my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]); 83 84$code.=<<___; 85 xor $s0,$t0 # t0^=key[0] 86 xor $s1,$t1 # t1^=key[1] 87 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff 88 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff 89 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] 90 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] 91 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff 92 shr \$16,$t0 93 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff 94 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] 95 shr \$16,$t1 96 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] 97 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff 98 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff 99 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] 100 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] 101 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff 102 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff 103 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] 104 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] 105 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] 106 mov `$seed+($i+1)*$scale+4`($key),$t0 107 xor $t3,$t2 # t2^=t3 108 ror \$8,$t3 # t3=RightRotate(t3,8) 109 xor $t2,$s2 110 xor $t2,$s3 111 xor $t3,$s3 112___ 113} 114 115# void Camellia_EncryptBlock_Rounds( 116# int grandRounds, 117# const Byte plaintext[], 118# const KEY_TABLE_TYPE keyTable, 119# Byte ciphertext[]) 120$code=<<___; 121.text 122 123# V1.x API 124.globl Camellia_EncryptBlock 125.type Camellia_EncryptBlock,\@abi-omnipotent 126.align 16 127Camellia_EncryptBlock: 128.cfi_startproc 129 movl \$128,%eax 130 subl $arg0d,%eax 131 movl \$3,$arg0d 132 adcl \$0,$arg0d # keyBitLength==128?3:4 133 jmp .Lenc_rounds 134.cfi_endproc 135.size Camellia_EncryptBlock,.-Camellia_EncryptBlock 136# V2 137.globl Camellia_EncryptBlock_Rounds 138.type Camellia_EncryptBlock_Rounds,\@function,4 139.align 16 140.Lenc_rounds: 141Camellia_EncryptBlock_Rounds: 142.cfi_startproc 143 push %rbx 144.cfi_push %rbx 145 push %rbp 146.cfi_push %rbp 147 push %r13 148.cfi_push %r13 149 push %r14 150.cfi_push %r14 151 push %r15 152.cfi_push %r15 153.Lenc_prologue: 154 155 #mov %rsi,$inp # put away arguments 156 mov %rcx,$out 157 mov %rdx,$key 158 159 shl \$6,%edi # process grandRounds 160 lea .LCamellia_SBOX(%rip),$Tbl 161 lea ($key,%rdi),$keyend 162 163 mov 0(%rsi),@S[0] # load plaintext 164 mov 4(%rsi),@S[1] 165 mov 8(%rsi),@S[2] 166 bswap @S[0] 167 mov 12(%rsi),@S[3] 168 bswap @S[1] 169 bswap @S[2] 170 bswap @S[3] 171 172 call _x86_64_Camellia_encrypt 173 174 bswap @S[0] 175 bswap @S[1] 176 bswap @S[2] 177 mov @S[0],0($out) 178 bswap @S[3] 179 mov @S[1],4($out) 180 mov @S[2],8($out) 181 mov @S[3],12($out) 182 183 mov 0(%rsp),%r15 184.cfi_restore %r15 185 mov 8(%rsp),%r14 186.cfi_restore %r14 187 mov 16(%rsp),%r13 188.cfi_restore %r13 189 mov 24(%rsp),%rbp 190.cfi_restore %rbp 191 mov 32(%rsp),%rbx 192.cfi_restore %rbx 193 lea 40(%rsp),%rsp 194.cfi_adjust_cfa_offset -40 195.Lenc_epilogue: 196 ret 197.cfi_endproc 198.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds 199 200.type _x86_64_Camellia_encrypt,\@abi-omnipotent 201.align 16 202_x86_64_Camellia_encrypt: 203.cfi_startproc 204 xor 0($key),@S[1] 205 xor 4($key),@S[0] # ^=key[0-3] 206 xor 8($key),@S[3] 207 xor 12($key),@S[2] 208.align 16 209.Leloop: 210 mov 16($key),$t1 # prefetch key[4-5] 211 mov 20($key),$t0 212 213___ 214 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } 215$code.=<<___; 216 lea 16*4($key),$key 217 cmp $keyend,$key 218 mov 8($key),$t3 # prefetch key[2-3] 219 mov 12($key),$t2 220 je .Ledone 221 222 and @S[0],$t0 223 or @S[3],$t3 224 rol \$1,$t0 225 xor $t3,@S[2] # s2^=s3|key[3]; 226 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 227 and @S[2],$t2 228 or @S[1],$t1 229 rol \$1,$t2 230 xor $t1,@S[0] # s0^=s1|key[1]; 231 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 232 jmp .Leloop 233 234.align 16 235.Ledone: 236 xor @S[2],$t0 # SwapHalf 237 xor @S[3],$t1 238 xor @S[0],$t2 239 xor @S[1],$t3 240 241 mov $t0,@S[0] 242 mov $t1,@S[1] 243 mov $t2,@S[2] 244 mov $t3,@S[3] 245 246 .byte 0xf3,0xc3 # rep ret 247.cfi_endproc 248.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt 249 250# V1.x API 251.globl Camellia_DecryptBlock 252.type Camellia_DecryptBlock,\@abi-omnipotent 253.align 16 254Camellia_DecryptBlock: 255.cfi_startproc 256 movl \$128,%eax 257 subl $arg0d,%eax 258 movl \$3,$arg0d 259 adcl \$0,$arg0d # keyBitLength==128?3:4 260 jmp .Ldec_rounds 261.cfi_endproc 262.size Camellia_DecryptBlock,.-Camellia_DecryptBlock 263# V2 264.globl Camellia_DecryptBlock_Rounds 265.type Camellia_DecryptBlock_Rounds,\@function,4 266.align 16 267.Ldec_rounds: 268Camellia_DecryptBlock_Rounds: 269.cfi_startproc 270 push %rbx 271.cfi_push %rbx 272 push %rbp 273.cfi_push %rbp 274 push %r13 275.cfi_push %r13 276 push %r14 277.cfi_push %r14 278 push %r15 279.cfi_push %r15 280.Ldec_prologue: 281 282 #mov %rsi,$inp # put away arguments 283 mov %rcx,$out 284 mov %rdx,$keyend 285 286 shl \$6,%edi # process grandRounds 287 lea .LCamellia_SBOX(%rip),$Tbl 288 lea ($keyend,%rdi),$key 289 290 mov 0(%rsi),@S[0] # load plaintext 291 mov 4(%rsi),@S[1] 292 mov 8(%rsi),@S[2] 293 bswap @S[0] 294 mov 12(%rsi),@S[3] 295 bswap @S[1] 296 bswap @S[2] 297 bswap @S[3] 298 299 call _x86_64_Camellia_decrypt 300 301 bswap @S[0] 302 bswap @S[1] 303 bswap @S[2] 304 mov @S[0],0($out) 305 bswap @S[3] 306 mov @S[1],4($out) 307 mov @S[2],8($out) 308 mov @S[3],12($out) 309 310 mov 0(%rsp),%r15 311.cfi_restore %r15 312 mov 8(%rsp),%r14 313.cfi_restore %r14 314 mov 16(%rsp),%r13 315.cfi_restore %r13 316 mov 24(%rsp),%rbp 317.cfi_restore %rbp 318 mov 32(%rsp),%rbx 319.cfi_restore %rbx 320 lea 40(%rsp),%rsp 321.cfi_adjust_cfa_offset -40 322.Ldec_epilogue: 323 ret 324.cfi_endproc 325.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds 326 327.type _x86_64_Camellia_decrypt,\@abi-omnipotent 328.align 16 329_x86_64_Camellia_decrypt: 330.cfi_startproc 331 xor 0($key),@S[1] 332 xor 4($key),@S[0] # ^=key[0-3] 333 xor 8($key),@S[3] 334 xor 12($key),@S[2] 335.align 16 336.Ldloop: 337 mov -8($key),$t1 # prefetch key[4-5] 338 mov -4($key),$t0 339 340___ 341 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } 342$code.=<<___; 343 lea -16*4($key),$key 344 cmp $keyend,$key 345 mov 0($key),$t3 # prefetch key[2-3] 346 mov 4($key),$t2 347 je .Lddone 348 349 and @S[0],$t0 350 or @S[3],$t3 351 rol \$1,$t0 352 xor $t3,@S[2] # s2^=s3|key[3]; 353 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 354 and @S[2],$t2 355 or @S[1],$t1 356 rol \$1,$t2 357 xor $t1,@S[0] # s0^=s1|key[1]; 358 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 359 360 jmp .Ldloop 361 362.align 16 363.Lddone: 364 xor @S[2],$t2 365 xor @S[3],$t3 366 xor @S[0],$t0 367 xor @S[1],$t1 368 369 mov $t2,@S[0] # SwapHalf 370 mov $t3,@S[1] 371 mov $t0,@S[2] 372 mov $t1,@S[3] 373 374 .byte 0xf3,0xc3 # rep ret 375.cfi_endproc 376.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt 377___ 378 379sub _saveround { 380my ($rnd,$key,@T)=@_; 381my $bias=int(@T[0])?shift(@T):0; 382 383 if ($#T==3) { 384 $code.=<<___; 385 mov @T[1],`$bias+$rnd*8+0`($key) 386 mov @T[0],`$bias+$rnd*8+4`($key) 387 mov @T[3],`$bias+$rnd*8+8`($key) 388 mov @T[2],`$bias+$rnd*8+12`($key) 389___ 390 } else { 391 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; 392 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); 393 } 394} 395 396sub _loadround { 397my ($rnd,$key,@T)=@_; 398my $bias=int(@T[0])?shift(@T):0; 399 400$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; 401$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); 402} 403 404# shld is very slow on Intel EM64T family. Even on AMD it limits 405# instruction decode rate [because it's VectorPath] and consequently 406# performance... 407sub __rotl128 { 408my ($i0,$i1,$rot)=@_; 409 410 if ($rot) { 411 $code.=<<___; 412 mov $i0,%r11 413 shld \$$rot,$i1,$i0 414 shld \$$rot,%r11,$i1 415___ 416 } 417} 418 419# ... Implementing 128-bit rotate without shld gives 80% better 420# performance EM64T, +15% on AMD64 and only ~7% degradation on 421# Core2. This is therefore preferred. 422sub _rotl128 { 423my ($i0,$i1,$rot)=@_; 424 425 if ($rot) { 426 $code.=<<___; 427 mov $i0,%r11 428 shl \$$rot,$i0 429 mov $i1,%r9 430 shr \$`64-$rot`,%r9 431 shr \$`64-$rot`,%r11 432 or %r9,$i0 433 shl \$$rot,$i1 434 or %r11,$i1 435___ 436 } 437} 438 439{ my $step=0; 440 441$code.=<<___; 442.globl Camellia_Ekeygen 443.type Camellia_Ekeygen,\@function,3 444.align 16 445Camellia_Ekeygen: 446.cfi_startproc 447 push %rbx 448.cfi_push %rbx 449 push %rbp 450.cfi_push %rbp 451 push %r13 452.cfi_push %r13 453 push %r14 454.cfi_push %r14 455 push %r15 456.cfi_push %r15 457.Lkey_prologue: 458 459 mov %edi,${keyend}d # put away arguments, keyBitLength 460 mov %rdx,$out # keyTable 461 462 mov 0(%rsi),@S[0] # load 0-127 bits 463 mov 4(%rsi),@S[1] 464 mov 8(%rsi),@S[2] 465 mov 12(%rsi),@S[3] 466 467 bswap @S[0] 468 bswap @S[1] 469 bswap @S[2] 470 bswap @S[3] 471___ 472 &_saveround (0,$out,@S); # KL<<<0 473$code.=<<___; 474 cmp \$128,$keyend # check keyBitLength 475 je .L1st128 476 477 mov 16(%rsi),@S[0] # load 128-191 bits 478 mov 20(%rsi),@S[1] 479 cmp \$192,$keyend 480 je .L1st192 481 mov 24(%rsi),@S[2] # load 192-255 bits 482 mov 28(%rsi),@S[3] 483 jmp .L1st256 484.L1st192: 485 mov @S[0],@S[2] 486 mov @S[1],@S[3] 487 not @S[2] 488 not @S[3] 489.L1st256: 490 bswap @S[0] 491 bswap @S[1] 492 bswap @S[2] 493 bswap @S[3] 494___ 495 &_saveround (4,$out,@S); # temp storage for KR! 496$code.=<<___; 497 xor 0($out),@S[1] # KR^KL 498 xor 4($out),@S[0] 499 xor 8($out),@S[3] 500 xor 12($out),@S[2] 501 502.L1st128: 503 lea .LCamellia_SIGMA(%rip),$key 504 lea .LCamellia_SBOX(%rip),$Tbl 505 506 mov 0($key),$t1 507 mov 4($key),$t0 508___ 509 &Camellia_Feistel($step++); 510 &Camellia_Feistel($step++); 511$code.=<<___; 512 xor 0($out),@S[1] # ^KL 513 xor 4($out),@S[0] 514 xor 8($out),@S[3] 515 xor 12($out),@S[2] 516___ 517 &Camellia_Feistel($step++); 518 &Camellia_Feistel($step++); 519$code.=<<___; 520 cmp \$128,$keyend 521 jne .L2nd256 522 523 lea 128($out),$out # size optimization 524 shl \$32,%r8 # @S[0]|| 525 shl \$32,%r10 # @S[2]|| 526 or %r9,%r8 # ||@S[1] 527 or %r11,%r10 # ||@S[3] 528___ 529 &_loadround (0,$out,-128,"%rax","%rbx"); # KL 530 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 531 &_rotl128 ("%rax","%rbx",15); 532 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 533 &_rotl128 ("%r8","%r10",15); 534 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 535 &_rotl128 ("%r8","%r10",15); # 15+15=30 536 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 537 &_rotl128 ("%rax","%rbx",30); # 15+30=45 538 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 539 &_rotl128 ("%r8","%r10",15); # 30+15=45 540 &_saveround (12,$out,-128,"%r8"); # KA<<<45 541 &_rotl128 ("%rax","%rbx",15); # 45+15=60 542 &_saveround (13,$out,-128,"%rbx"); # KL<<<60 543 &_rotl128 ("%r8","%r10",15); # 45+15=60 544 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 545 &_rotl128 ("%rax","%rbx",17); # 60+17=77 546 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 547 &_rotl128 ("%rax","%rbx",17); # 77+17=94 548 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 549 &_rotl128 ("%r8","%r10",34); # 60+34=94 550 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 551 &_rotl128 ("%rax","%rbx",17); # 94+17=111 552 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 553 &_rotl128 ("%r8","%r10",17); # 94+17=111 554 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 555$code.=<<___; 556 mov \$3,%eax 557 jmp .Ldone 558.align 16 559.L2nd256: 560___ 561 &_saveround (6,$out,@S); # temp storage for KA! 562$code.=<<___; 563 xor `4*8+0`($out),@S[1] # KA^KR 564 xor `4*8+4`($out),@S[0] 565 xor `5*8+0`($out),@S[3] 566 xor `5*8+4`($out),@S[2] 567___ 568 &Camellia_Feistel($step++); 569 &Camellia_Feistel($step++); 570 571 &_loadround (0,$out,"%rax","%rbx"); # KL 572 &_loadround (4,$out,"%rcx","%rdx"); # KR 573 &_loadround (6,$out,"%r14","%r15"); # KA 574$code.=<<___; 575 lea 128($out),$out # size optimization 576 shl \$32,%r8 # @S[0]|| 577 shl \$32,%r10 # @S[2]|| 578 or %r9,%r8 # ||@S[1] 579 or %r11,%r10 # ||@S[3] 580___ 581 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 582 &_rotl128 ("%rcx","%rdx",15); 583 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 584 &_rotl128 ("%r14","%r15",15); 585 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 586 &_rotl128 ("%rcx","%rdx",15); # 15+15=30 587 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 588 &_rotl128 ("%r8","%r10",30); 589 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 590 &_rotl128 ("%rax","%rbx",45); 591 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 592 &_rotl128 ("%r14","%r15",30); # 15+30=45 593 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 594 &_rotl128 ("%rax","%rbx",15); # 45+15=60 595 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 596 &_rotl128 ("%rcx","%rdx",30); # 30+30=60 597 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 598 &_rotl128 ("%r8","%r10",30); # 30+30=60 599 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 600 &_rotl128 ("%rax","%rbx",17); # 60+17=77 601 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 602 &_rotl128 ("%r14","%r15",32); # 45+32=77 603 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 604 &_rotl128 ("%rcx","%rdx",34); # 60+34=94 605 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 606 &_rotl128 ("%r14","%r15",17); # 77+17=94 607 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 608 &_rotl128 ("%rax","%rbx",34); # 77+34=111 609 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 610 &_rotl128 ("%r8","%r10",51); # 60+51=111 611 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 612$code.=<<___; 613 mov \$4,%eax 614.Ldone: 615 mov 0(%rsp),%r15 616.cfi_restore %r15 617 mov 8(%rsp),%r14 618.cfi_restore %r14 619 mov 16(%rsp),%r13 620.cfi_restore %r13 621 mov 24(%rsp),%rbp 622.cfi_restore %rbp 623 mov 32(%rsp),%rbx 624.cfi_restore %rbx 625 lea 40(%rsp),%rsp 626.cfi_adjust_cfa_offset -40 627.Lkey_epilogue: 628 ret 629.cfi_endproc 630.size Camellia_Ekeygen,.-Camellia_Ekeygen 631___ 632} 633 634@SBOX=( 635112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 636 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 637134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 638166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 639139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 640223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 641 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 642254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 643170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 644 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 645135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 646 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 647233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 648120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 649114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 650 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 651 652sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } 653sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } 654sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } 655sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } 656 657$code.=<<___; 658.align 64 659.LCamellia_SIGMA: 660.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 661.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 662.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 663.long 0, 0, 0, 0 664.LCamellia_SBOX: 665___ 666# tables are interleaved, remember? 667sub data_word { $code.=".long\t".join(',',@_)."\n"; } 668for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 669for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 670 671# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 672# size_t length, const CAMELLIA_KEY *key, 673# unsigned char *ivp,const int enc); 674{ 675$_key="0(%rsp)"; 676$_end="8(%rsp)"; # inp+len&~15 677$_res="16(%rsp)"; # len&15 678$ivec="24(%rsp)"; 679$_ivp="40(%rsp)"; 680$_rsp="48(%rsp)"; 681 682$code.=<<___; 683.globl Camellia_cbc_encrypt 684.type Camellia_cbc_encrypt,\@function,6 685.align 16 686Camellia_cbc_encrypt: 687.cfi_startproc 688 cmp \$0,%rdx 689 je .Lcbc_abort 690 push %rbx 691.cfi_push %rbx 692 push %rbp 693.cfi_push %rbp 694 push %r12 695.cfi_push %r12 696 push %r13 697.cfi_push %r13 698 push %r14 699.cfi_push %r14 700 push %r15 701.cfi_push %r15 702.Lcbc_prologue: 703 704 mov %rsp,%rbp 705.cfi_def_cfa_register %rbp 706 sub \$64,%rsp 707 and \$-64,%rsp 708 709 # place stack frame just "above mod 1024" the key schedule, 710 # this ensures that cache associativity suffices 711 lea -64-63(%rcx),%r10 712 sub %rsp,%r10 713 neg %r10 714 and \$0x3C0,%r10 715 sub %r10,%rsp 716 #add \$8,%rsp # 8 is reserved for callee's ra 717 718 mov %rdi,$inp # inp argument 719 mov %rsi,$out # out argument 720 mov %r8,%rbx # ivp argument 721 mov %rcx,$key # key argument 722 mov 272(%rcx),${keyend}d # grandRounds 723 724 mov %r8,$_ivp 725 mov %rbp,$_rsp 726.cfi_cfa_expression $_rsp,deref,+56 727 728.Lcbc_body: 729 lea .LCamellia_SBOX(%rip),$Tbl 730 731 mov \$32,%ecx 732.align 4 733.Lcbc_prefetch_sbox: 734 mov 0($Tbl),%rax 735 mov 32($Tbl),%rsi 736 mov 64($Tbl),%rdi 737 mov 96($Tbl),%r11 738 lea 128($Tbl),$Tbl 739 loop .Lcbc_prefetch_sbox 740 sub \$4096,$Tbl 741 shl \$6,$keyend 742 mov %rdx,%rcx # len argument 743 lea ($key,$keyend),$keyend 744 745 cmp \$0,%r9d # enc argument 746 je .LCBC_DECRYPT 747 748 and \$-16,%rdx 749 and \$15,%rcx # length residue 750 lea ($inp,%rdx),%rdx 751 mov $key,$_key 752 mov %rdx,$_end 753 mov %rcx,$_res 754 755 cmp $inp,%rdx 756 mov 0(%rbx),@S[0] # load IV 757 mov 4(%rbx),@S[1] 758 mov 8(%rbx),@S[2] 759 mov 12(%rbx),@S[3] 760 je .Lcbc_enc_tail 761 jmp .Lcbc_eloop 762 763.align 16 764.Lcbc_eloop: 765 xor 0($inp),@S[0] 766 xor 4($inp),@S[1] 767 xor 8($inp),@S[2] 768 bswap @S[0] 769 xor 12($inp),@S[3] 770 bswap @S[1] 771 bswap @S[2] 772 bswap @S[3] 773 774 call _x86_64_Camellia_encrypt 775 776 mov $_key,$key # "rewind" the key 777 bswap @S[0] 778 mov $_end,%rdx 779 bswap @S[1] 780 mov $_res,%rcx 781 bswap @S[2] 782 mov @S[0],0($out) 783 bswap @S[3] 784 mov @S[1],4($out) 785 mov @S[2],8($out) 786 lea 16($inp),$inp 787 mov @S[3],12($out) 788 cmp %rdx,$inp 789 lea 16($out),$out 790 jne .Lcbc_eloop 791 792 cmp \$0,%rcx 793 jne .Lcbc_enc_tail 794 795 mov $_ivp,$out 796 mov @S[0],0($out) # write out IV residue 797 mov @S[1],4($out) 798 mov @S[2],8($out) 799 mov @S[3],12($out) 800 jmp .Lcbc_done 801 802.align 16 803.Lcbc_enc_tail: 804 xor %rax,%rax 805 mov %rax,0+$ivec 806 mov %rax,8+$ivec 807 mov %rax,$_res 808 809.Lcbc_enc_pushf: 810 pushfq 811 cld 812 mov $inp,%rsi 813 lea 8+$ivec,%rdi 814 .long 0x9066A4F3 # rep movsb 815 popfq 816.Lcbc_enc_popf: 817 818 lea $ivec,$inp 819 lea 16+$ivec,%rax 820 mov %rax,$_end 821 jmp .Lcbc_eloop # one more time 822 823.align 16 824.LCBC_DECRYPT: 825 xchg $key,$keyend 826 add \$15,%rdx 827 and \$15,%rcx # length residue 828 and \$-16,%rdx 829 mov $key,$_key 830 lea ($inp,%rdx),%rdx 831 mov %rdx,$_end 832 mov %rcx,$_res 833 834 mov (%rbx),%rax # load IV 835 mov 8(%rbx),%rbx 836 jmp .Lcbc_dloop 837.align 16 838.Lcbc_dloop: 839 mov 0($inp),@S[0] 840 mov 4($inp),@S[1] 841 mov 8($inp),@S[2] 842 bswap @S[0] 843 mov 12($inp),@S[3] 844 bswap @S[1] 845 mov %rax,0+$ivec # save IV to temporary storage 846 bswap @S[2] 847 mov %rbx,8+$ivec 848 bswap @S[3] 849 850 call _x86_64_Camellia_decrypt 851 852 mov $_key,$key # "rewind" the key 853 mov $_end,%rdx 854 mov $_res,%rcx 855 856 bswap @S[0] 857 mov ($inp),%rax # load IV for next iteration 858 bswap @S[1] 859 mov 8($inp),%rbx 860 bswap @S[2] 861 xor 0+$ivec,@S[0] 862 bswap @S[3] 863 xor 4+$ivec,@S[1] 864 xor 8+$ivec,@S[2] 865 lea 16($inp),$inp 866 xor 12+$ivec,@S[3] 867 cmp %rdx,$inp 868 je .Lcbc_ddone 869 870 mov @S[0],0($out) 871 mov @S[1],4($out) 872 mov @S[2],8($out) 873 mov @S[3],12($out) 874 875 lea 16($out),$out 876 jmp .Lcbc_dloop 877 878.align 16 879.Lcbc_ddone: 880 mov $_ivp,%rdx 881 cmp \$0,%rcx 882 jne .Lcbc_dec_tail 883 884 mov @S[0],0($out) 885 mov @S[1],4($out) 886 mov @S[2],8($out) 887 mov @S[3],12($out) 888 889 mov %rax,(%rdx) # write out IV residue 890 mov %rbx,8(%rdx) 891 jmp .Lcbc_done 892.align 16 893.Lcbc_dec_tail: 894 mov @S[0],0+$ivec 895 mov @S[1],4+$ivec 896 mov @S[2],8+$ivec 897 mov @S[3],12+$ivec 898 899.Lcbc_dec_pushf: 900 pushfq 901 cld 902 lea 8+$ivec,%rsi 903 lea ($out),%rdi 904 .long 0x9066A4F3 # rep movsb 905 popfq 906.Lcbc_dec_popf: 907 908 mov %rax,(%rdx) # write out IV residue 909 mov %rbx,8(%rdx) 910 jmp .Lcbc_done 911 912.align 16 913.Lcbc_done: 914 mov $_rsp,%rcx 915.cfi_def_cfa %rcx,56 916 mov 0(%rcx),%r15 917.cfi_restore %r15 918 mov 8(%rcx),%r14 919.cfi_restore %r14 920 mov 16(%rcx),%r13 921.cfi_restore %r13 922 mov 24(%rcx),%r12 923.cfi_restore %r12 924 mov 32(%rcx),%rbp 925.cfi_restore %rbp 926 mov 40(%rcx),%rbx 927.cfi_restore %rbx 928 lea 48(%rcx),%rsp 929.cfi_def_cfa %rsp,8 930.Lcbc_abort: 931 ret 932.cfi_endproc 933.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt 934 935.asciz "Camellia for x86_64 by <appro\@openssl.org>" 936___ 937} 938 939# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 940# CONTEXT *context,DISPATCHER_CONTEXT *disp) 941if ($win64) { 942$rec="%rcx"; 943$frame="%rdx"; 944$context="%r8"; 945$disp="%r9"; 946 947$code.=<<___; 948.extern __imp_RtlVirtualUnwind 949.type common_se_handler,\@abi-omnipotent 950.align 16 951common_se_handler: 952 push %rsi 953 push %rdi 954 push %rbx 955 push %rbp 956 push %r12 957 push %r13 958 push %r14 959 push %r15 960 pushfq 961 lea -64(%rsp),%rsp 962 963 mov 120($context),%rax # pull context->Rax 964 mov 248($context),%rbx # pull context->Rip 965 966 mov 8($disp),%rsi # disp->ImageBase 967 mov 56($disp),%r11 # disp->HandlerData 968 969 mov 0(%r11),%r10d # HandlerData[0] 970 lea (%rsi,%r10),%r10 # prologue label 971 cmp %r10,%rbx # context->Rip<prologue label 972 jb .Lin_prologue 973 974 mov 152($context),%rax # pull context->Rsp 975 976 mov 4(%r11),%r10d # HandlerData[1] 977 lea (%rsi,%r10),%r10 # epilogue label 978 cmp %r10,%rbx # context->Rip>=epilogue label 979 jae .Lin_prologue 980 981 lea 40(%rax),%rax 982 mov -8(%rax),%rbx 983 mov -16(%rax),%rbp 984 mov -24(%rax),%r13 985 mov -32(%rax),%r14 986 mov -40(%rax),%r15 987 mov %rbx,144($context) # restore context->Rbx 988 mov %rbp,160($context) # restore context->Rbp 989 mov %r13,224($context) # restore context->R13 990 mov %r14,232($context) # restore context->R14 991 mov %r15,240($context) # restore context->R15 992 993.Lin_prologue: 994 mov 8(%rax),%rdi 995 mov 16(%rax),%rsi 996 mov %rax,152($context) # restore context->Rsp 997 mov %rsi,168($context) # restore context->Rsi 998 mov %rdi,176($context) # restore context->Rdi 999 1000 jmp .Lcommon_seh_exit 1001.size common_se_handler,.-common_se_handler 1002 1003.type cbc_se_handler,\@abi-omnipotent 1004.align 16 1005cbc_se_handler: 1006 push %rsi 1007 push %rdi 1008 push %rbx 1009 push %rbp 1010 push %r12 1011 push %r13 1012 push %r14 1013 push %r15 1014 pushfq 1015 lea -64(%rsp),%rsp 1016 1017 mov 120($context),%rax # pull context->Rax 1018 mov 248($context),%rbx # pull context->Rip 1019 1020 lea .Lcbc_prologue(%rip),%r10 1021 cmp %r10,%rbx # context->Rip<.Lcbc_prologue 1022 jb .Lin_cbc_prologue 1023 1024 lea .Lcbc_body(%rip),%r10 1025 cmp %r10,%rbx # context->Rip<.Lcbc_body 1026 jb .Lin_cbc_frame_setup 1027 1028 mov 152($context),%rax # pull context->Rsp 1029 1030 lea .Lcbc_abort(%rip),%r10 1031 cmp %r10,%rbx # context->Rip>=.Lcbc_abort 1032 jae .Lin_cbc_prologue 1033 1034 # handle pushf/popf in Camellia_cbc_encrypt 1035 lea .Lcbc_enc_pushf(%rip),%r10 1036 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf 1037 jbe .Lin_cbc_no_flag 1038 lea 8(%rax),%rax 1039 lea .Lcbc_enc_popf(%rip),%r10 1040 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf 1041 jb .Lin_cbc_no_flag 1042 lea -8(%rax),%rax 1043 lea .Lcbc_dec_pushf(%rip),%r10 1044 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf 1045 jbe .Lin_cbc_no_flag 1046 lea 8(%rax),%rax 1047 lea .Lcbc_dec_popf(%rip),%r10 1048 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf 1049 jb .Lin_cbc_no_flag 1050 lea -8(%rax),%rax 1051 1052.Lin_cbc_no_flag: 1053 mov 48(%rax),%rax # $_rsp 1054 lea 48(%rax),%rax 1055 1056.Lin_cbc_frame_setup: 1057 mov -8(%rax),%rbx 1058 mov -16(%rax),%rbp 1059 mov -24(%rax),%r12 1060 mov -32(%rax),%r13 1061 mov -40(%rax),%r14 1062 mov -48(%rax),%r15 1063 mov %rbx,144($context) # restore context->Rbx 1064 mov %rbp,160($context) # restore context->Rbp 1065 mov %r12,216($context) # restore context->R12 1066 mov %r13,224($context) # restore context->R13 1067 mov %r14,232($context) # restore context->R14 1068 mov %r15,240($context) # restore context->R15 1069 1070.Lin_cbc_prologue: 1071 mov 8(%rax),%rdi 1072 mov 16(%rax),%rsi 1073 mov %rax,152($context) # restore context->Rsp 1074 mov %rsi,168($context) # restore context->Rsi 1075 mov %rdi,176($context) # restore context->Rdi 1076 1077.align 4 1078.Lcommon_seh_exit: 1079 1080 mov 40($disp),%rdi # disp->ContextRecord 1081 mov $context,%rsi # context 1082 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1083 .long 0xa548f3fc # cld; rep movsq 1084 1085 mov $disp,%rsi 1086 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1087 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1088 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1089 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1090 mov 40(%rsi),%r10 # disp->ContextRecord 1091 lea 56(%rsi),%r11 # &disp->HandlerData 1092 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1093 mov %r10,32(%rsp) # arg5 1094 mov %r11,40(%rsp) # arg6 1095 mov %r12,48(%rsp) # arg7 1096 mov %rcx,56(%rsp) # arg8, (NULL) 1097 call *__imp_RtlVirtualUnwind(%rip) 1098 1099 mov \$1,%eax # ExceptionContinueSearch 1100 lea 64(%rsp),%rsp 1101 popfq 1102 pop %r15 1103 pop %r14 1104 pop %r13 1105 pop %r12 1106 pop %rbp 1107 pop %rbx 1108 pop %rdi 1109 pop %rsi 1110 ret 1111.size cbc_se_handler,.-cbc_se_handler 1112 1113.section .pdata 1114.align 4 1115 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds 1116 .rva .LSEH_end_Camellia_EncryptBlock_Rounds 1117 .rva .LSEH_info_Camellia_EncryptBlock_Rounds 1118 1119 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds 1120 .rva .LSEH_end_Camellia_DecryptBlock_Rounds 1121 .rva .LSEH_info_Camellia_DecryptBlock_Rounds 1122 1123 .rva .LSEH_begin_Camellia_Ekeygen 1124 .rva .LSEH_end_Camellia_Ekeygen 1125 .rva .LSEH_info_Camellia_Ekeygen 1126 1127 .rva .LSEH_begin_Camellia_cbc_encrypt 1128 .rva .LSEH_end_Camellia_cbc_encrypt 1129 .rva .LSEH_info_Camellia_cbc_encrypt 1130 1131.section .xdata 1132.align 8 1133.LSEH_info_Camellia_EncryptBlock_Rounds: 1134 .byte 9,0,0,0 1135 .rva common_se_handler 1136 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 1137.LSEH_info_Camellia_DecryptBlock_Rounds: 1138 .byte 9,0,0,0 1139 .rva common_se_handler 1140 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 1141.LSEH_info_Camellia_Ekeygen: 1142 .byte 9,0,0,0 1143 .rva common_se_handler 1144 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] 1145.LSEH_info_Camellia_cbc_encrypt: 1146 .byte 9,0,0,0 1147 .rva cbc_se_handler 1148___ 1149} 1150 1151$code =~ s/\`([^\`]*)\`/eval $1/gem; 1152print $code; 1153close STDOUT or die "error closing STDOUT: $!"; 1154