1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# sha1_block procedure for x86_64. 11# 12# It was brought to my attention that on EM64T compiler-generated code 13# was far behind 32-bit assembler implementation. This is unlike on 14# Opteron where compiler-generated code was only 15% behind 32-bit 15# assembler, which originally made it hard to motivate the effort. 16# There was suggestion to mechanically translate 32-bit code, but I 17# dismissed it, reasoning that x86_64 offers enough register bank 18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19# implementation:-) However! While 64-bit code does perform better 20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21# x86_64 does offer larger *addressable* bank, but out-of-order core 22# reaches for even more registers through dynamic aliasing, and EM64T 23# core must have managed to run-time optimize even 32-bit code just as 24# good as 64-bit one. Performance improvement is summarized in the 25# following table: 26# 27# gcc 3.4 32-bit asm cycles/byte 28# Opteron +45% +20% 6.8 29# Xeon P4 +65% +0% 9.9 30# Core2 +60% +10% 7.0 31 32# August 2009. 33# 34# The code was revised to minimize code size and to maximize 35# "distance" between instructions producing input to 'lea' 36# instruction and the 'lea' instruction itself, which is essential 37# for Intel Atom core. 38 39$flavour = shift; 40$output = shift; 41if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 42 43$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 48die "can't locate x86_64-xlate.pl"; 49 50open STDOUT,"| $^X $xlate $flavour $output"; 51 52$ctx="%rdi"; # 1st arg 53$inp="%rsi"; # 2nd arg 54$num="%rdx"; # 3rd arg 55 56# reassign arguments in order to produce more compact code 57$ctx="%r8"; 58$inp="%r9"; 59$num="%r10"; 60 61$t0="%eax"; 62$t1="%ebx"; 63$t2="%ecx"; 64@xi=("%edx","%ebp"); 65$A="%esi"; 66$B="%edi"; 67$C="%r11d"; 68$D="%r12d"; 69$E="%r13d"; 70 71@V=($A,$B,$C,$D,$E); 72 73sub BODY_00_19 { 74my ($i,$a,$b,$c,$d,$e)=@_; 75my $j=$i+1; 76$code.=<<___ if ($i==0); 77 mov `4*$i`($inp),$xi[0] 78 bswap $xi[0] 79 mov $xi[0],`4*$i`(%rsp) 80___ 81$code.=<<___ if ($i<15); 82 mov $c,$t0 83 mov `4*$j`($inp),$xi[1] 84 mov $a,$t2 85 xor $d,$t0 86 bswap $xi[1] 87 rol \$5,$t2 88 lea 0x5a827999($xi[0],$e),$e 89 and $b,$t0 90 mov $xi[1],`4*$j`(%rsp) 91 add $t2,$e 92 xor $d,$t0 93 rol \$30,$b 94 add $t0,$e 95___ 96$code.=<<___ if ($i>=15); 97 mov `4*($j%16)`(%rsp),$xi[1] 98 mov $c,$t0 99 mov $a,$t2 100 xor `4*(($j+2)%16)`(%rsp),$xi[1] 101 xor $d,$t0 102 rol \$5,$t2 103 xor `4*(($j+8)%16)`(%rsp),$xi[1] 104 and $b,$t0 105 lea 0x5a827999($xi[0],$e),$e 106 xor `4*(($j+13)%16)`(%rsp),$xi[1] 107 xor $d,$t0 108 rol \$1,$xi[1] 109 add $t2,$e 110 rol \$30,$b 111 mov $xi[1],`4*($j%16)`(%rsp) 112 add $t0,$e 113___ 114unshift(@xi,pop(@xi)); 115} 116 117sub BODY_20_39 { 118my ($i,$a,$b,$c,$d,$e)=@_; 119my $j=$i+1; 120my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 121$code.=<<___ if ($i<79); 122 mov `4*($j%16)`(%rsp),$xi[1] 123 mov $c,$t0 124 mov $a,$t2 125 xor `4*(($j+2)%16)`(%rsp),$xi[1] 126 xor $b,$t0 127 rol \$5,$t2 128 lea $K($xi[0],$e),$e 129 xor `4*(($j+8)%16)`(%rsp),$xi[1] 130 xor $d,$t0 131 add $t2,$e 132 xor `4*(($j+13)%16)`(%rsp),$xi[1] 133 rol \$30,$b 134 add $t0,$e 135 rol \$1,$xi[1] 136___ 137$code.=<<___ if ($i<76); 138 mov $xi[1],`4*($j%16)`(%rsp) 139___ 140$code.=<<___ if ($i==79); 141 mov $c,$t0 142 mov $a,$t2 143 xor $b,$t0 144 lea $K($xi[0],$e),$e 145 rol \$5,$t2 146 xor $d,$t0 147 add $t2,$e 148 rol \$30,$b 149 add $t0,$e 150___ 151unshift(@xi,pop(@xi)); 152} 153 154sub BODY_40_59 { 155my ($i,$a,$b,$c,$d,$e)=@_; 156my $j=$i+1; 157$code.=<<___; 158 mov `4*($j%16)`(%rsp),$xi[1] 159 mov $c,$t0 160 mov $c,$t1 161 xor `4*(($j+2)%16)`(%rsp),$xi[1] 162 and $d,$t0 163 mov $a,$t2 164 xor `4*(($j+8)%16)`(%rsp),$xi[1] 165 xor $d,$t1 166 lea 0x8f1bbcdc($xi[0],$e),$e 167 rol \$5,$t2 168 xor `4*(($j+13)%16)`(%rsp),$xi[1] 169 add $t0,$e 170 and $b,$t1 171 rol \$1,$xi[1] 172 add $t1,$e 173 rol \$30,$b 174 mov $xi[1],`4*($j%16)`(%rsp) 175 add $t2,$e 176___ 177unshift(@xi,pop(@xi)); 178} 179 180$code.=<<___; 181.text 182 183.globl sha1_block_data_order 184.type sha1_block_data_order,\@function,3 185.align 16 186sha1_block_data_order: 187 push %rbx 188 push %rbp 189 push %r12 190 push %r13 191 mov %rsp,%r11 192 mov %rdi,$ctx # reassigned argument 193 sub \$`8+16*4`,%rsp 194 mov %rsi,$inp # reassigned argument 195 and \$-64,%rsp 196 mov %rdx,$num # reassigned argument 197 mov %r11,`16*4`(%rsp) 198.Lprologue: 199 200 mov 0($ctx),$A 201 mov 4($ctx),$B 202 mov 8($ctx),$C 203 mov 12($ctx),$D 204 mov 16($ctx),$E 205 206.align 4 207.Lloop: 208___ 209for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 210for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 211for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 212for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 213$code.=<<___; 214 add 0($ctx),$A 215 add 4($ctx),$B 216 add 8($ctx),$C 217 add 12($ctx),$D 218 add 16($ctx),$E 219 mov $A,0($ctx) 220 mov $B,4($ctx) 221 mov $C,8($ctx) 222 mov $D,12($ctx) 223 mov $E,16($ctx) 224 225 sub \$1,$num 226 lea `16*4`($inp),$inp 227 jnz .Lloop 228 229 mov `16*4`(%rsp),%rsi 230 mov (%rsi),%r13 231 mov 8(%rsi),%r12 232 mov 16(%rsi),%rbp 233 mov 24(%rsi),%rbx 234 lea 32(%rsi),%rsp 235.Lepilogue: 236 ret 237.size sha1_block_data_order,.-sha1_block_data_order 238 239.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 240.align 16 241___ 242 243# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 244# CONTEXT *context,DISPATCHER_CONTEXT *disp) 245if ($win64) { 246$rec="%rcx"; 247$frame="%rdx"; 248$context="%r8"; 249$disp="%r9"; 250 251$code.=<<___; 252.extern __imp_RtlVirtualUnwind 253.type se_handler,\@abi-omnipotent 254.align 16 255se_handler: 256 push %rsi 257 push %rdi 258 push %rbx 259 push %rbp 260 push %r12 261 push %r13 262 push %r14 263 push %r15 264 pushfq 265 sub \$64,%rsp 266 267 mov 120($context),%rax # pull context->Rax 268 mov 248($context),%rbx # pull context->Rip 269 270 lea .Lprologue(%rip),%r10 271 cmp %r10,%rbx # context->Rip<.Lprologue 272 jb .Lin_prologue 273 274 mov 152($context),%rax # pull context->Rsp 275 276 lea .Lepilogue(%rip),%r10 277 cmp %r10,%rbx # context->Rip>=.Lepilogue 278 jae .Lin_prologue 279 280 mov `16*4`(%rax),%rax # pull saved stack pointer 281 lea 32(%rax),%rax 282 283 mov -8(%rax),%rbx 284 mov -16(%rax),%rbp 285 mov -24(%rax),%r12 286 mov -32(%rax),%r13 287 mov %rbx,144($context) # restore context->Rbx 288 mov %rbp,160($context) # restore context->Rbp 289 mov %r12,216($context) # restore context->R12 290 mov %r13,224($context) # restore context->R13 291 292.Lin_prologue: 293 mov 8(%rax),%rdi 294 mov 16(%rax),%rsi 295 mov %rax,152($context) # restore context->Rsp 296 mov %rsi,168($context) # restore context->Rsi 297 mov %rdi,176($context) # restore context->Rdi 298 299 mov 40($disp),%rdi # disp->ContextRecord 300 mov $context,%rsi # context 301 mov \$154,%ecx # sizeof(CONTEXT) 302 .long 0xa548f3fc # cld; rep movsq 303 304 mov $disp,%rsi 305 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 306 mov 8(%rsi),%rdx # arg2, disp->ImageBase 307 mov 0(%rsi),%r8 # arg3, disp->ControlPc 308 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 309 mov 40(%rsi),%r10 # disp->ContextRecord 310 lea 56(%rsi),%r11 # &disp->HandlerData 311 lea 24(%rsi),%r12 # &disp->EstablisherFrame 312 mov %r10,32(%rsp) # arg5 313 mov %r11,40(%rsp) # arg6 314 mov %r12,48(%rsp) # arg7 315 mov %rcx,56(%rsp) # arg8, (NULL) 316 call *__imp_RtlVirtualUnwind(%rip) 317 318 mov \$1,%eax # ExceptionContinueSearch 319 add \$64,%rsp 320 popfq 321 pop %r15 322 pop %r14 323 pop %r13 324 pop %r12 325 pop %rbp 326 pop %rbx 327 pop %rdi 328 pop %rsi 329 ret 330.size se_handler,.-se_handler 331 332.section .pdata 333.align 4 334 .rva .LSEH_begin_sha1_block_data_order 335 .rva .LSEH_end_sha1_block_data_order 336 .rva .LSEH_info_sha1_block_data_order 337 338.section .xdata 339.align 8 340.LSEH_info_sha1_block_data_order: 341 .byte 9,0,0,0 342 .rva se_handler 343___ 344} 345 346#################################################################### 347 348$code =~ s/\`([^\`]*)\`/eval $1/gem; 349print $code; 350close STDOUT; 351