1#! /usr/bin/env perl 2# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# May 2011 18# 19# The module implements bn_GF2m_mul_2x2 polynomial multiplication used 20# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for 21# the time being... Except that it has two code paths: code suitable 22# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and 23# later. Improvement varies from one benchmark and µ-arch to another. 24# Vanilla code path is at most 20% faster than compiler-generated code 25# [not very impressive], while PCLMULQDQ - whole 85%-160% better on 26# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that 27# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not 28# all CPU time is burnt in it... 29 30$flavour = shift; 31$output = shift; 32if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 33 34$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 42*STDOUT=*OUT; 43 44($lo,$hi)=("%rax","%rdx"); $a=$lo; 45($i0,$i1)=("%rsi","%rdi"); 46($t0,$t1)=("%rbx","%rcx"); 47($b,$mask)=("%rbp","%r8"); 48($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); 49($R,$Tx)=("%xmm0","%xmm1"); 50 51$code.=<<___; 52.text 53 54.type _mul_1x1,\@abi-omnipotent 55.align 16 56_mul_1x1: 57.cfi_startproc 58 sub \$128+8,%rsp 59.cfi_adjust_cfa_offset 128+8 60 mov \$-1,$a1 61 lea ($a,$a),$i0 62 shr \$3,$a1 63 lea (,$a,4),$i1 64 and $a,$a1 # a1=a&0x1fffffffffffffff 65 lea (,$a,8),$a8 66 sar \$63,$a # broadcast 63rd bit 67 lea ($a1,$a1),$a2 68 sar \$63,$i0 # broadcast 62nd bit 69 lea (,$a1,4),$a4 70 and $b,$a 71 sar \$63,$i1 # broadcast 61st bit 72 mov $a,$hi # $a is $lo 73 shl \$63,$lo 74 and $b,$i0 75 shr \$1,$hi 76 mov $i0,$t1 77 shl \$62,$i0 78 and $b,$i1 79 shr \$2,$t1 80 xor $i0,$lo 81 mov $i1,$t0 82 shl \$61,$i1 83 xor $t1,$hi 84 shr \$3,$t0 85 xor $i1,$lo 86 xor $t0,$hi 87 88 mov $a1,$a12 89 movq \$0,0(%rsp) # tab[0]=0 90 xor $a2,$a12 # a1^a2 91 mov $a1,8(%rsp) # tab[1]=a1 92 mov $a4,$a48 93 mov $a2,16(%rsp) # tab[2]=a2 94 xor $a8,$a48 # a4^a8 95 mov $a12,24(%rsp) # tab[3]=a1^a2 96 97 xor $a4,$a1 98 mov $a4,32(%rsp) # tab[4]=a4 99 xor $a4,$a2 100 mov $a1,40(%rsp) # tab[5]=a1^a4 101 xor $a4,$a12 102 mov $a2,48(%rsp) # tab[6]=a2^a4 103 xor $a48,$a1 # a1^a4^a4^a8=a1^a8 104 mov $a12,56(%rsp) # tab[7]=a1^a2^a4 105 xor $a48,$a2 # a2^a4^a4^a8=a1^a8 106 107 mov $a8,64(%rsp) # tab[8]=a8 108 xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 109 mov $a1,72(%rsp) # tab[9]=a1^a8 110 xor $a4,$a1 # a1^a8^a4 111 mov $a2,80(%rsp) # tab[10]=a2^a8 112 xor $a4,$a2 # a2^a8^a4 113 mov $a12,88(%rsp) # tab[11]=a1^a2^a8 114 115 xor $a4,$a12 # a1^a2^a8^a4 116 mov $a48,96(%rsp) # tab[12]=a4^a8 117 mov $mask,$i0 118 mov $a1,104(%rsp) # tab[13]=a1^a4^a8 119 and $b,$i0 120 mov $a2,112(%rsp) # tab[14]=a2^a4^a8 121 shr \$4,$b 122 mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 123 mov $mask,$i1 124 and $b,$i1 125 shr \$4,$b 126 127 movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 128 mov $mask,$i0 129 and $b,$i0 130 shr \$4,$b 131___ 132 for ($n=1;$n<8;$n++) { 133 $code.=<<___; 134 mov (%rsp,$i1,8),$t1 135 mov $mask,$i1 136 mov $t1,$t0 137 shl \$`8*$n-4`,$t1 138 and $b,$i1 139 movq (%rsp,$i0,8),$Tx 140 shr \$`64-(8*$n-4)`,$t0 141 xor $t1,$lo 142 pslldq \$$n,$Tx 143 mov $mask,$i0 144 shr \$4,$b 145 xor $t0,$hi 146 and $b,$i0 147 shr \$4,$b 148 pxor $Tx,$R 149___ 150 } 151$code.=<<___; 152 mov (%rsp,$i1,8),$t1 153 mov $t1,$t0 154 shl \$`8*$n-4`,$t1 155 movq $R,$i0 156 shr \$`64-(8*$n-4)`,$t0 157 xor $t1,$lo 158 psrldq \$8,$R 159 xor $t0,$hi 160 movq $R,$i1 161 xor $i0,$lo 162 xor $i1,$hi 163 164 add \$128+8,%rsp 165.cfi_adjust_cfa_offset -128-8 166 ret 167.Lend_mul_1x1: 168.cfi_endproc 169.size _mul_1x1,.-_mul_1x1 170___ 171 172($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order 173 ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order 174 175$code.=<<___; 176.extern OPENSSL_ia32cap_P 177.globl bn_GF2m_mul_2x2 178.type bn_GF2m_mul_2x2,\@abi-omnipotent 179.align 16 180bn_GF2m_mul_2x2: 181.cfi_startproc 182 mov %rsp,%rax 183 mov OPENSSL_ia32cap_P(%rip),%r10 184 bt \$33,%r10 185 jnc .Lvanilla_mul_2x2 186 187 movq $a1,%xmm0 188 movq $b1,%xmm1 189 movq $a0,%xmm2 190___ 191$code.=<<___ if ($win64); 192 movq 40(%rsp),%xmm3 193___ 194$code.=<<___ if (!$win64); 195 movq $b0,%xmm3 196___ 197$code.=<<___; 198 movdqa %xmm0,%xmm4 199 movdqa %xmm1,%xmm5 200 pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 201 pxor %xmm2,%xmm4 202 pxor %xmm3,%xmm5 203 pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 204 pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) 205 xorps %xmm0,%xmm4 206 xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 207 movdqa %xmm4,%xmm5 208 pslldq \$8,%xmm4 209 psrldq \$8,%xmm5 210 pxor %xmm4,%xmm2 211 pxor %xmm5,%xmm0 212 movdqu %xmm2,0($rp) 213 movdqu %xmm0,16($rp) 214 ret 215 216.align 16 217.Lvanilla_mul_2x2: 218 lea -8*17(%rsp),%rsp 219.cfi_adjust_cfa_offset 8*17 220___ 221$code.=<<___ if ($win64); 222 mov `8*17+40`(%rsp),$b0 223 mov %rdi,8*15(%rsp) 224 mov %rsi,8*16(%rsp) 225___ 226$code.=<<___; 227 mov %r14,8*10(%rsp) 228.cfi_rel_offset %r14,8*10 229 mov %r13,8*11(%rsp) 230.cfi_rel_offset %r13,8*11 231 mov %r12,8*12(%rsp) 232.cfi_rel_offset %r12,8*12 233 mov %rbp,8*13(%rsp) 234.cfi_rel_offset %rbp,8*13 235 mov %rbx,8*14(%rsp) 236.cfi_rel_offset %rbx,8*14 237.Lbody_mul_2x2: 238 mov $rp,32(%rsp) # save the arguments 239 mov $a1,40(%rsp) 240 mov $a0,48(%rsp) 241 mov $b1,56(%rsp) 242 mov $b0,64(%rsp) 243 244 mov \$0xf,$mask 245 mov $a1,$a 246 mov $b1,$b 247 call _mul_1x1 # a1·b1 248 mov $lo,16(%rsp) 249 mov $hi,24(%rsp) 250 251 mov 48(%rsp),$a 252 mov 64(%rsp),$b 253 call _mul_1x1 # a0·b0 254 mov $lo,0(%rsp) 255 mov $hi,8(%rsp) 256 257 mov 40(%rsp),$a 258 mov 56(%rsp),$b 259 xor 48(%rsp),$a 260 xor 64(%rsp),$b 261 call _mul_1x1 # (a0+a1)·(b0+b1) 262___ 263 @r=("%rbx","%rcx","%rdi","%rsi"); 264$code.=<<___; 265 mov 0(%rsp),@r[0] 266 mov 8(%rsp),@r[1] 267 mov 16(%rsp),@r[2] 268 mov 24(%rsp),@r[3] 269 mov 32(%rsp),%rbp 270 271 xor $hi,$lo 272 xor @r[1],$hi 273 xor @r[0],$lo 274 mov @r[0],0(%rbp) 275 xor @r[2],$hi 276 mov @r[3],24(%rbp) 277 xor @r[3],$lo 278 xor @r[3],$hi 279 xor $hi,$lo 280 mov $hi,16(%rbp) 281 mov $lo,8(%rbp) 282 283 mov 8*10(%rsp),%r14 284.cfi_restore %r14 285 mov 8*11(%rsp),%r13 286.cfi_restore %r13 287 mov 8*12(%rsp),%r12 288.cfi_restore %r12 289 mov 8*13(%rsp),%rbp 290.cfi_restore %rbp 291 mov 8*14(%rsp),%rbx 292.cfi_restore %rbx 293___ 294$code.=<<___ if ($win64); 295 mov 8*15(%rsp),%rdi 296 mov 8*16(%rsp),%rsi 297___ 298$code.=<<___; 299 lea 8*17(%rsp),%rsp 300.cfi_adjust_cfa_offset -8*17 301.Lepilogue_mul_2x2: 302 ret 303.Lend_mul_2x2: 304.cfi_endproc 305.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 306.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 307.align 16 308___ 309 310# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 311# CONTEXT *context,DISPATCHER_CONTEXT *disp) 312if ($win64) { 313$rec="%rcx"; 314$frame="%rdx"; 315$context="%r8"; 316$disp="%r9"; 317 318$code.=<<___; 319.extern __imp_RtlVirtualUnwind 320 321.type se_handler,\@abi-omnipotent 322.align 16 323se_handler: 324 push %rsi 325 push %rdi 326 push %rbx 327 push %rbp 328 push %r12 329 push %r13 330 push %r14 331 push %r15 332 pushfq 333 sub \$64,%rsp 334 335 mov 120($context),%rax # pull context->Rax 336 mov 248($context),%rbx # pull context->Rip 337 338 lea .Lbody_mul_2x2(%rip),%r10 339 cmp %r10,%rbx # context->Rip<"prologue" label 340 jb .Lin_prologue 341 342 mov 152($context),%rax # pull context->Rsp 343 344 lea .Lepilogue_mul_2x2(%rip),%r10 345 cmp %r10,%rbx # context->Rip>="epilogue" label 346 jae .Lin_prologue 347 348 mov 8*10(%rax),%r14 # mimic epilogue 349 mov 8*11(%rax),%r13 350 mov 8*12(%rax),%r12 351 mov 8*13(%rax),%rbp 352 mov 8*14(%rax),%rbx 353 mov 8*15(%rax),%rdi 354 mov 8*16(%rax),%rsi 355 356 mov %rbx,144($context) # restore context->Rbx 357 mov %rbp,160($context) # restore context->Rbp 358 mov %rsi,168($context) # restore context->Rsi 359 mov %rdi,176($context) # restore context->Rdi 360 mov %r12,216($context) # restore context->R12 361 mov %r13,224($context) # restore context->R13 362 mov %r14,232($context) # restore context->R14 363 364 lea 8*17(%rax),%rax 365 366.Lin_prologue: 367 mov %rax,152($context) # restore context->Rsp 368 369 mov 40($disp),%rdi # disp->ContextRecord 370 mov $context,%rsi # context 371 mov \$154,%ecx # sizeof(CONTEXT) 372 .long 0xa548f3fc # cld; rep movsq 373 374 mov $disp,%rsi 375 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 376 mov 8(%rsi),%rdx # arg2, disp->ImageBase 377 mov 0(%rsi),%r8 # arg3, disp->ControlPc 378 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 379 mov 40(%rsi),%r10 # disp->ContextRecord 380 lea 56(%rsi),%r11 # &disp->HandlerData 381 lea 24(%rsi),%r12 # &disp->EstablisherFrame 382 mov %r10,32(%rsp) # arg5 383 mov %r11,40(%rsp) # arg6 384 mov %r12,48(%rsp) # arg7 385 mov %rcx,56(%rsp) # arg8, (NULL) 386 call *__imp_RtlVirtualUnwind(%rip) 387 388 mov \$1,%eax # ExceptionContinueSearch 389 add \$64,%rsp 390 popfq 391 pop %r15 392 pop %r14 393 pop %r13 394 pop %r12 395 pop %rbp 396 pop %rbx 397 pop %rdi 398 pop %rsi 399 ret 400.size se_handler,.-se_handler 401 402.section .pdata 403.align 4 404 .rva _mul_1x1 405 .rva .Lend_mul_1x1 406 .rva .LSEH_info_1x1 407 408 .rva .Lvanilla_mul_2x2 409 .rva .Lend_mul_2x2 410 .rva .LSEH_info_2x2 411.section .xdata 412.align 8 413.LSEH_info_1x1: 414 .byte 0x01,0x07,0x02,0x00 415 .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 416.LSEH_info_2x2: 417 .byte 9,0,0,0 418 .rva se_handler 419___ 420} 421 422$code =~ s/\`([^\`]*)\`/eval($1)/gem; 423print $code; 424close STDOUT or die "error closing STDOUT: $!"; 425