11f13597dSJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 91f13597dSJung-uk Kim 101f13597dSJung-uk Kim# ==================================================================== 11e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 121f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 131f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 141f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 151f13597dSJung-uk Kim# ==================================================================== 161f13597dSJung-uk Kim 171f13597dSJung-uk Kim# October 2005 181f13597dSJung-uk Kim# 191f13597dSJung-uk Kim# This is a "teaser" code, as it can be improved in several ways... 201f13597dSJung-uk Kim# First of all non-SSE2 path should be implemented (yes, for now it 211f13597dSJung-uk Kim# performs Montgomery multiplication/convolution only on SSE2-capable 221f13597dSJung-uk Kim# CPUs such as P4, others fall down to original code). Then inner loop 231f13597dSJung-uk Kim# can be unrolled and modulo-scheduled to improve ILP and possibly 241f13597dSJung-uk Kim# moved to 128-bit XMM register bank (though it would require input 251f13597dSJung-uk Kim# rearrangement and/or increase bus bandwidth utilization). Dedicated 261f13597dSJung-uk Kim# squaring procedure should give further performance improvement... 271f13597dSJung-uk Kim# Yet, for being draft, the code improves rsa512 *sign* benchmark by 281f13597dSJung-uk Kim# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) 291f13597dSJung-uk Kim 301f13597dSJung-uk Kim# December 2006 311f13597dSJung-uk Kim# 321f13597dSJung-uk Kim# Modulo-scheduling SSE2 loops results in further 15-20% improvement. 331f13597dSJung-uk Kim# Integer-only code [being equipped with dedicated squaring procedure] 341f13597dSJung-uk Kim# gives ~40% on rsa512 sign benchmark... 351f13597dSJung-uk Kim 361f13597dSJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 371f13597dSJung-uk Kimpush(@INC,"${dir}","${dir}../../perlasm"); 381f13597dSJung-uk Kimrequire "x86asm.pl"; 391f13597dSJung-uk Kim 40*b077aed3SPierre Pronchery$output = pop and open STDOUT,">$output"; 41e71b7053SJung-uk Kim 42e71b7053SJung-uk Kim&asm_init($ARGV[0]); 431f13597dSJung-uk Kim 441f13597dSJung-uk Kim$sse2=0; 451f13597dSJung-uk Kimfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 461f13597dSJung-uk Kim 471f13597dSJung-uk Kim&external_label("OPENSSL_ia32cap_P") if ($sse2); 481f13597dSJung-uk Kim 491f13597dSJung-uk Kim&function_begin("bn_mul_mont"); 501f13597dSJung-uk Kim 511f13597dSJung-uk Kim$i="edx"; 521f13597dSJung-uk Kim$j="ecx"; 531f13597dSJung-uk Kim$ap="esi"; $tp="esi"; # overlapping variables!!! 541f13597dSJung-uk Kim$rp="edi"; $bp="edi"; # overlapping variables!!! 551f13597dSJung-uk Kim$np="ebp"; 561f13597dSJung-uk Kim$num="ebx"; 571f13597dSJung-uk Kim 581f13597dSJung-uk Kim$_num=&DWP(4*0,"esp"); # stack top layout 591f13597dSJung-uk Kim$_rp=&DWP(4*1,"esp"); 601f13597dSJung-uk Kim$_ap=&DWP(4*2,"esp"); 611f13597dSJung-uk Kim$_bp=&DWP(4*3,"esp"); 621f13597dSJung-uk Kim$_np=&DWP(4*4,"esp"); 631f13597dSJung-uk Kim$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); 641f13597dSJung-uk Kim$_sp=&DWP(4*6,"esp"); 651f13597dSJung-uk Kim$_bpend=&DWP(4*7,"esp"); 661f13597dSJung-uk Kim$frame=32; # size of above frame rounded up to 16n 671f13597dSJung-uk Kim 681f13597dSJung-uk Kim &xor ("eax","eax"); 691f13597dSJung-uk Kim &mov ("edi",&wparam(5)); # int num 701f13597dSJung-uk Kim &cmp ("edi",4); 711f13597dSJung-uk Kim &jl (&label("just_leave")); 721f13597dSJung-uk Kim 731f13597dSJung-uk Kim &lea ("esi",&wparam(0)); # put aside pointer to argument block 741f13597dSJung-uk Kim &lea ("edx",&wparam(1)); # load ap 751f13597dSJung-uk Kim &add ("edi",2); # extra two words on top of tp 761f13597dSJung-uk Kim &neg ("edi"); 77aeb5019cSJung-uk Kim &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) 781f13597dSJung-uk Kim &neg ("edi"); 791f13597dSJung-uk Kim 80e71b7053SJung-uk Kim # minimize cache contention by arranging 2K window between stack 811f13597dSJung-uk Kim # pointer and ap argument [np is also position sensitive vector, 821f13597dSJung-uk Kim # but it's assumed to be near ap, as it's allocated at ~same 831f13597dSJung-uk Kim # time]. 84aeb5019cSJung-uk Kim &mov ("eax","ebp"); 851f13597dSJung-uk Kim &sub ("eax","edx"); 861f13597dSJung-uk Kim &and ("eax",2047); 87aeb5019cSJung-uk Kim &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 881f13597dSJung-uk Kim 89aeb5019cSJung-uk Kim &xor ("edx","ebp"); 901f13597dSJung-uk Kim &and ("edx",2048); 911f13597dSJung-uk Kim &xor ("edx",2048); 92aeb5019cSJung-uk Kim &sub ("ebp","edx"); # this splits them apart modulo 4096 931f13597dSJung-uk Kim 94aeb5019cSJung-uk Kim &and ("ebp",-64); # align to cache line 951f13597dSJung-uk Kim 96e71b7053SJung-uk Kim # An OS-agnostic version of __chkstk. 97e71b7053SJung-uk Kim # 98e71b7053SJung-uk Kim # Some OSes (Windows) insist on stack being "wired" to 99b8721c16SJung-uk Kim # physical memory in strictly sequential manner, i.e. if stack 100b8721c16SJung-uk Kim # allocation spans two pages, then reference to farmost one can 101b8721c16SJung-uk Kim # be punishable by SEGV. But page walking can do good even on 102b8721c16SJung-uk Kim # other OSes, because it guarantees that villain thread hits 103b8721c16SJung-uk Kim # the guard page before it can make damage to innocent one... 104aeb5019cSJung-uk Kim &mov ("eax","esp"); 105aeb5019cSJung-uk Kim &sub ("eax","ebp"); 106b8721c16SJung-uk Kim &and ("eax",-4096); 107aeb5019cSJung-uk Kim &mov ("edx","esp"); # saved stack pointer! 108aeb5019cSJung-uk Kim &lea ("esp",&DWP(0,"ebp","eax")); 109aeb5019cSJung-uk Kim &mov ("eax",&DWP(0,"esp")); 110aeb5019cSJung-uk Kim &cmp ("esp","ebp"); 111aeb5019cSJung-uk Kim &ja (&label("page_walk")); 112aeb5019cSJung-uk Kim &jmp (&label("page_walk_done")); 113aeb5019cSJung-uk Kim 114aeb5019cSJung-uk Kim&set_label("page_walk",16); 115aeb5019cSJung-uk Kim &lea ("esp",&DWP(-4096,"esp")); 116aeb5019cSJung-uk Kim &mov ("eax",&DWP(0,"esp")); 117aeb5019cSJung-uk Kim &cmp ("esp","ebp"); 118aeb5019cSJung-uk Kim &ja (&label("page_walk")); 119aeb5019cSJung-uk Kim&set_label("page_walk_done"); 120b8721c16SJung-uk Kim 1211f13597dSJung-uk Kim ################################# load argument block... 1221f13597dSJung-uk Kim &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp 1231f13597dSJung-uk Kim &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap 1241f13597dSJung-uk Kim &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp 125aeb5019cSJung-uk Kim &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np 1261f13597dSJung-uk Kim &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 1271f13597dSJung-uk Kim #&mov ("edi",&DWP(5*4,"esi"));# int num 1281f13597dSJung-uk Kim 1291f13597dSJung-uk Kim &mov ("esi",&DWP(0,"esi")); # pull n0[0] 1301f13597dSJung-uk Kim &mov ($_rp,"eax"); # ... save a copy of argument block 1311f13597dSJung-uk Kim &mov ($_ap,"ebx"); 1321f13597dSJung-uk Kim &mov ($_bp,"ecx"); 133aeb5019cSJung-uk Kim &mov ($_np,"ebp"); 1341f13597dSJung-uk Kim &mov ($_n0,"esi"); 1351f13597dSJung-uk Kim &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling 1361f13597dSJung-uk Kim #&mov ($_num,$num); # redundant as $num is not reused 137aeb5019cSJung-uk Kim &mov ($_sp,"edx"); # saved stack pointer! 1381f13597dSJung-uk Kim 1391f13597dSJung-uk Kimif($sse2) { 1401f13597dSJung-uk Kim$acc0="mm0"; # mmx register bank layout 1411f13597dSJung-uk Kim$acc1="mm1"; 1421f13597dSJung-uk Kim$car0="mm2"; 1431f13597dSJung-uk Kim$car1="mm3"; 1441f13597dSJung-uk Kim$mul0="mm4"; 1451f13597dSJung-uk Kim$mul1="mm5"; 1461f13597dSJung-uk Kim$temp="mm6"; 1471f13597dSJung-uk Kim$mask="mm7"; 1481f13597dSJung-uk Kim 1491f13597dSJung-uk Kim &picmeup("eax","OPENSSL_ia32cap_P"); 1501f13597dSJung-uk Kim &bt (&DWP(0,"eax"),26); 1511f13597dSJung-uk Kim &jnc (&label("non_sse2")); 1521f13597dSJung-uk Kim 1531f13597dSJung-uk Kim &mov ("eax",-1); 1541f13597dSJung-uk Kim &movd ($mask,"eax"); # mask 32 lower bits 1551f13597dSJung-uk Kim 1561f13597dSJung-uk Kim &mov ($ap,$_ap); # load input pointers 1571f13597dSJung-uk Kim &mov ($bp,$_bp); 1581f13597dSJung-uk Kim &mov ($np,$_np); 1591f13597dSJung-uk Kim 1601f13597dSJung-uk Kim &xor ($i,$i); # i=0 1611f13597dSJung-uk Kim &xor ($j,$j); # j=0 1621f13597dSJung-uk Kim 1631f13597dSJung-uk Kim &movd ($mul0,&DWP(0,$bp)); # bp[0] 1641f13597dSJung-uk Kim &movd ($mul1,&DWP(0,$ap)); # ap[0] 1651f13597dSJung-uk Kim &movd ($car1,&DWP(0,$np)); # np[0] 1661f13597dSJung-uk Kim 1671f13597dSJung-uk Kim &pmuludq($mul1,$mul0); # ap[0]*bp[0] 1681f13597dSJung-uk Kim &movq ($car0,$mul1); 1691f13597dSJung-uk Kim &movq ($acc0,$mul1); # I wish movd worked for 1701f13597dSJung-uk Kim &pand ($acc0,$mask); # inter-register transfers 1711f13597dSJung-uk Kim 1721f13597dSJung-uk Kim &pmuludq($mul1,$_n0q); # *=n0 1731f13597dSJung-uk Kim 1741f13597dSJung-uk Kim &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 1751f13597dSJung-uk Kim &paddq ($car1,$acc0); 1761f13597dSJung-uk Kim 1771f13597dSJung-uk Kim &movd ($acc1,&DWP(4,$np)); # np[1] 1781f13597dSJung-uk Kim &movd ($acc0,&DWP(4,$ap)); # ap[1] 1791f13597dSJung-uk Kim 1801f13597dSJung-uk Kim &psrlq ($car0,32); 1811f13597dSJung-uk Kim &psrlq ($car1,32); 1821f13597dSJung-uk Kim 1831f13597dSJung-uk Kim &inc ($j); # j++ 1841f13597dSJung-uk Kim&set_label("1st",16); 1851f13597dSJung-uk Kim &pmuludq($acc0,$mul0); # ap[j]*bp[0] 1861f13597dSJung-uk Kim &pmuludq($acc1,$mul1); # np[j]*m1 1871f13597dSJung-uk Kim &paddq ($car0,$acc0); # +=c0 1881f13597dSJung-uk Kim &paddq ($car1,$acc1); # +=c1 1891f13597dSJung-uk Kim 1901f13597dSJung-uk Kim &movq ($acc0,$car0); 1911f13597dSJung-uk Kim &pand ($acc0,$mask); 1921f13597dSJung-uk Kim &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 1931f13597dSJung-uk Kim &paddq ($car1,$acc0); # +=ap[j]*bp[0]; 1941f13597dSJung-uk Kim &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 1951f13597dSJung-uk Kim &psrlq ($car0,32); 1961f13597dSJung-uk Kim &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= 1971f13597dSJung-uk Kim &psrlq ($car1,32); 1981f13597dSJung-uk Kim 1991f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 2001f13597dSJung-uk Kim &cmp ($j,$num); 2011f13597dSJung-uk Kim &jl (&label("1st")); 2021f13597dSJung-uk Kim 2031f13597dSJung-uk Kim &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] 2041f13597dSJung-uk Kim &pmuludq($acc1,$mul1); # np[num-1]*m1 2051f13597dSJung-uk Kim &paddq ($car0,$acc0); # +=c0 2061f13597dSJung-uk Kim &paddq ($car1,$acc1); # +=c1 2071f13597dSJung-uk Kim 2081f13597dSJung-uk Kim &movq ($acc0,$car0); 2091f13597dSJung-uk Kim &pand ($acc0,$mask); 2101f13597dSJung-uk Kim &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; 2111f13597dSJung-uk Kim &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 2121f13597dSJung-uk Kim 2131f13597dSJung-uk Kim &psrlq ($car0,32); 2141f13597dSJung-uk Kim &psrlq ($car1,32); 2151f13597dSJung-uk Kim 2161f13597dSJung-uk Kim &paddq ($car1,$car0); 2171f13597dSJung-uk Kim &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 2181f13597dSJung-uk Kim 2191f13597dSJung-uk Kim &inc ($i); # i++ 2201f13597dSJung-uk Kim&set_label("outer"); 2211f13597dSJung-uk Kim &xor ($j,$j); # j=0 2221f13597dSJung-uk Kim 2231f13597dSJung-uk Kim &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] 2241f13597dSJung-uk Kim &movd ($mul1,&DWP(0,$ap)); # ap[0] 2251f13597dSJung-uk Kim &movd ($temp,&DWP($frame,"esp")); # tp[0] 2261f13597dSJung-uk Kim &movd ($car1,&DWP(0,$np)); # np[0] 2271f13597dSJung-uk Kim &pmuludq($mul1,$mul0); # ap[0]*bp[i] 2281f13597dSJung-uk Kim 2291f13597dSJung-uk Kim &paddq ($mul1,$temp); # +=tp[0] 2301f13597dSJung-uk Kim &movq ($acc0,$mul1); 2311f13597dSJung-uk Kim &movq ($car0,$mul1); 2321f13597dSJung-uk Kim &pand ($acc0,$mask); 2331f13597dSJung-uk Kim 2341f13597dSJung-uk Kim &pmuludq($mul1,$_n0q); # *=n0 2351f13597dSJung-uk Kim 2361f13597dSJung-uk Kim &pmuludq($car1,$mul1); 2371f13597dSJung-uk Kim &paddq ($car1,$acc0); 2381f13597dSJung-uk Kim 2391f13597dSJung-uk Kim &movd ($temp,&DWP($frame+4,"esp")); # tp[1] 2401f13597dSJung-uk Kim &movd ($acc1,&DWP(4,$np)); # np[1] 2411f13597dSJung-uk Kim &movd ($acc0,&DWP(4,$ap)); # ap[1] 2421f13597dSJung-uk Kim 2431f13597dSJung-uk Kim &psrlq ($car0,32); 2441f13597dSJung-uk Kim &psrlq ($car1,32); 2451f13597dSJung-uk Kim &paddq ($car0,$temp); # +=tp[1] 2461f13597dSJung-uk Kim 2471f13597dSJung-uk Kim &inc ($j); # j++ 2481f13597dSJung-uk Kim &dec ($num); 2491f13597dSJung-uk Kim&set_label("inner"); 2501f13597dSJung-uk Kim &pmuludq($acc0,$mul0); # ap[j]*bp[i] 2511f13597dSJung-uk Kim &pmuludq($acc1,$mul1); # np[j]*m1 2521f13597dSJung-uk Kim &paddq ($car0,$acc0); # +=c0 2531f13597dSJung-uk Kim &paddq ($car1,$acc1); # +=c1 2541f13597dSJung-uk Kim 2551f13597dSJung-uk Kim &movq ($acc0,$car0); 2561f13597dSJung-uk Kim &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] 2571f13597dSJung-uk Kim &pand ($acc0,$mask); 2581f13597dSJung-uk Kim &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 2591f13597dSJung-uk Kim &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] 2601f13597dSJung-uk Kim &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 2611f13597dSJung-uk Kim &psrlq ($car0,32); 2621f13597dSJung-uk Kim &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= 2631f13597dSJung-uk Kim &psrlq ($car1,32); 2641f13597dSJung-uk Kim &paddq ($car0,$temp); # +=tp[j+1] 2651f13597dSJung-uk Kim 2661f13597dSJung-uk Kim &dec ($num); 2671f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); # j++ 2681f13597dSJung-uk Kim &jnz (&label("inner")); 2691f13597dSJung-uk Kim 2701f13597dSJung-uk Kim &mov ($num,$j); 2711f13597dSJung-uk Kim &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] 2721f13597dSJung-uk Kim &pmuludq($acc1,$mul1); # np[num-1]*m1 2731f13597dSJung-uk Kim &paddq ($car0,$acc0); # +=c0 2741f13597dSJung-uk Kim &paddq ($car1,$acc1); # +=c1 2751f13597dSJung-uk Kim 2761f13597dSJung-uk Kim &movq ($acc0,$car0); 2771f13597dSJung-uk Kim &pand ($acc0,$mask); 2781f13597dSJung-uk Kim &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] 2791f13597dSJung-uk Kim &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 2801f13597dSJung-uk Kim &psrlq ($car0,32); 2811f13597dSJung-uk Kim &psrlq ($car1,32); 2821f13597dSJung-uk Kim 2831f13597dSJung-uk Kim &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] 2841f13597dSJung-uk Kim &paddq ($car1,$car0); 2851f13597dSJung-uk Kim &paddq ($car1,$temp); 2861f13597dSJung-uk Kim &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 2871f13597dSJung-uk Kim 2881f13597dSJung-uk Kim &lea ($i,&DWP(1,$i)); # i++ 2891f13597dSJung-uk Kim &cmp ($i,$num); 2901f13597dSJung-uk Kim &jle (&label("outer")); 2911f13597dSJung-uk Kim 2921f13597dSJung-uk Kim &emms (); # done with mmx bank 2931f13597dSJung-uk Kim &jmp (&label("common_tail")); 2941f13597dSJung-uk Kim 2951f13597dSJung-uk Kim&set_label("non_sse2",16); 2961f13597dSJung-uk Kim} 2971f13597dSJung-uk Kim 2981f13597dSJung-uk Kimif (0) { 2991f13597dSJung-uk Kim &mov ("esp",$_sp); 3001f13597dSJung-uk Kim &xor ("eax","eax"); # signal "not fast enough [yet]" 3011f13597dSJung-uk Kim &jmp (&label("just_leave")); 3021f13597dSJung-uk Kim # While the below code provides competitive performance for 303e71b7053SJung-uk Kim # all key lengths on modern Intel cores, it's still more 3041f13597dSJung-uk Kim # than 10% slower for 4096-bit key elsewhere:-( "Competitive" 3051f13597dSJung-uk Kim # means compared to the original integer-only assembler. 3061f13597dSJung-uk Kim # 512-bit RSA sign is better by ~40%, but that's about all 3071f13597dSJung-uk Kim # one can say about all CPUs... 3081f13597dSJung-uk Kim} else { 3091f13597dSJung-uk Kim$inp="esi"; # integer path uses these registers differently 3101f13597dSJung-uk Kim$word="edi"; 3111f13597dSJung-uk Kim$carry="ebp"; 3121f13597dSJung-uk Kim 3131f13597dSJung-uk Kim &mov ($inp,$_ap); 3141f13597dSJung-uk Kim &lea ($carry,&DWP(1,$num)); 3151f13597dSJung-uk Kim &mov ($word,$_bp); 3161f13597dSJung-uk Kim &xor ($j,$j); # j=0 3171f13597dSJung-uk Kim &mov ("edx",$inp); 3181f13597dSJung-uk Kim &and ($carry,1); # see if num is even 3191f13597dSJung-uk Kim &sub ("edx",$word); # see if ap==bp 3201f13597dSJung-uk Kim &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] 3211f13597dSJung-uk Kim &or ($carry,"edx"); 3221f13597dSJung-uk Kim &mov ($word,&DWP(0,$word)); # bp[0] 3231f13597dSJung-uk Kim &jz (&label("bn_sqr_mont")); 3241f13597dSJung-uk Kim &mov ($_bpend,"eax"); 3251f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp)); 3261f13597dSJung-uk Kim &xor ("edx","edx"); 3271f13597dSJung-uk Kim 3281f13597dSJung-uk Kim&set_label("mull",16); 3291f13597dSJung-uk Kim &mov ($carry,"edx"); 3301f13597dSJung-uk Kim &mul ($word); # ap[j]*bp[0] 3311f13597dSJung-uk Kim &add ($carry,"eax"); 3321f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 3331f13597dSJung-uk Kim &adc ("edx",0); 3341f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 3351f13597dSJung-uk Kim &cmp ($j,$num); 3361f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 3371f13597dSJung-uk Kim &jl (&label("mull")); 3381f13597dSJung-uk Kim 3391f13597dSJung-uk Kim &mov ($carry,"edx"); 3401f13597dSJung-uk Kim &mul ($word); # ap[num-1]*bp[0] 3411f13597dSJung-uk Kim &mov ($word,$_n0); 3421f13597dSJung-uk Kim &add ("eax",$carry); 3431f13597dSJung-uk Kim &mov ($inp,$_np); 3441f13597dSJung-uk Kim &adc ("edx",0); 3451f13597dSJung-uk Kim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 3461f13597dSJung-uk Kim 3471f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= 3481f13597dSJung-uk Kim &xor ($j,$j); 3491f13597dSJung-uk Kim &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 3501f13597dSJung-uk Kim &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 3511f13597dSJung-uk Kim 3521f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp)); # np[0] 3531f13597dSJung-uk Kim &mul ($word); # np[0]*m 3541f13597dSJung-uk Kim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 3551f13597dSJung-uk Kim &mov ("eax",&DWP(4,$inp)); # np[1] 3561f13597dSJung-uk Kim &adc ("edx",0); 3571f13597dSJung-uk Kim &inc ($j); 3581f13597dSJung-uk Kim 3591f13597dSJung-uk Kim &jmp (&label("2ndmadd")); 3601f13597dSJung-uk Kim 3611f13597dSJung-uk Kim&set_label("1stmadd",16); 3621f13597dSJung-uk Kim &mov ($carry,"edx"); 3631f13597dSJung-uk Kim &mul ($word); # ap[j]*bp[i] 3641f13597dSJung-uk Kim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 3651f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 3661f13597dSJung-uk Kim &adc ("edx",0); 3671f13597dSJung-uk Kim &add ($carry,"eax"); 3681f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 3691f13597dSJung-uk Kim &adc ("edx",0); 3701f13597dSJung-uk Kim &cmp ($j,$num); 3711f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 3721f13597dSJung-uk Kim &jl (&label("1stmadd")); 3731f13597dSJung-uk Kim 3741f13597dSJung-uk Kim &mov ($carry,"edx"); 3751f13597dSJung-uk Kim &mul ($word); # ap[num-1]*bp[i] 3761f13597dSJung-uk Kim &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] 3771f13597dSJung-uk Kim &mov ($word,$_n0); 3781f13597dSJung-uk Kim &adc ("edx",0); 3791f13597dSJung-uk Kim &mov ($inp,$_np); 3801f13597dSJung-uk Kim &add ($carry,"eax"); 3811f13597dSJung-uk Kim &adc ("edx",0); 3821f13597dSJung-uk Kim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 3831f13597dSJung-uk Kim 3841f13597dSJung-uk Kim &xor ($j,$j); 3851f13597dSJung-uk Kim &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 3861f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= 3871f13597dSJung-uk Kim &adc ($j,0); 3881f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp)); # np[0] 3891f13597dSJung-uk Kim &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 3901f13597dSJung-uk Kim &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 3911f13597dSJung-uk Kim 3921f13597dSJung-uk Kim &mul ($word); # np[0]*m 3931f13597dSJung-uk Kim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 3941f13597dSJung-uk Kim &mov ("eax",&DWP(4,$inp)); # np[1] 3951f13597dSJung-uk Kim &adc ("edx",0); 3961f13597dSJung-uk Kim &mov ($j,1); 3971f13597dSJung-uk Kim 3981f13597dSJung-uk Kim&set_label("2ndmadd",16); 3991f13597dSJung-uk Kim &mov ($carry,"edx"); 4001f13597dSJung-uk Kim &mul ($word); # np[j]*m 4011f13597dSJung-uk Kim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 4021f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 4031f13597dSJung-uk Kim &adc ("edx",0); 4041f13597dSJung-uk Kim &add ($carry,"eax"); 4051f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] 4061f13597dSJung-uk Kim &adc ("edx",0); 4071f13597dSJung-uk Kim &cmp ($j,$num); 4081f13597dSJung-uk Kim &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= 4091f13597dSJung-uk Kim &jl (&label("2ndmadd")); 4101f13597dSJung-uk Kim 4111f13597dSJung-uk Kim &mov ($carry,"edx"); 4121f13597dSJung-uk Kim &mul ($word); # np[j]*m 4131f13597dSJung-uk Kim &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 4141f13597dSJung-uk Kim &adc ("edx",0); 4151f13597dSJung-uk Kim &add ($carry,"eax"); 4161f13597dSJung-uk Kim &adc ("edx",0); 4171f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 4181f13597dSJung-uk Kim 4191f13597dSJung-uk Kim &xor ("eax","eax"); 4201f13597dSJung-uk Kim &mov ($j,$_bp); # &bp[i] 4211f13597dSJung-uk Kim &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 4221f13597dSJung-uk Kim &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 4231f13597dSJung-uk Kim &lea ($j,&DWP(4,$j)); 4241f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 4251f13597dSJung-uk Kim &cmp ($j,$_bpend); 4261f13597dSJung-uk Kim &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 4271f13597dSJung-uk Kim &je (&label("common_tail")); 4281f13597dSJung-uk Kim 4291f13597dSJung-uk Kim &mov ($word,&DWP(0,$j)); # bp[i+1] 4301f13597dSJung-uk Kim &mov ($inp,$_ap); 4311f13597dSJung-uk Kim &mov ($_bp,$j); # &bp[++i] 4321f13597dSJung-uk Kim &xor ($j,$j); 4331f13597dSJung-uk Kim &xor ("edx","edx"); 4341f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp)); 4351f13597dSJung-uk Kim &jmp (&label("1stmadd")); 4361f13597dSJung-uk Kim 4371f13597dSJung-uk Kim&set_label("bn_sqr_mont",16); 4381f13597dSJung-uk Kim$sbit=$num; 4391f13597dSJung-uk Kim &mov ($_num,$num); 4401f13597dSJung-uk Kim &mov ($_bp,$j); # i=0 4411f13597dSJung-uk Kim 4421f13597dSJung-uk Kim &mov ("eax",$word); # ap[0] 4431f13597dSJung-uk Kim &mul ($word); # ap[0]*ap[0] 4441f13597dSJung-uk Kim &mov (&DWP($frame,"esp"),"eax"); # tp[0]= 4451f13597dSJung-uk Kim &mov ($sbit,"edx"); 4461f13597dSJung-uk Kim &shr ("edx",1); 4471f13597dSJung-uk Kim &and ($sbit,1); 4481f13597dSJung-uk Kim &inc ($j); 4491f13597dSJung-uk Kim&set_label("sqr",16); 4501f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 4511f13597dSJung-uk Kim &mov ($carry,"edx"); 4521f13597dSJung-uk Kim &mul ($word); # ap[j]*ap[0] 4531f13597dSJung-uk Kim &add ("eax",$carry); 4541f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 4551f13597dSJung-uk Kim &adc ("edx",0); 4561f13597dSJung-uk Kim &lea ($carry,&DWP(0,$sbit,"eax",2)); 4571f13597dSJung-uk Kim &shr ("eax",31); 4581f13597dSJung-uk Kim &cmp ($j,$_num); 4591f13597dSJung-uk Kim &mov ($sbit,"eax"); 4601f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 4611f13597dSJung-uk Kim &jl (&label("sqr")); 4621f13597dSJung-uk Kim 4631f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] 4641f13597dSJung-uk Kim &mov ($carry,"edx"); 4651f13597dSJung-uk Kim &mul ($word); # ap[num-1]*ap[0] 4661f13597dSJung-uk Kim &add ("eax",$carry); 4671f13597dSJung-uk Kim &mov ($word,$_n0); 4681f13597dSJung-uk Kim &adc ("edx",0); 4691f13597dSJung-uk Kim &mov ($inp,$_np); 4701f13597dSJung-uk Kim &lea ($carry,&DWP(0,$sbit,"eax",2)); 4711f13597dSJung-uk Kim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 4721f13597dSJung-uk Kim &shr ("eax",31); 4731f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= 4741f13597dSJung-uk Kim 4751f13597dSJung-uk Kim &lea ($carry,&DWP(0,"eax","edx",2)); 4761f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp)); # np[0] 4771f13597dSJung-uk Kim &shr ("edx",31); 4781f13597dSJung-uk Kim &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= 4791f13597dSJung-uk Kim &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= 4801f13597dSJung-uk Kim 4811f13597dSJung-uk Kim &mul ($word); # np[0]*m 4821f13597dSJung-uk Kim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 4831f13597dSJung-uk Kim &mov ($num,$j); 4841f13597dSJung-uk Kim &adc ("edx",0); 4851f13597dSJung-uk Kim &mov ("eax",&DWP(4,$inp)); # np[1] 4861f13597dSJung-uk Kim &mov ($j,1); 4871f13597dSJung-uk Kim 4881f13597dSJung-uk Kim&set_label("3rdmadd",16); 4891f13597dSJung-uk Kim &mov ($carry,"edx"); 4901f13597dSJung-uk Kim &mul ($word); # np[j]*m 4911f13597dSJung-uk Kim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 4921f13597dSJung-uk Kim &adc ("edx",0); 4931f13597dSJung-uk Kim &add ($carry,"eax"); 4941f13597dSJung-uk Kim &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] 4951f13597dSJung-uk Kim &adc ("edx",0); 4961f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= 4971f13597dSJung-uk Kim 4981f13597dSJung-uk Kim &mov ($carry,"edx"); 4991f13597dSJung-uk Kim &mul ($word); # np[j+1]*m 5001f13597dSJung-uk Kim &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] 5011f13597dSJung-uk Kim &lea ($j,&DWP(2,$j)); 5021f13597dSJung-uk Kim &adc ("edx",0); 5031f13597dSJung-uk Kim &add ($carry,"eax"); 5041f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] 5051f13597dSJung-uk Kim &adc ("edx",0); 5061f13597dSJung-uk Kim &cmp ($j,$num); 5071f13597dSJung-uk Kim &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= 5081f13597dSJung-uk Kim &jl (&label("3rdmadd")); 5091f13597dSJung-uk Kim 5101f13597dSJung-uk Kim &mov ($carry,"edx"); 5111f13597dSJung-uk Kim &mul ($word); # np[j]*m 5121f13597dSJung-uk Kim &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 5131f13597dSJung-uk Kim &adc ("edx",0); 5141f13597dSJung-uk Kim &add ($carry,"eax"); 5151f13597dSJung-uk Kim &adc ("edx",0); 5161f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 5171f13597dSJung-uk Kim 5181f13597dSJung-uk Kim &mov ($j,$_bp); # i 5191f13597dSJung-uk Kim &xor ("eax","eax"); 5201f13597dSJung-uk Kim &mov ($inp,$_ap); 5211f13597dSJung-uk Kim &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 5221f13597dSJung-uk Kim &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 5231f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 5241f13597dSJung-uk Kim &cmp ($j,$num); 5251f13597dSJung-uk Kim &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 5261f13597dSJung-uk Kim &je (&label("common_tail")); 5271f13597dSJung-uk Kim 5281f13597dSJung-uk Kim &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] 5291f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 5301f13597dSJung-uk Kim &mov ("eax",$word); 5311f13597dSJung-uk Kim &mov ($_bp,$j); # ++i 5321f13597dSJung-uk Kim &mul ($word); # ap[i]*ap[i] 5331f13597dSJung-uk Kim &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] 5341f13597dSJung-uk Kim &adc ("edx",0); 5351f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= 5361f13597dSJung-uk Kim &xor ($carry,$carry); 5371f13597dSJung-uk Kim &cmp ($j,$num); 5381f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 5391f13597dSJung-uk Kim &je (&label("sqrlast")); 5401f13597dSJung-uk Kim 5411f13597dSJung-uk Kim &mov ($sbit,"edx"); # zaps $num 5421f13597dSJung-uk Kim &shr ("edx",1); 5431f13597dSJung-uk Kim &and ($sbit,1); 5441f13597dSJung-uk Kim&set_label("sqradd",16); 5451f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 5461f13597dSJung-uk Kim &mov ($carry,"edx"); 5471f13597dSJung-uk Kim &mul ($word); # ap[j]*ap[i] 5481f13597dSJung-uk Kim &add ("eax",$carry); 5491f13597dSJung-uk Kim &lea ($carry,&DWP(0,"eax","eax")); 5501f13597dSJung-uk Kim &adc ("edx",0); 5511f13597dSJung-uk Kim &shr ("eax",31); 5521f13597dSJung-uk Kim &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 5531f13597dSJung-uk Kim &lea ($j,&DWP(1,$j)); 5541f13597dSJung-uk Kim &adc ("eax",0); 5551f13597dSJung-uk Kim &add ($carry,$sbit); 5561f13597dSJung-uk Kim &adc ("eax",0); 5571f13597dSJung-uk Kim &cmp ($j,$_num); 5581f13597dSJung-uk Kim &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 5591f13597dSJung-uk Kim &mov ($sbit,"eax"); 5601f13597dSJung-uk Kim &jle (&label("sqradd")); 5611f13597dSJung-uk Kim 5621f13597dSJung-uk Kim &mov ($carry,"edx"); 5631f13597dSJung-uk Kim &add ("edx","edx"); 5641f13597dSJung-uk Kim &shr ($carry,31); 5651f13597dSJung-uk Kim &add ("edx",$sbit); 5661f13597dSJung-uk Kim &adc ($carry,0); 5671f13597dSJung-uk Kim&set_label("sqrlast"); 5681f13597dSJung-uk Kim &mov ($word,$_n0); 5691f13597dSJung-uk Kim &mov ($inp,$_np); 5701f13597dSJung-uk Kim &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 5711f13597dSJung-uk Kim 5721f13597dSJung-uk Kim &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] 5731f13597dSJung-uk Kim &mov ("eax",&DWP(0,$inp)); # np[0] 5741f13597dSJung-uk Kim &adc ($carry,0); 5751f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= 5761f13597dSJung-uk Kim &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= 5771f13597dSJung-uk Kim 5781f13597dSJung-uk Kim &mul ($word); # np[0]*m 5791f13597dSJung-uk Kim &add ("eax",&DWP($frame,"esp")); # +=tp[0] 5801f13597dSJung-uk Kim &lea ($num,&DWP(-1,$j)); 5811f13597dSJung-uk Kim &adc ("edx",0); 5821f13597dSJung-uk Kim &mov ($j,1); 5831f13597dSJung-uk Kim &mov ("eax",&DWP(4,$inp)); # np[1] 5841f13597dSJung-uk Kim 5851f13597dSJung-uk Kim &jmp (&label("3rdmadd")); 5861f13597dSJung-uk Kim} 5871f13597dSJung-uk Kim 5881f13597dSJung-uk Kim&set_label("common_tail",16); 5891f13597dSJung-uk Kim &mov ($np,$_np); # load modulus pointer 5901f13597dSJung-uk Kim &mov ($rp,$_rp); # load result pointer 5911f13597dSJung-uk Kim &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] 5921f13597dSJung-uk Kim 5931f13597dSJung-uk Kim &mov ("eax",&DWP(0,$tp)); # tp[0] 5941f13597dSJung-uk Kim &mov ($j,$num); # j=num-1 5951f13597dSJung-uk Kim &xor ($i,$i); # i=0 and clear CF! 5961f13597dSJung-uk Kim 5971f13597dSJung-uk Kim&set_label("sub",16); 5981f13597dSJung-uk Kim &sbb ("eax",&DWP(0,$np,$i,4)); 5991f13597dSJung-uk Kim &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] 6001f13597dSJung-uk Kim &dec ($j); # doesn't affect CF! 6011f13597dSJung-uk Kim &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] 6021f13597dSJung-uk Kim &lea ($i,&DWP(1,$i)); # i++ 6031f13597dSJung-uk Kim &jge (&label("sub")); 6041f13597dSJung-uk Kim 6051f13597dSJung-uk Kim &sbb ("eax",0); # handle upmost overflow bit 606dea77ea6SJung-uk Kim &mov ("edx",-1); 607dea77ea6SJung-uk Kim &xor ("edx","eax"); 608dea77ea6SJung-uk Kim &jmp (&label("copy")); 6091f13597dSJung-uk Kim 610dea77ea6SJung-uk Kim&set_label("copy",16); # conditional copy 611dea77ea6SJung-uk Kim &mov ($tp,&DWP($frame,"esp",$num,4)); 612dea77ea6SJung-uk Kim &mov ($np,&DWP(0,$rp,$num,4)); 6131f13597dSJung-uk Kim &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector 614dea77ea6SJung-uk Kim &and ($tp,"eax"); 615dea77ea6SJung-uk Kim &and ($np,"edx"); 616dea77ea6SJung-uk Kim &or ($np,$tp); 617dea77ea6SJung-uk Kim &mov (&DWP(0,$rp,$num,4),$np); 6181f13597dSJung-uk Kim &dec ($num); 6191f13597dSJung-uk Kim &jge (&label("copy")); 6201f13597dSJung-uk Kim 6211f13597dSJung-uk Kim &mov ("esp",$_sp); # pull saved stack pointer 6221f13597dSJung-uk Kim &mov ("eax",1); 6231f13597dSJung-uk Kim&set_label("just_leave"); 6241f13597dSJung-uk Kim&function_end("bn_mul_mont"); 6251f13597dSJung-uk Kim 6261f13597dSJung-uk Kim&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 6271f13597dSJung-uk Kim 6281f13597dSJung-uk Kim&asm_finish(); 629e71b7053SJung-uk Kim 63017f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 631