15b37fcf3Sryker#!/usr/local/bin/perl 25b37fcf3Sryker 3*0a5d6edeSdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 4*0a5d6edeSdjmpush(@INC,"${dir}","${dir}../../perlasm"); 55b37fcf3Srykerrequire "x86asm.pl"; 65b37fcf3Sryker 7913ec974Sbeck&asm_init($ARGV[0],$0); 85b37fcf3Sryker 94fcf65c5Sdjm$sse2=0; 104fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 114fcf65c5Sdjm 124fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2); 134fcf65c5Sdjm 145b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words"); 155b37fcf3Sryker&bn_mul_words("bn_mul_words"); 165b37fcf3Sryker&bn_sqr_words("bn_sqr_words"); 17913ec974Sbeck&bn_div_words("bn_div_words"); 185b37fcf3Sryker&bn_add_words("bn_add_words"); 19913ec974Sbeck&bn_sub_words("bn_sub_words"); 204fcf65c5Sdjm&bn_sub_part_words("bn_sub_part_words"); 215b37fcf3Sryker 225b37fcf3Sryker&asm_finish(); 235b37fcf3Sryker 245b37fcf3Srykersub bn_mul_add_words 255b37fcf3Sryker { 265b37fcf3Sryker local($name)=@_; 275b37fcf3Sryker 28*0a5d6edeSdjm &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 295b37fcf3Sryker 30*0a5d6edeSdjm $r="eax"; 31*0a5d6edeSdjm $a="edx"; 32*0a5d6edeSdjm $c="ecx"; 335b37fcf3Sryker 344fcf65c5Sdjm if ($sse2) { 354fcf65c5Sdjm &picmeup("eax","OPENSSL_ia32cap_P"); 364fcf65c5Sdjm &bt(&DWP(0,"eax"),26); 37*0a5d6edeSdjm &jnc(&label("maw_non_sse2")); 384fcf65c5Sdjm 39*0a5d6edeSdjm &mov($r,&wparam(0)); 40*0a5d6edeSdjm &mov($a,&wparam(1)); 41*0a5d6edeSdjm &mov($c,&wparam(2)); 42*0a5d6edeSdjm &movd("mm0",&wparam(3)); # mm0 = w 434fcf65c5Sdjm &pxor("mm1","mm1"); # mm1 = carry_in 44*0a5d6edeSdjm &jmp(&label("maw_sse2_entry")); 454fcf65c5Sdjm 46*0a5d6edeSdjm &set_label("maw_sse2_unrolled",16); 474fcf65c5Sdjm &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 484fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 494fcf65c5Sdjm &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 504fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[0] 514fcf65c5Sdjm &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 524fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[1] 534fcf65c5Sdjm &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 544fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[2] 554fcf65c5Sdjm &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 564fcf65c5Sdjm &pmuludq("mm7","mm0"); # mm7 = w*a[3] 574fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 584fcf65c5Sdjm &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 594fcf65c5Sdjm &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 604fcf65c5Sdjm &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 614fcf65c5Sdjm &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 624fcf65c5Sdjm &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 634fcf65c5Sdjm &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 644fcf65c5Sdjm &movd(&DWP(0,$r,"",0),"mm1"); 654fcf65c5Sdjm &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 664fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[4] 674fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry0 684fcf65c5Sdjm &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 694fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[5] 704fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 714fcf65c5Sdjm &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 724fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[6] 734fcf65c5Sdjm &movd(&DWP(4,$r,"",0),"mm1"); 744fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry1 754fcf65c5Sdjm &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 764fcf65c5Sdjm &add($a,32); 774fcf65c5Sdjm &pmuludq("mm3","mm0"); # mm3 = w*a[7] 784fcf65c5Sdjm &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 794fcf65c5Sdjm &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 804fcf65c5Sdjm &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 814fcf65c5Sdjm &movd(&DWP(8,$r,"",0),"mm1"); 824fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry2 834fcf65c5Sdjm &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 844fcf65c5Sdjm &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 854fcf65c5Sdjm &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 864fcf65c5Sdjm &movd(&DWP(12,$r,"",0),"mm1"); 874fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry3 884fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 894fcf65c5Sdjm &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 904fcf65c5Sdjm &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 914fcf65c5Sdjm &movd(&DWP(16,$r,"",0),"mm1"); 924fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry4 934fcf65c5Sdjm &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 944fcf65c5Sdjm &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 954fcf65c5Sdjm &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 964fcf65c5Sdjm &movd(&DWP(20,$r,"",0),"mm1"); 974fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry5 984fcf65c5Sdjm &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 994fcf65c5Sdjm &movd(&DWP(24,$r,"",0),"mm1"); 1004fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry6 1014fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 1024fcf65c5Sdjm &movd(&DWP(28,$r,"",0),"mm1"); 103*0a5d6edeSdjm &lea($r,&DWP(32,$r)); 1044fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry_out 1054fcf65c5Sdjm 106*0a5d6edeSdjm &sub($c,8); 107*0a5d6edeSdjm &jz(&label("maw_sse2_exit")); 108*0a5d6edeSdjm &set_label("maw_sse2_entry"); 109*0a5d6edeSdjm &test($c,0xfffffff8); 110*0a5d6edeSdjm &jnz(&label("maw_sse2_unrolled")); 111*0a5d6edeSdjm 112*0a5d6edeSdjm &set_label("maw_sse2_loop",4); 113*0a5d6edeSdjm &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 114*0a5d6edeSdjm &movd("mm3",&DWP(0,$r)); # mm3 = r[i] 115*0a5d6edeSdjm &pmuludq("mm2","mm0"); # a[i] *= w 116*0a5d6edeSdjm &lea($a,&DWP(4,$a)); 117*0a5d6edeSdjm &paddq("mm1","mm3"); # carry += r[i] 118*0a5d6edeSdjm &paddq("mm1","mm2"); # carry += a[i]*w 119*0a5d6edeSdjm &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 120*0a5d6edeSdjm &sub($c,1); 121*0a5d6edeSdjm &psrlq("mm1",32); # carry = carry_high 122*0a5d6edeSdjm &lea($r,&DWP(4,$r)); 1234fcf65c5Sdjm &jnz(&label("maw_sse2_loop")); 124*0a5d6edeSdjm &set_label("maw_sse2_exit"); 125*0a5d6edeSdjm &movd("eax","mm1"); # c = carry_out 1264fcf65c5Sdjm &emms(); 127*0a5d6edeSdjm &ret(); 1284fcf65c5Sdjm 129*0a5d6edeSdjm &set_label("maw_non_sse2",16); 1304fcf65c5Sdjm } 1314fcf65c5Sdjm 132*0a5d6edeSdjm # function_begin prologue 133*0a5d6edeSdjm &push("ebp"); 134*0a5d6edeSdjm &push("ebx"); 135*0a5d6edeSdjm &push("esi"); 136*0a5d6edeSdjm &push("edi"); 1375b37fcf3Sryker 138*0a5d6edeSdjm &comment(""); 139*0a5d6edeSdjm $Low="eax"; 140*0a5d6edeSdjm $High="edx"; 141*0a5d6edeSdjm $a="ebx"; 142*0a5d6edeSdjm $w="ebp"; 143*0a5d6edeSdjm $r="edi"; 144*0a5d6edeSdjm $c="esi"; 145*0a5d6edeSdjm 146*0a5d6edeSdjm &xor($c,$c); # clear carry 147*0a5d6edeSdjm &mov($r,&wparam(0)); # 148*0a5d6edeSdjm 149*0a5d6edeSdjm &mov("ecx",&wparam(2)); # 150*0a5d6edeSdjm &mov($a,&wparam(1)); # 151*0a5d6edeSdjm 152*0a5d6edeSdjm &and("ecx",0xfffffff8); # num / 8 153*0a5d6edeSdjm &mov($w,&wparam(3)); # 154*0a5d6edeSdjm 155*0a5d6edeSdjm &push("ecx"); # Up the stack for a tmp variable 156*0a5d6edeSdjm 157*0a5d6edeSdjm &jz(&label("maw_finish")); 158*0a5d6edeSdjm 159*0a5d6edeSdjm &set_label("maw_loop",16); 1605b37fcf3Sryker 1615b37fcf3Sryker for ($i=0; $i<32; $i+=4) 1625b37fcf3Sryker { 1635b37fcf3Sryker &comment("Round $i"); 1645b37fcf3Sryker 165*0a5d6edeSdjm &mov("eax",&DWP($i,$a)); # *a 1665b37fcf3Sryker &mul($w); # *a * w 1675b37fcf3Sryker &add("eax",$c); # L(t)+= c 1685b37fcf3Sryker &adc("edx",0); # H(t)+=carry 169*0a5d6edeSdjm &add("eax",&DWP($i,$r)); # L(t)+= *r 170*0a5d6edeSdjm &adc("edx",0); # H(t)+=carry 171*0a5d6edeSdjm &mov(&DWP($i,$r),"eax"); # *r= L(t); 1725b37fcf3Sryker &mov($c,"edx"); # c= H(t); 1735b37fcf3Sryker } 1745b37fcf3Sryker 1755b37fcf3Sryker &comment(""); 1765b37fcf3Sryker &sub("ecx",8); 177*0a5d6edeSdjm &lea($a,&DWP(32,$a)); 178*0a5d6edeSdjm &lea($r,&DWP(32,$r)); 1795b37fcf3Sryker &jnz(&label("maw_loop")); 1805b37fcf3Sryker 1815b37fcf3Sryker &set_label("maw_finish",0); 1825b37fcf3Sryker &mov("ecx",&wparam(2)); # get num 1835b37fcf3Sryker &and("ecx",7); 1845b37fcf3Sryker &jnz(&label("maw_finish2")); # helps branch prediction 1855b37fcf3Sryker &jmp(&label("maw_end")); 1865b37fcf3Sryker 1875b37fcf3Sryker &set_label("maw_finish2",1); 1885b37fcf3Sryker for ($i=0; $i<7; $i++) 1895b37fcf3Sryker { 1905b37fcf3Sryker &comment("Tail Round $i"); 191*0a5d6edeSdjm &mov("eax",&DWP($i*4,$a)); # *a 1925b37fcf3Sryker &mul($w); # *a * w 1935b37fcf3Sryker &add("eax",$c); # L(t)+=c 1945b37fcf3Sryker &adc("edx",0); # H(t)+=carry 195*0a5d6edeSdjm &add("eax",&DWP($i*4,$r)); # L(t)+= *r 1965b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1975b37fcf3Sryker &dec("ecx") if ($i != 7-1); 198*0a5d6edeSdjm &mov(&DWP($i*4,$r),"eax"); # *r= L(t); 1995b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2005b37fcf3Sryker &jz(&label("maw_end")) if ($i != 7-1); 2015b37fcf3Sryker } 2025b37fcf3Sryker &set_label("maw_end",0); 2035b37fcf3Sryker &mov("eax",$c); 2045b37fcf3Sryker 2055b37fcf3Sryker &pop("ecx"); # clear variable from 2065b37fcf3Sryker 2075b37fcf3Sryker &function_end($name); 2085b37fcf3Sryker } 2095b37fcf3Sryker 2105b37fcf3Srykersub bn_mul_words 2115b37fcf3Sryker { 2125b37fcf3Sryker local($name)=@_; 2135b37fcf3Sryker 214*0a5d6edeSdjm &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 215*0a5d6edeSdjm 216*0a5d6edeSdjm $r="eax"; 217*0a5d6edeSdjm $a="edx"; 218*0a5d6edeSdjm $c="ecx"; 219*0a5d6edeSdjm 220*0a5d6edeSdjm if ($sse2) { 221*0a5d6edeSdjm &picmeup("eax","OPENSSL_ia32cap_P"); 222*0a5d6edeSdjm &bt(&DWP(0,"eax"),26); 223*0a5d6edeSdjm &jnc(&label("mw_non_sse2")); 224*0a5d6edeSdjm 225*0a5d6edeSdjm &mov($r,&wparam(0)); 226*0a5d6edeSdjm &mov($a,&wparam(1)); 227*0a5d6edeSdjm &mov($c,&wparam(2)); 228*0a5d6edeSdjm &movd("mm0",&wparam(3)); # mm0 = w 229*0a5d6edeSdjm &pxor("mm1","mm1"); # mm1 = carry = 0 230*0a5d6edeSdjm 231*0a5d6edeSdjm &set_label("mw_sse2_loop",16); 232*0a5d6edeSdjm &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 233*0a5d6edeSdjm &pmuludq("mm2","mm0"); # a[i] *= w 234*0a5d6edeSdjm &lea($a,&DWP(4,$a)); 235*0a5d6edeSdjm &paddq("mm1","mm2"); # carry += a[i]*w 236*0a5d6edeSdjm &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 237*0a5d6edeSdjm &sub($c,1); 238*0a5d6edeSdjm &psrlq("mm1",32); # carry = carry_high 239*0a5d6edeSdjm &lea($r,&DWP(4,$r)); 240*0a5d6edeSdjm &jnz(&label("mw_sse2_loop")); 241*0a5d6edeSdjm 242*0a5d6edeSdjm &movd("eax","mm1"); # return carry 243*0a5d6edeSdjm &emms(); 244*0a5d6edeSdjm &ret(); 245*0a5d6edeSdjm &set_label("mw_non_sse2",16); 246*0a5d6edeSdjm } 247*0a5d6edeSdjm 248*0a5d6edeSdjm # function_begin prologue 249*0a5d6edeSdjm &push("ebp"); 250*0a5d6edeSdjm &push("ebx"); 251*0a5d6edeSdjm &push("esi"); 252*0a5d6edeSdjm &push("edi"); 2535b37fcf3Sryker 2545b37fcf3Sryker &comment(""); 2555b37fcf3Sryker $Low="eax"; 2565b37fcf3Sryker $High="edx"; 2575b37fcf3Sryker $a="ebx"; 2585b37fcf3Sryker $w="ecx"; 2595b37fcf3Sryker $r="edi"; 2605b37fcf3Sryker $c="esi"; 2615b37fcf3Sryker $num="ebp"; 2625b37fcf3Sryker 2635b37fcf3Sryker &xor($c,$c); # clear carry 2645b37fcf3Sryker &mov($r,&wparam(0)); # 2655b37fcf3Sryker &mov($a,&wparam(1)); # 2665b37fcf3Sryker &mov($num,&wparam(2)); # 2675b37fcf3Sryker &mov($w,&wparam(3)); # 2685b37fcf3Sryker 2695b37fcf3Sryker &and($num,0xfffffff8); # num / 8 2705b37fcf3Sryker &jz(&label("mw_finish")); 2715b37fcf3Sryker 2725b37fcf3Sryker &set_label("mw_loop",0); 2735b37fcf3Sryker for ($i=0; $i<32; $i+=4) 2745b37fcf3Sryker { 2755b37fcf3Sryker &comment("Round $i"); 2765b37fcf3Sryker 2775b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 2785b37fcf3Sryker &mul($w); # *a * w 2795b37fcf3Sryker &add("eax",$c); # L(t)+=c 2805b37fcf3Sryker # XXX 2815b37fcf3Sryker 2825b37fcf3Sryker &adc("edx",0); # H(t)+=carry 2835b37fcf3Sryker &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 2845b37fcf3Sryker 2855b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2865b37fcf3Sryker } 2875b37fcf3Sryker 2885b37fcf3Sryker &comment(""); 2895b37fcf3Sryker &add($a,32); 2905b37fcf3Sryker &add($r,32); 2915b37fcf3Sryker &sub($num,8); 2925b37fcf3Sryker &jz(&label("mw_finish")); 2935b37fcf3Sryker &jmp(&label("mw_loop")); 2945b37fcf3Sryker 2955b37fcf3Sryker &set_label("mw_finish",0); 2965b37fcf3Sryker &mov($num,&wparam(2)); # get num 2975b37fcf3Sryker &and($num,7); 2985b37fcf3Sryker &jnz(&label("mw_finish2")); 2995b37fcf3Sryker &jmp(&label("mw_end")); 3005b37fcf3Sryker 3015b37fcf3Sryker &set_label("mw_finish2",1); 3025b37fcf3Sryker for ($i=0; $i<7; $i++) 3035b37fcf3Sryker { 3045b37fcf3Sryker &comment("Tail Round $i"); 3055b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0));# *a 3065b37fcf3Sryker &mul($w); # *a * w 3075b37fcf3Sryker &add("eax",$c); # L(t)+=c 3085b37fcf3Sryker # XXX 3095b37fcf3Sryker &adc("edx",0); # H(t)+=carry 3105b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 3115b37fcf3Sryker &mov($c,"edx"); # c= H(t); 3125b37fcf3Sryker &dec($num) if ($i != 7-1); 3135b37fcf3Sryker &jz(&label("mw_end")) if ($i != 7-1); 3145b37fcf3Sryker } 3155b37fcf3Sryker &set_label("mw_end",0); 3165b37fcf3Sryker &mov("eax",$c); 3175b37fcf3Sryker 3185b37fcf3Sryker &function_end($name); 3195b37fcf3Sryker } 3205b37fcf3Sryker 3215b37fcf3Srykersub bn_sqr_words 3225b37fcf3Sryker { 3235b37fcf3Sryker local($name)=@_; 3245b37fcf3Sryker 325*0a5d6edeSdjm &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 326*0a5d6edeSdjm 327*0a5d6edeSdjm $r="eax"; 328*0a5d6edeSdjm $a="edx"; 329*0a5d6edeSdjm $c="ecx"; 330*0a5d6edeSdjm 331*0a5d6edeSdjm if ($sse2) { 332*0a5d6edeSdjm &picmeup("eax","OPENSSL_ia32cap_P"); 333*0a5d6edeSdjm &bt(&DWP(0,"eax"),26); 334*0a5d6edeSdjm &jnc(&label("sqr_non_sse2")); 335*0a5d6edeSdjm 336*0a5d6edeSdjm &mov($r,&wparam(0)); 337*0a5d6edeSdjm &mov($a,&wparam(1)); 338*0a5d6edeSdjm &mov($c,&wparam(2)); 339*0a5d6edeSdjm 340*0a5d6edeSdjm &set_label("sqr_sse2_loop",16); 341*0a5d6edeSdjm &movd("mm0",&DWP(0,$a)); # mm0 = a[i] 342*0a5d6edeSdjm &pmuludq("mm0","mm0"); # a[i] *= a[i] 343*0a5d6edeSdjm &lea($a,&DWP(4,$a)); # a++ 344*0a5d6edeSdjm &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] 345*0a5d6edeSdjm &sub($c,1); 346*0a5d6edeSdjm &lea($r,&DWP(8,$r)); # r += 2 347*0a5d6edeSdjm &jnz(&label("sqr_sse2_loop")); 348*0a5d6edeSdjm 349*0a5d6edeSdjm &emms(); 350*0a5d6edeSdjm &ret(); 351*0a5d6edeSdjm &set_label("sqr_non_sse2",16); 352*0a5d6edeSdjm } 353*0a5d6edeSdjm 354*0a5d6edeSdjm # function_begin prologue 355*0a5d6edeSdjm &push("ebp"); 356*0a5d6edeSdjm &push("ebx"); 357*0a5d6edeSdjm &push("esi"); 358*0a5d6edeSdjm &push("edi"); 3595b37fcf3Sryker 3605b37fcf3Sryker &comment(""); 3615b37fcf3Sryker $r="esi"; 3625b37fcf3Sryker $a="edi"; 3635b37fcf3Sryker $num="ebx"; 3645b37fcf3Sryker 3655b37fcf3Sryker &mov($r,&wparam(0)); # 3665b37fcf3Sryker &mov($a,&wparam(1)); # 3675b37fcf3Sryker &mov($num,&wparam(2)); # 3685b37fcf3Sryker 3695b37fcf3Sryker &and($num,0xfffffff8); # num / 8 3705b37fcf3Sryker &jz(&label("sw_finish")); 3715b37fcf3Sryker 3725b37fcf3Sryker &set_label("sw_loop",0); 3735b37fcf3Sryker for ($i=0; $i<32; $i+=4) 3745b37fcf3Sryker { 3755b37fcf3Sryker &comment("Round $i"); 3765b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 3775b37fcf3Sryker # XXX 3785b37fcf3Sryker &mul("eax"); # *a * *a 3795b37fcf3Sryker &mov(&DWP($i*2,$r,"",0),"eax"); # 3805b37fcf3Sryker &mov(&DWP($i*2+4,$r,"",0),"edx");# 3815b37fcf3Sryker } 3825b37fcf3Sryker 3835b37fcf3Sryker &comment(""); 3845b37fcf3Sryker &add($a,32); 3855b37fcf3Sryker &add($r,64); 3865b37fcf3Sryker &sub($num,8); 3875b37fcf3Sryker &jnz(&label("sw_loop")); 3885b37fcf3Sryker 3895b37fcf3Sryker &set_label("sw_finish",0); 3905b37fcf3Sryker &mov($num,&wparam(2)); # get num 3915b37fcf3Sryker &and($num,7); 3925b37fcf3Sryker &jz(&label("sw_end")); 3935b37fcf3Sryker 3945b37fcf3Sryker for ($i=0; $i<7; $i++) 3955b37fcf3Sryker { 3965b37fcf3Sryker &comment("Tail Round $i"); 3975b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0)); # *a 3985b37fcf3Sryker # XXX 3995b37fcf3Sryker &mul("eax"); # *a * *a 4005b37fcf3Sryker &mov(&DWP($i*8,$r,"",0),"eax"); # 4015b37fcf3Sryker &dec($num) if ($i != 7-1); 4025b37fcf3Sryker &mov(&DWP($i*8+4,$r,"",0),"edx"); 4035b37fcf3Sryker &jz(&label("sw_end")) if ($i != 7-1); 4045b37fcf3Sryker } 4055b37fcf3Sryker &set_label("sw_end",0); 4065b37fcf3Sryker 4075b37fcf3Sryker &function_end($name); 4085b37fcf3Sryker } 4095b37fcf3Sryker 410913ec974Sbecksub bn_div_words 4115b37fcf3Sryker { 4125b37fcf3Sryker local($name)=@_; 4135b37fcf3Sryker 414*0a5d6edeSdjm &function_begin_B($name,""); 4155b37fcf3Sryker &mov("edx",&wparam(0)); # 4165b37fcf3Sryker &mov("eax",&wparam(1)); # 417*0a5d6edeSdjm &mov("ecx",&wparam(2)); # 418*0a5d6edeSdjm &div("ecx"); 419*0a5d6edeSdjm &ret(); 420*0a5d6edeSdjm &function_end_B($name); 4215b37fcf3Sryker } 4225b37fcf3Sryker 4235b37fcf3Srykersub bn_add_words 4245b37fcf3Sryker { 4255b37fcf3Sryker local($name)=@_; 4265b37fcf3Sryker 4275b37fcf3Sryker &function_begin($name,""); 4285b37fcf3Sryker 4295b37fcf3Sryker &comment(""); 4305b37fcf3Sryker $a="esi"; 4315b37fcf3Sryker $b="edi"; 4325b37fcf3Sryker $c="eax"; 4335b37fcf3Sryker $r="ebx"; 4345b37fcf3Sryker $tmp1="ecx"; 4355b37fcf3Sryker $tmp2="edx"; 4365b37fcf3Sryker $num="ebp"; 4375b37fcf3Sryker 4385b37fcf3Sryker &mov($r,&wparam(0)); # get r 4395b37fcf3Sryker &mov($a,&wparam(1)); # get a 4405b37fcf3Sryker &mov($b,&wparam(2)); # get b 4415b37fcf3Sryker &mov($num,&wparam(3)); # get num 4425b37fcf3Sryker &xor($c,$c); # clear carry 4435b37fcf3Sryker &and($num,0xfffffff8); # num / 8 4445b37fcf3Sryker 4455b37fcf3Sryker &jz(&label("aw_finish")); 4465b37fcf3Sryker 4475b37fcf3Sryker &set_label("aw_loop",0); 4485b37fcf3Sryker for ($i=0; $i<8; $i++) 4495b37fcf3Sryker { 4505b37fcf3Sryker &comment("Round $i"); 4515b37fcf3Sryker 4525b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 4535b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 4545b37fcf3Sryker &add($tmp1,$c); 4555b37fcf3Sryker &mov($c,0); 4565b37fcf3Sryker &adc($c,$c); 4575b37fcf3Sryker &add($tmp1,$tmp2); 4585b37fcf3Sryker &adc($c,0); 4595b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 4605b37fcf3Sryker } 4615b37fcf3Sryker 4625b37fcf3Sryker &comment(""); 4635b37fcf3Sryker &add($a,32); 4645b37fcf3Sryker &add($b,32); 4655b37fcf3Sryker &add($r,32); 4665b37fcf3Sryker &sub($num,8); 4675b37fcf3Sryker &jnz(&label("aw_loop")); 4685b37fcf3Sryker 4695b37fcf3Sryker &set_label("aw_finish",0); 4705b37fcf3Sryker &mov($num,&wparam(3)); # get num 4715b37fcf3Sryker &and($num,7); 4725b37fcf3Sryker &jz(&label("aw_end")); 4735b37fcf3Sryker 4745b37fcf3Sryker for ($i=0; $i<7; $i++) 4755b37fcf3Sryker { 4765b37fcf3Sryker &comment("Tail Round $i"); 4775b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 4785b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0));# *b 4795b37fcf3Sryker &add($tmp1,$c); 4805b37fcf3Sryker &mov($c,0); 4815b37fcf3Sryker &adc($c,$c); 4825b37fcf3Sryker &add($tmp1,$tmp2); 4835b37fcf3Sryker &adc($c,0); 4845b37fcf3Sryker &dec($num) if ($i != 6); 485da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 4865b37fcf3Sryker &jz(&label("aw_end")) if ($i != 6); 4875b37fcf3Sryker } 4885b37fcf3Sryker &set_label("aw_end",0); 4895b37fcf3Sryker 490913ec974Sbeck# &mov("eax",$c); # $c is "eax" 491913ec974Sbeck 492913ec974Sbeck &function_end($name); 493913ec974Sbeck } 494913ec974Sbeck 495913ec974Sbecksub bn_sub_words 496913ec974Sbeck { 497913ec974Sbeck local($name)=@_; 498913ec974Sbeck 499913ec974Sbeck &function_begin($name,""); 500913ec974Sbeck 501913ec974Sbeck &comment(""); 502913ec974Sbeck $a="esi"; 503913ec974Sbeck $b="edi"; 504913ec974Sbeck $c="eax"; 505913ec974Sbeck $r="ebx"; 506913ec974Sbeck $tmp1="ecx"; 507913ec974Sbeck $tmp2="edx"; 508913ec974Sbeck $num="ebp"; 509913ec974Sbeck 510913ec974Sbeck &mov($r,&wparam(0)); # get r 511913ec974Sbeck &mov($a,&wparam(1)); # get a 512913ec974Sbeck &mov($b,&wparam(2)); # get b 513913ec974Sbeck &mov($num,&wparam(3)); # get num 514913ec974Sbeck &xor($c,$c); # clear carry 515913ec974Sbeck &and($num,0xfffffff8); # num / 8 516913ec974Sbeck 517913ec974Sbeck &jz(&label("aw_finish")); 518913ec974Sbeck 519913ec974Sbeck &set_label("aw_loop",0); 520913ec974Sbeck for ($i=0; $i<8; $i++) 521913ec974Sbeck { 522913ec974Sbeck &comment("Round $i"); 523913ec974Sbeck 524913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 525913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 526913ec974Sbeck &sub($tmp1,$c); 527913ec974Sbeck &mov($c,0); 528913ec974Sbeck &adc($c,$c); 529913ec974Sbeck &sub($tmp1,$tmp2); 530913ec974Sbeck &adc($c,0); 531913ec974Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 532913ec974Sbeck } 533913ec974Sbeck 534913ec974Sbeck &comment(""); 535913ec974Sbeck &add($a,32); 536913ec974Sbeck &add($b,32); 537913ec974Sbeck &add($r,32); 538913ec974Sbeck &sub($num,8); 539913ec974Sbeck &jnz(&label("aw_loop")); 540913ec974Sbeck 541913ec974Sbeck &set_label("aw_finish",0); 542913ec974Sbeck &mov($num,&wparam(3)); # get num 543913ec974Sbeck &and($num,7); 544913ec974Sbeck &jz(&label("aw_end")); 545913ec974Sbeck 546913ec974Sbeck for ($i=0; $i<7; $i++) 547913ec974Sbeck { 548913ec974Sbeck &comment("Tail Round $i"); 549913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 550913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 551913ec974Sbeck &sub($tmp1,$c); 552913ec974Sbeck &mov($c,0); 553913ec974Sbeck &adc($c,$c); 554913ec974Sbeck &sub($tmp1,$tmp2); 555913ec974Sbeck &adc($c,0); 556913ec974Sbeck &dec($num) if ($i != 6); 557da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 558913ec974Sbeck &jz(&label("aw_end")) if ($i != 6); 559913ec974Sbeck } 560913ec974Sbeck &set_label("aw_end",0); 561913ec974Sbeck 562913ec974Sbeck# &mov("eax",$c); # $c is "eax" 5635b37fcf3Sryker 5645b37fcf3Sryker &function_end($name); 5655b37fcf3Sryker } 5665b37fcf3Sryker 567da347917Sbecksub bn_sub_part_words 568da347917Sbeck { 569da347917Sbeck local($name)=@_; 570da347917Sbeck 571da347917Sbeck &function_begin($name,""); 572da347917Sbeck 573da347917Sbeck &comment(""); 574da347917Sbeck $a="esi"; 575da347917Sbeck $b="edi"; 576da347917Sbeck $c="eax"; 577da347917Sbeck $r="ebx"; 578da347917Sbeck $tmp1="ecx"; 579da347917Sbeck $tmp2="edx"; 580da347917Sbeck $num="ebp"; 581da347917Sbeck 582da347917Sbeck &mov($r,&wparam(0)); # get r 583da347917Sbeck &mov($a,&wparam(1)); # get a 584da347917Sbeck &mov($b,&wparam(2)); # get b 585da347917Sbeck &mov($num,&wparam(3)); # get num 586da347917Sbeck &xor($c,$c); # clear carry 587da347917Sbeck &and($num,0xfffffff8); # num / 8 588da347917Sbeck 589da347917Sbeck &jz(&label("aw_finish")); 590da347917Sbeck 591da347917Sbeck &set_label("aw_loop",0); 592da347917Sbeck for ($i=0; $i<8; $i++) 593da347917Sbeck { 594da347917Sbeck &comment("Round $i"); 595da347917Sbeck 596da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 597da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 598da347917Sbeck &sub($tmp1,$c); 599da347917Sbeck &mov($c,0); 600da347917Sbeck &adc($c,$c); 601da347917Sbeck &sub($tmp1,$tmp2); 602da347917Sbeck &adc($c,0); 603da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 604da347917Sbeck } 605da347917Sbeck 606da347917Sbeck &comment(""); 607da347917Sbeck &add($a,32); 608da347917Sbeck &add($b,32); 609da347917Sbeck &add($r,32); 610da347917Sbeck &sub($num,8); 611da347917Sbeck &jnz(&label("aw_loop")); 612da347917Sbeck 613da347917Sbeck &set_label("aw_finish",0); 614da347917Sbeck &mov($num,&wparam(3)); # get num 615da347917Sbeck &and($num,7); 616da347917Sbeck &jz(&label("aw_end")); 617da347917Sbeck 618da347917Sbeck for ($i=0; $i<7; $i++) 619da347917Sbeck { 620da347917Sbeck &comment("Tail Round $i"); 621da347917Sbeck &mov($tmp1,&DWP(0,$a,"",0)); # *a 622da347917Sbeck &mov($tmp2,&DWP(0,$b,"",0));# *b 623da347917Sbeck &sub($tmp1,$c); 624da347917Sbeck &mov($c,0); 625da347917Sbeck &adc($c,$c); 626da347917Sbeck &sub($tmp1,$tmp2); 627da347917Sbeck &adc($c,0); 628da347917Sbeck &mov(&DWP(0,$r,"",0),$tmp1); # *r 629da347917Sbeck &add($a, 4); 630da347917Sbeck &add($b, 4); 631da347917Sbeck &add($r, 4); 632da347917Sbeck &dec($num) if ($i != 6); 633da347917Sbeck &jz(&label("aw_end")) if ($i != 6); 634da347917Sbeck } 635da347917Sbeck &set_label("aw_end",0); 636da347917Sbeck 637da347917Sbeck &cmp(&wparam(4),0); 638da347917Sbeck &je(&label("pw_end")); 639da347917Sbeck 640da347917Sbeck &mov($num,&wparam(4)); # get dl 641da347917Sbeck &cmp($num,0); 642da347917Sbeck &je(&label("pw_end")); 643da347917Sbeck &jge(&label("pw_pos")); 644da347917Sbeck 645da347917Sbeck &comment("pw_neg"); 646da347917Sbeck &mov($tmp2,0); 647da347917Sbeck &sub($tmp2,$num); 648da347917Sbeck &mov($num,$tmp2); 649da347917Sbeck &and($num,0xfffffff8); # num / 8 650da347917Sbeck &jz(&label("pw_neg_finish")); 651da347917Sbeck 652da347917Sbeck &set_label("pw_neg_loop",0); 653da347917Sbeck for ($i=0; $i<8; $i++) 654da347917Sbeck { 655da347917Sbeck &comment("dl<0 Round $i"); 656da347917Sbeck 657da347917Sbeck &mov($tmp1,0); 658da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 659da347917Sbeck &sub($tmp1,$c); 660da347917Sbeck &mov($c,0); 661da347917Sbeck &adc($c,$c); 662da347917Sbeck &sub($tmp1,$tmp2); 663da347917Sbeck &adc($c,0); 664da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 665da347917Sbeck } 666da347917Sbeck 667da347917Sbeck &comment(""); 668da347917Sbeck &add($b,32); 669da347917Sbeck &add($r,32); 670da347917Sbeck &sub($num,8); 671da347917Sbeck &jnz(&label("pw_neg_loop")); 672da347917Sbeck 673da347917Sbeck &set_label("pw_neg_finish",0); 674da347917Sbeck &mov($tmp2,&wparam(4)); # get dl 675da347917Sbeck &mov($num,0); 676da347917Sbeck &sub($num,$tmp2); 677da347917Sbeck &and($num,7); 678da347917Sbeck &jz(&label("pw_end")); 679da347917Sbeck 680da347917Sbeck for ($i=0; $i<7; $i++) 681da347917Sbeck { 682da347917Sbeck &comment("dl<0 Tail Round $i"); 683da347917Sbeck &mov($tmp1,0); 684da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 685da347917Sbeck &sub($tmp1,$c); 686da347917Sbeck &mov($c,0); 687da347917Sbeck &adc($c,$c); 688da347917Sbeck &sub($tmp1,$tmp2); 689da347917Sbeck &adc($c,0); 690da347917Sbeck &dec($num) if ($i != 6); 691da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 692da347917Sbeck &jz(&label("pw_end")) if ($i != 6); 693da347917Sbeck } 694da347917Sbeck 695da347917Sbeck &jmp(&label("pw_end")); 696da347917Sbeck 697da347917Sbeck &set_label("pw_pos",0); 698da347917Sbeck 699da347917Sbeck &and($num,0xfffffff8); # num / 8 700da347917Sbeck &jz(&label("pw_pos_finish")); 701da347917Sbeck 702da347917Sbeck &set_label("pw_pos_loop",0); 703da347917Sbeck 704da347917Sbeck for ($i=0; $i<8; $i++) 705da347917Sbeck { 706da347917Sbeck &comment("dl>0 Round $i"); 707da347917Sbeck 708da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 709da347917Sbeck &sub($tmp1,$c); 710da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 711da347917Sbeck &jnc(&label("pw_nc".$i)); 712da347917Sbeck } 713da347917Sbeck 714da347917Sbeck &comment(""); 715da347917Sbeck &add($a,32); 716da347917Sbeck &add($r,32); 717da347917Sbeck &sub($num,8); 718da347917Sbeck &jnz(&label("pw_pos_loop")); 719da347917Sbeck 720da347917Sbeck &set_label("pw_pos_finish",0); 721da347917Sbeck &mov($num,&wparam(4)); # get dl 722da347917Sbeck &and($num,7); 723da347917Sbeck &jz(&label("pw_end")); 724da347917Sbeck 725da347917Sbeck for ($i=0; $i<7; $i++) 726da347917Sbeck { 727da347917Sbeck &comment("dl>0 Tail Round $i"); 728da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 729da347917Sbeck &sub($tmp1,$c); 730da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 731da347917Sbeck &jnc(&label("pw_tail_nc".$i)); 732da347917Sbeck &dec($num) if ($i != 6); 733da347917Sbeck &jz(&label("pw_end")) if ($i != 6); 734da347917Sbeck } 735da347917Sbeck &mov($c,1); 736da347917Sbeck &jmp(&label("pw_end")); 737da347917Sbeck 738da347917Sbeck &set_label("pw_nc_loop",0); 739da347917Sbeck for ($i=0; $i<8; $i++) 740da347917Sbeck { 741da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 742da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 743da347917Sbeck &set_label("pw_nc".$i,0); 744da347917Sbeck } 745da347917Sbeck 746da347917Sbeck &comment(""); 747da347917Sbeck &add($a,32); 748da347917Sbeck &add($r,32); 749da347917Sbeck &sub($num,8); 750da347917Sbeck &jnz(&label("pw_nc_loop")); 751da347917Sbeck 752da347917Sbeck &mov($num,&wparam(4)); # get dl 753da347917Sbeck &and($num,7); 754da347917Sbeck &jz(&label("pw_nc_end")); 755da347917Sbeck 756da347917Sbeck for ($i=0; $i<7; $i++) 757da347917Sbeck { 758da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 759da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 760da347917Sbeck &set_label("pw_tail_nc".$i,0); 761da347917Sbeck &dec($num) if ($i != 6); 762da347917Sbeck &jz(&label("pw_nc_end")) if ($i != 6); 763da347917Sbeck } 764da347917Sbeck 765da347917Sbeck &set_label("pw_nc_end",0); 766da347917Sbeck &mov($c,0); 767da347917Sbeck 768da347917Sbeck &set_label("pw_end",0); 769da347917Sbeck 770da347917Sbeck# &mov("eax",$c); # $c is "eax" 771da347917Sbeck 772da347917Sbeck &function_end($name); 773da347917Sbeck } 774da347917Sbeck 775