15b37fcf3Sryker#!/usr/local/bin/perl 25b37fcf3Sryker 30a5d6edeSdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40a5d6edeSdjmpush(@INC,"${dir}","${dir}../../perlasm"); 55b37fcf3Srykerrequire "x86asm.pl"; 65b37fcf3Sryker 7913ec974Sbeck&asm_init($ARGV[0],$0); 85b37fcf3Sryker 94fcf65c5Sdjm$sse2=0; 104fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 114fcf65c5Sdjm 124fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2); 134fcf65c5Sdjm 145b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words"); 155b37fcf3Sryker&bn_mul_words("bn_mul_words"); 165b37fcf3Sryker&bn_sqr_words("bn_sqr_words"); 17913ec974Sbeck&bn_div_words("bn_div_words"); 185b37fcf3Sryker&bn_add_words("bn_add_words"); 19913ec974Sbeck&bn_sub_words("bn_sub_words"); 204fcf65c5Sdjm&bn_sub_part_words("bn_sub_part_words"); 215b37fcf3Sryker 225b37fcf3Sryker&asm_finish(); 235b37fcf3Sryker 245b37fcf3Srykersub bn_mul_add_words 255b37fcf3Sryker { 265b37fcf3Sryker local($name)=@_; 275b37fcf3Sryker 28e60c46c4Smiod &function_begin_B($name,""); 295b37fcf3Sryker 300a5d6edeSdjm $r="eax"; 310a5d6edeSdjm $a="edx"; 320a5d6edeSdjm $c="ecx"; 335b37fcf3Sryker 344fcf65c5Sdjm if ($sse2) { 35*0bd8ca4aSmiod &picsetup("eax"); 36*0bd8ca4aSmiod &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 37e60c46c4Smiod &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 380a5d6edeSdjm &jnc(&label("maw_non_sse2")); 394fcf65c5Sdjm 400a5d6edeSdjm &mov($r,&wparam(0)); 410a5d6edeSdjm &mov($a,&wparam(1)); 420a5d6edeSdjm &mov($c,&wparam(2)); 430a5d6edeSdjm &movd("mm0",&wparam(3)); # mm0 = w 444fcf65c5Sdjm &pxor("mm1","mm1"); # mm1 = carry_in 450a5d6edeSdjm &jmp(&label("maw_sse2_entry")); 464fcf65c5Sdjm 470a5d6edeSdjm &set_label("maw_sse2_unrolled",16); 484fcf65c5Sdjm &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 494fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 504fcf65c5Sdjm &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 514fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[0] 524fcf65c5Sdjm &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 534fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[1] 544fcf65c5Sdjm &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 554fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[2] 564fcf65c5Sdjm &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 574fcf65c5Sdjm &pmuludq("mm7","mm0"); # mm7 = w*a[3] 584fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 594fcf65c5Sdjm &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 604fcf65c5Sdjm &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 614fcf65c5Sdjm &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 624fcf65c5Sdjm &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 634fcf65c5Sdjm &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 644fcf65c5Sdjm &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 654fcf65c5Sdjm &movd(&DWP(0,$r,"",0),"mm1"); 664fcf65c5Sdjm &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 674fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[4] 684fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry0 694fcf65c5Sdjm &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 704fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[5] 714fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 724fcf65c5Sdjm &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 734fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[6] 744fcf65c5Sdjm &movd(&DWP(4,$r,"",0),"mm1"); 754fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry1 764fcf65c5Sdjm &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 774fcf65c5Sdjm &add($a,32); 784fcf65c5Sdjm &pmuludq("mm3","mm0"); # mm3 = w*a[7] 794fcf65c5Sdjm &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 804fcf65c5Sdjm &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 814fcf65c5Sdjm &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 824fcf65c5Sdjm &movd(&DWP(8,$r,"",0),"mm1"); 834fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry2 844fcf65c5Sdjm &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 854fcf65c5Sdjm &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 864fcf65c5Sdjm &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 874fcf65c5Sdjm &movd(&DWP(12,$r,"",0),"mm1"); 884fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry3 894fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 904fcf65c5Sdjm &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 914fcf65c5Sdjm &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 924fcf65c5Sdjm &movd(&DWP(16,$r,"",0),"mm1"); 934fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry4 944fcf65c5Sdjm &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 954fcf65c5Sdjm &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 964fcf65c5Sdjm &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 974fcf65c5Sdjm &movd(&DWP(20,$r,"",0),"mm1"); 984fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry5 994fcf65c5Sdjm &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 1004fcf65c5Sdjm &movd(&DWP(24,$r,"",0),"mm1"); 1014fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry6 1024fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 1034fcf65c5Sdjm &movd(&DWP(28,$r,"",0),"mm1"); 1040a5d6edeSdjm &lea($r,&DWP(32,$r)); 1054fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry_out 1064fcf65c5Sdjm 1070a5d6edeSdjm &sub($c,8); 1080a5d6edeSdjm &jz(&label("maw_sse2_exit")); 1090a5d6edeSdjm &set_label("maw_sse2_entry"); 1100a5d6edeSdjm &test($c,0xfffffff8); 1110a5d6edeSdjm &jnz(&label("maw_sse2_unrolled")); 1120a5d6edeSdjm 1130a5d6edeSdjm &set_label("maw_sse2_loop",4); 1140a5d6edeSdjm &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 1150a5d6edeSdjm &movd("mm3",&DWP(0,$r)); # mm3 = r[i] 1160a5d6edeSdjm &pmuludq("mm2","mm0"); # a[i] *= w 1170a5d6edeSdjm &lea($a,&DWP(4,$a)); 1180a5d6edeSdjm &paddq("mm1","mm3"); # carry += r[i] 1190a5d6edeSdjm &paddq("mm1","mm2"); # carry += a[i]*w 1200a5d6edeSdjm &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 1210a5d6edeSdjm &sub($c,1); 1220a5d6edeSdjm &psrlq("mm1",32); # carry = carry_high 1230a5d6edeSdjm &lea($r,&DWP(4,$r)); 1244fcf65c5Sdjm &jnz(&label("maw_sse2_loop")); 1250a5d6edeSdjm &set_label("maw_sse2_exit"); 1260a5d6edeSdjm &movd("eax","mm1"); # c = carry_out 1274fcf65c5Sdjm &emms(); 1280a5d6edeSdjm &ret(); 1294fcf65c5Sdjm 1300a5d6edeSdjm &set_label("maw_non_sse2",16); 1314fcf65c5Sdjm } 1324fcf65c5Sdjm 1330a5d6edeSdjm # function_begin prologue 1340a5d6edeSdjm &push("ebp"); 1350a5d6edeSdjm &push("ebx"); 1360a5d6edeSdjm &push("esi"); 1370a5d6edeSdjm &push("edi"); 1385b37fcf3Sryker 1390a5d6edeSdjm &comment(""); 1400a5d6edeSdjm $Low="eax"; 1410a5d6edeSdjm $High="edx"; 1420a5d6edeSdjm $a="ebx"; 1430a5d6edeSdjm $w="ebp"; 1440a5d6edeSdjm $r="edi"; 1450a5d6edeSdjm $c="esi"; 1460a5d6edeSdjm 1470a5d6edeSdjm &xor($c,$c); # clear carry 1480a5d6edeSdjm &mov($r,&wparam(0)); # 1490a5d6edeSdjm 1500a5d6edeSdjm &mov("ecx",&wparam(2)); # 1510a5d6edeSdjm &mov($a,&wparam(1)); # 1520a5d6edeSdjm 1530a5d6edeSdjm &and("ecx",0xfffffff8); # num / 8 1540a5d6edeSdjm &mov($w,&wparam(3)); # 1550a5d6edeSdjm 1560a5d6edeSdjm &push("ecx"); # Up the stack for a tmp variable 1570a5d6edeSdjm 1580a5d6edeSdjm &jz(&label("maw_finish")); 1590a5d6edeSdjm 1600a5d6edeSdjm &set_label("maw_loop",16); 1615b37fcf3Sryker 1625b37fcf3Sryker for ($i=0; $i<32; $i+=4) 1635b37fcf3Sryker { 1645b37fcf3Sryker &comment("Round $i"); 1655b37fcf3Sryker 1660a5d6edeSdjm &mov("eax",&DWP($i,$a)); # *a 1675b37fcf3Sryker &mul($w); # *a * w 1685b37fcf3Sryker &add("eax",$c); # L(t)+= c 1695b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1700a5d6edeSdjm &add("eax",&DWP($i,$r)); # L(t)+= *r 1710a5d6edeSdjm &adc("edx",0); # H(t)+=carry 1720a5d6edeSdjm &mov(&DWP($i,$r),"eax"); # *r= L(t); 1735b37fcf3Sryker &mov($c,"edx"); # c= H(t); 1745b37fcf3Sryker } 1755b37fcf3Sryker 1765b37fcf3Sryker &comment(""); 1775b37fcf3Sryker &sub("ecx",8); 1780a5d6edeSdjm &lea($a,&DWP(32,$a)); 1790a5d6edeSdjm &lea($r,&DWP(32,$r)); 1805b37fcf3Sryker &jnz(&label("maw_loop")); 1815b37fcf3Sryker 1825b37fcf3Sryker &set_label("maw_finish",0); 1835b37fcf3Sryker &mov("ecx",&wparam(2)); # get num 1845b37fcf3Sryker &and("ecx",7); 1855b37fcf3Sryker &jnz(&label("maw_finish2")); # helps branch prediction 1865b37fcf3Sryker &jmp(&label("maw_end")); 1875b37fcf3Sryker 1885b37fcf3Sryker &set_label("maw_finish2",1); 1895b37fcf3Sryker for ($i=0; $i<7; $i++) 1905b37fcf3Sryker { 1915b37fcf3Sryker &comment("Tail Round $i"); 1920a5d6edeSdjm &mov("eax",&DWP($i*4,$a)); # *a 1935b37fcf3Sryker &mul($w); # *a * w 1945b37fcf3Sryker &add("eax",$c); # L(t)+=c 1955b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1960a5d6edeSdjm &add("eax",&DWP($i*4,$r)); # L(t)+= *r 1975b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1985b37fcf3Sryker &dec("ecx") if ($i != 7-1); 1990a5d6edeSdjm &mov(&DWP($i*4,$r),"eax"); # *r= L(t); 2005b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2015b37fcf3Sryker &jz(&label("maw_end")) if ($i != 7-1); 2025b37fcf3Sryker } 2035b37fcf3Sryker &set_label("maw_end",0); 2045b37fcf3Sryker &mov("eax",$c); 2055b37fcf3Sryker 2065b37fcf3Sryker &pop("ecx"); # clear variable from 2075b37fcf3Sryker 2085b37fcf3Sryker &function_end($name); 2095b37fcf3Sryker } 2105b37fcf3Sryker 2115b37fcf3Srykersub bn_mul_words 2125b37fcf3Sryker { 2135b37fcf3Sryker local($name)=@_; 2145b37fcf3Sryker 215e60c46c4Smiod &function_begin_B($name,""); 2160a5d6edeSdjm 2170a5d6edeSdjm $r="eax"; 2180a5d6edeSdjm $a="edx"; 2190a5d6edeSdjm $c="ecx"; 2200a5d6edeSdjm 2210a5d6edeSdjm if ($sse2) { 222*0bd8ca4aSmiod &picsetup("eax"); 223*0bd8ca4aSmiod &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 224e60c46c4Smiod &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 2250a5d6edeSdjm &jnc(&label("mw_non_sse2")); 2260a5d6edeSdjm 2270a5d6edeSdjm &mov($r,&wparam(0)); 2280a5d6edeSdjm &mov($a,&wparam(1)); 2290a5d6edeSdjm &mov($c,&wparam(2)); 2300a5d6edeSdjm &movd("mm0",&wparam(3)); # mm0 = w 2310a5d6edeSdjm &pxor("mm1","mm1"); # mm1 = carry = 0 2320a5d6edeSdjm 2330a5d6edeSdjm &set_label("mw_sse2_loop",16); 2340a5d6edeSdjm &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 2350a5d6edeSdjm &pmuludq("mm2","mm0"); # a[i] *= w 2360a5d6edeSdjm &lea($a,&DWP(4,$a)); 2370a5d6edeSdjm &paddq("mm1","mm2"); # carry += a[i]*w 2380a5d6edeSdjm &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 2390a5d6edeSdjm &sub($c,1); 2400a5d6edeSdjm &psrlq("mm1",32); # carry = carry_high 2410a5d6edeSdjm &lea($r,&DWP(4,$r)); 2420a5d6edeSdjm &jnz(&label("mw_sse2_loop")); 2430a5d6edeSdjm 2440a5d6edeSdjm &movd("eax","mm1"); # return carry 2450a5d6edeSdjm &emms(); 2460a5d6edeSdjm &ret(); 2470a5d6edeSdjm &set_label("mw_non_sse2",16); 2480a5d6edeSdjm } 2490a5d6edeSdjm 2500a5d6edeSdjm # function_begin prologue 2510a5d6edeSdjm &push("ebp"); 2520a5d6edeSdjm &push("ebx"); 2530a5d6edeSdjm &push("esi"); 2540a5d6edeSdjm &push("edi"); 2555b37fcf3Sryker 2565b37fcf3Sryker &comment(""); 2575b37fcf3Sryker $Low="eax"; 2585b37fcf3Sryker $High="edx"; 2595b37fcf3Sryker $a="ebx"; 2605b37fcf3Sryker $w="ecx"; 2615b37fcf3Sryker $r="edi"; 2625b37fcf3Sryker $c="esi"; 2635b37fcf3Sryker $num="ebp"; 2645b37fcf3Sryker 2655b37fcf3Sryker &xor($c,$c); # clear carry 2665b37fcf3Sryker &mov($r,&wparam(0)); # 2675b37fcf3Sryker &mov($a,&wparam(1)); # 2685b37fcf3Sryker &mov($num,&wparam(2)); # 2695b37fcf3Sryker &mov($w,&wparam(3)); # 2705b37fcf3Sryker 2715b37fcf3Sryker &and($num,0xfffffff8); # num / 8 2725b37fcf3Sryker &jz(&label("mw_finish")); 2735b37fcf3Sryker 2745b37fcf3Sryker &set_label("mw_loop",0); 2755b37fcf3Sryker for ($i=0; $i<32; $i+=4) 2765b37fcf3Sryker { 2775b37fcf3Sryker &comment("Round $i"); 2785b37fcf3Sryker 2795b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 2805b37fcf3Sryker &mul($w); # *a * w 2815b37fcf3Sryker &add("eax",$c); # L(t)+=c 2825b37fcf3Sryker # XXX 2835b37fcf3Sryker 2845b37fcf3Sryker &adc("edx",0); # H(t)+=carry 2855b37fcf3Sryker &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 2865b37fcf3Sryker 2875b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2885b37fcf3Sryker } 2895b37fcf3Sryker 2905b37fcf3Sryker &comment(""); 2915b37fcf3Sryker &add($a,32); 2925b37fcf3Sryker &add($r,32); 2935b37fcf3Sryker &sub($num,8); 2945b37fcf3Sryker &jz(&label("mw_finish")); 2955b37fcf3Sryker &jmp(&label("mw_loop")); 2965b37fcf3Sryker 2975b37fcf3Sryker &set_label("mw_finish",0); 2985b37fcf3Sryker &mov($num,&wparam(2)); # get num 2995b37fcf3Sryker &and($num,7); 3005b37fcf3Sryker &jnz(&label("mw_finish2")); 3015b37fcf3Sryker &jmp(&label("mw_end")); 3025b37fcf3Sryker 3035b37fcf3Sryker &set_label("mw_finish2",1); 3045b37fcf3Sryker for ($i=0; $i<7; $i++) 3055b37fcf3Sryker { 3065b37fcf3Sryker &comment("Tail Round $i"); 3075b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0));# *a 3085b37fcf3Sryker &mul($w); # *a * w 3095b37fcf3Sryker &add("eax",$c); # L(t)+=c 3105b37fcf3Sryker # XXX 3115b37fcf3Sryker &adc("edx",0); # H(t)+=carry 3125b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 3135b37fcf3Sryker &mov($c,"edx"); # c= H(t); 3145b37fcf3Sryker &dec($num) if ($i != 7-1); 3155b37fcf3Sryker &jz(&label("mw_end")) if ($i != 7-1); 3165b37fcf3Sryker } 3175b37fcf3Sryker &set_label("mw_end",0); 3185b37fcf3Sryker &mov("eax",$c); 3195b37fcf3Sryker 3205b37fcf3Sryker &function_end($name); 3215b37fcf3Sryker } 3225b37fcf3Sryker 3235b37fcf3Srykersub bn_sqr_words 3245b37fcf3Sryker { 3255b37fcf3Sryker local($name)=@_; 3265b37fcf3Sryker 327e60c46c4Smiod &function_begin_B($name,""); 3280a5d6edeSdjm 3290a5d6edeSdjm $r="eax"; 3300a5d6edeSdjm $a="edx"; 3310a5d6edeSdjm $c="ecx"; 3320a5d6edeSdjm 3330a5d6edeSdjm if ($sse2) { 334*0bd8ca4aSmiod &picsetup("eax"); 335*0bd8ca4aSmiod &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 336e60c46c4Smiod &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 3370a5d6edeSdjm &jnc(&label("sqr_non_sse2")); 3380a5d6edeSdjm 3390a5d6edeSdjm &mov($r,&wparam(0)); 3400a5d6edeSdjm &mov($a,&wparam(1)); 3410a5d6edeSdjm &mov($c,&wparam(2)); 3420a5d6edeSdjm 3430a5d6edeSdjm &set_label("sqr_sse2_loop",16); 3440a5d6edeSdjm &movd("mm0",&DWP(0,$a)); # mm0 = a[i] 3450a5d6edeSdjm &pmuludq("mm0","mm0"); # a[i] *= a[i] 3460a5d6edeSdjm &lea($a,&DWP(4,$a)); # a++ 3470a5d6edeSdjm &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] 3480a5d6edeSdjm &sub($c,1); 3490a5d6edeSdjm &lea($r,&DWP(8,$r)); # r += 2 3500a5d6edeSdjm &jnz(&label("sqr_sse2_loop")); 3510a5d6edeSdjm 3520a5d6edeSdjm &emms(); 3530a5d6edeSdjm &ret(); 3540a5d6edeSdjm &set_label("sqr_non_sse2",16); 3550a5d6edeSdjm } 3560a5d6edeSdjm 3570a5d6edeSdjm # function_begin prologue 3580a5d6edeSdjm &push("ebp"); 3590a5d6edeSdjm &push("ebx"); 3600a5d6edeSdjm &push("esi"); 3610a5d6edeSdjm &push("edi"); 3625b37fcf3Sryker 3635b37fcf3Sryker &comment(""); 3645b37fcf3Sryker $r="esi"; 3655b37fcf3Sryker $a="edi"; 3665b37fcf3Sryker $num="ebx"; 3675b37fcf3Sryker 3685b37fcf3Sryker &mov($r,&wparam(0)); # 3695b37fcf3Sryker &mov($a,&wparam(1)); # 3705b37fcf3Sryker &mov($num,&wparam(2)); # 3715b37fcf3Sryker 3725b37fcf3Sryker &and($num,0xfffffff8); # num / 8 3735b37fcf3Sryker &jz(&label("sw_finish")); 3745b37fcf3Sryker 3755b37fcf3Sryker &set_label("sw_loop",0); 3765b37fcf3Sryker for ($i=0; $i<32; $i+=4) 3775b37fcf3Sryker { 3785b37fcf3Sryker &comment("Round $i"); 3795b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 3805b37fcf3Sryker # XXX 3815b37fcf3Sryker &mul("eax"); # *a * *a 3825b37fcf3Sryker &mov(&DWP($i*2,$r,"",0),"eax"); # 3835b37fcf3Sryker &mov(&DWP($i*2+4,$r,"",0),"edx");# 3845b37fcf3Sryker } 3855b37fcf3Sryker 3865b37fcf3Sryker &comment(""); 3875b37fcf3Sryker &add($a,32); 3885b37fcf3Sryker &add($r,64); 3895b37fcf3Sryker &sub($num,8); 3905b37fcf3Sryker &jnz(&label("sw_loop")); 3915b37fcf3Sryker 3925b37fcf3Sryker &set_label("sw_finish",0); 3935b37fcf3Sryker &mov($num,&wparam(2)); # get num 3945b37fcf3Sryker &and($num,7); 3955b37fcf3Sryker &jz(&label("sw_end")); 3965b37fcf3Sryker 3975b37fcf3Sryker for ($i=0; $i<7; $i++) 3985b37fcf3Sryker { 3995b37fcf3Sryker &comment("Tail Round $i"); 4005b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0)); # *a 4015b37fcf3Sryker # XXX 4025b37fcf3Sryker &mul("eax"); # *a * *a 4035b37fcf3Sryker &mov(&DWP($i*8,$r,"",0),"eax"); # 4045b37fcf3Sryker &dec($num) if ($i != 7-1); 4055b37fcf3Sryker &mov(&DWP($i*8+4,$r,"",0),"edx"); 4065b37fcf3Sryker &jz(&label("sw_end")) if ($i != 7-1); 4075b37fcf3Sryker } 4085b37fcf3Sryker &set_label("sw_end",0); 4095b37fcf3Sryker 4105b37fcf3Sryker &function_end($name); 4115b37fcf3Sryker } 4125b37fcf3Sryker 413913ec974Sbecksub bn_div_words 4145b37fcf3Sryker { 4155b37fcf3Sryker local($name)=@_; 4165b37fcf3Sryker 4170a5d6edeSdjm &function_begin_B($name,""); 4185b37fcf3Sryker &mov("edx",&wparam(0)); # 4195b37fcf3Sryker &mov("eax",&wparam(1)); # 4200a5d6edeSdjm &mov("ecx",&wparam(2)); # 4210a5d6edeSdjm &div("ecx"); 4220a5d6edeSdjm &ret(); 4230a5d6edeSdjm &function_end_B($name); 4245b37fcf3Sryker } 4255b37fcf3Sryker 4265b37fcf3Srykersub bn_add_words 4275b37fcf3Sryker { 4285b37fcf3Sryker local($name)=@_; 4295b37fcf3Sryker 4305b37fcf3Sryker &function_begin($name,""); 4315b37fcf3Sryker 4325b37fcf3Sryker &comment(""); 4335b37fcf3Sryker $a="esi"; 4345b37fcf3Sryker $b="edi"; 4355b37fcf3Sryker $c="eax"; 4365b37fcf3Sryker $r="ebx"; 4375b37fcf3Sryker $tmp1="ecx"; 4385b37fcf3Sryker $tmp2="edx"; 4395b37fcf3Sryker $num="ebp"; 4405b37fcf3Sryker 4415b37fcf3Sryker &mov($r,&wparam(0)); # get r 4425b37fcf3Sryker &mov($a,&wparam(1)); # get a 4435b37fcf3Sryker &mov($b,&wparam(2)); # get b 4445b37fcf3Sryker &mov($num,&wparam(3)); # get num 4455b37fcf3Sryker &xor($c,$c); # clear carry 4465b37fcf3Sryker &and($num,0xfffffff8); # num / 8 4475b37fcf3Sryker 4485b37fcf3Sryker &jz(&label("aw_finish")); 4495b37fcf3Sryker 4505b37fcf3Sryker &set_label("aw_loop",0); 4515b37fcf3Sryker for ($i=0; $i<8; $i++) 4525b37fcf3Sryker { 4535b37fcf3Sryker &comment("Round $i"); 4545b37fcf3Sryker 4555b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 4565b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 4575b37fcf3Sryker &add($tmp1,$c); 4585b37fcf3Sryker &mov($c,0); 4595b37fcf3Sryker &adc($c,$c); 4605b37fcf3Sryker &add($tmp1,$tmp2); 4615b37fcf3Sryker &adc($c,0); 4625b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 4635b37fcf3Sryker } 4645b37fcf3Sryker 4655b37fcf3Sryker &comment(""); 4665b37fcf3Sryker &add($a,32); 4675b37fcf3Sryker &add($b,32); 4685b37fcf3Sryker &add($r,32); 4695b37fcf3Sryker &sub($num,8); 4705b37fcf3Sryker &jnz(&label("aw_loop")); 4715b37fcf3Sryker 4725b37fcf3Sryker &set_label("aw_finish",0); 4735b37fcf3Sryker &mov($num,&wparam(3)); # get num 4745b37fcf3Sryker &and($num,7); 4755b37fcf3Sryker &jz(&label("aw_end")); 4765b37fcf3Sryker 4775b37fcf3Sryker for ($i=0; $i<7; $i++) 4785b37fcf3Sryker { 4795b37fcf3Sryker &comment("Tail Round $i"); 4805b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 4815b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0));# *b 4825b37fcf3Sryker &add($tmp1,$c); 4835b37fcf3Sryker &mov($c,0); 4845b37fcf3Sryker &adc($c,$c); 4855b37fcf3Sryker &add($tmp1,$tmp2); 4865b37fcf3Sryker &adc($c,0); 4875b37fcf3Sryker &dec($num) if ($i != 6); 488da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 4895b37fcf3Sryker &jz(&label("aw_end")) if ($i != 6); 4905b37fcf3Sryker } 4915b37fcf3Sryker &set_label("aw_end",0); 4925b37fcf3Sryker 493913ec974Sbeck# &mov("eax",$c); # $c is "eax" 494913ec974Sbeck 495913ec974Sbeck &function_end($name); 496913ec974Sbeck } 497913ec974Sbeck 498913ec974Sbecksub bn_sub_words 499913ec974Sbeck { 500913ec974Sbeck local($name)=@_; 501913ec974Sbeck 502913ec974Sbeck &function_begin($name,""); 503913ec974Sbeck 504913ec974Sbeck &comment(""); 505913ec974Sbeck $a="esi"; 506913ec974Sbeck $b="edi"; 507913ec974Sbeck $c="eax"; 508913ec974Sbeck $r="ebx"; 509913ec974Sbeck $tmp1="ecx"; 510913ec974Sbeck $tmp2="edx"; 511913ec974Sbeck $num="ebp"; 512913ec974Sbeck 513913ec974Sbeck &mov($r,&wparam(0)); # get r 514913ec974Sbeck &mov($a,&wparam(1)); # get a 515913ec974Sbeck &mov($b,&wparam(2)); # get b 516913ec974Sbeck &mov($num,&wparam(3)); # get num 517913ec974Sbeck &xor($c,$c); # clear carry 518913ec974Sbeck &and($num,0xfffffff8); # num / 8 519913ec974Sbeck 520913ec974Sbeck &jz(&label("aw_finish")); 521913ec974Sbeck 522913ec974Sbeck &set_label("aw_loop",0); 523913ec974Sbeck for ($i=0; $i<8; $i++) 524913ec974Sbeck { 525913ec974Sbeck &comment("Round $i"); 526913ec974Sbeck 527913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 528913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 529913ec974Sbeck &sub($tmp1,$c); 530913ec974Sbeck &mov($c,0); 531913ec974Sbeck &adc($c,$c); 532913ec974Sbeck &sub($tmp1,$tmp2); 533913ec974Sbeck &adc($c,0); 534913ec974Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 535913ec974Sbeck } 536913ec974Sbeck 537913ec974Sbeck &comment(""); 538913ec974Sbeck &add($a,32); 539913ec974Sbeck &add($b,32); 540913ec974Sbeck &add($r,32); 541913ec974Sbeck &sub($num,8); 542913ec974Sbeck &jnz(&label("aw_loop")); 543913ec974Sbeck 544913ec974Sbeck &set_label("aw_finish",0); 545913ec974Sbeck &mov($num,&wparam(3)); # get num 546913ec974Sbeck &and($num,7); 547913ec974Sbeck &jz(&label("aw_end")); 548913ec974Sbeck 549913ec974Sbeck for ($i=0; $i<7; $i++) 550913ec974Sbeck { 551913ec974Sbeck &comment("Tail Round $i"); 552913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 553913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 554913ec974Sbeck &sub($tmp1,$c); 555913ec974Sbeck &mov($c,0); 556913ec974Sbeck &adc($c,$c); 557913ec974Sbeck &sub($tmp1,$tmp2); 558913ec974Sbeck &adc($c,0); 559913ec974Sbeck &dec($num) if ($i != 6); 560da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 561913ec974Sbeck &jz(&label("aw_end")) if ($i != 6); 562913ec974Sbeck } 563913ec974Sbeck &set_label("aw_end",0); 564913ec974Sbeck 565913ec974Sbeck# &mov("eax",$c); # $c is "eax" 5665b37fcf3Sryker 5675b37fcf3Sryker &function_end($name); 5685b37fcf3Sryker } 5695b37fcf3Sryker 570da347917Sbecksub bn_sub_part_words 571da347917Sbeck { 572da347917Sbeck local($name)=@_; 573da347917Sbeck 574da347917Sbeck &function_begin($name,""); 575da347917Sbeck 576da347917Sbeck &comment(""); 577da347917Sbeck $a="esi"; 578da347917Sbeck $b="edi"; 579da347917Sbeck $c="eax"; 580da347917Sbeck $r="ebx"; 581da347917Sbeck $tmp1="ecx"; 582da347917Sbeck $tmp2="edx"; 583da347917Sbeck $num="ebp"; 584da347917Sbeck 585da347917Sbeck &mov($r,&wparam(0)); # get r 586da347917Sbeck &mov($a,&wparam(1)); # get a 587da347917Sbeck &mov($b,&wparam(2)); # get b 588da347917Sbeck &mov($num,&wparam(3)); # get num 589da347917Sbeck &xor($c,$c); # clear carry 590da347917Sbeck &and($num,0xfffffff8); # num / 8 591da347917Sbeck 592da347917Sbeck &jz(&label("aw_finish")); 593da347917Sbeck 594da347917Sbeck &set_label("aw_loop",0); 595da347917Sbeck for ($i=0; $i<8; $i++) 596da347917Sbeck { 597da347917Sbeck &comment("Round $i"); 598da347917Sbeck 599da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 600da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 601da347917Sbeck &sub($tmp1,$c); 602da347917Sbeck &mov($c,0); 603da347917Sbeck &adc($c,$c); 604da347917Sbeck &sub($tmp1,$tmp2); 605da347917Sbeck &adc($c,0); 606da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 607da347917Sbeck } 608da347917Sbeck 609da347917Sbeck &comment(""); 610da347917Sbeck &add($a,32); 611da347917Sbeck &add($b,32); 612da347917Sbeck &add($r,32); 613da347917Sbeck &sub($num,8); 614da347917Sbeck &jnz(&label("aw_loop")); 615da347917Sbeck 616da347917Sbeck &set_label("aw_finish",0); 617da347917Sbeck &mov($num,&wparam(3)); # get num 618da347917Sbeck &and($num,7); 619da347917Sbeck &jz(&label("aw_end")); 620da347917Sbeck 621da347917Sbeck for ($i=0; $i<7; $i++) 622da347917Sbeck { 623da347917Sbeck &comment("Tail Round $i"); 624da347917Sbeck &mov($tmp1,&DWP(0,$a,"",0)); # *a 625da347917Sbeck &mov($tmp2,&DWP(0,$b,"",0));# *b 626da347917Sbeck &sub($tmp1,$c); 627da347917Sbeck &mov($c,0); 628da347917Sbeck &adc($c,$c); 629da347917Sbeck &sub($tmp1,$tmp2); 630da347917Sbeck &adc($c,0); 631da347917Sbeck &mov(&DWP(0,$r,"",0),$tmp1); # *r 632da347917Sbeck &add($a, 4); 633da347917Sbeck &add($b, 4); 634da347917Sbeck &add($r, 4); 635da347917Sbeck &dec($num) if ($i != 6); 636da347917Sbeck &jz(&label("aw_end")) if ($i != 6); 637da347917Sbeck } 638da347917Sbeck &set_label("aw_end",0); 639da347917Sbeck 640da347917Sbeck &cmp(&wparam(4),0); 641da347917Sbeck &je(&label("pw_end")); 642da347917Sbeck 643da347917Sbeck &mov($num,&wparam(4)); # get dl 644da347917Sbeck &cmp($num,0); 645da347917Sbeck &je(&label("pw_end")); 646da347917Sbeck &jge(&label("pw_pos")); 647da347917Sbeck 648da347917Sbeck &comment("pw_neg"); 649da347917Sbeck &mov($tmp2,0); 650da347917Sbeck &sub($tmp2,$num); 651da347917Sbeck &mov($num,$tmp2); 652da347917Sbeck &and($num,0xfffffff8); # num / 8 653da347917Sbeck &jz(&label("pw_neg_finish")); 654da347917Sbeck 655da347917Sbeck &set_label("pw_neg_loop",0); 656da347917Sbeck for ($i=0; $i<8; $i++) 657da347917Sbeck { 658da347917Sbeck &comment("dl<0 Round $i"); 659da347917Sbeck 660da347917Sbeck &mov($tmp1,0); 661da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 662da347917Sbeck &sub($tmp1,$c); 663da347917Sbeck &mov($c,0); 664da347917Sbeck &adc($c,$c); 665da347917Sbeck &sub($tmp1,$tmp2); 666da347917Sbeck &adc($c,0); 667da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 668da347917Sbeck } 669da347917Sbeck 670da347917Sbeck &comment(""); 671da347917Sbeck &add($b,32); 672da347917Sbeck &add($r,32); 673da347917Sbeck &sub($num,8); 674da347917Sbeck &jnz(&label("pw_neg_loop")); 675da347917Sbeck 676da347917Sbeck &set_label("pw_neg_finish",0); 677da347917Sbeck &mov($tmp2,&wparam(4)); # get dl 678da347917Sbeck &mov($num,0); 679da347917Sbeck &sub($num,$tmp2); 680da347917Sbeck &and($num,7); 681da347917Sbeck &jz(&label("pw_end")); 682da347917Sbeck 683da347917Sbeck for ($i=0; $i<7; $i++) 684da347917Sbeck { 685da347917Sbeck &comment("dl<0 Tail Round $i"); 686da347917Sbeck &mov($tmp1,0); 687da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 688da347917Sbeck &sub($tmp1,$c); 689da347917Sbeck &mov($c,0); 690da347917Sbeck &adc($c,$c); 691da347917Sbeck &sub($tmp1,$tmp2); 692da347917Sbeck &adc($c,0); 693da347917Sbeck &dec($num) if ($i != 6); 694da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 695da347917Sbeck &jz(&label("pw_end")) if ($i != 6); 696da347917Sbeck } 697da347917Sbeck 698da347917Sbeck &jmp(&label("pw_end")); 699da347917Sbeck 700da347917Sbeck &set_label("pw_pos",0); 701da347917Sbeck 702da347917Sbeck &and($num,0xfffffff8); # num / 8 703da347917Sbeck &jz(&label("pw_pos_finish")); 704da347917Sbeck 705da347917Sbeck &set_label("pw_pos_loop",0); 706da347917Sbeck 707da347917Sbeck for ($i=0; $i<8; $i++) 708da347917Sbeck { 709da347917Sbeck &comment("dl>0 Round $i"); 710da347917Sbeck 711da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 712da347917Sbeck &sub($tmp1,$c); 713da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 714da347917Sbeck &jnc(&label("pw_nc".$i)); 715da347917Sbeck } 716da347917Sbeck 717da347917Sbeck &comment(""); 718da347917Sbeck &add($a,32); 719da347917Sbeck &add($r,32); 720da347917Sbeck &sub($num,8); 721da347917Sbeck &jnz(&label("pw_pos_loop")); 722da347917Sbeck 723da347917Sbeck &set_label("pw_pos_finish",0); 724da347917Sbeck &mov($num,&wparam(4)); # get dl 725da347917Sbeck &and($num,7); 726da347917Sbeck &jz(&label("pw_end")); 727da347917Sbeck 728da347917Sbeck for ($i=0; $i<7; $i++) 729da347917Sbeck { 730da347917Sbeck &comment("dl>0 Tail Round $i"); 731da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 732da347917Sbeck &sub($tmp1,$c); 733da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 734da347917Sbeck &jnc(&label("pw_tail_nc".$i)); 735da347917Sbeck &dec($num) if ($i != 6); 736da347917Sbeck &jz(&label("pw_end")) if ($i != 6); 737da347917Sbeck } 738da347917Sbeck &mov($c,1); 739da347917Sbeck &jmp(&label("pw_end")); 740da347917Sbeck 741da347917Sbeck &set_label("pw_nc_loop",0); 742da347917Sbeck for ($i=0; $i<8; $i++) 743da347917Sbeck { 744da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 745da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 746da347917Sbeck &set_label("pw_nc".$i,0); 747da347917Sbeck } 748da347917Sbeck 749da347917Sbeck &comment(""); 750da347917Sbeck &add($a,32); 751da347917Sbeck &add($r,32); 752da347917Sbeck &sub($num,8); 753da347917Sbeck &jnz(&label("pw_nc_loop")); 754da347917Sbeck 755da347917Sbeck &mov($num,&wparam(4)); # get dl 756da347917Sbeck &and($num,7); 757da347917Sbeck &jz(&label("pw_nc_end")); 758da347917Sbeck 759da347917Sbeck for ($i=0; $i<7; $i++) 760da347917Sbeck { 761da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 762da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 763da347917Sbeck &set_label("pw_tail_nc".$i,0); 764da347917Sbeck &dec($num) if ($i != 6); 765da347917Sbeck &jz(&label("pw_nc_end")) if ($i != 6); 766da347917Sbeck } 767da347917Sbeck 768da347917Sbeck &set_label("pw_nc_end",0); 769da347917Sbeck &mov($c,0); 770da347917Sbeck 771da347917Sbeck &set_label("pw_end",0); 772da347917Sbeck 773da347917Sbeck# &mov("eax",$c); # $c is "eax" 774da347917Sbeck 775da347917Sbeck &function_end($name); 776da347917Sbeck } 777da347917Sbeck 778