15b37fcf3Sryker#!/usr/local/bin/perl 25b37fcf3Sryker 30a5d6edeSdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40a5d6edeSdjmpush(@INC,"${dir}","${dir}../../perlasm"); 55b37fcf3Srykerrequire "x86asm.pl"; 65b37fcf3Sryker 7913ec974Sbeck&asm_init($ARGV[0],$0); 85b37fcf3Sryker 94fcf65c5Sdjm$sse2=0; 104fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 114fcf65c5Sdjm 124fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2); 134fcf65c5Sdjm 145b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words"); 155b37fcf3Sryker&bn_mul_words("bn_mul_words"); 165b37fcf3Sryker&bn_sqr_words("bn_sqr_words"); 17913ec974Sbeck&bn_div_words("bn_div_words"); 185b37fcf3Sryker&bn_add_words("bn_add_words"); 19913ec974Sbeck&bn_sub_words("bn_sub_words"); 205b37fcf3Sryker 215b37fcf3Sryker&asm_finish(); 225b37fcf3Sryker 235b37fcf3Srykersub bn_mul_add_words 245b37fcf3Sryker { 255b37fcf3Sryker local($name)=@_; 265b37fcf3Sryker 27e60c46c4Smiod &function_begin_B($name,""); 285b37fcf3Sryker 290a5d6edeSdjm $r="eax"; 300a5d6edeSdjm $a="edx"; 310a5d6edeSdjm $c="ecx"; 325b37fcf3Sryker 334fcf65c5Sdjm if ($sse2) { 34*0bd8ca4aSmiod &picsetup("eax"); 35*0bd8ca4aSmiod &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 36e60c46c4Smiod &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 370a5d6edeSdjm &jnc(&label("maw_non_sse2")); 384fcf65c5Sdjm 390a5d6edeSdjm &mov($r,&wparam(0)); 400a5d6edeSdjm &mov($a,&wparam(1)); 410a5d6edeSdjm &mov($c,&wparam(2)); 420a5d6edeSdjm &movd("mm0",&wparam(3)); # mm0 = w 434fcf65c5Sdjm &pxor("mm1","mm1"); # mm1 = carry_in 440a5d6edeSdjm &jmp(&label("maw_sse2_entry")); 454fcf65c5Sdjm 460a5d6edeSdjm &set_label("maw_sse2_unrolled",16); 474fcf65c5Sdjm &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 484fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 494fcf65c5Sdjm &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 504fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[0] 514fcf65c5Sdjm &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 524fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[1] 534fcf65c5Sdjm &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 544fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[2] 554fcf65c5Sdjm &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 564fcf65c5Sdjm &pmuludq("mm7","mm0"); # mm7 = w*a[3] 574fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 584fcf65c5Sdjm &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 594fcf65c5Sdjm &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 604fcf65c5Sdjm &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 614fcf65c5Sdjm &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 624fcf65c5Sdjm &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 634fcf65c5Sdjm &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 644fcf65c5Sdjm &movd(&DWP(0,$r,"",0),"mm1"); 654fcf65c5Sdjm &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 664fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[4] 674fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry0 684fcf65c5Sdjm &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 694fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[5] 704fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 714fcf65c5Sdjm &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 724fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[6] 734fcf65c5Sdjm &movd(&DWP(4,$r,"",0),"mm1"); 744fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry1 754fcf65c5Sdjm &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 764fcf65c5Sdjm &add($a,32); 774fcf65c5Sdjm &pmuludq("mm3","mm0"); # mm3 = w*a[7] 784fcf65c5Sdjm &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 794fcf65c5Sdjm &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 804fcf65c5Sdjm &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 814fcf65c5Sdjm &movd(&DWP(8,$r,"",0),"mm1"); 824fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry2 834fcf65c5Sdjm &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 844fcf65c5Sdjm &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 854fcf65c5Sdjm &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 864fcf65c5Sdjm &movd(&DWP(12,$r,"",0),"mm1"); 874fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry3 884fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 894fcf65c5Sdjm &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 904fcf65c5Sdjm &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 914fcf65c5Sdjm &movd(&DWP(16,$r,"",0),"mm1"); 924fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry4 934fcf65c5Sdjm &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 944fcf65c5Sdjm &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 954fcf65c5Sdjm &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 964fcf65c5Sdjm &movd(&DWP(20,$r,"",0),"mm1"); 974fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry5 984fcf65c5Sdjm &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 994fcf65c5Sdjm &movd(&DWP(24,$r,"",0),"mm1"); 1004fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry6 1014fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 1024fcf65c5Sdjm &movd(&DWP(28,$r,"",0),"mm1"); 1030a5d6edeSdjm &lea($r,&DWP(32,$r)); 1044fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry_out 1054fcf65c5Sdjm 1060a5d6edeSdjm &sub($c,8); 1070a5d6edeSdjm &jz(&label("maw_sse2_exit")); 1080a5d6edeSdjm &set_label("maw_sse2_entry"); 1090a5d6edeSdjm &test($c,0xfffffff8); 1100a5d6edeSdjm &jnz(&label("maw_sse2_unrolled")); 1110a5d6edeSdjm 1120a5d6edeSdjm &set_label("maw_sse2_loop",4); 1130a5d6edeSdjm &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 1140a5d6edeSdjm &movd("mm3",&DWP(0,$r)); # mm3 = r[i] 1150a5d6edeSdjm &pmuludq("mm2","mm0"); # a[i] *= w 1160a5d6edeSdjm &lea($a,&DWP(4,$a)); 1170a5d6edeSdjm &paddq("mm1","mm3"); # carry += r[i] 1180a5d6edeSdjm &paddq("mm1","mm2"); # carry += a[i]*w 1190a5d6edeSdjm &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 1200a5d6edeSdjm &sub($c,1); 1210a5d6edeSdjm &psrlq("mm1",32); # carry = carry_high 1220a5d6edeSdjm &lea($r,&DWP(4,$r)); 1234fcf65c5Sdjm &jnz(&label("maw_sse2_loop")); 1240a5d6edeSdjm &set_label("maw_sse2_exit"); 1250a5d6edeSdjm &movd("eax","mm1"); # c = carry_out 1264fcf65c5Sdjm &emms(); 1270a5d6edeSdjm &ret(); 1284fcf65c5Sdjm 1290a5d6edeSdjm &set_label("maw_non_sse2",16); 1304fcf65c5Sdjm } 1314fcf65c5Sdjm 1320a5d6edeSdjm # function_begin prologue 1330a5d6edeSdjm &push("ebp"); 1340a5d6edeSdjm &push("ebx"); 1350a5d6edeSdjm &push("esi"); 1360a5d6edeSdjm &push("edi"); 1375b37fcf3Sryker 1380a5d6edeSdjm &comment(""); 1390a5d6edeSdjm $Low="eax"; 1400a5d6edeSdjm $High="edx"; 1410a5d6edeSdjm $a="ebx"; 1420a5d6edeSdjm $w="ebp"; 1430a5d6edeSdjm $r="edi"; 1440a5d6edeSdjm $c="esi"; 1450a5d6edeSdjm 1460a5d6edeSdjm &xor($c,$c); # clear carry 1470a5d6edeSdjm &mov($r,&wparam(0)); # 1480a5d6edeSdjm 1490a5d6edeSdjm &mov("ecx",&wparam(2)); # 1500a5d6edeSdjm &mov($a,&wparam(1)); # 1510a5d6edeSdjm 1520a5d6edeSdjm &and("ecx",0xfffffff8); # num / 8 1530a5d6edeSdjm &mov($w,&wparam(3)); # 1540a5d6edeSdjm 1550a5d6edeSdjm &push("ecx"); # Up the stack for a tmp variable 1560a5d6edeSdjm 1570a5d6edeSdjm &jz(&label("maw_finish")); 1580a5d6edeSdjm 1590a5d6edeSdjm &set_label("maw_loop",16); 1605b37fcf3Sryker 1615b37fcf3Sryker for ($i=0; $i<32; $i+=4) 1625b37fcf3Sryker { 1635b37fcf3Sryker &comment("Round $i"); 1645b37fcf3Sryker 1650a5d6edeSdjm &mov("eax",&DWP($i,$a)); # *a 1665b37fcf3Sryker &mul($w); # *a * w 1675b37fcf3Sryker &add("eax",$c); # L(t)+= c 1685b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1690a5d6edeSdjm &add("eax",&DWP($i,$r)); # L(t)+= *r 1700a5d6edeSdjm &adc("edx",0); # H(t)+=carry 1710a5d6edeSdjm &mov(&DWP($i,$r),"eax"); # *r= L(t); 1725b37fcf3Sryker &mov($c,"edx"); # c= H(t); 1735b37fcf3Sryker } 1745b37fcf3Sryker 1755b37fcf3Sryker &comment(""); 1765b37fcf3Sryker &sub("ecx",8); 1770a5d6edeSdjm &lea($a,&DWP(32,$a)); 1780a5d6edeSdjm &lea($r,&DWP(32,$r)); 1795b37fcf3Sryker &jnz(&label("maw_loop")); 1805b37fcf3Sryker 1815b37fcf3Sryker &set_label("maw_finish",0); 1825b37fcf3Sryker &mov("ecx",&wparam(2)); # get num 1835b37fcf3Sryker &and("ecx",7); 1845b37fcf3Sryker &jnz(&label("maw_finish2")); # helps branch prediction 1855b37fcf3Sryker &jmp(&label("maw_end")); 1865b37fcf3Sryker 1875b37fcf3Sryker &set_label("maw_finish2",1); 1885b37fcf3Sryker for ($i=0; $i<7; $i++) 1895b37fcf3Sryker { 1905b37fcf3Sryker &comment("Tail Round $i"); 1910a5d6edeSdjm &mov("eax",&DWP($i*4,$a)); # *a 1925b37fcf3Sryker &mul($w); # *a * w 1935b37fcf3Sryker &add("eax",$c); # L(t)+=c 1945b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1950a5d6edeSdjm &add("eax",&DWP($i*4,$r)); # L(t)+= *r 1965b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1975b37fcf3Sryker &dec("ecx") if ($i != 7-1); 1980a5d6edeSdjm &mov(&DWP($i*4,$r),"eax"); # *r= L(t); 1995b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2005b37fcf3Sryker &jz(&label("maw_end")) if ($i != 7-1); 2015b37fcf3Sryker } 2025b37fcf3Sryker &set_label("maw_end",0); 2035b37fcf3Sryker &mov("eax",$c); 2045b37fcf3Sryker 2055b37fcf3Sryker &pop("ecx"); # clear variable from 2065b37fcf3Sryker 2075b37fcf3Sryker &function_end($name); 2085b37fcf3Sryker } 2095b37fcf3Sryker 2105b37fcf3Srykersub bn_mul_words 2115b37fcf3Sryker { 2125b37fcf3Sryker local($name)=@_; 2135b37fcf3Sryker 214e60c46c4Smiod &function_begin_B($name,""); 2150a5d6edeSdjm 2160a5d6edeSdjm $r="eax"; 2170a5d6edeSdjm $a="edx"; 2180a5d6edeSdjm $c="ecx"; 2190a5d6edeSdjm 2200a5d6edeSdjm if ($sse2) { 221*0bd8ca4aSmiod &picsetup("eax"); 222*0bd8ca4aSmiod &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 223e60c46c4Smiod &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 2240a5d6edeSdjm &jnc(&label("mw_non_sse2")); 2250a5d6edeSdjm 2260a5d6edeSdjm &mov($r,&wparam(0)); 2270a5d6edeSdjm &mov($a,&wparam(1)); 2280a5d6edeSdjm &mov($c,&wparam(2)); 2290a5d6edeSdjm &movd("mm0",&wparam(3)); # mm0 = w 2300a5d6edeSdjm &pxor("mm1","mm1"); # mm1 = carry = 0 2310a5d6edeSdjm 2320a5d6edeSdjm &set_label("mw_sse2_loop",16); 2330a5d6edeSdjm &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 2340a5d6edeSdjm &pmuludq("mm2","mm0"); # a[i] *= w 2350a5d6edeSdjm &lea($a,&DWP(4,$a)); 2360a5d6edeSdjm &paddq("mm1","mm2"); # carry += a[i]*w 2370a5d6edeSdjm &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 2380a5d6edeSdjm &sub($c,1); 2390a5d6edeSdjm &psrlq("mm1",32); # carry = carry_high 2400a5d6edeSdjm &lea($r,&DWP(4,$r)); 2410a5d6edeSdjm &jnz(&label("mw_sse2_loop")); 2420a5d6edeSdjm 2430a5d6edeSdjm &movd("eax","mm1"); # return carry 2440a5d6edeSdjm &emms(); 2450a5d6edeSdjm &ret(); 2460a5d6edeSdjm &set_label("mw_non_sse2",16); 2470a5d6edeSdjm } 2480a5d6edeSdjm 2490a5d6edeSdjm # function_begin prologue 2500a5d6edeSdjm &push("ebp"); 2510a5d6edeSdjm &push("ebx"); 2520a5d6edeSdjm &push("esi"); 2530a5d6edeSdjm &push("edi"); 2545b37fcf3Sryker 2555b37fcf3Sryker &comment(""); 2565b37fcf3Sryker $Low="eax"; 2575b37fcf3Sryker $High="edx"; 2585b37fcf3Sryker $a="ebx"; 2595b37fcf3Sryker $w="ecx"; 2605b37fcf3Sryker $r="edi"; 2615b37fcf3Sryker $c="esi"; 2625b37fcf3Sryker $num="ebp"; 2635b37fcf3Sryker 2645b37fcf3Sryker &xor($c,$c); # clear carry 2655b37fcf3Sryker &mov($r,&wparam(0)); # 2665b37fcf3Sryker &mov($a,&wparam(1)); # 2675b37fcf3Sryker &mov($num,&wparam(2)); # 2685b37fcf3Sryker &mov($w,&wparam(3)); # 2695b37fcf3Sryker 2705b37fcf3Sryker &and($num,0xfffffff8); # num / 8 2715b37fcf3Sryker &jz(&label("mw_finish")); 2725b37fcf3Sryker 2735b37fcf3Sryker &set_label("mw_loop",0); 2745b37fcf3Sryker for ($i=0; $i<32; $i+=4) 2755b37fcf3Sryker { 2765b37fcf3Sryker &comment("Round $i"); 2775b37fcf3Sryker 2785b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 2795b37fcf3Sryker &mul($w); # *a * w 2805b37fcf3Sryker &add("eax",$c); # L(t)+=c 2815b37fcf3Sryker # XXX 2825b37fcf3Sryker 2835b37fcf3Sryker &adc("edx",0); # H(t)+=carry 2845b37fcf3Sryker &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 2855b37fcf3Sryker 2865b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2875b37fcf3Sryker } 2885b37fcf3Sryker 2895b37fcf3Sryker &comment(""); 2905b37fcf3Sryker &add($a,32); 2915b37fcf3Sryker &add($r,32); 2925b37fcf3Sryker &sub($num,8); 2935b37fcf3Sryker &jz(&label("mw_finish")); 2945b37fcf3Sryker &jmp(&label("mw_loop")); 2955b37fcf3Sryker 2965b37fcf3Sryker &set_label("mw_finish",0); 2975b37fcf3Sryker &mov($num,&wparam(2)); # get num 2985b37fcf3Sryker &and($num,7); 2995b37fcf3Sryker &jnz(&label("mw_finish2")); 3005b37fcf3Sryker &jmp(&label("mw_end")); 3015b37fcf3Sryker 3025b37fcf3Sryker &set_label("mw_finish2",1); 3035b37fcf3Sryker for ($i=0; $i<7; $i++) 3045b37fcf3Sryker { 3055b37fcf3Sryker &comment("Tail Round $i"); 3065b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0));# *a 3075b37fcf3Sryker &mul($w); # *a * w 3085b37fcf3Sryker &add("eax",$c); # L(t)+=c 3095b37fcf3Sryker # XXX 3105b37fcf3Sryker &adc("edx",0); # H(t)+=carry 3115b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 3125b37fcf3Sryker &mov($c,"edx"); # c= H(t); 3135b37fcf3Sryker &dec($num) if ($i != 7-1); 3145b37fcf3Sryker &jz(&label("mw_end")) if ($i != 7-1); 3155b37fcf3Sryker } 3165b37fcf3Sryker &set_label("mw_end",0); 3175b37fcf3Sryker &mov("eax",$c); 3185b37fcf3Sryker 3195b37fcf3Sryker &function_end($name); 3205b37fcf3Sryker } 3215b37fcf3Sryker 3225b37fcf3Srykersub bn_sqr_words 3235b37fcf3Sryker { 3245b37fcf3Sryker local($name)=@_; 3255b37fcf3Sryker 326e60c46c4Smiod &function_begin_B($name,""); 3270a5d6edeSdjm 3280a5d6edeSdjm $r="eax"; 3290a5d6edeSdjm $a="edx"; 3300a5d6edeSdjm $c="ecx"; 3310a5d6edeSdjm 3320a5d6edeSdjm if ($sse2) { 333*0bd8ca4aSmiod &picsetup("eax"); 334*0bd8ca4aSmiod &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 335e60c46c4Smiod &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 3360a5d6edeSdjm &jnc(&label("sqr_non_sse2")); 3370a5d6edeSdjm 3380a5d6edeSdjm &mov($r,&wparam(0)); 3390a5d6edeSdjm &mov($a,&wparam(1)); 3400a5d6edeSdjm &mov($c,&wparam(2)); 3410a5d6edeSdjm 3420a5d6edeSdjm &set_label("sqr_sse2_loop",16); 3430a5d6edeSdjm &movd("mm0",&DWP(0,$a)); # mm0 = a[i] 3440a5d6edeSdjm &pmuludq("mm0","mm0"); # a[i] *= a[i] 3450a5d6edeSdjm &lea($a,&DWP(4,$a)); # a++ 3460a5d6edeSdjm &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] 3470a5d6edeSdjm &sub($c,1); 3480a5d6edeSdjm &lea($r,&DWP(8,$r)); # r += 2 3490a5d6edeSdjm &jnz(&label("sqr_sse2_loop")); 3500a5d6edeSdjm 3510a5d6edeSdjm &emms(); 3520a5d6edeSdjm &ret(); 3530a5d6edeSdjm &set_label("sqr_non_sse2",16); 3540a5d6edeSdjm } 3550a5d6edeSdjm 3560a5d6edeSdjm # function_begin prologue 3570a5d6edeSdjm &push("ebp"); 3580a5d6edeSdjm &push("ebx"); 3590a5d6edeSdjm &push("esi"); 3600a5d6edeSdjm &push("edi"); 3615b37fcf3Sryker 3625b37fcf3Sryker &comment(""); 3635b37fcf3Sryker $r="esi"; 3645b37fcf3Sryker $a="edi"; 3655b37fcf3Sryker $num="ebx"; 3665b37fcf3Sryker 3675b37fcf3Sryker &mov($r,&wparam(0)); # 3685b37fcf3Sryker &mov($a,&wparam(1)); # 3695b37fcf3Sryker &mov($num,&wparam(2)); # 3705b37fcf3Sryker 3715b37fcf3Sryker &and($num,0xfffffff8); # num / 8 3725b37fcf3Sryker &jz(&label("sw_finish")); 3735b37fcf3Sryker 3745b37fcf3Sryker &set_label("sw_loop",0); 3755b37fcf3Sryker for ($i=0; $i<32; $i+=4) 3765b37fcf3Sryker { 3775b37fcf3Sryker &comment("Round $i"); 3785b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 3795b37fcf3Sryker # XXX 3805b37fcf3Sryker &mul("eax"); # *a * *a 3815b37fcf3Sryker &mov(&DWP($i*2,$r,"",0),"eax"); # 3825b37fcf3Sryker &mov(&DWP($i*2+4,$r,"",0),"edx");# 3835b37fcf3Sryker } 3845b37fcf3Sryker 3855b37fcf3Sryker &comment(""); 3865b37fcf3Sryker &add($a,32); 3875b37fcf3Sryker &add($r,64); 3885b37fcf3Sryker &sub($num,8); 3895b37fcf3Sryker &jnz(&label("sw_loop")); 3905b37fcf3Sryker 3915b37fcf3Sryker &set_label("sw_finish",0); 3925b37fcf3Sryker &mov($num,&wparam(2)); # get num 3935b37fcf3Sryker &and($num,7); 3945b37fcf3Sryker &jz(&label("sw_end")); 3955b37fcf3Sryker 3965b37fcf3Sryker for ($i=0; $i<7; $i++) 3975b37fcf3Sryker { 3985b37fcf3Sryker &comment("Tail Round $i"); 3995b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0)); # *a 4005b37fcf3Sryker # XXX 4015b37fcf3Sryker &mul("eax"); # *a * *a 4025b37fcf3Sryker &mov(&DWP($i*8,$r,"",0),"eax"); # 4035b37fcf3Sryker &dec($num) if ($i != 7-1); 4045b37fcf3Sryker &mov(&DWP($i*8+4,$r,"",0),"edx"); 4055b37fcf3Sryker &jz(&label("sw_end")) if ($i != 7-1); 4065b37fcf3Sryker } 4075b37fcf3Sryker &set_label("sw_end",0); 4085b37fcf3Sryker 4095b37fcf3Sryker &function_end($name); 4105b37fcf3Sryker } 4115b37fcf3Sryker 412913ec974Sbecksub bn_div_words 4135b37fcf3Sryker { 4145b37fcf3Sryker local($name)=@_; 4155b37fcf3Sryker 4160a5d6edeSdjm &function_begin_B($name,""); 4175b37fcf3Sryker &mov("edx",&wparam(0)); # 4185b37fcf3Sryker &mov("eax",&wparam(1)); # 4190a5d6edeSdjm &mov("ecx",&wparam(2)); # 4200a5d6edeSdjm &div("ecx"); 4210a5d6edeSdjm &ret(); 4220a5d6edeSdjm &function_end_B($name); 4235b37fcf3Sryker } 4245b37fcf3Sryker 4255b37fcf3Srykersub bn_add_words 4265b37fcf3Sryker { 4275b37fcf3Sryker local($name)=@_; 4285b37fcf3Sryker 4295b37fcf3Sryker &function_begin($name,""); 4305b37fcf3Sryker 4315b37fcf3Sryker &comment(""); 4325b37fcf3Sryker $a="esi"; 4335b37fcf3Sryker $b="edi"; 4345b37fcf3Sryker $c="eax"; 4355b37fcf3Sryker $r="ebx"; 4365b37fcf3Sryker $tmp1="ecx"; 4375b37fcf3Sryker $tmp2="edx"; 4385b37fcf3Sryker $num="ebp"; 4395b37fcf3Sryker 4405b37fcf3Sryker &mov($r,&wparam(0)); # get r 4415b37fcf3Sryker &mov($a,&wparam(1)); # get a 4425b37fcf3Sryker &mov($b,&wparam(2)); # get b 4435b37fcf3Sryker &mov($num,&wparam(3)); # get num 4445b37fcf3Sryker &xor($c,$c); # clear carry 4455b37fcf3Sryker &and($num,0xfffffff8); # num / 8 4465b37fcf3Sryker 4475b37fcf3Sryker &jz(&label("aw_finish")); 4485b37fcf3Sryker 4495b37fcf3Sryker &set_label("aw_loop",0); 4505b37fcf3Sryker for ($i=0; $i<8; $i++) 4515b37fcf3Sryker { 4525b37fcf3Sryker &comment("Round $i"); 4535b37fcf3Sryker 4545b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 4555b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 4565b37fcf3Sryker &add($tmp1,$c); 4575b37fcf3Sryker &mov($c,0); 4585b37fcf3Sryker &adc($c,$c); 4595b37fcf3Sryker &add($tmp1,$tmp2); 4605b37fcf3Sryker &adc($c,0); 4615b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 4625b37fcf3Sryker } 4635b37fcf3Sryker 4645b37fcf3Sryker &comment(""); 4655b37fcf3Sryker &add($a,32); 4665b37fcf3Sryker &add($b,32); 4675b37fcf3Sryker &add($r,32); 4685b37fcf3Sryker &sub($num,8); 4695b37fcf3Sryker &jnz(&label("aw_loop")); 4705b37fcf3Sryker 4715b37fcf3Sryker &set_label("aw_finish",0); 4725b37fcf3Sryker &mov($num,&wparam(3)); # get num 4735b37fcf3Sryker &and($num,7); 4745b37fcf3Sryker &jz(&label("aw_end")); 4755b37fcf3Sryker 4765b37fcf3Sryker for ($i=0; $i<7; $i++) 4775b37fcf3Sryker { 4785b37fcf3Sryker &comment("Tail Round $i"); 4795b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 4805b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0));# *b 4815b37fcf3Sryker &add($tmp1,$c); 4825b37fcf3Sryker &mov($c,0); 4835b37fcf3Sryker &adc($c,$c); 4845b37fcf3Sryker &add($tmp1,$tmp2); 4855b37fcf3Sryker &adc($c,0); 4865b37fcf3Sryker &dec($num) if ($i != 6); 487da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 4885b37fcf3Sryker &jz(&label("aw_end")) if ($i != 6); 4895b37fcf3Sryker } 4905b37fcf3Sryker &set_label("aw_end",0); 4915b37fcf3Sryker 492913ec974Sbeck# &mov("eax",$c); # $c is "eax" 493913ec974Sbeck 494913ec974Sbeck &function_end($name); 495913ec974Sbeck } 496913ec974Sbeck 497913ec974Sbecksub bn_sub_words 498913ec974Sbeck { 499913ec974Sbeck local($name)=@_; 500913ec974Sbeck 501913ec974Sbeck &function_begin($name,""); 502913ec974Sbeck 503913ec974Sbeck &comment(""); 504913ec974Sbeck $a="esi"; 505913ec974Sbeck $b="edi"; 506913ec974Sbeck $c="eax"; 507913ec974Sbeck $r="ebx"; 508913ec974Sbeck $tmp1="ecx"; 509913ec974Sbeck $tmp2="edx"; 510913ec974Sbeck $num="ebp"; 511913ec974Sbeck 512913ec974Sbeck &mov($r,&wparam(0)); # get r 513913ec974Sbeck &mov($a,&wparam(1)); # get a 514913ec974Sbeck &mov($b,&wparam(2)); # get b 515913ec974Sbeck &mov($num,&wparam(3)); # get num 516913ec974Sbeck &xor($c,$c); # clear carry 517913ec974Sbeck &and($num,0xfffffff8); # num / 8 518913ec974Sbeck 519913ec974Sbeck &jz(&label("aw_finish")); 520913ec974Sbeck 521913ec974Sbeck &set_label("aw_loop",0); 522913ec974Sbeck for ($i=0; $i<8; $i++) 523913ec974Sbeck { 524913ec974Sbeck &comment("Round $i"); 525913ec974Sbeck 526913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 527913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 528913ec974Sbeck &sub($tmp1,$c); 529913ec974Sbeck &mov($c,0); 530913ec974Sbeck &adc($c,$c); 531913ec974Sbeck &sub($tmp1,$tmp2); 532913ec974Sbeck &adc($c,0); 533913ec974Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 534913ec974Sbeck } 535913ec974Sbeck 536913ec974Sbeck &comment(""); 537913ec974Sbeck &add($a,32); 538913ec974Sbeck &add($b,32); 539913ec974Sbeck &add($r,32); 540913ec974Sbeck &sub($num,8); 541913ec974Sbeck &jnz(&label("aw_loop")); 542913ec974Sbeck 543913ec974Sbeck &set_label("aw_finish",0); 544913ec974Sbeck &mov($num,&wparam(3)); # get num 545913ec974Sbeck &and($num,7); 546913ec974Sbeck &jz(&label("aw_end")); 547913ec974Sbeck 548913ec974Sbeck for ($i=0; $i<7; $i++) 549913ec974Sbeck { 550913ec974Sbeck &comment("Tail Round $i"); 551913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 552913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 553913ec974Sbeck &sub($tmp1,$c); 554913ec974Sbeck &mov($c,0); 555913ec974Sbeck &adc($c,$c); 556913ec974Sbeck &sub($tmp1,$tmp2); 557913ec974Sbeck &adc($c,0); 558913ec974Sbeck &dec($num) if ($i != 6); 559da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 560913ec974Sbeck &jz(&label("aw_end")) if ($i != 6); 561913ec974Sbeck } 562913ec974Sbeck &set_label("aw_end",0); 563913ec974Sbeck 564913ec974Sbeck# &mov("eax",$c); # $c is "eax" 5655b37fcf3Sryker 5665b37fcf3Sryker &function_end($name); 5675b37fcf3Sryker } 568