15b37fcf3Sryker#!/usr/local/bin/perl 25b37fcf3Sryker 35b37fcf3Srykerpush(@INC,"perlasm","../../perlasm"); 45b37fcf3Srykerrequire "x86asm.pl"; 55b37fcf3Sryker 6913ec974Sbeck&asm_init($ARGV[0],$0); 75b37fcf3Sryker 8*4fcf65c5Sdjm$sse2=0; 9*4fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 10*4fcf65c5Sdjm 11*4fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2); 12*4fcf65c5Sdjm 135b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words"); 145b37fcf3Sryker&bn_mul_words("bn_mul_words"); 155b37fcf3Sryker&bn_sqr_words("bn_sqr_words"); 16913ec974Sbeck&bn_div_words("bn_div_words"); 175b37fcf3Sryker&bn_add_words("bn_add_words"); 18913ec974Sbeck&bn_sub_words("bn_sub_words"); 19*4fcf65c5Sdjm&bn_sub_part_words("bn_sub_part_words"); 205b37fcf3Sryker 215b37fcf3Sryker&asm_finish(); 225b37fcf3Sryker 235b37fcf3Srykersub bn_mul_add_words 245b37fcf3Sryker { 255b37fcf3Sryker local($name)=@_; 265b37fcf3Sryker 27*4fcf65c5Sdjm &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 285b37fcf3Sryker 295b37fcf3Sryker &comment(""); 305b37fcf3Sryker $Low="eax"; 315b37fcf3Sryker $High="edx"; 325b37fcf3Sryker $a="ebx"; 335b37fcf3Sryker $w="ebp"; 345b37fcf3Sryker $r="edi"; 355b37fcf3Sryker $c="esi"; 365b37fcf3Sryker 375b37fcf3Sryker &xor($c,$c); # clear carry 385b37fcf3Sryker &mov($r,&wparam(0)); # 395b37fcf3Sryker 405b37fcf3Sryker &mov("ecx",&wparam(2)); # 415b37fcf3Sryker &mov($a,&wparam(1)); # 425b37fcf3Sryker 435b37fcf3Sryker &and("ecx",0xfffffff8); # num / 8 445b37fcf3Sryker &mov($w,&wparam(3)); # 455b37fcf3Sryker 465b37fcf3Sryker &push("ecx"); # Up the stack for a tmp variable 475b37fcf3Sryker 485b37fcf3Sryker &jz(&label("maw_finish")); 495b37fcf3Sryker 50*4fcf65c5Sdjm if ($sse2) { 51*4fcf65c5Sdjm &picmeup("eax","OPENSSL_ia32cap_P"); 52*4fcf65c5Sdjm &bt(&DWP(0,"eax"),26); 53*4fcf65c5Sdjm &jnc(&label("maw_loop")); 54*4fcf65c5Sdjm 55*4fcf65c5Sdjm &movd("mm0",$w); # mm0 = w 56*4fcf65c5Sdjm &pxor("mm1","mm1"); # mm1 = carry_in 57*4fcf65c5Sdjm 58*4fcf65c5Sdjm &set_label("maw_sse2_loop",0); 59*4fcf65c5Sdjm &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 60*4fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 61*4fcf65c5Sdjm &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 62*4fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[0] 63*4fcf65c5Sdjm &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 64*4fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[1] 65*4fcf65c5Sdjm &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 66*4fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[2] 67*4fcf65c5Sdjm &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 68*4fcf65c5Sdjm &pmuludq("mm7","mm0"); # mm7 = w*a[3] 69*4fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 70*4fcf65c5Sdjm &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 71*4fcf65c5Sdjm &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 72*4fcf65c5Sdjm &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 73*4fcf65c5Sdjm &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 74*4fcf65c5Sdjm &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 75*4fcf65c5Sdjm &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 76*4fcf65c5Sdjm &movd(&DWP(0,$r,"",0),"mm1"); 77*4fcf65c5Sdjm &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 78*4fcf65c5Sdjm &pmuludq("mm2","mm0"); # mm2 = w*a[4] 79*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry0 80*4fcf65c5Sdjm &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 81*4fcf65c5Sdjm &pmuludq("mm4","mm0"); # mm4 = w*a[5] 82*4fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 83*4fcf65c5Sdjm &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 84*4fcf65c5Sdjm &pmuludq("mm6","mm0"); # mm6 = w*a[6] 85*4fcf65c5Sdjm &movd(&DWP(4,$r,"",0),"mm1"); 86*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry1 87*4fcf65c5Sdjm &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 88*4fcf65c5Sdjm &add($a,32); 89*4fcf65c5Sdjm &pmuludq("mm3","mm0"); # mm3 = w*a[7] 90*4fcf65c5Sdjm &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 91*4fcf65c5Sdjm &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 92*4fcf65c5Sdjm &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 93*4fcf65c5Sdjm &movd(&DWP(8,$r,"",0),"mm1"); 94*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry2 95*4fcf65c5Sdjm &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 96*4fcf65c5Sdjm &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 97*4fcf65c5Sdjm &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 98*4fcf65c5Sdjm &movd(&DWP(12,$r,"",0),"mm1"); 99*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry3 100*4fcf65c5Sdjm &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 101*4fcf65c5Sdjm &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 102*4fcf65c5Sdjm &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 103*4fcf65c5Sdjm &movd(&DWP(16,$r,"",0),"mm1"); 104*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry4 105*4fcf65c5Sdjm &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 106*4fcf65c5Sdjm &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 107*4fcf65c5Sdjm &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 108*4fcf65c5Sdjm &movd(&DWP(20,$r,"",0),"mm1"); 109*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry5 110*4fcf65c5Sdjm &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 111*4fcf65c5Sdjm &movd(&DWP(24,$r,"",0),"mm1"); 112*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry6 113*4fcf65c5Sdjm &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 114*4fcf65c5Sdjm &movd(&DWP(28,$r,"",0),"mm1"); 115*4fcf65c5Sdjm &add($r,32); 116*4fcf65c5Sdjm &psrlq("mm1",32); # mm1 = carry_out 117*4fcf65c5Sdjm 118*4fcf65c5Sdjm &sub("ecx",8); 119*4fcf65c5Sdjm &jnz(&label("maw_sse2_loop")); 120*4fcf65c5Sdjm 121*4fcf65c5Sdjm &movd($c,"mm1"); # c = carry_out 122*4fcf65c5Sdjm &emms(); 123*4fcf65c5Sdjm 124*4fcf65c5Sdjm &jmp(&label("maw_finish")); 125*4fcf65c5Sdjm } 126*4fcf65c5Sdjm 1275b37fcf3Sryker &set_label("maw_loop",0); 1285b37fcf3Sryker 1295b37fcf3Sryker &mov(&swtmp(0),"ecx"); # 1305b37fcf3Sryker 1315b37fcf3Sryker for ($i=0; $i<32; $i+=4) 1325b37fcf3Sryker { 1335b37fcf3Sryker &comment("Round $i"); 1345b37fcf3Sryker 1355b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 1365b37fcf3Sryker &mul($w); # *a * w 1375b37fcf3Sryker &add("eax",$c); # L(t)+= *r 1385b37fcf3Sryker &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r 1395b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1405b37fcf3Sryker &add("eax",$c); # L(t)+=c 1415b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1425b37fcf3Sryker &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 1435b37fcf3Sryker &mov($c,"edx"); # c= H(t); 1445b37fcf3Sryker } 1455b37fcf3Sryker 1465b37fcf3Sryker &comment(""); 1475b37fcf3Sryker &mov("ecx",&swtmp(0)); # 1485b37fcf3Sryker &add($a,32); 1495b37fcf3Sryker &add($r,32); 1505b37fcf3Sryker &sub("ecx",8); 1515b37fcf3Sryker &jnz(&label("maw_loop")); 1525b37fcf3Sryker 1535b37fcf3Sryker &set_label("maw_finish",0); 1545b37fcf3Sryker &mov("ecx",&wparam(2)); # get num 1555b37fcf3Sryker &and("ecx",7); 1565b37fcf3Sryker &jnz(&label("maw_finish2")); # helps branch prediction 1575b37fcf3Sryker &jmp(&label("maw_end")); 1585b37fcf3Sryker 1595b37fcf3Sryker &set_label("maw_finish2",1); 1605b37fcf3Sryker for ($i=0; $i<7; $i++) 1615b37fcf3Sryker { 1625b37fcf3Sryker &comment("Tail Round $i"); 1635b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0));# *a 1645b37fcf3Sryker &mul($w); # *a * w 1655b37fcf3Sryker &add("eax",$c); # L(t)+=c 1665b37fcf3Sryker &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r 1675b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1685b37fcf3Sryker &add("eax",$c); 1695b37fcf3Sryker &adc("edx",0); # H(t)+=carry 1705b37fcf3Sryker &dec("ecx") if ($i != 7-1); 1715b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); 1725b37fcf3Sryker &mov($c,"edx"); # c= H(t); 1735b37fcf3Sryker &jz(&label("maw_end")) if ($i != 7-1); 1745b37fcf3Sryker } 1755b37fcf3Sryker &set_label("maw_end",0); 1765b37fcf3Sryker &mov("eax",$c); 1775b37fcf3Sryker 1785b37fcf3Sryker &pop("ecx"); # clear variable from 1795b37fcf3Sryker 1805b37fcf3Sryker &function_end($name); 1815b37fcf3Sryker } 1825b37fcf3Sryker 1835b37fcf3Srykersub bn_mul_words 1845b37fcf3Sryker { 1855b37fcf3Sryker local($name)=@_; 1865b37fcf3Sryker 1875b37fcf3Sryker &function_begin($name,""); 1885b37fcf3Sryker 1895b37fcf3Sryker &comment(""); 1905b37fcf3Sryker $Low="eax"; 1915b37fcf3Sryker $High="edx"; 1925b37fcf3Sryker $a="ebx"; 1935b37fcf3Sryker $w="ecx"; 1945b37fcf3Sryker $r="edi"; 1955b37fcf3Sryker $c="esi"; 1965b37fcf3Sryker $num="ebp"; 1975b37fcf3Sryker 1985b37fcf3Sryker &xor($c,$c); # clear carry 1995b37fcf3Sryker &mov($r,&wparam(0)); # 2005b37fcf3Sryker &mov($a,&wparam(1)); # 2015b37fcf3Sryker &mov($num,&wparam(2)); # 2025b37fcf3Sryker &mov($w,&wparam(3)); # 2035b37fcf3Sryker 2045b37fcf3Sryker &and($num,0xfffffff8); # num / 8 2055b37fcf3Sryker &jz(&label("mw_finish")); 2065b37fcf3Sryker 2075b37fcf3Sryker &set_label("mw_loop",0); 2085b37fcf3Sryker for ($i=0; $i<32; $i+=4) 2095b37fcf3Sryker { 2105b37fcf3Sryker &comment("Round $i"); 2115b37fcf3Sryker 2125b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 2135b37fcf3Sryker &mul($w); # *a * w 2145b37fcf3Sryker &add("eax",$c); # L(t)+=c 2155b37fcf3Sryker # XXX 2165b37fcf3Sryker 2175b37fcf3Sryker &adc("edx",0); # H(t)+=carry 2185b37fcf3Sryker &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 2195b37fcf3Sryker 2205b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2215b37fcf3Sryker } 2225b37fcf3Sryker 2235b37fcf3Sryker &comment(""); 2245b37fcf3Sryker &add($a,32); 2255b37fcf3Sryker &add($r,32); 2265b37fcf3Sryker &sub($num,8); 2275b37fcf3Sryker &jz(&label("mw_finish")); 2285b37fcf3Sryker &jmp(&label("mw_loop")); 2295b37fcf3Sryker 2305b37fcf3Sryker &set_label("mw_finish",0); 2315b37fcf3Sryker &mov($num,&wparam(2)); # get num 2325b37fcf3Sryker &and($num,7); 2335b37fcf3Sryker &jnz(&label("mw_finish2")); 2345b37fcf3Sryker &jmp(&label("mw_end")); 2355b37fcf3Sryker 2365b37fcf3Sryker &set_label("mw_finish2",1); 2375b37fcf3Sryker for ($i=0; $i<7; $i++) 2385b37fcf3Sryker { 2395b37fcf3Sryker &comment("Tail Round $i"); 2405b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0));# *a 2415b37fcf3Sryker &mul($w); # *a * w 2425b37fcf3Sryker &add("eax",$c); # L(t)+=c 2435b37fcf3Sryker # XXX 2445b37fcf3Sryker &adc("edx",0); # H(t)+=carry 2455b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 2465b37fcf3Sryker &mov($c,"edx"); # c= H(t); 2475b37fcf3Sryker &dec($num) if ($i != 7-1); 2485b37fcf3Sryker &jz(&label("mw_end")) if ($i != 7-1); 2495b37fcf3Sryker } 2505b37fcf3Sryker &set_label("mw_end",0); 2515b37fcf3Sryker &mov("eax",$c); 2525b37fcf3Sryker 2535b37fcf3Sryker &function_end($name); 2545b37fcf3Sryker } 2555b37fcf3Sryker 2565b37fcf3Srykersub bn_sqr_words 2575b37fcf3Sryker { 2585b37fcf3Sryker local($name)=@_; 2595b37fcf3Sryker 2605b37fcf3Sryker &function_begin($name,""); 2615b37fcf3Sryker 2625b37fcf3Sryker &comment(""); 2635b37fcf3Sryker $r="esi"; 2645b37fcf3Sryker $a="edi"; 2655b37fcf3Sryker $num="ebx"; 2665b37fcf3Sryker 2675b37fcf3Sryker &mov($r,&wparam(0)); # 2685b37fcf3Sryker &mov($a,&wparam(1)); # 2695b37fcf3Sryker &mov($num,&wparam(2)); # 2705b37fcf3Sryker 2715b37fcf3Sryker &and($num,0xfffffff8); # num / 8 2725b37fcf3Sryker &jz(&label("sw_finish")); 2735b37fcf3Sryker 2745b37fcf3Sryker &set_label("sw_loop",0); 2755b37fcf3Sryker for ($i=0; $i<32; $i+=4) 2765b37fcf3Sryker { 2775b37fcf3Sryker &comment("Round $i"); 2785b37fcf3Sryker &mov("eax",&DWP($i,$a,"",0)); # *a 2795b37fcf3Sryker # XXX 2805b37fcf3Sryker &mul("eax"); # *a * *a 2815b37fcf3Sryker &mov(&DWP($i*2,$r,"",0),"eax"); # 2825b37fcf3Sryker &mov(&DWP($i*2+4,$r,"",0),"edx");# 2835b37fcf3Sryker } 2845b37fcf3Sryker 2855b37fcf3Sryker &comment(""); 2865b37fcf3Sryker &add($a,32); 2875b37fcf3Sryker &add($r,64); 2885b37fcf3Sryker &sub($num,8); 2895b37fcf3Sryker &jnz(&label("sw_loop")); 2905b37fcf3Sryker 2915b37fcf3Sryker &set_label("sw_finish",0); 2925b37fcf3Sryker &mov($num,&wparam(2)); # get num 2935b37fcf3Sryker &and($num,7); 2945b37fcf3Sryker &jz(&label("sw_end")); 2955b37fcf3Sryker 2965b37fcf3Sryker for ($i=0; $i<7; $i++) 2975b37fcf3Sryker { 2985b37fcf3Sryker &comment("Tail Round $i"); 2995b37fcf3Sryker &mov("eax",&DWP($i*4,$a,"",0)); # *a 3005b37fcf3Sryker # XXX 3015b37fcf3Sryker &mul("eax"); # *a * *a 3025b37fcf3Sryker &mov(&DWP($i*8,$r,"",0),"eax"); # 3035b37fcf3Sryker &dec($num) if ($i != 7-1); 3045b37fcf3Sryker &mov(&DWP($i*8+4,$r,"",0),"edx"); 3055b37fcf3Sryker &jz(&label("sw_end")) if ($i != 7-1); 3065b37fcf3Sryker } 3075b37fcf3Sryker &set_label("sw_end",0); 3085b37fcf3Sryker 3095b37fcf3Sryker &function_end($name); 3105b37fcf3Sryker } 3115b37fcf3Sryker 312913ec974Sbecksub bn_div_words 3135b37fcf3Sryker { 3145b37fcf3Sryker local($name)=@_; 3155b37fcf3Sryker 3165b37fcf3Sryker &function_begin($name,""); 3175b37fcf3Sryker &mov("edx",&wparam(0)); # 3185b37fcf3Sryker &mov("eax",&wparam(1)); # 3195b37fcf3Sryker &mov("ebx",&wparam(2)); # 3205b37fcf3Sryker &div("ebx"); 3215b37fcf3Sryker &function_end($name); 3225b37fcf3Sryker } 3235b37fcf3Sryker 3245b37fcf3Srykersub bn_add_words 3255b37fcf3Sryker { 3265b37fcf3Sryker local($name)=@_; 3275b37fcf3Sryker 3285b37fcf3Sryker &function_begin($name,""); 3295b37fcf3Sryker 3305b37fcf3Sryker &comment(""); 3315b37fcf3Sryker $a="esi"; 3325b37fcf3Sryker $b="edi"; 3335b37fcf3Sryker $c="eax"; 3345b37fcf3Sryker $r="ebx"; 3355b37fcf3Sryker $tmp1="ecx"; 3365b37fcf3Sryker $tmp2="edx"; 3375b37fcf3Sryker $num="ebp"; 3385b37fcf3Sryker 3395b37fcf3Sryker &mov($r,&wparam(0)); # get r 3405b37fcf3Sryker &mov($a,&wparam(1)); # get a 3415b37fcf3Sryker &mov($b,&wparam(2)); # get b 3425b37fcf3Sryker &mov($num,&wparam(3)); # get num 3435b37fcf3Sryker &xor($c,$c); # clear carry 3445b37fcf3Sryker &and($num,0xfffffff8); # num / 8 3455b37fcf3Sryker 3465b37fcf3Sryker &jz(&label("aw_finish")); 3475b37fcf3Sryker 3485b37fcf3Sryker &set_label("aw_loop",0); 3495b37fcf3Sryker for ($i=0; $i<8; $i++) 3505b37fcf3Sryker { 3515b37fcf3Sryker &comment("Round $i"); 3525b37fcf3Sryker 3535b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 3545b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 3555b37fcf3Sryker &add($tmp1,$c); 3565b37fcf3Sryker &mov($c,0); 3575b37fcf3Sryker &adc($c,$c); 3585b37fcf3Sryker &add($tmp1,$tmp2); 3595b37fcf3Sryker &adc($c,0); 3605b37fcf3Sryker &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 3615b37fcf3Sryker } 3625b37fcf3Sryker 3635b37fcf3Sryker &comment(""); 3645b37fcf3Sryker &add($a,32); 3655b37fcf3Sryker &add($b,32); 3665b37fcf3Sryker &add($r,32); 3675b37fcf3Sryker &sub($num,8); 3685b37fcf3Sryker &jnz(&label("aw_loop")); 3695b37fcf3Sryker 3705b37fcf3Sryker &set_label("aw_finish",0); 3715b37fcf3Sryker &mov($num,&wparam(3)); # get num 3725b37fcf3Sryker &and($num,7); 3735b37fcf3Sryker &jz(&label("aw_end")); 3745b37fcf3Sryker 3755b37fcf3Sryker for ($i=0; $i<7; $i++) 3765b37fcf3Sryker { 3775b37fcf3Sryker &comment("Tail Round $i"); 3785b37fcf3Sryker &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 3795b37fcf3Sryker &mov($tmp2,&DWP($i*4,$b,"",0));# *b 3805b37fcf3Sryker &add($tmp1,$c); 3815b37fcf3Sryker &mov($c,0); 3825b37fcf3Sryker &adc($c,$c); 3835b37fcf3Sryker &add($tmp1,$tmp2); 3845b37fcf3Sryker &adc($c,0); 3855b37fcf3Sryker &dec($num) if ($i != 6); 386da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 3875b37fcf3Sryker &jz(&label("aw_end")) if ($i != 6); 3885b37fcf3Sryker } 3895b37fcf3Sryker &set_label("aw_end",0); 3905b37fcf3Sryker 391913ec974Sbeck# &mov("eax",$c); # $c is "eax" 392913ec974Sbeck 393913ec974Sbeck &function_end($name); 394913ec974Sbeck } 395913ec974Sbeck 396913ec974Sbecksub bn_sub_words 397913ec974Sbeck { 398913ec974Sbeck local($name)=@_; 399913ec974Sbeck 400913ec974Sbeck &function_begin($name,""); 401913ec974Sbeck 402913ec974Sbeck &comment(""); 403913ec974Sbeck $a="esi"; 404913ec974Sbeck $b="edi"; 405913ec974Sbeck $c="eax"; 406913ec974Sbeck $r="ebx"; 407913ec974Sbeck $tmp1="ecx"; 408913ec974Sbeck $tmp2="edx"; 409913ec974Sbeck $num="ebp"; 410913ec974Sbeck 411913ec974Sbeck &mov($r,&wparam(0)); # get r 412913ec974Sbeck &mov($a,&wparam(1)); # get a 413913ec974Sbeck &mov($b,&wparam(2)); # get b 414913ec974Sbeck &mov($num,&wparam(3)); # get num 415913ec974Sbeck &xor($c,$c); # clear carry 416913ec974Sbeck &and($num,0xfffffff8); # num / 8 417913ec974Sbeck 418913ec974Sbeck &jz(&label("aw_finish")); 419913ec974Sbeck 420913ec974Sbeck &set_label("aw_loop",0); 421913ec974Sbeck for ($i=0; $i<8; $i++) 422913ec974Sbeck { 423913ec974Sbeck &comment("Round $i"); 424913ec974Sbeck 425913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 426913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 427913ec974Sbeck &sub($tmp1,$c); 428913ec974Sbeck &mov($c,0); 429913ec974Sbeck &adc($c,$c); 430913ec974Sbeck &sub($tmp1,$tmp2); 431913ec974Sbeck &adc($c,0); 432913ec974Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 433913ec974Sbeck } 434913ec974Sbeck 435913ec974Sbeck &comment(""); 436913ec974Sbeck &add($a,32); 437913ec974Sbeck &add($b,32); 438913ec974Sbeck &add($r,32); 439913ec974Sbeck &sub($num,8); 440913ec974Sbeck &jnz(&label("aw_loop")); 441913ec974Sbeck 442913ec974Sbeck &set_label("aw_finish",0); 443913ec974Sbeck &mov($num,&wparam(3)); # get num 444913ec974Sbeck &and($num,7); 445913ec974Sbeck &jz(&label("aw_end")); 446913ec974Sbeck 447913ec974Sbeck for ($i=0; $i<7; $i++) 448913ec974Sbeck { 449913ec974Sbeck &comment("Tail Round $i"); 450913ec974Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 451913ec974Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 452913ec974Sbeck &sub($tmp1,$c); 453913ec974Sbeck &mov($c,0); 454913ec974Sbeck &adc($c,$c); 455913ec974Sbeck &sub($tmp1,$tmp2); 456913ec974Sbeck &adc($c,0); 457913ec974Sbeck &dec($num) if ($i != 6); 458da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 459913ec974Sbeck &jz(&label("aw_end")) if ($i != 6); 460913ec974Sbeck } 461913ec974Sbeck &set_label("aw_end",0); 462913ec974Sbeck 463913ec974Sbeck# &mov("eax",$c); # $c is "eax" 4645b37fcf3Sryker 4655b37fcf3Sryker &function_end($name); 4665b37fcf3Sryker } 4675b37fcf3Sryker 468da347917Sbecksub bn_sub_part_words 469da347917Sbeck { 470da347917Sbeck local($name)=@_; 471da347917Sbeck 472da347917Sbeck &function_begin($name,""); 473da347917Sbeck 474da347917Sbeck &comment(""); 475da347917Sbeck $a="esi"; 476da347917Sbeck $b="edi"; 477da347917Sbeck $c="eax"; 478da347917Sbeck $r="ebx"; 479da347917Sbeck $tmp1="ecx"; 480da347917Sbeck $tmp2="edx"; 481da347917Sbeck $num="ebp"; 482da347917Sbeck 483da347917Sbeck &mov($r,&wparam(0)); # get r 484da347917Sbeck &mov($a,&wparam(1)); # get a 485da347917Sbeck &mov($b,&wparam(2)); # get b 486da347917Sbeck &mov($num,&wparam(3)); # get num 487da347917Sbeck &xor($c,$c); # clear carry 488da347917Sbeck &and($num,0xfffffff8); # num / 8 489da347917Sbeck 490da347917Sbeck &jz(&label("aw_finish")); 491da347917Sbeck 492da347917Sbeck &set_label("aw_loop",0); 493da347917Sbeck for ($i=0; $i<8; $i++) 494da347917Sbeck { 495da347917Sbeck &comment("Round $i"); 496da347917Sbeck 497da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 498da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 499da347917Sbeck &sub($tmp1,$c); 500da347917Sbeck &mov($c,0); 501da347917Sbeck &adc($c,$c); 502da347917Sbeck &sub($tmp1,$tmp2); 503da347917Sbeck &adc($c,0); 504da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 505da347917Sbeck } 506da347917Sbeck 507da347917Sbeck &comment(""); 508da347917Sbeck &add($a,32); 509da347917Sbeck &add($b,32); 510da347917Sbeck &add($r,32); 511da347917Sbeck &sub($num,8); 512da347917Sbeck &jnz(&label("aw_loop")); 513da347917Sbeck 514da347917Sbeck &set_label("aw_finish",0); 515da347917Sbeck &mov($num,&wparam(3)); # get num 516da347917Sbeck &and($num,7); 517da347917Sbeck &jz(&label("aw_end")); 518da347917Sbeck 519da347917Sbeck for ($i=0; $i<7; $i++) 520da347917Sbeck { 521da347917Sbeck &comment("Tail Round $i"); 522da347917Sbeck &mov($tmp1,&DWP(0,$a,"",0)); # *a 523da347917Sbeck &mov($tmp2,&DWP(0,$b,"",0));# *b 524da347917Sbeck &sub($tmp1,$c); 525da347917Sbeck &mov($c,0); 526da347917Sbeck &adc($c,$c); 527da347917Sbeck &sub($tmp1,$tmp2); 528da347917Sbeck &adc($c,0); 529da347917Sbeck &mov(&DWP(0,$r,"",0),$tmp1); # *r 530da347917Sbeck &add($a, 4); 531da347917Sbeck &add($b, 4); 532da347917Sbeck &add($r, 4); 533da347917Sbeck &dec($num) if ($i != 6); 534da347917Sbeck &jz(&label("aw_end")) if ($i != 6); 535da347917Sbeck } 536da347917Sbeck &set_label("aw_end",0); 537da347917Sbeck 538da347917Sbeck &cmp(&wparam(4),0); 539da347917Sbeck &je(&label("pw_end")); 540da347917Sbeck 541da347917Sbeck &mov($num,&wparam(4)); # get dl 542da347917Sbeck &cmp($num,0); 543da347917Sbeck &je(&label("pw_end")); 544da347917Sbeck &jge(&label("pw_pos")); 545da347917Sbeck 546da347917Sbeck &comment("pw_neg"); 547da347917Sbeck &mov($tmp2,0); 548da347917Sbeck &sub($tmp2,$num); 549da347917Sbeck &mov($num,$tmp2); 550da347917Sbeck &and($num,0xfffffff8); # num / 8 551da347917Sbeck &jz(&label("pw_neg_finish")); 552da347917Sbeck 553da347917Sbeck &set_label("pw_neg_loop",0); 554da347917Sbeck for ($i=0; $i<8; $i++) 555da347917Sbeck { 556da347917Sbeck &comment("dl<0 Round $i"); 557da347917Sbeck 558da347917Sbeck &mov($tmp1,0); 559da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 560da347917Sbeck &sub($tmp1,$c); 561da347917Sbeck &mov($c,0); 562da347917Sbeck &adc($c,$c); 563da347917Sbeck &sub($tmp1,$tmp2); 564da347917Sbeck &adc($c,0); 565da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 566da347917Sbeck } 567da347917Sbeck 568da347917Sbeck &comment(""); 569da347917Sbeck &add($b,32); 570da347917Sbeck &add($r,32); 571da347917Sbeck &sub($num,8); 572da347917Sbeck &jnz(&label("pw_neg_loop")); 573da347917Sbeck 574da347917Sbeck &set_label("pw_neg_finish",0); 575da347917Sbeck &mov($tmp2,&wparam(4)); # get dl 576da347917Sbeck &mov($num,0); 577da347917Sbeck &sub($num,$tmp2); 578da347917Sbeck &and($num,7); 579da347917Sbeck &jz(&label("pw_end")); 580da347917Sbeck 581da347917Sbeck for ($i=0; $i<7; $i++) 582da347917Sbeck { 583da347917Sbeck &comment("dl<0 Tail Round $i"); 584da347917Sbeck &mov($tmp1,0); 585da347917Sbeck &mov($tmp2,&DWP($i*4,$b,"",0));# *b 586da347917Sbeck &sub($tmp1,$c); 587da347917Sbeck &mov($c,0); 588da347917Sbeck &adc($c,$c); 589da347917Sbeck &sub($tmp1,$tmp2); 590da347917Sbeck &adc($c,0); 591da347917Sbeck &dec($num) if ($i != 6); 592da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 593da347917Sbeck &jz(&label("pw_end")) if ($i != 6); 594da347917Sbeck } 595da347917Sbeck 596da347917Sbeck &jmp(&label("pw_end")); 597da347917Sbeck 598da347917Sbeck &set_label("pw_pos",0); 599da347917Sbeck 600da347917Sbeck &and($num,0xfffffff8); # num / 8 601da347917Sbeck &jz(&label("pw_pos_finish")); 602da347917Sbeck 603da347917Sbeck &set_label("pw_pos_loop",0); 604da347917Sbeck 605da347917Sbeck for ($i=0; $i<8; $i++) 606da347917Sbeck { 607da347917Sbeck &comment("dl>0 Round $i"); 608da347917Sbeck 609da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 610da347917Sbeck &sub($tmp1,$c); 611da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 612da347917Sbeck &jnc(&label("pw_nc".$i)); 613da347917Sbeck } 614da347917Sbeck 615da347917Sbeck &comment(""); 616da347917Sbeck &add($a,32); 617da347917Sbeck &add($r,32); 618da347917Sbeck &sub($num,8); 619da347917Sbeck &jnz(&label("pw_pos_loop")); 620da347917Sbeck 621da347917Sbeck &set_label("pw_pos_finish",0); 622da347917Sbeck &mov($num,&wparam(4)); # get dl 623da347917Sbeck &and($num,7); 624da347917Sbeck &jz(&label("pw_end")); 625da347917Sbeck 626da347917Sbeck for ($i=0; $i<7; $i++) 627da347917Sbeck { 628da347917Sbeck &comment("dl>0 Tail Round $i"); 629da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 630da347917Sbeck &sub($tmp1,$c); 631da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 632da347917Sbeck &jnc(&label("pw_tail_nc".$i)); 633da347917Sbeck &dec($num) if ($i != 6); 634da347917Sbeck &jz(&label("pw_end")) if ($i != 6); 635da347917Sbeck } 636da347917Sbeck &mov($c,1); 637da347917Sbeck &jmp(&label("pw_end")); 638da347917Sbeck 639da347917Sbeck &set_label("pw_nc_loop",0); 640da347917Sbeck for ($i=0; $i<8; $i++) 641da347917Sbeck { 642da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 643da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 644da347917Sbeck &set_label("pw_nc".$i,0); 645da347917Sbeck } 646da347917Sbeck 647da347917Sbeck &comment(""); 648da347917Sbeck &add($a,32); 649da347917Sbeck &add($r,32); 650da347917Sbeck &sub($num,8); 651da347917Sbeck &jnz(&label("pw_nc_loop")); 652da347917Sbeck 653da347917Sbeck &mov($num,&wparam(4)); # get dl 654da347917Sbeck &and($num,7); 655da347917Sbeck &jz(&label("pw_nc_end")); 656da347917Sbeck 657da347917Sbeck for ($i=0; $i<7; $i++) 658da347917Sbeck { 659da347917Sbeck &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 660da347917Sbeck &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 661da347917Sbeck &set_label("pw_tail_nc".$i,0); 662da347917Sbeck &dec($num) if ($i != 6); 663da347917Sbeck &jz(&label("pw_nc_end")) if ($i != 6); 664da347917Sbeck } 665da347917Sbeck 666da347917Sbeck &set_label("pw_nc_end",0); 667da347917Sbeck &mov($c,0); 668da347917Sbeck 669da347917Sbeck &set_label("pw_end",0); 670da347917Sbeck 671da347917Sbeck# &mov("eax",$c); # $c is "eax" 672da347917Sbeck 673da347917Sbeck &function_end($name); 674da347917Sbeck } 675da347917Sbeck 676