xref: /openbsd/lib/libcrypto/bn/asm/bn-586.pl (revision 0bd8ca4a)
15b37fcf3Sryker#!/usr/local/bin/perl
25b37fcf3Sryker
30a5d6edeSdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40a5d6edeSdjmpush(@INC,"${dir}","${dir}../../perlasm");
55b37fcf3Srykerrequire "x86asm.pl";
65b37fcf3Sryker
7913ec974Sbeck&asm_init($ARGV[0],$0);
85b37fcf3Sryker
94fcf65c5Sdjm$sse2=0;
104fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
114fcf65c5Sdjm
124fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2);
134fcf65c5Sdjm
145b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words");
155b37fcf3Sryker&bn_mul_words("bn_mul_words");
165b37fcf3Sryker&bn_sqr_words("bn_sqr_words");
17913ec974Sbeck&bn_div_words("bn_div_words");
185b37fcf3Sryker&bn_add_words("bn_add_words");
19913ec974Sbeck&bn_sub_words("bn_sub_words");
204fcf65c5Sdjm&bn_sub_part_words("bn_sub_part_words");
215b37fcf3Sryker
225b37fcf3Sryker&asm_finish();
235b37fcf3Sryker
245b37fcf3Srykersub bn_mul_add_words
255b37fcf3Sryker	{
265b37fcf3Sryker	local($name)=@_;
275b37fcf3Sryker
28e60c46c4Smiod	&function_begin_B($name,"");
295b37fcf3Sryker
300a5d6edeSdjm	$r="eax";
310a5d6edeSdjm	$a="edx";
320a5d6edeSdjm	$c="ecx";
335b37fcf3Sryker
344fcf65c5Sdjm	if ($sse2) {
35*0bd8ca4aSmiod		&picsetup("eax");
36*0bd8ca4aSmiod		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
37e60c46c4Smiod		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
380a5d6edeSdjm		&jnc(&label("maw_non_sse2"));
394fcf65c5Sdjm
400a5d6edeSdjm		&mov($r,&wparam(0));
410a5d6edeSdjm		&mov($a,&wparam(1));
420a5d6edeSdjm		&mov($c,&wparam(2));
430a5d6edeSdjm		&movd("mm0",&wparam(3));	# mm0 = w
444fcf65c5Sdjm		&pxor("mm1","mm1");		# mm1 = carry_in
450a5d6edeSdjm		&jmp(&label("maw_sse2_entry"));
464fcf65c5Sdjm
470a5d6edeSdjm	&set_label("maw_sse2_unrolled",16);
484fcf65c5Sdjm		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
494fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
504fcf65c5Sdjm		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
514fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
524fcf65c5Sdjm		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
534fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
544fcf65c5Sdjm		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
554fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
564fcf65c5Sdjm		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
574fcf65c5Sdjm		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
584fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
594fcf65c5Sdjm		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
604fcf65c5Sdjm		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
614fcf65c5Sdjm		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
624fcf65c5Sdjm		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
634fcf65c5Sdjm		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
644fcf65c5Sdjm		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
654fcf65c5Sdjm		&movd(&DWP(0,$r,"",0),"mm1");
664fcf65c5Sdjm		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
674fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
684fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry0
694fcf65c5Sdjm		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
704fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
714fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
724fcf65c5Sdjm		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
734fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
744fcf65c5Sdjm		&movd(&DWP(4,$r,"",0),"mm1");
754fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry1
764fcf65c5Sdjm		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
774fcf65c5Sdjm		&add($a,32);
784fcf65c5Sdjm		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
794fcf65c5Sdjm		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
804fcf65c5Sdjm		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
814fcf65c5Sdjm		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
824fcf65c5Sdjm		&movd(&DWP(8,$r,"",0),"mm1");
834fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry2
844fcf65c5Sdjm		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
854fcf65c5Sdjm		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
864fcf65c5Sdjm		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
874fcf65c5Sdjm		&movd(&DWP(12,$r,"",0),"mm1");
884fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry3
894fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
904fcf65c5Sdjm		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
914fcf65c5Sdjm		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
924fcf65c5Sdjm		&movd(&DWP(16,$r,"",0),"mm1");
934fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry4
944fcf65c5Sdjm		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
954fcf65c5Sdjm		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
964fcf65c5Sdjm		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
974fcf65c5Sdjm		&movd(&DWP(20,$r,"",0),"mm1");
984fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry5
994fcf65c5Sdjm		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
1004fcf65c5Sdjm		&movd(&DWP(24,$r,"",0),"mm1");
1014fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry6
1024fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
1034fcf65c5Sdjm		&movd(&DWP(28,$r,"",0),"mm1");
1040a5d6edeSdjm		&lea($r,&DWP(32,$r));
1054fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry_out
1064fcf65c5Sdjm
1070a5d6edeSdjm		&sub($c,8);
1080a5d6edeSdjm		&jz(&label("maw_sse2_exit"));
1090a5d6edeSdjm	&set_label("maw_sse2_entry");
1100a5d6edeSdjm		&test($c,0xfffffff8);
1110a5d6edeSdjm		&jnz(&label("maw_sse2_unrolled"));
1120a5d6edeSdjm
1130a5d6edeSdjm	&set_label("maw_sse2_loop",4);
1140a5d6edeSdjm		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
1150a5d6edeSdjm		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
1160a5d6edeSdjm		&pmuludq("mm2","mm0");		# a[i] *= w
1170a5d6edeSdjm		&lea($a,&DWP(4,$a));
1180a5d6edeSdjm		&paddq("mm1","mm3");		# carry += r[i]
1190a5d6edeSdjm		&paddq("mm1","mm2");		# carry += a[i]*w
1200a5d6edeSdjm		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
1210a5d6edeSdjm		&sub($c,1);
1220a5d6edeSdjm		&psrlq("mm1",32);		# carry = carry_high
1230a5d6edeSdjm		&lea($r,&DWP(4,$r));
1244fcf65c5Sdjm		&jnz(&label("maw_sse2_loop"));
1250a5d6edeSdjm	&set_label("maw_sse2_exit");
1260a5d6edeSdjm		&movd("eax","mm1");		# c = carry_out
1274fcf65c5Sdjm		&emms();
1280a5d6edeSdjm		&ret();
1294fcf65c5Sdjm
1300a5d6edeSdjm	&set_label("maw_non_sse2",16);
1314fcf65c5Sdjm	}
1324fcf65c5Sdjm
1330a5d6edeSdjm	# function_begin prologue
1340a5d6edeSdjm	&push("ebp");
1350a5d6edeSdjm	&push("ebx");
1360a5d6edeSdjm	&push("esi");
1370a5d6edeSdjm	&push("edi");
1385b37fcf3Sryker
1390a5d6edeSdjm	&comment("");
1400a5d6edeSdjm	$Low="eax";
1410a5d6edeSdjm	$High="edx";
1420a5d6edeSdjm	$a="ebx";
1430a5d6edeSdjm	$w="ebp";
1440a5d6edeSdjm	$r="edi";
1450a5d6edeSdjm	$c="esi";
1460a5d6edeSdjm
1470a5d6edeSdjm	&xor($c,$c);		# clear carry
1480a5d6edeSdjm	&mov($r,&wparam(0));	#
1490a5d6edeSdjm
1500a5d6edeSdjm	&mov("ecx",&wparam(2));	#
1510a5d6edeSdjm	&mov($a,&wparam(1));	#
1520a5d6edeSdjm
1530a5d6edeSdjm	&and("ecx",0xfffffff8);	# num / 8
1540a5d6edeSdjm	&mov($w,&wparam(3));	#
1550a5d6edeSdjm
1560a5d6edeSdjm	&push("ecx");		# Up the stack for a tmp variable
1570a5d6edeSdjm
1580a5d6edeSdjm	&jz(&label("maw_finish"));
1590a5d6edeSdjm
1600a5d6edeSdjm	&set_label("maw_loop",16);
1615b37fcf3Sryker
1625b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
1635b37fcf3Sryker		{
1645b37fcf3Sryker		&comment("Round $i");
1655b37fcf3Sryker
1660a5d6edeSdjm		 &mov("eax",&DWP($i,$a)); 	# *a
1675b37fcf3Sryker		&mul($w);			# *a * w
1685b37fcf3Sryker		&add("eax",$c);			# L(t)+= c
1695b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1700a5d6edeSdjm		 &add("eax",&DWP($i,$r));	# L(t)+= *r
1710a5d6edeSdjm		&adc("edx",0);			# H(t)+=carry
1720a5d6edeSdjm		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
1735b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
1745b37fcf3Sryker		}
1755b37fcf3Sryker
1765b37fcf3Sryker	&comment("");
1775b37fcf3Sryker	&sub("ecx",8);
1780a5d6edeSdjm	&lea($a,&DWP(32,$a));
1790a5d6edeSdjm	&lea($r,&DWP(32,$r));
1805b37fcf3Sryker	&jnz(&label("maw_loop"));
1815b37fcf3Sryker
1825b37fcf3Sryker	&set_label("maw_finish",0);
1835b37fcf3Sryker	&mov("ecx",&wparam(2));	# get num
1845b37fcf3Sryker	&and("ecx",7);
1855b37fcf3Sryker	&jnz(&label("maw_finish2"));	# helps branch prediction
1865b37fcf3Sryker	&jmp(&label("maw_end"));
1875b37fcf3Sryker
1885b37fcf3Sryker	&set_label("maw_finish2",1);
1895b37fcf3Sryker	for ($i=0; $i<7; $i++)
1905b37fcf3Sryker		{
1915b37fcf3Sryker		&comment("Tail Round $i");
1920a5d6edeSdjm		 &mov("eax",&DWP($i*4,$a));	# *a
1935b37fcf3Sryker		&mul($w);			# *a * w
1945b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
1955b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1960a5d6edeSdjm		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
1975b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1985b37fcf3Sryker		 &dec("ecx") if ($i != 7-1);
1990a5d6edeSdjm		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
2005b37fcf3Sryker		 &mov($c,"edx");		# c=  H(t);
2015b37fcf3Sryker		&jz(&label("maw_end")) if ($i != 7-1);
2025b37fcf3Sryker		}
2035b37fcf3Sryker	&set_label("maw_end",0);
2045b37fcf3Sryker	&mov("eax",$c);
2055b37fcf3Sryker
2065b37fcf3Sryker	&pop("ecx");	# clear variable from
2075b37fcf3Sryker
2085b37fcf3Sryker	&function_end($name);
2095b37fcf3Sryker	}
2105b37fcf3Sryker
2115b37fcf3Srykersub bn_mul_words
2125b37fcf3Sryker	{
2135b37fcf3Sryker	local($name)=@_;
2145b37fcf3Sryker
215e60c46c4Smiod	&function_begin_B($name,"");
2160a5d6edeSdjm
2170a5d6edeSdjm	$r="eax";
2180a5d6edeSdjm	$a="edx";
2190a5d6edeSdjm	$c="ecx";
2200a5d6edeSdjm
2210a5d6edeSdjm	if ($sse2) {
222*0bd8ca4aSmiod		&picsetup("eax");
223*0bd8ca4aSmiod		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
224e60c46c4Smiod		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
2250a5d6edeSdjm		&jnc(&label("mw_non_sse2"));
2260a5d6edeSdjm
2270a5d6edeSdjm		&mov($r,&wparam(0));
2280a5d6edeSdjm		&mov($a,&wparam(1));
2290a5d6edeSdjm		&mov($c,&wparam(2));
2300a5d6edeSdjm		&movd("mm0",&wparam(3));	# mm0 = w
2310a5d6edeSdjm		&pxor("mm1","mm1");		# mm1 = carry = 0
2320a5d6edeSdjm
2330a5d6edeSdjm	&set_label("mw_sse2_loop",16);
2340a5d6edeSdjm		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
2350a5d6edeSdjm		&pmuludq("mm2","mm0");		# a[i] *= w
2360a5d6edeSdjm		&lea($a,&DWP(4,$a));
2370a5d6edeSdjm		&paddq("mm1","mm2");		# carry += a[i]*w
2380a5d6edeSdjm		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
2390a5d6edeSdjm		&sub($c,1);
2400a5d6edeSdjm		&psrlq("mm1",32);		# carry = carry_high
2410a5d6edeSdjm		&lea($r,&DWP(4,$r));
2420a5d6edeSdjm		&jnz(&label("mw_sse2_loop"));
2430a5d6edeSdjm
2440a5d6edeSdjm		&movd("eax","mm1");		# return carry
2450a5d6edeSdjm		&emms();
2460a5d6edeSdjm		&ret();
2470a5d6edeSdjm	&set_label("mw_non_sse2",16);
2480a5d6edeSdjm	}
2490a5d6edeSdjm
2500a5d6edeSdjm	# function_begin prologue
2510a5d6edeSdjm	&push("ebp");
2520a5d6edeSdjm	&push("ebx");
2530a5d6edeSdjm	&push("esi");
2540a5d6edeSdjm	&push("edi");
2555b37fcf3Sryker
2565b37fcf3Sryker	&comment("");
2575b37fcf3Sryker	$Low="eax";
2585b37fcf3Sryker	$High="edx";
2595b37fcf3Sryker	$a="ebx";
2605b37fcf3Sryker	$w="ecx";
2615b37fcf3Sryker	$r="edi";
2625b37fcf3Sryker	$c="esi";
2635b37fcf3Sryker	$num="ebp";
2645b37fcf3Sryker
2655b37fcf3Sryker	&xor($c,$c);		# clear carry
2665b37fcf3Sryker	&mov($r,&wparam(0));	#
2675b37fcf3Sryker	&mov($a,&wparam(1));	#
2685b37fcf3Sryker	&mov($num,&wparam(2));	#
2695b37fcf3Sryker	&mov($w,&wparam(3));	#
2705b37fcf3Sryker
2715b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
2725b37fcf3Sryker	&jz(&label("mw_finish"));
2735b37fcf3Sryker
2745b37fcf3Sryker	&set_label("mw_loop",0);
2755b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
2765b37fcf3Sryker		{
2775b37fcf3Sryker		&comment("Round $i");
2785b37fcf3Sryker
2795b37fcf3Sryker		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
2805b37fcf3Sryker		&mul($w);			# *a * w
2815b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
2825b37fcf3Sryker		 # XXX
2835b37fcf3Sryker
2845b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
2855b37fcf3Sryker		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
2865b37fcf3Sryker
2875b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
2885b37fcf3Sryker		}
2895b37fcf3Sryker
2905b37fcf3Sryker	&comment("");
2915b37fcf3Sryker	&add($a,32);
2925b37fcf3Sryker	&add($r,32);
2935b37fcf3Sryker	&sub($num,8);
2945b37fcf3Sryker	&jz(&label("mw_finish"));
2955b37fcf3Sryker	&jmp(&label("mw_loop"));
2965b37fcf3Sryker
2975b37fcf3Sryker	&set_label("mw_finish",0);
2985b37fcf3Sryker	&mov($num,&wparam(2));	# get num
2995b37fcf3Sryker	&and($num,7);
3005b37fcf3Sryker	&jnz(&label("mw_finish2"));
3015b37fcf3Sryker	&jmp(&label("mw_end"));
3025b37fcf3Sryker
3035b37fcf3Sryker	&set_label("mw_finish2",1);
3045b37fcf3Sryker	for ($i=0; $i<7; $i++)
3055b37fcf3Sryker		{
3065b37fcf3Sryker		&comment("Tail Round $i");
3075b37fcf3Sryker		 &mov("eax",&DWP($i*4,$a,"",0));# *a
3085b37fcf3Sryker		&mul($w);			# *a * w
3095b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
3105b37fcf3Sryker		 # XXX
3115b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
3125b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
3135b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
3145b37fcf3Sryker		 &dec($num) if ($i != 7-1);
3155b37fcf3Sryker		&jz(&label("mw_end")) if ($i != 7-1);
3165b37fcf3Sryker		}
3175b37fcf3Sryker	&set_label("mw_end",0);
3185b37fcf3Sryker	&mov("eax",$c);
3195b37fcf3Sryker
3205b37fcf3Sryker	&function_end($name);
3215b37fcf3Sryker	}
3225b37fcf3Sryker
3235b37fcf3Srykersub bn_sqr_words
3245b37fcf3Sryker	{
3255b37fcf3Sryker	local($name)=@_;
3265b37fcf3Sryker
327e60c46c4Smiod	&function_begin_B($name,"");
3280a5d6edeSdjm
3290a5d6edeSdjm	$r="eax";
3300a5d6edeSdjm	$a="edx";
3310a5d6edeSdjm	$c="ecx";
3320a5d6edeSdjm
3330a5d6edeSdjm	if ($sse2) {
334*0bd8ca4aSmiod		&picsetup("eax");
335*0bd8ca4aSmiod		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
336e60c46c4Smiod		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
3370a5d6edeSdjm		&jnc(&label("sqr_non_sse2"));
3380a5d6edeSdjm
3390a5d6edeSdjm		&mov($r,&wparam(0));
3400a5d6edeSdjm		&mov($a,&wparam(1));
3410a5d6edeSdjm		&mov($c,&wparam(2));
3420a5d6edeSdjm
3430a5d6edeSdjm	&set_label("sqr_sse2_loop",16);
3440a5d6edeSdjm		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
3450a5d6edeSdjm		&pmuludq("mm0","mm0");		# a[i] *= a[i]
3460a5d6edeSdjm		&lea($a,&DWP(4,$a));		# a++
3470a5d6edeSdjm		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
3480a5d6edeSdjm		&sub($c,1);
3490a5d6edeSdjm		&lea($r,&DWP(8,$r));		# r += 2
3500a5d6edeSdjm		&jnz(&label("sqr_sse2_loop"));
3510a5d6edeSdjm
3520a5d6edeSdjm		&emms();
3530a5d6edeSdjm		&ret();
3540a5d6edeSdjm	&set_label("sqr_non_sse2",16);
3550a5d6edeSdjm	}
3560a5d6edeSdjm
3570a5d6edeSdjm	# function_begin prologue
3580a5d6edeSdjm	&push("ebp");
3590a5d6edeSdjm	&push("ebx");
3600a5d6edeSdjm	&push("esi");
3610a5d6edeSdjm	&push("edi");
3625b37fcf3Sryker
3635b37fcf3Sryker	&comment("");
3645b37fcf3Sryker	$r="esi";
3655b37fcf3Sryker	$a="edi";
3665b37fcf3Sryker	$num="ebx";
3675b37fcf3Sryker
3685b37fcf3Sryker	&mov($r,&wparam(0));	#
3695b37fcf3Sryker	&mov($a,&wparam(1));	#
3705b37fcf3Sryker	&mov($num,&wparam(2));	#
3715b37fcf3Sryker
3725b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
3735b37fcf3Sryker	&jz(&label("sw_finish"));
3745b37fcf3Sryker
3755b37fcf3Sryker	&set_label("sw_loop",0);
3765b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
3775b37fcf3Sryker		{
3785b37fcf3Sryker		&comment("Round $i");
3795b37fcf3Sryker		&mov("eax",&DWP($i,$a,"",0)); 	# *a
3805b37fcf3Sryker		 # XXX
3815b37fcf3Sryker		&mul("eax");			# *a * *a
3825b37fcf3Sryker		&mov(&DWP($i*2,$r,"",0),"eax");	#
3835b37fcf3Sryker		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
3845b37fcf3Sryker		}
3855b37fcf3Sryker
3865b37fcf3Sryker	&comment("");
3875b37fcf3Sryker	&add($a,32);
3885b37fcf3Sryker	&add($r,64);
3895b37fcf3Sryker	&sub($num,8);
3905b37fcf3Sryker	&jnz(&label("sw_loop"));
3915b37fcf3Sryker
3925b37fcf3Sryker	&set_label("sw_finish",0);
3935b37fcf3Sryker	&mov($num,&wparam(2));	# get num
3945b37fcf3Sryker	&and($num,7);
3955b37fcf3Sryker	&jz(&label("sw_end"));
3965b37fcf3Sryker
3975b37fcf3Sryker	for ($i=0; $i<7; $i++)
3985b37fcf3Sryker		{
3995b37fcf3Sryker		&comment("Tail Round $i");
4005b37fcf3Sryker		&mov("eax",&DWP($i*4,$a,"",0));	# *a
4015b37fcf3Sryker		 # XXX
4025b37fcf3Sryker		&mul("eax");			# *a * *a
4035b37fcf3Sryker		&mov(&DWP($i*8,$r,"",0),"eax");	#
4045b37fcf3Sryker		 &dec($num) if ($i != 7-1);
4055b37fcf3Sryker		&mov(&DWP($i*8+4,$r,"",0),"edx");
4065b37fcf3Sryker		 &jz(&label("sw_end")) if ($i != 7-1);
4075b37fcf3Sryker		}
4085b37fcf3Sryker	&set_label("sw_end",0);
4095b37fcf3Sryker
4105b37fcf3Sryker	&function_end($name);
4115b37fcf3Sryker	}
4125b37fcf3Sryker
413913ec974Sbecksub bn_div_words
4145b37fcf3Sryker	{
4155b37fcf3Sryker	local($name)=@_;
4165b37fcf3Sryker
4170a5d6edeSdjm	&function_begin_B($name,"");
4185b37fcf3Sryker	&mov("edx",&wparam(0));	#
4195b37fcf3Sryker	&mov("eax",&wparam(1));	#
4200a5d6edeSdjm	&mov("ecx",&wparam(2));	#
4210a5d6edeSdjm	&div("ecx");
4220a5d6edeSdjm	&ret();
4230a5d6edeSdjm	&function_end_B($name);
4245b37fcf3Sryker	}
4255b37fcf3Sryker
4265b37fcf3Srykersub bn_add_words
4275b37fcf3Sryker	{
4285b37fcf3Sryker	local($name)=@_;
4295b37fcf3Sryker
4305b37fcf3Sryker	&function_begin($name,"");
4315b37fcf3Sryker
4325b37fcf3Sryker	&comment("");
4335b37fcf3Sryker	$a="esi";
4345b37fcf3Sryker	$b="edi";
4355b37fcf3Sryker	$c="eax";
4365b37fcf3Sryker	$r="ebx";
4375b37fcf3Sryker	$tmp1="ecx";
4385b37fcf3Sryker	$tmp2="edx";
4395b37fcf3Sryker	$num="ebp";
4405b37fcf3Sryker
4415b37fcf3Sryker	&mov($r,&wparam(0));	# get r
4425b37fcf3Sryker	 &mov($a,&wparam(1));	# get a
4435b37fcf3Sryker	&mov($b,&wparam(2));	# get b
4445b37fcf3Sryker	 &mov($num,&wparam(3));	# get num
4455b37fcf3Sryker	&xor($c,$c);		# clear carry
4465b37fcf3Sryker	 &and($num,0xfffffff8);	# num / 8
4475b37fcf3Sryker
4485b37fcf3Sryker	&jz(&label("aw_finish"));
4495b37fcf3Sryker
4505b37fcf3Sryker	&set_label("aw_loop",0);
4515b37fcf3Sryker	for ($i=0; $i<8; $i++)
4525b37fcf3Sryker		{
4535b37fcf3Sryker		&comment("Round $i");
4545b37fcf3Sryker
4555b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
4565b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
4575b37fcf3Sryker		&add($tmp1,$c);
4585b37fcf3Sryker		 &mov($c,0);
4595b37fcf3Sryker		&adc($c,$c);
4605b37fcf3Sryker		 &add($tmp1,$tmp2);
4615b37fcf3Sryker		&adc($c,0);
4625b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
4635b37fcf3Sryker		}
4645b37fcf3Sryker
4655b37fcf3Sryker	&comment("");
4665b37fcf3Sryker	&add($a,32);
4675b37fcf3Sryker	 &add($b,32);
4685b37fcf3Sryker	&add($r,32);
4695b37fcf3Sryker	 &sub($num,8);
4705b37fcf3Sryker	&jnz(&label("aw_loop"));
4715b37fcf3Sryker
4725b37fcf3Sryker	&set_label("aw_finish",0);
4735b37fcf3Sryker	&mov($num,&wparam(3));	# get num
4745b37fcf3Sryker	&and($num,7);
4755b37fcf3Sryker	 &jz(&label("aw_end"));
4765b37fcf3Sryker
4775b37fcf3Sryker	for ($i=0; $i<7; $i++)
4785b37fcf3Sryker		{
4795b37fcf3Sryker		&comment("Tail Round $i");
4805b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
4815b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
4825b37fcf3Sryker		&add($tmp1,$c);
4835b37fcf3Sryker		 &mov($c,0);
4845b37fcf3Sryker		&adc($c,$c);
4855b37fcf3Sryker		 &add($tmp1,$tmp2);
4865b37fcf3Sryker		&adc($c,0);
4875b37fcf3Sryker		 &dec($num) if ($i != 6);
488da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
4895b37fcf3Sryker		 &jz(&label("aw_end")) if ($i != 6);
4905b37fcf3Sryker		}
4915b37fcf3Sryker	&set_label("aw_end",0);
4925b37fcf3Sryker
493913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
494913ec974Sbeck
495913ec974Sbeck	&function_end($name);
496913ec974Sbeck	}
497913ec974Sbeck
498913ec974Sbecksub bn_sub_words
499913ec974Sbeck	{
500913ec974Sbeck	local($name)=@_;
501913ec974Sbeck
502913ec974Sbeck	&function_begin($name,"");
503913ec974Sbeck
504913ec974Sbeck	&comment("");
505913ec974Sbeck	$a="esi";
506913ec974Sbeck	$b="edi";
507913ec974Sbeck	$c="eax";
508913ec974Sbeck	$r="ebx";
509913ec974Sbeck	$tmp1="ecx";
510913ec974Sbeck	$tmp2="edx";
511913ec974Sbeck	$num="ebp";
512913ec974Sbeck
513913ec974Sbeck	&mov($r,&wparam(0));	# get r
514913ec974Sbeck	 &mov($a,&wparam(1));	# get a
515913ec974Sbeck	&mov($b,&wparam(2));	# get b
516913ec974Sbeck	 &mov($num,&wparam(3));	# get num
517913ec974Sbeck	&xor($c,$c);		# clear carry
518913ec974Sbeck	 &and($num,0xfffffff8);	# num / 8
519913ec974Sbeck
520913ec974Sbeck	&jz(&label("aw_finish"));
521913ec974Sbeck
522913ec974Sbeck	&set_label("aw_loop",0);
523913ec974Sbeck	for ($i=0; $i<8; $i++)
524913ec974Sbeck		{
525913ec974Sbeck		&comment("Round $i");
526913ec974Sbeck
527913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
528913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
529913ec974Sbeck		&sub($tmp1,$c);
530913ec974Sbeck		 &mov($c,0);
531913ec974Sbeck		&adc($c,$c);
532913ec974Sbeck		 &sub($tmp1,$tmp2);
533913ec974Sbeck		&adc($c,0);
534913ec974Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
535913ec974Sbeck		}
536913ec974Sbeck
537913ec974Sbeck	&comment("");
538913ec974Sbeck	&add($a,32);
539913ec974Sbeck	 &add($b,32);
540913ec974Sbeck	&add($r,32);
541913ec974Sbeck	 &sub($num,8);
542913ec974Sbeck	&jnz(&label("aw_loop"));
543913ec974Sbeck
544913ec974Sbeck	&set_label("aw_finish",0);
545913ec974Sbeck	&mov($num,&wparam(3));	# get num
546913ec974Sbeck	&and($num,7);
547913ec974Sbeck	 &jz(&label("aw_end"));
548913ec974Sbeck
549913ec974Sbeck	for ($i=0; $i<7; $i++)
550913ec974Sbeck		{
551913ec974Sbeck		&comment("Tail Round $i");
552913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
553913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
554913ec974Sbeck		&sub($tmp1,$c);
555913ec974Sbeck		 &mov($c,0);
556913ec974Sbeck		&adc($c,$c);
557913ec974Sbeck		 &sub($tmp1,$tmp2);
558913ec974Sbeck		&adc($c,0);
559913ec974Sbeck		 &dec($num) if ($i != 6);
560da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
561913ec974Sbeck		 &jz(&label("aw_end")) if ($i != 6);
562913ec974Sbeck		}
563913ec974Sbeck	&set_label("aw_end",0);
564913ec974Sbeck
565913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
5665b37fcf3Sryker
5675b37fcf3Sryker	&function_end($name);
5685b37fcf3Sryker	}
5695b37fcf3Sryker
570da347917Sbecksub bn_sub_part_words
571da347917Sbeck	{
572da347917Sbeck	local($name)=@_;
573da347917Sbeck
574da347917Sbeck	&function_begin($name,"");
575da347917Sbeck
576da347917Sbeck	&comment("");
577da347917Sbeck	$a="esi";
578da347917Sbeck	$b="edi";
579da347917Sbeck	$c="eax";
580da347917Sbeck	$r="ebx";
581da347917Sbeck	$tmp1="ecx";
582da347917Sbeck	$tmp2="edx";
583da347917Sbeck	$num="ebp";
584da347917Sbeck
585da347917Sbeck	&mov($r,&wparam(0));	# get r
586da347917Sbeck	 &mov($a,&wparam(1));	# get a
587da347917Sbeck	&mov($b,&wparam(2));	# get b
588da347917Sbeck	 &mov($num,&wparam(3));	# get num
589da347917Sbeck	&xor($c,$c);		# clear carry
590da347917Sbeck	 &and($num,0xfffffff8);	# num / 8
591da347917Sbeck
592da347917Sbeck	&jz(&label("aw_finish"));
593da347917Sbeck
594da347917Sbeck	&set_label("aw_loop",0);
595da347917Sbeck	for ($i=0; $i<8; $i++)
596da347917Sbeck		{
597da347917Sbeck		&comment("Round $i");
598da347917Sbeck
599da347917Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
600da347917Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
601da347917Sbeck		&sub($tmp1,$c);
602da347917Sbeck		 &mov($c,0);
603da347917Sbeck		&adc($c,$c);
604da347917Sbeck		 &sub($tmp1,$tmp2);
605da347917Sbeck		&adc($c,0);
606da347917Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
607da347917Sbeck		}
608da347917Sbeck
609da347917Sbeck	&comment("");
610da347917Sbeck	&add($a,32);
611da347917Sbeck	 &add($b,32);
612da347917Sbeck	&add($r,32);
613da347917Sbeck	 &sub($num,8);
614da347917Sbeck	&jnz(&label("aw_loop"));
615da347917Sbeck
616da347917Sbeck	&set_label("aw_finish",0);
617da347917Sbeck	&mov($num,&wparam(3));	# get num
618da347917Sbeck	&and($num,7);
619da347917Sbeck	 &jz(&label("aw_end"));
620da347917Sbeck
621da347917Sbeck	for ($i=0; $i<7; $i++)
622da347917Sbeck		{
623da347917Sbeck		&comment("Tail Round $i");
624da347917Sbeck		&mov($tmp1,&DWP(0,$a,"",0));	# *a
625da347917Sbeck		 &mov($tmp2,&DWP(0,$b,"",0));# *b
626da347917Sbeck		&sub($tmp1,$c);
627da347917Sbeck		 &mov($c,0);
628da347917Sbeck		&adc($c,$c);
629da347917Sbeck		 &sub($tmp1,$tmp2);
630da347917Sbeck		&adc($c,0);
631da347917Sbeck		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
632da347917Sbeck		&add($a, 4);
633da347917Sbeck		&add($b, 4);
634da347917Sbeck		&add($r, 4);
635da347917Sbeck		 &dec($num) if ($i != 6);
636da347917Sbeck		 &jz(&label("aw_end")) if ($i != 6);
637da347917Sbeck		}
638da347917Sbeck	&set_label("aw_end",0);
639da347917Sbeck
640da347917Sbeck	&cmp(&wparam(4),0);
641da347917Sbeck	&je(&label("pw_end"));
642da347917Sbeck
643da347917Sbeck	&mov($num,&wparam(4));	# get dl
644da347917Sbeck	&cmp($num,0);
645da347917Sbeck	&je(&label("pw_end"));
646da347917Sbeck	&jge(&label("pw_pos"));
647da347917Sbeck
648da347917Sbeck	&comment("pw_neg");
649da347917Sbeck	&mov($tmp2,0);
650da347917Sbeck	&sub($tmp2,$num);
651da347917Sbeck	&mov($num,$tmp2);
652da347917Sbeck	&and($num,0xfffffff8);	# num / 8
653da347917Sbeck	&jz(&label("pw_neg_finish"));
654da347917Sbeck
655da347917Sbeck	&set_label("pw_neg_loop",0);
656da347917Sbeck	for ($i=0; $i<8; $i++)
657da347917Sbeck	{
658da347917Sbeck	    &comment("dl<0 Round $i");
659da347917Sbeck
660da347917Sbeck	    &mov($tmp1,0);
661da347917Sbeck	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
662da347917Sbeck	    &sub($tmp1,$c);
663da347917Sbeck	    &mov($c,0);
664da347917Sbeck	    &adc($c,$c);
665da347917Sbeck	    &sub($tmp1,$tmp2);
666da347917Sbeck	    &adc($c,0);
667da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
668da347917Sbeck	}
669da347917Sbeck
670da347917Sbeck	&comment("");
671da347917Sbeck	&add($b,32);
672da347917Sbeck	&add($r,32);
673da347917Sbeck	&sub($num,8);
674da347917Sbeck	&jnz(&label("pw_neg_loop"));
675da347917Sbeck
676da347917Sbeck	&set_label("pw_neg_finish",0);
677da347917Sbeck	&mov($tmp2,&wparam(4));	# get dl
678da347917Sbeck	&mov($num,0);
679da347917Sbeck	&sub($num,$tmp2);
680da347917Sbeck	&and($num,7);
681da347917Sbeck	&jz(&label("pw_end"));
682da347917Sbeck
683da347917Sbeck	for ($i=0; $i<7; $i++)
684da347917Sbeck	{
685da347917Sbeck	    &comment("dl<0 Tail Round $i");
686da347917Sbeck	    &mov($tmp1,0);
687da347917Sbeck	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
688da347917Sbeck	    &sub($tmp1,$c);
689da347917Sbeck	    &mov($c,0);
690da347917Sbeck	    &adc($c,$c);
691da347917Sbeck	    &sub($tmp1,$tmp2);
692da347917Sbeck	    &adc($c,0);
693da347917Sbeck	    &dec($num) if ($i != 6);
694da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
695da347917Sbeck	    &jz(&label("pw_end")) if ($i != 6);
696da347917Sbeck	}
697da347917Sbeck
698da347917Sbeck	&jmp(&label("pw_end"));
699da347917Sbeck
700da347917Sbeck	&set_label("pw_pos",0);
701da347917Sbeck
702da347917Sbeck	&and($num,0xfffffff8);	# num / 8
703da347917Sbeck	&jz(&label("pw_pos_finish"));
704da347917Sbeck
705da347917Sbeck	&set_label("pw_pos_loop",0);
706da347917Sbeck
707da347917Sbeck	for ($i=0; $i<8; $i++)
708da347917Sbeck	{
709da347917Sbeck	    &comment("dl>0 Round $i");
710da347917Sbeck
711da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
712da347917Sbeck	    &sub($tmp1,$c);
713da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
714da347917Sbeck	    &jnc(&label("pw_nc".$i));
715da347917Sbeck	}
716da347917Sbeck
717da347917Sbeck	&comment("");
718da347917Sbeck	&add($a,32);
719da347917Sbeck	&add($r,32);
720da347917Sbeck	&sub($num,8);
721da347917Sbeck	&jnz(&label("pw_pos_loop"));
722da347917Sbeck
723da347917Sbeck	&set_label("pw_pos_finish",0);
724da347917Sbeck	&mov($num,&wparam(4));	# get dl
725da347917Sbeck	&and($num,7);
726da347917Sbeck	&jz(&label("pw_end"));
727da347917Sbeck
728da347917Sbeck	for ($i=0; $i<7; $i++)
729da347917Sbeck	{
730da347917Sbeck	    &comment("dl>0 Tail Round $i");
731da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
732da347917Sbeck	    &sub($tmp1,$c);
733da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
734da347917Sbeck	    &jnc(&label("pw_tail_nc".$i));
735da347917Sbeck	    &dec($num) if ($i != 6);
736da347917Sbeck	    &jz(&label("pw_end")) if ($i != 6);
737da347917Sbeck	}
738da347917Sbeck	&mov($c,1);
739da347917Sbeck	&jmp(&label("pw_end"));
740da347917Sbeck
741da347917Sbeck	&set_label("pw_nc_loop",0);
742da347917Sbeck	for ($i=0; $i<8; $i++)
743da347917Sbeck	{
744da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
745da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
746da347917Sbeck	    &set_label("pw_nc".$i,0);
747da347917Sbeck	}
748da347917Sbeck
749da347917Sbeck	&comment("");
750da347917Sbeck	&add($a,32);
751da347917Sbeck	&add($r,32);
752da347917Sbeck	&sub($num,8);
753da347917Sbeck	&jnz(&label("pw_nc_loop"));
754da347917Sbeck
755da347917Sbeck	&mov($num,&wparam(4));	# get dl
756da347917Sbeck	&and($num,7);
757da347917Sbeck	&jz(&label("pw_nc_end"));
758da347917Sbeck
759da347917Sbeck	for ($i=0; $i<7; $i++)
760da347917Sbeck	{
761da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
762da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
763da347917Sbeck	    &set_label("pw_tail_nc".$i,0);
764da347917Sbeck	    &dec($num) if ($i != 6);
765da347917Sbeck	    &jz(&label("pw_nc_end")) if ($i != 6);
766da347917Sbeck	}
767da347917Sbeck
768da347917Sbeck	&set_label("pw_nc_end",0);
769da347917Sbeck	&mov($c,0);
770da347917Sbeck
771da347917Sbeck	&set_label("pw_end",0);
772da347917Sbeck
773da347917Sbeck#	&mov("eax",$c);		# $c is "eax"
774da347917Sbeck
775da347917Sbeck	&function_end($name);
776da347917Sbeck	}
777da347917Sbeck
778