xref: /openbsd/lib/libcrypto/bn/asm/bn-586.pl (revision a70818d0)
15b37fcf3Sryker#!/usr/local/bin/perl
25b37fcf3Sryker
30a5d6edeSdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40a5d6edeSdjmpush(@INC,"${dir}","${dir}../../perlasm");
55b37fcf3Srykerrequire "x86asm.pl";
65b37fcf3Sryker
7913ec974Sbeck&asm_init($ARGV[0],$0);
85b37fcf3Sryker
94fcf65c5Sdjm$sse2=0;
104fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
114fcf65c5Sdjm
124fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2);
134fcf65c5Sdjm
145b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words");
155b37fcf3Sryker&bn_mul_words("bn_mul_words");
165b37fcf3Sryker&bn_sqr_words("bn_sqr_words");
17913ec974Sbeck&bn_div_words("bn_div_words");
185b37fcf3Sryker&bn_add_words("bn_add_words");
19913ec974Sbeck&bn_sub_words("bn_sub_words");
205b37fcf3Sryker
215b37fcf3Sryker&asm_finish();
225b37fcf3Sryker
235b37fcf3Srykersub bn_mul_add_words
245b37fcf3Sryker	{
255b37fcf3Sryker	local($name)=@_;
265b37fcf3Sryker
27e60c46c4Smiod	&function_begin_B($name,"");
285b37fcf3Sryker
290a5d6edeSdjm	$r="eax";
300a5d6edeSdjm	$a="edx";
310a5d6edeSdjm	$c="ecx";
325b37fcf3Sryker
334fcf65c5Sdjm	if ($sse2) {
34*0bd8ca4aSmiod		&picsetup("eax");
35*0bd8ca4aSmiod		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
36e60c46c4Smiod		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
370a5d6edeSdjm		&jnc(&label("maw_non_sse2"));
384fcf65c5Sdjm
390a5d6edeSdjm		&mov($r,&wparam(0));
400a5d6edeSdjm		&mov($a,&wparam(1));
410a5d6edeSdjm		&mov($c,&wparam(2));
420a5d6edeSdjm		&movd("mm0",&wparam(3));	# mm0 = w
434fcf65c5Sdjm		&pxor("mm1","mm1");		# mm1 = carry_in
440a5d6edeSdjm		&jmp(&label("maw_sse2_entry"));
454fcf65c5Sdjm
460a5d6edeSdjm	&set_label("maw_sse2_unrolled",16);
474fcf65c5Sdjm		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
484fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
494fcf65c5Sdjm		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
504fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
514fcf65c5Sdjm		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
524fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
534fcf65c5Sdjm		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
544fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
554fcf65c5Sdjm		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
564fcf65c5Sdjm		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
574fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
584fcf65c5Sdjm		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
594fcf65c5Sdjm		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
604fcf65c5Sdjm		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
614fcf65c5Sdjm		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
624fcf65c5Sdjm		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
634fcf65c5Sdjm		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
644fcf65c5Sdjm		&movd(&DWP(0,$r,"",0),"mm1");
654fcf65c5Sdjm		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
664fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
674fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry0
684fcf65c5Sdjm		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
694fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
704fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
714fcf65c5Sdjm		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
724fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
734fcf65c5Sdjm		&movd(&DWP(4,$r,"",0),"mm1");
744fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry1
754fcf65c5Sdjm		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
764fcf65c5Sdjm		&add($a,32);
774fcf65c5Sdjm		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
784fcf65c5Sdjm		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
794fcf65c5Sdjm		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
804fcf65c5Sdjm		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
814fcf65c5Sdjm		&movd(&DWP(8,$r,"",0),"mm1");
824fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry2
834fcf65c5Sdjm		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
844fcf65c5Sdjm		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
854fcf65c5Sdjm		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
864fcf65c5Sdjm		&movd(&DWP(12,$r,"",0),"mm1");
874fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry3
884fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
894fcf65c5Sdjm		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
904fcf65c5Sdjm		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
914fcf65c5Sdjm		&movd(&DWP(16,$r,"",0),"mm1");
924fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry4
934fcf65c5Sdjm		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
944fcf65c5Sdjm		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
954fcf65c5Sdjm		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
964fcf65c5Sdjm		&movd(&DWP(20,$r,"",0),"mm1");
974fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry5
984fcf65c5Sdjm		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
994fcf65c5Sdjm		&movd(&DWP(24,$r,"",0),"mm1");
1004fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry6
1014fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
1024fcf65c5Sdjm		&movd(&DWP(28,$r,"",0),"mm1");
1030a5d6edeSdjm		&lea($r,&DWP(32,$r));
1044fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry_out
1054fcf65c5Sdjm
1060a5d6edeSdjm		&sub($c,8);
1070a5d6edeSdjm		&jz(&label("maw_sse2_exit"));
1080a5d6edeSdjm	&set_label("maw_sse2_entry");
1090a5d6edeSdjm		&test($c,0xfffffff8);
1100a5d6edeSdjm		&jnz(&label("maw_sse2_unrolled"));
1110a5d6edeSdjm
1120a5d6edeSdjm	&set_label("maw_sse2_loop",4);
1130a5d6edeSdjm		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
1140a5d6edeSdjm		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
1150a5d6edeSdjm		&pmuludq("mm2","mm0");		# a[i] *= w
1160a5d6edeSdjm		&lea($a,&DWP(4,$a));
1170a5d6edeSdjm		&paddq("mm1","mm3");		# carry += r[i]
1180a5d6edeSdjm		&paddq("mm1","mm2");		# carry += a[i]*w
1190a5d6edeSdjm		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
1200a5d6edeSdjm		&sub($c,1);
1210a5d6edeSdjm		&psrlq("mm1",32);		# carry = carry_high
1220a5d6edeSdjm		&lea($r,&DWP(4,$r));
1234fcf65c5Sdjm		&jnz(&label("maw_sse2_loop"));
1240a5d6edeSdjm	&set_label("maw_sse2_exit");
1250a5d6edeSdjm		&movd("eax","mm1");		# c = carry_out
1264fcf65c5Sdjm		&emms();
1270a5d6edeSdjm		&ret();
1284fcf65c5Sdjm
1290a5d6edeSdjm	&set_label("maw_non_sse2",16);
1304fcf65c5Sdjm	}
1314fcf65c5Sdjm
1320a5d6edeSdjm	# function_begin prologue
1330a5d6edeSdjm	&push("ebp");
1340a5d6edeSdjm	&push("ebx");
1350a5d6edeSdjm	&push("esi");
1360a5d6edeSdjm	&push("edi");
1375b37fcf3Sryker
1380a5d6edeSdjm	&comment("");
1390a5d6edeSdjm	$Low="eax";
1400a5d6edeSdjm	$High="edx";
1410a5d6edeSdjm	$a="ebx";
1420a5d6edeSdjm	$w="ebp";
1430a5d6edeSdjm	$r="edi";
1440a5d6edeSdjm	$c="esi";
1450a5d6edeSdjm
1460a5d6edeSdjm	&xor($c,$c);		# clear carry
1470a5d6edeSdjm	&mov($r,&wparam(0));	#
1480a5d6edeSdjm
1490a5d6edeSdjm	&mov("ecx",&wparam(2));	#
1500a5d6edeSdjm	&mov($a,&wparam(1));	#
1510a5d6edeSdjm
1520a5d6edeSdjm	&and("ecx",0xfffffff8);	# num / 8
1530a5d6edeSdjm	&mov($w,&wparam(3));	#
1540a5d6edeSdjm
1550a5d6edeSdjm	&push("ecx");		# Up the stack for a tmp variable
1560a5d6edeSdjm
1570a5d6edeSdjm	&jz(&label("maw_finish"));
1580a5d6edeSdjm
1590a5d6edeSdjm	&set_label("maw_loop",16);
1605b37fcf3Sryker
1615b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
1625b37fcf3Sryker		{
1635b37fcf3Sryker		&comment("Round $i");
1645b37fcf3Sryker
1650a5d6edeSdjm		 &mov("eax",&DWP($i,$a)); 	# *a
1665b37fcf3Sryker		&mul($w);			# *a * w
1675b37fcf3Sryker		&add("eax",$c);			# L(t)+= c
1685b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1690a5d6edeSdjm		 &add("eax",&DWP($i,$r));	# L(t)+= *r
1700a5d6edeSdjm		&adc("edx",0);			# H(t)+=carry
1710a5d6edeSdjm		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
1725b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
1735b37fcf3Sryker		}
1745b37fcf3Sryker
1755b37fcf3Sryker	&comment("");
1765b37fcf3Sryker	&sub("ecx",8);
1770a5d6edeSdjm	&lea($a,&DWP(32,$a));
1780a5d6edeSdjm	&lea($r,&DWP(32,$r));
1795b37fcf3Sryker	&jnz(&label("maw_loop"));
1805b37fcf3Sryker
1815b37fcf3Sryker	&set_label("maw_finish",0);
1825b37fcf3Sryker	&mov("ecx",&wparam(2));	# get num
1835b37fcf3Sryker	&and("ecx",7);
1845b37fcf3Sryker	&jnz(&label("maw_finish2"));	# helps branch prediction
1855b37fcf3Sryker	&jmp(&label("maw_end"));
1865b37fcf3Sryker
1875b37fcf3Sryker	&set_label("maw_finish2",1);
1885b37fcf3Sryker	for ($i=0; $i<7; $i++)
1895b37fcf3Sryker		{
1905b37fcf3Sryker		&comment("Tail Round $i");
1910a5d6edeSdjm		 &mov("eax",&DWP($i*4,$a));	# *a
1925b37fcf3Sryker		&mul($w);			# *a * w
1935b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
1945b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1950a5d6edeSdjm		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
1965b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1975b37fcf3Sryker		 &dec("ecx") if ($i != 7-1);
1980a5d6edeSdjm		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
1995b37fcf3Sryker		 &mov($c,"edx");		# c=  H(t);
2005b37fcf3Sryker		&jz(&label("maw_end")) if ($i != 7-1);
2015b37fcf3Sryker		}
2025b37fcf3Sryker	&set_label("maw_end",0);
2035b37fcf3Sryker	&mov("eax",$c);
2045b37fcf3Sryker
2055b37fcf3Sryker	&pop("ecx");	# clear variable from
2065b37fcf3Sryker
2075b37fcf3Sryker	&function_end($name);
2085b37fcf3Sryker	}
2095b37fcf3Sryker
2105b37fcf3Srykersub bn_mul_words
2115b37fcf3Sryker	{
2125b37fcf3Sryker	local($name)=@_;
2135b37fcf3Sryker
214e60c46c4Smiod	&function_begin_B($name,"");
2150a5d6edeSdjm
2160a5d6edeSdjm	$r="eax";
2170a5d6edeSdjm	$a="edx";
2180a5d6edeSdjm	$c="ecx";
2190a5d6edeSdjm
2200a5d6edeSdjm	if ($sse2) {
221*0bd8ca4aSmiod		&picsetup("eax");
222*0bd8ca4aSmiod		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
223e60c46c4Smiod		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
2240a5d6edeSdjm		&jnc(&label("mw_non_sse2"));
2250a5d6edeSdjm
2260a5d6edeSdjm		&mov($r,&wparam(0));
2270a5d6edeSdjm		&mov($a,&wparam(1));
2280a5d6edeSdjm		&mov($c,&wparam(2));
2290a5d6edeSdjm		&movd("mm0",&wparam(3));	# mm0 = w
2300a5d6edeSdjm		&pxor("mm1","mm1");		# mm1 = carry = 0
2310a5d6edeSdjm
2320a5d6edeSdjm	&set_label("mw_sse2_loop",16);
2330a5d6edeSdjm		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
2340a5d6edeSdjm		&pmuludq("mm2","mm0");		# a[i] *= w
2350a5d6edeSdjm		&lea($a,&DWP(4,$a));
2360a5d6edeSdjm		&paddq("mm1","mm2");		# carry += a[i]*w
2370a5d6edeSdjm		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
2380a5d6edeSdjm		&sub($c,1);
2390a5d6edeSdjm		&psrlq("mm1",32);		# carry = carry_high
2400a5d6edeSdjm		&lea($r,&DWP(4,$r));
2410a5d6edeSdjm		&jnz(&label("mw_sse2_loop"));
2420a5d6edeSdjm
2430a5d6edeSdjm		&movd("eax","mm1");		# return carry
2440a5d6edeSdjm		&emms();
2450a5d6edeSdjm		&ret();
2460a5d6edeSdjm	&set_label("mw_non_sse2",16);
2470a5d6edeSdjm	}
2480a5d6edeSdjm
2490a5d6edeSdjm	# function_begin prologue
2500a5d6edeSdjm	&push("ebp");
2510a5d6edeSdjm	&push("ebx");
2520a5d6edeSdjm	&push("esi");
2530a5d6edeSdjm	&push("edi");
2545b37fcf3Sryker
2555b37fcf3Sryker	&comment("");
2565b37fcf3Sryker	$Low="eax";
2575b37fcf3Sryker	$High="edx";
2585b37fcf3Sryker	$a="ebx";
2595b37fcf3Sryker	$w="ecx";
2605b37fcf3Sryker	$r="edi";
2615b37fcf3Sryker	$c="esi";
2625b37fcf3Sryker	$num="ebp";
2635b37fcf3Sryker
2645b37fcf3Sryker	&xor($c,$c);		# clear carry
2655b37fcf3Sryker	&mov($r,&wparam(0));	#
2665b37fcf3Sryker	&mov($a,&wparam(1));	#
2675b37fcf3Sryker	&mov($num,&wparam(2));	#
2685b37fcf3Sryker	&mov($w,&wparam(3));	#
2695b37fcf3Sryker
2705b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
2715b37fcf3Sryker	&jz(&label("mw_finish"));
2725b37fcf3Sryker
2735b37fcf3Sryker	&set_label("mw_loop",0);
2745b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
2755b37fcf3Sryker		{
2765b37fcf3Sryker		&comment("Round $i");
2775b37fcf3Sryker
2785b37fcf3Sryker		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
2795b37fcf3Sryker		&mul($w);			# *a * w
2805b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
2815b37fcf3Sryker		 # XXX
2825b37fcf3Sryker
2835b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
2845b37fcf3Sryker		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
2855b37fcf3Sryker
2865b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
2875b37fcf3Sryker		}
2885b37fcf3Sryker
2895b37fcf3Sryker	&comment("");
2905b37fcf3Sryker	&add($a,32);
2915b37fcf3Sryker	&add($r,32);
2925b37fcf3Sryker	&sub($num,8);
2935b37fcf3Sryker	&jz(&label("mw_finish"));
2945b37fcf3Sryker	&jmp(&label("mw_loop"));
2955b37fcf3Sryker
2965b37fcf3Sryker	&set_label("mw_finish",0);
2975b37fcf3Sryker	&mov($num,&wparam(2));	# get num
2985b37fcf3Sryker	&and($num,7);
2995b37fcf3Sryker	&jnz(&label("mw_finish2"));
3005b37fcf3Sryker	&jmp(&label("mw_end"));
3015b37fcf3Sryker
3025b37fcf3Sryker	&set_label("mw_finish2",1);
3035b37fcf3Sryker	for ($i=0; $i<7; $i++)
3045b37fcf3Sryker		{
3055b37fcf3Sryker		&comment("Tail Round $i");
3065b37fcf3Sryker		 &mov("eax",&DWP($i*4,$a,"",0));# *a
3075b37fcf3Sryker		&mul($w);			# *a * w
3085b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
3095b37fcf3Sryker		 # XXX
3105b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
3115b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
3125b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
3135b37fcf3Sryker		 &dec($num) if ($i != 7-1);
3145b37fcf3Sryker		&jz(&label("mw_end")) if ($i != 7-1);
3155b37fcf3Sryker		}
3165b37fcf3Sryker	&set_label("mw_end",0);
3175b37fcf3Sryker	&mov("eax",$c);
3185b37fcf3Sryker
3195b37fcf3Sryker	&function_end($name);
3205b37fcf3Sryker	}
3215b37fcf3Sryker
3225b37fcf3Srykersub bn_sqr_words
3235b37fcf3Sryker	{
3245b37fcf3Sryker	local($name)=@_;
3255b37fcf3Sryker
326e60c46c4Smiod	&function_begin_B($name,"");
3270a5d6edeSdjm
3280a5d6edeSdjm	$r="eax";
3290a5d6edeSdjm	$a="edx";
3300a5d6edeSdjm	$c="ecx";
3310a5d6edeSdjm
3320a5d6edeSdjm	if ($sse2) {
333*0bd8ca4aSmiod		&picsetup("eax");
334*0bd8ca4aSmiod		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
335e60c46c4Smiod		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
3360a5d6edeSdjm		&jnc(&label("sqr_non_sse2"));
3370a5d6edeSdjm
3380a5d6edeSdjm		&mov($r,&wparam(0));
3390a5d6edeSdjm		&mov($a,&wparam(1));
3400a5d6edeSdjm		&mov($c,&wparam(2));
3410a5d6edeSdjm
3420a5d6edeSdjm	&set_label("sqr_sse2_loop",16);
3430a5d6edeSdjm		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
3440a5d6edeSdjm		&pmuludq("mm0","mm0");		# a[i] *= a[i]
3450a5d6edeSdjm		&lea($a,&DWP(4,$a));		# a++
3460a5d6edeSdjm		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
3470a5d6edeSdjm		&sub($c,1);
3480a5d6edeSdjm		&lea($r,&DWP(8,$r));		# r += 2
3490a5d6edeSdjm		&jnz(&label("sqr_sse2_loop"));
3500a5d6edeSdjm
3510a5d6edeSdjm		&emms();
3520a5d6edeSdjm		&ret();
3530a5d6edeSdjm	&set_label("sqr_non_sse2",16);
3540a5d6edeSdjm	}
3550a5d6edeSdjm
3560a5d6edeSdjm	# function_begin prologue
3570a5d6edeSdjm	&push("ebp");
3580a5d6edeSdjm	&push("ebx");
3590a5d6edeSdjm	&push("esi");
3600a5d6edeSdjm	&push("edi");
3615b37fcf3Sryker
3625b37fcf3Sryker	&comment("");
3635b37fcf3Sryker	$r="esi";
3645b37fcf3Sryker	$a="edi";
3655b37fcf3Sryker	$num="ebx";
3665b37fcf3Sryker
3675b37fcf3Sryker	&mov($r,&wparam(0));	#
3685b37fcf3Sryker	&mov($a,&wparam(1));	#
3695b37fcf3Sryker	&mov($num,&wparam(2));	#
3705b37fcf3Sryker
3715b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
3725b37fcf3Sryker	&jz(&label("sw_finish"));
3735b37fcf3Sryker
3745b37fcf3Sryker	&set_label("sw_loop",0);
3755b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
3765b37fcf3Sryker		{
3775b37fcf3Sryker		&comment("Round $i");
3785b37fcf3Sryker		&mov("eax",&DWP($i,$a,"",0)); 	# *a
3795b37fcf3Sryker		 # XXX
3805b37fcf3Sryker		&mul("eax");			# *a * *a
3815b37fcf3Sryker		&mov(&DWP($i*2,$r,"",0),"eax");	#
3825b37fcf3Sryker		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
3835b37fcf3Sryker		}
3845b37fcf3Sryker
3855b37fcf3Sryker	&comment("");
3865b37fcf3Sryker	&add($a,32);
3875b37fcf3Sryker	&add($r,64);
3885b37fcf3Sryker	&sub($num,8);
3895b37fcf3Sryker	&jnz(&label("sw_loop"));
3905b37fcf3Sryker
3915b37fcf3Sryker	&set_label("sw_finish",0);
3925b37fcf3Sryker	&mov($num,&wparam(2));	# get num
3935b37fcf3Sryker	&and($num,7);
3945b37fcf3Sryker	&jz(&label("sw_end"));
3955b37fcf3Sryker
3965b37fcf3Sryker	for ($i=0; $i<7; $i++)
3975b37fcf3Sryker		{
3985b37fcf3Sryker		&comment("Tail Round $i");
3995b37fcf3Sryker		&mov("eax",&DWP($i*4,$a,"",0));	# *a
4005b37fcf3Sryker		 # XXX
4015b37fcf3Sryker		&mul("eax");			# *a * *a
4025b37fcf3Sryker		&mov(&DWP($i*8,$r,"",0),"eax");	#
4035b37fcf3Sryker		 &dec($num) if ($i != 7-1);
4045b37fcf3Sryker		&mov(&DWP($i*8+4,$r,"",0),"edx");
4055b37fcf3Sryker		 &jz(&label("sw_end")) if ($i != 7-1);
4065b37fcf3Sryker		}
4075b37fcf3Sryker	&set_label("sw_end",0);
4085b37fcf3Sryker
4095b37fcf3Sryker	&function_end($name);
4105b37fcf3Sryker	}
4115b37fcf3Sryker
412913ec974Sbecksub bn_div_words
4135b37fcf3Sryker	{
4145b37fcf3Sryker	local($name)=@_;
4155b37fcf3Sryker
4160a5d6edeSdjm	&function_begin_B($name,"");
4175b37fcf3Sryker	&mov("edx",&wparam(0));	#
4185b37fcf3Sryker	&mov("eax",&wparam(1));	#
4190a5d6edeSdjm	&mov("ecx",&wparam(2));	#
4200a5d6edeSdjm	&div("ecx");
4210a5d6edeSdjm	&ret();
4220a5d6edeSdjm	&function_end_B($name);
4235b37fcf3Sryker	}
4245b37fcf3Sryker
4255b37fcf3Srykersub bn_add_words
4265b37fcf3Sryker	{
4275b37fcf3Sryker	local($name)=@_;
4285b37fcf3Sryker
4295b37fcf3Sryker	&function_begin($name,"");
4305b37fcf3Sryker
4315b37fcf3Sryker	&comment("");
4325b37fcf3Sryker	$a="esi";
4335b37fcf3Sryker	$b="edi";
4345b37fcf3Sryker	$c="eax";
4355b37fcf3Sryker	$r="ebx";
4365b37fcf3Sryker	$tmp1="ecx";
4375b37fcf3Sryker	$tmp2="edx";
4385b37fcf3Sryker	$num="ebp";
4395b37fcf3Sryker
4405b37fcf3Sryker	&mov($r,&wparam(0));	# get r
4415b37fcf3Sryker	 &mov($a,&wparam(1));	# get a
4425b37fcf3Sryker	&mov($b,&wparam(2));	# get b
4435b37fcf3Sryker	 &mov($num,&wparam(3));	# get num
4445b37fcf3Sryker	&xor($c,$c);		# clear carry
4455b37fcf3Sryker	 &and($num,0xfffffff8);	# num / 8
4465b37fcf3Sryker
4475b37fcf3Sryker	&jz(&label("aw_finish"));
4485b37fcf3Sryker
4495b37fcf3Sryker	&set_label("aw_loop",0);
4505b37fcf3Sryker	for ($i=0; $i<8; $i++)
4515b37fcf3Sryker		{
4525b37fcf3Sryker		&comment("Round $i");
4535b37fcf3Sryker
4545b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
4555b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
4565b37fcf3Sryker		&add($tmp1,$c);
4575b37fcf3Sryker		 &mov($c,0);
4585b37fcf3Sryker		&adc($c,$c);
4595b37fcf3Sryker		 &add($tmp1,$tmp2);
4605b37fcf3Sryker		&adc($c,0);
4615b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
4625b37fcf3Sryker		}
4635b37fcf3Sryker
4645b37fcf3Sryker	&comment("");
4655b37fcf3Sryker	&add($a,32);
4665b37fcf3Sryker	 &add($b,32);
4675b37fcf3Sryker	&add($r,32);
4685b37fcf3Sryker	 &sub($num,8);
4695b37fcf3Sryker	&jnz(&label("aw_loop"));
4705b37fcf3Sryker
4715b37fcf3Sryker	&set_label("aw_finish",0);
4725b37fcf3Sryker	&mov($num,&wparam(3));	# get num
4735b37fcf3Sryker	&and($num,7);
4745b37fcf3Sryker	 &jz(&label("aw_end"));
4755b37fcf3Sryker
4765b37fcf3Sryker	for ($i=0; $i<7; $i++)
4775b37fcf3Sryker		{
4785b37fcf3Sryker		&comment("Tail Round $i");
4795b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
4805b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
4815b37fcf3Sryker		&add($tmp1,$c);
4825b37fcf3Sryker		 &mov($c,0);
4835b37fcf3Sryker		&adc($c,$c);
4845b37fcf3Sryker		 &add($tmp1,$tmp2);
4855b37fcf3Sryker		&adc($c,0);
4865b37fcf3Sryker		 &dec($num) if ($i != 6);
487da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
4885b37fcf3Sryker		 &jz(&label("aw_end")) if ($i != 6);
4895b37fcf3Sryker		}
4905b37fcf3Sryker	&set_label("aw_end",0);
4915b37fcf3Sryker
492913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
493913ec974Sbeck
494913ec974Sbeck	&function_end($name);
495913ec974Sbeck	}
496913ec974Sbeck
497913ec974Sbecksub bn_sub_words
498913ec974Sbeck	{
499913ec974Sbeck	local($name)=@_;
500913ec974Sbeck
501913ec974Sbeck	&function_begin($name,"");
502913ec974Sbeck
503913ec974Sbeck	&comment("");
504913ec974Sbeck	$a="esi";
505913ec974Sbeck	$b="edi";
506913ec974Sbeck	$c="eax";
507913ec974Sbeck	$r="ebx";
508913ec974Sbeck	$tmp1="ecx";
509913ec974Sbeck	$tmp2="edx";
510913ec974Sbeck	$num="ebp";
511913ec974Sbeck
512913ec974Sbeck	&mov($r,&wparam(0));	# get r
513913ec974Sbeck	 &mov($a,&wparam(1));	# get a
514913ec974Sbeck	&mov($b,&wparam(2));	# get b
515913ec974Sbeck	 &mov($num,&wparam(3));	# get num
516913ec974Sbeck	&xor($c,$c);		# clear carry
517913ec974Sbeck	 &and($num,0xfffffff8);	# num / 8
518913ec974Sbeck
519913ec974Sbeck	&jz(&label("aw_finish"));
520913ec974Sbeck
521913ec974Sbeck	&set_label("aw_loop",0);
522913ec974Sbeck	for ($i=0; $i<8; $i++)
523913ec974Sbeck		{
524913ec974Sbeck		&comment("Round $i");
525913ec974Sbeck
526913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
527913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
528913ec974Sbeck		&sub($tmp1,$c);
529913ec974Sbeck		 &mov($c,0);
530913ec974Sbeck		&adc($c,$c);
531913ec974Sbeck		 &sub($tmp1,$tmp2);
532913ec974Sbeck		&adc($c,0);
533913ec974Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
534913ec974Sbeck		}
535913ec974Sbeck
536913ec974Sbeck	&comment("");
537913ec974Sbeck	&add($a,32);
538913ec974Sbeck	 &add($b,32);
539913ec974Sbeck	&add($r,32);
540913ec974Sbeck	 &sub($num,8);
541913ec974Sbeck	&jnz(&label("aw_loop"));
542913ec974Sbeck
543913ec974Sbeck	&set_label("aw_finish",0);
544913ec974Sbeck	&mov($num,&wparam(3));	# get num
545913ec974Sbeck	&and($num,7);
546913ec974Sbeck	 &jz(&label("aw_end"));
547913ec974Sbeck
548913ec974Sbeck	for ($i=0; $i<7; $i++)
549913ec974Sbeck		{
550913ec974Sbeck		&comment("Tail Round $i");
551913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
552913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
553913ec974Sbeck		&sub($tmp1,$c);
554913ec974Sbeck		 &mov($c,0);
555913ec974Sbeck		&adc($c,$c);
556913ec974Sbeck		 &sub($tmp1,$tmp2);
557913ec974Sbeck		&adc($c,0);
558913ec974Sbeck		 &dec($num) if ($i != 6);
559da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
560913ec974Sbeck		 &jz(&label("aw_end")) if ($i != 6);
561913ec974Sbeck		}
562913ec974Sbeck	&set_label("aw_end",0);
563913ec974Sbeck
564913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
5655b37fcf3Sryker
5665b37fcf3Sryker	&function_end($name);
5675b37fcf3Sryker	}
568