xref: /openbsd/lib/libcrypto/bn/asm/bn-586.pl (revision 0a5d6ede)
15b37fcf3Sryker#!/usr/local/bin/perl
25b37fcf3Sryker
3*0a5d6edeSdjm$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4*0a5d6edeSdjmpush(@INC,"${dir}","${dir}../../perlasm");
55b37fcf3Srykerrequire "x86asm.pl";
65b37fcf3Sryker
7913ec974Sbeck&asm_init($ARGV[0],$0);
85b37fcf3Sryker
94fcf65c5Sdjm$sse2=0;
104fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
114fcf65c5Sdjm
124fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2);
134fcf65c5Sdjm
145b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words");
155b37fcf3Sryker&bn_mul_words("bn_mul_words");
165b37fcf3Sryker&bn_sqr_words("bn_sqr_words");
17913ec974Sbeck&bn_div_words("bn_div_words");
185b37fcf3Sryker&bn_add_words("bn_add_words");
19913ec974Sbeck&bn_sub_words("bn_sub_words");
204fcf65c5Sdjm&bn_sub_part_words("bn_sub_part_words");
215b37fcf3Sryker
225b37fcf3Sryker&asm_finish();
235b37fcf3Sryker
245b37fcf3Srykersub bn_mul_add_words
255b37fcf3Sryker	{
265b37fcf3Sryker	local($name)=@_;
275b37fcf3Sryker
28*0a5d6edeSdjm	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
295b37fcf3Sryker
30*0a5d6edeSdjm	$r="eax";
31*0a5d6edeSdjm	$a="edx";
32*0a5d6edeSdjm	$c="ecx";
335b37fcf3Sryker
344fcf65c5Sdjm	if ($sse2) {
354fcf65c5Sdjm		&picmeup("eax","OPENSSL_ia32cap_P");
364fcf65c5Sdjm		&bt(&DWP(0,"eax"),26);
37*0a5d6edeSdjm		&jnc(&label("maw_non_sse2"));
384fcf65c5Sdjm
39*0a5d6edeSdjm		&mov($r,&wparam(0));
40*0a5d6edeSdjm		&mov($a,&wparam(1));
41*0a5d6edeSdjm		&mov($c,&wparam(2));
42*0a5d6edeSdjm		&movd("mm0",&wparam(3));	# mm0 = w
434fcf65c5Sdjm		&pxor("mm1","mm1");		# mm1 = carry_in
44*0a5d6edeSdjm		&jmp(&label("maw_sse2_entry"));
454fcf65c5Sdjm
46*0a5d6edeSdjm	&set_label("maw_sse2_unrolled",16);
474fcf65c5Sdjm		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
484fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
494fcf65c5Sdjm		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
504fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
514fcf65c5Sdjm		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
524fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
534fcf65c5Sdjm		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
544fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
554fcf65c5Sdjm		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
564fcf65c5Sdjm		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
574fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
584fcf65c5Sdjm		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
594fcf65c5Sdjm		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
604fcf65c5Sdjm		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
614fcf65c5Sdjm		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
624fcf65c5Sdjm		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
634fcf65c5Sdjm		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
644fcf65c5Sdjm		&movd(&DWP(0,$r,"",0),"mm1");
654fcf65c5Sdjm		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
664fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
674fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry0
684fcf65c5Sdjm		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
694fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
704fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
714fcf65c5Sdjm		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
724fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
734fcf65c5Sdjm		&movd(&DWP(4,$r,"",0),"mm1");
744fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry1
754fcf65c5Sdjm		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
764fcf65c5Sdjm		&add($a,32);
774fcf65c5Sdjm		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
784fcf65c5Sdjm		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
794fcf65c5Sdjm		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
804fcf65c5Sdjm		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
814fcf65c5Sdjm		&movd(&DWP(8,$r,"",0),"mm1");
824fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry2
834fcf65c5Sdjm		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
844fcf65c5Sdjm		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
854fcf65c5Sdjm		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
864fcf65c5Sdjm		&movd(&DWP(12,$r,"",0),"mm1");
874fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry3
884fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
894fcf65c5Sdjm		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
904fcf65c5Sdjm		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
914fcf65c5Sdjm		&movd(&DWP(16,$r,"",0),"mm1");
924fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry4
934fcf65c5Sdjm		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
944fcf65c5Sdjm		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
954fcf65c5Sdjm		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
964fcf65c5Sdjm		&movd(&DWP(20,$r,"",0),"mm1");
974fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry5
984fcf65c5Sdjm		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
994fcf65c5Sdjm		&movd(&DWP(24,$r,"",0),"mm1");
1004fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry6
1014fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
1024fcf65c5Sdjm		&movd(&DWP(28,$r,"",0),"mm1");
103*0a5d6edeSdjm		&lea($r,&DWP(32,$r));
1044fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry_out
1054fcf65c5Sdjm
106*0a5d6edeSdjm		&sub($c,8);
107*0a5d6edeSdjm		&jz(&label("maw_sse2_exit"));
108*0a5d6edeSdjm	&set_label("maw_sse2_entry");
109*0a5d6edeSdjm		&test($c,0xfffffff8);
110*0a5d6edeSdjm		&jnz(&label("maw_sse2_unrolled"));
111*0a5d6edeSdjm
112*0a5d6edeSdjm	&set_label("maw_sse2_loop",4);
113*0a5d6edeSdjm		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
114*0a5d6edeSdjm		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
115*0a5d6edeSdjm		&pmuludq("mm2","mm0");		# a[i] *= w
116*0a5d6edeSdjm		&lea($a,&DWP(4,$a));
117*0a5d6edeSdjm		&paddq("mm1","mm3");		# carry += r[i]
118*0a5d6edeSdjm		&paddq("mm1","mm2");		# carry += a[i]*w
119*0a5d6edeSdjm		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
120*0a5d6edeSdjm		&sub($c,1);
121*0a5d6edeSdjm		&psrlq("mm1",32);		# carry = carry_high
122*0a5d6edeSdjm		&lea($r,&DWP(4,$r));
1234fcf65c5Sdjm		&jnz(&label("maw_sse2_loop"));
124*0a5d6edeSdjm	&set_label("maw_sse2_exit");
125*0a5d6edeSdjm		&movd("eax","mm1");		# c = carry_out
1264fcf65c5Sdjm		&emms();
127*0a5d6edeSdjm		&ret();
1284fcf65c5Sdjm
129*0a5d6edeSdjm	&set_label("maw_non_sse2",16);
1304fcf65c5Sdjm	}
1314fcf65c5Sdjm
132*0a5d6edeSdjm	# function_begin prologue
133*0a5d6edeSdjm	&push("ebp");
134*0a5d6edeSdjm	&push("ebx");
135*0a5d6edeSdjm	&push("esi");
136*0a5d6edeSdjm	&push("edi");
1375b37fcf3Sryker
138*0a5d6edeSdjm	&comment("");
139*0a5d6edeSdjm	$Low="eax";
140*0a5d6edeSdjm	$High="edx";
141*0a5d6edeSdjm	$a="ebx";
142*0a5d6edeSdjm	$w="ebp";
143*0a5d6edeSdjm	$r="edi";
144*0a5d6edeSdjm	$c="esi";
145*0a5d6edeSdjm
146*0a5d6edeSdjm	&xor($c,$c);		# clear carry
147*0a5d6edeSdjm	&mov($r,&wparam(0));	#
148*0a5d6edeSdjm
149*0a5d6edeSdjm	&mov("ecx",&wparam(2));	#
150*0a5d6edeSdjm	&mov($a,&wparam(1));	#
151*0a5d6edeSdjm
152*0a5d6edeSdjm	&and("ecx",0xfffffff8);	# num / 8
153*0a5d6edeSdjm	&mov($w,&wparam(3));	#
154*0a5d6edeSdjm
155*0a5d6edeSdjm	&push("ecx");		# Up the stack for a tmp variable
156*0a5d6edeSdjm
157*0a5d6edeSdjm	&jz(&label("maw_finish"));
158*0a5d6edeSdjm
159*0a5d6edeSdjm	&set_label("maw_loop",16);
1605b37fcf3Sryker
1615b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
1625b37fcf3Sryker		{
1635b37fcf3Sryker		&comment("Round $i");
1645b37fcf3Sryker
165*0a5d6edeSdjm		 &mov("eax",&DWP($i,$a)); 	# *a
1665b37fcf3Sryker		&mul($w);			# *a * w
1675b37fcf3Sryker		&add("eax",$c);			# L(t)+= c
1685b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
169*0a5d6edeSdjm		 &add("eax",&DWP($i,$r));	# L(t)+= *r
170*0a5d6edeSdjm		&adc("edx",0);			# H(t)+=carry
171*0a5d6edeSdjm		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
1725b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
1735b37fcf3Sryker		}
1745b37fcf3Sryker
1755b37fcf3Sryker	&comment("");
1765b37fcf3Sryker	&sub("ecx",8);
177*0a5d6edeSdjm	&lea($a,&DWP(32,$a));
178*0a5d6edeSdjm	&lea($r,&DWP(32,$r));
1795b37fcf3Sryker	&jnz(&label("maw_loop"));
1805b37fcf3Sryker
1815b37fcf3Sryker	&set_label("maw_finish",0);
1825b37fcf3Sryker	&mov("ecx",&wparam(2));	# get num
1835b37fcf3Sryker	&and("ecx",7);
1845b37fcf3Sryker	&jnz(&label("maw_finish2"));	# helps branch prediction
1855b37fcf3Sryker	&jmp(&label("maw_end"));
1865b37fcf3Sryker
1875b37fcf3Sryker	&set_label("maw_finish2",1);
1885b37fcf3Sryker	for ($i=0; $i<7; $i++)
1895b37fcf3Sryker		{
1905b37fcf3Sryker		&comment("Tail Round $i");
191*0a5d6edeSdjm		 &mov("eax",&DWP($i*4,$a));	# *a
1925b37fcf3Sryker		&mul($w);			# *a * w
1935b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
1945b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
195*0a5d6edeSdjm		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
1965b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1975b37fcf3Sryker		 &dec("ecx") if ($i != 7-1);
198*0a5d6edeSdjm		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
1995b37fcf3Sryker		 &mov($c,"edx");		# c=  H(t);
2005b37fcf3Sryker		&jz(&label("maw_end")) if ($i != 7-1);
2015b37fcf3Sryker		}
2025b37fcf3Sryker	&set_label("maw_end",0);
2035b37fcf3Sryker	&mov("eax",$c);
2045b37fcf3Sryker
2055b37fcf3Sryker	&pop("ecx");	# clear variable from
2065b37fcf3Sryker
2075b37fcf3Sryker	&function_end($name);
2085b37fcf3Sryker	}
2095b37fcf3Sryker
2105b37fcf3Srykersub bn_mul_words
2115b37fcf3Sryker	{
2125b37fcf3Sryker	local($name)=@_;
2135b37fcf3Sryker
214*0a5d6edeSdjm	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
215*0a5d6edeSdjm
216*0a5d6edeSdjm	$r="eax";
217*0a5d6edeSdjm	$a="edx";
218*0a5d6edeSdjm	$c="ecx";
219*0a5d6edeSdjm
220*0a5d6edeSdjm	if ($sse2) {
221*0a5d6edeSdjm		&picmeup("eax","OPENSSL_ia32cap_P");
222*0a5d6edeSdjm		&bt(&DWP(0,"eax"),26);
223*0a5d6edeSdjm		&jnc(&label("mw_non_sse2"));
224*0a5d6edeSdjm
225*0a5d6edeSdjm		&mov($r,&wparam(0));
226*0a5d6edeSdjm		&mov($a,&wparam(1));
227*0a5d6edeSdjm		&mov($c,&wparam(2));
228*0a5d6edeSdjm		&movd("mm0",&wparam(3));	# mm0 = w
229*0a5d6edeSdjm		&pxor("mm1","mm1");		# mm1 = carry = 0
230*0a5d6edeSdjm
231*0a5d6edeSdjm	&set_label("mw_sse2_loop",16);
232*0a5d6edeSdjm		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
233*0a5d6edeSdjm		&pmuludq("mm2","mm0");		# a[i] *= w
234*0a5d6edeSdjm		&lea($a,&DWP(4,$a));
235*0a5d6edeSdjm		&paddq("mm1","mm2");		# carry += a[i]*w
236*0a5d6edeSdjm		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
237*0a5d6edeSdjm		&sub($c,1);
238*0a5d6edeSdjm		&psrlq("mm1",32);		# carry = carry_high
239*0a5d6edeSdjm		&lea($r,&DWP(4,$r));
240*0a5d6edeSdjm		&jnz(&label("mw_sse2_loop"));
241*0a5d6edeSdjm
242*0a5d6edeSdjm		&movd("eax","mm1");		# return carry
243*0a5d6edeSdjm		&emms();
244*0a5d6edeSdjm		&ret();
245*0a5d6edeSdjm	&set_label("mw_non_sse2",16);
246*0a5d6edeSdjm	}
247*0a5d6edeSdjm
248*0a5d6edeSdjm	# function_begin prologue
249*0a5d6edeSdjm	&push("ebp");
250*0a5d6edeSdjm	&push("ebx");
251*0a5d6edeSdjm	&push("esi");
252*0a5d6edeSdjm	&push("edi");
2535b37fcf3Sryker
2545b37fcf3Sryker	&comment("");
2555b37fcf3Sryker	$Low="eax";
2565b37fcf3Sryker	$High="edx";
2575b37fcf3Sryker	$a="ebx";
2585b37fcf3Sryker	$w="ecx";
2595b37fcf3Sryker	$r="edi";
2605b37fcf3Sryker	$c="esi";
2615b37fcf3Sryker	$num="ebp";
2625b37fcf3Sryker
2635b37fcf3Sryker	&xor($c,$c);		# clear carry
2645b37fcf3Sryker	&mov($r,&wparam(0));	#
2655b37fcf3Sryker	&mov($a,&wparam(1));	#
2665b37fcf3Sryker	&mov($num,&wparam(2));	#
2675b37fcf3Sryker	&mov($w,&wparam(3));	#
2685b37fcf3Sryker
2695b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
2705b37fcf3Sryker	&jz(&label("mw_finish"));
2715b37fcf3Sryker
2725b37fcf3Sryker	&set_label("mw_loop",0);
2735b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
2745b37fcf3Sryker		{
2755b37fcf3Sryker		&comment("Round $i");
2765b37fcf3Sryker
2775b37fcf3Sryker		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
2785b37fcf3Sryker		&mul($w);			# *a * w
2795b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
2805b37fcf3Sryker		 # XXX
2815b37fcf3Sryker
2825b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
2835b37fcf3Sryker		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
2845b37fcf3Sryker
2855b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
2865b37fcf3Sryker		}
2875b37fcf3Sryker
2885b37fcf3Sryker	&comment("");
2895b37fcf3Sryker	&add($a,32);
2905b37fcf3Sryker	&add($r,32);
2915b37fcf3Sryker	&sub($num,8);
2925b37fcf3Sryker	&jz(&label("mw_finish"));
2935b37fcf3Sryker	&jmp(&label("mw_loop"));
2945b37fcf3Sryker
2955b37fcf3Sryker	&set_label("mw_finish",0);
2965b37fcf3Sryker	&mov($num,&wparam(2));	# get num
2975b37fcf3Sryker	&and($num,7);
2985b37fcf3Sryker	&jnz(&label("mw_finish2"));
2995b37fcf3Sryker	&jmp(&label("mw_end"));
3005b37fcf3Sryker
3015b37fcf3Sryker	&set_label("mw_finish2",1);
3025b37fcf3Sryker	for ($i=0; $i<7; $i++)
3035b37fcf3Sryker		{
3045b37fcf3Sryker		&comment("Tail Round $i");
3055b37fcf3Sryker		 &mov("eax",&DWP($i*4,$a,"",0));# *a
3065b37fcf3Sryker		&mul($w);			# *a * w
3075b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
3085b37fcf3Sryker		 # XXX
3095b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
3105b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
3115b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
3125b37fcf3Sryker		 &dec($num) if ($i != 7-1);
3135b37fcf3Sryker		&jz(&label("mw_end")) if ($i != 7-1);
3145b37fcf3Sryker		}
3155b37fcf3Sryker	&set_label("mw_end",0);
3165b37fcf3Sryker	&mov("eax",$c);
3175b37fcf3Sryker
3185b37fcf3Sryker	&function_end($name);
3195b37fcf3Sryker	}
3205b37fcf3Sryker
3215b37fcf3Srykersub bn_sqr_words
3225b37fcf3Sryker	{
3235b37fcf3Sryker	local($name)=@_;
3245b37fcf3Sryker
325*0a5d6edeSdjm	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
326*0a5d6edeSdjm
327*0a5d6edeSdjm	$r="eax";
328*0a5d6edeSdjm	$a="edx";
329*0a5d6edeSdjm	$c="ecx";
330*0a5d6edeSdjm
331*0a5d6edeSdjm	if ($sse2) {
332*0a5d6edeSdjm		&picmeup("eax","OPENSSL_ia32cap_P");
333*0a5d6edeSdjm		&bt(&DWP(0,"eax"),26);
334*0a5d6edeSdjm		&jnc(&label("sqr_non_sse2"));
335*0a5d6edeSdjm
336*0a5d6edeSdjm		&mov($r,&wparam(0));
337*0a5d6edeSdjm		&mov($a,&wparam(1));
338*0a5d6edeSdjm		&mov($c,&wparam(2));
339*0a5d6edeSdjm
340*0a5d6edeSdjm	&set_label("sqr_sse2_loop",16);
341*0a5d6edeSdjm		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
342*0a5d6edeSdjm		&pmuludq("mm0","mm0");		# a[i] *= a[i]
343*0a5d6edeSdjm		&lea($a,&DWP(4,$a));		# a++
344*0a5d6edeSdjm		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
345*0a5d6edeSdjm		&sub($c,1);
346*0a5d6edeSdjm		&lea($r,&DWP(8,$r));		# r += 2
347*0a5d6edeSdjm		&jnz(&label("sqr_sse2_loop"));
348*0a5d6edeSdjm
349*0a5d6edeSdjm		&emms();
350*0a5d6edeSdjm		&ret();
351*0a5d6edeSdjm	&set_label("sqr_non_sse2",16);
352*0a5d6edeSdjm	}
353*0a5d6edeSdjm
354*0a5d6edeSdjm	# function_begin prologue
355*0a5d6edeSdjm	&push("ebp");
356*0a5d6edeSdjm	&push("ebx");
357*0a5d6edeSdjm	&push("esi");
358*0a5d6edeSdjm	&push("edi");
3595b37fcf3Sryker
3605b37fcf3Sryker	&comment("");
3615b37fcf3Sryker	$r="esi";
3625b37fcf3Sryker	$a="edi";
3635b37fcf3Sryker	$num="ebx";
3645b37fcf3Sryker
3655b37fcf3Sryker	&mov($r,&wparam(0));	#
3665b37fcf3Sryker	&mov($a,&wparam(1));	#
3675b37fcf3Sryker	&mov($num,&wparam(2));	#
3685b37fcf3Sryker
3695b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
3705b37fcf3Sryker	&jz(&label("sw_finish"));
3715b37fcf3Sryker
3725b37fcf3Sryker	&set_label("sw_loop",0);
3735b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
3745b37fcf3Sryker		{
3755b37fcf3Sryker		&comment("Round $i");
3765b37fcf3Sryker		&mov("eax",&DWP($i,$a,"",0)); 	# *a
3775b37fcf3Sryker		 # XXX
3785b37fcf3Sryker		&mul("eax");			# *a * *a
3795b37fcf3Sryker		&mov(&DWP($i*2,$r,"",0),"eax");	#
3805b37fcf3Sryker		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
3815b37fcf3Sryker		}
3825b37fcf3Sryker
3835b37fcf3Sryker	&comment("");
3845b37fcf3Sryker	&add($a,32);
3855b37fcf3Sryker	&add($r,64);
3865b37fcf3Sryker	&sub($num,8);
3875b37fcf3Sryker	&jnz(&label("sw_loop"));
3885b37fcf3Sryker
3895b37fcf3Sryker	&set_label("sw_finish",0);
3905b37fcf3Sryker	&mov($num,&wparam(2));	# get num
3915b37fcf3Sryker	&and($num,7);
3925b37fcf3Sryker	&jz(&label("sw_end"));
3935b37fcf3Sryker
3945b37fcf3Sryker	for ($i=0; $i<7; $i++)
3955b37fcf3Sryker		{
3965b37fcf3Sryker		&comment("Tail Round $i");
3975b37fcf3Sryker		&mov("eax",&DWP($i*4,$a,"",0));	# *a
3985b37fcf3Sryker		 # XXX
3995b37fcf3Sryker		&mul("eax");			# *a * *a
4005b37fcf3Sryker		&mov(&DWP($i*8,$r,"",0),"eax");	#
4015b37fcf3Sryker		 &dec($num) if ($i != 7-1);
4025b37fcf3Sryker		&mov(&DWP($i*8+4,$r,"",0),"edx");
4035b37fcf3Sryker		 &jz(&label("sw_end")) if ($i != 7-1);
4045b37fcf3Sryker		}
4055b37fcf3Sryker	&set_label("sw_end",0);
4065b37fcf3Sryker
4075b37fcf3Sryker	&function_end($name);
4085b37fcf3Sryker	}
4095b37fcf3Sryker
410913ec974Sbecksub bn_div_words
4115b37fcf3Sryker	{
4125b37fcf3Sryker	local($name)=@_;
4135b37fcf3Sryker
414*0a5d6edeSdjm	&function_begin_B($name,"");
4155b37fcf3Sryker	&mov("edx",&wparam(0));	#
4165b37fcf3Sryker	&mov("eax",&wparam(1));	#
417*0a5d6edeSdjm	&mov("ecx",&wparam(2));	#
418*0a5d6edeSdjm	&div("ecx");
419*0a5d6edeSdjm	&ret();
420*0a5d6edeSdjm	&function_end_B($name);
4215b37fcf3Sryker	}
4225b37fcf3Sryker
4235b37fcf3Srykersub bn_add_words
4245b37fcf3Sryker	{
4255b37fcf3Sryker	local($name)=@_;
4265b37fcf3Sryker
4275b37fcf3Sryker	&function_begin($name,"");
4285b37fcf3Sryker
4295b37fcf3Sryker	&comment("");
4305b37fcf3Sryker	$a="esi";
4315b37fcf3Sryker	$b="edi";
4325b37fcf3Sryker	$c="eax";
4335b37fcf3Sryker	$r="ebx";
4345b37fcf3Sryker	$tmp1="ecx";
4355b37fcf3Sryker	$tmp2="edx";
4365b37fcf3Sryker	$num="ebp";
4375b37fcf3Sryker
4385b37fcf3Sryker	&mov($r,&wparam(0));	# get r
4395b37fcf3Sryker	 &mov($a,&wparam(1));	# get a
4405b37fcf3Sryker	&mov($b,&wparam(2));	# get b
4415b37fcf3Sryker	 &mov($num,&wparam(3));	# get num
4425b37fcf3Sryker	&xor($c,$c);		# clear carry
4435b37fcf3Sryker	 &and($num,0xfffffff8);	# num / 8
4445b37fcf3Sryker
4455b37fcf3Sryker	&jz(&label("aw_finish"));
4465b37fcf3Sryker
4475b37fcf3Sryker	&set_label("aw_loop",0);
4485b37fcf3Sryker	for ($i=0; $i<8; $i++)
4495b37fcf3Sryker		{
4505b37fcf3Sryker		&comment("Round $i");
4515b37fcf3Sryker
4525b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
4535b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
4545b37fcf3Sryker		&add($tmp1,$c);
4555b37fcf3Sryker		 &mov($c,0);
4565b37fcf3Sryker		&adc($c,$c);
4575b37fcf3Sryker		 &add($tmp1,$tmp2);
4585b37fcf3Sryker		&adc($c,0);
4595b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
4605b37fcf3Sryker		}
4615b37fcf3Sryker
4625b37fcf3Sryker	&comment("");
4635b37fcf3Sryker	&add($a,32);
4645b37fcf3Sryker	 &add($b,32);
4655b37fcf3Sryker	&add($r,32);
4665b37fcf3Sryker	 &sub($num,8);
4675b37fcf3Sryker	&jnz(&label("aw_loop"));
4685b37fcf3Sryker
4695b37fcf3Sryker	&set_label("aw_finish",0);
4705b37fcf3Sryker	&mov($num,&wparam(3));	# get num
4715b37fcf3Sryker	&and($num,7);
4725b37fcf3Sryker	 &jz(&label("aw_end"));
4735b37fcf3Sryker
4745b37fcf3Sryker	for ($i=0; $i<7; $i++)
4755b37fcf3Sryker		{
4765b37fcf3Sryker		&comment("Tail Round $i");
4775b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
4785b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
4795b37fcf3Sryker		&add($tmp1,$c);
4805b37fcf3Sryker		 &mov($c,0);
4815b37fcf3Sryker		&adc($c,$c);
4825b37fcf3Sryker		 &add($tmp1,$tmp2);
4835b37fcf3Sryker		&adc($c,0);
4845b37fcf3Sryker		 &dec($num) if ($i != 6);
485da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
4865b37fcf3Sryker		 &jz(&label("aw_end")) if ($i != 6);
4875b37fcf3Sryker		}
4885b37fcf3Sryker	&set_label("aw_end",0);
4895b37fcf3Sryker
490913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
491913ec974Sbeck
492913ec974Sbeck	&function_end($name);
493913ec974Sbeck	}
494913ec974Sbeck
495913ec974Sbecksub bn_sub_words
496913ec974Sbeck	{
497913ec974Sbeck	local($name)=@_;
498913ec974Sbeck
499913ec974Sbeck	&function_begin($name,"");
500913ec974Sbeck
501913ec974Sbeck	&comment("");
502913ec974Sbeck	$a="esi";
503913ec974Sbeck	$b="edi";
504913ec974Sbeck	$c="eax";
505913ec974Sbeck	$r="ebx";
506913ec974Sbeck	$tmp1="ecx";
507913ec974Sbeck	$tmp2="edx";
508913ec974Sbeck	$num="ebp";
509913ec974Sbeck
510913ec974Sbeck	&mov($r,&wparam(0));	# get r
511913ec974Sbeck	 &mov($a,&wparam(1));	# get a
512913ec974Sbeck	&mov($b,&wparam(2));	# get b
513913ec974Sbeck	 &mov($num,&wparam(3));	# get num
514913ec974Sbeck	&xor($c,$c);		# clear carry
515913ec974Sbeck	 &and($num,0xfffffff8);	# num / 8
516913ec974Sbeck
517913ec974Sbeck	&jz(&label("aw_finish"));
518913ec974Sbeck
519913ec974Sbeck	&set_label("aw_loop",0);
520913ec974Sbeck	for ($i=0; $i<8; $i++)
521913ec974Sbeck		{
522913ec974Sbeck		&comment("Round $i");
523913ec974Sbeck
524913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
525913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
526913ec974Sbeck		&sub($tmp1,$c);
527913ec974Sbeck		 &mov($c,0);
528913ec974Sbeck		&adc($c,$c);
529913ec974Sbeck		 &sub($tmp1,$tmp2);
530913ec974Sbeck		&adc($c,0);
531913ec974Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
532913ec974Sbeck		}
533913ec974Sbeck
534913ec974Sbeck	&comment("");
535913ec974Sbeck	&add($a,32);
536913ec974Sbeck	 &add($b,32);
537913ec974Sbeck	&add($r,32);
538913ec974Sbeck	 &sub($num,8);
539913ec974Sbeck	&jnz(&label("aw_loop"));
540913ec974Sbeck
541913ec974Sbeck	&set_label("aw_finish",0);
542913ec974Sbeck	&mov($num,&wparam(3));	# get num
543913ec974Sbeck	&and($num,7);
544913ec974Sbeck	 &jz(&label("aw_end"));
545913ec974Sbeck
546913ec974Sbeck	for ($i=0; $i<7; $i++)
547913ec974Sbeck		{
548913ec974Sbeck		&comment("Tail Round $i");
549913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
550913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
551913ec974Sbeck		&sub($tmp1,$c);
552913ec974Sbeck		 &mov($c,0);
553913ec974Sbeck		&adc($c,$c);
554913ec974Sbeck		 &sub($tmp1,$tmp2);
555913ec974Sbeck		&adc($c,0);
556913ec974Sbeck		 &dec($num) if ($i != 6);
557da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
558913ec974Sbeck		 &jz(&label("aw_end")) if ($i != 6);
559913ec974Sbeck		}
560913ec974Sbeck	&set_label("aw_end",0);
561913ec974Sbeck
562913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
5635b37fcf3Sryker
5645b37fcf3Sryker	&function_end($name);
5655b37fcf3Sryker	}
5665b37fcf3Sryker
567da347917Sbecksub bn_sub_part_words
568da347917Sbeck	{
569da347917Sbeck	local($name)=@_;
570da347917Sbeck
571da347917Sbeck	&function_begin($name,"");
572da347917Sbeck
573da347917Sbeck	&comment("");
574da347917Sbeck	$a="esi";
575da347917Sbeck	$b="edi";
576da347917Sbeck	$c="eax";
577da347917Sbeck	$r="ebx";
578da347917Sbeck	$tmp1="ecx";
579da347917Sbeck	$tmp2="edx";
580da347917Sbeck	$num="ebp";
581da347917Sbeck
582da347917Sbeck	&mov($r,&wparam(0));	# get r
583da347917Sbeck	 &mov($a,&wparam(1));	# get a
584da347917Sbeck	&mov($b,&wparam(2));	# get b
585da347917Sbeck	 &mov($num,&wparam(3));	# get num
586da347917Sbeck	&xor($c,$c);		# clear carry
587da347917Sbeck	 &and($num,0xfffffff8);	# num / 8
588da347917Sbeck
589da347917Sbeck	&jz(&label("aw_finish"));
590da347917Sbeck
591da347917Sbeck	&set_label("aw_loop",0);
592da347917Sbeck	for ($i=0; $i<8; $i++)
593da347917Sbeck		{
594da347917Sbeck		&comment("Round $i");
595da347917Sbeck
596da347917Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
597da347917Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
598da347917Sbeck		&sub($tmp1,$c);
599da347917Sbeck		 &mov($c,0);
600da347917Sbeck		&adc($c,$c);
601da347917Sbeck		 &sub($tmp1,$tmp2);
602da347917Sbeck		&adc($c,0);
603da347917Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
604da347917Sbeck		}
605da347917Sbeck
606da347917Sbeck	&comment("");
607da347917Sbeck	&add($a,32);
608da347917Sbeck	 &add($b,32);
609da347917Sbeck	&add($r,32);
610da347917Sbeck	 &sub($num,8);
611da347917Sbeck	&jnz(&label("aw_loop"));
612da347917Sbeck
613da347917Sbeck	&set_label("aw_finish",0);
614da347917Sbeck	&mov($num,&wparam(3));	# get num
615da347917Sbeck	&and($num,7);
616da347917Sbeck	 &jz(&label("aw_end"));
617da347917Sbeck
618da347917Sbeck	for ($i=0; $i<7; $i++)
619da347917Sbeck		{
620da347917Sbeck		&comment("Tail Round $i");
621da347917Sbeck		&mov($tmp1,&DWP(0,$a,"",0));	# *a
622da347917Sbeck		 &mov($tmp2,&DWP(0,$b,"",0));# *b
623da347917Sbeck		&sub($tmp1,$c);
624da347917Sbeck		 &mov($c,0);
625da347917Sbeck		&adc($c,$c);
626da347917Sbeck		 &sub($tmp1,$tmp2);
627da347917Sbeck		&adc($c,0);
628da347917Sbeck		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
629da347917Sbeck		&add($a, 4);
630da347917Sbeck		&add($b, 4);
631da347917Sbeck		&add($r, 4);
632da347917Sbeck		 &dec($num) if ($i != 6);
633da347917Sbeck		 &jz(&label("aw_end")) if ($i != 6);
634da347917Sbeck		}
635da347917Sbeck	&set_label("aw_end",0);
636da347917Sbeck
637da347917Sbeck	&cmp(&wparam(4),0);
638da347917Sbeck	&je(&label("pw_end"));
639da347917Sbeck
640da347917Sbeck	&mov($num,&wparam(4));	# get dl
641da347917Sbeck	&cmp($num,0);
642da347917Sbeck	&je(&label("pw_end"));
643da347917Sbeck	&jge(&label("pw_pos"));
644da347917Sbeck
645da347917Sbeck	&comment("pw_neg");
646da347917Sbeck	&mov($tmp2,0);
647da347917Sbeck	&sub($tmp2,$num);
648da347917Sbeck	&mov($num,$tmp2);
649da347917Sbeck	&and($num,0xfffffff8);	# num / 8
650da347917Sbeck	&jz(&label("pw_neg_finish"));
651da347917Sbeck
652da347917Sbeck	&set_label("pw_neg_loop",0);
653da347917Sbeck	for ($i=0; $i<8; $i++)
654da347917Sbeck	{
655da347917Sbeck	    &comment("dl<0 Round $i");
656da347917Sbeck
657da347917Sbeck	    &mov($tmp1,0);
658da347917Sbeck	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
659da347917Sbeck	    &sub($tmp1,$c);
660da347917Sbeck	    &mov($c,0);
661da347917Sbeck	    &adc($c,$c);
662da347917Sbeck	    &sub($tmp1,$tmp2);
663da347917Sbeck	    &adc($c,0);
664da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
665da347917Sbeck	}
666da347917Sbeck
667da347917Sbeck	&comment("");
668da347917Sbeck	&add($b,32);
669da347917Sbeck	&add($r,32);
670da347917Sbeck	&sub($num,8);
671da347917Sbeck	&jnz(&label("pw_neg_loop"));
672da347917Sbeck
673da347917Sbeck	&set_label("pw_neg_finish",0);
674da347917Sbeck	&mov($tmp2,&wparam(4));	# get dl
675da347917Sbeck	&mov($num,0);
676da347917Sbeck	&sub($num,$tmp2);
677da347917Sbeck	&and($num,7);
678da347917Sbeck	&jz(&label("pw_end"));
679da347917Sbeck
680da347917Sbeck	for ($i=0; $i<7; $i++)
681da347917Sbeck	{
682da347917Sbeck	    &comment("dl<0 Tail Round $i");
683da347917Sbeck	    &mov($tmp1,0);
684da347917Sbeck	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
685da347917Sbeck	    &sub($tmp1,$c);
686da347917Sbeck	    &mov($c,0);
687da347917Sbeck	    &adc($c,$c);
688da347917Sbeck	    &sub($tmp1,$tmp2);
689da347917Sbeck	    &adc($c,0);
690da347917Sbeck	    &dec($num) if ($i != 6);
691da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
692da347917Sbeck	    &jz(&label("pw_end")) if ($i != 6);
693da347917Sbeck	}
694da347917Sbeck
695da347917Sbeck	&jmp(&label("pw_end"));
696da347917Sbeck
697da347917Sbeck	&set_label("pw_pos",0);
698da347917Sbeck
699da347917Sbeck	&and($num,0xfffffff8);	# num / 8
700da347917Sbeck	&jz(&label("pw_pos_finish"));
701da347917Sbeck
702da347917Sbeck	&set_label("pw_pos_loop",0);
703da347917Sbeck
704da347917Sbeck	for ($i=0; $i<8; $i++)
705da347917Sbeck	{
706da347917Sbeck	    &comment("dl>0 Round $i");
707da347917Sbeck
708da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
709da347917Sbeck	    &sub($tmp1,$c);
710da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
711da347917Sbeck	    &jnc(&label("pw_nc".$i));
712da347917Sbeck	}
713da347917Sbeck
714da347917Sbeck	&comment("");
715da347917Sbeck	&add($a,32);
716da347917Sbeck	&add($r,32);
717da347917Sbeck	&sub($num,8);
718da347917Sbeck	&jnz(&label("pw_pos_loop"));
719da347917Sbeck
720da347917Sbeck	&set_label("pw_pos_finish",0);
721da347917Sbeck	&mov($num,&wparam(4));	# get dl
722da347917Sbeck	&and($num,7);
723da347917Sbeck	&jz(&label("pw_end"));
724da347917Sbeck
725da347917Sbeck	for ($i=0; $i<7; $i++)
726da347917Sbeck	{
727da347917Sbeck	    &comment("dl>0 Tail Round $i");
728da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
729da347917Sbeck	    &sub($tmp1,$c);
730da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
731da347917Sbeck	    &jnc(&label("pw_tail_nc".$i));
732da347917Sbeck	    &dec($num) if ($i != 6);
733da347917Sbeck	    &jz(&label("pw_end")) if ($i != 6);
734da347917Sbeck	}
735da347917Sbeck	&mov($c,1);
736da347917Sbeck	&jmp(&label("pw_end"));
737da347917Sbeck
738da347917Sbeck	&set_label("pw_nc_loop",0);
739da347917Sbeck	for ($i=0; $i<8; $i++)
740da347917Sbeck	{
741da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
742da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
743da347917Sbeck	    &set_label("pw_nc".$i,0);
744da347917Sbeck	}
745da347917Sbeck
746da347917Sbeck	&comment("");
747da347917Sbeck	&add($a,32);
748da347917Sbeck	&add($r,32);
749da347917Sbeck	&sub($num,8);
750da347917Sbeck	&jnz(&label("pw_nc_loop"));
751da347917Sbeck
752da347917Sbeck	&mov($num,&wparam(4));	# get dl
753da347917Sbeck	&and($num,7);
754da347917Sbeck	&jz(&label("pw_nc_end"));
755da347917Sbeck
756da347917Sbeck	for ($i=0; $i<7; $i++)
757da347917Sbeck	{
758da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
759da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
760da347917Sbeck	    &set_label("pw_tail_nc".$i,0);
761da347917Sbeck	    &dec($num) if ($i != 6);
762da347917Sbeck	    &jz(&label("pw_nc_end")) if ($i != 6);
763da347917Sbeck	}
764da347917Sbeck
765da347917Sbeck	&set_label("pw_nc_end",0);
766da347917Sbeck	&mov($c,0);
767da347917Sbeck
768da347917Sbeck	&set_label("pw_end",0);
769da347917Sbeck
770da347917Sbeck#	&mov("eax",$c);		# $c is "eax"
771da347917Sbeck
772da347917Sbeck	&function_end($name);
773da347917Sbeck	}
774da347917Sbeck
775