xref: /openbsd/lib/libcrypto/bn/asm/bn-586.pl (revision 4fcf65c5)
15b37fcf3Sryker#!/usr/local/bin/perl
25b37fcf3Sryker
35b37fcf3Srykerpush(@INC,"perlasm","../../perlasm");
45b37fcf3Srykerrequire "x86asm.pl";
55b37fcf3Sryker
6913ec974Sbeck&asm_init($ARGV[0],$0);
75b37fcf3Sryker
8*4fcf65c5Sdjm$sse2=0;
9*4fcf65c5Sdjmfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10*4fcf65c5Sdjm
11*4fcf65c5Sdjm&external_label("OPENSSL_ia32cap_P") if ($sse2);
12*4fcf65c5Sdjm
135b37fcf3Sryker&bn_mul_add_words("bn_mul_add_words");
145b37fcf3Sryker&bn_mul_words("bn_mul_words");
155b37fcf3Sryker&bn_sqr_words("bn_sqr_words");
16913ec974Sbeck&bn_div_words("bn_div_words");
175b37fcf3Sryker&bn_add_words("bn_add_words");
18913ec974Sbeck&bn_sub_words("bn_sub_words");
19*4fcf65c5Sdjm&bn_sub_part_words("bn_sub_part_words");
205b37fcf3Sryker
215b37fcf3Sryker&asm_finish();
225b37fcf3Sryker
235b37fcf3Srykersub bn_mul_add_words
245b37fcf3Sryker	{
255b37fcf3Sryker	local($name)=@_;
265b37fcf3Sryker
27*4fcf65c5Sdjm	&function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
285b37fcf3Sryker
295b37fcf3Sryker	&comment("");
305b37fcf3Sryker	$Low="eax";
315b37fcf3Sryker	$High="edx";
325b37fcf3Sryker	$a="ebx";
335b37fcf3Sryker	$w="ebp";
345b37fcf3Sryker	$r="edi";
355b37fcf3Sryker	$c="esi";
365b37fcf3Sryker
375b37fcf3Sryker	&xor($c,$c);		# clear carry
385b37fcf3Sryker	&mov($r,&wparam(0));	#
395b37fcf3Sryker
405b37fcf3Sryker	&mov("ecx",&wparam(2));	#
415b37fcf3Sryker	&mov($a,&wparam(1));	#
425b37fcf3Sryker
435b37fcf3Sryker	&and("ecx",0xfffffff8);	# num / 8
445b37fcf3Sryker	&mov($w,&wparam(3));	#
455b37fcf3Sryker
465b37fcf3Sryker	&push("ecx");		# Up the stack for a tmp variable
475b37fcf3Sryker
485b37fcf3Sryker	&jz(&label("maw_finish"));
495b37fcf3Sryker
50*4fcf65c5Sdjm	if ($sse2) {
51*4fcf65c5Sdjm		&picmeup("eax","OPENSSL_ia32cap_P");
52*4fcf65c5Sdjm		&bt(&DWP(0,"eax"),26);
53*4fcf65c5Sdjm		&jnc(&label("maw_loop"));
54*4fcf65c5Sdjm
55*4fcf65c5Sdjm		&movd("mm0",$w);		# mm0 = w
56*4fcf65c5Sdjm		&pxor("mm1","mm1");		# mm1 = carry_in
57*4fcf65c5Sdjm
58*4fcf65c5Sdjm		&set_label("maw_sse2_loop",0);
59*4fcf65c5Sdjm		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
60*4fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
61*4fcf65c5Sdjm		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
62*4fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
63*4fcf65c5Sdjm		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
64*4fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
65*4fcf65c5Sdjm		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
66*4fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
67*4fcf65c5Sdjm		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
68*4fcf65c5Sdjm		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
69*4fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
70*4fcf65c5Sdjm		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
71*4fcf65c5Sdjm		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
72*4fcf65c5Sdjm		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
73*4fcf65c5Sdjm		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
74*4fcf65c5Sdjm		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
75*4fcf65c5Sdjm		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
76*4fcf65c5Sdjm		&movd(&DWP(0,$r,"",0),"mm1");
77*4fcf65c5Sdjm		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
78*4fcf65c5Sdjm		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
79*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry0
80*4fcf65c5Sdjm		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
81*4fcf65c5Sdjm		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
82*4fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
83*4fcf65c5Sdjm		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
84*4fcf65c5Sdjm		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
85*4fcf65c5Sdjm		&movd(&DWP(4,$r,"",0),"mm1");
86*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry1
87*4fcf65c5Sdjm		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
88*4fcf65c5Sdjm		&add($a,32);
89*4fcf65c5Sdjm		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
90*4fcf65c5Sdjm		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
91*4fcf65c5Sdjm		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
92*4fcf65c5Sdjm		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
93*4fcf65c5Sdjm		&movd(&DWP(8,$r,"",0),"mm1");
94*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry2
95*4fcf65c5Sdjm		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
96*4fcf65c5Sdjm		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
97*4fcf65c5Sdjm		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
98*4fcf65c5Sdjm		&movd(&DWP(12,$r,"",0),"mm1");
99*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry3
100*4fcf65c5Sdjm		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
101*4fcf65c5Sdjm		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
102*4fcf65c5Sdjm		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
103*4fcf65c5Sdjm		&movd(&DWP(16,$r,"",0),"mm1");
104*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry4
105*4fcf65c5Sdjm		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
106*4fcf65c5Sdjm		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
107*4fcf65c5Sdjm		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
108*4fcf65c5Sdjm		&movd(&DWP(20,$r,"",0),"mm1");
109*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry5
110*4fcf65c5Sdjm		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
111*4fcf65c5Sdjm		&movd(&DWP(24,$r,"",0),"mm1");
112*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry6
113*4fcf65c5Sdjm		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
114*4fcf65c5Sdjm		&movd(&DWP(28,$r,"",0),"mm1");
115*4fcf65c5Sdjm		&add($r,32);
116*4fcf65c5Sdjm		&psrlq("mm1",32);		# mm1 = carry_out
117*4fcf65c5Sdjm
118*4fcf65c5Sdjm		&sub("ecx",8);
119*4fcf65c5Sdjm		&jnz(&label("maw_sse2_loop"));
120*4fcf65c5Sdjm
121*4fcf65c5Sdjm		&movd($c,"mm1");		# c = carry_out
122*4fcf65c5Sdjm		&emms();
123*4fcf65c5Sdjm
124*4fcf65c5Sdjm		&jmp(&label("maw_finish"));
125*4fcf65c5Sdjm	}
126*4fcf65c5Sdjm
1275b37fcf3Sryker	&set_label("maw_loop",0);
1285b37fcf3Sryker
1295b37fcf3Sryker	&mov(&swtmp(0),"ecx");	#
1305b37fcf3Sryker
1315b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
1325b37fcf3Sryker		{
1335b37fcf3Sryker		&comment("Round $i");
1345b37fcf3Sryker
1355b37fcf3Sryker		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
1365b37fcf3Sryker		&mul($w);			# *a * w
1375b37fcf3Sryker		&add("eax",$c);		# L(t)+= *r
1385b37fcf3Sryker		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
1395b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1405b37fcf3Sryker		 &add("eax",$c);		# L(t)+=c
1415b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1425b37fcf3Sryker		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
1435b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
1445b37fcf3Sryker		}
1455b37fcf3Sryker
1465b37fcf3Sryker	&comment("");
1475b37fcf3Sryker	&mov("ecx",&swtmp(0));	#
1485b37fcf3Sryker	&add($a,32);
1495b37fcf3Sryker	&add($r,32);
1505b37fcf3Sryker	&sub("ecx",8);
1515b37fcf3Sryker	&jnz(&label("maw_loop"));
1525b37fcf3Sryker
1535b37fcf3Sryker	&set_label("maw_finish",0);
1545b37fcf3Sryker	&mov("ecx",&wparam(2));	# get num
1555b37fcf3Sryker	&and("ecx",7);
1565b37fcf3Sryker	&jnz(&label("maw_finish2"));	# helps branch prediction
1575b37fcf3Sryker	&jmp(&label("maw_end"));
1585b37fcf3Sryker
1595b37fcf3Sryker	&set_label("maw_finish2",1);
1605b37fcf3Sryker	for ($i=0; $i<7; $i++)
1615b37fcf3Sryker		{
1625b37fcf3Sryker		&comment("Tail Round $i");
1635b37fcf3Sryker		 &mov("eax",&DWP($i*4,$a,"",0));# *a
1645b37fcf3Sryker		&mul($w);			# *a * w
1655b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
1665b37fcf3Sryker		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
1675b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1685b37fcf3Sryker		 &add("eax",$c);
1695b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
1705b37fcf3Sryker		 &dec("ecx") if ($i != 7-1);
1715b37fcf3Sryker		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
1725b37fcf3Sryker		 &mov($c,"edx");			# c=  H(t);
1735b37fcf3Sryker		&jz(&label("maw_end")) if ($i != 7-1);
1745b37fcf3Sryker		}
1755b37fcf3Sryker	&set_label("maw_end",0);
1765b37fcf3Sryker	&mov("eax",$c);
1775b37fcf3Sryker
1785b37fcf3Sryker	&pop("ecx");	# clear variable from
1795b37fcf3Sryker
1805b37fcf3Sryker	&function_end($name);
1815b37fcf3Sryker	}
1825b37fcf3Sryker
1835b37fcf3Srykersub bn_mul_words
1845b37fcf3Sryker	{
1855b37fcf3Sryker	local($name)=@_;
1865b37fcf3Sryker
1875b37fcf3Sryker	&function_begin($name,"");
1885b37fcf3Sryker
1895b37fcf3Sryker	&comment("");
1905b37fcf3Sryker	$Low="eax";
1915b37fcf3Sryker	$High="edx";
1925b37fcf3Sryker	$a="ebx";
1935b37fcf3Sryker	$w="ecx";
1945b37fcf3Sryker	$r="edi";
1955b37fcf3Sryker	$c="esi";
1965b37fcf3Sryker	$num="ebp";
1975b37fcf3Sryker
1985b37fcf3Sryker	&xor($c,$c);		# clear carry
1995b37fcf3Sryker	&mov($r,&wparam(0));	#
2005b37fcf3Sryker	&mov($a,&wparam(1));	#
2015b37fcf3Sryker	&mov($num,&wparam(2));	#
2025b37fcf3Sryker	&mov($w,&wparam(3));	#
2035b37fcf3Sryker
2045b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
2055b37fcf3Sryker	&jz(&label("mw_finish"));
2065b37fcf3Sryker
2075b37fcf3Sryker	&set_label("mw_loop",0);
2085b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
2095b37fcf3Sryker		{
2105b37fcf3Sryker		&comment("Round $i");
2115b37fcf3Sryker
2125b37fcf3Sryker		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
2135b37fcf3Sryker		&mul($w);			# *a * w
2145b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
2155b37fcf3Sryker		 # XXX
2165b37fcf3Sryker
2175b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
2185b37fcf3Sryker		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
2195b37fcf3Sryker
2205b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
2215b37fcf3Sryker		}
2225b37fcf3Sryker
2235b37fcf3Sryker	&comment("");
2245b37fcf3Sryker	&add($a,32);
2255b37fcf3Sryker	&add($r,32);
2265b37fcf3Sryker	&sub($num,8);
2275b37fcf3Sryker	&jz(&label("mw_finish"));
2285b37fcf3Sryker	&jmp(&label("mw_loop"));
2295b37fcf3Sryker
2305b37fcf3Sryker	&set_label("mw_finish",0);
2315b37fcf3Sryker	&mov($num,&wparam(2));	# get num
2325b37fcf3Sryker	&and($num,7);
2335b37fcf3Sryker	&jnz(&label("mw_finish2"));
2345b37fcf3Sryker	&jmp(&label("mw_end"));
2355b37fcf3Sryker
2365b37fcf3Sryker	&set_label("mw_finish2",1);
2375b37fcf3Sryker	for ($i=0; $i<7; $i++)
2385b37fcf3Sryker		{
2395b37fcf3Sryker		&comment("Tail Round $i");
2405b37fcf3Sryker		 &mov("eax",&DWP($i*4,$a,"",0));# *a
2415b37fcf3Sryker		&mul($w);			# *a * w
2425b37fcf3Sryker		&add("eax",$c);			# L(t)+=c
2435b37fcf3Sryker		 # XXX
2445b37fcf3Sryker		&adc("edx",0);			# H(t)+=carry
2455b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
2465b37fcf3Sryker		&mov($c,"edx");			# c=  H(t);
2475b37fcf3Sryker		 &dec($num) if ($i != 7-1);
2485b37fcf3Sryker		&jz(&label("mw_end")) if ($i != 7-1);
2495b37fcf3Sryker		}
2505b37fcf3Sryker	&set_label("mw_end",0);
2515b37fcf3Sryker	&mov("eax",$c);
2525b37fcf3Sryker
2535b37fcf3Sryker	&function_end($name);
2545b37fcf3Sryker	}
2555b37fcf3Sryker
2565b37fcf3Srykersub bn_sqr_words
2575b37fcf3Sryker	{
2585b37fcf3Sryker	local($name)=@_;
2595b37fcf3Sryker
2605b37fcf3Sryker	&function_begin($name,"");
2615b37fcf3Sryker
2625b37fcf3Sryker	&comment("");
2635b37fcf3Sryker	$r="esi";
2645b37fcf3Sryker	$a="edi";
2655b37fcf3Sryker	$num="ebx";
2665b37fcf3Sryker
2675b37fcf3Sryker	&mov($r,&wparam(0));	#
2685b37fcf3Sryker	&mov($a,&wparam(1));	#
2695b37fcf3Sryker	&mov($num,&wparam(2));	#
2705b37fcf3Sryker
2715b37fcf3Sryker	&and($num,0xfffffff8);	# num / 8
2725b37fcf3Sryker	&jz(&label("sw_finish"));
2735b37fcf3Sryker
2745b37fcf3Sryker	&set_label("sw_loop",0);
2755b37fcf3Sryker	for ($i=0; $i<32; $i+=4)
2765b37fcf3Sryker		{
2775b37fcf3Sryker		&comment("Round $i");
2785b37fcf3Sryker		&mov("eax",&DWP($i,$a,"",0)); 	# *a
2795b37fcf3Sryker		 # XXX
2805b37fcf3Sryker		&mul("eax");			# *a * *a
2815b37fcf3Sryker		&mov(&DWP($i*2,$r,"",0),"eax");	#
2825b37fcf3Sryker		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
2835b37fcf3Sryker		}
2845b37fcf3Sryker
2855b37fcf3Sryker	&comment("");
2865b37fcf3Sryker	&add($a,32);
2875b37fcf3Sryker	&add($r,64);
2885b37fcf3Sryker	&sub($num,8);
2895b37fcf3Sryker	&jnz(&label("sw_loop"));
2905b37fcf3Sryker
2915b37fcf3Sryker	&set_label("sw_finish",0);
2925b37fcf3Sryker	&mov($num,&wparam(2));	# get num
2935b37fcf3Sryker	&and($num,7);
2945b37fcf3Sryker	&jz(&label("sw_end"));
2955b37fcf3Sryker
2965b37fcf3Sryker	for ($i=0; $i<7; $i++)
2975b37fcf3Sryker		{
2985b37fcf3Sryker		&comment("Tail Round $i");
2995b37fcf3Sryker		&mov("eax",&DWP($i*4,$a,"",0));	# *a
3005b37fcf3Sryker		 # XXX
3015b37fcf3Sryker		&mul("eax");			# *a * *a
3025b37fcf3Sryker		&mov(&DWP($i*8,$r,"",0),"eax");	#
3035b37fcf3Sryker		 &dec($num) if ($i != 7-1);
3045b37fcf3Sryker		&mov(&DWP($i*8+4,$r,"",0),"edx");
3055b37fcf3Sryker		 &jz(&label("sw_end")) if ($i != 7-1);
3065b37fcf3Sryker		}
3075b37fcf3Sryker	&set_label("sw_end",0);
3085b37fcf3Sryker
3095b37fcf3Sryker	&function_end($name);
3105b37fcf3Sryker	}
3115b37fcf3Sryker
312913ec974Sbecksub bn_div_words
3135b37fcf3Sryker	{
3145b37fcf3Sryker	local($name)=@_;
3155b37fcf3Sryker
3165b37fcf3Sryker	&function_begin($name,"");
3175b37fcf3Sryker	&mov("edx",&wparam(0));	#
3185b37fcf3Sryker	&mov("eax",&wparam(1));	#
3195b37fcf3Sryker	&mov("ebx",&wparam(2));	#
3205b37fcf3Sryker	&div("ebx");
3215b37fcf3Sryker	&function_end($name);
3225b37fcf3Sryker	}
3235b37fcf3Sryker
3245b37fcf3Srykersub bn_add_words
3255b37fcf3Sryker	{
3265b37fcf3Sryker	local($name)=@_;
3275b37fcf3Sryker
3285b37fcf3Sryker	&function_begin($name,"");
3295b37fcf3Sryker
3305b37fcf3Sryker	&comment("");
3315b37fcf3Sryker	$a="esi";
3325b37fcf3Sryker	$b="edi";
3335b37fcf3Sryker	$c="eax";
3345b37fcf3Sryker	$r="ebx";
3355b37fcf3Sryker	$tmp1="ecx";
3365b37fcf3Sryker	$tmp2="edx";
3375b37fcf3Sryker	$num="ebp";
3385b37fcf3Sryker
3395b37fcf3Sryker	&mov($r,&wparam(0));	# get r
3405b37fcf3Sryker	 &mov($a,&wparam(1));	# get a
3415b37fcf3Sryker	&mov($b,&wparam(2));	# get b
3425b37fcf3Sryker	 &mov($num,&wparam(3));	# get num
3435b37fcf3Sryker	&xor($c,$c);		# clear carry
3445b37fcf3Sryker	 &and($num,0xfffffff8);	# num / 8
3455b37fcf3Sryker
3465b37fcf3Sryker	&jz(&label("aw_finish"));
3475b37fcf3Sryker
3485b37fcf3Sryker	&set_label("aw_loop",0);
3495b37fcf3Sryker	for ($i=0; $i<8; $i++)
3505b37fcf3Sryker		{
3515b37fcf3Sryker		&comment("Round $i");
3525b37fcf3Sryker
3535b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
3545b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
3555b37fcf3Sryker		&add($tmp1,$c);
3565b37fcf3Sryker		 &mov($c,0);
3575b37fcf3Sryker		&adc($c,$c);
3585b37fcf3Sryker		 &add($tmp1,$tmp2);
3595b37fcf3Sryker		&adc($c,0);
3605b37fcf3Sryker		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
3615b37fcf3Sryker		}
3625b37fcf3Sryker
3635b37fcf3Sryker	&comment("");
3645b37fcf3Sryker	&add($a,32);
3655b37fcf3Sryker	 &add($b,32);
3665b37fcf3Sryker	&add($r,32);
3675b37fcf3Sryker	 &sub($num,8);
3685b37fcf3Sryker	&jnz(&label("aw_loop"));
3695b37fcf3Sryker
3705b37fcf3Sryker	&set_label("aw_finish",0);
3715b37fcf3Sryker	&mov($num,&wparam(3));	# get num
3725b37fcf3Sryker	&and($num,7);
3735b37fcf3Sryker	 &jz(&label("aw_end"));
3745b37fcf3Sryker
3755b37fcf3Sryker	for ($i=0; $i<7; $i++)
3765b37fcf3Sryker		{
3775b37fcf3Sryker		&comment("Tail Round $i");
3785b37fcf3Sryker		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
3795b37fcf3Sryker		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
3805b37fcf3Sryker		&add($tmp1,$c);
3815b37fcf3Sryker		 &mov($c,0);
3825b37fcf3Sryker		&adc($c,$c);
3835b37fcf3Sryker		 &add($tmp1,$tmp2);
3845b37fcf3Sryker		&adc($c,0);
3855b37fcf3Sryker		 &dec($num) if ($i != 6);
386da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
3875b37fcf3Sryker		 &jz(&label("aw_end")) if ($i != 6);
3885b37fcf3Sryker		}
3895b37fcf3Sryker	&set_label("aw_end",0);
3905b37fcf3Sryker
391913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
392913ec974Sbeck
393913ec974Sbeck	&function_end($name);
394913ec974Sbeck	}
395913ec974Sbeck
396913ec974Sbecksub bn_sub_words
397913ec974Sbeck	{
398913ec974Sbeck	local($name)=@_;
399913ec974Sbeck
400913ec974Sbeck	&function_begin($name,"");
401913ec974Sbeck
402913ec974Sbeck	&comment("");
403913ec974Sbeck	$a="esi";
404913ec974Sbeck	$b="edi";
405913ec974Sbeck	$c="eax";
406913ec974Sbeck	$r="ebx";
407913ec974Sbeck	$tmp1="ecx";
408913ec974Sbeck	$tmp2="edx";
409913ec974Sbeck	$num="ebp";
410913ec974Sbeck
411913ec974Sbeck	&mov($r,&wparam(0));	# get r
412913ec974Sbeck	 &mov($a,&wparam(1));	# get a
413913ec974Sbeck	&mov($b,&wparam(2));	# get b
414913ec974Sbeck	 &mov($num,&wparam(3));	# get num
415913ec974Sbeck	&xor($c,$c);		# clear carry
416913ec974Sbeck	 &and($num,0xfffffff8);	# num / 8
417913ec974Sbeck
418913ec974Sbeck	&jz(&label("aw_finish"));
419913ec974Sbeck
420913ec974Sbeck	&set_label("aw_loop",0);
421913ec974Sbeck	for ($i=0; $i<8; $i++)
422913ec974Sbeck		{
423913ec974Sbeck		&comment("Round $i");
424913ec974Sbeck
425913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
426913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
427913ec974Sbeck		&sub($tmp1,$c);
428913ec974Sbeck		 &mov($c,0);
429913ec974Sbeck		&adc($c,$c);
430913ec974Sbeck		 &sub($tmp1,$tmp2);
431913ec974Sbeck		&adc($c,0);
432913ec974Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
433913ec974Sbeck		}
434913ec974Sbeck
435913ec974Sbeck	&comment("");
436913ec974Sbeck	&add($a,32);
437913ec974Sbeck	 &add($b,32);
438913ec974Sbeck	&add($r,32);
439913ec974Sbeck	 &sub($num,8);
440913ec974Sbeck	&jnz(&label("aw_loop"));
441913ec974Sbeck
442913ec974Sbeck	&set_label("aw_finish",0);
443913ec974Sbeck	&mov($num,&wparam(3));	# get num
444913ec974Sbeck	&and($num,7);
445913ec974Sbeck	 &jz(&label("aw_end"));
446913ec974Sbeck
447913ec974Sbeck	for ($i=0; $i<7; $i++)
448913ec974Sbeck		{
449913ec974Sbeck		&comment("Tail Round $i");
450913ec974Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
451913ec974Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
452913ec974Sbeck		&sub($tmp1,$c);
453913ec974Sbeck		 &mov($c,0);
454913ec974Sbeck		&adc($c,$c);
455913ec974Sbeck		 &sub($tmp1,$tmp2);
456913ec974Sbeck		&adc($c,0);
457913ec974Sbeck		 &dec($num) if ($i != 6);
458da347917Sbeck		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
459913ec974Sbeck		 &jz(&label("aw_end")) if ($i != 6);
460913ec974Sbeck		}
461913ec974Sbeck	&set_label("aw_end",0);
462913ec974Sbeck
463913ec974Sbeck#	&mov("eax",$c);		# $c is "eax"
4645b37fcf3Sryker
4655b37fcf3Sryker	&function_end($name);
4665b37fcf3Sryker	}
4675b37fcf3Sryker
468da347917Sbecksub bn_sub_part_words
469da347917Sbeck	{
470da347917Sbeck	local($name)=@_;
471da347917Sbeck
472da347917Sbeck	&function_begin($name,"");
473da347917Sbeck
474da347917Sbeck	&comment("");
475da347917Sbeck	$a="esi";
476da347917Sbeck	$b="edi";
477da347917Sbeck	$c="eax";
478da347917Sbeck	$r="ebx";
479da347917Sbeck	$tmp1="ecx";
480da347917Sbeck	$tmp2="edx";
481da347917Sbeck	$num="ebp";
482da347917Sbeck
483da347917Sbeck	&mov($r,&wparam(0));	# get r
484da347917Sbeck	 &mov($a,&wparam(1));	# get a
485da347917Sbeck	&mov($b,&wparam(2));	# get b
486da347917Sbeck	 &mov($num,&wparam(3));	# get num
487da347917Sbeck	&xor($c,$c);		# clear carry
488da347917Sbeck	 &and($num,0xfffffff8);	# num / 8
489da347917Sbeck
490da347917Sbeck	&jz(&label("aw_finish"));
491da347917Sbeck
492da347917Sbeck	&set_label("aw_loop",0);
493da347917Sbeck	for ($i=0; $i<8; $i++)
494da347917Sbeck		{
495da347917Sbeck		&comment("Round $i");
496da347917Sbeck
497da347917Sbeck		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
498da347917Sbeck		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
499da347917Sbeck		&sub($tmp1,$c);
500da347917Sbeck		 &mov($c,0);
501da347917Sbeck		&adc($c,$c);
502da347917Sbeck		 &sub($tmp1,$tmp2);
503da347917Sbeck		&adc($c,0);
504da347917Sbeck		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
505da347917Sbeck		}
506da347917Sbeck
507da347917Sbeck	&comment("");
508da347917Sbeck	&add($a,32);
509da347917Sbeck	 &add($b,32);
510da347917Sbeck	&add($r,32);
511da347917Sbeck	 &sub($num,8);
512da347917Sbeck	&jnz(&label("aw_loop"));
513da347917Sbeck
514da347917Sbeck	&set_label("aw_finish",0);
515da347917Sbeck	&mov($num,&wparam(3));	# get num
516da347917Sbeck	&and($num,7);
517da347917Sbeck	 &jz(&label("aw_end"));
518da347917Sbeck
519da347917Sbeck	for ($i=0; $i<7; $i++)
520da347917Sbeck		{
521da347917Sbeck		&comment("Tail Round $i");
522da347917Sbeck		&mov($tmp1,&DWP(0,$a,"",0));	# *a
523da347917Sbeck		 &mov($tmp2,&DWP(0,$b,"",0));# *b
524da347917Sbeck		&sub($tmp1,$c);
525da347917Sbeck		 &mov($c,0);
526da347917Sbeck		&adc($c,$c);
527da347917Sbeck		 &sub($tmp1,$tmp2);
528da347917Sbeck		&adc($c,0);
529da347917Sbeck		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
530da347917Sbeck		&add($a, 4);
531da347917Sbeck		&add($b, 4);
532da347917Sbeck		&add($r, 4);
533da347917Sbeck		 &dec($num) if ($i != 6);
534da347917Sbeck		 &jz(&label("aw_end")) if ($i != 6);
535da347917Sbeck		}
536da347917Sbeck	&set_label("aw_end",0);
537da347917Sbeck
538da347917Sbeck	&cmp(&wparam(4),0);
539da347917Sbeck	&je(&label("pw_end"));
540da347917Sbeck
541da347917Sbeck	&mov($num,&wparam(4));	# get dl
542da347917Sbeck	&cmp($num,0);
543da347917Sbeck	&je(&label("pw_end"));
544da347917Sbeck	&jge(&label("pw_pos"));
545da347917Sbeck
546da347917Sbeck	&comment("pw_neg");
547da347917Sbeck	&mov($tmp2,0);
548da347917Sbeck	&sub($tmp2,$num);
549da347917Sbeck	&mov($num,$tmp2);
550da347917Sbeck	&and($num,0xfffffff8);	# num / 8
551da347917Sbeck	&jz(&label("pw_neg_finish"));
552da347917Sbeck
553da347917Sbeck	&set_label("pw_neg_loop",0);
554da347917Sbeck	for ($i=0; $i<8; $i++)
555da347917Sbeck	{
556da347917Sbeck	    &comment("dl<0 Round $i");
557da347917Sbeck
558da347917Sbeck	    &mov($tmp1,0);
559da347917Sbeck	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
560da347917Sbeck	    &sub($tmp1,$c);
561da347917Sbeck	    &mov($c,0);
562da347917Sbeck	    &adc($c,$c);
563da347917Sbeck	    &sub($tmp1,$tmp2);
564da347917Sbeck	    &adc($c,0);
565da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
566da347917Sbeck	}
567da347917Sbeck
568da347917Sbeck	&comment("");
569da347917Sbeck	&add($b,32);
570da347917Sbeck	&add($r,32);
571da347917Sbeck	&sub($num,8);
572da347917Sbeck	&jnz(&label("pw_neg_loop"));
573da347917Sbeck
574da347917Sbeck	&set_label("pw_neg_finish",0);
575da347917Sbeck	&mov($tmp2,&wparam(4));	# get dl
576da347917Sbeck	&mov($num,0);
577da347917Sbeck	&sub($num,$tmp2);
578da347917Sbeck	&and($num,7);
579da347917Sbeck	&jz(&label("pw_end"));
580da347917Sbeck
581da347917Sbeck	for ($i=0; $i<7; $i++)
582da347917Sbeck	{
583da347917Sbeck	    &comment("dl<0 Tail Round $i");
584da347917Sbeck	    &mov($tmp1,0);
585da347917Sbeck	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
586da347917Sbeck	    &sub($tmp1,$c);
587da347917Sbeck	    &mov($c,0);
588da347917Sbeck	    &adc($c,$c);
589da347917Sbeck	    &sub($tmp1,$tmp2);
590da347917Sbeck	    &adc($c,0);
591da347917Sbeck	    &dec($num) if ($i != 6);
592da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
593da347917Sbeck	    &jz(&label("pw_end")) if ($i != 6);
594da347917Sbeck	}
595da347917Sbeck
596da347917Sbeck	&jmp(&label("pw_end"));
597da347917Sbeck
598da347917Sbeck	&set_label("pw_pos",0);
599da347917Sbeck
600da347917Sbeck	&and($num,0xfffffff8);	# num / 8
601da347917Sbeck	&jz(&label("pw_pos_finish"));
602da347917Sbeck
603da347917Sbeck	&set_label("pw_pos_loop",0);
604da347917Sbeck
605da347917Sbeck	for ($i=0; $i<8; $i++)
606da347917Sbeck	{
607da347917Sbeck	    &comment("dl>0 Round $i");
608da347917Sbeck
609da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
610da347917Sbeck	    &sub($tmp1,$c);
611da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
612da347917Sbeck	    &jnc(&label("pw_nc".$i));
613da347917Sbeck	}
614da347917Sbeck
615da347917Sbeck	&comment("");
616da347917Sbeck	&add($a,32);
617da347917Sbeck	&add($r,32);
618da347917Sbeck	&sub($num,8);
619da347917Sbeck	&jnz(&label("pw_pos_loop"));
620da347917Sbeck
621da347917Sbeck	&set_label("pw_pos_finish",0);
622da347917Sbeck	&mov($num,&wparam(4));	# get dl
623da347917Sbeck	&and($num,7);
624da347917Sbeck	&jz(&label("pw_end"));
625da347917Sbeck
626da347917Sbeck	for ($i=0; $i<7; $i++)
627da347917Sbeck	{
628da347917Sbeck	    &comment("dl>0 Tail Round $i");
629da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
630da347917Sbeck	    &sub($tmp1,$c);
631da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
632da347917Sbeck	    &jnc(&label("pw_tail_nc".$i));
633da347917Sbeck	    &dec($num) if ($i != 6);
634da347917Sbeck	    &jz(&label("pw_end")) if ($i != 6);
635da347917Sbeck	}
636da347917Sbeck	&mov($c,1);
637da347917Sbeck	&jmp(&label("pw_end"));
638da347917Sbeck
639da347917Sbeck	&set_label("pw_nc_loop",0);
640da347917Sbeck	for ($i=0; $i<8; $i++)
641da347917Sbeck	{
642da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
643da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
644da347917Sbeck	    &set_label("pw_nc".$i,0);
645da347917Sbeck	}
646da347917Sbeck
647da347917Sbeck	&comment("");
648da347917Sbeck	&add($a,32);
649da347917Sbeck	&add($r,32);
650da347917Sbeck	&sub($num,8);
651da347917Sbeck	&jnz(&label("pw_nc_loop"));
652da347917Sbeck
653da347917Sbeck	&mov($num,&wparam(4));	# get dl
654da347917Sbeck	&and($num,7);
655da347917Sbeck	&jz(&label("pw_nc_end"));
656da347917Sbeck
657da347917Sbeck	for ($i=0; $i<7; $i++)
658da347917Sbeck	{
659da347917Sbeck	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
660da347917Sbeck	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
661da347917Sbeck	    &set_label("pw_tail_nc".$i,0);
662da347917Sbeck	    &dec($num) if ($i != 6);
663da347917Sbeck	    &jz(&label("pw_nc_end")) if ($i != 6);
664da347917Sbeck	}
665da347917Sbeck
666da347917Sbeck	&set_label("pw_nc_end",0);
667da347917Sbeck	&mov($c,0);
668da347917Sbeck
669da347917Sbeck	&set_label("pw_end",0);
670da347917Sbeck
671da347917Sbeck#	&mov("eax",$c);		# $c is "eax"
672da347917Sbeck
673da347917Sbeck	&function_end($name);
674da347917Sbeck	}
675da347917Sbeck
676