1#! /usr/bin/env perl
2# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
11push(@INC,"${dir}","${dir}../../../perlasm");
12require "x86asm.pl";
13
14$output = pop;
15open STDOUT,">$output";
16
17&asm_init($ARGV[0]);
18
19$sse2=0;
20for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
21
22&external_label("OPENSSL_ia32cap_P") if ($sse2);
23
24&bn_mul_add_words("bn_mul_add_words");
25&bn_mul_words("bn_mul_words");
26&bn_sqr_words("bn_sqr_words");
27&bn_div_words("bn_div_words");
28&bn_add_words("bn_add_words");
29&bn_sub_words("bn_sub_words");
30
31&asm_finish();
32
33close STDOUT or die "error closing STDOUT";
34
35sub bn_mul_add_words
36	{
37	local($name)=@_;
38
39	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
40
41	$r="eax";
42	$a="edx";
43	$c="ecx";
44
45	if ($sse2) {
46		&picmeup("eax","OPENSSL_ia32cap_P");
47		&bt(&DWP(0,"eax"),26);
48		&jnc(&label("maw_non_sse2"));
49
50		&mov($r,&wparam(0));
51		&mov($a,&wparam(1));
52		&mov($c,&wparam(2));
53		&movd("mm0",&wparam(3));	# mm0 = w
54		&pxor("mm1","mm1");		# mm1 = carry_in
55		&jmp(&label("maw_sse2_entry"));
56
57	&set_label("maw_sse2_unrolled",16);
58		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
59		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
60		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
61		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
62		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
63		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
64		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
65		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
66		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
67		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
68		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
69		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
70		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
71		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
72		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
73		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
74		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
75		&movd(&DWP(0,$r,"",0),"mm1");
76		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
77		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
78		&psrlq("mm1",32);		# mm1 = carry0
79		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
80		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
81		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
82		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
83		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
84		&movd(&DWP(4,$r,"",0),"mm1");
85		&psrlq("mm1",32);		# mm1 = carry1
86		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
87		&add($a,32);
88		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
89		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
90		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
91		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
92		&movd(&DWP(8,$r,"",0),"mm1");
93		&psrlq("mm1",32);		# mm1 = carry2
94		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
95		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
96		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
97		&movd(&DWP(12,$r,"",0),"mm1");
98		&psrlq("mm1",32);		# mm1 = carry3
99		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
100		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
101		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
102		&movd(&DWP(16,$r,"",0),"mm1");
103		&psrlq("mm1",32);		# mm1 = carry4
104		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
105		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
106		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
107		&movd(&DWP(20,$r,"",0),"mm1");
108		&psrlq("mm1",32);		# mm1 = carry5
109		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
110		&movd(&DWP(24,$r,"",0),"mm1");
111		&psrlq("mm1",32);		# mm1 = carry6
112		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
113		&movd(&DWP(28,$r,"",0),"mm1");
114		&lea($r,&DWP(32,$r));
115		&psrlq("mm1",32);		# mm1 = carry_out
116
117		&sub($c,8);
118		&jz(&label("maw_sse2_exit"));
119	&set_label("maw_sse2_entry");
120		&test($c,0xfffffff8);
121		&jnz(&label("maw_sse2_unrolled"));
122
123	&set_label("maw_sse2_loop",4);
124		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
125		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
126		&pmuludq("mm2","mm0");		# a[i] *= w
127		&lea($a,&DWP(4,$a));
128		&paddq("mm1","mm3");		# carry += r[i]
129		&paddq("mm1","mm2");		# carry += a[i]*w
130		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
131		&sub($c,1);
132		&psrlq("mm1",32);		# carry = carry_high
133		&lea($r,&DWP(4,$r));
134		&jnz(&label("maw_sse2_loop"));
135	&set_label("maw_sse2_exit");
136		&movd("eax","mm1");		# c = carry_out
137		&emms();
138		&ret();
139
140	&set_label("maw_non_sse2",16);
141	}
142
143	# function_begin prologue
144	&push("ebp");
145	&push("ebx");
146	&push("esi");
147	&push("edi");
148
149	&comment("");
150	$Low="eax";
151	$High="edx";
152	$a="ebx";
153	$w="ebp";
154	$r="edi";
155	$c="esi";
156
157	&xor($c,$c);		# clear carry
158	&mov($r,&wparam(0));	#
159
160	&mov("ecx",&wparam(2));	#
161	&mov($a,&wparam(1));	#
162
163	&and("ecx",0xfffffff8);	# num / 8
164	&mov($w,&wparam(3));	#
165
166	&push("ecx");		# Up the stack for a tmp variable
167
168	&jz(&label("maw_finish"));
169
170	&set_label("maw_loop",16);
171
172	for ($i=0; $i<32; $i+=4)
173		{
174		&comment("Round $i");
175
176		 &mov("eax",&DWP($i,$a)); 	# *a
177		&mul($w);			# *a * w
178		&add("eax",$c);			# L(t)+= c
179		&adc("edx",0);			# H(t)+=carry
180		 &add("eax",&DWP($i,$r));	# L(t)+= *r
181		&adc("edx",0);			# H(t)+=carry
182		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
183		&mov($c,"edx");			# c=  H(t);
184		}
185
186	&comment("");
187	&sub("ecx",8);
188	&lea($a,&DWP(32,$a));
189	&lea($r,&DWP(32,$r));
190	&jnz(&label("maw_loop"));
191
192	&set_label("maw_finish",0);
193	&mov("ecx",&wparam(2));	# get num
194	&and("ecx",7);
195	&jnz(&label("maw_finish2"));	# helps branch prediction
196	&jmp(&label("maw_end"));
197
198	&set_label("maw_finish2",1);
199	for ($i=0; $i<7; $i++)
200		{
201		&comment("Tail Round $i");
202		 &mov("eax",&DWP($i*4,$a));	# *a
203		&mul($w);			# *a * w
204		&add("eax",$c);			# L(t)+=c
205		&adc("edx",0);			# H(t)+=carry
206		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
207		&adc("edx",0);			# H(t)+=carry
208		 &dec("ecx") if ($i != 7-1);
209		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
210		 &mov($c,"edx");		# c=  H(t);
211		&jz(&label("maw_end")) if ($i != 7-1);
212		}
213	&set_label("maw_end",0);
214	&mov("eax",$c);
215
216	&pop("ecx");	# clear variable from
217
218	&function_end($name);
219	}
220
221sub bn_mul_words
222	{
223	local($name)=@_;
224
225	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
226
227	$r="eax";
228	$a="edx";
229	$c="ecx";
230
231	if ($sse2) {
232		&picmeup("eax","OPENSSL_ia32cap_P");
233		&bt(&DWP(0,"eax"),26);
234		&jnc(&label("mw_non_sse2"));
235
236		&mov($r,&wparam(0));
237		&mov($a,&wparam(1));
238		&mov($c,&wparam(2));
239		&movd("mm0",&wparam(3));	# mm0 = w
240		&pxor("mm1","mm1");		# mm1 = carry = 0
241
242	&set_label("mw_sse2_loop",16);
243		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
244		&pmuludq("mm2","mm0");		# a[i] *= w
245		&lea($a,&DWP(4,$a));
246		&paddq("mm1","mm2");		# carry += a[i]*w
247		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
248		&sub($c,1);
249		&psrlq("mm1",32);		# carry = carry_high
250		&lea($r,&DWP(4,$r));
251		&jnz(&label("mw_sse2_loop"));
252
253		&movd("eax","mm1");		# return carry
254		&emms();
255		&ret();
256	&set_label("mw_non_sse2",16);
257	}
258
259	# function_begin prologue
260	&push("ebp");
261	&push("ebx");
262	&push("esi");
263	&push("edi");
264
265	&comment("");
266	$Low="eax";
267	$High="edx";
268	$a="ebx";
269	$w="ecx";
270	$r="edi";
271	$c="esi";
272	$num="ebp";
273
274	&xor($c,$c);		# clear carry
275	&mov($r,&wparam(0));	#
276	&mov($a,&wparam(1));	#
277	&mov($num,&wparam(2));	#
278	&mov($w,&wparam(3));	#
279
280	&and($num,0xfffffff8);	# num / 8
281	&jz(&label("mw_finish"));
282
283	&set_label("mw_loop",0);
284	for ($i=0; $i<32; $i+=4)
285		{
286		&comment("Round $i");
287
288		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
289		&mul($w);			# *a * w
290		&add("eax",$c);			# L(t)+=c
291		 # XXX
292
293		&adc("edx",0);			# H(t)+=carry
294		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
295
296		&mov($c,"edx");			# c=  H(t);
297		}
298
299	&comment("");
300	&add($a,32);
301	&add($r,32);
302	&sub($num,8);
303	&jz(&label("mw_finish"));
304	&jmp(&label("mw_loop"));
305
306	&set_label("mw_finish",0);
307	&mov($num,&wparam(2));	# get num
308	&and($num,7);
309	&jnz(&label("mw_finish2"));
310	&jmp(&label("mw_end"));
311
312	&set_label("mw_finish2",1);
313	for ($i=0; $i<7; $i++)
314		{
315		&comment("Tail Round $i");
316		 &mov("eax",&DWP($i*4,$a,"",0));# *a
317		&mul($w);			# *a * w
318		&add("eax",$c);			# L(t)+=c
319		 # XXX
320		&adc("edx",0);			# H(t)+=carry
321		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
322		&mov($c,"edx");			# c=  H(t);
323		 &dec($num) if ($i != 7-1);
324		&jz(&label("mw_end")) if ($i != 7-1);
325		}
326	&set_label("mw_end",0);
327	&mov("eax",$c);
328
329	&function_end($name);
330	}
331
332sub bn_sqr_words
333	{
334	local($name)=@_;
335
336	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
337
338	$r="eax";
339	$a="edx";
340	$c="ecx";
341
342	if ($sse2) {
343		&picmeup("eax","OPENSSL_ia32cap_P");
344		&bt(&DWP(0,"eax"),26);
345		&jnc(&label("sqr_non_sse2"));
346
347		&mov($r,&wparam(0));
348		&mov($a,&wparam(1));
349		&mov($c,&wparam(2));
350
351	&set_label("sqr_sse2_loop",16);
352		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
353		&pmuludq("mm0","mm0");		# a[i] *= a[i]
354		&lea($a,&DWP(4,$a));		# a++
355		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
356		&sub($c,1);
357		&lea($r,&DWP(8,$r));		# r += 2
358		&jnz(&label("sqr_sse2_loop"));
359
360		&emms();
361		&ret();
362	&set_label("sqr_non_sse2",16);
363	}
364
365	# function_begin prologue
366	&push("ebp");
367	&push("ebx");
368	&push("esi");
369	&push("edi");
370
371	&comment("");
372	$r="esi";
373	$a="edi";
374	$num="ebx";
375
376	&mov($r,&wparam(0));	#
377	&mov($a,&wparam(1));	#
378	&mov($num,&wparam(2));	#
379
380	&and($num,0xfffffff8);	# num / 8
381	&jz(&label("sw_finish"));
382
383	&set_label("sw_loop",0);
384	for ($i=0; $i<32; $i+=4)
385		{
386		&comment("Round $i");
387		&mov("eax",&DWP($i,$a,"",0)); 	# *a
388		 # XXX
389		&mul("eax");			# *a * *a
390		&mov(&DWP($i*2,$r,"",0),"eax");	#
391		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
392		}
393
394	&comment("");
395	&add($a,32);
396	&add($r,64);
397	&sub($num,8);
398	&jnz(&label("sw_loop"));
399
400	&set_label("sw_finish",0);
401	&mov($num,&wparam(2));	# get num
402	&and($num,7);
403	&jz(&label("sw_end"));
404
405	for ($i=0; $i<7; $i++)
406		{
407		&comment("Tail Round $i");
408		&mov("eax",&DWP($i*4,$a,"",0));	# *a
409		 # XXX
410		&mul("eax");			# *a * *a
411		&mov(&DWP($i*8,$r,"",0),"eax");	#
412		 &dec($num) if ($i != 7-1);
413		&mov(&DWP($i*8+4,$r,"",0),"edx");
414		 &jz(&label("sw_end")) if ($i != 7-1);
415		}
416	&set_label("sw_end",0);
417
418	&function_end($name);
419	}
420
421sub bn_div_words
422	{
423	local($name)=@_;
424
425	&function_begin_B($name,"");
426	&mov("edx",&wparam(0));	#
427	&mov("eax",&wparam(1));	#
428	&mov("ecx",&wparam(2));	#
429	&div("ecx");
430	&ret();
431	&function_end_B($name);
432	}
433
434sub bn_add_words
435	{
436	local($name)=@_;
437
438	&function_begin($name,"");
439
440	&comment("");
441	$a="esi";
442	$b="edi";
443	$c="eax";
444	$r="ebx";
445	$tmp1="ecx";
446	$tmp2="edx";
447	$num="ebp";
448
449	&mov($r,&wparam(0));	# get r
450	 &mov($a,&wparam(1));	# get a
451	&mov($b,&wparam(2));	# get b
452	 &mov($num,&wparam(3));	# get num
453	&xor($c,$c);		# clear carry
454	 &and($num,0xfffffff8);	# num / 8
455
456	&jz(&label("aw_finish"));
457
458	&set_label("aw_loop",0);
459	for ($i=0; $i<8; $i++)
460		{
461		&comment("Round $i");
462
463		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
464		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
465		&add($tmp1,$c);
466		 &mov($c,0);
467		&adc($c,$c);
468		 &add($tmp1,$tmp2);
469		&adc($c,0);
470		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
471		}
472
473	&comment("");
474	&add($a,32);
475	 &add($b,32);
476	&add($r,32);
477	 &sub($num,8);
478	&jnz(&label("aw_loop"));
479
480	&set_label("aw_finish",0);
481	&mov($num,&wparam(3));	# get num
482	&and($num,7);
483	 &jz(&label("aw_end"));
484
485	for ($i=0; $i<7; $i++)
486		{
487		&comment("Tail Round $i");
488		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
489		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
490		&add($tmp1,$c);
491		 &mov($c,0);
492		&adc($c,$c);
493		 &add($tmp1,$tmp2);
494		&adc($c,0);
495		 &dec($num) if ($i != 6);
496		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
497		 &jz(&label("aw_end")) if ($i != 6);
498		}
499	&set_label("aw_end",0);
500
501#	&mov("eax",$c);		# $c is "eax"
502
503	&function_end($name);
504	}
505
506sub bn_sub_words
507	{
508	local($name)=@_;
509
510	&function_begin($name,"");
511
512	&comment("");
513	$a="esi";
514	$b="edi";
515	$c="eax";
516	$r="ebx";
517	$tmp1="ecx";
518	$tmp2="edx";
519	$num="ebp";
520
521	&mov($r,&wparam(0));	# get r
522	 &mov($a,&wparam(1));	# get a
523	&mov($b,&wparam(2));	# get b
524	 &mov($num,&wparam(3));	# get num
525	&xor($c,$c);		# clear carry
526	 &and($num,0xfffffff8);	# num / 8
527
528	&jz(&label("aw_finish"));
529
530	&set_label("aw_loop",0);
531	for ($i=0; $i<8; $i++)
532		{
533		&comment("Round $i");
534
535		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
536		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
537		&sub($tmp1,$c);
538		 &mov($c,0);
539		&adc($c,$c);
540		 &sub($tmp1,$tmp2);
541		&adc($c,0);
542		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
543		}
544
545	&comment("");
546	&add($a,32);
547	 &add($b,32);
548	&add($r,32);
549	 &sub($num,8);
550	&jnz(&label("aw_loop"));
551
552	&set_label("aw_finish",0);
553	&mov($num,&wparam(3));	# get num
554	&and($num,7);
555	 &jz(&label("aw_end"));
556
557	for ($i=0; $i<7; $i++)
558		{
559		&comment("Tail Round $i");
560		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
561		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
562		&sub($tmp1,$c);
563		 &mov($c,0);
564		&adc($c,$c);
565		 &sub($tmp1,$tmp2);
566		&adc($c,0);
567		 &dec($num) if ($i != 6);
568		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
569		 &jz(&label("aw_end")) if ($i != 6);
570		}
571	&set_label("aw_end",0);
572
573#	&mov("eax",$c);		# $c is "eax"
574
575	&function_end($name);
576	}
577