1#! /usr/bin/env perl
2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions
6# are met:
7#
8# 1. Redistributions of source code must retain the above copyright
9#    notice, this list of conditions and the following disclaimer.
10#
11# 2. Redistributions in binary form must reproduce the above copyright
12#    notice, this list of conditions and the following disclaimer in
13#    the documentation and/or other materials provided with the
14#    distribution.
15#
16# 3. All advertising materials mentioning features or use of this
17#    software must display the following acknowledgment:
18#    "This product includes software developed by the OpenSSL Project
19#    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20#
21# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22#    endorse or promote products derived from this software without
23#    prior written permission. For written permission, please contact
24#    openssl-core@openssl.org.
25#
26# 5. Products derived from this software may not be called "OpenSSL"
27#    nor may "OpenSSL" appear in their names without prior written
28#    permission of the OpenSSL Project.
29#
30# 6. Redistributions of any form whatsoever must retain the following
31#    acknowledgment:
32#    "This product includes software developed by the OpenSSL Project
33#    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34#
35# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46# OF THE POSSIBILITY OF SUCH DAMAGE.
47# ====================================================================
48#
49# This product includes cryptographic software written by Eric Young
50# (eay@cryptsoft.com).  This product includes software written by Tim
51# Hudson (tjh@cryptsoft.com).
52
53
54# ====================================================================
55# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
56# project. The module is, however, dual licensed under OpenSSL and
57# CRYPTOGAMS licenses depending on where you obtain it. For further
58# details see http://www.openssl.org/~appro/cryptogams/.
59# ====================================================================
60#
61# ECP_NISTZ256 module for x86/SSE2.
62#
63# October 2014.
64#
65# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
66# http://eprint.iacr.org/2013/816. In the process of adaptation
67# original .c module was made 32-bit savvy in order to make this
68# implementation possible.
69#
70#		with/without -DECP_NISTZ256_ASM
71# Pentium	+66-163%
72# PIII		+72-172%
73# P4		+65-132%
74# Core2		+90-215%
75# Sandy Bridge	+105-265% (contemporary i[57]-* are all close to this)
76# Atom		+65-155%
77# Opteron	+54-110%
78# Bulldozer	+99-240%
79# VIA Nano	+93-290%
80#
81# Ranges denote minimum and maximum improvement coefficients depending
82# on benchmark. Lower coefficients are for ECDSA sign, server-side
83# operation. Keep in mind that +200% means 3x improvement.
84
85$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
86push(@INC,"${dir}","${dir}../../../perlasm");
87require "x86asm.pl";
88
89$output=pop;
90open STDOUT,">$output";
91
92&asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386");
93
94$sse2=0;
95for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
96
97&external_label("GFp_ia32cap_P") if ($sse2);
98
99
100########################################################################
101# Keep in mind that constants are stored least to most significant word
102&static_label("ONE_mont");
103&set_label("ONE_mont");
104&data_word(1,0,0,-1,-1,-1,-2,0);
105
106
107&function_begin_B("_ecp_nistz256_div_by_2");
108	# tmp = a is odd ? a+mod : a
109	#
110	# note that because mod has special form, i.e. consists of
111	# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
112	# assigning least significant bit of input to one register,
113	# %ebp, and its negative to another, %edx.
114
115	&mov	("ebp",&DWP(0,"esi"));
116	&xor	("edx","edx");
117	&mov	("ebx",&DWP(4,"esi"));
118	&mov	("eax","ebp");
119	&and	("ebp",1);
120	&mov	("ecx",&DWP(8,"esi"));
121	&sub	("edx","ebp");
122
123	&add	("eax","edx");
124	&adc	("ebx","edx");
125	&mov	(&DWP(0,"edi"),"eax");
126	&adc	("ecx","edx");
127	&mov	(&DWP(4,"edi"),"ebx");
128	&mov	(&DWP(8,"edi"),"ecx");
129
130	&mov	("eax",&DWP(12,"esi"));
131	&mov	("ebx",&DWP(16,"esi"));
132	&adc	("eax",0);
133	&mov	("ecx",&DWP(20,"esi"));
134	&adc	("ebx",0);
135	&mov	(&DWP(12,"edi"),"eax");
136	&adc	("ecx",0);
137	&mov	(&DWP(16,"edi"),"ebx");
138	&mov	(&DWP(20,"edi"),"ecx");
139
140	&mov	("eax",&DWP(24,"esi"));
141	&mov	("ebx",&DWP(28,"esi"));
142	&adc	("eax","ebp");
143	&adc	("ebx","edx");
144	&mov	(&DWP(24,"edi"),"eax");
145	&sbb	("esi","esi");			# broadcast carry bit
146	&mov	(&DWP(28,"edi"),"ebx");
147
148	# ret = tmp >> 1
149
150	&mov	("eax",&DWP(0,"edi"));
151	&mov	("ebx",&DWP(4,"edi"));
152	&mov	("ecx",&DWP(8,"edi"));
153	&mov	("edx",&DWP(12,"edi"));
154
155	&shr	("eax",1);
156	&mov	("ebp","ebx");
157	&shl	("ebx",31);
158	&or	("eax","ebx");
159
160	&shr	("ebp",1);
161	&mov	("ebx","ecx");
162	&shl	("ecx",31);
163	&mov	(&DWP(0,"edi"),"eax");
164	&or	("ebp","ecx");
165	&mov	("eax",&DWP(16,"edi"));
166
167	&shr	("ebx",1);
168	&mov	("ecx","edx");
169	&shl	("edx",31);
170	&mov	(&DWP(4,"edi"),"ebp");
171	&or	("ebx","edx");
172	&mov	("ebp",&DWP(20,"edi"));
173
174	&shr	("ecx",1);
175	&mov	("edx","eax");
176	&shl	("eax",31);
177	&mov	(&DWP(8,"edi"),"ebx");
178	&or	("ecx","eax");
179	&mov	("ebx",&DWP(24,"edi"));
180
181	&shr	("edx",1);
182	&mov	("eax","ebp");
183	&shl	("ebp",31);
184	&mov	(&DWP(12,"edi"),"ecx");
185	&or	("edx","ebp");
186	&mov	("ecx",&DWP(28,"edi"));
187
188	&shr	("eax",1);
189	&mov	("ebp","ebx");
190	&shl	("ebx",31);
191	&mov	(&DWP(16,"edi"),"edx");
192	&or	("eax","ebx");
193
194	&shr	("ebp",1);
195	&mov	("ebx","ecx");
196	&shl	("ecx",31);
197	&mov	(&DWP(20,"edi"),"eax");
198	&or	("ebp","ecx");
199
200	&shr	("ebx",1);
201	&shl	("esi",31);
202	&mov	(&DWP(24,"edi"),"ebp");
203	&or	("ebx","esi");			# handle top-most carry bit
204	&mov	(&DWP(28,"edi"),"ebx");
205
206	&ret	();
207&function_end_B("_ecp_nistz256_div_by_2");
208
209########################################################################
210# void GFp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8],
211#					const BN_ULONG ebp[8]);
212&function_begin("GFp_nistz256_add");
213	&mov	("esi",&wparam(1));
214	&mov	("ebp",&wparam(2));
215	&mov	("edi",&wparam(0));
216	&call	("_ecp_nistz256_add");
217&function_end("GFp_nistz256_add");
218
219&function_begin_B("_ecp_nistz256_add");
220	&mov	("eax",&DWP(0,"esi"));
221	&mov	("ebx",&DWP(4,"esi"));
222	&mov	("ecx",&DWP(8,"esi"));
223	&add	("eax",&DWP(0,"ebp"));
224	&mov	("edx",&DWP(12,"esi"));
225	&adc	("ebx",&DWP(4,"ebp"));
226	&mov	(&DWP(0,"edi"),"eax");
227	&adc	("ecx",&DWP(8,"ebp"));
228	&mov	(&DWP(4,"edi"),"ebx");
229	&adc	("edx",&DWP(12,"ebp"));
230	&mov	(&DWP(8,"edi"),"ecx");
231	&mov	(&DWP(12,"edi"),"edx");
232
233	&mov	("eax",&DWP(16,"esi"));
234	&mov	("ebx",&DWP(20,"esi"));
235	&mov	("ecx",&DWP(24,"esi"));
236	&adc	("eax",&DWP(16,"ebp"));
237	&mov	("edx",&DWP(28,"esi"));
238	&adc	("ebx",&DWP(20,"ebp"));
239	&mov	(&DWP(16,"edi"),"eax");
240	&adc	("ecx",&DWP(24,"ebp"));
241	&mov	(&DWP(20,"edi"),"ebx");
242	&mov	("esi",0);
243	&adc	("edx",&DWP(28,"ebp"));
244	&mov	(&DWP(24,"edi"),"ecx");
245	&adc	("esi",0);
246	&mov	(&DWP(28,"edi"),"edx");
247
248	# if a+b >= modulus, subtract modulus.
249	#
250	# But since comparison implies subtraction, we subtract modulus
251	# to see if it borrows, and then subtract it for real if
252	# subtraction didn't borrow.
253
254	&mov	("eax",&DWP(0,"edi"));
255	&mov	("ebx",&DWP(4,"edi"));
256	&mov	("ecx",&DWP(8,"edi"));
257	&sub	("eax",-1);
258	&mov	("edx",&DWP(12,"edi"));
259	&sbb	("ebx",-1);
260	&mov	("eax",&DWP(16,"edi"));
261	&sbb	("ecx",-1);
262	&mov	("ebx",&DWP(20,"edi"));
263	&sbb	("edx",0);
264	&mov	("ecx",&DWP(24,"edi"));
265	&sbb	("eax",0);
266	&mov	("edx",&DWP(28,"edi"));
267	&sbb	("ebx",0);
268	&sbb	("ecx",1);
269	&sbb	("edx",-1);
270	&sbb	("esi",0);
271
272	# Note that because mod has special form, i.e. consists of
273	# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
274	# by using borrow.
275
276	&not	("esi");
277	&mov	("eax",&DWP(0,"edi"));
278	&mov	("ebp","esi");
279	&mov	("ebx",&DWP(4,"edi"));
280	&shr	("ebp",31);
281	&mov	("ecx",&DWP(8,"edi"));
282	&sub	("eax","esi");
283	&mov	("edx",&DWP(12,"edi"));
284	&sbb	("ebx","esi");
285	&mov	(&DWP(0,"edi"),"eax");
286	&sbb	("ecx","esi");
287	&mov	(&DWP(4,"edi"),"ebx");
288	&sbb	("edx",0);
289	&mov	(&DWP(8,"edi"),"ecx");
290	&mov	(&DWP(12,"edi"),"edx");
291
292	&mov	("eax",&DWP(16,"edi"));
293	&mov	("ebx",&DWP(20,"edi"));
294	&mov	("ecx",&DWP(24,"edi"));
295	&sbb	("eax",0);
296	&mov	("edx",&DWP(28,"edi"));
297	&sbb	("ebx",0);
298	&mov	(&DWP(16,"edi"),"eax");
299	&sbb	("ecx","ebp");
300	&mov	(&DWP(20,"edi"),"ebx");
301	&sbb	("edx","esi");
302	&mov	(&DWP(24,"edi"),"ecx");
303	&mov	(&DWP(28,"edi"),"edx");
304
305	&ret	();
306&function_end_B("_ecp_nistz256_add");
307
308&function_begin_B("_ecp_nistz256_sub");
309	&mov	("eax",&DWP(0,"esi"));
310	&mov	("ebx",&DWP(4,"esi"));
311	&mov	("ecx",&DWP(8,"esi"));
312	&sub	("eax",&DWP(0,"ebp"));
313	&mov	("edx",&DWP(12,"esi"));
314	&sbb	("ebx",&DWP(4,"ebp"));
315	&mov	(&DWP(0,"edi"),"eax");
316	&sbb	("ecx",&DWP(8,"ebp"));
317	&mov	(&DWP(4,"edi"),"ebx");
318	&sbb	("edx",&DWP(12,"ebp"));
319	&mov	(&DWP(8,"edi"),"ecx");
320	&mov	(&DWP(12,"edi"),"edx");
321
322	&mov	("eax",&DWP(16,"esi"));
323	&mov	("ebx",&DWP(20,"esi"));
324	&mov	("ecx",&DWP(24,"esi"));
325	&sbb	("eax",&DWP(16,"ebp"));
326	&mov	("edx",&DWP(28,"esi"));
327	&sbb	("ebx",&DWP(20,"ebp"));
328	&sbb	("ecx",&DWP(24,"ebp"));
329	&mov	(&DWP(16,"edi"),"eax");
330	&sbb	("edx",&DWP(28,"ebp"));
331	&mov	(&DWP(20,"edi"),"ebx");
332	&sbb	("esi","esi");			# broadcast borrow bit
333	&mov	(&DWP(24,"edi"),"ecx");
334	&mov	(&DWP(28,"edi"),"edx");
335
336	# if a-b borrows, add modulus.
337	#
338	# Note that because mod has special form, i.e. consists of
339	# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
340	# assigning borrow bit to one register, %ebp, and its negative
341	# to another, %esi. But we started by calculating %esi...
342
343	&mov	("eax",&DWP(0,"edi"));
344	&mov	("ebp","esi");
345	&mov	("ebx",&DWP(4,"edi"));
346	&shr	("ebp",31);
347	&mov	("ecx",&DWP(8,"edi"));
348	&add	("eax","esi");
349	&mov	("edx",&DWP(12,"edi"));
350	&adc	("ebx","esi");
351	&mov	(&DWP(0,"edi"),"eax");
352	&adc	("ecx","esi");
353	&mov	(&DWP(4,"edi"),"ebx");
354	&adc	("edx",0);
355	&mov	(&DWP(8,"edi"),"ecx");
356	&mov	(&DWP(12,"edi"),"edx");
357
358	&mov	("eax",&DWP(16,"edi"));
359	&mov	("ebx",&DWP(20,"edi"));
360	&mov	("ecx",&DWP(24,"edi"));
361	&adc	("eax",0);
362	&mov	("edx",&DWP(28,"edi"));
363	&adc	("ebx",0);
364	&mov	(&DWP(16,"edi"),"eax");
365	&adc	("ecx","ebp");
366	&mov	(&DWP(20,"edi"),"ebx");
367	&adc	("edx","esi");
368	&mov	(&DWP(24,"edi"),"ecx");
369	&mov	(&DWP(28,"edi"),"edx");
370
371	&ret	();
372&function_end_B("_ecp_nistz256_sub");
373
374########################################################################
375# void GFp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]);
376&function_begin("GFp_nistz256_neg");
377	&mov	("ebp",&wparam(1));
378	&mov	("edi",&wparam(0));
379
380	&xor	("eax","eax");
381	&stack_push(8);
382	&mov	(&DWP(0,"esp"),"eax");
383	&mov	("esi","esp");
384	&mov	(&DWP(4,"esp"),"eax");
385	&mov	(&DWP(8,"esp"),"eax");
386	&mov	(&DWP(12,"esp"),"eax");
387	&mov	(&DWP(16,"esp"),"eax");
388	&mov	(&DWP(20,"esp"),"eax");
389	&mov	(&DWP(24,"esp"),"eax");
390	&mov	(&DWP(28,"esp"),"eax");
391
392	&call	("_ecp_nistz256_sub");
393
394	&stack_pop(8);
395&function_end("GFp_nistz256_neg");
396
397&function_begin_B("_picup_eax");
398	&mov	("eax",&DWP(0,"esp"));
399	&ret	();
400&function_end_B("_picup_eax");
401
402########################################################################
403# void GFp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8],
404#					     const BN_ULONG ebp[8]);
405&function_begin("GFp_nistz256_mul_mont");
406	&mov	("esi",&wparam(1));
407	&mov	("ebp",&wparam(2));
408						if ($sse2) {
409	&call	("_picup_eax");
410    &set_label("pic");
411	&picmeup("eax","GFp_ia32cap_P","eax",&label("pic"));
412	&mov	("eax",&DWP(0,"eax"));		}
413	&mov	("edi",&wparam(0));
414	&call	("_ecp_nistz256_mul_mont");
415&function_end("GFp_nistz256_mul_mont");
416
417&function_begin_B("_ecp_nistz256_mul_mont");
418						if ($sse2) {
419	# We always use SSE2
420
421	########################################
422	# SSE2 code path featuring 32x16-bit
423	# multiplications is ~2x faster than
424	# IALU counterpart (except on Atom)...
425	########################################
426	# stack layout:
427	# +------------------------------------+< %esp
428	# | 7 16-byte temporary XMM words,     |
429	# | "sliding" toward lower address     |
430	# .                                    .
431	# +------------------------------------+
432	# | unused XMM word                    |
433	# +------------------------------------+< +128,%ebx
434	# | 8 16-byte XMM words holding copies |
435	# | of a[i]<<64|a[i]                   |
436	# .                                    .
437	# .                                    .
438	# +------------------------------------+< +256
439	&mov	("edx","esp");
440	&sub	("esp",0x100);
441
442	&movd	("xmm7",&DWP(0,"ebp"));		# b[0] -> 0000.00xy
443	&lea	("ebp",&DWP(4,"ebp"));
444	&pcmpeqd("xmm6","xmm6");
445	&psrlq	("xmm6",48);			# compose 0xffff<<64|0xffff
446
447	&pshuflw("xmm7","xmm7",0b11011100);	# 0000.00xy -> 0000.0x0y
448	&and	("esp",-64);
449	&pshufd	("xmm7","xmm7",0b11011100);	# 0000.0x0y -> 000x.000y
450	&lea	("ebx",&DWP(0x80,"esp"));
451
452	&movd	("xmm0",&DWP(4*0,"esi"));	# a[0] -> 0000.00xy
453	&pshufd	("xmm0","xmm0",0b11001100);	# 0000.00xy -> 00xy.00xy
454	&movd	("xmm1",&DWP(4*1,"esi"));	# a[1] -> ...
455	&movdqa	(&QWP(0x00,"ebx"),"xmm0");	# offload converted a[0]
456	&pmuludq("xmm0","xmm7");		# a[0]*b[0]
457
458	&movd	("xmm2",&DWP(4*2,"esi"));
459	&pshufd	("xmm1","xmm1",0b11001100);
460	&movdqa	(&QWP(0x10,"ebx"),"xmm1");
461	&pmuludq("xmm1","xmm7");		# a[1]*b[0]
462
463	 &movq	("xmm4","xmm0");		# clear upper 64 bits
464	 &pslldq("xmm4",6);
465	 &paddq	("xmm4","xmm0");
466	 &movdqa("xmm5","xmm4");
467	 &psrldq("xmm4",10);			# upper 32 bits of a[0]*b[0]
468	 &pand	("xmm5","xmm6");		# lower 32 bits of a[0]*b[0]
469
470	# Upper half of a[0]*b[i] is carried into next multiplication
471	# iteration, while lower one "participates" in actual reduction.
472	# Normally latter is done by accumulating result of multiplication
473	# of modulus by "magic" digit, but thanks to special form of modulus
474	# and "magic" digit it can be performed only with additions and
475	# subtractions (see note in IALU section below). Note that we are
476	# not bothered with carry bits, they are accumulated in "flatten"
477	# phase after all multiplications and reductions.
478
479	&movd	("xmm3",&DWP(4*3,"esi"));
480	&pshufd	("xmm2","xmm2",0b11001100);
481	&movdqa	(&QWP(0x20,"ebx"),"xmm2");
482	&pmuludq("xmm2","xmm7");		# a[2]*b[0]
483	 &paddq	("xmm1","xmm4");		# a[1]*b[0]+hw(a[0]*b[0]), carry
484	&movdqa	(&QWP(0x00,"esp"),"xmm1");	# t[0]
485
486	&movd	("xmm0",&DWP(4*4,"esi"));
487	&pshufd	("xmm3","xmm3",0b11001100);
488	&movdqa	(&QWP(0x30,"ebx"),"xmm3");
489	&pmuludq("xmm3","xmm7");		# a[3]*b[0]
490	&movdqa	(&QWP(0x10,"esp"),"xmm2");
491
492	&movd	("xmm1",&DWP(4*5,"esi"));
493	&pshufd	("xmm0","xmm0",0b11001100);
494	&movdqa	(&QWP(0x40,"ebx"),"xmm0");
495	&pmuludq("xmm0","xmm7");		# a[4]*b[0]
496	 &paddq	("xmm3","xmm5");		# a[3]*b[0]+lw(a[0]*b[0]), reduction step
497	&movdqa	(&QWP(0x20,"esp"),"xmm3");
498
499	&movd	("xmm2",&DWP(4*6,"esi"));
500	&pshufd	("xmm1","xmm1",0b11001100);
501	&movdqa	(&QWP(0x50,"ebx"),"xmm1");
502	&pmuludq("xmm1","xmm7");		# a[5]*b[0]
503	&movdqa	(&QWP(0x30,"esp"),"xmm0");
504	 &pshufd("xmm4","xmm5",0b10110001);	# xmm4 = xmm5<<32, reduction step
505
506	&movd	("xmm3",&DWP(4*7,"esi"));
507	&pshufd	("xmm2","xmm2",0b11001100);
508	&movdqa	(&QWP(0x60,"ebx"),"xmm2");
509	&pmuludq("xmm2","xmm7");		# a[6]*b[0]
510	&movdqa	(&QWP(0x40,"esp"),"xmm1");
511	 &psubq	("xmm4","xmm5");		# xmm4 = xmm5*0xffffffff, reduction step
512
513	&movd	("xmm0",&DWP(0,"ebp"));		# b[1] -> 0000.00xy
514	&pshufd	("xmm3","xmm3",0b11001100);
515	&movdqa	(&QWP(0x70,"ebx"),"xmm3");
516	&pmuludq("xmm3","xmm7");		# a[7]*b[0]
517
518	&pshuflw("xmm7","xmm0",0b11011100);	# 0000.00xy -> 0000.0x0y
519	&movdqa	("xmm0",&QWP(0x00,"ebx"));	# pre-load converted a[0]
520	&pshufd	("xmm7","xmm7",0b11011100);	# 0000.0x0y -> 000x.000y
521
522	&mov	("ecx",6);
523	&lea	("ebp",&DWP(4,"ebp"));
524	&jmp	(&label("madd_sse2"));
525
526&set_label("madd_sse2",16);
527	 &paddq	("xmm2","xmm5");		# a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled]
528	 &paddq	("xmm3","xmm4");		# a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled]
529	&movdqa	("xmm1",&QWP(0x10,"ebx"));
530	&pmuludq("xmm0","xmm7");		# a[0]*b[i]
531	 &movdqa(&QWP(0x50,"esp"),"xmm2");
532
533	&movdqa	("xmm2",&QWP(0x20,"ebx"));
534	&pmuludq("xmm1","xmm7");		# a[1]*b[i]
535	 &movdqa(&QWP(0x60,"esp"),"xmm3");
536	&paddq	("xmm0",&QWP(0x00,"esp"));
537
538	&movdqa	("xmm3",&QWP(0x30,"ebx"));
539	&pmuludq("xmm2","xmm7");		# a[2]*b[i]
540	 &movq	("xmm4","xmm0");		# clear upper 64 bits
541	 &pslldq("xmm4",6);
542	&paddq	("xmm1",&QWP(0x10,"esp"));
543	 &paddq	("xmm4","xmm0");
544	 &movdqa("xmm5","xmm4");
545	 &psrldq("xmm4",10);			# upper 33 bits of a[0]*b[i]+t[0]
546
547	&movdqa	("xmm0",&QWP(0x40,"ebx"));
548	&pmuludq("xmm3","xmm7");		# a[3]*b[i]
549	 &paddq	("xmm1","xmm4");		# a[1]*b[i]+hw(a[0]*b[i]), carry
550	&paddq	("xmm2",&QWP(0x20,"esp"));
551	&movdqa	(&QWP(0x00,"esp"),"xmm1");
552
553	&movdqa	("xmm1",&QWP(0x50,"ebx"));
554	&pmuludq("xmm0","xmm7");		# a[4]*b[i]
555	&paddq	("xmm3",&QWP(0x30,"esp"));
556	&movdqa	(&QWP(0x10,"esp"),"xmm2");
557	 &pand	("xmm5","xmm6");		# lower 32 bits of a[0]*b[i]
558
559	&movdqa	("xmm2",&QWP(0x60,"ebx"));
560	&pmuludq("xmm1","xmm7");		# a[5]*b[i]
561	 &paddq	("xmm3","xmm5");		# a[3]*b[i]+lw(a[0]*b[i]), reduction step
562	&paddq	("xmm0",&QWP(0x40,"esp"));
563	&movdqa	(&QWP(0x20,"esp"),"xmm3");
564	 &pshufd("xmm4","xmm5",0b10110001);	# xmm4 = xmm5<<32, reduction step
565
566	&movdqa	("xmm3","xmm7");
567	&pmuludq("xmm2","xmm7");		# a[6]*b[i]
568	 &movd	("xmm7",&DWP(0,"ebp"));		# b[i++] -> 0000.00xy
569	 &lea	("ebp",&DWP(4,"ebp"));
570	&paddq	("xmm1",&QWP(0x50,"esp"));
571	 &psubq	("xmm4","xmm5");		# xmm4 = xmm5*0xffffffff, reduction step
572	&movdqa	(&QWP(0x30,"esp"),"xmm0");
573	 &pshuflw("xmm7","xmm7",0b11011100);	# 0000.00xy -> 0000.0x0y
574
575	&pmuludq("xmm3",&QWP(0x70,"ebx"));	# a[7]*b[i]
576	 &pshufd("xmm7","xmm7",0b11011100);	# 0000.0x0y -> 000x.000y
577	 &movdqa("xmm0",&QWP(0x00,"ebx"));	# pre-load converted a[0]
578	&movdqa	(&QWP(0x40,"esp"),"xmm1");
579	&paddq	("xmm2",&QWP(0x60,"esp"));
580
581	&dec	("ecx");
582	&jnz	(&label("madd_sse2"));
583
584	 &paddq	("xmm2","xmm5");		# a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled]
585	 &paddq	("xmm3","xmm4");		# a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled]
586	&movdqa	("xmm1",&QWP(0x10,"ebx"));
587	&pmuludq("xmm0","xmm7");		# a[0]*b[7]
588	 &movdqa(&QWP(0x50,"esp"),"xmm2");
589
590	&movdqa	("xmm2",&QWP(0x20,"ebx"));
591	&pmuludq("xmm1","xmm7");		# a[1]*b[7]
592	 &movdqa(&QWP(0x60,"esp"),"xmm3");
593	&paddq	("xmm0",&QWP(0x00,"esp"));
594
595	&movdqa	("xmm3",&QWP(0x30,"ebx"));
596	&pmuludq("xmm2","xmm7");		# a[2]*b[7]
597	 &movq	("xmm4","xmm0");		# clear upper 64 bits
598	 &pslldq("xmm4",6);
599	&paddq	("xmm1",&QWP(0x10,"esp"));
600	 &paddq	("xmm4","xmm0");
601	 &movdqa("xmm5","xmm4");
602	 &psrldq("xmm4",10);			# upper 33 bits of a[0]*b[i]+t[0]
603
604	&movdqa	("xmm0",&QWP(0x40,"ebx"));
605	&pmuludq("xmm3","xmm7");		# a[3]*b[7]
606	 &paddq	("xmm1","xmm4");		# a[1]*b[7]+hw(a[0]*b[7]), carry
607	&paddq	("xmm2",&QWP(0x20,"esp"));
608	&movdqa	(&QWP(0x00,"esp"),"xmm1");
609
610	&movdqa	("xmm1",&QWP(0x50,"ebx"));
611	&pmuludq("xmm0","xmm7");		# a[4]*b[7]
612	&paddq	("xmm3",&QWP(0x30,"esp"));
613	&movdqa	(&QWP(0x10,"esp"),"xmm2");
614	 &pand	("xmm5","xmm6");		# lower 32 bits of a[0]*b[i]
615
616	&movdqa	("xmm2",&QWP(0x60,"ebx"));
617	&pmuludq("xmm1","xmm7");		# a[5]*b[7]
618	 &paddq	("xmm3","xmm5");		# reduction step
619	&paddq	("xmm0",&QWP(0x40,"esp"));
620	&movdqa	(&QWP(0x20,"esp"),"xmm3");
621	 &pshufd("xmm4","xmm5",0b10110001);	# xmm4 = xmm5<<32, reduction step
622
623	&movdqa	("xmm3",&QWP(0x70,"ebx"));
624	&pmuludq("xmm2","xmm7");		# a[6]*b[7]
625	&paddq	("xmm1",&QWP(0x50,"esp"));
626	 &psubq	("xmm4","xmm5");		# xmm4 = xmm5*0xffffffff, reduction step
627	&movdqa	(&QWP(0x30,"esp"),"xmm0");
628
629	&pmuludq("xmm3","xmm7");		# a[7]*b[7]
630	&pcmpeqd("xmm7","xmm7");
631	&movdqa	("xmm0",&QWP(0x00,"esp"));
632	&pslldq	("xmm7",8);
633	&movdqa	(&QWP(0x40,"esp"),"xmm1");
634	&paddq	("xmm2",&QWP(0x60,"esp"));
635
636	 &paddq	("xmm2","xmm5");		# a[6]*b[7]+lw(a[0]*b[7]), reduction step
637	 &paddq	("xmm3","xmm4");		# a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step
638	 &movdqa(&QWP(0x50,"esp"),"xmm2");
639	 &movdqa(&QWP(0x60,"esp"),"xmm3");
640
641	&movdqa	("xmm1",&QWP(0x10,"esp"));
642	&movdqa	("xmm2",&QWP(0x20,"esp"));
643	&movdqa	("xmm3",&QWP(0x30,"esp"));
644
645	&movq	("xmm4","xmm0");		# "flatten"
646	&pand	("xmm0","xmm7");
647	&xor	("ebp","ebp");
648	&pslldq	("xmm4",6);
649	 &movq	("xmm5","xmm1");
650	&paddq	("xmm0","xmm4");
651	 &pand	("xmm1","xmm7");
652	&psrldq	("xmm0",6);
653	&movd	("eax","xmm0");
654	&psrldq	("xmm0",4);
655
656	&paddq	("xmm5","xmm0");
657	&movdqa	("xmm0",&QWP(0x40,"esp"));
658	&sub	("eax",-1);			# start subtracting modulus,
659						# this is used to determine
660						# if result is larger/smaller
661						# than modulus (see below)
662	&pslldq	("xmm5",6);
663	 &movq	("xmm4","xmm2");
664	&paddq	("xmm1","xmm5");
665	 &pand	("xmm2","xmm7");
666	&psrldq	("xmm1",6);
667	&mov	(&DWP(4*0,"edi"),"eax");
668	&movd	("eax","xmm1");
669	&psrldq	("xmm1",4);
670
671	&paddq	("xmm4","xmm1");
672	&movdqa	("xmm1",&QWP(0x50,"esp"));
673	&sbb	("eax",-1);
674	&pslldq	("xmm4",6);
675	 &movq	("xmm5","xmm3");
676	&paddq	("xmm2","xmm4");
677	 &pand	("xmm3","xmm7");
678	&psrldq	("xmm2",6);
679	&mov	(&DWP(4*1,"edi"),"eax");
680	&movd	("eax","xmm2");
681	&psrldq	("xmm2",4);
682
683	&paddq	("xmm5","xmm2");
684	&movdqa	("xmm2",&QWP(0x60,"esp"));
685	&sbb	("eax",-1);
686	&pslldq	("xmm5",6);
687	 &movq	("xmm4","xmm0");
688	&paddq	("xmm3","xmm5");
689	 &pand	("xmm0","xmm7");
690	&psrldq	("xmm3",6);
691	&mov	(&DWP(4*2,"edi"),"eax");
692	&movd	("eax","xmm3");
693	&psrldq	("xmm3",4);
694
695	&paddq	("xmm4","xmm3");
696	&sbb	("eax",0);
697	&pslldq	("xmm4",6);
698	 &movq	("xmm5","xmm1");
699	&paddq	("xmm0","xmm4");
700	 &pand	("xmm1","xmm7");
701	&psrldq	("xmm0",6);
702	&mov	(&DWP(4*3,"edi"),"eax");
703	&movd	("eax","xmm0");
704	&psrldq	("xmm0",4);
705
706	&paddq	("xmm5","xmm0");
707	&sbb	("eax",0);
708	&pslldq	("xmm5",6);
709	 &movq	("xmm4","xmm2");
710	&paddq	("xmm1","xmm5");
711	 &pand	("xmm2","xmm7");
712	&psrldq	("xmm1",6);
713	&movd	("ebx","xmm1");
714	&psrldq	("xmm1",4);
715	&mov	("esp","edx");
716
717	&paddq	("xmm4","xmm1");
718	&pslldq	("xmm4",6);
719	&paddq	("xmm2","xmm4");
720	&psrldq	("xmm2",6);
721	&movd	("ecx","xmm2");
722	&psrldq	("xmm2",4);
723	&sbb	("ebx",0);
724	&movd	("edx","xmm2");
725	&pextrw	("esi","xmm2",2);		# top-most overflow bit
726	&sbb	("ecx",1);
727	&sbb	("edx",-1);
728	&sbb	("esi",0);			# borrow from subtraction
729
730	# Final step is "if result > mod, subtract mod", and at this point
731	# we have result - mod written to output buffer, as well as borrow
732	# bit from this subtraction, and if borrow bit is set, we add
733	# modulus back.
734	#
735	# Note that because mod has special form, i.e. consists of
736	# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
737	# assigning borrow bit to one register, %ebp, and its negative
738	# to another, %esi. But we started by calculating %esi...
739
740	&sub	("ebp","esi");
741	&add	(&DWP(4*0,"edi"),"esi");	# add modulus or zero
742	&adc	(&DWP(4*1,"edi"),"esi");
743	&adc	(&DWP(4*2,"edi"),"esi");
744	&adc	(&DWP(4*3,"edi"),0);
745	&adc	("eax",0);
746	&adc	("ebx",0);
747	&mov	(&DWP(4*4,"edi"),"eax");
748	&adc	("ecx","ebp");
749	&mov	(&DWP(4*5,"edi"),"ebx");
750	&adc	("edx","esi");
751	&mov	(&DWP(4*6,"edi"),"ecx");
752	&mov	(&DWP(4*7,"edi"),"edx");
753
754	&ret	();
755
756}	# Non-SSE2 code removed.
757
758&function_end_B("_ecp_nistz256_mul_mont");
759
760########################################################################
761# following subroutines are "literal" implementation of those found in
762# ecp_nistz256.c
763#
764########################################################################
765# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
766#
767&static_label("point_double_shortcut");
768&function_begin("GFp_nistz256_point_double");
769{   my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
770
771	&mov	("esi",&wparam(1));
772
773	# above map() describes stack layout with 5 temporary
774	# 256-bit vectors on top, then we take extra word for
775	# GFp_ia32cap_P copy.
776	&stack_push(8*5+1);
777						if ($sse2) {
778	&call	("_picup_eax");
779    &set_label("pic");
780	&picmeup("edx","GFp_ia32cap_P","eax",&label("pic"));
781	&mov	("ebp",&DWP(0,"edx"));		}
782
783&set_label("point_double_shortcut");
784	&mov	("eax",&DWP(0,"esi"));		# copy in_x
785	&mov	("ebx",&DWP(4,"esi"));
786	&mov	("ecx",&DWP(8,"esi"));
787	&mov	("edx",&DWP(12,"esi"));
788	&mov	(&DWP($in_x+0,"esp"),"eax");
789	&mov	(&DWP($in_x+4,"esp"),"ebx");
790	&mov	(&DWP($in_x+8,"esp"),"ecx");
791	&mov	(&DWP($in_x+12,"esp"),"edx");
792	&mov	("eax",&DWP(16,"esi"));
793	&mov	("ebx",&DWP(20,"esi"));
794	&mov	("ecx",&DWP(24,"esi"));
795	&mov	("edx",&DWP(28,"esi"));
796	&mov	(&DWP($in_x+16,"esp"),"eax");
797	&mov	(&DWP($in_x+20,"esp"),"ebx");
798	&mov	(&DWP($in_x+24,"esp"),"ecx");
799	&mov	(&DWP($in_x+28,"esp"),"edx");
800	&mov	(&DWP(32*5,"esp"),"ebp");	# GFp_ia32cap_P copy
801
802	&lea	("ebp",&DWP(32,"esi"));
803	&lea	("esi",&DWP(32,"esi"));
804	&lea	("edi",&DWP($S,"esp"));
805	&call	("_ecp_nistz256_add");		# p256_mul_by_2(S, in_y);
806
807	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
808	&mov	("esi",64);
809	&add	("esi",&wparam(1));
810	&lea	("edi",&DWP($Zsqr,"esp"));
811	&mov	("ebp","esi");
812	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(Zsqr, in_z);
813
814	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
815	&lea	("esi",&DWP($S,"esp"));
816	&lea	("ebp",&DWP($S,"esp"));
817	&lea	("edi",&DWP($S,"esp"));
818	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(S, S);
819
820	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
821	&mov	("ebp",&wparam(1));
822	&lea	("esi",&DWP(32,"ebp"));
823	&lea	("ebp",&DWP(64,"ebp"));
824	&lea	("edi",&DWP($tmp0,"esp"));
825	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(tmp0, in_z, in_y);
826
827	&lea	("esi",&DWP($in_x,"esp"));
828	&lea	("ebp",&DWP($Zsqr,"esp"));
829	&lea	("edi",&DWP($M,"esp"));
830	&call	("_ecp_nistz256_add");		# p256_add(M, in_x, Zsqr);
831
832	&mov	("edi",64);
833	&lea	("esi",&DWP($tmp0,"esp"));
834	&lea	("ebp",&DWP($tmp0,"esp"));
835	&add	("edi",&wparam(0));
836	&call	("_ecp_nistz256_add");		# p256_mul_by_2(res_z, tmp0);
837
838	&lea	("esi",&DWP($in_x,"esp"));
839	&lea	("ebp",&DWP($Zsqr,"esp"));
840	&lea	("edi",&DWP($Zsqr,"esp"));
841	&call	("_ecp_nistz256_sub");		# p256_sub(Zsqr, in_x, Zsqr);
842
843	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
844	&lea	("esi",&DWP($S,"esp"));
845	&lea	("ebp",&DWP($S,"esp"));
846	&lea	("edi",&DWP($tmp0,"esp"));
847	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(tmp0, S);
848
849	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
850	&lea	("esi",&DWP($M,"esp"));
851	&lea	("ebp",&DWP($Zsqr,"esp"));
852	&lea	("edi",&DWP($M,"esp"));
853	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(M, M, Zsqr);
854
855	&mov	("edi",32);
856	&lea	("esi",&DWP($tmp0,"esp"));
857	&add	("edi",&wparam(0));
858	&call	("_ecp_nistz256_div_by_2");	# p256_div_by_2(res_y, tmp0);
859
860	&lea	("esi",&DWP($M,"esp"));
861	&lea	("ebp",&DWP($M,"esp"));
862	&lea	("edi",&DWP($tmp0,"esp"));
863	&call	("_ecp_nistz256_add");		# 1/2 p256_mul_by_3(M, M);
864
865	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
866	&lea	("esi",&DWP($in_x,"esp"));
867	&lea	("ebp",&DWP($S,"esp"));
868	&lea	("edi",&DWP($S,"esp"));
869	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(S, S, in_x);
870
871	&lea	("esi",&DWP($tmp0,"esp"));
872	&lea	("ebp",&DWP($M,"esp"));
873	&lea	("edi",&DWP($M,"esp"));
874	&call	("_ecp_nistz256_add");		# 2/2 p256_mul_by_3(M, M);
875
876	&lea	("esi",&DWP($S,"esp"));
877	&lea	("ebp",&DWP($S,"esp"));
878	&lea	("edi",&DWP($tmp0,"esp"));
879	&call	("_ecp_nistz256_add");		# p256_mul_by_2(tmp0, S);
880
881	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
882	&lea	("esi",&DWP($M,"esp"));
883	&lea	("ebp",&DWP($M,"esp"));
884	&mov	("edi",&wparam(0));
885	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(res_x, M);
886
887	&mov	("esi","edi");			# %edi is still res_x here
888	&lea	("ebp",&DWP($tmp0,"esp"));
889	&call	("_ecp_nistz256_sub");		# p256_sub(res_x, res_x, tmp0);
890
891	&lea	("esi",&DWP($S,"esp"));
892	&mov	("ebp","edi");			# %edi is still res_x
893	&lea	("edi",&DWP($S,"esp"));
894	&call	("_ecp_nistz256_sub");		# p256_sub(S, S, res_x);
895
896	&mov	("eax",&DWP(32*5,"esp"));	# GFp_ia32cap_P copy
897	&mov	("esi","edi");			# %edi is still &S
898	&lea	("ebp",&DWP($M,"esp"));
899	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(S, S, M);
900
901	&mov	("ebp",32);
902	&lea	("esi",&DWP($S,"esp"));
903	&add	("ebp",&wparam(0));
904	&mov	("edi","ebp");
905	&call	("_ecp_nistz256_sub");		# p256_sub(res_y, S, res_y);
906
907	&stack_pop(8*5+1);
908} &function_end("GFp_nistz256_point_double");
909
910########################################################################
911# void GFp_nistz256_point_add_affine(P256_POINT *out,
912#				     const P256_POINT *in1,
913#				     const P256_POINT_AFFINE *in2);
914&function_begin("GFp_nistz256_point_add_affine");
915{
916    my ($res_x,$res_y,$res_z,
917	$in1_x,$in1_y,$in1_z,
918	$in2_x,$in2_y,
919	$U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
920    my $Z1sqr = $S2;
921    my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
922
923	&mov	("esi",&wparam(1));
924
925	# above map() describes stack layout with 15 temporary
926	# 256-bit vectors on top, then we take extra words for
927	# !in1infty, !in2infty, and GFp_ia32cap_P copy.
928	&stack_push(8*15+3);
929						if ($sse2) {
930	&call	("_picup_eax");
931    &set_label("pic");
932	&picmeup("edx","GFp_ia32cap_P","eax",&label("pic"));
933	&mov	("ebp",&DWP(0,"edx"));		}
934
935	&lea	("edi",&DWP($in1_x,"esp"));
936    for($i=0;$i<96;$i+=16) {
937	&mov	("eax",&DWP($i+0,"esi"));	# copy in1
938	&mov	("ebx",&DWP($i+4,"esi"));
939	&mov	("ecx",&DWP($i+8,"esi"));
940	&mov	("edx",&DWP($i+12,"esi"));
941	&mov	(&DWP($i+0,"edi"),"eax");
942	&mov	(&DWP(32*15+8,"esp"),"ebp")	if ($i==0);
943	&mov	("ebp","eax")			if ($i==64);
944	&or	("ebp","eax")			if ($i>64);
945	&mov	(&DWP($i+4,"edi"),"ebx");
946	&or	("ebp","ebx")			if ($i>=64);
947	&mov	(&DWP($i+8,"edi"),"ecx");
948	&or	("ebp","ecx")			if ($i>=64);
949	&mov	(&DWP($i+12,"edi"),"edx");
950	&or	("ebp","edx")			if ($i>=64);
951    }
952	&xor	("eax","eax");
953	&mov	("esi",&wparam(2));
954	&sub	("eax","ebp");
955	&or	("ebp","eax");
956	&sar	("ebp",31);
957	&mov	(&DWP(32*15+0,"esp"),"ebp");	# !in1infty
958
959	&lea	("edi",&DWP($in2_x,"esp"));
960    for($i=0;$i<64;$i+=16) {
961	&mov	("eax",&DWP($i+0,"esi"));	# copy in2
962	&mov	("ebx",&DWP($i+4,"esi"));
963	&mov	("ecx",&DWP($i+8,"esi"));
964	&mov	("edx",&DWP($i+12,"esi"));
965	&mov	(&DWP($i+0,"edi"),"eax");
966	&mov	("ebp","eax")			if ($i==0);
967	&or	("ebp","eax")			if ($i!=0);
968	&mov	(&DWP($i+4,"edi"),"ebx");
969	&or	("ebp","ebx");
970	&mov	(&DWP($i+8,"edi"),"ecx");
971	&or	("ebp","ecx");
972	&mov	(&DWP($i+12,"edi"),"edx");
973	&or	("ebp","edx");
974    }
975	&xor	("ebx","ebx");
976	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
977	&sub	("ebx","ebp");
978	 &lea	("esi",&DWP($in1_z,"esp"));
979	&or	("ebx","ebp");
980	 &lea	("ebp",&DWP($in1_z,"esp"));
981	&sar	("ebx",31);
982	 &lea	("edi",&DWP($Z1sqr,"esp"));
983	&mov	(&DWP(32*15+4,"esp"),"ebx");	# !in2infty
984
985	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(Z1sqr, in1_z);
986
987	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
988	&lea	("esi",&DWP($in2_x,"esp"));
989	&mov	("ebp","edi");			# %esi is stull &Z1sqr
990	&lea	("edi",&DWP($U2,"esp"));
991	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(U2, Z1sqr, in2_x);
992
993	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
994	&lea	("esi",&DWP($in1_z,"esp"));
995	&lea	("ebp",&DWP($Z1sqr,"esp"));
996	&lea	("edi",&DWP($S2,"esp"));
997	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(S2, Z1sqr, in1_z);
998
999	&lea	("esi",&DWP($U2,"esp"));
1000	&lea	("ebp",&DWP($in1_x,"esp"));
1001	&lea	("edi",&DWP($H,"esp"));
1002	&call	("_ecp_nistz256_sub");		# p256_sub(H, U2, in1_x);
1003
1004	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1005	&lea	("esi",&DWP($in2_y,"esp"));
1006	&lea	("ebp",&DWP($S2,"esp"));
1007	&lea	("edi",&DWP($S2,"esp"));
1008	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(S2, S2, in2_y);
1009
1010	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1011	&lea	("esi",&DWP($in1_z,"esp"));
1012	&lea	("ebp",&DWP($H,"esp"));
1013	&lea	("edi",&DWP($res_z,"esp"));
1014	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(res_z, H, in1_z);
1015
1016	&lea	("esi",&DWP($S2,"esp"));
1017	&lea	("ebp",&DWP($in1_y,"esp"));
1018	&lea	("edi",&DWP($R,"esp"));
1019	&call	("_ecp_nistz256_sub");		# p256_sub(R, S2, in1_y);
1020
1021	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1022	&lea	("esi",&DWP($H,"esp"));
1023	&lea	("ebp",&DWP($H,"esp"));
1024	&lea	("edi",&DWP($Hsqr,"esp"));
1025	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(Hsqr, H);
1026
1027	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1028	&lea	("esi",&DWP($R,"esp"));
1029	&lea	("ebp",&DWP($R,"esp"));
1030	&lea	("edi",&DWP($Rsqr,"esp"));
1031	&call	("_ecp_nistz256_mul_mont");	# p256_sqr_mont(Rsqr, R);
1032
1033	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1034	&lea	("esi",&DWP($in1_x,"esp"));
1035	&lea	("ebp",&DWP($Hsqr,"esp"));
1036	&lea	("edi",&DWP($U2,"esp"));
1037	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(U2, in1_x, Hsqr);
1038
1039	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1040	&lea	("esi",&DWP($H,"esp"));
1041	&lea	("ebp",&DWP($Hsqr,"esp"));
1042	&lea	("edi",&DWP($Hcub,"esp"));
1043	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(Hcub, Hsqr, H);
1044
1045	&lea	("esi",&DWP($U2,"esp"));
1046	&lea	("ebp",&DWP($U2,"esp"));
1047	&lea	("edi",&DWP($Hsqr,"esp"));
1048	&call	("_ecp_nistz256_add");		# p256_mul_by_2(Hsqr, U2);
1049
1050	&lea	("esi",&DWP($Rsqr,"esp"));
1051	&lea	("ebp",&DWP($Hsqr,"esp"));
1052	&lea	("edi",&DWP($res_x,"esp"));
1053	&call	("_ecp_nistz256_sub");		# p256_sub(res_x, Rsqr, Hsqr);
1054
1055	&lea	("esi",&DWP($res_x,"esp"));
1056	&lea	("ebp",&DWP($Hcub,"esp"));
1057	&lea	("edi",&DWP($res_x,"esp"));
1058	&call	("_ecp_nistz256_sub");		# p256_sub(res_x, res_x, Hcub);
1059
1060	&lea	("esi",&DWP($U2,"esp"));
1061	&lea	("ebp",&DWP($res_x,"esp"));
1062	&lea	("edi",&DWP($res_y,"esp"));
1063	&call	("_ecp_nistz256_sub");		# p256_sub(res_y, U2, res_x);
1064
1065	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1066	&lea	("esi",&DWP($Hcub,"esp"));
1067	&lea	("ebp",&DWP($in1_y,"esp"));
1068	&lea	("edi",&DWP($S2,"esp"));
1069	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(S2, Hcub, in1_y);
1070
1071	&mov	("eax",&DWP(32*15+8,"esp"));	# GFp_ia32cap_P copy
1072	&lea	("esi",&DWP($R,"esp"));
1073	&lea	("ebp",&DWP($res_y,"esp"));
1074	&lea	("edi",&DWP($res_y,"esp"));
1075	&call	("_ecp_nistz256_mul_mont");	# p256_mul_mont(res_y, res_y, R);
1076
1077	&lea	("esi",&DWP($res_y,"esp"));
1078	&lea	("ebp",&DWP($S2,"esp"));
1079	&lea	("edi",&DWP($res_y,"esp"));
1080	&call	("_ecp_nistz256_sub");		# p256_sub(res_y, res_y, S2);
1081
1082	&mov	("ebp",&DWP(32*15+0,"esp"));	# !in1infty
1083	&mov	("esi",&DWP(32*15+4,"esp"));	# !in2infty
1084	&mov	("edi",&wparam(0));
1085	&mov	("edx","ebp");
1086	&not	("ebp");
1087	&and	("edx","esi");
1088	&and	("ebp","esi");
1089	&not	("esi");
1090
1091	########################################
1092	# conditional moves
1093    for($i=64;$i<96;$i+=4) {
1094	my $one=@ONE_mont[($i-64)/4];
1095
1096	&mov	("eax","edx");
1097	&and	("eax",&DWP($res_x+$i,"esp"));
1098	&mov	("ebx","ebp")			if ($one && $one!=-1);
1099	&and	("ebx",$one)			if ($one && $one!=-1);
1100	&mov	("ecx","esi");
1101	&and	("ecx",&DWP($in1_x+$i,"esp"));
1102	&or	("eax",$one==-1?"ebp":"ebx")	if ($one);
1103	&or	("eax","ecx");
1104	&mov	(&DWP($i,"edi"),"eax");
1105    }
1106    for($i=0;$i<64;$i+=4) {
1107	&mov	("eax","edx");
1108	&and	("eax",&DWP($res_x+$i,"esp"));
1109	&mov	("ebx","ebp");
1110	&and	("ebx",&DWP($in2_x+$i,"esp"));
1111	&mov	("ecx","esi");
1112	&and	("ecx",&DWP($in1_x+$i,"esp"));
1113	&or	("eax","ebx");
1114	&or	("eax","ecx");
1115	&mov	(&DWP($i,"edi"),"eax");
1116    }
1117	&stack_pop(8*15+3);
1118} &function_end("GFp_nistz256_point_add_affine");
1119
1120&asm_finish();
1121
1122close STDOUT or die "error closing STDOUT";
1123