1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#include <GFp/arm_arch.h>
13
14.text
15.align	5
16Lpoly:
17.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
18Lone_mont:
19.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
20Lone:
21.quad	1,0,0,0
22.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
23.align	2
24
25// void	GFp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
26//					     const BN_ULONG x2[4]);
27.globl	_GFp_nistz256_mul_mont
28.private_extern	_GFp_nistz256_mul_mont
29
30.align	4
31_GFp_nistz256_mul_mont:
32	stp	x29,x30,[sp,#-32]!
33	add	x29,sp,#0
34	stp	x19,x20,[sp,#16]
35
36	ldr	x3,[x2]		// bp[0]
37	ldp	x4,x5,[x1]
38	ldp	x6,x7,[x1,#16]
39	ldr	x12,Lpoly+8
40	ldr	x13,Lpoly+24
41
42	bl	__ecp_nistz256_mul_mont
43
44	ldp	x19,x20,[sp,#16]
45	ldp	x29,x30,[sp],#32
46	ret
47
48
49// void	GFp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
50.globl	_GFp_nistz256_sqr_mont
51.private_extern	_GFp_nistz256_sqr_mont
52
53.align	4
54_GFp_nistz256_sqr_mont:
55	stp	x29,x30,[sp,#-32]!
56	add	x29,sp,#0
57	stp	x19,x20,[sp,#16]
58
59	ldp	x4,x5,[x1]
60	ldp	x6,x7,[x1,#16]
61	ldr	x12,Lpoly+8
62	ldr	x13,Lpoly+24
63
64	bl	__ecp_nistz256_sqr_mont
65
66	ldp	x19,x20,[sp,#16]
67	ldp	x29,x30,[sp],#32
68	ret
69
70
71// void	GFp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
72//					const BN_ULONG x2[4]);
73.globl	_GFp_nistz256_add
74.private_extern	_GFp_nistz256_add
75
76.align	4
77_GFp_nistz256_add:
78	stp	x29,x30,[sp,#-16]!
79	add	x29,sp,#0
80
81	ldp	x14,x15,[x1]
82	ldp	x8,x9,[x2]
83	ldp	x16,x17,[x1,#16]
84	ldp	x10,x11,[x2,#16]
85	ldr	x12,Lpoly+8
86	ldr	x13,Lpoly+24
87
88	bl	__ecp_nistz256_add
89
90	ldp	x29,x30,[sp],#16
91	ret
92
93
94// void	GFp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
95.globl	_GFp_nistz256_neg
96.private_extern	_GFp_nistz256_neg
97
98.align	4
99_GFp_nistz256_neg:
100	stp	x29,x30,[sp,#-16]!
101	add	x29,sp,#0
102
103	mov	x2,x1
104	mov	x14,xzr		// a = 0
105	mov	x15,xzr
106	mov	x16,xzr
107	mov	x17,xzr
108	ldr	x12,Lpoly+8
109	ldr	x13,Lpoly+24
110
111	bl	__ecp_nistz256_sub_from
112
113	ldp	x29,x30,[sp],#16
114	ret
115
116
117// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
118// to x4-x7 and b[0] - to x3
119
120.align	4
121__ecp_nistz256_mul_mont:
122	mul	x14,x4,x3		// a[0]*b[0]
123	umulh	x8,x4,x3
124
125	mul	x15,x5,x3		// a[1]*b[0]
126	umulh	x9,x5,x3
127
128	mul	x16,x6,x3		// a[2]*b[0]
129	umulh	x10,x6,x3
130
131	mul	x17,x7,x3		// a[3]*b[0]
132	umulh	x11,x7,x3
133	ldr	x3,[x2,#8]		// b[1]
134
135	adds	x15,x15,x8		// accumulate high parts of multiplication
136	lsl	x8,x14,#32
137	adcs	x16,x16,x9
138	lsr	x9,x14,#32
139	adcs	x17,x17,x10
140	adc	x19,xzr,x11
141	mov	x20,xzr
142	subs	x10,x14,x8		// "*0xffff0001"
143	sbc	x11,x14,x9
144	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
145	mul	x8,x4,x3		// lo(a[0]*b[i])
146	adcs	x15,x16,x9
147	mul	x9,x5,x3		// lo(a[1]*b[i])
148	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
149	mul	x10,x6,x3		// lo(a[2]*b[i])
150	adcs	x17,x19,x11
151	mul	x11,x7,x3		// lo(a[3]*b[i])
152	adc	x19,x20,xzr
153
154	adds	x14,x14,x8		// accumulate low parts of multiplication
155	umulh	x8,x4,x3		// hi(a[0]*b[i])
156	adcs	x15,x15,x9
157	umulh	x9,x5,x3		// hi(a[1]*b[i])
158	adcs	x16,x16,x10
159	umulh	x10,x6,x3		// hi(a[2]*b[i])
160	adcs	x17,x17,x11
161	umulh	x11,x7,x3		// hi(a[3]*b[i])
162	adc	x19,x19,xzr
163	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
164	adds	x15,x15,x8		// accumulate high parts of multiplication
165	lsl	x8,x14,#32
166	adcs	x16,x16,x9
167	lsr	x9,x14,#32
168	adcs	x17,x17,x10
169	adcs	x19,x19,x11
170	adc	x20,xzr,xzr
171	subs	x10,x14,x8		// "*0xffff0001"
172	sbc	x11,x14,x9
173	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
174	mul	x8,x4,x3		// lo(a[0]*b[i])
175	adcs	x15,x16,x9
176	mul	x9,x5,x3		// lo(a[1]*b[i])
177	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
178	mul	x10,x6,x3		// lo(a[2]*b[i])
179	adcs	x17,x19,x11
180	mul	x11,x7,x3		// lo(a[3]*b[i])
181	adc	x19,x20,xzr
182
183	adds	x14,x14,x8		// accumulate low parts of multiplication
184	umulh	x8,x4,x3		// hi(a[0]*b[i])
185	adcs	x15,x15,x9
186	umulh	x9,x5,x3		// hi(a[1]*b[i])
187	adcs	x16,x16,x10
188	umulh	x10,x6,x3		// hi(a[2]*b[i])
189	adcs	x17,x17,x11
190	umulh	x11,x7,x3		// hi(a[3]*b[i])
191	adc	x19,x19,xzr
192	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
193	adds	x15,x15,x8		// accumulate high parts of multiplication
194	lsl	x8,x14,#32
195	adcs	x16,x16,x9
196	lsr	x9,x14,#32
197	adcs	x17,x17,x10
198	adcs	x19,x19,x11
199	adc	x20,xzr,xzr
200	subs	x10,x14,x8		// "*0xffff0001"
201	sbc	x11,x14,x9
202	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
203	mul	x8,x4,x3		// lo(a[0]*b[i])
204	adcs	x15,x16,x9
205	mul	x9,x5,x3		// lo(a[1]*b[i])
206	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
207	mul	x10,x6,x3		// lo(a[2]*b[i])
208	adcs	x17,x19,x11
209	mul	x11,x7,x3		// lo(a[3]*b[i])
210	adc	x19,x20,xzr
211
212	adds	x14,x14,x8		// accumulate low parts of multiplication
213	umulh	x8,x4,x3		// hi(a[0]*b[i])
214	adcs	x15,x15,x9
215	umulh	x9,x5,x3		// hi(a[1]*b[i])
216	adcs	x16,x16,x10
217	umulh	x10,x6,x3		// hi(a[2]*b[i])
218	adcs	x17,x17,x11
219	umulh	x11,x7,x3		// hi(a[3]*b[i])
220	adc	x19,x19,xzr
221	adds	x15,x15,x8		// accumulate high parts of multiplication
222	lsl	x8,x14,#32
223	adcs	x16,x16,x9
224	lsr	x9,x14,#32
225	adcs	x17,x17,x10
226	adcs	x19,x19,x11
227	adc	x20,xzr,xzr
228	// last reduction
229	subs	x10,x14,x8		// "*0xffff0001"
230	sbc	x11,x14,x9
231	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
232	adcs	x15,x16,x9
233	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
234	adcs	x17,x19,x11
235	adc	x19,x20,xzr
236
237	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
238	sbcs	x9,x15,x12
239	sbcs	x10,x16,xzr
240	sbcs	x11,x17,x13
241	sbcs	xzr,x19,xzr		// did it borrow?
242
243	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
244	csel	x15,x15,x9,lo
245	csel	x16,x16,x10,lo
246	stp	x14,x15,[x0]
247	csel	x17,x17,x11,lo
248	stp	x16,x17,[x0,#16]
249
250	ret
251
252
253// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
254// to x4-x7
255
256.align	4
257__ecp_nistz256_sqr_mont:
258	//  |  |  |  |  |  |a1*a0|  |
259	//  |  |  |  |  |a2*a0|  |  |
260	//  |  |a3*a2|a3*a0|  |  |  |
261	//  |  |  |  |a2*a1|  |  |  |
262	//  |  |  |a3*a1|  |  |  |  |
263	// *|  |  |  |  |  |  |  | 2|
264	// +|a3*a3|a2*a2|a1*a1|a0*a0|
265	//  |--+--+--+--+--+--+--+--|
266	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
267	//
268	//  "can't overflow" below mark carrying into high part of
269	//  multiplication result, which can't overflow, because it
270	//  can never be all ones.
271
272	mul	x15,x5,x4		// a[1]*a[0]
273	umulh	x9,x5,x4
274	mul	x16,x6,x4		// a[2]*a[0]
275	umulh	x10,x6,x4
276	mul	x17,x7,x4		// a[3]*a[0]
277	umulh	x19,x7,x4
278
279	adds	x16,x16,x9		// accumulate high parts of multiplication
280	mul	x8,x6,x5		// a[2]*a[1]
281	umulh	x9,x6,x5
282	adcs	x17,x17,x10
283	mul	x10,x7,x5		// a[3]*a[1]
284	umulh	x11,x7,x5
285	adc	x19,x19,xzr		// can't overflow
286
287	mul	x20,x7,x6		// a[3]*a[2]
288	umulh	x1,x7,x6
289
290	adds	x9,x9,x10		// accumulate high parts of multiplication
291	mul	x14,x4,x4		// a[0]*a[0]
292	adc	x10,x11,xzr		// can't overflow
293
294	adds	x17,x17,x8		// accumulate low parts of multiplication
295	umulh	x4,x4,x4
296	adcs	x19,x19,x9
297	mul	x9,x5,x5		// a[1]*a[1]
298	adcs	x20,x20,x10
299	umulh	x5,x5,x5
300	adc	x1,x1,xzr		// can't overflow
301
302	adds	x15,x15,x15	// acc[1-6]*=2
303	mul	x10,x6,x6		// a[2]*a[2]
304	adcs	x16,x16,x16
305	umulh	x6,x6,x6
306	adcs	x17,x17,x17
307	mul	x11,x7,x7		// a[3]*a[3]
308	adcs	x19,x19,x19
309	umulh	x7,x7,x7
310	adcs	x20,x20,x20
311	adcs	x1,x1,x1
312	adc	x2,xzr,xzr
313
314	adds	x15,x15,x4		// +a[i]*a[i]
315	adcs	x16,x16,x9
316	adcs	x17,x17,x5
317	adcs	x19,x19,x10
318	adcs	x20,x20,x6
319	lsl	x8,x14,#32
320	adcs	x1,x1,x11
321	lsr	x9,x14,#32
322	adc	x2,x2,x7
323	subs	x10,x14,x8		// "*0xffff0001"
324	sbc	x11,x14,x9
325	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
326	adcs	x15,x16,x9
327	lsl	x8,x14,#32
328	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
329	lsr	x9,x14,#32
330	adc	x17,x11,xzr		// can't overflow
331	subs	x10,x14,x8		// "*0xffff0001"
332	sbc	x11,x14,x9
333	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
334	adcs	x15,x16,x9
335	lsl	x8,x14,#32
336	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
337	lsr	x9,x14,#32
338	adc	x17,x11,xzr		// can't overflow
339	subs	x10,x14,x8		// "*0xffff0001"
340	sbc	x11,x14,x9
341	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
342	adcs	x15,x16,x9
343	lsl	x8,x14,#32
344	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
345	lsr	x9,x14,#32
346	adc	x17,x11,xzr		// can't overflow
347	subs	x10,x14,x8		// "*0xffff0001"
348	sbc	x11,x14,x9
349	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
350	adcs	x15,x16,x9
351	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
352	adc	x17,x11,xzr		// can't overflow
353
354	adds	x14,x14,x19	// accumulate upper half
355	adcs	x15,x15,x20
356	adcs	x16,x16,x1
357	adcs	x17,x17,x2
358	adc	x19,xzr,xzr
359
360	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
361	sbcs	x9,x15,x12
362	sbcs	x10,x16,xzr
363	sbcs	x11,x17,x13
364	sbcs	xzr,x19,xzr		// did it borrow?
365
366	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
367	csel	x15,x15,x9,lo
368	csel	x16,x16,x10,lo
369	stp	x14,x15,[x0]
370	csel	x17,x17,x11,lo
371	stp	x16,x17,[x0,#16]
372
373	ret
374
375
376// Note that __ecp_nistz256_add expects both input vectors pre-loaded to
377// x4-x7 and x8-x11. This is done because it's used in multiple
378// contexts, e.g. in multiplication by 2 and 3...
379
380.align	4
381__ecp_nistz256_add:
382	adds	x14,x14,x8		// ret = a+b
383	adcs	x15,x15,x9
384	adcs	x16,x16,x10
385	adcs	x17,x17,x11
386	adc	x1,xzr,xzr		// zap x1
387
388	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
389	sbcs	x9,x15,x12
390	sbcs	x10,x16,xzr
391	sbcs	x11,x17,x13
392	sbcs	xzr,x1,xzr		// did subtraction borrow?
393
394	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
395	csel	x15,x15,x9,lo
396	csel	x16,x16,x10,lo
397	stp	x14,x15,[x0]
398	csel	x17,x17,x11,lo
399	stp	x16,x17,[x0,#16]
400
401	ret
402
403
404
405.align	4
406__ecp_nistz256_sub_from:
407	ldp	x8,x9,[x2]
408	ldp	x10,x11,[x2,#16]
409	subs	x14,x14,x8		// ret = a-b
410	sbcs	x15,x15,x9
411	sbcs	x16,x16,x10
412	sbcs	x17,x17,x11
413	sbc	x1,xzr,xzr		// zap x1
414
415	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
416	adcs	x9,x15,x12
417	adcs	x10,x16,xzr
418	adc	x11,x17,x13
419	cmp	x1,xzr			// did subtraction borrow?
420
421	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
422	csel	x15,x15,x9,eq
423	csel	x16,x16,x10,eq
424	stp	x14,x15,[x0]
425	csel	x17,x17,x11,eq
426	stp	x16,x17,[x0,#16]
427
428	ret
429
430
431
432.align	4
433__ecp_nistz256_sub_morf:
434	ldp	x8,x9,[x2]
435	ldp	x10,x11,[x2,#16]
436	subs	x14,x8,x14		// ret = b-a
437	sbcs	x15,x9,x15
438	sbcs	x16,x10,x16
439	sbcs	x17,x11,x17
440	sbc	x1,xzr,xzr		// zap x1
441
442	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
443	adcs	x9,x15,x12
444	adcs	x10,x16,xzr
445	adc	x11,x17,x13
446	cmp	x1,xzr			// did subtraction borrow?
447
448	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
449	csel	x15,x15,x9,eq
450	csel	x16,x16,x10,eq
451	stp	x14,x15,[x0]
452	csel	x17,x17,x11,eq
453	stp	x16,x17,[x0,#16]
454
455	ret
456
457
458
459.align	4
460__ecp_nistz256_div_by_2:
461	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
462	adcs	x9,x15,x12
463	adcs	x10,x16,xzr
464	adcs	x11,x17,x13
465	adc	x1,xzr,xzr		// zap x1
466	tst	x14,#1		// is a even?
467
468	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
469	csel	x15,x15,x9,eq
470	csel	x16,x16,x10,eq
471	csel	x17,x17,x11,eq
472	csel	x1,xzr,x1,eq
473
474	lsr	x14,x14,#1		// ret >>= 1
475	orr	x14,x14,x15,lsl#63
476	lsr	x15,x15,#1
477	orr	x15,x15,x16,lsl#63
478	lsr	x16,x16,#1
479	orr	x16,x16,x17,lsl#63
480	lsr	x17,x17,#1
481	stp	x14,x15,[x0]
482	orr	x17,x17,x1,lsl#63
483	stp	x16,x17,[x0,#16]
484
485	ret
486
487.globl	_GFp_nistz256_point_double
488.private_extern	_GFp_nistz256_point_double
489
490.align	5
491_GFp_nistz256_point_double:
492	stp	x29,x30,[sp,#-80]!
493	add	x29,sp,#0
494	stp	x19,x20,[sp,#16]
495	stp	x21,x22,[sp,#32]
496	sub	sp,sp,#32*4
497
498Ldouble_shortcut:
499	ldp	x14,x15,[x1,#32]
500	mov	x21,x0
501	ldp	x16,x17,[x1,#48]
502	mov	x22,x1
503	ldr	x12,Lpoly+8
504	mov	x8,x14
505	ldr	x13,Lpoly+24
506	mov	x9,x15
507	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
508	mov	x10,x16
509	mov	x11,x17
510	ldp	x6,x7,[x22,#64+16]
511	add	x0,sp,#0
512	bl	__ecp_nistz256_add	// p256_mul_by_2(S, in_y);
513
514	add	x0,sp,#64
515	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
516
517	ldp	x8,x9,[x22]
518	ldp	x10,x11,[x22,#16]
519	mov	x4,x14		// put Zsqr aside for p256_sub
520	mov	x5,x15
521	mov	x6,x16
522	mov	x7,x17
523	add	x0,sp,#32
524	bl	__ecp_nistz256_add	// p256_add(M, Zsqr, in_x);
525
526	add	x2,x22,#0
527	mov	x14,x4		// restore Zsqr
528	mov	x15,x5
529	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
530	mov	x16,x6
531	mov	x17,x7
532	ldp	x6,x7,[sp,#0+16]
533	add	x0,sp,#64
534	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
535
536	add	x0,sp,#0
537	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
538
539	ldr	x3,[x22,#32]
540	ldp	x4,x5,[x22,#64]
541	ldp	x6,x7,[x22,#64+16]
542	add	x2,x22,#32
543	add	x0,sp,#96
544	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
545
546	mov	x8,x14
547	mov	x9,x15
548	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
549	mov	x10,x16
550	mov	x11,x17
551	ldp	x6,x7,[sp,#0+16]
552	add	x0,x21,#64
553	bl	__ecp_nistz256_add	// p256_mul_by_2(res_z, tmp0);
554
555	add	x0,sp,#96
556	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
557
558	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
559	ldp	x4,x5,[sp,#32]
560	ldp	x6,x7,[sp,#32+16]
561	add	x0,x21,#32
562	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
563
564	add	x2,sp,#64
565	add	x0,sp,#32
566	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
567
568	mov	x8,x14		// duplicate M
569	mov	x9,x15
570	mov	x10,x16
571	mov	x11,x17
572	mov	x4,x14		// put M aside
573	mov	x5,x15
574	mov	x6,x16
575	mov	x7,x17
576	add	x0,sp,#32
577	bl	__ecp_nistz256_add
578	mov	x8,x4			// restore M
579	mov	x9,x5
580	ldr	x3,[x22]		// forward load for p256_mul_mont
581	mov	x10,x6
582	ldp	x4,x5,[sp,#0]
583	mov	x11,x7
584	ldp	x6,x7,[sp,#0+16]
585	bl	__ecp_nistz256_add	// p256_mul_by_3(M, M);
586
587	add	x2,x22,#0
588	add	x0,sp,#0
589	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
590
591	mov	x8,x14
592	mov	x9,x15
593	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
594	mov	x10,x16
595	mov	x11,x17
596	ldp	x6,x7,[sp,#32+16]
597	add	x0,sp,#96
598	bl	__ecp_nistz256_add	// p256_mul_by_2(tmp0, S);
599
600	add	x0,x21,#0
601	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
602
603	add	x2,sp,#96
604	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
605
606	add	x2,sp,#0
607	add	x0,sp,#0
608	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
609
610	ldr	x3,[sp,#32]
611	mov	x4,x14		// copy S
612	mov	x5,x15
613	mov	x6,x16
614	mov	x7,x17
615	add	x2,sp,#32
616	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
617
618	add	x2,x21,#32
619	add	x0,x21,#32
620	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
621
622	add	sp,x29,#0		// destroy frame
623	ldp	x19,x20,[x29,#16]
624	ldp	x21,x22,[x29,#32]
625	ldp	x29,x30,[sp],#80
626	ret
627
628.globl	_GFp_nistz256_point_add_affine
629.private_extern	_GFp_nistz256_point_add_affine
630
631.align	5
632_GFp_nistz256_point_add_affine:
633	stp	x29,x30,[sp,#-80]!
634	add	x29,sp,#0
635	stp	x19,x20,[sp,#16]
636	stp	x21,x22,[sp,#32]
637	stp	x23,x24,[sp,#48]
638	stp	x25,x26,[sp,#64]
639	sub	sp,sp,#32*10
640
641	mov	x21,x0
642	mov	x22,x1
643	mov	x23,x2
644	ldr	x12,Lpoly+8
645	ldr	x13,Lpoly+24
646
647	ldp	x4,x5,[x1,#64]	// in1_z
648	ldp	x6,x7,[x1,#64+16]
649	orr	x8,x4,x5
650	orr	x10,x6,x7
651	orr	x24,x8,x10
652	cmp	x24,#0
653	csetm	x24,ne		// !in1infty
654
655	ldp	x14,x15,[x2]	// in2_x
656	ldp	x16,x17,[x2,#16]
657	ldp	x8,x9,[x2,#32]	// in2_y
658	ldp	x10,x11,[x2,#48]
659	orr	x14,x14,x15
660	orr	x16,x16,x17
661	orr	x8,x8,x9
662	orr	x10,x10,x11
663	orr	x14,x14,x16
664	orr	x8,x8,x10
665	orr	x25,x14,x8
666	cmp	x25,#0
667	csetm	x25,ne		// !in2infty
668
669	add	x0,sp,#128
670	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
671
672	mov	x4,x14
673	mov	x5,x15
674	mov	x6,x16
675	mov	x7,x17
676	ldr	x3,[x23]
677	add	x2,x23,#0
678	add	x0,sp,#96
679	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
680
681	add	x2,x22,#0
682	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
683	ldp	x4,x5,[sp,#128]
684	ldp	x6,x7,[sp,#128+16]
685	add	x0,sp,#160
686	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
687
688	add	x2,x22,#64
689	add	x0,sp,#128
690	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
691
692	ldr	x3,[x22,#64]
693	ldp	x4,x5,[sp,#160]
694	ldp	x6,x7,[sp,#160+16]
695	add	x2,x22,#64
696	add	x0,sp,#64
697	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
698
699	ldr	x3,[x23,#32]
700	ldp	x4,x5,[sp,#128]
701	ldp	x6,x7,[sp,#128+16]
702	add	x2,x23,#32
703	add	x0,sp,#128
704	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
705
706	add	x2,x22,#32
707	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
708	ldp	x6,x7,[sp,#160+16]
709	add	x0,sp,#192
710	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
711
712	add	x0,sp,#224
713	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
714
715	ldp	x4,x5,[sp,#192]
716	ldp	x6,x7,[sp,#192+16]
717	add	x0,sp,#288
718	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
719
720	ldr	x3,[sp,#160]
721	ldp	x4,x5,[sp,#224]
722	ldp	x6,x7,[sp,#224+16]
723	add	x2,sp,#160
724	add	x0,sp,#256
725	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
726
727	ldr	x3,[x22]
728	ldp	x4,x5,[sp,#224]
729	ldp	x6,x7,[sp,#224+16]
730	add	x2,x22,#0
731	add	x0,sp,#96
732	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
733
734	mov	x8,x14
735	mov	x9,x15
736	mov	x10,x16
737	mov	x11,x17
738	add	x0,sp,#224
739	bl	__ecp_nistz256_add	// p256_mul_by_2(Hsqr, U2);
740
741	add	x2,sp,#288
742	add	x0,sp,#0
743	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
744
745	add	x2,sp,#256
746	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
747
748	add	x2,sp,#96
749	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
750	ldp	x4,x5,[sp,#256]
751	ldp	x6,x7,[sp,#256+16]
752	add	x0,sp,#32
753	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
754
755	add	x2,x22,#32
756	add	x0,sp,#128
757	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
758
759	ldr	x3,[sp,#192]
760	ldp	x4,x5,[sp,#32]
761	ldp	x6,x7,[sp,#32+16]
762	add	x2,sp,#192
763	add	x0,sp,#32
764	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
765
766	add	x2,sp,#128
767	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
768
769	ldp	x4,x5,[sp,#0]		// res
770	ldp	x6,x7,[sp,#0+16]
771	ldp	x8,x9,[x23]		// in2
772	ldp	x10,x11,[x23,#16]
773	ldp	x14,x15,[x22,#0]	// in1
774	cmp	x24,#0			// !, remember?
775	ldp	x16,x17,[x22,#0+16]
776	csel	x8,x4,x8,ne
777	csel	x9,x5,x9,ne
778	ldp	x4,x5,[sp,#0+0+32]	// res
779	csel	x10,x6,x10,ne
780	csel	x11,x7,x11,ne
781	cmp	x25,#0			// !, remember?
782	ldp	x6,x7,[sp,#0+0+48]
783	csel	x14,x8,x14,ne
784	csel	x15,x9,x15,ne
785	ldp	x8,x9,[x23,#0+32]	// in2
786	csel	x16,x10,x16,ne
787	csel	x17,x11,x17,ne
788	ldp	x10,x11,[x23,#0+48]
789	stp	x14,x15,[x21,#0]
790	stp	x16,x17,[x21,#0+16]
791	adr	x23,Lone_mont-64
792	ldp	x14,x15,[x22,#32]	// in1
793	cmp	x24,#0			// !, remember?
794	ldp	x16,x17,[x22,#32+16]
795	csel	x8,x4,x8,ne
796	csel	x9,x5,x9,ne
797	ldp	x4,x5,[sp,#0+32+32]	// res
798	csel	x10,x6,x10,ne
799	csel	x11,x7,x11,ne
800	cmp	x25,#0			// !, remember?
801	ldp	x6,x7,[sp,#0+32+48]
802	csel	x14,x8,x14,ne
803	csel	x15,x9,x15,ne
804	ldp	x8,x9,[x23,#32+32]	// in2
805	csel	x16,x10,x16,ne
806	csel	x17,x11,x17,ne
807	ldp	x10,x11,[x23,#32+48]
808	stp	x14,x15,[x21,#32]
809	stp	x16,x17,[x21,#32+16]
810	ldp	x14,x15,[x22,#64]	// in1
811	cmp	x24,#0			// !, remember?
812	ldp	x16,x17,[x22,#64+16]
813	csel	x8,x4,x8,ne
814	csel	x9,x5,x9,ne
815	csel	x10,x6,x10,ne
816	csel	x11,x7,x11,ne
817	cmp	x25,#0			// !, remember?
818	csel	x14,x8,x14,ne
819	csel	x15,x9,x15,ne
820	csel	x16,x10,x16,ne
821	csel	x17,x11,x17,ne
822	stp	x14,x15,[x21,#64]
823	stp	x16,x17,[x21,#64+16]
824
825	add	sp,x29,#0		// destroy frame
826	ldp	x19,x20,[x29,#16]
827	ldp	x21,x22,[x29,#32]
828	ldp	x23,x24,[x29,#48]
829	ldp	x25,x26,[x29,#64]
830	ldp	x29,x30,[sp],#80
831	ret
832
833#endif  // !OPENSSL_NO_ASM
834