1/* Do not modify. This file is auto-generated from armv8-mont.pl. */
2.text
3
4.globl	bn_mul_mont
5.type	bn_mul_mont,%function
6.align	5
7bn_mul_mont:
8	tst	x5,#7
9	b.eq	__bn_sqr8x_mont
10	tst	x5,#3
11	b.eq	__bn_mul4x_mont
12.Lmul_mont:
13	stp	x29,x30,[sp,#-64]!
14	add	x29,sp,#0
15	stp	x19,x20,[sp,#16]
16	stp	x21,x22,[sp,#32]
17	stp	x23,x24,[sp,#48]
18
19	ldr	x9,[x2],#8		// bp[0]
20	sub	x22,sp,x5,lsl#3
21	ldp	x7,x8,[x1],#16	// ap[0..1]
22	lsl	x5,x5,#3
23	ldr	x4,[x4]		// *n0
24	and	x22,x22,#-16		// ABI says so
25	ldp	x13,x14,[x3],#16	// np[0..1]
26
27	mul	x6,x7,x9		// ap[0]*bp[0]
28	sub	x21,x5,#16		// j=num-2
29	umulh	x7,x7,x9
30	mul	x10,x8,x9		// ap[1]*bp[0]
31	umulh	x11,x8,x9
32
33	mul	x15,x6,x4		// "tp[0]"*n0
34	mov	sp,x22			// alloca
35
36	// (*)	mul	x12,x13,x15	// np[0]*m1
37	umulh	x13,x13,x15
38	mul	x16,x14,x15		// np[1]*m1
39	// (*)	adds	x12,x12,x6	// discarded
40	// (*)	As for removal of first multiplication and addition
41	//	instructions. The outcome of first addition is
42	//	guaranteed to be zero, which leaves two computationally
43	//	significant outcomes: it either carries or not. Then
44	//	question is when does it carry? Is there alternative
45	//	way to deduce it? If you follow operations, you can
46	//	observe that condition for carry is quite simple:
47	//	x6 being non-zero. So that carry can be calculated
48	//	by adding -1 to x6. That's what next instruction does.
49	subs	xzr,x6,#1		// (*)
50	umulh	x17,x14,x15
51	adc	x13,x13,xzr
52	cbz	x21,.L1st_skip
53
54.L1st:
55	ldr	x8,[x1],#8
56	adds	x6,x10,x7
57	sub	x21,x21,#8		// j--
58	adc	x7,x11,xzr
59
60	ldr	x14,[x3],#8
61	adds	x12,x16,x13
62	mul	x10,x8,x9		// ap[j]*bp[0]
63	adc	x13,x17,xzr
64	umulh	x11,x8,x9
65
66	adds	x12,x12,x6
67	mul	x16,x14,x15		// np[j]*m1
68	adc	x13,x13,xzr
69	umulh	x17,x14,x15
70	str	x12,[x22],#8		// tp[j-1]
71	cbnz	x21,.L1st
72
73.L1st_skip:
74	adds	x6,x10,x7
75	sub	x1,x1,x5		// rewind x1
76	adc	x7,x11,xzr
77
78	adds	x12,x16,x13
79	sub	x3,x3,x5		// rewind x3
80	adc	x13,x17,xzr
81
82	adds	x12,x12,x6
83	sub	x20,x5,#8		// i=num-1
84	adcs	x13,x13,x7
85
86	adc	x19,xzr,xzr		// upmost overflow bit
87	stp	x12,x13,[x22]
88
89.Louter:
90	ldr	x9,[x2],#8		// bp[i]
91	ldp	x7,x8,[x1],#16
92	ldr	x23,[sp]		// tp[0]
93	add	x22,sp,#8
94
95	mul	x6,x7,x9		// ap[0]*bp[i]
96	sub	x21,x5,#16		// j=num-2
97	umulh	x7,x7,x9
98	ldp	x13,x14,[x3],#16
99	mul	x10,x8,x9		// ap[1]*bp[i]
100	adds	x6,x6,x23
101	umulh	x11,x8,x9
102	adc	x7,x7,xzr
103
104	mul	x15,x6,x4
105	sub	x20,x20,#8		// i--
106
107	// (*)	mul	x12,x13,x15	// np[0]*m1
108	umulh	x13,x13,x15
109	mul	x16,x14,x15		// np[1]*m1
110	// (*)	adds	x12,x12,x6
111	subs	xzr,x6,#1		// (*)
112	umulh	x17,x14,x15
113	cbz	x21,.Linner_skip
114
115.Linner:
116	ldr	x8,[x1],#8
117	adc	x13,x13,xzr
118	ldr	x23,[x22],#8		// tp[j]
119	adds	x6,x10,x7
120	sub	x21,x21,#8		// j--
121	adc	x7,x11,xzr
122
123	adds	x12,x16,x13
124	ldr	x14,[x3],#8
125	adc	x13,x17,xzr
126
127	mul	x10,x8,x9		// ap[j]*bp[i]
128	adds	x6,x6,x23
129	umulh	x11,x8,x9
130	adc	x7,x7,xzr
131
132	mul	x16,x14,x15		// np[j]*m1
133	adds	x12,x12,x6
134	umulh	x17,x14,x15
135	str	x12,[x22,#-16]		// tp[j-1]
136	cbnz	x21,.Linner
137
138.Linner_skip:
139	ldr	x23,[x22],#8		// tp[j]
140	adc	x13,x13,xzr
141	adds	x6,x10,x7
142	sub	x1,x1,x5		// rewind x1
143	adc	x7,x11,xzr
144
145	adds	x12,x16,x13
146	sub	x3,x3,x5		// rewind x3
147	adcs	x13,x17,x19
148	adc	x19,xzr,xzr
149
150	adds	x6,x6,x23
151	adc	x7,x7,xzr
152
153	adds	x12,x12,x6
154	adcs	x13,x13,x7
155	adc	x19,x19,xzr		// upmost overflow bit
156	stp	x12,x13,[x22,#-16]
157
158	cbnz	x20,.Louter
159
160	// Final step. We see if result is larger than modulus, and
161	// if it is, subtract the modulus. But comparison implies
162	// subtraction. So we subtract modulus, see if it borrowed,
163	// and conditionally copy original value.
164	ldr	x23,[sp]		// tp[0]
165	add	x22,sp,#8
166	ldr	x14,[x3],#8		// np[0]
167	subs	x21,x5,#8		// j=num-1 and clear borrow
168	mov	x1,x0
169.Lsub:
170	sbcs	x8,x23,x14		// tp[j]-np[j]
171	ldr	x23,[x22],#8
172	sub	x21,x21,#8		// j--
173	ldr	x14,[x3],#8
174	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
175	cbnz	x21,.Lsub
176
177	sbcs	x8,x23,x14
178	sbcs	x19,x19,xzr		// did it borrow?
179	str	x8,[x1],#8		// rp[num-1]
180
181	ldr	x23,[sp]		// tp[0]
182	add	x22,sp,#8
183	ldr	x8,[x0],#8		// rp[0]
184	sub	x5,x5,#8		// num--
185	nop
186.Lcond_copy:
187	sub	x5,x5,#8		// num--
188	csel	x14,x23,x8,lo		// did it borrow?
189	ldr	x23,[x22],#8
190	ldr	x8,[x0],#8
191	str	xzr,[x22,#-16]		// wipe tp
192	str	x14,[x0,#-16]
193	cbnz	x5,.Lcond_copy
194
195	csel	x14,x23,x8,lo
196	str	xzr,[x22,#-8]		// wipe tp
197	str	x14,[x0,#-8]
198
199	ldp	x19,x20,[x29,#16]
200	mov	sp,x29
201	ldp	x21,x22,[x29,#32]
202	mov	x0,#1
203	ldp	x23,x24,[x29,#48]
204	ldr	x29,[sp],#64
205	ret
206.size	bn_mul_mont,.-bn_mul_mont
207.type	__bn_sqr8x_mont,%function
208.align	5
209__bn_sqr8x_mont:
210	cmp	x1,x2
211	b.ne	__bn_mul4x_mont
212.Lsqr8x_mont:
213.inst	0xd503233f		// paciasp
214	stp	x29,x30,[sp,#-128]!
215	add	x29,sp,#0
216	stp	x19,x20,[sp,#16]
217	stp	x21,x22,[sp,#32]
218	stp	x23,x24,[sp,#48]
219	stp	x25,x26,[sp,#64]
220	stp	x27,x28,[sp,#80]
221	stp	x0,x3,[sp,#96]	// offload rp and np
222
223	ldp	x6,x7,[x1,#8*0]
224	ldp	x8,x9,[x1,#8*2]
225	ldp	x10,x11,[x1,#8*4]
226	ldp	x12,x13,[x1,#8*6]
227
228	sub	x2,sp,x5,lsl#4
229	lsl	x5,x5,#3
230	ldr	x4,[x4]		// *n0
231	mov	sp,x2			// alloca
232	sub	x27,x5,#8*8
233	b	.Lsqr8x_zero_start
234
235.Lsqr8x_zero:
236	sub	x27,x27,#8*8
237	stp	xzr,xzr,[x2,#8*0]
238	stp	xzr,xzr,[x2,#8*2]
239	stp	xzr,xzr,[x2,#8*4]
240	stp	xzr,xzr,[x2,#8*6]
241.Lsqr8x_zero_start:
242	stp	xzr,xzr,[x2,#8*8]
243	stp	xzr,xzr,[x2,#8*10]
244	stp	xzr,xzr,[x2,#8*12]
245	stp	xzr,xzr,[x2,#8*14]
246	add	x2,x2,#8*16
247	cbnz	x27,.Lsqr8x_zero
248
249	add	x3,x1,x5
250	add	x1,x1,#8*8
251	mov	x19,xzr
252	mov	x20,xzr
253	mov	x21,xzr
254	mov	x22,xzr
255	mov	x23,xzr
256	mov	x24,xzr
257	mov	x25,xzr
258	mov	x26,xzr
259	mov	x2,sp
260	str	x4,[x29,#112]		// offload n0
261
262	// Multiply everything but a[i]*a[i]
263.align	4
264.Lsqr8x_outer_loop:
265        //                                                 a[1]a[0]	(i)
266        //                                             a[2]a[0]
267        //                                         a[3]a[0]
268        //                                     a[4]a[0]
269        //                                 a[5]a[0]
270        //                             a[6]a[0]
271        //                         a[7]a[0]
272        //                                         a[2]a[1]		(ii)
273        //                                     a[3]a[1]
274        //                                 a[4]a[1]
275        //                             a[5]a[1]
276        //                         a[6]a[1]
277        //                     a[7]a[1]
278        //                                 a[3]a[2]			(iii)
279        //                             a[4]a[2]
280        //                         a[5]a[2]
281        //                     a[6]a[2]
282        //                 a[7]a[2]
283        //                         a[4]a[3]				(iv)
284        //                     a[5]a[3]
285        //                 a[6]a[3]
286        //             a[7]a[3]
287        //                 a[5]a[4]					(v)
288        //             a[6]a[4]
289        //         a[7]a[4]
290        //         a[6]a[5]						(vi)
291        //     a[7]a[5]
292        // a[7]a[6]							(vii)
293
294	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
295	mul	x15,x8,x6
296	mul	x16,x9,x6
297	mul	x17,x10,x6
298	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
299	mul	x14,x11,x6
300	adcs	x21,x21,x15
301	mul	x15,x12,x6
302	adcs	x22,x22,x16
303	mul	x16,x13,x6
304	adcs	x23,x23,x17
305	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
306	adcs	x24,x24,x14
307	umulh	x14,x8,x6
308	adcs	x25,x25,x15
309	umulh	x15,x9,x6
310	adcs	x26,x26,x16
311	umulh	x16,x10,x6
312	stp	x19,x20,[x2],#8*2	// t[0..1]
313	adc	x19,xzr,xzr		// t[8]
314	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
315	umulh	x17,x11,x6
316	adcs	x22,x22,x14
317	umulh	x14,x12,x6
318	adcs	x23,x23,x15
319	umulh	x15,x13,x6
320	adcs	x24,x24,x16
321	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
322	adcs	x25,x25,x17
323	mul	x17,x9,x7
324	adcs	x26,x26,x14
325	mul	x14,x10,x7
326	adc	x19,x19,x15
327
328	mul	x15,x11,x7
329	adds	x22,x22,x16
330	mul	x16,x12,x7
331	adcs	x23,x23,x17
332	mul	x17,x13,x7
333	adcs	x24,x24,x14
334	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
335	adcs	x25,x25,x15
336	umulh	x15,x9,x7
337	adcs	x26,x26,x16
338	umulh	x16,x10,x7
339	adcs	x19,x19,x17
340	umulh	x17,x11,x7
341	stp	x21,x22,[x2],#8*2	// t[2..3]
342	adc	x20,xzr,xzr		// t[9]
343	adds	x23,x23,x14
344	umulh	x14,x12,x7
345	adcs	x24,x24,x15
346	umulh	x15,x13,x7
347	adcs	x25,x25,x16
348	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
349	adcs	x26,x26,x17
350	mul	x17,x10,x8
351	adcs	x19,x19,x14
352	mul	x14,x11,x8
353	adc	x20,x20,x15
354
355	mul	x15,x12,x8
356	adds	x24,x24,x16
357	mul	x16,x13,x8
358	adcs	x25,x25,x17
359	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
360	adcs	x26,x26,x14
361	umulh	x14,x10,x8
362	adcs	x19,x19,x15
363	umulh	x15,x11,x8
364	adcs	x20,x20,x16
365	umulh	x16,x12,x8
366	stp	x23,x24,[x2],#8*2	// t[4..5]
367	adc	x21,xzr,xzr		// t[10]
368	adds	x25,x25,x17
369	umulh	x17,x13,x8
370	adcs	x26,x26,x14
371	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
372	adcs	x19,x19,x15
373	mul	x15,x11,x9
374	adcs	x20,x20,x16
375	mul	x16,x12,x9
376	adc	x21,x21,x17
377
378	mul	x17,x13,x9
379	adds	x26,x26,x14
380	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
381	adcs	x19,x19,x15
382	umulh	x15,x11,x9
383	adcs	x20,x20,x16
384	umulh	x16,x12,x9
385	adcs	x21,x21,x17
386	umulh	x17,x13,x9
387	stp	x25,x26,[x2],#8*2	// t[6..7]
388	adc	x22,xzr,xzr		// t[11]
389	adds	x19,x19,x14
390	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
391	adcs	x20,x20,x15
392	mul	x15,x12,x10
393	adcs	x21,x21,x16
394	mul	x16,x13,x10
395	adc	x22,x22,x17
396
397	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
398	adds	x20,x20,x14
399	umulh	x14,x12,x10
400	adcs	x21,x21,x15
401	umulh	x15,x13,x10
402	adcs	x22,x22,x16
403	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
404	adc	x23,xzr,xzr		// t[12]
405	adds	x21,x21,x17
406	mul	x17,x13,x11
407	adcs	x22,x22,x14
408	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
409	adc	x23,x23,x15
410
411	umulh	x15,x13,x11
412	adds	x22,x22,x16
413	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
414	adcs	x23,x23,x17
415	umulh	x17,x13,x12		// hi(a[7]*a[6])
416	adc	x24,xzr,xzr		// t[13]
417	adds	x23,x23,x14
418	sub	x27,x3,x1	// done yet?
419	adc	x24,x24,x15
420
421	adds	x24,x24,x16
422	sub	x14,x3,x5	// rewinded ap
423	adc	x25,xzr,xzr		// t[14]
424	add	x25,x25,x17
425
426	cbz	x27,.Lsqr8x_outer_break
427
428	mov	x4,x6
429	ldp	x6,x7,[x2,#8*0]
430	ldp	x8,x9,[x2,#8*2]
431	ldp	x10,x11,[x2,#8*4]
432	ldp	x12,x13,[x2,#8*6]
433	adds	x19,x19,x6
434	adcs	x20,x20,x7
435	ldp	x6,x7,[x1,#8*0]
436	adcs	x21,x21,x8
437	adcs	x22,x22,x9
438	ldp	x8,x9,[x1,#8*2]
439	adcs	x23,x23,x10
440	adcs	x24,x24,x11
441	ldp	x10,x11,[x1,#8*4]
442	adcs	x25,x25,x12
443	mov	x0,x1
444	adcs	x26,xzr,x13
445	ldp	x12,x13,[x1,#8*6]
446	add	x1,x1,#8*8
447	//adc	x28,xzr,xzr		// moved below
448	mov	x27,#-8*8
449
450	//                                                         a[8]a[0]
451	//                                                     a[9]a[0]
452	//                                                 a[a]a[0]
453	//                                             a[b]a[0]
454	//                                         a[c]a[0]
455	//                                     a[d]a[0]
456	//                                 a[e]a[0]
457	//                             a[f]a[0]
458	//                                                     a[8]a[1]
459	//                         a[f]a[1]........................
460	//                                                 a[8]a[2]
461	//                     a[f]a[2]........................
462	//                                             a[8]a[3]
463	//                 a[f]a[3]........................
464	//                                         a[8]a[4]
465	//             a[f]a[4]........................
466	//                                     a[8]a[5]
467	//         a[f]a[5]........................
468	//                                 a[8]a[6]
469	//     a[f]a[6]........................
470	//                             a[8]a[7]
471	// a[f]a[7]........................
472.Lsqr8x_mul:
473	mul	x14,x6,x4
474	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
475	mul	x15,x7,x4
476	add	x27,x27,#8
477	mul	x16,x8,x4
478	mul	x17,x9,x4
479	adds	x19,x19,x14
480	mul	x14,x10,x4
481	adcs	x20,x20,x15
482	mul	x15,x11,x4
483	adcs	x21,x21,x16
484	mul	x16,x12,x4
485	adcs	x22,x22,x17
486	mul	x17,x13,x4
487	adcs	x23,x23,x14
488	umulh	x14,x6,x4
489	adcs	x24,x24,x15
490	umulh	x15,x7,x4
491	adcs	x25,x25,x16
492	umulh	x16,x8,x4
493	adcs	x26,x26,x17
494	umulh	x17,x9,x4
495	adc	x28,x28,xzr
496	str	x19,[x2],#8
497	adds	x19,x20,x14
498	umulh	x14,x10,x4
499	adcs	x20,x21,x15
500	umulh	x15,x11,x4
501	adcs	x21,x22,x16
502	umulh	x16,x12,x4
503	adcs	x22,x23,x17
504	umulh	x17,x13,x4
505	ldr	x4,[x0,x27]
506	adcs	x23,x24,x14
507	adcs	x24,x25,x15
508	adcs	x25,x26,x16
509	adcs	x26,x28,x17
510	//adc	x28,xzr,xzr		// moved above
511	cbnz	x27,.Lsqr8x_mul
512					// note that carry flag is guaranteed
513					// to be zero at this point
514	cmp	x1,x3		// done yet?
515	b.eq	.Lsqr8x_break
516
517	ldp	x6,x7,[x2,#8*0]
518	ldp	x8,x9,[x2,#8*2]
519	ldp	x10,x11,[x2,#8*4]
520	ldp	x12,x13,[x2,#8*6]
521	adds	x19,x19,x6
522	ldr	x4,[x0,#-8*8]
523	adcs	x20,x20,x7
524	ldp	x6,x7,[x1,#8*0]
525	adcs	x21,x21,x8
526	adcs	x22,x22,x9
527	ldp	x8,x9,[x1,#8*2]
528	adcs	x23,x23,x10
529	adcs	x24,x24,x11
530	ldp	x10,x11,[x1,#8*4]
531	adcs	x25,x25,x12
532	mov	x27,#-8*8
533	adcs	x26,x26,x13
534	ldp	x12,x13,[x1,#8*6]
535	add	x1,x1,#8*8
536	//adc	x28,xzr,xzr		// moved above
537	b	.Lsqr8x_mul
538
539.align	4
540.Lsqr8x_break:
541	ldp	x6,x7,[x0,#8*0]
542	add	x1,x0,#8*8
543	ldp	x8,x9,[x0,#8*2]
544	sub	x14,x3,x1		// is it last iteration?
545	ldp	x10,x11,[x0,#8*4]
546	sub	x15,x2,x14
547	ldp	x12,x13,[x0,#8*6]
548	cbz	x14,.Lsqr8x_outer_loop
549
550	stp	x19,x20,[x2,#8*0]
551	ldp	x19,x20,[x15,#8*0]
552	stp	x21,x22,[x2,#8*2]
553	ldp	x21,x22,[x15,#8*2]
554	stp	x23,x24,[x2,#8*4]
555	ldp	x23,x24,[x15,#8*4]
556	stp	x25,x26,[x2,#8*6]
557	mov	x2,x15
558	ldp	x25,x26,[x15,#8*6]
559	b	.Lsqr8x_outer_loop
560
561.align	4
562.Lsqr8x_outer_break:
563	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
564	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
565	ldp	x15,x16,[sp,#8*1]
566	ldp	x11,x13,[x14,#8*2]
567	add	x1,x14,#8*4
568	ldp	x17,x14,[sp,#8*3]
569
570	stp	x19,x20,[x2,#8*0]
571	mul	x19,x7,x7
572	stp	x21,x22,[x2,#8*2]
573	umulh	x7,x7,x7
574	stp	x23,x24,[x2,#8*4]
575	mul	x8,x9,x9
576	stp	x25,x26,[x2,#8*6]
577	mov	x2,sp
578	umulh	x9,x9,x9
579	adds	x20,x7,x15,lsl#1
580	extr	x15,x16,x15,#63
581	sub	x27,x5,#8*4
582
583.Lsqr4x_shift_n_add:
584	adcs	x21,x8,x15
585	extr	x16,x17,x16,#63
586	sub	x27,x27,#8*4
587	adcs	x22,x9,x16
588	ldp	x15,x16,[x2,#8*5]
589	mul	x10,x11,x11
590	ldp	x7,x9,[x1],#8*2
591	umulh	x11,x11,x11
592	mul	x12,x13,x13
593	umulh	x13,x13,x13
594	extr	x17,x14,x17,#63
595	stp	x19,x20,[x2,#8*0]
596	adcs	x23,x10,x17
597	extr	x14,x15,x14,#63
598	stp	x21,x22,[x2,#8*2]
599	adcs	x24,x11,x14
600	ldp	x17,x14,[x2,#8*7]
601	extr	x15,x16,x15,#63
602	adcs	x25,x12,x15
603	extr	x16,x17,x16,#63
604	adcs	x26,x13,x16
605	ldp	x15,x16,[x2,#8*9]
606	mul	x6,x7,x7
607	ldp	x11,x13,[x1],#8*2
608	umulh	x7,x7,x7
609	mul	x8,x9,x9
610	umulh	x9,x9,x9
611	stp	x23,x24,[x2,#8*4]
612	extr	x17,x14,x17,#63
613	stp	x25,x26,[x2,#8*6]
614	add	x2,x2,#8*8
615	adcs	x19,x6,x17
616	extr	x14,x15,x14,#63
617	adcs	x20,x7,x14
618	ldp	x17,x14,[x2,#8*3]
619	extr	x15,x16,x15,#63
620	cbnz	x27,.Lsqr4x_shift_n_add
621	ldp	x1,x4,[x29,#104]	// pull np and n0
622
623	adcs	x21,x8,x15
624	extr	x16,x17,x16,#63
625	adcs	x22,x9,x16
626	ldp	x15,x16,[x2,#8*5]
627	mul	x10,x11,x11
628	umulh	x11,x11,x11
629	stp	x19,x20,[x2,#8*0]
630	mul	x12,x13,x13
631	umulh	x13,x13,x13
632	stp	x21,x22,[x2,#8*2]
633	extr	x17,x14,x17,#63
634	adcs	x23,x10,x17
635	extr	x14,x15,x14,#63
636	ldp	x19,x20,[sp,#8*0]
637	adcs	x24,x11,x14
638	extr	x15,x16,x15,#63
639	ldp	x6,x7,[x1,#8*0]
640	adcs	x25,x12,x15
641	extr	x16,xzr,x16,#63
642	ldp	x8,x9,[x1,#8*2]
643	adc	x26,x13,x16
644	ldp	x10,x11,[x1,#8*4]
645
646	// Reduce by 512 bits per iteration
647	mul	x28,x4,x19		// t[0]*n0
648	ldp	x12,x13,[x1,#8*6]
649	add	x3,x1,x5
650	ldp	x21,x22,[sp,#8*2]
651	stp	x23,x24,[x2,#8*4]
652	ldp	x23,x24,[sp,#8*4]
653	stp	x25,x26,[x2,#8*6]
654	ldp	x25,x26,[sp,#8*6]
655	add	x1,x1,#8*8
656	mov	x30,xzr		// initial top-most carry
657	mov	x2,sp
658	mov	x27,#8
659
660.Lsqr8x_reduction:
661	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
662	mul	x15,x7,x28
663	sub	x27,x27,#1
664	mul	x16,x8,x28
665	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
666	mul	x17,x9,x28
667	// (*)	adds	xzr,x19,x14
668	subs	xzr,x19,#1		// (*)
669	mul	x14,x10,x28
670	adcs	x19,x20,x15
671	mul	x15,x11,x28
672	adcs	x20,x21,x16
673	mul	x16,x12,x28
674	adcs	x21,x22,x17
675	mul	x17,x13,x28
676	adcs	x22,x23,x14
677	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
678	adcs	x23,x24,x15
679	umulh	x15,x7,x28
680	adcs	x24,x25,x16
681	umulh	x16,x8,x28
682	adcs	x25,x26,x17
683	umulh	x17,x9,x28
684	adc	x26,xzr,xzr
685	adds	x19,x19,x14
686	umulh	x14,x10,x28
687	adcs	x20,x20,x15
688	umulh	x15,x11,x28
689	adcs	x21,x21,x16
690	umulh	x16,x12,x28
691	adcs	x22,x22,x17
692	umulh	x17,x13,x28
693	mul	x28,x4,x19		// next t[0]*n0
694	adcs	x23,x23,x14
695	adcs	x24,x24,x15
696	adcs	x25,x25,x16
697	adc	x26,x26,x17
698	cbnz	x27,.Lsqr8x_reduction
699
700	ldp	x14,x15,[x2,#8*0]
701	ldp	x16,x17,[x2,#8*2]
702	mov	x0,x2
703	sub	x27,x3,x1	// done yet?
704	adds	x19,x19,x14
705	adcs	x20,x20,x15
706	ldp	x14,x15,[x2,#8*4]
707	adcs	x21,x21,x16
708	adcs	x22,x22,x17
709	ldp	x16,x17,[x2,#8*6]
710	adcs	x23,x23,x14
711	adcs	x24,x24,x15
712	adcs	x25,x25,x16
713	adcs	x26,x26,x17
714	//adc	x28,xzr,xzr		// moved below
715	cbz	x27,.Lsqr8x8_post_condition
716
717	ldr	x4,[x2,#-8*8]
718	ldp	x6,x7,[x1,#8*0]
719	ldp	x8,x9,[x1,#8*2]
720	ldp	x10,x11,[x1,#8*4]
721	mov	x27,#-8*8
722	ldp	x12,x13,[x1,#8*6]
723	add	x1,x1,#8*8
724
725.Lsqr8x_tail:
726	mul	x14,x6,x4
727	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
728	mul	x15,x7,x4
729	add	x27,x27,#8
730	mul	x16,x8,x4
731	mul	x17,x9,x4
732	adds	x19,x19,x14
733	mul	x14,x10,x4
734	adcs	x20,x20,x15
735	mul	x15,x11,x4
736	adcs	x21,x21,x16
737	mul	x16,x12,x4
738	adcs	x22,x22,x17
739	mul	x17,x13,x4
740	adcs	x23,x23,x14
741	umulh	x14,x6,x4
742	adcs	x24,x24,x15
743	umulh	x15,x7,x4
744	adcs	x25,x25,x16
745	umulh	x16,x8,x4
746	adcs	x26,x26,x17
747	umulh	x17,x9,x4
748	adc	x28,x28,xzr
749	str	x19,[x2],#8
750	adds	x19,x20,x14
751	umulh	x14,x10,x4
752	adcs	x20,x21,x15
753	umulh	x15,x11,x4
754	adcs	x21,x22,x16
755	umulh	x16,x12,x4
756	adcs	x22,x23,x17
757	umulh	x17,x13,x4
758	ldr	x4,[x0,x27]
759	adcs	x23,x24,x14
760	adcs	x24,x25,x15
761	adcs	x25,x26,x16
762	adcs	x26,x28,x17
763	//adc	x28,xzr,xzr		// moved above
764	cbnz	x27,.Lsqr8x_tail
765					// note that carry flag is guaranteed
766					// to be zero at this point
767	ldp	x6,x7,[x2,#8*0]
768	sub	x27,x3,x1	// done yet?
769	sub	x16,x3,x5	// rewinded np
770	ldp	x8,x9,[x2,#8*2]
771	ldp	x10,x11,[x2,#8*4]
772	ldp	x12,x13,[x2,#8*6]
773	cbz	x27,.Lsqr8x_tail_break
774
775	ldr	x4,[x0,#-8*8]
776	adds	x19,x19,x6
777	adcs	x20,x20,x7
778	ldp	x6,x7,[x1,#8*0]
779	adcs	x21,x21,x8
780	adcs	x22,x22,x9
781	ldp	x8,x9,[x1,#8*2]
782	adcs	x23,x23,x10
783	adcs	x24,x24,x11
784	ldp	x10,x11,[x1,#8*4]
785	adcs	x25,x25,x12
786	mov	x27,#-8*8
787	adcs	x26,x26,x13
788	ldp	x12,x13,[x1,#8*6]
789	add	x1,x1,#8*8
790	//adc	x28,xzr,xzr		// moved above
791	b	.Lsqr8x_tail
792
793.align	4
794.Lsqr8x_tail_break:
795	ldr	x4,[x29,#112]		// pull n0
796	add	x27,x2,#8*8		// end of current t[num] window
797
798	subs	xzr,x30,#1		// "move" top-most carry to carry bit
799	adcs	x14,x19,x6
800	adcs	x15,x20,x7
801	ldp	x19,x20,[x0,#8*0]
802	adcs	x21,x21,x8
803	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
804	adcs	x22,x22,x9
805	ldp	x8,x9,[x16,#8*2]
806	adcs	x23,x23,x10
807	adcs	x24,x24,x11
808	ldp	x10,x11,[x16,#8*4]
809	adcs	x25,x25,x12
810	adcs	x26,x26,x13
811	ldp	x12,x13,[x16,#8*6]
812	add	x1,x16,#8*8
813	adc	x30,xzr,xzr	// top-most carry
814	mul	x28,x4,x19
815	stp	x14,x15,[x2,#8*0]
816	stp	x21,x22,[x2,#8*2]
817	ldp	x21,x22,[x0,#8*2]
818	stp	x23,x24,[x2,#8*4]
819	ldp	x23,x24,[x0,#8*4]
820	cmp	x27,x29		// did we hit the bottom?
821	stp	x25,x26,[x2,#8*6]
822	mov	x2,x0			// slide the window
823	ldp	x25,x26,[x0,#8*6]
824	mov	x27,#8
825	b.ne	.Lsqr8x_reduction
826
827	// Final step. We see if result is larger than modulus, and
828	// if it is, subtract the modulus. But comparison implies
829	// subtraction. So we subtract modulus, see if it borrowed,
830	// and conditionally copy original value.
831	ldr	x0,[x29,#96]		// pull rp
832	add	x2,x2,#8*8
833	subs	x14,x19,x6
834	sbcs	x15,x20,x7
835	sub	x27,x5,#8*8
836	mov	x3,x0		// x0 copy
837
838.Lsqr8x_sub:
839	sbcs	x16,x21,x8
840	ldp	x6,x7,[x1,#8*0]
841	sbcs	x17,x22,x9
842	stp	x14,x15,[x0,#8*0]
843	sbcs	x14,x23,x10
844	ldp	x8,x9,[x1,#8*2]
845	sbcs	x15,x24,x11
846	stp	x16,x17,[x0,#8*2]
847	sbcs	x16,x25,x12
848	ldp	x10,x11,[x1,#8*4]
849	sbcs	x17,x26,x13
850	ldp	x12,x13,[x1,#8*6]
851	add	x1,x1,#8*8
852	ldp	x19,x20,[x2,#8*0]
853	sub	x27,x27,#8*8
854	ldp	x21,x22,[x2,#8*2]
855	ldp	x23,x24,[x2,#8*4]
856	ldp	x25,x26,[x2,#8*6]
857	add	x2,x2,#8*8
858	stp	x14,x15,[x0,#8*4]
859	sbcs	x14,x19,x6
860	stp	x16,x17,[x0,#8*6]
861	add	x0,x0,#8*8
862	sbcs	x15,x20,x7
863	cbnz	x27,.Lsqr8x_sub
864
865	sbcs	x16,x21,x8
866	mov	x2,sp
867	add	x1,sp,x5
868	ldp	x6,x7,[x3,#8*0]
869	sbcs	x17,x22,x9
870	stp	x14,x15,[x0,#8*0]
871	sbcs	x14,x23,x10
872	ldp	x8,x9,[x3,#8*2]
873	sbcs	x15,x24,x11
874	stp	x16,x17,[x0,#8*2]
875	sbcs	x16,x25,x12
876	ldp	x19,x20,[x1,#8*0]
877	sbcs	x17,x26,x13
878	ldp	x21,x22,[x1,#8*2]
879	sbcs	xzr,x30,xzr	// did it borrow?
880	ldr	x30,[x29,#8]		// pull return address
881	stp	x14,x15,[x0,#8*4]
882	stp	x16,x17,[x0,#8*6]
883
884	sub	x27,x5,#8*4
885.Lsqr4x_cond_copy:
886	sub	x27,x27,#8*4
887	csel	x14,x19,x6,lo
888	stp	xzr,xzr,[x2,#8*0]
889	csel	x15,x20,x7,lo
890	ldp	x6,x7,[x3,#8*4]
891	ldp	x19,x20,[x1,#8*4]
892	csel	x16,x21,x8,lo
893	stp	xzr,xzr,[x2,#8*2]
894	add	x2,x2,#8*4
895	csel	x17,x22,x9,lo
896	ldp	x8,x9,[x3,#8*6]
897	ldp	x21,x22,[x1,#8*6]
898	add	x1,x1,#8*4
899	stp	x14,x15,[x3,#8*0]
900	stp	x16,x17,[x3,#8*2]
901	add	x3,x3,#8*4
902	stp	xzr,xzr,[x1,#8*0]
903	stp	xzr,xzr,[x1,#8*2]
904	cbnz	x27,.Lsqr4x_cond_copy
905
906	csel	x14,x19,x6,lo
907	stp	xzr,xzr,[x2,#8*0]
908	csel	x15,x20,x7,lo
909	stp	xzr,xzr,[x2,#8*2]
910	csel	x16,x21,x8,lo
911	csel	x17,x22,x9,lo
912	stp	x14,x15,[x3,#8*0]
913	stp	x16,x17,[x3,#8*2]
914
915	b	.Lsqr8x_done
916
917.align	4
918.Lsqr8x8_post_condition:
919	adc	x28,xzr,xzr
920	ldr	x30,[x29,#8]		// pull return address
921	// x19-7,x28 hold result, x6-7 hold modulus
922	subs	x6,x19,x6
923	ldr	x1,[x29,#96]		// pull rp
924	sbcs	x7,x20,x7
925	stp	xzr,xzr,[sp,#8*0]
926	sbcs	x8,x21,x8
927	stp	xzr,xzr,[sp,#8*2]
928	sbcs	x9,x22,x9
929	stp	xzr,xzr,[sp,#8*4]
930	sbcs	x10,x23,x10
931	stp	xzr,xzr,[sp,#8*6]
932	sbcs	x11,x24,x11
933	stp	xzr,xzr,[sp,#8*8]
934	sbcs	x12,x25,x12
935	stp	xzr,xzr,[sp,#8*10]
936	sbcs	x13,x26,x13
937	stp	xzr,xzr,[sp,#8*12]
938	sbcs	x28,x28,xzr	// did it borrow?
939	stp	xzr,xzr,[sp,#8*14]
940
941	// x6-7 hold result-modulus
942	csel	x6,x19,x6,lo
943	csel	x7,x20,x7,lo
944	csel	x8,x21,x8,lo
945	csel	x9,x22,x9,lo
946	stp	x6,x7,[x1,#8*0]
947	csel	x10,x23,x10,lo
948	csel	x11,x24,x11,lo
949	stp	x8,x9,[x1,#8*2]
950	csel	x12,x25,x12,lo
951	csel	x13,x26,x13,lo
952	stp	x10,x11,[x1,#8*4]
953	stp	x12,x13,[x1,#8*6]
954
955.Lsqr8x_done:
956	ldp	x19,x20,[x29,#16]
957	mov	sp,x29
958	ldp	x21,x22,[x29,#32]
959	mov	x0,#1
960	ldp	x23,x24,[x29,#48]
961	ldp	x25,x26,[x29,#64]
962	ldp	x27,x28,[x29,#80]
963	ldr	x29,[sp],#128
964.inst	0xd50323bf		// autiasp
965	ret
966.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
967.type	__bn_mul4x_mont,%function
968.align	5
969__bn_mul4x_mont:
970.inst	0xd503233f		// paciasp
971	stp	x29,x30,[sp,#-128]!
972	add	x29,sp,#0
973	stp	x19,x20,[sp,#16]
974	stp	x21,x22,[sp,#32]
975	stp	x23,x24,[sp,#48]
976	stp	x25,x26,[sp,#64]
977	stp	x27,x28,[sp,#80]
978
979	sub	x26,sp,x5,lsl#3
980	lsl	x5,x5,#3
981	ldr	x4,[x4]		// *n0
982	sub	sp,x26,#8*4		// alloca
983
984	add	x10,x2,x5
985	add	x27,x1,x5
986	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
987
988	ldr	x24,[x2,#8*0]		// b[0]
989	ldp	x6,x7,[x1,#8*0]	// a[0..3]
990	ldp	x8,x9,[x1,#8*2]
991	add	x1,x1,#8*4
992	mov	x19,xzr
993	mov	x20,xzr
994	mov	x21,xzr
995	mov	x22,xzr
996	ldp	x14,x15,[x3,#8*0]	// n[0..3]
997	ldp	x16,x17,[x3,#8*2]
998	adds	x3,x3,#8*4		// clear carry bit
999	mov	x0,xzr
1000	mov	x28,#0
1001	mov	x26,sp
1002
1003.Loop_mul4x_1st_reduction:
1004	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1005	adc	x0,x0,xzr	// modulo-scheduled
1006	mul	x11,x7,x24
1007	add	x28,x28,#8
1008	mul	x12,x8,x24
1009	and	x28,x28,#31
1010	mul	x13,x9,x24
1011	adds	x19,x19,x10
1012	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1013	adcs	x20,x20,x11
1014	mul	x25,x19,x4		// t[0]*n0
1015	adcs	x21,x21,x12
1016	umulh	x11,x7,x24
1017	adcs	x22,x22,x13
1018	umulh	x12,x8,x24
1019	adc	x23,xzr,xzr
1020	umulh	x13,x9,x24
1021	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1022	adds	x20,x20,x10
1023	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1024	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1025	adcs	x21,x21,x11
1026	mul	x11,x15,x25
1027	adcs	x22,x22,x12
1028	mul	x12,x16,x25
1029	adc	x23,x23,x13		// can't overflow
1030	mul	x13,x17,x25
1031	// (*)	adds	xzr,x19,x10
1032	subs	xzr,x19,#1		// (*)
1033	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1034	adcs	x19,x20,x11
1035	umulh	x11,x15,x25
1036	adcs	x20,x21,x12
1037	umulh	x12,x16,x25
1038	adcs	x21,x22,x13
1039	umulh	x13,x17,x25
1040	adcs	x22,x23,x0
1041	adc	x0,xzr,xzr
1042	adds	x19,x19,x10
1043	sub	x10,x27,x1
1044	adcs	x20,x20,x11
1045	adcs	x21,x21,x12
1046	adcs	x22,x22,x13
1047	//adc	x0,x0,xzr
1048	cbnz	x28,.Loop_mul4x_1st_reduction
1049
1050	cbz	x10,.Lmul4x4_post_condition
1051
1052	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1053	ldp	x8,x9,[x1,#8*2]
1054	add	x1,x1,#8*4
1055	ldr	x25,[sp]		// a[0]*n0
1056	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1057	ldp	x16,x17,[x3,#8*2]
1058	add	x3,x3,#8*4
1059
1060.Loop_mul4x_1st_tail:
1061	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1062	adc	x0,x0,xzr	// modulo-scheduled
1063	mul	x11,x7,x24
1064	add	x28,x28,#8
1065	mul	x12,x8,x24
1066	and	x28,x28,#31
1067	mul	x13,x9,x24
1068	adds	x19,x19,x10
1069	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1070	adcs	x20,x20,x11
1071	umulh	x11,x7,x24
1072	adcs	x21,x21,x12
1073	umulh	x12,x8,x24
1074	adcs	x22,x22,x13
1075	umulh	x13,x9,x24
1076	adc	x23,xzr,xzr
1077	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1078	adds	x20,x20,x10
1079	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1080	adcs	x21,x21,x11
1081	mul	x11,x15,x25
1082	adcs	x22,x22,x12
1083	mul	x12,x16,x25
1084	adc	x23,x23,x13		// can't overflow
1085	mul	x13,x17,x25
1086	adds	x19,x19,x10
1087	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1088	adcs	x20,x20,x11
1089	umulh	x11,x15,x25
1090	adcs	x21,x21,x12
1091	umulh	x12,x16,x25
1092	adcs	x22,x22,x13
1093	adcs	x23,x23,x0
1094	umulh	x13,x17,x25
1095	adc	x0,xzr,xzr
1096	ldr	x25,[sp,x28]		// next t[0]*n0
1097	str	x19,[x26],#8		// result!!!
1098	adds	x19,x20,x10
1099	sub	x10,x27,x1		// done yet?
1100	adcs	x20,x21,x11
1101	adcs	x21,x22,x12
1102	adcs	x22,x23,x13
1103	//adc	x0,x0,xzr
1104	cbnz	x28,.Loop_mul4x_1st_tail
1105
1106	sub	x11,x27,x5	// rewinded x1
1107	cbz	x10,.Lmul4x_proceed
1108
1109	ldp	x6,x7,[x1,#8*0]
1110	ldp	x8,x9,[x1,#8*2]
1111	add	x1,x1,#8*4
1112	ldp	x14,x15,[x3,#8*0]
1113	ldp	x16,x17,[x3,#8*2]
1114	add	x3,x3,#8*4
1115	b	.Loop_mul4x_1st_tail
1116
1117.align	5
1118.Lmul4x_proceed:
1119	ldr	x24,[x2,#8*4]!		// *++b
1120	adc	x30,x0,xzr
1121	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1122	sub	x3,x3,x5		// rewind np
1123	ldp	x8,x9,[x11,#8*2]
1124	add	x1,x11,#8*4
1125
1126	stp	x19,x20,[x26,#8*0]	// result!!!
1127	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1128	stp	x21,x22,[x26,#8*2]	// result!!!
1129	ldp	x21,x22,[sp,#8*6]
1130
1131	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1132	mov	x26,sp
1133	ldp	x16,x17,[x3,#8*2]
1134	adds	x3,x3,#8*4		// clear carry bit
1135	mov	x0,xzr
1136
1137.align	4
1138.Loop_mul4x_reduction:
1139	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1140	adc	x0,x0,xzr	// modulo-scheduled
1141	mul	x11,x7,x24
1142	add	x28,x28,#8
1143	mul	x12,x8,x24
1144	and	x28,x28,#31
1145	mul	x13,x9,x24
1146	adds	x19,x19,x10
1147	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1148	adcs	x20,x20,x11
1149	mul	x25,x19,x4		// t[0]*n0
1150	adcs	x21,x21,x12
1151	umulh	x11,x7,x24
1152	adcs	x22,x22,x13
1153	umulh	x12,x8,x24
1154	adc	x23,xzr,xzr
1155	umulh	x13,x9,x24
1156	ldr	x24,[x2,x28]		// next b[i]
1157	adds	x20,x20,x10
1158	// (*)	mul	x10,x14,x25
1159	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1160	adcs	x21,x21,x11
1161	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1162	adcs	x22,x22,x12
1163	mul	x12,x16,x25
1164	adc	x23,x23,x13		// can't overflow
1165	mul	x13,x17,x25
1166	// (*)	adds	xzr,x19,x10
1167	subs	xzr,x19,#1		// (*)
1168	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1169	adcs	x19,x20,x11
1170	umulh	x11,x15,x25
1171	adcs	x20,x21,x12
1172	umulh	x12,x16,x25
1173	adcs	x21,x22,x13
1174	umulh	x13,x17,x25
1175	adcs	x22,x23,x0
1176	adc	x0,xzr,xzr
1177	adds	x19,x19,x10
1178	adcs	x20,x20,x11
1179	adcs	x21,x21,x12
1180	adcs	x22,x22,x13
1181	//adc	x0,x0,xzr
1182	cbnz	x28,.Loop_mul4x_reduction
1183
1184	adc	x0,x0,xzr
1185	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1186	ldp	x12,x13,[x26,#8*6]
1187	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1188	ldp	x8,x9,[x1,#8*2]
1189	add	x1,x1,#8*4
1190	adds	x19,x19,x10
1191	adcs	x20,x20,x11
1192	adcs	x21,x21,x12
1193	adcs	x22,x22,x13
1194	//adc	x0,x0,xzr
1195
1196	ldr	x25,[sp]		// t[0]*n0
1197	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1198	ldp	x16,x17,[x3,#8*2]
1199	add	x3,x3,#8*4
1200
1201.align	4
1202.Loop_mul4x_tail:
1203	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1204	adc	x0,x0,xzr	// modulo-scheduled
1205	mul	x11,x7,x24
1206	add	x28,x28,#8
1207	mul	x12,x8,x24
1208	and	x28,x28,#31
1209	mul	x13,x9,x24
1210	adds	x19,x19,x10
1211	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1212	adcs	x20,x20,x11
1213	umulh	x11,x7,x24
1214	adcs	x21,x21,x12
1215	umulh	x12,x8,x24
1216	adcs	x22,x22,x13
1217	umulh	x13,x9,x24
1218	adc	x23,xzr,xzr
1219	ldr	x24,[x2,x28]		// next b[i]
1220	adds	x20,x20,x10
1221	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1222	adcs	x21,x21,x11
1223	mul	x11,x15,x25
1224	adcs	x22,x22,x12
1225	mul	x12,x16,x25
1226	adc	x23,x23,x13		// can't overflow
1227	mul	x13,x17,x25
1228	adds	x19,x19,x10
1229	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1230	adcs	x20,x20,x11
1231	umulh	x11,x15,x25
1232	adcs	x21,x21,x12
1233	umulh	x12,x16,x25
1234	adcs	x22,x22,x13
1235	umulh	x13,x17,x25
1236	adcs	x23,x23,x0
1237	ldr	x25,[sp,x28]		// next a[0]*n0
1238	adc	x0,xzr,xzr
1239	str	x19,[x26],#8		// result!!!
1240	adds	x19,x20,x10
1241	sub	x10,x27,x1		// done yet?
1242	adcs	x20,x21,x11
1243	adcs	x21,x22,x12
1244	adcs	x22,x23,x13
1245	//adc	x0,x0,xzr
1246	cbnz	x28,.Loop_mul4x_tail
1247
1248	sub	x11,x3,x5		// rewinded np?
1249	adc	x0,x0,xzr
1250	cbz	x10,.Loop_mul4x_break
1251
1252	ldp	x10,x11,[x26,#8*4]
1253	ldp	x12,x13,[x26,#8*6]
1254	ldp	x6,x7,[x1,#8*0]
1255	ldp	x8,x9,[x1,#8*2]
1256	add	x1,x1,#8*4
1257	adds	x19,x19,x10
1258	adcs	x20,x20,x11
1259	adcs	x21,x21,x12
1260	adcs	x22,x22,x13
1261	//adc	x0,x0,xzr
1262	ldp	x14,x15,[x3,#8*0]
1263	ldp	x16,x17,[x3,#8*2]
1264	add	x3,x3,#8*4
1265	b	.Loop_mul4x_tail
1266
1267.align	4
1268.Loop_mul4x_break:
1269	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1270	adds	x19,x19,x30
1271	add	x2,x2,#8*4		// bp++
1272	adcs	x20,x20,xzr
1273	sub	x1,x1,x5		// rewind ap
1274	adcs	x21,x21,xzr
1275	stp	x19,x20,[x26,#8*0]	// result!!!
1276	adcs	x22,x22,xzr
1277	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1278	adc	x30,x0,xzr
1279	stp	x21,x22,[x26,#8*2]	// result!!!
1280	cmp	x2,x13			// done yet?
1281	ldp	x21,x22,[sp,#8*6]
1282	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1283	ldp	x16,x17,[x11,#8*2]
1284	add	x3,x11,#8*4
1285	b.eq	.Lmul4x_post
1286
1287	ldr	x24,[x2]
1288	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1289	ldp	x8,x9,[x1,#8*2]
1290	adds	x1,x1,#8*4		// clear carry bit
1291	mov	x0,xzr
1292	mov	x26,sp
1293	b	.Loop_mul4x_reduction
1294
1295.align	4
1296.Lmul4x_post:
1297	// Final step. We see if result is larger than modulus, and
1298	// if it is, subtract the modulus. But comparison implies
1299	// subtraction. So we subtract modulus, see if it borrowed,
1300	// and conditionally copy original value.
1301	mov	x0,x12
1302	mov	x27,x12		// x0 copy
1303	subs	x10,x19,x14
1304	add	x26,sp,#8*8
1305	sbcs	x11,x20,x15
1306	sub	x28,x5,#8*4
1307
1308.Lmul4x_sub:
1309	sbcs	x12,x21,x16
1310	ldp	x14,x15,[x3,#8*0]
1311	sub	x28,x28,#8*4
1312	ldp	x19,x20,[x26,#8*0]
1313	sbcs	x13,x22,x17
1314	ldp	x16,x17,[x3,#8*2]
1315	add	x3,x3,#8*4
1316	ldp	x21,x22,[x26,#8*2]
1317	add	x26,x26,#8*4
1318	stp	x10,x11,[x0,#8*0]
1319	sbcs	x10,x19,x14
1320	stp	x12,x13,[x0,#8*2]
1321	add	x0,x0,#8*4
1322	sbcs	x11,x20,x15
1323	cbnz	x28,.Lmul4x_sub
1324
1325	sbcs	x12,x21,x16
1326	mov	x26,sp
1327	add	x1,sp,#8*4
1328	ldp	x6,x7,[x27,#8*0]
1329	sbcs	x13,x22,x17
1330	stp	x10,x11,[x0,#8*0]
1331	ldp	x8,x9,[x27,#8*2]
1332	stp	x12,x13,[x0,#8*2]
1333	ldp	x19,x20,[x1,#8*0]
1334	ldp	x21,x22,[x1,#8*2]
1335	sbcs	xzr,x30,xzr	// did it borrow?
1336	ldr	x30,[x29,#8]		// pull return address
1337
1338	sub	x28,x5,#8*4
1339.Lmul4x_cond_copy:
1340	sub	x28,x28,#8*4
1341	csel	x10,x19,x6,lo
1342	stp	xzr,xzr,[x26,#8*0]
1343	csel	x11,x20,x7,lo
1344	ldp	x6,x7,[x27,#8*4]
1345	ldp	x19,x20,[x1,#8*4]
1346	csel	x12,x21,x8,lo
1347	stp	xzr,xzr,[x26,#8*2]
1348	add	x26,x26,#8*4
1349	csel	x13,x22,x9,lo
1350	ldp	x8,x9,[x27,#8*6]
1351	ldp	x21,x22,[x1,#8*6]
1352	add	x1,x1,#8*4
1353	stp	x10,x11,[x27,#8*0]
1354	stp	x12,x13,[x27,#8*2]
1355	add	x27,x27,#8*4
1356	cbnz	x28,.Lmul4x_cond_copy
1357
1358	csel	x10,x19,x6,lo
1359	stp	xzr,xzr,[x26,#8*0]
1360	csel	x11,x20,x7,lo
1361	stp	xzr,xzr,[x26,#8*2]
1362	csel	x12,x21,x8,lo
1363	stp	xzr,xzr,[x26,#8*3]
1364	csel	x13,x22,x9,lo
1365	stp	xzr,xzr,[x26,#8*4]
1366	stp	x10,x11,[x27,#8*0]
1367	stp	x12,x13,[x27,#8*2]
1368
1369	b	.Lmul4x_done
1370
1371.align	4
1372.Lmul4x4_post_condition:
1373	adc	x0,x0,xzr
1374	ldr	x1,[x29,#96]		// pull rp
1375	// x19-3,x0 hold result, x14-7 hold modulus
1376	subs	x6,x19,x14
1377	ldr	x30,[x29,#8]		// pull return address
1378	sbcs	x7,x20,x15
1379	stp	xzr,xzr,[sp,#8*0]
1380	sbcs	x8,x21,x16
1381	stp	xzr,xzr,[sp,#8*2]
1382	sbcs	x9,x22,x17
1383	stp	xzr,xzr,[sp,#8*4]
1384	sbcs	xzr,x0,xzr		// did it borrow?
1385	stp	xzr,xzr,[sp,#8*6]
1386
1387	// x6-3 hold result-modulus
1388	csel	x6,x19,x6,lo
1389	csel	x7,x20,x7,lo
1390	csel	x8,x21,x8,lo
1391	csel	x9,x22,x9,lo
1392	stp	x6,x7,[x1,#8*0]
1393	stp	x8,x9,[x1,#8*2]
1394
1395.Lmul4x_done:
1396	ldp	x19,x20,[x29,#16]
1397	mov	sp,x29
1398	ldp	x21,x22,[x29,#32]
1399	mov	x0,#1
1400	ldp	x23,x24,[x29,#48]
1401	ldp	x25,x26,[x29,#64]
1402	ldp	x27,x28,[x29,#80]
1403	ldr	x29,[sp],#128
1404.inst	0xd50323bf		// autiasp
1405	ret
1406.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1407.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1408.align	2
1409.align	4
1410