1 /* armv8-curve25519
2  *
3  * Copyright (C) 2006-2021 wolfSSL Inc.
4  *
5  * This file is part of wolfSSL.
6  *
7  * wolfSSL is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * wolfSSL is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20  */
21 
22 #ifdef HAVE_CONFIG_H
23     #include <config.h>
24 #endif /* HAVE_CONFIG_H */
25 #include <wolfssl/wolfcrypt/settings.h>
26 
27 /* Generated using (from wolfssl):
28  *   cd ../scripts
29  *   ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c
30  */
31 #ifdef WOLFSSL_ARMASM
32 #ifdef __aarch64__
33 #ifdef HAVE_CURVE25519
34 #include <wolfssl/wolfcrypt/fe_operations.h>
35 
fe_init()36 void fe_init()
37 {
38     __asm__ __volatile__ (
39         "\n\t"
40         :
41         :
42         : "memory"
43     );
44 }
45 
fe_frombytes(fe out,const unsigned char * in)46 void fe_frombytes(fe out, const unsigned char* in)
47 {
48     __asm__ __volatile__ (
49         "ldp	x2, x3, [%x[in]]\n\t"
50         "ldp	x4, x5, [%x[in], #16]\n\t"
51         "and	x5, x5, #0x7fffffffffffffff\n\t"
52         "stp	x2, x3, [%x[out]]\n\t"
53         "stp	x4, x5, [%x[out], #16]\n\t"
54         : [out] "+r" (out), [in] "+r" (in)
55         :
56         : "memory", "x2", "x3", "x4", "x5", "x6"
57     );
58 }
59 
fe_tobytes(unsigned char * out,const fe n)60 void fe_tobytes(unsigned char* out, const fe n)
61 {
62     __asm__ __volatile__ (
63         "mov	x7, #19\n\t"
64         "ldp	x2, x3, [%x[n]]\n\t"
65         "ldp	x4, x5, [%x[n], #16]\n\t"
66         "adds	x6, x2, x7\n\t"
67         "adcs	x6, x3, xzr\n\t"
68         "adcs	x6, x4, xzr\n\t"
69         "adc	x6, x5, xzr\n\t"
70         "and	x6, x7, x6, asr 63\n\t"
71         "adds	x2, x2, x6\n\t"
72         "adcs	x3, x3, xzr\n\t"
73         "adcs	x4, x4, xzr\n\t"
74         "adc	x5, x5, xzr\n\t"
75         "and	x5, x5, #0x7fffffffffffffff\n\t"
76         "stp	x2, x3, [%x[out]]\n\t"
77         "stp	x4, x5, [%x[out], #16]\n\t"
78         : [out] "+r" (out), [n] "+r" (n)
79         :
80         : "memory", "x2", "x3", "x4", "x5", "x6", "x7"
81     );
82 }
83 
fe_1(fe n)84 void fe_1(fe n)
85 {
86     __asm__ __volatile__ (
87         /* Set one */
88         "mov	x1, #1\n\t"
89         "stp	x1, xzr, [%x[n]]\n\t"
90         "stp	xzr, xzr, [%x[n], #16]\n\t"
91         : [n] "+r" (n)
92         :
93         : "memory", "x1"
94     );
95 }
96 
fe_0(fe n)97 void fe_0(fe n)
98 {
99     __asm__ __volatile__ (
100         /* Set zero */
101         "stp	xzr, xzr, [%x[n]]\n\t"
102         "stp	xzr, xzr, [%x[n], #16]\n\t"
103         : [n] "+r" (n)
104         :
105         : "memory"
106     );
107 }
108 
fe_copy(fe r,const fe a)109 void fe_copy(fe r, const fe a)
110 {
111     __asm__ __volatile__ (
112         /* Copy */
113         "ldp	x2, x3, [%x[a]]\n\t"
114         "ldp	x4, x5, [%x[a], #16]\n\t"
115         "stp	x2, x3, [%x[r]]\n\t"
116         "stp	x4, x5, [%x[r], #16]\n\t"
117         : [r] "+r" (r), [a] "+r" (a)
118         :
119         : "memory", "x2", "x3", "x4", "x5"
120     );
121 }
122 
fe_sub(fe r,const fe a,const fe b)123 void fe_sub(fe r, const fe a, const fe b)
124 {
125     __asm__ __volatile__ (
126         /* Sub */
127         "ldp	x3, x4, [%x[a]]\n\t"
128         "ldp	x5, x6, [%x[a], #16]\n\t"
129         "ldp	x7, x8, [%x[b]]\n\t"
130         "ldp	x9, x10, [%x[b], #16]\n\t"
131         "subs	x3, x3, x7\n\t"
132         "sbcs	x4, x4, x8\n\t"
133         "sbcs	x5, x5, x9\n\t"
134         "sbcs	x6, x6, x10\n\t"
135         "mov	x12, #-19\n\t"
136         "csetm	x11, cc\n\t"
137         /*   Mask the modulus */
138         "and	x12, x11, x12\n\t"
139         "and	x13, x11, #0x7fffffffffffffff\n\t"
140         /*   Add modulus (if underflow) */
141         "adds	x3, x3, x12\n\t"
142         "adcs	x4, x4, x11\n\t"
143         "adcs	x5, x5, x11\n\t"
144         "adc	x6, x6, x13\n\t"
145         "stp	x3, x4, [%x[r]]\n\t"
146         "stp	x5, x6, [%x[r], #16]\n\t"
147         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
148         :
149         : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
150     );
151 }
152 
fe_add(fe r,const fe a,const fe b)153 void fe_add(fe r, const fe a, const fe b)
154 {
155     __asm__ __volatile__ (
156         /* Add */
157         "ldp	x3, x4, [%x[a]]\n\t"
158         "ldp	x5, x6, [%x[a], #16]\n\t"
159         "ldp	x7, x8, [%x[b]]\n\t"
160         "ldp	x9, x10, [%x[b], #16]\n\t"
161         "adds	x3, x3, x7\n\t"
162         "adcs	x4, x4, x8\n\t"
163         "adcs	x5, x5, x9\n\t"
164         "adc	x6, x6, x10\n\t"
165         "mov	x12, #-19\n\t"
166         "asr	x11, x6, #63\n\t"
167         /*   Mask the modulus */
168         "and	x12, x11, x12\n\t"
169         "and	x13, x11, #0x7fffffffffffffff\n\t"
170         /*   Sub modulus (if overflow) */
171         "subs	x3, x3, x12\n\t"
172         "sbcs	x4, x4, x11\n\t"
173         "sbcs	x5, x5, x11\n\t"
174         "sbc	x6, x6, x13\n\t"
175         "stp	x3, x4, [%x[r]]\n\t"
176         "stp	x5, x6, [%x[r], #16]\n\t"
177         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
178         :
179         : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
180     );
181 }
182 
fe_neg(fe r,const fe a)183 void fe_neg(fe r, const fe a)
184 {
185     __asm__ __volatile__ (
186         "ldp	x2, x3, [%x[a]]\n\t"
187         "ldp	x4, x5, [%x[a], #16]\n\t"
188         "mov	x6, #-19\n\t"
189         "mov	x7, #-1\n\t"
190         "mov	x8, #-1\n\t"
191         "mov	x9, #0x7fffffffffffffff\n\t"
192         "subs	x6, x6, x2\n\t"
193         "sbcs	x7, x7, x3\n\t"
194         "sbcs	x8, x8, x4\n\t"
195         "sbc	x9, x9, x5\n\t"
196         "stp	x6, x7, [%x[r]]\n\t"
197         "stp	x8, x9, [%x[r], #16]\n\t"
198         : [r] "+r" (r), [a] "+r" (a)
199         :
200         : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9"
201     );
202 }
203 
fe_isnonzero(const fe a)204 int fe_isnonzero(const fe a)
205 {
206     __asm__ __volatile__ (
207         "mov	x6, #19\n\t"
208         "ldp	x1, x2, [%x[a]]\n\t"
209         "ldp	x3, x4, [%x[a], #16]\n\t"
210         "adds	x5, x1, x6\n\t"
211         "adcs	x5, x2, xzr\n\t"
212         "adcs	x5, x3, xzr\n\t"
213         "adc	x5, x4, xzr\n\t"
214         "and	x5, x6, x5, asr 63\n\t"
215         "adds	x1, x1, x5\n\t"
216         "adcs	x2, x2, xzr\n\t"
217         "adcs	x3, x3, xzr\n\t"
218         "adc	x4, x4, xzr\n\t"
219         "and	x4, x4, #0x7fffffffffffffff\n\t"
220         "orr	%x[a], x1, x2\n\t"
221         "orr	x3, x3, x4\n\t"
222         "orr	%x[a], %x[a], x3\n\t"
223         : [a] "+r" (a)
224         :
225         : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
226     );
227     return (uint32_t)(size_t)a;
228 }
229 
fe_isnegative(const fe a)230 int fe_isnegative(const fe a)
231 {
232     __asm__ __volatile__ (
233         "mov	x6, #19\n\t"
234         "ldp	x1, x2, [%x[a]]\n\t"
235         "ldp	x3, x4, [%x[a], #16]\n\t"
236         "adds	x5, x1, x6\n\t"
237         "adcs	x5, x2, xzr\n\t"
238         "adcs	x5, x3, xzr\n\t"
239         "adc	x5, x4, xzr\n\t"
240         "and	%x[a], x1, #1\n\t"
241         "eor	%x[a], %x[a], x5, lsr 63\n\t"
242         : [a] "+r" (a)
243         :
244         : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
245     );
246     return (uint32_t)(size_t)a;
247 }
248 
fe_cmov_table(fe * r,fe * base,signed char b)249 void fe_cmov_table(fe* r, fe* base, signed char b)
250 {
251     __asm__ __volatile__ (
252         "stp	x29, x30, [sp, #-32]!\n\t"
253         "add	x29, sp, #0\n\t"
254         "str	%x[r], [x29, #16]\n\t"
255         "sxtb	%x[b], %w[b]\n\t"
256         "sbfx	x3, %x[b], #7, #1\n\t"
257         "eor	%x[r], %x[b], x3\n\t"
258         "sub	%x[r], %x[r], x3\n\t"
259         "mov	x4, #1\n\t"
260         "mov	x5, xzr\n\t"
261         "mov	x6, xzr\n\t"
262         "mov	x7, xzr\n\t"
263         "mov	x8, #1\n\t"
264         "mov	x9, xzr\n\t"
265         "mov	x10, xzr\n\t"
266         "mov	x11, xzr\n\t"
267         "mov	x12, xzr\n\t"
268         "mov	x13, xzr\n\t"
269         "mov	x14, xzr\n\t"
270         "mov	x15, xzr\n\t"
271         "cmp	%x[r], #1\n\t"
272         "ldp	x16, x17, [%x[base]]\n\t"
273         "ldp	x19, x20, [%x[base], #16]\n\t"
274         "ldp	x21, x22, [%x[base], #32]\n\t"
275         "ldp	x23, x24, [%x[base], #48]\n\t"
276         "ldp	x25, x26, [%x[base], #64]\n\t"
277         "ldp	x27, x28, [%x[base], #80]\n\t"
278         "csel	x4, x16, x4, eq\n\t"
279         "csel	x5, x17, x5, eq\n\t"
280         "csel	x6, x19, x6, eq\n\t"
281         "csel	x7, x20, x7, eq\n\t"
282         "csel	x8, x21, x8, eq\n\t"
283         "csel	x9, x22, x9, eq\n\t"
284         "csel	x10, x23, x10, eq\n\t"
285         "csel	x11, x24, x11, eq\n\t"
286         "csel	x12, x25, x12, eq\n\t"
287         "csel	x13, x26, x13, eq\n\t"
288         "csel	x14, x27, x14, eq\n\t"
289         "csel	x15, x28, x15, eq\n\t"
290         "cmp	%x[r], #2\n\t"
291         "ldp	x16, x17, [%x[base], #96]\n\t"
292         "ldp	x19, x20, [%x[base], #112]\n\t"
293         "ldp	x21, x22, [%x[base], #128]\n\t"
294         "ldp	x23, x24, [%x[base], #144]\n\t"
295         "ldp	x25, x26, [%x[base], #160]\n\t"
296         "ldp	x27, x28, [%x[base], #176]\n\t"
297         "csel	x4, x16, x4, eq\n\t"
298         "csel	x5, x17, x5, eq\n\t"
299         "csel	x6, x19, x6, eq\n\t"
300         "csel	x7, x20, x7, eq\n\t"
301         "csel	x8, x21, x8, eq\n\t"
302         "csel	x9, x22, x9, eq\n\t"
303         "csel	x10, x23, x10, eq\n\t"
304         "csel	x11, x24, x11, eq\n\t"
305         "csel	x12, x25, x12, eq\n\t"
306         "csel	x13, x26, x13, eq\n\t"
307         "csel	x14, x27, x14, eq\n\t"
308         "csel	x15, x28, x15, eq\n\t"
309         "cmp	%x[r], #3\n\t"
310         "ldp	x16, x17, [%x[base], #192]\n\t"
311         "ldp	x19, x20, [%x[base], #208]\n\t"
312         "ldp	x21, x22, [%x[base], #224]\n\t"
313         "ldp	x23, x24, [%x[base], #240]\n\t"
314         "ldp	x25, x26, [%x[base], #256]\n\t"
315         "ldp	x27, x28, [%x[base], #272]\n\t"
316         "csel	x4, x16, x4, eq\n\t"
317         "csel	x5, x17, x5, eq\n\t"
318         "csel	x6, x19, x6, eq\n\t"
319         "csel	x7, x20, x7, eq\n\t"
320         "csel	x8, x21, x8, eq\n\t"
321         "csel	x9, x22, x9, eq\n\t"
322         "csel	x10, x23, x10, eq\n\t"
323         "csel	x11, x24, x11, eq\n\t"
324         "csel	x12, x25, x12, eq\n\t"
325         "csel	x13, x26, x13, eq\n\t"
326         "csel	x14, x27, x14, eq\n\t"
327         "csel	x15, x28, x15, eq\n\t"
328         "cmp	%x[r], #4\n\t"
329         "ldp	x16, x17, [%x[base], #288]\n\t"
330         "ldp	x19, x20, [%x[base], #304]\n\t"
331         "ldp	x21, x22, [%x[base], #320]\n\t"
332         "ldp	x23, x24, [%x[base], #336]\n\t"
333         "ldp	x25, x26, [%x[base], #352]\n\t"
334         "ldp	x27, x28, [%x[base], #368]\n\t"
335         "csel	x4, x16, x4, eq\n\t"
336         "csel	x5, x17, x5, eq\n\t"
337         "csel	x6, x19, x6, eq\n\t"
338         "csel	x7, x20, x7, eq\n\t"
339         "csel	x8, x21, x8, eq\n\t"
340         "csel	x9, x22, x9, eq\n\t"
341         "csel	x10, x23, x10, eq\n\t"
342         "csel	x11, x24, x11, eq\n\t"
343         "csel	x12, x25, x12, eq\n\t"
344         "csel	x13, x26, x13, eq\n\t"
345         "csel	x14, x27, x14, eq\n\t"
346         "csel	x15, x28, x15, eq\n\t"
347         "add	%x[base], %x[base], #0x180\n\t"
348         "cmp	%x[r], #5\n\t"
349         "ldp	x16, x17, [%x[base]]\n\t"
350         "ldp	x19, x20, [%x[base], #16]\n\t"
351         "ldp	x21, x22, [%x[base], #32]\n\t"
352         "ldp	x23, x24, [%x[base], #48]\n\t"
353         "ldp	x25, x26, [%x[base], #64]\n\t"
354         "ldp	x27, x28, [%x[base], #80]\n\t"
355         "csel	x4, x16, x4, eq\n\t"
356         "csel	x5, x17, x5, eq\n\t"
357         "csel	x6, x19, x6, eq\n\t"
358         "csel	x7, x20, x7, eq\n\t"
359         "csel	x8, x21, x8, eq\n\t"
360         "csel	x9, x22, x9, eq\n\t"
361         "csel	x10, x23, x10, eq\n\t"
362         "csel	x11, x24, x11, eq\n\t"
363         "csel	x12, x25, x12, eq\n\t"
364         "csel	x13, x26, x13, eq\n\t"
365         "csel	x14, x27, x14, eq\n\t"
366         "csel	x15, x28, x15, eq\n\t"
367         "cmp	%x[r], #6\n\t"
368         "ldp	x16, x17, [%x[base], #96]\n\t"
369         "ldp	x19, x20, [%x[base], #112]\n\t"
370         "ldp	x21, x22, [%x[base], #128]\n\t"
371         "ldp	x23, x24, [%x[base], #144]\n\t"
372         "ldp	x25, x26, [%x[base], #160]\n\t"
373         "ldp	x27, x28, [%x[base], #176]\n\t"
374         "csel	x4, x16, x4, eq\n\t"
375         "csel	x5, x17, x5, eq\n\t"
376         "csel	x6, x19, x6, eq\n\t"
377         "csel	x7, x20, x7, eq\n\t"
378         "csel	x8, x21, x8, eq\n\t"
379         "csel	x9, x22, x9, eq\n\t"
380         "csel	x10, x23, x10, eq\n\t"
381         "csel	x11, x24, x11, eq\n\t"
382         "csel	x12, x25, x12, eq\n\t"
383         "csel	x13, x26, x13, eq\n\t"
384         "csel	x14, x27, x14, eq\n\t"
385         "csel	x15, x28, x15, eq\n\t"
386         "cmp	%x[r], #7\n\t"
387         "ldp	x16, x17, [%x[base], #192]\n\t"
388         "ldp	x19, x20, [%x[base], #208]\n\t"
389         "ldp	x21, x22, [%x[base], #224]\n\t"
390         "ldp	x23, x24, [%x[base], #240]\n\t"
391         "ldp	x25, x26, [%x[base], #256]\n\t"
392         "ldp	x27, x28, [%x[base], #272]\n\t"
393         "csel	x4, x16, x4, eq\n\t"
394         "csel	x5, x17, x5, eq\n\t"
395         "csel	x6, x19, x6, eq\n\t"
396         "csel	x7, x20, x7, eq\n\t"
397         "csel	x8, x21, x8, eq\n\t"
398         "csel	x9, x22, x9, eq\n\t"
399         "csel	x10, x23, x10, eq\n\t"
400         "csel	x11, x24, x11, eq\n\t"
401         "csel	x12, x25, x12, eq\n\t"
402         "csel	x13, x26, x13, eq\n\t"
403         "csel	x14, x27, x14, eq\n\t"
404         "csel	x15, x28, x15, eq\n\t"
405         "cmp	%x[r], #8\n\t"
406         "ldp	x16, x17, [%x[base], #288]\n\t"
407         "ldp	x19, x20, [%x[base], #304]\n\t"
408         "ldp	x21, x22, [%x[base], #320]\n\t"
409         "ldp	x23, x24, [%x[base], #336]\n\t"
410         "ldp	x25, x26, [%x[base], #352]\n\t"
411         "ldp	x27, x28, [%x[base], #368]\n\t"
412         "csel	x4, x16, x4, eq\n\t"
413         "csel	x5, x17, x5, eq\n\t"
414         "csel	x6, x19, x6, eq\n\t"
415         "csel	x7, x20, x7, eq\n\t"
416         "csel	x8, x21, x8, eq\n\t"
417         "csel	x9, x22, x9, eq\n\t"
418         "csel	x10, x23, x10, eq\n\t"
419         "csel	x11, x24, x11, eq\n\t"
420         "csel	x12, x25, x12, eq\n\t"
421         "csel	x13, x26, x13, eq\n\t"
422         "csel	x14, x27, x14, eq\n\t"
423         "csel	x15, x28, x15, eq\n\t"
424         "mov	x16, #-19\n\t"
425         "mov	x17, #-1\n\t"
426         "mov	x19, #-1\n\t"
427         "mov	x20, #0x7fffffffffffffff\n\t"
428         "subs	x16, x16, x12\n\t"
429         "sbcs	x17, x17, x13\n\t"
430         "sbcs	x19, x19, x14\n\t"
431         "sbc	x20, x20, x15\n\t"
432         "cmp	%x[b], #0\n\t"
433         "mov	x3, x4\n\t"
434         "csel	x4, x8, x4, lt\n\t"
435         "csel	x8, x3, x8, lt\n\t"
436         "mov	x3, x5\n\t"
437         "csel	x5, x9, x5, lt\n\t"
438         "csel	x9, x3, x9, lt\n\t"
439         "mov	x3, x6\n\t"
440         "csel	x6, x10, x6, lt\n\t"
441         "csel	x10, x3, x10, lt\n\t"
442         "mov	x3, x7\n\t"
443         "csel	x7, x11, x7, lt\n\t"
444         "csel	x11, x3, x11, lt\n\t"
445         "csel	x12, x16, x12, lt\n\t"
446         "csel	x13, x17, x13, lt\n\t"
447         "csel	x14, x19, x14, lt\n\t"
448         "csel	x15, x20, x15, lt\n\t"
449         "ldr	%x[r], [x29, #16]\n\t"
450         "stp	x4, x5, [%x[r]]\n\t"
451         "stp	x6, x7, [%x[r], #16]\n\t"
452         "stp	x8, x9, [%x[r], #32]\n\t"
453         "stp	x10, x11, [%x[r], #48]\n\t"
454         "stp	x12, x13, [%x[r], #64]\n\t"
455         "stp	x14, x15, [%x[r], #80]\n\t"
456         "ldp	x29, x30, [sp], #32\n\t"
457         : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
458         :
459         : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
460     );
461 }
462 
fe_mul(fe r,const fe a,const fe b)463 void fe_mul(fe r, const fe a, const fe b)
464 {
465     __asm__ __volatile__ (
466         /* Multiply */
467         "ldp	x14, x15, [%x[a]]\n\t"
468         "ldp	x16, x17, [%x[a], #16]\n\t"
469         "ldp	x19, x20, [%x[b]]\n\t"
470         "ldp	x21, x22, [%x[b], #16]\n\t"
471         /*  A[0] * B[0] */
472         "mul	x6, x14, x19\n\t"
473         "umulh	x7, x14, x19\n\t"
474         /*  A[0] * B[1] */
475         "mul	x3, x14, x20\n\t"
476         "umulh	x8, x14, x20\n\t"
477         "adds	x7, x7, x3\n\t"
478         "adc	x8, x8, xzr\n\t"
479         /*  A[1] * B[0] */
480         "mul	x3, x15, x19\n\t"
481         "umulh	x4, x15, x19\n\t"
482         "adds	x7, x7, x3\n\t"
483         "adcs	x8, x8, x4\n\t"
484         "adc	x9, xzr, xzr\n\t"
485         /*  A[0] * B[2] */
486         "mul	x3, x14, x21\n\t"
487         "umulh	x4, x14, x21\n\t"
488         "adds	x8, x8, x3\n\t"
489         "adc	x9, x9, x4\n\t"
490         /*  A[1] * B[1] */
491         "mul	x3, x15, x20\n\t"
492         "umulh	x4, x15, x20\n\t"
493         "adds	x8, x8, x3\n\t"
494         "adcs	x9, x9, x4\n\t"
495         "adc	x10, xzr, xzr\n\t"
496         /*  A[2] * B[0] */
497         "mul	x3, x16, x19\n\t"
498         "umulh	x4, x16, x19\n\t"
499         "adds	x8, x8, x3\n\t"
500         "adcs	x9, x9, x4\n\t"
501         "adc	x10, x10, xzr\n\t"
502         /*  A[0] * B[3] */
503         "mul	x3, x14, x22\n\t"
504         "umulh	x4, x14, x22\n\t"
505         "adds	x9, x9, x3\n\t"
506         "adcs	x10, x10, x4\n\t"
507         "adc	x11, xzr, xzr\n\t"
508         /*  A[1] * B[2] */
509         "mul	x3, x15, x21\n\t"
510         "umulh	x4, x15, x21\n\t"
511         "adds	x9, x9, x3\n\t"
512         "adcs	x10, x10, x4\n\t"
513         "adc	x11, x11, xzr\n\t"
514         /*  A[2] * B[1] */
515         "mul	x3, x16, x20\n\t"
516         "umulh	x4, x16, x20\n\t"
517         "adds	x9, x9, x3\n\t"
518         "adcs	x10, x10, x4\n\t"
519         "adc	x11, x11, xzr\n\t"
520         /*  A[3] * B[0] */
521         "mul	x3, x17, x19\n\t"
522         "umulh	x4, x17, x19\n\t"
523         "adds	x9, x9, x3\n\t"
524         "adcs	x10, x10, x4\n\t"
525         "adc	x11, x11, xzr\n\t"
526         /*  A[1] * B[3] */
527         "mul	x3, x15, x22\n\t"
528         "umulh	x4, x15, x22\n\t"
529         "adds	x10, x10, x3\n\t"
530         "adcs	x11, x11, x4\n\t"
531         "adc	x12, xzr, xzr\n\t"
532         /*  A[2] * B[2] */
533         "mul	x3, x16, x21\n\t"
534         "umulh	x4, x16, x21\n\t"
535         "adds	x10, x10, x3\n\t"
536         "adcs	x11, x11, x4\n\t"
537         "adc	x12, x12, xzr\n\t"
538         /*  A[3] * B[1] */
539         "mul	x3, x17, x20\n\t"
540         "umulh	x4, x17, x20\n\t"
541         "adds	x10, x10, x3\n\t"
542         "adcs	x11, x11, x4\n\t"
543         "adc	x12, x12, xzr\n\t"
544         /*  A[2] * B[3] */
545         "mul	x3, x16, x22\n\t"
546         "umulh	x4, x16, x22\n\t"
547         "adds	x11, x11, x3\n\t"
548         "adcs	x12, x12, x4\n\t"
549         "adc	x13, xzr, xzr\n\t"
550         /*  A[3] * B[2] */
551         "mul	x3, x17, x21\n\t"
552         "umulh	x4, x17, x21\n\t"
553         "adds	x11, x11, x3\n\t"
554         "adcs	x12, x12, x4\n\t"
555         "adc	x13, x13, xzr\n\t"
556         /*  A[3] * B[3] */
557         "mul	x3, x17, x22\n\t"
558         "umulh	x4, x17, x22\n\t"
559         "adds	x12, x12, x3\n\t"
560         "adc	x13, x13, x4\n\t"
561         /* Reduce */
562         /*  Move top half into t4-t7 and remove top bit from t3 */
563         "extr	x13, x13, x12, #63\n\t"
564         "extr	x12, x12, x11, #63\n\t"
565         "extr	x11, x11, x10, #63\n\t"
566         "extr	x10, x10, x9, #63\n\t"
567         "and	x9, x9, #0x7fffffffffffffff\n\t"
568         /*  Multiply top half by 19 */
569         "mov	x3, #19\n\t"
570         "mul	x4, x3, x10\n\t"
571         "umulh	x10, x3, x10\n\t"
572         "adds	x6, x6, x4\n\t"
573         "mul	x4, x3, x11\n\t"
574         "umulh	x11, x3, x11\n\t"
575         "adcs	x7, x7, x4\n\t"
576         "mul	x4, x3, x12\n\t"
577         "umulh	x12, x3, x12\n\t"
578         "adcs	x8, x8, x4\n\t"
579         "mul	x4, x3, x13\n\t"
580         "umulh	x5, x3, x13\n\t"
581         "adcs	x9, x9, x4\n\t"
582         "adc	x5, x5, xzr\n\t"
583         /*  Add remaining product results in */
584         "adds	x7, x7, x10\n\t"
585         "adcs	x8, x8, x11\n\t"
586         "adcs	x9, x9, x12\n\t"
587         "adc	x5, x5, xzr\n\t"
588         /*  Overflow */
589         "extr	x5, x5, x9, #63\n\t"
590         "mul	x5, x5, x3\n\t"
591         "and	x9, x9, #0x7fffffffffffffff\n\t"
592         "adds	x6, x6, x5\n\t"
593         "adcs	x7, x7, xzr\n\t"
594         "adcs	x8, x8, xzr\n\t"
595         "adc	x9, x9, xzr\n\t"
596         /* Reduce if top bit set */
597         "and	x5, x3, x9, asr 63\n\t"
598         "and	x9, x9, #0x7fffffffffffffff\n\t"
599         "adds	x6, x6, x5\n\t"
600         "adcs	x7, x7, xzr\n\t"
601         "adcs	x8, x8, xzr\n\t"
602         "adc	x9, x9, xzr\n\t"
603         /* Store */
604         "stp	x6, x7, [%x[r]]\n\t"
605         "stp	x8, x9, [%x[r], #16]\n\t"
606         : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
607         :
608         : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
609     );
610 }
611 
fe_sq(fe r,const fe a)612 void fe_sq(fe r, const fe a)
613 {
614     __asm__ __volatile__ (
615         /* Square */
616         "ldp	x13, x14, [%x[a]]\n\t"
617         "ldp	x15, x16, [%x[a], #16]\n\t"
618         /*  A[0] * A[1] */
619         "mul	x6, x13, x14\n\t"
620         "umulh	x7, x13, x14\n\t"
621         /*  A[0] * A[2] */
622         "mul	x2, x13, x15\n\t"
623         "umulh	x8, x13, x15\n\t"
624         "adds	x7, x7, x2\n\t"
625         "adc	x8, x8, xzr\n\t"
626         /*  A[0] * A[3] */
627         "mul	x2, x13, x16\n\t"
628         "umulh	x9, x13, x16\n\t"
629         "adds	x8, x8, x2\n\t"
630         "adc	x9, x9, xzr\n\t"
631         /*  A[1] * A[2] */
632         "mul	x2, x14, x15\n\t"
633         "umulh	x3, x14, x15\n\t"
634         "adds	x8, x8, x2\n\t"
635         "adcs	x9, x9, x3\n\t"
636         "adc	x10, xzr, xzr\n\t"
637         /*  A[1] * A[3] */
638         "mul	x2, x14, x16\n\t"
639         "umulh	x3, x14, x16\n\t"
640         "adds	x9, x9, x2\n\t"
641         "adc	x10, x10, x3\n\t"
642         /*  A[2] * A[3] */
643         "mul	x2, x15, x16\n\t"
644         "umulh	x11, x15, x16\n\t"
645         "adds	x10, x10, x2\n\t"
646         "adc	x11, x11, xzr\n\t"
647         /* Double */
648         "adds	x6, x6, x6\n\t"
649         "adcs	x7, x7, x7\n\t"
650         "adcs	x8, x8, x8\n\t"
651         "adcs	x9, x9, x9\n\t"
652         "adcs	x10, x10, x10\n\t"
653         "adcs	x11, x11, x11\n\t"
654         "adc	x12, xzr, xzr\n\t"
655         /*  A[0] * A[0] */
656         "mul	x5, x13, x13\n\t"
657         "umulh	x4, x13, x13\n\t"
658         /*  A[1] * A[1] */
659         "mul	x2, x14, x14\n\t"
660         "umulh	x3, x14, x14\n\t"
661         "adds	x6, x6, x4\n\t"
662         "adcs	x7, x7, x2\n\t"
663         "adc	x4, x3, xzr\n\t"
664         /*  A[2] * A[2] */
665         "mul	x2, x15, x15\n\t"
666         "umulh	x3, x15, x15\n\t"
667         "adds	x8, x8, x4\n\t"
668         "adcs	x9, x9, x2\n\t"
669         "adc	x4, x3, xzr\n\t"
670         /*  A[3] * A[3] */
671         "mul	x2, x16, x16\n\t"
672         "umulh	x3, x16, x16\n\t"
673         "adds	x10, x10, x4\n\t"
674         "adcs	x11, x11, x2\n\t"
675         "adc	x12, x12, x3\n\t"
676         /* Reduce */
677         /*  Move top half into t4-t7 and remove top bit from t3 */
678         "extr	x12, x12, x11, #63\n\t"
679         "extr	x11, x11, x10, #63\n\t"
680         "extr	x10, x10, x9, #63\n\t"
681         "extr	x9, x9, x8, #63\n\t"
682         "and	x8, x8, #0x7fffffffffffffff\n\t"
683         /*  Multiply top half by 19 */
684         "mov	x2, #19\n\t"
685         "mul	x3, x2, x9\n\t"
686         "umulh	x9, x2, x9\n\t"
687         "adds	x5, x5, x3\n\t"
688         "mul	x3, x2, x10\n\t"
689         "umulh	x10, x2, x10\n\t"
690         "adcs	x6, x6, x3\n\t"
691         "mul	x3, x2, x11\n\t"
692         "umulh	x11, x2, x11\n\t"
693         "adcs	x7, x7, x3\n\t"
694         "mul	x3, x2, x12\n\t"
695         "umulh	x4, x2, x12\n\t"
696         "adcs	x8, x8, x3\n\t"
697         "adc	x4, x4, xzr\n\t"
698         /*  Add remaining product results in */
699         "adds	x6, x6, x9\n\t"
700         "adcs	x7, x7, x10\n\t"
701         "adcs	x8, x8, x11\n\t"
702         "adc	x4, x4, xzr\n\t"
703         /*  Overflow */
704         "extr	x4, x4, x8, #63\n\t"
705         "mul	x4, x4, x2\n\t"
706         "and	x8, x8, #0x7fffffffffffffff\n\t"
707         "adds	x5, x5, x4\n\t"
708         "adcs	x6, x6, xzr\n\t"
709         "adcs	x7, x7, xzr\n\t"
710         "adc	x8, x8, xzr\n\t"
711         /* Reduce if top bit set */
712         "and	x4, x2, x8, asr 63\n\t"
713         "and	x8, x8, #0x7fffffffffffffff\n\t"
714         "adds	x5, x5, x4\n\t"
715         "adcs	x6, x6, xzr\n\t"
716         "adcs	x7, x7, xzr\n\t"
717         "adc	x8, x8, xzr\n\t"
718         /* Store */
719         "stp	x5, x6, [%x[r]]\n\t"
720         "stp	x7, x8, [%x[r], #16]\n\t"
721         : [r] "+r" (r), [a] "+r" (a)
722         :
723         : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16"
724     );
725 }
726 
fe_invert(fe r,const fe a)727 void fe_invert(fe r, const fe a)
728 {
729     __asm__ __volatile__ (
730         "stp	x29, x30, [sp, #-160]!\n\t"
731         "add	x29, sp, #0\n\t"
732         /* Invert */
733         "str	%x[r], [x29, #144]\n\t"
734         "str	%x[a], [x29, #152]\n\t"
735         "add	x0, x29, #16\n\t"
736 #ifndef NDEBUG
737         "ldr	x1, [x29, #152]\n\t"
738 #endif /* !NDEBUG */
739 #ifndef __APPLE__
740         "bl	fe_sq\n\t"
741 #else
742         "bl	_fe_sq\n\t"
743 #endif /* __APPLE__ */
744         "add	x0, x29, #48\n\t"
745         "add	x1, x29, #16\n\t"
746 #ifndef __APPLE__
747         "bl	fe_sq\n\t"
748 #else
749         "bl	_fe_sq\n\t"
750 #endif /* __APPLE__ */
751 #ifndef NDEBUG
752         "add	x0, x29, #48\n\t"
753 #endif /* !NDEBUG */
754         "add	x1, x29, #48\n\t"
755 #ifndef __APPLE__
756         "bl	fe_sq\n\t"
757 #else
758         "bl	_fe_sq\n\t"
759 #endif /* __APPLE__ */
760 #ifndef NDEBUG
761         "add	x0, x29, #48\n\t"
762 #endif /* !NDEBUG */
763         "ldr	x1, [x29, #152]\n\t"
764         "add	x2, x29, #48\n\t"
765 #ifndef __APPLE__
766         "bl	fe_mul\n\t"
767 #else
768         "bl	_fe_mul\n\t"
769 #endif /* __APPLE__ */
770         "add	x0, x29, #16\n\t"
771         "add	x1, x29, #16\n\t"
772         "add	x2, x29, #48\n\t"
773 #ifndef __APPLE__
774         "bl	fe_mul\n\t"
775 #else
776         "bl	_fe_mul\n\t"
777 #endif /* __APPLE__ */
778         "add	x0, x29, #0x50\n\t"
779 #ifndef NDEBUG
780         "add	x1, x29, #16\n\t"
781 #endif /* !NDEBUG */
782 #ifndef __APPLE__
783         "bl	fe_sq\n\t"
784 #else
785         "bl	_fe_sq\n\t"
786 #endif /* __APPLE__ */
787         "add	x0, x29, #48\n\t"
788         "add	x1, x29, #48\n\t"
789         "add	x2, x29, #0x50\n\t"
790 #ifndef __APPLE__
791         "bl	fe_mul\n\t"
792 #else
793         "bl	_fe_mul\n\t"
794 #endif /* __APPLE__ */
795         "add	x0, x29, #0x50\n\t"
796 #ifndef NDEBUG
797         "add	x1, x29, #48\n\t"
798 #endif /* !NDEBUG */
799 #ifndef __APPLE__
800         "bl	fe_sq\n\t"
801 #else
802         "bl	_fe_sq\n\t"
803 #endif /* __APPLE__ */
804         "mov	x20, #3\n\t"
805 #ifndef NDEBUG
806         "add	x0, x29, #0x50\n\t"
807 #endif /* !NDEBUG */
808         "add	x1, x29, #0x50\n\t"
809         "\n"
810     "L_fe_invert1_%=: \n\t"
811 #ifndef __APPLE__
812         "bl	fe_sq\n\t"
813 #else
814         "bl	_fe_sq\n\t"
815 #endif /* __APPLE__ */
816         "subs	x20, x20, #1\n\t"
817         "bcs	L_fe_invert1_%=\n\t"
818         "add	x0, x29, #48\n\t"
819 #ifndef NDEBUG
820         "add	x1, x29, #0x50\n\t"
821 #endif /* !NDEBUG */
822         "add	x2, x29, #48\n\t"
823 #ifndef __APPLE__
824         "bl	fe_mul\n\t"
825 #else
826         "bl	_fe_mul\n\t"
827 #endif /* __APPLE__ */
828         "add	x0, x29, #0x50\n\t"
829         "add	x1, x29, #48\n\t"
830 #ifndef __APPLE__
831         "bl	fe_sq\n\t"
832 #else
833         "bl	_fe_sq\n\t"
834 #endif /* __APPLE__ */
835         "mov	x20, #8\n\t"
836 #ifndef NDEBUG
837         "add	x0, x29, #0x50\n\t"
838 #endif /* !NDEBUG */
839         "add	x1, x29, #0x50\n\t"
840         "\n"
841     "L_fe_invert2_%=: \n\t"
842 #ifndef __APPLE__
843         "bl	fe_sq\n\t"
844 #else
845         "bl	_fe_sq\n\t"
846 #endif /* __APPLE__ */
847         "subs	x20, x20, #1\n\t"
848         "bcs	L_fe_invert2_%=\n\t"
849 #ifndef NDEBUG
850         "add	x0, x29, #0x50\n\t"
851 #endif /* !NDEBUG */
852 #ifndef NDEBUG
853         "add	x1, x29, #0x50\n\t"
854 #endif /* !NDEBUG */
855         "add	x2, x29, #48\n\t"
856 #ifndef __APPLE__
857         "bl	fe_mul\n\t"
858 #else
859         "bl	_fe_mul\n\t"
860 #endif /* __APPLE__ */
861         "add	x0, x29, #0x70\n\t"
862 #ifndef NDEBUG
863         "add	x1, x29, #0x50\n\t"
864 #endif /* !NDEBUG */
865 #ifndef __APPLE__
866         "bl	fe_sq\n\t"
867 #else
868         "bl	_fe_sq\n\t"
869 #endif /* __APPLE__ */
870         "mov	x20, #18\n\t"
871 #ifndef NDEBUG
872         "add	x0, x29, #0x70\n\t"
873 #endif /* !NDEBUG */
874         "add	x1, x29, #0x70\n\t"
875         "\n"
876     "L_fe_invert3_%=: \n\t"
877 #ifndef __APPLE__
878         "bl	fe_sq\n\t"
879 #else
880         "bl	_fe_sq\n\t"
881 #endif /* __APPLE__ */
882         "subs	x20, x20, #1\n\t"
883         "bcs	L_fe_invert3_%=\n\t"
884         "add	x0, x29, #0x50\n\t"
885 #ifndef NDEBUG
886         "add	x1, x29, #0x70\n\t"
887 #endif /* !NDEBUG */
888         "add	x2, x29, #0x50\n\t"
889 #ifndef __APPLE__
890         "bl	fe_mul\n\t"
891 #else
892         "bl	_fe_mul\n\t"
893 #endif /* __APPLE__ */
894         "mov	x20, #9\n\t"
895 #ifndef NDEBUG
896         "add	x0, x29, #0x50\n\t"
897 #endif /* !NDEBUG */
898         "add	x1, x29, #0x50\n\t"
899         "\n"
900     "L_fe_invert4_%=: \n\t"
901 #ifndef __APPLE__
902         "bl	fe_sq\n\t"
903 #else
904         "bl	_fe_sq\n\t"
905 #endif /* __APPLE__ */
906         "subs	x20, x20, #1\n\t"
907         "bcs	L_fe_invert4_%=\n\t"
908         "add	x0, x29, #48\n\t"
909 #ifndef NDEBUG
910         "add	x1, x29, #0x50\n\t"
911 #endif /* !NDEBUG */
912         "add	x2, x29, #48\n\t"
913 #ifndef __APPLE__
914         "bl	fe_mul\n\t"
915 #else
916         "bl	_fe_mul\n\t"
917 #endif /* __APPLE__ */
918         "add	x0, x29, #0x50\n\t"
919         "add	x1, x29, #48\n\t"
920 #ifndef __APPLE__
921         "bl	fe_sq\n\t"
922 #else
923         "bl	_fe_sq\n\t"
924 #endif /* __APPLE__ */
925         "mov	x20, #48\n\t"
926 #ifndef NDEBUG
927         "add	x0, x29, #0x50\n\t"
928 #endif /* !NDEBUG */
929         "add	x1, x29, #0x50\n\t"
930         "\n"
931     "L_fe_invert5_%=: \n\t"
932 #ifndef __APPLE__
933         "bl	fe_sq\n\t"
934 #else
935         "bl	_fe_sq\n\t"
936 #endif /* __APPLE__ */
937         "subs	x20, x20, #1\n\t"
938         "bcs	L_fe_invert5_%=\n\t"
939 #ifndef NDEBUG
940         "add	x0, x29, #0x50\n\t"
941 #endif /* !NDEBUG */
942 #ifndef NDEBUG
943         "add	x1, x29, #0x50\n\t"
944 #endif /* !NDEBUG */
945         "add	x2, x29, #48\n\t"
946 #ifndef __APPLE__
947         "bl	fe_mul\n\t"
948 #else
949         "bl	_fe_mul\n\t"
950 #endif /* __APPLE__ */
951         "add	x0, x29, #0x70\n\t"
952 #ifndef NDEBUG
953         "add	x1, x29, #0x50\n\t"
954 #endif /* !NDEBUG */
955 #ifndef __APPLE__
956         "bl	fe_sq\n\t"
957 #else
958         "bl	_fe_sq\n\t"
959 #endif /* __APPLE__ */
960         "mov	x20, #0x62\n\t"
961 #ifndef NDEBUG
962         "add	x0, x29, #0x70\n\t"
963 #endif /* !NDEBUG */
964         "add	x1, x29, #0x70\n\t"
965         "\n"
966     "L_fe_invert6_%=: \n\t"
967 #ifndef __APPLE__
968         "bl	fe_sq\n\t"
969 #else
970         "bl	_fe_sq\n\t"
971 #endif /* __APPLE__ */
972         "subs	x20, x20, #1\n\t"
973         "bcs	L_fe_invert6_%=\n\t"
974         "add	x0, x29, #0x50\n\t"
975 #ifndef NDEBUG
976         "add	x1, x29, #0x70\n\t"
977 #endif /* !NDEBUG */
978         "add	x2, x29, #0x50\n\t"
979 #ifndef __APPLE__
980         "bl	fe_mul\n\t"
981 #else
982         "bl	_fe_mul\n\t"
983 #endif /* __APPLE__ */
984         "mov	x20, #49\n\t"
985 #ifndef NDEBUG
986         "add	x0, x29, #0x50\n\t"
987 #endif /* !NDEBUG */
988         "add	x1, x29, #0x50\n\t"
989         "\n"
990     "L_fe_invert7_%=: \n\t"
991 #ifndef __APPLE__
992         "bl	fe_sq\n\t"
993 #else
994         "bl	_fe_sq\n\t"
995 #endif /* __APPLE__ */
996         "subs	x20, x20, #1\n\t"
997         "bcs	L_fe_invert7_%=\n\t"
998         "add	x0, x29, #48\n\t"
999 #ifndef NDEBUG
1000         "add	x1, x29, #0x50\n\t"
1001 #endif /* !NDEBUG */
1002         "add	x2, x29, #48\n\t"
1003 #ifndef __APPLE__
1004         "bl	fe_mul\n\t"
1005 #else
1006         "bl	_fe_mul\n\t"
1007 #endif /* __APPLE__ */
1008         "mov	x20, #4\n\t"
1009 #ifndef NDEBUG
1010         "add	x0, x29, #48\n\t"
1011 #endif /* !NDEBUG */
1012         "add	x1, x29, #48\n\t"
1013         "\n"
1014     "L_fe_invert8_%=: \n\t"
1015 #ifndef __APPLE__
1016         "bl	fe_sq\n\t"
1017 #else
1018         "bl	_fe_sq\n\t"
1019 #endif /* __APPLE__ */
1020         "subs	x20, x20, #1\n\t"
1021         "bcs	L_fe_invert8_%=\n\t"
1022         "ldr	x0, [x29, #144]\n\t"
1023 #ifndef NDEBUG
1024         "add	x1, x29, #48\n\t"
1025 #endif /* !NDEBUG */
1026         "add	x2, x29, #16\n\t"
1027 #ifndef __APPLE__
1028         "bl	fe_mul\n\t"
1029 #else
1030         "bl	_fe_mul\n\t"
1031 #endif /* __APPLE__ */
1032         "ldp	x29, x30, [sp], #0xa0\n\t"
1033         : [r] "+r" (r), [a] "+r" (a)
1034         :
1035         : "memory", "x2", "x20"
1036     );
1037 }
1038 
curve25519(byte * r,const byte * n,const byte * a)1039 int curve25519(byte* r, const byte* n, const byte* a)
1040 {
1041     __asm__ __volatile__ (
1042         "stp	x29, x30, [sp, #-192]!\n\t"
1043         "add	x29, sp, #0\n\t"
1044         "mov	x23, xzr\n\t"
1045         "str	%x[r], [x29, #176]\n\t"
1046         "str	%x[a], [x29, #184]\n\t"
1047         /* Copy */
1048         "ldp	x6, x7, [%x[a]]\n\t"
1049         "ldp	x8, x9, [%x[a], #16]\n\t"
1050         "stp	x6, x7, [x29, #80]\n\t"
1051         "stp	x8, x9, [x29, #96]\n\t"
1052         /* Set one */
1053         "mov	%x[a], #1\n\t"
1054         "stp	%x[a], xzr, [%x[r]]\n\t"
1055         "stp	xzr, xzr, [%x[r], #16]\n\t"
1056         /* Set zero */
1057         "stp	xzr, xzr, [x29, #16]\n\t"
1058         "stp	xzr, xzr, [x29, #32]\n\t"
1059         /* Set one */
1060         "mov	%x[a], #1\n\t"
1061         "stp	%x[a], xzr, [x29, #48]\n\t"
1062         "stp	xzr, xzr, [x29, #64]\n\t"
1063         "mov	x25, #62\n\t"
1064         "mov	x24, #24\n\t"
1065         "\n"
1066     "L_curve25519_words_%=: \n\t"
1067         "\n"
1068     "L_curve25519_bits_%=: \n\t"
1069         "ldr	%x[a], [%x[n], x24]\n\t"
1070         "lsr	%x[a], %x[a], x25\n\t"
1071         "and	%x[a], %x[a], #1\n\t"
1072         "eor	x23, x23, %x[a]\n\t"
1073         /* Conditional Swap */
1074         "cmp	x23, #1\n\t"
1075         "ldp	x10, x11, [%x[r]]\n\t"
1076         "ldp	x12, x13, [%x[r], #16]\n\t"
1077         "ldp	x6, x7, [x29, #80]\n\t"
1078         "ldp	x8, x9, [x29, #96]\n\t"
1079         "csel	x14, x10, x6, eq\n\t"
1080         "csel	x10, x6, x10, eq\n\t"
1081         "csel	x15, x11, x7, eq\n\t"
1082         "csel	x11, x7, x11, eq\n\t"
1083         "csel	x16, x12, x8, eq\n\t"
1084         "csel	x12, x8, x12, eq\n\t"
1085         "csel	x17, x13, x9, eq\n\t"
1086         "csel	x13, x9, x13, eq\n\t"
1087         /* Conditional Swap */
1088         "cmp	x23, #1\n\t"
1089         "ldp	x19, x20, [x29, #16]\n\t"
1090         "ldp	x21, x22, [x29, #32]\n\t"
1091         "ldp	x6, x7, [x29, #48]\n\t"
1092         "ldp	x8, x9, [x29, #64]\n\t"
1093         "csel	x5, x19, x6, eq\n\t"
1094         "csel	x19, x6, x19, eq\n\t"
1095         "csel	x26, x20, x7, eq\n\t"
1096         "csel	x20, x7, x20, eq\n\t"
1097         "csel	x27, x21, x8, eq\n\t"
1098         "csel	x21, x8, x21, eq\n\t"
1099         "csel	x28, x22, x9, eq\n\t"
1100         "csel	x22, x9, x22, eq\n\t"
1101         "mov	x23, %x[a]\n\t"
1102         /* Add */
1103         "adds	x6, x10, x19\n\t"
1104         "adcs	x7, x11, x20\n\t"
1105         "adcs	x8, x12, x21\n\t"
1106         "adc	x9, x13, x22\n\t"
1107         "mov	x3, #-19\n\t"
1108         "asr	%x[a], x9, #63\n\t"
1109         /*   Mask the modulus */
1110         "and	x3, %x[a], x3\n\t"
1111         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1112         /*   Sub modulus (if overflow) */
1113         "subs	x6, x6, x3\n\t"
1114         "sbcs	x7, x7, %x[a]\n\t"
1115         "sbcs	x8, x8, %x[a]\n\t"
1116         "sbc	x9, x9, x4\n\t"
1117         /* Sub */
1118         "subs	x19, x10, x19\n\t"
1119         "sbcs	x20, x11, x20\n\t"
1120         "sbcs	x21, x12, x21\n\t"
1121         "sbcs	x22, x13, x22\n\t"
1122         "mov	x3, #-19\n\t"
1123         "csetm	%x[a], cc\n\t"
1124         /*   Mask the modulus */
1125         "and	x3, %x[a], x3\n\t"
1126         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1127         /*   Add modulus (if underflow) */
1128         "adds	x19, x19, x3\n\t"
1129         "adcs	x20, x20, %x[a]\n\t"
1130         "adcs	x21, x21, %x[a]\n\t"
1131         "adc	x22, x22, x4\n\t"
1132         "stp	x19, x20, [x29, #144]\n\t"
1133         "stp	x21, x22, [x29, #160]\n\t"
1134         /* Add */
1135         "adds	x10, x14, x5\n\t"
1136         "adcs	x11, x15, x26\n\t"
1137         "adcs	x12, x16, x27\n\t"
1138         "adc	x13, x17, x28\n\t"
1139         "mov	x3, #-19\n\t"
1140         "asr	%x[a], x13, #63\n\t"
1141         /*   Mask the modulus */
1142         "and	x3, %x[a], x3\n\t"
1143         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1144         /*   Sub modulus (if overflow) */
1145         "subs	x10, x10, x3\n\t"
1146         "sbcs	x11, x11, %x[a]\n\t"
1147         "sbcs	x12, x12, %x[a]\n\t"
1148         "sbc	x13, x13, x4\n\t"
1149         /* Sub */
1150         "subs	x14, x14, x5\n\t"
1151         "sbcs	x15, x15, x26\n\t"
1152         "sbcs	x16, x16, x27\n\t"
1153         "sbcs	x17, x17, x28\n\t"
1154         "mov	x3, #-19\n\t"
1155         "csetm	%x[a], cc\n\t"
1156         /*   Mask the modulus */
1157         "and	x3, %x[a], x3\n\t"
1158         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1159         /*   Add modulus (if underflow) */
1160         "adds	x14, x14, x3\n\t"
1161         "adcs	x15, x15, %x[a]\n\t"
1162         "adcs	x16, x16, %x[a]\n\t"
1163         "adc	x17, x17, x4\n\t"
1164         /* Multiply */
1165         /*  A[0] * B[0] */
1166         "mul	x19, x14, x6\n\t"
1167         "umulh	x20, x14, x6\n\t"
1168         /*  A[0] * B[1] */
1169         "mul	x3, x14, x7\n\t"
1170         "umulh	x21, x14, x7\n\t"
1171         "adds	x20, x20, x3\n\t"
1172         "adc	x21, x21, xzr\n\t"
1173         /*  A[1] * B[0] */
1174         "mul	x3, x15, x6\n\t"
1175         "umulh	x4, x15, x6\n\t"
1176         "adds	x20, x20, x3\n\t"
1177         "adcs	x21, x21, x4\n\t"
1178         "adc	x22, xzr, xzr\n\t"
1179         /*  A[0] * B[2] */
1180         "mul	x3, x14, x8\n\t"
1181         "umulh	x4, x14, x8\n\t"
1182         "adds	x21, x21, x3\n\t"
1183         "adc	x22, x22, x4\n\t"
1184         /*  A[1] * B[1] */
1185         "mul	x3, x15, x7\n\t"
1186         "umulh	x4, x15, x7\n\t"
1187         "adds	x21, x21, x3\n\t"
1188         "adcs	x22, x22, x4\n\t"
1189         "adc	%x[a], xzr, xzr\n\t"
1190         /*  A[2] * B[0] */
1191         "mul	x3, x16, x6\n\t"
1192         "umulh	x4, x16, x6\n\t"
1193         "adds	x21, x21, x3\n\t"
1194         "adcs	x22, x22, x4\n\t"
1195         "adc	%x[a], %x[a], xzr\n\t"
1196         /*  A[0] * B[3] */
1197         "mul	x3, x14, x9\n\t"
1198         "umulh	x4, x14, x9\n\t"
1199         "adds	x22, x22, x3\n\t"
1200         "adcs	%x[a], %x[a], x4\n\t"
1201         "adc	x26, xzr, xzr\n\t"
1202         /*  A[1] * B[2] */
1203         "mul	x3, x15, x8\n\t"
1204         "umulh	x4, x15, x8\n\t"
1205         "adds	x22, x22, x3\n\t"
1206         "adcs	%x[a], %x[a], x4\n\t"
1207         "adc	x26, x26, xzr\n\t"
1208         /*  A[2] * B[1] */
1209         "mul	x3, x16, x7\n\t"
1210         "umulh	x4, x16, x7\n\t"
1211         "adds	x22, x22, x3\n\t"
1212         "adcs	%x[a], %x[a], x4\n\t"
1213         "adc	x26, x26, xzr\n\t"
1214         /*  A[3] * B[0] */
1215         "mul	x3, x17, x6\n\t"
1216         "umulh	x4, x17, x6\n\t"
1217         "adds	x22, x22, x3\n\t"
1218         "adcs	%x[a], %x[a], x4\n\t"
1219         "adc	x26, x26, xzr\n\t"
1220         /*  A[1] * B[3] */
1221         "mul	x3, x15, x9\n\t"
1222         "umulh	x4, x15, x9\n\t"
1223         "adds	%x[a], %x[a], x3\n\t"
1224         "adcs	x26, x26, x4\n\t"
1225         "adc	x27, xzr, xzr\n\t"
1226         /*  A[2] * B[2] */
1227         "mul	x3, x16, x8\n\t"
1228         "umulh	x4, x16, x8\n\t"
1229         "adds	%x[a], %x[a], x3\n\t"
1230         "adcs	x26, x26, x4\n\t"
1231         "adc	x27, x27, xzr\n\t"
1232         /*  A[3] * B[1] */
1233         "mul	x3, x17, x7\n\t"
1234         "umulh	x4, x17, x7\n\t"
1235         "adds	%x[a], %x[a], x3\n\t"
1236         "adcs	x26, x26, x4\n\t"
1237         "adc	x27, x27, xzr\n\t"
1238         /*  A[2] * B[3] */
1239         "mul	x3, x16, x9\n\t"
1240         "umulh	x4, x16, x9\n\t"
1241         "adds	x26, x26, x3\n\t"
1242         "adcs	x27, x27, x4\n\t"
1243         "adc	x28, xzr, xzr\n\t"
1244         /*  A[3] * B[2] */
1245         "mul	x3, x17, x8\n\t"
1246         "umulh	x4, x17, x8\n\t"
1247         "adds	x26, x26, x3\n\t"
1248         "adcs	x27, x27, x4\n\t"
1249         "adc	x28, x28, xzr\n\t"
1250         /*  A[3] * B[3] */
1251         "mul	x3, x17, x9\n\t"
1252         "umulh	x4, x17, x9\n\t"
1253         "adds	x27, x27, x3\n\t"
1254         "adc	x28, x28, x4\n\t"
1255         /* Reduce */
1256         /*  Move top half into t4-t7 and remove top bit from t3 */
1257         "extr	x28, x28, x27, #63\n\t"
1258         "extr	x27, x27, x26, #63\n\t"
1259         "extr	x26, x26, %x[a], #63\n\t"
1260         "extr	%x[a], %x[a], x22, #63\n\t"
1261         "and	x22, x22, #0x7fffffffffffffff\n\t"
1262         /*  Multiply top half by 19 */
1263         "mov	x3, #19\n\t"
1264         "mul	x4, x3, %x[a]\n\t"
1265         "umulh	%x[a], x3, %x[a]\n\t"
1266         "adds	x19, x19, x4\n\t"
1267         "mul	x4, x3, x26\n\t"
1268         "umulh	x26, x3, x26\n\t"
1269         "adcs	x20, x20, x4\n\t"
1270         "mul	x4, x3, x27\n\t"
1271         "umulh	x27, x3, x27\n\t"
1272         "adcs	x21, x21, x4\n\t"
1273         "mul	x4, x3, x28\n\t"
1274         "umulh	x5, x3, x28\n\t"
1275         "adcs	x22, x22, x4\n\t"
1276         "adc	x5, x5, xzr\n\t"
1277         /*  Add remaining product results in */
1278         "adds	x20, x20, %x[a]\n\t"
1279         "adcs	x21, x21, x26\n\t"
1280         "adcs	x22, x22, x27\n\t"
1281         "adc	x5, x5, xzr\n\t"
1282         /*  Overflow */
1283         "extr	x5, x5, x22, #63\n\t"
1284         "mul	x5, x5, x3\n\t"
1285         "and	x22, x22, #0x7fffffffffffffff\n\t"
1286         "adds	x19, x19, x5\n\t"
1287         "adcs	x20, x20, xzr\n\t"
1288         "adcs	x21, x21, xzr\n\t"
1289         "adc	x22, x22, xzr\n\t"
1290         /* Reduce if top bit set */
1291         "and	x5, x3, x22, asr 63\n\t"
1292         "and	x22, x22, #0x7fffffffffffffff\n\t"
1293         "adds	x19, x19, x5\n\t"
1294         "adcs	x20, x20, xzr\n\t"
1295         "adcs	x21, x21, xzr\n\t"
1296         "adc	x22, x22, xzr\n\t"
1297         /* Store */
1298         "stp	x19, x20, [x29, #112]\n\t"
1299         "stp	x21, x22, [x29, #128]\n\t"
1300         /* Multiply */
1301         "ldp	%x[a], x26, [x29, #144]\n\t"
1302         "ldp	x27, x28, [x29, #160]\n\t"
1303         /*  A[0] * B[0] */
1304         "mul	x19, x10, %x[a]\n\t"
1305         "umulh	x20, x10, %x[a]\n\t"
1306         /*  A[0] * B[1] */
1307         "mul	x3, x10, x26\n\t"
1308         "umulh	x21, x10, x26\n\t"
1309         "adds	x20, x20, x3\n\t"
1310         "adc	x21, x21, xzr\n\t"
1311         /*  A[1] * B[0] */
1312         "mul	x3, x11, %x[a]\n\t"
1313         "umulh	x4, x11, %x[a]\n\t"
1314         "adds	x20, x20, x3\n\t"
1315         "adcs	x21, x21, x4\n\t"
1316         "adc	x22, xzr, xzr\n\t"
1317         /*  A[0] * B[2] */
1318         "mul	x3, x10, x27\n\t"
1319         "umulh	x4, x10, x27\n\t"
1320         "adds	x21, x21, x3\n\t"
1321         "adc	x22, x22, x4\n\t"
1322         /*  A[1] * B[1] */
1323         "mul	x3, x11, x26\n\t"
1324         "umulh	x4, x11, x26\n\t"
1325         "adds	x21, x21, x3\n\t"
1326         "adcs	x22, x22, x4\n\t"
1327         "adc	x14, xzr, xzr\n\t"
1328         /*  A[2] * B[0] */
1329         "mul	x3, x12, %x[a]\n\t"
1330         "umulh	x4, x12, %x[a]\n\t"
1331         "adds	x21, x21, x3\n\t"
1332         "adcs	x22, x22, x4\n\t"
1333         "adc	x14, x14, xzr\n\t"
1334         /*  A[0] * B[3] */
1335         "mul	x3, x10, x28\n\t"
1336         "umulh	x4, x10, x28\n\t"
1337         "adds	x22, x22, x3\n\t"
1338         "adcs	x14, x14, x4\n\t"
1339         "adc	x15, xzr, xzr\n\t"
1340         /*  A[1] * B[2] */
1341         "mul	x3, x11, x27\n\t"
1342         "umulh	x4, x11, x27\n\t"
1343         "adds	x22, x22, x3\n\t"
1344         "adcs	x14, x14, x4\n\t"
1345         "adc	x15, x15, xzr\n\t"
1346         /*  A[2] * B[1] */
1347         "mul	x3, x12, x26\n\t"
1348         "umulh	x4, x12, x26\n\t"
1349         "adds	x22, x22, x3\n\t"
1350         "adcs	x14, x14, x4\n\t"
1351         "adc	x15, x15, xzr\n\t"
1352         /*  A[3] * B[0] */
1353         "mul	x3, x13, %x[a]\n\t"
1354         "umulh	x4, x13, %x[a]\n\t"
1355         "adds	x22, x22, x3\n\t"
1356         "adcs	x14, x14, x4\n\t"
1357         "adc	x15, x15, xzr\n\t"
1358         /*  A[1] * B[3] */
1359         "mul	x3, x11, x28\n\t"
1360         "umulh	x4, x11, x28\n\t"
1361         "adds	x14, x14, x3\n\t"
1362         "adcs	x15, x15, x4\n\t"
1363         "adc	x16, xzr, xzr\n\t"
1364         /*  A[2] * B[2] */
1365         "mul	x3, x12, x27\n\t"
1366         "umulh	x4, x12, x27\n\t"
1367         "adds	x14, x14, x3\n\t"
1368         "adcs	x15, x15, x4\n\t"
1369         "adc	x16, x16, xzr\n\t"
1370         /*  A[3] * B[1] */
1371         "mul	x3, x13, x26\n\t"
1372         "umulh	x4, x13, x26\n\t"
1373         "adds	x14, x14, x3\n\t"
1374         "adcs	x15, x15, x4\n\t"
1375         "adc	x16, x16, xzr\n\t"
1376         /*  A[2] * B[3] */
1377         "mul	x3, x12, x28\n\t"
1378         "umulh	x4, x12, x28\n\t"
1379         "adds	x15, x15, x3\n\t"
1380         "adcs	x16, x16, x4\n\t"
1381         "adc	x17, xzr, xzr\n\t"
1382         /*  A[3] * B[2] */
1383         "mul	x3, x13, x27\n\t"
1384         "umulh	x4, x13, x27\n\t"
1385         "adds	x15, x15, x3\n\t"
1386         "adcs	x16, x16, x4\n\t"
1387         "adc	x17, x17, xzr\n\t"
1388         /*  A[3] * B[3] */
1389         "mul	x3, x13, x28\n\t"
1390         "umulh	x4, x13, x28\n\t"
1391         "adds	x16, x16, x3\n\t"
1392         "adc	x17, x17, x4\n\t"
1393         /* Reduce */
1394         /*  Move top half into t4-t7 and remove top bit from t3 */
1395         "extr	x17, x17, x16, #63\n\t"
1396         "extr	x16, x16, x15, #63\n\t"
1397         "extr	x15, x15, x14, #63\n\t"
1398         "extr	x14, x14, x22, #63\n\t"
1399         "and	x22, x22, #0x7fffffffffffffff\n\t"
1400         /*  Multiply top half by 19 */
1401         "mov	x3, #19\n\t"
1402         "mul	x4, x3, x14\n\t"
1403         "umulh	x14, x3, x14\n\t"
1404         "adds	x19, x19, x4\n\t"
1405         "mul	x4, x3, x15\n\t"
1406         "umulh	x15, x3, x15\n\t"
1407         "adcs	x20, x20, x4\n\t"
1408         "mul	x4, x3, x16\n\t"
1409         "umulh	x16, x3, x16\n\t"
1410         "adcs	x21, x21, x4\n\t"
1411         "mul	x4, x3, x17\n\t"
1412         "umulh	x5, x3, x17\n\t"
1413         "adcs	x22, x22, x4\n\t"
1414         "adc	x5, x5, xzr\n\t"
1415         /*  Add remaining product results in */
1416         "adds	x20, x20, x14\n\t"
1417         "adcs	x21, x21, x15\n\t"
1418         "adcs	x22, x22, x16\n\t"
1419         "adc	x5, x5, xzr\n\t"
1420         /*  Overflow */
1421         "extr	x5, x5, x22, #63\n\t"
1422         "mul	x5, x5, x3\n\t"
1423         "and	x22, x22, #0x7fffffffffffffff\n\t"
1424         "adds	x19, x19, x5\n\t"
1425         "adcs	x20, x20, xzr\n\t"
1426         "adcs	x21, x21, xzr\n\t"
1427         "adc	x22, x22, xzr\n\t"
1428         /* Reduce if top bit set */
1429         "and	x5, x3, x22, asr 63\n\t"
1430         "and	x22, x22, #0x7fffffffffffffff\n\t"
1431         "adds	x19, x19, x5\n\t"
1432         "adcs	x20, x20, xzr\n\t"
1433         "adcs	x21, x21, xzr\n\t"
1434         "adc	x22, x22, xzr\n\t"
1435         /* Store */
1436         /* Square */
1437         /*  A[0] * A[1] */
1438         "mul	x11, %x[a], x26\n\t"
1439         "umulh	x12, %x[a], x26\n\t"
1440         /*  A[0] * A[2] */
1441         "mul	x3, %x[a], x27\n\t"
1442         "umulh	x13, %x[a], x27\n\t"
1443         "adds	x12, x12, x3\n\t"
1444         "adc	x13, x13, xzr\n\t"
1445         /*  A[0] * A[3] */
1446         "mul	x3, %x[a], x28\n\t"
1447         "umulh	x14, %x[a], x28\n\t"
1448         "adds	x13, x13, x3\n\t"
1449         "adc	x14, x14, xzr\n\t"
1450         /*  A[1] * A[2] */
1451         "mul	x3, x26, x27\n\t"
1452         "umulh	x4, x26, x27\n\t"
1453         "adds	x13, x13, x3\n\t"
1454         "adcs	x14, x14, x4\n\t"
1455         "adc	x15, xzr, xzr\n\t"
1456         /*  A[1] * A[3] */
1457         "mul	x3, x26, x28\n\t"
1458         "umulh	x4, x26, x28\n\t"
1459         "adds	x14, x14, x3\n\t"
1460         "adc	x15, x15, x4\n\t"
1461         /*  A[2] * A[3] */
1462         "mul	x3, x27, x28\n\t"
1463         "umulh	x16, x27, x28\n\t"
1464         "adds	x15, x15, x3\n\t"
1465         "adc	x16, x16, xzr\n\t"
1466         /* Double */
1467         "adds	x11, x11, x11\n\t"
1468         "adcs	x12, x12, x12\n\t"
1469         "adcs	x13, x13, x13\n\t"
1470         "adcs	x14, x14, x14\n\t"
1471         "adcs	x15, x15, x15\n\t"
1472         "adcs	x16, x16, x16\n\t"
1473         "adc	x17, xzr, xzr\n\t"
1474         /*  A[0] * A[0] */
1475         "mul	x10, %x[a], %x[a]\n\t"
1476         "umulh	x5, %x[a], %x[a]\n\t"
1477         /*  A[1] * A[1] */
1478         "mul	x3, x26, x26\n\t"
1479         "umulh	x4, x26, x26\n\t"
1480         "adds	x11, x11, x5\n\t"
1481         "adcs	x12, x12, x3\n\t"
1482         "adc	x5, x4, xzr\n\t"
1483         /*  A[2] * A[2] */
1484         "mul	x3, x27, x27\n\t"
1485         "umulh	x4, x27, x27\n\t"
1486         "adds	x13, x13, x5\n\t"
1487         "adcs	x14, x14, x3\n\t"
1488         "adc	x5, x4, xzr\n\t"
1489         /*  A[3] * A[3] */
1490         "mul	x3, x28, x28\n\t"
1491         "umulh	x4, x28, x28\n\t"
1492         "adds	x15, x15, x5\n\t"
1493         "adcs	x16, x16, x3\n\t"
1494         "adc	x17, x17, x4\n\t"
1495         /* Reduce */
1496         /*  Move top half into t4-t7 and remove top bit from t3 */
1497         "extr	x17, x17, x16, #63\n\t"
1498         "extr	x16, x16, x15, #63\n\t"
1499         "extr	x15, x15, x14, #63\n\t"
1500         "extr	x14, x14, x13, #63\n\t"
1501         "and	x13, x13, #0x7fffffffffffffff\n\t"
1502         /*  Multiply top half by 19 */
1503         "mov	x3, #19\n\t"
1504         "mul	x4, x3, x14\n\t"
1505         "umulh	x14, x3, x14\n\t"
1506         "adds	x10, x10, x4\n\t"
1507         "mul	x4, x3, x15\n\t"
1508         "umulh	x15, x3, x15\n\t"
1509         "adcs	x11, x11, x4\n\t"
1510         "mul	x4, x3, x16\n\t"
1511         "umulh	x16, x3, x16\n\t"
1512         "adcs	x12, x12, x4\n\t"
1513         "mul	x4, x3, x17\n\t"
1514         "umulh	x5, x3, x17\n\t"
1515         "adcs	x13, x13, x4\n\t"
1516         "adc	x5, x5, xzr\n\t"
1517         /*  Add remaining product results in */
1518         "adds	x11, x11, x14\n\t"
1519         "adcs	x12, x12, x15\n\t"
1520         "adcs	x13, x13, x16\n\t"
1521         "adc	x5, x5, xzr\n\t"
1522         /*  Overflow */
1523         "extr	x5, x5, x13, #63\n\t"
1524         "mul	x5, x5, x3\n\t"
1525         "and	x13, x13, #0x7fffffffffffffff\n\t"
1526         "adds	x10, x10, x5\n\t"
1527         "adcs	x11, x11, xzr\n\t"
1528         "adcs	x12, x12, xzr\n\t"
1529         "adc	x13, x13, xzr\n\t"
1530         /* Reduce if top bit set */
1531         "and	x5, x3, x13, asr 63\n\t"
1532         "and	x13, x13, #0x7fffffffffffffff\n\t"
1533         "adds	x10, x10, x5\n\t"
1534         "adcs	x11, x11, xzr\n\t"
1535         "adcs	x12, x12, xzr\n\t"
1536         "adc	x13, x13, xzr\n\t"
1537         /* Store */
1538         /* Square */
1539         /*  A[0] * A[1] */
1540         "mul	x15, x6, x7\n\t"
1541         "umulh	x16, x6, x7\n\t"
1542         /*  A[0] * A[2] */
1543         "mul	x3, x6, x8\n\t"
1544         "umulh	x17, x6, x8\n\t"
1545         "adds	x16, x16, x3\n\t"
1546         "adc	x17, x17, xzr\n\t"
1547         /*  A[0] * A[3] */
1548         "mul	x3, x6, x9\n\t"
1549         "umulh	%x[a], x6, x9\n\t"
1550         "adds	x17, x17, x3\n\t"
1551         "adc	%x[a], %x[a], xzr\n\t"
1552         /*  A[1] * A[2] */
1553         "mul	x3, x7, x8\n\t"
1554         "umulh	x4, x7, x8\n\t"
1555         "adds	x17, x17, x3\n\t"
1556         "adcs	%x[a], %x[a], x4\n\t"
1557         "adc	x26, xzr, xzr\n\t"
1558         /*  A[1] * A[3] */
1559         "mul	x3, x7, x9\n\t"
1560         "umulh	x4, x7, x9\n\t"
1561         "adds	%x[a], %x[a], x3\n\t"
1562         "adc	x26, x26, x4\n\t"
1563         /*  A[2] * A[3] */
1564         "mul	x3, x8, x9\n\t"
1565         "umulh	x27, x8, x9\n\t"
1566         "adds	x26, x26, x3\n\t"
1567         "adc	x27, x27, xzr\n\t"
1568         /* Double */
1569         "adds	x15, x15, x15\n\t"
1570         "adcs	x16, x16, x16\n\t"
1571         "adcs	x17, x17, x17\n\t"
1572         "adcs	%x[a], %x[a], %x[a]\n\t"
1573         "adcs	x26, x26, x26\n\t"
1574         "adcs	x27, x27, x27\n\t"
1575         "adc	x28, xzr, xzr\n\t"
1576         /*  A[0] * A[0] */
1577         "mul	x14, x6, x6\n\t"
1578         "umulh	x5, x6, x6\n\t"
1579         /*  A[1] * A[1] */
1580         "mul	x3, x7, x7\n\t"
1581         "umulh	x4, x7, x7\n\t"
1582         "adds	x15, x15, x5\n\t"
1583         "adcs	x16, x16, x3\n\t"
1584         "adc	x5, x4, xzr\n\t"
1585         /*  A[2] * A[2] */
1586         "mul	x3, x8, x8\n\t"
1587         "umulh	x4, x8, x8\n\t"
1588         "adds	x17, x17, x5\n\t"
1589         "adcs	%x[a], %x[a], x3\n\t"
1590         "adc	x5, x4, xzr\n\t"
1591         /*  A[3] * A[3] */
1592         "mul	x3, x9, x9\n\t"
1593         "umulh	x4, x9, x9\n\t"
1594         "adds	x26, x26, x5\n\t"
1595         "adcs	x27, x27, x3\n\t"
1596         "adc	x28, x28, x4\n\t"
1597         /* Reduce */
1598         /*  Move top half into t4-t7 and remove top bit from t3 */
1599         "extr	x28, x28, x27, #63\n\t"
1600         "extr	x27, x27, x26, #63\n\t"
1601         "extr	x26, x26, %x[a], #63\n\t"
1602         "extr	%x[a], %x[a], x17, #63\n\t"
1603         "and	x17, x17, #0x7fffffffffffffff\n\t"
1604         /*  Multiply top half by 19 */
1605         "mov	x3, #19\n\t"
1606         "mul	x4, x3, %x[a]\n\t"
1607         "umulh	%x[a], x3, %x[a]\n\t"
1608         "adds	x14, x14, x4\n\t"
1609         "mul	x4, x3, x26\n\t"
1610         "umulh	x26, x3, x26\n\t"
1611         "adcs	x15, x15, x4\n\t"
1612         "mul	x4, x3, x27\n\t"
1613         "umulh	x27, x3, x27\n\t"
1614         "adcs	x16, x16, x4\n\t"
1615         "mul	x4, x3, x28\n\t"
1616         "umulh	x5, x3, x28\n\t"
1617         "adcs	x17, x17, x4\n\t"
1618         "adc	x5, x5, xzr\n\t"
1619         /*  Add remaining product results in */
1620         "adds	x15, x15, %x[a]\n\t"
1621         "adcs	x16, x16, x26\n\t"
1622         "adcs	x17, x17, x27\n\t"
1623         "adc	x5, x5, xzr\n\t"
1624         /*  Overflow */
1625         "extr	x5, x5, x17, #63\n\t"
1626         "mul	x5, x5, x3\n\t"
1627         "and	x17, x17, #0x7fffffffffffffff\n\t"
1628         "adds	x14, x14, x5\n\t"
1629         "adcs	x15, x15, xzr\n\t"
1630         "adcs	x16, x16, xzr\n\t"
1631         "adc	x17, x17, xzr\n\t"
1632         /* Reduce if top bit set */
1633         "and	x5, x3, x17, asr 63\n\t"
1634         "and	x17, x17, #0x7fffffffffffffff\n\t"
1635         "adds	x14, x14, x5\n\t"
1636         "adcs	x15, x15, xzr\n\t"
1637         "adcs	x16, x16, xzr\n\t"
1638         "adc	x17, x17, xzr\n\t"
1639         /* Store */
1640         /* Multiply */
1641         /*  A[0] * B[0] */
1642         "mul	x6, x14, x10\n\t"
1643         "umulh	x7, x14, x10\n\t"
1644         /*  A[0] * B[1] */
1645         "mul	x3, x14, x11\n\t"
1646         "umulh	x8, x14, x11\n\t"
1647         "adds	x7, x7, x3\n\t"
1648         "adc	x8, x8, xzr\n\t"
1649         /*  A[1] * B[0] */
1650         "mul	x3, x15, x10\n\t"
1651         "umulh	x4, x15, x10\n\t"
1652         "adds	x7, x7, x3\n\t"
1653         "adcs	x8, x8, x4\n\t"
1654         "adc	x9, xzr, xzr\n\t"
1655         /*  A[0] * B[2] */
1656         "mul	x3, x14, x12\n\t"
1657         "umulh	x4, x14, x12\n\t"
1658         "adds	x8, x8, x3\n\t"
1659         "adc	x9, x9, x4\n\t"
1660         /*  A[1] * B[1] */
1661         "mul	x3, x15, x11\n\t"
1662         "umulh	x4, x15, x11\n\t"
1663         "adds	x8, x8, x3\n\t"
1664         "adcs	x9, x9, x4\n\t"
1665         "adc	%x[a], xzr, xzr\n\t"
1666         /*  A[2] * B[0] */
1667         "mul	x3, x16, x10\n\t"
1668         "umulh	x4, x16, x10\n\t"
1669         "adds	x8, x8, x3\n\t"
1670         "adcs	x9, x9, x4\n\t"
1671         "adc	%x[a], %x[a], xzr\n\t"
1672         /*  A[0] * B[3] */
1673         "mul	x3, x14, x13\n\t"
1674         "umulh	x4, x14, x13\n\t"
1675         "adds	x9, x9, x3\n\t"
1676         "adcs	%x[a], %x[a], x4\n\t"
1677         "adc	x26, xzr, xzr\n\t"
1678         /*  A[1] * B[2] */
1679         "mul	x3, x15, x12\n\t"
1680         "umulh	x4, x15, x12\n\t"
1681         "adds	x9, x9, x3\n\t"
1682         "adcs	%x[a], %x[a], x4\n\t"
1683         "adc	x26, x26, xzr\n\t"
1684         /*  A[2] * B[1] */
1685         "mul	x3, x16, x11\n\t"
1686         "umulh	x4, x16, x11\n\t"
1687         "adds	x9, x9, x3\n\t"
1688         "adcs	%x[a], %x[a], x4\n\t"
1689         "adc	x26, x26, xzr\n\t"
1690         /*  A[3] * B[0] */
1691         "mul	x3, x17, x10\n\t"
1692         "umulh	x4, x17, x10\n\t"
1693         "adds	x9, x9, x3\n\t"
1694         "adcs	%x[a], %x[a], x4\n\t"
1695         "adc	x26, x26, xzr\n\t"
1696         /*  A[1] * B[3] */
1697         "mul	x3, x15, x13\n\t"
1698         "umulh	x4, x15, x13\n\t"
1699         "adds	%x[a], %x[a], x3\n\t"
1700         "adcs	x26, x26, x4\n\t"
1701         "adc	x27, xzr, xzr\n\t"
1702         /*  A[2] * B[2] */
1703         "mul	x3, x16, x12\n\t"
1704         "umulh	x4, x16, x12\n\t"
1705         "adds	%x[a], %x[a], x3\n\t"
1706         "adcs	x26, x26, x4\n\t"
1707         "adc	x27, x27, xzr\n\t"
1708         /*  A[3] * B[1] */
1709         "mul	x3, x17, x11\n\t"
1710         "umulh	x4, x17, x11\n\t"
1711         "adds	%x[a], %x[a], x3\n\t"
1712         "adcs	x26, x26, x4\n\t"
1713         "adc	x27, x27, xzr\n\t"
1714         /*  A[2] * B[3] */
1715         "mul	x3, x16, x13\n\t"
1716         "umulh	x4, x16, x13\n\t"
1717         "adds	x26, x26, x3\n\t"
1718         "adcs	x27, x27, x4\n\t"
1719         "adc	x28, xzr, xzr\n\t"
1720         /*  A[3] * B[2] */
1721         "mul	x3, x17, x12\n\t"
1722         "umulh	x4, x17, x12\n\t"
1723         "adds	x26, x26, x3\n\t"
1724         "adcs	x27, x27, x4\n\t"
1725         "adc	x28, x28, xzr\n\t"
1726         /*  A[3] * B[3] */
1727         "mul	x3, x17, x13\n\t"
1728         "umulh	x4, x17, x13\n\t"
1729         "adds	x27, x27, x3\n\t"
1730         "adc	x28, x28, x4\n\t"
1731         /* Reduce */
1732         /*  Move top half into t4-t7 and remove top bit from t3 */
1733         "extr	x28, x28, x27, #63\n\t"
1734         "extr	x27, x27, x26, #63\n\t"
1735         "extr	x26, x26, %x[a], #63\n\t"
1736         "extr	%x[a], %x[a], x9, #63\n\t"
1737         "and	x9, x9, #0x7fffffffffffffff\n\t"
1738         /*  Multiply top half by 19 */
1739         "mov	x3, #19\n\t"
1740         "mul	x4, x3, %x[a]\n\t"
1741         "umulh	%x[a], x3, %x[a]\n\t"
1742         "adds	x6, x6, x4\n\t"
1743         "mul	x4, x3, x26\n\t"
1744         "umulh	x26, x3, x26\n\t"
1745         "adcs	x7, x7, x4\n\t"
1746         "mul	x4, x3, x27\n\t"
1747         "umulh	x27, x3, x27\n\t"
1748         "adcs	x8, x8, x4\n\t"
1749         "mul	x4, x3, x28\n\t"
1750         "umulh	x5, x3, x28\n\t"
1751         "adcs	x9, x9, x4\n\t"
1752         "adc	x5, x5, xzr\n\t"
1753         /*  Add remaining product results in */
1754         "adds	x7, x7, %x[a]\n\t"
1755         "adcs	x8, x8, x26\n\t"
1756         "adcs	x9, x9, x27\n\t"
1757         "adc	x5, x5, xzr\n\t"
1758         /*  Overflow */
1759         "extr	x5, x5, x9, #63\n\t"
1760         "mul	x5, x5, x3\n\t"
1761         "and	x9, x9, #0x7fffffffffffffff\n\t"
1762         "adds	x6, x6, x5\n\t"
1763         "adcs	x7, x7, xzr\n\t"
1764         "adcs	x8, x8, xzr\n\t"
1765         "adc	x9, x9, xzr\n\t"
1766         /* Reduce if top bit set */
1767         "and	x5, x3, x9, asr 63\n\t"
1768         "and	x9, x9, #0x7fffffffffffffff\n\t"
1769         "adds	x6, x6, x5\n\t"
1770         "adcs	x7, x7, xzr\n\t"
1771         "adcs	x8, x8, xzr\n\t"
1772         "adc	x9, x9, xzr\n\t"
1773         /* Store */
1774         "stp	x6, x7, [%x[r]]\n\t"
1775         "stp	x8, x9, [%x[r], #16]\n\t"
1776         /* Sub */
1777         "subs	x14, x14, x10\n\t"
1778         "sbcs	x15, x15, x11\n\t"
1779         "sbcs	x16, x16, x12\n\t"
1780         "sbcs	x17, x17, x13\n\t"
1781         "mov	x3, #-19\n\t"
1782         "csetm	%x[a], cc\n\t"
1783         /*   Mask the modulus */
1784         "and	x3, %x[a], x3\n\t"
1785         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1786         /*   Add modulus (if underflow) */
1787         "adds	x14, x14, x3\n\t"
1788         "adcs	x15, x15, %x[a]\n\t"
1789         "adcs	x16, x16, %x[a]\n\t"
1790         "adc	x17, x17, x4\n\t"
1791         /* Multiply by 121666 */
1792         "mov	x5, #0xdb42\n\t"
1793         "movk	x5, #1, lsl 16\n\t"
1794         "mul	x6, x14, x5\n\t"
1795         "umulh	x7, x14, x5\n\t"
1796         "mul	x3, x15, x5\n\t"
1797         "umulh	x4, x15, x5\n\t"
1798         "adds	x7, x7, x3\n\t"
1799         "adc	x8, xzr, x4\n\t"
1800         "mul	x3, x16, x5\n\t"
1801         "umulh	x4, x16, x5\n\t"
1802         "adds	x8, x8, x3\n\t"
1803         "adc	x9, xzr, x4\n\t"
1804         "mul	x3, x17, x5\n\t"
1805         "umulh	x4, x17, x5\n\t"
1806         "adds	x9, x9, x3\n\t"
1807         "adc	x4, xzr, x4\n\t"
1808         "mov	x5, #19\n\t"
1809         "extr	x4, x4, x9, #63\n\t"
1810         "mul	x4, x4, x5\n\t"
1811         "and	x9, x9, #0x7fffffffffffffff\n\t"
1812         "adds	x6, x6, x4\n\t"
1813         "adcs	x7, x7, xzr\n\t"
1814         "adcs	x8, x8, xzr\n\t"
1815         "adc	x9, x9, xzr\n\t"
1816         /* Add */
1817         "adds	x10, x10, x6\n\t"
1818         "adcs	x11, x11, x7\n\t"
1819         "adcs	x12, x12, x8\n\t"
1820         "adc	x13, x13, x9\n\t"
1821         "mov	x3, #-19\n\t"
1822         "asr	%x[a], x13, #63\n\t"
1823         /*   Mask the modulus */
1824         "and	x3, %x[a], x3\n\t"
1825         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1826         /*   Sub modulus (if overflow) */
1827         "subs	x10, x10, x3\n\t"
1828         "sbcs	x11, x11, %x[a]\n\t"
1829         "sbcs	x12, x12, %x[a]\n\t"
1830         "sbc	x13, x13, x4\n\t"
1831         /* Multiply */
1832         /*  A[0] * B[0] */
1833         "mul	x6, x14, x10\n\t"
1834         "umulh	x7, x14, x10\n\t"
1835         /*  A[0] * B[1] */
1836         "mul	x3, x14, x11\n\t"
1837         "umulh	x8, x14, x11\n\t"
1838         "adds	x7, x7, x3\n\t"
1839         "adc	x8, x8, xzr\n\t"
1840         /*  A[1] * B[0] */
1841         "mul	x3, x15, x10\n\t"
1842         "umulh	x4, x15, x10\n\t"
1843         "adds	x7, x7, x3\n\t"
1844         "adcs	x8, x8, x4\n\t"
1845         "adc	x9, xzr, xzr\n\t"
1846         /*  A[0] * B[2] */
1847         "mul	x3, x14, x12\n\t"
1848         "umulh	x4, x14, x12\n\t"
1849         "adds	x8, x8, x3\n\t"
1850         "adc	x9, x9, x4\n\t"
1851         /*  A[1] * B[1] */
1852         "mul	x3, x15, x11\n\t"
1853         "umulh	x4, x15, x11\n\t"
1854         "adds	x8, x8, x3\n\t"
1855         "adcs	x9, x9, x4\n\t"
1856         "adc	%x[a], xzr, xzr\n\t"
1857         /*  A[2] * B[0] */
1858         "mul	x3, x16, x10\n\t"
1859         "umulh	x4, x16, x10\n\t"
1860         "adds	x8, x8, x3\n\t"
1861         "adcs	x9, x9, x4\n\t"
1862         "adc	%x[a], %x[a], xzr\n\t"
1863         /*  A[0] * B[3] */
1864         "mul	x3, x14, x13\n\t"
1865         "umulh	x4, x14, x13\n\t"
1866         "adds	x9, x9, x3\n\t"
1867         "adcs	%x[a], %x[a], x4\n\t"
1868         "adc	x26, xzr, xzr\n\t"
1869         /*  A[1] * B[2] */
1870         "mul	x3, x15, x12\n\t"
1871         "umulh	x4, x15, x12\n\t"
1872         "adds	x9, x9, x3\n\t"
1873         "adcs	%x[a], %x[a], x4\n\t"
1874         "adc	x26, x26, xzr\n\t"
1875         /*  A[2] * B[1] */
1876         "mul	x3, x16, x11\n\t"
1877         "umulh	x4, x16, x11\n\t"
1878         "adds	x9, x9, x3\n\t"
1879         "adcs	%x[a], %x[a], x4\n\t"
1880         "adc	x26, x26, xzr\n\t"
1881         /*  A[3] * B[0] */
1882         "mul	x3, x17, x10\n\t"
1883         "umulh	x4, x17, x10\n\t"
1884         "adds	x9, x9, x3\n\t"
1885         "adcs	%x[a], %x[a], x4\n\t"
1886         "adc	x26, x26, xzr\n\t"
1887         /*  A[1] * B[3] */
1888         "mul	x3, x15, x13\n\t"
1889         "umulh	x4, x15, x13\n\t"
1890         "adds	%x[a], %x[a], x3\n\t"
1891         "adcs	x26, x26, x4\n\t"
1892         "adc	x27, xzr, xzr\n\t"
1893         /*  A[2] * B[2] */
1894         "mul	x3, x16, x12\n\t"
1895         "umulh	x4, x16, x12\n\t"
1896         "adds	%x[a], %x[a], x3\n\t"
1897         "adcs	x26, x26, x4\n\t"
1898         "adc	x27, x27, xzr\n\t"
1899         /*  A[3] * B[1] */
1900         "mul	x3, x17, x11\n\t"
1901         "umulh	x4, x17, x11\n\t"
1902         "adds	%x[a], %x[a], x3\n\t"
1903         "adcs	x26, x26, x4\n\t"
1904         "adc	x27, x27, xzr\n\t"
1905         /*  A[2] * B[3] */
1906         "mul	x3, x16, x13\n\t"
1907         "umulh	x4, x16, x13\n\t"
1908         "adds	x26, x26, x3\n\t"
1909         "adcs	x27, x27, x4\n\t"
1910         "adc	x28, xzr, xzr\n\t"
1911         /*  A[3] * B[2] */
1912         "mul	x3, x17, x12\n\t"
1913         "umulh	x4, x17, x12\n\t"
1914         "adds	x26, x26, x3\n\t"
1915         "adcs	x27, x27, x4\n\t"
1916         "adc	x28, x28, xzr\n\t"
1917         /*  A[3] * B[3] */
1918         "mul	x3, x17, x13\n\t"
1919         "umulh	x4, x17, x13\n\t"
1920         "adds	x27, x27, x3\n\t"
1921         "adc	x28, x28, x4\n\t"
1922         /* Reduce */
1923         /*  Move top half into t4-t7 and remove top bit from t3 */
1924         "extr	x28, x28, x27, #63\n\t"
1925         "extr	x27, x27, x26, #63\n\t"
1926         "extr	x26, x26, %x[a], #63\n\t"
1927         "extr	%x[a], %x[a], x9, #63\n\t"
1928         "and	x9, x9, #0x7fffffffffffffff\n\t"
1929         /*  Multiply top half by 19 */
1930         "mov	x3, #19\n\t"
1931         "mul	x4, x3, %x[a]\n\t"
1932         "umulh	%x[a], x3, %x[a]\n\t"
1933         "adds	x6, x6, x4\n\t"
1934         "mul	x4, x3, x26\n\t"
1935         "umulh	x26, x3, x26\n\t"
1936         "adcs	x7, x7, x4\n\t"
1937         "mul	x4, x3, x27\n\t"
1938         "umulh	x27, x3, x27\n\t"
1939         "adcs	x8, x8, x4\n\t"
1940         "mul	x4, x3, x28\n\t"
1941         "umulh	x5, x3, x28\n\t"
1942         "adcs	x9, x9, x4\n\t"
1943         "adc	x5, x5, xzr\n\t"
1944         /*  Add remaining product results in */
1945         "adds	x7, x7, %x[a]\n\t"
1946         "adcs	x8, x8, x26\n\t"
1947         "adcs	x9, x9, x27\n\t"
1948         "adc	x5, x5, xzr\n\t"
1949         /*  Overflow */
1950         "extr	x5, x5, x9, #63\n\t"
1951         "mul	x5, x5, x3\n\t"
1952         "and	x9, x9, #0x7fffffffffffffff\n\t"
1953         "adds	x6, x6, x5\n\t"
1954         "adcs	x7, x7, xzr\n\t"
1955         "adcs	x8, x8, xzr\n\t"
1956         "adc	x9, x9, xzr\n\t"
1957         /* Reduce if top bit set */
1958         "and	x5, x3, x9, asr 63\n\t"
1959         "and	x9, x9, #0x7fffffffffffffff\n\t"
1960         "adds	x6, x6, x5\n\t"
1961         "adcs	x7, x7, xzr\n\t"
1962         "adcs	x8, x8, xzr\n\t"
1963         "adc	x9, x9, xzr\n\t"
1964         /* Store */
1965         "stp	x6, x7, [x29, #16]\n\t"
1966         "stp	x8, x9, [x29, #32]\n\t"
1967         /* Add */
1968         "ldp	x6, x7, [x29, #112]\n\t"
1969         "ldp	x8, x9, [x29, #128]\n\t"
1970         "adds	x10, x6, x19\n\t"
1971         "adcs	x11, x7, x20\n\t"
1972         "adcs	x12, x8, x21\n\t"
1973         "adc	x13, x9, x22\n\t"
1974         "mov	x3, #-19\n\t"
1975         "asr	%x[a], x13, #63\n\t"
1976         /*   Mask the modulus */
1977         "and	x3, %x[a], x3\n\t"
1978         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1979         /*   Sub modulus (if overflow) */
1980         "subs	x10, x10, x3\n\t"
1981         "sbcs	x11, x11, %x[a]\n\t"
1982         "sbcs	x12, x12, %x[a]\n\t"
1983         "sbc	x13, x13, x4\n\t"
1984         /* Sub */
1985         "subs	x19, x6, x19\n\t"
1986         "sbcs	x20, x7, x20\n\t"
1987         "sbcs	x21, x8, x21\n\t"
1988         "sbcs	x22, x9, x22\n\t"
1989         "mov	x3, #-19\n\t"
1990         "csetm	%x[a], cc\n\t"
1991         /*   Mask the modulus */
1992         "and	x3, %x[a], x3\n\t"
1993         "and	x4, %x[a], #0x7fffffffffffffff\n\t"
1994         /*   Add modulus (if underflow) */
1995         "adds	x19, x19, x3\n\t"
1996         "adcs	x20, x20, %x[a]\n\t"
1997         "adcs	x21, x21, %x[a]\n\t"
1998         "adc	x22, x22, x4\n\t"
1999         /* Square */
2000         /*  A[0] * A[1] */
2001         "mul	x7, x10, x11\n\t"
2002         "umulh	x8, x10, x11\n\t"
2003         /*  A[0] * A[2] */
2004         "mul	x3, x10, x12\n\t"
2005         "umulh	x9, x10, x12\n\t"
2006         "adds	x8, x8, x3\n\t"
2007         "adc	x9, x9, xzr\n\t"
2008         /*  A[0] * A[3] */
2009         "mul	x3, x10, x13\n\t"
2010         "umulh	%x[a], x10, x13\n\t"
2011         "adds	x9, x9, x3\n\t"
2012         "adc	%x[a], %x[a], xzr\n\t"
2013         /*  A[1] * A[2] */
2014         "mul	x3, x11, x12\n\t"
2015         "umulh	x4, x11, x12\n\t"
2016         "adds	x9, x9, x3\n\t"
2017         "adcs	%x[a], %x[a], x4\n\t"
2018         "adc	x26, xzr, xzr\n\t"
2019         /*  A[1] * A[3] */
2020         "mul	x3, x11, x13\n\t"
2021         "umulh	x4, x11, x13\n\t"
2022         "adds	%x[a], %x[a], x3\n\t"
2023         "adc	x26, x26, x4\n\t"
2024         /*  A[2] * A[3] */
2025         "mul	x3, x12, x13\n\t"
2026         "umulh	x27, x12, x13\n\t"
2027         "adds	x26, x26, x3\n\t"
2028         "adc	x27, x27, xzr\n\t"
2029         /* Double */
2030         "adds	x7, x7, x7\n\t"
2031         "adcs	x8, x8, x8\n\t"
2032         "adcs	x9, x9, x9\n\t"
2033         "adcs	%x[a], %x[a], %x[a]\n\t"
2034         "adcs	x26, x26, x26\n\t"
2035         "adcs	x27, x27, x27\n\t"
2036         "adc	x28, xzr, xzr\n\t"
2037         /*  A[0] * A[0] */
2038         "mul	x6, x10, x10\n\t"
2039         "umulh	x5, x10, x10\n\t"
2040         /*  A[1] * A[1] */
2041         "mul	x3, x11, x11\n\t"
2042         "umulh	x4, x11, x11\n\t"
2043         "adds	x7, x7, x5\n\t"
2044         "adcs	x8, x8, x3\n\t"
2045         "adc	x5, x4, xzr\n\t"
2046         /*  A[2] * A[2] */
2047         "mul	x3, x12, x12\n\t"
2048         "umulh	x4, x12, x12\n\t"
2049         "adds	x9, x9, x5\n\t"
2050         "adcs	%x[a], %x[a], x3\n\t"
2051         "adc	x5, x4, xzr\n\t"
2052         /*  A[3] * A[3] */
2053         "mul	x3, x13, x13\n\t"
2054         "umulh	x4, x13, x13\n\t"
2055         "adds	x26, x26, x5\n\t"
2056         "adcs	x27, x27, x3\n\t"
2057         "adc	x28, x28, x4\n\t"
2058         /* Reduce */
2059         /*  Move top half into t4-t7 and remove top bit from t3 */
2060         "extr	x28, x28, x27, #63\n\t"
2061         "extr	x27, x27, x26, #63\n\t"
2062         "extr	x26, x26, %x[a], #63\n\t"
2063         "extr	%x[a], %x[a], x9, #63\n\t"
2064         "and	x9, x9, #0x7fffffffffffffff\n\t"
2065         /*  Multiply top half by 19 */
2066         "mov	x3, #19\n\t"
2067         "mul	x4, x3, %x[a]\n\t"
2068         "umulh	%x[a], x3, %x[a]\n\t"
2069         "adds	x6, x6, x4\n\t"
2070         "mul	x4, x3, x26\n\t"
2071         "umulh	x26, x3, x26\n\t"
2072         "adcs	x7, x7, x4\n\t"
2073         "mul	x4, x3, x27\n\t"
2074         "umulh	x27, x3, x27\n\t"
2075         "adcs	x8, x8, x4\n\t"
2076         "mul	x4, x3, x28\n\t"
2077         "umulh	x5, x3, x28\n\t"
2078         "adcs	x9, x9, x4\n\t"
2079         "adc	x5, x5, xzr\n\t"
2080         /*  Add remaining product results in */
2081         "adds	x7, x7, %x[a]\n\t"
2082         "adcs	x8, x8, x26\n\t"
2083         "adcs	x9, x9, x27\n\t"
2084         "adc	x5, x5, xzr\n\t"
2085         /*  Overflow */
2086         "extr	x5, x5, x9, #63\n\t"
2087         "mul	x5, x5, x3\n\t"
2088         "and	x9, x9, #0x7fffffffffffffff\n\t"
2089         "adds	x6, x6, x5\n\t"
2090         "adcs	x7, x7, xzr\n\t"
2091         "adcs	x8, x8, xzr\n\t"
2092         "adc	x9, x9, xzr\n\t"
2093         /* Reduce if top bit set */
2094         "and	x5, x3, x9, asr 63\n\t"
2095         "and	x9, x9, #0x7fffffffffffffff\n\t"
2096         "adds	x6, x6, x5\n\t"
2097         "adcs	x7, x7, xzr\n\t"
2098         "adcs	x8, x8, xzr\n\t"
2099         "adc	x9, x9, xzr\n\t"
2100         /* Store */
2101         "stp	x6, x7, [x29, #80]\n\t"
2102         "stp	x8, x9, [x29, #96]\n\t"
2103         /* Square */
2104         /*  A[0] * A[1] */
2105         "mul	x7, x19, x20\n\t"
2106         "umulh	x8, x19, x20\n\t"
2107         /*  A[0] * A[2] */
2108         "mul	x3, x19, x21\n\t"
2109         "umulh	x9, x19, x21\n\t"
2110         "adds	x8, x8, x3\n\t"
2111         "adc	x9, x9, xzr\n\t"
2112         /*  A[0] * A[3] */
2113         "mul	x3, x19, x22\n\t"
2114         "umulh	%x[a], x19, x22\n\t"
2115         "adds	x9, x9, x3\n\t"
2116         "adc	%x[a], %x[a], xzr\n\t"
2117         /*  A[1] * A[2] */
2118         "mul	x3, x20, x21\n\t"
2119         "umulh	x4, x20, x21\n\t"
2120         "adds	x9, x9, x3\n\t"
2121         "adcs	%x[a], %x[a], x4\n\t"
2122         "adc	x26, xzr, xzr\n\t"
2123         /*  A[1] * A[3] */
2124         "mul	x3, x20, x22\n\t"
2125         "umulh	x4, x20, x22\n\t"
2126         "adds	%x[a], %x[a], x3\n\t"
2127         "adc	x26, x26, x4\n\t"
2128         /*  A[2] * A[3] */
2129         "mul	x3, x21, x22\n\t"
2130         "umulh	x27, x21, x22\n\t"
2131         "adds	x26, x26, x3\n\t"
2132         "adc	x27, x27, xzr\n\t"
2133         /* Double */
2134         "adds	x7, x7, x7\n\t"
2135         "adcs	x8, x8, x8\n\t"
2136         "adcs	x9, x9, x9\n\t"
2137         "adcs	%x[a], %x[a], %x[a]\n\t"
2138         "adcs	x26, x26, x26\n\t"
2139         "adcs	x27, x27, x27\n\t"
2140         "adc	x28, xzr, xzr\n\t"
2141         /*  A[0] * A[0] */
2142         "mul	x6, x19, x19\n\t"
2143         "umulh	x5, x19, x19\n\t"
2144         /*  A[1] * A[1] */
2145         "mul	x3, x20, x20\n\t"
2146         "umulh	x4, x20, x20\n\t"
2147         "adds	x7, x7, x5\n\t"
2148         "adcs	x8, x8, x3\n\t"
2149         "adc	x5, x4, xzr\n\t"
2150         /*  A[2] * A[2] */
2151         "mul	x3, x21, x21\n\t"
2152         "umulh	x4, x21, x21\n\t"
2153         "adds	x9, x9, x5\n\t"
2154         "adcs	%x[a], %x[a], x3\n\t"
2155         "adc	x5, x4, xzr\n\t"
2156         /*  A[3] * A[3] */
2157         "mul	x3, x22, x22\n\t"
2158         "umulh	x4, x22, x22\n\t"
2159         "adds	x26, x26, x5\n\t"
2160         "adcs	x27, x27, x3\n\t"
2161         "adc	x28, x28, x4\n\t"
2162         /* Reduce */
2163         /*  Move top half into t4-t7 and remove top bit from t3 */
2164         "extr	x28, x28, x27, #63\n\t"
2165         "extr	x27, x27, x26, #63\n\t"
2166         "extr	x26, x26, %x[a], #63\n\t"
2167         "extr	%x[a], %x[a], x9, #63\n\t"
2168         "and	x9, x9, #0x7fffffffffffffff\n\t"
2169         /*  Multiply top half by 19 */
2170         "mov	x3, #19\n\t"
2171         "mul	x4, x3, %x[a]\n\t"
2172         "umulh	%x[a], x3, %x[a]\n\t"
2173         "adds	x6, x6, x4\n\t"
2174         "mul	x4, x3, x26\n\t"
2175         "umulh	x26, x3, x26\n\t"
2176         "adcs	x7, x7, x4\n\t"
2177         "mul	x4, x3, x27\n\t"
2178         "umulh	x27, x3, x27\n\t"
2179         "adcs	x8, x8, x4\n\t"
2180         "mul	x4, x3, x28\n\t"
2181         "umulh	x5, x3, x28\n\t"
2182         "adcs	x9, x9, x4\n\t"
2183         "adc	x5, x5, xzr\n\t"
2184         /*  Add remaining product results in */
2185         "adds	x7, x7, %x[a]\n\t"
2186         "adcs	x8, x8, x26\n\t"
2187         "adcs	x9, x9, x27\n\t"
2188         "adc	x5, x5, xzr\n\t"
2189         /*  Overflow */
2190         "extr	x5, x5, x9, #63\n\t"
2191         "mul	x5, x5, x3\n\t"
2192         "and	x9, x9, #0x7fffffffffffffff\n\t"
2193         "adds	x6, x6, x5\n\t"
2194         "adcs	x7, x7, xzr\n\t"
2195         "adcs	x8, x8, xzr\n\t"
2196         "adc	x9, x9, xzr\n\t"
2197         /* Reduce if top bit set */
2198         "and	x5, x3, x9, asr 63\n\t"
2199         "and	x9, x9, #0x7fffffffffffffff\n\t"
2200         "adds	x6, x6, x5\n\t"
2201         "adcs	x7, x7, xzr\n\t"
2202         "adcs	x8, x8, xzr\n\t"
2203         "adc	x9, x9, xzr\n\t"
2204         /* Store */
2205         "ldr	%x[a], [x29, #184]\n\t"
2206         /* Multiply */
2207         "ldp	x14, x15, [%x[a]]\n\t"
2208         "ldp	x16, x17, [%x[a], #16]\n\t"
2209         /*  A[0] * B[0] */
2210         "mul	x10, x14, x6\n\t"
2211         "umulh	x11, x14, x6\n\t"
2212         /*  A[0] * B[1] */
2213         "mul	x3, x14, x7\n\t"
2214         "umulh	x12, x14, x7\n\t"
2215         "adds	x11, x11, x3\n\t"
2216         "adc	x12, x12, xzr\n\t"
2217         /*  A[1] * B[0] */
2218         "mul	x3, x15, x6\n\t"
2219         "umulh	x4, x15, x6\n\t"
2220         "adds	x11, x11, x3\n\t"
2221         "adcs	x12, x12, x4\n\t"
2222         "adc	x13, xzr, xzr\n\t"
2223         /*  A[0] * B[2] */
2224         "mul	x3, x14, x8\n\t"
2225         "umulh	x4, x14, x8\n\t"
2226         "adds	x12, x12, x3\n\t"
2227         "adc	x13, x13, x4\n\t"
2228         /*  A[1] * B[1] */
2229         "mul	x3, x15, x7\n\t"
2230         "umulh	x4, x15, x7\n\t"
2231         "adds	x12, x12, x3\n\t"
2232         "adcs	x13, x13, x4\n\t"
2233         "adc	%x[a], xzr, xzr\n\t"
2234         /*  A[2] * B[0] */
2235         "mul	x3, x16, x6\n\t"
2236         "umulh	x4, x16, x6\n\t"
2237         "adds	x12, x12, x3\n\t"
2238         "adcs	x13, x13, x4\n\t"
2239         "adc	%x[a], %x[a], xzr\n\t"
2240         /*  A[0] * B[3] */
2241         "mul	x3, x14, x9\n\t"
2242         "umulh	x4, x14, x9\n\t"
2243         "adds	x13, x13, x3\n\t"
2244         "adcs	%x[a], %x[a], x4\n\t"
2245         "adc	x26, xzr, xzr\n\t"
2246         /*  A[1] * B[2] */
2247         "mul	x3, x15, x8\n\t"
2248         "umulh	x4, x15, x8\n\t"
2249         "adds	x13, x13, x3\n\t"
2250         "adcs	%x[a], %x[a], x4\n\t"
2251         "adc	x26, x26, xzr\n\t"
2252         /*  A[2] * B[1] */
2253         "mul	x3, x16, x7\n\t"
2254         "umulh	x4, x16, x7\n\t"
2255         "adds	x13, x13, x3\n\t"
2256         "adcs	%x[a], %x[a], x4\n\t"
2257         "adc	x26, x26, xzr\n\t"
2258         /*  A[3] * B[0] */
2259         "mul	x3, x17, x6\n\t"
2260         "umulh	x4, x17, x6\n\t"
2261         "adds	x13, x13, x3\n\t"
2262         "adcs	%x[a], %x[a], x4\n\t"
2263         "adc	x26, x26, xzr\n\t"
2264         /*  A[1] * B[3] */
2265         "mul	x3, x15, x9\n\t"
2266         "umulh	x4, x15, x9\n\t"
2267         "adds	%x[a], %x[a], x3\n\t"
2268         "adcs	x26, x26, x4\n\t"
2269         "adc	x27, xzr, xzr\n\t"
2270         /*  A[2] * B[2] */
2271         "mul	x3, x16, x8\n\t"
2272         "umulh	x4, x16, x8\n\t"
2273         "adds	%x[a], %x[a], x3\n\t"
2274         "adcs	x26, x26, x4\n\t"
2275         "adc	x27, x27, xzr\n\t"
2276         /*  A[3] * B[1] */
2277         "mul	x3, x17, x7\n\t"
2278         "umulh	x4, x17, x7\n\t"
2279         "adds	%x[a], %x[a], x3\n\t"
2280         "adcs	x26, x26, x4\n\t"
2281         "adc	x27, x27, xzr\n\t"
2282         /*  A[2] * B[3] */
2283         "mul	x3, x16, x9\n\t"
2284         "umulh	x4, x16, x9\n\t"
2285         "adds	x26, x26, x3\n\t"
2286         "adcs	x27, x27, x4\n\t"
2287         "adc	x28, xzr, xzr\n\t"
2288         /*  A[3] * B[2] */
2289         "mul	x3, x17, x8\n\t"
2290         "umulh	x4, x17, x8\n\t"
2291         "adds	x26, x26, x3\n\t"
2292         "adcs	x27, x27, x4\n\t"
2293         "adc	x28, x28, xzr\n\t"
2294         /*  A[3] * B[3] */
2295         "mul	x3, x17, x9\n\t"
2296         "umulh	x4, x17, x9\n\t"
2297         "adds	x27, x27, x3\n\t"
2298         "adc	x28, x28, x4\n\t"
2299         /* Reduce */
2300         /*  Move top half into t4-t7 and remove top bit from t3 */
2301         "extr	x28, x28, x27, #63\n\t"
2302         "extr	x27, x27, x26, #63\n\t"
2303         "extr	x26, x26, %x[a], #63\n\t"
2304         "extr	%x[a], %x[a], x13, #63\n\t"
2305         "and	x13, x13, #0x7fffffffffffffff\n\t"
2306         /*  Multiply top half by 19 */
2307         "mov	x3, #19\n\t"
2308         "mul	x4, x3, %x[a]\n\t"
2309         "umulh	%x[a], x3, %x[a]\n\t"
2310         "adds	x10, x10, x4\n\t"
2311         "mul	x4, x3, x26\n\t"
2312         "umulh	x26, x3, x26\n\t"
2313         "adcs	x11, x11, x4\n\t"
2314         "mul	x4, x3, x27\n\t"
2315         "umulh	x27, x3, x27\n\t"
2316         "adcs	x12, x12, x4\n\t"
2317         "mul	x4, x3, x28\n\t"
2318         "umulh	x5, x3, x28\n\t"
2319         "adcs	x13, x13, x4\n\t"
2320         "adc	x5, x5, xzr\n\t"
2321         /*  Add remaining product results in */
2322         "adds	x11, x11, %x[a]\n\t"
2323         "adcs	x12, x12, x26\n\t"
2324         "adcs	x13, x13, x27\n\t"
2325         "adc	x5, x5, xzr\n\t"
2326         /*  Overflow */
2327         "extr	x5, x5, x13, #63\n\t"
2328         "mul	x5, x5, x3\n\t"
2329         "and	x13, x13, #0x7fffffffffffffff\n\t"
2330         "adds	x10, x10, x5\n\t"
2331         "adcs	x11, x11, xzr\n\t"
2332         "adcs	x12, x12, xzr\n\t"
2333         "adc	x13, x13, xzr\n\t"
2334         /* Reduce if top bit set */
2335         "and	x5, x3, x13, asr 63\n\t"
2336         "and	x13, x13, #0x7fffffffffffffff\n\t"
2337         "adds	x10, x10, x5\n\t"
2338         "adcs	x11, x11, xzr\n\t"
2339         "adcs	x12, x12, xzr\n\t"
2340         "adc	x13, x13, xzr\n\t"
2341         /* Store */
2342         "stp	x10, x11, [x29, #48]\n\t"
2343         "stp	x12, x13, [x29, #64]\n\t"
2344         "sub	x25, x25, #1\n\t"
2345         "cmp	x25, #0\n\t"
2346         "bge	L_curve25519_bits_%=\n\t"
2347         "mov	x25, #63\n\t"
2348         "sub	x24, x24, #8\n\t"
2349         "cmp	x24, #0\n\t"
2350         "bge	L_curve25519_words_%=\n\t"
2351         /* Invert */
2352         "add	x0, x29, #48\n\t"
2353         "add	x1, x29, #16\n\t"
2354 #ifndef __APPLE__
2355         "bl	fe_sq\n\t"
2356 #else
2357         "bl	_fe_sq\n\t"
2358 #endif /* __APPLE__ */
2359         "add	x0, x29, #0x50\n\t"
2360         "add	x1, x29, #48\n\t"
2361 #ifndef __APPLE__
2362         "bl	fe_sq\n\t"
2363 #else
2364         "bl	_fe_sq\n\t"
2365 #endif /* __APPLE__ */
2366 #ifndef NDEBUG
2367         "add	x0, x29, #0x50\n\t"
2368 #endif /* !NDEBUG */
2369         "add	x1, x29, #0x50\n\t"
2370 #ifndef __APPLE__
2371         "bl	fe_sq\n\t"
2372 #else
2373         "bl	_fe_sq\n\t"
2374 #endif /* __APPLE__ */
2375 #ifndef NDEBUG
2376         "add	x0, x29, #0x50\n\t"
2377 #endif /* !NDEBUG */
2378         "add	x1, x29, #16\n\t"
2379         "add	x2, x29, #0x50\n\t"
2380 #ifndef __APPLE__
2381         "bl	fe_mul\n\t"
2382 #else
2383         "bl	_fe_mul\n\t"
2384 #endif /* __APPLE__ */
2385         "add	x0, x29, #48\n\t"
2386         "add	x1, x29, #48\n\t"
2387         "add	x2, x29, #0x50\n\t"
2388 #ifndef __APPLE__
2389         "bl	fe_mul\n\t"
2390 #else
2391         "bl	_fe_mul\n\t"
2392 #endif /* __APPLE__ */
2393         "add	x0, x29, #0x70\n\t"
2394 #ifndef NDEBUG
2395         "add	x1, x29, #48\n\t"
2396 #endif /* !NDEBUG */
2397 #ifndef __APPLE__
2398         "bl	fe_sq\n\t"
2399 #else
2400         "bl	_fe_sq\n\t"
2401 #endif /* __APPLE__ */
2402         "add	x0, x29, #0x50\n\t"
2403         "add	x1, x29, #0x50\n\t"
2404         "add	x2, x29, #0x70\n\t"
2405 #ifndef __APPLE__
2406         "bl	fe_mul\n\t"
2407 #else
2408         "bl	_fe_mul\n\t"
2409 #endif /* __APPLE__ */
2410         "add	x0, x29, #0x70\n\t"
2411 #ifndef NDEBUG
2412         "add	x1, x29, #0x50\n\t"
2413 #endif /* !NDEBUG */
2414 #ifndef __APPLE__
2415         "bl	fe_sq\n\t"
2416 #else
2417         "bl	_fe_sq\n\t"
2418 #endif /* __APPLE__ */
2419         "mov	x24, #3\n\t"
2420 #ifndef NDEBUG
2421         "add	x0, x29, #0x70\n\t"
2422 #endif /* !NDEBUG */
2423         "add	x1, x29, #0x70\n\t"
2424         "\n"
2425     "L_curve25519_inv_1_%=: \n\t"
2426 #ifndef __APPLE__
2427         "bl	fe_sq\n\t"
2428 #else
2429         "bl	_fe_sq\n\t"
2430 #endif /* __APPLE__ */
2431         "subs	x24, x24, #1\n\t"
2432         "bcs	L_curve25519_inv_1_%=\n\t"
2433         "add	x0, x29, #0x50\n\t"
2434 #ifndef NDEBUG
2435         "add	x1, x29, #0x70\n\t"
2436 #endif /* !NDEBUG */
2437         "add	x2, x29, #0x50\n\t"
2438 #ifndef __APPLE__
2439         "bl	fe_mul\n\t"
2440 #else
2441         "bl	_fe_mul\n\t"
2442 #endif /* __APPLE__ */
2443         "add	x0, x29, #0x70\n\t"
2444         "add	x1, x29, #0x50\n\t"
2445 #ifndef __APPLE__
2446         "bl	fe_sq\n\t"
2447 #else
2448         "bl	_fe_sq\n\t"
2449 #endif /* __APPLE__ */
2450         "mov	x24, #8\n\t"
2451 #ifndef NDEBUG
2452         "add	x0, x29, #0x70\n\t"
2453 #endif /* !NDEBUG */
2454         "add	x1, x29, #0x70\n\t"
2455         "\n"
2456     "L_curve25519_inv_2_%=: \n\t"
2457 #ifndef __APPLE__
2458         "bl	fe_sq\n\t"
2459 #else
2460         "bl	_fe_sq\n\t"
2461 #endif /* __APPLE__ */
2462         "subs	x24, x24, #1\n\t"
2463         "bcs	L_curve25519_inv_2_%=\n\t"
2464 #ifndef NDEBUG
2465         "add	x0, x29, #0x70\n\t"
2466 #endif /* !NDEBUG */
2467 #ifndef NDEBUG
2468         "add	x1, x29, #0x70\n\t"
2469 #endif /* !NDEBUG */
2470         "add	x2, x29, #0x50\n\t"
2471 #ifndef __APPLE__
2472         "bl	fe_mul\n\t"
2473 #else
2474         "bl	_fe_mul\n\t"
2475 #endif /* __APPLE__ */
2476         "add	x0, x29, #0x90\n\t"
2477 #ifndef NDEBUG
2478         "add	x1, x29, #0x70\n\t"
2479 #endif /* !NDEBUG */
2480 #ifndef __APPLE__
2481         "bl	fe_sq\n\t"
2482 #else
2483         "bl	_fe_sq\n\t"
2484 #endif /* __APPLE__ */
2485         "mov	x24, #18\n\t"
2486 #ifndef NDEBUG
2487         "add	x0, x29, #0x90\n\t"
2488 #endif /* !NDEBUG */
2489         "add	x1, x29, #0x90\n\t"
2490         "\n"
2491     "L_curve25519_inv_3_%=: \n\t"
2492 #ifndef __APPLE__
2493         "bl	fe_sq\n\t"
2494 #else
2495         "bl	_fe_sq\n\t"
2496 #endif /* __APPLE__ */
2497         "subs	x24, x24, #1\n\t"
2498         "bcs	L_curve25519_inv_3_%=\n\t"
2499         "add	x0, x29, #0x70\n\t"
2500 #ifndef NDEBUG
2501         "add	x1, x29, #0x90\n\t"
2502 #endif /* !NDEBUG */
2503         "add	x2, x29, #0x70\n\t"
2504 #ifndef __APPLE__
2505         "bl	fe_mul\n\t"
2506 #else
2507         "bl	_fe_mul\n\t"
2508 #endif /* __APPLE__ */
2509         "mov	x24, #9\n\t"
2510 #ifndef NDEBUG
2511         "add	x0, x29, #0x70\n\t"
2512 #endif /* !NDEBUG */
2513         "add	x1, x29, #0x70\n\t"
2514         "\n"
2515     "L_curve25519_inv_4_%=: \n\t"
2516 #ifndef __APPLE__
2517         "bl	fe_sq\n\t"
2518 #else
2519         "bl	_fe_sq\n\t"
2520 #endif /* __APPLE__ */
2521         "subs	x24, x24, #1\n\t"
2522         "bcs	L_curve25519_inv_4_%=\n\t"
2523         "add	x0, x29, #0x50\n\t"
2524 #ifndef NDEBUG
2525         "add	x1, x29, #0x70\n\t"
2526 #endif /* !NDEBUG */
2527         "add	x2, x29, #0x50\n\t"
2528 #ifndef __APPLE__
2529         "bl	fe_mul\n\t"
2530 #else
2531         "bl	_fe_mul\n\t"
2532 #endif /* __APPLE__ */
2533         "add	x0, x29, #0x70\n\t"
2534         "add	x1, x29, #0x50\n\t"
2535 #ifndef __APPLE__
2536         "bl	fe_sq\n\t"
2537 #else
2538         "bl	_fe_sq\n\t"
2539 #endif /* __APPLE__ */
2540         "mov	x24, #48\n\t"
2541 #ifndef NDEBUG
2542         "add	x0, x29, #0x70\n\t"
2543 #endif /* !NDEBUG */
2544         "add	x1, x29, #0x70\n\t"
2545         "\n"
2546     "L_curve25519_inv_5_%=: \n\t"
2547 #ifndef __APPLE__
2548         "bl	fe_sq\n\t"
2549 #else
2550         "bl	_fe_sq\n\t"
2551 #endif /* __APPLE__ */
2552         "subs	x24, x24, #1\n\t"
2553         "bcs	L_curve25519_inv_5_%=\n\t"
2554 #ifndef NDEBUG
2555         "add	x0, x29, #0x70\n\t"
2556 #endif /* !NDEBUG */
2557 #ifndef NDEBUG
2558         "add	x1, x29, #0x70\n\t"
2559 #endif /* !NDEBUG */
2560         "add	x2, x29, #0x50\n\t"
2561 #ifndef __APPLE__
2562         "bl	fe_mul\n\t"
2563 #else
2564         "bl	_fe_mul\n\t"
2565 #endif /* __APPLE__ */
2566         "add	x0, x29, #0x90\n\t"
2567 #ifndef NDEBUG
2568         "add	x1, x29, #0x70\n\t"
2569 #endif /* !NDEBUG */
2570 #ifndef __APPLE__
2571         "bl	fe_sq\n\t"
2572 #else
2573         "bl	_fe_sq\n\t"
2574 #endif /* __APPLE__ */
2575         "mov	x24, #0x62\n\t"
2576 #ifndef NDEBUG
2577         "add	x0, x29, #0x90\n\t"
2578 #endif /* !NDEBUG */
2579         "add	x1, x29, #0x90\n\t"
2580         "\n"
2581     "L_curve25519_inv_6_%=: \n\t"
2582 #ifndef __APPLE__
2583         "bl	fe_sq\n\t"
2584 #else
2585         "bl	_fe_sq\n\t"
2586 #endif /* __APPLE__ */
2587         "subs	x24, x24, #1\n\t"
2588         "bcs	L_curve25519_inv_6_%=\n\t"
2589         "add	x0, x29, #0x70\n\t"
2590 #ifndef NDEBUG
2591         "add	x1, x29, #0x90\n\t"
2592 #endif /* !NDEBUG */
2593         "add	x2, x29, #0x70\n\t"
2594 #ifndef __APPLE__
2595         "bl	fe_mul\n\t"
2596 #else
2597         "bl	_fe_mul\n\t"
2598 #endif /* __APPLE__ */
2599         "mov	x24, #49\n\t"
2600 #ifndef NDEBUG
2601         "add	x0, x29, #0x70\n\t"
2602 #endif /* !NDEBUG */
2603         "add	x1, x29, #0x70\n\t"
2604         "\n"
2605     "L_curve25519_inv_7_%=: \n\t"
2606 #ifndef __APPLE__
2607         "bl	fe_sq\n\t"
2608 #else
2609         "bl	_fe_sq\n\t"
2610 #endif /* __APPLE__ */
2611         "subs	x24, x24, #1\n\t"
2612         "bcs	L_curve25519_inv_7_%=\n\t"
2613         "add	x0, x29, #0x50\n\t"
2614 #ifndef NDEBUG
2615         "add	x1, x29, #0x70\n\t"
2616 #endif /* !NDEBUG */
2617         "add	x2, x29, #0x50\n\t"
2618 #ifndef __APPLE__
2619         "bl	fe_mul\n\t"
2620 #else
2621         "bl	_fe_mul\n\t"
2622 #endif /* __APPLE__ */
2623         "mov	x24, #4\n\t"
2624 #ifndef NDEBUG
2625         "add	x0, x29, #0x50\n\t"
2626 #endif /* !NDEBUG */
2627         "add	x1, x29, #0x50\n\t"
2628         "\n"
2629     "L_curve25519_inv_8_%=: \n\t"
2630 #ifndef __APPLE__
2631         "bl	fe_sq\n\t"
2632 #else
2633         "bl	_fe_sq\n\t"
2634 #endif /* __APPLE__ */
2635         "subs	x24, x24, #1\n\t"
2636         "bcs	L_curve25519_inv_8_%=\n\t"
2637         "add	x0, x29, #16\n\t"
2638 #ifndef NDEBUG
2639         "add	x1, x29, #0x50\n\t"
2640 #endif /* !NDEBUG */
2641         "add	x2, x29, #48\n\t"
2642 #ifndef __APPLE__
2643         "bl	fe_mul\n\t"
2644 #else
2645         "bl	_fe_mul\n\t"
2646 #endif /* __APPLE__ */
2647         "ldr	%x[r], [x29, #176]\n\t"
2648         /* Multiply */
2649         "ldp	x6, x7, [%x[r]]\n\t"
2650         "ldp	x8, x9, [%x[r], #16]\n\t"
2651         "ldp	x10, x11, [x29, #16]\n\t"
2652         "ldp	x12, x13, [x29, #32]\n\t"
2653         /*  A[0] * B[0] */
2654         "mul	x14, x6, x10\n\t"
2655         "umulh	x15, x6, x10\n\t"
2656         /*  A[0] * B[1] */
2657         "mul	x3, x6, x11\n\t"
2658         "umulh	x16, x6, x11\n\t"
2659         "adds	x15, x15, x3\n\t"
2660         "adc	x16, x16, xzr\n\t"
2661         /*  A[1] * B[0] */
2662         "mul	x3, x7, x10\n\t"
2663         "umulh	x4, x7, x10\n\t"
2664         "adds	x15, x15, x3\n\t"
2665         "adcs	x16, x16, x4\n\t"
2666         "adc	x17, xzr, xzr\n\t"
2667         /*  A[0] * B[2] */
2668         "mul	x3, x6, x12\n\t"
2669         "umulh	x4, x6, x12\n\t"
2670         "adds	x16, x16, x3\n\t"
2671         "adc	x17, x17, x4\n\t"
2672         /*  A[1] * B[1] */
2673         "mul	x3, x7, x11\n\t"
2674         "umulh	x4, x7, x11\n\t"
2675         "adds	x16, x16, x3\n\t"
2676         "adcs	x17, x17, x4\n\t"
2677         "adc	x19, xzr, xzr\n\t"
2678         /*  A[2] * B[0] */
2679         "mul	x3, x8, x10\n\t"
2680         "umulh	x4, x8, x10\n\t"
2681         "adds	x16, x16, x3\n\t"
2682         "adcs	x17, x17, x4\n\t"
2683         "adc	x19, x19, xzr\n\t"
2684         /*  A[0] * B[3] */
2685         "mul	x3, x6, x13\n\t"
2686         "umulh	x4, x6, x13\n\t"
2687         "adds	x17, x17, x3\n\t"
2688         "adcs	x19, x19, x4\n\t"
2689         "adc	x20, xzr, xzr\n\t"
2690         /*  A[1] * B[2] */
2691         "mul	x3, x7, x12\n\t"
2692         "umulh	x4, x7, x12\n\t"
2693         "adds	x17, x17, x3\n\t"
2694         "adcs	x19, x19, x4\n\t"
2695         "adc	x20, x20, xzr\n\t"
2696         /*  A[2] * B[1] */
2697         "mul	x3, x8, x11\n\t"
2698         "umulh	x4, x8, x11\n\t"
2699         "adds	x17, x17, x3\n\t"
2700         "adcs	x19, x19, x4\n\t"
2701         "adc	x20, x20, xzr\n\t"
2702         /*  A[3] * B[0] */
2703         "mul	x3, x9, x10\n\t"
2704         "umulh	x4, x9, x10\n\t"
2705         "adds	x17, x17, x3\n\t"
2706         "adcs	x19, x19, x4\n\t"
2707         "adc	x20, x20, xzr\n\t"
2708         /*  A[1] * B[3] */
2709         "mul	x3, x7, x13\n\t"
2710         "umulh	x4, x7, x13\n\t"
2711         "adds	x19, x19, x3\n\t"
2712         "adcs	x20, x20, x4\n\t"
2713         "adc	x21, xzr, xzr\n\t"
2714         /*  A[2] * B[2] */
2715         "mul	x3, x8, x12\n\t"
2716         "umulh	x4, x8, x12\n\t"
2717         "adds	x19, x19, x3\n\t"
2718         "adcs	x20, x20, x4\n\t"
2719         "adc	x21, x21, xzr\n\t"
2720         /*  A[3] * B[1] */
2721         "mul	x3, x9, x11\n\t"
2722         "umulh	x4, x9, x11\n\t"
2723         "adds	x19, x19, x3\n\t"
2724         "adcs	x20, x20, x4\n\t"
2725         "adc	x21, x21, xzr\n\t"
2726         /*  A[2] * B[3] */
2727         "mul	x3, x8, x13\n\t"
2728         "umulh	x4, x8, x13\n\t"
2729         "adds	x20, x20, x3\n\t"
2730         "adcs	x21, x21, x4\n\t"
2731         "adc	x22, xzr, xzr\n\t"
2732         /*  A[3] * B[2] */
2733         "mul	x3, x9, x12\n\t"
2734         "umulh	x4, x9, x12\n\t"
2735         "adds	x20, x20, x3\n\t"
2736         "adcs	x21, x21, x4\n\t"
2737         "adc	x22, x22, xzr\n\t"
2738         /*  A[3] * B[3] */
2739         "mul	x3, x9, x13\n\t"
2740         "umulh	x4, x9, x13\n\t"
2741         "adds	x21, x21, x3\n\t"
2742         "adc	x22, x22, x4\n\t"
2743         /* Reduce */
2744         /*  Move top half into t4-t7 and remove top bit from t3 */
2745         "extr	x22, x22, x21, #63\n\t"
2746         "extr	x21, x21, x20, #63\n\t"
2747         "extr	x20, x20, x19, #63\n\t"
2748         "extr	x19, x19, x17, #63\n\t"
2749         "and	x17, x17, #0x7fffffffffffffff\n\t"
2750         /*  Multiply top half by 19 */
2751         "mov	x3, #19\n\t"
2752         "mul	x4, x3, x19\n\t"
2753         "umulh	x19, x3, x19\n\t"
2754         "adds	x14, x14, x4\n\t"
2755         "mul	x4, x3, x20\n\t"
2756         "umulh	x20, x3, x20\n\t"
2757         "adcs	x15, x15, x4\n\t"
2758         "mul	x4, x3, x21\n\t"
2759         "umulh	x21, x3, x21\n\t"
2760         "adcs	x16, x16, x4\n\t"
2761         "mul	x4, x3, x22\n\t"
2762         "umulh	x5, x3, x22\n\t"
2763         "adcs	x17, x17, x4\n\t"
2764         "adc	x5, x5, xzr\n\t"
2765         /*  Add remaining product results in */
2766         "adds	x15, x15, x19\n\t"
2767         "adcs	x16, x16, x20\n\t"
2768         "adcs	x17, x17, x21\n\t"
2769         "adc	x5, x5, xzr\n\t"
2770         /*  Overflow */
2771         "extr	x5, x5, x17, #63\n\t"
2772         "mul	x5, x5, x3\n\t"
2773         "and	x17, x17, #0x7fffffffffffffff\n\t"
2774         "adds	x14, x14, x5\n\t"
2775         "adcs	x15, x15, xzr\n\t"
2776         "adcs	x16, x16, xzr\n\t"
2777         "adc	x17, x17, xzr\n\t"
2778         /* Reduce if top bit set */
2779         "and	x5, x3, x17, asr 63\n\t"
2780         "and	x17, x17, #0x7fffffffffffffff\n\t"
2781         "adds	x14, x14, x5\n\t"
2782         "adcs	x15, x15, xzr\n\t"
2783         "adcs	x16, x16, xzr\n\t"
2784         "adc	x17, x17, xzr\n\t"
2785         /* Store */
2786         "stp	x14, x15, [%x[r]]\n\t"
2787         "stp	x16, x17, [%x[r], #16]\n\t"
2788         "mov	x0, xzr\n\t"
2789         "ldp	x29, x30, [sp], #0xc0\n\t"
2790         : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
2791         :
2792         : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
2793     );
2794     return (uint32_t)(size_t)r;
2795 }
2796 
fe_pow22523(fe r,const fe a)2797 void fe_pow22523(fe r, const fe a)
2798 {
2799     __asm__ __volatile__ (
2800         "stp	x29, x30, [sp, #-128]!\n\t"
2801         "add	x29, sp, #0\n\t"
2802         /* pow22523 */
2803         "str	%x[r], [x29, #112]\n\t"
2804         "str	%x[a], [x29, #120]\n\t"
2805         "add	x0, x29, #16\n\t"
2806 #ifndef NDEBUG
2807         "ldr	x1, [x29, #120]\n\t"
2808 #endif /* !NDEBUG */
2809 #ifndef __APPLE__
2810         "bl	fe_sq\n\t"
2811 #else
2812         "bl	_fe_sq\n\t"
2813 #endif /* __APPLE__ */
2814         "add	x0, x29, #48\n\t"
2815         "add	x1, x29, #16\n\t"
2816 #ifndef __APPLE__
2817         "bl	fe_sq\n\t"
2818 #else
2819         "bl	_fe_sq\n\t"
2820 #endif /* __APPLE__ */
2821 #ifndef NDEBUG
2822         "add	x0, x29, #48\n\t"
2823 #endif /* !NDEBUG */
2824         "add	x1, x29, #48\n\t"
2825 #ifndef __APPLE__
2826         "bl	fe_sq\n\t"
2827 #else
2828         "bl	_fe_sq\n\t"
2829 #endif /* __APPLE__ */
2830 #ifndef NDEBUG
2831         "add	x0, x29, #48\n\t"
2832 #endif /* !NDEBUG */
2833         "ldr	x1, [x29, #120]\n\t"
2834         "add	x2, x29, #48\n\t"
2835 #ifndef __APPLE__
2836         "bl	fe_mul\n\t"
2837 #else
2838         "bl	_fe_mul\n\t"
2839 #endif /* __APPLE__ */
2840         "add	x0, x29, #16\n\t"
2841         "add	x1, x29, #16\n\t"
2842         "add	x2, x29, #48\n\t"
2843 #ifndef __APPLE__
2844         "bl	fe_mul\n\t"
2845 #else
2846         "bl	_fe_mul\n\t"
2847 #endif /* __APPLE__ */
2848 #ifndef NDEBUG
2849         "add	x0, x29, #16\n\t"
2850 #endif /* !NDEBUG */
2851 #ifndef NDEBUG
2852         "add	x1, x29, #16\n\t"
2853 #endif /* !NDEBUG */
2854 #ifndef __APPLE__
2855         "bl	fe_sq\n\t"
2856 #else
2857         "bl	_fe_sq\n\t"
2858 #endif /* __APPLE__ */
2859 #ifndef NDEBUG
2860         "add	x0, x29, #16\n\t"
2861 #endif /* !NDEBUG */
2862         "add	x1, x29, #48\n\t"
2863         "add	x2, x29, #16\n\t"
2864 #ifndef __APPLE__
2865         "bl	fe_mul\n\t"
2866 #else
2867         "bl	_fe_mul\n\t"
2868 #endif /* __APPLE__ */
2869         "add	x0, x29, #48\n\t"
2870         "add	x1, x29, #16\n\t"
2871 #ifndef __APPLE__
2872         "bl	fe_sq\n\t"
2873 #else
2874         "bl	_fe_sq\n\t"
2875 #endif /* __APPLE__ */
2876         "mov	x23, #3\n\t"
2877 #ifndef NDEBUG
2878         "add	x0, x29, #48\n\t"
2879 #endif /* !NDEBUG */
2880         "add	x1, x29, #48\n\t"
2881         "\n"
2882     "L_fe_pow22523_1_%=: \n\t"
2883 #ifndef __APPLE__
2884         "bl	fe_sq\n\t"
2885 #else
2886         "bl	_fe_sq\n\t"
2887 #endif /* __APPLE__ */
2888         "subs	x23, x23, #1\n\t"
2889         "bcs	L_fe_pow22523_1_%=\n\t"
2890         "add	x0, x29, #16\n\t"
2891 #ifndef NDEBUG
2892         "add	x1, x29, #48\n\t"
2893 #endif /* !NDEBUG */
2894         "add	x2, x29, #16\n\t"
2895 #ifndef __APPLE__
2896         "bl	fe_mul\n\t"
2897 #else
2898         "bl	_fe_mul\n\t"
2899 #endif /* __APPLE__ */
2900         "add	x0, x29, #48\n\t"
2901         "add	x1, x29, #16\n\t"
2902 #ifndef __APPLE__
2903         "bl	fe_sq\n\t"
2904 #else
2905         "bl	_fe_sq\n\t"
2906 #endif /* __APPLE__ */
2907         "mov	x23, #8\n\t"
2908 #ifndef NDEBUG
2909         "add	x0, x29, #48\n\t"
2910 #endif /* !NDEBUG */
2911         "add	x1, x29, #48\n\t"
2912         "\n"
2913     "L_fe_pow22523_2_%=: \n\t"
2914 #ifndef __APPLE__
2915         "bl	fe_sq\n\t"
2916 #else
2917         "bl	_fe_sq\n\t"
2918 #endif /* __APPLE__ */
2919         "subs	x23, x23, #1\n\t"
2920         "bcs	L_fe_pow22523_2_%=\n\t"
2921 #ifndef NDEBUG
2922         "add	x0, x29, #48\n\t"
2923 #endif /* !NDEBUG */
2924 #ifndef NDEBUG
2925         "add	x1, x29, #48\n\t"
2926 #endif /* !NDEBUG */
2927         "add	x2, x29, #16\n\t"
2928 #ifndef __APPLE__
2929         "bl	fe_mul\n\t"
2930 #else
2931         "bl	_fe_mul\n\t"
2932 #endif /* __APPLE__ */
2933         "add	x0, x29, #0x50\n\t"
2934 #ifndef NDEBUG
2935         "add	x1, x29, #48\n\t"
2936 #endif /* !NDEBUG */
2937 #ifndef __APPLE__
2938         "bl	fe_sq\n\t"
2939 #else
2940         "bl	_fe_sq\n\t"
2941 #endif /* __APPLE__ */
2942         "mov	x23, #18\n\t"
2943 #ifndef NDEBUG
2944         "add	x0, x29, #0x50\n\t"
2945 #endif /* !NDEBUG */
2946         "add	x1, x29, #0x50\n\t"
2947         "\n"
2948     "L_fe_pow22523_3_%=: \n\t"
2949 #ifndef __APPLE__
2950         "bl	fe_sq\n\t"
2951 #else
2952         "bl	_fe_sq\n\t"
2953 #endif /* __APPLE__ */
2954         "subs	x23, x23, #1\n\t"
2955         "bcs	L_fe_pow22523_3_%=\n\t"
2956         "add	x0, x29, #48\n\t"
2957 #ifndef NDEBUG
2958         "add	x1, x29, #0x50\n\t"
2959 #endif /* !NDEBUG */
2960         "add	x2, x29, #48\n\t"
2961 #ifndef __APPLE__
2962         "bl	fe_mul\n\t"
2963 #else
2964         "bl	_fe_mul\n\t"
2965 #endif /* __APPLE__ */
2966         "mov	x23, #9\n\t"
2967 #ifndef NDEBUG
2968         "add	x0, x29, #48\n\t"
2969 #endif /* !NDEBUG */
2970         "add	x1, x29, #48\n\t"
2971         "\n"
2972     "L_fe_pow22523_4_%=: \n\t"
2973 #ifndef __APPLE__
2974         "bl	fe_sq\n\t"
2975 #else
2976         "bl	_fe_sq\n\t"
2977 #endif /* __APPLE__ */
2978         "subs	x23, x23, #1\n\t"
2979         "bcs	L_fe_pow22523_4_%=\n\t"
2980         "add	x0, x29, #16\n\t"
2981 #ifndef NDEBUG
2982         "add	x1, x29, #48\n\t"
2983 #endif /* !NDEBUG */
2984         "add	x2, x29, #16\n\t"
2985 #ifndef __APPLE__
2986         "bl	fe_mul\n\t"
2987 #else
2988         "bl	_fe_mul\n\t"
2989 #endif /* __APPLE__ */
2990         "add	x0, x29, #48\n\t"
2991         "add	x1, x29, #16\n\t"
2992 #ifndef __APPLE__
2993         "bl	fe_sq\n\t"
2994 #else
2995         "bl	_fe_sq\n\t"
2996 #endif /* __APPLE__ */
2997         "mov	x23, #48\n\t"
2998 #ifndef NDEBUG
2999         "add	x0, x29, #48\n\t"
3000 #endif /* !NDEBUG */
3001         "add	x1, x29, #48\n\t"
3002         "\n"
3003     "L_fe_pow22523_5_%=: \n\t"
3004 #ifndef __APPLE__
3005         "bl	fe_sq\n\t"
3006 #else
3007         "bl	_fe_sq\n\t"
3008 #endif /* __APPLE__ */
3009         "subs	x23, x23, #1\n\t"
3010         "bcs	L_fe_pow22523_5_%=\n\t"
3011 #ifndef NDEBUG
3012         "add	x0, x29, #48\n\t"
3013 #endif /* !NDEBUG */
3014 #ifndef NDEBUG
3015         "add	x1, x29, #48\n\t"
3016 #endif /* !NDEBUG */
3017         "add	x2, x29, #16\n\t"
3018 #ifndef __APPLE__
3019         "bl	fe_mul\n\t"
3020 #else
3021         "bl	_fe_mul\n\t"
3022 #endif /* __APPLE__ */
3023         "add	x0, x29, #0x50\n\t"
3024 #ifndef NDEBUG
3025         "add	x1, x29, #48\n\t"
3026 #endif /* !NDEBUG */
3027 #ifndef __APPLE__
3028         "bl	fe_sq\n\t"
3029 #else
3030         "bl	_fe_sq\n\t"
3031 #endif /* __APPLE__ */
3032         "mov	x23, #0x62\n\t"
3033 #ifndef NDEBUG
3034         "add	x0, x29, #0x50\n\t"
3035 #endif /* !NDEBUG */
3036         "add	x1, x29, #0x50\n\t"
3037         "\n"
3038     "L_fe_pow22523_6_%=: \n\t"
3039 #ifndef __APPLE__
3040         "bl	fe_sq\n\t"
3041 #else
3042         "bl	_fe_sq\n\t"
3043 #endif /* __APPLE__ */
3044         "subs	x23, x23, #1\n\t"
3045         "bcs	L_fe_pow22523_6_%=\n\t"
3046         "add	x0, x29, #48\n\t"
3047 #ifndef NDEBUG
3048         "add	x1, x29, #0x50\n\t"
3049 #endif /* !NDEBUG */
3050         "add	x2, x29, #48\n\t"
3051 #ifndef __APPLE__
3052         "bl	fe_mul\n\t"
3053 #else
3054         "bl	_fe_mul\n\t"
3055 #endif /* __APPLE__ */
3056         "mov	x23, #49\n\t"
3057 #ifndef NDEBUG
3058         "add	x0, x29, #48\n\t"
3059 #endif /* !NDEBUG */
3060         "add	x1, x29, #48\n\t"
3061         "\n"
3062     "L_fe_pow22523_7_%=: \n\t"
3063 #ifndef __APPLE__
3064         "bl	fe_sq\n\t"
3065 #else
3066         "bl	_fe_sq\n\t"
3067 #endif /* __APPLE__ */
3068         "subs	x23, x23, #1\n\t"
3069         "bcs	L_fe_pow22523_7_%=\n\t"
3070         "add	x0, x29, #16\n\t"
3071 #ifndef NDEBUG
3072         "add	x1, x29, #48\n\t"
3073 #endif /* !NDEBUG */
3074         "add	x2, x29, #16\n\t"
3075 #ifndef __APPLE__
3076         "bl	fe_mul\n\t"
3077 #else
3078         "bl	_fe_mul\n\t"
3079 #endif /* __APPLE__ */
3080         "mov	x23, #1\n\t"
3081 #ifndef NDEBUG
3082         "add	x0, x29, #16\n\t"
3083 #endif /* !NDEBUG */
3084         "add	x1, x29, #16\n\t"
3085         "\n"
3086     "L_fe_pow22523_8_%=: \n\t"
3087 #ifndef __APPLE__
3088         "bl	fe_sq\n\t"
3089 #else
3090         "bl	_fe_sq\n\t"
3091 #endif /* __APPLE__ */
3092         "subs	x23, x23, #1\n\t"
3093         "bcs	L_fe_pow22523_8_%=\n\t"
3094         "ldr	x0, [x29, #112]\n\t"
3095 #ifndef NDEBUG
3096         "add	x1, x29, #16\n\t"
3097 #endif /* !NDEBUG */
3098         "ldr	x2, [x29, #120]\n\t"
3099 #ifndef __APPLE__
3100         "bl	fe_mul\n\t"
3101 #else
3102         "bl	_fe_mul\n\t"
3103 #endif /* __APPLE__ */
3104         "ldp	x29, x30, [sp], #0x80\n\t"
3105         : [r] "+r" (r), [a] "+r" (a)
3106         :
3107         : "memory", "x2", "x23"
3108     );
3109 }
3110 
fe_ge_to_p2(fe rx,fe ry,fe rz,const fe px,const fe py,const fe pz,const fe pt)3111 void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt)
3112 {
3113     __asm__ __volatile__ (
3114         "stp	x29, x30, [sp, #-64]!\n\t"
3115         "add	x29, sp, #0\n\t"
3116         "str	%x[ry], [x29, #16]\n\t"
3117         "str	%x[rz], [x29, #24]\n\t"
3118         "str	%x[px], [x29, #32]\n\t"
3119         "str	%x[py], [x29, #40]\n\t"
3120         "str	%x[pz], [x29, #48]\n\t"
3121         "str	%x[pt], [x29, #56]\n\t"
3122         "ldr	x1, [x29, #32]\n\t"
3123         "ldr	x2, [x29, #56]\n\t"
3124         /* Multiply */
3125         "ldp	x11, x12, [x1]\n\t"
3126         "ldp	x13, x14, [x1, #16]\n\t"
3127         "ldp	x15, x16, [x2]\n\t"
3128         "ldp	x17, x19, [x2, #16]\n\t"
3129         /*  A[0] * B[0] */
3130         "mul	x3, x11, x15\n\t"
3131         "umulh	x4, x11, x15\n\t"
3132         /*  A[0] * B[1] */
3133         "mul	x20, x11, x16\n\t"
3134         "umulh	x5, x11, x16\n\t"
3135         "adds	x4, x4, x20\n\t"
3136         "adc	x5, x5, xzr\n\t"
3137         /*  A[1] * B[0] */
3138         "mul	x20, x12, x15\n\t"
3139         "umulh	x21, x12, x15\n\t"
3140         "adds	x4, x4, x20\n\t"
3141         "adcs	x5, x5, x21\n\t"
3142         "adc	x6, xzr, xzr\n\t"
3143         /*  A[0] * B[2] */
3144         "mul	x20, x11, x17\n\t"
3145         "umulh	x21, x11, x17\n\t"
3146         "adds	x5, x5, x20\n\t"
3147         "adc	x6, x6, x21\n\t"
3148         /*  A[1] * B[1] */
3149         "mul	x20, x12, x16\n\t"
3150         "umulh	x21, x12, x16\n\t"
3151         "adds	x5, x5, x20\n\t"
3152         "adcs	x6, x6, x21\n\t"
3153         "adc	x7, xzr, xzr\n\t"
3154         /*  A[2] * B[0] */
3155         "mul	x20, x13, x15\n\t"
3156         "umulh	x21, x13, x15\n\t"
3157         "adds	x5, x5, x20\n\t"
3158         "adcs	x6, x6, x21\n\t"
3159         "adc	x7, x7, xzr\n\t"
3160         /*  A[0] * B[3] */
3161         "mul	x20, x11, x19\n\t"
3162         "umulh	x21, x11, x19\n\t"
3163         "adds	x6, x6, x20\n\t"
3164         "adcs	x7, x7, x21\n\t"
3165         "adc	x8, xzr, xzr\n\t"
3166         /*  A[1] * B[2] */
3167         "mul	x20, x12, x17\n\t"
3168         "umulh	x21, x12, x17\n\t"
3169         "adds	x6, x6, x20\n\t"
3170         "adcs	x7, x7, x21\n\t"
3171         "adc	x8, x8, xzr\n\t"
3172         /*  A[2] * B[1] */
3173         "mul	x20, x13, x16\n\t"
3174         "umulh	x21, x13, x16\n\t"
3175         "adds	x6, x6, x20\n\t"
3176         "adcs	x7, x7, x21\n\t"
3177         "adc	x8, x8, xzr\n\t"
3178         /*  A[3] * B[0] */
3179         "mul	x20, x14, x15\n\t"
3180         "umulh	x21, x14, x15\n\t"
3181         "adds	x6, x6, x20\n\t"
3182         "adcs	x7, x7, x21\n\t"
3183         "adc	x8, x8, xzr\n\t"
3184         /*  A[1] * B[3] */
3185         "mul	x20, x12, x19\n\t"
3186         "umulh	x21, x12, x19\n\t"
3187         "adds	x7, x7, x20\n\t"
3188         "adcs	x8, x8, x21\n\t"
3189         "adc	x9, xzr, xzr\n\t"
3190         /*  A[2] * B[2] */
3191         "mul	x20, x13, x17\n\t"
3192         "umulh	x21, x13, x17\n\t"
3193         "adds	x7, x7, x20\n\t"
3194         "adcs	x8, x8, x21\n\t"
3195         "adc	x9, x9, xzr\n\t"
3196         /*  A[3] * B[1] */
3197         "mul	x20, x14, x16\n\t"
3198         "umulh	x21, x14, x16\n\t"
3199         "adds	x7, x7, x20\n\t"
3200         "adcs	x8, x8, x21\n\t"
3201         "adc	x9, x9, xzr\n\t"
3202         /*  A[2] * B[3] */
3203         "mul	x20, x13, x19\n\t"
3204         "umulh	x21, x13, x19\n\t"
3205         "adds	x8, x8, x20\n\t"
3206         "adcs	x9, x9, x21\n\t"
3207         "adc	x10, xzr, xzr\n\t"
3208         /*  A[3] * B[2] */
3209         "mul	x20, x14, x17\n\t"
3210         "umulh	x21, x14, x17\n\t"
3211         "adds	x8, x8, x20\n\t"
3212         "adcs	x9, x9, x21\n\t"
3213         "adc	x10, x10, xzr\n\t"
3214         /*  A[3] * B[3] */
3215         "mul	x20, x14, x19\n\t"
3216         "umulh	x21, x14, x19\n\t"
3217         "adds	x9, x9, x20\n\t"
3218         "adc	x10, x10, x21\n\t"
3219         /* Reduce */
3220         /*  Move top half into t4-t7 and remove top bit from t3 */
3221         "extr	x10, x10, x9, #63\n\t"
3222         "extr	x9, x9, x8, #63\n\t"
3223         "extr	x8, x8, x7, #63\n\t"
3224         "extr	x7, x7, x6, #63\n\t"
3225         "and	x6, x6, #0x7fffffffffffffff\n\t"
3226         /*  Multiply top half by 19 */
3227         "mov	x20, #19\n\t"
3228         "mul	x21, x20, x7\n\t"
3229         "umulh	x7, x20, x7\n\t"
3230         "adds	x3, x3, x21\n\t"
3231         "mul	x21, x20, x8\n\t"
3232         "umulh	x8, x20, x8\n\t"
3233         "adcs	x4, x4, x21\n\t"
3234         "mul	x21, x20, x9\n\t"
3235         "umulh	x9, x20, x9\n\t"
3236         "adcs	x5, x5, x21\n\t"
3237         "mul	x21, x20, x10\n\t"
3238         "umulh	x22, x20, x10\n\t"
3239         "adcs	x6, x6, x21\n\t"
3240         "adc	x22, x22, xzr\n\t"
3241         /*  Add remaining product results in */
3242         "adds	x4, x4, x7\n\t"
3243         "adcs	x5, x5, x8\n\t"
3244         "adcs	x6, x6, x9\n\t"
3245         "adc	x22, x22, xzr\n\t"
3246         /*  Overflow */
3247         "extr	x22, x22, x6, #63\n\t"
3248         "mul	x22, x22, x20\n\t"
3249         "and	x6, x6, #0x7fffffffffffffff\n\t"
3250         "adds	x3, x3, x22\n\t"
3251         "adcs	x4, x4, xzr\n\t"
3252         "adcs	x5, x5, xzr\n\t"
3253         "adc	x6, x6, xzr\n\t"
3254         /* Reduce if top bit set */
3255         "and	x22, x20, x6, asr 63\n\t"
3256         "and	x6, x6, #0x7fffffffffffffff\n\t"
3257         "adds	x3, x3, x22\n\t"
3258         "adcs	x4, x4, xzr\n\t"
3259         "adcs	x5, x5, xzr\n\t"
3260         "adc	x6, x6, xzr\n\t"
3261         /* Store */
3262         "stp	x3, x4, [x0]\n\t"
3263         "stp	x5, x6, [x0, #16]\n\t"
3264         "ldr	x0, [x29, #16]\n\t"
3265         "ldr	x1, [x29, #40]\n\t"
3266         "ldr	x2, [x29, #48]\n\t"
3267         /* Multiply */
3268         "ldp	x11, x12, [x1]\n\t"
3269         "ldp	x13, x14, [x1, #16]\n\t"
3270         "ldp	x15, x16, [x2]\n\t"
3271         "ldp	x17, x19, [x2, #16]\n\t"
3272         /*  A[0] * B[0] */
3273         "mul	x3, x11, x15\n\t"
3274         "umulh	x4, x11, x15\n\t"
3275         /*  A[0] * B[1] */
3276         "mul	x20, x11, x16\n\t"
3277         "umulh	x5, x11, x16\n\t"
3278         "adds	x4, x4, x20\n\t"
3279         "adc	x5, x5, xzr\n\t"
3280         /*  A[1] * B[0] */
3281         "mul	x20, x12, x15\n\t"
3282         "umulh	x21, x12, x15\n\t"
3283         "adds	x4, x4, x20\n\t"
3284         "adcs	x5, x5, x21\n\t"
3285         "adc	x6, xzr, xzr\n\t"
3286         /*  A[0] * B[2] */
3287         "mul	x20, x11, x17\n\t"
3288         "umulh	x21, x11, x17\n\t"
3289         "adds	x5, x5, x20\n\t"
3290         "adc	x6, x6, x21\n\t"
3291         /*  A[1] * B[1] */
3292         "mul	x20, x12, x16\n\t"
3293         "umulh	x21, x12, x16\n\t"
3294         "adds	x5, x5, x20\n\t"
3295         "adcs	x6, x6, x21\n\t"
3296         "adc	x7, xzr, xzr\n\t"
3297         /*  A[2] * B[0] */
3298         "mul	x20, x13, x15\n\t"
3299         "umulh	x21, x13, x15\n\t"
3300         "adds	x5, x5, x20\n\t"
3301         "adcs	x6, x6, x21\n\t"
3302         "adc	x7, x7, xzr\n\t"
3303         /*  A[0] * B[3] */
3304         "mul	x20, x11, x19\n\t"
3305         "umulh	x21, x11, x19\n\t"
3306         "adds	x6, x6, x20\n\t"
3307         "adcs	x7, x7, x21\n\t"
3308         "adc	x8, xzr, xzr\n\t"
3309         /*  A[1] * B[2] */
3310         "mul	x20, x12, x17\n\t"
3311         "umulh	x21, x12, x17\n\t"
3312         "adds	x6, x6, x20\n\t"
3313         "adcs	x7, x7, x21\n\t"
3314         "adc	x8, x8, xzr\n\t"
3315         /*  A[2] * B[1] */
3316         "mul	x20, x13, x16\n\t"
3317         "umulh	x21, x13, x16\n\t"
3318         "adds	x6, x6, x20\n\t"
3319         "adcs	x7, x7, x21\n\t"
3320         "adc	x8, x8, xzr\n\t"
3321         /*  A[3] * B[0] */
3322         "mul	x20, x14, x15\n\t"
3323         "umulh	x21, x14, x15\n\t"
3324         "adds	x6, x6, x20\n\t"
3325         "adcs	x7, x7, x21\n\t"
3326         "adc	x8, x8, xzr\n\t"
3327         /*  A[1] * B[3] */
3328         "mul	x20, x12, x19\n\t"
3329         "umulh	x21, x12, x19\n\t"
3330         "adds	x7, x7, x20\n\t"
3331         "adcs	x8, x8, x21\n\t"
3332         "adc	x9, xzr, xzr\n\t"
3333         /*  A[2] * B[2] */
3334         "mul	x20, x13, x17\n\t"
3335         "umulh	x21, x13, x17\n\t"
3336         "adds	x7, x7, x20\n\t"
3337         "adcs	x8, x8, x21\n\t"
3338         "adc	x9, x9, xzr\n\t"
3339         /*  A[3] * B[1] */
3340         "mul	x20, x14, x16\n\t"
3341         "umulh	x21, x14, x16\n\t"
3342         "adds	x7, x7, x20\n\t"
3343         "adcs	x8, x8, x21\n\t"
3344         "adc	x9, x9, xzr\n\t"
3345         /*  A[2] * B[3] */
3346         "mul	x20, x13, x19\n\t"
3347         "umulh	x21, x13, x19\n\t"
3348         "adds	x8, x8, x20\n\t"
3349         "adcs	x9, x9, x21\n\t"
3350         "adc	x10, xzr, xzr\n\t"
3351         /*  A[3] * B[2] */
3352         "mul	x20, x14, x17\n\t"
3353         "umulh	x21, x14, x17\n\t"
3354         "adds	x8, x8, x20\n\t"
3355         "adcs	x9, x9, x21\n\t"
3356         "adc	x10, x10, xzr\n\t"
3357         /*  A[3] * B[3] */
3358         "mul	x20, x14, x19\n\t"
3359         "umulh	x21, x14, x19\n\t"
3360         "adds	x9, x9, x20\n\t"
3361         "adc	x10, x10, x21\n\t"
3362         /* Reduce */
3363         /*  Move top half into t4-t7 and remove top bit from t3 */
3364         "extr	x10, x10, x9, #63\n\t"
3365         "extr	x9, x9, x8, #63\n\t"
3366         "extr	x8, x8, x7, #63\n\t"
3367         "extr	x7, x7, x6, #63\n\t"
3368         "and	x6, x6, #0x7fffffffffffffff\n\t"
3369         /*  Multiply top half by 19 */
3370         "mov	x20, #19\n\t"
3371         "mul	x21, x20, x7\n\t"
3372         "umulh	x7, x20, x7\n\t"
3373         "adds	x3, x3, x21\n\t"
3374         "mul	x21, x20, x8\n\t"
3375         "umulh	x8, x20, x8\n\t"
3376         "adcs	x4, x4, x21\n\t"
3377         "mul	x21, x20, x9\n\t"
3378         "umulh	x9, x20, x9\n\t"
3379         "adcs	x5, x5, x21\n\t"
3380         "mul	x21, x20, x10\n\t"
3381         "umulh	x22, x20, x10\n\t"
3382         "adcs	x6, x6, x21\n\t"
3383         "adc	x22, x22, xzr\n\t"
3384         /*  Add remaining product results in */
3385         "adds	x4, x4, x7\n\t"
3386         "adcs	x5, x5, x8\n\t"
3387         "adcs	x6, x6, x9\n\t"
3388         "adc	x22, x22, xzr\n\t"
3389         /*  Overflow */
3390         "extr	x22, x22, x6, #63\n\t"
3391         "mul	x22, x22, x20\n\t"
3392         "and	x6, x6, #0x7fffffffffffffff\n\t"
3393         "adds	x3, x3, x22\n\t"
3394         "adcs	x4, x4, xzr\n\t"
3395         "adcs	x5, x5, xzr\n\t"
3396         "adc	x6, x6, xzr\n\t"
3397         /* Reduce if top bit set */
3398         "and	x22, x20, x6, asr 63\n\t"
3399         "and	x6, x6, #0x7fffffffffffffff\n\t"
3400         "adds	x3, x3, x22\n\t"
3401         "adcs	x4, x4, xzr\n\t"
3402         "adcs	x5, x5, xzr\n\t"
3403         "adc	x6, x6, xzr\n\t"
3404         /* Store */
3405         "stp	x3, x4, [x0]\n\t"
3406         "stp	x5, x6, [x0, #16]\n\t"
3407         "ldr	x0, [x29, #24]\n\t"
3408         "ldr	x2, [x29, #56]\n\t"
3409         /* Multiply */
3410         "ldp	x11, x12, [x2]\n\t"
3411         "ldp	x13, x14, [x2, #16]\n\t"
3412         /*  A[0] * B[0] */
3413         "mul	x3, x15, x11\n\t"
3414         "umulh	x4, x15, x11\n\t"
3415         /*  A[0] * B[1] */
3416         "mul	x20, x15, x12\n\t"
3417         "umulh	x5, x15, x12\n\t"
3418         "adds	x4, x4, x20\n\t"
3419         "adc	x5, x5, xzr\n\t"
3420         /*  A[1] * B[0] */
3421         "mul	x20, x16, x11\n\t"
3422         "umulh	x21, x16, x11\n\t"
3423         "adds	x4, x4, x20\n\t"
3424         "adcs	x5, x5, x21\n\t"
3425         "adc	x6, xzr, xzr\n\t"
3426         /*  A[0] * B[2] */
3427         "mul	x20, x15, x13\n\t"
3428         "umulh	x21, x15, x13\n\t"
3429         "adds	x5, x5, x20\n\t"
3430         "adc	x6, x6, x21\n\t"
3431         /*  A[1] * B[1] */
3432         "mul	x20, x16, x12\n\t"
3433         "umulh	x21, x16, x12\n\t"
3434         "adds	x5, x5, x20\n\t"
3435         "adcs	x6, x6, x21\n\t"
3436         "adc	x7, xzr, xzr\n\t"
3437         /*  A[2] * B[0] */
3438         "mul	x20, x17, x11\n\t"
3439         "umulh	x21, x17, x11\n\t"
3440         "adds	x5, x5, x20\n\t"
3441         "adcs	x6, x6, x21\n\t"
3442         "adc	x7, x7, xzr\n\t"
3443         /*  A[0] * B[3] */
3444         "mul	x20, x15, x14\n\t"
3445         "umulh	x21, x15, x14\n\t"
3446         "adds	x6, x6, x20\n\t"
3447         "adcs	x7, x7, x21\n\t"
3448         "adc	x8, xzr, xzr\n\t"
3449         /*  A[1] * B[2] */
3450         "mul	x20, x16, x13\n\t"
3451         "umulh	x21, x16, x13\n\t"
3452         "adds	x6, x6, x20\n\t"
3453         "adcs	x7, x7, x21\n\t"
3454         "adc	x8, x8, xzr\n\t"
3455         /*  A[2] * B[1] */
3456         "mul	x20, x17, x12\n\t"
3457         "umulh	x21, x17, x12\n\t"
3458         "adds	x6, x6, x20\n\t"
3459         "adcs	x7, x7, x21\n\t"
3460         "adc	x8, x8, xzr\n\t"
3461         /*  A[3] * B[0] */
3462         "mul	x20, x19, x11\n\t"
3463         "umulh	x21, x19, x11\n\t"
3464         "adds	x6, x6, x20\n\t"
3465         "adcs	x7, x7, x21\n\t"
3466         "adc	x8, x8, xzr\n\t"
3467         /*  A[1] * B[3] */
3468         "mul	x20, x16, x14\n\t"
3469         "umulh	x21, x16, x14\n\t"
3470         "adds	x7, x7, x20\n\t"
3471         "adcs	x8, x8, x21\n\t"
3472         "adc	x9, xzr, xzr\n\t"
3473         /*  A[2] * B[2] */
3474         "mul	x20, x17, x13\n\t"
3475         "umulh	x21, x17, x13\n\t"
3476         "adds	x7, x7, x20\n\t"
3477         "adcs	x8, x8, x21\n\t"
3478         "adc	x9, x9, xzr\n\t"
3479         /*  A[3] * B[1] */
3480         "mul	x20, x19, x12\n\t"
3481         "umulh	x21, x19, x12\n\t"
3482         "adds	x7, x7, x20\n\t"
3483         "adcs	x8, x8, x21\n\t"
3484         "adc	x9, x9, xzr\n\t"
3485         /*  A[2] * B[3] */
3486         "mul	x20, x17, x14\n\t"
3487         "umulh	x21, x17, x14\n\t"
3488         "adds	x8, x8, x20\n\t"
3489         "adcs	x9, x9, x21\n\t"
3490         "adc	x10, xzr, xzr\n\t"
3491         /*  A[3] * B[2] */
3492         "mul	x20, x19, x13\n\t"
3493         "umulh	x21, x19, x13\n\t"
3494         "adds	x8, x8, x20\n\t"
3495         "adcs	x9, x9, x21\n\t"
3496         "adc	x10, x10, xzr\n\t"
3497         /*  A[3] * B[3] */
3498         "mul	x20, x19, x14\n\t"
3499         "umulh	x21, x19, x14\n\t"
3500         "adds	x9, x9, x20\n\t"
3501         "adc	x10, x10, x21\n\t"
3502         /* Reduce */
3503         /*  Move top half into t4-t7 and remove top bit from t3 */
3504         "extr	x10, x10, x9, #63\n\t"
3505         "extr	x9, x9, x8, #63\n\t"
3506         "extr	x8, x8, x7, #63\n\t"
3507         "extr	x7, x7, x6, #63\n\t"
3508         "and	x6, x6, #0x7fffffffffffffff\n\t"
3509         /*  Multiply top half by 19 */
3510         "mov	x20, #19\n\t"
3511         "mul	x21, x20, x7\n\t"
3512         "umulh	x7, x20, x7\n\t"
3513         "adds	x3, x3, x21\n\t"
3514         "mul	x21, x20, x8\n\t"
3515         "umulh	x8, x20, x8\n\t"
3516         "adcs	x4, x4, x21\n\t"
3517         "mul	x21, x20, x9\n\t"
3518         "umulh	x9, x20, x9\n\t"
3519         "adcs	x5, x5, x21\n\t"
3520         "mul	x21, x20, x10\n\t"
3521         "umulh	x22, x20, x10\n\t"
3522         "adcs	x6, x6, x21\n\t"
3523         "adc	x22, x22, xzr\n\t"
3524         /*  Add remaining product results in */
3525         "adds	x4, x4, x7\n\t"
3526         "adcs	x5, x5, x8\n\t"
3527         "adcs	x6, x6, x9\n\t"
3528         "adc	x22, x22, xzr\n\t"
3529         /*  Overflow */
3530         "extr	x22, x22, x6, #63\n\t"
3531         "mul	x22, x22, x20\n\t"
3532         "and	x6, x6, #0x7fffffffffffffff\n\t"
3533         "adds	x3, x3, x22\n\t"
3534         "adcs	x4, x4, xzr\n\t"
3535         "adcs	x5, x5, xzr\n\t"
3536         "adc	x6, x6, xzr\n\t"
3537         /* Reduce if top bit set */
3538         "and	x22, x20, x6, asr 63\n\t"
3539         "and	x6, x6, #0x7fffffffffffffff\n\t"
3540         "adds	x3, x3, x22\n\t"
3541         "adcs	x4, x4, xzr\n\t"
3542         "adcs	x5, x5, xzr\n\t"
3543         "adc	x6, x6, xzr\n\t"
3544         /* Store */
3545         "stp	x3, x4, [x0]\n\t"
3546         "stp	x5, x6, [x0, #16]\n\t"
3547         "ldp	x29, x30, [sp], #0x40\n\t"
3548         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
3549         :
3550         : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
3551     );
3552 }
3553 
fe_ge_to_p3(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt)3554 void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
3555 {
3556     __asm__ __volatile__ (
3557         "stp	x29, x30, [sp, #-96]!\n\t"
3558         "add	x29, sp, #0\n\t"
3559         "str	%x[ry], [x29, #16]\n\t"
3560         "str	%x[rz], [x29, #24]\n\t"
3561         "str	%x[rt], [x29, #32]\n\t"
3562         "str	%x[px], [x29, #40]\n\t"
3563         "str	%x[py], [x29, #48]\n\t"
3564         "str	%x[pz], [x29, #56]\n\t"
3565         "str	%x[pt], [x29, #64]\n\t"
3566         "ldr	x1, [x29, #40]\n\t"
3567         "ldr	x2, [x29, #64]\n\t"
3568         /* Multiply */
3569         "ldp	x11, x12, [x1]\n\t"
3570         "ldp	x13, x14, [x1, #16]\n\t"
3571         "ldp	x15, x16, [x2]\n\t"
3572         "ldp	x17, x19, [x2, #16]\n\t"
3573         /*  A[0] * B[0] */
3574         "mul	x3, x11, x15\n\t"
3575         "umulh	x4, x11, x15\n\t"
3576         /*  A[0] * B[1] */
3577         "mul	x24, x11, x16\n\t"
3578         "umulh	x5, x11, x16\n\t"
3579         "adds	x4, x4, x24\n\t"
3580         "adc	x5, x5, xzr\n\t"
3581         /*  A[1] * B[0] */
3582         "mul	x24, x12, x15\n\t"
3583         "umulh	x25, x12, x15\n\t"
3584         "adds	x4, x4, x24\n\t"
3585         "adcs	x5, x5, x25\n\t"
3586         "adc	x6, xzr, xzr\n\t"
3587         /*  A[0] * B[2] */
3588         "mul	x24, x11, x17\n\t"
3589         "umulh	x25, x11, x17\n\t"
3590         "adds	x5, x5, x24\n\t"
3591         "adc	x6, x6, x25\n\t"
3592         /*  A[1] * B[1] */
3593         "mul	x24, x12, x16\n\t"
3594         "umulh	x25, x12, x16\n\t"
3595         "adds	x5, x5, x24\n\t"
3596         "adcs	x6, x6, x25\n\t"
3597         "adc	x7, xzr, xzr\n\t"
3598         /*  A[2] * B[0] */
3599         "mul	x24, x13, x15\n\t"
3600         "umulh	x25, x13, x15\n\t"
3601         "adds	x5, x5, x24\n\t"
3602         "adcs	x6, x6, x25\n\t"
3603         "adc	x7, x7, xzr\n\t"
3604         /*  A[0] * B[3] */
3605         "mul	x24, x11, x19\n\t"
3606         "umulh	x25, x11, x19\n\t"
3607         "adds	x6, x6, x24\n\t"
3608         "adcs	x7, x7, x25\n\t"
3609         "adc	x8, xzr, xzr\n\t"
3610         /*  A[1] * B[2] */
3611         "mul	x24, x12, x17\n\t"
3612         "umulh	x25, x12, x17\n\t"
3613         "adds	x6, x6, x24\n\t"
3614         "adcs	x7, x7, x25\n\t"
3615         "adc	x8, x8, xzr\n\t"
3616         /*  A[2] * B[1] */
3617         "mul	x24, x13, x16\n\t"
3618         "umulh	x25, x13, x16\n\t"
3619         "adds	x6, x6, x24\n\t"
3620         "adcs	x7, x7, x25\n\t"
3621         "adc	x8, x8, xzr\n\t"
3622         /*  A[3] * B[0] */
3623         "mul	x24, x14, x15\n\t"
3624         "umulh	x25, x14, x15\n\t"
3625         "adds	x6, x6, x24\n\t"
3626         "adcs	x7, x7, x25\n\t"
3627         "adc	x8, x8, xzr\n\t"
3628         /*  A[1] * B[3] */
3629         "mul	x24, x12, x19\n\t"
3630         "umulh	x25, x12, x19\n\t"
3631         "adds	x7, x7, x24\n\t"
3632         "adcs	x8, x8, x25\n\t"
3633         "adc	x9, xzr, xzr\n\t"
3634         /*  A[2] * B[2] */
3635         "mul	x24, x13, x17\n\t"
3636         "umulh	x25, x13, x17\n\t"
3637         "adds	x7, x7, x24\n\t"
3638         "adcs	x8, x8, x25\n\t"
3639         "adc	x9, x9, xzr\n\t"
3640         /*  A[3] * B[1] */
3641         "mul	x24, x14, x16\n\t"
3642         "umulh	x25, x14, x16\n\t"
3643         "adds	x7, x7, x24\n\t"
3644         "adcs	x8, x8, x25\n\t"
3645         "adc	x9, x9, xzr\n\t"
3646         /*  A[2] * B[3] */
3647         "mul	x24, x13, x19\n\t"
3648         "umulh	x25, x13, x19\n\t"
3649         "adds	x8, x8, x24\n\t"
3650         "adcs	x9, x9, x25\n\t"
3651         "adc	x10, xzr, xzr\n\t"
3652         /*  A[3] * B[2] */
3653         "mul	x24, x14, x17\n\t"
3654         "umulh	x25, x14, x17\n\t"
3655         "adds	x8, x8, x24\n\t"
3656         "adcs	x9, x9, x25\n\t"
3657         "adc	x10, x10, xzr\n\t"
3658         /*  A[3] * B[3] */
3659         "mul	x24, x14, x19\n\t"
3660         "umulh	x25, x14, x19\n\t"
3661         "adds	x9, x9, x24\n\t"
3662         "adc	x10, x10, x25\n\t"
3663         /* Reduce */
3664         /*  Move top half into t4-t7 and remove top bit from t3 */
3665         "extr	x10, x10, x9, #63\n\t"
3666         "extr	x9, x9, x8, #63\n\t"
3667         "extr	x8, x8, x7, #63\n\t"
3668         "extr	x7, x7, x6, #63\n\t"
3669         "and	x6, x6, #0x7fffffffffffffff\n\t"
3670         /*  Multiply top half by 19 */
3671         "mov	x24, #19\n\t"
3672         "mul	x25, x24, x7\n\t"
3673         "umulh	x7, x24, x7\n\t"
3674         "adds	x3, x3, x25\n\t"
3675         "mul	x25, x24, x8\n\t"
3676         "umulh	x8, x24, x8\n\t"
3677         "adcs	x4, x4, x25\n\t"
3678         "mul	x25, x24, x9\n\t"
3679         "umulh	x9, x24, x9\n\t"
3680         "adcs	x5, x5, x25\n\t"
3681         "mul	x25, x24, x10\n\t"
3682         "umulh	x26, x24, x10\n\t"
3683         "adcs	x6, x6, x25\n\t"
3684         "adc	x26, x26, xzr\n\t"
3685         /*  Add remaining product results in */
3686         "adds	x4, x4, x7\n\t"
3687         "adcs	x5, x5, x8\n\t"
3688         "adcs	x6, x6, x9\n\t"
3689         "adc	x26, x26, xzr\n\t"
3690         /*  Overflow */
3691         "extr	x26, x26, x6, #63\n\t"
3692         "mul	x26, x26, x24\n\t"
3693         "and	x6, x6, #0x7fffffffffffffff\n\t"
3694         "adds	x3, x3, x26\n\t"
3695         "adcs	x4, x4, xzr\n\t"
3696         "adcs	x5, x5, xzr\n\t"
3697         "adc	x6, x6, xzr\n\t"
3698         /* Reduce if top bit set */
3699         "and	x26, x24, x6, asr 63\n\t"
3700         "and	x6, x6, #0x7fffffffffffffff\n\t"
3701         "adds	x3, x3, x26\n\t"
3702         "adcs	x4, x4, xzr\n\t"
3703         "adcs	x5, x5, xzr\n\t"
3704         "adc	x6, x6, xzr\n\t"
3705         /* Store */
3706         "stp	x3, x4, [x0]\n\t"
3707         "stp	x5, x6, [x0, #16]\n\t"
3708         "ldr	x0, [x29, #32]\n\t"
3709         "ldr	x2, [x29, #48]\n\t"
3710         /* Multiply */
3711         "ldp	x20, x21, [x2]\n\t"
3712         "ldp	x22, x23, [x2, #16]\n\t"
3713         /*  A[0] * B[0] */
3714         "mul	x3, x11, x20\n\t"
3715         "umulh	x4, x11, x20\n\t"
3716         /*  A[0] * B[1] */
3717         "mul	x24, x11, x21\n\t"
3718         "umulh	x5, x11, x21\n\t"
3719         "adds	x4, x4, x24\n\t"
3720         "adc	x5, x5, xzr\n\t"
3721         /*  A[1] * B[0] */
3722         "mul	x24, x12, x20\n\t"
3723         "umulh	x25, x12, x20\n\t"
3724         "adds	x4, x4, x24\n\t"
3725         "adcs	x5, x5, x25\n\t"
3726         "adc	x6, xzr, xzr\n\t"
3727         /*  A[0] * B[2] */
3728         "mul	x24, x11, x22\n\t"
3729         "umulh	x25, x11, x22\n\t"
3730         "adds	x5, x5, x24\n\t"
3731         "adc	x6, x6, x25\n\t"
3732         /*  A[1] * B[1] */
3733         "mul	x24, x12, x21\n\t"
3734         "umulh	x25, x12, x21\n\t"
3735         "adds	x5, x5, x24\n\t"
3736         "adcs	x6, x6, x25\n\t"
3737         "adc	x7, xzr, xzr\n\t"
3738         /*  A[2] * B[0] */
3739         "mul	x24, x13, x20\n\t"
3740         "umulh	x25, x13, x20\n\t"
3741         "adds	x5, x5, x24\n\t"
3742         "adcs	x6, x6, x25\n\t"
3743         "adc	x7, x7, xzr\n\t"
3744         /*  A[0] * B[3] */
3745         "mul	x24, x11, x23\n\t"
3746         "umulh	x25, x11, x23\n\t"
3747         "adds	x6, x6, x24\n\t"
3748         "adcs	x7, x7, x25\n\t"
3749         "adc	x8, xzr, xzr\n\t"
3750         /*  A[1] * B[2] */
3751         "mul	x24, x12, x22\n\t"
3752         "umulh	x25, x12, x22\n\t"
3753         "adds	x6, x6, x24\n\t"
3754         "adcs	x7, x7, x25\n\t"
3755         "adc	x8, x8, xzr\n\t"
3756         /*  A[2] * B[1] */
3757         "mul	x24, x13, x21\n\t"
3758         "umulh	x25, x13, x21\n\t"
3759         "adds	x6, x6, x24\n\t"
3760         "adcs	x7, x7, x25\n\t"
3761         "adc	x8, x8, xzr\n\t"
3762         /*  A[3] * B[0] */
3763         "mul	x24, x14, x20\n\t"
3764         "umulh	x25, x14, x20\n\t"
3765         "adds	x6, x6, x24\n\t"
3766         "adcs	x7, x7, x25\n\t"
3767         "adc	x8, x8, xzr\n\t"
3768         /*  A[1] * B[3] */
3769         "mul	x24, x12, x23\n\t"
3770         "umulh	x25, x12, x23\n\t"
3771         "adds	x7, x7, x24\n\t"
3772         "adcs	x8, x8, x25\n\t"
3773         "adc	x9, xzr, xzr\n\t"
3774         /*  A[2] * B[2] */
3775         "mul	x24, x13, x22\n\t"
3776         "umulh	x25, x13, x22\n\t"
3777         "adds	x7, x7, x24\n\t"
3778         "adcs	x8, x8, x25\n\t"
3779         "adc	x9, x9, xzr\n\t"
3780         /*  A[3] * B[1] */
3781         "mul	x24, x14, x21\n\t"
3782         "umulh	x25, x14, x21\n\t"
3783         "adds	x7, x7, x24\n\t"
3784         "adcs	x8, x8, x25\n\t"
3785         "adc	x9, x9, xzr\n\t"
3786         /*  A[2] * B[3] */
3787         "mul	x24, x13, x23\n\t"
3788         "umulh	x25, x13, x23\n\t"
3789         "adds	x8, x8, x24\n\t"
3790         "adcs	x9, x9, x25\n\t"
3791         "adc	x10, xzr, xzr\n\t"
3792         /*  A[3] * B[2] */
3793         "mul	x24, x14, x22\n\t"
3794         "umulh	x25, x14, x22\n\t"
3795         "adds	x8, x8, x24\n\t"
3796         "adcs	x9, x9, x25\n\t"
3797         "adc	x10, x10, xzr\n\t"
3798         /*  A[3] * B[3] */
3799         "mul	x24, x14, x23\n\t"
3800         "umulh	x25, x14, x23\n\t"
3801         "adds	x9, x9, x24\n\t"
3802         "adc	x10, x10, x25\n\t"
3803         /* Reduce */
3804         /*  Move top half into t4-t7 and remove top bit from t3 */
3805         "extr	x10, x10, x9, #63\n\t"
3806         "extr	x9, x9, x8, #63\n\t"
3807         "extr	x8, x8, x7, #63\n\t"
3808         "extr	x7, x7, x6, #63\n\t"
3809         "and	x6, x6, #0x7fffffffffffffff\n\t"
3810         /*  Multiply top half by 19 */
3811         "mov	x24, #19\n\t"
3812         "mul	x25, x24, x7\n\t"
3813         "umulh	x7, x24, x7\n\t"
3814         "adds	x3, x3, x25\n\t"
3815         "mul	x25, x24, x8\n\t"
3816         "umulh	x8, x24, x8\n\t"
3817         "adcs	x4, x4, x25\n\t"
3818         "mul	x25, x24, x9\n\t"
3819         "umulh	x9, x24, x9\n\t"
3820         "adcs	x5, x5, x25\n\t"
3821         "mul	x25, x24, x10\n\t"
3822         "umulh	x26, x24, x10\n\t"
3823         "adcs	x6, x6, x25\n\t"
3824         "adc	x26, x26, xzr\n\t"
3825         /*  Add remaining product results in */
3826         "adds	x4, x4, x7\n\t"
3827         "adcs	x5, x5, x8\n\t"
3828         "adcs	x6, x6, x9\n\t"
3829         "adc	x26, x26, xzr\n\t"
3830         /*  Overflow */
3831         "extr	x26, x26, x6, #63\n\t"
3832         "mul	x26, x26, x24\n\t"
3833         "and	x6, x6, #0x7fffffffffffffff\n\t"
3834         "adds	x3, x3, x26\n\t"
3835         "adcs	x4, x4, xzr\n\t"
3836         "adcs	x5, x5, xzr\n\t"
3837         "adc	x6, x6, xzr\n\t"
3838         /* Reduce if top bit set */
3839         "and	x26, x24, x6, asr 63\n\t"
3840         "and	x6, x6, #0x7fffffffffffffff\n\t"
3841         "adds	x3, x3, x26\n\t"
3842         "adcs	x4, x4, xzr\n\t"
3843         "adcs	x5, x5, xzr\n\t"
3844         "adc	x6, x6, xzr\n\t"
3845         /* Store */
3846         "stp	x3, x4, [x0]\n\t"
3847         "stp	x5, x6, [x0, #16]\n\t"
3848         "ldr	x0, [x29, #16]\n\t"
3849         "ldr	x2, [x29, #56]\n\t"
3850         /* Multiply */
3851         "ldp	x11, x12, [x2]\n\t"
3852         "ldp	x13, x14, [x2, #16]\n\t"
3853         /*  A[0] * B[0] */
3854         "mul	x3, x20, x11\n\t"
3855         "umulh	x4, x20, x11\n\t"
3856         /*  A[0] * B[1] */
3857         "mul	x24, x20, x12\n\t"
3858         "umulh	x5, x20, x12\n\t"
3859         "adds	x4, x4, x24\n\t"
3860         "adc	x5, x5, xzr\n\t"
3861         /*  A[1] * B[0] */
3862         "mul	x24, x21, x11\n\t"
3863         "umulh	x25, x21, x11\n\t"
3864         "adds	x4, x4, x24\n\t"
3865         "adcs	x5, x5, x25\n\t"
3866         "adc	x6, xzr, xzr\n\t"
3867         /*  A[0] * B[2] */
3868         "mul	x24, x20, x13\n\t"
3869         "umulh	x25, x20, x13\n\t"
3870         "adds	x5, x5, x24\n\t"
3871         "adc	x6, x6, x25\n\t"
3872         /*  A[1] * B[1] */
3873         "mul	x24, x21, x12\n\t"
3874         "umulh	x25, x21, x12\n\t"
3875         "adds	x5, x5, x24\n\t"
3876         "adcs	x6, x6, x25\n\t"
3877         "adc	x7, xzr, xzr\n\t"
3878         /*  A[2] * B[0] */
3879         "mul	x24, x22, x11\n\t"
3880         "umulh	x25, x22, x11\n\t"
3881         "adds	x5, x5, x24\n\t"
3882         "adcs	x6, x6, x25\n\t"
3883         "adc	x7, x7, xzr\n\t"
3884         /*  A[0] * B[3] */
3885         "mul	x24, x20, x14\n\t"
3886         "umulh	x25, x20, x14\n\t"
3887         "adds	x6, x6, x24\n\t"
3888         "adcs	x7, x7, x25\n\t"
3889         "adc	x8, xzr, xzr\n\t"
3890         /*  A[1] * B[2] */
3891         "mul	x24, x21, x13\n\t"
3892         "umulh	x25, x21, x13\n\t"
3893         "adds	x6, x6, x24\n\t"
3894         "adcs	x7, x7, x25\n\t"
3895         "adc	x8, x8, xzr\n\t"
3896         /*  A[2] * B[1] */
3897         "mul	x24, x22, x12\n\t"
3898         "umulh	x25, x22, x12\n\t"
3899         "adds	x6, x6, x24\n\t"
3900         "adcs	x7, x7, x25\n\t"
3901         "adc	x8, x8, xzr\n\t"
3902         /*  A[3] * B[0] */
3903         "mul	x24, x23, x11\n\t"
3904         "umulh	x25, x23, x11\n\t"
3905         "adds	x6, x6, x24\n\t"
3906         "adcs	x7, x7, x25\n\t"
3907         "adc	x8, x8, xzr\n\t"
3908         /*  A[1] * B[3] */
3909         "mul	x24, x21, x14\n\t"
3910         "umulh	x25, x21, x14\n\t"
3911         "adds	x7, x7, x24\n\t"
3912         "adcs	x8, x8, x25\n\t"
3913         "adc	x9, xzr, xzr\n\t"
3914         /*  A[2] * B[2] */
3915         "mul	x24, x22, x13\n\t"
3916         "umulh	x25, x22, x13\n\t"
3917         "adds	x7, x7, x24\n\t"
3918         "adcs	x8, x8, x25\n\t"
3919         "adc	x9, x9, xzr\n\t"
3920         /*  A[3] * B[1] */
3921         "mul	x24, x23, x12\n\t"
3922         "umulh	x25, x23, x12\n\t"
3923         "adds	x7, x7, x24\n\t"
3924         "adcs	x8, x8, x25\n\t"
3925         "adc	x9, x9, xzr\n\t"
3926         /*  A[2] * B[3] */
3927         "mul	x24, x22, x14\n\t"
3928         "umulh	x25, x22, x14\n\t"
3929         "adds	x8, x8, x24\n\t"
3930         "adcs	x9, x9, x25\n\t"
3931         "adc	x10, xzr, xzr\n\t"
3932         /*  A[3] * B[2] */
3933         "mul	x24, x23, x13\n\t"
3934         "umulh	x25, x23, x13\n\t"
3935         "adds	x8, x8, x24\n\t"
3936         "adcs	x9, x9, x25\n\t"
3937         "adc	x10, x10, xzr\n\t"
3938         /*  A[3] * B[3] */
3939         "mul	x24, x23, x14\n\t"
3940         "umulh	x25, x23, x14\n\t"
3941         "adds	x9, x9, x24\n\t"
3942         "adc	x10, x10, x25\n\t"
3943         /* Reduce */
3944         /*  Move top half into t4-t7 and remove top bit from t3 */
3945         "extr	x10, x10, x9, #63\n\t"
3946         "extr	x9, x9, x8, #63\n\t"
3947         "extr	x8, x8, x7, #63\n\t"
3948         "extr	x7, x7, x6, #63\n\t"
3949         "and	x6, x6, #0x7fffffffffffffff\n\t"
3950         /*  Multiply top half by 19 */
3951         "mov	x24, #19\n\t"
3952         "mul	x25, x24, x7\n\t"
3953         "umulh	x7, x24, x7\n\t"
3954         "adds	x3, x3, x25\n\t"
3955         "mul	x25, x24, x8\n\t"
3956         "umulh	x8, x24, x8\n\t"
3957         "adcs	x4, x4, x25\n\t"
3958         "mul	x25, x24, x9\n\t"
3959         "umulh	x9, x24, x9\n\t"
3960         "adcs	x5, x5, x25\n\t"
3961         "mul	x25, x24, x10\n\t"
3962         "umulh	x26, x24, x10\n\t"
3963         "adcs	x6, x6, x25\n\t"
3964         "adc	x26, x26, xzr\n\t"
3965         /*  Add remaining product results in */
3966         "adds	x4, x4, x7\n\t"
3967         "adcs	x5, x5, x8\n\t"
3968         "adcs	x6, x6, x9\n\t"
3969         "adc	x26, x26, xzr\n\t"
3970         /*  Overflow */
3971         "extr	x26, x26, x6, #63\n\t"
3972         "mul	x26, x26, x24\n\t"
3973         "and	x6, x6, #0x7fffffffffffffff\n\t"
3974         "adds	x3, x3, x26\n\t"
3975         "adcs	x4, x4, xzr\n\t"
3976         "adcs	x5, x5, xzr\n\t"
3977         "adc	x6, x6, xzr\n\t"
3978         /* Reduce if top bit set */
3979         "and	x26, x24, x6, asr 63\n\t"
3980         "and	x6, x6, #0x7fffffffffffffff\n\t"
3981         "adds	x3, x3, x26\n\t"
3982         "adcs	x4, x4, xzr\n\t"
3983         "adcs	x5, x5, xzr\n\t"
3984         "adc	x6, x6, xzr\n\t"
3985         /* Store */
3986         "stp	x3, x4, [x0]\n\t"
3987         "stp	x5, x6, [x0, #16]\n\t"
3988         "ldr	x0, [x29, #24]\n\t"
3989         /* Multiply */
3990         /*  A[0] * B[0] */
3991         "mul	x3, x11, x15\n\t"
3992         "umulh	x4, x11, x15\n\t"
3993         /*  A[0] * B[1] */
3994         "mul	x24, x11, x16\n\t"
3995         "umulh	x5, x11, x16\n\t"
3996         "adds	x4, x4, x24\n\t"
3997         "adc	x5, x5, xzr\n\t"
3998         /*  A[1] * B[0] */
3999         "mul	x24, x12, x15\n\t"
4000         "umulh	x25, x12, x15\n\t"
4001         "adds	x4, x4, x24\n\t"
4002         "adcs	x5, x5, x25\n\t"
4003         "adc	x6, xzr, xzr\n\t"
4004         /*  A[0] * B[2] */
4005         "mul	x24, x11, x17\n\t"
4006         "umulh	x25, x11, x17\n\t"
4007         "adds	x5, x5, x24\n\t"
4008         "adc	x6, x6, x25\n\t"
4009         /*  A[1] * B[1] */
4010         "mul	x24, x12, x16\n\t"
4011         "umulh	x25, x12, x16\n\t"
4012         "adds	x5, x5, x24\n\t"
4013         "adcs	x6, x6, x25\n\t"
4014         "adc	x7, xzr, xzr\n\t"
4015         /*  A[2] * B[0] */
4016         "mul	x24, x13, x15\n\t"
4017         "umulh	x25, x13, x15\n\t"
4018         "adds	x5, x5, x24\n\t"
4019         "adcs	x6, x6, x25\n\t"
4020         "adc	x7, x7, xzr\n\t"
4021         /*  A[0] * B[3] */
4022         "mul	x24, x11, x19\n\t"
4023         "umulh	x25, x11, x19\n\t"
4024         "adds	x6, x6, x24\n\t"
4025         "adcs	x7, x7, x25\n\t"
4026         "adc	x8, xzr, xzr\n\t"
4027         /*  A[1] * B[2] */
4028         "mul	x24, x12, x17\n\t"
4029         "umulh	x25, x12, x17\n\t"
4030         "adds	x6, x6, x24\n\t"
4031         "adcs	x7, x7, x25\n\t"
4032         "adc	x8, x8, xzr\n\t"
4033         /*  A[2] * B[1] */
4034         "mul	x24, x13, x16\n\t"
4035         "umulh	x25, x13, x16\n\t"
4036         "adds	x6, x6, x24\n\t"
4037         "adcs	x7, x7, x25\n\t"
4038         "adc	x8, x8, xzr\n\t"
4039         /*  A[3] * B[0] */
4040         "mul	x24, x14, x15\n\t"
4041         "umulh	x25, x14, x15\n\t"
4042         "adds	x6, x6, x24\n\t"
4043         "adcs	x7, x7, x25\n\t"
4044         "adc	x8, x8, xzr\n\t"
4045         /*  A[1] * B[3] */
4046         "mul	x24, x12, x19\n\t"
4047         "umulh	x25, x12, x19\n\t"
4048         "adds	x7, x7, x24\n\t"
4049         "adcs	x8, x8, x25\n\t"
4050         "adc	x9, xzr, xzr\n\t"
4051         /*  A[2] * B[2] */
4052         "mul	x24, x13, x17\n\t"
4053         "umulh	x25, x13, x17\n\t"
4054         "adds	x7, x7, x24\n\t"
4055         "adcs	x8, x8, x25\n\t"
4056         "adc	x9, x9, xzr\n\t"
4057         /*  A[3] * B[1] */
4058         "mul	x24, x14, x16\n\t"
4059         "umulh	x25, x14, x16\n\t"
4060         "adds	x7, x7, x24\n\t"
4061         "adcs	x8, x8, x25\n\t"
4062         "adc	x9, x9, xzr\n\t"
4063         /*  A[2] * B[3] */
4064         "mul	x24, x13, x19\n\t"
4065         "umulh	x25, x13, x19\n\t"
4066         "adds	x8, x8, x24\n\t"
4067         "adcs	x9, x9, x25\n\t"
4068         "adc	x10, xzr, xzr\n\t"
4069         /*  A[3] * B[2] */
4070         "mul	x24, x14, x17\n\t"
4071         "umulh	x25, x14, x17\n\t"
4072         "adds	x8, x8, x24\n\t"
4073         "adcs	x9, x9, x25\n\t"
4074         "adc	x10, x10, xzr\n\t"
4075         /*  A[3] * B[3] */
4076         "mul	x24, x14, x19\n\t"
4077         "umulh	x25, x14, x19\n\t"
4078         "adds	x9, x9, x24\n\t"
4079         "adc	x10, x10, x25\n\t"
4080         /* Reduce */
4081         /*  Move top half into t4-t7 and remove top bit from t3 */
4082         "extr	x10, x10, x9, #63\n\t"
4083         "extr	x9, x9, x8, #63\n\t"
4084         "extr	x8, x8, x7, #63\n\t"
4085         "extr	x7, x7, x6, #63\n\t"
4086         "and	x6, x6, #0x7fffffffffffffff\n\t"
4087         /*  Multiply top half by 19 */
4088         "mov	x24, #19\n\t"
4089         "mul	x25, x24, x7\n\t"
4090         "umulh	x7, x24, x7\n\t"
4091         "adds	x3, x3, x25\n\t"
4092         "mul	x25, x24, x8\n\t"
4093         "umulh	x8, x24, x8\n\t"
4094         "adcs	x4, x4, x25\n\t"
4095         "mul	x25, x24, x9\n\t"
4096         "umulh	x9, x24, x9\n\t"
4097         "adcs	x5, x5, x25\n\t"
4098         "mul	x25, x24, x10\n\t"
4099         "umulh	x26, x24, x10\n\t"
4100         "adcs	x6, x6, x25\n\t"
4101         "adc	x26, x26, xzr\n\t"
4102         /*  Add remaining product results in */
4103         "adds	x4, x4, x7\n\t"
4104         "adcs	x5, x5, x8\n\t"
4105         "adcs	x6, x6, x9\n\t"
4106         "adc	x26, x26, xzr\n\t"
4107         /*  Overflow */
4108         "extr	x26, x26, x6, #63\n\t"
4109         "mul	x26, x26, x24\n\t"
4110         "and	x6, x6, #0x7fffffffffffffff\n\t"
4111         "adds	x3, x3, x26\n\t"
4112         "adcs	x4, x4, xzr\n\t"
4113         "adcs	x5, x5, xzr\n\t"
4114         "adc	x6, x6, xzr\n\t"
4115         /* Reduce if top bit set */
4116         "and	x26, x24, x6, asr 63\n\t"
4117         "and	x6, x6, #0x7fffffffffffffff\n\t"
4118         "adds	x3, x3, x26\n\t"
4119         "adcs	x4, x4, xzr\n\t"
4120         "adcs	x5, x5, xzr\n\t"
4121         "adc	x6, x6, xzr\n\t"
4122         /* Store */
4123         "stp	x3, x4, [x0]\n\t"
4124         "stp	x5, x6, [x0, #16]\n\t"
4125         "ldp	x29, x30, [sp], #0x60\n\t"
4126         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
4127         :
4128         : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
4129     );
4130 }
4131 
fe_ge_dbl(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz)4132 void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
4133 {
4134     __asm__ __volatile__ (
4135         "stp	x29, x30, [sp, #-80]!\n\t"
4136         "add	x29, sp, #0\n\t"
4137         "str	%x[rx], [x29, #16]\n\t"
4138         "str	%x[ry], [x29, #24]\n\t"
4139         "str	%x[rz], [x29, #32]\n\t"
4140         "str	%x[rt], [x29, #40]\n\t"
4141         "str	%x[px], [x29, #48]\n\t"
4142         "str	%x[py], [x29, #56]\n\t"
4143         "str	%x[pz], [x29, #64]\n\t"
4144         "ldr	x1, [x29, #48]\n\t"
4145         /* Square */
4146         "ldp	x12, x13, [x1]\n\t"
4147         "ldp	x14, x15, [x1, #16]\n\t"
4148         /*  A[0] * A[1] */
4149         "mul	x5, x12, x13\n\t"
4150         "umulh	x6, x12, x13\n\t"
4151         /*  A[0] * A[2] */
4152         "mul	x25, x12, x14\n\t"
4153         "umulh	x7, x12, x14\n\t"
4154         "adds	x6, x6, x25\n\t"
4155         "adc	x7, x7, xzr\n\t"
4156         /*  A[0] * A[3] */
4157         "mul	x25, x12, x15\n\t"
4158         "umulh	x8, x12, x15\n\t"
4159         "adds	x7, x7, x25\n\t"
4160         "adc	x8, x8, xzr\n\t"
4161         /*  A[1] * A[2] */
4162         "mul	x25, x13, x14\n\t"
4163         "umulh	x26, x13, x14\n\t"
4164         "adds	x7, x7, x25\n\t"
4165         "adcs	x8, x8, x26\n\t"
4166         "adc	x9, xzr, xzr\n\t"
4167         /*  A[1] * A[3] */
4168         "mul	x25, x13, x15\n\t"
4169         "umulh	x26, x13, x15\n\t"
4170         "adds	x8, x8, x25\n\t"
4171         "adc	x9, x9, x26\n\t"
4172         /*  A[2] * A[3] */
4173         "mul	x25, x14, x15\n\t"
4174         "umulh	x10, x14, x15\n\t"
4175         "adds	x9, x9, x25\n\t"
4176         "adc	x10, x10, xzr\n\t"
4177         /* Double */
4178         "adds	x5, x5, x5\n\t"
4179         "adcs	x6, x6, x6\n\t"
4180         "adcs	x7, x7, x7\n\t"
4181         "adcs	x8, x8, x8\n\t"
4182         "adcs	x9, x9, x9\n\t"
4183         "adcs	x10, x10, x10\n\t"
4184         "adc	x11, xzr, xzr\n\t"
4185         /*  A[0] * A[0] */
4186         "mul	x4, x12, x12\n\t"
4187         "umulh	x27, x12, x12\n\t"
4188         /*  A[1] * A[1] */
4189         "mul	x25, x13, x13\n\t"
4190         "umulh	x26, x13, x13\n\t"
4191         "adds	x5, x5, x27\n\t"
4192         "adcs	x6, x6, x25\n\t"
4193         "adc	x27, x26, xzr\n\t"
4194         /*  A[2] * A[2] */
4195         "mul	x25, x14, x14\n\t"
4196         "umulh	x26, x14, x14\n\t"
4197         "adds	x7, x7, x27\n\t"
4198         "adcs	x8, x8, x25\n\t"
4199         "adc	x27, x26, xzr\n\t"
4200         /*  A[3] * A[3] */
4201         "mul	x25, x15, x15\n\t"
4202         "umulh	x26, x15, x15\n\t"
4203         "adds	x9, x9, x27\n\t"
4204         "adcs	x10, x10, x25\n\t"
4205         "adc	x11, x11, x26\n\t"
4206         /* Reduce */
4207         /*  Move top half into t4-t7 and remove top bit from t3 */
4208         "extr	x11, x11, x10, #63\n\t"
4209         "extr	x10, x10, x9, #63\n\t"
4210         "extr	x9, x9, x8, #63\n\t"
4211         "extr	x8, x8, x7, #63\n\t"
4212         "and	x7, x7, #0x7fffffffffffffff\n\t"
4213         /*  Multiply top half by 19 */
4214         "mov	x25, #19\n\t"
4215         "mul	x26, x25, x8\n\t"
4216         "umulh	x8, x25, x8\n\t"
4217         "adds	x4, x4, x26\n\t"
4218         "mul	x26, x25, x9\n\t"
4219         "umulh	x9, x25, x9\n\t"
4220         "adcs	x5, x5, x26\n\t"
4221         "mul	x26, x25, x10\n\t"
4222         "umulh	x10, x25, x10\n\t"
4223         "adcs	x6, x6, x26\n\t"
4224         "mul	x26, x25, x11\n\t"
4225         "umulh	x27, x25, x11\n\t"
4226         "adcs	x7, x7, x26\n\t"
4227         "adc	x27, x27, xzr\n\t"
4228         /*  Add remaining product results in */
4229         "adds	x5, x5, x8\n\t"
4230         "adcs	x6, x6, x9\n\t"
4231         "adcs	x7, x7, x10\n\t"
4232         "adc	x27, x27, xzr\n\t"
4233         /*  Overflow */
4234         "extr	x27, x27, x7, #63\n\t"
4235         "mul	x27, x27, x25\n\t"
4236         "and	x7, x7, #0x7fffffffffffffff\n\t"
4237         "adds	x4, x4, x27\n\t"
4238         "adcs	x5, x5, xzr\n\t"
4239         "adcs	x6, x6, xzr\n\t"
4240         "adc	x7, x7, xzr\n\t"
4241         /* Reduce if top bit set */
4242         "and	x27, x25, x7, asr 63\n\t"
4243         "and	x7, x7, #0x7fffffffffffffff\n\t"
4244         "adds	x4, x4, x27\n\t"
4245         "adcs	x5, x5, xzr\n\t"
4246         "adcs	x6, x6, xzr\n\t"
4247         "adc	x7, x7, xzr\n\t"
4248         /* Store */
4249         "stp	x4, x5, [x0]\n\t"
4250         "stp	x6, x7, [x0, #16]\n\t"
4251         "ldr	x0, [x29, #32]\n\t"
4252         "ldr	x1, [x29, #56]\n\t"
4253         /* Square */
4254         "ldp	x21, x22, [x1]\n\t"
4255         "ldp	x23, x24, [x1, #16]\n\t"
4256         /*  A[0] * A[1] */
4257         "mul	x9, x21, x22\n\t"
4258         "umulh	x10, x21, x22\n\t"
4259         /*  A[0] * A[2] */
4260         "mul	x25, x21, x23\n\t"
4261         "umulh	x11, x21, x23\n\t"
4262         "adds	x10, x10, x25\n\t"
4263         "adc	x11, x11, xzr\n\t"
4264         /*  A[0] * A[3] */
4265         "mul	x25, x21, x24\n\t"
4266         "umulh	x16, x21, x24\n\t"
4267         "adds	x11, x11, x25\n\t"
4268         "adc	x16, x16, xzr\n\t"
4269         /*  A[1] * A[2] */
4270         "mul	x25, x22, x23\n\t"
4271         "umulh	x26, x22, x23\n\t"
4272         "adds	x11, x11, x25\n\t"
4273         "adcs	x16, x16, x26\n\t"
4274         "adc	x17, xzr, xzr\n\t"
4275         /*  A[1] * A[3] */
4276         "mul	x25, x22, x24\n\t"
4277         "umulh	x26, x22, x24\n\t"
4278         "adds	x16, x16, x25\n\t"
4279         "adc	x17, x17, x26\n\t"
4280         /*  A[2] * A[3] */
4281         "mul	x25, x23, x24\n\t"
4282         "umulh	x19, x23, x24\n\t"
4283         "adds	x17, x17, x25\n\t"
4284         "adc	x19, x19, xzr\n\t"
4285         /* Double */
4286         "adds	x9, x9, x9\n\t"
4287         "adcs	x10, x10, x10\n\t"
4288         "adcs	x11, x11, x11\n\t"
4289         "adcs	x16, x16, x16\n\t"
4290         "adcs	x17, x17, x17\n\t"
4291         "adcs	x19, x19, x19\n\t"
4292         "adc	x20, xzr, xzr\n\t"
4293         /*  A[0] * A[0] */
4294         "mul	x8, x21, x21\n\t"
4295         "umulh	x27, x21, x21\n\t"
4296         /*  A[1] * A[1] */
4297         "mul	x25, x22, x22\n\t"
4298         "umulh	x26, x22, x22\n\t"
4299         "adds	x9, x9, x27\n\t"
4300         "adcs	x10, x10, x25\n\t"
4301         "adc	x27, x26, xzr\n\t"
4302         /*  A[2] * A[2] */
4303         "mul	x25, x23, x23\n\t"
4304         "umulh	x26, x23, x23\n\t"
4305         "adds	x11, x11, x27\n\t"
4306         "adcs	x16, x16, x25\n\t"
4307         "adc	x27, x26, xzr\n\t"
4308         /*  A[3] * A[3] */
4309         "mul	x25, x24, x24\n\t"
4310         "umulh	x26, x24, x24\n\t"
4311         "adds	x17, x17, x27\n\t"
4312         "adcs	x19, x19, x25\n\t"
4313         "adc	x20, x20, x26\n\t"
4314         /* Reduce */
4315         /*  Move top half into t4-t7 and remove top bit from t3 */
4316         "extr	x20, x20, x19, #63\n\t"
4317         "extr	x19, x19, x17, #63\n\t"
4318         "extr	x17, x17, x16, #63\n\t"
4319         "extr	x16, x16, x11, #63\n\t"
4320         "and	x11, x11, #0x7fffffffffffffff\n\t"
4321         /*  Multiply top half by 19 */
4322         "mov	x25, #19\n\t"
4323         "mul	x26, x25, x16\n\t"
4324         "umulh	x16, x25, x16\n\t"
4325         "adds	x8, x8, x26\n\t"
4326         "mul	x26, x25, x17\n\t"
4327         "umulh	x17, x25, x17\n\t"
4328         "adcs	x9, x9, x26\n\t"
4329         "mul	x26, x25, x19\n\t"
4330         "umulh	x19, x25, x19\n\t"
4331         "adcs	x10, x10, x26\n\t"
4332         "mul	x26, x25, x20\n\t"
4333         "umulh	x27, x25, x20\n\t"
4334         "adcs	x11, x11, x26\n\t"
4335         "adc	x27, x27, xzr\n\t"
4336         /*  Add remaining product results in */
4337         "adds	x9, x9, x16\n\t"
4338         "adcs	x10, x10, x17\n\t"
4339         "adcs	x11, x11, x19\n\t"
4340         "adc	x27, x27, xzr\n\t"
4341         /*  Overflow */
4342         "extr	x27, x27, x11, #63\n\t"
4343         "mul	x27, x27, x25\n\t"
4344         "and	x11, x11, #0x7fffffffffffffff\n\t"
4345         "adds	x8, x8, x27\n\t"
4346         "adcs	x9, x9, xzr\n\t"
4347         "adcs	x10, x10, xzr\n\t"
4348         "adc	x11, x11, xzr\n\t"
4349         /* Reduce if top bit set */
4350         "and	x27, x25, x11, asr 63\n\t"
4351         "and	x11, x11, #0x7fffffffffffffff\n\t"
4352         "adds	x8, x8, x27\n\t"
4353         "adcs	x9, x9, xzr\n\t"
4354         "adcs	x10, x10, xzr\n\t"
4355         "adc	x11, x11, xzr\n\t"
4356         /* Store */
4357         "stp	x8, x9, [x0]\n\t"
4358         "stp	x10, x11, [x0, #16]\n\t"
4359         "ldr	x0, [x29, #24]\n\t"
4360         /* Add */
4361         "adds	x12, x12, x21\n\t"
4362         "adcs	x13, x13, x22\n\t"
4363         "adcs	x14, x14, x23\n\t"
4364         "adc	x15, x15, x24\n\t"
4365         "mov	x25, #-19\n\t"
4366         "asr	x28, x15, #63\n\t"
4367         /*   Mask the modulus */
4368         "and	x25, x28, x25\n\t"
4369         "and	x26, x28, #0x7fffffffffffffff\n\t"
4370         /*   Sub modulus (if overflow) */
4371         "subs	x12, x12, x25\n\t"
4372         "sbcs	x13, x13, x28\n\t"
4373         "sbcs	x14, x14, x28\n\t"
4374         "sbc	x15, x15, x26\n\t"
4375         "ldr	x0, [x29, #40]\n\t"
4376         /* Square */
4377         /*  A[0] * A[1] */
4378         "mul	x17, x12, x13\n\t"
4379         "umulh	x19, x12, x13\n\t"
4380         /*  A[0] * A[2] */
4381         "mul	x25, x12, x14\n\t"
4382         "umulh	x20, x12, x14\n\t"
4383         "adds	x19, x19, x25\n\t"
4384         "adc	x20, x20, xzr\n\t"
4385         /*  A[0] * A[3] */
4386         "mul	x25, x12, x15\n\t"
4387         "umulh	x21, x12, x15\n\t"
4388         "adds	x20, x20, x25\n\t"
4389         "adc	x21, x21, xzr\n\t"
4390         /*  A[1] * A[2] */
4391         "mul	x25, x13, x14\n\t"
4392         "umulh	x26, x13, x14\n\t"
4393         "adds	x20, x20, x25\n\t"
4394         "adcs	x21, x21, x26\n\t"
4395         "adc	x22, xzr, xzr\n\t"
4396         /*  A[1] * A[3] */
4397         "mul	x25, x13, x15\n\t"
4398         "umulh	x26, x13, x15\n\t"
4399         "adds	x21, x21, x25\n\t"
4400         "adc	x22, x22, x26\n\t"
4401         /*  A[2] * A[3] */
4402         "mul	x25, x14, x15\n\t"
4403         "umulh	x23, x14, x15\n\t"
4404         "adds	x22, x22, x25\n\t"
4405         "adc	x23, x23, xzr\n\t"
4406         /* Double */
4407         "adds	x17, x17, x17\n\t"
4408         "adcs	x19, x19, x19\n\t"
4409         "adcs	x20, x20, x20\n\t"
4410         "adcs	x21, x21, x21\n\t"
4411         "adcs	x22, x22, x22\n\t"
4412         "adcs	x23, x23, x23\n\t"
4413         "adc	x24, xzr, xzr\n\t"
4414         /*  A[0] * A[0] */
4415         "mul	x16, x12, x12\n\t"
4416         "umulh	x27, x12, x12\n\t"
4417         /*  A[1] * A[1] */
4418         "mul	x25, x13, x13\n\t"
4419         "umulh	x26, x13, x13\n\t"
4420         "adds	x17, x17, x27\n\t"
4421         "adcs	x19, x19, x25\n\t"
4422         "adc	x27, x26, xzr\n\t"
4423         /*  A[2] * A[2] */
4424         "mul	x25, x14, x14\n\t"
4425         "umulh	x26, x14, x14\n\t"
4426         "adds	x20, x20, x27\n\t"
4427         "adcs	x21, x21, x25\n\t"
4428         "adc	x27, x26, xzr\n\t"
4429         /*  A[3] * A[3] */
4430         "mul	x25, x15, x15\n\t"
4431         "umulh	x26, x15, x15\n\t"
4432         "adds	x22, x22, x27\n\t"
4433         "adcs	x23, x23, x25\n\t"
4434         "adc	x24, x24, x26\n\t"
4435         /* Reduce */
4436         /*  Move top half into t4-t7 and remove top bit from t3 */
4437         "extr	x24, x24, x23, #63\n\t"
4438         "extr	x23, x23, x22, #63\n\t"
4439         "extr	x22, x22, x21, #63\n\t"
4440         "extr	x21, x21, x20, #63\n\t"
4441         "and	x20, x20, #0x7fffffffffffffff\n\t"
4442         /*  Multiply top half by 19 */
4443         "mov	x25, #19\n\t"
4444         "mul	x26, x25, x21\n\t"
4445         "umulh	x21, x25, x21\n\t"
4446         "adds	x16, x16, x26\n\t"
4447         "mul	x26, x25, x22\n\t"
4448         "umulh	x22, x25, x22\n\t"
4449         "adcs	x17, x17, x26\n\t"
4450         "mul	x26, x25, x23\n\t"
4451         "umulh	x23, x25, x23\n\t"
4452         "adcs	x19, x19, x26\n\t"
4453         "mul	x26, x25, x24\n\t"
4454         "umulh	x27, x25, x24\n\t"
4455         "adcs	x20, x20, x26\n\t"
4456         "adc	x27, x27, xzr\n\t"
4457         /*  Add remaining product results in */
4458         "adds	x17, x17, x21\n\t"
4459         "adcs	x19, x19, x22\n\t"
4460         "adcs	x20, x20, x23\n\t"
4461         "adc	x27, x27, xzr\n\t"
4462         /*  Overflow */
4463         "extr	x27, x27, x20, #63\n\t"
4464         "mul	x27, x27, x25\n\t"
4465         "and	x20, x20, #0x7fffffffffffffff\n\t"
4466         "adds	x16, x16, x27\n\t"
4467         "adcs	x17, x17, xzr\n\t"
4468         "adcs	x19, x19, xzr\n\t"
4469         "adc	x20, x20, xzr\n\t"
4470         /* Reduce if top bit set */
4471         "and	x27, x25, x20, asr 63\n\t"
4472         "and	x20, x20, #0x7fffffffffffffff\n\t"
4473         "adds	x16, x16, x27\n\t"
4474         "adcs	x17, x17, xzr\n\t"
4475         "adcs	x19, x19, xzr\n\t"
4476         "adc	x20, x20, xzr\n\t"
4477         /* Store */
4478         "stp	x16, x17, [x0]\n\t"
4479         "stp	x19, x20, [x0, #16]\n\t"
4480         "ldr	x0, [x29, #24]\n\t"
4481         "ldr	x1, [x29, #32]\n\t"
4482         /* Add */
4483         "adds	x12, x8, x4\n\t"
4484         "adcs	x13, x9, x5\n\t"
4485         "adcs	x14, x10, x6\n\t"
4486         "adc	x15, x11, x7\n\t"
4487         "mov	x25, #-19\n\t"
4488         "asr	x28, x15, #63\n\t"
4489         /*   Mask the modulus */
4490         "and	x25, x28, x25\n\t"
4491         "and	x26, x28, #0x7fffffffffffffff\n\t"
4492         /*   Sub modulus (if overflow) */
4493         "subs	x12, x12, x25\n\t"
4494         "sbcs	x13, x13, x28\n\t"
4495         "sbcs	x14, x14, x28\n\t"
4496         "sbc	x15, x15, x26\n\t"
4497         /* Sub */
4498         "subs	x21, x8, x4\n\t"
4499         "sbcs	x22, x9, x5\n\t"
4500         "sbcs	x23, x10, x6\n\t"
4501         "sbcs	x24, x11, x7\n\t"
4502         "mov	x25, #-19\n\t"
4503         "csetm	x28, cc\n\t"
4504         /*   Mask the modulus */
4505         "and	x25, x28, x25\n\t"
4506         "and	x26, x28, #0x7fffffffffffffff\n\t"
4507         /*   Add modulus (if underflow) */
4508         "adds	x21, x21, x25\n\t"
4509         "adcs	x22, x22, x28\n\t"
4510         "adcs	x23, x23, x28\n\t"
4511         "adc	x24, x24, x26\n\t"
4512         "stp	x12, x13, [x0]\n\t"
4513         "stp	x14, x15, [x0, #16]\n\t"
4514         "stp	x21, x22, [x1]\n\t"
4515         "stp	x23, x24, [x1, #16]\n\t"
4516         "ldr	x0, [x29, #16]\n\t"
4517         /* Sub */
4518         "subs	x16, x16, x12\n\t"
4519         "sbcs	x17, x17, x13\n\t"
4520         "sbcs	x19, x19, x14\n\t"
4521         "sbcs	x20, x20, x15\n\t"
4522         "mov	x25, #-19\n\t"
4523         "csetm	x28, cc\n\t"
4524         /*   Mask the modulus */
4525         "and	x25, x28, x25\n\t"
4526         "and	x26, x28, #0x7fffffffffffffff\n\t"
4527         /*   Add modulus (if underflow) */
4528         "adds	x16, x16, x25\n\t"
4529         "adcs	x17, x17, x28\n\t"
4530         "adcs	x19, x19, x28\n\t"
4531         "adc	x20, x20, x26\n\t"
4532         "stp	x16, x17, [x0]\n\t"
4533         "stp	x19, x20, [x0, #16]\n\t"
4534         "ldr	x0, [x29, #40]\n\t"
4535         "ldr	x1, [x29, #64]\n\t"
4536         /* Square * 2 */
4537         "ldp	x12, x13, [x1]\n\t"
4538         "ldp	x14, x15, [x1, #16]\n\t"
4539         /*  A[0] * A[1] */
4540         "mul	x5, x12, x13\n\t"
4541         "umulh	x6, x12, x13\n\t"
4542         /*  A[0] * A[2] */
4543         "mul	x25, x12, x14\n\t"
4544         "umulh	x7, x12, x14\n\t"
4545         "adds	x6, x6, x25\n\t"
4546         "adc	x7, x7, xzr\n\t"
4547         /*  A[0] * A[3] */
4548         "mul	x25, x12, x15\n\t"
4549         "umulh	x8, x12, x15\n\t"
4550         "adds	x7, x7, x25\n\t"
4551         "adc	x8, x8, xzr\n\t"
4552         /*  A[1] * A[2] */
4553         "mul	x25, x13, x14\n\t"
4554         "umulh	x26, x13, x14\n\t"
4555         "adds	x7, x7, x25\n\t"
4556         "adcs	x8, x8, x26\n\t"
4557         "adc	x9, xzr, xzr\n\t"
4558         /*  A[1] * A[3] */
4559         "mul	x25, x13, x15\n\t"
4560         "umulh	x26, x13, x15\n\t"
4561         "adds	x8, x8, x25\n\t"
4562         "adc	x9, x9, x26\n\t"
4563         /*  A[2] * A[3] */
4564         "mul	x25, x14, x15\n\t"
4565         "umulh	x10, x14, x15\n\t"
4566         "adds	x9, x9, x25\n\t"
4567         "adc	x10, x10, xzr\n\t"
4568         /* Double */
4569         "adds	x5, x5, x5\n\t"
4570         "adcs	x6, x6, x6\n\t"
4571         "adcs	x7, x7, x7\n\t"
4572         "adcs	x8, x8, x8\n\t"
4573         "adcs	x9, x9, x9\n\t"
4574         "adcs	x10, x10, x10\n\t"
4575         "adc	x11, xzr, xzr\n\t"
4576         /*  A[0] * A[0] */
4577         "mul	x4, x12, x12\n\t"
4578         "umulh	x28, x12, x12\n\t"
4579         /*  A[1] * A[1] */
4580         "mul	x25, x13, x13\n\t"
4581         "umulh	x26, x13, x13\n\t"
4582         "adds	x5, x5, x28\n\t"
4583         "adcs	x6, x6, x25\n\t"
4584         "adc	x28, x26, xzr\n\t"
4585         /*  A[2] * A[2] */
4586         "mul	x25, x14, x14\n\t"
4587         "umulh	x26, x14, x14\n\t"
4588         "adds	x7, x7, x28\n\t"
4589         "adcs	x8, x8, x25\n\t"
4590         "adc	x28, x26, xzr\n\t"
4591         /*  A[3] * A[3] */
4592         "mul	x25, x15, x15\n\t"
4593         "umulh	x26, x15, x15\n\t"
4594         "adds	x9, x9, x28\n\t"
4595         "adcs	x10, x10, x25\n\t"
4596         "adc	x11, x11, x26\n\t"
4597         /* Double and Reduce */
4598         "mov	x25, #0x169\n\t"
4599         /*  Move top half into t4-t7 and remove top bit from t3 */
4600         "lsr	x28, x11, #61\n\t"
4601         "extr	x11, x11, x10, #62\n\t"
4602         "extr	x10, x10, x9, #62\n\t"
4603         "extr	x9, x9, x8, #62\n\t"
4604         "extr	x8, x8, x7, #62\n\t"
4605         "extr	x7, x7, x6, #63\n\t"
4606         "extr	x6, x6, x5, #63\n\t"
4607         "extr	x5, x5, x4, #63\n\t"
4608         "lsl	x4, x4, #1\n\t"
4609         "and	x7, x7, #0x7fffffffffffffff\n\t"
4610         /*  Two left, only one right */
4611         "and	x11, x11, #0x7fffffffffffffff\n\t"
4612         /*  Multiply top bits by 19*19 */
4613         "mul	x28, x28, x25\n\t"
4614         /*  Multiply top half by 19 */
4615         "mov	x25, #19\n\t"
4616         "mul	x26, x25, x8\n\t"
4617         "umulh	x8, x25, x8\n\t"
4618         "adds	x4, x4, x26\n\t"
4619         "mul	x26, x25, x9\n\t"
4620         "umulh	x9, x25, x9\n\t"
4621         "adcs	x5, x5, x26\n\t"
4622         "mul	x26, x25, x10\n\t"
4623         "umulh	x10, x25, x10\n\t"
4624         "adcs	x6, x6, x26\n\t"
4625         "mul	x26, x25, x11\n\t"
4626         "umulh	x27, x25, x11\n\t"
4627         "adcs	x7, x7, x26\n\t"
4628         "adc	x27, x27, xzr\n\t"
4629         /*  Add remaining product results in */
4630         "adds	x4, x4, x28\n\t"
4631         "adcs	x5, x5, x8\n\t"
4632         "adcs	x6, x6, x9\n\t"
4633         "adcs	x7, x7, x10\n\t"
4634         "adc	x27, x27, xzr\n\t"
4635         /*  Overflow */
4636         "extr	x27, x27, x7, #63\n\t"
4637         "mul	x27, x27, x25\n\t"
4638         "and	x7, x7, #0x7fffffffffffffff\n\t"
4639         "adds	x4, x4, x27\n\t"
4640         "adcs	x5, x5, xzr\n\t"
4641         "adcs	x6, x6, xzr\n\t"
4642         "adc	x7, x7, xzr\n\t"
4643         /* Reduce if top bit set */
4644         "and	x27, x25, x7, asr 63\n\t"
4645         "and	x7, x7, #0x7fffffffffffffff\n\t"
4646         "adds	x4, x4, x27\n\t"
4647         "adcs	x5, x5, xzr\n\t"
4648         "adcs	x6, x6, xzr\n\t"
4649         "adc	x7, x7, xzr\n\t"
4650         /* Store */
4651         "ldr	x0, [x29, #40]\n\t"
4652         /* Sub */
4653         "subs	x4, x4, x21\n\t"
4654         "sbcs	x5, x5, x22\n\t"
4655         "sbcs	x6, x6, x23\n\t"
4656         "sbcs	x7, x7, x24\n\t"
4657         "mov	x25, #-19\n\t"
4658         "csetm	x28, cc\n\t"
4659         /*   Mask the modulus */
4660         "and	x25, x28, x25\n\t"
4661         "and	x26, x28, #0x7fffffffffffffff\n\t"
4662         /*   Add modulus (if underflow) */
4663         "adds	x4, x4, x25\n\t"
4664         "adcs	x5, x5, x28\n\t"
4665         "adcs	x6, x6, x28\n\t"
4666         "adc	x7, x7, x26\n\t"
4667         "stp	x4, x5, [x0]\n\t"
4668         "stp	x6, x7, [x0, #16]\n\t"
4669         "ldp	x29, x30, [sp], #0x50\n\t"
4670         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
4671         :
4672         : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
4673     );
4674 }
4675 
fe_ge_madd(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qxy2d,const fe qyplusx,const fe qyminusx)4676 void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
4677 {
4678     __asm__ __volatile__ (
4679         "stp	x29, x30, [sp, #-112]!\n\t"
4680         "add	x29, sp, #0\n\t"
4681         "str	%x[qyminusx], [sp, #104]\n\t"
4682         "str	%x[qyplusx], [sp, #96]\n\t"
4683         "str	%x[qxy2d], [sp, #88]\n\t"
4684         "str	%x[rx], [x29, #16]\n\t"
4685         "str	%x[ry], [x29, #24]\n\t"
4686         "str	%x[rz], [x29, #32]\n\t"
4687         "str	%x[rt], [x29, #40]\n\t"
4688         "str	%x[px], [x29, #48]\n\t"
4689         "str	%x[py], [x29, #56]\n\t"
4690         "str	%x[pz], [x29, #64]\n\t"
4691         "str	%x[pt], [x29, #72]\n\t"
4692         "ldr	x2, [x29, #56]\n\t"
4693         "ldr	x3, [x29, #48]\n\t"
4694         /* Add */
4695         "ldp	x12, x13, [x2]\n\t"
4696         "ldp	x14, x15, [x2, #16]\n\t"
4697         "ldp	x16, x17, [x3]\n\t"
4698         "ldp	x19, x20, [x3, #16]\n\t"
4699         "adds	x4, x12, x16\n\t"
4700         "adcs	x5, x13, x17\n\t"
4701         "adcs	x6, x14, x19\n\t"
4702         "adc	x7, x15, x20\n\t"
4703         "mov	x25, #-19\n\t"
4704         "asr	x28, x7, #63\n\t"
4705         /*   Mask the modulus */
4706         "and	x25, x28, x25\n\t"
4707         "and	x26, x28, #0x7fffffffffffffff\n\t"
4708         /*   Sub modulus (if overflow) */
4709         "subs	x4, x4, x25\n\t"
4710         "sbcs	x5, x5, x28\n\t"
4711         "sbcs	x6, x6, x28\n\t"
4712         "sbc	x7, x7, x26\n\t"
4713         /* Sub */
4714         "subs	x8, x12, x16\n\t"
4715         "sbcs	x9, x13, x17\n\t"
4716         "sbcs	x10, x14, x19\n\t"
4717         "sbcs	x11, x15, x20\n\t"
4718         "mov	x25, #-19\n\t"
4719         "csetm	x28, cc\n\t"
4720         /*   Mask the modulus */
4721         "and	x25, x28, x25\n\t"
4722         "and	x26, x28, #0x7fffffffffffffff\n\t"
4723         /*   Add modulus (if underflow) */
4724         "adds	x8, x8, x25\n\t"
4725         "adcs	x9, x9, x28\n\t"
4726         "adcs	x10, x10, x28\n\t"
4727         "adc	x11, x11, x26\n\t"
4728         "ldr	x0, [x29, #32]\n\t"
4729         "ldr	x2, [sp, #96]\n\t"
4730         /* Multiply */
4731         "ldp	x21, x22, [x2]\n\t"
4732         "ldp	x23, x24, [x2, #16]\n\t"
4733         /*  A[0] * B[0] */
4734         "mul	x12, x4, x21\n\t"
4735         "umulh	x13, x4, x21\n\t"
4736         /*  A[0] * B[1] */
4737         "mul	x25, x4, x22\n\t"
4738         "umulh	x14, x4, x22\n\t"
4739         "adds	x13, x13, x25\n\t"
4740         "adc	x14, x14, xzr\n\t"
4741         /*  A[1] * B[0] */
4742         "mul	x25, x5, x21\n\t"
4743         "umulh	x26, x5, x21\n\t"
4744         "adds	x13, x13, x25\n\t"
4745         "adcs	x14, x14, x26\n\t"
4746         "adc	x15, xzr, xzr\n\t"
4747         /*  A[0] * B[2] */
4748         "mul	x25, x4, x23\n\t"
4749         "umulh	x26, x4, x23\n\t"
4750         "adds	x14, x14, x25\n\t"
4751         "adc	x15, x15, x26\n\t"
4752         /*  A[1] * B[1] */
4753         "mul	x25, x5, x22\n\t"
4754         "umulh	x26, x5, x22\n\t"
4755         "adds	x14, x14, x25\n\t"
4756         "adcs	x15, x15, x26\n\t"
4757         "adc	x16, xzr, xzr\n\t"
4758         /*  A[2] * B[0] */
4759         "mul	x25, x6, x21\n\t"
4760         "umulh	x26, x6, x21\n\t"
4761         "adds	x14, x14, x25\n\t"
4762         "adcs	x15, x15, x26\n\t"
4763         "adc	x16, x16, xzr\n\t"
4764         /*  A[0] * B[3] */
4765         "mul	x25, x4, x24\n\t"
4766         "umulh	x26, x4, x24\n\t"
4767         "adds	x15, x15, x25\n\t"
4768         "adcs	x16, x16, x26\n\t"
4769         "adc	x17, xzr, xzr\n\t"
4770         /*  A[1] * B[2] */
4771         "mul	x25, x5, x23\n\t"
4772         "umulh	x26, x5, x23\n\t"
4773         "adds	x15, x15, x25\n\t"
4774         "adcs	x16, x16, x26\n\t"
4775         "adc	x17, x17, xzr\n\t"
4776         /*  A[2] * B[1] */
4777         "mul	x25, x6, x22\n\t"
4778         "umulh	x26, x6, x22\n\t"
4779         "adds	x15, x15, x25\n\t"
4780         "adcs	x16, x16, x26\n\t"
4781         "adc	x17, x17, xzr\n\t"
4782         /*  A[3] * B[0] */
4783         "mul	x25, x7, x21\n\t"
4784         "umulh	x26, x7, x21\n\t"
4785         "adds	x15, x15, x25\n\t"
4786         "adcs	x16, x16, x26\n\t"
4787         "adc	x17, x17, xzr\n\t"
4788         /*  A[1] * B[3] */
4789         "mul	x25, x5, x24\n\t"
4790         "umulh	x26, x5, x24\n\t"
4791         "adds	x16, x16, x25\n\t"
4792         "adcs	x17, x17, x26\n\t"
4793         "adc	x19, xzr, xzr\n\t"
4794         /*  A[2] * B[2] */
4795         "mul	x25, x6, x23\n\t"
4796         "umulh	x26, x6, x23\n\t"
4797         "adds	x16, x16, x25\n\t"
4798         "adcs	x17, x17, x26\n\t"
4799         "adc	x19, x19, xzr\n\t"
4800         /*  A[3] * B[1] */
4801         "mul	x25, x7, x22\n\t"
4802         "umulh	x26, x7, x22\n\t"
4803         "adds	x16, x16, x25\n\t"
4804         "adcs	x17, x17, x26\n\t"
4805         "adc	x19, x19, xzr\n\t"
4806         /*  A[2] * B[3] */
4807         "mul	x25, x6, x24\n\t"
4808         "umulh	x26, x6, x24\n\t"
4809         "adds	x17, x17, x25\n\t"
4810         "adcs	x19, x19, x26\n\t"
4811         "adc	x20, xzr, xzr\n\t"
4812         /*  A[3] * B[2] */
4813         "mul	x25, x7, x23\n\t"
4814         "umulh	x26, x7, x23\n\t"
4815         "adds	x17, x17, x25\n\t"
4816         "adcs	x19, x19, x26\n\t"
4817         "adc	x20, x20, xzr\n\t"
4818         /*  A[3] * B[3] */
4819         "mul	x25, x7, x24\n\t"
4820         "umulh	x26, x7, x24\n\t"
4821         "adds	x19, x19, x25\n\t"
4822         "adc	x20, x20, x26\n\t"
4823         /* Reduce */
4824         /*  Move top half into t4-t7 and remove top bit from t3 */
4825         "extr	x20, x20, x19, #63\n\t"
4826         "extr	x19, x19, x17, #63\n\t"
4827         "extr	x17, x17, x16, #63\n\t"
4828         "extr	x16, x16, x15, #63\n\t"
4829         "and	x15, x15, #0x7fffffffffffffff\n\t"
4830         /*  Multiply top half by 19 */
4831         "mov	x25, #19\n\t"
4832         "mul	x26, x25, x16\n\t"
4833         "umulh	x16, x25, x16\n\t"
4834         "adds	x12, x12, x26\n\t"
4835         "mul	x26, x25, x17\n\t"
4836         "umulh	x17, x25, x17\n\t"
4837         "adcs	x13, x13, x26\n\t"
4838         "mul	x26, x25, x19\n\t"
4839         "umulh	x19, x25, x19\n\t"
4840         "adcs	x14, x14, x26\n\t"
4841         "mul	x26, x25, x20\n\t"
4842         "umulh	x27, x25, x20\n\t"
4843         "adcs	x15, x15, x26\n\t"
4844         "adc	x27, x27, xzr\n\t"
4845         /*  Add remaining product results in */
4846         "adds	x13, x13, x16\n\t"
4847         "adcs	x14, x14, x17\n\t"
4848         "adcs	x15, x15, x19\n\t"
4849         "adc	x27, x27, xzr\n\t"
4850         /*  Overflow */
4851         "extr	x27, x27, x15, #63\n\t"
4852         "mul	x27, x27, x25\n\t"
4853         "and	x15, x15, #0x7fffffffffffffff\n\t"
4854         "adds	x12, x12, x27\n\t"
4855         "adcs	x13, x13, xzr\n\t"
4856         "adcs	x14, x14, xzr\n\t"
4857         "adc	x15, x15, xzr\n\t"
4858         /* Reduce if top bit set */
4859         "and	x27, x25, x15, asr 63\n\t"
4860         "and	x15, x15, #0x7fffffffffffffff\n\t"
4861         "adds	x12, x12, x27\n\t"
4862         "adcs	x13, x13, xzr\n\t"
4863         "adcs	x14, x14, xzr\n\t"
4864         "adc	x15, x15, xzr\n\t"
4865         /* Store */
4866         "ldr	x0, [x29, #24]\n\t"
4867         "ldr	x1, [sp, #104]\n\t"
4868         /* Multiply */
4869         "ldp	x21, x22, [x1]\n\t"
4870         "ldp	x23, x24, [x1, #16]\n\t"
4871         /*  A[0] * B[0] */
4872         "mul	x4, x8, x21\n\t"
4873         "umulh	x5, x8, x21\n\t"
4874         /*  A[0] * B[1] */
4875         "mul	x25, x8, x22\n\t"
4876         "umulh	x6, x8, x22\n\t"
4877         "adds	x5, x5, x25\n\t"
4878         "adc	x6, x6, xzr\n\t"
4879         /*  A[1] * B[0] */
4880         "mul	x25, x9, x21\n\t"
4881         "umulh	x26, x9, x21\n\t"
4882         "adds	x5, x5, x25\n\t"
4883         "adcs	x6, x6, x26\n\t"
4884         "adc	x7, xzr, xzr\n\t"
4885         /*  A[0] * B[2] */
4886         "mul	x25, x8, x23\n\t"
4887         "umulh	x26, x8, x23\n\t"
4888         "adds	x6, x6, x25\n\t"
4889         "adc	x7, x7, x26\n\t"
4890         /*  A[1] * B[1] */
4891         "mul	x25, x9, x22\n\t"
4892         "umulh	x26, x9, x22\n\t"
4893         "adds	x6, x6, x25\n\t"
4894         "adcs	x7, x7, x26\n\t"
4895         "adc	x16, xzr, xzr\n\t"
4896         /*  A[2] * B[0] */
4897         "mul	x25, x10, x21\n\t"
4898         "umulh	x26, x10, x21\n\t"
4899         "adds	x6, x6, x25\n\t"
4900         "adcs	x7, x7, x26\n\t"
4901         "adc	x16, x16, xzr\n\t"
4902         /*  A[0] * B[3] */
4903         "mul	x25, x8, x24\n\t"
4904         "umulh	x26, x8, x24\n\t"
4905         "adds	x7, x7, x25\n\t"
4906         "adcs	x16, x16, x26\n\t"
4907         "adc	x17, xzr, xzr\n\t"
4908         /*  A[1] * B[2] */
4909         "mul	x25, x9, x23\n\t"
4910         "umulh	x26, x9, x23\n\t"
4911         "adds	x7, x7, x25\n\t"
4912         "adcs	x16, x16, x26\n\t"
4913         "adc	x17, x17, xzr\n\t"
4914         /*  A[2] * B[1] */
4915         "mul	x25, x10, x22\n\t"
4916         "umulh	x26, x10, x22\n\t"
4917         "adds	x7, x7, x25\n\t"
4918         "adcs	x16, x16, x26\n\t"
4919         "adc	x17, x17, xzr\n\t"
4920         /*  A[3] * B[0] */
4921         "mul	x25, x11, x21\n\t"
4922         "umulh	x26, x11, x21\n\t"
4923         "adds	x7, x7, x25\n\t"
4924         "adcs	x16, x16, x26\n\t"
4925         "adc	x17, x17, xzr\n\t"
4926         /*  A[1] * B[3] */
4927         "mul	x25, x9, x24\n\t"
4928         "umulh	x26, x9, x24\n\t"
4929         "adds	x16, x16, x25\n\t"
4930         "adcs	x17, x17, x26\n\t"
4931         "adc	x19, xzr, xzr\n\t"
4932         /*  A[2] * B[2] */
4933         "mul	x25, x10, x23\n\t"
4934         "umulh	x26, x10, x23\n\t"
4935         "adds	x16, x16, x25\n\t"
4936         "adcs	x17, x17, x26\n\t"
4937         "adc	x19, x19, xzr\n\t"
4938         /*  A[3] * B[1] */
4939         "mul	x25, x11, x22\n\t"
4940         "umulh	x26, x11, x22\n\t"
4941         "adds	x16, x16, x25\n\t"
4942         "adcs	x17, x17, x26\n\t"
4943         "adc	x19, x19, xzr\n\t"
4944         /*  A[2] * B[3] */
4945         "mul	x25, x10, x24\n\t"
4946         "umulh	x26, x10, x24\n\t"
4947         "adds	x17, x17, x25\n\t"
4948         "adcs	x19, x19, x26\n\t"
4949         "adc	x20, xzr, xzr\n\t"
4950         /*  A[3] * B[2] */
4951         "mul	x25, x11, x23\n\t"
4952         "umulh	x26, x11, x23\n\t"
4953         "adds	x17, x17, x25\n\t"
4954         "adcs	x19, x19, x26\n\t"
4955         "adc	x20, x20, xzr\n\t"
4956         /*  A[3] * B[3] */
4957         "mul	x25, x11, x24\n\t"
4958         "umulh	x26, x11, x24\n\t"
4959         "adds	x19, x19, x25\n\t"
4960         "adc	x20, x20, x26\n\t"
4961         /* Reduce */
4962         /*  Move top half into t4-t7 and remove top bit from t3 */
4963         "extr	x20, x20, x19, #63\n\t"
4964         "extr	x19, x19, x17, #63\n\t"
4965         "extr	x17, x17, x16, #63\n\t"
4966         "extr	x16, x16, x7, #63\n\t"
4967         "and	x7, x7, #0x7fffffffffffffff\n\t"
4968         /*  Multiply top half by 19 */
4969         "mov	x25, #19\n\t"
4970         "mul	x26, x25, x16\n\t"
4971         "umulh	x16, x25, x16\n\t"
4972         "adds	x4, x4, x26\n\t"
4973         "mul	x26, x25, x17\n\t"
4974         "umulh	x17, x25, x17\n\t"
4975         "adcs	x5, x5, x26\n\t"
4976         "mul	x26, x25, x19\n\t"
4977         "umulh	x19, x25, x19\n\t"
4978         "adcs	x6, x6, x26\n\t"
4979         "mul	x26, x25, x20\n\t"
4980         "umulh	x27, x25, x20\n\t"
4981         "adcs	x7, x7, x26\n\t"
4982         "adc	x27, x27, xzr\n\t"
4983         /*  Add remaining product results in */
4984         "adds	x5, x5, x16\n\t"
4985         "adcs	x6, x6, x17\n\t"
4986         "adcs	x7, x7, x19\n\t"
4987         "adc	x27, x27, xzr\n\t"
4988         /*  Overflow */
4989         "extr	x27, x27, x7, #63\n\t"
4990         "mul	x27, x27, x25\n\t"
4991         "and	x7, x7, #0x7fffffffffffffff\n\t"
4992         "adds	x4, x4, x27\n\t"
4993         "adcs	x5, x5, xzr\n\t"
4994         "adcs	x6, x6, xzr\n\t"
4995         "adc	x7, x7, xzr\n\t"
4996         /* Reduce if top bit set */
4997         "and	x27, x25, x7, asr 63\n\t"
4998         "and	x7, x7, #0x7fffffffffffffff\n\t"
4999         "adds	x4, x4, x27\n\t"
5000         "adcs	x5, x5, xzr\n\t"
5001         "adcs	x6, x6, xzr\n\t"
5002         "adc	x7, x7, xzr\n\t"
5003         /* Store */
5004         "ldr	x0, [x29, #24]\n\t"
5005         "ldr	x1, [x29, #16]\n\t"
5006         /* Add */
5007         "adds	x8, x12, x4\n\t"
5008         "adcs	x9, x13, x5\n\t"
5009         "adcs	x10, x14, x6\n\t"
5010         "adc	x11, x15, x7\n\t"
5011         "mov	x25, #-19\n\t"
5012         "asr	x28, x11, #63\n\t"
5013         /*   Mask the modulus */
5014         "and	x25, x28, x25\n\t"
5015         "and	x26, x28, #0x7fffffffffffffff\n\t"
5016         /*   Sub modulus (if overflow) */
5017         "subs	x8, x8, x25\n\t"
5018         "sbcs	x9, x9, x28\n\t"
5019         "sbcs	x10, x10, x28\n\t"
5020         "sbc	x11, x11, x26\n\t"
5021         /* Sub */
5022         "subs	x16, x12, x4\n\t"
5023         "sbcs	x17, x13, x5\n\t"
5024         "sbcs	x19, x14, x6\n\t"
5025         "sbcs	x20, x15, x7\n\t"
5026         "mov	x25, #-19\n\t"
5027         "csetm	x28, cc\n\t"
5028         /*   Mask the modulus */
5029         "and	x25, x28, x25\n\t"
5030         "and	x26, x28, #0x7fffffffffffffff\n\t"
5031         /*   Add modulus (if underflow) */
5032         "adds	x16, x16, x25\n\t"
5033         "adcs	x17, x17, x28\n\t"
5034         "adcs	x19, x19, x28\n\t"
5035         "adc	x20, x20, x26\n\t"
5036         "stp	x8, x9, [x0]\n\t"
5037         "stp	x10, x11, [x0, #16]\n\t"
5038         "stp	x16, x17, [x1]\n\t"
5039         "stp	x19, x20, [x1, #16]\n\t"
5040         "ldr	x0, [x29, #40]\n\t"
5041         "ldr	x1, [sp, #88]\n\t"
5042         "ldr	x3, [x29, #72]\n\t"
5043         /* Multiply */
5044         "ldp	x16, x17, [x1]\n\t"
5045         "ldp	x19, x20, [x1, #16]\n\t"
5046         "ldp	x21, x22, [x3]\n\t"
5047         "ldp	x23, x24, [x3, #16]\n\t"
5048         /*  A[0] * B[0] */
5049         "mul	x4, x16, x21\n\t"
5050         "umulh	x5, x16, x21\n\t"
5051         /*  A[0] * B[1] */
5052         "mul	x25, x16, x22\n\t"
5053         "umulh	x6, x16, x22\n\t"
5054         "adds	x5, x5, x25\n\t"
5055         "adc	x6, x6, xzr\n\t"
5056         /*  A[1] * B[0] */
5057         "mul	x25, x17, x21\n\t"
5058         "umulh	x26, x17, x21\n\t"
5059         "adds	x5, x5, x25\n\t"
5060         "adcs	x6, x6, x26\n\t"
5061         "adc	x7, xzr, xzr\n\t"
5062         /*  A[0] * B[2] */
5063         "mul	x25, x16, x23\n\t"
5064         "umulh	x26, x16, x23\n\t"
5065         "adds	x6, x6, x25\n\t"
5066         "adc	x7, x7, x26\n\t"
5067         /*  A[1] * B[1] */
5068         "mul	x25, x17, x22\n\t"
5069         "umulh	x26, x17, x22\n\t"
5070         "adds	x6, x6, x25\n\t"
5071         "adcs	x7, x7, x26\n\t"
5072         "adc	x8, xzr, xzr\n\t"
5073         /*  A[2] * B[0] */
5074         "mul	x25, x19, x21\n\t"
5075         "umulh	x26, x19, x21\n\t"
5076         "adds	x6, x6, x25\n\t"
5077         "adcs	x7, x7, x26\n\t"
5078         "adc	x8, x8, xzr\n\t"
5079         /*  A[0] * B[3] */
5080         "mul	x25, x16, x24\n\t"
5081         "umulh	x26, x16, x24\n\t"
5082         "adds	x7, x7, x25\n\t"
5083         "adcs	x8, x8, x26\n\t"
5084         "adc	x9, xzr, xzr\n\t"
5085         /*  A[1] * B[2] */
5086         "mul	x25, x17, x23\n\t"
5087         "umulh	x26, x17, x23\n\t"
5088         "adds	x7, x7, x25\n\t"
5089         "adcs	x8, x8, x26\n\t"
5090         "adc	x9, x9, xzr\n\t"
5091         /*  A[2] * B[1] */
5092         "mul	x25, x19, x22\n\t"
5093         "umulh	x26, x19, x22\n\t"
5094         "adds	x7, x7, x25\n\t"
5095         "adcs	x8, x8, x26\n\t"
5096         "adc	x9, x9, xzr\n\t"
5097         /*  A[3] * B[0] */
5098         "mul	x25, x20, x21\n\t"
5099         "umulh	x26, x20, x21\n\t"
5100         "adds	x7, x7, x25\n\t"
5101         "adcs	x8, x8, x26\n\t"
5102         "adc	x9, x9, xzr\n\t"
5103         /*  A[1] * B[3] */
5104         "mul	x25, x17, x24\n\t"
5105         "umulh	x26, x17, x24\n\t"
5106         "adds	x8, x8, x25\n\t"
5107         "adcs	x9, x9, x26\n\t"
5108         "adc	x10, xzr, xzr\n\t"
5109         /*  A[2] * B[2] */
5110         "mul	x25, x19, x23\n\t"
5111         "umulh	x26, x19, x23\n\t"
5112         "adds	x8, x8, x25\n\t"
5113         "adcs	x9, x9, x26\n\t"
5114         "adc	x10, x10, xzr\n\t"
5115         /*  A[3] * B[1] */
5116         "mul	x25, x20, x22\n\t"
5117         "umulh	x26, x20, x22\n\t"
5118         "adds	x8, x8, x25\n\t"
5119         "adcs	x9, x9, x26\n\t"
5120         "adc	x10, x10, xzr\n\t"
5121         /*  A[2] * B[3] */
5122         "mul	x25, x19, x24\n\t"
5123         "umulh	x26, x19, x24\n\t"
5124         "adds	x9, x9, x25\n\t"
5125         "adcs	x10, x10, x26\n\t"
5126         "adc	x11, xzr, xzr\n\t"
5127         /*  A[3] * B[2] */
5128         "mul	x25, x20, x23\n\t"
5129         "umulh	x26, x20, x23\n\t"
5130         "adds	x9, x9, x25\n\t"
5131         "adcs	x10, x10, x26\n\t"
5132         "adc	x11, x11, xzr\n\t"
5133         /*  A[3] * B[3] */
5134         "mul	x25, x20, x24\n\t"
5135         "umulh	x26, x20, x24\n\t"
5136         "adds	x10, x10, x25\n\t"
5137         "adc	x11, x11, x26\n\t"
5138         /* Reduce */
5139         /*  Move top half into t4-t7 and remove top bit from t3 */
5140         "extr	x11, x11, x10, #63\n\t"
5141         "extr	x10, x10, x9, #63\n\t"
5142         "extr	x9, x9, x8, #63\n\t"
5143         "extr	x8, x8, x7, #63\n\t"
5144         "and	x7, x7, #0x7fffffffffffffff\n\t"
5145         /*  Multiply top half by 19 */
5146         "mov	x25, #19\n\t"
5147         "mul	x26, x25, x8\n\t"
5148         "umulh	x8, x25, x8\n\t"
5149         "adds	x4, x4, x26\n\t"
5150         "mul	x26, x25, x9\n\t"
5151         "umulh	x9, x25, x9\n\t"
5152         "adcs	x5, x5, x26\n\t"
5153         "mul	x26, x25, x10\n\t"
5154         "umulh	x10, x25, x10\n\t"
5155         "adcs	x6, x6, x26\n\t"
5156         "mul	x26, x25, x11\n\t"
5157         "umulh	x27, x25, x11\n\t"
5158         "adcs	x7, x7, x26\n\t"
5159         "adc	x27, x27, xzr\n\t"
5160         /*  Add remaining product results in */
5161         "adds	x5, x5, x8\n\t"
5162         "adcs	x6, x6, x9\n\t"
5163         "adcs	x7, x7, x10\n\t"
5164         "adc	x27, x27, xzr\n\t"
5165         /*  Overflow */
5166         "extr	x27, x27, x7, #63\n\t"
5167         "mul	x27, x27, x25\n\t"
5168         "and	x7, x7, #0x7fffffffffffffff\n\t"
5169         "adds	x4, x4, x27\n\t"
5170         "adcs	x5, x5, xzr\n\t"
5171         "adcs	x6, x6, xzr\n\t"
5172         "adc	x7, x7, xzr\n\t"
5173         /* Reduce if top bit set */
5174         "and	x27, x25, x7, asr 63\n\t"
5175         "and	x7, x7, #0x7fffffffffffffff\n\t"
5176         "adds	x4, x4, x27\n\t"
5177         "adcs	x5, x5, xzr\n\t"
5178         "adcs	x6, x6, xzr\n\t"
5179         "adc	x7, x7, xzr\n\t"
5180         /* Store */
5181         "ldr	x0, [x29, #32]\n\t"
5182         "ldr	x1, [x29, #64]\n\t"
5183         /* Double */
5184         "ldp	x8, x9, [x1]\n\t"
5185         "ldp	x10, x11, [x1, #16]\n\t"
5186         "adds	x8, x8, x8\n\t"
5187         "adcs	x9, x9, x9\n\t"
5188         "adcs	x10, x10, x10\n\t"
5189         "adc	x11, x11, x11\n\t"
5190         "mov	x25, #-19\n\t"
5191         "asr	x28, x11, #63\n\t"
5192         /*   Mask the modulus */
5193         "and	x25, x28, x25\n\t"
5194         "and	x26, x28, #0x7fffffffffffffff\n\t"
5195         /*   Sub modulus (if overflow) */
5196         "subs	x8, x8, x25\n\t"
5197         "sbcs	x9, x9, x28\n\t"
5198         "sbcs	x10, x10, x28\n\t"
5199         "sbc	x11, x11, x26\n\t"
5200         "ldr	x1, [x29, #40]\n\t"
5201         /* Add */
5202         "adds	x12, x8, x4\n\t"
5203         "adcs	x13, x9, x5\n\t"
5204         "adcs	x14, x10, x6\n\t"
5205         "adc	x15, x11, x7\n\t"
5206         "mov	x25, #-19\n\t"
5207         "asr	x28, x15, #63\n\t"
5208         /*   Mask the modulus */
5209         "and	x25, x28, x25\n\t"
5210         "and	x26, x28, #0x7fffffffffffffff\n\t"
5211         /*   Sub modulus (if overflow) */
5212         "subs	x12, x12, x25\n\t"
5213         "sbcs	x13, x13, x28\n\t"
5214         "sbcs	x14, x14, x28\n\t"
5215         "sbc	x15, x15, x26\n\t"
5216         /* Sub */
5217         "subs	x16, x8, x4\n\t"
5218         "sbcs	x17, x9, x5\n\t"
5219         "sbcs	x19, x10, x6\n\t"
5220         "sbcs	x20, x11, x7\n\t"
5221         "mov	x25, #-19\n\t"
5222         "csetm	x28, cc\n\t"
5223         /*   Mask the modulus */
5224         "and	x25, x28, x25\n\t"
5225         "and	x26, x28, #0x7fffffffffffffff\n\t"
5226         /*   Add modulus (if underflow) */
5227         "adds	x16, x16, x25\n\t"
5228         "adcs	x17, x17, x28\n\t"
5229         "adcs	x19, x19, x28\n\t"
5230         "adc	x20, x20, x26\n\t"
5231         "stp	x12, x13, [x0]\n\t"
5232         "stp	x14, x15, [x0, #16]\n\t"
5233         "stp	x16, x17, [x1]\n\t"
5234         "stp	x19, x20, [x1, #16]\n\t"
5235         "ldp	x29, x30, [sp], #0x70\n\t"
5236         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qxy2d] "+r" (qxy2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
5237         :
5238         : "memory", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
5239     );
5240 }
5241 
fe_ge_msub(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qxy2d,const fe qyplusx,const fe qyminusx)5242 void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
5243 {
5244     __asm__ __volatile__ (
5245         "stp	x29, x30, [sp, #-112]!\n\t"
5246         "add	x29, sp, #0\n\t"
5247         "str	%x[qyminusx], [sp, #104]\n\t"
5248         "str	%x[qyplusx], [sp, #96]\n\t"
5249         "str	%x[qxy2d], [sp, #88]\n\t"
5250         "str	%x[rx], [x29, #16]\n\t"
5251         "str	%x[ry], [x29, #24]\n\t"
5252         "str	%x[rz], [x29, #32]\n\t"
5253         "str	%x[rt], [x29, #40]\n\t"
5254         "str	%x[px], [x29, #48]\n\t"
5255         "str	%x[py], [x29, #56]\n\t"
5256         "str	%x[pz], [x29, #64]\n\t"
5257         "str	%x[pt], [x29, #72]\n\t"
5258         "ldr	x2, [x29, #56]\n\t"
5259         "ldr	x3, [x29, #48]\n\t"
5260         /* Add */
5261         "ldp	x12, x13, [x2]\n\t"
5262         "ldp	x14, x15, [x2, #16]\n\t"
5263         "ldp	x16, x17, [x3]\n\t"
5264         "ldp	x19, x20, [x3, #16]\n\t"
5265         "adds	x4, x12, x16\n\t"
5266         "adcs	x5, x13, x17\n\t"
5267         "adcs	x6, x14, x19\n\t"
5268         "adc	x7, x15, x20\n\t"
5269         "mov	x25, #-19\n\t"
5270         "asr	x28, x7, #63\n\t"
5271         /*   Mask the modulus */
5272         "and	x25, x28, x25\n\t"
5273         "and	x26, x28, #0x7fffffffffffffff\n\t"
5274         /*   Sub modulus (if overflow) */
5275         "subs	x4, x4, x25\n\t"
5276         "sbcs	x5, x5, x28\n\t"
5277         "sbcs	x6, x6, x28\n\t"
5278         "sbc	x7, x7, x26\n\t"
5279         /* Sub */
5280         "subs	x8, x12, x16\n\t"
5281         "sbcs	x9, x13, x17\n\t"
5282         "sbcs	x10, x14, x19\n\t"
5283         "sbcs	x11, x15, x20\n\t"
5284         "mov	x25, #-19\n\t"
5285         "csetm	x28, cc\n\t"
5286         /*   Mask the modulus */
5287         "and	x25, x28, x25\n\t"
5288         "and	x26, x28, #0x7fffffffffffffff\n\t"
5289         /*   Add modulus (if underflow) */
5290         "adds	x8, x8, x25\n\t"
5291         "adcs	x9, x9, x28\n\t"
5292         "adcs	x10, x10, x28\n\t"
5293         "adc	x11, x11, x26\n\t"
5294         "ldr	x0, [x29, #32]\n\t"
5295         "ldr	x2, [sp, #104]\n\t"
5296         /* Multiply */
5297         "ldp	x21, x22, [x2]\n\t"
5298         "ldp	x23, x24, [x2, #16]\n\t"
5299         /*  A[0] * B[0] */
5300         "mul	x12, x4, x21\n\t"
5301         "umulh	x13, x4, x21\n\t"
5302         /*  A[0] * B[1] */
5303         "mul	x25, x4, x22\n\t"
5304         "umulh	x14, x4, x22\n\t"
5305         "adds	x13, x13, x25\n\t"
5306         "adc	x14, x14, xzr\n\t"
5307         /*  A[1] * B[0] */
5308         "mul	x25, x5, x21\n\t"
5309         "umulh	x26, x5, x21\n\t"
5310         "adds	x13, x13, x25\n\t"
5311         "adcs	x14, x14, x26\n\t"
5312         "adc	x15, xzr, xzr\n\t"
5313         /*  A[0] * B[2] */
5314         "mul	x25, x4, x23\n\t"
5315         "umulh	x26, x4, x23\n\t"
5316         "adds	x14, x14, x25\n\t"
5317         "adc	x15, x15, x26\n\t"
5318         /*  A[1] * B[1] */
5319         "mul	x25, x5, x22\n\t"
5320         "umulh	x26, x5, x22\n\t"
5321         "adds	x14, x14, x25\n\t"
5322         "adcs	x15, x15, x26\n\t"
5323         "adc	x16, xzr, xzr\n\t"
5324         /*  A[2] * B[0] */
5325         "mul	x25, x6, x21\n\t"
5326         "umulh	x26, x6, x21\n\t"
5327         "adds	x14, x14, x25\n\t"
5328         "adcs	x15, x15, x26\n\t"
5329         "adc	x16, x16, xzr\n\t"
5330         /*  A[0] * B[3] */
5331         "mul	x25, x4, x24\n\t"
5332         "umulh	x26, x4, x24\n\t"
5333         "adds	x15, x15, x25\n\t"
5334         "adcs	x16, x16, x26\n\t"
5335         "adc	x17, xzr, xzr\n\t"
5336         /*  A[1] * B[2] */
5337         "mul	x25, x5, x23\n\t"
5338         "umulh	x26, x5, x23\n\t"
5339         "adds	x15, x15, x25\n\t"
5340         "adcs	x16, x16, x26\n\t"
5341         "adc	x17, x17, xzr\n\t"
5342         /*  A[2] * B[1] */
5343         "mul	x25, x6, x22\n\t"
5344         "umulh	x26, x6, x22\n\t"
5345         "adds	x15, x15, x25\n\t"
5346         "adcs	x16, x16, x26\n\t"
5347         "adc	x17, x17, xzr\n\t"
5348         /*  A[3] * B[0] */
5349         "mul	x25, x7, x21\n\t"
5350         "umulh	x26, x7, x21\n\t"
5351         "adds	x15, x15, x25\n\t"
5352         "adcs	x16, x16, x26\n\t"
5353         "adc	x17, x17, xzr\n\t"
5354         /*  A[1] * B[3] */
5355         "mul	x25, x5, x24\n\t"
5356         "umulh	x26, x5, x24\n\t"
5357         "adds	x16, x16, x25\n\t"
5358         "adcs	x17, x17, x26\n\t"
5359         "adc	x19, xzr, xzr\n\t"
5360         /*  A[2] * B[2] */
5361         "mul	x25, x6, x23\n\t"
5362         "umulh	x26, x6, x23\n\t"
5363         "adds	x16, x16, x25\n\t"
5364         "adcs	x17, x17, x26\n\t"
5365         "adc	x19, x19, xzr\n\t"
5366         /*  A[3] * B[1] */
5367         "mul	x25, x7, x22\n\t"
5368         "umulh	x26, x7, x22\n\t"
5369         "adds	x16, x16, x25\n\t"
5370         "adcs	x17, x17, x26\n\t"
5371         "adc	x19, x19, xzr\n\t"
5372         /*  A[2] * B[3] */
5373         "mul	x25, x6, x24\n\t"
5374         "umulh	x26, x6, x24\n\t"
5375         "adds	x17, x17, x25\n\t"
5376         "adcs	x19, x19, x26\n\t"
5377         "adc	x20, xzr, xzr\n\t"
5378         /*  A[3] * B[2] */
5379         "mul	x25, x7, x23\n\t"
5380         "umulh	x26, x7, x23\n\t"
5381         "adds	x17, x17, x25\n\t"
5382         "adcs	x19, x19, x26\n\t"
5383         "adc	x20, x20, xzr\n\t"
5384         /*  A[3] * B[3] */
5385         "mul	x25, x7, x24\n\t"
5386         "umulh	x26, x7, x24\n\t"
5387         "adds	x19, x19, x25\n\t"
5388         "adc	x20, x20, x26\n\t"
5389         /* Reduce */
5390         /*  Move top half into t4-t7 and remove top bit from t3 */
5391         "extr	x20, x20, x19, #63\n\t"
5392         "extr	x19, x19, x17, #63\n\t"
5393         "extr	x17, x17, x16, #63\n\t"
5394         "extr	x16, x16, x15, #63\n\t"
5395         "and	x15, x15, #0x7fffffffffffffff\n\t"
5396         /*  Multiply top half by 19 */
5397         "mov	x25, #19\n\t"
5398         "mul	x26, x25, x16\n\t"
5399         "umulh	x16, x25, x16\n\t"
5400         "adds	x12, x12, x26\n\t"
5401         "mul	x26, x25, x17\n\t"
5402         "umulh	x17, x25, x17\n\t"
5403         "adcs	x13, x13, x26\n\t"
5404         "mul	x26, x25, x19\n\t"
5405         "umulh	x19, x25, x19\n\t"
5406         "adcs	x14, x14, x26\n\t"
5407         "mul	x26, x25, x20\n\t"
5408         "umulh	x27, x25, x20\n\t"
5409         "adcs	x15, x15, x26\n\t"
5410         "adc	x27, x27, xzr\n\t"
5411         /*  Add remaining product results in */
5412         "adds	x13, x13, x16\n\t"
5413         "adcs	x14, x14, x17\n\t"
5414         "adcs	x15, x15, x19\n\t"
5415         "adc	x27, x27, xzr\n\t"
5416         /*  Overflow */
5417         "extr	x27, x27, x15, #63\n\t"
5418         "mul	x27, x27, x25\n\t"
5419         "and	x15, x15, #0x7fffffffffffffff\n\t"
5420         "adds	x12, x12, x27\n\t"
5421         "adcs	x13, x13, xzr\n\t"
5422         "adcs	x14, x14, xzr\n\t"
5423         "adc	x15, x15, xzr\n\t"
5424         /* Reduce if top bit set */
5425         "and	x27, x25, x15, asr 63\n\t"
5426         "and	x15, x15, #0x7fffffffffffffff\n\t"
5427         "adds	x12, x12, x27\n\t"
5428         "adcs	x13, x13, xzr\n\t"
5429         "adcs	x14, x14, xzr\n\t"
5430         "adc	x15, x15, xzr\n\t"
5431         /* Store */
5432         "ldr	x0, [x29, #24]\n\t"
5433         "ldr	x1, [sp, #96]\n\t"
5434         /* Multiply */
5435         "ldp	x21, x22, [x1]\n\t"
5436         "ldp	x23, x24, [x1, #16]\n\t"
5437         /*  A[0] * B[0] */
5438         "mul	x4, x8, x21\n\t"
5439         "umulh	x5, x8, x21\n\t"
5440         /*  A[0] * B[1] */
5441         "mul	x25, x8, x22\n\t"
5442         "umulh	x6, x8, x22\n\t"
5443         "adds	x5, x5, x25\n\t"
5444         "adc	x6, x6, xzr\n\t"
5445         /*  A[1] * B[0] */
5446         "mul	x25, x9, x21\n\t"
5447         "umulh	x26, x9, x21\n\t"
5448         "adds	x5, x5, x25\n\t"
5449         "adcs	x6, x6, x26\n\t"
5450         "adc	x7, xzr, xzr\n\t"
5451         /*  A[0] * B[2] */
5452         "mul	x25, x8, x23\n\t"
5453         "umulh	x26, x8, x23\n\t"
5454         "adds	x6, x6, x25\n\t"
5455         "adc	x7, x7, x26\n\t"
5456         /*  A[1] * B[1] */
5457         "mul	x25, x9, x22\n\t"
5458         "umulh	x26, x9, x22\n\t"
5459         "adds	x6, x6, x25\n\t"
5460         "adcs	x7, x7, x26\n\t"
5461         "adc	x16, xzr, xzr\n\t"
5462         /*  A[2] * B[0] */
5463         "mul	x25, x10, x21\n\t"
5464         "umulh	x26, x10, x21\n\t"
5465         "adds	x6, x6, x25\n\t"
5466         "adcs	x7, x7, x26\n\t"
5467         "adc	x16, x16, xzr\n\t"
5468         /*  A[0] * B[3] */
5469         "mul	x25, x8, x24\n\t"
5470         "umulh	x26, x8, x24\n\t"
5471         "adds	x7, x7, x25\n\t"
5472         "adcs	x16, x16, x26\n\t"
5473         "adc	x17, xzr, xzr\n\t"
5474         /*  A[1] * B[2] */
5475         "mul	x25, x9, x23\n\t"
5476         "umulh	x26, x9, x23\n\t"
5477         "adds	x7, x7, x25\n\t"
5478         "adcs	x16, x16, x26\n\t"
5479         "adc	x17, x17, xzr\n\t"
5480         /*  A[2] * B[1] */
5481         "mul	x25, x10, x22\n\t"
5482         "umulh	x26, x10, x22\n\t"
5483         "adds	x7, x7, x25\n\t"
5484         "adcs	x16, x16, x26\n\t"
5485         "adc	x17, x17, xzr\n\t"
5486         /*  A[3] * B[0] */
5487         "mul	x25, x11, x21\n\t"
5488         "umulh	x26, x11, x21\n\t"
5489         "adds	x7, x7, x25\n\t"
5490         "adcs	x16, x16, x26\n\t"
5491         "adc	x17, x17, xzr\n\t"
5492         /*  A[1] * B[3] */
5493         "mul	x25, x9, x24\n\t"
5494         "umulh	x26, x9, x24\n\t"
5495         "adds	x16, x16, x25\n\t"
5496         "adcs	x17, x17, x26\n\t"
5497         "adc	x19, xzr, xzr\n\t"
5498         /*  A[2] * B[2] */
5499         "mul	x25, x10, x23\n\t"
5500         "umulh	x26, x10, x23\n\t"
5501         "adds	x16, x16, x25\n\t"
5502         "adcs	x17, x17, x26\n\t"
5503         "adc	x19, x19, xzr\n\t"
5504         /*  A[3] * B[1] */
5505         "mul	x25, x11, x22\n\t"
5506         "umulh	x26, x11, x22\n\t"
5507         "adds	x16, x16, x25\n\t"
5508         "adcs	x17, x17, x26\n\t"
5509         "adc	x19, x19, xzr\n\t"
5510         /*  A[2] * B[3] */
5511         "mul	x25, x10, x24\n\t"
5512         "umulh	x26, x10, x24\n\t"
5513         "adds	x17, x17, x25\n\t"
5514         "adcs	x19, x19, x26\n\t"
5515         "adc	x20, xzr, xzr\n\t"
5516         /*  A[3] * B[2] */
5517         "mul	x25, x11, x23\n\t"
5518         "umulh	x26, x11, x23\n\t"
5519         "adds	x17, x17, x25\n\t"
5520         "adcs	x19, x19, x26\n\t"
5521         "adc	x20, x20, xzr\n\t"
5522         /*  A[3] * B[3] */
5523         "mul	x25, x11, x24\n\t"
5524         "umulh	x26, x11, x24\n\t"
5525         "adds	x19, x19, x25\n\t"
5526         "adc	x20, x20, x26\n\t"
5527         /* Reduce */
5528         /*  Move top half into t4-t7 and remove top bit from t3 */
5529         "extr	x20, x20, x19, #63\n\t"
5530         "extr	x19, x19, x17, #63\n\t"
5531         "extr	x17, x17, x16, #63\n\t"
5532         "extr	x16, x16, x7, #63\n\t"
5533         "and	x7, x7, #0x7fffffffffffffff\n\t"
5534         /*  Multiply top half by 19 */
5535         "mov	x25, #19\n\t"
5536         "mul	x26, x25, x16\n\t"
5537         "umulh	x16, x25, x16\n\t"
5538         "adds	x4, x4, x26\n\t"
5539         "mul	x26, x25, x17\n\t"
5540         "umulh	x17, x25, x17\n\t"
5541         "adcs	x5, x5, x26\n\t"
5542         "mul	x26, x25, x19\n\t"
5543         "umulh	x19, x25, x19\n\t"
5544         "adcs	x6, x6, x26\n\t"
5545         "mul	x26, x25, x20\n\t"
5546         "umulh	x27, x25, x20\n\t"
5547         "adcs	x7, x7, x26\n\t"
5548         "adc	x27, x27, xzr\n\t"
5549         /*  Add remaining product results in */
5550         "adds	x5, x5, x16\n\t"
5551         "adcs	x6, x6, x17\n\t"
5552         "adcs	x7, x7, x19\n\t"
5553         "adc	x27, x27, xzr\n\t"
5554         /*  Overflow */
5555         "extr	x27, x27, x7, #63\n\t"
5556         "mul	x27, x27, x25\n\t"
5557         "and	x7, x7, #0x7fffffffffffffff\n\t"
5558         "adds	x4, x4, x27\n\t"
5559         "adcs	x5, x5, xzr\n\t"
5560         "adcs	x6, x6, xzr\n\t"
5561         "adc	x7, x7, xzr\n\t"
5562         /* Reduce if top bit set */
5563         "and	x27, x25, x7, asr 63\n\t"
5564         "and	x7, x7, #0x7fffffffffffffff\n\t"
5565         "adds	x4, x4, x27\n\t"
5566         "adcs	x5, x5, xzr\n\t"
5567         "adcs	x6, x6, xzr\n\t"
5568         "adc	x7, x7, xzr\n\t"
5569         /* Store */
5570         "ldr	x0, [x29, #24]\n\t"
5571         "ldr	x1, [x29, #16]\n\t"
5572         /* Add */
5573         "adds	x8, x12, x4\n\t"
5574         "adcs	x9, x13, x5\n\t"
5575         "adcs	x10, x14, x6\n\t"
5576         "adc	x11, x15, x7\n\t"
5577         "mov	x25, #-19\n\t"
5578         "asr	x28, x11, #63\n\t"
5579         /*   Mask the modulus */
5580         "and	x25, x28, x25\n\t"
5581         "and	x26, x28, #0x7fffffffffffffff\n\t"
5582         /*   Sub modulus (if overflow) */
5583         "subs	x8, x8, x25\n\t"
5584         "sbcs	x9, x9, x28\n\t"
5585         "sbcs	x10, x10, x28\n\t"
5586         "sbc	x11, x11, x26\n\t"
5587         /* Sub */
5588         "subs	x16, x12, x4\n\t"
5589         "sbcs	x17, x13, x5\n\t"
5590         "sbcs	x19, x14, x6\n\t"
5591         "sbcs	x20, x15, x7\n\t"
5592         "mov	x25, #-19\n\t"
5593         "csetm	x28, cc\n\t"
5594         /*   Mask the modulus */
5595         "and	x25, x28, x25\n\t"
5596         "and	x26, x28, #0x7fffffffffffffff\n\t"
5597         /*   Add modulus (if underflow) */
5598         "adds	x16, x16, x25\n\t"
5599         "adcs	x17, x17, x28\n\t"
5600         "adcs	x19, x19, x28\n\t"
5601         "adc	x20, x20, x26\n\t"
5602         "stp	x8, x9, [x0]\n\t"
5603         "stp	x10, x11, [x0, #16]\n\t"
5604         "stp	x16, x17, [x1]\n\t"
5605         "stp	x19, x20, [x1, #16]\n\t"
5606         "ldr	x0, [x29, #40]\n\t"
5607         "ldr	x1, [sp, #88]\n\t"
5608         "ldr	x3, [x29, #72]\n\t"
5609         /* Multiply */
5610         "ldp	x16, x17, [x1]\n\t"
5611         "ldp	x19, x20, [x1, #16]\n\t"
5612         "ldp	x21, x22, [x3]\n\t"
5613         "ldp	x23, x24, [x3, #16]\n\t"
5614         /*  A[0] * B[0] */
5615         "mul	x4, x16, x21\n\t"
5616         "umulh	x5, x16, x21\n\t"
5617         /*  A[0] * B[1] */
5618         "mul	x25, x16, x22\n\t"
5619         "umulh	x6, x16, x22\n\t"
5620         "adds	x5, x5, x25\n\t"
5621         "adc	x6, x6, xzr\n\t"
5622         /*  A[1] * B[0] */
5623         "mul	x25, x17, x21\n\t"
5624         "umulh	x26, x17, x21\n\t"
5625         "adds	x5, x5, x25\n\t"
5626         "adcs	x6, x6, x26\n\t"
5627         "adc	x7, xzr, xzr\n\t"
5628         /*  A[0] * B[2] */
5629         "mul	x25, x16, x23\n\t"
5630         "umulh	x26, x16, x23\n\t"
5631         "adds	x6, x6, x25\n\t"
5632         "adc	x7, x7, x26\n\t"
5633         /*  A[1] * B[1] */
5634         "mul	x25, x17, x22\n\t"
5635         "umulh	x26, x17, x22\n\t"
5636         "adds	x6, x6, x25\n\t"
5637         "adcs	x7, x7, x26\n\t"
5638         "adc	x8, xzr, xzr\n\t"
5639         /*  A[2] * B[0] */
5640         "mul	x25, x19, x21\n\t"
5641         "umulh	x26, x19, x21\n\t"
5642         "adds	x6, x6, x25\n\t"
5643         "adcs	x7, x7, x26\n\t"
5644         "adc	x8, x8, xzr\n\t"
5645         /*  A[0] * B[3] */
5646         "mul	x25, x16, x24\n\t"
5647         "umulh	x26, x16, x24\n\t"
5648         "adds	x7, x7, x25\n\t"
5649         "adcs	x8, x8, x26\n\t"
5650         "adc	x9, xzr, xzr\n\t"
5651         /*  A[1] * B[2] */
5652         "mul	x25, x17, x23\n\t"
5653         "umulh	x26, x17, x23\n\t"
5654         "adds	x7, x7, x25\n\t"
5655         "adcs	x8, x8, x26\n\t"
5656         "adc	x9, x9, xzr\n\t"
5657         /*  A[2] * B[1] */
5658         "mul	x25, x19, x22\n\t"
5659         "umulh	x26, x19, x22\n\t"
5660         "adds	x7, x7, x25\n\t"
5661         "adcs	x8, x8, x26\n\t"
5662         "adc	x9, x9, xzr\n\t"
5663         /*  A[3] * B[0] */
5664         "mul	x25, x20, x21\n\t"
5665         "umulh	x26, x20, x21\n\t"
5666         "adds	x7, x7, x25\n\t"
5667         "adcs	x8, x8, x26\n\t"
5668         "adc	x9, x9, xzr\n\t"
5669         /*  A[1] * B[3] */
5670         "mul	x25, x17, x24\n\t"
5671         "umulh	x26, x17, x24\n\t"
5672         "adds	x8, x8, x25\n\t"
5673         "adcs	x9, x9, x26\n\t"
5674         "adc	x10, xzr, xzr\n\t"
5675         /*  A[2] * B[2] */
5676         "mul	x25, x19, x23\n\t"
5677         "umulh	x26, x19, x23\n\t"
5678         "adds	x8, x8, x25\n\t"
5679         "adcs	x9, x9, x26\n\t"
5680         "adc	x10, x10, xzr\n\t"
5681         /*  A[3] * B[1] */
5682         "mul	x25, x20, x22\n\t"
5683         "umulh	x26, x20, x22\n\t"
5684         "adds	x8, x8, x25\n\t"
5685         "adcs	x9, x9, x26\n\t"
5686         "adc	x10, x10, xzr\n\t"
5687         /*  A[2] * B[3] */
5688         "mul	x25, x19, x24\n\t"
5689         "umulh	x26, x19, x24\n\t"
5690         "adds	x9, x9, x25\n\t"
5691         "adcs	x10, x10, x26\n\t"
5692         "adc	x11, xzr, xzr\n\t"
5693         /*  A[3] * B[2] */
5694         "mul	x25, x20, x23\n\t"
5695         "umulh	x26, x20, x23\n\t"
5696         "adds	x9, x9, x25\n\t"
5697         "adcs	x10, x10, x26\n\t"
5698         "adc	x11, x11, xzr\n\t"
5699         /*  A[3] * B[3] */
5700         "mul	x25, x20, x24\n\t"
5701         "umulh	x26, x20, x24\n\t"
5702         "adds	x10, x10, x25\n\t"
5703         "adc	x11, x11, x26\n\t"
5704         /* Reduce */
5705         /*  Move top half into t4-t7 and remove top bit from t3 */
5706         "extr	x11, x11, x10, #63\n\t"
5707         "extr	x10, x10, x9, #63\n\t"
5708         "extr	x9, x9, x8, #63\n\t"
5709         "extr	x8, x8, x7, #63\n\t"
5710         "and	x7, x7, #0x7fffffffffffffff\n\t"
5711         /*  Multiply top half by 19 */
5712         "mov	x25, #19\n\t"
5713         "mul	x26, x25, x8\n\t"
5714         "umulh	x8, x25, x8\n\t"
5715         "adds	x4, x4, x26\n\t"
5716         "mul	x26, x25, x9\n\t"
5717         "umulh	x9, x25, x9\n\t"
5718         "adcs	x5, x5, x26\n\t"
5719         "mul	x26, x25, x10\n\t"
5720         "umulh	x10, x25, x10\n\t"
5721         "adcs	x6, x6, x26\n\t"
5722         "mul	x26, x25, x11\n\t"
5723         "umulh	x27, x25, x11\n\t"
5724         "adcs	x7, x7, x26\n\t"
5725         "adc	x27, x27, xzr\n\t"
5726         /*  Add remaining product results in */
5727         "adds	x5, x5, x8\n\t"
5728         "adcs	x6, x6, x9\n\t"
5729         "adcs	x7, x7, x10\n\t"
5730         "adc	x27, x27, xzr\n\t"
5731         /*  Overflow */
5732         "extr	x27, x27, x7, #63\n\t"
5733         "mul	x27, x27, x25\n\t"
5734         "and	x7, x7, #0x7fffffffffffffff\n\t"
5735         "adds	x4, x4, x27\n\t"
5736         "adcs	x5, x5, xzr\n\t"
5737         "adcs	x6, x6, xzr\n\t"
5738         "adc	x7, x7, xzr\n\t"
5739         /* Reduce if top bit set */
5740         "and	x27, x25, x7, asr 63\n\t"
5741         "and	x7, x7, #0x7fffffffffffffff\n\t"
5742         "adds	x4, x4, x27\n\t"
5743         "adcs	x5, x5, xzr\n\t"
5744         "adcs	x6, x6, xzr\n\t"
5745         "adc	x7, x7, xzr\n\t"
5746         /* Store */
5747         "ldr	x0, [x29, #32]\n\t"
5748         "ldr	x1, [x29, #64]\n\t"
5749         /* Double */
5750         "ldp	x8, x9, [x1]\n\t"
5751         "ldp	x10, x11, [x1, #16]\n\t"
5752         "adds	x8, x8, x8\n\t"
5753         "adcs	x9, x9, x9\n\t"
5754         "adcs	x10, x10, x10\n\t"
5755         "adc	x11, x11, x11\n\t"
5756         "mov	x25, #-19\n\t"
5757         "asr	x28, x11, #63\n\t"
5758         /*   Mask the modulus */
5759         "and	x25, x28, x25\n\t"
5760         "and	x26, x28, #0x7fffffffffffffff\n\t"
5761         /*   Sub modulus (if overflow) */
5762         "subs	x8, x8, x25\n\t"
5763         "sbcs	x9, x9, x28\n\t"
5764         "sbcs	x10, x10, x28\n\t"
5765         "sbc	x11, x11, x26\n\t"
5766         "ldr	x1, [x29, #40]\n\t"
5767         /* Add */
5768         "adds	x12, x8, x4\n\t"
5769         "adcs	x13, x9, x5\n\t"
5770         "adcs	x14, x10, x6\n\t"
5771         "adc	x15, x11, x7\n\t"
5772         "mov	x25, #-19\n\t"
5773         "asr	x28, x15, #63\n\t"
5774         /*   Mask the modulus */
5775         "and	x25, x28, x25\n\t"
5776         "and	x26, x28, #0x7fffffffffffffff\n\t"
5777         /*   Sub modulus (if overflow) */
5778         "subs	x12, x12, x25\n\t"
5779         "sbcs	x13, x13, x28\n\t"
5780         "sbcs	x14, x14, x28\n\t"
5781         "sbc	x15, x15, x26\n\t"
5782         /* Sub */
5783         "subs	x16, x8, x4\n\t"
5784         "sbcs	x17, x9, x5\n\t"
5785         "sbcs	x19, x10, x6\n\t"
5786         "sbcs	x20, x11, x7\n\t"
5787         "mov	x25, #-19\n\t"
5788         "csetm	x28, cc\n\t"
5789         /*   Mask the modulus */
5790         "and	x25, x28, x25\n\t"
5791         "and	x26, x28, #0x7fffffffffffffff\n\t"
5792         /*   Add modulus (if underflow) */
5793         "adds	x16, x16, x25\n\t"
5794         "adcs	x17, x17, x28\n\t"
5795         "adcs	x19, x19, x28\n\t"
5796         "adc	x20, x20, x26\n\t"
5797         "stp	x12, x13, [x1]\n\t"
5798         "stp	x14, x15, [x1, #16]\n\t"
5799         "stp	x16, x17, [x0]\n\t"
5800         "stp	x19, x20, [x0, #16]\n\t"
5801         "ldp	x29, x30, [sp], #0x70\n\t"
5802         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qxy2d] "+r" (qxy2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
5803         :
5804         : "memory", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
5805     );
5806 }
5807 
fe_ge_add(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qz,const fe qt2d,const fe qyplusx,const fe qyminusx)5808 void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
5809 {
5810     __asm__ __volatile__ (
5811         "stp	x29, x30, [sp, #-128]!\n\t"
5812         "add	x29, sp, #0\n\t"
5813         "str	%x[qyminusx], [sp, #120]\n\t"
5814         "str	%x[qyplusx], [sp, #112]\n\t"
5815         "str	%x[qt2d], [sp, #104]\n\t"
5816         "str	%x[qz], [sp, #96]\n\t"
5817         "str	%x[rx], [x29, #16]\n\t"
5818         "str	%x[ry], [x29, #24]\n\t"
5819         "str	%x[rz], [x29, #32]\n\t"
5820         "str	%x[rt], [x29, #40]\n\t"
5821         "str	%x[px], [x29, #48]\n\t"
5822         "str	%x[py], [x29, #56]\n\t"
5823         "str	%x[pz], [x29, #64]\n\t"
5824         "str	%x[pt], [x29, #72]\n\t"
5825         "ldr	x2, [x29, #56]\n\t"
5826         "ldr	x3, [x29, #48]\n\t"
5827         /* Add */
5828         "ldp	x12, x13, [x2]\n\t"
5829         "ldp	x14, x15, [x2, #16]\n\t"
5830         "ldp	x16, x17, [x3]\n\t"
5831         "ldp	x19, x20, [x3, #16]\n\t"
5832         "adds	x4, x12, x16\n\t"
5833         "adcs	x5, x13, x17\n\t"
5834         "adcs	x6, x14, x19\n\t"
5835         "adc	x7, x15, x20\n\t"
5836         "mov	x25, #-19\n\t"
5837         "asr	x28, x7, #63\n\t"
5838         /*   Mask the modulus */
5839         "and	x25, x28, x25\n\t"
5840         "and	x26, x28, #0x7fffffffffffffff\n\t"
5841         /*   Sub modulus (if overflow) */
5842         "subs	x4, x4, x25\n\t"
5843         "sbcs	x5, x5, x28\n\t"
5844         "sbcs	x6, x6, x28\n\t"
5845         "sbc	x7, x7, x26\n\t"
5846         /* Sub */
5847         "subs	x8, x12, x16\n\t"
5848         "sbcs	x9, x13, x17\n\t"
5849         "sbcs	x10, x14, x19\n\t"
5850         "sbcs	x11, x15, x20\n\t"
5851         "mov	x25, #-19\n\t"
5852         "csetm	x28, cc\n\t"
5853         /*   Mask the modulus */
5854         "and	x25, x28, x25\n\t"
5855         "and	x26, x28, #0x7fffffffffffffff\n\t"
5856         /*   Add modulus (if underflow) */
5857         "adds	x8, x8, x25\n\t"
5858         "adcs	x9, x9, x28\n\t"
5859         "adcs	x10, x10, x28\n\t"
5860         "adc	x11, x11, x26\n\t"
5861         "ldr	x0, [x29, #32]\n\t"
5862         "ldr	x2, [sp, #112]\n\t"
5863         /* Multiply */
5864         "ldp	x21, x22, [x2]\n\t"
5865         "ldp	x23, x24, [x2, #16]\n\t"
5866         /*  A[0] * B[0] */
5867         "mul	x12, x4, x21\n\t"
5868         "umulh	x13, x4, x21\n\t"
5869         /*  A[0] * B[1] */
5870         "mul	x25, x4, x22\n\t"
5871         "umulh	x14, x4, x22\n\t"
5872         "adds	x13, x13, x25\n\t"
5873         "adc	x14, x14, xzr\n\t"
5874         /*  A[1] * B[0] */
5875         "mul	x25, x5, x21\n\t"
5876         "umulh	x26, x5, x21\n\t"
5877         "adds	x13, x13, x25\n\t"
5878         "adcs	x14, x14, x26\n\t"
5879         "adc	x15, xzr, xzr\n\t"
5880         /*  A[0] * B[2] */
5881         "mul	x25, x4, x23\n\t"
5882         "umulh	x26, x4, x23\n\t"
5883         "adds	x14, x14, x25\n\t"
5884         "adc	x15, x15, x26\n\t"
5885         /*  A[1] * B[1] */
5886         "mul	x25, x5, x22\n\t"
5887         "umulh	x26, x5, x22\n\t"
5888         "adds	x14, x14, x25\n\t"
5889         "adcs	x15, x15, x26\n\t"
5890         "adc	x16, xzr, xzr\n\t"
5891         /*  A[2] * B[0] */
5892         "mul	x25, x6, x21\n\t"
5893         "umulh	x26, x6, x21\n\t"
5894         "adds	x14, x14, x25\n\t"
5895         "adcs	x15, x15, x26\n\t"
5896         "adc	x16, x16, xzr\n\t"
5897         /*  A[0] * B[3] */
5898         "mul	x25, x4, x24\n\t"
5899         "umulh	x26, x4, x24\n\t"
5900         "adds	x15, x15, x25\n\t"
5901         "adcs	x16, x16, x26\n\t"
5902         "adc	x17, xzr, xzr\n\t"
5903         /*  A[1] * B[2] */
5904         "mul	x25, x5, x23\n\t"
5905         "umulh	x26, x5, x23\n\t"
5906         "adds	x15, x15, x25\n\t"
5907         "adcs	x16, x16, x26\n\t"
5908         "adc	x17, x17, xzr\n\t"
5909         /*  A[2] * B[1] */
5910         "mul	x25, x6, x22\n\t"
5911         "umulh	x26, x6, x22\n\t"
5912         "adds	x15, x15, x25\n\t"
5913         "adcs	x16, x16, x26\n\t"
5914         "adc	x17, x17, xzr\n\t"
5915         /*  A[3] * B[0] */
5916         "mul	x25, x7, x21\n\t"
5917         "umulh	x26, x7, x21\n\t"
5918         "adds	x15, x15, x25\n\t"
5919         "adcs	x16, x16, x26\n\t"
5920         "adc	x17, x17, xzr\n\t"
5921         /*  A[1] * B[3] */
5922         "mul	x25, x5, x24\n\t"
5923         "umulh	x26, x5, x24\n\t"
5924         "adds	x16, x16, x25\n\t"
5925         "adcs	x17, x17, x26\n\t"
5926         "adc	x19, xzr, xzr\n\t"
5927         /*  A[2] * B[2] */
5928         "mul	x25, x6, x23\n\t"
5929         "umulh	x26, x6, x23\n\t"
5930         "adds	x16, x16, x25\n\t"
5931         "adcs	x17, x17, x26\n\t"
5932         "adc	x19, x19, xzr\n\t"
5933         /*  A[3] * B[1] */
5934         "mul	x25, x7, x22\n\t"
5935         "umulh	x26, x7, x22\n\t"
5936         "adds	x16, x16, x25\n\t"
5937         "adcs	x17, x17, x26\n\t"
5938         "adc	x19, x19, xzr\n\t"
5939         /*  A[2] * B[3] */
5940         "mul	x25, x6, x24\n\t"
5941         "umulh	x26, x6, x24\n\t"
5942         "adds	x17, x17, x25\n\t"
5943         "adcs	x19, x19, x26\n\t"
5944         "adc	x20, xzr, xzr\n\t"
5945         /*  A[3] * B[2] */
5946         "mul	x25, x7, x23\n\t"
5947         "umulh	x26, x7, x23\n\t"
5948         "adds	x17, x17, x25\n\t"
5949         "adcs	x19, x19, x26\n\t"
5950         "adc	x20, x20, xzr\n\t"
5951         /*  A[3] * B[3] */
5952         "mul	x25, x7, x24\n\t"
5953         "umulh	x26, x7, x24\n\t"
5954         "adds	x19, x19, x25\n\t"
5955         "adc	x20, x20, x26\n\t"
5956         /* Reduce */
5957         /*  Move top half into t4-t7 and remove top bit from t3 */
5958         "extr	x20, x20, x19, #63\n\t"
5959         "extr	x19, x19, x17, #63\n\t"
5960         "extr	x17, x17, x16, #63\n\t"
5961         "extr	x16, x16, x15, #63\n\t"
5962         "and	x15, x15, #0x7fffffffffffffff\n\t"
5963         /*  Multiply top half by 19 */
5964         "mov	x25, #19\n\t"
5965         "mul	x26, x25, x16\n\t"
5966         "umulh	x16, x25, x16\n\t"
5967         "adds	x12, x12, x26\n\t"
5968         "mul	x26, x25, x17\n\t"
5969         "umulh	x17, x25, x17\n\t"
5970         "adcs	x13, x13, x26\n\t"
5971         "mul	x26, x25, x19\n\t"
5972         "umulh	x19, x25, x19\n\t"
5973         "adcs	x14, x14, x26\n\t"
5974         "mul	x26, x25, x20\n\t"
5975         "umulh	x27, x25, x20\n\t"
5976         "adcs	x15, x15, x26\n\t"
5977         "adc	x27, x27, xzr\n\t"
5978         /*  Add remaining product results in */
5979         "adds	x13, x13, x16\n\t"
5980         "adcs	x14, x14, x17\n\t"
5981         "adcs	x15, x15, x19\n\t"
5982         "adc	x27, x27, xzr\n\t"
5983         /*  Overflow */
5984         "extr	x27, x27, x15, #63\n\t"
5985         "mul	x27, x27, x25\n\t"
5986         "and	x15, x15, #0x7fffffffffffffff\n\t"
5987         "adds	x12, x12, x27\n\t"
5988         "adcs	x13, x13, xzr\n\t"
5989         "adcs	x14, x14, xzr\n\t"
5990         "adc	x15, x15, xzr\n\t"
5991         /* Reduce if top bit set */
5992         "and	x27, x25, x15, asr 63\n\t"
5993         "and	x15, x15, #0x7fffffffffffffff\n\t"
5994         "adds	x12, x12, x27\n\t"
5995         "adcs	x13, x13, xzr\n\t"
5996         "adcs	x14, x14, xzr\n\t"
5997         "adc	x15, x15, xzr\n\t"
5998         /* Store */
5999         "ldr	x0, [x29, #24]\n\t"
6000         "ldr	x1, [sp, #120]\n\t"
6001         /* Multiply */
6002         "ldp	x21, x22, [x1]\n\t"
6003         "ldp	x23, x24, [x1, #16]\n\t"
6004         /*  A[0] * B[0] */
6005         "mul	x4, x8, x21\n\t"
6006         "umulh	x5, x8, x21\n\t"
6007         /*  A[0] * B[1] */
6008         "mul	x25, x8, x22\n\t"
6009         "umulh	x6, x8, x22\n\t"
6010         "adds	x5, x5, x25\n\t"
6011         "adc	x6, x6, xzr\n\t"
6012         /*  A[1] * B[0] */
6013         "mul	x25, x9, x21\n\t"
6014         "umulh	x26, x9, x21\n\t"
6015         "adds	x5, x5, x25\n\t"
6016         "adcs	x6, x6, x26\n\t"
6017         "adc	x7, xzr, xzr\n\t"
6018         /*  A[0] * B[2] */
6019         "mul	x25, x8, x23\n\t"
6020         "umulh	x26, x8, x23\n\t"
6021         "adds	x6, x6, x25\n\t"
6022         "adc	x7, x7, x26\n\t"
6023         /*  A[1] * B[1] */
6024         "mul	x25, x9, x22\n\t"
6025         "umulh	x26, x9, x22\n\t"
6026         "adds	x6, x6, x25\n\t"
6027         "adcs	x7, x7, x26\n\t"
6028         "adc	x16, xzr, xzr\n\t"
6029         /*  A[2] * B[0] */
6030         "mul	x25, x10, x21\n\t"
6031         "umulh	x26, x10, x21\n\t"
6032         "adds	x6, x6, x25\n\t"
6033         "adcs	x7, x7, x26\n\t"
6034         "adc	x16, x16, xzr\n\t"
6035         /*  A[0] * B[3] */
6036         "mul	x25, x8, x24\n\t"
6037         "umulh	x26, x8, x24\n\t"
6038         "adds	x7, x7, x25\n\t"
6039         "adcs	x16, x16, x26\n\t"
6040         "adc	x17, xzr, xzr\n\t"
6041         /*  A[1] * B[2] */
6042         "mul	x25, x9, x23\n\t"
6043         "umulh	x26, x9, x23\n\t"
6044         "adds	x7, x7, x25\n\t"
6045         "adcs	x16, x16, x26\n\t"
6046         "adc	x17, x17, xzr\n\t"
6047         /*  A[2] * B[1] */
6048         "mul	x25, x10, x22\n\t"
6049         "umulh	x26, x10, x22\n\t"
6050         "adds	x7, x7, x25\n\t"
6051         "adcs	x16, x16, x26\n\t"
6052         "adc	x17, x17, xzr\n\t"
6053         /*  A[3] * B[0] */
6054         "mul	x25, x11, x21\n\t"
6055         "umulh	x26, x11, x21\n\t"
6056         "adds	x7, x7, x25\n\t"
6057         "adcs	x16, x16, x26\n\t"
6058         "adc	x17, x17, xzr\n\t"
6059         /*  A[1] * B[3] */
6060         "mul	x25, x9, x24\n\t"
6061         "umulh	x26, x9, x24\n\t"
6062         "adds	x16, x16, x25\n\t"
6063         "adcs	x17, x17, x26\n\t"
6064         "adc	x19, xzr, xzr\n\t"
6065         /*  A[2] * B[2] */
6066         "mul	x25, x10, x23\n\t"
6067         "umulh	x26, x10, x23\n\t"
6068         "adds	x16, x16, x25\n\t"
6069         "adcs	x17, x17, x26\n\t"
6070         "adc	x19, x19, xzr\n\t"
6071         /*  A[3] * B[1] */
6072         "mul	x25, x11, x22\n\t"
6073         "umulh	x26, x11, x22\n\t"
6074         "adds	x16, x16, x25\n\t"
6075         "adcs	x17, x17, x26\n\t"
6076         "adc	x19, x19, xzr\n\t"
6077         /*  A[2] * B[3] */
6078         "mul	x25, x10, x24\n\t"
6079         "umulh	x26, x10, x24\n\t"
6080         "adds	x17, x17, x25\n\t"
6081         "adcs	x19, x19, x26\n\t"
6082         "adc	x20, xzr, xzr\n\t"
6083         /*  A[3] * B[2] */
6084         "mul	x25, x11, x23\n\t"
6085         "umulh	x26, x11, x23\n\t"
6086         "adds	x17, x17, x25\n\t"
6087         "adcs	x19, x19, x26\n\t"
6088         "adc	x20, x20, xzr\n\t"
6089         /*  A[3] * B[3] */
6090         "mul	x25, x11, x24\n\t"
6091         "umulh	x26, x11, x24\n\t"
6092         "adds	x19, x19, x25\n\t"
6093         "adc	x20, x20, x26\n\t"
6094         /* Reduce */
6095         /*  Move top half into t4-t7 and remove top bit from t3 */
6096         "extr	x20, x20, x19, #63\n\t"
6097         "extr	x19, x19, x17, #63\n\t"
6098         "extr	x17, x17, x16, #63\n\t"
6099         "extr	x16, x16, x7, #63\n\t"
6100         "and	x7, x7, #0x7fffffffffffffff\n\t"
6101         /*  Multiply top half by 19 */
6102         "mov	x25, #19\n\t"
6103         "mul	x26, x25, x16\n\t"
6104         "umulh	x16, x25, x16\n\t"
6105         "adds	x4, x4, x26\n\t"
6106         "mul	x26, x25, x17\n\t"
6107         "umulh	x17, x25, x17\n\t"
6108         "adcs	x5, x5, x26\n\t"
6109         "mul	x26, x25, x19\n\t"
6110         "umulh	x19, x25, x19\n\t"
6111         "adcs	x6, x6, x26\n\t"
6112         "mul	x26, x25, x20\n\t"
6113         "umulh	x27, x25, x20\n\t"
6114         "adcs	x7, x7, x26\n\t"
6115         "adc	x27, x27, xzr\n\t"
6116         /*  Add remaining product results in */
6117         "adds	x5, x5, x16\n\t"
6118         "adcs	x6, x6, x17\n\t"
6119         "adcs	x7, x7, x19\n\t"
6120         "adc	x27, x27, xzr\n\t"
6121         /*  Overflow */
6122         "extr	x27, x27, x7, #63\n\t"
6123         "mul	x27, x27, x25\n\t"
6124         "and	x7, x7, #0x7fffffffffffffff\n\t"
6125         "adds	x4, x4, x27\n\t"
6126         "adcs	x5, x5, xzr\n\t"
6127         "adcs	x6, x6, xzr\n\t"
6128         "adc	x7, x7, xzr\n\t"
6129         /* Reduce if top bit set */
6130         "and	x27, x25, x7, asr 63\n\t"
6131         "and	x7, x7, #0x7fffffffffffffff\n\t"
6132         "adds	x4, x4, x27\n\t"
6133         "adcs	x5, x5, xzr\n\t"
6134         "adcs	x6, x6, xzr\n\t"
6135         "adc	x7, x7, xzr\n\t"
6136         /* Store */
6137         "ldr	x0, [x29, #24]\n\t"
6138         "ldr	x1, [x29, #16]\n\t"
6139         /* Add */
6140         "adds	x8, x12, x4\n\t"
6141         "adcs	x9, x13, x5\n\t"
6142         "adcs	x10, x14, x6\n\t"
6143         "adc	x11, x15, x7\n\t"
6144         "mov	x25, #-19\n\t"
6145         "asr	x28, x11, #63\n\t"
6146         /*   Mask the modulus */
6147         "and	x25, x28, x25\n\t"
6148         "and	x26, x28, #0x7fffffffffffffff\n\t"
6149         /*   Sub modulus (if overflow) */
6150         "subs	x8, x8, x25\n\t"
6151         "sbcs	x9, x9, x28\n\t"
6152         "sbcs	x10, x10, x28\n\t"
6153         "sbc	x11, x11, x26\n\t"
6154         /* Sub */
6155         "subs	x16, x12, x4\n\t"
6156         "sbcs	x17, x13, x5\n\t"
6157         "sbcs	x19, x14, x6\n\t"
6158         "sbcs	x20, x15, x7\n\t"
6159         "mov	x25, #-19\n\t"
6160         "csetm	x28, cc\n\t"
6161         /*   Mask the modulus */
6162         "and	x25, x28, x25\n\t"
6163         "and	x26, x28, #0x7fffffffffffffff\n\t"
6164         /*   Add modulus (if underflow) */
6165         "adds	x16, x16, x25\n\t"
6166         "adcs	x17, x17, x28\n\t"
6167         "adcs	x19, x19, x28\n\t"
6168         "adc	x20, x20, x26\n\t"
6169         "stp	x8, x9, [x0]\n\t"
6170         "stp	x10, x11, [x0, #16]\n\t"
6171         "stp	x16, x17, [x1]\n\t"
6172         "stp	x19, x20, [x1, #16]\n\t"
6173         "ldr	x0, [x29, #48]\n\t"
6174         "ldr	x1, [x29, #64]\n\t"
6175         "ldr	x2, [sp, #96]\n\t"
6176         /* Multiply */
6177         "ldp	x12, x13, [x1]\n\t"
6178         "ldp	x14, x15, [x1, #16]\n\t"
6179         "ldp	x16, x17, [x2]\n\t"
6180         "ldp	x19, x20, [x2, #16]\n\t"
6181         /*  A[0] * B[0] */
6182         "mul	x4, x12, x16\n\t"
6183         "umulh	x5, x12, x16\n\t"
6184         /*  A[0] * B[1] */
6185         "mul	x25, x12, x17\n\t"
6186         "umulh	x6, x12, x17\n\t"
6187         "adds	x5, x5, x25\n\t"
6188         "adc	x6, x6, xzr\n\t"
6189         /*  A[1] * B[0] */
6190         "mul	x25, x13, x16\n\t"
6191         "umulh	x26, x13, x16\n\t"
6192         "adds	x5, x5, x25\n\t"
6193         "adcs	x6, x6, x26\n\t"
6194         "adc	x7, xzr, xzr\n\t"
6195         /*  A[0] * B[2] */
6196         "mul	x25, x12, x19\n\t"
6197         "umulh	x26, x12, x19\n\t"
6198         "adds	x6, x6, x25\n\t"
6199         "adc	x7, x7, x26\n\t"
6200         /*  A[1] * B[1] */
6201         "mul	x25, x13, x17\n\t"
6202         "umulh	x26, x13, x17\n\t"
6203         "adds	x6, x6, x25\n\t"
6204         "adcs	x7, x7, x26\n\t"
6205         "adc	x8, xzr, xzr\n\t"
6206         /*  A[2] * B[0] */
6207         "mul	x25, x14, x16\n\t"
6208         "umulh	x26, x14, x16\n\t"
6209         "adds	x6, x6, x25\n\t"
6210         "adcs	x7, x7, x26\n\t"
6211         "adc	x8, x8, xzr\n\t"
6212         /*  A[0] * B[3] */
6213         "mul	x25, x12, x20\n\t"
6214         "umulh	x26, x12, x20\n\t"
6215         "adds	x7, x7, x25\n\t"
6216         "adcs	x8, x8, x26\n\t"
6217         "adc	x9, xzr, xzr\n\t"
6218         /*  A[1] * B[2] */
6219         "mul	x25, x13, x19\n\t"
6220         "umulh	x26, x13, x19\n\t"
6221         "adds	x7, x7, x25\n\t"
6222         "adcs	x8, x8, x26\n\t"
6223         "adc	x9, x9, xzr\n\t"
6224         /*  A[2] * B[1] */
6225         "mul	x25, x14, x17\n\t"
6226         "umulh	x26, x14, x17\n\t"
6227         "adds	x7, x7, x25\n\t"
6228         "adcs	x8, x8, x26\n\t"
6229         "adc	x9, x9, xzr\n\t"
6230         /*  A[3] * B[0] */
6231         "mul	x25, x15, x16\n\t"
6232         "umulh	x26, x15, x16\n\t"
6233         "adds	x7, x7, x25\n\t"
6234         "adcs	x8, x8, x26\n\t"
6235         "adc	x9, x9, xzr\n\t"
6236         /*  A[1] * B[3] */
6237         "mul	x25, x13, x20\n\t"
6238         "umulh	x26, x13, x20\n\t"
6239         "adds	x8, x8, x25\n\t"
6240         "adcs	x9, x9, x26\n\t"
6241         "adc	x10, xzr, xzr\n\t"
6242         /*  A[2] * B[2] */
6243         "mul	x25, x14, x19\n\t"
6244         "umulh	x26, x14, x19\n\t"
6245         "adds	x8, x8, x25\n\t"
6246         "adcs	x9, x9, x26\n\t"
6247         "adc	x10, x10, xzr\n\t"
6248         /*  A[3] * B[1] */
6249         "mul	x25, x15, x17\n\t"
6250         "umulh	x26, x15, x17\n\t"
6251         "adds	x8, x8, x25\n\t"
6252         "adcs	x9, x9, x26\n\t"
6253         "adc	x10, x10, xzr\n\t"
6254         /*  A[2] * B[3] */
6255         "mul	x25, x14, x20\n\t"
6256         "umulh	x26, x14, x20\n\t"
6257         "adds	x9, x9, x25\n\t"
6258         "adcs	x10, x10, x26\n\t"
6259         "adc	x11, xzr, xzr\n\t"
6260         /*  A[3] * B[2] */
6261         "mul	x25, x15, x19\n\t"
6262         "umulh	x26, x15, x19\n\t"
6263         "adds	x9, x9, x25\n\t"
6264         "adcs	x10, x10, x26\n\t"
6265         "adc	x11, x11, xzr\n\t"
6266         /*  A[3] * B[3] */
6267         "mul	x25, x15, x20\n\t"
6268         "umulh	x26, x15, x20\n\t"
6269         "adds	x10, x10, x25\n\t"
6270         "adc	x11, x11, x26\n\t"
6271         /* Reduce */
6272         /*  Move top half into t4-t7 and remove top bit from t3 */
6273         "extr	x11, x11, x10, #63\n\t"
6274         "extr	x10, x10, x9, #63\n\t"
6275         "extr	x9, x9, x8, #63\n\t"
6276         "extr	x8, x8, x7, #63\n\t"
6277         "and	x7, x7, #0x7fffffffffffffff\n\t"
6278         /*  Multiply top half by 19 */
6279         "mov	x25, #19\n\t"
6280         "mul	x26, x25, x8\n\t"
6281         "umulh	x8, x25, x8\n\t"
6282         "adds	x4, x4, x26\n\t"
6283         "mul	x26, x25, x9\n\t"
6284         "umulh	x9, x25, x9\n\t"
6285         "adcs	x5, x5, x26\n\t"
6286         "mul	x26, x25, x10\n\t"
6287         "umulh	x10, x25, x10\n\t"
6288         "adcs	x6, x6, x26\n\t"
6289         "mul	x26, x25, x11\n\t"
6290         "umulh	x27, x25, x11\n\t"
6291         "adcs	x7, x7, x26\n\t"
6292         "adc	x27, x27, xzr\n\t"
6293         /*  Add remaining product results in */
6294         "adds	x5, x5, x8\n\t"
6295         "adcs	x6, x6, x9\n\t"
6296         "adcs	x7, x7, x10\n\t"
6297         "adc	x27, x27, xzr\n\t"
6298         /*  Overflow */
6299         "extr	x27, x27, x7, #63\n\t"
6300         "mul	x27, x27, x25\n\t"
6301         "and	x7, x7, #0x7fffffffffffffff\n\t"
6302         "adds	x4, x4, x27\n\t"
6303         "adcs	x5, x5, xzr\n\t"
6304         "adcs	x6, x6, xzr\n\t"
6305         "adc	x7, x7, xzr\n\t"
6306         /* Reduce if top bit set */
6307         "and	x27, x25, x7, asr 63\n\t"
6308         "and	x7, x7, #0x7fffffffffffffff\n\t"
6309         "adds	x4, x4, x27\n\t"
6310         "adcs	x5, x5, xzr\n\t"
6311         "adcs	x6, x6, xzr\n\t"
6312         "adc	x7, x7, xzr\n\t"
6313         /* Store */
6314         "ldr	x0, [x29, #48]\n\t"
6315         /* Double */
6316         "adds	x4, x4, x4\n\t"
6317         "adcs	x5, x5, x5\n\t"
6318         "adcs	x6, x6, x6\n\t"
6319         "adc	x7, x7, x7\n\t"
6320         "mov	x25, #-19\n\t"
6321         "asr	x28, x7, #63\n\t"
6322         /*   Mask the modulus */
6323         "and	x25, x28, x25\n\t"
6324         "and	x26, x28, #0x7fffffffffffffff\n\t"
6325         /*   Sub modulus (if overflow) */
6326         "subs	x4, x4, x25\n\t"
6327         "sbcs	x5, x5, x28\n\t"
6328         "sbcs	x6, x6, x28\n\t"
6329         "sbc	x7, x7, x26\n\t"
6330         "ldr	x0, [x29, #40]\n\t"
6331         "ldr	x1, [sp, #104]\n\t"
6332         "ldr	x2, [x29, #72]\n\t"
6333         /* Multiply */
6334         "ldp	x16, x17, [x1]\n\t"
6335         "ldp	x19, x20, [x1, #16]\n\t"
6336         "ldp	x21, x22, [x2]\n\t"
6337         "ldp	x23, x24, [x2, #16]\n\t"
6338         /*  A[0] * B[0] */
6339         "mul	x8, x16, x21\n\t"
6340         "umulh	x9, x16, x21\n\t"
6341         /*  A[0] * B[1] */
6342         "mul	x25, x16, x22\n\t"
6343         "umulh	x10, x16, x22\n\t"
6344         "adds	x9, x9, x25\n\t"
6345         "adc	x10, x10, xzr\n\t"
6346         /*  A[1] * B[0] */
6347         "mul	x25, x17, x21\n\t"
6348         "umulh	x26, x17, x21\n\t"
6349         "adds	x9, x9, x25\n\t"
6350         "adcs	x10, x10, x26\n\t"
6351         "adc	x11, xzr, xzr\n\t"
6352         /*  A[0] * B[2] */
6353         "mul	x25, x16, x23\n\t"
6354         "umulh	x26, x16, x23\n\t"
6355         "adds	x10, x10, x25\n\t"
6356         "adc	x11, x11, x26\n\t"
6357         /*  A[1] * B[1] */
6358         "mul	x25, x17, x22\n\t"
6359         "umulh	x26, x17, x22\n\t"
6360         "adds	x10, x10, x25\n\t"
6361         "adcs	x11, x11, x26\n\t"
6362         "adc	x12, xzr, xzr\n\t"
6363         /*  A[2] * B[0] */
6364         "mul	x25, x19, x21\n\t"
6365         "umulh	x26, x19, x21\n\t"
6366         "adds	x10, x10, x25\n\t"
6367         "adcs	x11, x11, x26\n\t"
6368         "adc	x12, x12, xzr\n\t"
6369         /*  A[0] * B[3] */
6370         "mul	x25, x16, x24\n\t"
6371         "umulh	x26, x16, x24\n\t"
6372         "adds	x11, x11, x25\n\t"
6373         "adcs	x12, x12, x26\n\t"
6374         "adc	x13, xzr, xzr\n\t"
6375         /*  A[1] * B[2] */
6376         "mul	x25, x17, x23\n\t"
6377         "umulh	x26, x17, x23\n\t"
6378         "adds	x11, x11, x25\n\t"
6379         "adcs	x12, x12, x26\n\t"
6380         "adc	x13, x13, xzr\n\t"
6381         /*  A[2] * B[1] */
6382         "mul	x25, x19, x22\n\t"
6383         "umulh	x26, x19, x22\n\t"
6384         "adds	x11, x11, x25\n\t"
6385         "adcs	x12, x12, x26\n\t"
6386         "adc	x13, x13, xzr\n\t"
6387         /*  A[3] * B[0] */
6388         "mul	x25, x20, x21\n\t"
6389         "umulh	x26, x20, x21\n\t"
6390         "adds	x11, x11, x25\n\t"
6391         "adcs	x12, x12, x26\n\t"
6392         "adc	x13, x13, xzr\n\t"
6393         /*  A[1] * B[3] */
6394         "mul	x25, x17, x24\n\t"
6395         "umulh	x26, x17, x24\n\t"
6396         "adds	x12, x12, x25\n\t"
6397         "adcs	x13, x13, x26\n\t"
6398         "adc	x14, xzr, xzr\n\t"
6399         /*  A[2] * B[2] */
6400         "mul	x25, x19, x23\n\t"
6401         "umulh	x26, x19, x23\n\t"
6402         "adds	x12, x12, x25\n\t"
6403         "adcs	x13, x13, x26\n\t"
6404         "adc	x14, x14, xzr\n\t"
6405         /*  A[3] * B[1] */
6406         "mul	x25, x20, x22\n\t"
6407         "umulh	x26, x20, x22\n\t"
6408         "adds	x12, x12, x25\n\t"
6409         "adcs	x13, x13, x26\n\t"
6410         "adc	x14, x14, xzr\n\t"
6411         /*  A[2] * B[3] */
6412         "mul	x25, x19, x24\n\t"
6413         "umulh	x26, x19, x24\n\t"
6414         "adds	x13, x13, x25\n\t"
6415         "adcs	x14, x14, x26\n\t"
6416         "adc	x15, xzr, xzr\n\t"
6417         /*  A[3] * B[2] */
6418         "mul	x25, x20, x23\n\t"
6419         "umulh	x26, x20, x23\n\t"
6420         "adds	x13, x13, x25\n\t"
6421         "adcs	x14, x14, x26\n\t"
6422         "adc	x15, x15, xzr\n\t"
6423         /*  A[3] * B[3] */
6424         "mul	x25, x20, x24\n\t"
6425         "umulh	x26, x20, x24\n\t"
6426         "adds	x14, x14, x25\n\t"
6427         "adc	x15, x15, x26\n\t"
6428         /* Reduce */
6429         /*  Move top half into t4-t7 and remove top bit from t3 */
6430         "extr	x15, x15, x14, #63\n\t"
6431         "extr	x14, x14, x13, #63\n\t"
6432         "extr	x13, x13, x12, #63\n\t"
6433         "extr	x12, x12, x11, #63\n\t"
6434         "and	x11, x11, #0x7fffffffffffffff\n\t"
6435         /*  Multiply top half by 19 */
6436         "mov	x25, #19\n\t"
6437         "mul	x26, x25, x12\n\t"
6438         "umulh	x12, x25, x12\n\t"
6439         "adds	x8, x8, x26\n\t"
6440         "mul	x26, x25, x13\n\t"
6441         "umulh	x13, x25, x13\n\t"
6442         "adcs	x9, x9, x26\n\t"
6443         "mul	x26, x25, x14\n\t"
6444         "umulh	x14, x25, x14\n\t"
6445         "adcs	x10, x10, x26\n\t"
6446         "mul	x26, x25, x15\n\t"
6447         "umulh	x27, x25, x15\n\t"
6448         "adcs	x11, x11, x26\n\t"
6449         "adc	x27, x27, xzr\n\t"
6450         /*  Add remaining product results in */
6451         "adds	x9, x9, x12\n\t"
6452         "adcs	x10, x10, x13\n\t"
6453         "adcs	x11, x11, x14\n\t"
6454         "adc	x27, x27, xzr\n\t"
6455         /*  Overflow */
6456         "extr	x27, x27, x11, #63\n\t"
6457         "mul	x27, x27, x25\n\t"
6458         "and	x11, x11, #0x7fffffffffffffff\n\t"
6459         "adds	x8, x8, x27\n\t"
6460         "adcs	x9, x9, xzr\n\t"
6461         "adcs	x10, x10, xzr\n\t"
6462         "adc	x11, x11, xzr\n\t"
6463         /* Reduce if top bit set */
6464         "and	x27, x25, x11, asr 63\n\t"
6465         "and	x11, x11, #0x7fffffffffffffff\n\t"
6466         "adds	x8, x8, x27\n\t"
6467         "adcs	x9, x9, xzr\n\t"
6468         "adcs	x10, x10, xzr\n\t"
6469         "adc	x11, x11, xzr\n\t"
6470         /* Store */
6471         "ldr	x0, [x29, #32]\n\t"
6472         "ldr	x1, [x29, #40]\n\t"
6473         /* Add */
6474         "adds	x12, x4, x8\n\t"
6475         "adcs	x13, x5, x9\n\t"
6476         "adcs	x14, x6, x10\n\t"
6477         "adc	x15, x7, x11\n\t"
6478         "mov	x25, #-19\n\t"
6479         "asr	x28, x15, #63\n\t"
6480         /*   Mask the modulus */
6481         "and	x25, x28, x25\n\t"
6482         "and	x26, x28, #0x7fffffffffffffff\n\t"
6483         /*   Sub modulus (if overflow) */
6484         "subs	x12, x12, x25\n\t"
6485         "sbcs	x13, x13, x28\n\t"
6486         "sbcs	x14, x14, x28\n\t"
6487         "sbc	x15, x15, x26\n\t"
6488         /* Sub */
6489         "subs	x16, x4, x8\n\t"
6490         "sbcs	x17, x5, x9\n\t"
6491         "sbcs	x19, x6, x10\n\t"
6492         "sbcs	x20, x7, x11\n\t"
6493         "mov	x25, #-19\n\t"
6494         "csetm	x28, cc\n\t"
6495         /*   Mask the modulus */
6496         "and	x25, x28, x25\n\t"
6497         "and	x26, x28, #0x7fffffffffffffff\n\t"
6498         /*   Add modulus (if underflow) */
6499         "adds	x16, x16, x25\n\t"
6500         "adcs	x17, x17, x28\n\t"
6501         "adcs	x19, x19, x28\n\t"
6502         "adc	x20, x20, x26\n\t"
6503         "stp	x12, x13, [x0]\n\t"
6504         "stp	x14, x15, [x0, #16]\n\t"
6505         "stp	x16, x17, [x1]\n\t"
6506         "stp	x19, x20, [x1, #16]\n\t"
6507         "ldp	x29, x30, [sp], #0x80\n\t"
6508         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qz] "+r" (qz), [qt2d] "+r" (qt2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
6509         :
6510         : "memory", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
6511     );
6512 }
6513 
fe_ge_sub(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qz,const fe qt2d,const fe qyplusx,const fe qyminusx)6514 void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
6515 {
6516     __asm__ __volatile__ (
6517         "stp	x29, x30, [sp, #-128]!\n\t"
6518         "add	x29, sp, #0\n\t"
6519         "str	%x[qyminusx], [sp, #120]\n\t"
6520         "str	%x[qyplusx], [sp, #112]\n\t"
6521         "str	%x[qt2d], [sp, #104]\n\t"
6522         "str	%x[qz], [sp, #96]\n\t"
6523         "str	%x[rx], [x29, #16]\n\t"
6524         "str	%x[ry], [x29, #24]\n\t"
6525         "str	%x[rz], [x29, #32]\n\t"
6526         "str	%x[rt], [x29, #40]\n\t"
6527         "str	%x[px], [x29, #48]\n\t"
6528         "str	%x[py], [x29, #56]\n\t"
6529         "str	%x[pz], [x29, #64]\n\t"
6530         "str	%x[pt], [x29, #72]\n\t"
6531         "ldr	x2, [x29, #56]\n\t"
6532         "ldr	x3, [x29, #48]\n\t"
6533         /* Add */
6534         "ldp	x12, x13, [x2]\n\t"
6535         "ldp	x14, x15, [x2, #16]\n\t"
6536         "ldp	x16, x17, [x3]\n\t"
6537         "ldp	x19, x20, [x3, #16]\n\t"
6538         "adds	x4, x12, x16\n\t"
6539         "adcs	x5, x13, x17\n\t"
6540         "adcs	x6, x14, x19\n\t"
6541         "adc	x7, x15, x20\n\t"
6542         "mov	x25, #-19\n\t"
6543         "asr	x28, x7, #63\n\t"
6544         /*   Mask the modulus */
6545         "and	x25, x28, x25\n\t"
6546         "and	x26, x28, #0x7fffffffffffffff\n\t"
6547         /*   Sub modulus (if overflow) */
6548         "subs	x4, x4, x25\n\t"
6549         "sbcs	x5, x5, x28\n\t"
6550         "sbcs	x6, x6, x28\n\t"
6551         "sbc	x7, x7, x26\n\t"
6552         /* Sub */
6553         "subs	x8, x12, x16\n\t"
6554         "sbcs	x9, x13, x17\n\t"
6555         "sbcs	x10, x14, x19\n\t"
6556         "sbcs	x11, x15, x20\n\t"
6557         "mov	x25, #-19\n\t"
6558         "csetm	x28, cc\n\t"
6559         /*   Mask the modulus */
6560         "and	x25, x28, x25\n\t"
6561         "and	x26, x28, #0x7fffffffffffffff\n\t"
6562         /*   Add modulus (if underflow) */
6563         "adds	x8, x8, x25\n\t"
6564         "adcs	x9, x9, x28\n\t"
6565         "adcs	x10, x10, x28\n\t"
6566         "adc	x11, x11, x26\n\t"
6567         "ldr	x0, [x29, #32]\n\t"
6568         "ldr	x2, [sp, #120]\n\t"
6569         /* Multiply */
6570         "ldp	x21, x22, [x2]\n\t"
6571         "ldp	x23, x24, [x2, #16]\n\t"
6572         /*  A[0] * B[0] */
6573         "mul	x12, x4, x21\n\t"
6574         "umulh	x13, x4, x21\n\t"
6575         /*  A[0] * B[1] */
6576         "mul	x25, x4, x22\n\t"
6577         "umulh	x14, x4, x22\n\t"
6578         "adds	x13, x13, x25\n\t"
6579         "adc	x14, x14, xzr\n\t"
6580         /*  A[1] * B[0] */
6581         "mul	x25, x5, x21\n\t"
6582         "umulh	x26, x5, x21\n\t"
6583         "adds	x13, x13, x25\n\t"
6584         "adcs	x14, x14, x26\n\t"
6585         "adc	x15, xzr, xzr\n\t"
6586         /*  A[0] * B[2] */
6587         "mul	x25, x4, x23\n\t"
6588         "umulh	x26, x4, x23\n\t"
6589         "adds	x14, x14, x25\n\t"
6590         "adc	x15, x15, x26\n\t"
6591         /*  A[1] * B[1] */
6592         "mul	x25, x5, x22\n\t"
6593         "umulh	x26, x5, x22\n\t"
6594         "adds	x14, x14, x25\n\t"
6595         "adcs	x15, x15, x26\n\t"
6596         "adc	x16, xzr, xzr\n\t"
6597         /*  A[2] * B[0] */
6598         "mul	x25, x6, x21\n\t"
6599         "umulh	x26, x6, x21\n\t"
6600         "adds	x14, x14, x25\n\t"
6601         "adcs	x15, x15, x26\n\t"
6602         "adc	x16, x16, xzr\n\t"
6603         /*  A[0] * B[3] */
6604         "mul	x25, x4, x24\n\t"
6605         "umulh	x26, x4, x24\n\t"
6606         "adds	x15, x15, x25\n\t"
6607         "adcs	x16, x16, x26\n\t"
6608         "adc	x17, xzr, xzr\n\t"
6609         /*  A[1] * B[2] */
6610         "mul	x25, x5, x23\n\t"
6611         "umulh	x26, x5, x23\n\t"
6612         "adds	x15, x15, x25\n\t"
6613         "adcs	x16, x16, x26\n\t"
6614         "adc	x17, x17, xzr\n\t"
6615         /*  A[2] * B[1] */
6616         "mul	x25, x6, x22\n\t"
6617         "umulh	x26, x6, x22\n\t"
6618         "adds	x15, x15, x25\n\t"
6619         "adcs	x16, x16, x26\n\t"
6620         "adc	x17, x17, xzr\n\t"
6621         /*  A[3] * B[0] */
6622         "mul	x25, x7, x21\n\t"
6623         "umulh	x26, x7, x21\n\t"
6624         "adds	x15, x15, x25\n\t"
6625         "adcs	x16, x16, x26\n\t"
6626         "adc	x17, x17, xzr\n\t"
6627         /*  A[1] * B[3] */
6628         "mul	x25, x5, x24\n\t"
6629         "umulh	x26, x5, x24\n\t"
6630         "adds	x16, x16, x25\n\t"
6631         "adcs	x17, x17, x26\n\t"
6632         "adc	x19, xzr, xzr\n\t"
6633         /*  A[2] * B[2] */
6634         "mul	x25, x6, x23\n\t"
6635         "umulh	x26, x6, x23\n\t"
6636         "adds	x16, x16, x25\n\t"
6637         "adcs	x17, x17, x26\n\t"
6638         "adc	x19, x19, xzr\n\t"
6639         /*  A[3] * B[1] */
6640         "mul	x25, x7, x22\n\t"
6641         "umulh	x26, x7, x22\n\t"
6642         "adds	x16, x16, x25\n\t"
6643         "adcs	x17, x17, x26\n\t"
6644         "adc	x19, x19, xzr\n\t"
6645         /*  A[2] * B[3] */
6646         "mul	x25, x6, x24\n\t"
6647         "umulh	x26, x6, x24\n\t"
6648         "adds	x17, x17, x25\n\t"
6649         "adcs	x19, x19, x26\n\t"
6650         "adc	x20, xzr, xzr\n\t"
6651         /*  A[3] * B[2] */
6652         "mul	x25, x7, x23\n\t"
6653         "umulh	x26, x7, x23\n\t"
6654         "adds	x17, x17, x25\n\t"
6655         "adcs	x19, x19, x26\n\t"
6656         "adc	x20, x20, xzr\n\t"
6657         /*  A[3] * B[3] */
6658         "mul	x25, x7, x24\n\t"
6659         "umulh	x26, x7, x24\n\t"
6660         "adds	x19, x19, x25\n\t"
6661         "adc	x20, x20, x26\n\t"
6662         /* Reduce */
6663         /*  Move top half into t4-t7 and remove top bit from t3 */
6664         "extr	x20, x20, x19, #63\n\t"
6665         "extr	x19, x19, x17, #63\n\t"
6666         "extr	x17, x17, x16, #63\n\t"
6667         "extr	x16, x16, x15, #63\n\t"
6668         "and	x15, x15, #0x7fffffffffffffff\n\t"
6669         /*  Multiply top half by 19 */
6670         "mov	x25, #19\n\t"
6671         "mul	x26, x25, x16\n\t"
6672         "umulh	x16, x25, x16\n\t"
6673         "adds	x12, x12, x26\n\t"
6674         "mul	x26, x25, x17\n\t"
6675         "umulh	x17, x25, x17\n\t"
6676         "adcs	x13, x13, x26\n\t"
6677         "mul	x26, x25, x19\n\t"
6678         "umulh	x19, x25, x19\n\t"
6679         "adcs	x14, x14, x26\n\t"
6680         "mul	x26, x25, x20\n\t"
6681         "umulh	x27, x25, x20\n\t"
6682         "adcs	x15, x15, x26\n\t"
6683         "adc	x27, x27, xzr\n\t"
6684         /*  Add remaining product results in */
6685         "adds	x13, x13, x16\n\t"
6686         "adcs	x14, x14, x17\n\t"
6687         "adcs	x15, x15, x19\n\t"
6688         "adc	x27, x27, xzr\n\t"
6689         /*  Overflow */
6690         "extr	x27, x27, x15, #63\n\t"
6691         "mul	x27, x27, x25\n\t"
6692         "and	x15, x15, #0x7fffffffffffffff\n\t"
6693         "adds	x12, x12, x27\n\t"
6694         "adcs	x13, x13, xzr\n\t"
6695         "adcs	x14, x14, xzr\n\t"
6696         "adc	x15, x15, xzr\n\t"
6697         /* Reduce if top bit set */
6698         "and	x27, x25, x15, asr 63\n\t"
6699         "and	x15, x15, #0x7fffffffffffffff\n\t"
6700         "adds	x12, x12, x27\n\t"
6701         "adcs	x13, x13, xzr\n\t"
6702         "adcs	x14, x14, xzr\n\t"
6703         "adc	x15, x15, xzr\n\t"
6704         /* Store */
6705         "ldr	x0, [x29, #24]\n\t"
6706         "ldr	x1, [sp, #112]\n\t"
6707         /* Multiply */
6708         "ldp	x21, x22, [x1]\n\t"
6709         "ldp	x23, x24, [x1, #16]\n\t"
6710         /*  A[0] * B[0] */
6711         "mul	x4, x8, x21\n\t"
6712         "umulh	x5, x8, x21\n\t"
6713         /*  A[0] * B[1] */
6714         "mul	x25, x8, x22\n\t"
6715         "umulh	x6, x8, x22\n\t"
6716         "adds	x5, x5, x25\n\t"
6717         "adc	x6, x6, xzr\n\t"
6718         /*  A[1] * B[0] */
6719         "mul	x25, x9, x21\n\t"
6720         "umulh	x26, x9, x21\n\t"
6721         "adds	x5, x5, x25\n\t"
6722         "adcs	x6, x6, x26\n\t"
6723         "adc	x7, xzr, xzr\n\t"
6724         /*  A[0] * B[2] */
6725         "mul	x25, x8, x23\n\t"
6726         "umulh	x26, x8, x23\n\t"
6727         "adds	x6, x6, x25\n\t"
6728         "adc	x7, x7, x26\n\t"
6729         /*  A[1] * B[1] */
6730         "mul	x25, x9, x22\n\t"
6731         "umulh	x26, x9, x22\n\t"
6732         "adds	x6, x6, x25\n\t"
6733         "adcs	x7, x7, x26\n\t"
6734         "adc	x16, xzr, xzr\n\t"
6735         /*  A[2] * B[0] */
6736         "mul	x25, x10, x21\n\t"
6737         "umulh	x26, x10, x21\n\t"
6738         "adds	x6, x6, x25\n\t"
6739         "adcs	x7, x7, x26\n\t"
6740         "adc	x16, x16, xzr\n\t"
6741         /*  A[0] * B[3] */
6742         "mul	x25, x8, x24\n\t"
6743         "umulh	x26, x8, x24\n\t"
6744         "adds	x7, x7, x25\n\t"
6745         "adcs	x16, x16, x26\n\t"
6746         "adc	x17, xzr, xzr\n\t"
6747         /*  A[1] * B[2] */
6748         "mul	x25, x9, x23\n\t"
6749         "umulh	x26, x9, x23\n\t"
6750         "adds	x7, x7, x25\n\t"
6751         "adcs	x16, x16, x26\n\t"
6752         "adc	x17, x17, xzr\n\t"
6753         /*  A[2] * B[1] */
6754         "mul	x25, x10, x22\n\t"
6755         "umulh	x26, x10, x22\n\t"
6756         "adds	x7, x7, x25\n\t"
6757         "adcs	x16, x16, x26\n\t"
6758         "adc	x17, x17, xzr\n\t"
6759         /*  A[3] * B[0] */
6760         "mul	x25, x11, x21\n\t"
6761         "umulh	x26, x11, x21\n\t"
6762         "adds	x7, x7, x25\n\t"
6763         "adcs	x16, x16, x26\n\t"
6764         "adc	x17, x17, xzr\n\t"
6765         /*  A[1] * B[3] */
6766         "mul	x25, x9, x24\n\t"
6767         "umulh	x26, x9, x24\n\t"
6768         "adds	x16, x16, x25\n\t"
6769         "adcs	x17, x17, x26\n\t"
6770         "adc	x19, xzr, xzr\n\t"
6771         /*  A[2] * B[2] */
6772         "mul	x25, x10, x23\n\t"
6773         "umulh	x26, x10, x23\n\t"
6774         "adds	x16, x16, x25\n\t"
6775         "adcs	x17, x17, x26\n\t"
6776         "adc	x19, x19, xzr\n\t"
6777         /*  A[3] * B[1] */
6778         "mul	x25, x11, x22\n\t"
6779         "umulh	x26, x11, x22\n\t"
6780         "adds	x16, x16, x25\n\t"
6781         "adcs	x17, x17, x26\n\t"
6782         "adc	x19, x19, xzr\n\t"
6783         /*  A[2] * B[3] */
6784         "mul	x25, x10, x24\n\t"
6785         "umulh	x26, x10, x24\n\t"
6786         "adds	x17, x17, x25\n\t"
6787         "adcs	x19, x19, x26\n\t"
6788         "adc	x20, xzr, xzr\n\t"
6789         /*  A[3] * B[2] */
6790         "mul	x25, x11, x23\n\t"
6791         "umulh	x26, x11, x23\n\t"
6792         "adds	x17, x17, x25\n\t"
6793         "adcs	x19, x19, x26\n\t"
6794         "adc	x20, x20, xzr\n\t"
6795         /*  A[3] * B[3] */
6796         "mul	x25, x11, x24\n\t"
6797         "umulh	x26, x11, x24\n\t"
6798         "adds	x19, x19, x25\n\t"
6799         "adc	x20, x20, x26\n\t"
6800         /* Reduce */
6801         /*  Move top half into t4-t7 and remove top bit from t3 */
6802         "extr	x20, x20, x19, #63\n\t"
6803         "extr	x19, x19, x17, #63\n\t"
6804         "extr	x17, x17, x16, #63\n\t"
6805         "extr	x16, x16, x7, #63\n\t"
6806         "and	x7, x7, #0x7fffffffffffffff\n\t"
6807         /*  Multiply top half by 19 */
6808         "mov	x25, #19\n\t"
6809         "mul	x26, x25, x16\n\t"
6810         "umulh	x16, x25, x16\n\t"
6811         "adds	x4, x4, x26\n\t"
6812         "mul	x26, x25, x17\n\t"
6813         "umulh	x17, x25, x17\n\t"
6814         "adcs	x5, x5, x26\n\t"
6815         "mul	x26, x25, x19\n\t"
6816         "umulh	x19, x25, x19\n\t"
6817         "adcs	x6, x6, x26\n\t"
6818         "mul	x26, x25, x20\n\t"
6819         "umulh	x27, x25, x20\n\t"
6820         "adcs	x7, x7, x26\n\t"
6821         "adc	x27, x27, xzr\n\t"
6822         /*  Add remaining product results in */
6823         "adds	x5, x5, x16\n\t"
6824         "adcs	x6, x6, x17\n\t"
6825         "adcs	x7, x7, x19\n\t"
6826         "adc	x27, x27, xzr\n\t"
6827         /*  Overflow */
6828         "extr	x27, x27, x7, #63\n\t"
6829         "mul	x27, x27, x25\n\t"
6830         "and	x7, x7, #0x7fffffffffffffff\n\t"
6831         "adds	x4, x4, x27\n\t"
6832         "adcs	x5, x5, xzr\n\t"
6833         "adcs	x6, x6, xzr\n\t"
6834         "adc	x7, x7, xzr\n\t"
6835         /* Reduce if top bit set */
6836         "and	x27, x25, x7, asr 63\n\t"
6837         "and	x7, x7, #0x7fffffffffffffff\n\t"
6838         "adds	x4, x4, x27\n\t"
6839         "adcs	x5, x5, xzr\n\t"
6840         "adcs	x6, x6, xzr\n\t"
6841         "adc	x7, x7, xzr\n\t"
6842         /* Store */
6843         "ldr	x0, [x29, #24]\n\t"
6844         "ldr	x1, [x29, #16]\n\t"
6845         /* Add */
6846         "adds	x8, x12, x4\n\t"
6847         "adcs	x9, x13, x5\n\t"
6848         "adcs	x10, x14, x6\n\t"
6849         "adc	x11, x15, x7\n\t"
6850         "mov	x25, #-19\n\t"
6851         "asr	x28, x11, #63\n\t"
6852         /*   Mask the modulus */
6853         "and	x25, x28, x25\n\t"
6854         "and	x26, x28, #0x7fffffffffffffff\n\t"
6855         /*   Sub modulus (if overflow) */
6856         "subs	x8, x8, x25\n\t"
6857         "sbcs	x9, x9, x28\n\t"
6858         "sbcs	x10, x10, x28\n\t"
6859         "sbc	x11, x11, x26\n\t"
6860         /* Sub */
6861         "subs	x16, x12, x4\n\t"
6862         "sbcs	x17, x13, x5\n\t"
6863         "sbcs	x19, x14, x6\n\t"
6864         "sbcs	x20, x15, x7\n\t"
6865         "mov	x25, #-19\n\t"
6866         "csetm	x28, cc\n\t"
6867         /*   Mask the modulus */
6868         "and	x25, x28, x25\n\t"
6869         "and	x26, x28, #0x7fffffffffffffff\n\t"
6870         /*   Add modulus (if underflow) */
6871         "adds	x16, x16, x25\n\t"
6872         "adcs	x17, x17, x28\n\t"
6873         "adcs	x19, x19, x28\n\t"
6874         "adc	x20, x20, x26\n\t"
6875         "stp	x8, x9, [x0]\n\t"
6876         "stp	x10, x11, [x0, #16]\n\t"
6877         "stp	x16, x17, [x1]\n\t"
6878         "stp	x19, x20, [x1, #16]\n\t"
6879         "ldr	x0, [x29, #48]\n\t"
6880         "ldr	x1, [x29, #64]\n\t"
6881         "ldr	x2, [sp, #96]\n\t"
6882         /* Multiply */
6883         "ldp	x12, x13, [x1]\n\t"
6884         "ldp	x14, x15, [x1, #16]\n\t"
6885         "ldp	x16, x17, [x2]\n\t"
6886         "ldp	x19, x20, [x2, #16]\n\t"
6887         /*  A[0] * B[0] */
6888         "mul	x4, x12, x16\n\t"
6889         "umulh	x5, x12, x16\n\t"
6890         /*  A[0] * B[1] */
6891         "mul	x25, x12, x17\n\t"
6892         "umulh	x6, x12, x17\n\t"
6893         "adds	x5, x5, x25\n\t"
6894         "adc	x6, x6, xzr\n\t"
6895         /*  A[1] * B[0] */
6896         "mul	x25, x13, x16\n\t"
6897         "umulh	x26, x13, x16\n\t"
6898         "adds	x5, x5, x25\n\t"
6899         "adcs	x6, x6, x26\n\t"
6900         "adc	x7, xzr, xzr\n\t"
6901         /*  A[0] * B[2] */
6902         "mul	x25, x12, x19\n\t"
6903         "umulh	x26, x12, x19\n\t"
6904         "adds	x6, x6, x25\n\t"
6905         "adc	x7, x7, x26\n\t"
6906         /*  A[1] * B[1] */
6907         "mul	x25, x13, x17\n\t"
6908         "umulh	x26, x13, x17\n\t"
6909         "adds	x6, x6, x25\n\t"
6910         "adcs	x7, x7, x26\n\t"
6911         "adc	x8, xzr, xzr\n\t"
6912         /*  A[2] * B[0] */
6913         "mul	x25, x14, x16\n\t"
6914         "umulh	x26, x14, x16\n\t"
6915         "adds	x6, x6, x25\n\t"
6916         "adcs	x7, x7, x26\n\t"
6917         "adc	x8, x8, xzr\n\t"
6918         /*  A[0] * B[3] */
6919         "mul	x25, x12, x20\n\t"
6920         "umulh	x26, x12, x20\n\t"
6921         "adds	x7, x7, x25\n\t"
6922         "adcs	x8, x8, x26\n\t"
6923         "adc	x9, xzr, xzr\n\t"
6924         /*  A[1] * B[2] */
6925         "mul	x25, x13, x19\n\t"
6926         "umulh	x26, x13, x19\n\t"
6927         "adds	x7, x7, x25\n\t"
6928         "adcs	x8, x8, x26\n\t"
6929         "adc	x9, x9, xzr\n\t"
6930         /*  A[2] * B[1] */
6931         "mul	x25, x14, x17\n\t"
6932         "umulh	x26, x14, x17\n\t"
6933         "adds	x7, x7, x25\n\t"
6934         "adcs	x8, x8, x26\n\t"
6935         "adc	x9, x9, xzr\n\t"
6936         /*  A[3] * B[0] */
6937         "mul	x25, x15, x16\n\t"
6938         "umulh	x26, x15, x16\n\t"
6939         "adds	x7, x7, x25\n\t"
6940         "adcs	x8, x8, x26\n\t"
6941         "adc	x9, x9, xzr\n\t"
6942         /*  A[1] * B[3] */
6943         "mul	x25, x13, x20\n\t"
6944         "umulh	x26, x13, x20\n\t"
6945         "adds	x8, x8, x25\n\t"
6946         "adcs	x9, x9, x26\n\t"
6947         "adc	x10, xzr, xzr\n\t"
6948         /*  A[2] * B[2] */
6949         "mul	x25, x14, x19\n\t"
6950         "umulh	x26, x14, x19\n\t"
6951         "adds	x8, x8, x25\n\t"
6952         "adcs	x9, x9, x26\n\t"
6953         "adc	x10, x10, xzr\n\t"
6954         /*  A[3] * B[1] */
6955         "mul	x25, x15, x17\n\t"
6956         "umulh	x26, x15, x17\n\t"
6957         "adds	x8, x8, x25\n\t"
6958         "adcs	x9, x9, x26\n\t"
6959         "adc	x10, x10, xzr\n\t"
6960         /*  A[2] * B[3] */
6961         "mul	x25, x14, x20\n\t"
6962         "umulh	x26, x14, x20\n\t"
6963         "adds	x9, x9, x25\n\t"
6964         "adcs	x10, x10, x26\n\t"
6965         "adc	x11, xzr, xzr\n\t"
6966         /*  A[3] * B[2] */
6967         "mul	x25, x15, x19\n\t"
6968         "umulh	x26, x15, x19\n\t"
6969         "adds	x9, x9, x25\n\t"
6970         "adcs	x10, x10, x26\n\t"
6971         "adc	x11, x11, xzr\n\t"
6972         /*  A[3] * B[3] */
6973         "mul	x25, x15, x20\n\t"
6974         "umulh	x26, x15, x20\n\t"
6975         "adds	x10, x10, x25\n\t"
6976         "adc	x11, x11, x26\n\t"
6977         /* Reduce */
6978         /*  Move top half into t4-t7 and remove top bit from t3 */
6979         "extr	x11, x11, x10, #63\n\t"
6980         "extr	x10, x10, x9, #63\n\t"
6981         "extr	x9, x9, x8, #63\n\t"
6982         "extr	x8, x8, x7, #63\n\t"
6983         "and	x7, x7, #0x7fffffffffffffff\n\t"
6984         /*  Multiply top half by 19 */
6985         "mov	x25, #19\n\t"
6986         "mul	x26, x25, x8\n\t"
6987         "umulh	x8, x25, x8\n\t"
6988         "adds	x4, x4, x26\n\t"
6989         "mul	x26, x25, x9\n\t"
6990         "umulh	x9, x25, x9\n\t"
6991         "adcs	x5, x5, x26\n\t"
6992         "mul	x26, x25, x10\n\t"
6993         "umulh	x10, x25, x10\n\t"
6994         "adcs	x6, x6, x26\n\t"
6995         "mul	x26, x25, x11\n\t"
6996         "umulh	x27, x25, x11\n\t"
6997         "adcs	x7, x7, x26\n\t"
6998         "adc	x27, x27, xzr\n\t"
6999         /*  Add remaining product results in */
7000         "adds	x5, x5, x8\n\t"
7001         "adcs	x6, x6, x9\n\t"
7002         "adcs	x7, x7, x10\n\t"
7003         "adc	x27, x27, xzr\n\t"
7004         /*  Overflow */
7005         "extr	x27, x27, x7, #63\n\t"
7006         "mul	x27, x27, x25\n\t"
7007         "and	x7, x7, #0x7fffffffffffffff\n\t"
7008         "adds	x4, x4, x27\n\t"
7009         "adcs	x5, x5, xzr\n\t"
7010         "adcs	x6, x6, xzr\n\t"
7011         "adc	x7, x7, xzr\n\t"
7012         /* Reduce if top bit set */
7013         "and	x27, x25, x7, asr 63\n\t"
7014         "and	x7, x7, #0x7fffffffffffffff\n\t"
7015         "adds	x4, x4, x27\n\t"
7016         "adcs	x5, x5, xzr\n\t"
7017         "adcs	x6, x6, xzr\n\t"
7018         "adc	x7, x7, xzr\n\t"
7019         /* Store */
7020         "ldr	x0, [x29, #48]\n\t"
7021         /* Double */
7022         "adds	x4, x4, x4\n\t"
7023         "adcs	x5, x5, x5\n\t"
7024         "adcs	x6, x6, x6\n\t"
7025         "adc	x7, x7, x7\n\t"
7026         "mov	x25, #-19\n\t"
7027         "asr	x28, x7, #63\n\t"
7028         /*   Mask the modulus */
7029         "and	x25, x28, x25\n\t"
7030         "and	x26, x28, #0x7fffffffffffffff\n\t"
7031         /*   Sub modulus (if overflow) */
7032         "subs	x4, x4, x25\n\t"
7033         "sbcs	x5, x5, x28\n\t"
7034         "sbcs	x6, x6, x28\n\t"
7035         "sbc	x7, x7, x26\n\t"
7036         "ldr	x0, [x29, #40]\n\t"
7037         "ldr	x1, [sp, #104]\n\t"
7038         "ldr	x2, [x29, #72]\n\t"
7039         /* Multiply */
7040         "ldp	x16, x17, [x1]\n\t"
7041         "ldp	x19, x20, [x1, #16]\n\t"
7042         "ldp	x21, x22, [x2]\n\t"
7043         "ldp	x23, x24, [x2, #16]\n\t"
7044         /*  A[0] * B[0] */
7045         "mul	x8, x16, x21\n\t"
7046         "umulh	x9, x16, x21\n\t"
7047         /*  A[0] * B[1] */
7048         "mul	x25, x16, x22\n\t"
7049         "umulh	x10, x16, x22\n\t"
7050         "adds	x9, x9, x25\n\t"
7051         "adc	x10, x10, xzr\n\t"
7052         /*  A[1] * B[0] */
7053         "mul	x25, x17, x21\n\t"
7054         "umulh	x26, x17, x21\n\t"
7055         "adds	x9, x9, x25\n\t"
7056         "adcs	x10, x10, x26\n\t"
7057         "adc	x11, xzr, xzr\n\t"
7058         /*  A[0] * B[2] */
7059         "mul	x25, x16, x23\n\t"
7060         "umulh	x26, x16, x23\n\t"
7061         "adds	x10, x10, x25\n\t"
7062         "adc	x11, x11, x26\n\t"
7063         /*  A[1] * B[1] */
7064         "mul	x25, x17, x22\n\t"
7065         "umulh	x26, x17, x22\n\t"
7066         "adds	x10, x10, x25\n\t"
7067         "adcs	x11, x11, x26\n\t"
7068         "adc	x12, xzr, xzr\n\t"
7069         /*  A[2] * B[0] */
7070         "mul	x25, x19, x21\n\t"
7071         "umulh	x26, x19, x21\n\t"
7072         "adds	x10, x10, x25\n\t"
7073         "adcs	x11, x11, x26\n\t"
7074         "adc	x12, x12, xzr\n\t"
7075         /*  A[0] * B[3] */
7076         "mul	x25, x16, x24\n\t"
7077         "umulh	x26, x16, x24\n\t"
7078         "adds	x11, x11, x25\n\t"
7079         "adcs	x12, x12, x26\n\t"
7080         "adc	x13, xzr, xzr\n\t"
7081         /*  A[1] * B[2] */
7082         "mul	x25, x17, x23\n\t"
7083         "umulh	x26, x17, x23\n\t"
7084         "adds	x11, x11, x25\n\t"
7085         "adcs	x12, x12, x26\n\t"
7086         "adc	x13, x13, xzr\n\t"
7087         /*  A[2] * B[1] */
7088         "mul	x25, x19, x22\n\t"
7089         "umulh	x26, x19, x22\n\t"
7090         "adds	x11, x11, x25\n\t"
7091         "adcs	x12, x12, x26\n\t"
7092         "adc	x13, x13, xzr\n\t"
7093         /*  A[3] * B[0] */
7094         "mul	x25, x20, x21\n\t"
7095         "umulh	x26, x20, x21\n\t"
7096         "adds	x11, x11, x25\n\t"
7097         "adcs	x12, x12, x26\n\t"
7098         "adc	x13, x13, xzr\n\t"
7099         /*  A[1] * B[3] */
7100         "mul	x25, x17, x24\n\t"
7101         "umulh	x26, x17, x24\n\t"
7102         "adds	x12, x12, x25\n\t"
7103         "adcs	x13, x13, x26\n\t"
7104         "adc	x14, xzr, xzr\n\t"
7105         /*  A[2] * B[2] */
7106         "mul	x25, x19, x23\n\t"
7107         "umulh	x26, x19, x23\n\t"
7108         "adds	x12, x12, x25\n\t"
7109         "adcs	x13, x13, x26\n\t"
7110         "adc	x14, x14, xzr\n\t"
7111         /*  A[3] * B[1] */
7112         "mul	x25, x20, x22\n\t"
7113         "umulh	x26, x20, x22\n\t"
7114         "adds	x12, x12, x25\n\t"
7115         "adcs	x13, x13, x26\n\t"
7116         "adc	x14, x14, xzr\n\t"
7117         /*  A[2] * B[3] */
7118         "mul	x25, x19, x24\n\t"
7119         "umulh	x26, x19, x24\n\t"
7120         "adds	x13, x13, x25\n\t"
7121         "adcs	x14, x14, x26\n\t"
7122         "adc	x15, xzr, xzr\n\t"
7123         /*  A[3] * B[2] */
7124         "mul	x25, x20, x23\n\t"
7125         "umulh	x26, x20, x23\n\t"
7126         "adds	x13, x13, x25\n\t"
7127         "adcs	x14, x14, x26\n\t"
7128         "adc	x15, x15, xzr\n\t"
7129         /*  A[3] * B[3] */
7130         "mul	x25, x20, x24\n\t"
7131         "umulh	x26, x20, x24\n\t"
7132         "adds	x14, x14, x25\n\t"
7133         "adc	x15, x15, x26\n\t"
7134         /* Reduce */
7135         /*  Move top half into t4-t7 and remove top bit from t3 */
7136         "extr	x15, x15, x14, #63\n\t"
7137         "extr	x14, x14, x13, #63\n\t"
7138         "extr	x13, x13, x12, #63\n\t"
7139         "extr	x12, x12, x11, #63\n\t"
7140         "and	x11, x11, #0x7fffffffffffffff\n\t"
7141         /*  Multiply top half by 19 */
7142         "mov	x25, #19\n\t"
7143         "mul	x26, x25, x12\n\t"
7144         "umulh	x12, x25, x12\n\t"
7145         "adds	x8, x8, x26\n\t"
7146         "mul	x26, x25, x13\n\t"
7147         "umulh	x13, x25, x13\n\t"
7148         "adcs	x9, x9, x26\n\t"
7149         "mul	x26, x25, x14\n\t"
7150         "umulh	x14, x25, x14\n\t"
7151         "adcs	x10, x10, x26\n\t"
7152         "mul	x26, x25, x15\n\t"
7153         "umulh	x27, x25, x15\n\t"
7154         "adcs	x11, x11, x26\n\t"
7155         "adc	x27, x27, xzr\n\t"
7156         /*  Add remaining product results in */
7157         "adds	x9, x9, x12\n\t"
7158         "adcs	x10, x10, x13\n\t"
7159         "adcs	x11, x11, x14\n\t"
7160         "adc	x27, x27, xzr\n\t"
7161         /*  Overflow */
7162         "extr	x27, x27, x11, #63\n\t"
7163         "mul	x27, x27, x25\n\t"
7164         "and	x11, x11, #0x7fffffffffffffff\n\t"
7165         "adds	x8, x8, x27\n\t"
7166         "adcs	x9, x9, xzr\n\t"
7167         "adcs	x10, x10, xzr\n\t"
7168         "adc	x11, x11, xzr\n\t"
7169         /* Reduce if top bit set */
7170         "and	x27, x25, x11, asr 63\n\t"
7171         "and	x11, x11, #0x7fffffffffffffff\n\t"
7172         "adds	x8, x8, x27\n\t"
7173         "adcs	x9, x9, xzr\n\t"
7174         "adcs	x10, x10, xzr\n\t"
7175         "adc	x11, x11, xzr\n\t"
7176         /* Store */
7177         "ldr	x0, [x29, #40]\n\t"
7178         "ldr	x1, [x29, #32]\n\t"
7179         /* Add */
7180         "adds	x12, x4, x8\n\t"
7181         "adcs	x13, x5, x9\n\t"
7182         "adcs	x14, x6, x10\n\t"
7183         "adc	x15, x7, x11\n\t"
7184         "mov	x25, #-19\n\t"
7185         "asr	x28, x15, #63\n\t"
7186         /*   Mask the modulus */
7187         "and	x25, x28, x25\n\t"
7188         "and	x26, x28, #0x7fffffffffffffff\n\t"
7189         /*   Sub modulus (if overflow) */
7190         "subs	x12, x12, x25\n\t"
7191         "sbcs	x13, x13, x28\n\t"
7192         "sbcs	x14, x14, x28\n\t"
7193         "sbc	x15, x15, x26\n\t"
7194         /* Sub */
7195         "subs	x16, x4, x8\n\t"
7196         "sbcs	x17, x5, x9\n\t"
7197         "sbcs	x19, x6, x10\n\t"
7198         "sbcs	x20, x7, x11\n\t"
7199         "mov	x25, #-19\n\t"
7200         "csetm	x28, cc\n\t"
7201         /*   Mask the modulus */
7202         "and	x25, x28, x25\n\t"
7203         "and	x26, x28, #0x7fffffffffffffff\n\t"
7204         /*   Add modulus (if underflow) */
7205         "adds	x16, x16, x25\n\t"
7206         "adcs	x17, x17, x28\n\t"
7207         "adcs	x19, x19, x28\n\t"
7208         "adc	x20, x20, x26\n\t"
7209         "stp	x12, x13, [x0]\n\t"
7210         "stp	x14, x15, [x0, #16]\n\t"
7211         "stp	x16, x17, [x1]\n\t"
7212         "stp	x19, x20, [x1, #16]\n\t"
7213         "ldp	x29, x30, [sp], #0x80\n\t"
7214         : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qz] "+r" (qz), [qt2d] "+r" (qt2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
7215         :
7216         : "memory", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
7217     );
7218 }
7219 
7220 #endif /* HAVE_CURVE25519 */
7221 #endif /* __aarch64__ */
7222 #endif /* WOLFSSL_ARMASM */
7223