1 /* armv8-curve25519
2 *
3 * Copyright (C) 2006-2021 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL.
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20 */
21
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif /* HAVE_CONFIG_H */
25 #include <wolfssl/wolfcrypt/settings.h>
26
27 /* Generated using (from wolfssl):
28 * cd ../scripts
29 * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.c
30 */
31 #ifdef WOLFSSL_ARMASM
32 #ifdef __aarch64__
33 #ifdef HAVE_CURVE25519
34 #include <wolfssl/wolfcrypt/fe_operations.h>
35
fe_init()36 void fe_init()
37 {
38 __asm__ __volatile__ (
39 "\n\t"
40 :
41 :
42 : "memory"
43 );
44 }
45
fe_frombytes(fe out,const unsigned char * in)46 void fe_frombytes(fe out, const unsigned char* in)
47 {
48 __asm__ __volatile__ (
49 "ldp x2, x3, [%x[in]]\n\t"
50 "ldp x4, x5, [%x[in], #16]\n\t"
51 "and x5, x5, #0x7fffffffffffffff\n\t"
52 "stp x2, x3, [%x[out]]\n\t"
53 "stp x4, x5, [%x[out], #16]\n\t"
54 : [out] "+r" (out), [in] "+r" (in)
55 :
56 : "memory", "x2", "x3", "x4", "x5", "x6"
57 );
58 }
59
fe_tobytes(unsigned char * out,const fe n)60 void fe_tobytes(unsigned char* out, const fe n)
61 {
62 __asm__ __volatile__ (
63 "mov x7, #19\n\t"
64 "ldp x2, x3, [%x[n]]\n\t"
65 "ldp x4, x5, [%x[n], #16]\n\t"
66 "adds x6, x2, x7\n\t"
67 "adcs x6, x3, xzr\n\t"
68 "adcs x6, x4, xzr\n\t"
69 "adc x6, x5, xzr\n\t"
70 "and x6, x7, x6, asr 63\n\t"
71 "adds x2, x2, x6\n\t"
72 "adcs x3, x3, xzr\n\t"
73 "adcs x4, x4, xzr\n\t"
74 "adc x5, x5, xzr\n\t"
75 "and x5, x5, #0x7fffffffffffffff\n\t"
76 "stp x2, x3, [%x[out]]\n\t"
77 "stp x4, x5, [%x[out], #16]\n\t"
78 : [out] "+r" (out), [n] "+r" (n)
79 :
80 : "memory", "x2", "x3", "x4", "x5", "x6", "x7"
81 );
82 }
83
fe_1(fe n)84 void fe_1(fe n)
85 {
86 __asm__ __volatile__ (
87 /* Set one */
88 "mov x1, #1\n\t"
89 "stp x1, xzr, [%x[n]]\n\t"
90 "stp xzr, xzr, [%x[n], #16]\n\t"
91 : [n] "+r" (n)
92 :
93 : "memory", "x1"
94 );
95 }
96
fe_0(fe n)97 void fe_0(fe n)
98 {
99 __asm__ __volatile__ (
100 /* Set zero */
101 "stp xzr, xzr, [%x[n]]\n\t"
102 "stp xzr, xzr, [%x[n], #16]\n\t"
103 : [n] "+r" (n)
104 :
105 : "memory"
106 );
107 }
108
fe_copy(fe r,const fe a)109 void fe_copy(fe r, const fe a)
110 {
111 __asm__ __volatile__ (
112 /* Copy */
113 "ldp x2, x3, [%x[a]]\n\t"
114 "ldp x4, x5, [%x[a], #16]\n\t"
115 "stp x2, x3, [%x[r]]\n\t"
116 "stp x4, x5, [%x[r], #16]\n\t"
117 : [r] "+r" (r), [a] "+r" (a)
118 :
119 : "memory", "x2", "x3", "x4", "x5"
120 );
121 }
122
fe_sub(fe r,const fe a,const fe b)123 void fe_sub(fe r, const fe a, const fe b)
124 {
125 __asm__ __volatile__ (
126 /* Sub */
127 "ldp x3, x4, [%x[a]]\n\t"
128 "ldp x5, x6, [%x[a], #16]\n\t"
129 "ldp x7, x8, [%x[b]]\n\t"
130 "ldp x9, x10, [%x[b], #16]\n\t"
131 "subs x3, x3, x7\n\t"
132 "sbcs x4, x4, x8\n\t"
133 "sbcs x5, x5, x9\n\t"
134 "sbcs x6, x6, x10\n\t"
135 "mov x12, #-19\n\t"
136 "csetm x11, cc\n\t"
137 /* Mask the modulus */
138 "and x12, x11, x12\n\t"
139 "and x13, x11, #0x7fffffffffffffff\n\t"
140 /* Add modulus (if underflow) */
141 "adds x3, x3, x12\n\t"
142 "adcs x4, x4, x11\n\t"
143 "adcs x5, x5, x11\n\t"
144 "adc x6, x6, x13\n\t"
145 "stp x3, x4, [%x[r]]\n\t"
146 "stp x5, x6, [%x[r], #16]\n\t"
147 : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
148 :
149 : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
150 );
151 }
152
fe_add(fe r,const fe a,const fe b)153 void fe_add(fe r, const fe a, const fe b)
154 {
155 __asm__ __volatile__ (
156 /* Add */
157 "ldp x3, x4, [%x[a]]\n\t"
158 "ldp x5, x6, [%x[a], #16]\n\t"
159 "ldp x7, x8, [%x[b]]\n\t"
160 "ldp x9, x10, [%x[b], #16]\n\t"
161 "adds x3, x3, x7\n\t"
162 "adcs x4, x4, x8\n\t"
163 "adcs x5, x5, x9\n\t"
164 "adc x6, x6, x10\n\t"
165 "mov x12, #-19\n\t"
166 "asr x11, x6, #63\n\t"
167 /* Mask the modulus */
168 "and x12, x11, x12\n\t"
169 "and x13, x11, #0x7fffffffffffffff\n\t"
170 /* Sub modulus (if overflow) */
171 "subs x3, x3, x12\n\t"
172 "sbcs x4, x4, x11\n\t"
173 "sbcs x5, x5, x11\n\t"
174 "sbc x6, x6, x13\n\t"
175 "stp x3, x4, [%x[r]]\n\t"
176 "stp x5, x6, [%x[r], #16]\n\t"
177 : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
178 :
179 : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
180 );
181 }
182
fe_neg(fe r,const fe a)183 void fe_neg(fe r, const fe a)
184 {
185 __asm__ __volatile__ (
186 "ldp x2, x3, [%x[a]]\n\t"
187 "ldp x4, x5, [%x[a], #16]\n\t"
188 "mov x6, #-19\n\t"
189 "mov x7, #-1\n\t"
190 "mov x8, #-1\n\t"
191 "mov x9, #0x7fffffffffffffff\n\t"
192 "subs x6, x6, x2\n\t"
193 "sbcs x7, x7, x3\n\t"
194 "sbcs x8, x8, x4\n\t"
195 "sbc x9, x9, x5\n\t"
196 "stp x6, x7, [%x[r]]\n\t"
197 "stp x8, x9, [%x[r], #16]\n\t"
198 : [r] "+r" (r), [a] "+r" (a)
199 :
200 : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9"
201 );
202 }
203
fe_isnonzero(const fe a)204 int fe_isnonzero(const fe a)
205 {
206 __asm__ __volatile__ (
207 "mov x6, #19\n\t"
208 "ldp x1, x2, [%x[a]]\n\t"
209 "ldp x3, x4, [%x[a], #16]\n\t"
210 "adds x5, x1, x6\n\t"
211 "adcs x5, x2, xzr\n\t"
212 "adcs x5, x3, xzr\n\t"
213 "adc x5, x4, xzr\n\t"
214 "and x5, x6, x5, asr 63\n\t"
215 "adds x1, x1, x5\n\t"
216 "adcs x2, x2, xzr\n\t"
217 "adcs x3, x3, xzr\n\t"
218 "adc x4, x4, xzr\n\t"
219 "and x4, x4, #0x7fffffffffffffff\n\t"
220 "orr %x[a], x1, x2\n\t"
221 "orr x3, x3, x4\n\t"
222 "orr %x[a], %x[a], x3\n\t"
223 : [a] "+r" (a)
224 :
225 : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
226 );
227 return (uint32_t)(size_t)a;
228 }
229
fe_isnegative(const fe a)230 int fe_isnegative(const fe a)
231 {
232 __asm__ __volatile__ (
233 "mov x6, #19\n\t"
234 "ldp x1, x2, [%x[a]]\n\t"
235 "ldp x3, x4, [%x[a], #16]\n\t"
236 "adds x5, x1, x6\n\t"
237 "adcs x5, x2, xzr\n\t"
238 "adcs x5, x3, xzr\n\t"
239 "adc x5, x4, xzr\n\t"
240 "and %x[a], x1, #1\n\t"
241 "eor %x[a], %x[a], x5, lsr 63\n\t"
242 : [a] "+r" (a)
243 :
244 : "memory", "x1", "x2", "x3", "x4", "x5", "x6"
245 );
246 return (uint32_t)(size_t)a;
247 }
248
fe_cmov_table(fe * r,fe * base,signed char b)249 void fe_cmov_table(fe* r, fe* base, signed char b)
250 {
251 __asm__ __volatile__ (
252 "stp x29, x30, [sp, #-32]!\n\t"
253 "add x29, sp, #0\n\t"
254 "str %x[r], [x29, #16]\n\t"
255 "sxtb %x[b], %w[b]\n\t"
256 "sbfx x3, %x[b], #7, #1\n\t"
257 "eor %x[r], %x[b], x3\n\t"
258 "sub %x[r], %x[r], x3\n\t"
259 "mov x4, #1\n\t"
260 "mov x5, xzr\n\t"
261 "mov x6, xzr\n\t"
262 "mov x7, xzr\n\t"
263 "mov x8, #1\n\t"
264 "mov x9, xzr\n\t"
265 "mov x10, xzr\n\t"
266 "mov x11, xzr\n\t"
267 "mov x12, xzr\n\t"
268 "mov x13, xzr\n\t"
269 "mov x14, xzr\n\t"
270 "mov x15, xzr\n\t"
271 "cmp %x[r], #1\n\t"
272 "ldp x16, x17, [%x[base]]\n\t"
273 "ldp x19, x20, [%x[base], #16]\n\t"
274 "ldp x21, x22, [%x[base], #32]\n\t"
275 "ldp x23, x24, [%x[base], #48]\n\t"
276 "ldp x25, x26, [%x[base], #64]\n\t"
277 "ldp x27, x28, [%x[base], #80]\n\t"
278 "csel x4, x16, x4, eq\n\t"
279 "csel x5, x17, x5, eq\n\t"
280 "csel x6, x19, x6, eq\n\t"
281 "csel x7, x20, x7, eq\n\t"
282 "csel x8, x21, x8, eq\n\t"
283 "csel x9, x22, x9, eq\n\t"
284 "csel x10, x23, x10, eq\n\t"
285 "csel x11, x24, x11, eq\n\t"
286 "csel x12, x25, x12, eq\n\t"
287 "csel x13, x26, x13, eq\n\t"
288 "csel x14, x27, x14, eq\n\t"
289 "csel x15, x28, x15, eq\n\t"
290 "cmp %x[r], #2\n\t"
291 "ldp x16, x17, [%x[base], #96]\n\t"
292 "ldp x19, x20, [%x[base], #112]\n\t"
293 "ldp x21, x22, [%x[base], #128]\n\t"
294 "ldp x23, x24, [%x[base], #144]\n\t"
295 "ldp x25, x26, [%x[base], #160]\n\t"
296 "ldp x27, x28, [%x[base], #176]\n\t"
297 "csel x4, x16, x4, eq\n\t"
298 "csel x5, x17, x5, eq\n\t"
299 "csel x6, x19, x6, eq\n\t"
300 "csel x7, x20, x7, eq\n\t"
301 "csel x8, x21, x8, eq\n\t"
302 "csel x9, x22, x9, eq\n\t"
303 "csel x10, x23, x10, eq\n\t"
304 "csel x11, x24, x11, eq\n\t"
305 "csel x12, x25, x12, eq\n\t"
306 "csel x13, x26, x13, eq\n\t"
307 "csel x14, x27, x14, eq\n\t"
308 "csel x15, x28, x15, eq\n\t"
309 "cmp %x[r], #3\n\t"
310 "ldp x16, x17, [%x[base], #192]\n\t"
311 "ldp x19, x20, [%x[base], #208]\n\t"
312 "ldp x21, x22, [%x[base], #224]\n\t"
313 "ldp x23, x24, [%x[base], #240]\n\t"
314 "ldp x25, x26, [%x[base], #256]\n\t"
315 "ldp x27, x28, [%x[base], #272]\n\t"
316 "csel x4, x16, x4, eq\n\t"
317 "csel x5, x17, x5, eq\n\t"
318 "csel x6, x19, x6, eq\n\t"
319 "csel x7, x20, x7, eq\n\t"
320 "csel x8, x21, x8, eq\n\t"
321 "csel x9, x22, x9, eq\n\t"
322 "csel x10, x23, x10, eq\n\t"
323 "csel x11, x24, x11, eq\n\t"
324 "csel x12, x25, x12, eq\n\t"
325 "csel x13, x26, x13, eq\n\t"
326 "csel x14, x27, x14, eq\n\t"
327 "csel x15, x28, x15, eq\n\t"
328 "cmp %x[r], #4\n\t"
329 "ldp x16, x17, [%x[base], #288]\n\t"
330 "ldp x19, x20, [%x[base], #304]\n\t"
331 "ldp x21, x22, [%x[base], #320]\n\t"
332 "ldp x23, x24, [%x[base], #336]\n\t"
333 "ldp x25, x26, [%x[base], #352]\n\t"
334 "ldp x27, x28, [%x[base], #368]\n\t"
335 "csel x4, x16, x4, eq\n\t"
336 "csel x5, x17, x5, eq\n\t"
337 "csel x6, x19, x6, eq\n\t"
338 "csel x7, x20, x7, eq\n\t"
339 "csel x8, x21, x8, eq\n\t"
340 "csel x9, x22, x9, eq\n\t"
341 "csel x10, x23, x10, eq\n\t"
342 "csel x11, x24, x11, eq\n\t"
343 "csel x12, x25, x12, eq\n\t"
344 "csel x13, x26, x13, eq\n\t"
345 "csel x14, x27, x14, eq\n\t"
346 "csel x15, x28, x15, eq\n\t"
347 "add %x[base], %x[base], #0x180\n\t"
348 "cmp %x[r], #5\n\t"
349 "ldp x16, x17, [%x[base]]\n\t"
350 "ldp x19, x20, [%x[base], #16]\n\t"
351 "ldp x21, x22, [%x[base], #32]\n\t"
352 "ldp x23, x24, [%x[base], #48]\n\t"
353 "ldp x25, x26, [%x[base], #64]\n\t"
354 "ldp x27, x28, [%x[base], #80]\n\t"
355 "csel x4, x16, x4, eq\n\t"
356 "csel x5, x17, x5, eq\n\t"
357 "csel x6, x19, x6, eq\n\t"
358 "csel x7, x20, x7, eq\n\t"
359 "csel x8, x21, x8, eq\n\t"
360 "csel x9, x22, x9, eq\n\t"
361 "csel x10, x23, x10, eq\n\t"
362 "csel x11, x24, x11, eq\n\t"
363 "csel x12, x25, x12, eq\n\t"
364 "csel x13, x26, x13, eq\n\t"
365 "csel x14, x27, x14, eq\n\t"
366 "csel x15, x28, x15, eq\n\t"
367 "cmp %x[r], #6\n\t"
368 "ldp x16, x17, [%x[base], #96]\n\t"
369 "ldp x19, x20, [%x[base], #112]\n\t"
370 "ldp x21, x22, [%x[base], #128]\n\t"
371 "ldp x23, x24, [%x[base], #144]\n\t"
372 "ldp x25, x26, [%x[base], #160]\n\t"
373 "ldp x27, x28, [%x[base], #176]\n\t"
374 "csel x4, x16, x4, eq\n\t"
375 "csel x5, x17, x5, eq\n\t"
376 "csel x6, x19, x6, eq\n\t"
377 "csel x7, x20, x7, eq\n\t"
378 "csel x8, x21, x8, eq\n\t"
379 "csel x9, x22, x9, eq\n\t"
380 "csel x10, x23, x10, eq\n\t"
381 "csel x11, x24, x11, eq\n\t"
382 "csel x12, x25, x12, eq\n\t"
383 "csel x13, x26, x13, eq\n\t"
384 "csel x14, x27, x14, eq\n\t"
385 "csel x15, x28, x15, eq\n\t"
386 "cmp %x[r], #7\n\t"
387 "ldp x16, x17, [%x[base], #192]\n\t"
388 "ldp x19, x20, [%x[base], #208]\n\t"
389 "ldp x21, x22, [%x[base], #224]\n\t"
390 "ldp x23, x24, [%x[base], #240]\n\t"
391 "ldp x25, x26, [%x[base], #256]\n\t"
392 "ldp x27, x28, [%x[base], #272]\n\t"
393 "csel x4, x16, x4, eq\n\t"
394 "csel x5, x17, x5, eq\n\t"
395 "csel x6, x19, x6, eq\n\t"
396 "csel x7, x20, x7, eq\n\t"
397 "csel x8, x21, x8, eq\n\t"
398 "csel x9, x22, x9, eq\n\t"
399 "csel x10, x23, x10, eq\n\t"
400 "csel x11, x24, x11, eq\n\t"
401 "csel x12, x25, x12, eq\n\t"
402 "csel x13, x26, x13, eq\n\t"
403 "csel x14, x27, x14, eq\n\t"
404 "csel x15, x28, x15, eq\n\t"
405 "cmp %x[r], #8\n\t"
406 "ldp x16, x17, [%x[base], #288]\n\t"
407 "ldp x19, x20, [%x[base], #304]\n\t"
408 "ldp x21, x22, [%x[base], #320]\n\t"
409 "ldp x23, x24, [%x[base], #336]\n\t"
410 "ldp x25, x26, [%x[base], #352]\n\t"
411 "ldp x27, x28, [%x[base], #368]\n\t"
412 "csel x4, x16, x4, eq\n\t"
413 "csel x5, x17, x5, eq\n\t"
414 "csel x6, x19, x6, eq\n\t"
415 "csel x7, x20, x7, eq\n\t"
416 "csel x8, x21, x8, eq\n\t"
417 "csel x9, x22, x9, eq\n\t"
418 "csel x10, x23, x10, eq\n\t"
419 "csel x11, x24, x11, eq\n\t"
420 "csel x12, x25, x12, eq\n\t"
421 "csel x13, x26, x13, eq\n\t"
422 "csel x14, x27, x14, eq\n\t"
423 "csel x15, x28, x15, eq\n\t"
424 "mov x16, #-19\n\t"
425 "mov x17, #-1\n\t"
426 "mov x19, #-1\n\t"
427 "mov x20, #0x7fffffffffffffff\n\t"
428 "subs x16, x16, x12\n\t"
429 "sbcs x17, x17, x13\n\t"
430 "sbcs x19, x19, x14\n\t"
431 "sbc x20, x20, x15\n\t"
432 "cmp %x[b], #0\n\t"
433 "mov x3, x4\n\t"
434 "csel x4, x8, x4, lt\n\t"
435 "csel x8, x3, x8, lt\n\t"
436 "mov x3, x5\n\t"
437 "csel x5, x9, x5, lt\n\t"
438 "csel x9, x3, x9, lt\n\t"
439 "mov x3, x6\n\t"
440 "csel x6, x10, x6, lt\n\t"
441 "csel x10, x3, x10, lt\n\t"
442 "mov x3, x7\n\t"
443 "csel x7, x11, x7, lt\n\t"
444 "csel x11, x3, x11, lt\n\t"
445 "csel x12, x16, x12, lt\n\t"
446 "csel x13, x17, x13, lt\n\t"
447 "csel x14, x19, x14, lt\n\t"
448 "csel x15, x20, x15, lt\n\t"
449 "ldr %x[r], [x29, #16]\n\t"
450 "stp x4, x5, [%x[r]]\n\t"
451 "stp x6, x7, [%x[r], #16]\n\t"
452 "stp x8, x9, [%x[r], #32]\n\t"
453 "stp x10, x11, [%x[r], #48]\n\t"
454 "stp x12, x13, [%x[r], #64]\n\t"
455 "stp x14, x15, [%x[r], #80]\n\t"
456 "ldp x29, x30, [sp], #32\n\t"
457 : [r] "+r" (r), [base] "+r" (base), [b] "+r" (b)
458 :
459 : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
460 );
461 }
462
fe_mul(fe r,const fe a,const fe b)463 void fe_mul(fe r, const fe a, const fe b)
464 {
465 __asm__ __volatile__ (
466 /* Multiply */
467 "ldp x14, x15, [%x[a]]\n\t"
468 "ldp x16, x17, [%x[a], #16]\n\t"
469 "ldp x19, x20, [%x[b]]\n\t"
470 "ldp x21, x22, [%x[b], #16]\n\t"
471 /* A[0] * B[0] */
472 "mul x6, x14, x19\n\t"
473 "umulh x7, x14, x19\n\t"
474 /* A[0] * B[1] */
475 "mul x3, x14, x20\n\t"
476 "umulh x8, x14, x20\n\t"
477 "adds x7, x7, x3\n\t"
478 "adc x8, x8, xzr\n\t"
479 /* A[1] * B[0] */
480 "mul x3, x15, x19\n\t"
481 "umulh x4, x15, x19\n\t"
482 "adds x7, x7, x3\n\t"
483 "adcs x8, x8, x4\n\t"
484 "adc x9, xzr, xzr\n\t"
485 /* A[0] * B[2] */
486 "mul x3, x14, x21\n\t"
487 "umulh x4, x14, x21\n\t"
488 "adds x8, x8, x3\n\t"
489 "adc x9, x9, x4\n\t"
490 /* A[1] * B[1] */
491 "mul x3, x15, x20\n\t"
492 "umulh x4, x15, x20\n\t"
493 "adds x8, x8, x3\n\t"
494 "adcs x9, x9, x4\n\t"
495 "adc x10, xzr, xzr\n\t"
496 /* A[2] * B[0] */
497 "mul x3, x16, x19\n\t"
498 "umulh x4, x16, x19\n\t"
499 "adds x8, x8, x3\n\t"
500 "adcs x9, x9, x4\n\t"
501 "adc x10, x10, xzr\n\t"
502 /* A[0] * B[3] */
503 "mul x3, x14, x22\n\t"
504 "umulh x4, x14, x22\n\t"
505 "adds x9, x9, x3\n\t"
506 "adcs x10, x10, x4\n\t"
507 "adc x11, xzr, xzr\n\t"
508 /* A[1] * B[2] */
509 "mul x3, x15, x21\n\t"
510 "umulh x4, x15, x21\n\t"
511 "adds x9, x9, x3\n\t"
512 "adcs x10, x10, x4\n\t"
513 "adc x11, x11, xzr\n\t"
514 /* A[2] * B[1] */
515 "mul x3, x16, x20\n\t"
516 "umulh x4, x16, x20\n\t"
517 "adds x9, x9, x3\n\t"
518 "adcs x10, x10, x4\n\t"
519 "adc x11, x11, xzr\n\t"
520 /* A[3] * B[0] */
521 "mul x3, x17, x19\n\t"
522 "umulh x4, x17, x19\n\t"
523 "adds x9, x9, x3\n\t"
524 "adcs x10, x10, x4\n\t"
525 "adc x11, x11, xzr\n\t"
526 /* A[1] * B[3] */
527 "mul x3, x15, x22\n\t"
528 "umulh x4, x15, x22\n\t"
529 "adds x10, x10, x3\n\t"
530 "adcs x11, x11, x4\n\t"
531 "adc x12, xzr, xzr\n\t"
532 /* A[2] * B[2] */
533 "mul x3, x16, x21\n\t"
534 "umulh x4, x16, x21\n\t"
535 "adds x10, x10, x3\n\t"
536 "adcs x11, x11, x4\n\t"
537 "adc x12, x12, xzr\n\t"
538 /* A[3] * B[1] */
539 "mul x3, x17, x20\n\t"
540 "umulh x4, x17, x20\n\t"
541 "adds x10, x10, x3\n\t"
542 "adcs x11, x11, x4\n\t"
543 "adc x12, x12, xzr\n\t"
544 /* A[2] * B[3] */
545 "mul x3, x16, x22\n\t"
546 "umulh x4, x16, x22\n\t"
547 "adds x11, x11, x3\n\t"
548 "adcs x12, x12, x4\n\t"
549 "adc x13, xzr, xzr\n\t"
550 /* A[3] * B[2] */
551 "mul x3, x17, x21\n\t"
552 "umulh x4, x17, x21\n\t"
553 "adds x11, x11, x3\n\t"
554 "adcs x12, x12, x4\n\t"
555 "adc x13, x13, xzr\n\t"
556 /* A[3] * B[3] */
557 "mul x3, x17, x22\n\t"
558 "umulh x4, x17, x22\n\t"
559 "adds x12, x12, x3\n\t"
560 "adc x13, x13, x4\n\t"
561 /* Reduce */
562 /* Move top half into t4-t7 and remove top bit from t3 */
563 "extr x13, x13, x12, #63\n\t"
564 "extr x12, x12, x11, #63\n\t"
565 "extr x11, x11, x10, #63\n\t"
566 "extr x10, x10, x9, #63\n\t"
567 "and x9, x9, #0x7fffffffffffffff\n\t"
568 /* Multiply top half by 19 */
569 "mov x3, #19\n\t"
570 "mul x4, x3, x10\n\t"
571 "umulh x10, x3, x10\n\t"
572 "adds x6, x6, x4\n\t"
573 "mul x4, x3, x11\n\t"
574 "umulh x11, x3, x11\n\t"
575 "adcs x7, x7, x4\n\t"
576 "mul x4, x3, x12\n\t"
577 "umulh x12, x3, x12\n\t"
578 "adcs x8, x8, x4\n\t"
579 "mul x4, x3, x13\n\t"
580 "umulh x5, x3, x13\n\t"
581 "adcs x9, x9, x4\n\t"
582 "adc x5, x5, xzr\n\t"
583 /* Add remaining product results in */
584 "adds x7, x7, x10\n\t"
585 "adcs x8, x8, x11\n\t"
586 "adcs x9, x9, x12\n\t"
587 "adc x5, x5, xzr\n\t"
588 /* Overflow */
589 "extr x5, x5, x9, #63\n\t"
590 "mul x5, x5, x3\n\t"
591 "and x9, x9, #0x7fffffffffffffff\n\t"
592 "adds x6, x6, x5\n\t"
593 "adcs x7, x7, xzr\n\t"
594 "adcs x8, x8, xzr\n\t"
595 "adc x9, x9, xzr\n\t"
596 /* Reduce if top bit set */
597 "and x5, x3, x9, asr 63\n\t"
598 "and x9, x9, #0x7fffffffffffffff\n\t"
599 "adds x6, x6, x5\n\t"
600 "adcs x7, x7, xzr\n\t"
601 "adcs x8, x8, xzr\n\t"
602 "adc x9, x9, xzr\n\t"
603 /* Store */
604 "stp x6, x7, [%x[r]]\n\t"
605 "stp x8, x9, [%x[r], #16]\n\t"
606 : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
607 :
608 : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
609 );
610 }
611
fe_sq(fe r,const fe a)612 void fe_sq(fe r, const fe a)
613 {
614 __asm__ __volatile__ (
615 /* Square */
616 "ldp x13, x14, [%x[a]]\n\t"
617 "ldp x15, x16, [%x[a], #16]\n\t"
618 /* A[0] * A[1] */
619 "mul x6, x13, x14\n\t"
620 "umulh x7, x13, x14\n\t"
621 /* A[0] * A[2] */
622 "mul x2, x13, x15\n\t"
623 "umulh x8, x13, x15\n\t"
624 "adds x7, x7, x2\n\t"
625 "adc x8, x8, xzr\n\t"
626 /* A[0] * A[3] */
627 "mul x2, x13, x16\n\t"
628 "umulh x9, x13, x16\n\t"
629 "adds x8, x8, x2\n\t"
630 "adc x9, x9, xzr\n\t"
631 /* A[1] * A[2] */
632 "mul x2, x14, x15\n\t"
633 "umulh x3, x14, x15\n\t"
634 "adds x8, x8, x2\n\t"
635 "adcs x9, x9, x3\n\t"
636 "adc x10, xzr, xzr\n\t"
637 /* A[1] * A[3] */
638 "mul x2, x14, x16\n\t"
639 "umulh x3, x14, x16\n\t"
640 "adds x9, x9, x2\n\t"
641 "adc x10, x10, x3\n\t"
642 /* A[2] * A[3] */
643 "mul x2, x15, x16\n\t"
644 "umulh x11, x15, x16\n\t"
645 "adds x10, x10, x2\n\t"
646 "adc x11, x11, xzr\n\t"
647 /* Double */
648 "adds x6, x6, x6\n\t"
649 "adcs x7, x7, x7\n\t"
650 "adcs x8, x8, x8\n\t"
651 "adcs x9, x9, x9\n\t"
652 "adcs x10, x10, x10\n\t"
653 "adcs x11, x11, x11\n\t"
654 "adc x12, xzr, xzr\n\t"
655 /* A[0] * A[0] */
656 "mul x5, x13, x13\n\t"
657 "umulh x4, x13, x13\n\t"
658 /* A[1] * A[1] */
659 "mul x2, x14, x14\n\t"
660 "umulh x3, x14, x14\n\t"
661 "adds x6, x6, x4\n\t"
662 "adcs x7, x7, x2\n\t"
663 "adc x4, x3, xzr\n\t"
664 /* A[2] * A[2] */
665 "mul x2, x15, x15\n\t"
666 "umulh x3, x15, x15\n\t"
667 "adds x8, x8, x4\n\t"
668 "adcs x9, x9, x2\n\t"
669 "adc x4, x3, xzr\n\t"
670 /* A[3] * A[3] */
671 "mul x2, x16, x16\n\t"
672 "umulh x3, x16, x16\n\t"
673 "adds x10, x10, x4\n\t"
674 "adcs x11, x11, x2\n\t"
675 "adc x12, x12, x3\n\t"
676 /* Reduce */
677 /* Move top half into t4-t7 and remove top bit from t3 */
678 "extr x12, x12, x11, #63\n\t"
679 "extr x11, x11, x10, #63\n\t"
680 "extr x10, x10, x9, #63\n\t"
681 "extr x9, x9, x8, #63\n\t"
682 "and x8, x8, #0x7fffffffffffffff\n\t"
683 /* Multiply top half by 19 */
684 "mov x2, #19\n\t"
685 "mul x3, x2, x9\n\t"
686 "umulh x9, x2, x9\n\t"
687 "adds x5, x5, x3\n\t"
688 "mul x3, x2, x10\n\t"
689 "umulh x10, x2, x10\n\t"
690 "adcs x6, x6, x3\n\t"
691 "mul x3, x2, x11\n\t"
692 "umulh x11, x2, x11\n\t"
693 "adcs x7, x7, x3\n\t"
694 "mul x3, x2, x12\n\t"
695 "umulh x4, x2, x12\n\t"
696 "adcs x8, x8, x3\n\t"
697 "adc x4, x4, xzr\n\t"
698 /* Add remaining product results in */
699 "adds x6, x6, x9\n\t"
700 "adcs x7, x7, x10\n\t"
701 "adcs x8, x8, x11\n\t"
702 "adc x4, x4, xzr\n\t"
703 /* Overflow */
704 "extr x4, x4, x8, #63\n\t"
705 "mul x4, x4, x2\n\t"
706 "and x8, x8, #0x7fffffffffffffff\n\t"
707 "adds x5, x5, x4\n\t"
708 "adcs x6, x6, xzr\n\t"
709 "adcs x7, x7, xzr\n\t"
710 "adc x8, x8, xzr\n\t"
711 /* Reduce if top bit set */
712 "and x4, x2, x8, asr 63\n\t"
713 "and x8, x8, #0x7fffffffffffffff\n\t"
714 "adds x5, x5, x4\n\t"
715 "adcs x6, x6, xzr\n\t"
716 "adcs x7, x7, xzr\n\t"
717 "adc x8, x8, xzr\n\t"
718 /* Store */
719 "stp x5, x6, [%x[r]]\n\t"
720 "stp x7, x8, [%x[r], #16]\n\t"
721 : [r] "+r" (r), [a] "+r" (a)
722 :
723 : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16"
724 );
725 }
726
fe_invert(fe r,const fe a)727 void fe_invert(fe r, const fe a)
728 {
729 __asm__ __volatile__ (
730 "stp x29, x30, [sp, #-160]!\n\t"
731 "add x29, sp, #0\n\t"
732 /* Invert */
733 "str %x[r], [x29, #144]\n\t"
734 "str %x[a], [x29, #152]\n\t"
735 "add x0, x29, #16\n\t"
736 #ifndef NDEBUG
737 "ldr x1, [x29, #152]\n\t"
738 #endif /* !NDEBUG */
739 #ifndef __APPLE__
740 "bl fe_sq\n\t"
741 #else
742 "bl _fe_sq\n\t"
743 #endif /* __APPLE__ */
744 "add x0, x29, #48\n\t"
745 "add x1, x29, #16\n\t"
746 #ifndef __APPLE__
747 "bl fe_sq\n\t"
748 #else
749 "bl _fe_sq\n\t"
750 #endif /* __APPLE__ */
751 #ifndef NDEBUG
752 "add x0, x29, #48\n\t"
753 #endif /* !NDEBUG */
754 "add x1, x29, #48\n\t"
755 #ifndef __APPLE__
756 "bl fe_sq\n\t"
757 #else
758 "bl _fe_sq\n\t"
759 #endif /* __APPLE__ */
760 #ifndef NDEBUG
761 "add x0, x29, #48\n\t"
762 #endif /* !NDEBUG */
763 "ldr x1, [x29, #152]\n\t"
764 "add x2, x29, #48\n\t"
765 #ifndef __APPLE__
766 "bl fe_mul\n\t"
767 #else
768 "bl _fe_mul\n\t"
769 #endif /* __APPLE__ */
770 "add x0, x29, #16\n\t"
771 "add x1, x29, #16\n\t"
772 "add x2, x29, #48\n\t"
773 #ifndef __APPLE__
774 "bl fe_mul\n\t"
775 #else
776 "bl _fe_mul\n\t"
777 #endif /* __APPLE__ */
778 "add x0, x29, #0x50\n\t"
779 #ifndef NDEBUG
780 "add x1, x29, #16\n\t"
781 #endif /* !NDEBUG */
782 #ifndef __APPLE__
783 "bl fe_sq\n\t"
784 #else
785 "bl _fe_sq\n\t"
786 #endif /* __APPLE__ */
787 "add x0, x29, #48\n\t"
788 "add x1, x29, #48\n\t"
789 "add x2, x29, #0x50\n\t"
790 #ifndef __APPLE__
791 "bl fe_mul\n\t"
792 #else
793 "bl _fe_mul\n\t"
794 #endif /* __APPLE__ */
795 "add x0, x29, #0x50\n\t"
796 #ifndef NDEBUG
797 "add x1, x29, #48\n\t"
798 #endif /* !NDEBUG */
799 #ifndef __APPLE__
800 "bl fe_sq\n\t"
801 #else
802 "bl _fe_sq\n\t"
803 #endif /* __APPLE__ */
804 "mov x20, #3\n\t"
805 #ifndef NDEBUG
806 "add x0, x29, #0x50\n\t"
807 #endif /* !NDEBUG */
808 "add x1, x29, #0x50\n\t"
809 "\n"
810 "L_fe_invert1_%=: \n\t"
811 #ifndef __APPLE__
812 "bl fe_sq\n\t"
813 #else
814 "bl _fe_sq\n\t"
815 #endif /* __APPLE__ */
816 "subs x20, x20, #1\n\t"
817 "bcs L_fe_invert1_%=\n\t"
818 "add x0, x29, #48\n\t"
819 #ifndef NDEBUG
820 "add x1, x29, #0x50\n\t"
821 #endif /* !NDEBUG */
822 "add x2, x29, #48\n\t"
823 #ifndef __APPLE__
824 "bl fe_mul\n\t"
825 #else
826 "bl _fe_mul\n\t"
827 #endif /* __APPLE__ */
828 "add x0, x29, #0x50\n\t"
829 "add x1, x29, #48\n\t"
830 #ifndef __APPLE__
831 "bl fe_sq\n\t"
832 #else
833 "bl _fe_sq\n\t"
834 #endif /* __APPLE__ */
835 "mov x20, #8\n\t"
836 #ifndef NDEBUG
837 "add x0, x29, #0x50\n\t"
838 #endif /* !NDEBUG */
839 "add x1, x29, #0x50\n\t"
840 "\n"
841 "L_fe_invert2_%=: \n\t"
842 #ifndef __APPLE__
843 "bl fe_sq\n\t"
844 #else
845 "bl _fe_sq\n\t"
846 #endif /* __APPLE__ */
847 "subs x20, x20, #1\n\t"
848 "bcs L_fe_invert2_%=\n\t"
849 #ifndef NDEBUG
850 "add x0, x29, #0x50\n\t"
851 #endif /* !NDEBUG */
852 #ifndef NDEBUG
853 "add x1, x29, #0x50\n\t"
854 #endif /* !NDEBUG */
855 "add x2, x29, #48\n\t"
856 #ifndef __APPLE__
857 "bl fe_mul\n\t"
858 #else
859 "bl _fe_mul\n\t"
860 #endif /* __APPLE__ */
861 "add x0, x29, #0x70\n\t"
862 #ifndef NDEBUG
863 "add x1, x29, #0x50\n\t"
864 #endif /* !NDEBUG */
865 #ifndef __APPLE__
866 "bl fe_sq\n\t"
867 #else
868 "bl _fe_sq\n\t"
869 #endif /* __APPLE__ */
870 "mov x20, #18\n\t"
871 #ifndef NDEBUG
872 "add x0, x29, #0x70\n\t"
873 #endif /* !NDEBUG */
874 "add x1, x29, #0x70\n\t"
875 "\n"
876 "L_fe_invert3_%=: \n\t"
877 #ifndef __APPLE__
878 "bl fe_sq\n\t"
879 #else
880 "bl _fe_sq\n\t"
881 #endif /* __APPLE__ */
882 "subs x20, x20, #1\n\t"
883 "bcs L_fe_invert3_%=\n\t"
884 "add x0, x29, #0x50\n\t"
885 #ifndef NDEBUG
886 "add x1, x29, #0x70\n\t"
887 #endif /* !NDEBUG */
888 "add x2, x29, #0x50\n\t"
889 #ifndef __APPLE__
890 "bl fe_mul\n\t"
891 #else
892 "bl _fe_mul\n\t"
893 #endif /* __APPLE__ */
894 "mov x20, #9\n\t"
895 #ifndef NDEBUG
896 "add x0, x29, #0x50\n\t"
897 #endif /* !NDEBUG */
898 "add x1, x29, #0x50\n\t"
899 "\n"
900 "L_fe_invert4_%=: \n\t"
901 #ifndef __APPLE__
902 "bl fe_sq\n\t"
903 #else
904 "bl _fe_sq\n\t"
905 #endif /* __APPLE__ */
906 "subs x20, x20, #1\n\t"
907 "bcs L_fe_invert4_%=\n\t"
908 "add x0, x29, #48\n\t"
909 #ifndef NDEBUG
910 "add x1, x29, #0x50\n\t"
911 #endif /* !NDEBUG */
912 "add x2, x29, #48\n\t"
913 #ifndef __APPLE__
914 "bl fe_mul\n\t"
915 #else
916 "bl _fe_mul\n\t"
917 #endif /* __APPLE__ */
918 "add x0, x29, #0x50\n\t"
919 "add x1, x29, #48\n\t"
920 #ifndef __APPLE__
921 "bl fe_sq\n\t"
922 #else
923 "bl _fe_sq\n\t"
924 #endif /* __APPLE__ */
925 "mov x20, #48\n\t"
926 #ifndef NDEBUG
927 "add x0, x29, #0x50\n\t"
928 #endif /* !NDEBUG */
929 "add x1, x29, #0x50\n\t"
930 "\n"
931 "L_fe_invert5_%=: \n\t"
932 #ifndef __APPLE__
933 "bl fe_sq\n\t"
934 #else
935 "bl _fe_sq\n\t"
936 #endif /* __APPLE__ */
937 "subs x20, x20, #1\n\t"
938 "bcs L_fe_invert5_%=\n\t"
939 #ifndef NDEBUG
940 "add x0, x29, #0x50\n\t"
941 #endif /* !NDEBUG */
942 #ifndef NDEBUG
943 "add x1, x29, #0x50\n\t"
944 #endif /* !NDEBUG */
945 "add x2, x29, #48\n\t"
946 #ifndef __APPLE__
947 "bl fe_mul\n\t"
948 #else
949 "bl _fe_mul\n\t"
950 #endif /* __APPLE__ */
951 "add x0, x29, #0x70\n\t"
952 #ifndef NDEBUG
953 "add x1, x29, #0x50\n\t"
954 #endif /* !NDEBUG */
955 #ifndef __APPLE__
956 "bl fe_sq\n\t"
957 #else
958 "bl _fe_sq\n\t"
959 #endif /* __APPLE__ */
960 "mov x20, #0x62\n\t"
961 #ifndef NDEBUG
962 "add x0, x29, #0x70\n\t"
963 #endif /* !NDEBUG */
964 "add x1, x29, #0x70\n\t"
965 "\n"
966 "L_fe_invert6_%=: \n\t"
967 #ifndef __APPLE__
968 "bl fe_sq\n\t"
969 #else
970 "bl _fe_sq\n\t"
971 #endif /* __APPLE__ */
972 "subs x20, x20, #1\n\t"
973 "bcs L_fe_invert6_%=\n\t"
974 "add x0, x29, #0x50\n\t"
975 #ifndef NDEBUG
976 "add x1, x29, #0x70\n\t"
977 #endif /* !NDEBUG */
978 "add x2, x29, #0x50\n\t"
979 #ifndef __APPLE__
980 "bl fe_mul\n\t"
981 #else
982 "bl _fe_mul\n\t"
983 #endif /* __APPLE__ */
984 "mov x20, #49\n\t"
985 #ifndef NDEBUG
986 "add x0, x29, #0x50\n\t"
987 #endif /* !NDEBUG */
988 "add x1, x29, #0x50\n\t"
989 "\n"
990 "L_fe_invert7_%=: \n\t"
991 #ifndef __APPLE__
992 "bl fe_sq\n\t"
993 #else
994 "bl _fe_sq\n\t"
995 #endif /* __APPLE__ */
996 "subs x20, x20, #1\n\t"
997 "bcs L_fe_invert7_%=\n\t"
998 "add x0, x29, #48\n\t"
999 #ifndef NDEBUG
1000 "add x1, x29, #0x50\n\t"
1001 #endif /* !NDEBUG */
1002 "add x2, x29, #48\n\t"
1003 #ifndef __APPLE__
1004 "bl fe_mul\n\t"
1005 #else
1006 "bl _fe_mul\n\t"
1007 #endif /* __APPLE__ */
1008 "mov x20, #4\n\t"
1009 #ifndef NDEBUG
1010 "add x0, x29, #48\n\t"
1011 #endif /* !NDEBUG */
1012 "add x1, x29, #48\n\t"
1013 "\n"
1014 "L_fe_invert8_%=: \n\t"
1015 #ifndef __APPLE__
1016 "bl fe_sq\n\t"
1017 #else
1018 "bl _fe_sq\n\t"
1019 #endif /* __APPLE__ */
1020 "subs x20, x20, #1\n\t"
1021 "bcs L_fe_invert8_%=\n\t"
1022 "ldr x0, [x29, #144]\n\t"
1023 #ifndef NDEBUG
1024 "add x1, x29, #48\n\t"
1025 #endif /* !NDEBUG */
1026 "add x2, x29, #16\n\t"
1027 #ifndef __APPLE__
1028 "bl fe_mul\n\t"
1029 #else
1030 "bl _fe_mul\n\t"
1031 #endif /* __APPLE__ */
1032 "ldp x29, x30, [sp], #0xa0\n\t"
1033 : [r] "+r" (r), [a] "+r" (a)
1034 :
1035 : "memory", "x2", "x20"
1036 );
1037 }
1038
curve25519(byte * r,const byte * n,const byte * a)1039 int curve25519(byte* r, const byte* n, const byte* a)
1040 {
1041 __asm__ __volatile__ (
1042 "stp x29, x30, [sp, #-192]!\n\t"
1043 "add x29, sp, #0\n\t"
1044 "mov x23, xzr\n\t"
1045 "str %x[r], [x29, #176]\n\t"
1046 "str %x[a], [x29, #184]\n\t"
1047 /* Copy */
1048 "ldp x6, x7, [%x[a]]\n\t"
1049 "ldp x8, x9, [%x[a], #16]\n\t"
1050 "stp x6, x7, [x29, #80]\n\t"
1051 "stp x8, x9, [x29, #96]\n\t"
1052 /* Set one */
1053 "mov %x[a], #1\n\t"
1054 "stp %x[a], xzr, [%x[r]]\n\t"
1055 "stp xzr, xzr, [%x[r], #16]\n\t"
1056 /* Set zero */
1057 "stp xzr, xzr, [x29, #16]\n\t"
1058 "stp xzr, xzr, [x29, #32]\n\t"
1059 /* Set one */
1060 "mov %x[a], #1\n\t"
1061 "stp %x[a], xzr, [x29, #48]\n\t"
1062 "stp xzr, xzr, [x29, #64]\n\t"
1063 "mov x25, #62\n\t"
1064 "mov x24, #24\n\t"
1065 "\n"
1066 "L_curve25519_words_%=: \n\t"
1067 "\n"
1068 "L_curve25519_bits_%=: \n\t"
1069 "ldr %x[a], [%x[n], x24]\n\t"
1070 "lsr %x[a], %x[a], x25\n\t"
1071 "and %x[a], %x[a], #1\n\t"
1072 "eor x23, x23, %x[a]\n\t"
1073 /* Conditional Swap */
1074 "cmp x23, #1\n\t"
1075 "ldp x10, x11, [%x[r]]\n\t"
1076 "ldp x12, x13, [%x[r], #16]\n\t"
1077 "ldp x6, x7, [x29, #80]\n\t"
1078 "ldp x8, x9, [x29, #96]\n\t"
1079 "csel x14, x10, x6, eq\n\t"
1080 "csel x10, x6, x10, eq\n\t"
1081 "csel x15, x11, x7, eq\n\t"
1082 "csel x11, x7, x11, eq\n\t"
1083 "csel x16, x12, x8, eq\n\t"
1084 "csel x12, x8, x12, eq\n\t"
1085 "csel x17, x13, x9, eq\n\t"
1086 "csel x13, x9, x13, eq\n\t"
1087 /* Conditional Swap */
1088 "cmp x23, #1\n\t"
1089 "ldp x19, x20, [x29, #16]\n\t"
1090 "ldp x21, x22, [x29, #32]\n\t"
1091 "ldp x6, x7, [x29, #48]\n\t"
1092 "ldp x8, x9, [x29, #64]\n\t"
1093 "csel x5, x19, x6, eq\n\t"
1094 "csel x19, x6, x19, eq\n\t"
1095 "csel x26, x20, x7, eq\n\t"
1096 "csel x20, x7, x20, eq\n\t"
1097 "csel x27, x21, x8, eq\n\t"
1098 "csel x21, x8, x21, eq\n\t"
1099 "csel x28, x22, x9, eq\n\t"
1100 "csel x22, x9, x22, eq\n\t"
1101 "mov x23, %x[a]\n\t"
1102 /* Add */
1103 "adds x6, x10, x19\n\t"
1104 "adcs x7, x11, x20\n\t"
1105 "adcs x8, x12, x21\n\t"
1106 "adc x9, x13, x22\n\t"
1107 "mov x3, #-19\n\t"
1108 "asr %x[a], x9, #63\n\t"
1109 /* Mask the modulus */
1110 "and x3, %x[a], x3\n\t"
1111 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1112 /* Sub modulus (if overflow) */
1113 "subs x6, x6, x3\n\t"
1114 "sbcs x7, x7, %x[a]\n\t"
1115 "sbcs x8, x8, %x[a]\n\t"
1116 "sbc x9, x9, x4\n\t"
1117 /* Sub */
1118 "subs x19, x10, x19\n\t"
1119 "sbcs x20, x11, x20\n\t"
1120 "sbcs x21, x12, x21\n\t"
1121 "sbcs x22, x13, x22\n\t"
1122 "mov x3, #-19\n\t"
1123 "csetm %x[a], cc\n\t"
1124 /* Mask the modulus */
1125 "and x3, %x[a], x3\n\t"
1126 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1127 /* Add modulus (if underflow) */
1128 "adds x19, x19, x3\n\t"
1129 "adcs x20, x20, %x[a]\n\t"
1130 "adcs x21, x21, %x[a]\n\t"
1131 "adc x22, x22, x4\n\t"
1132 "stp x19, x20, [x29, #144]\n\t"
1133 "stp x21, x22, [x29, #160]\n\t"
1134 /* Add */
1135 "adds x10, x14, x5\n\t"
1136 "adcs x11, x15, x26\n\t"
1137 "adcs x12, x16, x27\n\t"
1138 "adc x13, x17, x28\n\t"
1139 "mov x3, #-19\n\t"
1140 "asr %x[a], x13, #63\n\t"
1141 /* Mask the modulus */
1142 "and x3, %x[a], x3\n\t"
1143 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1144 /* Sub modulus (if overflow) */
1145 "subs x10, x10, x3\n\t"
1146 "sbcs x11, x11, %x[a]\n\t"
1147 "sbcs x12, x12, %x[a]\n\t"
1148 "sbc x13, x13, x4\n\t"
1149 /* Sub */
1150 "subs x14, x14, x5\n\t"
1151 "sbcs x15, x15, x26\n\t"
1152 "sbcs x16, x16, x27\n\t"
1153 "sbcs x17, x17, x28\n\t"
1154 "mov x3, #-19\n\t"
1155 "csetm %x[a], cc\n\t"
1156 /* Mask the modulus */
1157 "and x3, %x[a], x3\n\t"
1158 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1159 /* Add modulus (if underflow) */
1160 "adds x14, x14, x3\n\t"
1161 "adcs x15, x15, %x[a]\n\t"
1162 "adcs x16, x16, %x[a]\n\t"
1163 "adc x17, x17, x4\n\t"
1164 /* Multiply */
1165 /* A[0] * B[0] */
1166 "mul x19, x14, x6\n\t"
1167 "umulh x20, x14, x6\n\t"
1168 /* A[0] * B[1] */
1169 "mul x3, x14, x7\n\t"
1170 "umulh x21, x14, x7\n\t"
1171 "adds x20, x20, x3\n\t"
1172 "adc x21, x21, xzr\n\t"
1173 /* A[1] * B[0] */
1174 "mul x3, x15, x6\n\t"
1175 "umulh x4, x15, x6\n\t"
1176 "adds x20, x20, x3\n\t"
1177 "adcs x21, x21, x4\n\t"
1178 "adc x22, xzr, xzr\n\t"
1179 /* A[0] * B[2] */
1180 "mul x3, x14, x8\n\t"
1181 "umulh x4, x14, x8\n\t"
1182 "adds x21, x21, x3\n\t"
1183 "adc x22, x22, x4\n\t"
1184 /* A[1] * B[1] */
1185 "mul x3, x15, x7\n\t"
1186 "umulh x4, x15, x7\n\t"
1187 "adds x21, x21, x3\n\t"
1188 "adcs x22, x22, x4\n\t"
1189 "adc %x[a], xzr, xzr\n\t"
1190 /* A[2] * B[0] */
1191 "mul x3, x16, x6\n\t"
1192 "umulh x4, x16, x6\n\t"
1193 "adds x21, x21, x3\n\t"
1194 "adcs x22, x22, x4\n\t"
1195 "adc %x[a], %x[a], xzr\n\t"
1196 /* A[0] * B[3] */
1197 "mul x3, x14, x9\n\t"
1198 "umulh x4, x14, x9\n\t"
1199 "adds x22, x22, x3\n\t"
1200 "adcs %x[a], %x[a], x4\n\t"
1201 "adc x26, xzr, xzr\n\t"
1202 /* A[1] * B[2] */
1203 "mul x3, x15, x8\n\t"
1204 "umulh x4, x15, x8\n\t"
1205 "adds x22, x22, x3\n\t"
1206 "adcs %x[a], %x[a], x4\n\t"
1207 "adc x26, x26, xzr\n\t"
1208 /* A[2] * B[1] */
1209 "mul x3, x16, x7\n\t"
1210 "umulh x4, x16, x7\n\t"
1211 "adds x22, x22, x3\n\t"
1212 "adcs %x[a], %x[a], x4\n\t"
1213 "adc x26, x26, xzr\n\t"
1214 /* A[3] * B[0] */
1215 "mul x3, x17, x6\n\t"
1216 "umulh x4, x17, x6\n\t"
1217 "adds x22, x22, x3\n\t"
1218 "adcs %x[a], %x[a], x4\n\t"
1219 "adc x26, x26, xzr\n\t"
1220 /* A[1] * B[3] */
1221 "mul x3, x15, x9\n\t"
1222 "umulh x4, x15, x9\n\t"
1223 "adds %x[a], %x[a], x3\n\t"
1224 "adcs x26, x26, x4\n\t"
1225 "adc x27, xzr, xzr\n\t"
1226 /* A[2] * B[2] */
1227 "mul x3, x16, x8\n\t"
1228 "umulh x4, x16, x8\n\t"
1229 "adds %x[a], %x[a], x3\n\t"
1230 "adcs x26, x26, x4\n\t"
1231 "adc x27, x27, xzr\n\t"
1232 /* A[3] * B[1] */
1233 "mul x3, x17, x7\n\t"
1234 "umulh x4, x17, x7\n\t"
1235 "adds %x[a], %x[a], x3\n\t"
1236 "adcs x26, x26, x4\n\t"
1237 "adc x27, x27, xzr\n\t"
1238 /* A[2] * B[3] */
1239 "mul x3, x16, x9\n\t"
1240 "umulh x4, x16, x9\n\t"
1241 "adds x26, x26, x3\n\t"
1242 "adcs x27, x27, x4\n\t"
1243 "adc x28, xzr, xzr\n\t"
1244 /* A[3] * B[2] */
1245 "mul x3, x17, x8\n\t"
1246 "umulh x4, x17, x8\n\t"
1247 "adds x26, x26, x3\n\t"
1248 "adcs x27, x27, x4\n\t"
1249 "adc x28, x28, xzr\n\t"
1250 /* A[3] * B[3] */
1251 "mul x3, x17, x9\n\t"
1252 "umulh x4, x17, x9\n\t"
1253 "adds x27, x27, x3\n\t"
1254 "adc x28, x28, x4\n\t"
1255 /* Reduce */
1256 /* Move top half into t4-t7 and remove top bit from t3 */
1257 "extr x28, x28, x27, #63\n\t"
1258 "extr x27, x27, x26, #63\n\t"
1259 "extr x26, x26, %x[a], #63\n\t"
1260 "extr %x[a], %x[a], x22, #63\n\t"
1261 "and x22, x22, #0x7fffffffffffffff\n\t"
1262 /* Multiply top half by 19 */
1263 "mov x3, #19\n\t"
1264 "mul x4, x3, %x[a]\n\t"
1265 "umulh %x[a], x3, %x[a]\n\t"
1266 "adds x19, x19, x4\n\t"
1267 "mul x4, x3, x26\n\t"
1268 "umulh x26, x3, x26\n\t"
1269 "adcs x20, x20, x4\n\t"
1270 "mul x4, x3, x27\n\t"
1271 "umulh x27, x3, x27\n\t"
1272 "adcs x21, x21, x4\n\t"
1273 "mul x4, x3, x28\n\t"
1274 "umulh x5, x3, x28\n\t"
1275 "adcs x22, x22, x4\n\t"
1276 "adc x5, x5, xzr\n\t"
1277 /* Add remaining product results in */
1278 "adds x20, x20, %x[a]\n\t"
1279 "adcs x21, x21, x26\n\t"
1280 "adcs x22, x22, x27\n\t"
1281 "adc x5, x5, xzr\n\t"
1282 /* Overflow */
1283 "extr x5, x5, x22, #63\n\t"
1284 "mul x5, x5, x3\n\t"
1285 "and x22, x22, #0x7fffffffffffffff\n\t"
1286 "adds x19, x19, x5\n\t"
1287 "adcs x20, x20, xzr\n\t"
1288 "adcs x21, x21, xzr\n\t"
1289 "adc x22, x22, xzr\n\t"
1290 /* Reduce if top bit set */
1291 "and x5, x3, x22, asr 63\n\t"
1292 "and x22, x22, #0x7fffffffffffffff\n\t"
1293 "adds x19, x19, x5\n\t"
1294 "adcs x20, x20, xzr\n\t"
1295 "adcs x21, x21, xzr\n\t"
1296 "adc x22, x22, xzr\n\t"
1297 /* Store */
1298 "stp x19, x20, [x29, #112]\n\t"
1299 "stp x21, x22, [x29, #128]\n\t"
1300 /* Multiply */
1301 "ldp %x[a], x26, [x29, #144]\n\t"
1302 "ldp x27, x28, [x29, #160]\n\t"
1303 /* A[0] * B[0] */
1304 "mul x19, x10, %x[a]\n\t"
1305 "umulh x20, x10, %x[a]\n\t"
1306 /* A[0] * B[1] */
1307 "mul x3, x10, x26\n\t"
1308 "umulh x21, x10, x26\n\t"
1309 "adds x20, x20, x3\n\t"
1310 "adc x21, x21, xzr\n\t"
1311 /* A[1] * B[0] */
1312 "mul x3, x11, %x[a]\n\t"
1313 "umulh x4, x11, %x[a]\n\t"
1314 "adds x20, x20, x3\n\t"
1315 "adcs x21, x21, x4\n\t"
1316 "adc x22, xzr, xzr\n\t"
1317 /* A[0] * B[2] */
1318 "mul x3, x10, x27\n\t"
1319 "umulh x4, x10, x27\n\t"
1320 "adds x21, x21, x3\n\t"
1321 "adc x22, x22, x4\n\t"
1322 /* A[1] * B[1] */
1323 "mul x3, x11, x26\n\t"
1324 "umulh x4, x11, x26\n\t"
1325 "adds x21, x21, x3\n\t"
1326 "adcs x22, x22, x4\n\t"
1327 "adc x14, xzr, xzr\n\t"
1328 /* A[2] * B[0] */
1329 "mul x3, x12, %x[a]\n\t"
1330 "umulh x4, x12, %x[a]\n\t"
1331 "adds x21, x21, x3\n\t"
1332 "adcs x22, x22, x4\n\t"
1333 "adc x14, x14, xzr\n\t"
1334 /* A[0] * B[3] */
1335 "mul x3, x10, x28\n\t"
1336 "umulh x4, x10, x28\n\t"
1337 "adds x22, x22, x3\n\t"
1338 "adcs x14, x14, x4\n\t"
1339 "adc x15, xzr, xzr\n\t"
1340 /* A[1] * B[2] */
1341 "mul x3, x11, x27\n\t"
1342 "umulh x4, x11, x27\n\t"
1343 "adds x22, x22, x3\n\t"
1344 "adcs x14, x14, x4\n\t"
1345 "adc x15, x15, xzr\n\t"
1346 /* A[2] * B[1] */
1347 "mul x3, x12, x26\n\t"
1348 "umulh x4, x12, x26\n\t"
1349 "adds x22, x22, x3\n\t"
1350 "adcs x14, x14, x4\n\t"
1351 "adc x15, x15, xzr\n\t"
1352 /* A[3] * B[0] */
1353 "mul x3, x13, %x[a]\n\t"
1354 "umulh x4, x13, %x[a]\n\t"
1355 "adds x22, x22, x3\n\t"
1356 "adcs x14, x14, x4\n\t"
1357 "adc x15, x15, xzr\n\t"
1358 /* A[1] * B[3] */
1359 "mul x3, x11, x28\n\t"
1360 "umulh x4, x11, x28\n\t"
1361 "adds x14, x14, x3\n\t"
1362 "adcs x15, x15, x4\n\t"
1363 "adc x16, xzr, xzr\n\t"
1364 /* A[2] * B[2] */
1365 "mul x3, x12, x27\n\t"
1366 "umulh x4, x12, x27\n\t"
1367 "adds x14, x14, x3\n\t"
1368 "adcs x15, x15, x4\n\t"
1369 "adc x16, x16, xzr\n\t"
1370 /* A[3] * B[1] */
1371 "mul x3, x13, x26\n\t"
1372 "umulh x4, x13, x26\n\t"
1373 "adds x14, x14, x3\n\t"
1374 "adcs x15, x15, x4\n\t"
1375 "adc x16, x16, xzr\n\t"
1376 /* A[2] * B[3] */
1377 "mul x3, x12, x28\n\t"
1378 "umulh x4, x12, x28\n\t"
1379 "adds x15, x15, x3\n\t"
1380 "adcs x16, x16, x4\n\t"
1381 "adc x17, xzr, xzr\n\t"
1382 /* A[3] * B[2] */
1383 "mul x3, x13, x27\n\t"
1384 "umulh x4, x13, x27\n\t"
1385 "adds x15, x15, x3\n\t"
1386 "adcs x16, x16, x4\n\t"
1387 "adc x17, x17, xzr\n\t"
1388 /* A[3] * B[3] */
1389 "mul x3, x13, x28\n\t"
1390 "umulh x4, x13, x28\n\t"
1391 "adds x16, x16, x3\n\t"
1392 "adc x17, x17, x4\n\t"
1393 /* Reduce */
1394 /* Move top half into t4-t7 and remove top bit from t3 */
1395 "extr x17, x17, x16, #63\n\t"
1396 "extr x16, x16, x15, #63\n\t"
1397 "extr x15, x15, x14, #63\n\t"
1398 "extr x14, x14, x22, #63\n\t"
1399 "and x22, x22, #0x7fffffffffffffff\n\t"
1400 /* Multiply top half by 19 */
1401 "mov x3, #19\n\t"
1402 "mul x4, x3, x14\n\t"
1403 "umulh x14, x3, x14\n\t"
1404 "adds x19, x19, x4\n\t"
1405 "mul x4, x3, x15\n\t"
1406 "umulh x15, x3, x15\n\t"
1407 "adcs x20, x20, x4\n\t"
1408 "mul x4, x3, x16\n\t"
1409 "umulh x16, x3, x16\n\t"
1410 "adcs x21, x21, x4\n\t"
1411 "mul x4, x3, x17\n\t"
1412 "umulh x5, x3, x17\n\t"
1413 "adcs x22, x22, x4\n\t"
1414 "adc x5, x5, xzr\n\t"
1415 /* Add remaining product results in */
1416 "adds x20, x20, x14\n\t"
1417 "adcs x21, x21, x15\n\t"
1418 "adcs x22, x22, x16\n\t"
1419 "adc x5, x5, xzr\n\t"
1420 /* Overflow */
1421 "extr x5, x5, x22, #63\n\t"
1422 "mul x5, x5, x3\n\t"
1423 "and x22, x22, #0x7fffffffffffffff\n\t"
1424 "adds x19, x19, x5\n\t"
1425 "adcs x20, x20, xzr\n\t"
1426 "adcs x21, x21, xzr\n\t"
1427 "adc x22, x22, xzr\n\t"
1428 /* Reduce if top bit set */
1429 "and x5, x3, x22, asr 63\n\t"
1430 "and x22, x22, #0x7fffffffffffffff\n\t"
1431 "adds x19, x19, x5\n\t"
1432 "adcs x20, x20, xzr\n\t"
1433 "adcs x21, x21, xzr\n\t"
1434 "adc x22, x22, xzr\n\t"
1435 /* Store */
1436 /* Square */
1437 /* A[0] * A[1] */
1438 "mul x11, %x[a], x26\n\t"
1439 "umulh x12, %x[a], x26\n\t"
1440 /* A[0] * A[2] */
1441 "mul x3, %x[a], x27\n\t"
1442 "umulh x13, %x[a], x27\n\t"
1443 "adds x12, x12, x3\n\t"
1444 "adc x13, x13, xzr\n\t"
1445 /* A[0] * A[3] */
1446 "mul x3, %x[a], x28\n\t"
1447 "umulh x14, %x[a], x28\n\t"
1448 "adds x13, x13, x3\n\t"
1449 "adc x14, x14, xzr\n\t"
1450 /* A[1] * A[2] */
1451 "mul x3, x26, x27\n\t"
1452 "umulh x4, x26, x27\n\t"
1453 "adds x13, x13, x3\n\t"
1454 "adcs x14, x14, x4\n\t"
1455 "adc x15, xzr, xzr\n\t"
1456 /* A[1] * A[3] */
1457 "mul x3, x26, x28\n\t"
1458 "umulh x4, x26, x28\n\t"
1459 "adds x14, x14, x3\n\t"
1460 "adc x15, x15, x4\n\t"
1461 /* A[2] * A[3] */
1462 "mul x3, x27, x28\n\t"
1463 "umulh x16, x27, x28\n\t"
1464 "adds x15, x15, x3\n\t"
1465 "adc x16, x16, xzr\n\t"
1466 /* Double */
1467 "adds x11, x11, x11\n\t"
1468 "adcs x12, x12, x12\n\t"
1469 "adcs x13, x13, x13\n\t"
1470 "adcs x14, x14, x14\n\t"
1471 "adcs x15, x15, x15\n\t"
1472 "adcs x16, x16, x16\n\t"
1473 "adc x17, xzr, xzr\n\t"
1474 /* A[0] * A[0] */
1475 "mul x10, %x[a], %x[a]\n\t"
1476 "umulh x5, %x[a], %x[a]\n\t"
1477 /* A[1] * A[1] */
1478 "mul x3, x26, x26\n\t"
1479 "umulh x4, x26, x26\n\t"
1480 "adds x11, x11, x5\n\t"
1481 "adcs x12, x12, x3\n\t"
1482 "adc x5, x4, xzr\n\t"
1483 /* A[2] * A[2] */
1484 "mul x3, x27, x27\n\t"
1485 "umulh x4, x27, x27\n\t"
1486 "adds x13, x13, x5\n\t"
1487 "adcs x14, x14, x3\n\t"
1488 "adc x5, x4, xzr\n\t"
1489 /* A[3] * A[3] */
1490 "mul x3, x28, x28\n\t"
1491 "umulh x4, x28, x28\n\t"
1492 "adds x15, x15, x5\n\t"
1493 "adcs x16, x16, x3\n\t"
1494 "adc x17, x17, x4\n\t"
1495 /* Reduce */
1496 /* Move top half into t4-t7 and remove top bit from t3 */
1497 "extr x17, x17, x16, #63\n\t"
1498 "extr x16, x16, x15, #63\n\t"
1499 "extr x15, x15, x14, #63\n\t"
1500 "extr x14, x14, x13, #63\n\t"
1501 "and x13, x13, #0x7fffffffffffffff\n\t"
1502 /* Multiply top half by 19 */
1503 "mov x3, #19\n\t"
1504 "mul x4, x3, x14\n\t"
1505 "umulh x14, x3, x14\n\t"
1506 "adds x10, x10, x4\n\t"
1507 "mul x4, x3, x15\n\t"
1508 "umulh x15, x3, x15\n\t"
1509 "adcs x11, x11, x4\n\t"
1510 "mul x4, x3, x16\n\t"
1511 "umulh x16, x3, x16\n\t"
1512 "adcs x12, x12, x4\n\t"
1513 "mul x4, x3, x17\n\t"
1514 "umulh x5, x3, x17\n\t"
1515 "adcs x13, x13, x4\n\t"
1516 "adc x5, x5, xzr\n\t"
1517 /* Add remaining product results in */
1518 "adds x11, x11, x14\n\t"
1519 "adcs x12, x12, x15\n\t"
1520 "adcs x13, x13, x16\n\t"
1521 "adc x5, x5, xzr\n\t"
1522 /* Overflow */
1523 "extr x5, x5, x13, #63\n\t"
1524 "mul x5, x5, x3\n\t"
1525 "and x13, x13, #0x7fffffffffffffff\n\t"
1526 "adds x10, x10, x5\n\t"
1527 "adcs x11, x11, xzr\n\t"
1528 "adcs x12, x12, xzr\n\t"
1529 "adc x13, x13, xzr\n\t"
1530 /* Reduce if top bit set */
1531 "and x5, x3, x13, asr 63\n\t"
1532 "and x13, x13, #0x7fffffffffffffff\n\t"
1533 "adds x10, x10, x5\n\t"
1534 "adcs x11, x11, xzr\n\t"
1535 "adcs x12, x12, xzr\n\t"
1536 "adc x13, x13, xzr\n\t"
1537 /* Store */
1538 /* Square */
1539 /* A[0] * A[1] */
1540 "mul x15, x6, x7\n\t"
1541 "umulh x16, x6, x7\n\t"
1542 /* A[0] * A[2] */
1543 "mul x3, x6, x8\n\t"
1544 "umulh x17, x6, x8\n\t"
1545 "adds x16, x16, x3\n\t"
1546 "adc x17, x17, xzr\n\t"
1547 /* A[0] * A[3] */
1548 "mul x3, x6, x9\n\t"
1549 "umulh %x[a], x6, x9\n\t"
1550 "adds x17, x17, x3\n\t"
1551 "adc %x[a], %x[a], xzr\n\t"
1552 /* A[1] * A[2] */
1553 "mul x3, x7, x8\n\t"
1554 "umulh x4, x7, x8\n\t"
1555 "adds x17, x17, x3\n\t"
1556 "adcs %x[a], %x[a], x4\n\t"
1557 "adc x26, xzr, xzr\n\t"
1558 /* A[1] * A[3] */
1559 "mul x3, x7, x9\n\t"
1560 "umulh x4, x7, x9\n\t"
1561 "adds %x[a], %x[a], x3\n\t"
1562 "adc x26, x26, x4\n\t"
1563 /* A[2] * A[3] */
1564 "mul x3, x8, x9\n\t"
1565 "umulh x27, x8, x9\n\t"
1566 "adds x26, x26, x3\n\t"
1567 "adc x27, x27, xzr\n\t"
1568 /* Double */
1569 "adds x15, x15, x15\n\t"
1570 "adcs x16, x16, x16\n\t"
1571 "adcs x17, x17, x17\n\t"
1572 "adcs %x[a], %x[a], %x[a]\n\t"
1573 "adcs x26, x26, x26\n\t"
1574 "adcs x27, x27, x27\n\t"
1575 "adc x28, xzr, xzr\n\t"
1576 /* A[0] * A[0] */
1577 "mul x14, x6, x6\n\t"
1578 "umulh x5, x6, x6\n\t"
1579 /* A[1] * A[1] */
1580 "mul x3, x7, x7\n\t"
1581 "umulh x4, x7, x7\n\t"
1582 "adds x15, x15, x5\n\t"
1583 "adcs x16, x16, x3\n\t"
1584 "adc x5, x4, xzr\n\t"
1585 /* A[2] * A[2] */
1586 "mul x3, x8, x8\n\t"
1587 "umulh x4, x8, x8\n\t"
1588 "adds x17, x17, x5\n\t"
1589 "adcs %x[a], %x[a], x3\n\t"
1590 "adc x5, x4, xzr\n\t"
1591 /* A[3] * A[3] */
1592 "mul x3, x9, x9\n\t"
1593 "umulh x4, x9, x9\n\t"
1594 "adds x26, x26, x5\n\t"
1595 "adcs x27, x27, x3\n\t"
1596 "adc x28, x28, x4\n\t"
1597 /* Reduce */
1598 /* Move top half into t4-t7 and remove top bit from t3 */
1599 "extr x28, x28, x27, #63\n\t"
1600 "extr x27, x27, x26, #63\n\t"
1601 "extr x26, x26, %x[a], #63\n\t"
1602 "extr %x[a], %x[a], x17, #63\n\t"
1603 "and x17, x17, #0x7fffffffffffffff\n\t"
1604 /* Multiply top half by 19 */
1605 "mov x3, #19\n\t"
1606 "mul x4, x3, %x[a]\n\t"
1607 "umulh %x[a], x3, %x[a]\n\t"
1608 "adds x14, x14, x4\n\t"
1609 "mul x4, x3, x26\n\t"
1610 "umulh x26, x3, x26\n\t"
1611 "adcs x15, x15, x4\n\t"
1612 "mul x4, x3, x27\n\t"
1613 "umulh x27, x3, x27\n\t"
1614 "adcs x16, x16, x4\n\t"
1615 "mul x4, x3, x28\n\t"
1616 "umulh x5, x3, x28\n\t"
1617 "adcs x17, x17, x4\n\t"
1618 "adc x5, x5, xzr\n\t"
1619 /* Add remaining product results in */
1620 "adds x15, x15, %x[a]\n\t"
1621 "adcs x16, x16, x26\n\t"
1622 "adcs x17, x17, x27\n\t"
1623 "adc x5, x5, xzr\n\t"
1624 /* Overflow */
1625 "extr x5, x5, x17, #63\n\t"
1626 "mul x5, x5, x3\n\t"
1627 "and x17, x17, #0x7fffffffffffffff\n\t"
1628 "adds x14, x14, x5\n\t"
1629 "adcs x15, x15, xzr\n\t"
1630 "adcs x16, x16, xzr\n\t"
1631 "adc x17, x17, xzr\n\t"
1632 /* Reduce if top bit set */
1633 "and x5, x3, x17, asr 63\n\t"
1634 "and x17, x17, #0x7fffffffffffffff\n\t"
1635 "adds x14, x14, x5\n\t"
1636 "adcs x15, x15, xzr\n\t"
1637 "adcs x16, x16, xzr\n\t"
1638 "adc x17, x17, xzr\n\t"
1639 /* Store */
1640 /* Multiply */
1641 /* A[0] * B[0] */
1642 "mul x6, x14, x10\n\t"
1643 "umulh x7, x14, x10\n\t"
1644 /* A[0] * B[1] */
1645 "mul x3, x14, x11\n\t"
1646 "umulh x8, x14, x11\n\t"
1647 "adds x7, x7, x3\n\t"
1648 "adc x8, x8, xzr\n\t"
1649 /* A[1] * B[0] */
1650 "mul x3, x15, x10\n\t"
1651 "umulh x4, x15, x10\n\t"
1652 "adds x7, x7, x3\n\t"
1653 "adcs x8, x8, x4\n\t"
1654 "adc x9, xzr, xzr\n\t"
1655 /* A[0] * B[2] */
1656 "mul x3, x14, x12\n\t"
1657 "umulh x4, x14, x12\n\t"
1658 "adds x8, x8, x3\n\t"
1659 "adc x9, x9, x4\n\t"
1660 /* A[1] * B[1] */
1661 "mul x3, x15, x11\n\t"
1662 "umulh x4, x15, x11\n\t"
1663 "adds x8, x8, x3\n\t"
1664 "adcs x9, x9, x4\n\t"
1665 "adc %x[a], xzr, xzr\n\t"
1666 /* A[2] * B[0] */
1667 "mul x3, x16, x10\n\t"
1668 "umulh x4, x16, x10\n\t"
1669 "adds x8, x8, x3\n\t"
1670 "adcs x9, x9, x4\n\t"
1671 "adc %x[a], %x[a], xzr\n\t"
1672 /* A[0] * B[3] */
1673 "mul x3, x14, x13\n\t"
1674 "umulh x4, x14, x13\n\t"
1675 "adds x9, x9, x3\n\t"
1676 "adcs %x[a], %x[a], x4\n\t"
1677 "adc x26, xzr, xzr\n\t"
1678 /* A[1] * B[2] */
1679 "mul x3, x15, x12\n\t"
1680 "umulh x4, x15, x12\n\t"
1681 "adds x9, x9, x3\n\t"
1682 "adcs %x[a], %x[a], x4\n\t"
1683 "adc x26, x26, xzr\n\t"
1684 /* A[2] * B[1] */
1685 "mul x3, x16, x11\n\t"
1686 "umulh x4, x16, x11\n\t"
1687 "adds x9, x9, x3\n\t"
1688 "adcs %x[a], %x[a], x4\n\t"
1689 "adc x26, x26, xzr\n\t"
1690 /* A[3] * B[0] */
1691 "mul x3, x17, x10\n\t"
1692 "umulh x4, x17, x10\n\t"
1693 "adds x9, x9, x3\n\t"
1694 "adcs %x[a], %x[a], x4\n\t"
1695 "adc x26, x26, xzr\n\t"
1696 /* A[1] * B[3] */
1697 "mul x3, x15, x13\n\t"
1698 "umulh x4, x15, x13\n\t"
1699 "adds %x[a], %x[a], x3\n\t"
1700 "adcs x26, x26, x4\n\t"
1701 "adc x27, xzr, xzr\n\t"
1702 /* A[2] * B[2] */
1703 "mul x3, x16, x12\n\t"
1704 "umulh x4, x16, x12\n\t"
1705 "adds %x[a], %x[a], x3\n\t"
1706 "adcs x26, x26, x4\n\t"
1707 "adc x27, x27, xzr\n\t"
1708 /* A[3] * B[1] */
1709 "mul x3, x17, x11\n\t"
1710 "umulh x4, x17, x11\n\t"
1711 "adds %x[a], %x[a], x3\n\t"
1712 "adcs x26, x26, x4\n\t"
1713 "adc x27, x27, xzr\n\t"
1714 /* A[2] * B[3] */
1715 "mul x3, x16, x13\n\t"
1716 "umulh x4, x16, x13\n\t"
1717 "adds x26, x26, x3\n\t"
1718 "adcs x27, x27, x4\n\t"
1719 "adc x28, xzr, xzr\n\t"
1720 /* A[3] * B[2] */
1721 "mul x3, x17, x12\n\t"
1722 "umulh x4, x17, x12\n\t"
1723 "adds x26, x26, x3\n\t"
1724 "adcs x27, x27, x4\n\t"
1725 "adc x28, x28, xzr\n\t"
1726 /* A[3] * B[3] */
1727 "mul x3, x17, x13\n\t"
1728 "umulh x4, x17, x13\n\t"
1729 "adds x27, x27, x3\n\t"
1730 "adc x28, x28, x4\n\t"
1731 /* Reduce */
1732 /* Move top half into t4-t7 and remove top bit from t3 */
1733 "extr x28, x28, x27, #63\n\t"
1734 "extr x27, x27, x26, #63\n\t"
1735 "extr x26, x26, %x[a], #63\n\t"
1736 "extr %x[a], %x[a], x9, #63\n\t"
1737 "and x9, x9, #0x7fffffffffffffff\n\t"
1738 /* Multiply top half by 19 */
1739 "mov x3, #19\n\t"
1740 "mul x4, x3, %x[a]\n\t"
1741 "umulh %x[a], x3, %x[a]\n\t"
1742 "adds x6, x6, x4\n\t"
1743 "mul x4, x3, x26\n\t"
1744 "umulh x26, x3, x26\n\t"
1745 "adcs x7, x7, x4\n\t"
1746 "mul x4, x3, x27\n\t"
1747 "umulh x27, x3, x27\n\t"
1748 "adcs x8, x8, x4\n\t"
1749 "mul x4, x3, x28\n\t"
1750 "umulh x5, x3, x28\n\t"
1751 "adcs x9, x9, x4\n\t"
1752 "adc x5, x5, xzr\n\t"
1753 /* Add remaining product results in */
1754 "adds x7, x7, %x[a]\n\t"
1755 "adcs x8, x8, x26\n\t"
1756 "adcs x9, x9, x27\n\t"
1757 "adc x5, x5, xzr\n\t"
1758 /* Overflow */
1759 "extr x5, x5, x9, #63\n\t"
1760 "mul x5, x5, x3\n\t"
1761 "and x9, x9, #0x7fffffffffffffff\n\t"
1762 "adds x6, x6, x5\n\t"
1763 "adcs x7, x7, xzr\n\t"
1764 "adcs x8, x8, xzr\n\t"
1765 "adc x9, x9, xzr\n\t"
1766 /* Reduce if top bit set */
1767 "and x5, x3, x9, asr 63\n\t"
1768 "and x9, x9, #0x7fffffffffffffff\n\t"
1769 "adds x6, x6, x5\n\t"
1770 "adcs x7, x7, xzr\n\t"
1771 "adcs x8, x8, xzr\n\t"
1772 "adc x9, x9, xzr\n\t"
1773 /* Store */
1774 "stp x6, x7, [%x[r]]\n\t"
1775 "stp x8, x9, [%x[r], #16]\n\t"
1776 /* Sub */
1777 "subs x14, x14, x10\n\t"
1778 "sbcs x15, x15, x11\n\t"
1779 "sbcs x16, x16, x12\n\t"
1780 "sbcs x17, x17, x13\n\t"
1781 "mov x3, #-19\n\t"
1782 "csetm %x[a], cc\n\t"
1783 /* Mask the modulus */
1784 "and x3, %x[a], x3\n\t"
1785 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1786 /* Add modulus (if underflow) */
1787 "adds x14, x14, x3\n\t"
1788 "adcs x15, x15, %x[a]\n\t"
1789 "adcs x16, x16, %x[a]\n\t"
1790 "adc x17, x17, x4\n\t"
1791 /* Multiply by 121666 */
1792 "mov x5, #0xdb42\n\t"
1793 "movk x5, #1, lsl 16\n\t"
1794 "mul x6, x14, x5\n\t"
1795 "umulh x7, x14, x5\n\t"
1796 "mul x3, x15, x5\n\t"
1797 "umulh x4, x15, x5\n\t"
1798 "adds x7, x7, x3\n\t"
1799 "adc x8, xzr, x4\n\t"
1800 "mul x3, x16, x5\n\t"
1801 "umulh x4, x16, x5\n\t"
1802 "adds x8, x8, x3\n\t"
1803 "adc x9, xzr, x4\n\t"
1804 "mul x3, x17, x5\n\t"
1805 "umulh x4, x17, x5\n\t"
1806 "adds x9, x9, x3\n\t"
1807 "adc x4, xzr, x4\n\t"
1808 "mov x5, #19\n\t"
1809 "extr x4, x4, x9, #63\n\t"
1810 "mul x4, x4, x5\n\t"
1811 "and x9, x9, #0x7fffffffffffffff\n\t"
1812 "adds x6, x6, x4\n\t"
1813 "adcs x7, x7, xzr\n\t"
1814 "adcs x8, x8, xzr\n\t"
1815 "adc x9, x9, xzr\n\t"
1816 /* Add */
1817 "adds x10, x10, x6\n\t"
1818 "adcs x11, x11, x7\n\t"
1819 "adcs x12, x12, x8\n\t"
1820 "adc x13, x13, x9\n\t"
1821 "mov x3, #-19\n\t"
1822 "asr %x[a], x13, #63\n\t"
1823 /* Mask the modulus */
1824 "and x3, %x[a], x3\n\t"
1825 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1826 /* Sub modulus (if overflow) */
1827 "subs x10, x10, x3\n\t"
1828 "sbcs x11, x11, %x[a]\n\t"
1829 "sbcs x12, x12, %x[a]\n\t"
1830 "sbc x13, x13, x4\n\t"
1831 /* Multiply */
1832 /* A[0] * B[0] */
1833 "mul x6, x14, x10\n\t"
1834 "umulh x7, x14, x10\n\t"
1835 /* A[0] * B[1] */
1836 "mul x3, x14, x11\n\t"
1837 "umulh x8, x14, x11\n\t"
1838 "adds x7, x7, x3\n\t"
1839 "adc x8, x8, xzr\n\t"
1840 /* A[1] * B[0] */
1841 "mul x3, x15, x10\n\t"
1842 "umulh x4, x15, x10\n\t"
1843 "adds x7, x7, x3\n\t"
1844 "adcs x8, x8, x4\n\t"
1845 "adc x9, xzr, xzr\n\t"
1846 /* A[0] * B[2] */
1847 "mul x3, x14, x12\n\t"
1848 "umulh x4, x14, x12\n\t"
1849 "adds x8, x8, x3\n\t"
1850 "adc x9, x9, x4\n\t"
1851 /* A[1] * B[1] */
1852 "mul x3, x15, x11\n\t"
1853 "umulh x4, x15, x11\n\t"
1854 "adds x8, x8, x3\n\t"
1855 "adcs x9, x9, x4\n\t"
1856 "adc %x[a], xzr, xzr\n\t"
1857 /* A[2] * B[0] */
1858 "mul x3, x16, x10\n\t"
1859 "umulh x4, x16, x10\n\t"
1860 "adds x8, x8, x3\n\t"
1861 "adcs x9, x9, x4\n\t"
1862 "adc %x[a], %x[a], xzr\n\t"
1863 /* A[0] * B[3] */
1864 "mul x3, x14, x13\n\t"
1865 "umulh x4, x14, x13\n\t"
1866 "adds x9, x9, x3\n\t"
1867 "adcs %x[a], %x[a], x4\n\t"
1868 "adc x26, xzr, xzr\n\t"
1869 /* A[1] * B[2] */
1870 "mul x3, x15, x12\n\t"
1871 "umulh x4, x15, x12\n\t"
1872 "adds x9, x9, x3\n\t"
1873 "adcs %x[a], %x[a], x4\n\t"
1874 "adc x26, x26, xzr\n\t"
1875 /* A[2] * B[1] */
1876 "mul x3, x16, x11\n\t"
1877 "umulh x4, x16, x11\n\t"
1878 "adds x9, x9, x3\n\t"
1879 "adcs %x[a], %x[a], x4\n\t"
1880 "adc x26, x26, xzr\n\t"
1881 /* A[3] * B[0] */
1882 "mul x3, x17, x10\n\t"
1883 "umulh x4, x17, x10\n\t"
1884 "adds x9, x9, x3\n\t"
1885 "adcs %x[a], %x[a], x4\n\t"
1886 "adc x26, x26, xzr\n\t"
1887 /* A[1] * B[3] */
1888 "mul x3, x15, x13\n\t"
1889 "umulh x4, x15, x13\n\t"
1890 "adds %x[a], %x[a], x3\n\t"
1891 "adcs x26, x26, x4\n\t"
1892 "adc x27, xzr, xzr\n\t"
1893 /* A[2] * B[2] */
1894 "mul x3, x16, x12\n\t"
1895 "umulh x4, x16, x12\n\t"
1896 "adds %x[a], %x[a], x3\n\t"
1897 "adcs x26, x26, x4\n\t"
1898 "adc x27, x27, xzr\n\t"
1899 /* A[3] * B[1] */
1900 "mul x3, x17, x11\n\t"
1901 "umulh x4, x17, x11\n\t"
1902 "adds %x[a], %x[a], x3\n\t"
1903 "adcs x26, x26, x4\n\t"
1904 "adc x27, x27, xzr\n\t"
1905 /* A[2] * B[3] */
1906 "mul x3, x16, x13\n\t"
1907 "umulh x4, x16, x13\n\t"
1908 "adds x26, x26, x3\n\t"
1909 "adcs x27, x27, x4\n\t"
1910 "adc x28, xzr, xzr\n\t"
1911 /* A[3] * B[2] */
1912 "mul x3, x17, x12\n\t"
1913 "umulh x4, x17, x12\n\t"
1914 "adds x26, x26, x3\n\t"
1915 "adcs x27, x27, x4\n\t"
1916 "adc x28, x28, xzr\n\t"
1917 /* A[3] * B[3] */
1918 "mul x3, x17, x13\n\t"
1919 "umulh x4, x17, x13\n\t"
1920 "adds x27, x27, x3\n\t"
1921 "adc x28, x28, x4\n\t"
1922 /* Reduce */
1923 /* Move top half into t4-t7 and remove top bit from t3 */
1924 "extr x28, x28, x27, #63\n\t"
1925 "extr x27, x27, x26, #63\n\t"
1926 "extr x26, x26, %x[a], #63\n\t"
1927 "extr %x[a], %x[a], x9, #63\n\t"
1928 "and x9, x9, #0x7fffffffffffffff\n\t"
1929 /* Multiply top half by 19 */
1930 "mov x3, #19\n\t"
1931 "mul x4, x3, %x[a]\n\t"
1932 "umulh %x[a], x3, %x[a]\n\t"
1933 "adds x6, x6, x4\n\t"
1934 "mul x4, x3, x26\n\t"
1935 "umulh x26, x3, x26\n\t"
1936 "adcs x7, x7, x4\n\t"
1937 "mul x4, x3, x27\n\t"
1938 "umulh x27, x3, x27\n\t"
1939 "adcs x8, x8, x4\n\t"
1940 "mul x4, x3, x28\n\t"
1941 "umulh x5, x3, x28\n\t"
1942 "adcs x9, x9, x4\n\t"
1943 "adc x5, x5, xzr\n\t"
1944 /* Add remaining product results in */
1945 "adds x7, x7, %x[a]\n\t"
1946 "adcs x8, x8, x26\n\t"
1947 "adcs x9, x9, x27\n\t"
1948 "adc x5, x5, xzr\n\t"
1949 /* Overflow */
1950 "extr x5, x5, x9, #63\n\t"
1951 "mul x5, x5, x3\n\t"
1952 "and x9, x9, #0x7fffffffffffffff\n\t"
1953 "adds x6, x6, x5\n\t"
1954 "adcs x7, x7, xzr\n\t"
1955 "adcs x8, x8, xzr\n\t"
1956 "adc x9, x9, xzr\n\t"
1957 /* Reduce if top bit set */
1958 "and x5, x3, x9, asr 63\n\t"
1959 "and x9, x9, #0x7fffffffffffffff\n\t"
1960 "adds x6, x6, x5\n\t"
1961 "adcs x7, x7, xzr\n\t"
1962 "adcs x8, x8, xzr\n\t"
1963 "adc x9, x9, xzr\n\t"
1964 /* Store */
1965 "stp x6, x7, [x29, #16]\n\t"
1966 "stp x8, x9, [x29, #32]\n\t"
1967 /* Add */
1968 "ldp x6, x7, [x29, #112]\n\t"
1969 "ldp x8, x9, [x29, #128]\n\t"
1970 "adds x10, x6, x19\n\t"
1971 "adcs x11, x7, x20\n\t"
1972 "adcs x12, x8, x21\n\t"
1973 "adc x13, x9, x22\n\t"
1974 "mov x3, #-19\n\t"
1975 "asr %x[a], x13, #63\n\t"
1976 /* Mask the modulus */
1977 "and x3, %x[a], x3\n\t"
1978 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1979 /* Sub modulus (if overflow) */
1980 "subs x10, x10, x3\n\t"
1981 "sbcs x11, x11, %x[a]\n\t"
1982 "sbcs x12, x12, %x[a]\n\t"
1983 "sbc x13, x13, x4\n\t"
1984 /* Sub */
1985 "subs x19, x6, x19\n\t"
1986 "sbcs x20, x7, x20\n\t"
1987 "sbcs x21, x8, x21\n\t"
1988 "sbcs x22, x9, x22\n\t"
1989 "mov x3, #-19\n\t"
1990 "csetm %x[a], cc\n\t"
1991 /* Mask the modulus */
1992 "and x3, %x[a], x3\n\t"
1993 "and x4, %x[a], #0x7fffffffffffffff\n\t"
1994 /* Add modulus (if underflow) */
1995 "adds x19, x19, x3\n\t"
1996 "adcs x20, x20, %x[a]\n\t"
1997 "adcs x21, x21, %x[a]\n\t"
1998 "adc x22, x22, x4\n\t"
1999 /* Square */
2000 /* A[0] * A[1] */
2001 "mul x7, x10, x11\n\t"
2002 "umulh x8, x10, x11\n\t"
2003 /* A[0] * A[2] */
2004 "mul x3, x10, x12\n\t"
2005 "umulh x9, x10, x12\n\t"
2006 "adds x8, x8, x3\n\t"
2007 "adc x9, x9, xzr\n\t"
2008 /* A[0] * A[3] */
2009 "mul x3, x10, x13\n\t"
2010 "umulh %x[a], x10, x13\n\t"
2011 "adds x9, x9, x3\n\t"
2012 "adc %x[a], %x[a], xzr\n\t"
2013 /* A[1] * A[2] */
2014 "mul x3, x11, x12\n\t"
2015 "umulh x4, x11, x12\n\t"
2016 "adds x9, x9, x3\n\t"
2017 "adcs %x[a], %x[a], x4\n\t"
2018 "adc x26, xzr, xzr\n\t"
2019 /* A[1] * A[3] */
2020 "mul x3, x11, x13\n\t"
2021 "umulh x4, x11, x13\n\t"
2022 "adds %x[a], %x[a], x3\n\t"
2023 "adc x26, x26, x4\n\t"
2024 /* A[2] * A[3] */
2025 "mul x3, x12, x13\n\t"
2026 "umulh x27, x12, x13\n\t"
2027 "adds x26, x26, x3\n\t"
2028 "adc x27, x27, xzr\n\t"
2029 /* Double */
2030 "adds x7, x7, x7\n\t"
2031 "adcs x8, x8, x8\n\t"
2032 "adcs x9, x9, x9\n\t"
2033 "adcs %x[a], %x[a], %x[a]\n\t"
2034 "adcs x26, x26, x26\n\t"
2035 "adcs x27, x27, x27\n\t"
2036 "adc x28, xzr, xzr\n\t"
2037 /* A[0] * A[0] */
2038 "mul x6, x10, x10\n\t"
2039 "umulh x5, x10, x10\n\t"
2040 /* A[1] * A[1] */
2041 "mul x3, x11, x11\n\t"
2042 "umulh x4, x11, x11\n\t"
2043 "adds x7, x7, x5\n\t"
2044 "adcs x8, x8, x3\n\t"
2045 "adc x5, x4, xzr\n\t"
2046 /* A[2] * A[2] */
2047 "mul x3, x12, x12\n\t"
2048 "umulh x4, x12, x12\n\t"
2049 "adds x9, x9, x5\n\t"
2050 "adcs %x[a], %x[a], x3\n\t"
2051 "adc x5, x4, xzr\n\t"
2052 /* A[3] * A[3] */
2053 "mul x3, x13, x13\n\t"
2054 "umulh x4, x13, x13\n\t"
2055 "adds x26, x26, x5\n\t"
2056 "adcs x27, x27, x3\n\t"
2057 "adc x28, x28, x4\n\t"
2058 /* Reduce */
2059 /* Move top half into t4-t7 and remove top bit from t3 */
2060 "extr x28, x28, x27, #63\n\t"
2061 "extr x27, x27, x26, #63\n\t"
2062 "extr x26, x26, %x[a], #63\n\t"
2063 "extr %x[a], %x[a], x9, #63\n\t"
2064 "and x9, x9, #0x7fffffffffffffff\n\t"
2065 /* Multiply top half by 19 */
2066 "mov x3, #19\n\t"
2067 "mul x4, x3, %x[a]\n\t"
2068 "umulh %x[a], x3, %x[a]\n\t"
2069 "adds x6, x6, x4\n\t"
2070 "mul x4, x3, x26\n\t"
2071 "umulh x26, x3, x26\n\t"
2072 "adcs x7, x7, x4\n\t"
2073 "mul x4, x3, x27\n\t"
2074 "umulh x27, x3, x27\n\t"
2075 "adcs x8, x8, x4\n\t"
2076 "mul x4, x3, x28\n\t"
2077 "umulh x5, x3, x28\n\t"
2078 "adcs x9, x9, x4\n\t"
2079 "adc x5, x5, xzr\n\t"
2080 /* Add remaining product results in */
2081 "adds x7, x7, %x[a]\n\t"
2082 "adcs x8, x8, x26\n\t"
2083 "adcs x9, x9, x27\n\t"
2084 "adc x5, x5, xzr\n\t"
2085 /* Overflow */
2086 "extr x5, x5, x9, #63\n\t"
2087 "mul x5, x5, x3\n\t"
2088 "and x9, x9, #0x7fffffffffffffff\n\t"
2089 "adds x6, x6, x5\n\t"
2090 "adcs x7, x7, xzr\n\t"
2091 "adcs x8, x8, xzr\n\t"
2092 "adc x9, x9, xzr\n\t"
2093 /* Reduce if top bit set */
2094 "and x5, x3, x9, asr 63\n\t"
2095 "and x9, x9, #0x7fffffffffffffff\n\t"
2096 "adds x6, x6, x5\n\t"
2097 "adcs x7, x7, xzr\n\t"
2098 "adcs x8, x8, xzr\n\t"
2099 "adc x9, x9, xzr\n\t"
2100 /* Store */
2101 "stp x6, x7, [x29, #80]\n\t"
2102 "stp x8, x9, [x29, #96]\n\t"
2103 /* Square */
2104 /* A[0] * A[1] */
2105 "mul x7, x19, x20\n\t"
2106 "umulh x8, x19, x20\n\t"
2107 /* A[0] * A[2] */
2108 "mul x3, x19, x21\n\t"
2109 "umulh x9, x19, x21\n\t"
2110 "adds x8, x8, x3\n\t"
2111 "adc x9, x9, xzr\n\t"
2112 /* A[0] * A[3] */
2113 "mul x3, x19, x22\n\t"
2114 "umulh %x[a], x19, x22\n\t"
2115 "adds x9, x9, x3\n\t"
2116 "adc %x[a], %x[a], xzr\n\t"
2117 /* A[1] * A[2] */
2118 "mul x3, x20, x21\n\t"
2119 "umulh x4, x20, x21\n\t"
2120 "adds x9, x9, x3\n\t"
2121 "adcs %x[a], %x[a], x4\n\t"
2122 "adc x26, xzr, xzr\n\t"
2123 /* A[1] * A[3] */
2124 "mul x3, x20, x22\n\t"
2125 "umulh x4, x20, x22\n\t"
2126 "adds %x[a], %x[a], x3\n\t"
2127 "adc x26, x26, x4\n\t"
2128 /* A[2] * A[3] */
2129 "mul x3, x21, x22\n\t"
2130 "umulh x27, x21, x22\n\t"
2131 "adds x26, x26, x3\n\t"
2132 "adc x27, x27, xzr\n\t"
2133 /* Double */
2134 "adds x7, x7, x7\n\t"
2135 "adcs x8, x8, x8\n\t"
2136 "adcs x9, x9, x9\n\t"
2137 "adcs %x[a], %x[a], %x[a]\n\t"
2138 "adcs x26, x26, x26\n\t"
2139 "adcs x27, x27, x27\n\t"
2140 "adc x28, xzr, xzr\n\t"
2141 /* A[0] * A[0] */
2142 "mul x6, x19, x19\n\t"
2143 "umulh x5, x19, x19\n\t"
2144 /* A[1] * A[1] */
2145 "mul x3, x20, x20\n\t"
2146 "umulh x4, x20, x20\n\t"
2147 "adds x7, x7, x5\n\t"
2148 "adcs x8, x8, x3\n\t"
2149 "adc x5, x4, xzr\n\t"
2150 /* A[2] * A[2] */
2151 "mul x3, x21, x21\n\t"
2152 "umulh x4, x21, x21\n\t"
2153 "adds x9, x9, x5\n\t"
2154 "adcs %x[a], %x[a], x3\n\t"
2155 "adc x5, x4, xzr\n\t"
2156 /* A[3] * A[3] */
2157 "mul x3, x22, x22\n\t"
2158 "umulh x4, x22, x22\n\t"
2159 "adds x26, x26, x5\n\t"
2160 "adcs x27, x27, x3\n\t"
2161 "adc x28, x28, x4\n\t"
2162 /* Reduce */
2163 /* Move top half into t4-t7 and remove top bit from t3 */
2164 "extr x28, x28, x27, #63\n\t"
2165 "extr x27, x27, x26, #63\n\t"
2166 "extr x26, x26, %x[a], #63\n\t"
2167 "extr %x[a], %x[a], x9, #63\n\t"
2168 "and x9, x9, #0x7fffffffffffffff\n\t"
2169 /* Multiply top half by 19 */
2170 "mov x3, #19\n\t"
2171 "mul x4, x3, %x[a]\n\t"
2172 "umulh %x[a], x3, %x[a]\n\t"
2173 "adds x6, x6, x4\n\t"
2174 "mul x4, x3, x26\n\t"
2175 "umulh x26, x3, x26\n\t"
2176 "adcs x7, x7, x4\n\t"
2177 "mul x4, x3, x27\n\t"
2178 "umulh x27, x3, x27\n\t"
2179 "adcs x8, x8, x4\n\t"
2180 "mul x4, x3, x28\n\t"
2181 "umulh x5, x3, x28\n\t"
2182 "adcs x9, x9, x4\n\t"
2183 "adc x5, x5, xzr\n\t"
2184 /* Add remaining product results in */
2185 "adds x7, x7, %x[a]\n\t"
2186 "adcs x8, x8, x26\n\t"
2187 "adcs x9, x9, x27\n\t"
2188 "adc x5, x5, xzr\n\t"
2189 /* Overflow */
2190 "extr x5, x5, x9, #63\n\t"
2191 "mul x5, x5, x3\n\t"
2192 "and x9, x9, #0x7fffffffffffffff\n\t"
2193 "adds x6, x6, x5\n\t"
2194 "adcs x7, x7, xzr\n\t"
2195 "adcs x8, x8, xzr\n\t"
2196 "adc x9, x9, xzr\n\t"
2197 /* Reduce if top bit set */
2198 "and x5, x3, x9, asr 63\n\t"
2199 "and x9, x9, #0x7fffffffffffffff\n\t"
2200 "adds x6, x6, x5\n\t"
2201 "adcs x7, x7, xzr\n\t"
2202 "adcs x8, x8, xzr\n\t"
2203 "adc x9, x9, xzr\n\t"
2204 /* Store */
2205 "ldr %x[a], [x29, #184]\n\t"
2206 /* Multiply */
2207 "ldp x14, x15, [%x[a]]\n\t"
2208 "ldp x16, x17, [%x[a], #16]\n\t"
2209 /* A[0] * B[0] */
2210 "mul x10, x14, x6\n\t"
2211 "umulh x11, x14, x6\n\t"
2212 /* A[0] * B[1] */
2213 "mul x3, x14, x7\n\t"
2214 "umulh x12, x14, x7\n\t"
2215 "adds x11, x11, x3\n\t"
2216 "adc x12, x12, xzr\n\t"
2217 /* A[1] * B[0] */
2218 "mul x3, x15, x6\n\t"
2219 "umulh x4, x15, x6\n\t"
2220 "adds x11, x11, x3\n\t"
2221 "adcs x12, x12, x4\n\t"
2222 "adc x13, xzr, xzr\n\t"
2223 /* A[0] * B[2] */
2224 "mul x3, x14, x8\n\t"
2225 "umulh x4, x14, x8\n\t"
2226 "adds x12, x12, x3\n\t"
2227 "adc x13, x13, x4\n\t"
2228 /* A[1] * B[1] */
2229 "mul x3, x15, x7\n\t"
2230 "umulh x4, x15, x7\n\t"
2231 "adds x12, x12, x3\n\t"
2232 "adcs x13, x13, x4\n\t"
2233 "adc %x[a], xzr, xzr\n\t"
2234 /* A[2] * B[0] */
2235 "mul x3, x16, x6\n\t"
2236 "umulh x4, x16, x6\n\t"
2237 "adds x12, x12, x3\n\t"
2238 "adcs x13, x13, x4\n\t"
2239 "adc %x[a], %x[a], xzr\n\t"
2240 /* A[0] * B[3] */
2241 "mul x3, x14, x9\n\t"
2242 "umulh x4, x14, x9\n\t"
2243 "adds x13, x13, x3\n\t"
2244 "adcs %x[a], %x[a], x4\n\t"
2245 "adc x26, xzr, xzr\n\t"
2246 /* A[1] * B[2] */
2247 "mul x3, x15, x8\n\t"
2248 "umulh x4, x15, x8\n\t"
2249 "adds x13, x13, x3\n\t"
2250 "adcs %x[a], %x[a], x4\n\t"
2251 "adc x26, x26, xzr\n\t"
2252 /* A[2] * B[1] */
2253 "mul x3, x16, x7\n\t"
2254 "umulh x4, x16, x7\n\t"
2255 "adds x13, x13, x3\n\t"
2256 "adcs %x[a], %x[a], x4\n\t"
2257 "adc x26, x26, xzr\n\t"
2258 /* A[3] * B[0] */
2259 "mul x3, x17, x6\n\t"
2260 "umulh x4, x17, x6\n\t"
2261 "adds x13, x13, x3\n\t"
2262 "adcs %x[a], %x[a], x4\n\t"
2263 "adc x26, x26, xzr\n\t"
2264 /* A[1] * B[3] */
2265 "mul x3, x15, x9\n\t"
2266 "umulh x4, x15, x9\n\t"
2267 "adds %x[a], %x[a], x3\n\t"
2268 "adcs x26, x26, x4\n\t"
2269 "adc x27, xzr, xzr\n\t"
2270 /* A[2] * B[2] */
2271 "mul x3, x16, x8\n\t"
2272 "umulh x4, x16, x8\n\t"
2273 "adds %x[a], %x[a], x3\n\t"
2274 "adcs x26, x26, x4\n\t"
2275 "adc x27, x27, xzr\n\t"
2276 /* A[3] * B[1] */
2277 "mul x3, x17, x7\n\t"
2278 "umulh x4, x17, x7\n\t"
2279 "adds %x[a], %x[a], x3\n\t"
2280 "adcs x26, x26, x4\n\t"
2281 "adc x27, x27, xzr\n\t"
2282 /* A[2] * B[3] */
2283 "mul x3, x16, x9\n\t"
2284 "umulh x4, x16, x9\n\t"
2285 "adds x26, x26, x3\n\t"
2286 "adcs x27, x27, x4\n\t"
2287 "adc x28, xzr, xzr\n\t"
2288 /* A[3] * B[2] */
2289 "mul x3, x17, x8\n\t"
2290 "umulh x4, x17, x8\n\t"
2291 "adds x26, x26, x3\n\t"
2292 "adcs x27, x27, x4\n\t"
2293 "adc x28, x28, xzr\n\t"
2294 /* A[3] * B[3] */
2295 "mul x3, x17, x9\n\t"
2296 "umulh x4, x17, x9\n\t"
2297 "adds x27, x27, x3\n\t"
2298 "adc x28, x28, x4\n\t"
2299 /* Reduce */
2300 /* Move top half into t4-t7 and remove top bit from t3 */
2301 "extr x28, x28, x27, #63\n\t"
2302 "extr x27, x27, x26, #63\n\t"
2303 "extr x26, x26, %x[a], #63\n\t"
2304 "extr %x[a], %x[a], x13, #63\n\t"
2305 "and x13, x13, #0x7fffffffffffffff\n\t"
2306 /* Multiply top half by 19 */
2307 "mov x3, #19\n\t"
2308 "mul x4, x3, %x[a]\n\t"
2309 "umulh %x[a], x3, %x[a]\n\t"
2310 "adds x10, x10, x4\n\t"
2311 "mul x4, x3, x26\n\t"
2312 "umulh x26, x3, x26\n\t"
2313 "adcs x11, x11, x4\n\t"
2314 "mul x4, x3, x27\n\t"
2315 "umulh x27, x3, x27\n\t"
2316 "adcs x12, x12, x4\n\t"
2317 "mul x4, x3, x28\n\t"
2318 "umulh x5, x3, x28\n\t"
2319 "adcs x13, x13, x4\n\t"
2320 "adc x5, x5, xzr\n\t"
2321 /* Add remaining product results in */
2322 "adds x11, x11, %x[a]\n\t"
2323 "adcs x12, x12, x26\n\t"
2324 "adcs x13, x13, x27\n\t"
2325 "adc x5, x5, xzr\n\t"
2326 /* Overflow */
2327 "extr x5, x5, x13, #63\n\t"
2328 "mul x5, x5, x3\n\t"
2329 "and x13, x13, #0x7fffffffffffffff\n\t"
2330 "adds x10, x10, x5\n\t"
2331 "adcs x11, x11, xzr\n\t"
2332 "adcs x12, x12, xzr\n\t"
2333 "adc x13, x13, xzr\n\t"
2334 /* Reduce if top bit set */
2335 "and x5, x3, x13, asr 63\n\t"
2336 "and x13, x13, #0x7fffffffffffffff\n\t"
2337 "adds x10, x10, x5\n\t"
2338 "adcs x11, x11, xzr\n\t"
2339 "adcs x12, x12, xzr\n\t"
2340 "adc x13, x13, xzr\n\t"
2341 /* Store */
2342 "stp x10, x11, [x29, #48]\n\t"
2343 "stp x12, x13, [x29, #64]\n\t"
2344 "sub x25, x25, #1\n\t"
2345 "cmp x25, #0\n\t"
2346 "bge L_curve25519_bits_%=\n\t"
2347 "mov x25, #63\n\t"
2348 "sub x24, x24, #8\n\t"
2349 "cmp x24, #0\n\t"
2350 "bge L_curve25519_words_%=\n\t"
2351 /* Invert */
2352 "add x0, x29, #48\n\t"
2353 "add x1, x29, #16\n\t"
2354 #ifndef __APPLE__
2355 "bl fe_sq\n\t"
2356 #else
2357 "bl _fe_sq\n\t"
2358 #endif /* __APPLE__ */
2359 "add x0, x29, #0x50\n\t"
2360 "add x1, x29, #48\n\t"
2361 #ifndef __APPLE__
2362 "bl fe_sq\n\t"
2363 #else
2364 "bl _fe_sq\n\t"
2365 #endif /* __APPLE__ */
2366 #ifndef NDEBUG
2367 "add x0, x29, #0x50\n\t"
2368 #endif /* !NDEBUG */
2369 "add x1, x29, #0x50\n\t"
2370 #ifndef __APPLE__
2371 "bl fe_sq\n\t"
2372 #else
2373 "bl _fe_sq\n\t"
2374 #endif /* __APPLE__ */
2375 #ifndef NDEBUG
2376 "add x0, x29, #0x50\n\t"
2377 #endif /* !NDEBUG */
2378 "add x1, x29, #16\n\t"
2379 "add x2, x29, #0x50\n\t"
2380 #ifndef __APPLE__
2381 "bl fe_mul\n\t"
2382 #else
2383 "bl _fe_mul\n\t"
2384 #endif /* __APPLE__ */
2385 "add x0, x29, #48\n\t"
2386 "add x1, x29, #48\n\t"
2387 "add x2, x29, #0x50\n\t"
2388 #ifndef __APPLE__
2389 "bl fe_mul\n\t"
2390 #else
2391 "bl _fe_mul\n\t"
2392 #endif /* __APPLE__ */
2393 "add x0, x29, #0x70\n\t"
2394 #ifndef NDEBUG
2395 "add x1, x29, #48\n\t"
2396 #endif /* !NDEBUG */
2397 #ifndef __APPLE__
2398 "bl fe_sq\n\t"
2399 #else
2400 "bl _fe_sq\n\t"
2401 #endif /* __APPLE__ */
2402 "add x0, x29, #0x50\n\t"
2403 "add x1, x29, #0x50\n\t"
2404 "add x2, x29, #0x70\n\t"
2405 #ifndef __APPLE__
2406 "bl fe_mul\n\t"
2407 #else
2408 "bl _fe_mul\n\t"
2409 #endif /* __APPLE__ */
2410 "add x0, x29, #0x70\n\t"
2411 #ifndef NDEBUG
2412 "add x1, x29, #0x50\n\t"
2413 #endif /* !NDEBUG */
2414 #ifndef __APPLE__
2415 "bl fe_sq\n\t"
2416 #else
2417 "bl _fe_sq\n\t"
2418 #endif /* __APPLE__ */
2419 "mov x24, #3\n\t"
2420 #ifndef NDEBUG
2421 "add x0, x29, #0x70\n\t"
2422 #endif /* !NDEBUG */
2423 "add x1, x29, #0x70\n\t"
2424 "\n"
2425 "L_curve25519_inv_1_%=: \n\t"
2426 #ifndef __APPLE__
2427 "bl fe_sq\n\t"
2428 #else
2429 "bl _fe_sq\n\t"
2430 #endif /* __APPLE__ */
2431 "subs x24, x24, #1\n\t"
2432 "bcs L_curve25519_inv_1_%=\n\t"
2433 "add x0, x29, #0x50\n\t"
2434 #ifndef NDEBUG
2435 "add x1, x29, #0x70\n\t"
2436 #endif /* !NDEBUG */
2437 "add x2, x29, #0x50\n\t"
2438 #ifndef __APPLE__
2439 "bl fe_mul\n\t"
2440 #else
2441 "bl _fe_mul\n\t"
2442 #endif /* __APPLE__ */
2443 "add x0, x29, #0x70\n\t"
2444 "add x1, x29, #0x50\n\t"
2445 #ifndef __APPLE__
2446 "bl fe_sq\n\t"
2447 #else
2448 "bl _fe_sq\n\t"
2449 #endif /* __APPLE__ */
2450 "mov x24, #8\n\t"
2451 #ifndef NDEBUG
2452 "add x0, x29, #0x70\n\t"
2453 #endif /* !NDEBUG */
2454 "add x1, x29, #0x70\n\t"
2455 "\n"
2456 "L_curve25519_inv_2_%=: \n\t"
2457 #ifndef __APPLE__
2458 "bl fe_sq\n\t"
2459 #else
2460 "bl _fe_sq\n\t"
2461 #endif /* __APPLE__ */
2462 "subs x24, x24, #1\n\t"
2463 "bcs L_curve25519_inv_2_%=\n\t"
2464 #ifndef NDEBUG
2465 "add x0, x29, #0x70\n\t"
2466 #endif /* !NDEBUG */
2467 #ifndef NDEBUG
2468 "add x1, x29, #0x70\n\t"
2469 #endif /* !NDEBUG */
2470 "add x2, x29, #0x50\n\t"
2471 #ifndef __APPLE__
2472 "bl fe_mul\n\t"
2473 #else
2474 "bl _fe_mul\n\t"
2475 #endif /* __APPLE__ */
2476 "add x0, x29, #0x90\n\t"
2477 #ifndef NDEBUG
2478 "add x1, x29, #0x70\n\t"
2479 #endif /* !NDEBUG */
2480 #ifndef __APPLE__
2481 "bl fe_sq\n\t"
2482 #else
2483 "bl _fe_sq\n\t"
2484 #endif /* __APPLE__ */
2485 "mov x24, #18\n\t"
2486 #ifndef NDEBUG
2487 "add x0, x29, #0x90\n\t"
2488 #endif /* !NDEBUG */
2489 "add x1, x29, #0x90\n\t"
2490 "\n"
2491 "L_curve25519_inv_3_%=: \n\t"
2492 #ifndef __APPLE__
2493 "bl fe_sq\n\t"
2494 #else
2495 "bl _fe_sq\n\t"
2496 #endif /* __APPLE__ */
2497 "subs x24, x24, #1\n\t"
2498 "bcs L_curve25519_inv_3_%=\n\t"
2499 "add x0, x29, #0x70\n\t"
2500 #ifndef NDEBUG
2501 "add x1, x29, #0x90\n\t"
2502 #endif /* !NDEBUG */
2503 "add x2, x29, #0x70\n\t"
2504 #ifndef __APPLE__
2505 "bl fe_mul\n\t"
2506 #else
2507 "bl _fe_mul\n\t"
2508 #endif /* __APPLE__ */
2509 "mov x24, #9\n\t"
2510 #ifndef NDEBUG
2511 "add x0, x29, #0x70\n\t"
2512 #endif /* !NDEBUG */
2513 "add x1, x29, #0x70\n\t"
2514 "\n"
2515 "L_curve25519_inv_4_%=: \n\t"
2516 #ifndef __APPLE__
2517 "bl fe_sq\n\t"
2518 #else
2519 "bl _fe_sq\n\t"
2520 #endif /* __APPLE__ */
2521 "subs x24, x24, #1\n\t"
2522 "bcs L_curve25519_inv_4_%=\n\t"
2523 "add x0, x29, #0x50\n\t"
2524 #ifndef NDEBUG
2525 "add x1, x29, #0x70\n\t"
2526 #endif /* !NDEBUG */
2527 "add x2, x29, #0x50\n\t"
2528 #ifndef __APPLE__
2529 "bl fe_mul\n\t"
2530 #else
2531 "bl _fe_mul\n\t"
2532 #endif /* __APPLE__ */
2533 "add x0, x29, #0x70\n\t"
2534 "add x1, x29, #0x50\n\t"
2535 #ifndef __APPLE__
2536 "bl fe_sq\n\t"
2537 #else
2538 "bl _fe_sq\n\t"
2539 #endif /* __APPLE__ */
2540 "mov x24, #48\n\t"
2541 #ifndef NDEBUG
2542 "add x0, x29, #0x70\n\t"
2543 #endif /* !NDEBUG */
2544 "add x1, x29, #0x70\n\t"
2545 "\n"
2546 "L_curve25519_inv_5_%=: \n\t"
2547 #ifndef __APPLE__
2548 "bl fe_sq\n\t"
2549 #else
2550 "bl _fe_sq\n\t"
2551 #endif /* __APPLE__ */
2552 "subs x24, x24, #1\n\t"
2553 "bcs L_curve25519_inv_5_%=\n\t"
2554 #ifndef NDEBUG
2555 "add x0, x29, #0x70\n\t"
2556 #endif /* !NDEBUG */
2557 #ifndef NDEBUG
2558 "add x1, x29, #0x70\n\t"
2559 #endif /* !NDEBUG */
2560 "add x2, x29, #0x50\n\t"
2561 #ifndef __APPLE__
2562 "bl fe_mul\n\t"
2563 #else
2564 "bl _fe_mul\n\t"
2565 #endif /* __APPLE__ */
2566 "add x0, x29, #0x90\n\t"
2567 #ifndef NDEBUG
2568 "add x1, x29, #0x70\n\t"
2569 #endif /* !NDEBUG */
2570 #ifndef __APPLE__
2571 "bl fe_sq\n\t"
2572 #else
2573 "bl _fe_sq\n\t"
2574 #endif /* __APPLE__ */
2575 "mov x24, #0x62\n\t"
2576 #ifndef NDEBUG
2577 "add x0, x29, #0x90\n\t"
2578 #endif /* !NDEBUG */
2579 "add x1, x29, #0x90\n\t"
2580 "\n"
2581 "L_curve25519_inv_6_%=: \n\t"
2582 #ifndef __APPLE__
2583 "bl fe_sq\n\t"
2584 #else
2585 "bl _fe_sq\n\t"
2586 #endif /* __APPLE__ */
2587 "subs x24, x24, #1\n\t"
2588 "bcs L_curve25519_inv_6_%=\n\t"
2589 "add x0, x29, #0x70\n\t"
2590 #ifndef NDEBUG
2591 "add x1, x29, #0x90\n\t"
2592 #endif /* !NDEBUG */
2593 "add x2, x29, #0x70\n\t"
2594 #ifndef __APPLE__
2595 "bl fe_mul\n\t"
2596 #else
2597 "bl _fe_mul\n\t"
2598 #endif /* __APPLE__ */
2599 "mov x24, #49\n\t"
2600 #ifndef NDEBUG
2601 "add x0, x29, #0x70\n\t"
2602 #endif /* !NDEBUG */
2603 "add x1, x29, #0x70\n\t"
2604 "\n"
2605 "L_curve25519_inv_7_%=: \n\t"
2606 #ifndef __APPLE__
2607 "bl fe_sq\n\t"
2608 #else
2609 "bl _fe_sq\n\t"
2610 #endif /* __APPLE__ */
2611 "subs x24, x24, #1\n\t"
2612 "bcs L_curve25519_inv_7_%=\n\t"
2613 "add x0, x29, #0x50\n\t"
2614 #ifndef NDEBUG
2615 "add x1, x29, #0x70\n\t"
2616 #endif /* !NDEBUG */
2617 "add x2, x29, #0x50\n\t"
2618 #ifndef __APPLE__
2619 "bl fe_mul\n\t"
2620 #else
2621 "bl _fe_mul\n\t"
2622 #endif /* __APPLE__ */
2623 "mov x24, #4\n\t"
2624 #ifndef NDEBUG
2625 "add x0, x29, #0x50\n\t"
2626 #endif /* !NDEBUG */
2627 "add x1, x29, #0x50\n\t"
2628 "\n"
2629 "L_curve25519_inv_8_%=: \n\t"
2630 #ifndef __APPLE__
2631 "bl fe_sq\n\t"
2632 #else
2633 "bl _fe_sq\n\t"
2634 #endif /* __APPLE__ */
2635 "subs x24, x24, #1\n\t"
2636 "bcs L_curve25519_inv_8_%=\n\t"
2637 "add x0, x29, #16\n\t"
2638 #ifndef NDEBUG
2639 "add x1, x29, #0x50\n\t"
2640 #endif /* !NDEBUG */
2641 "add x2, x29, #48\n\t"
2642 #ifndef __APPLE__
2643 "bl fe_mul\n\t"
2644 #else
2645 "bl _fe_mul\n\t"
2646 #endif /* __APPLE__ */
2647 "ldr %x[r], [x29, #176]\n\t"
2648 /* Multiply */
2649 "ldp x6, x7, [%x[r]]\n\t"
2650 "ldp x8, x9, [%x[r], #16]\n\t"
2651 "ldp x10, x11, [x29, #16]\n\t"
2652 "ldp x12, x13, [x29, #32]\n\t"
2653 /* A[0] * B[0] */
2654 "mul x14, x6, x10\n\t"
2655 "umulh x15, x6, x10\n\t"
2656 /* A[0] * B[1] */
2657 "mul x3, x6, x11\n\t"
2658 "umulh x16, x6, x11\n\t"
2659 "adds x15, x15, x3\n\t"
2660 "adc x16, x16, xzr\n\t"
2661 /* A[1] * B[0] */
2662 "mul x3, x7, x10\n\t"
2663 "umulh x4, x7, x10\n\t"
2664 "adds x15, x15, x3\n\t"
2665 "adcs x16, x16, x4\n\t"
2666 "adc x17, xzr, xzr\n\t"
2667 /* A[0] * B[2] */
2668 "mul x3, x6, x12\n\t"
2669 "umulh x4, x6, x12\n\t"
2670 "adds x16, x16, x3\n\t"
2671 "adc x17, x17, x4\n\t"
2672 /* A[1] * B[1] */
2673 "mul x3, x7, x11\n\t"
2674 "umulh x4, x7, x11\n\t"
2675 "adds x16, x16, x3\n\t"
2676 "adcs x17, x17, x4\n\t"
2677 "adc x19, xzr, xzr\n\t"
2678 /* A[2] * B[0] */
2679 "mul x3, x8, x10\n\t"
2680 "umulh x4, x8, x10\n\t"
2681 "adds x16, x16, x3\n\t"
2682 "adcs x17, x17, x4\n\t"
2683 "adc x19, x19, xzr\n\t"
2684 /* A[0] * B[3] */
2685 "mul x3, x6, x13\n\t"
2686 "umulh x4, x6, x13\n\t"
2687 "adds x17, x17, x3\n\t"
2688 "adcs x19, x19, x4\n\t"
2689 "adc x20, xzr, xzr\n\t"
2690 /* A[1] * B[2] */
2691 "mul x3, x7, x12\n\t"
2692 "umulh x4, x7, x12\n\t"
2693 "adds x17, x17, x3\n\t"
2694 "adcs x19, x19, x4\n\t"
2695 "adc x20, x20, xzr\n\t"
2696 /* A[2] * B[1] */
2697 "mul x3, x8, x11\n\t"
2698 "umulh x4, x8, x11\n\t"
2699 "adds x17, x17, x3\n\t"
2700 "adcs x19, x19, x4\n\t"
2701 "adc x20, x20, xzr\n\t"
2702 /* A[3] * B[0] */
2703 "mul x3, x9, x10\n\t"
2704 "umulh x4, x9, x10\n\t"
2705 "adds x17, x17, x3\n\t"
2706 "adcs x19, x19, x4\n\t"
2707 "adc x20, x20, xzr\n\t"
2708 /* A[1] * B[3] */
2709 "mul x3, x7, x13\n\t"
2710 "umulh x4, x7, x13\n\t"
2711 "adds x19, x19, x3\n\t"
2712 "adcs x20, x20, x4\n\t"
2713 "adc x21, xzr, xzr\n\t"
2714 /* A[2] * B[2] */
2715 "mul x3, x8, x12\n\t"
2716 "umulh x4, x8, x12\n\t"
2717 "adds x19, x19, x3\n\t"
2718 "adcs x20, x20, x4\n\t"
2719 "adc x21, x21, xzr\n\t"
2720 /* A[3] * B[1] */
2721 "mul x3, x9, x11\n\t"
2722 "umulh x4, x9, x11\n\t"
2723 "adds x19, x19, x3\n\t"
2724 "adcs x20, x20, x4\n\t"
2725 "adc x21, x21, xzr\n\t"
2726 /* A[2] * B[3] */
2727 "mul x3, x8, x13\n\t"
2728 "umulh x4, x8, x13\n\t"
2729 "adds x20, x20, x3\n\t"
2730 "adcs x21, x21, x4\n\t"
2731 "adc x22, xzr, xzr\n\t"
2732 /* A[3] * B[2] */
2733 "mul x3, x9, x12\n\t"
2734 "umulh x4, x9, x12\n\t"
2735 "adds x20, x20, x3\n\t"
2736 "adcs x21, x21, x4\n\t"
2737 "adc x22, x22, xzr\n\t"
2738 /* A[3] * B[3] */
2739 "mul x3, x9, x13\n\t"
2740 "umulh x4, x9, x13\n\t"
2741 "adds x21, x21, x3\n\t"
2742 "adc x22, x22, x4\n\t"
2743 /* Reduce */
2744 /* Move top half into t4-t7 and remove top bit from t3 */
2745 "extr x22, x22, x21, #63\n\t"
2746 "extr x21, x21, x20, #63\n\t"
2747 "extr x20, x20, x19, #63\n\t"
2748 "extr x19, x19, x17, #63\n\t"
2749 "and x17, x17, #0x7fffffffffffffff\n\t"
2750 /* Multiply top half by 19 */
2751 "mov x3, #19\n\t"
2752 "mul x4, x3, x19\n\t"
2753 "umulh x19, x3, x19\n\t"
2754 "adds x14, x14, x4\n\t"
2755 "mul x4, x3, x20\n\t"
2756 "umulh x20, x3, x20\n\t"
2757 "adcs x15, x15, x4\n\t"
2758 "mul x4, x3, x21\n\t"
2759 "umulh x21, x3, x21\n\t"
2760 "adcs x16, x16, x4\n\t"
2761 "mul x4, x3, x22\n\t"
2762 "umulh x5, x3, x22\n\t"
2763 "adcs x17, x17, x4\n\t"
2764 "adc x5, x5, xzr\n\t"
2765 /* Add remaining product results in */
2766 "adds x15, x15, x19\n\t"
2767 "adcs x16, x16, x20\n\t"
2768 "adcs x17, x17, x21\n\t"
2769 "adc x5, x5, xzr\n\t"
2770 /* Overflow */
2771 "extr x5, x5, x17, #63\n\t"
2772 "mul x5, x5, x3\n\t"
2773 "and x17, x17, #0x7fffffffffffffff\n\t"
2774 "adds x14, x14, x5\n\t"
2775 "adcs x15, x15, xzr\n\t"
2776 "adcs x16, x16, xzr\n\t"
2777 "adc x17, x17, xzr\n\t"
2778 /* Reduce if top bit set */
2779 "and x5, x3, x17, asr 63\n\t"
2780 "and x17, x17, #0x7fffffffffffffff\n\t"
2781 "adds x14, x14, x5\n\t"
2782 "adcs x15, x15, xzr\n\t"
2783 "adcs x16, x16, xzr\n\t"
2784 "adc x17, x17, xzr\n\t"
2785 /* Store */
2786 "stp x14, x15, [%x[r]]\n\t"
2787 "stp x16, x17, [%x[r], #16]\n\t"
2788 "mov x0, xzr\n\t"
2789 "ldp x29, x30, [sp], #0xc0\n\t"
2790 : [r] "+r" (r), [n] "+r" (n), [a] "+r" (a)
2791 :
2792 : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
2793 );
2794 return (uint32_t)(size_t)r;
2795 }
2796
fe_pow22523(fe r,const fe a)2797 void fe_pow22523(fe r, const fe a)
2798 {
2799 __asm__ __volatile__ (
2800 "stp x29, x30, [sp, #-128]!\n\t"
2801 "add x29, sp, #0\n\t"
2802 /* pow22523 */
2803 "str %x[r], [x29, #112]\n\t"
2804 "str %x[a], [x29, #120]\n\t"
2805 "add x0, x29, #16\n\t"
2806 #ifndef NDEBUG
2807 "ldr x1, [x29, #120]\n\t"
2808 #endif /* !NDEBUG */
2809 #ifndef __APPLE__
2810 "bl fe_sq\n\t"
2811 #else
2812 "bl _fe_sq\n\t"
2813 #endif /* __APPLE__ */
2814 "add x0, x29, #48\n\t"
2815 "add x1, x29, #16\n\t"
2816 #ifndef __APPLE__
2817 "bl fe_sq\n\t"
2818 #else
2819 "bl _fe_sq\n\t"
2820 #endif /* __APPLE__ */
2821 #ifndef NDEBUG
2822 "add x0, x29, #48\n\t"
2823 #endif /* !NDEBUG */
2824 "add x1, x29, #48\n\t"
2825 #ifndef __APPLE__
2826 "bl fe_sq\n\t"
2827 #else
2828 "bl _fe_sq\n\t"
2829 #endif /* __APPLE__ */
2830 #ifndef NDEBUG
2831 "add x0, x29, #48\n\t"
2832 #endif /* !NDEBUG */
2833 "ldr x1, [x29, #120]\n\t"
2834 "add x2, x29, #48\n\t"
2835 #ifndef __APPLE__
2836 "bl fe_mul\n\t"
2837 #else
2838 "bl _fe_mul\n\t"
2839 #endif /* __APPLE__ */
2840 "add x0, x29, #16\n\t"
2841 "add x1, x29, #16\n\t"
2842 "add x2, x29, #48\n\t"
2843 #ifndef __APPLE__
2844 "bl fe_mul\n\t"
2845 #else
2846 "bl _fe_mul\n\t"
2847 #endif /* __APPLE__ */
2848 #ifndef NDEBUG
2849 "add x0, x29, #16\n\t"
2850 #endif /* !NDEBUG */
2851 #ifndef NDEBUG
2852 "add x1, x29, #16\n\t"
2853 #endif /* !NDEBUG */
2854 #ifndef __APPLE__
2855 "bl fe_sq\n\t"
2856 #else
2857 "bl _fe_sq\n\t"
2858 #endif /* __APPLE__ */
2859 #ifndef NDEBUG
2860 "add x0, x29, #16\n\t"
2861 #endif /* !NDEBUG */
2862 "add x1, x29, #48\n\t"
2863 "add x2, x29, #16\n\t"
2864 #ifndef __APPLE__
2865 "bl fe_mul\n\t"
2866 #else
2867 "bl _fe_mul\n\t"
2868 #endif /* __APPLE__ */
2869 "add x0, x29, #48\n\t"
2870 "add x1, x29, #16\n\t"
2871 #ifndef __APPLE__
2872 "bl fe_sq\n\t"
2873 #else
2874 "bl _fe_sq\n\t"
2875 #endif /* __APPLE__ */
2876 "mov x23, #3\n\t"
2877 #ifndef NDEBUG
2878 "add x0, x29, #48\n\t"
2879 #endif /* !NDEBUG */
2880 "add x1, x29, #48\n\t"
2881 "\n"
2882 "L_fe_pow22523_1_%=: \n\t"
2883 #ifndef __APPLE__
2884 "bl fe_sq\n\t"
2885 #else
2886 "bl _fe_sq\n\t"
2887 #endif /* __APPLE__ */
2888 "subs x23, x23, #1\n\t"
2889 "bcs L_fe_pow22523_1_%=\n\t"
2890 "add x0, x29, #16\n\t"
2891 #ifndef NDEBUG
2892 "add x1, x29, #48\n\t"
2893 #endif /* !NDEBUG */
2894 "add x2, x29, #16\n\t"
2895 #ifndef __APPLE__
2896 "bl fe_mul\n\t"
2897 #else
2898 "bl _fe_mul\n\t"
2899 #endif /* __APPLE__ */
2900 "add x0, x29, #48\n\t"
2901 "add x1, x29, #16\n\t"
2902 #ifndef __APPLE__
2903 "bl fe_sq\n\t"
2904 #else
2905 "bl _fe_sq\n\t"
2906 #endif /* __APPLE__ */
2907 "mov x23, #8\n\t"
2908 #ifndef NDEBUG
2909 "add x0, x29, #48\n\t"
2910 #endif /* !NDEBUG */
2911 "add x1, x29, #48\n\t"
2912 "\n"
2913 "L_fe_pow22523_2_%=: \n\t"
2914 #ifndef __APPLE__
2915 "bl fe_sq\n\t"
2916 #else
2917 "bl _fe_sq\n\t"
2918 #endif /* __APPLE__ */
2919 "subs x23, x23, #1\n\t"
2920 "bcs L_fe_pow22523_2_%=\n\t"
2921 #ifndef NDEBUG
2922 "add x0, x29, #48\n\t"
2923 #endif /* !NDEBUG */
2924 #ifndef NDEBUG
2925 "add x1, x29, #48\n\t"
2926 #endif /* !NDEBUG */
2927 "add x2, x29, #16\n\t"
2928 #ifndef __APPLE__
2929 "bl fe_mul\n\t"
2930 #else
2931 "bl _fe_mul\n\t"
2932 #endif /* __APPLE__ */
2933 "add x0, x29, #0x50\n\t"
2934 #ifndef NDEBUG
2935 "add x1, x29, #48\n\t"
2936 #endif /* !NDEBUG */
2937 #ifndef __APPLE__
2938 "bl fe_sq\n\t"
2939 #else
2940 "bl _fe_sq\n\t"
2941 #endif /* __APPLE__ */
2942 "mov x23, #18\n\t"
2943 #ifndef NDEBUG
2944 "add x0, x29, #0x50\n\t"
2945 #endif /* !NDEBUG */
2946 "add x1, x29, #0x50\n\t"
2947 "\n"
2948 "L_fe_pow22523_3_%=: \n\t"
2949 #ifndef __APPLE__
2950 "bl fe_sq\n\t"
2951 #else
2952 "bl _fe_sq\n\t"
2953 #endif /* __APPLE__ */
2954 "subs x23, x23, #1\n\t"
2955 "bcs L_fe_pow22523_3_%=\n\t"
2956 "add x0, x29, #48\n\t"
2957 #ifndef NDEBUG
2958 "add x1, x29, #0x50\n\t"
2959 #endif /* !NDEBUG */
2960 "add x2, x29, #48\n\t"
2961 #ifndef __APPLE__
2962 "bl fe_mul\n\t"
2963 #else
2964 "bl _fe_mul\n\t"
2965 #endif /* __APPLE__ */
2966 "mov x23, #9\n\t"
2967 #ifndef NDEBUG
2968 "add x0, x29, #48\n\t"
2969 #endif /* !NDEBUG */
2970 "add x1, x29, #48\n\t"
2971 "\n"
2972 "L_fe_pow22523_4_%=: \n\t"
2973 #ifndef __APPLE__
2974 "bl fe_sq\n\t"
2975 #else
2976 "bl _fe_sq\n\t"
2977 #endif /* __APPLE__ */
2978 "subs x23, x23, #1\n\t"
2979 "bcs L_fe_pow22523_4_%=\n\t"
2980 "add x0, x29, #16\n\t"
2981 #ifndef NDEBUG
2982 "add x1, x29, #48\n\t"
2983 #endif /* !NDEBUG */
2984 "add x2, x29, #16\n\t"
2985 #ifndef __APPLE__
2986 "bl fe_mul\n\t"
2987 #else
2988 "bl _fe_mul\n\t"
2989 #endif /* __APPLE__ */
2990 "add x0, x29, #48\n\t"
2991 "add x1, x29, #16\n\t"
2992 #ifndef __APPLE__
2993 "bl fe_sq\n\t"
2994 #else
2995 "bl _fe_sq\n\t"
2996 #endif /* __APPLE__ */
2997 "mov x23, #48\n\t"
2998 #ifndef NDEBUG
2999 "add x0, x29, #48\n\t"
3000 #endif /* !NDEBUG */
3001 "add x1, x29, #48\n\t"
3002 "\n"
3003 "L_fe_pow22523_5_%=: \n\t"
3004 #ifndef __APPLE__
3005 "bl fe_sq\n\t"
3006 #else
3007 "bl _fe_sq\n\t"
3008 #endif /* __APPLE__ */
3009 "subs x23, x23, #1\n\t"
3010 "bcs L_fe_pow22523_5_%=\n\t"
3011 #ifndef NDEBUG
3012 "add x0, x29, #48\n\t"
3013 #endif /* !NDEBUG */
3014 #ifndef NDEBUG
3015 "add x1, x29, #48\n\t"
3016 #endif /* !NDEBUG */
3017 "add x2, x29, #16\n\t"
3018 #ifndef __APPLE__
3019 "bl fe_mul\n\t"
3020 #else
3021 "bl _fe_mul\n\t"
3022 #endif /* __APPLE__ */
3023 "add x0, x29, #0x50\n\t"
3024 #ifndef NDEBUG
3025 "add x1, x29, #48\n\t"
3026 #endif /* !NDEBUG */
3027 #ifndef __APPLE__
3028 "bl fe_sq\n\t"
3029 #else
3030 "bl _fe_sq\n\t"
3031 #endif /* __APPLE__ */
3032 "mov x23, #0x62\n\t"
3033 #ifndef NDEBUG
3034 "add x0, x29, #0x50\n\t"
3035 #endif /* !NDEBUG */
3036 "add x1, x29, #0x50\n\t"
3037 "\n"
3038 "L_fe_pow22523_6_%=: \n\t"
3039 #ifndef __APPLE__
3040 "bl fe_sq\n\t"
3041 #else
3042 "bl _fe_sq\n\t"
3043 #endif /* __APPLE__ */
3044 "subs x23, x23, #1\n\t"
3045 "bcs L_fe_pow22523_6_%=\n\t"
3046 "add x0, x29, #48\n\t"
3047 #ifndef NDEBUG
3048 "add x1, x29, #0x50\n\t"
3049 #endif /* !NDEBUG */
3050 "add x2, x29, #48\n\t"
3051 #ifndef __APPLE__
3052 "bl fe_mul\n\t"
3053 #else
3054 "bl _fe_mul\n\t"
3055 #endif /* __APPLE__ */
3056 "mov x23, #49\n\t"
3057 #ifndef NDEBUG
3058 "add x0, x29, #48\n\t"
3059 #endif /* !NDEBUG */
3060 "add x1, x29, #48\n\t"
3061 "\n"
3062 "L_fe_pow22523_7_%=: \n\t"
3063 #ifndef __APPLE__
3064 "bl fe_sq\n\t"
3065 #else
3066 "bl _fe_sq\n\t"
3067 #endif /* __APPLE__ */
3068 "subs x23, x23, #1\n\t"
3069 "bcs L_fe_pow22523_7_%=\n\t"
3070 "add x0, x29, #16\n\t"
3071 #ifndef NDEBUG
3072 "add x1, x29, #48\n\t"
3073 #endif /* !NDEBUG */
3074 "add x2, x29, #16\n\t"
3075 #ifndef __APPLE__
3076 "bl fe_mul\n\t"
3077 #else
3078 "bl _fe_mul\n\t"
3079 #endif /* __APPLE__ */
3080 "mov x23, #1\n\t"
3081 #ifndef NDEBUG
3082 "add x0, x29, #16\n\t"
3083 #endif /* !NDEBUG */
3084 "add x1, x29, #16\n\t"
3085 "\n"
3086 "L_fe_pow22523_8_%=: \n\t"
3087 #ifndef __APPLE__
3088 "bl fe_sq\n\t"
3089 #else
3090 "bl _fe_sq\n\t"
3091 #endif /* __APPLE__ */
3092 "subs x23, x23, #1\n\t"
3093 "bcs L_fe_pow22523_8_%=\n\t"
3094 "ldr x0, [x29, #112]\n\t"
3095 #ifndef NDEBUG
3096 "add x1, x29, #16\n\t"
3097 #endif /* !NDEBUG */
3098 "ldr x2, [x29, #120]\n\t"
3099 #ifndef __APPLE__
3100 "bl fe_mul\n\t"
3101 #else
3102 "bl _fe_mul\n\t"
3103 #endif /* __APPLE__ */
3104 "ldp x29, x30, [sp], #0x80\n\t"
3105 : [r] "+r" (r), [a] "+r" (a)
3106 :
3107 : "memory", "x2", "x23"
3108 );
3109 }
3110
fe_ge_to_p2(fe rx,fe ry,fe rz,const fe px,const fe py,const fe pz,const fe pt)3111 void fe_ge_to_p2(fe rx, fe ry, fe rz, const fe px, const fe py, const fe pz, const fe pt)
3112 {
3113 __asm__ __volatile__ (
3114 "stp x29, x30, [sp, #-64]!\n\t"
3115 "add x29, sp, #0\n\t"
3116 "str %x[ry], [x29, #16]\n\t"
3117 "str %x[rz], [x29, #24]\n\t"
3118 "str %x[px], [x29, #32]\n\t"
3119 "str %x[py], [x29, #40]\n\t"
3120 "str %x[pz], [x29, #48]\n\t"
3121 "str %x[pt], [x29, #56]\n\t"
3122 "ldr x1, [x29, #32]\n\t"
3123 "ldr x2, [x29, #56]\n\t"
3124 /* Multiply */
3125 "ldp x11, x12, [x1]\n\t"
3126 "ldp x13, x14, [x1, #16]\n\t"
3127 "ldp x15, x16, [x2]\n\t"
3128 "ldp x17, x19, [x2, #16]\n\t"
3129 /* A[0] * B[0] */
3130 "mul x3, x11, x15\n\t"
3131 "umulh x4, x11, x15\n\t"
3132 /* A[0] * B[1] */
3133 "mul x20, x11, x16\n\t"
3134 "umulh x5, x11, x16\n\t"
3135 "adds x4, x4, x20\n\t"
3136 "adc x5, x5, xzr\n\t"
3137 /* A[1] * B[0] */
3138 "mul x20, x12, x15\n\t"
3139 "umulh x21, x12, x15\n\t"
3140 "adds x4, x4, x20\n\t"
3141 "adcs x5, x5, x21\n\t"
3142 "adc x6, xzr, xzr\n\t"
3143 /* A[0] * B[2] */
3144 "mul x20, x11, x17\n\t"
3145 "umulh x21, x11, x17\n\t"
3146 "adds x5, x5, x20\n\t"
3147 "adc x6, x6, x21\n\t"
3148 /* A[1] * B[1] */
3149 "mul x20, x12, x16\n\t"
3150 "umulh x21, x12, x16\n\t"
3151 "adds x5, x5, x20\n\t"
3152 "adcs x6, x6, x21\n\t"
3153 "adc x7, xzr, xzr\n\t"
3154 /* A[2] * B[0] */
3155 "mul x20, x13, x15\n\t"
3156 "umulh x21, x13, x15\n\t"
3157 "adds x5, x5, x20\n\t"
3158 "adcs x6, x6, x21\n\t"
3159 "adc x7, x7, xzr\n\t"
3160 /* A[0] * B[3] */
3161 "mul x20, x11, x19\n\t"
3162 "umulh x21, x11, x19\n\t"
3163 "adds x6, x6, x20\n\t"
3164 "adcs x7, x7, x21\n\t"
3165 "adc x8, xzr, xzr\n\t"
3166 /* A[1] * B[2] */
3167 "mul x20, x12, x17\n\t"
3168 "umulh x21, x12, x17\n\t"
3169 "adds x6, x6, x20\n\t"
3170 "adcs x7, x7, x21\n\t"
3171 "adc x8, x8, xzr\n\t"
3172 /* A[2] * B[1] */
3173 "mul x20, x13, x16\n\t"
3174 "umulh x21, x13, x16\n\t"
3175 "adds x6, x6, x20\n\t"
3176 "adcs x7, x7, x21\n\t"
3177 "adc x8, x8, xzr\n\t"
3178 /* A[3] * B[0] */
3179 "mul x20, x14, x15\n\t"
3180 "umulh x21, x14, x15\n\t"
3181 "adds x6, x6, x20\n\t"
3182 "adcs x7, x7, x21\n\t"
3183 "adc x8, x8, xzr\n\t"
3184 /* A[1] * B[3] */
3185 "mul x20, x12, x19\n\t"
3186 "umulh x21, x12, x19\n\t"
3187 "adds x7, x7, x20\n\t"
3188 "adcs x8, x8, x21\n\t"
3189 "adc x9, xzr, xzr\n\t"
3190 /* A[2] * B[2] */
3191 "mul x20, x13, x17\n\t"
3192 "umulh x21, x13, x17\n\t"
3193 "adds x7, x7, x20\n\t"
3194 "adcs x8, x8, x21\n\t"
3195 "adc x9, x9, xzr\n\t"
3196 /* A[3] * B[1] */
3197 "mul x20, x14, x16\n\t"
3198 "umulh x21, x14, x16\n\t"
3199 "adds x7, x7, x20\n\t"
3200 "adcs x8, x8, x21\n\t"
3201 "adc x9, x9, xzr\n\t"
3202 /* A[2] * B[3] */
3203 "mul x20, x13, x19\n\t"
3204 "umulh x21, x13, x19\n\t"
3205 "adds x8, x8, x20\n\t"
3206 "adcs x9, x9, x21\n\t"
3207 "adc x10, xzr, xzr\n\t"
3208 /* A[3] * B[2] */
3209 "mul x20, x14, x17\n\t"
3210 "umulh x21, x14, x17\n\t"
3211 "adds x8, x8, x20\n\t"
3212 "adcs x9, x9, x21\n\t"
3213 "adc x10, x10, xzr\n\t"
3214 /* A[3] * B[3] */
3215 "mul x20, x14, x19\n\t"
3216 "umulh x21, x14, x19\n\t"
3217 "adds x9, x9, x20\n\t"
3218 "adc x10, x10, x21\n\t"
3219 /* Reduce */
3220 /* Move top half into t4-t7 and remove top bit from t3 */
3221 "extr x10, x10, x9, #63\n\t"
3222 "extr x9, x9, x8, #63\n\t"
3223 "extr x8, x8, x7, #63\n\t"
3224 "extr x7, x7, x6, #63\n\t"
3225 "and x6, x6, #0x7fffffffffffffff\n\t"
3226 /* Multiply top half by 19 */
3227 "mov x20, #19\n\t"
3228 "mul x21, x20, x7\n\t"
3229 "umulh x7, x20, x7\n\t"
3230 "adds x3, x3, x21\n\t"
3231 "mul x21, x20, x8\n\t"
3232 "umulh x8, x20, x8\n\t"
3233 "adcs x4, x4, x21\n\t"
3234 "mul x21, x20, x9\n\t"
3235 "umulh x9, x20, x9\n\t"
3236 "adcs x5, x5, x21\n\t"
3237 "mul x21, x20, x10\n\t"
3238 "umulh x22, x20, x10\n\t"
3239 "adcs x6, x6, x21\n\t"
3240 "adc x22, x22, xzr\n\t"
3241 /* Add remaining product results in */
3242 "adds x4, x4, x7\n\t"
3243 "adcs x5, x5, x8\n\t"
3244 "adcs x6, x6, x9\n\t"
3245 "adc x22, x22, xzr\n\t"
3246 /* Overflow */
3247 "extr x22, x22, x6, #63\n\t"
3248 "mul x22, x22, x20\n\t"
3249 "and x6, x6, #0x7fffffffffffffff\n\t"
3250 "adds x3, x3, x22\n\t"
3251 "adcs x4, x4, xzr\n\t"
3252 "adcs x5, x5, xzr\n\t"
3253 "adc x6, x6, xzr\n\t"
3254 /* Reduce if top bit set */
3255 "and x22, x20, x6, asr 63\n\t"
3256 "and x6, x6, #0x7fffffffffffffff\n\t"
3257 "adds x3, x3, x22\n\t"
3258 "adcs x4, x4, xzr\n\t"
3259 "adcs x5, x5, xzr\n\t"
3260 "adc x6, x6, xzr\n\t"
3261 /* Store */
3262 "stp x3, x4, [x0]\n\t"
3263 "stp x5, x6, [x0, #16]\n\t"
3264 "ldr x0, [x29, #16]\n\t"
3265 "ldr x1, [x29, #40]\n\t"
3266 "ldr x2, [x29, #48]\n\t"
3267 /* Multiply */
3268 "ldp x11, x12, [x1]\n\t"
3269 "ldp x13, x14, [x1, #16]\n\t"
3270 "ldp x15, x16, [x2]\n\t"
3271 "ldp x17, x19, [x2, #16]\n\t"
3272 /* A[0] * B[0] */
3273 "mul x3, x11, x15\n\t"
3274 "umulh x4, x11, x15\n\t"
3275 /* A[0] * B[1] */
3276 "mul x20, x11, x16\n\t"
3277 "umulh x5, x11, x16\n\t"
3278 "adds x4, x4, x20\n\t"
3279 "adc x5, x5, xzr\n\t"
3280 /* A[1] * B[0] */
3281 "mul x20, x12, x15\n\t"
3282 "umulh x21, x12, x15\n\t"
3283 "adds x4, x4, x20\n\t"
3284 "adcs x5, x5, x21\n\t"
3285 "adc x6, xzr, xzr\n\t"
3286 /* A[0] * B[2] */
3287 "mul x20, x11, x17\n\t"
3288 "umulh x21, x11, x17\n\t"
3289 "adds x5, x5, x20\n\t"
3290 "adc x6, x6, x21\n\t"
3291 /* A[1] * B[1] */
3292 "mul x20, x12, x16\n\t"
3293 "umulh x21, x12, x16\n\t"
3294 "adds x5, x5, x20\n\t"
3295 "adcs x6, x6, x21\n\t"
3296 "adc x7, xzr, xzr\n\t"
3297 /* A[2] * B[0] */
3298 "mul x20, x13, x15\n\t"
3299 "umulh x21, x13, x15\n\t"
3300 "adds x5, x5, x20\n\t"
3301 "adcs x6, x6, x21\n\t"
3302 "adc x7, x7, xzr\n\t"
3303 /* A[0] * B[3] */
3304 "mul x20, x11, x19\n\t"
3305 "umulh x21, x11, x19\n\t"
3306 "adds x6, x6, x20\n\t"
3307 "adcs x7, x7, x21\n\t"
3308 "adc x8, xzr, xzr\n\t"
3309 /* A[1] * B[2] */
3310 "mul x20, x12, x17\n\t"
3311 "umulh x21, x12, x17\n\t"
3312 "adds x6, x6, x20\n\t"
3313 "adcs x7, x7, x21\n\t"
3314 "adc x8, x8, xzr\n\t"
3315 /* A[2] * B[1] */
3316 "mul x20, x13, x16\n\t"
3317 "umulh x21, x13, x16\n\t"
3318 "adds x6, x6, x20\n\t"
3319 "adcs x7, x7, x21\n\t"
3320 "adc x8, x8, xzr\n\t"
3321 /* A[3] * B[0] */
3322 "mul x20, x14, x15\n\t"
3323 "umulh x21, x14, x15\n\t"
3324 "adds x6, x6, x20\n\t"
3325 "adcs x7, x7, x21\n\t"
3326 "adc x8, x8, xzr\n\t"
3327 /* A[1] * B[3] */
3328 "mul x20, x12, x19\n\t"
3329 "umulh x21, x12, x19\n\t"
3330 "adds x7, x7, x20\n\t"
3331 "adcs x8, x8, x21\n\t"
3332 "adc x9, xzr, xzr\n\t"
3333 /* A[2] * B[2] */
3334 "mul x20, x13, x17\n\t"
3335 "umulh x21, x13, x17\n\t"
3336 "adds x7, x7, x20\n\t"
3337 "adcs x8, x8, x21\n\t"
3338 "adc x9, x9, xzr\n\t"
3339 /* A[3] * B[1] */
3340 "mul x20, x14, x16\n\t"
3341 "umulh x21, x14, x16\n\t"
3342 "adds x7, x7, x20\n\t"
3343 "adcs x8, x8, x21\n\t"
3344 "adc x9, x9, xzr\n\t"
3345 /* A[2] * B[3] */
3346 "mul x20, x13, x19\n\t"
3347 "umulh x21, x13, x19\n\t"
3348 "adds x8, x8, x20\n\t"
3349 "adcs x9, x9, x21\n\t"
3350 "adc x10, xzr, xzr\n\t"
3351 /* A[3] * B[2] */
3352 "mul x20, x14, x17\n\t"
3353 "umulh x21, x14, x17\n\t"
3354 "adds x8, x8, x20\n\t"
3355 "adcs x9, x9, x21\n\t"
3356 "adc x10, x10, xzr\n\t"
3357 /* A[3] * B[3] */
3358 "mul x20, x14, x19\n\t"
3359 "umulh x21, x14, x19\n\t"
3360 "adds x9, x9, x20\n\t"
3361 "adc x10, x10, x21\n\t"
3362 /* Reduce */
3363 /* Move top half into t4-t7 and remove top bit from t3 */
3364 "extr x10, x10, x9, #63\n\t"
3365 "extr x9, x9, x8, #63\n\t"
3366 "extr x8, x8, x7, #63\n\t"
3367 "extr x7, x7, x6, #63\n\t"
3368 "and x6, x6, #0x7fffffffffffffff\n\t"
3369 /* Multiply top half by 19 */
3370 "mov x20, #19\n\t"
3371 "mul x21, x20, x7\n\t"
3372 "umulh x7, x20, x7\n\t"
3373 "adds x3, x3, x21\n\t"
3374 "mul x21, x20, x8\n\t"
3375 "umulh x8, x20, x8\n\t"
3376 "adcs x4, x4, x21\n\t"
3377 "mul x21, x20, x9\n\t"
3378 "umulh x9, x20, x9\n\t"
3379 "adcs x5, x5, x21\n\t"
3380 "mul x21, x20, x10\n\t"
3381 "umulh x22, x20, x10\n\t"
3382 "adcs x6, x6, x21\n\t"
3383 "adc x22, x22, xzr\n\t"
3384 /* Add remaining product results in */
3385 "adds x4, x4, x7\n\t"
3386 "adcs x5, x5, x8\n\t"
3387 "adcs x6, x6, x9\n\t"
3388 "adc x22, x22, xzr\n\t"
3389 /* Overflow */
3390 "extr x22, x22, x6, #63\n\t"
3391 "mul x22, x22, x20\n\t"
3392 "and x6, x6, #0x7fffffffffffffff\n\t"
3393 "adds x3, x3, x22\n\t"
3394 "adcs x4, x4, xzr\n\t"
3395 "adcs x5, x5, xzr\n\t"
3396 "adc x6, x6, xzr\n\t"
3397 /* Reduce if top bit set */
3398 "and x22, x20, x6, asr 63\n\t"
3399 "and x6, x6, #0x7fffffffffffffff\n\t"
3400 "adds x3, x3, x22\n\t"
3401 "adcs x4, x4, xzr\n\t"
3402 "adcs x5, x5, xzr\n\t"
3403 "adc x6, x6, xzr\n\t"
3404 /* Store */
3405 "stp x3, x4, [x0]\n\t"
3406 "stp x5, x6, [x0, #16]\n\t"
3407 "ldr x0, [x29, #24]\n\t"
3408 "ldr x2, [x29, #56]\n\t"
3409 /* Multiply */
3410 "ldp x11, x12, [x2]\n\t"
3411 "ldp x13, x14, [x2, #16]\n\t"
3412 /* A[0] * B[0] */
3413 "mul x3, x15, x11\n\t"
3414 "umulh x4, x15, x11\n\t"
3415 /* A[0] * B[1] */
3416 "mul x20, x15, x12\n\t"
3417 "umulh x5, x15, x12\n\t"
3418 "adds x4, x4, x20\n\t"
3419 "adc x5, x5, xzr\n\t"
3420 /* A[1] * B[0] */
3421 "mul x20, x16, x11\n\t"
3422 "umulh x21, x16, x11\n\t"
3423 "adds x4, x4, x20\n\t"
3424 "adcs x5, x5, x21\n\t"
3425 "adc x6, xzr, xzr\n\t"
3426 /* A[0] * B[2] */
3427 "mul x20, x15, x13\n\t"
3428 "umulh x21, x15, x13\n\t"
3429 "adds x5, x5, x20\n\t"
3430 "adc x6, x6, x21\n\t"
3431 /* A[1] * B[1] */
3432 "mul x20, x16, x12\n\t"
3433 "umulh x21, x16, x12\n\t"
3434 "adds x5, x5, x20\n\t"
3435 "adcs x6, x6, x21\n\t"
3436 "adc x7, xzr, xzr\n\t"
3437 /* A[2] * B[0] */
3438 "mul x20, x17, x11\n\t"
3439 "umulh x21, x17, x11\n\t"
3440 "adds x5, x5, x20\n\t"
3441 "adcs x6, x6, x21\n\t"
3442 "adc x7, x7, xzr\n\t"
3443 /* A[0] * B[3] */
3444 "mul x20, x15, x14\n\t"
3445 "umulh x21, x15, x14\n\t"
3446 "adds x6, x6, x20\n\t"
3447 "adcs x7, x7, x21\n\t"
3448 "adc x8, xzr, xzr\n\t"
3449 /* A[1] * B[2] */
3450 "mul x20, x16, x13\n\t"
3451 "umulh x21, x16, x13\n\t"
3452 "adds x6, x6, x20\n\t"
3453 "adcs x7, x7, x21\n\t"
3454 "adc x8, x8, xzr\n\t"
3455 /* A[2] * B[1] */
3456 "mul x20, x17, x12\n\t"
3457 "umulh x21, x17, x12\n\t"
3458 "adds x6, x6, x20\n\t"
3459 "adcs x7, x7, x21\n\t"
3460 "adc x8, x8, xzr\n\t"
3461 /* A[3] * B[0] */
3462 "mul x20, x19, x11\n\t"
3463 "umulh x21, x19, x11\n\t"
3464 "adds x6, x6, x20\n\t"
3465 "adcs x7, x7, x21\n\t"
3466 "adc x8, x8, xzr\n\t"
3467 /* A[1] * B[3] */
3468 "mul x20, x16, x14\n\t"
3469 "umulh x21, x16, x14\n\t"
3470 "adds x7, x7, x20\n\t"
3471 "adcs x8, x8, x21\n\t"
3472 "adc x9, xzr, xzr\n\t"
3473 /* A[2] * B[2] */
3474 "mul x20, x17, x13\n\t"
3475 "umulh x21, x17, x13\n\t"
3476 "adds x7, x7, x20\n\t"
3477 "adcs x8, x8, x21\n\t"
3478 "adc x9, x9, xzr\n\t"
3479 /* A[3] * B[1] */
3480 "mul x20, x19, x12\n\t"
3481 "umulh x21, x19, x12\n\t"
3482 "adds x7, x7, x20\n\t"
3483 "adcs x8, x8, x21\n\t"
3484 "adc x9, x9, xzr\n\t"
3485 /* A[2] * B[3] */
3486 "mul x20, x17, x14\n\t"
3487 "umulh x21, x17, x14\n\t"
3488 "adds x8, x8, x20\n\t"
3489 "adcs x9, x9, x21\n\t"
3490 "adc x10, xzr, xzr\n\t"
3491 /* A[3] * B[2] */
3492 "mul x20, x19, x13\n\t"
3493 "umulh x21, x19, x13\n\t"
3494 "adds x8, x8, x20\n\t"
3495 "adcs x9, x9, x21\n\t"
3496 "adc x10, x10, xzr\n\t"
3497 /* A[3] * B[3] */
3498 "mul x20, x19, x14\n\t"
3499 "umulh x21, x19, x14\n\t"
3500 "adds x9, x9, x20\n\t"
3501 "adc x10, x10, x21\n\t"
3502 /* Reduce */
3503 /* Move top half into t4-t7 and remove top bit from t3 */
3504 "extr x10, x10, x9, #63\n\t"
3505 "extr x9, x9, x8, #63\n\t"
3506 "extr x8, x8, x7, #63\n\t"
3507 "extr x7, x7, x6, #63\n\t"
3508 "and x6, x6, #0x7fffffffffffffff\n\t"
3509 /* Multiply top half by 19 */
3510 "mov x20, #19\n\t"
3511 "mul x21, x20, x7\n\t"
3512 "umulh x7, x20, x7\n\t"
3513 "adds x3, x3, x21\n\t"
3514 "mul x21, x20, x8\n\t"
3515 "umulh x8, x20, x8\n\t"
3516 "adcs x4, x4, x21\n\t"
3517 "mul x21, x20, x9\n\t"
3518 "umulh x9, x20, x9\n\t"
3519 "adcs x5, x5, x21\n\t"
3520 "mul x21, x20, x10\n\t"
3521 "umulh x22, x20, x10\n\t"
3522 "adcs x6, x6, x21\n\t"
3523 "adc x22, x22, xzr\n\t"
3524 /* Add remaining product results in */
3525 "adds x4, x4, x7\n\t"
3526 "adcs x5, x5, x8\n\t"
3527 "adcs x6, x6, x9\n\t"
3528 "adc x22, x22, xzr\n\t"
3529 /* Overflow */
3530 "extr x22, x22, x6, #63\n\t"
3531 "mul x22, x22, x20\n\t"
3532 "and x6, x6, #0x7fffffffffffffff\n\t"
3533 "adds x3, x3, x22\n\t"
3534 "adcs x4, x4, xzr\n\t"
3535 "adcs x5, x5, xzr\n\t"
3536 "adc x6, x6, xzr\n\t"
3537 /* Reduce if top bit set */
3538 "and x22, x20, x6, asr 63\n\t"
3539 "and x6, x6, #0x7fffffffffffffff\n\t"
3540 "adds x3, x3, x22\n\t"
3541 "adcs x4, x4, xzr\n\t"
3542 "adcs x5, x5, xzr\n\t"
3543 "adc x6, x6, xzr\n\t"
3544 /* Store */
3545 "stp x3, x4, [x0]\n\t"
3546 "stp x5, x6, [x0, #16]\n\t"
3547 "ldp x29, x30, [sp], #0x40\n\t"
3548 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
3549 :
3550 : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22"
3551 );
3552 }
3553
fe_ge_to_p3(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt)3554 void fe_ge_to_p3(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt)
3555 {
3556 __asm__ __volatile__ (
3557 "stp x29, x30, [sp, #-96]!\n\t"
3558 "add x29, sp, #0\n\t"
3559 "str %x[ry], [x29, #16]\n\t"
3560 "str %x[rz], [x29, #24]\n\t"
3561 "str %x[rt], [x29, #32]\n\t"
3562 "str %x[px], [x29, #40]\n\t"
3563 "str %x[py], [x29, #48]\n\t"
3564 "str %x[pz], [x29, #56]\n\t"
3565 "str %x[pt], [x29, #64]\n\t"
3566 "ldr x1, [x29, #40]\n\t"
3567 "ldr x2, [x29, #64]\n\t"
3568 /* Multiply */
3569 "ldp x11, x12, [x1]\n\t"
3570 "ldp x13, x14, [x1, #16]\n\t"
3571 "ldp x15, x16, [x2]\n\t"
3572 "ldp x17, x19, [x2, #16]\n\t"
3573 /* A[0] * B[0] */
3574 "mul x3, x11, x15\n\t"
3575 "umulh x4, x11, x15\n\t"
3576 /* A[0] * B[1] */
3577 "mul x24, x11, x16\n\t"
3578 "umulh x5, x11, x16\n\t"
3579 "adds x4, x4, x24\n\t"
3580 "adc x5, x5, xzr\n\t"
3581 /* A[1] * B[0] */
3582 "mul x24, x12, x15\n\t"
3583 "umulh x25, x12, x15\n\t"
3584 "adds x4, x4, x24\n\t"
3585 "adcs x5, x5, x25\n\t"
3586 "adc x6, xzr, xzr\n\t"
3587 /* A[0] * B[2] */
3588 "mul x24, x11, x17\n\t"
3589 "umulh x25, x11, x17\n\t"
3590 "adds x5, x5, x24\n\t"
3591 "adc x6, x6, x25\n\t"
3592 /* A[1] * B[1] */
3593 "mul x24, x12, x16\n\t"
3594 "umulh x25, x12, x16\n\t"
3595 "adds x5, x5, x24\n\t"
3596 "adcs x6, x6, x25\n\t"
3597 "adc x7, xzr, xzr\n\t"
3598 /* A[2] * B[0] */
3599 "mul x24, x13, x15\n\t"
3600 "umulh x25, x13, x15\n\t"
3601 "adds x5, x5, x24\n\t"
3602 "adcs x6, x6, x25\n\t"
3603 "adc x7, x7, xzr\n\t"
3604 /* A[0] * B[3] */
3605 "mul x24, x11, x19\n\t"
3606 "umulh x25, x11, x19\n\t"
3607 "adds x6, x6, x24\n\t"
3608 "adcs x7, x7, x25\n\t"
3609 "adc x8, xzr, xzr\n\t"
3610 /* A[1] * B[2] */
3611 "mul x24, x12, x17\n\t"
3612 "umulh x25, x12, x17\n\t"
3613 "adds x6, x6, x24\n\t"
3614 "adcs x7, x7, x25\n\t"
3615 "adc x8, x8, xzr\n\t"
3616 /* A[2] * B[1] */
3617 "mul x24, x13, x16\n\t"
3618 "umulh x25, x13, x16\n\t"
3619 "adds x6, x6, x24\n\t"
3620 "adcs x7, x7, x25\n\t"
3621 "adc x8, x8, xzr\n\t"
3622 /* A[3] * B[0] */
3623 "mul x24, x14, x15\n\t"
3624 "umulh x25, x14, x15\n\t"
3625 "adds x6, x6, x24\n\t"
3626 "adcs x7, x7, x25\n\t"
3627 "adc x8, x8, xzr\n\t"
3628 /* A[1] * B[3] */
3629 "mul x24, x12, x19\n\t"
3630 "umulh x25, x12, x19\n\t"
3631 "adds x7, x7, x24\n\t"
3632 "adcs x8, x8, x25\n\t"
3633 "adc x9, xzr, xzr\n\t"
3634 /* A[2] * B[2] */
3635 "mul x24, x13, x17\n\t"
3636 "umulh x25, x13, x17\n\t"
3637 "adds x7, x7, x24\n\t"
3638 "adcs x8, x8, x25\n\t"
3639 "adc x9, x9, xzr\n\t"
3640 /* A[3] * B[1] */
3641 "mul x24, x14, x16\n\t"
3642 "umulh x25, x14, x16\n\t"
3643 "adds x7, x7, x24\n\t"
3644 "adcs x8, x8, x25\n\t"
3645 "adc x9, x9, xzr\n\t"
3646 /* A[2] * B[3] */
3647 "mul x24, x13, x19\n\t"
3648 "umulh x25, x13, x19\n\t"
3649 "adds x8, x8, x24\n\t"
3650 "adcs x9, x9, x25\n\t"
3651 "adc x10, xzr, xzr\n\t"
3652 /* A[3] * B[2] */
3653 "mul x24, x14, x17\n\t"
3654 "umulh x25, x14, x17\n\t"
3655 "adds x8, x8, x24\n\t"
3656 "adcs x9, x9, x25\n\t"
3657 "adc x10, x10, xzr\n\t"
3658 /* A[3] * B[3] */
3659 "mul x24, x14, x19\n\t"
3660 "umulh x25, x14, x19\n\t"
3661 "adds x9, x9, x24\n\t"
3662 "adc x10, x10, x25\n\t"
3663 /* Reduce */
3664 /* Move top half into t4-t7 and remove top bit from t3 */
3665 "extr x10, x10, x9, #63\n\t"
3666 "extr x9, x9, x8, #63\n\t"
3667 "extr x8, x8, x7, #63\n\t"
3668 "extr x7, x7, x6, #63\n\t"
3669 "and x6, x6, #0x7fffffffffffffff\n\t"
3670 /* Multiply top half by 19 */
3671 "mov x24, #19\n\t"
3672 "mul x25, x24, x7\n\t"
3673 "umulh x7, x24, x7\n\t"
3674 "adds x3, x3, x25\n\t"
3675 "mul x25, x24, x8\n\t"
3676 "umulh x8, x24, x8\n\t"
3677 "adcs x4, x4, x25\n\t"
3678 "mul x25, x24, x9\n\t"
3679 "umulh x9, x24, x9\n\t"
3680 "adcs x5, x5, x25\n\t"
3681 "mul x25, x24, x10\n\t"
3682 "umulh x26, x24, x10\n\t"
3683 "adcs x6, x6, x25\n\t"
3684 "adc x26, x26, xzr\n\t"
3685 /* Add remaining product results in */
3686 "adds x4, x4, x7\n\t"
3687 "adcs x5, x5, x8\n\t"
3688 "adcs x6, x6, x9\n\t"
3689 "adc x26, x26, xzr\n\t"
3690 /* Overflow */
3691 "extr x26, x26, x6, #63\n\t"
3692 "mul x26, x26, x24\n\t"
3693 "and x6, x6, #0x7fffffffffffffff\n\t"
3694 "adds x3, x3, x26\n\t"
3695 "adcs x4, x4, xzr\n\t"
3696 "adcs x5, x5, xzr\n\t"
3697 "adc x6, x6, xzr\n\t"
3698 /* Reduce if top bit set */
3699 "and x26, x24, x6, asr 63\n\t"
3700 "and x6, x6, #0x7fffffffffffffff\n\t"
3701 "adds x3, x3, x26\n\t"
3702 "adcs x4, x4, xzr\n\t"
3703 "adcs x5, x5, xzr\n\t"
3704 "adc x6, x6, xzr\n\t"
3705 /* Store */
3706 "stp x3, x4, [x0]\n\t"
3707 "stp x5, x6, [x0, #16]\n\t"
3708 "ldr x0, [x29, #32]\n\t"
3709 "ldr x2, [x29, #48]\n\t"
3710 /* Multiply */
3711 "ldp x20, x21, [x2]\n\t"
3712 "ldp x22, x23, [x2, #16]\n\t"
3713 /* A[0] * B[0] */
3714 "mul x3, x11, x20\n\t"
3715 "umulh x4, x11, x20\n\t"
3716 /* A[0] * B[1] */
3717 "mul x24, x11, x21\n\t"
3718 "umulh x5, x11, x21\n\t"
3719 "adds x4, x4, x24\n\t"
3720 "adc x5, x5, xzr\n\t"
3721 /* A[1] * B[0] */
3722 "mul x24, x12, x20\n\t"
3723 "umulh x25, x12, x20\n\t"
3724 "adds x4, x4, x24\n\t"
3725 "adcs x5, x5, x25\n\t"
3726 "adc x6, xzr, xzr\n\t"
3727 /* A[0] * B[2] */
3728 "mul x24, x11, x22\n\t"
3729 "umulh x25, x11, x22\n\t"
3730 "adds x5, x5, x24\n\t"
3731 "adc x6, x6, x25\n\t"
3732 /* A[1] * B[1] */
3733 "mul x24, x12, x21\n\t"
3734 "umulh x25, x12, x21\n\t"
3735 "adds x5, x5, x24\n\t"
3736 "adcs x6, x6, x25\n\t"
3737 "adc x7, xzr, xzr\n\t"
3738 /* A[2] * B[0] */
3739 "mul x24, x13, x20\n\t"
3740 "umulh x25, x13, x20\n\t"
3741 "adds x5, x5, x24\n\t"
3742 "adcs x6, x6, x25\n\t"
3743 "adc x7, x7, xzr\n\t"
3744 /* A[0] * B[3] */
3745 "mul x24, x11, x23\n\t"
3746 "umulh x25, x11, x23\n\t"
3747 "adds x6, x6, x24\n\t"
3748 "adcs x7, x7, x25\n\t"
3749 "adc x8, xzr, xzr\n\t"
3750 /* A[1] * B[2] */
3751 "mul x24, x12, x22\n\t"
3752 "umulh x25, x12, x22\n\t"
3753 "adds x6, x6, x24\n\t"
3754 "adcs x7, x7, x25\n\t"
3755 "adc x8, x8, xzr\n\t"
3756 /* A[2] * B[1] */
3757 "mul x24, x13, x21\n\t"
3758 "umulh x25, x13, x21\n\t"
3759 "adds x6, x6, x24\n\t"
3760 "adcs x7, x7, x25\n\t"
3761 "adc x8, x8, xzr\n\t"
3762 /* A[3] * B[0] */
3763 "mul x24, x14, x20\n\t"
3764 "umulh x25, x14, x20\n\t"
3765 "adds x6, x6, x24\n\t"
3766 "adcs x7, x7, x25\n\t"
3767 "adc x8, x8, xzr\n\t"
3768 /* A[1] * B[3] */
3769 "mul x24, x12, x23\n\t"
3770 "umulh x25, x12, x23\n\t"
3771 "adds x7, x7, x24\n\t"
3772 "adcs x8, x8, x25\n\t"
3773 "adc x9, xzr, xzr\n\t"
3774 /* A[2] * B[2] */
3775 "mul x24, x13, x22\n\t"
3776 "umulh x25, x13, x22\n\t"
3777 "adds x7, x7, x24\n\t"
3778 "adcs x8, x8, x25\n\t"
3779 "adc x9, x9, xzr\n\t"
3780 /* A[3] * B[1] */
3781 "mul x24, x14, x21\n\t"
3782 "umulh x25, x14, x21\n\t"
3783 "adds x7, x7, x24\n\t"
3784 "adcs x8, x8, x25\n\t"
3785 "adc x9, x9, xzr\n\t"
3786 /* A[2] * B[3] */
3787 "mul x24, x13, x23\n\t"
3788 "umulh x25, x13, x23\n\t"
3789 "adds x8, x8, x24\n\t"
3790 "adcs x9, x9, x25\n\t"
3791 "adc x10, xzr, xzr\n\t"
3792 /* A[3] * B[2] */
3793 "mul x24, x14, x22\n\t"
3794 "umulh x25, x14, x22\n\t"
3795 "adds x8, x8, x24\n\t"
3796 "adcs x9, x9, x25\n\t"
3797 "adc x10, x10, xzr\n\t"
3798 /* A[3] * B[3] */
3799 "mul x24, x14, x23\n\t"
3800 "umulh x25, x14, x23\n\t"
3801 "adds x9, x9, x24\n\t"
3802 "adc x10, x10, x25\n\t"
3803 /* Reduce */
3804 /* Move top half into t4-t7 and remove top bit from t3 */
3805 "extr x10, x10, x9, #63\n\t"
3806 "extr x9, x9, x8, #63\n\t"
3807 "extr x8, x8, x7, #63\n\t"
3808 "extr x7, x7, x6, #63\n\t"
3809 "and x6, x6, #0x7fffffffffffffff\n\t"
3810 /* Multiply top half by 19 */
3811 "mov x24, #19\n\t"
3812 "mul x25, x24, x7\n\t"
3813 "umulh x7, x24, x7\n\t"
3814 "adds x3, x3, x25\n\t"
3815 "mul x25, x24, x8\n\t"
3816 "umulh x8, x24, x8\n\t"
3817 "adcs x4, x4, x25\n\t"
3818 "mul x25, x24, x9\n\t"
3819 "umulh x9, x24, x9\n\t"
3820 "adcs x5, x5, x25\n\t"
3821 "mul x25, x24, x10\n\t"
3822 "umulh x26, x24, x10\n\t"
3823 "adcs x6, x6, x25\n\t"
3824 "adc x26, x26, xzr\n\t"
3825 /* Add remaining product results in */
3826 "adds x4, x4, x7\n\t"
3827 "adcs x5, x5, x8\n\t"
3828 "adcs x6, x6, x9\n\t"
3829 "adc x26, x26, xzr\n\t"
3830 /* Overflow */
3831 "extr x26, x26, x6, #63\n\t"
3832 "mul x26, x26, x24\n\t"
3833 "and x6, x6, #0x7fffffffffffffff\n\t"
3834 "adds x3, x3, x26\n\t"
3835 "adcs x4, x4, xzr\n\t"
3836 "adcs x5, x5, xzr\n\t"
3837 "adc x6, x6, xzr\n\t"
3838 /* Reduce if top bit set */
3839 "and x26, x24, x6, asr 63\n\t"
3840 "and x6, x6, #0x7fffffffffffffff\n\t"
3841 "adds x3, x3, x26\n\t"
3842 "adcs x4, x4, xzr\n\t"
3843 "adcs x5, x5, xzr\n\t"
3844 "adc x6, x6, xzr\n\t"
3845 /* Store */
3846 "stp x3, x4, [x0]\n\t"
3847 "stp x5, x6, [x0, #16]\n\t"
3848 "ldr x0, [x29, #16]\n\t"
3849 "ldr x2, [x29, #56]\n\t"
3850 /* Multiply */
3851 "ldp x11, x12, [x2]\n\t"
3852 "ldp x13, x14, [x2, #16]\n\t"
3853 /* A[0] * B[0] */
3854 "mul x3, x20, x11\n\t"
3855 "umulh x4, x20, x11\n\t"
3856 /* A[0] * B[1] */
3857 "mul x24, x20, x12\n\t"
3858 "umulh x5, x20, x12\n\t"
3859 "adds x4, x4, x24\n\t"
3860 "adc x5, x5, xzr\n\t"
3861 /* A[1] * B[0] */
3862 "mul x24, x21, x11\n\t"
3863 "umulh x25, x21, x11\n\t"
3864 "adds x4, x4, x24\n\t"
3865 "adcs x5, x5, x25\n\t"
3866 "adc x6, xzr, xzr\n\t"
3867 /* A[0] * B[2] */
3868 "mul x24, x20, x13\n\t"
3869 "umulh x25, x20, x13\n\t"
3870 "adds x5, x5, x24\n\t"
3871 "adc x6, x6, x25\n\t"
3872 /* A[1] * B[1] */
3873 "mul x24, x21, x12\n\t"
3874 "umulh x25, x21, x12\n\t"
3875 "adds x5, x5, x24\n\t"
3876 "adcs x6, x6, x25\n\t"
3877 "adc x7, xzr, xzr\n\t"
3878 /* A[2] * B[0] */
3879 "mul x24, x22, x11\n\t"
3880 "umulh x25, x22, x11\n\t"
3881 "adds x5, x5, x24\n\t"
3882 "adcs x6, x6, x25\n\t"
3883 "adc x7, x7, xzr\n\t"
3884 /* A[0] * B[3] */
3885 "mul x24, x20, x14\n\t"
3886 "umulh x25, x20, x14\n\t"
3887 "adds x6, x6, x24\n\t"
3888 "adcs x7, x7, x25\n\t"
3889 "adc x8, xzr, xzr\n\t"
3890 /* A[1] * B[2] */
3891 "mul x24, x21, x13\n\t"
3892 "umulh x25, x21, x13\n\t"
3893 "adds x6, x6, x24\n\t"
3894 "adcs x7, x7, x25\n\t"
3895 "adc x8, x8, xzr\n\t"
3896 /* A[2] * B[1] */
3897 "mul x24, x22, x12\n\t"
3898 "umulh x25, x22, x12\n\t"
3899 "adds x6, x6, x24\n\t"
3900 "adcs x7, x7, x25\n\t"
3901 "adc x8, x8, xzr\n\t"
3902 /* A[3] * B[0] */
3903 "mul x24, x23, x11\n\t"
3904 "umulh x25, x23, x11\n\t"
3905 "adds x6, x6, x24\n\t"
3906 "adcs x7, x7, x25\n\t"
3907 "adc x8, x8, xzr\n\t"
3908 /* A[1] * B[3] */
3909 "mul x24, x21, x14\n\t"
3910 "umulh x25, x21, x14\n\t"
3911 "adds x7, x7, x24\n\t"
3912 "adcs x8, x8, x25\n\t"
3913 "adc x9, xzr, xzr\n\t"
3914 /* A[2] * B[2] */
3915 "mul x24, x22, x13\n\t"
3916 "umulh x25, x22, x13\n\t"
3917 "adds x7, x7, x24\n\t"
3918 "adcs x8, x8, x25\n\t"
3919 "adc x9, x9, xzr\n\t"
3920 /* A[3] * B[1] */
3921 "mul x24, x23, x12\n\t"
3922 "umulh x25, x23, x12\n\t"
3923 "adds x7, x7, x24\n\t"
3924 "adcs x8, x8, x25\n\t"
3925 "adc x9, x9, xzr\n\t"
3926 /* A[2] * B[3] */
3927 "mul x24, x22, x14\n\t"
3928 "umulh x25, x22, x14\n\t"
3929 "adds x8, x8, x24\n\t"
3930 "adcs x9, x9, x25\n\t"
3931 "adc x10, xzr, xzr\n\t"
3932 /* A[3] * B[2] */
3933 "mul x24, x23, x13\n\t"
3934 "umulh x25, x23, x13\n\t"
3935 "adds x8, x8, x24\n\t"
3936 "adcs x9, x9, x25\n\t"
3937 "adc x10, x10, xzr\n\t"
3938 /* A[3] * B[3] */
3939 "mul x24, x23, x14\n\t"
3940 "umulh x25, x23, x14\n\t"
3941 "adds x9, x9, x24\n\t"
3942 "adc x10, x10, x25\n\t"
3943 /* Reduce */
3944 /* Move top half into t4-t7 and remove top bit from t3 */
3945 "extr x10, x10, x9, #63\n\t"
3946 "extr x9, x9, x8, #63\n\t"
3947 "extr x8, x8, x7, #63\n\t"
3948 "extr x7, x7, x6, #63\n\t"
3949 "and x6, x6, #0x7fffffffffffffff\n\t"
3950 /* Multiply top half by 19 */
3951 "mov x24, #19\n\t"
3952 "mul x25, x24, x7\n\t"
3953 "umulh x7, x24, x7\n\t"
3954 "adds x3, x3, x25\n\t"
3955 "mul x25, x24, x8\n\t"
3956 "umulh x8, x24, x8\n\t"
3957 "adcs x4, x4, x25\n\t"
3958 "mul x25, x24, x9\n\t"
3959 "umulh x9, x24, x9\n\t"
3960 "adcs x5, x5, x25\n\t"
3961 "mul x25, x24, x10\n\t"
3962 "umulh x26, x24, x10\n\t"
3963 "adcs x6, x6, x25\n\t"
3964 "adc x26, x26, xzr\n\t"
3965 /* Add remaining product results in */
3966 "adds x4, x4, x7\n\t"
3967 "adcs x5, x5, x8\n\t"
3968 "adcs x6, x6, x9\n\t"
3969 "adc x26, x26, xzr\n\t"
3970 /* Overflow */
3971 "extr x26, x26, x6, #63\n\t"
3972 "mul x26, x26, x24\n\t"
3973 "and x6, x6, #0x7fffffffffffffff\n\t"
3974 "adds x3, x3, x26\n\t"
3975 "adcs x4, x4, xzr\n\t"
3976 "adcs x5, x5, xzr\n\t"
3977 "adc x6, x6, xzr\n\t"
3978 /* Reduce if top bit set */
3979 "and x26, x24, x6, asr 63\n\t"
3980 "and x6, x6, #0x7fffffffffffffff\n\t"
3981 "adds x3, x3, x26\n\t"
3982 "adcs x4, x4, xzr\n\t"
3983 "adcs x5, x5, xzr\n\t"
3984 "adc x6, x6, xzr\n\t"
3985 /* Store */
3986 "stp x3, x4, [x0]\n\t"
3987 "stp x5, x6, [x0, #16]\n\t"
3988 "ldr x0, [x29, #24]\n\t"
3989 /* Multiply */
3990 /* A[0] * B[0] */
3991 "mul x3, x11, x15\n\t"
3992 "umulh x4, x11, x15\n\t"
3993 /* A[0] * B[1] */
3994 "mul x24, x11, x16\n\t"
3995 "umulh x5, x11, x16\n\t"
3996 "adds x4, x4, x24\n\t"
3997 "adc x5, x5, xzr\n\t"
3998 /* A[1] * B[0] */
3999 "mul x24, x12, x15\n\t"
4000 "umulh x25, x12, x15\n\t"
4001 "adds x4, x4, x24\n\t"
4002 "adcs x5, x5, x25\n\t"
4003 "adc x6, xzr, xzr\n\t"
4004 /* A[0] * B[2] */
4005 "mul x24, x11, x17\n\t"
4006 "umulh x25, x11, x17\n\t"
4007 "adds x5, x5, x24\n\t"
4008 "adc x6, x6, x25\n\t"
4009 /* A[1] * B[1] */
4010 "mul x24, x12, x16\n\t"
4011 "umulh x25, x12, x16\n\t"
4012 "adds x5, x5, x24\n\t"
4013 "adcs x6, x6, x25\n\t"
4014 "adc x7, xzr, xzr\n\t"
4015 /* A[2] * B[0] */
4016 "mul x24, x13, x15\n\t"
4017 "umulh x25, x13, x15\n\t"
4018 "adds x5, x5, x24\n\t"
4019 "adcs x6, x6, x25\n\t"
4020 "adc x7, x7, xzr\n\t"
4021 /* A[0] * B[3] */
4022 "mul x24, x11, x19\n\t"
4023 "umulh x25, x11, x19\n\t"
4024 "adds x6, x6, x24\n\t"
4025 "adcs x7, x7, x25\n\t"
4026 "adc x8, xzr, xzr\n\t"
4027 /* A[1] * B[2] */
4028 "mul x24, x12, x17\n\t"
4029 "umulh x25, x12, x17\n\t"
4030 "adds x6, x6, x24\n\t"
4031 "adcs x7, x7, x25\n\t"
4032 "adc x8, x8, xzr\n\t"
4033 /* A[2] * B[1] */
4034 "mul x24, x13, x16\n\t"
4035 "umulh x25, x13, x16\n\t"
4036 "adds x6, x6, x24\n\t"
4037 "adcs x7, x7, x25\n\t"
4038 "adc x8, x8, xzr\n\t"
4039 /* A[3] * B[0] */
4040 "mul x24, x14, x15\n\t"
4041 "umulh x25, x14, x15\n\t"
4042 "adds x6, x6, x24\n\t"
4043 "adcs x7, x7, x25\n\t"
4044 "adc x8, x8, xzr\n\t"
4045 /* A[1] * B[3] */
4046 "mul x24, x12, x19\n\t"
4047 "umulh x25, x12, x19\n\t"
4048 "adds x7, x7, x24\n\t"
4049 "adcs x8, x8, x25\n\t"
4050 "adc x9, xzr, xzr\n\t"
4051 /* A[2] * B[2] */
4052 "mul x24, x13, x17\n\t"
4053 "umulh x25, x13, x17\n\t"
4054 "adds x7, x7, x24\n\t"
4055 "adcs x8, x8, x25\n\t"
4056 "adc x9, x9, xzr\n\t"
4057 /* A[3] * B[1] */
4058 "mul x24, x14, x16\n\t"
4059 "umulh x25, x14, x16\n\t"
4060 "adds x7, x7, x24\n\t"
4061 "adcs x8, x8, x25\n\t"
4062 "adc x9, x9, xzr\n\t"
4063 /* A[2] * B[3] */
4064 "mul x24, x13, x19\n\t"
4065 "umulh x25, x13, x19\n\t"
4066 "adds x8, x8, x24\n\t"
4067 "adcs x9, x9, x25\n\t"
4068 "adc x10, xzr, xzr\n\t"
4069 /* A[3] * B[2] */
4070 "mul x24, x14, x17\n\t"
4071 "umulh x25, x14, x17\n\t"
4072 "adds x8, x8, x24\n\t"
4073 "adcs x9, x9, x25\n\t"
4074 "adc x10, x10, xzr\n\t"
4075 /* A[3] * B[3] */
4076 "mul x24, x14, x19\n\t"
4077 "umulh x25, x14, x19\n\t"
4078 "adds x9, x9, x24\n\t"
4079 "adc x10, x10, x25\n\t"
4080 /* Reduce */
4081 /* Move top half into t4-t7 and remove top bit from t3 */
4082 "extr x10, x10, x9, #63\n\t"
4083 "extr x9, x9, x8, #63\n\t"
4084 "extr x8, x8, x7, #63\n\t"
4085 "extr x7, x7, x6, #63\n\t"
4086 "and x6, x6, #0x7fffffffffffffff\n\t"
4087 /* Multiply top half by 19 */
4088 "mov x24, #19\n\t"
4089 "mul x25, x24, x7\n\t"
4090 "umulh x7, x24, x7\n\t"
4091 "adds x3, x3, x25\n\t"
4092 "mul x25, x24, x8\n\t"
4093 "umulh x8, x24, x8\n\t"
4094 "adcs x4, x4, x25\n\t"
4095 "mul x25, x24, x9\n\t"
4096 "umulh x9, x24, x9\n\t"
4097 "adcs x5, x5, x25\n\t"
4098 "mul x25, x24, x10\n\t"
4099 "umulh x26, x24, x10\n\t"
4100 "adcs x6, x6, x25\n\t"
4101 "adc x26, x26, xzr\n\t"
4102 /* Add remaining product results in */
4103 "adds x4, x4, x7\n\t"
4104 "adcs x5, x5, x8\n\t"
4105 "adcs x6, x6, x9\n\t"
4106 "adc x26, x26, xzr\n\t"
4107 /* Overflow */
4108 "extr x26, x26, x6, #63\n\t"
4109 "mul x26, x26, x24\n\t"
4110 "and x6, x6, #0x7fffffffffffffff\n\t"
4111 "adds x3, x3, x26\n\t"
4112 "adcs x4, x4, xzr\n\t"
4113 "adcs x5, x5, xzr\n\t"
4114 "adc x6, x6, xzr\n\t"
4115 /* Reduce if top bit set */
4116 "and x26, x24, x6, asr 63\n\t"
4117 "and x6, x6, #0x7fffffffffffffff\n\t"
4118 "adds x3, x3, x26\n\t"
4119 "adcs x4, x4, xzr\n\t"
4120 "adcs x5, x5, xzr\n\t"
4121 "adc x6, x6, xzr\n\t"
4122 /* Store */
4123 "stp x3, x4, [x0]\n\t"
4124 "stp x5, x6, [x0, #16]\n\t"
4125 "ldp x29, x30, [sp], #0x60\n\t"
4126 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt)
4127 :
4128 : "memory", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
4129 );
4130 }
4131
fe_ge_dbl(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz)4132 void fe_ge_dbl(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz)
4133 {
4134 __asm__ __volatile__ (
4135 "stp x29, x30, [sp, #-80]!\n\t"
4136 "add x29, sp, #0\n\t"
4137 "str %x[rx], [x29, #16]\n\t"
4138 "str %x[ry], [x29, #24]\n\t"
4139 "str %x[rz], [x29, #32]\n\t"
4140 "str %x[rt], [x29, #40]\n\t"
4141 "str %x[px], [x29, #48]\n\t"
4142 "str %x[py], [x29, #56]\n\t"
4143 "str %x[pz], [x29, #64]\n\t"
4144 "ldr x1, [x29, #48]\n\t"
4145 /* Square */
4146 "ldp x12, x13, [x1]\n\t"
4147 "ldp x14, x15, [x1, #16]\n\t"
4148 /* A[0] * A[1] */
4149 "mul x5, x12, x13\n\t"
4150 "umulh x6, x12, x13\n\t"
4151 /* A[0] * A[2] */
4152 "mul x25, x12, x14\n\t"
4153 "umulh x7, x12, x14\n\t"
4154 "adds x6, x6, x25\n\t"
4155 "adc x7, x7, xzr\n\t"
4156 /* A[0] * A[3] */
4157 "mul x25, x12, x15\n\t"
4158 "umulh x8, x12, x15\n\t"
4159 "adds x7, x7, x25\n\t"
4160 "adc x8, x8, xzr\n\t"
4161 /* A[1] * A[2] */
4162 "mul x25, x13, x14\n\t"
4163 "umulh x26, x13, x14\n\t"
4164 "adds x7, x7, x25\n\t"
4165 "adcs x8, x8, x26\n\t"
4166 "adc x9, xzr, xzr\n\t"
4167 /* A[1] * A[3] */
4168 "mul x25, x13, x15\n\t"
4169 "umulh x26, x13, x15\n\t"
4170 "adds x8, x8, x25\n\t"
4171 "adc x9, x9, x26\n\t"
4172 /* A[2] * A[3] */
4173 "mul x25, x14, x15\n\t"
4174 "umulh x10, x14, x15\n\t"
4175 "adds x9, x9, x25\n\t"
4176 "adc x10, x10, xzr\n\t"
4177 /* Double */
4178 "adds x5, x5, x5\n\t"
4179 "adcs x6, x6, x6\n\t"
4180 "adcs x7, x7, x7\n\t"
4181 "adcs x8, x8, x8\n\t"
4182 "adcs x9, x9, x9\n\t"
4183 "adcs x10, x10, x10\n\t"
4184 "adc x11, xzr, xzr\n\t"
4185 /* A[0] * A[0] */
4186 "mul x4, x12, x12\n\t"
4187 "umulh x27, x12, x12\n\t"
4188 /* A[1] * A[1] */
4189 "mul x25, x13, x13\n\t"
4190 "umulh x26, x13, x13\n\t"
4191 "adds x5, x5, x27\n\t"
4192 "adcs x6, x6, x25\n\t"
4193 "adc x27, x26, xzr\n\t"
4194 /* A[2] * A[2] */
4195 "mul x25, x14, x14\n\t"
4196 "umulh x26, x14, x14\n\t"
4197 "adds x7, x7, x27\n\t"
4198 "adcs x8, x8, x25\n\t"
4199 "adc x27, x26, xzr\n\t"
4200 /* A[3] * A[3] */
4201 "mul x25, x15, x15\n\t"
4202 "umulh x26, x15, x15\n\t"
4203 "adds x9, x9, x27\n\t"
4204 "adcs x10, x10, x25\n\t"
4205 "adc x11, x11, x26\n\t"
4206 /* Reduce */
4207 /* Move top half into t4-t7 and remove top bit from t3 */
4208 "extr x11, x11, x10, #63\n\t"
4209 "extr x10, x10, x9, #63\n\t"
4210 "extr x9, x9, x8, #63\n\t"
4211 "extr x8, x8, x7, #63\n\t"
4212 "and x7, x7, #0x7fffffffffffffff\n\t"
4213 /* Multiply top half by 19 */
4214 "mov x25, #19\n\t"
4215 "mul x26, x25, x8\n\t"
4216 "umulh x8, x25, x8\n\t"
4217 "adds x4, x4, x26\n\t"
4218 "mul x26, x25, x9\n\t"
4219 "umulh x9, x25, x9\n\t"
4220 "adcs x5, x5, x26\n\t"
4221 "mul x26, x25, x10\n\t"
4222 "umulh x10, x25, x10\n\t"
4223 "adcs x6, x6, x26\n\t"
4224 "mul x26, x25, x11\n\t"
4225 "umulh x27, x25, x11\n\t"
4226 "adcs x7, x7, x26\n\t"
4227 "adc x27, x27, xzr\n\t"
4228 /* Add remaining product results in */
4229 "adds x5, x5, x8\n\t"
4230 "adcs x6, x6, x9\n\t"
4231 "adcs x7, x7, x10\n\t"
4232 "adc x27, x27, xzr\n\t"
4233 /* Overflow */
4234 "extr x27, x27, x7, #63\n\t"
4235 "mul x27, x27, x25\n\t"
4236 "and x7, x7, #0x7fffffffffffffff\n\t"
4237 "adds x4, x4, x27\n\t"
4238 "adcs x5, x5, xzr\n\t"
4239 "adcs x6, x6, xzr\n\t"
4240 "adc x7, x7, xzr\n\t"
4241 /* Reduce if top bit set */
4242 "and x27, x25, x7, asr 63\n\t"
4243 "and x7, x7, #0x7fffffffffffffff\n\t"
4244 "adds x4, x4, x27\n\t"
4245 "adcs x5, x5, xzr\n\t"
4246 "adcs x6, x6, xzr\n\t"
4247 "adc x7, x7, xzr\n\t"
4248 /* Store */
4249 "stp x4, x5, [x0]\n\t"
4250 "stp x6, x7, [x0, #16]\n\t"
4251 "ldr x0, [x29, #32]\n\t"
4252 "ldr x1, [x29, #56]\n\t"
4253 /* Square */
4254 "ldp x21, x22, [x1]\n\t"
4255 "ldp x23, x24, [x1, #16]\n\t"
4256 /* A[0] * A[1] */
4257 "mul x9, x21, x22\n\t"
4258 "umulh x10, x21, x22\n\t"
4259 /* A[0] * A[2] */
4260 "mul x25, x21, x23\n\t"
4261 "umulh x11, x21, x23\n\t"
4262 "adds x10, x10, x25\n\t"
4263 "adc x11, x11, xzr\n\t"
4264 /* A[0] * A[3] */
4265 "mul x25, x21, x24\n\t"
4266 "umulh x16, x21, x24\n\t"
4267 "adds x11, x11, x25\n\t"
4268 "adc x16, x16, xzr\n\t"
4269 /* A[1] * A[2] */
4270 "mul x25, x22, x23\n\t"
4271 "umulh x26, x22, x23\n\t"
4272 "adds x11, x11, x25\n\t"
4273 "adcs x16, x16, x26\n\t"
4274 "adc x17, xzr, xzr\n\t"
4275 /* A[1] * A[3] */
4276 "mul x25, x22, x24\n\t"
4277 "umulh x26, x22, x24\n\t"
4278 "adds x16, x16, x25\n\t"
4279 "adc x17, x17, x26\n\t"
4280 /* A[2] * A[3] */
4281 "mul x25, x23, x24\n\t"
4282 "umulh x19, x23, x24\n\t"
4283 "adds x17, x17, x25\n\t"
4284 "adc x19, x19, xzr\n\t"
4285 /* Double */
4286 "adds x9, x9, x9\n\t"
4287 "adcs x10, x10, x10\n\t"
4288 "adcs x11, x11, x11\n\t"
4289 "adcs x16, x16, x16\n\t"
4290 "adcs x17, x17, x17\n\t"
4291 "adcs x19, x19, x19\n\t"
4292 "adc x20, xzr, xzr\n\t"
4293 /* A[0] * A[0] */
4294 "mul x8, x21, x21\n\t"
4295 "umulh x27, x21, x21\n\t"
4296 /* A[1] * A[1] */
4297 "mul x25, x22, x22\n\t"
4298 "umulh x26, x22, x22\n\t"
4299 "adds x9, x9, x27\n\t"
4300 "adcs x10, x10, x25\n\t"
4301 "adc x27, x26, xzr\n\t"
4302 /* A[2] * A[2] */
4303 "mul x25, x23, x23\n\t"
4304 "umulh x26, x23, x23\n\t"
4305 "adds x11, x11, x27\n\t"
4306 "adcs x16, x16, x25\n\t"
4307 "adc x27, x26, xzr\n\t"
4308 /* A[3] * A[3] */
4309 "mul x25, x24, x24\n\t"
4310 "umulh x26, x24, x24\n\t"
4311 "adds x17, x17, x27\n\t"
4312 "adcs x19, x19, x25\n\t"
4313 "adc x20, x20, x26\n\t"
4314 /* Reduce */
4315 /* Move top half into t4-t7 and remove top bit from t3 */
4316 "extr x20, x20, x19, #63\n\t"
4317 "extr x19, x19, x17, #63\n\t"
4318 "extr x17, x17, x16, #63\n\t"
4319 "extr x16, x16, x11, #63\n\t"
4320 "and x11, x11, #0x7fffffffffffffff\n\t"
4321 /* Multiply top half by 19 */
4322 "mov x25, #19\n\t"
4323 "mul x26, x25, x16\n\t"
4324 "umulh x16, x25, x16\n\t"
4325 "adds x8, x8, x26\n\t"
4326 "mul x26, x25, x17\n\t"
4327 "umulh x17, x25, x17\n\t"
4328 "adcs x9, x9, x26\n\t"
4329 "mul x26, x25, x19\n\t"
4330 "umulh x19, x25, x19\n\t"
4331 "adcs x10, x10, x26\n\t"
4332 "mul x26, x25, x20\n\t"
4333 "umulh x27, x25, x20\n\t"
4334 "adcs x11, x11, x26\n\t"
4335 "adc x27, x27, xzr\n\t"
4336 /* Add remaining product results in */
4337 "adds x9, x9, x16\n\t"
4338 "adcs x10, x10, x17\n\t"
4339 "adcs x11, x11, x19\n\t"
4340 "adc x27, x27, xzr\n\t"
4341 /* Overflow */
4342 "extr x27, x27, x11, #63\n\t"
4343 "mul x27, x27, x25\n\t"
4344 "and x11, x11, #0x7fffffffffffffff\n\t"
4345 "adds x8, x8, x27\n\t"
4346 "adcs x9, x9, xzr\n\t"
4347 "adcs x10, x10, xzr\n\t"
4348 "adc x11, x11, xzr\n\t"
4349 /* Reduce if top bit set */
4350 "and x27, x25, x11, asr 63\n\t"
4351 "and x11, x11, #0x7fffffffffffffff\n\t"
4352 "adds x8, x8, x27\n\t"
4353 "adcs x9, x9, xzr\n\t"
4354 "adcs x10, x10, xzr\n\t"
4355 "adc x11, x11, xzr\n\t"
4356 /* Store */
4357 "stp x8, x9, [x0]\n\t"
4358 "stp x10, x11, [x0, #16]\n\t"
4359 "ldr x0, [x29, #24]\n\t"
4360 /* Add */
4361 "adds x12, x12, x21\n\t"
4362 "adcs x13, x13, x22\n\t"
4363 "adcs x14, x14, x23\n\t"
4364 "adc x15, x15, x24\n\t"
4365 "mov x25, #-19\n\t"
4366 "asr x28, x15, #63\n\t"
4367 /* Mask the modulus */
4368 "and x25, x28, x25\n\t"
4369 "and x26, x28, #0x7fffffffffffffff\n\t"
4370 /* Sub modulus (if overflow) */
4371 "subs x12, x12, x25\n\t"
4372 "sbcs x13, x13, x28\n\t"
4373 "sbcs x14, x14, x28\n\t"
4374 "sbc x15, x15, x26\n\t"
4375 "ldr x0, [x29, #40]\n\t"
4376 /* Square */
4377 /* A[0] * A[1] */
4378 "mul x17, x12, x13\n\t"
4379 "umulh x19, x12, x13\n\t"
4380 /* A[0] * A[2] */
4381 "mul x25, x12, x14\n\t"
4382 "umulh x20, x12, x14\n\t"
4383 "adds x19, x19, x25\n\t"
4384 "adc x20, x20, xzr\n\t"
4385 /* A[0] * A[3] */
4386 "mul x25, x12, x15\n\t"
4387 "umulh x21, x12, x15\n\t"
4388 "adds x20, x20, x25\n\t"
4389 "adc x21, x21, xzr\n\t"
4390 /* A[1] * A[2] */
4391 "mul x25, x13, x14\n\t"
4392 "umulh x26, x13, x14\n\t"
4393 "adds x20, x20, x25\n\t"
4394 "adcs x21, x21, x26\n\t"
4395 "adc x22, xzr, xzr\n\t"
4396 /* A[1] * A[3] */
4397 "mul x25, x13, x15\n\t"
4398 "umulh x26, x13, x15\n\t"
4399 "adds x21, x21, x25\n\t"
4400 "adc x22, x22, x26\n\t"
4401 /* A[2] * A[3] */
4402 "mul x25, x14, x15\n\t"
4403 "umulh x23, x14, x15\n\t"
4404 "adds x22, x22, x25\n\t"
4405 "adc x23, x23, xzr\n\t"
4406 /* Double */
4407 "adds x17, x17, x17\n\t"
4408 "adcs x19, x19, x19\n\t"
4409 "adcs x20, x20, x20\n\t"
4410 "adcs x21, x21, x21\n\t"
4411 "adcs x22, x22, x22\n\t"
4412 "adcs x23, x23, x23\n\t"
4413 "adc x24, xzr, xzr\n\t"
4414 /* A[0] * A[0] */
4415 "mul x16, x12, x12\n\t"
4416 "umulh x27, x12, x12\n\t"
4417 /* A[1] * A[1] */
4418 "mul x25, x13, x13\n\t"
4419 "umulh x26, x13, x13\n\t"
4420 "adds x17, x17, x27\n\t"
4421 "adcs x19, x19, x25\n\t"
4422 "adc x27, x26, xzr\n\t"
4423 /* A[2] * A[2] */
4424 "mul x25, x14, x14\n\t"
4425 "umulh x26, x14, x14\n\t"
4426 "adds x20, x20, x27\n\t"
4427 "adcs x21, x21, x25\n\t"
4428 "adc x27, x26, xzr\n\t"
4429 /* A[3] * A[3] */
4430 "mul x25, x15, x15\n\t"
4431 "umulh x26, x15, x15\n\t"
4432 "adds x22, x22, x27\n\t"
4433 "adcs x23, x23, x25\n\t"
4434 "adc x24, x24, x26\n\t"
4435 /* Reduce */
4436 /* Move top half into t4-t7 and remove top bit from t3 */
4437 "extr x24, x24, x23, #63\n\t"
4438 "extr x23, x23, x22, #63\n\t"
4439 "extr x22, x22, x21, #63\n\t"
4440 "extr x21, x21, x20, #63\n\t"
4441 "and x20, x20, #0x7fffffffffffffff\n\t"
4442 /* Multiply top half by 19 */
4443 "mov x25, #19\n\t"
4444 "mul x26, x25, x21\n\t"
4445 "umulh x21, x25, x21\n\t"
4446 "adds x16, x16, x26\n\t"
4447 "mul x26, x25, x22\n\t"
4448 "umulh x22, x25, x22\n\t"
4449 "adcs x17, x17, x26\n\t"
4450 "mul x26, x25, x23\n\t"
4451 "umulh x23, x25, x23\n\t"
4452 "adcs x19, x19, x26\n\t"
4453 "mul x26, x25, x24\n\t"
4454 "umulh x27, x25, x24\n\t"
4455 "adcs x20, x20, x26\n\t"
4456 "adc x27, x27, xzr\n\t"
4457 /* Add remaining product results in */
4458 "adds x17, x17, x21\n\t"
4459 "adcs x19, x19, x22\n\t"
4460 "adcs x20, x20, x23\n\t"
4461 "adc x27, x27, xzr\n\t"
4462 /* Overflow */
4463 "extr x27, x27, x20, #63\n\t"
4464 "mul x27, x27, x25\n\t"
4465 "and x20, x20, #0x7fffffffffffffff\n\t"
4466 "adds x16, x16, x27\n\t"
4467 "adcs x17, x17, xzr\n\t"
4468 "adcs x19, x19, xzr\n\t"
4469 "adc x20, x20, xzr\n\t"
4470 /* Reduce if top bit set */
4471 "and x27, x25, x20, asr 63\n\t"
4472 "and x20, x20, #0x7fffffffffffffff\n\t"
4473 "adds x16, x16, x27\n\t"
4474 "adcs x17, x17, xzr\n\t"
4475 "adcs x19, x19, xzr\n\t"
4476 "adc x20, x20, xzr\n\t"
4477 /* Store */
4478 "stp x16, x17, [x0]\n\t"
4479 "stp x19, x20, [x0, #16]\n\t"
4480 "ldr x0, [x29, #24]\n\t"
4481 "ldr x1, [x29, #32]\n\t"
4482 /* Add */
4483 "adds x12, x8, x4\n\t"
4484 "adcs x13, x9, x5\n\t"
4485 "adcs x14, x10, x6\n\t"
4486 "adc x15, x11, x7\n\t"
4487 "mov x25, #-19\n\t"
4488 "asr x28, x15, #63\n\t"
4489 /* Mask the modulus */
4490 "and x25, x28, x25\n\t"
4491 "and x26, x28, #0x7fffffffffffffff\n\t"
4492 /* Sub modulus (if overflow) */
4493 "subs x12, x12, x25\n\t"
4494 "sbcs x13, x13, x28\n\t"
4495 "sbcs x14, x14, x28\n\t"
4496 "sbc x15, x15, x26\n\t"
4497 /* Sub */
4498 "subs x21, x8, x4\n\t"
4499 "sbcs x22, x9, x5\n\t"
4500 "sbcs x23, x10, x6\n\t"
4501 "sbcs x24, x11, x7\n\t"
4502 "mov x25, #-19\n\t"
4503 "csetm x28, cc\n\t"
4504 /* Mask the modulus */
4505 "and x25, x28, x25\n\t"
4506 "and x26, x28, #0x7fffffffffffffff\n\t"
4507 /* Add modulus (if underflow) */
4508 "adds x21, x21, x25\n\t"
4509 "adcs x22, x22, x28\n\t"
4510 "adcs x23, x23, x28\n\t"
4511 "adc x24, x24, x26\n\t"
4512 "stp x12, x13, [x0]\n\t"
4513 "stp x14, x15, [x0, #16]\n\t"
4514 "stp x21, x22, [x1]\n\t"
4515 "stp x23, x24, [x1, #16]\n\t"
4516 "ldr x0, [x29, #16]\n\t"
4517 /* Sub */
4518 "subs x16, x16, x12\n\t"
4519 "sbcs x17, x17, x13\n\t"
4520 "sbcs x19, x19, x14\n\t"
4521 "sbcs x20, x20, x15\n\t"
4522 "mov x25, #-19\n\t"
4523 "csetm x28, cc\n\t"
4524 /* Mask the modulus */
4525 "and x25, x28, x25\n\t"
4526 "and x26, x28, #0x7fffffffffffffff\n\t"
4527 /* Add modulus (if underflow) */
4528 "adds x16, x16, x25\n\t"
4529 "adcs x17, x17, x28\n\t"
4530 "adcs x19, x19, x28\n\t"
4531 "adc x20, x20, x26\n\t"
4532 "stp x16, x17, [x0]\n\t"
4533 "stp x19, x20, [x0, #16]\n\t"
4534 "ldr x0, [x29, #40]\n\t"
4535 "ldr x1, [x29, #64]\n\t"
4536 /* Square * 2 */
4537 "ldp x12, x13, [x1]\n\t"
4538 "ldp x14, x15, [x1, #16]\n\t"
4539 /* A[0] * A[1] */
4540 "mul x5, x12, x13\n\t"
4541 "umulh x6, x12, x13\n\t"
4542 /* A[0] * A[2] */
4543 "mul x25, x12, x14\n\t"
4544 "umulh x7, x12, x14\n\t"
4545 "adds x6, x6, x25\n\t"
4546 "adc x7, x7, xzr\n\t"
4547 /* A[0] * A[3] */
4548 "mul x25, x12, x15\n\t"
4549 "umulh x8, x12, x15\n\t"
4550 "adds x7, x7, x25\n\t"
4551 "adc x8, x8, xzr\n\t"
4552 /* A[1] * A[2] */
4553 "mul x25, x13, x14\n\t"
4554 "umulh x26, x13, x14\n\t"
4555 "adds x7, x7, x25\n\t"
4556 "adcs x8, x8, x26\n\t"
4557 "adc x9, xzr, xzr\n\t"
4558 /* A[1] * A[3] */
4559 "mul x25, x13, x15\n\t"
4560 "umulh x26, x13, x15\n\t"
4561 "adds x8, x8, x25\n\t"
4562 "adc x9, x9, x26\n\t"
4563 /* A[2] * A[3] */
4564 "mul x25, x14, x15\n\t"
4565 "umulh x10, x14, x15\n\t"
4566 "adds x9, x9, x25\n\t"
4567 "adc x10, x10, xzr\n\t"
4568 /* Double */
4569 "adds x5, x5, x5\n\t"
4570 "adcs x6, x6, x6\n\t"
4571 "adcs x7, x7, x7\n\t"
4572 "adcs x8, x8, x8\n\t"
4573 "adcs x9, x9, x9\n\t"
4574 "adcs x10, x10, x10\n\t"
4575 "adc x11, xzr, xzr\n\t"
4576 /* A[0] * A[0] */
4577 "mul x4, x12, x12\n\t"
4578 "umulh x28, x12, x12\n\t"
4579 /* A[1] * A[1] */
4580 "mul x25, x13, x13\n\t"
4581 "umulh x26, x13, x13\n\t"
4582 "adds x5, x5, x28\n\t"
4583 "adcs x6, x6, x25\n\t"
4584 "adc x28, x26, xzr\n\t"
4585 /* A[2] * A[2] */
4586 "mul x25, x14, x14\n\t"
4587 "umulh x26, x14, x14\n\t"
4588 "adds x7, x7, x28\n\t"
4589 "adcs x8, x8, x25\n\t"
4590 "adc x28, x26, xzr\n\t"
4591 /* A[3] * A[3] */
4592 "mul x25, x15, x15\n\t"
4593 "umulh x26, x15, x15\n\t"
4594 "adds x9, x9, x28\n\t"
4595 "adcs x10, x10, x25\n\t"
4596 "adc x11, x11, x26\n\t"
4597 /* Double and Reduce */
4598 "mov x25, #0x169\n\t"
4599 /* Move top half into t4-t7 and remove top bit from t3 */
4600 "lsr x28, x11, #61\n\t"
4601 "extr x11, x11, x10, #62\n\t"
4602 "extr x10, x10, x9, #62\n\t"
4603 "extr x9, x9, x8, #62\n\t"
4604 "extr x8, x8, x7, #62\n\t"
4605 "extr x7, x7, x6, #63\n\t"
4606 "extr x6, x6, x5, #63\n\t"
4607 "extr x5, x5, x4, #63\n\t"
4608 "lsl x4, x4, #1\n\t"
4609 "and x7, x7, #0x7fffffffffffffff\n\t"
4610 /* Two left, only one right */
4611 "and x11, x11, #0x7fffffffffffffff\n\t"
4612 /* Multiply top bits by 19*19 */
4613 "mul x28, x28, x25\n\t"
4614 /* Multiply top half by 19 */
4615 "mov x25, #19\n\t"
4616 "mul x26, x25, x8\n\t"
4617 "umulh x8, x25, x8\n\t"
4618 "adds x4, x4, x26\n\t"
4619 "mul x26, x25, x9\n\t"
4620 "umulh x9, x25, x9\n\t"
4621 "adcs x5, x5, x26\n\t"
4622 "mul x26, x25, x10\n\t"
4623 "umulh x10, x25, x10\n\t"
4624 "adcs x6, x6, x26\n\t"
4625 "mul x26, x25, x11\n\t"
4626 "umulh x27, x25, x11\n\t"
4627 "adcs x7, x7, x26\n\t"
4628 "adc x27, x27, xzr\n\t"
4629 /* Add remaining product results in */
4630 "adds x4, x4, x28\n\t"
4631 "adcs x5, x5, x8\n\t"
4632 "adcs x6, x6, x9\n\t"
4633 "adcs x7, x7, x10\n\t"
4634 "adc x27, x27, xzr\n\t"
4635 /* Overflow */
4636 "extr x27, x27, x7, #63\n\t"
4637 "mul x27, x27, x25\n\t"
4638 "and x7, x7, #0x7fffffffffffffff\n\t"
4639 "adds x4, x4, x27\n\t"
4640 "adcs x5, x5, xzr\n\t"
4641 "adcs x6, x6, xzr\n\t"
4642 "adc x7, x7, xzr\n\t"
4643 /* Reduce if top bit set */
4644 "and x27, x25, x7, asr 63\n\t"
4645 "and x7, x7, #0x7fffffffffffffff\n\t"
4646 "adds x4, x4, x27\n\t"
4647 "adcs x5, x5, xzr\n\t"
4648 "adcs x6, x6, xzr\n\t"
4649 "adc x7, x7, xzr\n\t"
4650 /* Store */
4651 "ldr x0, [x29, #40]\n\t"
4652 /* Sub */
4653 "subs x4, x4, x21\n\t"
4654 "sbcs x5, x5, x22\n\t"
4655 "sbcs x6, x6, x23\n\t"
4656 "sbcs x7, x7, x24\n\t"
4657 "mov x25, #-19\n\t"
4658 "csetm x28, cc\n\t"
4659 /* Mask the modulus */
4660 "and x25, x28, x25\n\t"
4661 "and x26, x28, #0x7fffffffffffffff\n\t"
4662 /* Add modulus (if underflow) */
4663 "adds x4, x4, x25\n\t"
4664 "adcs x5, x5, x28\n\t"
4665 "adcs x6, x6, x28\n\t"
4666 "adc x7, x7, x26\n\t"
4667 "stp x4, x5, [x0]\n\t"
4668 "stp x6, x7, [x0, #16]\n\t"
4669 "ldp x29, x30, [sp], #0x50\n\t"
4670 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz)
4671 :
4672 : "memory", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
4673 );
4674 }
4675
fe_ge_madd(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qxy2d,const fe qyplusx,const fe qyminusx)4676 void fe_ge_madd(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
4677 {
4678 __asm__ __volatile__ (
4679 "stp x29, x30, [sp, #-112]!\n\t"
4680 "add x29, sp, #0\n\t"
4681 "str %x[qyminusx], [sp, #104]\n\t"
4682 "str %x[qyplusx], [sp, #96]\n\t"
4683 "str %x[qxy2d], [sp, #88]\n\t"
4684 "str %x[rx], [x29, #16]\n\t"
4685 "str %x[ry], [x29, #24]\n\t"
4686 "str %x[rz], [x29, #32]\n\t"
4687 "str %x[rt], [x29, #40]\n\t"
4688 "str %x[px], [x29, #48]\n\t"
4689 "str %x[py], [x29, #56]\n\t"
4690 "str %x[pz], [x29, #64]\n\t"
4691 "str %x[pt], [x29, #72]\n\t"
4692 "ldr x2, [x29, #56]\n\t"
4693 "ldr x3, [x29, #48]\n\t"
4694 /* Add */
4695 "ldp x12, x13, [x2]\n\t"
4696 "ldp x14, x15, [x2, #16]\n\t"
4697 "ldp x16, x17, [x3]\n\t"
4698 "ldp x19, x20, [x3, #16]\n\t"
4699 "adds x4, x12, x16\n\t"
4700 "adcs x5, x13, x17\n\t"
4701 "adcs x6, x14, x19\n\t"
4702 "adc x7, x15, x20\n\t"
4703 "mov x25, #-19\n\t"
4704 "asr x28, x7, #63\n\t"
4705 /* Mask the modulus */
4706 "and x25, x28, x25\n\t"
4707 "and x26, x28, #0x7fffffffffffffff\n\t"
4708 /* Sub modulus (if overflow) */
4709 "subs x4, x4, x25\n\t"
4710 "sbcs x5, x5, x28\n\t"
4711 "sbcs x6, x6, x28\n\t"
4712 "sbc x7, x7, x26\n\t"
4713 /* Sub */
4714 "subs x8, x12, x16\n\t"
4715 "sbcs x9, x13, x17\n\t"
4716 "sbcs x10, x14, x19\n\t"
4717 "sbcs x11, x15, x20\n\t"
4718 "mov x25, #-19\n\t"
4719 "csetm x28, cc\n\t"
4720 /* Mask the modulus */
4721 "and x25, x28, x25\n\t"
4722 "and x26, x28, #0x7fffffffffffffff\n\t"
4723 /* Add modulus (if underflow) */
4724 "adds x8, x8, x25\n\t"
4725 "adcs x9, x9, x28\n\t"
4726 "adcs x10, x10, x28\n\t"
4727 "adc x11, x11, x26\n\t"
4728 "ldr x0, [x29, #32]\n\t"
4729 "ldr x2, [sp, #96]\n\t"
4730 /* Multiply */
4731 "ldp x21, x22, [x2]\n\t"
4732 "ldp x23, x24, [x2, #16]\n\t"
4733 /* A[0] * B[0] */
4734 "mul x12, x4, x21\n\t"
4735 "umulh x13, x4, x21\n\t"
4736 /* A[0] * B[1] */
4737 "mul x25, x4, x22\n\t"
4738 "umulh x14, x4, x22\n\t"
4739 "adds x13, x13, x25\n\t"
4740 "adc x14, x14, xzr\n\t"
4741 /* A[1] * B[0] */
4742 "mul x25, x5, x21\n\t"
4743 "umulh x26, x5, x21\n\t"
4744 "adds x13, x13, x25\n\t"
4745 "adcs x14, x14, x26\n\t"
4746 "adc x15, xzr, xzr\n\t"
4747 /* A[0] * B[2] */
4748 "mul x25, x4, x23\n\t"
4749 "umulh x26, x4, x23\n\t"
4750 "adds x14, x14, x25\n\t"
4751 "adc x15, x15, x26\n\t"
4752 /* A[1] * B[1] */
4753 "mul x25, x5, x22\n\t"
4754 "umulh x26, x5, x22\n\t"
4755 "adds x14, x14, x25\n\t"
4756 "adcs x15, x15, x26\n\t"
4757 "adc x16, xzr, xzr\n\t"
4758 /* A[2] * B[0] */
4759 "mul x25, x6, x21\n\t"
4760 "umulh x26, x6, x21\n\t"
4761 "adds x14, x14, x25\n\t"
4762 "adcs x15, x15, x26\n\t"
4763 "adc x16, x16, xzr\n\t"
4764 /* A[0] * B[3] */
4765 "mul x25, x4, x24\n\t"
4766 "umulh x26, x4, x24\n\t"
4767 "adds x15, x15, x25\n\t"
4768 "adcs x16, x16, x26\n\t"
4769 "adc x17, xzr, xzr\n\t"
4770 /* A[1] * B[2] */
4771 "mul x25, x5, x23\n\t"
4772 "umulh x26, x5, x23\n\t"
4773 "adds x15, x15, x25\n\t"
4774 "adcs x16, x16, x26\n\t"
4775 "adc x17, x17, xzr\n\t"
4776 /* A[2] * B[1] */
4777 "mul x25, x6, x22\n\t"
4778 "umulh x26, x6, x22\n\t"
4779 "adds x15, x15, x25\n\t"
4780 "adcs x16, x16, x26\n\t"
4781 "adc x17, x17, xzr\n\t"
4782 /* A[3] * B[0] */
4783 "mul x25, x7, x21\n\t"
4784 "umulh x26, x7, x21\n\t"
4785 "adds x15, x15, x25\n\t"
4786 "adcs x16, x16, x26\n\t"
4787 "adc x17, x17, xzr\n\t"
4788 /* A[1] * B[3] */
4789 "mul x25, x5, x24\n\t"
4790 "umulh x26, x5, x24\n\t"
4791 "adds x16, x16, x25\n\t"
4792 "adcs x17, x17, x26\n\t"
4793 "adc x19, xzr, xzr\n\t"
4794 /* A[2] * B[2] */
4795 "mul x25, x6, x23\n\t"
4796 "umulh x26, x6, x23\n\t"
4797 "adds x16, x16, x25\n\t"
4798 "adcs x17, x17, x26\n\t"
4799 "adc x19, x19, xzr\n\t"
4800 /* A[3] * B[1] */
4801 "mul x25, x7, x22\n\t"
4802 "umulh x26, x7, x22\n\t"
4803 "adds x16, x16, x25\n\t"
4804 "adcs x17, x17, x26\n\t"
4805 "adc x19, x19, xzr\n\t"
4806 /* A[2] * B[3] */
4807 "mul x25, x6, x24\n\t"
4808 "umulh x26, x6, x24\n\t"
4809 "adds x17, x17, x25\n\t"
4810 "adcs x19, x19, x26\n\t"
4811 "adc x20, xzr, xzr\n\t"
4812 /* A[3] * B[2] */
4813 "mul x25, x7, x23\n\t"
4814 "umulh x26, x7, x23\n\t"
4815 "adds x17, x17, x25\n\t"
4816 "adcs x19, x19, x26\n\t"
4817 "adc x20, x20, xzr\n\t"
4818 /* A[3] * B[3] */
4819 "mul x25, x7, x24\n\t"
4820 "umulh x26, x7, x24\n\t"
4821 "adds x19, x19, x25\n\t"
4822 "adc x20, x20, x26\n\t"
4823 /* Reduce */
4824 /* Move top half into t4-t7 and remove top bit from t3 */
4825 "extr x20, x20, x19, #63\n\t"
4826 "extr x19, x19, x17, #63\n\t"
4827 "extr x17, x17, x16, #63\n\t"
4828 "extr x16, x16, x15, #63\n\t"
4829 "and x15, x15, #0x7fffffffffffffff\n\t"
4830 /* Multiply top half by 19 */
4831 "mov x25, #19\n\t"
4832 "mul x26, x25, x16\n\t"
4833 "umulh x16, x25, x16\n\t"
4834 "adds x12, x12, x26\n\t"
4835 "mul x26, x25, x17\n\t"
4836 "umulh x17, x25, x17\n\t"
4837 "adcs x13, x13, x26\n\t"
4838 "mul x26, x25, x19\n\t"
4839 "umulh x19, x25, x19\n\t"
4840 "adcs x14, x14, x26\n\t"
4841 "mul x26, x25, x20\n\t"
4842 "umulh x27, x25, x20\n\t"
4843 "adcs x15, x15, x26\n\t"
4844 "adc x27, x27, xzr\n\t"
4845 /* Add remaining product results in */
4846 "adds x13, x13, x16\n\t"
4847 "adcs x14, x14, x17\n\t"
4848 "adcs x15, x15, x19\n\t"
4849 "adc x27, x27, xzr\n\t"
4850 /* Overflow */
4851 "extr x27, x27, x15, #63\n\t"
4852 "mul x27, x27, x25\n\t"
4853 "and x15, x15, #0x7fffffffffffffff\n\t"
4854 "adds x12, x12, x27\n\t"
4855 "adcs x13, x13, xzr\n\t"
4856 "adcs x14, x14, xzr\n\t"
4857 "adc x15, x15, xzr\n\t"
4858 /* Reduce if top bit set */
4859 "and x27, x25, x15, asr 63\n\t"
4860 "and x15, x15, #0x7fffffffffffffff\n\t"
4861 "adds x12, x12, x27\n\t"
4862 "adcs x13, x13, xzr\n\t"
4863 "adcs x14, x14, xzr\n\t"
4864 "adc x15, x15, xzr\n\t"
4865 /* Store */
4866 "ldr x0, [x29, #24]\n\t"
4867 "ldr x1, [sp, #104]\n\t"
4868 /* Multiply */
4869 "ldp x21, x22, [x1]\n\t"
4870 "ldp x23, x24, [x1, #16]\n\t"
4871 /* A[0] * B[0] */
4872 "mul x4, x8, x21\n\t"
4873 "umulh x5, x8, x21\n\t"
4874 /* A[0] * B[1] */
4875 "mul x25, x8, x22\n\t"
4876 "umulh x6, x8, x22\n\t"
4877 "adds x5, x5, x25\n\t"
4878 "adc x6, x6, xzr\n\t"
4879 /* A[1] * B[0] */
4880 "mul x25, x9, x21\n\t"
4881 "umulh x26, x9, x21\n\t"
4882 "adds x5, x5, x25\n\t"
4883 "adcs x6, x6, x26\n\t"
4884 "adc x7, xzr, xzr\n\t"
4885 /* A[0] * B[2] */
4886 "mul x25, x8, x23\n\t"
4887 "umulh x26, x8, x23\n\t"
4888 "adds x6, x6, x25\n\t"
4889 "adc x7, x7, x26\n\t"
4890 /* A[1] * B[1] */
4891 "mul x25, x9, x22\n\t"
4892 "umulh x26, x9, x22\n\t"
4893 "adds x6, x6, x25\n\t"
4894 "adcs x7, x7, x26\n\t"
4895 "adc x16, xzr, xzr\n\t"
4896 /* A[2] * B[0] */
4897 "mul x25, x10, x21\n\t"
4898 "umulh x26, x10, x21\n\t"
4899 "adds x6, x6, x25\n\t"
4900 "adcs x7, x7, x26\n\t"
4901 "adc x16, x16, xzr\n\t"
4902 /* A[0] * B[3] */
4903 "mul x25, x8, x24\n\t"
4904 "umulh x26, x8, x24\n\t"
4905 "adds x7, x7, x25\n\t"
4906 "adcs x16, x16, x26\n\t"
4907 "adc x17, xzr, xzr\n\t"
4908 /* A[1] * B[2] */
4909 "mul x25, x9, x23\n\t"
4910 "umulh x26, x9, x23\n\t"
4911 "adds x7, x7, x25\n\t"
4912 "adcs x16, x16, x26\n\t"
4913 "adc x17, x17, xzr\n\t"
4914 /* A[2] * B[1] */
4915 "mul x25, x10, x22\n\t"
4916 "umulh x26, x10, x22\n\t"
4917 "adds x7, x7, x25\n\t"
4918 "adcs x16, x16, x26\n\t"
4919 "adc x17, x17, xzr\n\t"
4920 /* A[3] * B[0] */
4921 "mul x25, x11, x21\n\t"
4922 "umulh x26, x11, x21\n\t"
4923 "adds x7, x7, x25\n\t"
4924 "adcs x16, x16, x26\n\t"
4925 "adc x17, x17, xzr\n\t"
4926 /* A[1] * B[3] */
4927 "mul x25, x9, x24\n\t"
4928 "umulh x26, x9, x24\n\t"
4929 "adds x16, x16, x25\n\t"
4930 "adcs x17, x17, x26\n\t"
4931 "adc x19, xzr, xzr\n\t"
4932 /* A[2] * B[2] */
4933 "mul x25, x10, x23\n\t"
4934 "umulh x26, x10, x23\n\t"
4935 "adds x16, x16, x25\n\t"
4936 "adcs x17, x17, x26\n\t"
4937 "adc x19, x19, xzr\n\t"
4938 /* A[3] * B[1] */
4939 "mul x25, x11, x22\n\t"
4940 "umulh x26, x11, x22\n\t"
4941 "adds x16, x16, x25\n\t"
4942 "adcs x17, x17, x26\n\t"
4943 "adc x19, x19, xzr\n\t"
4944 /* A[2] * B[3] */
4945 "mul x25, x10, x24\n\t"
4946 "umulh x26, x10, x24\n\t"
4947 "adds x17, x17, x25\n\t"
4948 "adcs x19, x19, x26\n\t"
4949 "adc x20, xzr, xzr\n\t"
4950 /* A[3] * B[2] */
4951 "mul x25, x11, x23\n\t"
4952 "umulh x26, x11, x23\n\t"
4953 "adds x17, x17, x25\n\t"
4954 "adcs x19, x19, x26\n\t"
4955 "adc x20, x20, xzr\n\t"
4956 /* A[3] * B[3] */
4957 "mul x25, x11, x24\n\t"
4958 "umulh x26, x11, x24\n\t"
4959 "adds x19, x19, x25\n\t"
4960 "adc x20, x20, x26\n\t"
4961 /* Reduce */
4962 /* Move top half into t4-t7 and remove top bit from t3 */
4963 "extr x20, x20, x19, #63\n\t"
4964 "extr x19, x19, x17, #63\n\t"
4965 "extr x17, x17, x16, #63\n\t"
4966 "extr x16, x16, x7, #63\n\t"
4967 "and x7, x7, #0x7fffffffffffffff\n\t"
4968 /* Multiply top half by 19 */
4969 "mov x25, #19\n\t"
4970 "mul x26, x25, x16\n\t"
4971 "umulh x16, x25, x16\n\t"
4972 "adds x4, x4, x26\n\t"
4973 "mul x26, x25, x17\n\t"
4974 "umulh x17, x25, x17\n\t"
4975 "adcs x5, x5, x26\n\t"
4976 "mul x26, x25, x19\n\t"
4977 "umulh x19, x25, x19\n\t"
4978 "adcs x6, x6, x26\n\t"
4979 "mul x26, x25, x20\n\t"
4980 "umulh x27, x25, x20\n\t"
4981 "adcs x7, x7, x26\n\t"
4982 "adc x27, x27, xzr\n\t"
4983 /* Add remaining product results in */
4984 "adds x5, x5, x16\n\t"
4985 "adcs x6, x6, x17\n\t"
4986 "adcs x7, x7, x19\n\t"
4987 "adc x27, x27, xzr\n\t"
4988 /* Overflow */
4989 "extr x27, x27, x7, #63\n\t"
4990 "mul x27, x27, x25\n\t"
4991 "and x7, x7, #0x7fffffffffffffff\n\t"
4992 "adds x4, x4, x27\n\t"
4993 "adcs x5, x5, xzr\n\t"
4994 "adcs x6, x6, xzr\n\t"
4995 "adc x7, x7, xzr\n\t"
4996 /* Reduce if top bit set */
4997 "and x27, x25, x7, asr 63\n\t"
4998 "and x7, x7, #0x7fffffffffffffff\n\t"
4999 "adds x4, x4, x27\n\t"
5000 "adcs x5, x5, xzr\n\t"
5001 "adcs x6, x6, xzr\n\t"
5002 "adc x7, x7, xzr\n\t"
5003 /* Store */
5004 "ldr x0, [x29, #24]\n\t"
5005 "ldr x1, [x29, #16]\n\t"
5006 /* Add */
5007 "adds x8, x12, x4\n\t"
5008 "adcs x9, x13, x5\n\t"
5009 "adcs x10, x14, x6\n\t"
5010 "adc x11, x15, x7\n\t"
5011 "mov x25, #-19\n\t"
5012 "asr x28, x11, #63\n\t"
5013 /* Mask the modulus */
5014 "and x25, x28, x25\n\t"
5015 "and x26, x28, #0x7fffffffffffffff\n\t"
5016 /* Sub modulus (if overflow) */
5017 "subs x8, x8, x25\n\t"
5018 "sbcs x9, x9, x28\n\t"
5019 "sbcs x10, x10, x28\n\t"
5020 "sbc x11, x11, x26\n\t"
5021 /* Sub */
5022 "subs x16, x12, x4\n\t"
5023 "sbcs x17, x13, x5\n\t"
5024 "sbcs x19, x14, x6\n\t"
5025 "sbcs x20, x15, x7\n\t"
5026 "mov x25, #-19\n\t"
5027 "csetm x28, cc\n\t"
5028 /* Mask the modulus */
5029 "and x25, x28, x25\n\t"
5030 "and x26, x28, #0x7fffffffffffffff\n\t"
5031 /* Add modulus (if underflow) */
5032 "adds x16, x16, x25\n\t"
5033 "adcs x17, x17, x28\n\t"
5034 "adcs x19, x19, x28\n\t"
5035 "adc x20, x20, x26\n\t"
5036 "stp x8, x9, [x0]\n\t"
5037 "stp x10, x11, [x0, #16]\n\t"
5038 "stp x16, x17, [x1]\n\t"
5039 "stp x19, x20, [x1, #16]\n\t"
5040 "ldr x0, [x29, #40]\n\t"
5041 "ldr x1, [sp, #88]\n\t"
5042 "ldr x3, [x29, #72]\n\t"
5043 /* Multiply */
5044 "ldp x16, x17, [x1]\n\t"
5045 "ldp x19, x20, [x1, #16]\n\t"
5046 "ldp x21, x22, [x3]\n\t"
5047 "ldp x23, x24, [x3, #16]\n\t"
5048 /* A[0] * B[0] */
5049 "mul x4, x16, x21\n\t"
5050 "umulh x5, x16, x21\n\t"
5051 /* A[0] * B[1] */
5052 "mul x25, x16, x22\n\t"
5053 "umulh x6, x16, x22\n\t"
5054 "adds x5, x5, x25\n\t"
5055 "adc x6, x6, xzr\n\t"
5056 /* A[1] * B[0] */
5057 "mul x25, x17, x21\n\t"
5058 "umulh x26, x17, x21\n\t"
5059 "adds x5, x5, x25\n\t"
5060 "adcs x6, x6, x26\n\t"
5061 "adc x7, xzr, xzr\n\t"
5062 /* A[0] * B[2] */
5063 "mul x25, x16, x23\n\t"
5064 "umulh x26, x16, x23\n\t"
5065 "adds x6, x6, x25\n\t"
5066 "adc x7, x7, x26\n\t"
5067 /* A[1] * B[1] */
5068 "mul x25, x17, x22\n\t"
5069 "umulh x26, x17, x22\n\t"
5070 "adds x6, x6, x25\n\t"
5071 "adcs x7, x7, x26\n\t"
5072 "adc x8, xzr, xzr\n\t"
5073 /* A[2] * B[0] */
5074 "mul x25, x19, x21\n\t"
5075 "umulh x26, x19, x21\n\t"
5076 "adds x6, x6, x25\n\t"
5077 "adcs x7, x7, x26\n\t"
5078 "adc x8, x8, xzr\n\t"
5079 /* A[0] * B[3] */
5080 "mul x25, x16, x24\n\t"
5081 "umulh x26, x16, x24\n\t"
5082 "adds x7, x7, x25\n\t"
5083 "adcs x8, x8, x26\n\t"
5084 "adc x9, xzr, xzr\n\t"
5085 /* A[1] * B[2] */
5086 "mul x25, x17, x23\n\t"
5087 "umulh x26, x17, x23\n\t"
5088 "adds x7, x7, x25\n\t"
5089 "adcs x8, x8, x26\n\t"
5090 "adc x9, x9, xzr\n\t"
5091 /* A[2] * B[1] */
5092 "mul x25, x19, x22\n\t"
5093 "umulh x26, x19, x22\n\t"
5094 "adds x7, x7, x25\n\t"
5095 "adcs x8, x8, x26\n\t"
5096 "adc x9, x9, xzr\n\t"
5097 /* A[3] * B[0] */
5098 "mul x25, x20, x21\n\t"
5099 "umulh x26, x20, x21\n\t"
5100 "adds x7, x7, x25\n\t"
5101 "adcs x8, x8, x26\n\t"
5102 "adc x9, x9, xzr\n\t"
5103 /* A[1] * B[3] */
5104 "mul x25, x17, x24\n\t"
5105 "umulh x26, x17, x24\n\t"
5106 "adds x8, x8, x25\n\t"
5107 "adcs x9, x9, x26\n\t"
5108 "adc x10, xzr, xzr\n\t"
5109 /* A[2] * B[2] */
5110 "mul x25, x19, x23\n\t"
5111 "umulh x26, x19, x23\n\t"
5112 "adds x8, x8, x25\n\t"
5113 "adcs x9, x9, x26\n\t"
5114 "adc x10, x10, xzr\n\t"
5115 /* A[3] * B[1] */
5116 "mul x25, x20, x22\n\t"
5117 "umulh x26, x20, x22\n\t"
5118 "adds x8, x8, x25\n\t"
5119 "adcs x9, x9, x26\n\t"
5120 "adc x10, x10, xzr\n\t"
5121 /* A[2] * B[3] */
5122 "mul x25, x19, x24\n\t"
5123 "umulh x26, x19, x24\n\t"
5124 "adds x9, x9, x25\n\t"
5125 "adcs x10, x10, x26\n\t"
5126 "adc x11, xzr, xzr\n\t"
5127 /* A[3] * B[2] */
5128 "mul x25, x20, x23\n\t"
5129 "umulh x26, x20, x23\n\t"
5130 "adds x9, x9, x25\n\t"
5131 "adcs x10, x10, x26\n\t"
5132 "adc x11, x11, xzr\n\t"
5133 /* A[3] * B[3] */
5134 "mul x25, x20, x24\n\t"
5135 "umulh x26, x20, x24\n\t"
5136 "adds x10, x10, x25\n\t"
5137 "adc x11, x11, x26\n\t"
5138 /* Reduce */
5139 /* Move top half into t4-t7 and remove top bit from t3 */
5140 "extr x11, x11, x10, #63\n\t"
5141 "extr x10, x10, x9, #63\n\t"
5142 "extr x9, x9, x8, #63\n\t"
5143 "extr x8, x8, x7, #63\n\t"
5144 "and x7, x7, #0x7fffffffffffffff\n\t"
5145 /* Multiply top half by 19 */
5146 "mov x25, #19\n\t"
5147 "mul x26, x25, x8\n\t"
5148 "umulh x8, x25, x8\n\t"
5149 "adds x4, x4, x26\n\t"
5150 "mul x26, x25, x9\n\t"
5151 "umulh x9, x25, x9\n\t"
5152 "adcs x5, x5, x26\n\t"
5153 "mul x26, x25, x10\n\t"
5154 "umulh x10, x25, x10\n\t"
5155 "adcs x6, x6, x26\n\t"
5156 "mul x26, x25, x11\n\t"
5157 "umulh x27, x25, x11\n\t"
5158 "adcs x7, x7, x26\n\t"
5159 "adc x27, x27, xzr\n\t"
5160 /* Add remaining product results in */
5161 "adds x5, x5, x8\n\t"
5162 "adcs x6, x6, x9\n\t"
5163 "adcs x7, x7, x10\n\t"
5164 "adc x27, x27, xzr\n\t"
5165 /* Overflow */
5166 "extr x27, x27, x7, #63\n\t"
5167 "mul x27, x27, x25\n\t"
5168 "and x7, x7, #0x7fffffffffffffff\n\t"
5169 "adds x4, x4, x27\n\t"
5170 "adcs x5, x5, xzr\n\t"
5171 "adcs x6, x6, xzr\n\t"
5172 "adc x7, x7, xzr\n\t"
5173 /* Reduce if top bit set */
5174 "and x27, x25, x7, asr 63\n\t"
5175 "and x7, x7, #0x7fffffffffffffff\n\t"
5176 "adds x4, x4, x27\n\t"
5177 "adcs x5, x5, xzr\n\t"
5178 "adcs x6, x6, xzr\n\t"
5179 "adc x7, x7, xzr\n\t"
5180 /* Store */
5181 "ldr x0, [x29, #32]\n\t"
5182 "ldr x1, [x29, #64]\n\t"
5183 /* Double */
5184 "ldp x8, x9, [x1]\n\t"
5185 "ldp x10, x11, [x1, #16]\n\t"
5186 "adds x8, x8, x8\n\t"
5187 "adcs x9, x9, x9\n\t"
5188 "adcs x10, x10, x10\n\t"
5189 "adc x11, x11, x11\n\t"
5190 "mov x25, #-19\n\t"
5191 "asr x28, x11, #63\n\t"
5192 /* Mask the modulus */
5193 "and x25, x28, x25\n\t"
5194 "and x26, x28, #0x7fffffffffffffff\n\t"
5195 /* Sub modulus (if overflow) */
5196 "subs x8, x8, x25\n\t"
5197 "sbcs x9, x9, x28\n\t"
5198 "sbcs x10, x10, x28\n\t"
5199 "sbc x11, x11, x26\n\t"
5200 "ldr x1, [x29, #40]\n\t"
5201 /* Add */
5202 "adds x12, x8, x4\n\t"
5203 "adcs x13, x9, x5\n\t"
5204 "adcs x14, x10, x6\n\t"
5205 "adc x15, x11, x7\n\t"
5206 "mov x25, #-19\n\t"
5207 "asr x28, x15, #63\n\t"
5208 /* Mask the modulus */
5209 "and x25, x28, x25\n\t"
5210 "and x26, x28, #0x7fffffffffffffff\n\t"
5211 /* Sub modulus (if overflow) */
5212 "subs x12, x12, x25\n\t"
5213 "sbcs x13, x13, x28\n\t"
5214 "sbcs x14, x14, x28\n\t"
5215 "sbc x15, x15, x26\n\t"
5216 /* Sub */
5217 "subs x16, x8, x4\n\t"
5218 "sbcs x17, x9, x5\n\t"
5219 "sbcs x19, x10, x6\n\t"
5220 "sbcs x20, x11, x7\n\t"
5221 "mov x25, #-19\n\t"
5222 "csetm x28, cc\n\t"
5223 /* Mask the modulus */
5224 "and x25, x28, x25\n\t"
5225 "and x26, x28, #0x7fffffffffffffff\n\t"
5226 /* Add modulus (if underflow) */
5227 "adds x16, x16, x25\n\t"
5228 "adcs x17, x17, x28\n\t"
5229 "adcs x19, x19, x28\n\t"
5230 "adc x20, x20, x26\n\t"
5231 "stp x12, x13, [x0]\n\t"
5232 "stp x14, x15, [x0, #16]\n\t"
5233 "stp x16, x17, [x1]\n\t"
5234 "stp x19, x20, [x1, #16]\n\t"
5235 "ldp x29, x30, [sp], #0x70\n\t"
5236 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qxy2d] "+r" (qxy2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
5237 :
5238 : "memory", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
5239 );
5240 }
5241
fe_ge_msub(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qxy2d,const fe qyplusx,const fe qyminusx)5242 void fe_ge_msub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qxy2d, const fe qyplusx, const fe qyminusx)
5243 {
5244 __asm__ __volatile__ (
5245 "stp x29, x30, [sp, #-112]!\n\t"
5246 "add x29, sp, #0\n\t"
5247 "str %x[qyminusx], [sp, #104]\n\t"
5248 "str %x[qyplusx], [sp, #96]\n\t"
5249 "str %x[qxy2d], [sp, #88]\n\t"
5250 "str %x[rx], [x29, #16]\n\t"
5251 "str %x[ry], [x29, #24]\n\t"
5252 "str %x[rz], [x29, #32]\n\t"
5253 "str %x[rt], [x29, #40]\n\t"
5254 "str %x[px], [x29, #48]\n\t"
5255 "str %x[py], [x29, #56]\n\t"
5256 "str %x[pz], [x29, #64]\n\t"
5257 "str %x[pt], [x29, #72]\n\t"
5258 "ldr x2, [x29, #56]\n\t"
5259 "ldr x3, [x29, #48]\n\t"
5260 /* Add */
5261 "ldp x12, x13, [x2]\n\t"
5262 "ldp x14, x15, [x2, #16]\n\t"
5263 "ldp x16, x17, [x3]\n\t"
5264 "ldp x19, x20, [x3, #16]\n\t"
5265 "adds x4, x12, x16\n\t"
5266 "adcs x5, x13, x17\n\t"
5267 "adcs x6, x14, x19\n\t"
5268 "adc x7, x15, x20\n\t"
5269 "mov x25, #-19\n\t"
5270 "asr x28, x7, #63\n\t"
5271 /* Mask the modulus */
5272 "and x25, x28, x25\n\t"
5273 "and x26, x28, #0x7fffffffffffffff\n\t"
5274 /* Sub modulus (if overflow) */
5275 "subs x4, x4, x25\n\t"
5276 "sbcs x5, x5, x28\n\t"
5277 "sbcs x6, x6, x28\n\t"
5278 "sbc x7, x7, x26\n\t"
5279 /* Sub */
5280 "subs x8, x12, x16\n\t"
5281 "sbcs x9, x13, x17\n\t"
5282 "sbcs x10, x14, x19\n\t"
5283 "sbcs x11, x15, x20\n\t"
5284 "mov x25, #-19\n\t"
5285 "csetm x28, cc\n\t"
5286 /* Mask the modulus */
5287 "and x25, x28, x25\n\t"
5288 "and x26, x28, #0x7fffffffffffffff\n\t"
5289 /* Add modulus (if underflow) */
5290 "adds x8, x8, x25\n\t"
5291 "adcs x9, x9, x28\n\t"
5292 "adcs x10, x10, x28\n\t"
5293 "adc x11, x11, x26\n\t"
5294 "ldr x0, [x29, #32]\n\t"
5295 "ldr x2, [sp, #104]\n\t"
5296 /* Multiply */
5297 "ldp x21, x22, [x2]\n\t"
5298 "ldp x23, x24, [x2, #16]\n\t"
5299 /* A[0] * B[0] */
5300 "mul x12, x4, x21\n\t"
5301 "umulh x13, x4, x21\n\t"
5302 /* A[0] * B[1] */
5303 "mul x25, x4, x22\n\t"
5304 "umulh x14, x4, x22\n\t"
5305 "adds x13, x13, x25\n\t"
5306 "adc x14, x14, xzr\n\t"
5307 /* A[1] * B[0] */
5308 "mul x25, x5, x21\n\t"
5309 "umulh x26, x5, x21\n\t"
5310 "adds x13, x13, x25\n\t"
5311 "adcs x14, x14, x26\n\t"
5312 "adc x15, xzr, xzr\n\t"
5313 /* A[0] * B[2] */
5314 "mul x25, x4, x23\n\t"
5315 "umulh x26, x4, x23\n\t"
5316 "adds x14, x14, x25\n\t"
5317 "adc x15, x15, x26\n\t"
5318 /* A[1] * B[1] */
5319 "mul x25, x5, x22\n\t"
5320 "umulh x26, x5, x22\n\t"
5321 "adds x14, x14, x25\n\t"
5322 "adcs x15, x15, x26\n\t"
5323 "adc x16, xzr, xzr\n\t"
5324 /* A[2] * B[0] */
5325 "mul x25, x6, x21\n\t"
5326 "umulh x26, x6, x21\n\t"
5327 "adds x14, x14, x25\n\t"
5328 "adcs x15, x15, x26\n\t"
5329 "adc x16, x16, xzr\n\t"
5330 /* A[0] * B[3] */
5331 "mul x25, x4, x24\n\t"
5332 "umulh x26, x4, x24\n\t"
5333 "adds x15, x15, x25\n\t"
5334 "adcs x16, x16, x26\n\t"
5335 "adc x17, xzr, xzr\n\t"
5336 /* A[1] * B[2] */
5337 "mul x25, x5, x23\n\t"
5338 "umulh x26, x5, x23\n\t"
5339 "adds x15, x15, x25\n\t"
5340 "adcs x16, x16, x26\n\t"
5341 "adc x17, x17, xzr\n\t"
5342 /* A[2] * B[1] */
5343 "mul x25, x6, x22\n\t"
5344 "umulh x26, x6, x22\n\t"
5345 "adds x15, x15, x25\n\t"
5346 "adcs x16, x16, x26\n\t"
5347 "adc x17, x17, xzr\n\t"
5348 /* A[3] * B[0] */
5349 "mul x25, x7, x21\n\t"
5350 "umulh x26, x7, x21\n\t"
5351 "adds x15, x15, x25\n\t"
5352 "adcs x16, x16, x26\n\t"
5353 "adc x17, x17, xzr\n\t"
5354 /* A[1] * B[3] */
5355 "mul x25, x5, x24\n\t"
5356 "umulh x26, x5, x24\n\t"
5357 "adds x16, x16, x25\n\t"
5358 "adcs x17, x17, x26\n\t"
5359 "adc x19, xzr, xzr\n\t"
5360 /* A[2] * B[2] */
5361 "mul x25, x6, x23\n\t"
5362 "umulh x26, x6, x23\n\t"
5363 "adds x16, x16, x25\n\t"
5364 "adcs x17, x17, x26\n\t"
5365 "adc x19, x19, xzr\n\t"
5366 /* A[3] * B[1] */
5367 "mul x25, x7, x22\n\t"
5368 "umulh x26, x7, x22\n\t"
5369 "adds x16, x16, x25\n\t"
5370 "adcs x17, x17, x26\n\t"
5371 "adc x19, x19, xzr\n\t"
5372 /* A[2] * B[3] */
5373 "mul x25, x6, x24\n\t"
5374 "umulh x26, x6, x24\n\t"
5375 "adds x17, x17, x25\n\t"
5376 "adcs x19, x19, x26\n\t"
5377 "adc x20, xzr, xzr\n\t"
5378 /* A[3] * B[2] */
5379 "mul x25, x7, x23\n\t"
5380 "umulh x26, x7, x23\n\t"
5381 "adds x17, x17, x25\n\t"
5382 "adcs x19, x19, x26\n\t"
5383 "adc x20, x20, xzr\n\t"
5384 /* A[3] * B[3] */
5385 "mul x25, x7, x24\n\t"
5386 "umulh x26, x7, x24\n\t"
5387 "adds x19, x19, x25\n\t"
5388 "adc x20, x20, x26\n\t"
5389 /* Reduce */
5390 /* Move top half into t4-t7 and remove top bit from t3 */
5391 "extr x20, x20, x19, #63\n\t"
5392 "extr x19, x19, x17, #63\n\t"
5393 "extr x17, x17, x16, #63\n\t"
5394 "extr x16, x16, x15, #63\n\t"
5395 "and x15, x15, #0x7fffffffffffffff\n\t"
5396 /* Multiply top half by 19 */
5397 "mov x25, #19\n\t"
5398 "mul x26, x25, x16\n\t"
5399 "umulh x16, x25, x16\n\t"
5400 "adds x12, x12, x26\n\t"
5401 "mul x26, x25, x17\n\t"
5402 "umulh x17, x25, x17\n\t"
5403 "adcs x13, x13, x26\n\t"
5404 "mul x26, x25, x19\n\t"
5405 "umulh x19, x25, x19\n\t"
5406 "adcs x14, x14, x26\n\t"
5407 "mul x26, x25, x20\n\t"
5408 "umulh x27, x25, x20\n\t"
5409 "adcs x15, x15, x26\n\t"
5410 "adc x27, x27, xzr\n\t"
5411 /* Add remaining product results in */
5412 "adds x13, x13, x16\n\t"
5413 "adcs x14, x14, x17\n\t"
5414 "adcs x15, x15, x19\n\t"
5415 "adc x27, x27, xzr\n\t"
5416 /* Overflow */
5417 "extr x27, x27, x15, #63\n\t"
5418 "mul x27, x27, x25\n\t"
5419 "and x15, x15, #0x7fffffffffffffff\n\t"
5420 "adds x12, x12, x27\n\t"
5421 "adcs x13, x13, xzr\n\t"
5422 "adcs x14, x14, xzr\n\t"
5423 "adc x15, x15, xzr\n\t"
5424 /* Reduce if top bit set */
5425 "and x27, x25, x15, asr 63\n\t"
5426 "and x15, x15, #0x7fffffffffffffff\n\t"
5427 "adds x12, x12, x27\n\t"
5428 "adcs x13, x13, xzr\n\t"
5429 "adcs x14, x14, xzr\n\t"
5430 "adc x15, x15, xzr\n\t"
5431 /* Store */
5432 "ldr x0, [x29, #24]\n\t"
5433 "ldr x1, [sp, #96]\n\t"
5434 /* Multiply */
5435 "ldp x21, x22, [x1]\n\t"
5436 "ldp x23, x24, [x1, #16]\n\t"
5437 /* A[0] * B[0] */
5438 "mul x4, x8, x21\n\t"
5439 "umulh x5, x8, x21\n\t"
5440 /* A[0] * B[1] */
5441 "mul x25, x8, x22\n\t"
5442 "umulh x6, x8, x22\n\t"
5443 "adds x5, x5, x25\n\t"
5444 "adc x6, x6, xzr\n\t"
5445 /* A[1] * B[0] */
5446 "mul x25, x9, x21\n\t"
5447 "umulh x26, x9, x21\n\t"
5448 "adds x5, x5, x25\n\t"
5449 "adcs x6, x6, x26\n\t"
5450 "adc x7, xzr, xzr\n\t"
5451 /* A[0] * B[2] */
5452 "mul x25, x8, x23\n\t"
5453 "umulh x26, x8, x23\n\t"
5454 "adds x6, x6, x25\n\t"
5455 "adc x7, x7, x26\n\t"
5456 /* A[1] * B[1] */
5457 "mul x25, x9, x22\n\t"
5458 "umulh x26, x9, x22\n\t"
5459 "adds x6, x6, x25\n\t"
5460 "adcs x7, x7, x26\n\t"
5461 "adc x16, xzr, xzr\n\t"
5462 /* A[2] * B[0] */
5463 "mul x25, x10, x21\n\t"
5464 "umulh x26, x10, x21\n\t"
5465 "adds x6, x6, x25\n\t"
5466 "adcs x7, x7, x26\n\t"
5467 "adc x16, x16, xzr\n\t"
5468 /* A[0] * B[3] */
5469 "mul x25, x8, x24\n\t"
5470 "umulh x26, x8, x24\n\t"
5471 "adds x7, x7, x25\n\t"
5472 "adcs x16, x16, x26\n\t"
5473 "adc x17, xzr, xzr\n\t"
5474 /* A[1] * B[2] */
5475 "mul x25, x9, x23\n\t"
5476 "umulh x26, x9, x23\n\t"
5477 "adds x7, x7, x25\n\t"
5478 "adcs x16, x16, x26\n\t"
5479 "adc x17, x17, xzr\n\t"
5480 /* A[2] * B[1] */
5481 "mul x25, x10, x22\n\t"
5482 "umulh x26, x10, x22\n\t"
5483 "adds x7, x7, x25\n\t"
5484 "adcs x16, x16, x26\n\t"
5485 "adc x17, x17, xzr\n\t"
5486 /* A[3] * B[0] */
5487 "mul x25, x11, x21\n\t"
5488 "umulh x26, x11, x21\n\t"
5489 "adds x7, x7, x25\n\t"
5490 "adcs x16, x16, x26\n\t"
5491 "adc x17, x17, xzr\n\t"
5492 /* A[1] * B[3] */
5493 "mul x25, x9, x24\n\t"
5494 "umulh x26, x9, x24\n\t"
5495 "adds x16, x16, x25\n\t"
5496 "adcs x17, x17, x26\n\t"
5497 "adc x19, xzr, xzr\n\t"
5498 /* A[2] * B[2] */
5499 "mul x25, x10, x23\n\t"
5500 "umulh x26, x10, x23\n\t"
5501 "adds x16, x16, x25\n\t"
5502 "adcs x17, x17, x26\n\t"
5503 "adc x19, x19, xzr\n\t"
5504 /* A[3] * B[1] */
5505 "mul x25, x11, x22\n\t"
5506 "umulh x26, x11, x22\n\t"
5507 "adds x16, x16, x25\n\t"
5508 "adcs x17, x17, x26\n\t"
5509 "adc x19, x19, xzr\n\t"
5510 /* A[2] * B[3] */
5511 "mul x25, x10, x24\n\t"
5512 "umulh x26, x10, x24\n\t"
5513 "adds x17, x17, x25\n\t"
5514 "adcs x19, x19, x26\n\t"
5515 "adc x20, xzr, xzr\n\t"
5516 /* A[3] * B[2] */
5517 "mul x25, x11, x23\n\t"
5518 "umulh x26, x11, x23\n\t"
5519 "adds x17, x17, x25\n\t"
5520 "adcs x19, x19, x26\n\t"
5521 "adc x20, x20, xzr\n\t"
5522 /* A[3] * B[3] */
5523 "mul x25, x11, x24\n\t"
5524 "umulh x26, x11, x24\n\t"
5525 "adds x19, x19, x25\n\t"
5526 "adc x20, x20, x26\n\t"
5527 /* Reduce */
5528 /* Move top half into t4-t7 and remove top bit from t3 */
5529 "extr x20, x20, x19, #63\n\t"
5530 "extr x19, x19, x17, #63\n\t"
5531 "extr x17, x17, x16, #63\n\t"
5532 "extr x16, x16, x7, #63\n\t"
5533 "and x7, x7, #0x7fffffffffffffff\n\t"
5534 /* Multiply top half by 19 */
5535 "mov x25, #19\n\t"
5536 "mul x26, x25, x16\n\t"
5537 "umulh x16, x25, x16\n\t"
5538 "adds x4, x4, x26\n\t"
5539 "mul x26, x25, x17\n\t"
5540 "umulh x17, x25, x17\n\t"
5541 "adcs x5, x5, x26\n\t"
5542 "mul x26, x25, x19\n\t"
5543 "umulh x19, x25, x19\n\t"
5544 "adcs x6, x6, x26\n\t"
5545 "mul x26, x25, x20\n\t"
5546 "umulh x27, x25, x20\n\t"
5547 "adcs x7, x7, x26\n\t"
5548 "adc x27, x27, xzr\n\t"
5549 /* Add remaining product results in */
5550 "adds x5, x5, x16\n\t"
5551 "adcs x6, x6, x17\n\t"
5552 "adcs x7, x7, x19\n\t"
5553 "adc x27, x27, xzr\n\t"
5554 /* Overflow */
5555 "extr x27, x27, x7, #63\n\t"
5556 "mul x27, x27, x25\n\t"
5557 "and x7, x7, #0x7fffffffffffffff\n\t"
5558 "adds x4, x4, x27\n\t"
5559 "adcs x5, x5, xzr\n\t"
5560 "adcs x6, x6, xzr\n\t"
5561 "adc x7, x7, xzr\n\t"
5562 /* Reduce if top bit set */
5563 "and x27, x25, x7, asr 63\n\t"
5564 "and x7, x7, #0x7fffffffffffffff\n\t"
5565 "adds x4, x4, x27\n\t"
5566 "adcs x5, x5, xzr\n\t"
5567 "adcs x6, x6, xzr\n\t"
5568 "adc x7, x7, xzr\n\t"
5569 /* Store */
5570 "ldr x0, [x29, #24]\n\t"
5571 "ldr x1, [x29, #16]\n\t"
5572 /* Add */
5573 "adds x8, x12, x4\n\t"
5574 "adcs x9, x13, x5\n\t"
5575 "adcs x10, x14, x6\n\t"
5576 "adc x11, x15, x7\n\t"
5577 "mov x25, #-19\n\t"
5578 "asr x28, x11, #63\n\t"
5579 /* Mask the modulus */
5580 "and x25, x28, x25\n\t"
5581 "and x26, x28, #0x7fffffffffffffff\n\t"
5582 /* Sub modulus (if overflow) */
5583 "subs x8, x8, x25\n\t"
5584 "sbcs x9, x9, x28\n\t"
5585 "sbcs x10, x10, x28\n\t"
5586 "sbc x11, x11, x26\n\t"
5587 /* Sub */
5588 "subs x16, x12, x4\n\t"
5589 "sbcs x17, x13, x5\n\t"
5590 "sbcs x19, x14, x6\n\t"
5591 "sbcs x20, x15, x7\n\t"
5592 "mov x25, #-19\n\t"
5593 "csetm x28, cc\n\t"
5594 /* Mask the modulus */
5595 "and x25, x28, x25\n\t"
5596 "and x26, x28, #0x7fffffffffffffff\n\t"
5597 /* Add modulus (if underflow) */
5598 "adds x16, x16, x25\n\t"
5599 "adcs x17, x17, x28\n\t"
5600 "adcs x19, x19, x28\n\t"
5601 "adc x20, x20, x26\n\t"
5602 "stp x8, x9, [x0]\n\t"
5603 "stp x10, x11, [x0, #16]\n\t"
5604 "stp x16, x17, [x1]\n\t"
5605 "stp x19, x20, [x1, #16]\n\t"
5606 "ldr x0, [x29, #40]\n\t"
5607 "ldr x1, [sp, #88]\n\t"
5608 "ldr x3, [x29, #72]\n\t"
5609 /* Multiply */
5610 "ldp x16, x17, [x1]\n\t"
5611 "ldp x19, x20, [x1, #16]\n\t"
5612 "ldp x21, x22, [x3]\n\t"
5613 "ldp x23, x24, [x3, #16]\n\t"
5614 /* A[0] * B[0] */
5615 "mul x4, x16, x21\n\t"
5616 "umulh x5, x16, x21\n\t"
5617 /* A[0] * B[1] */
5618 "mul x25, x16, x22\n\t"
5619 "umulh x6, x16, x22\n\t"
5620 "adds x5, x5, x25\n\t"
5621 "adc x6, x6, xzr\n\t"
5622 /* A[1] * B[0] */
5623 "mul x25, x17, x21\n\t"
5624 "umulh x26, x17, x21\n\t"
5625 "adds x5, x5, x25\n\t"
5626 "adcs x6, x6, x26\n\t"
5627 "adc x7, xzr, xzr\n\t"
5628 /* A[0] * B[2] */
5629 "mul x25, x16, x23\n\t"
5630 "umulh x26, x16, x23\n\t"
5631 "adds x6, x6, x25\n\t"
5632 "adc x7, x7, x26\n\t"
5633 /* A[1] * B[1] */
5634 "mul x25, x17, x22\n\t"
5635 "umulh x26, x17, x22\n\t"
5636 "adds x6, x6, x25\n\t"
5637 "adcs x7, x7, x26\n\t"
5638 "adc x8, xzr, xzr\n\t"
5639 /* A[2] * B[0] */
5640 "mul x25, x19, x21\n\t"
5641 "umulh x26, x19, x21\n\t"
5642 "adds x6, x6, x25\n\t"
5643 "adcs x7, x7, x26\n\t"
5644 "adc x8, x8, xzr\n\t"
5645 /* A[0] * B[3] */
5646 "mul x25, x16, x24\n\t"
5647 "umulh x26, x16, x24\n\t"
5648 "adds x7, x7, x25\n\t"
5649 "adcs x8, x8, x26\n\t"
5650 "adc x9, xzr, xzr\n\t"
5651 /* A[1] * B[2] */
5652 "mul x25, x17, x23\n\t"
5653 "umulh x26, x17, x23\n\t"
5654 "adds x7, x7, x25\n\t"
5655 "adcs x8, x8, x26\n\t"
5656 "adc x9, x9, xzr\n\t"
5657 /* A[2] * B[1] */
5658 "mul x25, x19, x22\n\t"
5659 "umulh x26, x19, x22\n\t"
5660 "adds x7, x7, x25\n\t"
5661 "adcs x8, x8, x26\n\t"
5662 "adc x9, x9, xzr\n\t"
5663 /* A[3] * B[0] */
5664 "mul x25, x20, x21\n\t"
5665 "umulh x26, x20, x21\n\t"
5666 "adds x7, x7, x25\n\t"
5667 "adcs x8, x8, x26\n\t"
5668 "adc x9, x9, xzr\n\t"
5669 /* A[1] * B[3] */
5670 "mul x25, x17, x24\n\t"
5671 "umulh x26, x17, x24\n\t"
5672 "adds x8, x8, x25\n\t"
5673 "adcs x9, x9, x26\n\t"
5674 "adc x10, xzr, xzr\n\t"
5675 /* A[2] * B[2] */
5676 "mul x25, x19, x23\n\t"
5677 "umulh x26, x19, x23\n\t"
5678 "adds x8, x8, x25\n\t"
5679 "adcs x9, x9, x26\n\t"
5680 "adc x10, x10, xzr\n\t"
5681 /* A[3] * B[1] */
5682 "mul x25, x20, x22\n\t"
5683 "umulh x26, x20, x22\n\t"
5684 "adds x8, x8, x25\n\t"
5685 "adcs x9, x9, x26\n\t"
5686 "adc x10, x10, xzr\n\t"
5687 /* A[2] * B[3] */
5688 "mul x25, x19, x24\n\t"
5689 "umulh x26, x19, x24\n\t"
5690 "adds x9, x9, x25\n\t"
5691 "adcs x10, x10, x26\n\t"
5692 "adc x11, xzr, xzr\n\t"
5693 /* A[3] * B[2] */
5694 "mul x25, x20, x23\n\t"
5695 "umulh x26, x20, x23\n\t"
5696 "adds x9, x9, x25\n\t"
5697 "adcs x10, x10, x26\n\t"
5698 "adc x11, x11, xzr\n\t"
5699 /* A[3] * B[3] */
5700 "mul x25, x20, x24\n\t"
5701 "umulh x26, x20, x24\n\t"
5702 "adds x10, x10, x25\n\t"
5703 "adc x11, x11, x26\n\t"
5704 /* Reduce */
5705 /* Move top half into t4-t7 and remove top bit from t3 */
5706 "extr x11, x11, x10, #63\n\t"
5707 "extr x10, x10, x9, #63\n\t"
5708 "extr x9, x9, x8, #63\n\t"
5709 "extr x8, x8, x7, #63\n\t"
5710 "and x7, x7, #0x7fffffffffffffff\n\t"
5711 /* Multiply top half by 19 */
5712 "mov x25, #19\n\t"
5713 "mul x26, x25, x8\n\t"
5714 "umulh x8, x25, x8\n\t"
5715 "adds x4, x4, x26\n\t"
5716 "mul x26, x25, x9\n\t"
5717 "umulh x9, x25, x9\n\t"
5718 "adcs x5, x5, x26\n\t"
5719 "mul x26, x25, x10\n\t"
5720 "umulh x10, x25, x10\n\t"
5721 "adcs x6, x6, x26\n\t"
5722 "mul x26, x25, x11\n\t"
5723 "umulh x27, x25, x11\n\t"
5724 "adcs x7, x7, x26\n\t"
5725 "adc x27, x27, xzr\n\t"
5726 /* Add remaining product results in */
5727 "adds x5, x5, x8\n\t"
5728 "adcs x6, x6, x9\n\t"
5729 "adcs x7, x7, x10\n\t"
5730 "adc x27, x27, xzr\n\t"
5731 /* Overflow */
5732 "extr x27, x27, x7, #63\n\t"
5733 "mul x27, x27, x25\n\t"
5734 "and x7, x7, #0x7fffffffffffffff\n\t"
5735 "adds x4, x4, x27\n\t"
5736 "adcs x5, x5, xzr\n\t"
5737 "adcs x6, x6, xzr\n\t"
5738 "adc x7, x7, xzr\n\t"
5739 /* Reduce if top bit set */
5740 "and x27, x25, x7, asr 63\n\t"
5741 "and x7, x7, #0x7fffffffffffffff\n\t"
5742 "adds x4, x4, x27\n\t"
5743 "adcs x5, x5, xzr\n\t"
5744 "adcs x6, x6, xzr\n\t"
5745 "adc x7, x7, xzr\n\t"
5746 /* Store */
5747 "ldr x0, [x29, #32]\n\t"
5748 "ldr x1, [x29, #64]\n\t"
5749 /* Double */
5750 "ldp x8, x9, [x1]\n\t"
5751 "ldp x10, x11, [x1, #16]\n\t"
5752 "adds x8, x8, x8\n\t"
5753 "adcs x9, x9, x9\n\t"
5754 "adcs x10, x10, x10\n\t"
5755 "adc x11, x11, x11\n\t"
5756 "mov x25, #-19\n\t"
5757 "asr x28, x11, #63\n\t"
5758 /* Mask the modulus */
5759 "and x25, x28, x25\n\t"
5760 "and x26, x28, #0x7fffffffffffffff\n\t"
5761 /* Sub modulus (if overflow) */
5762 "subs x8, x8, x25\n\t"
5763 "sbcs x9, x9, x28\n\t"
5764 "sbcs x10, x10, x28\n\t"
5765 "sbc x11, x11, x26\n\t"
5766 "ldr x1, [x29, #40]\n\t"
5767 /* Add */
5768 "adds x12, x8, x4\n\t"
5769 "adcs x13, x9, x5\n\t"
5770 "adcs x14, x10, x6\n\t"
5771 "adc x15, x11, x7\n\t"
5772 "mov x25, #-19\n\t"
5773 "asr x28, x15, #63\n\t"
5774 /* Mask the modulus */
5775 "and x25, x28, x25\n\t"
5776 "and x26, x28, #0x7fffffffffffffff\n\t"
5777 /* Sub modulus (if overflow) */
5778 "subs x12, x12, x25\n\t"
5779 "sbcs x13, x13, x28\n\t"
5780 "sbcs x14, x14, x28\n\t"
5781 "sbc x15, x15, x26\n\t"
5782 /* Sub */
5783 "subs x16, x8, x4\n\t"
5784 "sbcs x17, x9, x5\n\t"
5785 "sbcs x19, x10, x6\n\t"
5786 "sbcs x20, x11, x7\n\t"
5787 "mov x25, #-19\n\t"
5788 "csetm x28, cc\n\t"
5789 /* Mask the modulus */
5790 "and x25, x28, x25\n\t"
5791 "and x26, x28, #0x7fffffffffffffff\n\t"
5792 /* Add modulus (if underflow) */
5793 "adds x16, x16, x25\n\t"
5794 "adcs x17, x17, x28\n\t"
5795 "adcs x19, x19, x28\n\t"
5796 "adc x20, x20, x26\n\t"
5797 "stp x12, x13, [x1]\n\t"
5798 "stp x14, x15, [x1, #16]\n\t"
5799 "stp x16, x17, [x0]\n\t"
5800 "stp x19, x20, [x0, #16]\n\t"
5801 "ldp x29, x30, [sp], #0x70\n\t"
5802 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qxy2d] "+r" (qxy2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
5803 :
5804 : "memory", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
5805 );
5806 }
5807
fe_ge_add(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qz,const fe qt2d,const fe qyplusx,const fe qyminusx)5808 void fe_ge_add(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
5809 {
5810 __asm__ __volatile__ (
5811 "stp x29, x30, [sp, #-128]!\n\t"
5812 "add x29, sp, #0\n\t"
5813 "str %x[qyminusx], [sp, #120]\n\t"
5814 "str %x[qyplusx], [sp, #112]\n\t"
5815 "str %x[qt2d], [sp, #104]\n\t"
5816 "str %x[qz], [sp, #96]\n\t"
5817 "str %x[rx], [x29, #16]\n\t"
5818 "str %x[ry], [x29, #24]\n\t"
5819 "str %x[rz], [x29, #32]\n\t"
5820 "str %x[rt], [x29, #40]\n\t"
5821 "str %x[px], [x29, #48]\n\t"
5822 "str %x[py], [x29, #56]\n\t"
5823 "str %x[pz], [x29, #64]\n\t"
5824 "str %x[pt], [x29, #72]\n\t"
5825 "ldr x2, [x29, #56]\n\t"
5826 "ldr x3, [x29, #48]\n\t"
5827 /* Add */
5828 "ldp x12, x13, [x2]\n\t"
5829 "ldp x14, x15, [x2, #16]\n\t"
5830 "ldp x16, x17, [x3]\n\t"
5831 "ldp x19, x20, [x3, #16]\n\t"
5832 "adds x4, x12, x16\n\t"
5833 "adcs x5, x13, x17\n\t"
5834 "adcs x6, x14, x19\n\t"
5835 "adc x7, x15, x20\n\t"
5836 "mov x25, #-19\n\t"
5837 "asr x28, x7, #63\n\t"
5838 /* Mask the modulus */
5839 "and x25, x28, x25\n\t"
5840 "and x26, x28, #0x7fffffffffffffff\n\t"
5841 /* Sub modulus (if overflow) */
5842 "subs x4, x4, x25\n\t"
5843 "sbcs x5, x5, x28\n\t"
5844 "sbcs x6, x6, x28\n\t"
5845 "sbc x7, x7, x26\n\t"
5846 /* Sub */
5847 "subs x8, x12, x16\n\t"
5848 "sbcs x9, x13, x17\n\t"
5849 "sbcs x10, x14, x19\n\t"
5850 "sbcs x11, x15, x20\n\t"
5851 "mov x25, #-19\n\t"
5852 "csetm x28, cc\n\t"
5853 /* Mask the modulus */
5854 "and x25, x28, x25\n\t"
5855 "and x26, x28, #0x7fffffffffffffff\n\t"
5856 /* Add modulus (if underflow) */
5857 "adds x8, x8, x25\n\t"
5858 "adcs x9, x9, x28\n\t"
5859 "adcs x10, x10, x28\n\t"
5860 "adc x11, x11, x26\n\t"
5861 "ldr x0, [x29, #32]\n\t"
5862 "ldr x2, [sp, #112]\n\t"
5863 /* Multiply */
5864 "ldp x21, x22, [x2]\n\t"
5865 "ldp x23, x24, [x2, #16]\n\t"
5866 /* A[0] * B[0] */
5867 "mul x12, x4, x21\n\t"
5868 "umulh x13, x4, x21\n\t"
5869 /* A[0] * B[1] */
5870 "mul x25, x4, x22\n\t"
5871 "umulh x14, x4, x22\n\t"
5872 "adds x13, x13, x25\n\t"
5873 "adc x14, x14, xzr\n\t"
5874 /* A[1] * B[0] */
5875 "mul x25, x5, x21\n\t"
5876 "umulh x26, x5, x21\n\t"
5877 "adds x13, x13, x25\n\t"
5878 "adcs x14, x14, x26\n\t"
5879 "adc x15, xzr, xzr\n\t"
5880 /* A[0] * B[2] */
5881 "mul x25, x4, x23\n\t"
5882 "umulh x26, x4, x23\n\t"
5883 "adds x14, x14, x25\n\t"
5884 "adc x15, x15, x26\n\t"
5885 /* A[1] * B[1] */
5886 "mul x25, x5, x22\n\t"
5887 "umulh x26, x5, x22\n\t"
5888 "adds x14, x14, x25\n\t"
5889 "adcs x15, x15, x26\n\t"
5890 "adc x16, xzr, xzr\n\t"
5891 /* A[2] * B[0] */
5892 "mul x25, x6, x21\n\t"
5893 "umulh x26, x6, x21\n\t"
5894 "adds x14, x14, x25\n\t"
5895 "adcs x15, x15, x26\n\t"
5896 "adc x16, x16, xzr\n\t"
5897 /* A[0] * B[3] */
5898 "mul x25, x4, x24\n\t"
5899 "umulh x26, x4, x24\n\t"
5900 "adds x15, x15, x25\n\t"
5901 "adcs x16, x16, x26\n\t"
5902 "adc x17, xzr, xzr\n\t"
5903 /* A[1] * B[2] */
5904 "mul x25, x5, x23\n\t"
5905 "umulh x26, x5, x23\n\t"
5906 "adds x15, x15, x25\n\t"
5907 "adcs x16, x16, x26\n\t"
5908 "adc x17, x17, xzr\n\t"
5909 /* A[2] * B[1] */
5910 "mul x25, x6, x22\n\t"
5911 "umulh x26, x6, x22\n\t"
5912 "adds x15, x15, x25\n\t"
5913 "adcs x16, x16, x26\n\t"
5914 "adc x17, x17, xzr\n\t"
5915 /* A[3] * B[0] */
5916 "mul x25, x7, x21\n\t"
5917 "umulh x26, x7, x21\n\t"
5918 "adds x15, x15, x25\n\t"
5919 "adcs x16, x16, x26\n\t"
5920 "adc x17, x17, xzr\n\t"
5921 /* A[1] * B[3] */
5922 "mul x25, x5, x24\n\t"
5923 "umulh x26, x5, x24\n\t"
5924 "adds x16, x16, x25\n\t"
5925 "adcs x17, x17, x26\n\t"
5926 "adc x19, xzr, xzr\n\t"
5927 /* A[2] * B[2] */
5928 "mul x25, x6, x23\n\t"
5929 "umulh x26, x6, x23\n\t"
5930 "adds x16, x16, x25\n\t"
5931 "adcs x17, x17, x26\n\t"
5932 "adc x19, x19, xzr\n\t"
5933 /* A[3] * B[1] */
5934 "mul x25, x7, x22\n\t"
5935 "umulh x26, x7, x22\n\t"
5936 "adds x16, x16, x25\n\t"
5937 "adcs x17, x17, x26\n\t"
5938 "adc x19, x19, xzr\n\t"
5939 /* A[2] * B[3] */
5940 "mul x25, x6, x24\n\t"
5941 "umulh x26, x6, x24\n\t"
5942 "adds x17, x17, x25\n\t"
5943 "adcs x19, x19, x26\n\t"
5944 "adc x20, xzr, xzr\n\t"
5945 /* A[3] * B[2] */
5946 "mul x25, x7, x23\n\t"
5947 "umulh x26, x7, x23\n\t"
5948 "adds x17, x17, x25\n\t"
5949 "adcs x19, x19, x26\n\t"
5950 "adc x20, x20, xzr\n\t"
5951 /* A[3] * B[3] */
5952 "mul x25, x7, x24\n\t"
5953 "umulh x26, x7, x24\n\t"
5954 "adds x19, x19, x25\n\t"
5955 "adc x20, x20, x26\n\t"
5956 /* Reduce */
5957 /* Move top half into t4-t7 and remove top bit from t3 */
5958 "extr x20, x20, x19, #63\n\t"
5959 "extr x19, x19, x17, #63\n\t"
5960 "extr x17, x17, x16, #63\n\t"
5961 "extr x16, x16, x15, #63\n\t"
5962 "and x15, x15, #0x7fffffffffffffff\n\t"
5963 /* Multiply top half by 19 */
5964 "mov x25, #19\n\t"
5965 "mul x26, x25, x16\n\t"
5966 "umulh x16, x25, x16\n\t"
5967 "adds x12, x12, x26\n\t"
5968 "mul x26, x25, x17\n\t"
5969 "umulh x17, x25, x17\n\t"
5970 "adcs x13, x13, x26\n\t"
5971 "mul x26, x25, x19\n\t"
5972 "umulh x19, x25, x19\n\t"
5973 "adcs x14, x14, x26\n\t"
5974 "mul x26, x25, x20\n\t"
5975 "umulh x27, x25, x20\n\t"
5976 "adcs x15, x15, x26\n\t"
5977 "adc x27, x27, xzr\n\t"
5978 /* Add remaining product results in */
5979 "adds x13, x13, x16\n\t"
5980 "adcs x14, x14, x17\n\t"
5981 "adcs x15, x15, x19\n\t"
5982 "adc x27, x27, xzr\n\t"
5983 /* Overflow */
5984 "extr x27, x27, x15, #63\n\t"
5985 "mul x27, x27, x25\n\t"
5986 "and x15, x15, #0x7fffffffffffffff\n\t"
5987 "adds x12, x12, x27\n\t"
5988 "adcs x13, x13, xzr\n\t"
5989 "adcs x14, x14, xzr\n\t"
5990 "adc x15, x15, xzr\n\t"
5991 /* Reduce if top bit set */
5992 "and x27, x25, x15, asr 63\n\t"
5993 "and x15, x15, #0x7fffffffffffffff\n\t"
5994 "adds x12, x12, x27\n\t"
5995 "adcs x13, x13, xzr\n\t"
5996 "adcs x14, x14, xzr\n\t"
5997 "adc x15, x15, xzr\n\t"
5998 /* Store */
5999 "ldr x0, [x29, #24]\n\t"
6000 "ldr x1, [sp, #120]\n\t"
6001 /* Multiply */
6002 "ldp x21, x22, [x1]\n\t"
6003 "ldp x23, x24, [x1, #16]\n\t"
6004 /* A[0] * B[0] */
6005 "mul x4, x8, x21\n\t"
6006 "umulh x5, x8, x21\n\t"
6007 /* A[0] * B[1] */
6008 "mul x25, x8, x22\n\t"
6009 "umulh x6, x8, x22\n\t"
6010 "adds x5, x5, x25\n\t"
6011 "adc x6, x6, xzr\n\t"
6012 /* A[1] * B[0] */
6013 "mul x25, x9, x21\n\t"
6014 "umulh x26, x9, x21\n\t"
6015 "adds x5, x5, x25\n\t"
6016 "adcs x6, x6, x26\n\t"
6017 "adc x7, xzr, xzr\n\t"
6018 /* A[0] * B[2] */
6019 "mul x25, x8, x23\n\t"
6020 "umulh x26, x8, x23\n\t"
6021 "adds x6, x6, x25\n\t"
6022 "adc x7, x7, x26\n\t"
6023 /* A[1] * B[1] */
6024 "mul x25, x9, x22\n\t"
6025 "umulh x26, x9, x22\n\t"
6026 "adds x6, x6, x25\n\t"
6027 "adcs x7, x7, x26\n\t"
6028 "adc x16, xzr, xzr\n\t"
6029 /* A[2] * B[0] */
6030 "mul x25, x10, x21\n\t"
6031 "umulh x26, x10, x21\n\t"
6032 "adds x6, x6, x25\n\t"
6033 "adcs x7, x7, x26\n\t"
6034 "adc x16, x16, xzr\n\t"
6035 /* A[0] * B[3] */
6036 "mul x25, x8, x24\n\t"
6037 "umulh x26, x8, x24\n\t"
6038 "adds x7, x7, x25\n\t"
6039 "adcs x16, x16, x26\n\t"
6040 "adc x17, xzr, xzr\n\t"
6041 /* A[1] * B[2] */
6042 "mul x25, x9, x23\n\t"
6043 "umulh x26, x9, x23\n\t"
6044 "adds x7, x7, x25\n\t"
6045 "adcs x16, x16, x26\n\t"
6046 "adc x17, x17, xzr\n\t"
6047 /* A[2] * B[1] */
6048 "mul x25, x10, x22\n\t"
6049 "umulh x26, x10, x22\n\t"
6050 "adds x7, x7, x25\n\t"
6051 "adcs x16, x16, x26\n\t"
6052 "adc x17, x17, xzr\n\t"
6053 /* A[3] * B[0] */
6054 "mul x25, x11, x21\n\t"
6055 "umulh x26, x11, x21\n\t"
6056 "adds x7, x7, x25\n\t"
6057 "adcs x16, x16, x26\n\t"
6058 "adc x17, x17, xzr\n\t"
6059 /* A[1] * B[3] */
6060 "mul x25, x9, x24\n\t"
6061 "umulh x26, x9, x24\n\t"
6062 "adds x16, x16, x25\n\t"
6063 "adcs x17, x17, x26\n\t"
6064 "adc x19, xzr, xzr\n\t"
6065 /* A[2] * B[2] */
6066 "mul x25, x10, x23\n\t"
6067 "umulh x26, x10, x23\n\t"
6068 "adds x16, x16, x25\n\t"
6069 "adcs x17, x17, x26\n\t"
6070 "adc x19, x19, xzr\n\t"
6071 /* A[3] * B[1] */
6072 "mul x25, x11, x22\n\t"
6073 "umulh x26, x11, x22\n\t"
6074 "adds x16, x16, x25\n\t"
6075 "adcs x17, x17, x26\n\t"
6076 "adc x19, x19, xzr\n\t"
6077 /* A[2] * B[3] */
6078 "mul x25, x10, x24\n\t"
6079 "umulh x26, x10, x24\n\t"
6080 "adds x17, x17, x25\n\t"
6081 "adcs x19, x19, x26\n\t"
6082 "adc x20, xzr, xzr\n\t"
6083 /* A[3] * B[2] */
6084 "mul x25, x11, x23\n\t"
6085 "umulh x26, x11, x23\n\t"
6086 "adds x17, x17, x25\n\t"
6087 "adcs x19, x19, x26\n\t"
6088 "adc x20, x20, xzr\n\t"
6089 /* A[3] * B[3] */
6090 "mul x25, x11, x24\n\t"
6091 "umulh x26, x11, x24\n\t"
6092 "adds x19, x19, x25\n\t"
6093 "adc x20, x20, x26\n\t"
6094 /* Reduce */
6095 /* Move top half into t4-t7 and remove top bit from t3 */
6096 "extr x20, x20, x19, #63\n\t"
6097 "extr x19, x19, x17, #63\n\t"
6098 "extr x17, x17, x16, #63\n\t"
6099 "extr x16, x16, x7, #63\n\t"
6100 "and x7, x7, #0x7fffffffffffffff\n\t"
6101 /* Multiply top half by 19 */
6102 "mov x25, #19\n\t"
6103 "mul x26, x25, x16\n\t"
6104 "umulh x16, x25, x16\n\t"
6105 "adds x4, x4, x26\n\t"
6106 "mul x26, x25, x17\n\t"
6107 "umulh x17, x25, x17\n\t"
6108 "adcs x5, x5, x26\n\t"
6109 "mul x26, x25, x19\n\t"
6110 "umulh x19, x25, x19\n\t"
6111 "adcs x6, x6, x26\n\t"
6112 "mul x26, x25, x20\n\t"
6113 "umulh x27, x25, x20\n\t"
6114 "adcs x7, x7, x26\n\t"
6115 "adc x27, x27, xzr\n\t"
6116 /* Add remaining product results in */
6117 "adds x5, x5, x16\n\t"
6118 "adcs x6, x6, x17\n\t"
6119 "adcs x7, x7, x19\n\t"
6120 "adc x27, x27, xzr\n\t"
6121 /* Overflow */
6122 "extr x27, x27, x7, #63\n\t"
6123 "mul x27, x27, x25\n\t"
6124 "and x7, x7, #0x7fffffffffffffff\n\t"
6125 "adds x4, x4, x27\n\t"
6126 "adcs x5, x5, xzr\n\t"
6127 "adcs x6, x6, xzr\n\t"
6128 "adc x7, x7, xzr\n\t"
6129 /* Reduce if top bit set */
6130 "and x27, x25, x7, asr 63\n\t"
6131 "and x7, x7, #0x7fffffffffffffff\n\t"
6132 "adds x4, x4, x27\n\t"
6133 "adcs x5, x5, xzr\n\t"
6134 "adcs x6, x6, xzr\n\t"
6135 "adc x7, x7, xzr\n\t"
6136 /* Store */
6137 "ldr x0, [x29, #24]\n\t"
6138 "ldr x1, [x29, #16]\n\t"
6139 /* Add */
6140 "adds x8, x12, x4\n\t"
6141 "adcs x9, x13, x5\n\t"
6142 "adcs x10, x14, x6\n\t"
6143 "adc x11, x15, x7\n\t"
6144 "mov x25, #-19\n\t"
6145 "asr x28, x11, #63\n\t"
6146 /* Mask the modulus */
6147 "and x25, x28, x25\n\t"
6148 "and x26, x28, #0x7fffffffffffffff\n\t"
6149 /* Sub modulus (if overflow) */
6150 "subs x8, x8, x25\n\t"
6151 "sbcs x9, x9, x28\n\t"
6152 "sbcs x10, x10, x28\n\t"
6153 "sbc x11, x11, x26\n\t"
6154 /* Sub */
6155 "subs x16, x12, x4\n\t"
6156 "sbcs x17, x13, x5\n\t"
6157 "sbcs x19, x14, x6\n\t"
6158 "sbcs x20, x15, x7\n\t"
6159 "mov x25, #-19\n\t"
6160 "csetm x28, cc\n\t"
6161 /* Mask the modulus */
6162 "and x25, x28, x25\n\t"
6163 "and x26, x28, #0x7fffffffffffffff\n\t"
6164 /* Add modulus (if underflow) */
6165 "adds x16, x16, x25\n\t"
6166 "adcs x17, x17, x28\n\t"
6167 "adcs x19, x19, x28\n\t"
6168 "adc x20, x20, x26\n\t"
6169 "stp x8, x9, [x0]\n\t"
6170 "stp x10, x11, [x0, #16]\n\t"
6171 "stp x16, x17, [x1]\n\t"
6172 "stp x19, x20, [x1, #16]\n\t"
6173 "ldr x0, [x29, #48]\n\t"
6174 "ldr x1, [x29, #64]\n\t"
6175 "ldr x2, [sp, #96]\n\t"
6176 /* Multiply */
6177 "ldp x12, x13, [x1]\n\t"
6178 "ldp x14, x15, [x1, #16]\n\t"
6179 "ldp x16, x17, [x2]\n\t"
6180 "ldp x19, x20, [x2, #16]\n\t"
6181 /* A[0] * B[0] */
6182 "mul x4, x12, x16\n\t"
6183 "umulh x5, x12, x16\n\t"
6184 /* A[0] * B[1] */
6185 "mul x25, x12, x17\n\t"
6186 "umulh x6, x12, x17\n\t"
6187 "adds x5, x5, x25\n\t"
6188 "adc x6, x6, xzr\n\t"
6189 /* A[1] * B[0] */
6190 "mul x25, x13, x16\n\t"
6191 "umulh x26, x13, x16\n\t"
6192 "adds x5, x5, x25\n\t"
6193 "adcs x6, x6, x26\n\t"
6194 "adc x7, xzr, xzr\n\t"
6195 /* A[0] * B[2] */
6196 "mul x25, x12, x19\n\t"
6197 "umulh x26, x12, x19\n\t"
6198 "adds x6, x6, x25\n\t"
6199 "adc x7, x7, x26\n\t"
6200 /* A[1] * B[1] */
6201 "mul x25, x13, x17\n\t"
6202 "umulh x26, x13, x17\n\t"
6203 "adds x6, x6, x25\n\t"
6204 "adcs x7, x7, x26\n\t"
6205 "adc x8, xzr, xzr\n\t"
6206 /* A[2] * B[0] */
6207 "mul x25, x14, x16\n\t"
6208 "umulh x26, x14, x16\n\t"
6209 "adds x6, x6, x25\n\t"
6210 "adcs x7, x7, x26\n\t"
6211 "adc x8, x8, xzr\n\t"
6212 /* A[0] * B[3] */
6213 "mul x25, x12, x20\n\t"
6214 "umulh x26, x12, x20\n\t"
6215 "adds x7, x7, x25\n\t"
6216 "adcs x8, x8, x26\n\t"
6217 "adc x9, xzr, xzr\n\t"
6218 /* A[1] * B[2] */
6219 "mul x25, x13, x19\n\t"
6220 "umulh x26, x13, x19\n\t"
6221 "adds x7, x7, x25\n\t"
6222 "adcs x8, x8, x26\n\t"
6223 "adc x9, x9, xzr\n\t"
6224 /* A[2] * B[1] */
6225 "mul x25, x14, x17\n\t"
6226 "umulh x26, x14, x17\n\t"
6227 "adds x7, x7, x25\n\t"
6228 "adcs x8, x8, x26\n\t"
6229 "adc x9, x9, xzr\n\t"
6230 /* A[3] * B[0] */
6231 "mul x25, x15, x16\n\t"
6232 "umulh x26, x15, x16\n\t"
6233 "adds x7, x7, x25\n\t"
6234 "adcs x8, x8, x26\n\t"
6235 "adc x9, x9, xzr\n\t"
6236 /* A[1] * B[3] */
6237 "mul x25, x13, x20\n\t"
6238 "umulh x26, x13, x20\n\t"
6239 "adds x8, x8, x25\n\t"
6240 "adcs x9, x9, x26\n\t"
6241 "adc x10, xzr, xzr\n\t"
6242 /* A[2] * B[2] */
6243 "mul x25, x14, x19\n\t"
6244 "umulh x26, x14, x19\n\t"
6245 "adds x8, x8, x25\n\t"
6246 "adcs x9, x9, x26\n\t"
6247 "adc x10, x10, xzr\n\t"
6248 /* A[3] * B[1] */
6249 "mul x25, x15, x17\n\t"
6250 "umulh x26, x15, x17\n\t"
6251 "adds x8, x8, x25\n\t"
6252 "adcs x9, x9, x26\n\t"
6253 "adc x10, x10, xzr\n\t"
6254 /* A[2] * B[3] */
6255 "mul x25, x14, x20\n\t"
6256 "umulh x26, x14, x20\n\t"
6257 "adds x9, x9, x25\n\t"
6258 "adcs x10, x10, x26\n\t"
6259 "adc x11, xzr, xzr\n\t"
6260 /* A[3] * B[2] */
6261 "mul x25, x15, x19\n\t"
6262 "umulh x26, x15, x19\n\t"
6263 "adds x9, x9, x25\n\t"
6264 "adcs x10, x10, x26\n\t"
6265 "adc x11, x11, xzr\n\t"
6266 /* A[3] * B[3] */
6267 "mul x25, x15, x20\n\t"
6268 "umulh x26, x15, x20\n\t"
6269 "adds x10, x10, x25\n\t"
6270 "adc x11, x11, x26\n\t"
6271 /* Reduce */
6272 /* Move top half into t4-t7 and remove top bit from t3 */
6273 "extr x11, x11, x10, #63\n\t"
6274 "extr x10, x10, x9, #63\n\t"
6275 "extr x9, x9, x8, #63\n\t"
6276 "extr x8, x8, x7, #63\n\t"
6277 "and x7, x7, #0x7fffffffffffffff\n\t"
6278 /* Multiply top half by 19 */
6279 "mov x25, #19\n\t"
6280 "mul x26, x25, x8\n\t"
6281 "umulh x8, x25, x8\n\t"
6282 "adds x4, x4, x26\n\t"
6283 "mul x26, x25, x9\n\t"
6284 "umulh x9, x25, x9\n\t"
6285 "adcs x5, x5, x26\n\t"
6286 "mul x26, x25, x10\n\t"
6287 "umulh x10, x25, x10\n\t"
6288 "adcs x6, x6, x26\n\t"
6289 "mul x26, x25, x11\n\t"
6290 "umulh x27, x25, x11\n\t"
6291 "adcs x7, x7, x26\n\t"
6292 "adc x27, x27, xzr\n\t"
6293 /* Add remaining product results in */
6294 "adds x5, x5, x8\n\t"
6295 "adcs x6, x6, x9\n\t"
6296 "adcs x7, x7, x10\n\t"
6297 "adc x27, x27, xzr\n\t"
6298 /* Overflow */
6299 "extr x27, x27, x7, #63\n\t"
6300 "mul x27, x27, x25\n\t"
6301 "and x7, x7, #0x7fffffffffffffff\n\t"
6302 "adds x4, x4, x27\n\t"
6303 "adcs x5, x5, xzr\n\t"
6304 "adcs x6, x6, xzr\n\t"
6305 "adc x7, x7, xzr\n\t"
6306 /* Reduce if top bit set */
6307 "and x27, x25, x7, asr 63\n\t"
6308 "and x7, x7, #0x7fffffffffffffff\n\t"
6309 "adds x4, x4, x27\n\t"
6310 "adcs x5, x5, xzr\n\t"
6311 "adcs x6, x6, xzr\n\t"
6312 "adc x7, x7, xzr\n\t"
6313 /* Store */
6314 "ldr x0, [x29, #48]\n\t"
6315 /* Double */
6316 "adds x4, x4, x4\n\t"
6317 "adcs x5, x5, x5\n\t"
6318 "adcs x6, x6, x6\n\t"
6319 "adc x7, x7, x7\n\t"
6320 "mov x25, #-19\n\t"
6321 "asr x28, x7, #63\n\t"
6322 /* Mask the modulus */
6323 "and x25, x28, x25\n\t"
6324 "and x26, x28, #0x7fffffffffffffff\n\t"
6325 /* Sub modulus (if overflow) */
6326 "subs x4, x4, x25\n\t"
6327 "sbcs x5, x5, x28\n\t"
6328 "sbcs x6, x6, x28\n\t"
6329 "sbc x7, x7, x26\n\t"
6330 "ldr x0, [x29, #40]\n\t"
6331 "ldr x1, [sp, #104]\n\t"
6332 "ldr x2, [x29, #72]\n\t"
6333 /* Multiply */
6334 "ldp x16, x17, [x1]\n\t"
6335 "ldp x19, x20, [x1, #16]\n\t"
6336 "ldp x21, x22, [x2]\n\t"
6337 "ldp x23, x24, [x2, #16]\n\t"
6338 /* A[0] * B[0] */
6339 "mul x8, x16, x21\n\t"
6340 "umulh x9, x16, x21\n\t"
6341 /* A[0] * B[1] */
6342 "mul x25, x16, x22\n\t"
6343 "umulh x10, x16, x22\n\t"
6344 "adds x9, x9, x25\n\t"
6345 "adc x10, x10, xzr\n\t"
6346 /* A[1] * B[0] */
6347 "mul x25, x17, x21\n\t"
6348 "umulh x26, x17, x21\n\t"
6349 "adds x9, x9, x25\n\t"
6350 "adcs x10, x10, x26\n\t"
6351 "adc x11, xzr, xzr\n\t"
6352 /* A[0] * B[2] */
6353 "mul x25, x16, x23\n\t"
6354 "umulh x26, x16, x23\n\t"
6355 "adds x10, x10, x25\n\t"
6356 "adc x11, x11, x26\n\t"
6357 /* A[1] * B[1] */
6358 "mul x25, x17, x22\n\t"
6359 "umulh x26, x17, x22\n\t"
6360 "adds x10, x10, x25\n\t"
6361 "adcs x11, x11, x26\n\t"
6362 "adc x12, xzr, xzr\n\t"
6363 /* A[2] * B[0] */
6364 "mul x25, x19, x21\n\t"
6365 "umulh x26, x19, x21\n\t"
6366 "adds x10, x10, x25\n\t"
6367 "adcs x11, x11, x26\n\t"
6368 "adc x12, x12, xzr\n\t"
6369 /* A[0] * B[3] */
6370 "mul x25, x16, x24\n\t"
6371 "umulh x26, x16, x24\n\t"
6372 "adds x11, x11, x25\n\t"
6373 "adcs x12, x12, x26\n\t"
6374 "adc x13, xzr, xzr\n\t"
6375 /* A[1] * B[2] */
6376 "mul x25, x17, x23\n\t"
6377 "umulh x26, x17, x23\n\t"
6378 "adds x11, x11, x25\n\t"
6379 "adcs x12, x12, x26\n\t"
6380 "adc x13, x13, xzr\n\t"
6381 /* A[2] * B[1] */
6382 "mul x25, x19, x22\n\t"
6383 "umulh x26, x19, x22\n\t"
6384 "adds x11, x11, x25\n\t"
6385 "adcs x12, x12, x26\n\t"
6386 "adc x13, x13, xzr\n\t"
6387 /* A[3] * B[0] */
6388 "mul x25, x20, x21\n\t"
6389 "umulh x26, x20, x21\n\t"
6390 "adds x11, x11, x25\n\t"
6391 "adcs x12, x12, x26\n\t"
6392 "adc x13, x13, xzr\n\t"
6393 /* A[1] * B[3] */
6394 "mul x25, x17, x24\n\t"
6395 "umulh x26, x17, x24\n\t"
6396 "adds x12, x12, x25\n\t"
6397 "adcs x13, x13, x26\n\t"
6398 "adc x14, xzr, xzr\n\t"
6399 /* A[2] * B[2] */
6400 "mul x25, x19, x23\n\t"
6401 "umulh x26, x19, x23\n\t"
6402 "adds x12, x12, x25\n\t"
6403 "adcs x13, x13, x26\n\t"
6404 "adc x14, x14, xzr\n\t"
6405 /* A[3] * B[1] */
6406 "mul x25, x20, x22\n\t"
6407 "umulh x26, x20, x22\n\t"
6408 "adds x12, x12, x25\n\t"
6409 "adcs x13, x13, x26\n\t"
6410 "adc x14, x14, xzr\n\t"
6411 /* A[2] * B[3] */
6412 "mul x25, x19, x24\n\t"
6413 "umulh x26, x19, x24\n\t"
6414 "adds x13, x13, x25\n\t"
6415 "adcs x14, x14, x26\n\t"
6416 "adc x15, xzr, xzr\n\t"
6417 /* A[3] * B[2] */
6418 "mul x25, x20, x23\n\t"
6419 "umulh x26, x20, x23\n\t"
6420 "adds x13, x13, x25\n\t"
6421 "adcs x14, x14, x26\n\t"
6422 "adc x15, x15, xzr\n\t"
6423 /* A[3] * B[3] */
6424 "mul x25, x20, x24\n\t"
6425 "umulh x26, x20, x24\n\t"
6426 "adds x14, x14, x25\n\t"
6427 "adc x15, x15, x26\n\t"
6428 /* Reduce */
6429 /* Move top half into t4-t7 and remove top bit from t3 */
6430 "extr x15, x15, x14, #63\n\t"
6431 "extr x14, x14, x13, #63\n\t"
6432 "extr x13, x13, x12, #63\n\t"
6433 "extr x12, x12, x11, #63\n\t"
6434 "and x11, x11, #0x7fffffffffffffff\n\t"
6435 /* Multiply top half by 19 */
6436 "mov x25, #19\n\t"
6437 "mul x26, x25, x12\n\t"
6438 "umulh x12, x25, x12\n\t"
6439 "adds x8, x8, x26\n\t"
6440 "mul x26, x25, x13\n\t"
6441 "umulh x13, x25, x13\n\t"
6442 "adcs x9, x9, x26\n\t"
6443 "mul x26, x25, x14\n\t"
6444 "umulh x14, x25, x14\n\t"
6445 "adcs x10, x10, x26\n\t"
6446 "mul x26, x25, x15\n\t"
6447 "umulh x27, x25, x15\n\t"
6448 "adcs x11, x11, x26\n\t"
6449 "adc x27, x27, xzr\n\t"
6450 /* Add remaining product results in */
6451 "adds x9, x9, x12\n\t"
6452 "adcs x10, x10, x13\n\t"
6453 "adcs x11, x11, x14\n\t"
6454 "adc x27, x27, xzr\n\t"
6455 /* Overflow */
6456 "extr x27, x27, x11, #63\n\t"
6457 "mul x27, x27, x25\n\t"
6458 "and x11, x11, #0x7fffffffffffffff\n\t"
6459 "adds x8, x8, x27\n\t"
6460 "adcs x9, x9, xzr\n\t"
6461 "adcs x10, x10, xzr\n\t"
6462 "adc x11, x11, xzr\n\t"
6463 /* Reduce if top bit set */
6464 "and x27, x25, x11, asr 63\n\t"
6465 "and x11, x11, #0x7fffffffffffffff\n\t"
6466 "adds x8, x8, x27\n\t"
6467 "adcs x9, x9, xzr\n\t"
6468 "adcs x10, x10, xzr\n\t"
6469 "adc x11, x11, xzr\n\t"
6470 /* Store */
6471 "ldr x0, [x29, #32]\n\t"
6472 "ldr x1, [x29, #40]\n\t"
6473 /* Add */
6474 "adds x12, x4, x8\n\t"
6475 "adcs x13, x5, x9\n\t"
6476 "adcs x14, x6, x10\n\t"
6477 "adc x15, x7, x11\n\t"
6478 "mov x25, #-19\n\t"
6479 "asr x28, x15, #63\n\t"
6480 /* Mask the modulus */
6481 "and x25, x28, x25\n\t"
6482 "and x26, x28, #0x7fffffffffffffff\n\t"
6483 /* Sub modulus (if overflow) */
6484 "subs x12, x12, x25\n\t"
6485 "sbcs x13, x13, x28\n\t"
6486 "sbcs x14, x14, x28\n\t"
6487 "sbc x15, x15, x26\n\t"
6488 /* Sub */
6489 "subs x16, x4, x8\n\t"
6490 "sbcs x17, x5, x9\n\t"
6491 "sbcs x19, x6, x10\n\t"
6492 "sbcs x20, x7, x11\n\t"
6493 "mov x25, #-19\n\t"
6494 "csetm x28, cc\n\t"
6495 /* Mask the modulus */
6496 "and x25, x28, x25\n\t"
6497 "and x26, x28, #0x7fffffffffffffff\n\t"
6498 /* Add modulus (if underflow) */
6499 "adds x16, x16, x25\n\t"
6500 "adcs x17, x17, x28\n\t"
6501 "adcs x19, x19, x28\n\t"
6502 "adc x20, x20, x26\n\t"
6503 "stp x12, x13, [x0]\n\t"
6504 "stp x14, x15, [x0, #16]\n\t"
6505 "stp x16, x17, [x1]\n\t"
6506 "stp x19, x20, [x1, #16]\n\t"
6507 "ldp x29, x30, [sp], #0x80\n\t"
6508 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qz] "+r" (qz), [qt2d] "+r" (qt2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
6509 :
6510 : "memory", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
6511 );
6512 }
6513
fe_ge_sub(fe rx,fe ry,fe rz,fe rt,const fe px,const fe py,const fe pz,const fe pt,const fe qz,const fe qt2d,const fe qyplusx,const fe qyminusx)6514 void fe_ge_sub(fe rx, fe ry, fe rz, fe rt, const fe px, const fe py, const fe pz, const fe pt, const fe qz, const fe qt2d, const fe qyplusx, const fe qyminusx)
6515 {
6516 __asm__ __volatile__ (
6517 "stp x29, x30, [sp, #-128]!\n\t"
6518 "add x29, sp, #0\n\t"
6519 "str %x[qyminusx], [sp, #120]\n\t"
6520 "str %x[qyplusx], [sp, #112]\n\t"
6521 "str %x[qt2d], [sp, #104]\n\t"
6522 "str %x[qz], [sp, #96]\n\t"
6523 "str %x[rx], [x29, #16]\n\t"
6524 "str %x[ry], [x29, #24]\n\t"
6525 "str %x[rz], [x29, #32]\n\t"
6526 "str %x[rt], [x29, #40]\n\t"
6527 "str %x[px], [x29, #48]\n\t"
6528 "str %x[py], [x29, #56]\n\t"
6529 "str %x[pz], [x29, #64]\n\t"
6530 "str %x[pt], [x29, #72]\n\t"
6531 "ldr x2, [x29, #56]\n\t"
6532 "ldr x3, [x29, #48]\n\t"
6533 /* Add */
6534 "ldp x12, x13, [x2]\n\t"
6535 "ldp x14, x15, [x2, #16]\n\t"
6536 "ldp x16, x17, [x3]\n\t"
6537 "ldp x19, x20, [x3, #16]\n\t"
6538 "adds x4, x12, x16\n\t"
6539 "adcs x5, x13, x17\n\t"
6540 "adcs x6, x14, x19\n\t"
6541 "adc x7, x15, x20\n\t"
6542 "mov x25, #-19\n\t"
6543 "asr x28, x7, #63\n\t"
6544 /* Mask the modulus */
6545 "and x25, x28, x25\n\t"
6546 "and x26, x28, #0x7fffffffffffffff\n\t"
6547 /* Sub modulus (if overflow) */
6548 "subs x4, x4, x25\n\t"
6549 "sbcs x5, x5, x28\n\t"
6550 "sbcs x6, x6, x28\n\t"
6551 "sbc x7, x7, x26\n\t"
6552 /* Sub */
6553 "subs x8, x12, x16\n\t"
6554 "sbcs x9, x13, x17\n\t"
6555 "sbcs x10, x14, x19\n\t"
6556 "sbcs x11, x15, x20\n\t"
6557 "mov x25, #-19\n\t"
6558 "csetm x28, cc\n\t"
6559 /* Mask the modulus */
6560 "and x25, x28, x25\n\t"
6561 "and x26, x28, #0x7fffffffffffffff\n\t"
6562 /* Add modulus (if underflow) */
6563 "adds x8, x8, x25\n\t"
6564 "adcs x9, x9, x28\n\t"
6565 "adcs x10, x10, x28\n\t"
6566 "adc x11, x11, x26\n\t"
6567 "ldr x0, [x29, #32]\n\t"
6568 "ldr x2, [sp, #120]\n\t"
6569 /* Multiply */
6570 "ldp x21, x22, [x2]\n\t"
6571 "ldp x23, x24, [x2, #16]\n\t"
6572 /* A[0] * B[0] */
6573 "mul x12, x4, x21\n\t"
6574 "umulh x13, x4, x21\n\t"
6575 /* A[0] * B[1] */
6576 "mul x25, x4, x22\n\t"
6577 "umulh x14, x4, x22\n\t"
6578 "adds x13, x13, x25\n\t"
6579 "adc x14, x14, xzr\n\t"
6580 /* A[1] * B[0] */
6581 "mul x25, x5, x21\n\t"
6582 "umulh x26, x5, x21\n\t"
6583 "adds x13, x13, x25\n\t"
6584 "adcs x14, x14, x26\n\t"
6585 "adc x15, xzr, xzr\n\t"
6586 /* A[0] * B[2] */
6587 "mul x25, x4, x23\n\t"
6588 "umulh x26, x4, x23\n\t"
6589 "adds x14, x14, x25\n\t"
6590 "adc x15, x15, x26\n\t"
6591 /* A[1] * B[1] */
6592 "mul x25, x5, x22\n\t"
6593 "umulh x26, x5, x22\n\t"
6594 "adds x14, x14, x25\n\t"
6595 "adcs x15, x15, x26\n\t"
6596 "adc x16, xzr, xzr\n\t"
6597 /* A[2] * B[0] */
6598 "mul x25, x6, x21\n\t"
6599 "umulh x26, x6, x21\n\t"
6600 "adds x14, x14, x25\n\t"
6601 "adcs x15, x15, x26\n\t"
6602 "adc x16, x16, xzr\n\t"
6603 /* A[0] * B[3] */
6604 "mul x25, x4, x24\n\t"
6605 "umulh x26, x4, x24\n\t"
6606 "adds x15, x15, x25\n\t"
6607 "adcs x16, x16, x26\n\t"
6608 "adc x17, xzr, xzr\n\t"
6609 /* A[1] * B[2] */
6610 "mul x25, x5, x23\n\t"
6611 "umulh x26, x5, x23\n\t"
6612 "adds x15, x15, x25\n\t"
6613 "adcs x16, x16, x26\n\t"
6614 "adc x17, x17, xzr\n\t"
6615 /* A[2] * B[1] */
6616 "mul x25, x6, x22\n\t"
6617 "umulh x26, x6, x22\n\t"
6618 "adds x15, x15, x25\n\t"
6619 "adcs x16, x16, x26\n\t"
6620 "adc x17, x17, xzr\n\t"
6621 /* A[3] * B[0] */
6622 "mul x25, x7, x21\n\t"
6623 "umulh x26, x7, x21\n\t"
6624 "adds x15, x15, x25\n\t"
6625 "adcs x16, x16, x26\n\t"
6626 "adc x17, x17, xzr\n\t"
6627 /* A[1] * B[3] */
6628 "mul x25, x5, x24\n\t"
6629 "umulh x26, x5, x24\n\t"
6630 "adds x16, x16, x25\n\t"
6631 "adcs x17, x17, x26\n\t"
6632 "adc x19, xzr, xzr\n\t"
6633 /* A[2] * B[2] */
6634 "mul x25, x6, x23\n\t"
6635 "umulh x26, x6, x23\n\t"
6636 "adds x16, x16, x25\n\t"
6637 "adcs x17, x17, x26\n\t"
6638 "adc x19, x19, xzr\n\t"
6639 /* A[3] * B[1] */
6640 "mul x25, x7, x22\n\t"
6641 "umulh x26, x7, x22\n\t"
6642 "adds x16, x16, x25\n\t"
6643 "adcs x17, x17, x26\n\t"
6644 "adc x19, x19, xzr\n\t"
6645 /* A[2] * B[3] */
6646 "mul x25, x6, x24\n\t"
6647 "umulh x26, x6, x24\n\t"
6648 "adds x17, x17, x25\n\t"
6649 "adcs x19, x19, x26\n\t"
6650 "adc x20, xzr, xzr\n\t"
6651 /* A[3] * B[2] */
6652 "mul x25, x7, x23\n\t"
6653 "umulh x26, x7, x23\n\t"
6654 "adds x17, x17, x25\n\t"
6655 "adcs x19, x19, x26\n\t"
6656 "adc x20, x20, xzr\n\t"
6657 /* A[3] * B[3] */
6658 "mul x25, x7, x24\n\t"
6659 "umulh x26, x7, x24\n\t"
6660 "adds x19, x19, x25\n\t"
6661 "adc x20, x20, x26\n\t"
6662 /* Reduce */
6663 /* Move top half into t4-t7 and remove top bit from t3 */
6664 "extr x20, x20, x19, #63\n\t"
6665 "extr x19, x19, x17, #63\n\t"
6666 "extr x17, x17, x16, #63\n\t"
6667 "extr x16, x16, x15, #63\n\t"
6668 "and x15, x15, #0x7fffffffffffffff\n\t"
6669 /* Multiply top half by 19 */
6670 "mov x25, #19\n\t"
6671 "mul x26, x25, x16\n\t"
6672 "umulh x16, x25, x16\n\t"
6673 "adds x12, x12, x26\n\t"
6674 "mul x26, x25, x17\n\t"
6675 "umulh x17, x25, x17\n\t"
6676 "adcs x13, x13, x26\n\t"
6677 "mul x26, x25, x19\n\t"
6678 "umulh x19, x25, x19\n\t"
6679 "adcs x14, x14, x26\n\t"
6680 "mul x26, x25, x20\n\t"
6681 "umulh x27, x25, x20\n\t"
6682 "adcs x15, x15, x26\n\t"
6683 "adc x27, x27, xzr\n\t"
6684 /* Add remaining product results in */
6685 "adds x13, x13, x16\n\t"
6686 "adcs x14, x14, x17\n\t"
6687 "adcs x15, x15, x19\n\t"
6688 "adc x27, x27, xzr\n\t"
6689 /* Overflow */
6690 "extr x27, x27, x15, #63\n\t"
6691 "mul x27, x27, x25\n\t"
6692 "and x15, x15, #0x7fffffffffffffff\n\t"
6693 "adds x12, x12, x27\n\t"
6694 "adcs x13, x13, xzr\n\t"
6695 "adcs x14, x14, xzr\n\t"
6696 "adc x15, x15, xzr\n\t"
6697 /* Reduce if top bit set */
6698 "and x27, x25, x15, asr 63\n\t"
6699 "and x15, x15, #0x7fffffffffffffff\n\t"
6700 "adds x12, x12, x27\n\t"
6701 "adcs x13, x13, xzr\n\t"
6702 "adcs x14, x14, xzr\n\t"
6703 "adc x15, x15, xzr\n\t"
6704 /* Store */
6705 "ldr x0, [x29, #24]\n\t"
6706 "ldr x1, [sp, #112]\n\t"
6707 /* Multiply */
6708 "ldp x21, x22, [x1]\n\t"
6709 "ldp x23, x24, [x1, #16]\n\t"
6710 /* A[0] * B[0] */
6711 "mul x4, x8, x21\n\t"
6712 "umulh x5, x8, x21\n\t"
6713 /* A[0] * B[1] */
6714 "mul x25, x8, x22\n\t"
6715 "umulh x6, x8, x22\n\t"
6716 "adds x5, x5, x25\n\t"
6717 "adc x6, x6, xzr\n\t"
6718 /* A[1] * B[0] */
6719 "mul x25, x9, x21\n\t"
6720 "umulh x26, x9, x21\n\t"
6721 "adds x5, x5, x25\n\t"
6722 "adcs x6, x6, x26\n\t"
6723 "adc x7, xzr, xzr\n\t"
6724 /* A[0] * B[2] */
6725 "mul x25, x8, x23\n\t"
6726 "umulh x26, x8, x23\n\t"
6727 "adds x6, x6, x25\n\t"
6728 "adc x7, x7, x26\n\t"
6729 /* A[1] * B[1] */
6730 "mul x25, x9, x22\n\t"
6731 "umulh x26, x9, x22\n\t"
6732 "adds x6, x6, x25\n\t"
6733 "adcs x7, x7, x26\n\t"
6734 "adc x16, xzr, xzr\n\t"
6735 /* A[2] * B[0] */
6736 "mul x25, x10, x21\n\t"
6737 "umulh x26, x10, x21\n\t"
6738 "adds x6, x6, x25\n\t"
6739 "adcs x7, x7, x26\n\t"
6740 "adc x16, x16, xzr\n\t"
6741 /* A[0] * B[3] */
6742 "mul x25, x8, x24\n\t"
6743 "umulh x26, x8, x24\n\t"
6744 "adds x7, x7, x25\n\t"
6745 "adcs x16, x16, x26\n\t"
6746 "adc x17, xzr, xzr\n\t"
6747 /* A[1] * B[2] */
6748 "mul x25, x9, x23\n\t"
6749 "umulh x26, x9, x23\n\t"
6750 "adds x7, x7, x25\n\t"
6751 "adcs x16, x16, x26\n\t"
6752 "adc x17, x17, xzr\n\t"
6753 /* A[2] * B[1] */
6754 "mul x25, x10, x22\n\t"
6755 "umulh x26, x10, x22\n\t"
6756 "adds x7, x7, x25\n\t"
6757 "adcs x16, x16, x26\n\t"
6758 "adc x17, x17, xzr\n\t"
6759 /* A[3] * B[0] */
6760 "mul x25, x11, x21\n\t"
6761 "umulh x26, x11, x21\n\t"
6762 "adds x7, x7, x25\n\t"
6763 "adcs x16, x16, x26\n\t"
6764 "adc x17, x17, xzr\n\t"
6765 /* A[1] * B[3] */
6766 "mul x25, x9, x24\n\t"
6767 "umulh x26, x9, x24\n\t"
6768 "adds x16, x16, x25\n\t"
6769 "adcs x17, x17, x26\n\t"
6770 "adc x19, xzr, xzr\n\t"
6771 /* A[2] * B[2] */
6772 "mul x25, x10, x23\n\t"
6773 "umulh x26, x10, x23\n\t"
6774 "adds x16, x16, x25\n\t"
6775 "adcs x17, x17, x26\n\t"
6776 "adc x19, x19, xzr\n\t"
6777 /* A[3] * B[1] */
6778 "mul x25, x11, x22\n\t"
6779 "umulh x26, x11, x22\n\t"
6780 "adds x16, x16, x25\n\t"
6781 "adcs x17, x17, x26\n\t"
6782 "adc x19, x19, xzr\n\t"
6783 /* A[2] * B[3] */
6784 "mul x25, x10, x24\n\t"
6785 "umulh x26, x10, x24\n\t"
6786 "adds x17, x17, x25\n\t"
6787 "adcs x19, x19, x26\n\t"
6788 "adc x20, xzr, xzr\n\t"
6789 /* A[3] * B[2] */
6790 "mul x25, x11, x23\n\t"
6791 "umulh x26, x11, x23\n\t"
6792 "adds x17, x17, x25\n\t"
6793 "adcs x19, x19, x26\n\t"
6794 "adc x20, x20, xzr\n\t"
6795 /* A[3] * B[3] */
6796 "mul x25, x11, x24\n\t"
6797 "umulh x26, x11, x24\n\t"
6798 "adds x19, x19, x25\n\t"
6799 "adc x20, x20, x26\n\t"
6800 /* Reduce */
6801 /* Move top half into t4-t7 and remove top bit from t3 */
6802 "extr x20, x20, x19, #63\n\t"
6803 "extr x19, x19, x17, #63\n\t"
6804 "extr x17, x17, x16, #63\n\t"
6805 "extr x16, x16, x7, #63\n\t"
6806 "and x7, x7, #0x7fffffffffffffff\n\t"
6807 /* Multiply top half by 19 */
6808 "mov x25, #19\n\t"
6809 "mul x26, x25, x16\n\t"
6810 "umulh x16, x25, x16\n\t"
6811 "adds x4, x4, x26\n\t"
6812 "mul x26, x25, x17\n\t"
6813 "umulh x17, x25, x17\n\t"
6814 "adcs x5, x5, x26\n\t"
6815 "mul x26, x25, x19\n\t"
6816 "umulh x19, x25, x19\n\t"
6817 "adcs x6, x6, x26\n\t"
6818 "mul x26, x25, x20\n\t"
6819 "umulh x27, x25, x20\n\t"
6820 "adcs x7, x7, x26\n\t"
6821 "adc x27, x27, xzr\n\t"
6822 /* Add remaining product results in */
6823 "adds x5, x5, x16\n\t"
6824 "adcs x6, x6, x17\n\t"
6825 "adcs x7, x7, x19\n\t"
6826 "adc x27, x27, xzr\n\t"
6827 /* Overflow */
6828 "extr x27, x27, x7, #63\n\t"
6829 "mul x27, x27, x25\n\t"
6830 "and x7, x7, #0x7fffffffffffffff\n\t"
6831 "adds x4, x4, x27\n\t"
6832 "adcs x5, x5, xzr\n\t"
6833 "adcs x6, x6, xzr\n\t"
6834 "adc x7, x7, xzr\n\t"
6835 /* Reduce if top bit set */
6836 "and x27, x25, x7, asr 63\n\t"
6837 "and x7, x7, #0x7fffffffffffffff\n\t"
6838 "adds x4, x4, x27\n\t"
6839 "adcs x5, x5, xzr\n\t"
6840 "adcs x6, x6, xzr\n\t"
6841 "adc x7, x7, xzr\n\t"
6842 /* Store */
6843 "ldr x0, [x29, #24]\n\t"
6844 "ldr x1, [x29, #16]\n\t"
6845 /* Add */
6846 "adds x8, x12, x4\n\t"
6847 "adcs x9, x13, x5\n\t"
6848 "adcs x10, x14, x6\n\t"
6849 "adc x11, x15, x7\n\t"
6850 "mov x25, #-19\n\t"
6851 "asr x28, x11, #63\n\t"
6852 /* Mask the modulus */
6853 "and x25, x28, x25\n\t"
6854 "and x26, x28, #0x7fffffffffffffff\n\t"
6855 /* Sub modulus (if overflow) */
6856 "subs x8, x8, x25\n\t"
6857 "sbcs x9, x9, x28\n\t"
6858 "sbcs x10, x10, x28\n\t"
6859 "sbc x11, x11, x26\n\t"
6860 /* Sub */
6861 "subs x16, x12, x4\n\t"
6862 "sbcs x17, x13, x5\n\t"
6863 "sbcs x19, x14, x6\n\t"
6864 "sbcs x20, x15, x7\n\t"
6865 "mov x25, #-19\n\t"
6866 "csetm x28, cc\n\t"
6867 /* Mask the modulus */
6868 "and x25, x28, x25\n\t"
6869 "and x26, x28, #0x7fffffffffffffff\n\t"
6870 /* Add modulus (if underflow) */
6871 "adds x16, x16, x25\n\t"
6872 "adcs x17, x17, x28\n\t"
6873 "adcs x19, x19, x28\n\t"
6874 "adc x20, x20, x26\n\t"
6875 "stp x8, x9, [x0]\n\t"
6876 "stp x10, x11, [x0, #16]\n\t"
6877 "stp x16, x17, [x1]\n\t"
6878 "stp x19, x20, [x1, #16]\n\t"
6879 "ldr x0, [x29, #48]\n\t"
6880 "ldr x1, [x29, #64]\n\t"
6881 "ldr x2, [sp, #96]\n\t"
6882 /* Multiply */
6883 "ldp x12, x13, [x1]\n\t"
6884 "ldp x14, x15, [x1, #16]\n\t"
6885 "ldp x16, x17, [x2]\n\t"
6886 "ldp x19, x20, [x2, #16]\n\t"
6887 /* A[0] * B[0] */
6888 "mul x4, x12, x16\n\t"
6889 "umulh x5, x12, x16\n\t"
6890 /* A[0] * B[1] */
6891 "mul x25, x12, x17\n\t"
6892 "umulh x6, x12, x17\n\t"
6893 "adds x5, x5, x25\n\t"
6894 "adc x6, x6, xzr\n\t"
6895 /* A[1] * B[0] */
6896 "mul x25, x13, x16\n\t"
6897 "umulh x26, x13, x16\n\t"
6898 "adds x5, x5, x25\n\t"
6899 "adcs x6, x6, x26\n\t"
6900 "adc x7, xzr, xzr\n\t"
6901 /* A[0] * B[2] */
6902 "mul x25, x12, x19\n\t"
6903 "umulh x26, x12, x19\n\t"
6904 "adds x6, x6, x25\n\t"
6905 "adc x7, x7, x26\n\t"
6906 /* A[1] * B[1] */
6907 "mul x25, x13, x17\n\t"
6908 "umulh x26, x13, x17\n\t"
6909 "adds x6, x6, x25\n\t"
6910 "adcs x7, x7, x26\n\t"
6911 "adc x8, xzr, xzr\n\t"
6912 /* A[2] * B[0] */
6913 "mul x25, x14, x16\n\t"
6914 "umulh x26, x14, x16\n\t"
6915 "adds x6, x6, x25\n\t"
6916 "adcs x7, x7, x26\n\t"
6917 "adc x8, x8, xzr\n\t"
6918 /* A[0] * B[3] */
6919 "mul x25, x12, x20\n\t"
6920 "umulh x26, x12, x20\n\t"
6921 "adds x7, x7, x25\n\t"
6922 "adcs x8, x8, x26\n\t"
6923 "adc x9, xzr, xzr\n\t"
6924 /* A[1] * B[2] */
6925 "mul x25, x13, x19\n\t"
6926 "umulh x26, x13, x19\n\t"
6927 "adds x7, x7, x25\n\t"
6928 "adcs x8, x8, x26\n\t"
6929 "adc x9, x9, xzr\n\t"
6930 /* A[2] * B[1] */
6931 "mul x25, x14, x17\n\t"
6932 "umulh x26, x14, x17\n\t"
6933 "adds x7, x7, x25\n\t"
6934 "adcs x8, x8, x26\n\t"
6935 "adc x9, x9, xzr\n\t"
6936 /* A[3] * B[0] */
6937 "mul x25, x15, x16\n\t"
6938 "umulh x26, x15, x16\n\t"
6939 "adds x7, x7, x25\n\t"
6940 "adcs x8, x8, x26\n\t"
6941 "adc x9, x9, xzr\n\t"
6942 /* A[1] * B[3] */
6943 "mul x25, x13, x20\n\t"
6944 "umulh x26, x13, x20\n\t"
6945 "adds x8, x8, x25\n\t"
6946 "adcs x9, x9, x26\n\t"
6947 "adc x10, xzr, xzr\n\t"
6948 /* A[2] * B[2] */
6949 "mul x25, x14, x19\n\t"
6950 "umulh x26, x14, x19\n\t"
6951 "adds x8, x8, x25\n\t"
6952 "adcs x9, x9, x26\n\t"
6953 "adc x10, x10, xzr\n\t"
6954 /* A[3] * B[1] */
6955 "mul x25, x15, x17\n\t"
6956 "umulh x26, x15, x17\n\t"
6957 "adds x8, x8, x25\n\t"
6958 "adcs x9, x9, x26\n\t"
6959 "adc x10, x10, xzr\n\t"
6960 /* A[2] * B[3] */
6961 "mul x25, x14, x20\n\t"
6962 "umulh x26, x14, x20\n\t"
6963 "adds x9, x9, x25\n\t"
6964 "adcs x10, x10, x26\n\t"
6965 "adc x11, xzr, xzr\n\t"
6966 /* A[3] * B[2] */
6967 "mul x25, x15, x19\n\t"
6968 "umulh x26, x15, x19\n\t"
6969 "adds x9, x9, x25\n\t"
6970 "adcs x10, x10, x26\n\t"
6971 "adc x11, x11, xzr\n\t"
6972 /* A[3] * B[3] */
6973 "mul x25, x15, x20\n\t"
6974 "umulh x26, x15, x20\n\t"
6975 "adds x10, x10, x25\n\t"
6976 "adc x11, x11, x26\n\t"
6977 /* Reduce */
6978 /* Move top half into t4-t7 and remove top bit from t3 */
6979 "extr x11, x11, x10, #63\n\t"
6980 "extr x10, x10, x9, #63\n\t"
6981 "extr x9, x9, x8, #63\n\t"
6982 "extr x8, x8, x7, #63\n\t"
6983 "and x7, x7, #0x7fffffffffffffff\n\t"
6984 /* Multiply top half by 19 */
6985 "mov x25, #19\n\t"
6986 "mul x26, x25, x8\n\t"
6987 "umulh x8, x25, x8\n\t"
6988 "adds x4, x4, x26\n\t"
6989 "mul x26, x25, x9\n\t"
6990 "umulh x9, x25, x9\n\t"
6991 "adcs x5, x5, x26\n\t"
6992 "mul x26, x25, x10\n\t"
6993 "umulh x10, x25, x10\n\t"
6994 "adcs x6, x6, x26\n\t"
6995 "mul x26, x25, x11\n\t"
6996 "umulh x27, x25, x11\n\t"
6997 "adcs x7, x7, x26\n\t"
6998 "adc x27, x27, xzr\n\t"
6999 /* Add remaining product results in */
7000 "adds x5, x5, x8\n\t"
7001 "adcs x6, x6, x9\n\t"
7002 "adcs x7, x7, x10\n\t"
7003 "adc x27, x27, xzr\n\t"
7004 /* Overflow */
7005 "extr x27, x27, x7, #63\n\t"
7006 "mul x27, x27, x25\n\t"
7007 "and x7, x7, #0x7fffffffffffffff\n\t"
7008 "adds x4, x4, x27\n\t"
7009 "adcs x5, x5, xzr\n\t"
7010 "adcs x6, x6, xzr\n\t"
7011 "adc x7, x7, xzr\n\t"
7012 /* Reduce if top bit set */
7013 "and x27, x25, x7, asr 63\n\t"
7014 "and x7, x7, #0x7fffffffffffffff\n\t"
7015 "adds x4, x4, x27\n\t"
7016 "adcs x5, x5, xzr\n\t"
7017 "adcs x6, x6, xzr\n\t"
7018 "adc x7, x7, xzr\n\t"
7019 /* Store */
7020 "ldr x0, [x29, #48]\n\t"
7021 /* Double */
7022 "adds x4, x4, x4\n\t"
7023 "adcs x5, x5, x5\n\t"
7024 "adcs x6, x6, x6\n\t"
7025 "adc x7, x7, x7\n\t"
7026 "mov x25, #-19\n\t"
7027 "asr x28, x7, #63\n\t"
7028 /* Mask the modulus */
7029 "and x25, x28, x25\n\t"
7030 "and x26, x28, #0x7fffffffffffffff\n\t"
7031 /* Sub modulus (if overflow) */
7032 "subs x4, x4, x25\n\t"
7033 "sbcs x5, x5, x28\n\t"
7034 "sbcs x6, x6, x28\n\t"
7035 "sbc x7, x7, x26\n\t"
7036 "ldr x0, [x29, #40]\n\t"
7037 "ldr x1, [sp, #104]\n\t"
7038 "ldr x2, [x29, #72]\n\t"
7039 /* Multiply */
7040 "ldp x16, x17, [x1]\n\t"
7041 "ldp x19, x20, [x1, #16]\n\t"
7042 "ldp x21, x22, [x2]\n\t"
7043 "ldp x23, x24, [x2, #16]\n\t"
7044 /* A[0] * B[0] */
7045 "mul x8, x16, x21\n\t"
7046 "umulh x9, x16, x21\n\t"
7047 /* A[0] * B[1] */
7048 "mul x25, x16, x22\n\t"
7049 "umulh x10, x16, x22\n\t"
7050 "adds x9, x9, x25\n\t"
7051 "adc x10, x10, xzr\n\t"
7052 /* A[1] * B[0] */
7053 "mul x25, x17, x21\n\t"
7054 "umulh x26, x17, x21\n\t"
7055 "adds x9, x9, x25\n\t"
7056 "adcs x10, x10, x26\n\t"
7057 "adc x11, xzr, xzr\n\t"
7058 /* A[0] * B[2] */
7059 "mul x25, x16, x23\n\t"
7060 "umulh x26, x16, x23\n\t"
7061 "adds x10, x10, x25\n\t"
7062 "adc x11, x11, x26\n\t"
7063 /* A[1] * B[1] */
7064 "mul x25, x17, x22\n\t"
7065 "umulh x26, x17, x22\n\t"
7066 "adds x10, x10, x25\n\t"
7067 "adcs x11, x11, x26\n\t"
7068 "adc x12, xzr, xzr\n\t"
7069 /* A[2] * B[0] */
7070 "mul x25, x19, x21\n\t"
7071 "umulh x26, x19, x21\n\t"
7072 "adds x10, x10, x25\n\t"
7073 "adcs x11, x11, x26\n\t"
7074 "adc x12, x12, xzr\n\t"
7075 /* A[0] * B[3] */
7076 "mul x25, x16, x24\n\t"
7077 "umulh x26, x16, x24\n\t"
7078 "adds x11, x11, x25\n\t"
7079 "adcs x12, x12, x26\n\t"
7080 "adc x13, xzr, xzr\n\t"
7081 /* A[1] * B[2] */
7082 "mul x25, x17, x23\n\t"
7083 "umulh x26, x17, x23\n\t"
7084 "adds x11, x11, x25\n\t"
7085 "adcs x12, x12, x26\n\t"
7086 "adc x13, x13, xzr\n\t"
7087 /* A[2] * B[1] */
7088 "mul x25, x19, x22\n\t"
7089 "umulh x26, x19, x22\n\t"
7090 "adds x11, x11, x25\n\t"
7091 "adcs x12, x12, x26\n\t"
7092 "adc x13, x13, xzr\n\t"
7093 /* A[3] * B[0] */
7094 "mul x25, x20, x21\n\t"
7095 "umulh x26, x20, x21\n\t"
7096 "adds x11, x11, x25\n\t"
7097 "adcs x12, x12, x26\n\t"
7098 "adc x13, x13, xzr\n\t"
7099 /* A[1] * B[3] */
7100 "mul x25, x17, x24\n\t"
7101 "umulh x26, x17, x24\n\t"
7102 "adds x12, x12, x25\n\t"
7103 "adcs x13, x13, x26\n\t"
7104 "adc x14, xzr, xzr\n\t"
7105 /* A[2] * B[2] */
7106 "mul x25, x19, x23\n\t"
7107 "umulh x26, x19, x23\n\t"
7108 "adds x12, x12, x25\n\t"
7109 "adcs x13, x13, x26\n\t"
7110 "adc x14, x14, xzr\n\t"
7111 /* A[3] * B[1] */
7112 "mul x25, x20, x22\n\t"
7113 "umulh x26, x20, x22\n\t"
7114 "adds x12, x12, x25\n\t"
7115 "adcs x13, x13, x26\n\t"
7116 "adc x14, x14, xzr\n\t"
7117 /* A[2] * B[3] */
7118 "mul x25, x19, x24\n\t"
7119 "umulh x26, x19, x24\n\t"
7120 "adds x13, x13, x25\n\t"
7121 "adcs x14, x14, x26\n\t"
7122 "adc x15, xzr, xzr\n\t"
7123 /* A[3] * B[2] */
7124 "mul x25, x20, x23\n\t"
7125 "umulh x26, x20, x23\n\t"
7126 "adds x13, x13, x25\n\t"
7127 "adcs x14, x14, x26\n\t"
7128 "adc x15, x15, xzr\n\t"
7129 /* A[3] * B[3] */
7130 "mul x25, x20, x24\n\t"
7131 "umulh x26, x20, x24\n\t"
7132 "adds x14, x14, x25\n\t"
7133 "adc x15, x15, x26\n\t"
7134 /* Reduce */
7135 /* Move top half into t4-t7 and remove top bit from t3 */
7136 "extr x15, x15, x14, #63\n\t"
7137 "extr x14, x14, x13, #63\n\t"
7138 "extr x13, x13, x12, #63\n\t"
7139 "extr x12, x12, x11, #63\n\t"
7140 "and x11, x11, #0x7fffffffffffffff\n\t"
7141 /* Multiply top half by 19 */
7142 "mov x25, #19\n\t"
7143 "mul x26, x25, x12\n\t"
7144 "umulh x12, x25, x12\n\t"
7145 "adds x8, x8, x26\n\t"
7146 "mul x26, x25, x13\n\t"
7147 "umulh x13, x25, x13\n\t"
7148 "adcs x9, x9, x26\n\t"
7149 "mul x26, x25, x14\n\t"
7150 "umulh x14, x25, x14\n\t"
7151 "adcs x10, x10, x26\n\t"
7152 "mul x26, x25, x15\n\t"
7153 "umulh x27, x25, x15\n\t"
7154 "adcs x11, x11, x26\n\t"
7155 "adc x27, x27, xzr\n\t"
7156 /* Add remaining product results in */
7157 "adds x9, x9, x12\n\t"
7158 "adcs x10, x10, x13\n\t"
7159 "adcs x11, x11, x14\n\t"
7160 "adc x27, x27, xzr\n\t"
7161 /* Overflow */
7162 "extr x27, x27, x11, #63\n\t"
7163 "mul x27, x27, x25\n\t"
7164 "and x11, x11, #0x7fffffffffffffff\n\t"
7165 "adds x8, x8, x27\n\t"
7166 "adcs x9, x9, xzr\n\t"
7167 "adcs x10, x10, xzr\n\t"
7168 "adc x11, x11, xzr\n\t"
7169 /* Reduce if top bit set */
7170 "and x27, x25, x11, asr 63\n\t"
7171 "and x11, x11, #0x7fffffffffffffff\n\t"
7172 "adds x8, x8, x27\n\t"
7173 "adcs x9, x9, xzr\n\t"
7174 "adcs x10, x10, xzr\n\t"
7175 "adc x11, x11, xzr\n\t"
7176 /* Store */
7177 "ldr x0, [x29, #40]\n\t"
7178 "ldr x1, [x29, #32]\n\t"
7179 /* Add */
7180 "adds x12, x4, x8\n\t"
7181 "adcs x13, x5, x9\n\t"
7182 "adcs x14, x6, x10\n\t"
7183 "adc x15, x7, x11\n\t"
7184 "mov x25, #-19\n\t"
7185 "asr x28, x15, #63\n\t"
7186 /* Mask the modulus */
7187 "and x25, x28, x25\n\t"
7188 "and x26, x28, #0x7fffffffffffffff\n\t"
7189 /* Sub modulus (if overflow) */
7190 "subs x12, x12, x25\n\t"
7191 "sbcs x13, x13, x28\n\t"
7192 "sbcs x14, x14, x28\n\t"
7193 "sbc x15, x15, x26\n\t"
7194 /* Sub */
7195 "subs x16, x4, x8\n\t"
7196 "sbcs x17, x5, x9\n\t"
7197 "sbcs x19, x6, x10\n\t"
7198 "sbcs x20, x7, x11\n\t"
7199 "mov x25, #-19\n\t"
7200 "csetm x28, cc\n\t"
7201 /* Mask the modulus */
7202 "and x25, x28, x25\n\t"
7203 "and x26, x28, #0x7fffffffffffffff\n\t"
7204 /* Add modulus (if underflow) */
7205 "adds x16, x16, x25\n\t"
7206 "adcs x17, x17, x28\n\t"
7207 "adcs x19, x19, x28\n\t"
7208 "adc x20, x20, x26\n\t"
7209 "stp x12, x13, [x0]\n\t"
7210 "stp x14, x15, [x0, #16]\n\t"
7211 "stp x16, x17, [x1]\n\t"
7212 "stp x19, x20, [x1, #16]\n\t"
7213 "ldp x29, x30, [sp], #0x80\n\t"
7214 : [rx] "+r" (rx), [ry] "+r" (ry), [rz] "+r" (rz), [rt] "+r" (rt), [px] "+r" (px), [py] "+r" (py), [pz] "+r" (pz), [pt] "+r" (pt), [qz] "+r" (qz), [qt2d] "+r" (qt2d), [qyplusx] "+r" (qyplusx), [qyminusx] "+r" (qyminusx)
7215 :
7216 : "memory", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
7217 );
7218 }
7219
7220 #endif /* HAVE_CURVE25519 */
7221 #endif /* __aarch64__ */
7222 #endif /* WOLFSSL_ARMASM */
7223