1 /* asm.c
2  *
3  * Copyright (C) 2006-2021 wolfSSL Inc.
4  *
5  * This file is part of wolfSSL.
6  *
7  * wolfSSL is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * wolfSSL is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20  */
21 
22 
23 #ifdef HAVE_CONFIG_H
24     #include <config.h>
25 #endif
26 
27 #include <wolfssl/wolfcrypt/settings.h>
28 
29 /*
30  * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
31  * http://math.libtomcrypt.com
32  */
33 
34 
35 /******************************************************************/
36 /* fp_montgomery_reduce.c asm or generic */
37 
38 
39 /* Each platform needs to query info type 1 from cpuid to see if aesni is
40  * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
41  */
42 
43 #if defined(HAVE_INTEL_MULX)
44 #ifndef _MSC_VER
45     #define cpuid(reg, leaf, sub)\
46             __asm__ __volatile__ ("cpuid":\
47              "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
48              "a" (leaf), "c"(sub));
49 
50     #define XASM_LINK(f) asm(f)
51 #else
52 
53     #include <intrin.h>
54     #define cpuid(a,b,c) __cpuidex((int*)a,b,c)
55 
56     #define XASM_LINK(f)
57 
58 #endif /* _MSC_VER */
59 
60 #define EAX 0
61 #define EBX 1
62 #define ECX 2
63 #define EDX 3
64 
65 #define CPUID_AVX1   0x1
66 #define CPUID_AVX2   0x2
67 #define CPUID_RDRAND 0x4
68 #define CPUID_RDSEED 0x8
69 #define CPUID_BMI2   0x10   /* MULX, RORX */
70 #define CPUID_ADX    0x20   /* ADCX, ADOX */
71 
72 #define IS_INTEL_AVX1       (cpuid_flags&CPUID_AVX1)
73 #define IS_INTEL_AVX2       (cpuid_flags&CPUID_AVX2)
74 #define IS_INTEL_BMI2       (cpuid_flags&CPUID_BMI2)
75 #define IS_INTEL_ADX        (cpuid_flags&CPUID_ADX)
76 #define IS_INTEL_RDRAND     (cpuid_flags&CPUID_RDRAND)
77 #define IS_INTEL_RDSEED     (cpuid_flags&CPUID_RDSEED)
78 #define SET_FLAGS
79 
80 static word32 cpuid_check = 0 ;
81 static word32 cpuid_flags = 0 ;
82 
cpuid_flag(word32 leaf,word32 sub,word32 num,word32 bit)83 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
84     int got_intel_cpu = 0;
85     int got_amd_cpu = 0;
86     unsigned int reg[5];
87 
88     reg[4] = '\0' ;
89     cpuid(reg, 0, 0);
90 
91     /* check for intel cpu */
92     if( memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
93         memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
94         memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
95         got_intel_cpu = 1;
96     }
97 
98     /* check for AMD cpu */
99     if( memcmp((char *)&(reg[EBX]), "Auth", 4) == 0 &&
100         memcmp((char *)&(reg[EDX]), "enti", 4) == 0 &&
101         memcmp((char *)&(reg[ECX]), "cAMD", 4) == 0) {
102         got_amd_cpu = 1;
103     }
104     if (got_intel_cpu || got_amd_cpu) {
105         cpuid(reg, leaf, sub);
106         return((reg[num]>>bit)&0x1) ;
107     }
108     return 0 ;
109 }
110 
set_cpuid_flags(void)111 WC_INLINE static int set_cpuid_flags(void) {
112     if(cpuid_check == 0) {
113         if(cpuid_flag(7, 0, EBX, 8)){  cpuid_flags |= CPUID_BMI2 ; }
114         if(cpuid_flag(7, 0, EBX,19)){  cpuid_flags |= CPUID_ADX  ; }
115 		cpuid_check = 1 ;
116 		return 0 ;
117     }
118     return 1 ;
119 }
120 
121 #define RETURN return
122 #define IF_HAVE_INTEL_MULX(func, ret)    \
123    if(cpuid_check==0)set_cpuid_flags() ; \
124    if(IS_INTEL_BMI2 && IS_INTEL_ADX){  func;  ret ;  }
125 
126 #else
127     #define IF_HAVE_INTEL_MULX(func, ret)
128 #endif
129 
130 #if defined(TFM_X86) && !defined(TFM_SSE2)
131 /* x86-32 code */
132 
133 #define MONT_START
134 #define MONT_FINI
135 #define LOOP_END
136 #define LOOP_START \
137    mu = c[x] * mp
138 
139 #define INNERMUL                                          \
140 __asm__(                                                  \
141    "movl %5,%%eax \n\t"                                   \
142    "mull %4       \n\t"                                   \
143    "addl %1,%%eax \n\t"                                   \
144    "adcl $0,%%edx \n\t"                                   \
145    "addl %%eax,%0 \n\t"                                   \
146    "adcl $0,%%edx \n\t"                                   \
147    "movl %%edx,%1 \n\t"                                   \
148 :"=g"(_c[LO]), "=r"(cy)                                   \
149 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
150 : "%eax", "%edx", "cc")
151 
152 #define PROPCARRY                           \
153 __asm__(                                    \
154    "addl   %1,%0    \n\t"                   \
155    "setb   %%al     \n\t"                   \
156    "movzbl %%al,%1 \n\t"                    \
157 :"=g"(_c[LO]), "=r"(cy)                     \
158 :"0"(_c[LO]), "1"(cy)                       \
159 : "%eax", "cc")
160 
161 /******************************************************************/
162 #elif defined(TFM_X86_64)
163 /* x86-64 code */
164 
165 #define MONT_START
166 #define MONT_FINI
167 #define LOOP_END
168 #define LOOP_START \
169    mu = c[x] * mp
170 
171 #define INNERMUL                                          \
172 __asm__(                                                  \
173    "movq %5,%%rax \n\t"                                   \
174    "mulq %4       \n\t"                                   \
175    "addq %1,%%rax \n\t"                                   \
176    "adcq $0,%%rdx \n\t"                                   \
177    "addq %%rax,%0 \n\t"                                   \
178    "adcq $0,%%rdx \n\t"                                   \
179    "movq %%rdx,%1 \n\t"                                   \
180 :"=g"(_c[LO]), "=r"(cy)                                   \
181 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
182 : "%rax", "%rdx", "cc")
183 
184 #if defined(HAVE_INTEL_MULX)
185 #define MULX_INNERMUL8(x,y,z,cy)                                       \
186     __asm__  volatile (                                                \
187         "movq	%[yn], %%rdx\n\t"                                      \
188         "xorq	%%rcx, %%rcx\n\t"                                      \
189         "movq   0(%[c]), %%r8\n\t"                                     \
190         "movq   8(%[c]), %%r9\n\t"                                     \
191         "movq   16(%[c]), %%r10\n\t"                                   \
192         "movq   24(%[c]), %%r11\n\t"                                   \
193         "movq   32(%[c]), %%r12\n\t"                                   \
194         "movq   40(%[c]), %%r13\n\t"                                   \
195         "movq   48(%[c]), %%r14\n\t"                                   \
196         "movq   56(%[c]), %%r15\n\t"                                   \
197                                                                        \
198         "mulx	0(%[xp]), %%rax, %%rcx\n\t"                            \
199         "adcxq	%[cy], %%r8\n\t"                                       \
200         "adoxq	%%rax, %%r8\n\t"                                       \
201         "mulx	8(%[xp]), %%rax, %[cy]\n\t"                            \
202         "adcxq	%%rcx, %%r9\n\t"                                       \
203         "adoxq	%%rax, %%r9\n\t"                                       \
204         "mulx	16(%[xp]), %%rax, %%rcx\n\t"                           \
205         "adcxq	%[cy], %%r10\n\t"                                      \
206         "adoxq	%%rax, %%r10\n\t"                                      \
207         "mulx	24(%[xp]), %%rax, %[cy]\n\t"                           \
208         "adcxq	%%rcx, %%r11\n\t"                                      \
209         "adoxq	%%rax, %%r11\n\t"                                      \
210         "mulx	32(%[xp]), %%rax, %%rcx\n\t"                           \
211         "adcxq	%[cy], %%r12\n\t"                                      \
212         "adoxq	%%rax, %%r12\n\t"                                      \
213         "mulx	40(%[xp]), %%rax, %[cy]\n\t"                           \
214         "adcxq	%%rcx, %%r13\n\t"                                      \
215         "adoxq	%%rax, %%r13\n\t"                                      \
216         "mulx	48(%[xp]), %%rax, %%rcx\n\t"                           \
217         "adcxq	%[cy], %%r14\n\t"                                      \
218         "adoxq	%%rax, %%r14\n\t"                                      \
219         "adcxq	%%rcx, %%r15\n\t"                                      \
220         "mulx	56(%[xp]), %%rax, %[cy]\n\t"                           \
221         "movq	$0, %%rdx\n\t"                                         \
222         "adoxq	%%rdx, %%rax\n\t"                                      \
223         "adcxq	%%rdx, %[cy]\n\t"                                      \
224         "adoxq	%%rdx, %[cy]\n\t"                                      \
225         "addq   %%rax, %%r15\n\t"                                      \
226         "adcq   $0, %[cy]\n\t"                                         \
227                                                                        \
228         "movq   %%r8,   0(%[c])\n\t"                                   \
229         "movq   %%r9,   8(%[c])\n\t"                                   \
230         "movq   %%r10, 16(%[c])\n\t"                                   \
231         "movq   %%r11, 24(%[c])\n\t"                                   \
232         "movq   %%r12, 32(%[c])\n\t"                                   \
233         "movq   %%r13, 40(%[c])\n\t"                                   \
234         "movq   %%r14, 48(%[c])\n\t"                                   \
235         "movq   %%r15, 56(%[c])\n\t"                                   \
236         : [cy] "+r" (cy)                                               \
237         : [xp] "r" (x), [c] "r" (c_mulx), [yn] "rm" (y)                \
238         :"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", \
239          "%rdx", "%rax", "%rcx" \
240     )
241 
242 #define INNERMUL8_MULX \
243 {\
244     MULX_INNERMUL8(tmpm, mu, _c, cy);\
245 }
246 #endif
247 
248 #define INNERMUL8 \
249  __asm__(                    \
250  "movq 0(%5),%%rax    \n\t"  \
251  "movq 0(%2),%%r10    \n\t"  \
252  "movq 0x8(%5),%%r11  \n\t"  \
253  "mulq %4             \n\t"  \
254  "addq %%r10,%%rax    \n\t"  \
255  "adcq $0,%%rdx       \n\t"  \
256  "movq 0x8(%2),%%r10  \n\t"  \
257  "addq %3,%%rax       \n\t"  \
258  "adcq $0,%%rdx       \n\t"  \
259  "movq %%rax,0(%0)    \n\t"  \
260  "movq %%rdx,%1       \n\t"  \
261  \
262  "movq %%r11,%%rax    \n\t"  \
263  "movq 0x10(%5),%%r11 \n\t"  \
264  "mulq %4             \n\t"  \
265  "addq %%r10,%%rax    \n\t"  \
266  "adcq $0,%%rdx       \n\t"  \
267  "movq 0x10(%2),%%r10 \n\t"  \
268  "addq %3,%%rax       \n\t"  \
269  "adcq $0,%%rdx       \n\t"  \
270  "movq %%rax,0x8(%0)  \n\t"  \
271  "movq %%rdx,%1       \n\t"  \
272  \
273  "movq %%r11,%%rax    \n\t"  \
274  "movq 0x18(%5),%%r11 \n\t"  \
275  "mulq %4             \n\t"  \
276  "addq %%r10,%%rax    \n\t"  \
277  "adcq $0,%%rdx       \n\t"  \
278  "movq 0x18(%2),%%r10 \n\t"  \
279  "addq %3,%%rax       \n\t"  \
280  "adcq $0,%%rdx       \n\t"  \
281  "movq %%rax,0x10(%0) \n\t"  \
282  "movq %%rdx,%1       \n\t"  \
283  \
284  "movq %%r11,%%rax    \n\t"  \
285  "movq 0x20(%5),%%r11 \n\t"  \
286  "mulq %4             \n\t"  \
287  "addq %%r10,%%rax    \n\t"  \
288  "adcq $0,%%rdx       \n\t"  \
289  "movq 0x20(%2),%%r10 \n\t"  \
290  "addq %3,%%rax       \n\t"  \
291  "adcq $0,%%rdx       \n\t"  \
292  "movq %%rax,0x18(%0) \n\t"  \
293  "movq %%rdx,%1       \n\t"  \
294  \
295  "movq %%r11,%%rax    \n\t"  \
296  "movq 0x28(%5),%%r11 \n\t"  \
297  "mulq %4             \n\t"  \
298  "addq %%r10,%%rax    \n\t"  \
299  "adcq $0,%%rdx       \n\t"  \
300  "movq 0x28(%2),%%r10 \n\t"  \
301  "addq %3,%%rax       \n\t"  \
302  "adcq $0,%%rdx       \n\t"  \
303  "movq %%rax,0x20(%0) \n\t"  \
304  "movq %%rdx,%1       \n\t"  \
305  \
306  "movq %%r11,%%rax    \n\t"  \
307  "movq 0x30(%5),%%r11 \n\t"  \
308  "mulq %4             \n\t"  \
309  "addq %%r10,%%rax    \n\t"  \
310  "adcq $0,%%rdx       \n\t"  \
311  "movq 0x30(%2),%%r10 \n\t"  \
312  "addq %3,%%rax       \n\t"  \
313  "adcq $0,%%rdx       \n\t"  \
314  "movq %%rax,0x28(%0) \n\t"  \
315  "movq %%rdx,%1       \n\t"  \
316  \
317  "movq %%r11,%%rax    \n\t"  \
318  "movq 0x38(%5),%%r11 \n\t"  \
319  "mulq %4             \n\t"  \
320  "addq %%r10,%%rax    \n\t"  \
321  "adcq $0,%%rdx       \n\t"  \
322  "movq 0x38(%2),%%r10 \n\t"  \
323  "addq %3,%%rax       \n\t"  \
324  "adcq $0,%%rdx       \n\t"  \
325  "movq %%rax,0x30(%0) \n\t"  \
326  "movq %%rdx,%1       \n\t"  \
327  \
328  "movq %%r11,%%rax    \n\t"  \
329  "mulq %4             \n\t"  \
330  "addq %%r10,%%rax    \n\t"  \
331  "adcq $0,%%rdx       \n\t"  \
332  "addq %3,%%rax       \n\t"  \
333  "adcq $0,%%rdx       \n\t"  \
334  "movq %%rax,0x38(%0) \n\t"  \
335  "movq %%rdx,%1       \n\t"  \
336  \
337 :"=r"(_c), "=r"(cy)                    \
338 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
339 : "%rax", "%rdx", "%r10", "%r11", "cc")
340 
341 #define PROPCARRY                           \
342 __asm__(                                    \
343    "addq   %1,%0    \n\t"                   \
344    "setb   %%al     \n\t"                   \
345    "movzbq %%al,%1 \n\t"                    \
346 :"=g"(_c[LO]), "=r"(cy)                     \
347 :"0"(_c[LO]), "1"(cy)                       \
348 : "%rax", "cc")
349 
350 /******************************************************************/
351 #elif defined(TFM_SSE2)
352 /* SSE2 code (assumes 32-bit fp_digits) */
353 /* XMM register assignments:
354  * xmm0  *tmpm++, then Mu * (*tmpm++)
355  * xmm1  c[x], then Mu
356  * xmm2  mp
357  * xmm3  cy
358  * xmm4  _c[LO]
359  */
360 
361 #define MONT_START \
362    __asm__("movd %0,%%mm2"::"g"(mp))
363 
364 #define MONT_FINI \
365    __asm__("emms")
366 
367 #define LOOP_START          \
368 __asm__(                    \
369 "movd %0,%%mm1        \n\t" \
370 "pxor %%mm3,%%mm3     \n\t" \
371 "pmuludq %%mm2,%%mm1  \n\t" \
372 :: "g"(c[x]))
373 
374 /* pmuludq on mmx registers does a 32x32->64 multiply. */
375 #define INNERMUL               \
376 __asm__(                       \
377    "movd %1,%%mm4        \n\t" \
378    "movd %2,%%mm0        \n\t" \
379    "paddq %%mm4,%%mm3    \n\t" \
380    "pmuludq %%mm1,%%mm0  \n\t" \
381    "paddq %%mm0,%%mm3    \n\t" \
382    "movd %%mm3,%0        \n\t" \
383    "psrlq $32, %%mm3     \n\t" \
384 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
385 
386 #define INNERMUL8 \
387 __asm__(                       \
388    "movd 0(%1),%%mm4     \n\t" \
389    "movd 0(%2),%%mm0     \n\t" \
390    "paddq %%mm4,%%mm3    \n\t" \
391    "pmuludq %%mm1,%%mm0  \n\t" \
392    "movd 4(%2),%%mm5     \n\t" \
393    "paddq %%mm0,%%mm3    \n\t" \
394    "movd 4(%1),%%mm6     \n\t" \
395    "movd %%mm3,0(%0)     \n\t" \
396    "psrlq $32, %%mm3     \n\t" \
397 \
398    "paddq %%mm6,%%mm3    \n\t" \
399    "pmuludq %%mm1,%%mm5  \n\t" \
400    "movd 8(%2),%%mm6     \n\t" \
401    "paddq %%mm5,%%mm3    \n\t" \
402    "movd 8(%1),%%mm7     \n\t" \
403    "movd %%mm3,4(%0)     \n\t" \
404    "psrlq $32, %%mm3     \n\t" \
405 \
406    "paddq %%mm7,%%mm3    \n\t" \
407    "pmuludq %%mm1,%%mm6  \n\t" \
408    "movd 12(%2),%%mm7    \n\t" \
409    "paddq %%mm6,%%mm3    \n\t" \
410    "movd 12(%1),%%mm5     \n\t" \
411    "movd %%mm3,8(%0)     \n\t" \
412    "psrlq $32, %%mm3     \n\t" \
413 \
414    "paddq %%mm5,%%mm3    \n\t" \
415    "pmuludq %%mm1,%%mm7  \n\t" \
416    "movd 16(%2),%%mm5    \n\t" \
417    "paddq %%mm7,%%mm3    \n\t" \
418    "movd 16(%1),%%mm6    \n\t" \
419    "movd %%mm3,12(%0)    \n\t" \
420    "psrlq $32, %%mm3     \n\t" \
421 \
422    "paddq %%mm6,%%mm3    \n\t" \
423    "pmuludq %%mm1,%%mm5  \n\t" \
424    "movd 20(%2),%%mm6    \n\t" \
425    "paddq %%mm5,%%mm3    \n\t" \
426    "movd 20(%1),%%mm7    \n\t" \
427    "movd %%mm3,16(%0)    \n\t" \
428    "psrlq $32, %%mm3     \n\t" \
429 \
430    "paddq %%mm7,%%mm3    \n\t" \
431    "pmuludq %%mm1,%%mm6  \n\t" \
432    "movd 24(%2),%%mm7    \n\t" \
433    "paddq %%mm6,%%mm3    \n\t" \
434    "movd 24(%1),%%mm5     \n\t" \
435    "movd %%mm3,20(%0)    \n\t" \
436    "psrlq $32, %%mm3     \n\t" \
437 \
438    "paddq %%mm5,%%mm3    \n\t" \
439    "pmuludq %%mm1,%%mm7  \n\t" \
440    "movd 28(%2),%%mm5    \n\t" \
441    "paddq %%mm7,%%mm3    \n\t" \
442    "movd 28(%1),%%mm6    \n\t" \
443    "movd %%mm3,24(%0)    \n\t" \
444    "psrlq $32, %%mm3     \n\t" \
445 \
446    "paddq %%mm6,%%mm3    \n\t" \
447    "pmuludq %%mm1,%%mm5  \n\t" \
448    "paddq %%mm5,%%mm3    \n\t" \
449    "movd %%mm3,28(%0)    \n\t" \
450    "psrlq $32, %%mm3     \n\t" \
451 :"=r"(_c) : "0"(_c), "r"(tmpm) );
452 
453 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
454    pointer */
455 
456 #define LOOP_END \
457 __asm__( "movd %%mm3,%0  \n" :"=r"(cy))
458 
459 #define PROPCARRY                           \
460 __asm__(                                    \
461    "addl   %1,%0    \n\t"                   \
462    "setb   %%al     \n\t"                   \
463    "movzbl %%al,%1 \n\t"                    \
464 :"=g"(_c[LO]), "=r"(cy)                     \
465 :"0"(_c[LO]), "1"(cy)                       \
466 : "%eax", "cc")
467 
468 /******************************************************************/
469 #elif defined(TFM_ARM)
470    /* ARMv4 code */
471 
472 #define MONT_START
473 #define MONT_FINI
474 #define LOOP_END
475 #define LOOP_START \
476    mu = c[x] * mp
477 
478 
479 #ifdef __thumb__
480 
481 #define INNERMUL                    \
482 __asm__(                            \
483     " LDR    r0,%1            \n\t" \
484     " ADDS   r0,r0,%0         \n\t" \
485     " ITE    CS               \n\t" \
486     " MOVCS  %0,#1            \n\t" \
487     " MOVCC  %0,#0            \n\t" \
488     " UMLAL  r0,%0,%3,%4      \n\t" \
489     " STR    r0,%1            \n\t" \
490 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
491 
492 #define PROPCARRY                  \
493 __asm__(                           \
494     " LDR   r0,%1            \n\t" \
495     " ADDS  r0,r0,%0         \n\t" \
496     " STR   r0,%1            \n\t" \
497     " ITE   CS               \n\t" \
498     " MOVCS %0,#1            \n\t" \
499     " MOVCC %0,#0            \n\t" \
500 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
501 
502 
503 /* TAO thumb mode uses ite (if then else) to detect carry directly
504  * fixed unmatched constraint warning by changing 1 to m  */
505 
506 #else  /* __thumb__ */
507 
508 #define INNERMUL                    \
509 __asm__(                            \
510     " LDR    r0,%1            \n\t" \
511     " ADDS   r0,r0,%0         \n\t" \
512     " MOVCS  %0,#1            \n\t" \
513     " MOVCC  %0,#0            \n\t" \
514     " UMLAL  r0,%0,%3,%4      \n\t" \
515     " STR    r0,%1            \n\t" \
516 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
517 
518 #define PROPCARRY                  \
519 __asm__(                           \
520     " LDR   r0,%1            \n\t" \
521     " ADDS  r0,r0,%0         \n\t" \
522     " STR   r0,%1            \n\t" \
523     " MOVCS %0,#1            \n\t" \
524     " MOVCC %0,#0            \n\t" \
525 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
526 
527 #endif /* __thumb__ */
528 
529 #elif defined(TFM_PPC32)
530 
531 /* PPC32 */
532 #define MONT_START
533 #define MONT_FINI
534 #define LOOP_END
535 #define LOOP_START \
536    mu = c[x] * mp
537 
538 #define INNERMUL                     \
539 __asm__(                             \
540    " mullw    16,%3,%4       \n\t"   \
541    " mulhwu   17,%3,%4       \n\t"   \
542    " addc     16,16,%2       \n\t"   \
543    " addze    17,17          \n\t"   \
544    " addc     %1,16,%5       \n\t"   \
545    " addze    %0,17          \n\t"   \
546 :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
547 
548 #define PROPCARRY                    \
549 __asm__(                             \
550    " addc     %1,%3,%2      \n\t"    \
551    " xor      %0,%2,%2      \n\t"    \
552    " addze    %0,%2         \n\t"    \
553 :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
554 
555 #elif defined(TFM_PPC64)
556 
557 /* PPC64 */
558 #define MONT_START
559 #define MONT_FINI
560 #define LOOP_END
561 #define LOOP_START \
562    mu = c[x] * mp
563 
564 #define INNERMUL                      \
565 __asm__(                              \
566    " mulld    r16,%3,%4       \n\t"   \
567    " mulhdu   r17,%3,%4       \n\t"   \
568    " addc     r16,16,%0       \n\t"   \
569    " addze    r17,r17         \n\t"   \
570    " ldx      r18,0,%1        \n\t"   \
571    " addc     r16,r16,r18     \n\t"   \
572    " addze    %0,r17          \n\t"   \
573    " sdx      r16,0,%1        \n\t"   \
574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
575 
576 #define PROPCARRY                     \
577 __asm__(                              \
578    " ldx      r16,0,%1       \n\t"    \
579    " addc     r16,r16,%0     \n\t"    \
580    " sdx      r16,0,%1       \n\t"    \
581    " xor      %0,%0,%0       \n\t"    \
582    " addze    %0,%0          \n\t"    \
583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
584 
585 /******************************************************************/
586 
587 #elif defined(TFM_AVR32)
588 
589 /* AVR32 */
590 #define MONT_START
591 #define MONT_FINI
592 #define LOOP_END
593 #define LOOP_START \
594    mu = c[x] * mp
595 
596 #define INNERMUL                    \
597 __asm__(                            \
598     " ld.w   r2,%1            \n\t" \
599     " add    r2,%0            \n\t" \
600     " eor    r3,r3            \n\t" \
601     " acr    r3               \n\t" \
602     " macu.d r2,%3,%4         \n\t" \
603     " st.w   %1,r2            \n\t" \
604     " mov    %0,r3            \n\t" \
605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
606 
607 #define PROPCARRY                    \
608 __asm__(                             \
609    " ld.w     r2,%1         \n\t"    \
610    " add      r2,%0         \n\t"    \
611    " st.w     %1,r2         \n\t"    \
612    " eor      %0,%0         \n\t"    \
613    " acr      %0            \n\t"    \
614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
615 
616 /******************************************************************/
617 #elif defined(TFM_MIPS)
618 
619 /* MIPS */
620 #define MONT_START
621 #define MONT_FINI
622 #define LOOP_END
623 #define LOOP_START \
624    mu = c[x] * mp
625 
626 #define INNERMUL                     \
627 __asm__(                             \
628    " multu    %3,%4          \n\t"   \
629    " mflo     $12            \n\t"   \
630    " mfhi     $13            \n\t"   \
631    " addu     $12,$12,%0     \n\t"   \
632    " sltu     $10,$12,%0     \n\t"   \
633    " addu     $13,$13,$10    \n\t"   \
634    " lw       $10,%1         \n\t"   \
635    " addu     $12,$12,$10    \n\t"   \
636    " sltu     $10,$12,$10    \n\t"   \
637    " addu     %0,$13,$10     \n\t"   \
638    " sw       $12,%1         \n\t"   \
639 :"+r"(cy),"+m"(_c[0]):""(cy),"r"(mu),"r"(tmpm[0]),""(_c[0]):"$10","$12","$13"); ++tmpm;
640 
641 #define PROPCARRY                    \
642 __asm__(                             \
643    " lw       $10,%1        \n\t"    \
644    " addu     $10,$10,%0    \n\t"    \
645    " sw       $10,%1        \n\t"    \
646    " sltu     %0,$10,%0     \n\t"    \
647 :"+r"(cy),"+m"(_c[0]):""(cy),""(_c[0]):"$10");
648 
649 /******************************************************************/
650 #else
651 
652 /* ISO C code */
653 #define MONT_START
654 #define MONT_FINI
655 #define LOOP_END
656 #define LOOP_START \
657    mu = c[x] * mp
658 
659 #define INNERMUL                                      \
660    do { fp_word t;                                    \
661    t  = ((fp_word)_c[0] + (fp_word)cy) +              \
662                 (((fp_word)mu) * ((fp_word)*tmpm++)); \
663    _c[0] = (fp_digit)t;                               \
664    cy = (fp_digit)(t >> DIGIT_BIT);                   \
665    } while (0)
666 
667 #define PROPCARRY \
668    do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
669 
670 #endif
671 /******************************************************************/
672 
673 
674 #define LO  0
675 /* end fp_montogomery_reduce.c asm */
676 
677 
678 /* start fp_sqr_comba.c asm */
679 #if defined(TFM_X86)
680 
681 /* x86-32 optimized */
682 
683 #define COMBA_START
684 
685 #define CLEAR_CARRY \
686    c0 = c1 = c2 = 0;
687 
688 #define COMBA_STORE(x) \
689    x = c0;
690 
691 #define COMBA_STORE2(x) \
692    x = c1;
693 
694 #define CARRY_FORWARD \
695    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
696 
697 #define COMBA_FINI
698 
699 #define SQRADD(i, j)                                      \
700 __asm__(                                                  \
701      "movl  %3,%%eax     \n\t"                            \
702      "mull  %%eax        \n\t"                            \
703      "addl  %%eax,%0     \n\t"                            \
704      "adcl  %%edx,%1     \n\t"                            \
705      "adcl  $0,%2        \n\t"                            \
706      :"+rm"(c0), "+rm"(c1), "+rm"(c2)                     \
707      : "m"(i)                                             \
708      :"%eax","%edx","cc");
709 
710 #define SQRADD2(i, j)                                     \
711 __asm__(                                                  \
712      "movl  %3,%%eax     \n\t"                            \
713      "mull  %4           \n\t"                            \
714      "addl  %%eax,%0     \n\t"                            \
715      "adcl  %%edx,%1     \n\t"                            \
716      "adcl  $0,%2        \n\t"                            \
717      "addl  %%eax,%0     \n\t"                            \
718      "adcl  %%edx,%1     \n\t"                            \
719      "adcl  $0,%2        \n\t"                            \
720      :"+rm"(c0), "+rm"(c1), "+rm"(c2)                     \
721      : "m"(i), "m"(j)                                     \
722      :"%eax","%edx", "cc");
723 
724 #define SQRADDSC(i, j)                                    \
725 __asm__(                                                  \
726      "movl  %3,%%eax     \n\t"                            \
727      "mull  %4           \n\t"                            \
728      "movl  %%eax,%0     \n\t"                            \
729      "movl  %%edx,%1     \n\t"                            \
730      "xorl  %2,%2        \n\t"                            \
731      :"=r"(sc0), "=r"(sc1), "=r"(sc2)                     \
732      : "g"(i), "g"(j)                                     \
733      :"%eax","%edx","cc");
734 
735 #define SQRADDAC(i, j)                                    \
736 __asm__(                                                  \
737      "movl  %6,%%eax     \n\t"                            \
738      "mull  %7           \n\t"                            \
739      "addl  %%eax,%0     \n\t"                            \
740      "adcl  %%edx,%1     \n\t"                            \
741      "adcl  $0,%2        \n\t"                            \
742      :"=r"(sc0), "=r"(sc1), "=r"(sc2)                     \
743      : "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j)       \
744      :"%eax","%edx","cc");
745 
746 #define SQRADDDB                                          \
747 __asm__(                                                  \
748      "addl %6,%0         \n\t"                            \
749      "adcl %7,%1         \n\t"                            \
750      "adcl %8,%2         \n\t"                            \
751      "addl %6,%0         \n\t"                            \
752      "adcl %7,%1         \n\t"                            \
753      "adcl %8,%2         \n\t"                            \
754      :"=r"(c0), "=r"(c1), "=r"(c2)                        \
755      : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1),     \
756        "r"(sc2)                                           \
757      : "cc");
758 
759 #elif defined(TFM_X86_64)
760 /* x86-64 optimized */
761 
762 #define COMBA_START
763 
764 #define CLEAR_CARRY \
765    c0 = c1 = c2 = 0;
766 
767 #define COMBA_STORE(x) \
768    x = c0;
769 
770 #define COMBA_STORE2(x) \
771    x = c1;
772 
773 #define CARRY_FORWARD \
774    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
775 
776 #define COMBA_FINI
777 
778 #define SQRADD(i, j)                                      \
779 __asm__(                                                  \
780      "movq  %6,%%rax     \n\t"                            \
781      "mulq  %%rax        \n\t"                            \
782      "addq  %%rax,%0     \n\t"                            \
783      "adcq  %%rdx,%1     \n\t"                            \
784      "adcq  $0,%2        \n\t"                            \
785      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
786 
787 #define SQRADD2(i, j)                                     \
788 __asm__(                                                  \
789      "movq  %6,%%rax     \n\t"                            \
790      "mulq  %7           \n\t"                            \
791      "addq  %%rax,%0     \n\t"                            \
792      "adcq  %%rdx,%1     \n\t"                            \
793      "adcq  $0,%2        \n\t"                            \
794      "addq  %%rax,%0     \n\t"                            \
795      "adcq  %%rdx,%1     \n\t"                            \
796      "adcq  $0,%2        \n\t"                            \
797      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
798 
799 #define SQRADDSC(i, j)                                    \
800 __asm__(                                                  \
801      "movq  %3,%%rax     \n\t"                            \
802      "mulq  %4           \n\t"                            \
803      "movq  %%rax,%0     \n\t"                            \
804      "movq  %%rdx,%1     \n\t"                            \
805      "xorq  %2,%2        \n\t"                            \
806      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
807 
808 #define SQRADDAC(i, j)                                                         \
809 __asm__(                                                  \
810      "movq  %6,%%rax     \n\t"                            \
811      "mulq  %7           \n\t"                            \
812      "addq  %%rax,%0     \n\t"                            \
813      "adcq  %%rdx,%1     \n\t"                            \
814      "adcq  $0,%2        \n\t"                            \
815      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
816 
817 #define SQRADDDB                                          \
818 __asm__(                                                  \
819      "addq %6,%0         \n\t"                            \
820      "adcq %7,%1         \n\t"                            \
821      "adcq %8,%2         \n\t"                            \
822      "addq %6,%0         \n\t"                            \
823      "adcq %7,%1         \n\t"                            \
824      "adcq %8,%2         \n\t"                            \
825      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
826 
827 #elif defined(TFM_SSE2)
828 
829 /* SSE2 Optimized */
830 #define COMBA_START
831 
832 #define CLEAR_CARRY \
833    c0 = c1 = c2 = 0;
834 
835 #define COMBA_STORE(x) \
836    x = c0;
837 
838 #define COMBA_STORE2(x) \
839    x = c1;
840 
841 #define CARRY_FORWARD \
842    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
843 
844 #define COMBA_FINI \
845    __asm__("emms");
846 
847 #define SQRADD(i, j)                                      \
848 __asm__(                                                  \
849      "movd  %6,%%mm0     \n\t"                            \
850      "pmuludq %%mm0,%%mm0\n\t"                            \
851      "movd  %%mm0,%%eax  \n\t"                            \
852      "psrlq $32,%%mm0    \n\t"                            \
853      "addl  %%eax,%0     \n\t"                            \
854      "movd  %%mm0,%%eax  \n\t"                            \
855      "adcl  %%eax,%1     \n\t"                            \
856      "adcl  $0,%2        \n\t"                            \
857      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
858 
859 #define SQRADD2(i, j)                                     \
860 __asm__(                                                  \
861      "movd  %6,%%mm0     \n\t"                            \
862      "movd  %7,%%mm1     \n\t"                            \
863      "pmuludq %%mm1,%%mm0\n\t"                            \
864      "movd  %%mm0,%%eax  \n\t"                            \
865      "psrlq $32,%%mm0    \n\t"                            \
866      "movd  %%mm0,%%edx  \n\t"                            \
867      "addl  %%eax,%0     \n\t"                            \
868      "adcl  %%edx,%1     \n\t"                            \
869      "adcl  $0,%2        \n\t"                            \
870      "addl  %%eax,%0     \n\t"                            \
871      "adcl  %%edx,%1     \n\t"                            \
872      "adcl  $0,%2        \n\t"                            \
873      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
874 
875 #define SQRADDSC(i, j)                                                         \
876 __asm__(                                                  \
877      "movd  %3,%%mm0     \n\t"                            \
878      "movd  %4,%%mm1     \n\t"                            \
879      "pmuludq %%mm1,%%mm0\n\t"                            \
880      "movd  %%mm0,%0     \n\t"                            \
881      "psrlq $32,%%mm0    \n\t"                            \
882      "movd  %%mm0,%1     \n\t"                            \
883      "xorl  %2,%2        \n\t"                            \
884      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
885 
886 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
887 
888 #define SQRADDAC(i, j)                                                         \
889 __asm__(                                                  \
890      "movd  %6,%%mm0     \n\t"                            \
891      "movd  %7,%%mm1     \n\t"                            \
892      "pmuludq %%mm1,%%mm0\n\t"                            \
893      "movd  %%mm0,%%eax  \n\t"                            \
894      "psrlq $32,%%mm0    \n\t"                            \
895      "movd  %%mm0,%%edx  \n\t"                            \
896      "addl  %%eax,%0     \n\t"                            \
897      "adcl  %%edx,%1     \n\t"                            \
898      "adcl  $0,%2        \n\t"                            \
899      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");
900 
901 #define SQRADDDB                                          \
902 __asm__(                                                  \
903      "addl %6,%0         \n\t"                            \
904      "adcl %7,%1         \n\t"                            \
905      "adcl %8,%2         \n\t"                            \
906      "addl %6,%0         \n\t"                            \
907      "adcl %7,%1         \n\t"                            \
908      "adcl %8,%2         \n\t"                            \
909      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
910 
911 #elif defined(TFM_ARM)
912 
913 /* ARM code */
914 
915 #define COMBA_START
916 
917 #define CLEAR_CARRY \
918    c0 = c1 = c2 = 0;
919 
920 #define COMBA_STORE(x) \
921    x = c0;
922 
923 #define COMBA_STORE2(x) \
924    x = c1;
925 
926 #define CARRY_FORWARD \
927    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
928 
929 #define COMBA_FINI
930 
931 /* multiplies point i and j, updates carry "c1" and digit c2 */
932 #define SQRADD(i, j)                                             \
933 __asm__(                                                         \
934 "  UMULL  r0,r1,%6,%6              \n\t"                         \
935 "  ADDS   %0,%0,r0                 \n\t"                         \
936 "  ADCS   %1,%1,r1                 \n\t"                         \
937 "  ADC    %2,%2,#0                 \n\t"                         \
938 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
939 
940 /* for squaring some of the terms are doubled... */
941 #define SQRADD2(i, j)                                            \
942 __asm__(                                                         \
943 "  UMULL  r0,r1,%6,%7              \n\t"                         \
944 "  ADDS   %0,%0,r0                 \n\t"                         \
945 "  ADCS   %1,%1,r1                 \n\t"                         \
946 "  ADC    %2,%2,#0                 \n\t"                         \
947 "  ADDS   %0,%0,r0                 \n\t"                         \
948 "  ADCS   %1,%1,r1                 \n\t"                         \
949 "  ADC    %2,%2,#0                 \n\t"                         \
950 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
951 
952 #define SQRADDSC(i, j)                                           \
953 __asm__(                                                         \
954 "  UMULL  %0,%1,%3,%4              \n\t"                         \
955 "  SUB    %2,%2,%2                 \n\t"                         \
956 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
957 
958 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
959 
960 #define SQRADDAC(i, j)                                           \
961 __asm__(                                                         \
962 "  UMULL  r0,r1,%6,%7              \n\t"                         \
963 "  ADDS   %0,%0,r0                 \n\t"                         \
964 "  ADCS   %1,%1,r1                 \n\t"                         \
965 "  ADC    %2,%2,#0                 \n\t"                         \
966 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
967 
968 #define SQRADDDB                                                 \
969 __asm__(                                                         \
970 "  ADDS  %0,%0,%3                     \n\t"                      \
971 "  ADCS  %1,%1,%4                     \n\t"                      \
972 "  ADC   %2,%2,%5                     \n\t"                      \
973 "  ADDS  %0,%0,%3                     \n\t"                      \
974 "  ADCS  %1,%1,%4                     \n\t"                      \
975 "  ADC   %2,%2,%5                     \n\t"                      \
976 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
977 
978 #elif defined(TFM_PPC32)
979 
980 /* PPC32 */
981 
982 #define COMBA_START
983 
984 #define CLEAR_CARRY \
985    c0 = c1 = c2 = 0;
986 
987 #define COMBA_STORE(x) \
988    x = c0;
989 
990 #define COMBA_STORE2(x) \
991    x = c1;
992 
993 #define CARRY_FORWARD \
994    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
995 
996 #define COMBA_FINI
997 
998 /* multiplies point i and j, updates carry "c1" and digit c2 */
999 #define SQRADD(i, j)             \
1000 __asm__(                         \
1001    " mullw  16,%6,%6       \n\t" \
1002    " addc   %0,%0,16       \n\t" \
1003    " mulhwu 16,%6,%6       \n\t" \
1004    " adde   %1,%1,16       \n\t" \
1005    " addze  %2,%2          \n\t" \
1006 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
1007 
1008 /* for squaring some of the terms are doubled... */
1009 #define SQRADD2(i, j)            \
1010 __asm__(                         \
1011    " mullw  16,%6,%7       \n\t" \
1012    " mulhwu 17,%6,%7       \n\t" \
1013    " addc   %0,%0,16       \n\t" \
1014    " adde   %1,%1,17       \n\t" \
1015    " addze  %2,%2          \n\t" \
1016    " addc   %0,%0,16       \n\t" \
1017    " adde   %1,%1,17       \n\t" \
1018    " addze  %2,%2          \n\t" \
1019 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
1020 
1021 #define SQRADDSC(i, j)            \
1022 __asm__(                          \
1023    " mullw  %0,%6,%7        \n\t" \
1024    " mulhwu %1,%6,%7        \n\t" \
1025    " xor    %2,%2,%2        \n\t" \
1026 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1027 
1028 #define SQRADDAC(i, j)           \
1029 __asm__(                         \
1030    " mullw  16,%6,%7       \n\t" \
1031    " addc   %0,%0,16       \n\t" \
1032    " mulhwu 16,%6,%7       \n\t" \
1033    " adde   %1,%1,16       \n\t" \
1034    " addze  %2,%2          \n\t" \
1035 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
1036 
1037 #define SQRADDDB                  \
1038 __asm__(                          \
1039    " addc   %0,%0,%3        \n\t" \
1040    " adde   %1,%1,%4        \n\t" \
1041    " adde   %2,%2,%5        \n\t" \
1042    " addc   %0,%0,%3        \n\t" \
1043    " adde   %1,%1,%4        \n\t" \
1044    " adde   %2,%2,%5        \n\t" \
1045 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1046 
1047 #elif defined(TFM_PPC64)
1048 /* PPC64 */
1049 
1050 #define COMBA_START
1051 
1052 #define CLEAR_CARRY \
1053    c0 = c1 = c2 = 0;
1054 
1055 #define COMBA_STORE(x) \
1056    x = c0;
1057 
1058 #define COMBA_STORE2(x) \
1059    x = c1;
1060 
1061 #define CARRY_FORWARD \
1062    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1063 
1064 #define COMBA_FINI
1065 
1066 /* multiplies point i and j, updates carry "c1" and digit c2 */
1067 #define SQRADD(i, j)              \
1068 __asm__(                          \
1069    " mulld  r16,%6,%6       \n\t" \
1070    " addc   %0,%0,r16       \n\t" \
1071    " mulhdu r16,%6,%6       \n\t" \
1072    " adde   %1,%1,r16       \n\t" \
1073    " addze  %2,%2           \n\t" \
1074 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");
1075 
1076 /* for squaring some of the terms are doubled... */
1077 #define SQRADD2(i, j)             \
1078 __asm__(                          \
1079    " mulld  r16,%6,%7       \n\t" \
1080    " mulhdu r17,%6,%7       \n\t" \
1081    " addc   %0,%0,r16       \n\t" \
1082    " adde   %1,%1,r17       \n\t" \
1083    " addze  %2,%2           \n\t" \
1084    " addc   %0,%0,r16       \n\t" \
1085    " adde   %1,%1,r17       \n\t" \
1086    " addze  %2,%2           \n\t" \
1087 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");
1088 
1089 #define SQRADDSC(i, j)            \
1090 __asm__(                          \
1091    " mulld  %0,%6,%7        \n\t" \
1092    " mulhdu %1,%6,%7        \n\t" \
1093    " xor    %2,%2,%2        \n\t" \
1094 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1095 
1096 #define SQRADDAC(i, j)            \
1097 __asm__(                          \
1098    " mulld  r16,%6,%7       \n\t" \
1099    " addc   %0,%0,r16       \n\t" \
1100    " mulhdu r16,%6,%7       \n\t" \
1101    " adde   %1,%1,r16       \n\t" \
1102    " addze  %2,%2           \n\t" \
1103 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");
1104 
1105 #define SQRADDDB                  \
1106 __asm__(                          \
1107    " addc   %0,%0,%3        \n\t" \
1108    " adde   %1,%1,%4        \n\t" \
1109    " adde   %2,%2,%5        \n\t" \
1110    " addc   %0,%0,%3        \n\t" \
1111    " adde   %1,%1,%4        \n\t" \
1112    " adde   %2,%2,%5        \n\t" \
1113 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1114 
1115 
1116 #elif defined(TFM_AVR32)
1117 
1118 /* AVR32 */
1119 
1120 #define COMBA_START
1121 
1122 #define CLEAR_CARRY \
1123    c0 = c1 = c2 = 0;
1124 
1125 #define COMBA_STORE(x) \
1126    x = c0;
1127 
1128 #define COMBA_STORE2(x) \
1129    x = c1;
1130 
1131 #define CARRY_FORWARD \
1132    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1133 
1134 #define COMBA_FINI
1135 
1136 /* multiplies point i and j, updates carry "c1" and digit c2 */
1137 #define SQRADD(i, j)             \
1138 __asm__(                         \
1139    " mulu.d r2,%6,%6       \n\t" \
1140    " add    %0,%0,r2       \n\t" \
1141    " adc    %1,%1,r3       \n\t" \
1142    " acr    %2             \n\t" \
1143 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
1144 
1145 /* for squaring some of the terms are doubled... */
1146 #define SQRADD2(i, j)            \
1147 __asm__(                         \
1148    " mulu.d r2,%6,%7       \n\t" \
1149    " add    %0,%0,r2       \n\t" \
1150    " adc    %1,%1,r3       \n\t" \
1151    " acr    %2,            \n\t" \
1152    " add    %0,%0,r2       \n\t" \
1153    " adc    %1,%1,r3       \n\t" \
1154    " acr    %2,            \n\t" \
1155 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
1156 
1157 #define SQRADDSC(i, j)            \
1158 __asm__(                          \
1159    " mulu.d r2,%6,%7        \n\t" \
1160    " mov    %0,r2           \n\t" \
1161    " mov    %1,r3           \n\t" \
1162    " eor    %2,%2           \n\t" \
1163 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
1164 
1165 #define SQRADDAC(i, j)           \
1166 __asm__(                         \
1167    " mulu.d r2,%6,%7       \n\t" \
1168    " add    %0,%0,r2       \n\t" \
1169    " adc    %1,%1,r3       \n\t" \
1170    " acr    %2             \n\t" \
1171 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
1172 
1173 #define SQRADDDB                  \
1174 __asm__(                          \
1175    " add    %0,%0,%3        \n\t" \
1176    " adc    %1,%1,%4        \n\t" \
1177    " adc    %2,%2,%5        \n\t" \
1178    " add    %0,%0,%3        \n\t" \
1179    " adc    %1,%1,%4        \n\t" \
1180    " adc    %2,%2,%5        \n\t" \
1181 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1182 
1183 #elif defined(TFM_MIPS)
1184 
1185 /* MIPS */
1186 #define COMBA_START
1187 
1188 #define CLEAR_CARRY \
1189    c0 = c1 = c2 = 0;
1190 
1191 #define COMBA_STORE(x) \
1192    x = c0;
1193 
1194 #define COMBA_STORE2(x) \
1195    x = c1;
1196 
1197 #define CARRY_FORWARD \
1198    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1199 
1200 #define COMBA_FINI
1201 
1202 /* multiplies point i and j, updates carry "c1" and digit c2 */
1203 #define SQRADD(i, j)              \
1204 __asm__(                          \
1205    " multu  %6,%6          \n\t"  \
1206    " mflo   $12            \n\t"  \
1207    " mfhi   $13            \n\t"  \
1208    " addu    %0,%0,$12     \n\t"  \
1209    " sltu   $12,%0,$12     \n\t"  \
1210    " addu    %1,%1,$13     \n\t"  \
1211    " sltu   $13,%1,$13     \n\t"  \
1212    " addu    %1,%1,$12     \n\t"  \
1213    " sltu   $12,%1,$12     \n\t"  \
1214    " addu    %2,%2,$13     \n\t"  \
1215    " addu    %2,%2,$12     \n\t"  \
1216 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
1217 
1218 /* for squaring some of the terms are doubled... */
1219 #define SQRADD2(i, j)            \
1220 __asm__(                         \
1221    " multu  %6,%7          \n\t" \
1222    " mflo   $12            \n\t" \
1223    " mfhi   $13            \n\t" \
1224                                  \
1225    " addu    %0,%0,$12     \n\t" \
1226    " sltu   $14,%0,$12     \n\t" \
1227    " addu    %1,%1,$13     \n\t" \
1228    " sltu   $15,%1,$13     \n\t" \
1229    " addu    %1,%1,$14     \n\t" \
1230    " sltu   $14,%1,$14     \n\t" \
1231    " addu    %2,%2,$15     \n\t" \
1232    " addu    %2,%2,$14     \n\t" \
1233                                  \
1234    " addu    %0,%0,$12     \n\t" \
1235    " sltu   $14,%0,$12     \n\t" \
1236    " addu    %1,%1,$13     \n\t" \
1237    " sltu   $15,%1,$13     \n\t" \
1238    " addu    %1,%1,$14     \n\t" \
1239    " sltu   $14,%1,$14     \n\t" \
1240    " addu    %2,%2,$15     \n\t" \
1241    " addu    %2,%2,$14     \n\t" \
1242 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
1243 
1244 #define SQRADDSC(i, j)            \
1245 __asm__(                          \
1246    " multu  %6,%7          \n\t"  \
1247    " mflo   %0             \n\t"  \
1248    " mfhi   %1             \n\t"  \
1249    " xor    %2,%2,%2       \n\t"  \
1250 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1251 
1252 #define SQRADDAC(i, j)           \
1253 __asm__(                         \
1254    " multu  %6,%7          \n\t" \
1255    " mflo   $12            \n\t" \
1256    " mfhi   $13            \n\t" \
1257    " addu    %0,%0,$12     \n\t" \
1258    " sltu   $12,%0,$12     \n\t" \
1259    " addu    %1,%1,$13     \n\t" \
1260    " sltu   $13,%1,$13     \n\t" \
1261    " addu    %1,%1,$12     \n\t" \
1262    " sltu   $12,%1,$12     \n\t" \
1263    " addu    %2,%2,$13     \n\t" \
1264    " addu    %2,%2,$12     \n\t" \
1265 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
1266 
1267 #define SQRADDDB                  \
1268 __asm__(                          \
1269    " addu    %0,%0,%3       \n\t" \
1270    " sltu   $10,%0,%3       \n\t" \
1271    " addu    %1,%1,$10      \n\t" \
1272    " sltu   $10,%1,$10      \n\t" \
1273    " addu    %1,%1,%4       \n\t" \
1274    " sltu   $11,%1,%4       \n\t" \
1275    " addu    %2,%2,$10      \n\t" \
1276    " addu    %2,%2,$11      \n\t" \
1277    " addu    %2,%2,%5       \n\t" \
1278                                   \
1279    " addu    %0,%0,%3       \n\t" \
1280    " sltu   $10,%0,%3       \n\t" \
1281    " addu    %1,%1,$10      \n\t" \
1282    " sltu   $10,%1,$10      \n\t" \
1283    " addu    %1,%1,%4       \n\t" \
1284    " sltu   $11,%1,%4       \n\t" \
1285    " addu    %2,%2,$10      \n\t" \
1286    " addu    %2,%2,$11      \n\t" \
1287    " addu    %2,%2,%5       \n\t" \
1288 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
1289 
1290 #else
1291 
1292 #define TFM_ISO
1293 
1294 /* ISO C portable code */
1295 
1296 #define COMBA_START
1297 
1298 #define CLEAR_CARRY \
1299    c0 = c1 = c2 = 0;
1300 
1301 #define COMBA_STORE(x) \
1302    x = c0;
1303 
1304 #define COMBA_STORE2(x) \
1305    x = c1;
1306 
1307 #define CARRY_FORWARD \
1308    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1309 
1310 #define COMBA_FINI
1311 
1312 /* multiplies point i and j, updates carry "c1" and digit c2 */
1313 #define SQRADD(i, j)                                 \
1314    do { fp_word t;                                   \
1315    t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = (fp_digit)t;    \
1316    t = c1 + (t >> DIGIT_BIT);             c1 = (fp_digit)t;    \
1317                                           c2 +=(fp_digit) (t >> DIGIT_BIT); \
1318    } while (0);
1319 
1320 
1321 /* for squaring some of the terms are doubled... */
1322 #define SQRADD2(i, j)                                                 \
1323    do { fp_word t;                                                    \
1324    t  = ((fp_word)i) * ((fp_word)j);                                  \
1325    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;           \
1326    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;           \
1327                                          c2 +=(fp_digit)(tt >> DIGIT_BIT);     \
1328    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;                    \
1329    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;            \
1330                                          c2 +=(fp_digit)(tt >> DIGIT_BIT);     \
1331    } while (0);
1332 
1333 #define SQRADDSC(i, j)                                                         \
1334    do { fp_word t;                                                             \
1335       t =  ((fp_word)i) * ((fp_word)j);                                        \
1336       sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;                      \
1337    } while (0);
1338 
1339 #define SQRADDAC(i, j)                                                         \
1340    do { fp_word t;                                                             \
1341    t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 =  (fp_digit)t;                 \
1342    t = sc1 + (t >> DIGIT_BIT);             sc1 =  (fp_digit)t;                 \
1343                                            sc2 += (fp_digit)(t >> DIGIT_BIT);  \
1344    } while (0);
1345 
1346 #define SQRADDDB                                                               \
1347    do { fp_word t;                                                             \
1348    t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t;                 \
1349    t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT);                \
1350                                              c1 = (fp_digit)t;                 \
1351    c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT));   \
1352    } while (0);
1353 
1354 #endif
1355 
1356 #ifdef TFM_SMALL_SET
1357     #include "fp_sqr_comba_small_set.i"
1358 #endif
1359 
1360 #if defined(TFM_SQR3) && FP_SIZE >= 6
1361     #include "fp_sqr_comba_3.i"
1362 #endif
1363 #if defined(TFM_SQR4) && FP_SIZE >= 8
1364     #include "fp_sqr_comba_4.i"
1365 #endif
1366 #if defined(TFM_SQR6) && FP_SIZE >= 12
1367     #include "fp_sqr_comba_6.i"
1368 #endif
1369 #if defined(TFM_SQR7) && FP_SIZE >= 14
1370     #include "fp_sqr_comba_7.i"
1371 #endif
1372 #if defined(TFM_SQR8) && FP_SIZE >= 16
1373     #include "fp_sqr_comba_8.i"
1374 #endif
1375 #if defined(TFM_SQR9) && FP_SIZE >= 18
1376     #include "fp_sqr_comba_9.i"
1377 #endif
1378 #if defined(TFM_SQR12) && FP_SIZE >= 24
1379     #include "fp_sqr_comba_12.i"
1380 #endif
1381 #if defined(TFM_SQR17) && FP_SIZE >= 34
1382     #include "fp_sqr_comba_17.i"
1383 #endif
1384 #if defined(TFM_SQR20) && FP_SIZE >= 40
1385     #include "fp_sqr_comba_20.i"
1386 #endif
1387 #if defined(TFM_SQR24) && FP_SIZE >= 48
1388     #include "fp_sqr_comba_24.i"
1389 #endif
1390 #if defined(TFM_SQR28) && FP_SIZE >= 56
1391     #include "fp_sqr_comba_28.i"
1392 #endif
1393 #if defined(TFM_SQR32) && FP_SIZE >= 64
1394     #include "fp_sqr_comba_32.i"
1395 #endif
1396 #if defined(TFM_SQR48) && FP_SIZE >= 96
1397     #include "fp_sqr_comba_48.i"
1398 #endif
1399 #if defined(TFM_SQR64) && FP_SIZE >= 128
1400     #include "fp_sqr_comba_64.i"
1401 #endif
1402 /* end fp_sqr_comba.c asm */
1403 
1404 /* start fp_mul_comba.c asm */
1405 /* these are the combas.  Worship them. */
1406 #if defined(TFM_X86)
1407 /* Generic x86 optimized code */
1408 
1409 /* anything you need at the start */
1410 #define COMBA_START
1411 
1412 /* clear the chaining variables */
1413 #define COMBA_CLEAR \
1414    c0 = c1 = c2 = 0;
1415 
1416 /* forward the carry to the next digit */
1417 #define COMBA_FORWARD \
1418    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1419 
1420 /* store the first sum */
1421 #define COMBA_STORE(x) \
1422    x = c0;
1423 
1424 /* store the second sum [carry] */
1425 #define COMBA_STORE2(x) \
1426    x = c1;
1427 
1428 /* anything you need at the end */
1429 #define COMBA_FINI
1430 
1431 /* this should multiply i and j  */
1432 #define MULADD(i, j)                                      \
1433 __asm__(                                                  \
1434      "movl  %6,%%eax     \n\t"                            \
1435      "mull  %7           \n\t"                            \
1436      "addl  %%eax,%0     \n\t"                            \
1437      "adcl  %%edx,%1     \n\t"                            \
1438      "adcl  $0,%2        \n\t"                            \
1439      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
1440 
1441 #elif defined(TFM_X86_64)
1442 /* x86-64 optimized */
1443 
1444 /* anything you need at the start */
1445 #define COMBA_START
1446 
1447 /* clear the chaining variables */
1448 #define COMBA_CLEAR \
1449    c0 = c1 = c2 = 0;
1450 
1451 /* forward the carry to the next digit */
1452 #define COMBA_FORWARD \
1453    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1454 
1455 /* store the first sum */
1456 #define COMBA_STORE(x) \
1457    x = c0;
1458 
1459 /* store the second sum [carry] */
1460 #define COMBA_STORE2(x) \
1461    x = c1;
1462 
1463 /* anything you need at the end */
1464 #define COMBA_FINI
1465 
1466 /* this should multiply i and j  */
1467 #define MULADD(i, j)                                      \
1468 __asm__  (                                                \
1469      "movq  %6,%%rax     \n\t"                            \
1470      "mulq  %7           \n\t"                            \
1471      "addq  %%rax,%0     \n\t"                            \
1472      "adcq  %%rdx,%1     \n\t"                            \
1473      "adcq  $0,%2        \n\t"                            \
1474      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
1475 
1476 
1477 #if defined(HAVE_INTEL_MULX)
1478 #define MULADD_BODY(a,b,carry,c)                        \
1479     __asm__ volatile(                                   \
1480          "movq  %[a0],%%rdx\n\t"                        \
1481          "xorq  %%rcx, %%rcx\n\t"                       \
1482          "movq  0(%[cp]),%%r8\n\t"                      \
1483          "movq  8(%[cp]),%%r9\n\t"                      \
1484          "movq  16(%[cp]),%%r10\n\t"                    \
1485          "movq  24(%[cp]),%%r11\n\t"                    \
1486                                                         \
1487          "mulx  (%[bp]),%%rax, %%rbx\n\t"               \
1488          "adcxq  %[ca], %%r8\n\t"                       \
1489          "adoxq  %%rax, %%r8\n\t"                       \
1490          "mulx  8(%[bp]),%%rax, %%rcx\n\t"              \
1491          "adcxq  %%rbx, %%r9\n\t"                       \
1492          "adoxq  %%rax, %%r9\n\t"                       \
1493          "mulx  16(%[bp]),%%rax, %%rbx\n\t"             \
1494          "adcxq  %%rcx, %%r10\n\t"                      \
1495          "adoxq  %%rax, %%r10\n\t"                      \
1496          "mulx  24(%[bp]),%%rax, %%rcx\n\t"             \
1497          "adcxq  %%rbx, %%r11\n\t"                      \
1498          "mov $0, %[ca]\n\t"                            \
1499          "adoxq  %%rax, %%r11\n\t"                      \
1500          "adcxq  %%rcx, %[ca]\n\t"                      \
1501          "mov $0, %%rdx\n\t"                            \
1502          "adoxq  %%rdx, %[ca]\n\t"                      \
1503                                                         \
1504          "movq  %%r8, 0(%[cp])\n\t"                     \
1505          "movq  %%r9, 8(%[cp])\n\t"                     \
1506          "movq  %%r10, 16(%[cp])\n\t"                   \
1507          "movq  %%r11, 24(%[cp])\n\t"                   \
1508       : [ca] "+r" (carry)                               \
1509       : [a0] "r" (a->dp[ix]), [bp] "r" (&(b->dp[iy])),  \
1510         [cp] "r" (&(c->dp[iz]))                         \
1511       : "%r8", "%r9", "%r10", "%r11",                   \
1512         "%rdx", "%rax", "%rcx", "%rbx"                  \
1513     )
1514 
1515 #define TFM_INTEL_MUL_COMBA(a, b, ca, c)   \
1516     for (iz=0; iz<pa; iz++) c->dp[iz] = 0; \
1517     for (ix=0; ix<a->used; ix++) {         \
1518         ca = 0;                            \
1519         for (iy=0; iy<b->used; iy+=4) {    \
1520             iz = ix + iy;                  \
1521             MULADD_BODY(a, b, ca, c);      \
1522         }                                  \
1523         c->dp[ix + iy] = ca;               \
1524     }
1525 #endif
1526 
1527 #elif defined(TFM_SSE2)
1528 /* use SSE2 optimizations */
1529 
1530 /* anything you need at the start */
1531 #define COMBA_START
1532 
1533 /* clear the chaining variables */
1534 #define COMBA_CLEAR \
1535    c0 = c1 = c2 = 0;
1536 
1537 /* forward the carry to the next digit */
1538 #define COMBA_FORWARD \
1539    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1540 
1541 /* store the first sum */
1542 #define COMBA_STORE(x) \
1543    x = c0;
1544 
1545 /* store the second sum [carry] */
1546 #define COMBA_STORE2(x) \
1547    x = c1;
1548 
1549 /* anything you need at the end */
1550 #define COMBA_FINI \
1551    __asm__("emms");
1552 
1553 /* this should multiply i and j  */
1554 #define MULADD(i, j)                                     \
1555 __asm__(                                                 \
1556     "movd  %6,%%mm0     \n\t"                            \
1557     "movd  %7,%%mm1     \n\t"                            \
1558     "pmuludq %%mm1,%%mm0\n\t"                            \
1559     "movd  %%mm0,%%eax  \n\t"                            \
1560     "psrlq $32,%%mm0    \n\t"                            \
1561     "addl  %%eax,%0     \n\t"                            \
1562     "movd  %%mm0,%%eax  \n\t"                            \
1563     "adcl  %%eax,%1     \n\t"                            \
1564     "adcl  $0,%2        \n\t"                            \
1565     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
1566 
1567 #elif defined(TFM_ARM)
1568 /* ARM code */
1569 
1570 #define COMBA_START
1571 
1572 #define COMBA_CLEAR \
1573    c0 = c1 = c2 = 0;
1574 
1575 #define COMBA_FORWARD \
1576    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1577 
1578 #define COMBA_STORE(x) \
1579    x = c0;
1580 
1581 #define COMBA_STORE2(x) \
1582    x = c1;
1583 
1584 #define COMBA_FINI
1585 
1586 #define MULADD(i, j)                                          \
1587 __asm__(                                                      \
1588 "  UMULL  r0,r1,%6,%7           \n\t"                         \
1589 "  ADDS   %0,%0,r0              \n\t"                         \
1590 "  ADCS   %1,%1,r1              \n\t"                         \
1591 "  ADC    %2,%2,#0              \n\t"                         \
1592 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
1593 
1594 #elif defined(TFM_PPC32)
1595 /* For 32-bit PPC */
1596 
1597 #define COMBA_START
1598 
1599 #define COMBA_CLEAR \
1600    c0 = c1 = c2 = 0;
1601 
1602 #define COMBA_FORWARD \
1603    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1604 
1605 #define COMBA_STORE(x) \
1606    x = c0;
1607 
1608 #define COMBA_STORE2(x) \
1609    x = c1;
1610 
1611 #define COMBA_FINI
1612 
1613 /* untested: will mulhwu change the flags?  Docs say no */
1614 #define MULADD(i, j)             \
1615 __asm__(                         \
1616    " mullw  16,%6,%7       \n\t" \
1617    " addc   %0,%0,16       \n\t" \
1618    " mulhwu 16,%6,%7       \n\t" \
1619    " adde   %1,%1,16       \n\t" \
1620    " addze  %2,%2          \n\t" \
1621 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1622 
1623 #elif defined(TFM_PPC64)
1624 /* For 64-bit PPC */
1625 
1626 #define COMBA_START
1627 
1628 #define COMBA_CLEAR \
1629    c0 = c1 = c2 = 0;
1630 
1631 #define COMBA_FORWARD \
1632    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1633 
1634 #define COMBA_STORE(x) \
1635    x = c0;
1636 
1637 #define COMBA_STORE2(x) \
1638    x = c1;
1639 
1640 #define COMBA_FINI
1641 
1642 /* untested: will mulhdu change the flags?  Docs say no */
1643 #define MULADD(i, j)              \
1644 ____asm__(                        \
1645    " mulld  r16,%6,%7       \n\t" \
1646    " addc   %0,%0,16        \n\t" \
1647    " mulhdu r16,%6,%7       \n\t" \
1648    " adde   %1,%1,16        \n\t" \
1649    " addze  %2,%2           \n\t" \
1650 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
1651 
1652 #elif defined(TFM_AVR32)
1653 
1654 /* ISO C code */
1655 
1656 #define COMBA_START
1657 
1658 #define COMBA_CLEAR \
1659    c0 = c1 = c2 = 0;
1660 
1661 #define COMBA_FORWARD \
1662    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1663 
1664 #define COMBA_STORE(x) \
1665    x = c0;
1666 
1667 #define COMBA_STORE2(x) \
1668    x = c1;
1669 
1670 #define COMBA_FINI
1671 
1672 #define MULADD(i, j)             \
1673 ____asm__(                       \
1674    " mulu.d r2,%6,%7        \n\t"\
1675    " add    %0,r2           \n\t"\
1676    " adc    %1,%1,r3        \n\t"\
1677    " acr    %2              \n\t"\
1678 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
1679 
1680 #elif defined(TFM_MIPS)
1681 
1682 /* MIPS */
1683 #define COMBA_START
1684 
1685 #define COMBA_CLEAR \
1686    c0 = c1 = c2 = 0;
1687 
1688 #define COMBA_FORWARD \
1689    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1690 
1691 #define COMBA_STORE(x) \
1692    x = c0;
1693 
1694 #define COMBA_STORE2(x) \
1695    x = c1;
1696 
1697 #define COMBA_FINI
1698 
1699 #define MULADD(i, j)              \
1700 __asm__(                          \
1701    " multu  %6,%7          \n\t"  \
1702    " mflo   $12            \n\t"  \
1703    " mfhi   $13            \n\t"  \
1704    " addu    %0,%0,$12     \n\t"  \
1705    " sltu   $12,%0,$12     \n\t"  \
1706    " addu    %1,%1,$13     \n\t"  \
1707    " sltu   $13,%1,$13     \n\t"  \
1708    " addu    %1,%1,$12     \n\t"  \
1709    " sltu   $12,%1,$12     \n\t"  \
1710    " addu    %2,%2,$13     \n\t"  \
1711    " addu    %2,%2,$12     \n\t"  \
1712 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
1713 
1714 #else
1715 /* ISO C code */
1716 
1717 #define COMBA_START
1718 
1719 #define COMBA_CLEAR \
1720    c0 = c1 = c2 = 0;
1721 
1722 #define COMBA_FORWARD \
1723    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1724 
1725 #define COMBA_STORE(x) \
1726    x = c0;
1727 
1728 #define COMBA_STORE2(x) \
1729    x = c1;
1730 
1731 #define COMBA_FINI
1732 
1733 #define MULADD(i, j)                                                                                                                                  \
1734    do { fp_word t;                                      \
1735    t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j);       \
1736    c0 = (fp_digit)t;                                    \
1737    t = (fp_word)c1 + (t >> DIGIT_BIT);                  \
1738    c1 = (fp_digit)t;                                    \
1739    c2 += (fp_digit)(t >> DIGIT_BIT);                    \
1740    } while (0);
1741 
1742 #endif
1743 
1744 
1745 #ifdef TFM_SMALL_SET
1746     #include "fp_mul_comba_small_set.i"
1747 #endif
1748 
1749 #if defined(TFM_MUL3) && FP_SIZE >= 6
1750     #include "fp_mul_comba_3.i"
1751 #endif
1752 #if defined(TFM_MUL4) && FP_SIZE >= 8
1753     #include "fp_mul_comba_4.i"
1754 #endif
1755 #if defined(TFM_MUL6) && FP_SIZE >= 12
1756     #include "fp_mul_comba_6.i"
1757 #endif
1758 #if defined(TFM_MUL7) && FP_SIZE >= 14
1759     #include "fp_mul_comba_7.i"
1760 #endif
1761 #if defined(TFM_MUL8) && FP_SIZE >= 16
1762     #include "fp_mul_comba_8.i"
1763 #endif
1764 #if defined(TFM_MUL9) && FP_SIZE >= 18
1765     #include "fp_mul_comba_9.i"
1766 #endif
1767 #if defined(TFM_MUL12) && FP_SIZE >= 24
1768     #include "fp_mul_comba_12.i"
1769 #endif
1770 #if defined(TFM_MUL17) && FP_SIZE >= 34
1771     #include "fp_mul_comba_17.i"
1772 #endif
1773 #if defined(TFM_MUL20) && FP_SIZE >= 40
1774     #include "fp_mul_comba_20.i"
1775 #endif
1776 #if defined(TFM_MUL24) && FP_SIZE >= 48
1777     #include "fp_mul_comba_24.i"
1778 #endif
1779 #if defined(TFM_MUL28) && FP_SIZE >= 56
1780     #include "fp_mul_comba_28.i"
1781 #endif
1782 #if defined(TFM_MUL32) && FP_SIZE >= 64
1783     #include "fp_mul_comba_32.i"
1784 #endif
1785 #if defined(TFM_MUL48) && FP_SIZE >= 96
1786     #include "fp_mul_comba_48.i"
1787 #endif
1788 #if defined(TFM_MUL64) && FP_SIZE >= 128
1789     #include "fp_mul_comba_64.i"
1790 #endif
1791 
1792 /* end fp_mul_comba.c asm */
1793 
1794