1 /* asm.c
2 *
3 * Copyright (C) 2006-2021 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL.
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20 */
21
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <wolfssl/wolfcrypt/settings.h>
28
29 /*
30 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
31 * http://math.libtomcrypt.com
32 */
33
34
35 /******************************************************************/
36 /* fp_montgomery_reduce.c asm or generic */
37
38
39 /* Each platform needs to query info type 1 from cpuid to see if aesni is
40 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
41 */
42
43 #if defined(HAVE_INTEL_MULX)
44 #ifndef _MSC_VER
45 #define cpuid(reg, leaf, sub)\
46 __asm__ __volatile__ ("cpuid":\
47 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
48 "a" (leaf), "c"(sub));
49
50 #define XASM_LINK(f) asm(f)
51 #else
52
53 #include <intrin.h>
54 #define cpuid(a,b,c) __cpuidex((int*)a,b,c)
55
56 #define XASM_LINK(f)
57
58 #endif /* _MSC_VER */
59
60 #define EAX 0
61 #define EBX 1
62 #define ECX 2
63 #define EDX 3
64
65 #define CPUID_AVX1 0x1
66 #define CPUID_AVX2 0x2
67 #define CPUID_RDRAND 0x4
68 #define CPUID_RDSEED 0x8
69 #define CPUID_BMI2 0x10 /* MULX, RORX */
70 #define CPUID_ADX 0x20 /* ADCX, ADOX */
71
72 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
73 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
74 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
75 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
76 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
77 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
78 #define SET_FLAGS
79
80 static word32 cpuid_check = 0 ;
81 static word32 cpuid_flags = 0 ;
82
cpuid_flag(word32 leaf,word32 sub,word32 num,word32 bit)83 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
84 int got_intel_cpu = 0;
85 int got_amd_cpu = 0;
86 unsigned int reg[5];
87
88 reg[4] = '\0' ;
89 cpuid(reg, 0, 0);
90
91 /* check for intel cpu */
92 if( memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
93 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
94 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
95 got_intel_cpu = 1;
96 }
97
98 /* check for AMD cpu */
99 if( memcmp((char *)&(reg[EBX]), "Auth", 4) == 0 &&
100 memcmp((char *)&(reg[EDX]), "enti", 4) == 0 &&
101 memcmp((char *)&(reg[ECX]), "cAMD", 4) == 0) {
102 got_amd_cpu = 1;
103 }
104 if (got_intel_cpu || got_amd_cpu) {
105 cpuid(reg, leaf, sub);
106 return((reg[num]>>bit)&0x1) ;
107 }
108 return 0 ;
109 }
110
set_cpuid_flags(void)111 WC_INLINE static int set_cpuid_flags(void) {
112 if(cpuid_check == 0) {
113 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
114 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
115 cpuid_check = 1 ;
116 return 0 ;
117 }
118 return 1 ;
119 }
120
121 #define RETURN return
122 #define IF_HAVE_INTEL_MULX(func, ret) \
123 if(cpuid_check==0)set_cpuid_flags() ; \
124 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
125
126 #else
127 #define IF_HAVE_INTEL_MULX(func, ret)
128 #endif
129
130 #if defined(TFM_X86) && !defined(TFM_SSE2)
131 /* x86-32 code */
132
133 #define MONT_START
134 #define MONT_FINI
135 #define LOOP_END
136 #define LOOP_START \
137 mu = c[x] * mp
138
139 #define INNERMUL \
140 __asm__( \
141 "movl %5,%%eax \n\t" \
142 "mull %4 \n\t" \
143 "addl %1,%%eax \n\t" \
144 "adcl $0,%%edx \n\t" \
145 "addl %%eax,%0 \n\t" \
146 "adcl $0,%%edx \n\t" \
147 "movl %%edx,%1 \n\t" \
148 :"=g"(_c[LO]), "=r"(cy) \
149 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
150 : "%eax", "%edx", "cc")
151
152 #define PROPCARRY \
153 __asm__( \
154 "addl %1,%0 \n\t" \
155 "setb %%al \n\t" \
156 "movzbl %%al,%1 \n\t" \
157 :"=g"(_c[LO]), "=r"(cy) \
158 :"0"(_c[LO]), "1"(cy) \
159 : "%eax", "cc")
160
161 /******************************************************************/
162 #elif defined(TFM_X86_64)
163 /* x86-64 code */
164
165 #define MONT_START
166 #define MONT_FINI
167 #define LOOP_END
168 #define LOOP_START \
169 mu = c[x] * mp
170
171 #define INNERMUL \
172 __asm__( \
173 "movq %5,%%rax \n\t" \
174 "mulq %4 \n\t" \
175 "addq %1,%%rax \n\t" \
176 "adcq $0,%%rdx \n\t" \
177 "addq %%rax,%0 \n\t" \
178 "adcq $0,%%rdx \n\t" \
179 "movq %%rdx,%1 \n\t" \
180 :"=g"(_c[LO]), "=r"(cy) \
181 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
182 : "%rax", "%rdx", "cc")
183
184 #if defined(HAVE_INTEL_MULX)
185 #define MULX_INNERMUL8(x,y,z,cy) \
186 __asm__ volatile ( \
187 "movq %[yn], %%rdx\n\t" \
188 "xorq %%rcx, %%rcx\n\t" \
189 "movq 0(%[c]), %%r8\n\t" \
190 "movq 8(%[c]), %%r9\n\t" \
191 "movq 16(%[c]), %%r10\n\t" \
192 "movq 24(%[c]), %%r11\n\t" \
193 "movq 32(%[c]), %%r12\n\t" \
194 "movq 40(%[c]), %%r13\n\t" \
195 "movq 48(%[c]), %%r14\n\t" \
196 "movq 56(%[c]), %%r15\n\t" \
197 \
198 "mulx 0(%[xp]), %%rax, %%rcx\n\t" \
199 "adcxq %[cy], %%r8\n\t" \
200 "adoxq %%rax, %%r8\n\t" \
201 "mulx 8(%[xp]), %%rax, %[cy]\n\t" \
202 "adcxq %%rcx, %%r9\n\t" \
203 "adoxq %%rax, %%r9\n\t" \
204 "mulx 16(%[xp]), %%rax, %%rcx\n\t" \
205 "adcxq %[cy], %%r10\n\t" \
206 "adoxq %%rax, %%r10\n\t" \
207 "mulx 24(%[xp]), %%rax, %[cy]\n\t" \
208 "adcxq %%rcx, %%r11\n\t" \
209 "adoxq %%rax, %%r11\n\t" \
210 "mulx 32(%[xp]), %%rax, %%rcx\n\t" \
211 "adcxq %[cy], %%r12\n\t" \
212 "adoxq %%rax, %%r12\n\t" \
213 "mulx 40(%[xp]), %%rax, %[cy]\n\t" \
214 "adcxq %%rcx, %%r13\n\t" \
215 "adoxq %%rax, %%r13\n\t" \
216 "mulx 48(%[xp]), %%rax, %%rcx\n\t" \
217 "adcxq %[cy], %%r14\n\t" \
218 "adoxq %%rax, %%r14\n\t" \
219 "adcxq %%rcx, %%r15\n\t" \
220 "mulx 56(%[xp]), %%rax, %[cy]\n\t" \
221 "movq $0, %%rdx\n\t" \
222 "adoxq %%rdx, %%rax\n\t" \
223 "adcxq %%rdx, %[cy]\n\t" \
224 "adoxq %%rdx, %[cy]\n\t" \
225 "addq %%rax, %%r15\n\t" \
226 "adcq $0, %[cy]\n\t" \
227 \
228 "movq %%r8, 0(%[c])\n\t" \
229 "movq %%r9, 8(%[c])\n\t" \
230 "movq %%r10, 16(%[c])\n\t" \
231 "movq %%r11, 24(%[c])\n\t" \
232 "movq %%r12, 32(%[c])\n\t" \
233 "movq %%r13, 40(%[c])\n\t" \
234 "movq %%r14, 48(%[c])\n\t" \
235 "movq %%r15, 56(%[c])\n\t" \
236 : [cy] "+r" (cy) \
237 : [xp] "r" (x), [c] "r" (c_mulx), [yn] "rm" (y) \
238 :"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", \
239 "%rdx", "%rax", "%rcx" \
240 )
241
242 #define INNERMUL8_MULX \
243 {\
244 MULX_INNERMUL8(tmpm, mu, _c, cy);\
245 }
246 #endif
247
248 #define INNERMUL8 \
249 __asm__( \
250 "movq 0(%5),%%rax \n\t" \
251 "movq 0(%2),%%r10 \n\t" \
252 "movq 0x8(%5),%%r11 \n\t" \
253 "mulq %4 \n\t" \
254 "addq %%r10,%%rax \n\t" \
255 "adcq $0,%%rdx \n\t" \
256 "movq 0x8(%2),%%r10 \n\t" \
257 "addq %3,%%rax \n\t" \
258 "adcq $0,%%rdx \n\t" \
259 "movq %%rax,0(%0) \n\t" \
260 "movq %%rdx,%1 \n\t" \
261 \
262 "movq %%r11,%%rax \n\t" \
263 "movq 0x10(%5),%%r11 \n\t" \
264 "mulq %4 \n\t" \
265 "addq %%r10,%%rax \n\t" \
266 "adcq $0,%%rdx \n\t" \
267 "movq 0x10(%2),%%r10 \n\t" \
268 "addq %3,%%rax \n\t" \
269 "adcq $0,%%rdx \n\t" \
270 "movq %%rax,0x8(%0) \n\t" \
271 "movq %%rdx,%1 \n\t" \
272 \
273 "movq %%r11,%%rax \n\t" \
274 "movq 0x18(%5),%%r11 \n\t" \
275 "mulq %4 \n\t" \
276 "addq %%r10,%%rax \n\t" \
277 "adcq $0,%%rdx \n\t" \
278 "movq 0x18(%2),%%r10 \n\t" \
279 "addq %3,%%rax \n\t" \
280 "adcq $0,%%rdx \n\t" \
281 "movq %%rax,0x10(%0) \n\t" \
282 "movq %%rdx,%1 \n\t" \
283 \
284 "movq %%r11,%%rax \n\t" \
285 "movq 0x20(%5),%%r11 \n\t" \
286 "mulq %4 \n\t" \
287 "addq %%r10,%%rax \n\t" \
288 "adcq $0,%%rdx \n\t" \
289 "movq 0x20(%2),%%r10 \n\t" \
290 "addq %3,%%rax \n\t" \
291 "adcq $0,%%rdx \n\t" \
292 "movq %%rax,0x18(%0) \n\t" \
293 "movq %%rdx,%1 \n\t" \
294 \
295 "movq %%r11,%%rax \n\t" \
296 "movq 0x28(%5),%%r11 \n\t" \
297 "mulq %4 \n\t" \
298 "addq %%r10,%%rax \n\t" \
299 "adcq $0,%%rdx \n\t" \
300 "movq 0x28(%2),%%r10 \n\t" \
301 "addq %3,%%rax \n\t" \
302 "adcq $0,%%rdx \n\t" \
303 "movq %%rax,0x20(%0) \n\t" \
304 "movq %%rdx,%1 \n\t" \
305 \
306 "movq %%r11,%%rax \n\t" \
307 "movq 0x30(%5),%%r11 \n\t" \
308 "mulq %4 \n\t" \
309 "addq %%r10,%%rax \n\t" \
310 "adcq $0,%%rdx \n\t" \
311 "movq 0x30(%2),%%r10 \n\t" \
312 "addq %3,%%rax \n\t" \
313 "adcq $0,%%rdx \n\t" \
314 "movq %%rax,0x28(%0) \n\t" \
315 "movq %%rdx,%1 \n\t" \
316 \
317 "movq %%r11,%%rax \n\t" \
318 "movq 0x38(%5),%%r11 \n\t" \
319 "mulq %4 \n\t" \
320 "addq %%r10,%%rax \n\t" \
321 "adcq $0,%%rdx \n\t" \
322 "movq 0x38(%2),%%r10 \n\t" \
323 "addq %3,%%rax \n\t" \
324 "adcq $0,%%rdx \n\t" \
325 "movq %%rax,0x30(%0) \n\t" \
326 "movq %%rdx,%1 \n\t" \
327 \
328 "movq %%r11,%%rax \n\t" \
329 "mulq %4 \n\t" \
330 "addq %%r10,%%rax \n\t" \
331 "adcq $0,%%rdx \n\t" \
332 "addq %3,%%rax \n\t" \
333 "adcq $0,%%rdx \n\t" \
334 "movq %%rax,0x38(%0) \n\t" \
335 "movq %%rdx,%1 \n\t" \
336 \
337 :"=r"(_c), "=r"(cy) \
338 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
339 : "%rax", "%rdx", "%r10", "%r11", "cc")
340
341 #define PROPCARRY \
342 __asm__( \
343 "addq %1,%0 \n\t" \
344 "setb %%al \n\t" \
345 "movzbq %%al,%1 \n\t" \
346 :"=g"(_c[LO]), "=r"(cy) \
347 :"0"(_c[LO]), "1"(cy) \
348 : "%rax", "cc")
349
350 /******************************************************************/
351 #elif defined(TFM_SSE2)
352 /* SSE2 code (assumes 32-bit fp_digits) */
353 /* XMM register assignments:
354 * xmm0 *tmpm++, then Mu * (*tmpm++)
355 * xmm1 c[x], then Mu
356 * xmm2 mp
357 * xmm3 cy
358 * xmm4 _c[LO]
359 */
360
361 #define MONT_START \
362 __asm__("movd %0,%%mm2"::"g"(mp))
363
364 #define MONT_FINI \
365 __asm__("emms")
366
367 #define LOOP_START \
368 __asm__( \
369 "movd %0,%%mm1 \n\t" \
370 "pxor %%mm3,%%mm3 \n\t" \
371 "pmuludq %%mm2,%%mm1 \n\t" \
372 :: "g"(c[x]))
373
374 /* pmuludq on mmx registers does a 32x32->64 multiply. */
375 #define INNERMUL \
376 __asm__( \
377 "movd %1,%%mm4 \n\t" \
378 "movd %2,%%mm0 \n\t" \
379 "paddq %%mm4,%%mm3 \n\t" \
380 "pmuludq %%mm1,%%mm0 \n\t" \
381 "paddq %%mm0,%%mm3 \n\t" \
382 "movd %%mm3,%0 \n\t" \
383 "psrlq $32, %%mm3 \n\t" \
384 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
385
386 #define INNERMUL8 \
387 __asm__( \
388 "movd 0(%1),%%mm4 \n\t" \
389 "movd 0(%2),%%mm0 \n\t" \
390 "paddq %%mm4,%%mm3 \n\t" \
391 "pmuludq %%mm1,%%mm0 \n\t" \
392 "movd 4(%2),%%mm5 \n\t" \
393 "paddq %%mm0,%%mm3 \n\t" \
394 "movd 4(%1),%%mm6 \n\t" \
395 "movd %%mm3,0(%0) \n\t" \
396 "psrlq $32, %%mm3 \n\t" \
397 \
398 "paddq %%mm6,%%mm3 \n\t" \
399 "pmuludq %%mm1,%%mm5 \n\t" \
400 "movd 8(%2),%%mm6 \n\t" \
401 "paddq %%mm5,%%mm3 \n\t" \
402 "movd 8(%1),%%mm7 \n\t" \
403 "movd %%mm3,4(%0) \n\t" \
404 "psrlq $32, %%mm3 \n\t" \
405 \
406 "paddq %%mm7,%%mm3 \n\t" \
407 "pmuludq %%mm1,%%mm6 \n\t" \
408 "movd 12(%2),%%mm7 \n\t" \
409 "paddq %%mm6,%%mm3 \n\t" \
410 "movd 12(%1),%%mm5 \n\t" \
411 "movd %%mm3,8(%0) \n\t" \
412 "psrlq $32, %%mm3 \n\t" \
413 \
414 "paddq %%mm5,%%mm3 \n\t" \
415 "pmuludq %%mm1,%%mm7 \n\t" \
416 "movd 16(%2),%%mm5 \n\t" \
417 "paddq %%mm7,%%mm3 \n\t" \
418 "movd 16(%1),%%mm6 \n\t" \
419 "movd %%mm3,12(%0) \n\t" \
420 "psrlq $32, %%mm3 \n\t" \
421 \
422 "paddq %%mm6,%%mm3 \n\t" \
423 "pmuludq %%mm1,%%mm5 \n\t" \
424 "movd 20(%2),%%mm6 \n\t" \
425 "paddq %%mm5,%%mm3 \n\t" \
426 "movd 20(%1),%%mm7 \n\t" \
427 "movd %%mm3,16(%0) \n\t" \
428 "psrlq $32, %%mm3 \n\t" \
429 \
430 "paddq %%mm7,%%mm3 \n\t" \
431 "pmuludq %%mm1,%%mm6 \n\t" \
432 "movd 24(%2),%%mm7 \n\t" \
433 "paddq %%mm6,%%mm3 \n\t" \
434 "movd 24(%1),%%mm5 \n\t" \
435 "movd %%mm3,20(%0) \n\t" \
436 "psrlq $32, %%mm3 \n\t" \
437 \
438 "paddq %%mm5,%%mm3 \n\t" \
439 "pmuludq %%mm1,%%mm7 \n\t" \
440 "movd 28(%2),%%mm5 \n\t" \
441 "paddq %%mm7,%%mm3 \n\t" \
442 "movd 28(%1),%%mm6 \n\t" \
443 "movd %%mm3,24(%0) \n\t" \
444 "psrlq $32, %%mm3 \n\t" \
445 \
446 "paddq %%mm6,%%mm3 \n\t" \
447 "pmuludq %%mm1,%%mm5 \n\t" \
448 "paddq %%mm5,%%mm3 \n\t" \
449 "movd %%mm3,28(%0) \n\t" \
450 "psrlq $32, %%mm3 \n\t" \
451 :"=r"(_c) : "0"(_c), "r"(tmpm) );
452
453 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
454 pointer */
455
456 #define LOOP_END \
457 __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
458
459 #define PROPCARRY \
460 __asm__( \
461 "addl %1,%0 \n\t" \
462 "setb %%al \n\t" \
463 "movzbl %%al,%1 \n\t" \
464 :"=g"(_c[LO]), "=r"(cy) \
465 :"0"(_c[LO]), "1"(cy) \
466 : "%eax", "cc")
467
468 /******************************************************************/
469 #elif defined(TFM_ARM)
470 /* ARMv4 code */
471
472 #define MONT_START
473 #define MONT_FINI
474 #define LOOP_END
475 #define LOOP_START \
476 mu = c[x] * mp
477
478
479 #ifdef __thumb__
480
481 #define INNERMUL \
482 __asm__( \
483 " LDR r0,%1 \n\t" \
484 " ADDS r0,r0,%0 \n\t" \
485 " ITE CS \n\t" \
486 " MOVCS %0,#1 \n\t" \
487 " MOVCC %0,#0 \n\t" \
488 " UMLAL r0,%0,%3,%4 \n\t" \
489 " STR r0,%1 \n\t" \
490 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
491
492 #define PROPCARRY \
493 __asm__( \
494 " LDR r0,%1 \n\t" \
495 " ADDS r0,r0,%0 \n\t" \
496 " STR r0,%1 \n\t" \
497 " ITE CS \n\t" \
498 " MOVCS %0,#1 \n\t" \
499 " MOVCC %0,#0 \n\t" \
500 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
501
502
503 /* TAO thumb mode uses ite (if then else) to detect carry directly
504 * fixed unmatched constraint warning by changing 1 to m */
505
506 #else /* __thumb__ */
507
508 #define INNERMUL \
509 __asm__( \
510 " LDR r0,%1 \n\t" \
511 " ADDS r0,r0,%0 \n\t" \
512 " MOVCS %0,#1 \n\t" \
513 " MOVCC %0,#0 \n\t" \
514 " UMLAL r0,%0,%3,%4 \n\t" \
515 " STR r0,%1 \n\t" \
516 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
517
518 #define PROPCARRY \
519 __asm__( \
520 " LDR r0,%1 \n\t" \
521 " ADDS r0,r0,%0 \n\t" \
522 " STR r0,%1 \n\t" \
523 " MOVCS %0,#1 \n\t" \
524 " MOVCC %0,#0 \n\t" \
525 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
526
527 #endif /* __thumb__ */
528
529 #elif defined(TFM_PPC32)
530
531 /* PPC32 */
532 #define MONT_START
533 #define MONT_FINI
534 #define LOOP_END
535 #define LOOP_START \
536 mu = c[x] * mp
537
538 #define INNERMUL \
539 __asm__( \
540 " mullw 16,%3,%4 \n\t" \
541 " mulhwu 17,%3,%4 \n\t" \
542 " addc 16,16,%2 \n\t" \
543 " addze 17,17 \n\t" \
544 " addc %1,16,%5 \n\t" \
545 " addze %0,17 \n\t" \
546 :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
547
548 #define PROPCARRY \
549 __asm__( \
550 " addc %1,%3,%2 \n\t" \
551 " xor %0,%2,%2 \n\t" \
552 " addze %0,%2 \n\t" \
553 :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
554
555 #elif defined(TFM_PPC64)
556
557 /* PPC64 */
558 #define MONT_START
559 #define MONT_FINI
560 #define LOOP_END
561 #define LOOP_START \
562 mu = c[x] * mp
563
564 #define INNERMUL \
565 __asm__( \
566 " mulld r16,%3,%4 \n\t" \
567 " mulhdu r17,%3,%4 \n\t" \
568 " addc r16,16,%0 \n\t" \
569 " addze r17,r17 \n\t" \
570 " ldx r18,0,%1 \n\t" \
571 " addc r16,r16,r18 \n\t" \
572 " addze %0,r17 \n\t" \
573 " sdx r16,0,%1 \n\t" \
574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
575
576 #define PROPCARRY \
577 __asm__( \
578 " ldx r16,0,%1 \n\t" \
579 " addc r16,r16,%0 \n\t" \
580 " sdx r16,0,%1 \n\t" \
581 " xor %0,%0,%0 \n\t" \
582 " addze %0,%0 \n\t" \
583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
584
585 /******************************************************************/
586
587 #elif defined(TFM_AVR32)
588
589 /* AVR32 */
590 #define MONT_START
591 #define MONT_FINI
592 #define LOOP_END
593 #define LOOP_START \
594 mu = c[x] * mp
595
596 #define INNERMUL \
597 __asm__( \
598 " ld.w r2,%1 \n\t" \
599 " add r2,%0 \n\t" \
600 " eor r3,r3 \n\t" \
601 " acr r3 \n\t" \
602 " macu.d r2,%3,%4 \n\t" \
603 " st.w %1,r2 \n\t" \
604 " mov %0,r3 \n\t" \
605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
606
607 #define PROPCARRY \
608 __asm__( \
609 " ld.w r2,%1 \n\t" \
610 " add r2,%0 \n\t" \
611 " st.w %1,r2 \n\t" \
612 " eor %0,%0 \n\t" \
613 " acr %0 \n\t" \
614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
615
616 /******************************************************************/
617 #elif defined(TFM_MIPS)
618
619 /* MIPS */
620 #define MONT_START
621 #define MONT_FINI
622 #define LOOP_END
623 #define LOOP_START \
624 mu = c[x] * mp
625
626 #define INNERMUL \
627 __asm__( \
628 " multu %3,%4 \n\t" \
629 " mflo $12 \n\t" \
630 " mfhi $13 \n\t" \
631 " addu $12,$12,%0 \n\t" \
632 " sltu $10,$12,%0 \n\t" \
633 " addu $13,$13,$10 \n\t" \
634 " lw $10,%1 \n\t" \
635 " addu $12,$12,$10 \n\t" \
636 " sltu $10,$12,$10 \n\t" \
637 " addu %0,$13,$10 \n\t" \
638 " sw $12,%1 \n\t" \
639 :"+r"(cy),"+m"(_c[0]):""(cy),"r"(mu),"r"(tmpm[0]),""(_c[0]):"$10","$12","$13"); ++tmpm;
640
641 #define PROPCARRY \
642 __asm__( \
643 " lw $10,%1 \n\t" \
644 " addu $10,$10,%0 \n\t" \
645 " sw $10,%1 \n\t" \
646 " sltu %0,$10,%0 \n\t" \
647 :"+r"(cy),"+m"(_c[0]):""(cy),""(_c[0]):"$10");
648
649 /******************************************************************/
650 #else
651
652 /* ISO C code */
653 #define MONT_START
654 #define MONT_FINI
655 #define LOOP_END
656 #define LOOP_START \
657 mu = c[x] * mp
658
659 #define INNERMUL \
660 do { fp_word t; \
661 t = ((fp_word)_c[0] + (fp_word)cy) + \
662 (((fp_word)mu) * ((fp_word)*tmpm++)); \
663 _c[0] = (fp_digit)t; \
664 cy = (fp_digit)(t >> DIGIT_BIT); \
665 } while (0)
666
667 #define PROPCARRY \
668 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
669
670 #endif
671 /******************************************************************/
672
673
674 #define LO 0
675 /* end fp_montogomery_reduce.c asm */
676
677
678 /* start fp_sqr_comba.c asm */
679 #if defined(TFM_X86)
680
681 /* x86-32 optimized */
682
683 #define COMBA_START
684
685 #define CLEAR_CARRY \
686 c0 = c1 = c2 = 0;
687
688 #define COMBA_STORE(x) \
689 x = c0;
690
691 #define COMBA_STORE2(x) \
692 x = c1;
693
694 #define CARRY_FORWARD \
695 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
696
697 #define COMBA_FINI
698
699 #define SQRADD(i, j) \
700 __asm__( \
701 "movl %3,%%eax \n\t" \
702 "mull %%eax \n\t" \
703 "addl %%eax,%0 \n\t" \
704 "adcl %%edx,%1 \n\t" \
705 "adcl $0,%2 \n\t" \
706 :"+rm"(c0), "+rm"(c1), "+rm"(c2) \
707 : "m"(i) \
708 :"%eax","%edx","cc");
709
710 #define SQRADD2(i, j) \
711 __asm__( \
712 "movl %3,%%eax \n\t" \
713 "mull %4 \n\t" \
714 "addl %%eax,%0 \n\t" \
715 "adcl %%edx,%1 \n\t" \
716 "adcl $0,%2 \n\t" \
717 "addl %%eax,%0 \n\t" \
718 "adcl %%edx,%1 \n\t" \
719 "adcl $0,%2 \n\t" \
720 :"+rm"(c0), "+rm"(c1), "+rm"(c2) \
721 : "m"(i), "m"(j) \
722 :"%eax","%edx", "cc");
723
724 #define SQRADDSC(i, j) \
725 __asm__( \
726 "movl %3,%%eax \n\t" \
727 "mull %4 \n\t" \
728 "movl %%eax,%0 \n\t" \
729 "movl %%edx,%1 \n\t" \
730 "xorl %2,%2 \n\t" \
731 :"=r"(sc0), "=r"(sc1), "=r"(sc2) \
732 : "g"(i), "g"(j) \
733 :"%eax","%edx","cc");
734
735 #define SQRADDAC(i, j) \
736 __asm__( \
737 "movl %6,%%eax \n\t" \
738 "mull %7 \n\t" \
739 "addl %%eax,%0 \n\t" \
740 "adcl %%edx,%1 \n\t" \
741 "adcl $0,%2 \n\t" \
742 :"=r"(sc0), "=r"(sc1), "=r"(sc2) \
743 : "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) \
744 :"%eax","%edx","cc");
745
746 #define SQRADDDB \
747 __asm__( \
748 "addl %6,%0 \n\t" \
749 "adcl %7,%1 \n\t" \
750 "adcl %8,%2 \n\t" \
751 "addl %6,%0 \n\t" \
752 "adcl %7,%1 \n\t" \
753 "adcl %8,%2 \n\t" \
754 :"=r"(c0), "=r"(c1), "=r"(c2) \
755 : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), \
756 "r"(sc2) \
757 : "cc");
758
759 #elif defined(TFM_X86_64)
760 /* x86-64 optimized */
761
762 #define COMBA_START
763
764 #define CLEAR_CARRY \
765 c0 = c1 = c2 = 0;
766
767 #define COMBA_STORE(x) \
768 x = c0;
769
770 #define COMBA_STORE2(x) \
771 x = c1;
772
773 #define CARRY_FORWARD \
774 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
775
776 #define COMBA_FINI
777
778 #define SQRADD(i, j) \
779 __asm__( \
780 "movq %6,%%rax \n\t" \
781 "mulq %%rax \n\t" \
782 "addq %%rax,%0 \n\t" \
783 "adcq %%rdx,%1 \n\t" \
784 "adcq $0,%2 \n\t" \
785 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
786
787 #define SQRADD2(i, j) \
788 __asm__( \
789 "movq %6,%%rax \n\t" \
790 "mulq %7 \n\t" \
791 "addq %%rax,%0 \n\t" \
792 "adcq %%rdx,%1 \n\t" \
793 "adcq $0,%2 \n\t" \
794 "addq %%rax,%0 \n\t" \
795 "adcq %%rdx,%1 \n\t" \
796 "adcq $0,%2 \n\t" \
797 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
798
799 #define SQRADDSC(i, j) \
800 __asm__( \
801 "movq %3,%%rax \n\t" \
802 "mulq %4 \n\t" \
803 "movq %%rax,%0 \n\t" \
804 "movq %%rdx,%1 \n\t" \
805 "xorq %2,%2 \n\t" \
806 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
807
808 #define SQRADDAC(i, j) \
809 __asm__( \
810 "movq %6,%%rax \n\t" \
811 "mulq %7 \n\t" \
812 "addq %%rax,%0 \n\t" \
813 "adcq %%rdx,%1 \n\t" \
814 "adcq $0,%2 \n\t" \
815 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
816
817 #define SQRADDDB \
818 __asm__( \
819 "addq %6,%0 \n\t" \
820 "adcq %7,%1 \n\t" \
821 "adcq %8,%2 \n\t" \
822 "addq %6,%0 \n\t" \
823 "adcq %7,%1 \n\t" \
824 "adcq %8,%2 \n\t" \
825 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
826
827 #elif defined(TFM_SSE2)
828
829 /* SSE2 Optimized */
830 #define COMBA_START
831
832 #define CLEAR_CARRY \
833 c0 = c1 = c2 = 0;
834
835 #define COMBA_STORE(x) \
836 x = c0;
837
838 #define COMBA_STORE2(x) \
839 x = c1;
840
841 #define CARRY_FORWARD \
842 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
843
844 #define COMBA_FINI \
845 __asm__("emms");
846
847 #define SQRADD(i, j) \
848 __asm__( \
849 "movd %6,%%mm0 \n\t" \
850 "pmuludq %%mm0,%%mm0\n\t" \
851 "movd %%mm0,%%eax \n\t" \
852 "psrlq $32,%%mm0 \n\t" \
853 "addl %%eax,%0 \n\t" \
854 "movd %%mm0,%%eax \n\t" \
855 "adcl %%eax,%1 \n\t" \
856 "adcl $0,%2 \n\t" \
857 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
858
859 #define SQRADD2(i, j) \
860 __asm__( \
861 "movd %6,%%mm0 \n\t" \
862 "movd %7,%%mm1 \n\t" \
863 "pmuludq %%mm1,%%mm0\n\t" \
864 "movd %%mm0,%%eax \n\t" \
865 "psrlq $32,%%mm0 \n\t" \
866 "movd %%mm0,%%edx \n\t" \
867 "addl %%eax,%0 \n\t" \
868 "adcl %%edx,%1 \n\t" \
869 "adcl $0,%2 \n\t" \
870 "addl %%eax,%0 \n\t" \
871 "adcl %%edx,%1 \n\t" \
872 "adcl $0,%2 \n\t" \
873 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
874
875 #define SQRADDSC(i, j) \
876 __asm__( \
877 "movd %3,%%mm0 \n\t" \
878 "movd %4,%%mm1 \n\t" \
879 "pmuludq %%mm1,%%mm0\n\t" \
880 "movd %%mm0,%0 \n\t" \
881 "psrlq $32,%%mm0 \n\t" \
882 "movd %%mm0,%1 \n\t" \
883 "xorl %2,%2 \n\t" \
884 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
885
886 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
887
888 #define SQRADDAC(i, j) \
889 __asm__( \
890 "movd %6,%%mm0 \n\t" \
891 "movd %7,%%mm1 \n\t" \
892 "pmuludq %%mm1,%%mm0\n\t" \
893 "movd %%mm0,%%eax \n\t" \
894 "psrlq $32,%%mm0 \n\t" \
895 "movd %%mm0,%%edx \n\t" \
896 "addl %%eax,%0 \n\t" \
897 "adcl %%edx,%1 \n\t" \
898 "adcl $0,%2 \n\t" \
899 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
900
901 #define SQRADDDB \
902 __asm__( \
903 "addl %6,%0 \n\t" \
904 "adcl %7,%1 \n\t" \
905 "adcl %8,%2 \n\t" \
906 "addl %6,%0 \n\t" \
907 "adcl %7,%1 \n\t" \
908 "adcl %8,%2 \n\t" \
909 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
910
911 #elif defined(TFM_ARM)
912
913 /* ARM code */
914
915 #define COMBA_START
916
917 #define CLEAR_CARRY \
918 c0 = c1 = c2 = 0;
919
920 #define COMBA_STORE(x) \
921 x = c0;
922
923 #define COMBA_STORE2(x) \
924 x = c1;
925
926 #define CARRY_FORWARD \
927 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
928
929 #define COMBA_FINI
930
931 /* multiplies point i and j, updates carry "c1" and digit c2 */
932 #define SQRADD(i, j) \
933 __asm__( \
934 " UMULL r0,r1,%6,%6 \n\t" \
935 " ADDS %0,%0,r0 \n\t" \
936 " ADCS %1,%1,r1 \n\t" \
937 " ADC %2,%2,#0 \n\t" \
938 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
939
940 /* for squaring some of the terms are doubled... */
941 #define SQRADD2(i, j) \
942 __asm__( \
943 " UMULL r0,r1,%6,%7 \n\t" \
944 " ADDS %0,%0,r0 \n\t" \
945 " ADCS %1,%1,r1 \n\t" \
946 " ADC %2,%2,#0 \n\t" \
947 " ADDS %0,%0,r0 \n\t" \
948 " ADCS %1,%1,r1 \n\t" \
949 " ADC %2,%2,#0 \n\t" \
950 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
951
952 #define SQRADDSC(i, j) \
953 __asm__( \
954 " UMULL %0,%1,%3,%4 \n\t" \
955 " SUB %2,%2,%2 \n\t" \
956 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
957
958 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
959
960 #define SQRADDAC(i, j) \
961 __asm__( \
962 " UMULL r0,r1,%6,%7 \n\t" \
963 " ADDS %0,%0,r0 \n\t" \
964 " ADCS %1,%1,r1 \n\t" \
965 " ADC %2,%2,#0 \n\t" \
966 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
967
968 #define SQRADDDB \
969 __asm__( \
970 " ADDS %0,%0,%3 \n\t" \
971 " ADCS %1,%1,%4 \n\t" \
972 " ADC %2,%2,%5 \n\t" \
973 " ADDS %0,%0,%3 \n\t" \
974 " ADCS %1,%1,%4 \n\t" \
975 " ADC %2,%2,%5 \n\t" \
976 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
977
978 #elif defined(TFM_PPC32)
979
980 /* PPC32 */
981
982 #define COMBA_START
983
984 #define CLEAR_CARRY \
985 c0 = c1 = c2 = 0;
986
987 #define COMBA_STORE(x) \
988 x = c0;
989
990 #define COMBA_STORE2(x) \
991 x = c1;
992
993 #define CARRY_FORWARD \
994 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
995
996 #define COMBA_FINI
997
998 /* multiplies point i and j, updates carry "c1" and digit c2 */
999 #define SQRADD(i, j) \
1000 __asm__( \
1001 " mullw 16,%6,%6 \n\t" \
1002 " addc %0,%0,16 \n\t" \
1003 " mulhwu 16,%6,%6 \n\t" \
1004 " adde %1,%1,16 \n\t" \
1005 " addze %2,%2 \n\t" \
1006 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
1007
1008 /* for squaring some of the terms are doubled... */
1009 #define SQRADD2(i, j) \
1010 __asm__( \
1011 " mullw 16,%6,%7 \n\t" \
1012 " mulhwu 17,%6,%7 \n\t" \
1013 " addc %0,%0,16 \n\t" \
1014 " adde %1,%1,17 \n\t" \
1015 " addze %2,%2 \n\t" \
1016 " addc %0,%0,16 \n\t" \
1017 " adde %1,%1,17 \n\t" \
1018 " addze %2,%2 \n\t" \
1019 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
1020
1021 #define SQRADDSC(i, j) \
1022 __asm__( \
1023 " mullw %0,%6,%7 \n\t" \
1024 " mulhwu %1,%6,%7 \n\t" \
1025 " xor %2,%2,%2 \n\t" \
1026 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1027
1028 #define SQRADDAC(i, j) \
1029 __asm__( \
1030 " mullw 16,%6,%7 \n\t" \
1031 " addc %0,%0,16 \n\t" \
1032 " mulhwu 16,%6,%7 \n\t" \
1033 " adde %1,%1,16 \n\t" \
1034 " addze %2,%2 \n\t" \
1035 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
1036
1037 #define SQRADDDB \
1038 __asm__( \
1039 " addc %0,%0,%3 \n\t" \
1040 " adde %1,%1,%4 \n\t" \
1041 " adde %2,%2,%5 \n\t" \
1042 " addc %0,%0,%3 \n\t" \
1043 " adde %1,%1,%4 \n\t" \
1044 " adde %2,%2,%5 \n\t" \
1045 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1046
1047 #elif defined(TFM_PPC64)
1048 /* PPC64 */
1049
1050 #define COMBA_START
1051
1052 #define CLEAR_CARRY \
1053 c0 = c1 = c2 = 0;
1054
1055 #define COMBA_STORE(x) \
1056 x = c0;
1057
1058 #define COMBA_STORE2(x) \
1059 x = c1;
1060
1061 #define CARRY_FORWARD \
1062 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1063
1064 #define COMBA_FINI
1065
1066 /* multiplies point i and j, updates carry "c1" and digit c2 */
1067 #define SQRADD(i, j) \
1068 __asm__( \
1069 " mulld r16,%6,%6 \n\t" \
1070 " addc %0,%0,r16 \n\t" \
1071 " mulhdu r16,%6,%6 \n\t" \
1072 " adde %1,%1,r16 \n\t" \
1073 " addze %2,%2 \n\t" \
1074 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");
1075
1076 /* for squaring some of the terms are doubled... */
1077 #define SQRADD2(i, j) \
1078 __asm__( \
1079 " mulld r16,%6,%7 \n\t" \
1080 " mulhdu r17,%6,%7 \n\t" \
1081 " addc %0,%0,r16 \n\t" \
1082 " adde %1,%1,r17 \n\t" \
1083 " addze %2,%2 \n\t" \
1084 " addc %0,%0,r16 \n\t" \
1085 " adde %1,%1,r17 \n\t" \
1086 " addze %2,%2 \n\t" \
1087 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");
1088
1089 #define SQRADDSC(i, j) \
1090 __asm__( \
1091 " mulld %0,%6,%7 \n\t" \
1092 " mulhdu %1,%6,%7 \n\t" \
1093 " xor %2,%2,%2 \n\t" \
1094 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1095
1096 #define SQRADDAC(i, j) \
1097 __asm__( \
1098 " mulld r16,%6,%7 \n\t" \
1099 " addc %0,%0,r16 \n\t" \
1100 " mulhdu r16,%6,%7 \n\t" \
1101 " adde %1,%1,r16 \n\t" \
1102 " addze %2,%2 \n\t" \
1103 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");
1104
1105 #define SQRADDDB \
1106 __asm__( \
1107 " addc %0,%0,%3 \n\t" \
1108 " adde %1,%1,%4 \n\t" \
1109 " adde %2,%2,%5 \n\t" \
1110 " addc %0,%0,%3 \n\t" \
1111 " adde %1,%1,%4 \n\t" \
1112 " adde %2,%2,%5 \n\t" \
1113 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1114
1115
1116 #elif defined(TFM_AVR32)
1117
1118 /* AVR32 */
1119
1120 #define COMBA_START
1121
1122 #define CLEAR_CARRY \
1123 c0 = c1 = c2 = 0;
1124
1125 #define COMBA_STORE(x) \
1126 x = c0;
1127
1128 #define COMBA_STORE2(x) \
1129 x = c1;
1130
1131 #define CARRY_FORWARD \
1132 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1133
1134 #define COMBA_FINI
1135
1136 /* multiplies point i and j, updates carry "c1" and digit c2 */
1137 #define SQRADD(i, j) \
1138 __asm__( \
1139 " mulu.d r2,%6,%6 \n\t" \
1140 " add %0,%0,r2 \n\t" \
1141 " adc %1,%1,r3 \n\t" \
1142 " acr %2 \n\t" \
1143 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
1144
1145 /* for squaring some of the terms are doubled... */
1146 #define SQRADD2(i, j) \
1147 __asm__( \
1148 " mulu.d r2,%6,%7 \n\t" \
1149 " add %0,%0,r2 \n\t" \
1150 " adc %1,%1,r3 \n\t" \
1151 " acr %2, \n\t" \
1152 " add %0,%0,r2 \n\t" \
1153 " adc %1,%1,r3 \n\t" \
1154 " acr %2, \n\t" \
1155 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
1156
1157 #define SQRADDSC(i, j) \
1158 __asm__( \
1159 " mulu.d r2,%6,%7 \n\t" \
1160 " mov %0,r2 \n\t" \
1161 " mov %1,r3 \n\t" \
1162 " eor %2,%2 \n\t" \
1163 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
1164
1165 #define SQRADDAC(i, j) \
1166 __asm__( \
1167 " mulu.d r2,%6,%7 \n\t" \
1168 " add %0,%0,r2 \n\t" \
1169 " adc %1,%1,r3 \n\t" \
1170 " acr %2 \n\t" \
1171 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
1172
1173 #define SQRADDDB \
1174 __asm__( \
1175 " add %0,%0,%3 \n\t" \
1176 " adc %1,%1,%4 \n\t" \
1177 " adc %2,%2,%5 \n\t" \
1178 " add %0,%0,%3 \n\t" \
1179 " adc %1,%1,%4 \n\t" \
1180 " adc %2,%2,%5 \n\t" \
1181 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1182
1183 #elif defined(TFM_MIPS)
1184
1185 /* MIPS */
1186 #define COMBA_START
1187
1188 #define CLEAR_CARRY \
1189 c0 = c1 = c2 = 0;
1190
1191 #define COMBA_STORE(x) \
1192 x = c0;
1193
1194 #define COMBA_STORE2(x) \
1195 x = c1;
1196
1197 #define CARRY_FORWARD \
1198 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1199
1200 #define COMBA_FINI
1201
1202 /* multiplies point i and j, updates carry "c1" and digit c2 */
1203 #define SQRADD(i, j) \
1204 __asm__( \
1205 " multu %6,%6 \n\t" \
1206 " mflo $12 \n\t" \
1207 " mfhi $13 \n\t" \
1208 " addu %0,%0,$12 \n\t" \
1209 " sltu $12,%0,$12 \n\t" \
1210 " addu %1,%1,$13 \n\t" \
1211 " sltu $13,%1,$13 \n\t" \
1212 " addu %1,%1,$12 \n\t" \
1213 " sltu $12,%1,$12 \n\t" \
1214 " addu %2,%2,$13 \n\t" \
1215 " addu %2,%2,$12 \n\t" \
1216 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
1217
1218 /* for squaring some of the terms are doubled... */
1219 #define SQRADD2(i, j) \
1220 __asm__( \
1221 " multu %6,%7 \n\t" \
1222 " mflo $12 \n\t" \
1223 " mfhi $13 \n\t" \
1224 \
1225 " addu %0,%0,$12 \n\t" \
1226 " sltu $14,%0,$12 \n\t" \
1227 " addu %1,%1,$13 \n\t" \
1228 " sltu $15,%1,$13 \n\t" \
1229 " addu %1,%1,$14 \n\t" \
1230 " sltu $14,%1,$14 \n\t" \
1231 " addu %2,%2,$15 \n\t" \
1232 " addu %2,%2,$14 \n\t" \
1233 \
1234 " addu %0,%0,$12 \n\t" \
1235 " sltu $14,%0,$12 \n\t" \
1236 " addu %1,%1,$13 \n\t" \
1237 " sltu $15,%1,$13 \n\t" \
1238 " addu %1,%1,$14 \n\t" \
1239 " sltu $14,%1,$14 \n\t" \
1240 " addu %2,%2,$15 \n\t" \
1241 " addu %2,%2,$14 \n\t" \
1242 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
1243
1244 #define SQRADDSC(i, j) \
1245 __asm__( \
1246 " multu %6,%7 \n\t" \
1247 " mflo %0 \n\t" \
1248 " mfhi %1 \n\t" \
1249 " xor %2,%2,%2 \n\t" \
1250 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1251
1252 #define SQRADDAC(i, j) \
1253 __asm__( \
1254 " multu %6,%7 \n\t" \
1255 " mflo $12 \n\t" \
1256 " mfhi $13 \n\t" \
1257 " addu %0,%0,$12 \n\t" \
1258 " sltu $12,%0,$12 \n\t" \
1259 " addu %1,%1,$13 \n\t" \
1260 " sltu $13,%1,$13 \n\t" \
1261 " addu %1,%1,$12 \n\t" \
1262 " sltu $12,%1,$12 \n\t" \
1263 " addu %2,%2,$13 \n\t" \
1264 " addu %2,%2,$12 \n\t" \
1265 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
1266
1267 #define SQRADDDB \
1268 __asm__( \
1269 " addu %0,%0,%3 \n\t" \
1270 " sltu $10,%0,%3 \n\t" \
1271 " addu %1,%1,$10 \n\t" \
1272 " sltu $10,%1,$10 \n\t" \
1273 " addu %1,%1,%4 \n\t" \
1274 " sltu $11,%1,%4 \n\t" \
1275 " addu %2,%2,$10 \n\t" \
1276 " addu %2,%2,$11 \n\t" \
1277 " addu %2,%2,%5 \n\t" \
1278 \
1279 " addu %0,%0,%3 \n\t" \
1280 " sltu $10,%0,%3 \n\t" \
1281 " addu %1,%1,$10 \n\t" \
1282 " sltu $10,%1,$10 \n\t" \
1283 " addu %1,%1,%4 \n\t" \
1284 " sltu $11,%1,%4 \n\t" \
1285 " addu %2,%2,$10 \n\t" \
1286 " addu %2,%2,$11 \n\t" \
1287 " addu %2,%2,%5 \n\t" \
1288 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
1289
1290 #else
1291
1292 #define TFM_ISO
1293
1294 /* ISO C portable code */
1295
1296 #define COMBA_START
1297
1298 #define CLEAR_CARRY \
1299 c0 = c1 = c2 = 0;
1300
1301 #define COMBA_STORE(x) \
1302 x = c0;
1303
1304 #define COMBA_STORE2(x) \
1305 x = c1;
1306
1307 #define CARRY_FORWARD \
1308 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1309
1310 #define COMBA_FINI
1311
1312 /* multiplies point i and j, updates carry "c1" and digit c2 */
1313 #define SQRADD(i, j) \
1314 do { fp_word t; \
1315 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
1316 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
1317 c2 +=(fp_digit) (t >> DIGIT_BIT); \
1318 } while (0);
1319
1320
1321 /* for squaring some of the terms are doubled... */
1322 #define SQRADD2(i, j) \
1323 do { fp_word t; \
1324 t = ((fp_word)i) * ((fp_word)j); \
1325 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
1326 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
1327 c2 +=(fp_digit)(tt >> DIGIT_BIT); \
1328 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
1329 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
1330 c2 +=(fp_digit)(tt >> DIGIT_BIT); \
1331 } while (0);
1332
1333 #define SQRADDSC(i, j) \
1334 do { fp_word t; \
1335 t = ((fp_word)i) * ((fp_word)j); \
1336 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
1337 } while (0);
1338
1339 #define SQRADDAC(i, j) \
1340 do { fp_word t; \
1341 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
1342 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
1343 sc2 += (fp_digit)(t >> DIGIT_BIT); \
1344 } while (0);
1345
1346 #define SQRADDDB \
1347 do { fp_word t; \
1348 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
1349 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
1350 c1 = (fp_digit)t; \
1351 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
1352 } while (0);
1353
1354 #endif
1355
1356 #ifdef TFM_SMALL_SET
1357 #include "fp_sqr_comba_small_set.i"
1358 #endif
1359
1360 #if defined(TFM_SQR3) && FP_SIZE >= 6
1361 #include "fp_sqr_comba_3.i"
1362 #endif
1363 #if defined(TFM_SQR4) && FP_SIZE >= 8
1364 #include "fp_sqr_comba_4.i"
1365 #endif
1366 #if defined(TFM_SQR6) && FP_SIZE >= 12
1367 #include "fp_sqr_comba_6.i"
1368 #endif
1369 #if defined(TFM_SQR7) && FP_SIZE >= 14
1370 #include "fp_sqr_comba_7.i"
1371 #endif
1372 #if defined(TFM_SQR8) && FP_SIZE >= 16
1373 #include "fp_sqr_comba_8.i"
1374 #endif
1375 #if defined(TFM_SQR9) && FP_SIZE >= 18
1376 #include "fp_sqr_comba_9.i"
1377 #endif
1378 #if defined(TFM_SQR12) && FP_SIZE >= 24
1379 #include "fp_sqr_comba_12.i"
1380 #endif
1381 #if defined(TFM_SQR17) && FP_SIZE >= 34
1382 #include "fp_sqr_comba_17.i"
1383 #endif
1384 #if defined(TFM_SQR20) && FP_SIZE >= 40
1385 #include "fp_sqr_comba_20.i"
1386 #endif
1387 #if defined(TFM_SQR24) && FP_SIZE >= 48
1388 #include "fp_sqr_comba_24.i"
1389 #endif
1390 #if defined(TFM_SQR28) && FP_SIZE >= 56
1391 #include "fp_sqr_comba_28.i"
1392 #endif
1393 #if defined(TFM_SQR32) && FP_SIZE >= 64
1394 #include "fp_sqr_comba_32.i"
1395 #endif
1396 #if defined(TFM_SQR48) && FP_SIZE >= 96
1397 #include "fp_sqr_comba_48.i"
1398 #endif
1399 #if defined(TFM_SQR64) && FP_SIZE >= 128
1400 #include "fp_sqr_comba_64.i"
1401 #endif
1402 /* end fp_sqr_comba.c asm */
1403
1404 /* start fp_mul_comba.c asm */
1405 /* these are the combas. Worship them. */
1406 #if defined(TFM_X86)
1407 /* Generic x86 optimized code */
1408
1409 /* anything you need at the start */
1410 #define COMBA_START
1411
1412 /* clear the chaining variables */
1413 #define COMBA_CLEAR \
1414 c0 = c1 = c2 = 0;
1415
1416 /* forward the carry to the next digit */
1417 #define COMBA_FORWARD \
1418 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1419
1420 /* store the first sum */
1421 #define COMBA_STORE(x) \
1422 x = c0;
1423
1424 /* store the second sum [carry] */
1425 #define COMBA_STORE2(x) \
1426 x = c1;
1427
1428 /* anything you need at the end */
1429 #define COMBA_FINI
1430
1431 /* this should multiply i and j */
1432 #define MULADD(i, j) \
1433 __asm__( \
1434 "movl %6,%%eax \n\t" \
1435 "mull %7 \n\t" \
1436 "addl %%eax,%0 \n\t" \
1437 "adcl %%edx,%1 \n\t" \
1438 "adcl $0,%2 \n\t" \
1439 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
1440
1441 #elif defined(TFM_X86_64)
1442 /* x86-64 optimized */
1443
1444 /* anything you need at the start */
1445 #define COMBA_START
1446
1447 /* clear the chaining variables */
1448 #define COMBA_CLEAR \
1449 c0 = c1 = c2 = 0;
1450
1451 /* forward the carry to the next digit */
1452 #define COMBA_FORWARD \
1453 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1454
1455 /* store the first sum */
1456 #define COMBA_STORE(x) \
1457 x = c0;
1458
1459 /* store the second sum [carry] */
1460 #define COMBA_STORE2(x) \
1461 x = c1;
1462
1463 /* anything you need at the end */
1464 #define COMBA_FINI
1465
1466 /* this should multiply i and j */
1467 #define MULADD(i, j) \
1468 __asm__ ( \
1469 "movq %6,%%rax \n\t" \
1470 "mulq %7 \n\t" \
1471 "addq %%rax,%0 \n\t" \
1472 "adcq %%rdx,%1 \n\t" \
1473 "adcq $0,%2 \n\t" \
1474 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
1475
1476
1477 #if defined(HAVE_INTEL_MULX)
1478 #define MULADD_BODY(a,b,carry,c) \
1479 __asm__ volatile( \
1480 "movq %[a0],%%rdx\n\t" \
1481 "xorq %%rcx, %%rcx\n\t" \
1482 "movq 0(%[cp]),%%r8\n\t" \
1483 "movq 8(%[cp]),%%r9\n\t" \
1484 "movq 16(%[cp]),%%r10\n\t" \
1485 "movq 24(%[cp]),%%r11\n\t" \
1486 \
1487 "mulx (%[bp]),%%rax, %%rbx\n\t" \
1488 "adcxq %[ca], %%r8\n\t" \
1489 "adoxq %%rax, %%r8\n\t" \
1490 "mulx 8(%[bp]),%%rax, %%rcx\n\t" \
1491 "adcxq %%rbx, %%r9\n\t" \
1492 "adoxq %%rax, %%r9\n\t" \
1493 "mulx 16(%[bp]),%%rax, %%rbx\n\t" \
1494 "adcxq %%rcx, %%r10\n\t" \
1495 "adoxq %%rax, %%r10\n\t" \
1496 "mulx 24(%[bp]),%%rax, %%rcx\n\t" \
1497 "adcxq %%rbx, %%r11\n\t" \
1498 "mov $0, %[ca]\n\t" \
1499 "adoxq %%rax, %%r11\n\t" \
1500 "adcxq %%rcx, %[ca]\n\t" \
1501 "mov $0, %%rdx\n\t" \
1502 "adoxq %%rdx, %[ca]\n\t" \
1503 \
1504 "movq %%r8, 0(%[cp])\n\t" \
1505 "movq %%r9, 8(%[cp])\n\t" \
1506 "movq %%r10, 16(%[cp])\n\t" \
1507 "movq %%r11, 24(%[cp])\n\t" \
1508 : [ca] "+r" (carry) \
1509 : [a0] "r" (a->dp[ix]), [bp] "r" (&(b->dp[iy])), \
1510 [cp] "r" (&(c->dp[iz])) \
1511 : "%r8", "%r9", "%r10", "%r11", \
1512 "%rdx", "%rax", "%rcx", "%rbx" \
1513 )
1514
1515 #define TFM_INTEL_MUL_COMBA(a, b, ca, c) \
1516 for (iz=0; iz<pa; iz++) c->dp[iz] = 0; \
1517 for (ix=0; ix<a->used; ix++) { \
1518 ca = 0; \
1519 for (iy=0; iy<b->used; iy+=4) { \
1520 iz = ix + iy; \
1521 MULADD_BODY(a, b, ca, c); \
1522 } \
1523 c->dp[ix + iy] = ca; \
1524 }
1525 #endif
1526
1527 #elif defined(TFM_SSE2)
1528 /* use SSE2 optimizations */
1529
1530 /* anything you need at the start */
1531 #define COMBA_START
1532
1533 /* clear the chaining variables */
1534 #define COMBA_CLEAR \
1535 c0 = c1 = c2 = 0;
1536
1537 /* forward the carry to the next digit */
1538 #define COMBA_FORWARD \
1539 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1540
1541 /* store the first sum */
1542 #define COMBA_STORE(x) \
1543 x = c0;
1544
1545 /* store the second sum [carry] */
1546 #define COMBA_STORE2(x) \
1547 x = c1;
1548
1549 /* anything you need at the end */
1550 #define COMBA_FINI \
1551 __asm__("emms");
1552
1553 /* this should multiply i and j */
1554 #define MULADD(i, j) \
1555 __asm__( \
1556 "movd %6,%%mm0 \n\t" \
1557 "movd %7,%%mm1 \n\t" \
1558 "pmuludq %%mm1,%%mm0\n\t" \
1559 "movd %%mm0,%%eax \n\t" \
1560 "psrlq $32,%%mm0 \n\t" \
1561 "addl %%eax,%0 \n\t" \
1562 "movd %%mm0,%%eax \n\t" \
1563 "adcl %%eax,%1 \n\t" \
1564 "adcl $0,%2 \n\t" \
1565 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
1566
1567 #elif defined(TFM_ARM)
1568 /* ARM code */
1569
1570 #define COMBA_START
1571
1572 #define COMBA_CLEAR \
1573 c0 = c1 = c2 = 0;
1574
1575 #define COMBA_FORWARD \
1576 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1577
1578 #define COMBA_STORE(x) \
1579 x = c0;
1580
1581 #define COMBA_STORE2(x) \
1582 x = c1;
1583
1584 #define COMBA_FINI
1585
1586 #define MULADD(i, j) \
1587 __asm__( \
1588 " UMULL r0,r1,%6,%7 \n\t" \
1589 " ADDS %0,%0,r0 \n\t" \
1590 " ADCS %1,%1,r1 \n\t" \
1591 " ADC %2,%2,#0 \n\t" \
1592 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
1593
1594 #elif defined(TFM_PPC32)
1595 /* For 32-bit PPC */
1596
1597 #define COMBA_START
1598
1599 #define COMBA_CLEAR \
1600 c0 = c1 = c2 = 0;
1601
1602 #define COMBA_FORWARD \
1603 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1604
1605 #define COMBA_STORE(x) \
1606 x = c0;
1607
1608 #define COMBA_STORE2(x) \
1609 x = c1;
1610
1611 #define COMBA_FINI
1612
1613 /* untested: will mulhwu change the flags? Docs say no */
1614 #define MULADD(i, j) \
1615 __asm__( \
1616 " mullw 16,%6,%7 \n\t" \
1617 " addc %0,%0,16 \n\t" \
1618 " mulhwu 16,%6,%7 \n\t" \
1619 " adde %1,%1,16 \n\t" \
1620 " addze %2,%2 \n\t" \
1621 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1622
1623 #elif defined(TFM_PPC64)
1624 /* For 64-bit PPC */
1625
1626 #define COMBA_START
1627
1628 #define COMBA_CLEAR \
1629 c0 = c1 = c2 = 0;
1630
1631 #define COMBA_FORWARD \
1632 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1633
1634 #define COMBA_STORE(x) \
1635 x = c0;
1636
1637 #define COMBA_STORE2(x) \
1638 x = c1;
1639
1640 #define COMBA_FINI
1641
1642 /* untested: will mulhdu change the flags? Docs say no */
1643 #define MULADD(i, j) \
1644 ____asm__( \
1645 " mulld r16,%6,%7 \n\t" \
1646 " addc %0,%0,16 \n\t" \
1647 " mulhdu r16,%6,%7 \n\t" \
1648 " adde %1,%1,16 \n\t" \
1649 " addze %2,%2 \n\t" \
1650 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
1651
1652 #elif defined(TFM_AVR32)
1653
1654 /* ISO C code */
1655
1656 #define COMBA_START
1657
1658 #define COMBA_CLEAR \
1659 c0 = c1 = c2 = 0;
1660
1661 #define COMBA_FORWARD \
1662 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1663
1664 #define COMBA_STORE(x) \
1665 x = c0;
1666
1667 #define COMBA_STORE2(x) \
1668 x = c1;
1669
1670 #define COMBA_FINI
1671
1672 #define MULADD(i, j) \
1673 ____asm__( \
1674 " mulu.d r2,%6,%7 \n\t"\
1675 " add %0,r2 \n\t"\
1676 " adc %1,%1,r3 \n\t"\
1677 " acr %2 \n\t"\
1678 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
1679
1680 #elif defined(TFM_MIPS)
1681
1682 /* MIPS */
1683 #define COMBA_START
1684
1685 #define COMBA_CLEAR \
1686 c0 = c1 = c2 = 0;
1687
1688 #define COMBA_FORWARD \
1689 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1690
1691 #define COMBA_STORE(x) \
1692 x = c0;
1693
1694 #define COMBA_STORE2(x) \
1695 x = c1;
1696
1697 #define COMBA_FINI
1698
1699 #define MULADD(i, j) \
1700 __asm__( \
1701 " multu %6,%7 \n\t" \
1702 " mflo $12 \n\t" \
1703 " mfhi $13 \n\t" \
1704 " addu %0,%0,$12 \n\t" \
1705 " sltu $12,%0,$12 \n\t" \
1706 " addu %1,%1,$13 \n\t" \
1707 " sltu $13,%1,$13 \n\t" \
1708 " addu %1,%1,$12 \n\t" \
1709 " sltu $12,%1,$12 \n\t" \
1710 " addu %2,%2,$13 \n\t" \
1711 " addu %2,%2,$12 \n\t" \
1712 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
1713
1714 #else
1715 /* ISO C code */
1716
1717 #define COMBA_START
1718
1719 #define COMBA_CLEAR \
1720 c0 = c1 = c2 = 0;
1721
1722 #define COMBA_FORWARD \
1723 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1724
1725 #define COMBA_STORE(x) \
1726 x = c0;
1727
1728 #define COMBA_STORE2(x) \
1729 x = c1;
1730
1731 #define COMBA_FINI
1732
1733 #define MULADD(i, j) \
1734 do { fp_word t; \
1735 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); \
1736 c0 = (fp_digit)t; \
1737 t = (fp_word)c1 + (t >> DIGIT_BIT); \
1738 c1 = (fp_digit)t; \
1739 c2 += (fp_digit)(t >> DIGIT_BIT); \
1740 } while (0);
1741
1742 #endif
1743
1744
1745 #ifdef TFM_SMALL_SET
1746 #include "fp_mul_comba_small_set.i"
1747 #endif
1748
1749 #if defined(TFM_MUL3) && FP_SIZE >= 6
1750 #include "fp_mul_comba_3.i"
1751 #endif
1752 #if defined(TFM_MUL4) && FP_SIZE >= 8
1753 #include "fp_mul_comba_4.i"
1754 #endif
1755 #if defined(TFM_MUL6) && FP_SIZE >= 12
1756 #include "fp_mul_comba_6.i"
1757 #endif
1758 #if defined(TFM_MUL7) && FP_SIZE >= 14
1759 #include "fp_mul_comba_7.i"
1760 #endif
1761 #if defined(TFM_MUL8) && FP_SIZE >= 16
1762 #include "fp_mul_comba_8.i"
1763 #endif
1764 #if defined(TFM_MUL9) && FP_SIZE >= 18
1765 #include "fp_mul_comba_9.i"
1766 #endif
1767 #if defined(TFM_MUL12) && FP_SIZE >= 24
1768 #include "fp_mul_comba_12.i"
1769 #endif
1770 #if defined(TFM_MUL17) && FP_SIZE >= 34
1771 #include "fp_mul_comba_17.i"
1772 #endif
1773 #if defined(TFM_MUL20) && FP_SIZE >= 40
1774 #include "fp_mul_comba_20.i"
1775 #endif
1776 #if defined(TFM_MUL24) && FP_SIZE >= 48
1777 #include "fp_mul_comba_24.i"
1778 #endif
1779 #if defined(TFM_MUL28) && FP_SIZE >= 56
1780 #include "fp_mul_comba_28.i"
1781 #endif
1782 #if defined(TFM_MUL32) && FP_SIZE >= 64
1783 #include "fp_mul_comba_32.i"
1784 #endif
1785 #if defined(TFM_MUL48) && FP_SIZE >= 96
1786 #include "fp_mul_comba_48.i"
1787 #endif
1788 #if defined(TFM_MUL64) && FP_SIZE >= 128
1789 #include "fp_mul_comba_64.i"
1790 #endif
1791
1792 /* end fp_mul_comba.c asm */
1793
1794