1 /* TomsFastMath, a fast ISO C bignum library.
2 *
3 * This project is meant to fill in where LibTomMath
4 * falls short. That is speed ;-)
5 *
6 * This project is public domain and free for all purposes.
7 *
8 * Tom St Denis, tomstdenis@gmail.com
9 */
10
11 /* About this file...
12
13 */
14
15 #include "bignum_fast.h"
16
17 #if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
18 #undef TFM_SSE2
19 #define TFM_X86
20 #endif
21
22 /* these are the combas. Worship them. */
23 #if defined(TFM_X86)
24 /* Generic x86 optimized code */
25
26 /* anything you need at the start */
27 #define COMBA_START
28
29 /* clear the chaining variables */
30 #define COMBA_CLEAR \
31 c0 = c1 = c2 = 0;
32
33 /* forward the carry to the next digit */
34 #define COMBA_FORWARD \
35 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
36
37 /* store the first sum */
38 #define COMBA_STORE(x) \
39 x = c0;
40
41 /* store the second sum [carry] */
42 #define COMBA_STORE2(x) \
43 x = c1;
44
45 /* anything you need at the end */
46 #define COMBA_FINI
47
48 /* this should multiply i and j */
49 #define MULADD(i, j) \
50 asm( \
51 "movl %6,%%eax \n\t" \
52 "mull %7 \n\t" \
53 "addl %%eax,%0 \n\t" \
54 "adcl %%edx,%1 \n\t" \
55 "adcl $0,%2 \n\t" \
56 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
57
58 #elif defined(TFM_X86_64)
59 /* x86-64 optimized */
60
61 /* anything you need at the start */
62 #define COMBA_START
63
64 /* clear the chaining variables */
65 #define COMBA_CLEAR \
66 c0 = c1 = c2 = 0;
67
68 /* forward the carry to the next digit */
69 #define COMBA_FORWARD \
70 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
71
72 /* store the first sum */
73 #define COMBA_STORE(x) \
74 x = c0;
75
76 /* store the second sum [carry] */
77 #define COMBA_STORE2(x) \
78 x = c1;
79
80 /* anything you need at the end */
81 #define COMBA_FINI
82
83 /* this should multiply i and j */
84 #define MULADD(i, j) \
85 asm ( \
86 "movq %6,%%rax \n\t" \
87 "mulq %7 \n\t" \
88 "addq %%rax,%0 \n\t" \
89 "adcq %%rdx,%1 \n\t" \
90 "adcq $0,%2 \n\t" \
91 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
92
93 #elif defined(TFM_SSE2)
94 /* use SSE2 optimizations */
95
96 /* anything you need at the start */
97 #define COMBA_START
98
99 /* clear the chaining variables */
100 #define COMBA_CLEAR \
101 c0 = c1 = c2 = 0;
102
103 /* forward the carry to the next digit */
104 #define COMBA_FORWARD \
105 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
106
107 /* store the first sum */
108 #define COMBA_STORE(x) \
109 x = c0;
110
111 /* store the second sum [carry] */
112 #define COMBA_STORE2(x) \
113 x = c1;
114
115 /* anything you need at the end */
116 #define COMBA_FINI \
117 asm("emms");
118
119 /* this should multiply i and j */
120 #define MULADD(i, j) \
121 asm( \
122 "movd %6,%%mm0 \n\t" \
123 "movd %7,%%mm1 \n\t" \
124 "pmuludq %%mm1,%%mm0\n\t" \
125 "movd %%mm0,%%eax \n\t" \
126 "psrlq $32,%%mm0 \n\t" \
127 "addl %%eax,%0 \n\t" \
128 "movd %%mm0,%%eax \n\t" \
129 "adcl %%eax,%1 \n\t" \
130 "adcl $0,%2 \n\t" \
131 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
132
133 #elif defined(TFM_ARM)
134 /* ARM code */
135
136 #define COMBA_START
137
138 #define COMBA_CLEAR \
139 c0 = c1 = c2 = 0;
140
141 #define COMBA_FORWARD \
142 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
143
144 #define COMBA_STORE(x) \
145 x = c0;
146
147 #define COMBA_STORE2(x) \
148 x = c1;
149
150 #define COMBA_FINI
151
152 #define MULADD(i, j) \
153 asm( \
154 " UMULL r0,r1,%6,%7 \n\t" \
155 " ADDS %0,%0,r0 \n\t" \
156 " ADCS %1,%1,r1 \n\t" \
157 " ADC %2,%2,#0 \n\t" \
158 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
159
160 #elif defined(TFM_PPC32)
161 /* For 32-bit PPC */
162
163 #define COMBA_START
164
165 #define COMBA_CLEAR \
166 c0 = c1 = c2 = 0;
167
168 #define COMBA_FORWARD \
169 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
170
171 #define COMBA_STORE(x) \
172 x = c0;
173
174 #define COMBA_STORE2(x) \
175 x = c1;
176
177 #define COMBA_FINI
178
179 /* untested: will mulhwu change the flags? Docs say no */
180 #define MULADD(i, j) \
181 asm( \
182 " mullw 16,%6,%7 \n\t" \
183 " addc %0,%0,16 \n\t" \
184 " mulhwu 16,%6,%7 \n\t" \
185 " adde %1,%1,16 \n\t" \
186 " addze %2,%2 \n\t" \
187 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
188
189 #elif defined(TFM_PPC64)
190 /* For 64-bit PPC */
191
192 #define COMBA_START
193
194 #define COMBA_CLEAR \
195 c0 = c1 = c2 = 0;
196
197 #define COMBA_FORWARD \
198 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
199
200 #define COMBA_STORE(x) \
201 x = c0;
202
203 #define COMBA_STORE2(x) \
204 x = c1;
205
206 #define COMBA_FINI
207
208 /* untested: will mulhdu change the flags? Docs say no */
209 #define MULADD(i, j) \
210 asm( \
211 " mulld r16,%6,%7 \n\t" \
212 " addc %0,%0,16 \n\t" \
213 " mulhdu r16,%6,%7 \n\t" \
214 " adde %1,%1,16 \n\t" \
215 " addze %2,%2 \n\t" \
216 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
217
218 #elif defined(TFM_AVR32)
219
220 /* ISO C code */
221
222 #define COMBA_START
223
224 #define COMBA_CLEAR \
225 c0 = c1 = c2 = 0;
226
227 #define COMBA_FORWARD \
228 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
229
230 #define COMBA_STORE(x) \
231 x = c0;
232
233 #define COMBA_STORE2(x) \
234 x = c1;
235
236 #define COMBA_FINI
237
238 #define MULADD(i, j) \
239 asm( \
240 " mulu.d r2,%6,%7 \n\t"\
241 " add %0,r2 \n\t"\
242 " adc %1,%1,r3 \n\t"\
243 " acr %2 \n\t"\
244 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
245
246 #elif defined(TFM_MIPS)
247
248 #define COMBA_START
249
250 #define COMBA_CLEAR \
251 c0 = c1 = c2 = 0;
252
253 #define COMBA_FORWARD \
254 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
255
256 #define COMBA_STORE(x) \
257 x = c0;
258
259 #define COMBA_STORE2(x) \
260 x = c1;
261
262 #define COMBA_FINI
263
264 #define MULADD(i, j) \
265 asm( \
266 " multu %6,%7 \n\t" \
267 " mflo $12 \n\t" \
268 " mfhi $13 \n\t" \
269 " addu %0,%0,$12 \n\t" \
270 " sltu $12,%0,$12 \n\t" \
271 " addu %1,%1,$13 \n\t" \
272 " sltu $13,%1,$13 \n\t" \
273 " addu %1,%1,$12 \n\t" \
274 " sltu $12,%1,$12 \n\t" \
275 " addu %2,%2,$13 \n\t" \
276 " addu %2,%2,$12 \n\t" \
277 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
278
279 #else
280 /* ISO C code */
281
282 #define COMBA_START
283
284 #define COMBA_CLEAR \
285 c0 = c1 = c2 = 0;
286
287 #define COMBA_FORWARD \
288 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
289
290 #define COMBA_STORE(x) \
291 x = c0;
292
293 #define COMBA_STORE2(x) \
294 x = c1;
295
296 #define COMBA_FINI
297
298 #define MULADD(i, j) \
299 do { fp_word t; \
300 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
301 t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
302 } while (0);
303
304 #endif
305
306 #ifndef TFM_DEFINES
307
308 /* generic PxQ multiplier */
fp_mul_comba(fp_int * A,fp_int * B,fp_int * C)309 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
310 {
311 int ix, iy, iz, tx, ty, pa;
312 fp_digit c0, c1, c2, *tmpx, *tmpy;
313 fp_int tmp, *dst;
314
315 COMBA_START;
316 COMBA_CLEAR;
317
318 /* get size of output and trim */
319 pa = A->used + B->used;
320 if (pa >= FP_SIZE) {
321 pa = FP_SIZE-1;
322 }
323
324 if (A == C || B == C) {
325 fp_zero(&tmp);
326 dst = &tmp;
327 } else {
328 fp_zero(C);
329 dst = C;
330 }
331
332 for (ix = 0; ix < pa; ix++) {
333 /* get offsets into the two bignums */
334 ty = MIN(ix, B->used-1);
335 tx = ix - ty;
336
337 /* setup temp aliases */
338 tmpx = A->dp + tx;
339 tmpy = B->dp + ty;
340
341 /* this is the number of times the loop will iterrate, essentially its
342 while (tx++ < a->used && ty-- >= 0) { ... }
343 */
344 iy = MIN(A->used-tx, ty+1);
345
346 /* execute loop */
347 COMBA_FORWARD;
348 for (iz = 0; iz < iy; ++iz) {
349 MULADD(*tmpx++, *tmpy--);
350 }
351
352 /* store term */
353 COMBA_STORE(dst->dp[ix]);
354 }
355 COMBA_FINI;
356
357 dst->used = pa;
358 dst->sign = A->sign ^ B->sign;
359 fp_clamp(dst);
360 fp_copy(dst, C);
361 }
362
363 #endif
364
365 /* $Source: /cvs/libtom/tomsfastmath/src/mul/fp_mul_comba.c,v $ */
366 /* $Revision: 1.4 $ */
367 /* $Date: 2007/03/14 23:47:42 $ */
368
369