1 /**************************************************************************/
2 /*                                                                        */
3 /*                                 OCaml                                  */
4 /*                                                                        */
5 /*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
6 /*                                                                        */
7 /*   Copyright 2003 Institut National de Recherche en Informatique et     */
8 /*     en Automatique.                                                    */
9 /*                                                                        */
10 /*   All rights reserved.  This file is distributed under the terms of    */
11 /*   the GNU Lesser General Public License version 2.1, with the          */
12 /*   special exception on linking described in the file LICENSE.          */
13 /*                                                                        */
14 /**************************************************************************/
15 
16 /* Code specific to the Intel IA32 (x86) architecture. */
17 
18 #define BngAdd2(res,carryout,arg1,arg2)                                     \
19   asm("xorl %1, %1 \n\t"                                                    \
20       "addl %3, %0 \n\t"                                                    \
21       "setc %b1"                                                            \
22       : "=r" (res), "=&q" (carryout)                                        \
23       : "0" (arg1), "rm" (arg2))
24 
25 #define BngSub2(res,carryout,arg1,arg2)                                     \
26   asm("xorl %1, %1 \n\t"                                                    \
27       "subl %3, %0 \n\t"                                                    \
28       "setc %b1"                                                            \
29       : "=r" (res), "=&q" (carryout)                                        \
30       : "0" (arg1), "rm" (arg2))
31 
32 #define BngMult(resh,resl,arg1,arg2)                                        \
33   asm("mull %3"                                                             \
34       : "=a" (resl), "=d" (resh)                                            \
35       : "a" (arg1), "r" (arg2))
36 
37 #define BngDiv(quo,rem,nh,nl,d)                                             \
38   asm("divl %4"                                                             \
39       : "=a" (quo), "=d" (rem)                                              \
40       : "a" (nl), "d" (nh), "r" (d))
41 
42 /* Reimplementation in asm of some of the bng operations. */
43 
bng_ia32_add(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)44 static bngcarry bng_ia32_add
45        (bng a/*[alen]*/, bngsize alen,
46         bng b/*[blen]*/, bngsize blen,
47         bngcarry carry)
48 {
49   bngdigit tmp;
50   alen -= blen;
51   if (blen > 0) {
52     asm("negb %b3 \n\t"
53         "1: \n\t"
54         "movl (%0), %4 \n\t"
55         "adcl (%1), %4 \n\t"
56         "movl %4, (%0) \n\t"
57         "leal 4(%0), %0 \n\t"
58         "leal 4(%1), %1 \n\t"
59         "decl %2 \n\t"
60         "jnz 1b \n\t"
61         "setc %b3"
62         : "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
63   }
64   if (carry == 0 || alen == 0) return carry;
65   do {
66     if (++(*a) != 0) return 0;
67     a++;
68   } while (--alen);
69   return 1;
70 }
71 
bng_ia32_sub(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)72 static bngcarry bng_ia32_sub
73        (bng a/*[alen]*/, bngsize alen,
74         bng b/*[blen]*/, bngsize blen,
75         bngcarry carry)
76 {
77   bngdigit tmp;
78   alen -= blen;
79   if (blen > 0) {
80     asm("negb %b3 \n\t"
81         "1: \n\t"
82         "movl (%0), %4 \n\t"
83         "sbbl (%1), %4 \n\t"
84         "movl %4, (%0) \n\t"
85         "leal 4(%0), %0 \n\t"
86         "leal 4(%1), %1 \n\t"
87         "decl %2 \n\t"
88         "jnz 1b \n\t"
89         "setc %b3"
90         : "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
91   }
92   if (carry == 0 || alen == 0) return carry;
93   do {
94     if ((*a)-- != 0) return 0;
95     a++;
96   } while (--alen);
97   return 1;
98 }
99 
bng_ia32_mult_add_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)100 static bngdigit bng_ia32_mult_add_digit
101      (bng a/*[alen]*/, bngsize alen,
102       bng b/*[blen]*/, bngsize blen,
103       bngdigit d)
104 {
105   bngdigit out;
106   bngcarry carry;
107 
108   alen -= blen;
109   out = 0;
110   if (blen > 0) {
111     asm("1: \n\t"
112         "movl (%1), %%eax \n\t"
113         "mull %4\n\t"           /* edx:eax = d * next digit of b */
114         "addl (%0), %%eax \n\t" /* add next digit of a to eax */
115         "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
116         "addl %3, %%eax \n\t"   /* add out to eax */
117         "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
118         "movl %%eax, (%0) \n\t" /* eax is next digit of result */
119         "movl %%edx, %3 \n\t"   /* edx is next out */
120         "leal 4(%0), %0 \n\t"
121         "leal 4(%1), %1 \n\t"
122         "decl %2 \n\t"
123         "jnz 1b"
124         : "+&r" (a), "+&r" (b), "+&r" (blen), "=m" (out)
125         : "m" (d)
126         : "eax", "edx");
127   }
128   if (alen == 0) return out;
129   /* current digit of a += out */
130   BngAdd2(*a, carry, *a, out);
131   a++;
132   alen--;
133   /* Propagate carry */
134   if (carry == 0 || alen == 0) return carry;
135   do {
136     if (++(*a) != 0) return 0;
137     a++;
138   } while (--alen);
139   return 1;
140 }
141 
bng_ia32_mult_sub_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)142 static bngdigit bng_ia32_mult_sub_digit
143      (bng a/*[alen]*/, bngsize alen,
144       bng b/*[blen]*/, bngsize blen,
145       bngdigit d)
146 {
147   bngdigit out, tmp;
148   bngcarry carry;
149 
150   alen -= blen;
151   out = 0;
152   if (blen > 0) {
153     asm("1: \n\t"
154         "movl (%1), %%eax \n\t"
155         "movl (%0), %4 \n\t"
156         "mull %5\n\t"           /* edx:eax = d * next digit of b */
157         "subl %%eax, %4 \n\t"   /* subtract eax from next digit of a */
158         "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
159         "subl %3, %4 \n\t"      /* subtract out */
160         "adcl $0, %%edx \n\t"   /* accumulate carry in edx */
161         "movl %4, (%0) \n\t"    /* store next digit of result */
162         "movl %%edx, %3 \n\t"   /* edx is next out */
163         "leal 4(%0), %0 \n\t"
164         "leal 4(%1), %1 \n\t"
165         "decl %2 \n\t"
166         "jnz 1b"
167         : "+&r" (a), "+&r" (b), "=m" (blen), "=m" (out), "=&r" (tmp)
168         : "m" (d)
169         : "eax", "edx");
170   }
171   if (alen == 0) return out;
172   /* current digit of a -= out */
173   BngSub2(*a, carry, *a, out);
174   a++;
175   alen--;
176   /* Propagate carry */
177   if (carry == 0 || alen == 0) return carry;
178   do {
179     if ((*a)-- != 0) return 0;
180     a++;
181   } while (--alen);
182   return 1;
183 }
184 
185 /* This is another asm implementation of some of the bng operations,
186    using SSE2 operations to provide 64-bit arithmetic.
187    This is faster than the plain IA32 code above on the Pentium 4.
188    (Arithmetic operations with carry are slow on the Pentium 4). */
189 
190 #if BNG_ASM_LEVEL >= 2
191 
bng_ia32sse2_add(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)192 static bngcarry bng_ia32sse2_add
193        (bng a/*[alen]*/, bngsize alen,
194         bng b/*[blen]*/, bngsize blen,
195         bngcarry carry)
196 {
197   alen -= blen;
198   if (blen > 0) {
199     asm("movd %3, %%mm0 \n\t"       /* MM0 is carry */
200         "1: \n\t"
201         "movd (%0), %%mm1 \n\t"     /* MM1 is next digit of a */
202         "movd (%1), %%mm2 \n\t"     /* MM2 is next digit of b */
203         "paddq %%mm1, %%mm0 \n\t"   /* Add carry (64 bits) */
204         "paddq %%mm2, %%mm0 \n\t"   /* Add digits (64 bits) */
205         "movd %%mm0, (%0) \n\t"     /* Store low 32 bits of result */
206         "psrlq $32, %%mm0 \n\t"     /* Next carry is top 32 bits of results */
207         "addl $4, %0\n\t"
208         "addl $4, %1\n\t"
209         "subl $1, %2\n\t"
210         "jne 1b \n\t"
211         "movd %%mm0, %3 \n\t"
212         "emms"
213         : "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
214   }
215   if (carry == 0 || alen == 0) return carry;
216   do {
217     if (++(*a) != 0) return 0;
218     a++;
219   } while (--alen);
220   return 1;
221 }
222 
bng_ia32sse2_sub(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)223 static bngcarry bng_ia32sse2_sub
224        (bng a/*[alen]*/, bngsize alen,
225         bng b/*[blen]*/, bngsize blen,
226         bngcarry carry)
227 {
228   alen -= blen;
229   if (blen > 0) {
230     asm("movd %3, %%mm0 \n\t"       /* MM0 is carry */
231         "1: \n\t"
232         "movd (%0), %%mm1 \n\t"     /* MM1 is next digit of a */
233         "movd (%1), %%mm2 \n\t"     /* MM2 is next digit of b */
234         "psubq %%mm0, %%mm1 \n\t"   /* Subtract carry (64 bits) */
235         "psubq %%mm2, %%mm1 \n\t"   /* Subtract digits (64 bits) */
236         "movd %%mm1, (%0) \n\t"     /* Store low 32 bits of result */
237         "psrlq $63, %%mm1 \n\t"     /* Next carry is sign bit of result */
238         "movq %%mm1, %%mm0 \n\t"
239         "addl $4, %0\n\t"
240         "addl $4, %1\n\t"
241         "subl $1, %2\n\t"
242         "jne 1b \n\t"
243         "movd %%mm0, %3 \n\t"
244         "emms"
245         : "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
246   }
247   if (carry == 0 || alen == 0) return carry;
248   do {
249     if ((*a)-- != 0) return 0;
250     a++;
251   } while (--alen);
252   return 1;
253 }
254 
bng_ia32sse2_mult_add_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)255 static bngdigit bng_ia32sse2_mult_add_digit
256      (bng a/*[alen]*/, bngsize alen,
257       bng b/*[blen]*/, bngsize blen,
258       bngdigit d)
259 {
260   bngdigit out;
261   bngcarry carry;
262 
263   alen -= blen;
264   out = 0;
265   if (blen > 0) {
266     asm("pxor %%mm0, %%mm0 \n\t"      /* MM0 is carry */
267         "movd %4, %%mm7 \n\t"         /* MM7 is digit d */
268         "1: \n\t"
269         "movd (%0), %%mm1 \n\t"       /* MM1 is next digit of a */
270         "movd (%1), %%mm2 \n\t"       /* MM2 is next digit of b */
271         "pmuludq %%mm7, %%mm2 \n\t"   /* MM2 = d * digit of b */
272         "paddq %%mm1, %%mm0 \n\t"     /* Add product and carry ... */
273         "paddq %%mm2, %%mm0 \n\t"     /* ... and digit of a */
274         "movd %%mm0, (%0) \n\t"       /* Store low 32 bits of result */
275         "psrlq $32, %%mm0 \n\t"       /* Next carry is high 32 bits result */
276         "addl $4, %0\n\t"
277         "addl $4, %1\n\t"
278         "subl $1, %2\n\t"
279         "jne 1b \n\t"
280         "movd %%mm0, %3 \n\t"
281         "emms"
282         : "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
283         : "m" (d));
284   }
285   if (alen == 0) return out;
286   /* current digit of a += out */
287   BngAdd2(*a, carry, *a, out);
288   a++;
289   alen--;
290   /* Propagate carry */
291   if (carry == 0 || alen == 0) return carry;
292   do {
293     if (++(*a) != 0) return 0;
294     a++;
295   } while (--alen);
296   return 1;
297 }
298 
bng_ia32sse2_mult_sub_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)299 static bngdigit bng_ia32sse2_mult_sub_digit
300      (bng a/*[alen]*/, bngsize alen,
301       bng b/*[blen]*/, bngsize blen,
302       bngdigit d)
303 {
304   static unsigned long long bias1 = 0xFFFFFFFF00000000ULL - 0xFFFFFFFFULL;
305   static unsigned long bias2 = 0xFFFFFFFFUL;
306   bngdigit out;
307   bngcarry carry;
308 
309   alen -= blen;
310   out = 0;
311   if (blen > 0) {
312     /* Carry C is represented by ENC(C) = 0xFFFFFFFF - C (one's complement) */
313     asm("movd %6, %%mm0 \n\t"         /* MM0 is carry (initially 0xFFFFFFFF) */
314         "movq %5, %%mm6 \n\t"         /* MM6 is magic constant bias1 */
315         "movd %4, %%mm7 \n\t"         /* MM7 is digit d */
316         "1: \n\t"
317         "movd (%0), %%mm1 \n\t"       /* MM1 is next digit of a */
318         "movd (%1), %%mm2 \n\t"       /* MM2 is next digit of b */
319         "paddq %%mm6, %%mm1 \n\t"     /* bias digit of a */
320         "pmuludq %%mm7, %%mm2 \n\t"   /* MM2 = d * digit of b */
321         /* Compute
322            digit of a + ENC(carry) + 0xFFFFFFFF00000000 - 0xFFFFFFFF - product
323            = digit of a - carry + 0xFFFFFFFF00000000 - product
324            = digit of a - carry - productlow + (ENC(nextcarry) << 32) */
325         "psubq %%mm2, %%mm1 \n\t"
326         "paddq %%mm1, %%mm0 \n\t"
327         "movd %%mm0, (%0) \n\t"       /* Store low 32 bits of result */
328         "psrlq $32, %%mm0 \n\t"       /* Next carry is 32 high bits of result */
329         "addl $4, %0\n\t"
330         "addl $4, %1\n\t"
331         "subl $1, %2\n\t"
332         "jne 1b \n\t"
333         "movd %%mm0, %3 \n\t"
334         "emms"
335         : "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
336         : "m" (d), "m" (bias1), "m" (bias2));
337     out = ~out; /* Undo encoding on out digit */
338   }
339   if (alen == 0) return out;
340   /* current digit of a -= out */
341   BngSub2(*a, carry, *a, out);
342   a++;
343   alen--;
344   /* Propagate carry */
345   if (carry == 0 || alen == 0) return carry;
346   do {
347     if ((*a)-- != 0) return 0;
348     a++;
349   } while (--alen);
350   return 1;
351 }
352 
353 /* Detect whether SSE2 instructions are supported */
354 
bng_ia32_sse2_supported(void)355 static int bng_ia32_sse2_supported(void)
356 {
357   unsigned int flags, newflags, max_id, capabilities;
358 
359 #define EFLAG_CPUID 0x00200000
360 #define CPUID_IDENTIFY 0
361 #define CPUID_CAPABILITIES 1
362 #define SSE2_CAPABILITY 26
363 
364   /* Check if processor has CPUID instruction */
365   asm("pushfl \n\t"
366       "popl %0"
367       : "=r" (flags) : );
368   newflags = flags ^ EFLAG_CPUID;   /* CPUID detection flag */
369   asm("pushfl \n\t"
370       "pushl %1 \n\t"
371       "popfl \n\t"
372       "pushfl \n\t"
373       "popl %0 \n\t"
374       "popfl"
375       : "=r" (flags) : "r" (newflags));
376   /* If CPUID detection flag cannot be changed, CPUID instruction is not
377      available */
378   if ((flags & EFLAG_CPUID) != (newflags & EFLAG_CPUID)) return 0;
379   /* See if SSE2 extensions are supported */
380   asm("pushl %%ebx \n\t"        /* need to preserve %ebx for PIC */
381       "cpuid \n\t"
382       "popl %%ebx"
383       : "=a" (max_id) : "a" (CPUID_IDENTIFY): "ecx", "edx");
384   if (max_id < 1) return 0;
385   asm("pushl %%ebx \n\t"
386       "cpuid \n\t"
387       "popl %%ebx"
388       : "=d" (capabilities) : "a" (CPUID_CAPABILITIES) : "ecx");
389   return capabilities & (1 << SSE2_CAPABILITY);
390 }
391 
392 #endif
393 
bng_ia32_setup_ops(void)394 static void bng_ia32_setup_ops(void)
395 {
396 #if BNG_ASM_LEVEL >= 2
397   if (bng_ia32_sse2_supported()) {
398     bng_ops.add = bng_ia32sse2_add;
399     bng_ops.sub = bng_ia32sse2_sub;
400     bng_ops.mult_add_digit = bng_ia32sse2_mult_add_digit;
401     bng_ops.mult_sub_digit = bng_ia32sse2_mult_sub_digit;
402     return;
403   }
404 #endif
405   bng_ops.add = bng_ia32_add;
406   bng_ops.sub = bng_ia32_sub;
407   bng_ops.mult_add_digit = bng_ia32_mult_add_digit;
408   bng_ops.mult_sub_digit = bng_ia32_mult_sub_digit;
409 }
410 
411 #define BNG_SETUP_OPS bng_ia32_setup_ops()
412