1 /**************************************************************************/
2 /* */
3 /* OCaml */
4 /* */
5 /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
6 /* */
7 /* Copyright 2003 Institut National de Recherche en Informatique et */
8 /* en Automatique. */
9 /* */
10 /* All rights reserved. This file is distributed under the terms of */
11 /* the GNU Lesser General Public License version 2.1, with the */
12 /* special exception on linking described in the file LICENSE. */
13 /* */
14 /**************************************************************************/
15
16 /* Code specific to the Intel IA32 (x86) architecture. */
17
18 #define BngAdd2(res,carryout,arg1,arg2) \
19 asm("xorl %1, %1 \n\t" \
20 "addl %3, %0 \n\t" \
21 "setc %b1" \
22 : "=r" (res), "=&q" (carryout) \
23 : "0" (arg1), "rm" (arg2))
24
25 #define BngSub2(res,carryout,arg1,arg2) \
26 asm("xorl %1, %1 \n\t" \
27 "subl %3, %0 \n\t" \
28 "setc %b1" \
29 : "=r" (res), "=&q" (carryout) \
30 : "0" (arg1), "rm" (arg2))
31
32 #define BngMult(resh,resl,arg1,arg2) \
33 asm("mull %3" \
34 : "=a" (resl), "=d" (resh) \
35 : "a" (arg1), "r" (arg2))
36
37 #define BngDiv(quo,rem,nh,nl,d) \
38 asm("divl %4" \
39 : "=a" (quo), "=d" (rem) \
40 : "a" (nl), "d" (nh), "r" (d))
41
42 /* Reimplementation in asm of some of the bng operations. */
43
bng_ia32_add(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)44 static bngcarry bng_ia32_add
45 (bng a/*[alen]*/, bngsize alen,
46 bng b/*[blen]*/, bngsize blen,
47 bngcarry carry)
48 {
49 bngdigit tmp;
50 alen -= blen;
51 if (blen > 0) {
52 asm("negb %b3 \n\t"
53 "1: \n\t"
54 "movl (%0), %4 \n\t"
55 "adcl (%1), %4 \n\t"
56 "movl %4, (%0) \n\t"
57 "leal 4(%0), %0 \n\t"
58 "leal 4(%1), %1 \n\t"
59 "decl %2 \n\t"
60 "jnz 1b \n\t"
61 "setc %b3"
62 : "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
63 }
64 if (carry == 0 || alen == 0) return carry;
65 do {
66 if (++(*a) != 0) return 0;
67 a++;
68 } while (--alen);
69 return 1;
70 }
71
bng_ia32_sub(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)72 static bngcarry bng_ia32_sub
73 (bng a/*[alen]*/, bngsize alen,
74 bng b/*[blen]*/, bngsize blen,
75 bngcarry carry)
76 {
77 bngdigit tmp;
78 alen -= blen;
79 if (blen > 0) {
80 asm("negb %b3 \n\t"
81 "1: \n\t"
82 "movl (%0), %4 \n\t"
83 "sbbl (%1), %4 \n\t"
84 "movl %4, (%0) \n\t"
85 "leal 4(%0), %0 \n\t"
86 "leal 4(%1), %1 \n\t"
87 "decl %2 \n\t"
88 "jnz 1b \n\t"
89 "setc %b3"
90 : "+&r" (a), "+&r" (b), "+&r" (blen), "+&q" (carry), "=&r" (tmp));
91 }
92 if (carry == 0 || alen == 0) return carry;
93 do {
94 if ((*a)-- != 0) return 0;
95 a++;
96 } while (--alen);
97 return 1;
98 }
99
bng_ia32_mult_add_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)100 static bngdigit bng_ia32_mult_add_digit
101 (bng a/*[alen]*/, bngsize alen,
102 bng b/*[blen]*/, bngsize blen,
103 bngdigit d)
104 {
105 bngdigit out;
106 bngcarry carry;
107
108 alen -= blen;
109 out = 0;
110 if (blen > 0) {
111 asm("1: \n\t"
112 "movl (%1), %%eax \n\t"
113 "mull %4\n\t" /* edx:eax = d * next digit of b */
114 "addl (%0), %%eax \n\t" /* add next digit of a to eax */
115 "adcl $0, %%edx \n\t" /* accumulate carry in edx */
116 "addl %3, %%eax \n\t" /* add out to eax */
117 "adcl $0, %%edx \n\t" /* accumulate carry in edx */
118 "movl %%eax, (%0) \n\t" /* eax is next digit of result */
119 "movl %%edx, %3 \n\t" /* edx is next out */
120 "leal 4(%0), %0 \n\t"
121 "leal 4(%1), %1 \n\t"
122 "decl %2 \n\t"
123 "jnz 1b"
124 : "+&r" (a), "+&r" (b), "+&r" (blen), "=m" (out)
125 : "m" (d)
126 : "eax", "edx");
127 }
128 if (alen == 0) return out;
129 /* current digit of a += out */
130 BngAdd2(*a, carry, *a, out);
131 a++;
132 alen--;
133 /* Propagate carry */
134 if (carry == 0 || alen == 0) return carry;
135 do {
136 if (++(*a) != 0) return 0;
137 a++;
138 } while (--alen);
139 return 1;
140 }
141
bng_ia32_mult_sub_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)142 static bngdigit bng_ia32_mult_sub_digit
143 (bng a/*[alen]*/, bngsize alen,
144 bng b/*[blen]*/, bngsize blen,
145 bngdigit d)
146 {
147 bngdigit out, tmp;
148 bngcarry carry;
149
150 alen -= blen;
151 out = 0;
152 if (blen > 0) {
153 asm("1: \n\t"
154 "movl (%1), %%eax \n\t"
155 "movl (%0), %4 \n\t"
156 "mull %5\n\t" /* edx:eax = d * next digit of b */
157 "subl %%eax, %4 \n\t" /* subtract eax from next digit of a */
158 "adcl $0, %%edx \n\t" /* accumulate carry in edx */
159 "subl %3, %4 \n\t" /* subtract out */
160 "adcl $0, %%edx \n\t" /* accumulate carry in edx */
161 "movl %4, (%0) \n\t" /* store next digit of result */
162 "movl %%edx, %3 \n\t" /* edx is next out */
163 "leal 4(%0), %0 \n\t"
164 "leal 4(%1), %1 \n\t"
165 "decl %2 \n\t"
166 "jnz 1b"
167 : "+&r" (a), "+&r" (b), "=m" (blen), "=m" (out), "=&r" (tmp)
168 : "m" (d)
169 : "eax", "edx");
170 }
171 if (alen == 0) return out;
172 /* current digit of a -= out */
173 BngSub2(*a, carry, *a, out);
174 a++;
175 alen--;
176 /* Propagate carry */
177 if (carry == 0 || alen == 0) return carry;
178 do {
179 if ((*a)-- != 0) return 0;
180 a++;
181 } while (--alen);
182 return 1;
183 }
184
185 /* This is another asm implementation of some of the bng operations,
186 using SSE2 operations to provide 64-bit arithmetic.
187 This is faster than the plain IA32 code above on the Pentium 4.
188 (Arithmetic operations with carry are slow on the Pentium 4). */
189
190 #if BNG_ASM_LEVEL >= 2
191
bng_ia32sse2_add(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)192 static bngcarry bng_ia32sse2_add
193 (bng a/*[alen]*/, bngsize alen,
194 bng b/*[blen]*/, bngsize blen,
195 bngcarry carry)
196 {
197 alen -= blen;
198 if (blen > 0) {
199 asm("movd %3, %%mm0 \n\t" /* MM0 is carry */
200 "1: \n\t"
201 "movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
202 "movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
203 "paddq %%mm1, %%mm0 \n\t" /* Add carry (64 bits) */
204 "paddq %%mm2, %%mm0 \n\t" /* Add digits (64 bits) */
205 "movd %%mm0, (%0) \n\t" /* Store low 32 bits of result */
206 "psrlq $32, %%mm0 \n\t" /* Next carry is top 32 bits of results */
207 "addl $4, %0\n\t"
208 "addl $4, %1\n\t"
209 "subl $1, %2\n\t"
210 "jne 1b \n\t"
211 "movd %%mm0, %3 \n\t"
212 "emms"
213 : "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
214 }
215 if (carry == 0 || alen == 0) return carry;
216 do {
217 if (++(*a) != 0) return 0;
218 a++;
219 } while (--alen);
220 return 1;
221 }
222
bng_ia32sse2_sub(bng a,bngsize alen,bng b,bngsize blen,bngcarry carry)223 static bngcarry bng_ia32sse2_sub
224 (bng a/*[alen]*/, bngsize alen,
225 bng b/*[blen]*/, bngsize blen,
226 bngcarry carry)
227 {
228 alen -= blen;
229 if (blen > 0) {
230 asm("movd %3, %%mm0 \n\t" /* MM0 is carry */
231 "1: \n\t"
232 "movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
233 "movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
234 "psubq %%mm0, %%mm1 \n\t" /* Subtract carry (64 bits) */
235 "psubq %%mm2, %%mm1 \n\t" /* Subtract digits (64 bits) */
236 "movd %%mm1, (%0) \n\t" /* Store low 32 bits of result */
237 "psrlq $63, %%mm1 \n\t" /* Next carry is sign bit of result */
238 "movq %%mm1, %%mm0 \n\t"
239 "addl $4, %0\n\t"
240 "addl $4, %1\n\t"
241 "subl $1, %2\n\t"
242 "jne 1b \n\t"
243 "movd %%mm0, %3 \n\t"
244 "emms"
245 : "+&r" (a), "+&r" (b), "+&r" (blen), "+&rm" (carry));
246 }
247 if (carry == 0 || alen == 0) return carry;
248 do {
249 if ((*a)-- != 0) return 0;
250 a++;
251 } while (--alen);
252 return 1;
253 }
254
bng_ia32sse2_mult_add_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)255 static bngdigit bng_ia32sse2_mult_add_digit
256 (bng a/*[alen]*/, bngsize alen,
257 bng b/*[blen]*/, bngsize blen,
258 bngdigit d)
259 {
260 bngdigit out;
261 bngcarry carry;
262
263 alen -= blen;
264 out = 0;
265 if (blen > 0) {
266 asm("pxor %%mm0, %%mm0 \n\t" /* MM0 is carry */
267 "movd %4, %%mm7 \n\t" /* MM7 is digit d */
268 "1: \n\t"
269 "movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
270 "movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
271 "pmuludq %%mm7, %%mm2 \n\t" /* MM2 = d * digit of b */
272 "paddq %%mm1, %%mm0 \n\t" /* Add product and carry ... */
273 "paddq %%mm2, %%mm0 \n\t" /* ... and digit of a */
274 "movd %%mm0, (%0) \n\t" /* Store low 32 bits of result */
275 "psrlq $32, %%mm0 \n\t" /* Next carry is high 32 bits result */
276 "addl $4, %0\n\t"
277 "addl $4, %1\n\t"
278 "subl $1, %2\n\t"
279 "jne 1b \n\t"
280 "movd %%mm0, %3 \n\t"
281 "emms"
282 : "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
283 : "m" (d));
284 }
285 if (alen == 0) return out;
286 /* current digit of a += out */
287 BngAdd2(*a, carry, *a, out);
288 a++;
289 alen--;
290 /* Propagate carry */
291 if (carry == 0 || alen == 0) return carry;
292 do {
293 if (++(*a) != 0) return 0;
294 a++;
295 } while (--alen);
296 return 1;
297 }
298
bng_ia32sse2_mult_sub_digit(bng a,bngsize alen,bng b,bngsize blen,bngdigit d)299 static bngdigit bng_ia32sse2_mult_sub_digit
300 (bng a/*[alen]*/, bngsize alen,
301 bng b/*[blen]*/, bngsize blen,
302 bngdigit d)
303 {
304 static unsigned long long bias1 = 0xFFFFFFFF00000000ULL - 0xFFFFFFFFULL;
305 static unsigned long bias2 = 0xFFFFFFFFUL;
306 bngdigit out;
307 bngcarry carry;
308
309 alen -= blen;
310 out = 0;
311 if (blen > 0) {
312 /* Carry C is represented by ENC(C) = 0xFFFFFFFF - C (one's complement) */
313 asm("movd %6, %%mm0 \n\t" /* MM0 is carry (initially 0xFFFFFFFF) */
314 "movq %5, %%mm6 \n\t" /* MM6 is magic constant bias1 */
315 "movd %4, %%mm7 \n\t" /* MM7 is digit d */
316 "1: \n\t"
317 "movd (%0), %%mm1 \n\t" /* MM1 is next digit of a */
318 "movd (%1), %%mm2 \n\t" /* MM2 is next digit of b */
319 "paddq %%mm6, %%mm1 \n\t" /* bias digit of a */
320 "pmuludq %%mm7, %%mm2 \n\t" /* MM2 = d * digit of b */
321 /* Compute
322 digit of a + ENC(carry) + 0xFFFFFFFF00000000 - 0xFFFFFFFF - product
323 = digit of a - carry + 0xFFFFFFFF00000000 - product
324 = digit of a - carry - productlow + (ENC(nextcarry) << 32) */
325 "psubq %%mm2, %%mm1 \n\t"
326 "paddq %%mm1, %%mm0 \n\t"
327 "movd %%mm0, (%0) \n\t" /* Store low 32 bits of result */
328 "psrlq $32, %%mm0 \n\t" /* Next carry is 32 high bits of result */
329 "addl $4, %0\n\t"
330 "addl $4, %1\n\t"
331 "subl $1, %2\n\t"
332 "jne 1b \n\t"
333 "movd %%mm0, %3 \n\t"
334 "emms"
335 : "+&r" (a), "+&r" (b), "+&r" (blen), "=&rm" (out)
336 : "m" (d), "m" (bias1), "m" (bias2));
337 out = ~out; /* Undo encoding on out digit */
338 }
339 if (alen == 0) return out;
340 /* current digit of a -= out */
341 BngSub2(*a, carry, *a, out);
342 a++;
343 alen--;
344 /* Propagate carry */
345 if (carry == 0 || alen == 0) return carry;
346 do {
347 if ((*a)-- != 0) return 0;
348 a++;
349 } while (--alen);
350 return 1;
351 }
352
353 /* Detect whether SSE2 instructions are supported */
354
bng_ia32_sse2_supported(void)355 static int bng_ia32_sse2_supported(void)
356 {
357 unsigned int flags, newflags, max_id, capabilities;
358
359 #define EFLAG_CPUID 0x00200000
360 #define CPUID_IDENTIFY 0
361 #define CPUID_CAPABILITIES 1
362 #define SSE2_CAPABILITY 26
363
364 /* Check if processor has CPUID instruction */
365 asm("pushfl \n\t"
366 "popl %0"
367 : "=r" (flags) : );
368 newflags = flags ^ EFLAG_CPUID; /* CPUID detection flag */
369 asm("pushfl \n\t"
370 "pushl %1 \n\t"
371 "popfl \n\t"
372 "pushfl \n\t"
373 "popl %0 \n\t"
374 "popfl"
375 : "=r" (flags) : "r" (newflags));
376 /* If CPUID detection flag cannot be changed, CPUID instruction is not
377 available */
378 if ((flags & EFLAG_CPUID) != (newflags & EFLAG_CPUID)) return 0;
379 /* See if SSE2 extensions are supported */
380 asm("pushl %%ebx \n\t" /* need to preserve %ebx for PIC */
381 "cpuid \n\t"
382 "popl %%ebx"
383 : "=a" (max_id) : "a" (CPUID_IDENTIFY): "ecx", "edx");
384 if (max_id < 1) return 0;
385 asm("pushl %%ebx \n\t"
386 "cpuid \n\t"
387 "popl %%ebx"
388 : "=d" (capabilities) : "a" (CPUID_CAPABILITIES) : "ecx");
389 return capabilities & (1 << SSE2_CAPABILITY);
390 }
391
392 #endif
393
bng_ia32_setup_ops(void)394 static void bng_ia32_setup_ops(void)
395 {
396 #if BNG_ASM_LEVEL >= 2
397 if (bng_ia32_sse2_supported()) {
398 bng_ops.add = bng_ia32sse2_add;
399 bng_ops.sub = bng_ia32sse2_sub;
400 bng_ops.mult_add_digit = bng_ia32sse2_mult_add_digit;
401 bng_ops.mult_sub_digit = bng_ia32sse2_mult_sub_digit;
402 return;
403 }
404 #endif
405 bng_ops.add = bng_ia32_add;
406 bng_ops.sub = bng_ia32_sub;
407 bng_ops.mult_add_digit = bng_ia32_mult_add_digit;
408 bng_ops.mult_sub_digit = bng_ia32_mult_sub_digit;
409 }
410
411 #define BNG_SETUP_OPS bng_ia32_setup_ops()
412