1 /*
2 Copyright (C) 2018 Fredrik Johansson
3
4 This file is part of Arb.
5
6 Arb is free software: you can redistribute it and/or modify it under
7 the terms of the GNU Lesser General Public License (LGPL) as published
8 by the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version. See <http://www.gnu.org/licenses/>.
10 */
11
12 #include "acb.h"
13
14 /* We need uint64_t instead of mp_limb_t on 32-bit systems for
15 safe summation of 30-bit error bounds. */
16 #include <stdint.h>
17
18 /* The following macros are found in FLINT's longlong.h, but
19 the release version is out of date. */
20
21 /* x86 : 64 bit */
22 #if (GMP_LIMB_BITS == 64 && defined (__amd64__))
23
24 #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \
25 __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0" \
26 : "=r" (sh), "=&r" (sm), "=&r" (sl) \
27 : "0" ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)), \
28 "1" ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)), \
29 "2" ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl))) \
30
31 #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \
32 __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0" \
33 : "=r" (dh), "=&r" (dm), "=&r" (dl) \
34 : "0" ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)), \
35 "1" ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)), \
36 "2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \
37
38 #endif /* x86_64 */
39
40 /* x86 : 32 bit */
41 #if (GMP_LIMB_BITS == 32 && (defined (__i386__) \
42 || defined (__i486__) || defined(__amd64__)))
43
44 #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \
45 __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0" \
46 : "=r" (sh), "=r" (sm), "=&r" (sl) \
47 : "0" ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)), \
48 "1" ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)), \
49 "2" ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl))) \
50
51 #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \
52 __asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0" \
53 : "=r" (dh), "=r" (dm), "=&r" (dl) \
54 : "0" ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)), \
55 "1" ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)), \
56 "2" ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl))) \
57
58 #endif /* x86 */
59
60
61 #if !defined(add_sssaaaaaa2)
62
63 #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \
64 do { \
65 mp_limb_t __t, __u; \
66 add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl); \
67 add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm); \
68 add_ssaaaa(sh, sm, ah + bh, sm, __u, __t); \
69 } while (0)
70
71 #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \
72 do { \
73 mp_limb_t __t, __u; \
74 sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl); \
75 sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm); \
76 sub_ddmmss(dh, dm, mh - sh, dm, -__u, -__t); \
77 } while (0)
78
79 #endif
80
81 void
82 _arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
83 mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
84 int negative, flint_bitcnt_t shift);
85
86 void
87 _arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
88 mp_srcptr xptr, mp_size_t xn,
89 int negative, flint_bitcnt_t shift);
90
91 static void
_arb_dot_output(arb_t res,mp_ptr sum,mp_size_t sn,int negative,slong sum_exp,slong prec)92 _arb_dot_output(arb_t res, mp_ptr sum, mp_size_t sn, int negative,
93 slong sum_exp, slong prec)
94 {
95 slong exp_fix;
96
97 if (sum[sn - 1] >= LIMB_TOP)
98 {
99 mpn_neg(sum, sum, sn);
100 negative ^= 1;
101 }
102
103 exp_fix = 0;
104
105 if (sum[sn - 1] == 0)
106 {
107 slong sum_exp2;
108 mp_size_t sn2;
109
110 sn2 = sn;
111 sum_exp2 = sum_exp;
112
113 while (sn2 > 0 && sum[sn2 - 1] == 0)
114 {
115 sum_exp2 -= FLINT_BITS;
116 sn2--;
117 }
118
119 if (sn2 == 0)
120 {
121 arf_zero(arb_midref(res));
122 }
123 else
124 {
125 _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn2, negative, prec, ARF_RND_DOWN);
126 _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp2);
127 }
128 }
129 else
130 {
131 if (sn == 2) /* unnecessary? */
132 _arf_set_round_uiui(arb_midref(res), &exp_fix, sum[1], sum[0], negative, prec, ARF_RND_DOWN);
133 else
134 _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn, negative, prec, ARF_RND_DOWN);
135
136 _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp);
137 }
138 }
139
140 /* xxx: don't use surrounding variables */
141 #define ARB_DOT_ADD(s_sum, s_serr, s_sn, s_sum_exp, s_subtract, xm) \
142 if (!arf_is_special(xm)) \
143 { \
144 mp_srcptr xptr; \
145 xexp = ARF_EXP(xm); \
146 xn = ARF_SIZE(xm); \
147 xnegative = ARF_SGNBIT(xm); \
148 shift = s_sum_exp - xexp; \
149 if (shift >= s_sn * FLINT_BITS) \
150 { \
151 } \
152 else \
153 { \
154 xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm); \
155 _arb_dot_add_generic(s_sum, &s_serr, tmp, s_sn, xptr, xn, xnegative ^ s_subtract, shift); \
156 } \
157 } \
158
159 static void
_arf_complex_mul_gauss(arf_t e,arf_t f,const arf_t a,const arf_t b,const arf_t c,const arf_t d)160 _arf_complex_mul_gauss(arf_t e, arf_t f, const arf_t a, const arf_t b,
161 const arf_t c, const arf_t d)
162 {
163 mp_srcptr ap, bp, cp, dp;
164 int asgn, bsgn, csgn, dsgn;
165 mp_size_t an, bn, cn, dn;
166 slong aexp, bexp, cexp, dexp;
167 fmpz texp, uexp;
168
169 fmpz_t za, zb, zc, zd, t, u, v;
170 slong abot, bbot, cbot, dbot;
171
172 ARF_GET_MPN_READONLY(ap, an, a);
173 asgn = ARF_SGNBIT(a);
174 aexp = ARF_EXP(a);
175
176 ARF_GET_MPN_READONLY(bp, bn, b);
177 bsgn = ARF_SGNBIT(b);
178 bexp = ARF_EXP(b);
179
180 ARF_GET_MPN_READONLY(cp, cn, c);
181 csgn = ARF_SGNBIT(c);
182 cexp = ARF_EXP(c);
183
184 ARF_GET_MPN_READONLY(dp, dn, d);
185 dsgn = ARF_SGNBIT(d);
186 dexp = ARF_EXP(d);
187
188 /* Gauss multiplication
189 e = ac - bd
190 f = (a+b)(c+d) - ac - bd */
191
192 abot = aexp - an * FLINT_BITS;
193 bbot = bexp - bn * FLINT_BITS;
194 cbot = cexp - cn * FLINT_BITS;
195 dbot = dexp - dn * FLINT_BITS;
196
197 texp = FLINT_MIN(abot, bbot);
198 uexp = FLINT_MIN(cbot, dbot);
199
200 fmpz_init(za);
201 fmpz_init(zb);
202 fmpz_init(zc);
203 fmpz_init(zd);
204 fmpz_init(t);
205 fmpz_init(u);
206 fmpz_init(v);
207
208 fmpz_lshift_mpn(za, ap, an, asgn, abot - texp);
209 fmpz_lshift_mpn(zb, bp, bn, bsgn, bbot - texp);
210 fmpz_lshift_mpn(zc, cp, cn, csgn, cbot - uexp);
211 fmpz_lshift_mpn(zd, dp, dn, dsgn, dbot - uexp);
212
213 fmpz_add(t, za, zb);
214 fmpz_add(v, zc, zd);
215 fmpz_mul(u, t, v);
216 fmpz_mul(t, za, zc);
217 fmpz_mul(v, zb, zd);
218 fmpz_sub(u, u, t);
219 fmpz_sub(u, u, v);
220 fmpz_sub(t, t, v);
221
222 texp += uexp;
223 arf_set_fmpz_2exp(e, t, &texp);
224 arf_set_fmpz_2exp(f, u, &texp);
225
226 fmpz_clear(za);
227 fmpz_clear(zb);
228 fmpz_clear(zc);
229 fmpz_clear(zd);
230 fmpz_clear(t);
231 fmpz_clear(u);
232 fmpz_clear(v);
233 }
234
235 ARB_DLL extern slong acb_dot_gauss_dot_cutoff;
236 #define GAUSS_CUTOFF acb_dot_gauss_dot_cutoff
237
238 void
acb_approx_dot_simple(acb_t res,const acb_t initial,int subtract,acb_srcptr x,slong xstep,acb_srcptr y,slong ystep,slong len,slong prec)239 acb_approx_dot_simple(acb_t res, const acb_t initial, int subtract,
240 acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
241 {
242 slong i;
243
244 if (len <= 0)
245 {
246 if (initial == NULL)
247 {
248 arf_zero(arb_midref(acb_realref(res)));
249 arf_zero(arb_midref(acb_imagref(res)));
250 }
251 else
252 {
253 arf_set_round(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)), prec, ARB_RND);
254 arf_set_round(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)), prec, ARB_RND);
255 }
256 return;
257 }
258
259 if (initial == NULL && len == 1)
260 {
261 arf_complex_mul(arb_midref(acb_realref(res)),
262 arb_midref(acb_imagref(res)),
263 arb_midref(acb_realref(x)),
264 arb_midref(acb_imagref(x)),
265 arb_midref(acb_realref(y)),
266 arb_midref(acb_imagref(y)), prec, ARB_RND);
267 }
268 else
269 {
270 arf_t e, f;
271
272 arf_init(e);
273 arf_init(f);
274
275 if (initial != NULL)
276 {
277 if (subtract)
278 {
279 arf_neg(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)));
280 arf_neg(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)));
281 }
282 else
283 {
284 arf_set(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)));
285 arf_set(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)));
286 }
287 }
288
289 for (i = 0; i < len; i++)
290 {
291 arf_complex_mul(e, f,
292 arb_midref(acb_realref(x + i * xstep)),
293 arb_midref(acb_imagref(x + i * xstep)),
294 arb_midref(acb_realref(y + i * ystep)),
295 arb_midref(acb_imagref(y + i * ystep)), prec, ARB_RND);
296
297
298 if (i == 0 && initial == NULL)
299 {
300 arf_set(arb_midref(acb_realref(res)), e);
301 arf_set(arb_midref(acb_imagref(res)), f);
302 }
303 else
304 {
305 arf_add(arb_midref(acb_realref(res)), arb_midref(acb_realref(res)), e, prec, ARB_RND);
306 arf_add(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(res)), f, prec, ARB_RND);
307 }
308 }
309
310 arf_clear(e);
311 arf_clear(f);
312 }
313
314 if (subtract)
315 {
316 arf_neg(arb_midref(acb_realref(res)), arb_midref(acb_realref(res)));
317 arf_neg(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(res)));
318 }
319 }
320
321 void
acb_approx_dot(acb_t res,const acb_t initial,int subtract,acb_srcptr x,slong xstep,acb_srcptr y,slong ystep,slong len,slong prec)322 acb_approx_dot(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
323 {
324 slong i, j, padding, extend;
325 slong xexp, yexp, exp;
326 slong re_nonzero, im_nonzero;
327 slong re_max_exp, re_min_exp, re_sum_exp;
328 slong im_max_exp, im_min_exp, im_sum_exp;
329 slong re_prec, im_prec;
330 int xnegative, ynegative;
331 mp_size_t xn, yn, re_sn, im_sn, alloc;
332 flint_bitcnt_t shift;
333 arb_srcptr xi, yi;
334 arf_srcptr xm, ym;
335 mp_limb_t re_serr, im_serr; /* Sum over arithmetic errors */
336 mp_ptr tmp, re_sum, im_sum; /* Workspace */
337 slong xoff, yoff;
338 char * use_gauss;
339 ARF_ADD_TMP_DECL;
340
341 /* todo: fast fma and fmma (len=2) code */
342 if (len <= 1)
343 {
344 acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
345 return;
346 }
347
348 /* Number of nonzero midpoint terms in sum. */
349 re_nonzero = 0;
350 im_nonzero = 0;
351
352 /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
353 re_max_exp = WORD_MIN;
354 im_max_exp = WORD_MIN;
355
356 /* Used to reduce the precision. */
357 re_min_exp = WORD_MAX;
358 im_min_exp = WORD_MAX;
359
360 /* Account for the initial term. */
361 if (initial != NULL)
362 {
363 if (!ARF_IS_LAGOM(arb_midref(acb_realref(initial))) || !ARF_IS_LAGOM(arb_midref(acb_imagref(initial))))
364 {
365 acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
366 return;
367 }
368
369 xm = arb_midref(acb_realref(initial));
370
371 if (!arf_is_special(xm))
372 {
373 re_max_exp = ARF_EXP(xm);
374 re_nonzero++;
375
376 if (prec > 2 * FLINT_BITS)
377 re_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
378 }
379
380 xm = arb_midref(acb_imagref(initial));
381
382 if (!arf_is_special(xm))
383 {
384 im_max_exp = ARF_EXP(xm);
385 im_nonzero++;
386
387 if (prec > 2 * FLINT_BITS)
388 im_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
389 }
390 }
391
392 for (xoff = 0; xoff < 2; xoff++)
393 {
394 for (yoff = 0; yoff < 2; yoff++)
395 {
396 slong nonzero, max_exp, min_exp;
397
398 if (xoff == yoff)
399 {
400 nonzero = re_nonzero;
401 max_exp = re_max_exp;
402 min_exp = re_min_exp;
403 }
404 else
405 {
406 nonzero = im_nonzero;
407 max_exp = im_max_exp;
408 min_exp = im_min_exp;
409 }
410
411 /* Determine maximum exponents for the main sum and the radius sum. */
412 for (i = 0; i < len; i++)
413 {
414 xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
415 yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
416
417 /* Fallback for huge exponents or non-finite values. */
418 if (!ARF_IS_LAGOM(arb_midref(xi)) || !ARF_IS_LAGOM(arb_midref(yi)))
419 {
420 acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
421 return;
422 }
423
424 xm = arb_midref(xi);
425 ym = arb_midref(yi);
426
427 /* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */
428 if (!arf_is_special(xm))
429 {
430 xexp = ARF_EXP(xm);
431
432 if (!arf_is_special(ym))
433 {
434 yexp = ARF_EXP(ym);
435
436 max_exp = FLINT_MAX(max_exp, xexp + yexp);
437 nonzero++;
438
439 if (prec > 2 * FLINT_BITS)
440 {
441 slong bot;
442 bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
443 min_exp = FLINT_MIN(min_exp, bot);
444 }
445 }
446 }
447 }
448
449 if (xoff == yoff)
450 {
451 re_nonzero = nonzero;
452 re_max_exp = max_exp;
453 re_min_exp = min_exp;
454 }
455 else
456 {
457 im_nonzero = nonzero;
458 im_max_exp = max_exp;
459 im_min_exp = min_exp;
460 }
461 }
462 }
463
464 re_prec = prec;
465 im_prec = prec;
466
467 if (re_max_exp == WORD_MIN && im_max_exp == WORD_MIN)
468 {
469 arf_zero(arb_midref(acb_realref(res)));
470 arf_zero(arb_midref(acb_imagref(res)));
471 return;
472 }
473
474 /* The midpoint sum is zero. */
475 if (re_max_exp == WORD_MIN)
476 {
477 re_prec = 2;
478 }
479 else
480 {
481 if (re_min_exp != WORD_MAX)
482 re_prec = FLINT_MIN(re_prec, re_max_exp - re_min_exp + MAG_BITS);
483 re_prec = FLINT_MAX(re_prec, 2);
484 }
485
486 if (im_max_exp == WORD_MIN)
487 {
488 im_prec = 2;
489 }
490 else
491 {
492 if (re_min_exp != WORD_MAX)
493 im_prec = FLINT_MIN(im_prec, im_max_exp - im_min_exp + MAG_BITS);
494 im_prec = FLINT_MAX(im_prec, 2);
495 }
496
497 extend = FLINT_BIT_COUNT(re_nonzero) + 1;
498 padding = 4 + FLINT_BIT_COUNT(len);
499 re_sn = (re_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
500 re_sn = FLINT_MAX(re_sn, 2);
501 re_sum_exp = re_max_exp + extend;
502
503 extend = FLINT_BIT_COUNT(im_nonzero) + 1;
504 padding = 4 + FLINT_BIT_COUNT(len);
505 im_sn = (im_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
506 im_sn = FLINT_MAX(im_sn, 2);
507 im_sum_exp = im_max_exp + extend;
508
509 /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
510 for carry or borrow that avoids an extra branch). We need
511 2 * (sn + 2) limbs to store the product of two numbers
512 with up to (sn + 2) limbs, plus 1 extra limb for shifting
513 the product. */
514 alloc = (re_sn + 1) + (im_sn + 1) + 2 * (FLINT_MAX(re_sn, im_sn) + 2) + 1;
515 ARF_ADD_TMP_ALLOC(re_sum, alloc)
516 im_sum = re_sum + (re_sn + 1);
517 tmp = im_sum + (im_sn + 1);
518
519 /* Set sum to 0 */
520 re_serr = 0;
521 for (j = 0; j < re_sn + 1; j++)
522 re_sum[j] = 0;
523 im_serr = 0;
524 for (j = 0; j < im_sn + 1; j++)
525 im_sum[j] = 0;
526
527 if (initial != NULL)
528 {
529 xm = arb_midref(acb_realref(initial));
530
531 ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, subtract, xm);
532
533 xm = arb_midref(acb_imagref(initial));
534
535 ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, subtract, xm);
536 }
537
538 use_gauss = NULL;
539
540 if (re_prec >= GAUSS_CUTOFF * FLINT_BITS &&
541 im_prec >= GAUSS_CUTOFF * FLINT_BITS)
542 {
543 arf_t e, f;
544
545 for (i = 0; i < len; i++)
546 {
547 arb_srcptr ai, bi, ci, di;
548 mp_size_t an, bn, cn, dn;
549 slong aexp, bexp, cexp, dexp;
550
551 ai = ((arb_srcptr) x) + 2 * i * xstep;
552 bi = ((arb_srcptr) x) + 2 * i * xstep + 1;
553 ci = ((arb_srcptr) y) + 2 * i * ystep;
554 di = ((arb_srcptr) y) + 2 * i * ystep + 1;
555
556 an = ARF_SIZE(arb_midref(ai));
557 bn = ARF_SIZE(arb_midref(bi));
558 cn = ARF_SIZE(arb_midref(ci));
559 dn = ARF_SIZE(arb_midref(di));
560
561 aexp = ARF_EXP(arb_midref(ai));
562 bexp = ARF_EXP(arb_midref(bi));
563 cexp = ARF_EXP(arb_midref(ci));
564 dexp = ARF_EXP(arb_midref(di));
565
566 if (an >= GAUSS_CUTOFF && bn >= GAUSS_CUTOFF &&
567 bn >= GAUSS_CUTOFF && cn >= GAUSS_CUTOFF &&
568 FLINT_ABS(an - bn) <= 2 &&
569 FLINT_ABS(cn - dn) <= 2 &&
570 FLINT_ABS(aexp - bexp) <= 64 &&
571 FLINT_ABS(cexp - dexp) <= 64 &&
572 re_sum_exp - (aexp + cexp) < 0.1 * re_prec &&
573 im_sum_exp - (aexp + dexp) < 0.1 * im_prec &&
574 an + cn < 2.2 * re_sn && an + dn < 2.2 * im_sn)
575 {
576 if (use_gauss == NULL)
577 {
578 use_gauss = flint_calloc(len, sizeof(char));
579 arf_init(e);
580 arf_init(f);
581 }
582
583 use_gauss[i] = 1;
584 _arf_complex_mul_gauss(e, f, arb_midref(ai), arb_midref(bi), arb_midref(ci), arb_midref(di));
585 ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, 0, e);
586 ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, 0, f);
587 }
588 }
589
590 if (use_gauss != NULL)
591 {
592 arf_clear(e);
593 arf_clear(f);
594 }
595 }
596
597 for (xoff = 0; xoff < 2; xoff++)
598 {
599 for (yoff = 0; yoff < 2; yoff++)
600 {
601 slong sum_exp;
602 mp_ptr sum;
603 mp_size_t sn;
604 mp_limb_t serr;
605 int flipsign;
606
607 if (xoff == yoff)
608 {
609 sum_exp = re_sum_exp;
610 sum = re_sum;
611 sn = re_sn;
612 if (re_max_exp == WORD_MIN)
613 continue;
614 }
615 else
616 {
617 sum_exp = im_sum_exp;
618 sum = im_sum;
619 sn = im_sn;
620 if (im_max_exp == WORD_MIN)
621 continue;
622 }
623
624 serr = 0;
625 flipsign = (xoff + yoff == 2);
626
627 for (i = 0; i < len; i++)
628 {
629 xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
630 yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
631
632 xm = arb_midref(xi);
633 ym = arb_midref(yi);
634
635 /* The midpoints of x[i] and y[i] are both nonzero. */
636 if (!arf_is_special(xm) && !arf_is_special(ym))
637 {
638 xexp = ARF_EXP(xm);
639 xn = ARF_SIZE(xm);
640 xnegative = ARF_SGNBIT(xm);
641
642 yexp = ARF_EXP(ym);
643 yn = ARF_SIZE(ym);
644 ynegative = ARF_SGNBIT(ym);
645
646 exp = xexp + yexp;
647 shift = sum_exp - exp;
648
649 if (shift >= sn * FLINT_BITS)
650 {
651 }
652 else if (xn <= 2 && yn <= 2 && sn <= 3)
653 {
654 mp_limb_t x1, x0, y1, y0;
655 mp_limb_t u3, u2, u1, u0;
656
657 if (xn == 1 && yn == 1)
658 {
659 x0 = ARF_NOPTR_D(xm)[0];
660 y0 = ARF_NOPTR_D(ym)[0];
661 umul_ppmm(u3, u2, x0, y0);
662 u1 = u0 = 0;
663 }
664 else if (xn == 2 && yn == 2)
665 {
666 x0 = ARF_NOPTR_D(xm)[0];
667 x1 = ARF_NOPTR_D(xm)[1];
668 y0 = ARF_NOPTR_D(ym)[0];
669 y1 = ARF_NOPTR_D(ym)[1];
670 nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
671 }
672 else if (xn == 1)
673 {
674 x0 = ARF_NOPTR_D(xm)[0];
675 y0 = ARF_NOPTR_D(ym)[0];
676 y1 = ARF_NOPTR_D(ym)[1];
677 nn_mul_2x1(u3, u2, u1, y1, y0, x0);
678 u0 = 0;
679 }
680 else
681 {
682 x0 = ARF_NOPTR_D(xm)[0];
683 x1 = ARF_NOPTR_D(xm)[1];
684 y0 = ARF_NOPTR_D(ym)[0];
685 nn_mul_2x1(u3, u2, u1, x1, x0, y0);
686 u0 = 0;
687 }
688
689 if (sn == 2)
690 {
691 if (shift < FLINT_BITS)
692 {
693 u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
694 u3 = (u3 >> shift);
695 }
696 else if (shift == FLINT_BITS)
697 {
698 u2 = u3;
699 u3 = 0;
700 }
701 else /* FLINT_BITS < shift < 2 * FLINT_BITS */
702 {
703 u2 = (u3 >> (shift - FLINT_BITS));
704 u3 = 0;
705 }
706
707 if (xnegative ^ ynegative ^ flipsign)
708 sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
709 else
710 add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
711 }
712 else if (sn == 3)
713 {
714 if (shift < FLINT_BITS)
715 {
716 u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
717 u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
718 u3 = (u3 >> shift);
719 }
720 else if (shift == FLINT_BITS)
721 {
722 u1 = u2;
723 u2 = u3;
724 u3 = 0;
725 }
726 else if (shift < 2 * FLINT_BITS)
727 {
728 u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
729 u2 = (u3 >> (shift - FLINT_BITS));
730 u3 = 0;
731 }
732 else if (shift == 2 * FLINT_BITS)
733 {
734 u1 = u3;
735 u2 = 0;
736 u3 = 0;
737 }
738 else /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
739 {
740 u1 = (u3 >> (shift - 2 * FLINT_BITS));
741 u2 = 0;
742 u3 = 0;
743 }
744
745 if (xnegative ^ ynegative ^ flipsign)
746 sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
747 else
748 add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
749 }
750 }
751 else
752 {
753 mp_srcptr xptr, yptr;
754
755 xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
756 yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
757
758 if (use_gauss == NULL || use_gauss[i] == 0)
759 _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative ^ flipsign, shift);
760 }
761 }
762 }
763 }
764 }
765
766 _arb_dot_output(acb_realref(res), re_sum, re_sn, subtract, re_sum_exp, re_prec);
767 _arb_dot_output(acb_imagref(res), im_sum, im_sn, subtract, im_sum_exp, im_prec);
768
769 ARF_ADD_TMP_FREE(re_sum, alloc);
770 if (use_gauss != NULL)
771 flint_free(use_gauss);
772 }
773