1 /*
2     Copyright (C) 2018 Fredrik Johansson
3 
4     This file is part of Arb.
5 
6     Arb is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include "acb.h"
13 
14 /* We need uint64_t instead of mp_limb_t on 32-bit systems for
15    safe summation of 30-bit error bounds. */
16 #include <stdint.h>
17 
18 /* The following macros are found in FLINT's longlong.h, but
19    the release version is out of date. */
20 
21 /* x86 : 64 bit */
22 #if (GMP_LIMB_BITS == 64 && defined (__amd64__))
23 
24 #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
25   __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0"     \
26        : "=r" (sh), "=&r" (sm), "=&r" (sl)                  \
27        : "0"  ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)),  \
28          "1"  ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)),  \
29          "2"  ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl)))  \
30 
31 #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
32   __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0"     \
33        : "=r" (dh), "=&r" (dm), "=&r" (dl)                  \
34        : "0"  ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)),  \
35          "1"  ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)),  \
36 "2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \
37 
38 #endif /* x86_64 */
39 
40 /* x86 : 32 bit */
41 #if (GMP_LIMB_BITS == 32 && (defined (__i386__) \
42    || defined (__i486__) || defined(__amd64__)))
43 
44 #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)  \
45   __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0"     \
46        : "=r" (sh), "=r" (sm), "=&r" (sl)                  \
47        : "0"  ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)),    \
48          "1"  ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)),    \
49          "2"  ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl)))    \
50 
51 #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)  \
52   __asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0"     \
53        : "=r" (dh), "=r" (dm), "=&r" (dl)                  \
54        : "0"  ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)),    \
55          "1"  ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)),    \
56          "2"  ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl)))    \
57 
58 #endif /* x86 */
59 
60 
61 #if !defined(add_sssaaaaaa2)
62 
63 #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl)           \
64   do {                                                              \
65     mp_limb_t __t, __u;                                             \
66     add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl);      \
67     add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm);      \
68     add_ssaaaa(sh, sm, ah + bh, sm, __u, __t);                      \
69 } while (0)
70 
71 #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl)           \
72   do {                                                              \
73     mp_limb_t __t, __u;                                             \
74     sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl);      \
75     sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm);      \
76     sub_ddmmss(dh, dm, mh - sh, dm, -__u, -__t);                    \
77   } while (0)
78 
79 #endif
80 
81 void
82 _arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
83     mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
84     int negative, flint_bitcnt_t shift);
85 
86 void
87 _arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
88     mp_srcptr xptr, mp_size_t xn,
89     int negative, flint_bitcnt_t shift);
90 
91 static void
_arb_dot_output(arb_t res,mp_ptr sum,mp_size_t sn,int negative,slong sum_exp,slong prec)92 _arb_dot_output(arb_t res, mp_ptr sum, mp_size_t sn, int negative,
93     slong sum_exp, slong prec)
94 {
95     slong exp_fix;
96 
97     if (sum[sn - 1] >= LIMB_TOP)
98     {
99         mpn_neg(sum, sum, sn);
100         negative ^= 1;
101     }
102 
103     exp_fix = 0;
104 
105     if (sum[sn - 1] == 0)
106     {
107         slong sum_exp2;
108         mp_size_t sn2;
109 
110         sn2 = sn;
111         sum_exp2 = sum_exp;
112 
113         while (sn2 > 0 && sum[sn2 - 1] == 0)
114         {
115             sum_exp2 -= FLINT_BITS;
116             sn2--;
117         }
118 
119         if (sn2 == 0)
120         {
121             arf_zero(arb_midref(res));
122         }
123         else
124         {
125             _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn2, negative, prec, ARF_RND_DOWN);
126             _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp2);
127         }
128     }
129     else
130     {
131         if (sn == 2)  /* unnecessary? */
132             _arf_set_round_uiui(arb_midref(res), &exp_fix, sum[1], sum[0], negative, prec, ARF_RND_DOWN);
133         else
134             _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn, negative, prec, ARF_RND_DOWN);
135 
136         _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp);
137     }
138 }
139 
140 /* xxx: don't use surrounding variables */
141 #define ARB_DOT_ADD(s_sum, s_serr, s_sn, s_sum_exp, s_subtract, xm) \
142     if (!arf_is_special(xm)) \
143     { \
144         mp_srcptr xptr; \
145         xexp = ARF_EXP(xm); \
146         xn = ARF_SIZE(xm); \
147         xnegative = ARF_SGNBIT(xm); \
148         shift = s_sum_exp - xexp; \
149         if (shift >= s_sn * FLINT_BITS) \
150         { \
151         } \
152         else \
153         { \
154             xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm); \
155             _arb_dot_add_generic(s_sum, &s_serr, tmp, s_sn, xptr, xn, xnegative ^ s_subtract, shift); \
156         } \
157     } \
158 
159 static void
_arf_complex_mul_gauss(arf_t e,arf_t f,const arf_t a,const arf_t b,const arf_t c,const arf_t d)160 _arf_complex_mul_gauss(arf_t e, arf_t f, const arf_t a, const arf_t b,
161                                          const arf_t c, const arf_t d)
162 {
163     mp_srcptr ap, bp, cp, dp;
164     int asgn, bsgn, csgn, dsgn;
165     mp_size_t an, bn, cn, dn;
166     slong aexp, bexp, cexp, dexp;
167     fmpz texp, uexp;
168 
169     fmpz_t za, zb, zc, zd, t, u, v;
170     slong abot, bbot, cbot, dbot;
171 
172     ARF_GET_MPN_READONLY(ap, an, a);
173     asgn = ARF_SGNBIT(a);
174     aexp = ARF_EXP(a);
175 
176     ARF_GET_MPN_READONLY(bp, bn, b);
177     bsgn = ARF_SGNBIT(b);
178     bexp = ARF_EXP(b);
179 
180     ARF_GET_MPN_READONLY(cp, cn, c);
181     csgn = ARF_SGNBIT(c);
182     cexp = ARF_EXP(c);
183 
184     ARF_GET_MPN_READONLY(dp, dn, d);
185     dsgn = ARF_SGNBIT(d);
186     dexp = ARF_EXP(d);
187 
188     /* Gauss multiplication
189         e = ac - bd
190         f = (a+b)(c+d) - ac - bd */
191 
192     abot = aexp - an * FLINT_BITS;
193     bbot = bexp - bn * FLINT_BITS;
194     cbot = cexp - cn * FLINT_BITS;
195     dbot = dexp - dn * FLINT_BITS;
196 
197     texp = FLINT_MIN(abot, bbot);
198     uexp = FLINT_MIN(cbot, dbot);
199 
200     fmpz_init(za);
201     fmpz_init(zb);
202     fmpz_init(zc);
203     fmpz_init(zd);
204     fmpz_init(t);
205     fmpz_init(u);
206     fmpz_init(v);
207 
208     fmpz_lshift_mpn(za, ap, an, asgn, abot - texp);
209     fmpz_lshift_mpn(zb, bp, bn, bsgn, bbot - texp);
210     fmpz_lshift_mpn(zc, cp, cn, csgn, cbot - uexp);
211     fmpz_lshift_mpn(zd, dp, dn, dsgn, dbot - uexp);
212 
213     fmpz_add(t, za, zb);
214     fmpz_add(v, zc, zd);
215     fmpz_mul(u, t, v);
216     fmpz_mul(t, za, zc);
217     fmpz_mul(v, zb, zd);
218     fmpz_sub(u, u, t);
219     fmpz_sub(u, u, v);
220     fmpz_sub(t, t, v);
221 
222     texp += uexp;
223     arf_set_fmpz_2exp(e, t, &texp);
224     arf_set_fmpz_2exp(f, u, &texp);
225 
226     fmpz_clear(za);
227     fmpz_clear(zb);
228     fmpz_clear(zc);
229     fmpz_clear(zd);
230     fmpz_clear(t);
231     fmpz_clear(u);
232     fmpz_clear(v);
233 }
234 
235 ARB_DLL extern slong acb_dot_gauss_dot_cutoff;
236 #define GAUSS_CUTOFF acb_dot_gauss_dot_cutoff
237 
238 void
acb_approx_dot_simple(acb_t res,const acb_t initial,int subtract,acb_srcptr x,slong xstep,acb_srcptr y,slong ystep,slong len,slong prec)239 acb_approx_dot_simple(acb_t res, const acb_t initial, int subtract,
240     acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
241 {
242     slong i;
243 
244     if (len <= 0)
245     {
246         if (initial == NULL)
247         {
248             arf_zero(arb_midref(acb_realref(res)));
249             arf_zero(arb_midref(acb_imagref(res)));
250         }
251         else
252         {
253             arf_set_round(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)), prec, ARB_RND);
254             arf_set_round(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)), prec, ARB_RND);
255         }
256         return;
257     }
258 
259     if (initial == NULL && len == 1)
260     {
261         arf_complex_mul(arb_midref(acb_realref(res)),
262                         arb_midref(acb_imagref(res)),
263                         arb_midref(acb_realref(x)),
264                         arb_midref(acb_imagref(x)),
265                         arb_midref(acb_realref(y)),
266                         arb_midref(acb_imagref(y)), prec, ARB_RND);
267     }
268     else
269     {
270         arf_t e, f;
271 
272         arf_init(e);
273         arf_init(f);
274 
275         if (initial != NULL)
276         {
277             if (subtract)
278             {
279                 arf_neg(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)));
280                 arf_neg(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)));
281             }
282             else
283             {
284                 arf_set(arb_midref(acb_realref(res)), arb_midref(acb_realref(initial)));
285                 arf_set(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(initial)));
286             }
287         }
288 
289         for (i = 0; i < len; i++)
290         {
291             arf_complex_mul(e, f,
292                             arb_midref(acb_realref(x + i * xstep)),
293                             arb_midref(acb_imagref(x + i * xstep)),
294                             arb_midref(acb_realref(y + i * ystep)),
295                             arb_midref(acb_imagref(y + i * ystep)), prec, ARB_RND);
296 
297 
298             if (i == 0 && initial == NULL)
299             {
300                 arf_set(arb_midref(acb_realref(res)), e);
301                 arf_set(arb_midref(acb_imagref(res)), f);
302             }
303             else
304             {
305                 arf_add(arb_midref(acb_realref(res)), arb_midref(acb_realref(res)), e, prec, ARB_RND);
306                 arf_add(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(res)), f, prec, ARB_RND);
307             }
308         }
309 
310         arf_clear(e);
311         arf_clear(f);
312     }
313 
314     if (subtract)
315     {
316         arf_neg(arb_midref(acb_realref(res)), arb_midref(acb_realref(res)));
317         arf_neg(arb_midref(acb_imagref(res)), arb_midref(acb_imagref(res)));
318     }
319 }
320 
321 void
acb_approx_dot(acb_t res,const acb_t initial,int subtract,acb_srcptr x,slong xstep,acb_srcptr y,slong ystep,slong len,slong prec)322 acb_approx_dot(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec)
323 {
324     slong i, j, padding, extend;
325     slong xexp, yexp, exp;
326     slong re_nonzero, im_nonzero;
327     slong re_max_exp, re_min_exp, re_sum_exp;
328     slong im_max_exp, im_min_exp, im_sum_exp;
329     slong re_prec, im_prec;
330     int xnegative, ynegative;
331     mp_size_t xn, yn, re_sn, im_sn, alloc;
332     flint_bitcnt_t shift;
333     arb_srcptr xi, yi;
334     arf_srcptr xm, ym;
335     mp_limb_t re_serr, im_serr;   /* Sum over arithmetic errors */
336     mp_ptr tmp, re_sum, im_sum;   /* Workspace */
337     slong xoff, yoff;
338     char * use_gauss;
339     ARF_ADD_TMP_DECL;
340 
341     /* todo: fast fma and fmma (len=2) code */
342     if (len <= 1)
343     {
344         acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
345         return;
346     }
347 
348     /* Number of nonzero midpoint terms in sum. */
349     re_nonzero = 0;
350     im_nonzero = 0;
351 
352     /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
353     re_max_exp = WORD_MIN;
354     im_max_exp = WORD_MIN;
355 
356     /* Used to reduce the precision. */
357     re_min_exp = WORD_MAX;
358     im_min_exp = WORD_MAX;
359 
360     /* Account for the initial term. */
361     if (initial != NULL)
362     {
363         if (!ARF_IS_LAGOM(arb_midref(acb_realref(initial))) || !ARF_IS_LAGOM(arb_midref(acb_imagref(initial))))
364         {
365             acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
366             return;
367         }
368 
369         xm = arb_midref(acb_realref(initial));
370 
371         if (!arf_is_special(xm))
372         {
373             re_max_exp = ARF_EXP(xm);
374             re_nonzero++;
375 
376             if (prec > 2 * FLINT_BITS)
377                 re_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
378         }
379 
380         xm = arb_midref(acb_imagref(initial));
381 
382         if (!arf_is_special(xm))
383         {
384             im_max_exp = ARF_EXP(xm);
385             im_nonzero++;
386 
387             if (prec > 2 * FLINT_BITS)
388                 im_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
389         }
390     }
391 
392     for (xoff = 0; xoff < 2; xoff++)
393     {
394         for (yoff = 0; yoff < 2; yoff++)
395         {
396             slong nonzero, max_exp, min_exp;
397 
398             if (xoff == yoff)
399             {
400                 nonzero = re_nonzero;
401                 max_exp = re_max_exp;
402                 min_exp = re_min_exp;
403             }
404             else
405             {
406                 nonzero = im_nonzero;
407                 max_exp = im_max_exp;
408                 min_exp = im_min_exp;
409             }
410 
411             /* Determine maximum exponents for the main sum and the radius sum. */
412             for (i = 0; i < len; i++)
413             {
414                 xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
415                 yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
416 
417                 /* Fallback for huge exponents or non-finite values. */
418                 if (!ARF_IS_LAGOM(arb_midref(xi)) || !ARF_IS_LAGOM(arb_midref(yi)))
419                 {
420                     acb_approx_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
421                     return;
422                 }
423 
424                 xm = arb_midref(xi);
425                 ym = arb_midref(yi);
426 
427                 /* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */
428                 if (!arf_is_special(xm))
429                 {
430                     xexp = ARF_EXP(xm);
431 
432                     if (!arf_is_special(ym))
433                     {
434                         yexp = ARF_EXP(ym);
435 
436                         max_exp = FLINT_MAX(max_exp, xexp + yexp);
437                         nonzero++;
438 
439                         if (prec > 2 * FLINT_BITS)
440                         {
441                             slong bot;
442                             bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
443                             min_exp = FLINT_MIN(min_exp, bot);
444                         }
445                     }
446                 }
447             }
448 
449             if (xoff == yoff)
450             {
451                 re_nonzero = nonzero;
452                 re_max_exp = max_exp;
453                 re_min_exp = min_exp;
454             }
455             else
456             {
457                 im_nonzero = nonzero;
458                 im_max_exp = max_exp;
459                 im_min_exp = min_exp;
460             }
461         }
462     }
463 
464     re_prec = prec;
465     im_prec = prec;
466 
467     if (re_max_exp == WORD_MIN && im_max_exp == WORD_MIN)
468     {
469         arf_zero(arb_midref(acb_realref(res)));
470         arf_zero(arb_midref(acb_imagref(res)));
471         return;
472     }
473 
474     /* The midpoint sum is zero. */
475     if (re_max_exp == WORD_MIN)
476     {
477         re_prec = 2;
478     }
479     else
480     {
481         if (re_min_exp != WORD_MAX)
482             re_prec = FLINT_MIN(re_prec, re_max_exp - re_min_exp + MAG_BITS);
483         re_prec = FLINT_MAX(re_prec, 2);
484     }
485 
486     if (im_max_exp == WORD_MIN)
487     {
488         im_prec = 2;
489     }
490     else
491     {
492         if (re_min_exp != WORD_MAX)
493             im_prec = FLINT_MIN(im_prec, im_max_exp - im_min_exp + MAG_BITS);
494         im_prec = FLINT_MAX(im_prec, 2);
495     }
496 
497     extend = FLINT_BIT_COUNT(re_nonzero) + 1;
498     padding = 4 + FLINT_BIT_COUNT(len);
499     re_sn = (re_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
500     re_sn = FLINT_MAX(re_sn, 2);
501     re_sum_exp = re_max_exp + extend;
502 
503     extend = FLINT_BIT_COUNT(im_nonzero) + 1;
504     padding = 4 + FLINT_BIT_COUNT(len);
505     im_sn = (im_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
506     im_sn = FLINT_MAX(im_sn, 2);
507     im_sum_exp = im_max_exp + extend;
508 
509     /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
510        for carry or borrow that avoids an extra branch). We need
511        2 * (sn + 2) limbs to store the product of two numbers
512        with up to (sn + 2) limbs, plus 1 extra limb for shifting
513        the product. */
514     alloc = (re_sn + 1) + (im_sn + 1) + 2 * (FLINT_MAX(re_sn, im_sn) + 2) + 1;
515     ARF_ADD_TMP_ALLOC(re_sum, alloc)
516     im_sum = re_sum + (re_sn + 1);
517     tmp = im_sum + (im_sn + 1);
518 
519     /* Set sum to 0 */
520     re_serr = 0;
521     for (j = 0; j < re_sn + 1; j++)
522         re_sum[j] = 0;
523     im_serr = 0;
524     for (j = 0; j < im_sn + 1; j++)
525         im_sum[j] = 0;
526 
527     if (initial != NULL)
528     {
529         xm = arb_midref(acb_realref(initial));
530 
531         ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, subtract, xm);
532 
533         xm = arb_midref(acb_imagref(initial));
534 
535         ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, subtract, xm);
536     }
537 
538     use_gauss = NULL;
539 
540     if (re_prec >= GAUSS_CUTOFF * FLINT_BITS &&
541         im_prec >= GAUSS_CUTOFF * FLINT_BITS)
542     {
543         arf_t e, f;
544 
545         for (i = 0; i < len; i++)
546         {
547             arb_srcptr ai, bi, ci, di;
548             mp_size_t an, bn, cn, dn;
549             slong aexp, bexp, cexp, dexp;
550 
551             ai = ((arb_srcptr) x) + 2 * i * xstep;
552             bi = ((arb_srcptr) x) + 2 * i * xstep + 1;
553             ci = ((arb_srcptr) y) + 2 * i * ystep;
554             di = ((arb_srcptr) y) + 2 * i * ystep + 1;
555 
556             an = ARF_SIZE(arb_midref(ai));
557             bn = ARF_SIZE(arb_midref(bi));
558             cn = ARF_SIZE(arb_midref(ci));
559             dn = ARF_SIZE(arb_midref(di));
560 
561             aexp = ARF_EXP(arb_midref(ai));
562             bexp = ARF_EXP(arb_midref(bi));
563             cexp = ARF_EXP(arb_midref(ci));
564             dexp = ARF_EXP(arb_midref(di));
565 
566             if (an >= GAUSS_CUTOFF && bn >= GAUSS_CUTOFF &&
567                 bn >= GAUSS_CUTOFF && cn >= GAUSS_CUTOFF &&
568                 FLINT_ABS(an - bn) <= 2 &&
569                 FLINT_ABS(cn - dn) <= 2 &&
570                 FLINT_ABS(aexp - bexp) <= 64 &&
571                 FLINT_ABS(cexp - dexp) <= 64 &&
572                 re_sum_exp - (aexp + cexp) < 0.1 * re_prec &&
573                 im_sum_exp - (aexp + dexp) < 0.1 * im_prec &&
574                 an + cn < 2.2 * re_sn && an + dn < 2.2 * im_sn)
575             {
576                 if (use_gauss == NULL)
577                 {
578                     use_gauss = flint_calloc(len, sizeof(char));
579                     arf_init(e);
580                     arf_init(f);
581                 }
582 
583                 use_gauss[i] = 1;
584                 _arf_complex_mul_gauss(e, f, arb_midref(ai), arb_midref(bi), arb_midref(ci), arb_midref(di));
585                 ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, 0, e);
586                 ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, 0, f);
587             }
588         }
589 
590         if (use_gauss != NULL)
591         {
592             arf_clear(e);
593             arf_clear(f);
594         }
595     }
596 
597     for (xoff = 0; xoff < 2; xoff++)
598     {
599         for (yoff = 0; yoff < 2; yoff++)
600         {
601             slong sum_exp;
602             mp_ptr sum;
603             mp_size_t sn;
604             mp_limb_t serr;
605             int flipsign;
606 
607             if (xoff == yoff)
608             {
609                 sum_exp = re_sum_exp;
610                 sum = re_sum;
611                 sn = re_sn;
612                 if (re_max_exp == WORD_MIN)
613                     continue;
614             }
615             else
616             {
617                 sum_exp = im_sum_exp;
618                 sum = im_sum;
619                 sn = im_sn;
620                 if (im_max_exp == WORD_MIN)
621                     continue;
622             }
623 
624             serr = 0;
625             flipsign = (xoff + yoff == 2);
626 
627             for (i = 0; i < len; i++)
628             {
629                 xi = ((arb_srcptr) x) + 2 * i * xstep + xoff;
630                 yi = ((arb_srcptr) y) + 2 * i * ystep + yoff;
631 
632                 xm = arb_midref(xi);
633                 ym = arb_midref(yi);
634 
635                 /* The midpoints of x[i] and y[i] are both nonzero. */
636                 if (!arf_is_special(xm) && !arf_is_special(ym))
637                 {
638                     xexp = ARF_EXP(xm);
639                     xn = ARF_SIZE(xm);
640                     xnegative = ARF_SGNBIT(xm);
641 
642                     yexp = ARF_EXP(ym);
643                     yn = ARF_SIZE(ym);
644                     ynegative = ARF_SGNBIT(ym);
645 
646                     exp = xexp + yexp;
647                     shift = sum_exp - exp;
648 
649                     if (shift >= sn * FLINT_BITS)
650                     {
651                     }
652                     else if (xn <= 2 && yn <= 2 && sn <= 3)
653                     {
654                         mp_limb_t x1, x0, y1, y0;
655                         mp_limb_t u3, u2, u1, u0;
656 
657                         if (xn == 1 && yn == 1)
658                         {
659                             x0 = ARF_NOPTR_D(xm)[0];
660                             y0 = ARF_NOPTR_D(ym)[0];
661                             umul_ppmm(u3, u2, x0, y0);
662                             u1 = u0 = 0;
663                         }
664                         else if (xn == 2 && yn == 2)
665                         {
666                             x0 = ARF_NOPTR_D(xm)[0];
667                             x1 = ARF_NOPTR_D(xm)[1];
668                             y0 = ARF_NOPTR_D(ym)[0];
669                             y1 = ARF_NOPTR_D(ym)[1];
670                             nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
671                         }
672                         else if (xn == 1)
673                         {
674                             x0 = ARF_NOPTR_D(xm)[0];
675                             y0 = ARF_NOPTR_D(ym)[0];
676                             y1 = ARF_NOPTR_D(ym)[1];
677                             nn_mul_2x1(u3, u2, u1, y1, y0, x0);
678                             u0 = 0;
679                         }
680                         else
681                         {
682                             x0 = ARF_NOPTR_D(xm)[0];
683                             x1 = ARF_NOPTR_D(xm)[1];
684                             y0 = ARF_NOPTR_D(ym)[0];
685                             nn_mul_2x1(u3, u2, u1, x1, x0, y0);
686                             u0 = 0;
687                         }
688 
689                         if (sn == 2)
690                         {
691                             if (shift < FLINT_BITS)
692                             {
693                                 u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
694                                 u3 = (u3 >> shift);
695                             }
696                             else if (shift == FLINT_BITS)
697                             {
698                                 u2 = u3;
699                                 u3 = 0;
700                             }
701                             else /* FLINT_BITS < shift < 2 * FLINT_BITS */
702                             {
703                                 u2 = (u3 >> (shift - FLINT_BITS));
704                                 u3 = 0;
705                             }
706 
707                             if (xnegative ^ ynegative ^ flipsign)
708                                 sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
709                             else
710                                 add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
711                         }
712                         else if (sn == 3)
713                         {
714                             if (shift < FLINT_BITS)
715                             {
716                                 u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
717                                 u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
718                                 u3 = (u3 >> shift);
719                             }
720                             else if (shift == FLINT_BITS)
721                             {
722                                 u1 = u2;
723                                 u2 = u3;
724                                 u3 = 0;
725                             }
726                             else if (shift < 2 * FLINT_BITS)
727                             {
728                                 u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
729                                 u2 = (u3 >> (shift - FLINT_BITS));
730                                 u3 = 0;
731                             }
732                             else if (shift == 2 * FLINT_BITS)
733                             {
734                                 u1 = u3;
735                                 u2 = 0;
736                                 u3 = 0;
737                             }
738                             else  /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
739                             {
740                                 u1 = (u3 >> (shift - 2 * FLINT_BITS));
741                                 u2 = 0;
742                                 u3 = 0;
743                             }
744 
745                             if (xnegative ^ ynegative ^ flipsign)
746                                 sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
747                             else
748                                 add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
749                         }
750                     }
751                     else
752                     {
753                         mp_srcptr xptr, yptr;
754 
755                         xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
756                         yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
757 
758                         if (use_gauss == NULL || use_gauss[i] == 0)
759                             _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative ^ flipsign, shift);
760                     }
761                 }
762             }
763         }
764     }
765 
766     _arb_dot_output(acb_realref(res), re_sum, re_sn, subtract, re_sum_exp, re_prec);
767     _arb_dot_output(acb_imagref(res), im_sum, im_sn, subtract, im_sum_exp, im_prec);
768 
769     ARF_ADD_TMP_FREE(re_sum, alloc);
770     if (use_gauss != NULL)
771         flint_free(use_gauss);
772 }
773