1 /* mpn_div_qr_1u_pi2.
2 
3    THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
4    ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 2013, 2017 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 /* ISSUES:
36 
37    * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
38 
39    * Are there any problems with generating n quotient limbs in the q area?  It
40      surely simplifies things.
41 
42    * Not yet adequately tested.
43 */
44 
45 #include "gmp-impl.h"
46 #include "longlong.h"
47 
48 /* Define some longlong.h-style macros, but for wider operations.
49    * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
50      an additional sum operand.
51 */
52 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
53 
54 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
55 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
56   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
57 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
58 	   : "0"  ((USItype)(s2)),					\
59 	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
60 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
62 
63 #if defined (__amd64__) && W_TYPE_SIZE == 64
64 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
65   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
66 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
67 	   : "0"  ((UDItype)(s2)),					\
68 	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
69 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
70 #endif
71 
72 #if defined (__aarch64__) && W_TYPE_SIZE == 64
73 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
74   __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
75 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
76 	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) __CLOBBER_CC)
77 #endif
78 
79 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
80 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
81    processor running in 32-bit mode, since the carry flag then gets the 32-bit
82    carry.  */
83 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
84   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
85 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
86 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
87 	     __CLOBBER_CC)
88 #endif
89 
90 #endif /* __GNUC__ */
91 
92 #ifndef add_sssaaaa
93 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
94   do {									\
95     UWtype __s0, __s1, __c0, __c1;					\
96     __s0 = (a0) + (b0);							\
97     __s1 = (a1) + (b1);							\
98     __c0 = __s0 < (a0);							\
99     __c1 = __s1 < (a1);							\
100     (s0) = __s0;							\
101     __s1 = __s1 + __c0;							\
102     (s1) = __s1;							\
103     (s2) += __c1 + (__s1 < __c0);					\
104   } while (0)
105 #endif
106 
107 struct precomp_div_1_pi2
108 {
109   mp_limb_t dip[2];
110   mp_limb_t d;
111   int norm_cnt;
112 };
113 
114 mp_limb_t
mpn_div_qr_1u_pi2(mp_ptr qp,mp_srcptr up,mp_size_t un,struct precomp_div_1_pi2 * pd)115 mpn_div_qr_1u_pi2 (mp_ptr qp,
116 		   mp_srcptr up, mp_size_t un,
117 		   struct precomp_div_1_pi2 *pd)
118 {
119   mp_size_t i;
120   mp_limb_t r, u2, u1, u0;
121   mp_limb_t d0, di1, di0;
122   mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
123   mp_limb_t cnd;
124   int cnt;
125 
126   ASSERT (un >= 2);
127   ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
128   ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
129   ASSERT_MPN (up, un);
130 
131 #define q3 q3a
132 #define q2 q2b
133 #define q1 q1b
134 
135   up += un - 3;
136   cnt = pd->norm_cnt;
137   r = up[2] >> (GMP_NUMB_BITS - cnt);
138   d0 = pd->d << cnt;
139 
140   qp += un - 2;
141 
142   di1 = pd->dip[1];
143   di0 = pd->dip[0];
144 
145   for (i = un - 3; i >= 0; i -= 2)
146     {
147       u2 = r;
148       u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
149       u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
150 
151       /* Dividend in {r,u1,u0} */
152 
153       umul_ppmm (q1d,q0d, u1, di0);
154       umul_ppmm (q2b,q1b, u1, di1);
155       q2b++;				/* cannot spill */
156       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
157 
158       umul_ppmm (q2c,q1c, u2,  di0);
159       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
160       umul_ppmm (q3a,q2a, u2, di1);
161 
162       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
163 
164       q3 += r;
165 
166       r = u0 - q2 * d0;
167 
168       cnd = (r >= q1);
169       r += d0 & -cnd;
170       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
171 
172       if (UNLIKELY (r >= d0))
173 	{
174 	  r -= d0;
175 	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
176 	}
177 
178       qp[0] = q2;
179       qp[1] = q3;
180 
181       up -= 2;
182       qp -= 2;
183     }
184 
185   if ((un & 1) != 0)
186     {
187       u2 = r;
188       u1 = (up[2] << cnt);
189 
190       udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
191       qp[1] = q3;
192     }
193   else
194     {
195       u2 = r;
196       u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
197       u0 = (up[1] << cnt);
198 
199       /* Dividend in {r,u1,u0} */
200 
201       umul_ppmm (q1d,q0d, u1, di0);
202       umul_ppmm (q2b,q1b, u1, di1);
203       q2b++;				/* cannot spill */
204       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
205 
206       umul_ppmm (q2c,q1c, u2,  di0);
207       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
208       umul_ppmm (q3a,q2a, u2, di1);
209 
210       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
211 
212       q3 += r;
213 
214       r = u0 - q2 * d0;
215 
216       cnd = (r >= q1);
217       r += d0 & -cnd;
218       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
219 
220       if (UNLIKELY (r >= d0))
221 	{
222 	  r -= d0;
223 	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
224 	}
225 
226       qp[0] = q2;
227       qp[1] = q3;
228     }
229 
230   return r >> cnt;
231 
232 #undef q3
233 #undef q2
234 #undef q1
235 }
236