1 /* mpn_div_qr_1u_pi2.
2 
3    THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
4    ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 2013 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 /* ISSUES:
36 
37    * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
38 
39    * Are there any problems with generating n quotient limbs in the q area?  It
40      surely simplifies things.
41 
42    * Not yet adequately tested.
43 */
44 
45 #include "gmp.h"
46 #include "gmp-impl.h"
47 #include "longlong.h"
48 
49 /* Define some longlong.h-style macros, but for wider operations.
50    * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
51      carry-out into an additional sum operand.
52 */
53 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER)
54 
55 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
56 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
57   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
58 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
59 	   : "0"  ((USItype)(s2)),					\
60 	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
61 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
62 #endif
63 
64 #if defined (__amd64__) && W_TYPE_SIZE == 64
65 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
66   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
67 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
68 	   : "0"  ((UDItype)(s2)),					\
69 	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
70 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72 
73 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
74 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
75    processor running in 32-bit mode, since the carry flag then gets the 32-bit
76    carry.  */
77 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
78   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
79 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
80 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
81 #endif
82 
83 #endif /* __GNUC__ */
84 
85 #ifndef add_sssaaaa
86 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
87   do {									\
88     UWtype __s0, __s1, __c0, __c1;					\
89     __s0 = (a0) + (b0);							\
90     __s1 = (a1) + (b1);							\
91     __c0 = __s0 < (a0);							\
92     __c1 = __s1 < (a1);							\
93     (s0) = __s0;							\
94     __s1 = __s1 + __c0;							\
95     (s1) = __s1;							\
96     (s2) += __c1 + (__s1 < __c0);					\
97   } while (0)
98 #endif
99 
100 struct precomp_div_1_pi2
101 {
102   mp_limb_t dip[2];
103   mp_limb_t d;
104   int norm_cnt;
105 };
106 
107 mp_limb_t
mpn_div_qr_1u_pi2(mp_ptr qp,mp_srcptr up,mp_size_t un,struct precomp_div_1_pi2 * pd)108 mpn_div_qr_1u_pi2 (mp_ptr qp,
109 		   mp_srcptr up, mp_size_t un,
110 		   struct precomp_div_1_pi2 *pd)
111 {
112   mp_size_t i;
113   mp_limb_t r, u2, u1, u0;
114   mp_limb_t d0, di1, di0;
115   mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
116   mp_limb_t cnd;
117   int cnt;
118 
119   ASSERT (un >= 2);
120   ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
121   ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
122   ASSERT_MPN (up, un);
123 
124 #define q3 q3a
125 #define q2 q2b
126 #define q1 q1b
127 
128   up += un - 3;
129   cnt = pd->norm_cnt;
130   r = up[2] >> (GMP_NUMB_BITS - cnt);
131   d0 = pd->d << cnt;
132 
133   qp += un - 2;
134 
135   di1 = pd->dip[1];
136   di0 = pd->dip[0];
137 
138   for (i = un - 3; i >= 0; i -= 2)
139     {
140       u2 = r;
141       u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
142       u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
143 
144       /* Dividend in {r,u1,u0} */
145 
146       umul_ppmm (q1d,q0d, u1, di0);
147       umul_ppmm (q2b,q1b, u1, di1);
148       q2b++;				/* cannot spill */
149       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
150 
151       umul_ppmm (q2c,q1c, u2,  di0);
152       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
153       umul_ppmm (q3a,q2a, u2, di1);
154 
155       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
156 
157       q3 += r;
158 
159       r = u0 - q2 * d0;
160 
161       cnd = (r >= q1);
162       r += d0 & -cnd;
163       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
164 
165       if (UNLIKELY (r >= d0))
166 	{
167 	  r -= d0;
168 	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
169 	}
170 
171       qp[0] = q2;
172       qp[1] = q3;
173 
174       up -= 2;
175       qp -= 2;
176     }
177 
178   if ((un & 1) != 0)
179     {
180       u2 = r;
181       u1 = (up[2] << cnt);
182 
183       udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
184       qp[1] = q3;
185     }
186   else
187     {
188       u2 = r;
189       u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
190       u0 = (up[1] << cnt);
191 
192       /* Dividend in {r,u1,u0} */
193 
194       umul_ppmm (q1d,q0d, u1, di0);
195       umul_ppmm (q2b,q1b, u1, di1);
196       q2b++;				/* cannot spill */
197       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
198 
199       umul_ppmm (q2c,q1c, u2,  di0);
200       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
201       umul_ppmm (q3a,q2a, u2, di1);
202 
203       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
204 
205       q3 += r;
206 
207       r = u0 - q2 * d0;
208 
209       cnd = (r >= q1);
210       r += d0 & -cnd;
211       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
212 
213       if (UNLIKELY (r >= d0))
214 	{
215 	  r -= d0;
216 	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
217 	}
218 
219       qp[0] = q2;
220       qp[1] = q3;
221     }
222 
223   return r >> cnt;
224 
225 #undef q3
226 #undef q2
227 #undef q1
228 }
229