1 /* mpn_div_qr_1u_pi2.
2 
3    THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
4    ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 2013 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 /* ISSUES:
36 
37    * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
38 
39    * Are there any problems with generating n quotient limbs in the q area?  It
40      surely simplifies things.
41 
42    * Not yet adequately tested.
43 */
44 
45 #include "gmp.h"
46 #include "gmp-impl.h"
47 #include "longlong.h"
48 
49 /* Define some longlong.h-style macros, but for wider operations.
50    * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
51      carry-out into an additional sum operand.
52 */
53 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER)
54 
55 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
56 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
57   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
58 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
59 	   : "0"  ((USItype)(s2)),					\
60 	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
61 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
62 #endif
63 
64 #if defined (__amd64__) && W_TYPE_SIZE == 64
65 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
66   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
67 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
68 	   : "0"  ((UDItype)(s2)),					\
69 	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
70 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72 
73 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
74 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
75    processor running in 32-bit mode, since the carry flag then gets the 32-bit
76    carry.  */
77 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
78   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"	\
79 	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
80 	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
81 #endif
82 
83 #endif /* __GNUC__ */
84 
85 #ifndef add_sssaaaa
86 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
87   do {									\
88     UWtype __s0, __s1, __c0, __c1;					\
89     __s0 = (a0) + (b0);							\
90     __s1 = (a1) + (b1);							\
91     __c0 = __s0 < (a0);							\
92     __c1 = __s1 < (a1);							\
93     (s0) = __s0;							\
94     __s1 = __s1 + __c0;							\
95     (s1) = __s1;							\
96     (s2) += __c1 + (__s1 < __c0);					\
97   } while (0)
98 #endif
99 
100 struct precomp_div_1_pi2
101 {
102   mp_limb_t dip[2];
103   mp_limb_t d;
104   int norm_cnt;
105 };
106 
107 mp_limb_t
mpn_div_qr_1n_pi2(mp_ptr qp,mp_srcptr up,mp_size_t un,struct precomp_div_1_pi2 * pd)108 mpn_div_qr_1n_pi2 (mp_ptr qp,
109 		   mp_srcptr up, mp_size_t un,
110 		   struct precomp_div_1_pi2 *pd)
111 {
112   mp_limb_t most_significant_q_limb;
113   mp_size_t i;
114   mp_limb_t r, u2, u1, u0;
115   mp_limb_t d0, di1, di0;
116   mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
117   mp_limb_t cnd;
118 
119   ASSERT (un >= 2);
120   ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
121   ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
122   ASSERT_MPN (up, un);
123 
124 #define q3 q3a
125 #define q2 q2b
126 #define q1 q1b
127 
128   up += un - 3;
129   r = up[2];
130   d0 = pd->d;
131 
132   most_significant_q_limb = (r >= d0);
133   r -= d0 & -most_significant_q_limb;
134 
135   qp += un - 3;
136   qp[2] = most_significant_q_limb;
137 
138   di1 = pd->dip[1];
139   di0 = pd->dip[0];
140 
141   for (i = un - 3; i >= 0; i -= 2)
142     {
143       u2 = r;
144       u1 = up[1];
145       u0 = up[0];
146 
147       /* Dividend in {r,u1,u0} */
148 
149       umul_ppmm (q1d,q0d, u1, di0);
150       umul_ppmm (q2b,q1b, u1, di1);
151       q2b++;				/* cannot spill */
152       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
153 
154       umul_ppmm (q2c,q1c, u2,  di0);
155       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
156       umul_ppmm (q3a,q2a, u2, di1);
157 
158       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
159 
160       q3 += r;
161 
162       r = u0 - q2 * d0;
163 
164       cnd = (r >= q1);
165       r += d0 & -cnd;
166       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
167 
168       if (UNLIKELY (r >= d0))
169 	{
170 	  r -= d0;
171 	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
172 	}
173 
174       qp[0] = q2;
175       qp[1] = q3;
176 
177       up -= 2;
178       qp -= 2;
179     }
180 
181   if ((un & 1) == 0)
182     {
183       u2 = r;
184       u1 = up[1];
185 
186       udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
187       qp[1] = q3;
188     }
189 
190   return r;
191 
192 #undef q3
193 #undef q2
194 #undef q1
195 }
196