1 /* mpn_mulmid -- middle product
2 
3 Copyright (C) 2009, David Harvey
4 
5 All rights reserved.
6 
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9 
10 1. Redistributions of source code must retain the above copyright notice, this
11    list of conditions and the following disclaimer.
12 
13 2. Redistributions in binary form must reproduce the above copyright notice,
14    this list of conditions and the following disclaimer in the documentation
15    and/or other materials provided with the distribution.
16 
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
21 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 */
29 
30 #include "mpir.h"
31 #include "gmp-impl.h"
32 
33 
34 #define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
35 
36 /* Let a = sum_0^{m-1} a_i B^i and b = sum_0^{n-1} b_j B^j
37 
38    then MP(a, m, b, n) = sum_{0<=i<m, 0<=j<n, n-1<=i+j<=m-1} a_ib_j B^{i+j-n+1}
39 
40    Note there are m-n+1 different values of i+j and each product a_ib_j will be two limbs. Thus when added together, the sum must take up n-m+3 limbs of space.
41 
42    This function computes MP(ap,an,bp,bn), placing the result in {rp, an-bn+3}.
43 
44    It is required that bn << GMP_NUMBMAX.
45 */
46 
47 void
mpn_mulmid(mp_ptr rp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn)48 mpn_mulmid (mp_ptr rp,
49             mp_srcptr ap, mp_size_t an,
50             mp_srcptr bp, mp_size_t bn)
51 {
52   mp_size_t rn, k;
53   mp_ptr scratch, temp;
54 
55   ASSERT (an >= bn);
56   ASSERT (bn >= 1);
57   ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
58   ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
59 
60   if (bn < MULMID_TOOM42_THRESHOLD)
61     {
62       /* region not tall enough to make toom42 worthwhile for any portion */
63 
64       if (an < CHUNK)
65 	{
66 	  /* region not too wide either, just call basecase directly */
67 	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
68 	  return;
69 	}
70 
71       /* Region quite wide. For better locality, use basecase on chunks:
72 
73 	 AAABBBCC..
74 	 .AAABBBCC.
75 	 ..AAABBBCC
76       */
77 
78       k = CHUNK - bn + 1;    /* number of diagonals per chunk */
79 
80       /* first chunk (marked A in the above diagram) */
81       mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
82 
83       /* remaining chunks (B, C, etc) */
84       an -= k;
85 
86       while (an >= CHUNK)
87 	{
88 	  mp_limb_t t0, t1, cy;
89 	  ap += k, rp += k;
90 	  t0 = rp[0], t1 = rp[1];
91 	  mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
92 	  ADDC_LIMB (cy, rp[0], rp[0], t0);    /* add back saved limbs */
93 	  MPN_INCR_U (rp + 1, k + 1, t1 + cy);
94 	  an -= k;
95 	}
96 
97       if (an >= bn)
98 	{
99 	  /* last remaining chunk */
100 	  mp_limb_t t0, t1, cy;
101 	  ap += k, rp += k;
102 	  t0 = rp[0], t1 = rp[1];
103 	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
104 	  ADDC_LIMB (cy, rp[0], rp[0], t0);
105 	  MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
106 	}
107 
108       return;
109     }
110 
111   /* region is tall enough for toom42 */
112 
113   rn = an - bn + 1;
114 
115   if (rn < MULMID_TOOM42_THRESHOLD)
116     {
117       /* region not wide enough to make toom42 worthwhile for any portion */
118 
119       if (bn < CHUNK)
120 	{
121 	  /* region not too tall either, just call basecase directly */
122 	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
123 	  return;
124 	}
125 
126       /* Region quite tall. For better locality, use basecase on chunks:
127 
128 	 AAAAA....
129 	 .AAAAA...
130 	 ..BBBBB..
131 	 ...BBBBB.
132 	 ....CCCCC
133       */
134       {
135           TMP_DECL;
136           TMP_MARK;
137 
138           temp = TMP_ALLOC_LIMBS (rn + 2);
139 
140           /* first chunk (marked A in the above diagram) */
141           bp += bn - CHUNK, an -= bn - CHUNK;
142           mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
143 
144           /* remaining chunks (B, C, etc) */
145           bn -= CHUNK;
146 
147           while (bn >= CHUNK)
148 	    {
149 	      ap += CHUNK, bp -= CHUNK;
150 	      mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
151 	      mpn_add_n (rp, rp, temp, rn + 2);
152 	      bn -= CHUNK;
153 	    }
154 
155           if (bn)
156 	    {
157 	      /* last remaining chunk */
158 	      ap += CHUNK, bp -= bn;
159 	      mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
160 	      mpn_add_n (rp, rp, temp, rn + 2);
161 	    }
162 
163           TMP_FREE;
164       }
165       return;
166     }
167 
168   /* we're definitely going to use toom42 somewhere */
169 
170   if (bn > rn)
171     {
172       /* slice region into chunks, use toom42 on all chunks except possibly
173 	 the last:
174 
175          AA....
176          .AA...
177          ..BB..
178          ...BB.
179          ....CC
180       */
181 
182       TMP_DECL;
183       TMP_MARK;
184 
185       temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
186       scratch = temp + rn + 2;
187 
188       /* first chunk (marked A in the above diagram) */
189       bp += bn - rn;
190       mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
191 
192       /* remaining chunks (B, C, etc) */
193       bn -= rn;
194 
195       while (bn >= rn)
196         {
197           ap += rn, bp -= rn;
198 	  mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
199           mpn_add_n (rp, rp, temp, rn + 2);
200           bn -= rn;
201         }
202 
203       if (bn)
204         {
205           /* last remaining chunk */
206           ap += rn, bp -= bn;
207 	  mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
208           mpn_add_n (rp, rp, temp, rn + 2);
209         }
210 
211       TMP_FREE;
212     }
213   else
214     {
215       /* slice region into chunks, use toom42 on all chunks except possibly
216 	 the last:
217 
218          AAABBBCC..
219          .AAABBBCC.
220          ..AAABBBCC
221       */
222 
223       TMP_DECL;
224       TMP_MARK;
225 
226       scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
227 
228       /* first chunk (marked A in the above diagram) */
229       mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
230 
231       /* remaining chunks (B, C, etc) */
232       rn -= bn;
233 
234       while (rn >= bn)
235         {
236 	  mp_limb_t t0, t1, cy;
237           ap += bn, rp += bn;
238           t0 = rp[0], t1 = rp[1];
239           mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
240 	  ADDC_LIMB (cy, rp[0], rp[0], t0);     /* add back saved limbs */
241 	  MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
242 	  rn -= bn;
243         }
244 
245       TMP_FREE;
246 
247       if (rn)
248         {
249           /* last remaining chunk */
250 	  mp_limb_t t0, t1, cy;
251           ap += bn, rp += bn;
252           t0 = rp[0], t1 = rp[1];
253           mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
254 	  ADDC_LIMB (cy, rp[0], rp[0], t0);
255 	  MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
256         }
257     }
258 }
259