1 /*
2     Copyright (C) 2009, 2011 William Hart
3 
4     This file is part of FLINT.
5 
6     FLINT is free software: you can redistribute it and/or modify it under
7     the terms of the GNU Lesser General Public License (LGPL) as published
8     by the Free Software Foundation; either version 2.1 of the License, or
9     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
10 */
11 
12 #include "gmp.h"
13 #include "flint.h"
14 #include "fft.h"
15 #include "longlong.h"
16 #include "ulong_extras.h"
17 #include "fft_tuning.h"
18 #include "mpn_extras.h"
19 
20 static mp_size_t mulmod_2expp1_table_n[FFT_N_NUM] = MULMOD_TAB;
21 
fft_naive_convolution_1(mp_limb_t * r,mp_limb_t * ii,mp_limb_t * jj,mp_size_t m)22 void fft_naive_convolution_1(mp_limb_t * r, mp_limb_t * ii, mp_limb_t * jj, mp_size_t m)
23 {
24    mp_size_t i, j;
25 
26    for (i = 0; i < m; i++)
27       r[i] = ii[0]*jj[i];
28 
29    for (i = 1; i < m; i++)
30    {
31       for (j = 0; j < m - i; j++)
32          r[i+j] += ii[i]*jj[j];
33 
34       for ( ; j < m; j++)
35          r[i+j-m] -=ii[i]*jj[j];
36    }
37 }
38 
_fft_mulmod_2expp1(mp_limb_t * r1,mp_limb_t * i1,mp_limb_t * i2,mp_size_t r_limbs,flint_bitcnt_t depth,flint_bitcnt_t w)39 void _fft_mulmod_2expp1(mp_limb_t * r1, mp_limb_t * i1, mp_limb_t * i2,
40                  mp_size_t r_limbs, flint_bitcnt_t depth, flint_bitcnt_t w)
41 {
42    mp_size_t n = (UWORD(1)<<depth);
43    flint_bitcnt_t bits1 = (r_limbs*FLINT_BITS)/(2*n);
44 
45    mp_size_t limb_add, limbs = (n*w)/FLINT_BITS;
46    mp_size_t size = limbs + 1;
47    mp_size_t i, j, ll;
48 
49    mp_limb_t * ptr;
50    mp_limb_t ** ii, ** jj, *tt, *t1, *t2, *s1, *r, *ii0, *jj0;
51    mp_limb_t c;
52 
53    ii = flint_malloc((2*(n + n*size) + 4*n + 5*size)*sizeof(mp_limb_t));
54    for (i = 0, ptr = (mp_limb_t *) ii + 2*n; i < 2*n; i++, ptr += size)
55    {
56       ii[i] = ptr;
57    }
58    ii0 = ptr;
59    t1 = ii0 + 2*n;
60    t2 = t1 + size;
61    s1 = t2 + size;
62    r = s1 + size;
63    tt = r + 2*n;
64 
65    if (i1 != i2)
66    {
67       jj = flint_malloc((2*(n + n*size) + 2*n)*sizeof(mp_limb_t));
68       for (i = 0, ptr = (mp_limb_t *) jj + 2*n; i < 2*n; i++, ptr += size)
69       {
70          jj[i] = ptr;
71       }
72       jj0 = ptr;
73    } else
74    {
75       jj = ii;
76       jj0 = ii0;
77    }
78 
79    j = fft_split_bits(ii, i1, r_limbs, bits1, limbs);
80    for ( ; j < 2*n; j++)
81       flint_mpn_zero(ii[j], limbs + 1);
82 
83    for (i = 0; i < 2*n; i++)
84       ii0[i] = ii[i][0];
85 
86    fft_negacyclic(ii, n, w, &t1, &t2, &s1);
87    for (j = 0; j < 2*n; j++)
88       mpn_normmod_2expp1(ii[j], limbs);
89 
90    if (i1 != i2)
91    {
92       j = fft_split_bits(jj, i2, r_limbs, bits1, limbs);
93       for ( ; j < 2*n; j++)
94          flint_mpn_zero(jj[j], limbs + 1);
95 
96       for (i = 0; i < 2*n; i++)
97          jj0[i] = jj[i][0];
98 
99       fft_negacyclic(jj, n, w, &t1, &t2, &s1);
100    }
101 
102    for (j = 0; j < 2*n; j++)
103    {
104       if (i1 != i2) mpn_normmod_2expp1(jj[j], limbs);
105       c = 2*ii[j][limbs] + jj[j][limbs];
106       ii[j][limbs] = flint_mpn_mulmod_2expp1_basecase(ii[j], ii[j], jj[j], c, n*w, tt);
107    }
108 
109    ifft_negacyclic(ii, n, w, &t1, &t2, &s1);
110 
111    fft_naive_convolution_1(r, ii0, jj0, 2*n);
112 
113    for (j = 0; j < 2*n; j++)
114    {
115       mp_limb_t t, cy2;
116 
117       mpn_div_2expmod_2expp1(ii[j], ii[j], limbs, depth + 1);
118       mpn_normmod_2expp1(ii[j], limbs);
119 
120       t = ii[j][limbs];
121       ii[j][limbs] = r[j] - ii[j][0];
122       cy2 = mpn_add_1(ii[j], ii[j], limbs + 1, ii[j][limbs]);
123       add_ssaaaa(r[j], ii[j][limbs], 0, ii[j][limbs], 0, t);
124       if (cy2) r[j]++;
125    }
126 
127    flint_mpn_zero(r1, r_limbs + 1);
128    fft_combine_bits(r1, ii, 2*n - 1, bits1, limbs + 1, r_limbs + 1);
129 
130    /*
131       as the negacyclic convolution has effectively done subtractions
132       some of the coefficients will be negative, so need to subtract p
133    */
134    ll = 0;
135    limb_add = bits1/FLINT_BITS;
136 
137    for (j = 0; j < 2*n - 2; j++)
138    {
139       if (r[j])
140          mpn_sub_1(r1 + ll + 1, r1 + ll + 1, r_limbs - ll, 1);
141       else if ((mp_limb_signed_t) ii[j][limbs] < 0) /* coefficient was -ve */
142       {
143          mpn_sub_1(r1 + ll + 1, r1 + ll + 1, r_limbs - ll, 1);
144          mpn_sub_1(r1 + ll + limbs + 1, r1 + ll + limbs + 1, r_limbs - limbs - ll, 1);
145       }
146 
147       ll += limb_add;
148    }
149    /* penultimate coefficient, top bit was already ignored */
150    if (r[j] || (mp_limb_signed_t) ii[j][limbs] < 0) /* coefficient was -ve */
151       mpn_sub_1(r1 + ll + 1, r1 + ll + 1, r_limbs - ll, 1);
152 
153    /* final coefficient wraps around */
154    if (limb_add)
155       r1[r_limbs] += mpn_add_n(r1 + r_limbs - limb_add, r1 + r_limbs - limb_add, ii[2*n - 1], limb_add);
156    c = mpn_sub_n(r1, r1, ii[2*n - 1] + limb_add, limbs + 1 - limb_add);
157    mpn_addmod_2expp1_1(r1 + limbs + 1 - limb_add, r_limbs - limbs - 1 + limb_add, -c);
158    mpn_normmod_2expp1(r1, r_limbs);
159 
160    flint_free(ii);
161    if (i1 != i2) flint_free(jj);
162 }
163 
fft_mulmod_2expp1(mp_limb_t * r,mp_limb_t * i1,mp_limb_t * i2,mp_size_t n,mp_size_t w,mp_limb_t * tt)164 void fft_mulmod_2expp1(mp_limb_t * r, mp_limb_t * i1, mp_limb_t * i2,
165                            mp_size_t n, mp_size_t w, mp_limb_t * tt)
166 {
167    mp_size_t bits = n*w;
168    mp_size_t limbs = bits/FLINT_BITS;
169    flint_bitcnt_t depth1, depth = 1;
170 
171    mp_size_t w1, off;
172 
173    mp_limb_t c = 2*i1[limbs] + i2[limbs];
174 
175    if (c & 1)
176    {
177       mpn_neg_n(r, i1, limbs + 1);
178       mpn_normmod_2expp1(r, limbs);
179       return;
180    } else if (c & 2)
181    {
182       mpn_neg_n(r, i2, limbs + 1);
183       mpn_normmod_2expp1(r, limbs);
184       return;
185    }
186 
187    if (limbs <= FFT_MULMOD_2EXPP1_CUTOFF)
188    {
189       r[limbs] = flint_mpn_mulmod_2expp1_basecase(r, i1, i2, c, bits, tt);
190       return;
191    }
192 
193    while ((UWORD(1)<<depth) < bits) depth++;
194 
195    if (depth < 12) off = mulmod_2expp1_table_n[0];
196    else off = mulmod_2expp1_table_n[FLINT_MIN(depth, FFT_N_NUM + 11) - 12];
197    depth1 = depth/2 - off;
198 
199    w1 = bits/(UWORD(1)<<(2*depth1));
200 
201    _fft_mulmod_2expp1(r, i1, i2, limbs, depth1, w1);
202 }
203 
fft_adjust_limbs(mp_size_t limbs)204 slong fft_adjust_limbs(mp_size_t limbs)
205 {
206    mp_size_t bits1 = limbs*FLINT_BITS, bits2;
207    mp_size_t depth = 1, limbs2, depth1 = 1, depth2 = 1, adj;
208    mp_size_t off1, off2;
209 
210    if (limbs <= FFT_MULMOD_2EXPP1_CUTOFF) return limbs;
211 
212    depth = FLINT_CLOG2(limbs);
213    limbs2 = (WORD(1)<<depth); /* within a factor of 2 of limbs */
214    bits2 = limbs2*FLINT_BITS;
215 
216    depth1 = FLINT_CLOG2(bits1);
217    if (depth1 < 12) off1 = mulmod_2expp1_table_n[0];
218    else off1 = mulmod_2expp1_table_n[FLINT_MIN(depth1, FFT_N_NUM + 11) - 12];
219    depth1 = depth1/2 - off1;
220 
221    depth2 = FLINT_CLOG2(bits2);
222    if (depth2 < 12) off2 = mulmod_2expp1_table_n[0];
223    else off2 = mulmod_2expp1_table_n[FLINT_MIN(depth2, FFT_N_NUM + 11) - 12];
224    depth2 = depth2/2 - off2;
225 
226    depth1 = FLINT_MAX(depth1, depth2);
227    adj = (WORD(1)<<(depth1 + 1));
228    limbs2 = adj*((limbs + adj - 1)/adj); /* round up number of limbs */
229    bits1 = limbs2*FLINT_BITS;
230    bits2 = (WORD(1)<<(depth1*2));
231    bits1 = bits2*((bits1 + bits2 - 1)/bits2); /* round up bits */
232    limbs = bits1/FLINT_BITS;
233 
234    return limbs;
235 }
236