1 /*
2 This file is part of pocketfft.
3 
4 Copyright (C) 2010-2020 Max-Planck-Society
5 Copyright (C) 2019 Peter Bell
6 
7 Authors: Martin Reinecke, Peter Bell
8 
9 All rights reserved.
10 
11 Redistribution and use in source and binary forms, with or without modification,
12 are permitted provided that the following conditions are met:
13 
14 * Redistributions of source code must retain the above copyright notice, this
15   list of conditions and the following disclaimer.
16 * Redistributions in binary form must reproduce the above copyright notice, this
17   list of conditions and the following disclaimer in the documentation and/or
18   other materials provided with the distribution.
19 * Neither the name of the copyright holder nor the names of its contributors may
20   be used to endorse or promote products derived from this software without
21   specific prior written permission.
22 
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
24 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
27 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
30 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34 
35 /*
36  *  Python interface.
37  */
38 
39 #include <pybind11/pybind11.h>
40 #include <pybind11/numpy.h>
41 #include <pybind11/stl.h>
42 #include <complex>
43 
44 #include "ducc0/fft/fft.h"
45 #include "ducc0/bindings/pybind_utils.h"
46 
47 namespace ducc0 {
48 
49 namespace detail_pymodule_fft {
50 
51 namespace {
52 
53 using shape_t = ducc0::fmav_info::shape_t;
54 using std::size_t;
55 using std::ptrdiff_t;
56 
57 namespace py = pybind11;
58 
59 // Only instantiate long double transforms if they offer more precision
60 using ldbl_t = typename std::conditional<
61   sizeof(long double)==sizeof(double), double, long double>::type;
62 
63 using c64 = std::complex<float>;
64 using c128 = std::complex<double>;
65 using clong = std::complex<ldbl_t>;
66 using f32 = float;
67 using f64 = double;
68 using flong = ldbl_t;
69 auto None = py::none();
70 
makeaxes(const py::array & in,const py::object & axes)71 shape_t makeaxes(const py::array &in, const py::object &axes)
72   {
73   if (axes.is_none())
74     {
75     shape_t res(size_t(in.ndim()));
76     for (size_t i=0; i<res.size(); ++i)
77       res[i]=i;
78     return res;
79     }
80   auto tmp=axes.cast<std::vector<ptrdiff_t>>();
81   auto ndim = in.ndim();
82   if ((tmp.size()>size_t(ndim)) || (tmp.size()==0))
83     throw std::runtime_error("bad axes argument");
84   for (auto& sz: tmp)
85     {
86     if (sz<0)
87       sz += ndim;
88     if ((sz>=ndim) || (sz<0))
89       throw std::invalid_argument("axes exceeds dimensionality of output");
90     }
91   return shape_t(tmp.begin(), tmp.end());
92   }
93 
94 #define DISPATCH(arr, T1, T2, T3, func, args) \
95   { \
96   if (py::isinstance<py::array_t<T1>>(arr)) return func<double> args; \
97   if (py::isinstance<py::array_t<T2>>(arr)) return func<float> args;  \
98   if (py::isinstance<py::array_t<T3>>(arr)) return func<ldbl_t> args; \
99   throw std::runtime_error("unsupported data type"); \
100   }
101 
norm_fct(int inorm,size_t N)102 template<typename T> T norm_fct(int inorm, size_t N)
103   {
104   if (inorm==0) return T(1);
105   if (inorm==2) return T(1/ldbl_t(N));
106   if (inorm==1) return T(1/sqrt(ldbl_t(N)));
107   throw std::invalid_argument("invalid value for inorm (must be 0, 1, or 2)");
108   }
109 
norm_fct(int inorm,const shape_t & shape,const shape_t & axes,size_t fct=1,int delta=0)110 template<typename T> T norm_fct(int inorm, const shape_t &shape,
111   const shape_t &axes, size_t fct=1, int delta=0)
112   {
113   if (inorm==0) return T(1);
114   size_t N(1);
115   for (auto a: axes)
116     N *= fct * size_t(int64_t(shape[a])+delta);
117   return norm_fct<T>(inorm, N);
118   }
119 
c2c_internal(const py::array & in,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)120 template<typename T> py::array c2c_internal(const py::array &in,
121   const py::object &axes_, bool forward, int inorm, py::object &out_,
122   size_t nthreads)
123   {
124   auto axes = makeaxes(in, axes_);
125   auto ain = to_cfmav<std::complex<T>>(in);
126   auto out = get_optional_Pyarr<std::complex<T>>(out_, ain.shape());
127   auto aout = to_vfmav<std::complex<T>>(out);
128   {
129   py::gil_scoped_release release;
130   T fct = norm_fct<T>(inorm, ain.shape(), axes);
131   ducc0::c2c(ain, aout, axes, forward, fct, nthreads);
132   }
133   return move(out);
134   }
135 
c2c_sym_internal(const py::array & in,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)136 template<typename T> py::array c2c_sym_internal(const py::array &in,
137   const py::object &axes_, bool forward, int inorm, py::object &out_,
138   size_t nthreads)
139   {
140   auto axes = makeaxes(in, axes_);
141   auto ain = to_cfmav<T>(in);
142   auto out = get_optional_Pyarr<std::complex<T>>(out_, ain.shape());
143   auto aout = to_vfmav<std::complex<T>>(out);
144   {
145   py::gil_scoped_release release;
146   T fct = norm_fct<T>(inorm, ain.shape(), axes);
147   ducc0::r2c(ain, aout, axes, forward, fct, nthreads);
148   // now fill in second half
149   using namespace ducc0::detail_fft;
150   hermiteHelper(0, 0, 0, 0, aout, aout, axes, [](const std::complex<T> &c, complex<T> &, complex<T> &c1)
151     {
152     c1 = conj(c);
153     }, nthreads);
154   }
155   return move(out);
156   }
157 
c2c(const py::array & a,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)158 py::array c2c(const py::array &a, const py::object &axes_, bool forward,
159   int inorm, py::object &out_, size_t nthreads)
160   {
161   if (a.dtype().kind() == 'c')
162     DISPATCH(a, c128, c64, clong, c2c_internal, (a, axes_, forward,
163              inorm, out_, nthreads))
164 
165   DISPATCH(a, f64, f32, flong, c2c_sym_internal, (a, axes_, forward,
166            inorm, out_, nthreads))
167   }
168 
r2c_internal(const py::array & in,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)169 template<typename T> py::array r2c_internal(const py::array &in,
170   const py::object &axes_, bool forward, int inorm, py::object &out_,
171   size_t nthreads)
172   {
173   auto axes = makeaxes(in, axes_);
174   auto ain = to_cfmav<T>(in);
175   auto dims_out(ain.shape());
176   dims_out[axes.back()] = (dims_out[axes.back()]>>1)+1;
177   auto out = get_optional_Pyarr<std::complex<T>>(out_, dims_out);
178   auto aout = to_vfmav<std::complex<T>>(out);
179   {
180   py::gil_scoped_release release;
181   T fct = norm_fct<T>(inorm, ain.shape(), axes);
182   ducc0::r2c(ain, aout, axes, forward, fct, nthreads);
183   }
184   return move(out);
185   }
186 
r2c(const py::array & in,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)187 py::array r2c(const py::array &in, const py::object &axes_, bool forward,
188   int inorm, py::object &out_, size_t nthreads)
189   {
190   DISPATCH(in, f64, f32, flong, r2c_internal, (in, axes_, forward, inorm, out_,
191     nthreads))
192   }
193 
r2r_fftpack_internal(const py::array & in,const py::object & axes_,bool real2hermitian,bool forward,int inorm,py::object & out_,size_t nthreads)194 template<typename T> py::array r2r_fftpack_internal(const py::array &in,
195   const py::object &axes_, bool real2hermitian, bool forward, int inorm,
196   py::object &out_, size_t nthreads)
197   {
198   auto axes = makeaxes(in, axes_);
199   auto ain = to_cfmav<T>(in);
200   auto out = get_optional_Pyarr<T>(out_, ain.shape());
201   auto aout = to_vfmav<T>(out);
202   {
203   py::gil_scoped_release release;
204   T fct = norm_fct<T>(inorm, ain.shape(), axes);
205   ducc0::r2r_fftpack(ain, aout, axes, real2hermitian, forward, fct, nthreads);
206   }
207   return std::move(out);
208   }
209 
r2r_fftpack(const py::array & in,const py::object & axes_,bool real2hermitian,bool forward,int inorm,py::object & out_,size_t nthreads)210 py::array r2r_fftpack(const py::array &in, const py::object &axes_,
211   bool real2hermitian, bool forward, int inorm, py::object &out_,
212   size_t nthreads)
213   {
214   DISPATCH(in, f64, f32, flong, r2r_fftpack_internal, (in, axes_,
215     real2hermitian, forward, inorm, out_, nthreads))
216   }
217 
r2r_fftw_internal(const py::array & in,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)218 template<typename T> py::array r2r_fftw_internal(const py::array &in,
219   const py::object &axes_, bool forward, int inorm,
220   py::object &out_, size_t nthreads)
221   {
222   auto axes = makeaxes(in, axes_);
223   auto ain = to_cfmav<T>(in);
224   auto out = get_optional_Pyarr<T>(out_, ain.shape());
225   auto aout = to_vfmav<T>(out);
226   {
227   py::gil_scoped_release release;
228   T fct = norm_fct<T>(inorm, ain.shape(), axes);
229   ducc0::r2r_fftw(ain, aout, axes, forward, fct, nthreads);
230   }
231   return std::move(out);
232   }
233 
r2r_fftw(const py::array & in,const py::object & axes_,bool forward,int inorm,py::object & out_,size_t nthreads)234 py::array r2r_fftw(const py::array &in, const py::object &axes_,
235   bool forward, int inorm, py::object &out_, size_t nthreads)
236   {
237   DISPATCH(in, f64, f32, flong, r2r_fftw_internal, (in, axes_,
238     forward, inorm, out_, nthreads))
239   }
240 
dct_internal(const py::array & in,const py::object & axes_,int type,int inorm,py::object & out_,size_t nthreads)241 template<typename T> py::array dct_internal(const py::array &in,
242   const py::object &axes_, int type, int inorm, py::object &out_,
243   size_t nthreads)
244   {
245   auto axes = makeaxes(in, axes_);
246   auto ain = to_cfmav<T>(in);
247   auto out = get_optional_Pyarr<T>(out_, ain.shape());
248   auto aout = to_vfmav<T>(out);
249   {
250   py::gil_scoped_release release;
251   T fct = (type==1) ? norm_fct<T>(inorm, ain.shape(), axes, 2, -1)
252                     : norm_fct<T>(inorm, ain.shape(), axes, 2);
253   bool ortho = inorm == true;
254   ducc0::dct(ain, aout, axes, type, fct, ortho, nthreads);
255   }
256   return std::move(out);
257   }
258 
dct(const py::array & in,int type,const py::object & axes_,int inorm,py::object & out_,size_t nthreads)259 py::array dct(const py::array &in, int type, const py::object &axes_,
260   int inorm, py::object &out_, size_t nthreads)
261   {
262   if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
263   DISPATCH(in, f64, f32, flong, dct_internal, (in, axes_, type, inorm, out_,
264     nthreads))
265   }
266 
dst_internal(const py::array & in,const py::object & axes_,int type,int inorm,py::object & out_,size_t nthreads)267 template<typename T> py::array dst_internal(const py::array &in,
268   const py::object &axes_, int type, int inorm, py::object &out_,
269   size_t nthreads)
270   {
271   auto axes = makeaxes(in, axes_);
272   auto ain = to_cfmav<T>(in);
273   auto out = get_optional_Pyarr<T>(out_, ain.shape());
274   auto aout = to_vfmav<T>(out);
275   {
276   py::gil_scoped_release release;
277   T fct = (type==1) ? norm_fct<T>(inorm, ain.shape(), axes, 2, 1)
278                     : norm_fct<T>(inorm, ain.shape(), axes, 2);
279   bool ortho = inorm == true;
280   ducc0::dst(ain, aout, axes, type, fct, ortho, nthreads);
281   }
282   return std::move(out);
283   }
284 
dst(const py::array & in,int type,const py::object & axes_,int inorm,py::object & out_,size_t nthreads)285 py::array dst(const py::array &in, int type, const py::object &axes_,
286   int inorm, py::object &out_, size_t nthreads)
287   {
288   if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
289   DISPATCH(in, f64, f32, flong, dst_internal, (in, axes_, type, inorm,
290     out_, nthreads))
291   }
292 
c2r_internal(const py::array & in,const py::object & axes_,size_t lastsize,bool forward,int inorm,py::object & out_,size_t nthreads)293 template<typename T> py::array c2r_internal(const py::array &in,
294   const py::object &axes_, size_t lastsize, bool forward, int inorm,
295   py::object &out_, size_t nthreads)
296   {
297   auto axes = makeaxes(in, axes_);
298   size_t axis = axes.back();
299   auto ain = to_cfmav<std::complex<T>>(in);
300   shape_t dims_out(ain.shape());
301   if (lastsize==0) lastsize=2*ain.shape(axis)-1;
302   if ((lastsize/2) + 1 != ain.shape(axis))
303     throw std::invalid_argument("bad lastsize");
304   dims_out[axis] = lastsize;
305   auto out = get_optional_Pyarr<T>(out_, dims_out);
306   auto aout = to_vfmav<T>(out);
307   {
308   py::gil_scoped_release release;
309   T fct = norm_fct<T>(inorm, aout.shape(), axes);
310   ducc0::c2r(ain, aout, axes, forward, fct, nthreads);
311   }
312   return std::move(out);
313   }
314 
c2r(const py::array & in,const py::object & axes_,size_t lastsize,bool forward,int inorm,py::object & out_,size_t nthreads)315 py::array c2r(const py::array &in, const py::object &axes_, size_t lastsize,
316   bool forward, int inorm, py::object &out_, size_t nthreads)
317   {
318   DISPATCH(in, c128, c64, clong, c2r_internal, (in, axes_, lastsize, forward,
319     inorm, out_, nthreads))
320   }
321 
separable_hartley_internal(const py::array & in,const py::object & axes_,int inorm,py::object & out_,size_t nthreads)322 template<typename T> py::array separable_hartley_internal(const py::array &in,
323   const py::object &axes_, int inorm, py::object &out_, size_t nthreads)
324   {
325   auto axes = makeaxes(in, axes_);
326   auto ain = to_cfmav<T>(in);
327   auto out = get_optional_Pyarr<T>(out_, ain.shape());
328   auto aout = to_vfmav<T>(out);
329   {
330   py::gil_scoped_release release;
331   T fct = norm_fct<T>(inorm, ain.shape(), axes);
332   ducc0::r2r_separable_hartley(ain, aout, axes, fct, nthreads);
333   }
334   return std::move(out);
335   }
336 
separable_hartley(const py::array & in,const py::object & axes_,int inorm,py::object & out_,size_t nthreads)337 py::array separable_hartley(const py::array &in, const py::object &axes_,
338   int inorm, py::object &out_, size_t nthreads)
339   {
340   DISPATCH(in, f64, f32, flong, separable_hartley_internal, (in, axes_, inorm,
341     out_, nthreads))
342   }
343 
genuine_hartley_internal(const py::array & in,const py::object & axes_,int inorm,py::object & out_,size_t nthreads)344 template<typename T> py::array genuine_hartley_internal(const py::array &in,
345   const py::object &axes_, int inorm, py::object &out_, size_t nthreads)
346   {
347   auto axes = makeaxes(in, axes_);
348   auto ain = to_cfmav<T>(in);
349   auto out = get_optional_Pyarr<T>(out_, ain.shape());
350   auto aout = to_vfmav<T>(out);
351   {
352   py::gil_scoped_release release;
353   T fct = norm_fct<T>(inorm, ain.shape(), axes);
354   ducc0::r2r_genuine_hartley(ain, aout, axes, fct, nthreads);
355   }
356   return std::move(out);
357   }
358 
genuine_hartley(const py::array & in,const py::object & axes_,int inorm,py::object & out_,size_t nthreads)359 py::array genuine_hartley(const py::array &in, const py::object &axes_,
360   int inorm, py::object &out_, size_t nthreads)
361   {
362   DISPATCH(in, f64, f32, flong, genuine_hartley_internal, (in, axes_, inorm,
363     out_, nthreads))
364   }
365 
366 // Export good_size in raw C-API to reduce overhead (~4x faster)
good_size(PyObject *,PyObject * args)367 PyObject * good_size(PyObject * /*self*/, PyObject * args)
368   {
369   Py_ssize_t n_ = -1;
370   int real = false;
371   if (!PyArg_ParseTuple(args, "n|p:good_size", &n_, &real))
372     return nullptr;
373 
374   if (n_<0)
375     {
376     PyErr_SetString(PyExc_ValueError, "Target length must be positive");
377     return nullptr;
378     }
379   if ((n_-1) > static_cast<Py_ssize_t>(std::numeric_limits<size_t>::max() / 11))
380     {
381     PyErr_Format(PyExc_ValueError,
382                  "Target length is too large to perform an FFT: %zi", n_);
383     return nullptr;
384     }
385   const auto n = static_cast<size_t>(n_);
386   using namespace ducc0::detail_fft;
387   return PyLong_FromSize_t(
388     real ? util1d::good_size_real(n) : util1d::good_size_cmplx(n));
389   }
390 
convolve_axis_internal(const py::array & in_,py::array & out_,size_t axis,const py::array & kernel_,size_t nthreads)391 template<typename T> py::array convolve_axis_internal(const py::array &in_,
392   py::array &out_, size_t axis, const py::array &kernel_, size_t nthreads)
393   {
394   auto in = to_cfmav<T>(in_);
395   auto out = to_vfmav<T>(out_);
396   auto kernel = to_cmav<T,1>(kernel_);
397   {
398   py::gil_scoped_release release;
399   ducc0::convolve_axis(in, out, axis, kernel, nthreads);
400   }
401   return std::move(out_);
402   }
403 
convolve_axis_internal_c(const py::array & in_,py::array & out_,size_t axis,const py::array & kernel_,size_t nthreads)404 template<typename T> py::array convolve_axis_internal_c(const py::array &in_,
405   py::array &out_, size_t axis, const py::array &kernel_, size_t nthreads)
406   {
407   return convolve_axis_internal<std::complex<T>>(in_, out_, axis, kernel_, nthreads);
408   }
409 
convolve_axis(const py::array & in,py::array & out,size_t axis,const py::array & kernel,size_t nthreads)410 py::array convolve_axis(const py::array &in, py::array &out, size_t axis,
411   const py::array &kernel, size_t nthreads)
412   {
413   if (in.dtype().kind() == 'c')
414     DISPATCH(in, c128, c64, clong, convolve_axis_internal_c, (in, out, axis,
415       kernel, nthreads))
416   else
417     DISPATCH(in, f64, f32, flong, convolve_axis_internal, (in, out, axis,
418       kernel, nthreads))
419   }
420 
421 const char *fft_DS = R"""(Fast Fourier, sine/cosine, and Hartley transforms.
422 
423 This module supports
424  - single, double, and long double precision
425  - complex and real-valued transforms
426  - multi-dimensional transforms
427 
428 For two- and higher-dimensional transforms the code will use SSE2 and AVX
429 vector instructions for faster execution if these are supported by the CPU and
430 were enabled during compilation.
431 )""";
432 
433 const char *c2c_DS = R"""(Performs a complex FFT.
434 
435 Parameters
436 ----------
437 a : numpy.ndarray (any complex or real type)
438     The input data. If its type is real, a more efficient real-to-complex
439     transform will be used.
440 axes : list of integers
441     The axes along which the FFT is carried out.
442     If not set, all axes will be transformed.
443 forward : bool
444     If `True`, a negative sign is used in the exponent, else a positive one.
445 inorm : int
446     Normalization type
447       | 0 : no normalization
448       | 1 : divide by sqrt(N)
449       | 2 : divide by N
450 
451     where N is the product of the lengths of the transformed axes.
452 out : numpy.ndarray (same shape as `a`, complex type with same accuracy as `a`)
453     May be identical to `a`, but if it isn't, it must not overlap with `a`.
454     If None, a new array is allocated to store the output.
455 nthreads : int
456     Number of threads to use. If 0, use the system default (typically the number
457     of hardware threads on the compute node).
458 
459 Returns
460 -------
461 numpy.ndarray (same shape as `a`, complex type with same accuracy as `a`)
462     The transformed data.
463 )""";
464 
465 const char *r2c_DS = R"""(Performs an FFT whose input is strictly real.
466 
467 Parameters
468 ----------
469 a : numpy.ndarray (any real type)
470     The input data
471 axes : list of integers
472     The axes along which the FFT is carried out.
473     If not set, all axes will be transformed in ascending order.
474 forward : bool
475     If `True`, a negative sign is used in the exponent, else a positive one.
476 inorm : int
477     Normalization type
478       | 0 : no normalization
479       | 1 : divide by sqrt(N)
480       | 2 : divide by N
481 
482     where N is the product of the lengths of the transformed input axes.
483 out : numpy.ndarray (complex type with same accuracy as `a`)
484     For the required shape, see the `Returns` section.
485     Must not overlap with `a`.
486     If None, a new array is allocated to store the output.
487 nthreads : int
488     Number of threads to use. If 0, use the system default (typically the number
489     of hardware threads on the compute node).
490 
491 Returns
492 -------
493 numpy.ndarray (complex type with same accuracy as `a`)
494     The transformed data. The shape is identical to that of the input array,
495     except for the axis that was transformed last. If the length of that axis
496     was n on input, it is n//2+1 on output.
497 )""";
498 
499 const char *c2r_DS = R"""(Performs an FFT whose output is strictly real.
500 
501 Parameters
502 ----------
503 a : numpy.ndarray (any complex type)
504     The input data
505 axes : list of integers
506     The axes along which the FFT is carried out.
507     If not set, all axes will be transformed in ascending order.
508 lastsize : the output size of the last axis to be transformed.
509     If the corresponding input axis has size n, this can be 2*n-2 or 2*n-1.
510 forward : bool
511     If `True`, a negative sign is used in the exponent, else a positive one.
512 inorm : int
513     Normalization type
514       | 0 : no normalization
515       | 1 : divide by sqrt(N)
516       | 2 : divide by N
517 
518     where N is the product of the lengths of the transformed output axes.
519 out : numpy.ndarray (real type with same accuracy as `a`)
520     For the required shape, see the `Returns` section.
521     Must not overlap with `a`.
522     If None, a new array is allocated to store the output.
523 nthreads : int
524     Number of threads to use. If 0, use the system default (typically the number
525     of hardware threads on the compute node).
526 
527 Returns
528 -------
529 numpy.ndarray (real type with same accuracy as `a`)
530     The transformed data. The shape is identical to that of the input array,
531     except for the axis that was transformed last, which has now `lastsize`
532     entries.
533 )""";
534 
535 const char *r2r_fftpack_DS = R"""(Performs a real-valued FFT using FFTPACK's halfcomplex storage scheme.
536 
537 Parameters
538 ----------
539 a : numpy.ndarray (any real type)
540     The input data
541 axes : list of integers
542     The axes along which the FFT is carried out.
543     If not set, all axes will be transformed.
544 real2hermitian : bool
545     if True, the input is purely real and the output will have Hermitian
546     symmetry and be stored in FFTPACK's halfcomplex ordering, otherwise the
547     opposite.
548 forward : bool
549     If `True`, a negative sign is used in the exponent, else a positive one.
550 inorm : int
551     Normalization type
552       | 0 : no normalization
553       | 1 : divide by sqrt(N)
554       | 2 : divide by N
555 
556     where N is the length of `axis`.
557 out : numpy.ndarray (same shape and data type as `a`)
558     May be identical to `a`, but if it isn't, it must not overlap with `a`.
559     If None, a new array is allocated to store the output.
560 nthreads : int
561     Number of threads to use. If 0, use the system default (typically the number
562     of hardware threads on the compute node).
563 
564 Returns
565 -------
566 numpy.ndarray (same shape and data type as `a`)
567     The transformed data. The shape is identical to that of the input array.
568 )""";
569 
570 const char *r2r_fftw_DS = R"""(Performs a real-valued FFT using FFTW's halfcomplex storage scheme.
571 
572 Parameters
573 ----------
574 a : numpy.ndarray (any real type)
575     The input data
576 axes : list of integers
577     The axes along which the FFT is carried out.
578     If not set, all axes will be transformed.
579 forward : bool
580     If `True`, a negative sign is used in the exponent, else a positive one.
581 inorm : int
582     Normalization type
583       | 0 : no normalization
584       | 1 : divide by sqrt(N)
585       | 2 : divide by N
586 
587     where N is the length of `axis`.
588 out : numpy.ndarray (same shape and data type as `a`)
589     May be identical to `a`, but if it isn't, it must not overlap with `a`.
590     If None, a new array is allocated to store the output.
591 nthreads : int
592     Number of threads to use. If 0, use the system default (typically the number
593     of hardware threads on the compute node).
594 
595 Returns
596 -------
597 numpy.ndarray (same shape and data type as `a`)
598     The transformed data. The shape is identical to that of the input array.
599 )""";
600 
601 const char *separable_hartley_DS = R"""(Performs a separable Hartley transform.
602 For every requested axis, a 1D forward Fourier transform is carried out, and
603 the real and imaginary parts of the result are added before the next axis is
604 processed.
605 
606 Parameters
607 ----------
608 a : numpy.ndarray (any real type)
609     The input data
610 axes : list of integers
611     The axes along which the transform is carried out.
612     If not set, all axes will be transformed.
613 inorm : int
614     Normalization type
615       | 0 : no normalization
616       | 1 : divide by sqrt(N)
617       | 2 : divide by N
618 
619     where N is the product of the lengths of the transformed axes.
620 out : numpy.ndarray (same shape and data type as `a`)
621     May be identical to `a`, but if it isn't, it must not overlap with `a`.
622     If None, a new array is allocated to store the output.
623 nthreads : int
624     Number of threads to use. If 0, use the system default (typically the number
625     of hardware threads on the compute node).
626 
627 Returns
628 -------
629 numpy.ndarray (same shape and data type as `a`)
630     The transformed data
631 )""";
632 
633 const char *genuine_hartley_DS = R"""(Performs a full Hartley transform.
634 A full Fourier transform is carried out over the requested axes, and the
635 sum of real and imaginary parts of the result is stored in the output
636 array. For a single transformed axis, this is identical to `separable_hartley`,
637 but when transforming multiple axes, the results are different.
638 
639 Parameters
640 ----------
641 a : numpy.ndarray (any real type)
642     The input data
643 axes : list of integers
644     The axes along which the transform is carried out.
645     If not set, all axes will be transformed.
646 inorm : int
647     Normalization type
648       | 0 : no normalization
649       | 1 : divide by sqrt(N)
650       | 2 : divide by N
651 
652     where N is the product of the lengths of the transformed axes.
653 out : numpy.ndarray (same shape and data type as `a`)
654     May be identical to `a`, but if it isn't, it must not overlap with `a`.
655     If None, a new array is allocated to store the output.
656 nthreads : int
657     Number of threads to use. If 0, use the system default (typically the number
658     of hardware threads on the compute node).
659 
660 Returns
661 -------
662 numpy.ndarray (same shape and data type as `a`)
663     The transformed data
664 )""";
665 
666 const char *dct_DS = R"""(Performs a discrete cosine transform.
667 
668 Parameters
669 ----------
670 a : numpy.ndarray (any real type)
671     The input data
672 type : integer
673     the type of DCT. Must be in [1; 4].
674 axes : list of integers
675     The axes along which the transform is carried out.
676     If not set, all axes will be transformed.
677 inorm : integer
678     the normalization type
679       | 0 : no normalization
680       | 1 : make transform orthogonal and divide by sqrt(N)
681       | 2 : divide by N
682 
683     where N is the product of n_i for every transformed axis i.
684     n_i is 2*(<axis_length>-1 for type 1 and 2*<axis length>
685     for types 2, 3, 4.
686     Making the transform orthogonal involves the following additional steps
687     for every 1D sub-transform:
688 
689     Type 1
690       multiply first and last input value by sqrt(2);
691       divide first and last output value by sqrt(2)
692     Type 2
693       divide first output value by sqrt(2)
694     Type 3
695       multiply first input value by sqrt(2)
696     Type 4
697       nothing
698 
699 out : numpy.ndarray (same shape and data type as `a`)
700     May be identical to `a`, but if it isn't, it must not overlap with `a`.
701     If None, a new array is allocated to store the output.
702 nthreads : int
703     Number of threads to use. If 0, use the system default (typically the number
704     of hardware threads on the compute node).
705 
706 Returns
707 -------
708 numpy.ndarray (same shape and data type as `a`)
709     The transformed data
710 )""";
711 
712 const char *dst_DS = R"""(Performs a discrete sine transform.
713 
714 Parameters
715 ----------
716 a : numpy.ndarray (any real type)
717     The input data
718 type : integer
719     the type of DST. Must be in [1; 4].
720 axes : list of integers
721     The axes along which the transform is carried out.
722     If not set, all axes will be transformed.
723 inorm : int
724     Normalization type
725       | 0 : no normalization
726       | 1 : make transform orthogonal and divide by sqrt(N)
727       | 2 : divide by N
728 
729     where N is the product of n_i for every transformed axis i.
730     n_i is 2*(<axis_length>+1 for type 1 and 2*<axis length>
731     for types 2, 3, 4.
732     Making the transform orthogonal involves the following additional steps
733     for every 1D sub-transform:
734 
735     Type 1
736       nothing
737     Type 2
738       divide first output value by sqrt(2)
739     Type 3
740       multiply first input value by sqrt(2)
741     Type 4
742       nothing
743 
744 out : numpy.ndarray (same shape and data type as `a`)
745     May be identical to `a`, but if it isn't, it must not overlap with `a`.
746     If None, a new array is allocated to store the output.
747 nthreads : int
748     Number of threads to use. If 0, use the system default (typically the number
749     of hardware threads on the compute node).
750 
751 Returns
752 -------
753 numpy.ndarray (same shape and data type as `a`)
754     The transformed data
755 )""";
756 
757 const char *convolve_axis_DS = R"""(Performs a circular convolution along one axis.
758 
759 The result is equivalent to
760 
761 .. code-block:: Python
762 
763     import scipy.ndimage
764     import scipy.signal
765     import scipy.fft
766     kernel = scipy.fft.fftshift(kernel)
767     tmp = scipy.ndimage.convolve1d(in, kernel, axis, mode='wrap')
768     out[()] = scipy.signal.resample(tmp, out.shape[axis], axis=axis)
769     return out
770 
771 Parameters
772 ----------
773 in : numpy.ndarray (any real or complex type)
774     The input data
775 out : numpy.ndarray (same type as `in`)
776     The output data. Must have the same shape as `in` except for the axis
777     to be convolved
778 axis : integer
779     The axis along which the convolution is carried out.
780 kernel : one-dimensional numpy.ndarray (same type as `in`)
781     The kernel to be used for convolution
782     The length of this array must be equal to in.shape[axis]
783 nthreads : int
784     Number of threads to use. If 0, use the system default (typically the number
785     of hardware threads on the compute node).
786 
787 Returns
788 -------
789 numpy.ndarray (identical to `out`)
790     The convolved input
791 
792 Notes
793 -----
794 The main purpose of this routine is efficiency: the combination of the above
795 operations can be carried out more quickly than running the individual
796 operations in succession.
797 
798 If `in.shape[axis]!=out.shape[axis]`, the appropriate amount of zero-padding or
799 truncation will be carried out after the convolution step.
800 
801 `in` and `out` may overlap in memory. If they do, their first elements must
802 be at the same memory location, and all their strides must be equal.
803 )""";
804 
805 const char * good_size_DS = R"""(Returns a good length to pad an FFT to.
806 
807 Parameters
808 ----------
809 n : int
810     Minimum transform length
811 real : bool, optional
812     True if either input or output of FFT should be fully real.
813 
814 Returns
815 -------
816 out : int
817     The smallest fast size >= n
818 
819 )""";
820 
821 } // unnamed namespace
822 
add_fft(py::module_ & msup)823 void add_fft(py::module_ &msup)
824   {
825   using namespace pybind11::literals;
826   auto m = msup.def_submodule("fft");
827   m.doc() = fft_DS;
828   m.def("c2c", c2c, c2c_DS, "a"_a, "axes"_a=None, "forward"_a=true,
829     "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
830   m.def("r2c", r2c, r2c_DS, "a"_a, "axes"_a=None, "forward"_a=true,
831     "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
832   m.def("c2r", c2r, c2r_DS, "a"_a, "axes"_a=None, "lastsize"_a=0,
833     "forward"_a=true, "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
834   m.def("r2r_fftpack", r2r_fftpack, r2r_fftpack_DS, "a"_a, "axes"_a,
835     "real2hermitian"_a, "forward"_a, "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
836   m.def("r2r_fftw", r2r_fftw, r2r_fftw_DS, "a"_a, "axes"_a,
837     "forward"_a, "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
838   m.def("separable_hartley", separable_hartley, separable_hartley_DS, "a"_a,
839     "axes"_a=None, "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
840   m.def("genuine_hartley", genuine_hartley, genuine_hartley_DS, "a"_a,
841     "axes"_a=None, "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
842   m.def("dct", dct, dct_DS, "a"_a, "type"_a, "axes"_a=None, "inorm"_a=0,
843     "out"_a=None, "nthreads"_a=1);
844   m.def("dst", dst, dst_DS, "a"_a, "type"_a, "axes"_a=None, "inorm"_a=0,
845     "out"_a=None, "nthreads"_a=1);
846   m.def("convolve_axis", convolve_axis, convolve_axis_DS, "in"_a, "out"_a,
847     "axis"_a, "kernel"_a, "nthreads"_a=1);
848 
849   static PyMethodDef good_size_meth[] =
850     {{"good_size", good_size, METH_VARARGS, good_size_DS}, {0, 0, 0, 0}};
851   PyModule_AddFunctions(m.ptr(), good_size_meth);
852   }
853 
854 }
855 
856 using detail_pymodule_fft::add_fft;
857 
858 }
859