1 #ifndef VEXCL_FFT_PLAN_HPP
2 #define VEXCL_FFT_PLAN_HPP
3 
4 /*
5 The MIT License
6 
7 Copyright (c) 2012-2018 Denis Demidov <dennis.demidov@gmail.com>
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 */
27 
28 /**
29  * \file   vexcl/fft/plan.hpp
30  * \author Pascal Germroth <pascal@ensieve.org>
31  * \brief  FFT plan, stores kernels and buffers for one configuration.
32  */
33 
34 #include <cmath>
35 #include <queue>
36 #include <numeric>
37 
38 #include <vexcl/profiler.hpp>
39 #include <vexcl/vector.hpp>
40 #include <vexcl/function.hpp>
41 #include <vexcl/fft/unrolled_dft.hpp>
42 #include <vexcl/fft/kernels.hpp>
43 
44 namespace vex {
45 
46 /// Fast Fourier Transform
47 namespace fft {
48 
49 /// FFT direction.
50 enum direction {
51     forward, ///< Forward transform.
52     inverse, ///< Inverse transform.
53     none     ///< Specifies dimension(s) to do batch transform.
54 };
55 
56 // Returns successive prime numbers on each call.
57 struct prime_generator {
58     typedef std::pair<size_t, size_t> P;
59 
60     std::priority_queue<P, std::vector<P>, std::greater<P>> cross;
61     size_t x;
prime_generatorvex::fft::prime_generator62     prime_generator() : x(2) {}
63 
operator ()vex::fft::prime_generator64     size_t operator()() {
65         for(;; x++) {
66             if(cross.empty() || cross.top().first != x) { // is a prime.
67                 cross.push(P(x * x, x));
68                 return x++;
69             } else { // has prime factor. cross out.
70                 while(cross.top().first == x) {
71                     size_t prime = cross.top().second;
72                     cross.pop();
73                     cross.push(P(x + prime, prime));
74                 }
75             }
76         }
77     }
78 };
79 
80 // Returns the prime factors of a number.
prime_factors(size_t n)81 inline std::vector<pow> prime_factors(size_t n) {
82     std::vector<pow> fs;
83     if(n != 0) {
84         prime_generator next;
85         while(n != 1) {
86             const auto prime = next();
87             size_t exp = 0;
88             while(n % prime == 0) {
89                 n /= prime;
90                 exp++;
91             }
92             if(exp != 0)
93                 fs.push_back(pow(prime, exp));
94         }
95     }
96     return fs;
97 }
98 
99 // for arbitrary x, return smallest y=a^b c^d e^f...>=x where a,c,e are iterated over (assumed to be prime)
100 template<class Iterator>
next_prime_power(Iterator begin,Iterator end,size_t target,size_t n=1,size_t best=-1)101 inline size_t next_prime_power(Iterator begin, Iterator end, size_t target, size_t n = 1, size_t best = -1) {
102     const size_t prime = *begin++;
103     for(; n < target ; n *= prime) {
104         if(begin != end)
105             best = next_prime_power(begin, end, target, n, best);
106     }
107     return std::min(n, best);
108 }
109 
110 
111 
112 struct planner {
113     const size_t max_size;
114     std::vector<size_t> primes;
115 
plannervex::fft::planner116     planner(size_t s = 25) : max_size(std::min(s, supported_kernel_sizes().back())) {
117         auto ps = supported_primes();
118         for(auto i = ps.begin() ; i != ps.end() ; i++)
119             if(*i <= s) primes.push_back(*i);
120     }
121 
122     // returns the size the data must be padded to.
best_sizevex::fft::planner123     size_t best_size(size_t n) const {
124         return next_prime_power(primes.begin(), primes.end(), n);
125     }
126 
127     // splits n into a list of powers 2^a 2^b 2^c 3^d 5^e...
128     // exponents are limited by available kernels,
129     // if no kernel for prime is available, exponent will be 0, interpret as 1.
factorvex::fft::planner130     std::vector<pow> factor(size_t n) const {
131         std::vector<pow> out, factors = prime_factors(n);
132         for(auto f = factors.begin() ; f != factors.end() ; f++) {
133             if(std::find(primes.begin(), primes.end(), f->base) != primes.end()) {
134                 // split exponent into reasonable parts.
135                 auto qs = stages(*f);
136                 // use smallest radix first
137                 std::copy(qs.rbegin(), qs.rend(), std::back_inserter(out));
138             } else {
139                 // unsupported prime.
140                 for(size_t i = 0 ; i != f->exponent ; i++)
141                     out.push_back(pow(f->base, 0));
142             }
143         }
144         return out;
145     }
146 
147   private:
stagesvex::fft::planner148     std::vector<pow> stages(pow p) const {
149         size_t t = static_cast<size_t>(std::log(max_size + 1.0) / std::log(static_cast<double>(p.base)));
150         std::vector<pow> fs;
151 #ifdef FFT_SIMPLE_PLANNER
152         // use largest radixes, i.e. 2^4 2^4 2^1
153         for(size_t e = p.exponent ; ; ) {
154             if(e > t) {
155                 fs.push_back(pow(p.base, t));
156                 e -= t;
157             } else if(e <= t) {
158                 fs.push_back(pow(p.base, e));
159                 break;
160             }
161         }
162 #else
163         // avoid very small radixes, i.e. 2^3 2^3 2^3
164         // number of parts
165         size_t m = (p.exponent + t - 1) / t;
166         // two levels.
167         size_t r = t * m - p.exponent;
168         size_t u = m * (r / m);
169         size_t v = t - (r / m);
170         for(size_t i = 0 ; i < m - r + u ; i++)
171             fs.push_back(pow(p.base, v));
172         for(size_t i = 0 ; i < r - u ; i++)
173             fs.push_back(pow(p.base, v - 1));
174 #endif
175         return fs;
176     }
177 };
178 
179 
180 template <class Tv, class Planner = planner>
181 struct plan {
182     typedef typename cl_scalar_of<Tv>::type Ts;
183     static_assert(
184             std::is_same<Ts, cl_float >::value ||
185             std::is_same<Ts, cl_double>::value,
186             "Only float and double data supported."
187             );
188 
189     typedef typename cl_vector_of<Ts, 2>::type T2;
190 
191     VEX_FUNCTION_S(T2, r2c, (Ts, v), type_name<T2>() + " r = {v, 0}; return r;");
192     VEX_FUNCTION_S(Ts, c2r, (T2, v), "return v.x;");
193     VEX_FUNCTION_S(T2, scl, (T2, v)(Ts, s), "v.x *= s; v.y *= s; return v;");
194 
195     const std::vector<backend::command_queue> &queues;
196     Planner planner;
197     Ts scale;
198     const std::vector<size_t> sizes;
199 
200     std::vector<kernel_call> kernels;
201 
202     size_t input, output;
203     std::vector< vex::vector<T2> > bufs;
204 
205     profiler<> *profile;
206 
207     // \param sizes
208     //  1D case: {n}.
209     //  2D case: {h, w} in row-major format: x + y * w. (like FFTw)
210     //  etc.
planvex::fft::plan211     plan(const std::vector<backend::command_queue> &_queues, const std::vector<size_t> sizes,
212         const std::vector<direction> dirs, const Planner &planner = Planner())
213         : queues(_queues), planner(planner), sizes(sizes), profile(NULL)
214     {
215         assert(sizes.size() >= 1);
216         assert(sizes.size() == dirs.size());
217 
218         precondition(
219                 queues.size() == 1,
220                 "FFT is only supported for single-device contexts."
221                 );
222 
223         auto queue   = queues[0];
224 
225         size_t total_n = std::accumulate(sizes.begin(), sizes.end(),
226             static_cast<size_t>(1), std::multiplies<size_t>());
227         size_t current = bufs.size(); bufs.push_back(vex::vector<T2>(queues, total_n));
228         size_t other   = bufs.size(); bufs.push_back(vex::vector<T2>(queues, total_n));
229 
230         size_t inv_n = 1;
231         for(size_t i = 0 ; i < sizes.size() ; i++)
232             if(dirs[i] == inverse)
233                 inv_n *= sizes[i];
234         scale = (Ts)1 / inv_n;
235 
236         // Build the list of kernels.
237         input = current;
238         for(size_t i = 1 ; i <= sizes.size() ; i++) {
239             const size_t j = sizes.size() - i;
240             const size_t w = sizes[j], h = total_n / w;
241             if(w > 1) {
242                 // 1D, each row.
243                 if(dirs[j] != none)
244                     plan_cooley_tukey(dirs[j] == inverse, w, h, current, other, false);
245 
246                 if(h > 1 && !(dirs.size() == 2 && dirs[0] == none)) {
247                     kernels.push_back(transpose_kernel<Ts>(queue, w, h, bufs[current](), bufs[other]()));
248                     std::swap(current, other);
249                 }
250             }
251         }
252         output = current;
253     }
254 
plan_cooley_tukeyvex::fft::plan255     void plan_cooley_tukey(bool inverse, size_t n, size_t batch, size_t &current, size_t &other, bool once) {
256         size_t p = 1;
257         auto rs = planner.factor(n);
258         for(auto r = rs.begin() ; r != rs.end() ; r++) {
259             if(r->exponent == 0) {
260                 plan_bluestein(n, batch, inverse, r->base, p, current, other);
261                 p *= r->base;
262             } else {
263                 kernels.push_back(radix_kernel<Ts>(once, queues[0], n, batch,
264                     inverse, *r, p, bufs[current](), bufs[other]()));
265                 std::swap(current, other);
266                 p *= r->value;
267             }
268         }
269     }
270 
plan_bluesteinvex::fft::plan271     void plan_bluestein(size_t width, size_t batch, bool inverse, size_t n, size_t p, size_t &current, size_t &other) {
272         size_t conv_n = planner.best_size(2 * n);
273         size_t threads = width / n;
274 
275         size_t b_twiddle = bufs.size(); bufs.push_back(vex::vector<T2>(queues, n));
276         size_t b_other   = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n));
277         size_t b_current = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n));
278         size_t a_current = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n * batch * threads));
279         size_t a_other   = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n * batch * threads));
280 
281         // calculate twiddle factors
282         kernels.push_back(bluestein_twiddle<Ts>(queues[0], n, inverse,
283             bufs[b_twiddle]())); // once
284 
285         // first part of the convolution
286         kernels.push_back(bluestein_pad_kernel<Ts>(queues[0], n, conv_n,
287             bufs[b_twiddle](), bufs[b_current]())); // once
288 
289         plan_cooley_tukey(false, conv_n, 1, b_current, b_other, /*once*/true);
290 
291         // other part of convolution
292         kernels.push_back(bluestein_mul_in<Ts>(queues[0], inverse, batch, n, p, threads, conv_n,
293             bufs[current](), bufs[b_twiddle](), bufs[a_current]()));
294 
295         plan_cooley_tukey(false, conv_n, threads * batch, a_current, a_other, false);
296 
297         // calculate convolution
298         kernels.push_back(bluestein_mul<Ts>(queues[0], conv_n, threads * batch,
299             bufs[a_current](), bufs[b_current](), bufs[a_other]()));
300         std::swap(a_current, a_other);
301 
302         plan_cooley_tukey(true, conv_n, threads * batch, a_current, a_other, false);
303 
304         // twiddle again
305         kernels.push_back(bluestein_mul_out<Ts>(queues[0], batch, p, n, threads, conv_n,
306             bufs[a_current](), bufs[b_twiddle](), bufs[other]()));
307         std::swap(current, other);
308     }
309 
310     // Execute the complete transformation.
311     // Converts real-valued input and output, supports multiply-adding to output.
312     template<class Expr>
transformvex::fft::plan313     void transform(const Expr &in) {
314         if(profile) {
315             std::ostringstream prof_name;
316             prof_name << "fft(n={";
317             for(size_t i = 0 ; i < sizes.size() ; i++) {
318                 if(i != 0) prof_name << ", ";
319                 prof_name << sizes[i];
320             }
321             prof_name << "}, scale=" << scale << ")";
322             profile->tic_cl(prof_name.str());
323             profile->tic_cl("in");
324         }
325         vector<T2> &in_c = bufs[input];
326         if(cl_vector_length<Tv>::value == 1) in_c = r2c(in);
327         else in_c = in;
328         if(profile) profile->toc("in");
329         for(auto run = kernels.begin(); run != kernels.end(); ++run) {
330             if(!run->once || run->count == 0) {
331 #ifdef FFT_DUMP_ARRAYS
332                 for(auto b = bufs.begin(); b != bufs.end(); ++b)
333                     std::cerr << "   " << std::setprecision(2) << (*b)()() << " = " << (*b) << std::endl;
334                 std::cerr << "run " << run->desc << std::endl;
335 #endif
336                 if(profile) {
337                     std::ostringstream s;
338                     s << "(k" << (run - kernels.begin()) << ")";
339                     if(run->once) s << " ONCE";
340                     s << " " << run->desc;
341                     profile->tic_cl(s.str());
342                 }
343                 run->kernel(queues[0]);
344                 run->count++;
345                 if(profile) profile->toc("");
346             }
347         }
348 #ifdef FFT_DUMP_ARRAYS
349         for(auto b = bufs.begin(); b != bufs.end(); ++b)
350             std::cerr << "   " << (*b)()() << " = " << (*b) << std::endl;
351 #endif
352         if (profile) profile->toc("");
353     }
354 
355     template <typename Tout, class Expr>
applyvex::fft::plan356     auto apply(const Expr &expr) ->
357         typename std::enable_if<
358             cl_vector_length<Tout>::value == 1,
359             decltype( scale * c2r(bufs[output]) )
360         >::type
361     {
362         transform(expr);
363         return scale * c2r(bufs[output]);
364     }
365 
366     template <typename Tout, class Expr>
applyvex::fft::plan367     auto apply(const Expr &expr) ->
368         typename std::enable_if<
369             cl_vector_length<Tout>::value == 2,
370             decltype( scl(bufs[output], scale) )
371         >::type
372     {
373         transform(expr);
374         return scl(bufs[output], scale);
375     }
376 
descvex::fft::plan377     std::string desc() const {
378         std::ostringstream o;
379         o << "FFT(";
380         // sizes
381         for(auto n = sizes.begin() ; n != sizes.end() ; n++) {
382             if(n != sizes.begin()) o << " x ";
383             o << *n;
384             auto fs = prime_factors(*n);
385             if(fs.size() > 1) {
386                 o << '=';
387                 for(auto f = fs.begin() ; f != fs.end() ; f++) {
388                     if(f != fs.begin()) o << '*';
389                     o << *f;
390                 }
391             }
392         }
393         o << ")";
394         return o.str();
395     }
396 };
397 
398 
399 template <class T, class P>
operator <<(std::ostream & o,const plan<T,P> & p)400 inline std::ostream &operator<<(std::ostream &o, const plan<T,P> &p) {
401     o << p.desc() << "{\n";
402     for(auto k = p.kernels.begin() ; k != p.kernels.end() ; k++) {
403         o << "  ";
404         if(k->once) o << "once: ";
405         o << k->desc << "\n";
406     }
407     return o << "}";
408 }
409 
410 } // namespace fft
411 } // namespace vex
412 #endif
413