1 #ifndef VEXCL_FFT_PLAN_HPP
2 #define VEXCL_FFT_PLAN_HPP
3
4 /*
5 The MIT License
6
7 Copyright (c) 2012-2018 Denis Demidov <dennis.demidov@gmail.com>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 */
27
28 /**
29 * \file vexcl/fft/plan.hpp
30 * \author Pascal Germroth <pascal@ensieve.org>
31 * \brief FFT plan, stores kernels and buffers for one configuration.
32 */
33
34 #include <cmath>
35 #include <queue>
36 #include <numeric>
37
38 #include <vexcl/profiler.hpp>
39 #include <vexcl/vector.hpp>
40 #include <vexcl/function.hpp>
41 #include <vexcl/fft/unrolled_dft.hpp>
42 #include <vexcl/fft/kernels.hpp>
43
44 namespace vex {
45
46 /// Fast Fourier Transform
47 namespace fft {
48
49 /// FFT direction.
50 enum direction {
51 forward, ///< Forward transform.
52 inverse, ///< Inverse transform.
53 none ///< Specifies dimension(s) to do batch transform.
54 };
55
56 // Returns successive prime numbers on each call.
57 struct prime_generator {
58 typedef std::pair<size_t, size_t> P;
59
60 std::priority_queue<P, std::vector<P>, std::greater<P>> cross;
61 size_t x;
prime_generatorvex::fft::prime_generator62 prime_generator() : x(2) {}
63
operator ()vex::fft::prime_generator64 size_t operator()() {
65 for(;; x++) {
66 if(cross.empty() || cross.top().first != x) { // is a prime.
67 cross.push(P(x * x, x));
68 return x++;
69 } else { // has prime factor. cross out.
70 while(cross.top().first == x) {
71 size_t prime = cross.top().second;
72 cross.pop();
73 cross.push(P(x + prime, prime));
74 }
75 }
76 }
77 }
78 };
79
80 // Returns the prime factors of a number.
prime_factors(size_t n)81 inline std::vector<pow> prime_factors(size_t n) {
82 std::vector<pow> fs;
83 if(n != 0) {
84 prime_generator next;
85 while(n != 1) {
86 const auto prime = next();
87 size_t exp = 0;
88 while(n % prime == 0) {
89 n /= prime;
90 exp++;
91 }
92 if(exp != 0)
93 fs.push_back(pow(prime, exp));
94 }
95 }
96 return fs;
97 }
98
99 // for arbitrary x, return smallest y=a^b c^d e^f...>=x where a,c,e are iterated over (assumed to be prime)
100 template<class Iterator>
next_prime_power(Iterator begin,Iterator end,size_t target,size_t n=1,size_t best=-1)101 inline size_t next_prime_power(Iterator begin, Iterator end, size_t target, size_t n = 1, size_t best = -1) {
102 const size_t prime = *begin++;
103 for(; n < target ; n *= prime) {
104 if(begin != end)
105 best = next_prime_power(begin, end, target, n, best);
106 }
107 return std::min(n, best);
108 }
109
110
111
112 struct planner {
113 const size_t max_size;
114 std::vector<size_t> primes;
115
plannervex::fft::planner116 planner(size_t s = 25) : max_size(std::min(s, supported_kernel_sizes().back())) {
117 auto ps = supported_primes();
118 for(auto i = ps.begin() ; i != ps.end() ; i++)
119 if(*i <= s) primes.push_back(*i);
120 }
121
122 // returns the size the data must be padded to.
best_sizevex::fft::planner123 size_t best_size(size_t n) const {
124 return next_prime_power(primes.begin(), primes.end(), n);
125 }
126
127 // splits n into a list of powers 2^a 2^b 2^c 3^d 5^e...
128 // exponents are limited by available kernels,
129 // if no kernel for prime is available, exponent will be 0, interpret as 1.
factorvex::fft::planner130 std::vector<pow> factor(size_t n) const {
131 std::vector<pow> out, factors = prime_factors(n);
132 for(auto f = factors.begin() ; f != factors.end() ; f++) {
133 if(std::find(primes.begin(), primes.end(), f->base) != primes.end()) {
134 // split exponent into reasonable parts.
135 auto qs = stages(*f);
136 // use smallest radix first
137 std::copy(qs.rbegin(), qs.rend(), std::back_inserter(out));
138 } else {
139 // unsupported prime.
140 for(size_t i = 0 ; i != f->exponent ; i++)
141 out.push_back(pow(f->base, 0));
142 }
143 }
144 return out;
145 }
146
147 private:
stagesvex::fft::planner148 std::vector<pow> stages(pow p) const {
149 size_t t = static_cast<size_t>(std::log(max_size + 1.0) / std::log(static_cast<double>(p.base)));
150 std::vector<pow> fs;
151 #ifdef FFT_SIMPLE_PLANNER
152 // use largest radixes, i.e. 2^4 2^4 2^1
153 for(size_t e = p.exponent ; ; ) {
154 if(e > t) {
155 fs.push_back(pow(p.base, t));
156 e -= t;
157 } else if(e <= t) {
158 fs.push_back(pow(p.base, e));
159 break;
160 }
161 }
162 #else
163 // avoid very small radixes, i.e. 2^3 2^3 2^3
164 // number of parts
165 size_t m = (p.exponent + t - 1) / t;
166 // two levels.
167 size_t r = t * m - p.exponent;
168 size_t u = m * (r / m);
169 size_t v = t - (r / m);
170 for(size_t i = 0 ; i < m - r + u ; i++)
171 fs.push_back(pow(p.base, v));
172 for(size_t i = 0 ; i < r - u ; i++)
173 fs.push_back(pow(p.base, v - 1));
174 #endif
175 return fs;
176 }
177 };
178
179
180 template <class Tv, class Planner = planner>
181 struct plan {
182 typedef typename cl_scalar_of<Tv>::type Ts;
183 static_assert(
184 std::is_same<Ts, cl_float >::value ||
185 std::is_same<Ts, cl_double>::value,
186 "Only float and double data supported."
187 );
188
189 typedef typename cl_vector_of<Ts, 2>::type T2;
190
191 VEX_FUNCTION_S(T2, r2c, (Ts, v), type_name<T2>() + " r = {v, 0}; return r;");
192 VEX_FUNCTION_S(Ts, c2r, (T2, v), "return v.x;");
193 VEX_FUNCTION_S(T2, scl, (T2, v)(Ts, s), "v.x *= s; v.y *= s; return v;");
194
195 const std::vector<backend::command_queue> &queues;
196 Planner planner;
197 Ts scale;
198 const std::vector<size_t> sizes;
199
200 std::vector<kernel_call> kernels;
201
202 size_t input, output;
203 std::vector< vex::vector<T2> > bufs;
204
205 profiler<> *profile;
206
207 // \param sizes
208 // 1D case: {n}.
209 // 2D case: {h, w} in row-major format: x + y * w. (like FFTw)
210 // etc.
planvex::fft::plan211 plan(const std::vector<backend::command_queue> &_queues, const std::vector<size_t> sizes,
212 const std::vector<direction> dirs, const Planner &planner = Planner())
213 : queues(_queues), planner(planner), sizes(sizes), profile(NULL)
214 {
215 assert(sizes.size() >= 1);
216 assert(sizes.size() == dirs.size());
217
218 precondition(
219 queues.size() == 1,
220 "FFT is only supported for single-device contexts."
221 );
222
223 auto queue = queues[0];
224
225 size_t total_n = std::accumulate(sizes.begin(), sizes.end(),
226 static_cast<size_t>(1), std::multiplies<size_t>());
227 size_t current = bufs.size(); bufs.push_back(vex::vector<T2>(queues, total_n));
228 size_t other = bufs.size(); bufs.push_back(vex::vector<T2>(queues, total_n));
229
230 size_t inv_n = 1;
231 for(size_t i = 0 ; i < sizes.size() ; i++)
232 if(dirs[i] == inverse)
233 inv_n *= sizes[i];
234 scale = (Ts)1 / inv_n;
235
236 // Build the list of kernels.
237 input = current;
238 for(size_t i = 1 ; i <= sizes.size() ; i++) {
239 const size_t j = sizes.size() - i;
240 const size_t w = sizes[j], h = total_n / w;
241 if(w > 1) {
242 // 1D, each row.
243 if(dirs[j] != none)
244 plan_cooley_tukey(dirs[j] == inverse, w, h, current, other, false);
245
246 if(h > 1 && !(dirs.size() == 2 && dirs[0] == none)) {
247 kernels.push_back(transpose_kernel<Ts>(queue, w, h, bufs[current](), bufs[other]()));
248 std::swap(current, other);
249 }
250 }
251 }
252 output = current;
253 }
254
plan_cooley_tukeyvex::fft::plan255 void plan_cooley_tukey(bool inverse, size_t n, size_t batch, size_t ¤t, size_t &other, bool once) {
256 size_t p = 1;
257 auto rs = planner.factor(n);
258 for(auto r = rs.begin() ; r != rs.end() ; r++) {
259 if(r->exponent == 0) {
260 plan_bluestein(n, batch, inverse, r->base, p, current, other);
261 p *= r->base;
262 } else {
263 kernels.push_back(radix_kernel<Ts>(once, queues[0], n, batch,
264 inverse, *r, p, bufs[current](), bufs[other]()));
265 std::swap(current, other);
266 p *= r->value;
267 }
268 }
269 }
270
plan_bluesteinvex::fft::plan271 void plan_bluestein(size_t width, size_t batch, bool inverse, size_t n, size_t p, size_t ¤t, size_t &other) {
272 size_t conv_n = planner.best_size(2 * n);
273 size_t threads = width / n;
274
275 size_t b_twiddle = bufs.size(); bufs.push_back(vex::vector<T2>(queues, n));
276 size_t b_other = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n));
277 size_t b_current = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n));
278 size_t a_current = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n * batch * threads));
279 size_t a_other = bufs.size(); bufs.push_back(vex::vector<T2>(queues, conv_n * batch * threads));
280
281 // calculate twiddle factors
282 kernels.push_back(bluestein_twiddle<Ts>(queues[0], n, inverse,
283 bufs[b_twiddle]())); // once
284
285 // first part of the convolution
286 kernels.push_back(bluestein_pad_kernel<Ts>(queues[0], n, conv_n,
287 bufs[b_twiddle](), bufs[b_current]())); // once
288
289 plan_cooley_tukey(false, conv_n, 1, b_current, b_other, /*once*/true);
290
291 // other part of convolution
292 kernels.push_back(bluestein_mul_in<Ts>(queues[0], inverse, batch, n, p, threads, conv_n,
293 bufs[current](), bufs[b_twiddle](), bufs[a_current]()));
294
295 plan_cooley_tukey(false, conv_n, threads * batch, a_current, a_other, false);
296
297 // calculate convolution
298 kernels.push_back(bluestein_mul<Ts>(queues[0], conv_n, threads * batch,
299 bufs[a_current](), bufs[b_current](), bufs[a_other]()));
300 std::swap(a_current, a_other);
301
302 plan_cooley_tukey(true, conv_n, threads * batch, a_current, a_other, false);
303
304 // twiddle again
305 kernels.push_back(bluestein_mul_out<Ts>(queues[0], batch, p, n, threads, conv_n,
306 bufs[a_current](), bufs[b_twiddle](), bufs[other]()));
307 std::swap(current, other);
308 }
309
310 // Execute the complete transformation.
311 // Converts real-valued input and output, supports multiply-adding to output.
312 template<class Expr>
transformvex::fft::plan313 void transform(const Expr &in) {
314 if(profile) {
315 std::ostringstream prof_name;
316 prof_name << "fft(n={";
317 for(size_t i = 0 ; i < sizes.size() ; i++) {
318 if(i != 0) prof_name << ", ";
319 prof_name << sizes[i];
320 }
321 prof_name << "}, scale=" << scale << ")";
322 profile->tic_cl(prof_name.str());
323 profile->tic_cl("in");
324 }
325 vector<T2> &in_c = bufs[input];
326 if(cl_vector_length<Tv>::value == 1) in_c = r2c(in);
327 else in_c = in;
328 if(profile) profile->toc("in");
329 for(auto run = kernels.begin(); run != kernels.end(); ++run) {
330 if(!run->once || run->count == 0) {
331 #ifdef FFT_DUMP_ARRAYS
332 for(auto b = bufs.begin(); b != bufs.end(); ++b)
333 std::cerr << " " << std::setprecision(2) << (*b)()() << " = " << (*b) << std::endl;
334 std::cerr << "run " << run->desc << std::endl;
335 #endif
336 if(profile) {
337 std::ostringstream s;
338 s << "(k" << (run - kernels.begin()) << ")";
339 if(run->once) s << " ONCE";
340 s << " " << run->desc;
341 profile->tic_cl(s.str());
342 }
343 run->kernel(queues[0]);
344 run->count++;
345 if(profile) profile->toc("");
346 }
347 }
348 #ifdef FFT_DUMP_ARRAYS
349 for(auto b = bufs.begin(); b != bufs.end(); ++b)
350 std::cerr << " " << (*b)()() << " = " << (*b) << std::endl;
351 #endif
352 if (profile) profile->toc("");
353 }
354
355 template <typename Tout, class Expr>
applyvex::fft::plan356 auto apply(const Expr &expr) ->
357 typename std::enable_if<
358 cl_vector_length<Tout>::value == 1,
359 decltype( scale * c2r(bufs[output]) )
360 >::type
361 {
362 transform(expr);
363 return scale * c2r(bufs[output]);
364 }
365
366 template <typename Tout, class Expr>
applyvex::fft::plan367 auto apply(const Expr &expr) ->
368 typename std::enable_if<
369 cl_vector_length<Tout>::value == 2,
370 decltype( scl(bufs[output], scale) )
371 >::type
372 {
373 transform(expr);
374 return scl(bufs[output], scale);
375 }
376
descvex::fft::plan377 std::string desc() const {
378 std::ostringstream o;
379 o << "FFT(";
380 // sizes
381 for(auto n = sizes.begin() ; n != sizes.end() ; n++) {
382 if(n != sizes.begin()) o << " x ";
383 o << *n;
384 auto fs = prime_factors(*n);
385 if(fs.size() > 1) {
386 o << '=';
387 for(auto f = fs.begin() ; f != fs.end() ; f++) {
388 if(f != fs.begin()) o << '*';
389 o << *f;
390 }
391 }
392 }
393 o << ")";
394 return o.str();
395 }
396 };
397
398
399 template <class T, class P>
operator <<(std::ostream & o,const plan<T,P> & p)400 inline std::ostream &operator<<(std::ostream &o, const plan<T,P> &p) {
401 o << p.desc() << "{\n";
402 for(auto k = p.kernels.begin() ; k != p.kernels.end() ; k++) {
403 o << " ";
404 if(k->once) o << "once: ";
405 o << k->desc << "\n";
406 }
407 return o << "}";
408 }
409
410 } // namespace fft
411 } // namespace vex
412 #endif
413