1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
3
4 /* =========================================================================
5 Copyright (c) 2010-2016, Institute for Microelectronics,
6 Institute for Analysis and Scientific Computing,
7 TU Wien.
8 Portions of this software are copyright by UChicago Argonne, LLC.
9
10 -----------------
11 ViennaCL - The Vienna Computing Library
12 -----------------
13
14 Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15
16 (A list of authors and contributors can be found in the manual)
17
18 License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20
21 /** @file viennacl/linalg/cuda/fft_operations.hpp
22 @brief Implementations of Fast Furier Transformation using cuda
23 */
24 #include <cmath>
25 #include <viennacl/matrix.hpp>
26 #include <viennacl/vector.hpp>
27
28 #include "viennacl/forwards.h"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/tools/tools.hpp"
31 #include "viennacl/linalg/cuda/common.hpp"
32 #include "viennacl/linalg/host_based/vector_operations.hpp"
33 #include "viennacl/linalg/host_based/fft_operations.hpp"
34
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace cuda
40 {
41 namespace detail
42 {
43 namespace fft
44 {
45 const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
46
num_bits(vcl_size_t size)47 inline vcl_size_t num_bits(vcl_size_t size)
48 {
49 vcl_size_t bits_datasize = 0;
50 vcl_size_t ds = 1;
51
52 while (ds < size)
53 {
54 ds = ds << 1;
55 bits_datasize++;
56 }
57
58 return bits_datasize;
59 }
60
next_power_2(vcl_size_t n)61 inline vcl_size_t next_power_2(vcl_size_t n)
62 {
63 n = n - 1;
64
65 vcl_size_t power = 1;
66
67 while (power < sizeof(vcl_size_t) * 8)
68 {
69 n = n | (n >> power);
70 power *= 2;
71 }
72
73 return n + 1;
74 }
75
76 } //namespace fft
77 } //namespace detail
78
79 // addition
operator +(float2 a,float2 b)80 inline __host__ __device__ float2 operator+(float2 a, float2 b)
81 {
82 return make_float2(a.x + b.x, a.y + b.y);
83 }
84
85 // subtract
operator -(float2 a,float2 b)86 inline __host__ __device__ float2 operator-(float2 a, float2 b)
87 {
88 return make_float2(a.x - b.x, a.y - b.y);
89 }
90 // division
91 template<typename SCALARTYPE>
operator /(float2 a,SCALARTYPE b)92 inline __device__ float2 operator/(float2 a,SCALARTYPE b)
93 {
94 return make_float2(a.x/b, a.y/b);
95 }
96
97 //multiplication
operator *(float2 in1,float2 in2)98 inline __device__ float2 operator*(float2 in1, float2 in2)
99 {
100 return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
101 }
102
103 // addition
operator +(double2 a,double2 b)104 inline __host__ __device__ double2 operator+(double2 a, double2 b)
105 {
106 return make_double2(a.x + b.x, a.y + b.y);
107 }
108
109 // subtraction
operator -(double2 a,double2 b)110 inline __host__ __device__ double2 operator-(double2 a, double2 b)
111 {
112 return make_double2(a.x - b.x, a.y - b.y);
113 }
114
115 // division
116 template<typename SCALARTYPE>
operator /(double2 a,SCALARTYPE b)117 inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
118 {
119 return make_double2(a.x/b, a.y/b);
120 }
121
122 //multiplication
operator *(double2 in1,double2 in2)123 inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
124 {
125 return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
126 }
127
get_reorder_num(unsigned int v,unsigned int bit_size)128 inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
129 {
130 v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
131 v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
132 v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
133 v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
134 v = (v >> 16) | (v << 16);
135 v = v >> (32 - bit_size);
136 return v;
137 }
138
139 template<typename Numeric2T, typename NumericT>
fft_direct(const Numeric2T * input,Numeric2T * output,unsigned int size,unsigned int stride,unsigned int batch_num,NumericT sign,bool is_row_major)140 __global__ void fft_direct(
141 const Numeric2T * input,
142 Numeric2T * output,
143 unsigned int size,
144 unsigned int stride,
145 unsigned int batch_num,
146 NumericT sign,
147 bool is_row_major)
148 {
149
150 const NumericT NUM_PI(3.14159265358979323846);
151
152 for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
153 {
154 for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
155 {
156 Numeric2T f;
157 f.x = 0;
158 f.y = 0;
159
160 for (unsigned int n = 0; n < size; n++)
161 {
162 Numeric2T in;
163 if (!is_row_major)
164 in = input[batch_id * stride + n]; //input index here
165 else
166 in = input[n * stride + batch_id];//input index here
167
168 NumericT sn,cs;
169 NumericT arg = sign * 2 * NUM_PI * k / size * n;
170 sn = sin(arg);
171 cs = cos(arg);
172
173 Numeric2T ex;
174 ex.x = cs;
175 ex.y = sn;
176 Numeric2T tmp;
177 tmp.x = in.x * ex.x - in.y * ex.y;
178 tmp.y = in.x * ex.y + in.y * ex.x;
179 f = f + tmp;
180 }
181
182 if (!is_row_major)
183 output[batch_id * stride + k] = f; // output index here
184 else
185 output[k * stride + batch_id] = f;// output index here
186 }
187 }
188 }
189
190 /**
191 * @brief Direct 1D algorithm for computing Fourier transformation.
192 *
193 * Works on any sizes of data.
194 * Serial implementation has o(n^2) complexity
195 */
196 template<typename NumericT, unsigned int AlignmentV>
direct(viennacl::vector<NumericT,AlignmentV> const & in,viennacl::vector<NumericT,AlignmentV> & out,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)197 void direct(viennacl::vector<NumericT, AlignmentV> const & in,
198 viennacl::vector<NumericT, AlignmentV> & out,
199 vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
200 NumericT sign = NumericT(-1),
201 viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
202 {
203 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
204
205 fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
206 reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
207 static_cast<unsigned int>(size),
208 static_cast<unsigned int>(stride),
209 static_cast<unsigned int>(batch_num),
210 sign,
211 bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
212 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
213 }
214
215 /**
216 * @brief Direct 2D algorithm for computing Fourier transformation.
217 *
218 * Works on any sizes of data.
219 * Serial implementation has o(n^2) complexity
220 */
221 template<typename NumericT, unsigned int AlignmentV>
direct(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> const & in,viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & out,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)222 void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
223 viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out,
224 vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
225 NumericT sign = NumericT(-1),
226 viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
227 {
228 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
229
230 fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
231 reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
232 static_cast<unsigned int>(size),
233 static_cast<unsigned int>(stride),
234 static_cast<unsigned int>(batch_num),
235 sign,
236 bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
237 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
238 }
239
240 template<typename NumericT>
fft_reorder(NumericT * input,unsigned int bit_size,unsigned int size,unsigned int stride,unsigned int batch_num,bool is_row_major)241 __global__ void fft_reorder(NumericT * input,
242 unsigned int bit_size,
243 unsigned int size,
244 unsigned int stride,
245 unsigned int batch_num,
246 bool is_row_major)
247 {
248
249 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
250 unsigned int glb_sz = gridDim.x * blockDim.x;
251
252 for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
253 {
254 for (unsigned int i = glb_id; i < size; i += glb_sz)
255 {
256 unsigned int v = get_reorder_num(i, bit_size);
257
258 if (i < v)
259 {
260 if (!is_row_major)
261 {
262 NumericT tmp = input[batch_id * stride + i]; // index
263 input[batch_id * stride + i] = input[batch_id * stride + v];//index
264 input[batch_id * stride + v] = tmp;//index
265 }
266 else
267 {
268 NumericT tmp = input[i * stride + batch_id];
269 input[i * stride + batch_id] = input[v * stride + batch_id];
270 input[v * stride + batch_id] = tmp;
271 }
272 }
273 }
274 }
275 }
276
277 /***
278 * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
279 * Such reordering should be done before in-place FFT.
280 */
281 template<typename NumericT, unsigned int AlignmentV>
reorder(viennacl::vector<NumericT,AlignmentV> & in,vcl_size_t size,vcl_size_t stride,vcl_size_t bits_datasize,vcl_size_t batch_num,viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)282 void reorder(viennacl::vector<NumericT, AlignmentV> & in,
283 vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
284 viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
285 {
286 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
287
288 fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
289 static_cast<unsigned int>(bits_datasize),
290 static_cast<unsigned int>(size),
291 static_cast<unsigned int>(stride),
292 static_cast<unsigned int>(batch_num),
293 static_cast<bool>(data_order));
294 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
295 }
296
297 template<typename Numeric2T, typename NumericT>
fft_radix2_local(Numeric2T * input,unsigned int bit_size,unsigned int size,unsigned int stride,unsigned int batch_num,NumericT sign,bool is_row_major)298 __global__ void fft_radix2_local(Numeric2T * input,
299 unsigned int bit_size,
300 unsigned int size,
301 unsigned int stride,
302 unsigned int batch_num,
303 NumericT sign,
304 bool is_row_major)
305 {
306 __shared__ Numeric2T lcl_input[1024];
307 unsigned int grp_id = blockIdx.x;
308 unsigned int grp_num = gridDim.x;
309
310 unsigned int lcl_sz = blockDim.x;
311 unsigned int lcl_id = threadIdx.x;
312 const NumericT NUM_PI(3.14159265358979323846);
313
314 for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
315 {
316 for (unsigned int p = lcl_id; p < size; p += lcl_sz)
317 {
318 unsigned int v = get_reorder_num(p, bit_size);
319 if (!is_row_major)
320 lcl_input[v] = input[batch_id * stride + p];
321 else
322 lcl_input[v] = input[p * stride + batch_id];
323 }
324
325 __syncthreads();
326
327 //performs Cooley-Tukey FFT on local arrayfft
328 for (unsigned int s = 0; s < bit_size; s++)
329 {
330 unsigned int ss = 1 << s;
331 NumericT cs, sn;
332 for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
333 {
334 unsigned int group = (tid & (ss - 1));
335 unsigned int pos = ((tid >> s) << (s + 1)) + group;
336
337 Numeric2T in1 = lcl_input[pos];
338 Numeric2T in2 = lcl_input[pos + ss];
339
340 NumericT arg = group * sign * NUM_PI / ss;
341
342 sn = sin(arg);
343 cs = cos(arg);
344 Numeric2T ex;
345 ex.x = cs;
346 ex.y = sn;
347
348 Numeric2T tmp;
349 tmp.x = in2.x * ex.x - in2.y * ex.y;
350 tmp.y = in2.x * ex.y + in2.y * ex.x;
351
352 lcl_input[pos + ss] = in1 - tmp;
353 lcl_input[pos] = in1 + tmp;
354 }
355 __syncthreads();
356 }
357
358 //copy local array back to global memory
359 for (unsigned int p = lcl_id; p < size; p += lcl_sz)
360 {
361 if (!is_row_major)
362 input[batch_id * stride + p] = lcl_input[p]; //index
363 else
364 input[p * stride + batch_id] = lcl_input[p];
365 }
366
367 }
368 }
369
370 template<typename Numeric2T, typename NumericT>
fft_radix2(Numeric2T * input,unsigned int s,unsigned int bit_size,unsigned int size,unsigned int stride,unsigned int batch_num,NumericT sign,bool is_row_major)371 __global__ void fft_radix2(Numeric2T * input,
372 unsigned int s,
373 unsigned int bit_size,
374 unsigned int size,
375 unsigned int stride,
376 unsigned int batch_num,
377 NumericT sign,
378 bool is_row_major)
379 {
380
381 unsigned int ss = 1 << s;
382 unsigned int half_size = size >> 1;
383
384 NumericT cs, sn;
385 const NumericT NUM_PI(3.14159265358979323846);
386
387 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
388 unsigned int glb_sz = gridDim.x * blockDim.x;
389
390 for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
391 {
392 for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
393 {
394 unsigned int group = (tid & (ss - 1));
395 unsigned int pos = ((tid >> s) << (s + 1)) + group;
396 Numeric2T in1;
397 Numeric2T in2;
398 unsigned int offset;
399 if (!is_row_major)
400 {
401 offset = batch_id * stride + pos;
402 in1 = input[offset]; //index
403 in2 = input[offset + ss];//index
404 }
405 else
406 {
407 offset = pos * stride + batch_id;
408 in1 = input[offset]; //index
409 in2 = input[offset + ss * stride];//index
410 }
411
412 NumericT arg = group * sign * NUM_PI / ss;
413
414 sn = sin(arg);
415 cs = cos(arg);
416
417 Numeric2T ex;
418 ex.x = cs;
419 ex.y = sn;
420
421 Numeric2T tmp;
422 tmp.x = in2.x * ex.x - in2.y * ex.y;
423 tmp.y = in2.x * ex.y + in2.y * ex.x;
424
425 if (!is_row_major)
426 input[offset + ss] = in1 - tmp; //index
427 else
428 input[offset + ss * stride] = in1 - tmp; //index
429 input[offset] = in1 + tmp; //index
430 }
431 }
432 }
433
434 /**
435 * @brief Radix-2 1D algorithm for computing Fourier transformation.
436 *
437 * Works only on power-of-two sizes of data.
438 * Serial implementation has o(n * lg n) complexity.
439 * This is a Cooley-Tukey algorithm
440 */
441 template<typename NumericT, unsigned int AlignmentV>
radix2(viennacl::vector<NumericT,AlignmentV> & in,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)442 void radix2(viennacl::vector<NumericT, AlignmentV> & in,
443 vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
444 viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
445 {
446 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
447
448 unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
449
450 if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
451 {
452 fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
453 static_cast<unsigned int>(bit_size),
454 static_cast<unsigned int>(size),
455 static_cast<unsigned int>(stride),
456 static_cast<unsigned int>(batch_num),
457 static_cast<NumericT>(sign),
458 static_cast<bool>(data_order));
459 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
460 }
461 else
462 {
463 fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
464 static_cast<unsigned int>(bit_size),
465 static_cast<unsigned int>(size),
466 static_cast<unsigned int>(stride),
467 static_cast<unsigned int>(batch_num),
468 static_cast<bool>(data_order));
469 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
470
471 for (vcl_size_t step = 0; step < bit_size; step++)
472 {
473 fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
474 static_cast<unsigned int>(step),
475 static_cast<unsigned int>(bit_size),
476 static_cast<unsigned int>(size),
477 static_cast<unsigned int>(stride),
478 static_cast<unsigned int>(batch_num),
479 sign,
480 static_cast<bool>(data_order));
481 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
482 }
483 }
484 }
485
486 /**
487 * @brief Radix-2 2D algorithm for computing Fourier transformation.
488 *
489 * Works only on power-of-two sizes of data.
490 * Serial implementation has o(n * lg n) complexity.
491 * This is a Cooley-Tukey algorithm
492 */
493 template<typename NumericT, unsigned int AlignmentV>
radix2(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & in,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)494 void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
495 vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
496 viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
497 {
498 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
499
500 unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
501
502 if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
503 {
504 fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
505 static_cast<unsigned int>(bit_size),
506 static_cast<unsigned int>(size),
507 static_cast<unsigned int>(stride),
508 static_cast<unsigned int>(batch_num),
509 sign,
510 static_cast<bool>(data_order));
511 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
512 }
513 else
514 {
515 fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
516 static_cast<unsigned int>(bit_size),
517 static_cast<unsigned int>(size),
518 static_cast<unsigned int>(stride),
519 static_cast<unsigned int>(batch_num),
520 static_cast<bool>(data_order));
521 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
522 for (vcl_size_t step = 0; step < bit_size; step++)
523 {
524 fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
525 static_cast<unsigned int>(step),
526 static_cast<unsigned int>(bit_size),
527 static_cast<unsigned int>(size),
528 static_cast<unsigned int>(stride),
529 static_cast<unsigned int>(batch_num),
530 sign,
531 static_cast<bool>(data_order));
532 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
533 }
534 }
535 }
536
537 template<typename Numeric2T, typename NumericT>
bluestein_post(Numeric2T * Z,Numeric2T * out,unsigned int size,NumericT sign)538 __global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
539 {
540 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
541 unsigned int glb_sz =gridDim.x * blockDim.x;
542
543 unsigned int double_size = size << 1;
544 NumericT sn_a, cs_a;
545 const NumericT NUM_PI(3.14159265358979323846);
546
547 for (unsigned int i = glb_id; i < size; i += glb_sz)
548 {
549 unsigned int rm = i * i % (double_size);
550 NumericT angle = (NumericT)rm / size * (-NUM_PI);
551
552 sn_a = sin(angle);
553 cs_a= cos(angle);
554
555 Numeric2T b_i;
556 b_i.x = cs_a;
557 b_i.y = sn_a;
558 out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
559 out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
560 }
561 }
562
563 template<typename Numeric2T, typename NumericT>
bluestein_pre(Numeric2T * input,Numeric2T * A,Numeric2T * B,unsigned int size,unsigned int ext_size,NumericT sign)564 __global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
565 unsigned int size, unsigned int ext_size, NumericT sign)
566 {
567 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
568 unsigned int glb_sz = gridDim.x * blockDim.x;
569
570 unsigned int double_size = size << 1;
571
572 NumericT sn_a, cs_a;
573 const NumericT NUM_PI(3.14159265358979323846);
574
575 for (unsigned int i = glb_id; i < size; i += glb_sz)
576 {
577 unsigned int rm = i * i % (double_size);
578 NumericT angle = (NumericT)rm / size * NUM_PI;
579
580 sn_a = sin(-angle);
581 cs_a= cos(-angle);
582
583 Numeric2T a_i;
584 a_i.x =cs_a;
585 a_i.y =sn_a;
586
587 Numeric2T b_i;
588 b_i.x =cs_a;
589 b_i.y =-sn_a;
590
591 A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
592 A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
593 B[i] = b_i;
594
595 // very bad instruction, to be fixed
596 if (i)
597 B[ext_size - i] = b_i;
598 }
599 }
600
601 template<typename NumericT>
zero2(NumericT * input1,NumericT * input2,unsigned int size)602 __global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
603 {
604 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
605 {
606 input1[i].x = 0;
607 input1[i].y = 0;
608
609 input2[i].x = 0;
610 input2[i].y = 0;
611 }
612 }
613
614 /**
615 * @brief Bluestein's algorithm for computing Fourier transformation.
616 *
617 * Currently, Works only for sizes of input data which less than 2^16.
618 * Uses a lot of additional memory, but should be fast for any size of data.
619 * Serial implementation has something about o(n * lg n) complexity
620 */
621 template<typename NumericT, unsigned int AlignmentV>
bluestein(viennacl::vector<NumericT,AlignmentV> & in,viennacl::vector<NumericT,AlignmentV> & out,vcl_size_t)622 void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
623 viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
624 {
625 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
626
627 vcl_size_t size = in.size() >> 1;
628 vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1);
629
630 viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
631 viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
632 viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
633
634 zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
635 reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
636 static_cast<unsigned int>(ext_size));
637 VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
638
639 bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
640 reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
641 reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
642 static_cast<unsigned int>(size),
643 static_cast<unsigned int>(ext_size),
644 NumericT(1));
645 VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
646
647 viennacl::linalg::convolve_i(A, B, Z);
648
649 bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
650 reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
651 static_cast<unsigned int>(size),
652 NumericT(1));
653 VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
654 }
655
656 template<typename NumericT>
fft_mult_vec(const NumericT * input1,const NumericT * input2,NumericT * output,unsigned int size)657 __global__ void fft_mult_vec(const NumericT * input1,
658 const NumericT * input2,
659 NumericT * output,
660 unsigned int size)
661 {
662 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
663 {
664 NumericT in1 = input1[i];
665 NumericT in2 = input2[i];
666 output[i] = in1 * in2;
667 }
668 }
669
670 /**
671 * @brief Mutiply two complex vectors and store result in output
672 */
673 template<typename NumericT, unsigned int AlignmentV>
multiply_complex(viennacl::vector<NumericT,AlignmentV> const & input1,viennacl::vector<NumericT,AlignmentV> const & input2,viennacl::vector<NumericT,AlignmentV> & output)674 void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
675 viennacl::vector<NumericT, AlignmentV> const & input2,
676 viennacl::vector<NumericT, AlignmentV> & output)
677 {
678 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
679
680 vcl_size_t size = input1.size() / 2;
681
682 fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
683 reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
684 reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
685 static_cast<unsigned int>(size));
686 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
687 }
688
689 template<typename Numeric2T, typename NumericT>
fft_div_vec_scalar(Numeric2T * input1,unsigned int size,NumericT factor)690 __global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
691 {
692 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
693 input1[i] = input1[i]/factor;
694 }
695
696 /**
697 * @brief Normalize vector on with his own size
698 */
699 template<typename NumericT, unsigned int AlignmentV>
normalize(viennacl::vector<NumericT,AlignmentV> & input)700 void normalize(viennacl::vector<NumericT, AlignmentV> & input)
701 {
702 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
703
704 vcl_size_t size = input.size() >> 1;
705 NumericT norm_factor = static_cast<NumericT>(size);
706 fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
707 static_cast<unsigned int>(size),
708 norm_factor);
709 VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
710 }
711
712 template<typename NumericT>
transpose(const NumericT * input,NumericT * output,unsigned int row_num,unsigned int col_num)713 __global__ void transpose(const NumericT * input,
714 NumericT * output,
715 unsigned int row_num,
716 unsigned int col_num)
717 {
718 unsigned int size = row_num * col_num;
719 for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
720 {
721 unsigned int row = i / col_num;
722 unsigned int col = i - row*col_num;
723 unsigned int new_pos = col * row_num + row;
724 output[new_pos] = input[i];
725 }
726 }
727
728 /**
729 * @brief Transpose matrix
730 */
731 template<typename NumericT, unsigned int AlignmentV>
transpose(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> const & input,viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & output)732 void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
733 viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
734 {
735 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
736
737 transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
738 reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
739 static_cast<unsigned int>(input.internal_size1()>>1),
740 static_cast<unsigned int>(input.internal_size2()>>1));
741 VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
742
743 }
744
745 template<typename NumericT>
transpose_inplace(NumericT * input,unsigned int row_num,unsigned int col_num)746 __global__ void transpose_inplace(
747 NumericT * input,
748 unsigned int row_num,
749 unsigned int col_num)
750 {
751 unsigned int size = row_num * col_num;
752 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
753 {
754 unsigned int row = i / col_num;
755 unsigned int col = i - row*col_num;
756 unsigned int new_pos = col * row_num + row;
757 if (i < new_pos)
758 {
759 NumericT val = input[i];
760 input[i] = input[new_pos];
761 input[new_pos] = val;
762 }
763 }
764 }
765
766 /**
767 * @brief Inplace_transpose matrix
768 */
769 template<typename NumericT, unsigned int AlignmentV>
transpose(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & input)770 void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
771 {
772 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
773
774 transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
775 static_cast<unsigned int>(input.internal_size1()>>1),
776 static_cast<unsigned int>(input.internal_size2() >> 1));
777 VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
778
779 }
780
781 template<typename RealT,typename ComplexT>
real_to_complex(const RealT * in,ComplexT * out,unsigned int size)782 __global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
783 {
784 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
785 {
786 ComplexT val;
787 val.x = in[i];
788 val.y = 0;
789 out[i] = val;
790 }
791 }
792
793 /**
794 * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
795 */
796 template<typename NumericT>
real_to_complex(viennacl::vector_base<NumericT> const & in,viennacl::vector_base<NumericT> & out,vcl_size_t size)797 void real_to_complex(viennacl::vector_base<NumericT> const & in,
798 viennacl::vector_base<NumericT> & out, vcl_size_t size)
799 {
800 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
801
802 real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
803 reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
804 static_cast<unsigned int>(size));
805 VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
806 }
807
808 template<typename ComplexT,typename RealT>
complex_to_real(const ComplexT * in,RealT * out,unsigned int size)809 __global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
810 {
811 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
812 out[i] = in[i].x;
813 }
814
815 /**
816 * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
817 */
818 template<typename NumericT>
complex_to_real(viennacl::vector_base<NumericT> const & in,viennacl::vector_base<NumericT> & out,vcl_size_t size)819 void complex_to_real(viennacl::vector_base<NumericT> const & in,
820 viennacl::vector_base<NumericT>& out, vcl_size_t size)
821 {
822 typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
823
824 complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
825 viennacl::cuda_arg(out),
826 static_cast<unsigned int>(size));
827 VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
828
829 }
830
831 template<typename NumericT>
reverse_inplace(NumericT * vec,unsigned int size)832 __global__ void reverse_inplace(NumericT * vec, unsigned int size)
833 {
834 for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
835 {
836 NumericT val1 = vec[i];
837 NumericT val2 = vec[size - i - 1];
838 vec[i] = val2;
839 vec[size - i - 1] = val1;
840 }
841 }
842
843 /**
844 * @brief Reverse vector to oposite order and save it in input vector
845 */
846 template<typename NumericT>
reverse(viennacl::vector_base<NumericT> & in)847 void reverse(viennacl::vector_base<NumericT>& in)
848 {
849 vcl_size_t size = in.size();
850 reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
851 VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
852 }
853
854 } //namespace cuda
855 } //namespace linalg
856 } //namespace viennacl
857
858 #endif /* FFT_OPERATIONS_HPP_ */
859