1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
3 
4 /* =========================================================================
5   Copyright (c) 2010-2016, Institute for Microelectronics,
6                             Institute for Analysis and Scientific Computing,
7                             TU Wien.
8    Portions of this software are copyright by UChicago Argonne, LLC.
9 
10                             -----------------
11                   ViennaCL - The Vienna Computing Library
12                             -----------------
13 
14    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
15 
16    (A list of authors and contributors can be found in the manual)
17 
18    License:         MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 /** @file viennacl/linalg/cuda/fft_operations.hpp
22     @brief Implementations of Fast Furier Transformation using cuda
23 */
24 #include <cmath>
25 #include <viennacl/matrix.hpp>
26 #include <viennacl/vector.hpp>
27 
28 #include "viennacl/forwards.h"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/tools/tools.hpp"
31 #include "viennacl/linalg/cuda/common.hpp"
32 #include "viennacl/linalg/host_based/vector_operations.hpp"
33 #include "viennacl/linalg/host_based/fft_operations.hpp"
34 
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace cuda
40 {
41 namespace detail
42 {
43   namespace fft
44   {
45     const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
46 
num_bits(vcl_size_t size)47     inline vcl_size_t num_bits(vcl_size_t size)
48     {
49       vcl_size_t bits_datasize = 0;
50       vcl_size_t ds = 1;
51 
52       while (ds < size)
53       {
54         ds = ds << 1;
55         bits_datasize++;
56       }
57 
58       return bits_datasize;
59     }
60 
next_power_2(vcl_size_t n)61     inline vcl_size_t next_power_2(vcl_size_t n)
62     {
63       n = n - 1;
64 
65       vcl_size_t power = 1;
66 
67       while (power < sizeof(vcl_size_t) * 8)
68       {
69         n = n | (n >> power);
70         power *= 2;
71       }
72 
73       return n + 1;
74     }
75 
76   } //namespace fft
77 } //namespace detail
78 
79 // addition
operator +(float2 a,float2 b)80 inline __host__ __device__ float2 operator+(float2 a, float2 b)
81 {
82   return make_float2(a.x + b.x, a.y + b.y);
83 }
84 
85 // subtract
operator -(float2 a,float2 b)86 inline __host__ __device__ float2 operator-(float2 a, float2 b)
87 {
88   return make_float2(a.x - b.x, a.y - b.y);
89 }
90 // division
91 template<typename SCALARTYPE>
operator /(float2 a,SCALARTYPE b)92 inline __device__ float2 operator/(float2 a,SCALARTYPE b)
93 {
94   return make_float2(a.x/b, a.y/b);
95 }
96 
97 //multiplication
operator *(float2 in1,float2 in2)98 inline __device__ float2 operator*(float2 in1, float2 in2)
99 {
100   return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
101 }
102 
103 // addition
operator +(double2 a,double2 b)104 inline __host__ __device__ double2 operator+(double2 a, double2 b)
105 {
106   return make_double2(a.x + b.x, a.y + b.y);
107 }
108 
109 // subtraction
operator -(double2 a,double2 b)110 inline __host__ __device__ double2 operator-(double2 a, double2 b)
111 {
112   return make_double2(a.x - b.x, a.y - b.y);
113 }
114 
115 // division
116 template<typename SCALARTYPE>
operator /(double2 a,SCALARTYPE b)117 inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
118 {
119   return make_double2(a.x/b, a.y/b);
120 }
121 
122 //multiplication
operator *(double2 in1,double2 in2)123 inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
124 {
125   return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
126 }
127 
get_reorder_num(unsigned int v,unsigned int bit_size)128 inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
129 {
130   v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
131   v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
132   v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
133   v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
134   v = (v >> 16) | (v << 16);
135   v = v >> (32 - bit_size);
136   return v;
137 }
138 
139 template<typename Numeric2T, typename NumericT>
fft_direct(const Numeric2T * input,Numeric2T * output,unsigned int size,unsigned int stride,unsigned int batch_num,NumericT sign,bool is_row_major)140 __global__ void fft_direct(
141     const Numeric2T * input,
142     Numeric2T * output,
143     unsigned int size,
144     unsigned int stride,
145     unsigned int batch_num,
146     NumericT sign,
147     bool is_row_major)
148 {
149 
150   const NumericT NUM_PI(3.14159265358979323846);
151 
152   for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
153   {
154     for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
155     {
156       Numeric2T f;
157       f.x = 0;
158       f.y = 0;
159 
160       for (unsigned int n = 0; n < size; n++)
161       {
162         Numeric2T in;
163         if (!is_row_major)
164           in = input[batch_id * stride + n];   //input index here
165         else
166           in = input[n * stride + batch_id];//input index here
167 
168         NumericT sn,cs;
169         NumericT arg = sign * 2 * NUM_PI * k / size * n;
170         sn = sin(arg);
171         cs = cos(arg);
172 
173         Numeric2T ex;
174         ex.x = cs;
175         ex.y = sn;
176         Numeric2T tmp;
177         tmp.x = in.x * ex.x - in.y * ex.y;
178         tmp.y = in.x * ex.y + in.y * ex.x;
179         f = f + tmp;
180       }
181 
182       if (!is_row_major)
183         output[batch_id * stride + k] = f; // output index here
184       else
185         output[k * stride + batch_id] = f;// output index here
186     }
187   }
188 }
189 
190 /**
191  * @brief Direct 1D algorithm for computing Fourier transformation.
192  *
193  * Works on any sizes of data.
194  * Serial implementation has o(n^2) complexity
195  */
196 template<typename NumericT, unsigned int AlignmentV>
direct(viennacl::vector<NumericT,AlignmentV> const & in,viennacl::vector<NumericT,AlignmentV> & out,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)197 void direct(viennacl::vector<NumericT, AlignmentV> const & in,
198             viennacl::vector<NumericT, AlignmentV>       & out,
199             vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
200             NumericT sign = NumericT(-1),
201             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
202 {
203   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
204 
205   fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
206                           reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(out)),
207                           static_cast<unsigned int>(size),
208                           static_cast<unsigned int>(stride),
209                           static_cast<unsigned int>(batch_num),
210                           sign,
211                           bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
212   VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
213 }
214 
215 /**
216  * @brief Direct 2D algorithm for computing Fourier transformation.
217  *
218  * Works on any sizes of data.
219  * Serial implementation has o(n^2) complexity
220  */
221 template<typename NumericT, unsigned int AlignmentV>
direct(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> const & in,viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & out,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)222 void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
223             viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>       & out,
224             vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
225             NumericT sign = NumericT(-1),
226             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
227 {
228   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
229 
230   fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
231                           reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(out)),
232                           static_cast<unsigned int>(size),
233                           static_cast<unsigned int>(stride),
234                           static_cast<unsigned int>(batch_num),
235                           sign,
236                           bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
237   VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
238 }
239 
240 template<typename NumericT>
fft_reorder(NumericT * input,unsigned int bit_size,unsigned int size,unsigned int stride,unsigned int batch_num,bool is_row_major)241 __global__ void fft_reorder(NumericT * input,
242                             unsigned int bit_size,
243                             unsigned int size,
244                             unsigned int stride,
245                             unsigned int batch_num,
246                             bool is_row_major)
247 {
248 
249   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
250   unsigned int glb_sz = gridDim.x * blockDim.x;
251 
252   for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
253   {
254     for (unsigned int i = glb_id; i < size; i += glb_sz)
255     {
256       unsigned int v = get_reorder_num(i, bit_size);
257 
258       if (i < v)
259       {
260         if (!is_row_major)
261         {
262           NumericT tmp = input[batch_id * stride + i];    // index
263           input[batch_id * stride + i] = input[batch_id * stride + v];//index
264           input[batch_id * stride + v] = tmp;//index
265         }
266         else
267         {
268           NumericT tmp = input[i * stride + batch_id];
269           input[i * stride + batch_id] = input[v * stride + batch_id];
270           input[v * stride + batch_id] = tmp;
271         }
272       }
273     }
274   }
275 }
276 
277 /***
278  * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
279  * Such reordering should be done before in-place FFT.
280  */
281 template<typename NumericT, unsigned int AlignmentV>
reorder(viennacl::vector<NumericT,AlignmentV> & in,vcl_size_t size,vcl_size_t stride,vcl_size_t bits_datasize,vcl_size_t batch_num,viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)282 void reorder(viennacl::vector<NumericT, AlignmentV> & in,
283              vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
284              viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
285 {
286   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
287 
288   fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
289                            static_cast<unsigned int>(bits_datasize),
290                            static_cast<unsigned int>(size),
291                            static_cast<unsigned int>(stride),
292                            static_cast<unsigned int>(batch_num),
293                            static_cast<bool>(data_order));
294   VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
295 }
296 
297 template<typename Numeric2T, typename NumericT>
fft_radix2_local(Numeric2T * input,unsigned int bit_size,unsigned int size,unsigned int stride,unsigned int batch_num,NumericT sign,bool is_row_major)298 __global__ void fft_radix2_local(Numeric2T * input,
299                                  unsigned int bit_size,
300                                  unsigned int size,
301                                  unsigned int stride,
302                                  unsigned int batch_num,
303                                  NumericT sign,
304                                  bool is_row_major)
305 {
306   __shared__ Numeric2T lcl_input[1024];
307   unsigned int grp_id = blockIdx.x;
308   unsigned int grp_num = gridDim.x;
309 
310   unsigned int lcl_sz = blockDim.x;
311   unsigned int lcl_id = threadIdx.x;
312   const NumericT NUM_PI(3.14159265358979323846);
313 
314   for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
315   {
316     for (unsigned int p = lcl_id; p < size; p += lcl_sz)
317     {
318       unsigned int v = get_reorder_num(p, bit_size);
319       if (!is_row_major)
320         lcl_input[v] = input[batch_id * stride + p];
321       else
322         lcl_input[v] = input[p * stride + batch_id];
323     }
324 
325     __syncthreads();
326 
327     //performs Cooley-Tukey FFT on local arrayfft
328     for (unsigned int s = 0; s < bit_size; s++)
329     {
330       unsigned int ss = 1 << s;
331       NumericT cs, sn;
332       for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
333       {
334         unsigned int group = (tid & (ss - 1));
335         unsigned int pos = ((tid >> s) << (s + 1)) + group;
336 
337         Numeric2T in1 = lcl_input[pos];
338         Numeric2T in2 = lcl_input[pos + ss];
339 
340         NumericT arg = group * sign * NUM_PI / ss;
341 
342         sn = sin(arg);
343         cs = cos(arg);
344         Numeric2T ex;
345         ex.x = cs;
346         ex.y = sn;
347 
348         Numeric2T tmp;
349         tmp.x = in2.x * ex.x - in2.y * ex.y;
350         tmp.y = in2.x * ex.y + in2.y * ex.x;
351 
352         lcl_input[pos + ss] = in1 - tmp;
353         lcl_input[pos]      = in1 + tmp;
354       }
355       __syncthreads();
356     }
357 
358     //copy local array back to global memory
359     for (unsigned int p = lcl_id; p < size; p += lcl_sz)
360     {
361       if (!is_row_major)
362         input[batch_id * stride + p] = lcl_input[p];   //index
363       else
364         input[p * stride + batch_id] = lcl_input[p];
365     }
366 
367   }
368 }
369 
370 template<typename Numeric2T, typename NumericT>
fft_radix2(Numeric2T * input,unsigned int s,unsigned int bit_size,unsigned int size,unsigned int stride,unsigned int batch_num,NumericT sign,bool is_row_major)371 __global__ void fft_radix2(Numeric2T * input,
372                            unsigned int s,
373                            unsigned int bit_size,
374                            unsigned int size,
375                            unsigned int stride,
376                            unsigned int batch_num,
377                            NumericT sign,
378                            bool is_row_major)
379 {
380 
381   unsigned int ss = 1 << s;
382   unsigned int half_size = size >> 1;
383 
384   NumericT cs, sn;
385   const NumericT NUM_PI(3.14159265358979323846);
386 
387   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
388   unsigned int glb_sz = gridDim.x * blockDim.x;
389 
390   for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
391   {
392     for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
393     {
394       unsigned int group = (tid & (ss - 1));
395       unsigned int pos = ((tid >> s) << (s + 1)) + group;
396       Numeric2T in1;
397       Numeric2T in2;
398       unsigned int offset;
399       if (!is_row_major)
400       {
401         offset = batch_id * stride + pos;
402         in1 = input[offset];   //index
403         in2 = input[offset + ss];//index
404       }
405       else
406       {
407         offset = pos * stride + batch_id;
408         in1 = input[offset];   //index
409         in2 = input[offset + ss * stride];//index
410       }
411 
412       NumericT arg = group * sign * NUM_PI / ss;
413 
414       sn = sin(arg);
415       cs = cos(arg);
416 
417       Numeric2T ex;
418       ex.x = cs;
419       ex.y = sn;
420 
421       Numeric2T tmp;
422       tmp.x = in2.x * ex.x - in2.y * ex.y;
423       tmp.y = in2.x * ex.y + in2.y * ex.x;
424 
425       if (!is_row_major)
426         input[offset + ss] = in1 - tmp;  //index
427       else
428         input[offset + ss * stride] = in1 - tmp;  //index
429       input[offset] = in1 + tmp;  //index
430     }
431   }
432 }
433 
434 /**
435  * @brief Radix-2 1D algorithm for computing Fourier transformation.
436  *
437  * Works only on power-of-two sizes of data.
438  * Serial implementation has o(n * lg n) complexity.
439  * This is a Cooley-Tukey algorithm
440  */
441 template<typename NumericT, unsigned int AlignmentV>
radix2(viennacl::vector<NumericT,AlignmentV> & in,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)442 void radix2(viennacl::vector<NumericT, AlignmentV> & in,
443             vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
444             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
445 {
446   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
447 
448   unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
449 
450   if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
451   {
452     fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
453                                   static_cast<unsigned int>(bit_size),
454                                   static_cast<unsigned int>(size),
455                                   static_cast<unsigned int>(stride),
456                                   static_cast<unsigned int>(batch_num),
457                                   static_cast<NumericT>(sign),
458                                   static_cast<bool>(data_order));
459     VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
460   }
461   else
462   {
463     fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
464                              static_cast<unsigned int>(bit_size),
465                              static_cast<unsigned int>(size),
466                              static_cast<unsigned int>(stride),
467                              static_cast<unsigned int>(batch_num),
468                              static_cast<bool>(data_order));
469     VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
470 
471     for (vcl_size_t step = 0; step < bit_size; step++)
472     {
473       fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
474                               static_cast<unsigned int>(step),
475                               static_cast<unsigned int>(bit_size),
476                               static_cast<unsigned int>(size),
477                               static_cast<unsigned int>(stride),
478                               static_cast<unsigned int>(batch_num),
479                               sign,
480                               static_cast<bool>(data_order));
481       VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
482     }
483   }
484 }
485 
486 /**
487  * @brief Radix-2 2D algorithm for computing Fourier transformation.
488  *
489  * Works only on power-of-two sizes of data.
490  * Serial implementation has o(n * lg n) complexity.
491  * This is a Cooley-Tukey algorithm
492  */
493 template<typename NumericT, unsigned int AlignmentV>
radix2(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & in,vcl_size_t size,vcl_size_t stride,vcl_size_t batch_num,NumericT sign=NumericT (-1),viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)494 void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
495             vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
496             viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
497 {
498   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
499 
500   unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
501 
502   if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
503   {
504     fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
505                                   static_cast<unsigned int>(bit_size),
506                                   static_cast<unsigned int>(size),
507                                   static_cast<unsigned int>(stride),
508                                   static_cast<unsigned int>(batch_num),
509                                   sign,
510                                   static_cast<bool>(data_order));
511     VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
512   }
513   else
514   {
515     fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
516                              static_cast<unsigned int>(bit_size),
517                              static_cast<unsigned int>(size),
518                              static_cast<unsigned int>(stride),
519                              static_cast<unsigned int>(batch_num),
520                              static_cast<bool>(data_order));
521     VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
522     for (vcl_size_t step = 0; step < bit_size; step++)
523     {
524       fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
525                               static_cast<unsigned int>(step),
526                               static_cast<unsigned int>(bit_size),
527                               static_cast<unsigned int>(size),
528                               static_cast<unsigned int>(stride),
529                               static_cast<unsigned int>(batch_num),
530                               sign,
531                               static_cast<bool>(data_order));
532       VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
533     }
534   }
535 }
536 
537 template<typename Numeric2T, typename NumericT>
bluestein_post(Numeric2T * Z,Numeric2T * out,unsigned int size,NumericT sign)538 __global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
539 {
540   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
541   unsigned int glb_sz =gridDim.x * blockDim.x;
542 
543   unsigned int double_size = size << 1;
544   NumericT sn_a, cs_a;
545   const NumericT NUM_PI(3.14159265358979323846);
546 
547   for (unsigned int i = glb_id; i < size; i += glb_sz)
548   {
549     unsigned int rm = i * i % (double_size);
550     NumericT angle = (NumericT)rm / size * (-NUM_PI);
551 
552     sn_a = sin(angle);
553     cs_a= cos(angle);
554 
555     Numeric2T b_i;
556     b_i.x = cs_a;
557     b_i.y = sn_a;
558     out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
559     out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
560   }
561 }
562 
563 template<typename Numeric2T, typename NumericT>
bluestein_pre(Numeric2T * input,Numeric2T * A,Numeric2T * B,unsigned int size,unsigned int ext_size,NumericT sign)564 __global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
565                               unsigned int size, unsigned int ext_size, NumericT sign)
566 {
567   unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
568   unsigned int glb_sz = gridDim.x * blockDim.x;
569 
570   unsigned int double_size = size << 1;
571 
572   NumericT sn_a, cs_a;
573   const NumericT NUM_PI(3.14159265358979323846);
574 
575   for (unsigned int i = glb_id; i < size; i += glb_sz)
576   {
577     unsigned int rm = i * i % (double_size);
578     NumericT angle = (NumericT)rm / size * NUM_PI;
579 
580     sn_a = sin(-angle);
581     cs_a= cos(-angle);
582 
583     Numeric2T a_i;
584     a_i.x =cs_a;
585     a_i.y =sn_a;
586 
587     Numeric2T b_i;
588     b_i.x =cs_a;
589     b_i.y =-sn_a;
590 
591     A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
592     A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
593     B[i] = b_i;
594 
595     // very bad instruction, to be fixed
596     if (i)
597     B[ext_size - i] = b_i;
598   }
599 }
600 
601 template<typename NumericT>
zero2(NumericT * input1,NumericT * input2,unsigned int size)602 __global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
603 {
604   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
605   {
606     input1[i].x = 0;
607     input1[i].y = 0;
608 
609     input2[i].x = 0;
610     input2[i].y = 0;
611   }
612 }
613 
614 /**
615  * @brief Bluestein's algorithm for computing Fourier transformation.
616  *
617  * Currently,  Works only for sizes of input data which less than 2^16.
618  * Uses a lot of additional memory, but should be fast for any size of data.
619  * Serial implementation has something about o(n * lg n) complexity
620  */
621 template<typename NumericT, unsigned int AlignmentV>
bluestein(viennacl::vector<NumericT,AlignmentV> & in,viennacl::vector<NumericT,AlignmentV> & out,vcl_size_t)622 void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
623                viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
624 {
625   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
626 
627   vcl_size_t size = in.size() >> 1;
628   vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1);
629 
630   viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
631   viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
632   viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
633 
634   zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
635                      reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
636                      static_cast<unsigned int>(ext_size));
637   VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
638 
639   bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
640                              reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
641                              reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
642                              static_cast<unsigned int>(size),
643                              static_cast<unsigned int>(ext_size),
644                              NumericT(1));
645   VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
646 
647   viennacl::linalg::convolve_i(A, B, Z);
648 
649   bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
650                               reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
651                               static_cast<unsigned int>(size),
652                               NumericT(1));
653   VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
654 }
655 
656 template<typename NumericT>
fft_mult_vec(const NumericT * input1,const NumericT * input2,NumericT * output,unsigned int size)657 __global__ void fft_mult_vec(const NumericT * input1,
658                              const NumericT * input2,
659                              NumericT * output,
660                              unsigned int size)
661 {
662   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
663   {
664     NumericT in1 = input1[i];
665     NumericT in2 = input2[i];
666     output[i] = in1 * in2;
667   }
668 }
669 
670 /**
671  * @brief Mutiply two complex vectors and store result in output
672  */
673 template<typename NumericT, unsigned int AlignmentV>
multiply_complex(viennacl::vector<NumericT,AlignmentV> const & input1,viennacl::vector<NumericT,AlignmentV> const & input2,viennacl::vector<NumericT,AlignmentV> & output)674 void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
675                       viennacl::vector<NumericT, AlignmentV> const & input2,
676                       viennacl::vector<NumericT, AlignmentV> & output)
677 {
678   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
679 
680   vcl_size_t size = input1.size() / 2;
681 
682   fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
683                             reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
684                             reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(output)),
685                             static_cast<unsigned int>(size));
686   VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
687 }
688 
689 template<typename Numeric2T, typename NumericT>
fft_div_vec_scalar(Numeric2T * input1,unsigned int size,NumericT factor)690 __global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
691 {
692   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
693     input1[i] = input1[i]/factor;
694 }
695 
696 /**
697  * @brief Normalize vector on with his own size
698  */
699 template<typename NumericT, unsigned int AlignmentV>
normalize(viennacl::vector<NumericT,AlignmentV> & input)700 void normalize(viennacl::vector<NumericT, AlignmentV> & input)
701 {
702   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
703 
704   vcl_size_t size = input.size() >> 1;
705   NumericT norm_factor = static_cast<NumericT>(size);
706   fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
707                                   static_cast<unsigned int>(size),
708                                   norm_factor);
709   VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
710 }
711 
712 template<typename NumericT>
transpose(const NumericT * input,NumericT * output,unsigned int row_num,unsigned int col_num)713 __global__ void transpose(const NumericT * input,
714                           NumericT * output,
715                           unsigned int row_num,
716                           unsigned int col_num)
717 {
718   unsigned int size = row_num * col_num;
719   for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
720   {
721     unsigned int row = i / col_num;
722     unsigned int col = i - row*col_num;
723     unsigned int new_pos = col * row_num + row;
724     output[new_pos] = input[i];
725   }
726 }
727 
728 /**
729  * @brief Transpose matrix
730  */
731 template<typename NumericT, unsigned int AlignmentV>
transpose(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> const & input,viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & output)732 void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
733                viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
734 {
735   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
736 
737   transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
738                          reinterpret_cast<      numeric2_type *>(viennacl::cuda_arg(output)),
739                          static_cast<unsigned int>(input.internal_size1()>>1),
740                          static_cast<unsigned int>(input.internal_size2()>>1));
741   VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
742 
743 }
744 
745 template<typename NumericT>
transpose_inplace(NumericT * input,unsigned int row_num,unsigned int col_num)746 __global__ void transpose_inplace(
747     NumericT * input,
748     unsigned int row_num,
749     unsigned int col_num)
750 {
751   unsigned int size = row_num * col_num;
752   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
753   {
754     unsigned int row = i / col_num;
755     unsigned int col = i - row*col_num;
756     unsigned int new_pos = col * row_num + row;
757     if (i < new_pos)
758     {
759       NumericT val = input[i];
760       input[i] = input[new_pos];
761       input[new_pos] = val;
762     }
763   }
764 }
765 
766 /**
767  * @brief Inplace_transpose matrix
768  */
769 template<typename NumericT, unsigned int AlignmentV>
transpose(viennacl::matrix<NumericT,viennacl::row_major,AlignmentV> & input)770 void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
771 {
772   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
773 
774   transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
775                                  static_cast<unsigned int>(input.internal_size1()>>1),
776                                  static_cast<unsigned int>(input.internal_size2() >> 1));
777   VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
778 
779 }
780 
781 template<typename RealT,typename ComplexT>
real_to_complex(const RealT * in,ComplexT * out,unsigned int size)782 __global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
783 {
784   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
785   {
786     ComplexT val;
787     val.x = in[i];
788     val.y = 0;
789     out[i] = val;
790   }
791 }
792 
793 /**
794  * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
795  */
796 template<typename NumericT>
real_to_complex(viennacl::vector_base<NumericT> const & in,viennacl::vector_base<NumericT> & out,vcl_size_t size)797 void real_to_complex(viennacl::vector_base<NumericT> const & in,
798                      viennacl::vector_base<NumericT> & out, vcl_size_t size)
799 {
800   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
801 
802   real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
803                                reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
804                                static_cast<unsigned int>(size));
805   VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
806 }
807 
808 template<typename ComplexT,typename RealT>
complex_to_real(const ComplexT * in,RealT * out,unsigned int size)809 __global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
810 {
811   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
812     out[i] = in[i].x;
813 }
814 
815 /**
816  * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
817  */
818 template<typename NumericT>
complex_to_real(viennacl::vector_base<NumericT> const & in,viennacl::vector_base<NumericT> & out,vcl_size_t size)819 void complex_to_real(viennacl::vector_base<NumericT> const & in,
820                      viennacl::vector_base<NumericT>& out, vcl_size_t size)
821 {
822   typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type  numeric2_type;
823 
824   complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
825                                viennacl::cuda_arg(out),
826                                static_cast<unsigned int>(size));
827   VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
828 
829 }
830 
831 template<typename NumericT>
reverse_inplace(NumericT * vec,unsigned int size)832 __global__ void reverse_inplace(NumericT * vec, unsigned int size)
833 {
834   for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
835   {
836     NumericT val1 = vec[i];
837     NumericT val2 = vec[size - i - 1];
838     vec[i] = val2;
839     vec[size - i - 1] = val1;
840   }
841 }
842 
843 /**
844  * @brief Reverse vector to oposite order and save it in input vector
845  */
846 template<typename NumericT>
reverse(viennacl::vector_base<NumericT> & in)847 void reverse(viennacl::vector_base<NumericT>& in)
848 {
849   vcl_size_t size = in.size();
850   reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
851   VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
852 }
853 
854 }  //namespace cuda
855 }  //namespace linalg
856 }  //namespace viennacl
857 
858 #endif /* FFT_OPERATIONS_HPP_ */
859