1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
3 
4 /* =========================================================================
5    Copyright (c) 2010-2016, Institute for Microelectronics,
6                             Institute for Analysis and Scientific Computing,
7                             TU Wien.
8    Portions of this software are copyright by UChicago Argonne, LLC.
9 
10                             -----------------
11                   ViennaCL - The Vienna Computing Library
12                             -----------------
13 
14    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
15 
16    (A list of authors and contributors can be found in the manual)
17 
18    License:         MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
23 #include "viennacl/ocl/platform.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
26 /** @file viennacl/linalg/opencl/kernels/fft.hpp
27  *  @brief OpenCL kernel file for FFT operations */
28 namespace viennacl
29 {
30 namespace linalg
31 {
32 namespace opencl
33 {
34 namespace kernels
35 {
36 
37 //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
38 
39 
40 // Postprocessing phase of Bluestein algorithm
41 template<typename StringT>
generate_fft_bluestein_post(StringT & source,std::string const & numeric_string)42 void generate_fft_bluestein_post(StringT & source, std::string const & numeric_string)
43 {
44   source.append("__kernel void bluestein_post(__global "); source.append(numeric_string); source.append("2 *Z, \n");
45   source.append("                             __global "); source.append(numeric_string); source.append("2 *out, \n");
46   source.append("                             unsigned int size) \n");
47   source.append("{ \n");
48   source.append("  unsigned int glb_id = get_global_id(0); \n");
49   source.append("  unsigned int glb_sz = get_global_size(0); \n");
50 
51   source.append("  unsigned int double_size = size << 1; \n");
52   source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
53   source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
54 
55   source.append("  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
56   source.append("    unsigned int rm = i * i % (double_size); \n");
57   source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * (-NUM_PI); \n");
58 
59   source.append("    sn_a = sincos(angle, &cs_a); \n");
60 
61   source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
62   source.append("    out[i] = ("); source.append(numeric_string); source.append("2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
63   source.append("  } \n");
64   source.append("} \n");
65 }
66 
67 // Preprocessing phase of Bluestein algorithm
68 template<typename StringT>
generate_fft_bluestein_pre(StringT & source,std::string const & numeric_string)69 void generate_fft_bluestein_pre(StringT & source, std::string const & numeric_string)
70 {
71   source.append("__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append("2 *input, \n");
72   source.append("  __global "); source.append(numeric_string); source.append("2 *A, \n");
73   source.append("  __global "); source.append(numeric_string); source.append("2 *B, \n");
74   source.append("  unsigned int size, \n");
75   source.append("  unsigned int ext_size \n");
76   source.append("  ) { \n");
77   source.append("  unsigned int glb_id = get_global_id(0); \n");
78   source.append("  unsigned int glb_sz = get_global_size(0); \n");
79 
80   source.append("  unsigned int double_size = size << 1; \n");
81 
82   source.append("  "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
83   source.append("  const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
84 
85   source.append("  for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
86   source.append("    unsigned int rm = i * i % (double_size); \n");
87   source.append("    "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * NUM_PI; \n");
88 
89   source.append("    sn_a = sincos(-angle, &cs_a); \n");
90 
91   source.append("    "); source.append(numeric_string); source.append("2 a_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
92   source.append("    "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, -sn_a); \n");
93 
94   source.append("    A[i] = ("); source.append(numeric_string); source.append("2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
95   source.append("    B[i] = b_i; \n");
96 
97           // very bad instruction, to be fixed
98   source.append("    if (i) \n");
99   source.append("      B[ext_size - i] = b_i; \n");
100   source.append("  } \n");
101   source.append("} \n");
102 }
103 
104 /** @brief Extract real part of a complex number array */
105 template<typename StringT>
generate_fft_complex_to_real(StringT & source,std::string const & numeric_string)106 void generate_fft_complex_to_real(StringT & source, std::string const & numeric_string)
107 {
108   source.append("__kernel void complex_to_real(__global "); source.append(numeric_string); source.append("2 *in, \n");
109   source.append("  __global "); source.append(numeric_string); source.append("  *out, \n");
110   source.append("  unsigned int size) { \n");
111   source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
112   source.append("    out[i] = in[i].x; \n");
113   source.append("} \n");
114 }
115 
116 /** @brief OpenCL kernel generation code for dividing a complex number by a real number */
117 template<typename StringT>
generate_fft_div_vec_scalar(StringT & source,std::string const & numeric_string)118 void generate_fft_div_vec_scalar(StringT & source, std::string const & numeric_string)
119 {
120   source.append("__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append("2 *input1, \n");
121   source.append("  unsigned int size, \n");
122   source.append("  "); source.append(numeric_string); source.append(" factor) { \n");
123   source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))  \n");
124   source.append("    input1[i] /= factor; \n");
125   source.append("} \n");
126 }
127 
128 /** @brief Elementwise product of two complex vectors */
129 template<typename StringT>
generate_fft_mult_vec(StringT & source,std::string const & numeric_string)130 void generate_fft_mult_vec(StringT & source, std::string const & numeric_string)
131 {
132   source.append("__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append("2 *input1, \n");
133   source.append("  __global const "); source.append(numeric_string); source.append("2 *input2, \n");
134   source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
135   source.append("  unsigned int size) { \n");
136   source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
137   source.append("    "); source.append(numeric_string); source.append("2 in1 = input1[i]; \n");
138   source.append("    "); source.append(numeric_string); source.append("2 in2 = input2[i]; \n");
139 
140   source.append("    output[i] = ("); source.append(numeric_string); source.append("2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
141   source.append("  } \n");
142   source.append("} \n");
143 }
144 
145 /** @brief Embedds a real-valued vector into a complex one */
146 template<typename StringT>
generate_fft_real_to_complex(StringT & source,std::string const & numeric_string)147 void generate_fft_real_to_complex(StringT & source, std::string const & numeric_string)
148 {
149   source.append("__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(" *in, \n");
150   source.append("  __global "); source.append(numeric_string); source.append("2 *out, \n");
151   source.append("  unsigned int size) { \n");
152   source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
153   source.append("    "); source.append(numeric_string); source.append("2 val = 0; \n");
154   source.append("    val.x = in[i]; \n");
155   source.append("    out[i] = val; \n");
156   source.append("  } \n");
157   source.append("} \n");
158 }
159 
160 /** @brief Reverses the entries in a vector */
161 template<typename StringT>
generate_fft_reverse_inplace(StringT & source,std::string const & numeric_string)162 void generate_fft_reverse_inplace(StringT & source, std::string const & numeric_string)
163 {
164   source.append("__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(" *vec, uint size) { \n");
165   source.append("  for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
166   source.append("    "); source.append(numeric_string); source.append(" val1 = vec[i]; \n");
167   source.append("    "); source.append(numeric_string); source.append(" val2 = vec[size - i - 1]; \n");
168 
169   source.append("    vec[i] = val2; \n");
170   source.append("    vec[size - i - 1] = val1; \n");
171   source.append("  } \n");
172   source.append("} \n");
173 }
174 
175 /** @brief Simplistic matrix transpose function */
176 template<typename StringT>
generate_fft_transpose(StringT & source,std::string const & numeric_string)177 void generate_fft_transpose(StringT & source, std::string const & numeric_string)
178 {
179   source.append("__kernel void transpose(__global "); source.append(numeric_string); source.append("2 *input, \n");
180   source.append("  __global "); source.append(numeric_string); source.append("2 *output, \n");
181   source.append("  unsigned int row_num, \n");
182   source.append("  unsigned int col_num) { \n");
183   source.append("  unsigned int size = row_num * col_num; \n");
184   source.append("  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
185   source.append("    unsigned int row = i / col_num; \n");
186   source.append("    unsigned int col = i - row*col_num; \n");
187 
188   source.append("    unsigned int new_pos = col * row_num + row; \n");
189 
190   source.append("    output[new_pos] = input[i]; \n");
191   source.append("  } \n");
192   source.append("} \n");
193 }
194 
195 /** @brief Simplistic inplace matrix transpose function */
196 template<typename StringT>
generate_fft_transpose_inplace(StringT & source,std::string const & numeric_string)197 void generate_fft_transpose_inplace(StringT & source, std::string const & numeric_string)
198 {
199   source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("2* input, \n");
200   source.append("  unsigned int row_num, \n");
201   source.append("  unsigned int col_num) { \n");
202   source.append("  unsigned int size = row_num * col_num; \n");
203   source.append("  for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
204   source.append("    unsigned int row = i / col_num; \n");
205   source.append("    unsigned int col = i - row*col_num; \n");
206 
207   source.append("    unsigned int new_pos = col * row_num + row; \n");
208 
209   source.append("    if (i < new_pos) { \n");
210   source.append("      "); source.append(numeric_string); source.append("2 val = input[i]; \n");
211   source.append("      input[i] = input[new_pos]; \n");
212   source.append("      input[new_pos] = val; \n");
213   source.append("    } \n");
214   source.append("  } \n");
215   source.append("} \n");
216 }
217 
218 /** @brief Computes the matrix vector product with a Vandermonde matrix */
219 template<typename StringT>
generate_fft_vandermonde_prod(StringT & source,std::string const & numeric_string)220 void generate_fft_vandermonde_prod(StringT & source, std::string const & numeric_string)
221 {
222   source.append("__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(" *vander, \n");
223   source.append("  __global "); source.append(numeric_string); source.append(" *vector, \n");
224   source.append("  __global "); source.append(numeric_string); source.append(" *result, \n");
225   source.append("  uint size) { \n");
226   source.append("  for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
227   source.append("    "); source.append(numeric_string); source.append(" mul = vander[i]; \n");
228   source.append("    "); source.append(numeric_string); source.append(" pwr = 1; \n");
229   source.append("    "); source.append(numeric_string); source.append(" val = 0; \n");
230 
231   source.append("    for (uint j = 0; j < size; j++) { \n");
232   source.append("      val = val + pwr * vector[j]; \n");
233   source.append("      pwr *= mul; \n");
234   source.append("    } \n");
235 
236   source.append("    result[i] = val; \n");
237   source.append("  } \n");
238   source.append("} \n");
239 }
240 
241 /** @brief Zero two complex vectors (to avoid kernel launch overhead) */
242 template<typename StringT>
generate_fft_zero2(StringT & source,std::string const & numeric_string)243 void generate_fft_zero2(StringT & source, std::string const & numeric_string)
244 {
245   source.append("__kernel void zero2(__global "); source.append(numeric_string); source.append("2 *input1, \n");
246   source.append("  __global "); source.append(numeric_string); source.append("2 *input2, \n");
247   source.append("  unsigned int size) { \n");
248   source.append("  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
249   source.append("    input1[i] = 0; \n");
250   source.append("    input2[i] = 0; \n");
251   source.append("  } \n");
252   source.append("} \n");
253 }
254 
255 //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
256 
257 // main kernel class
258 /** @brief Main kernel class for generating OpenCL kernels for the fast Fourier transform. */
259 template<typename NumericT>
260 struct fft
261 {
program_nameviennacl::linalg::opencl::kernels::fft262   static std::string program_name()
263   {
264     return viennacl::ocl::type_to_string<NumericT>::apply() + "_fft";
265   }
266 
initviennacl::linalg::opencl::kernels::fft267   static void init(viennacl::ocl::context & ctx)
268   {
269     static std::map<cl_context, bool> init_done;
270     if (!init_done[ctx.handle().get()])
271     {
272       viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
273       std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
274 
275       std::string source;
276       source.reserve(8192);
277 
278       viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
279 
280       // unary operations
281       if (numeric_string == "float" || numeric_string == "double")
282       {
283         generate_fft_bluestein_post(source, numeric_string);
284         generate_fft_bluestein_pre(source, numeric_string);
285         generate_fft_complex_to_real(source, numeric_string);
286         generate_fft_div_vec_scalar(source, numeric_string);
287         generate_fft_mult_vec(source, numeric_string);
288         generate_fft_real_to_complex(source, numeric_string);
289         generate_fft_reverse_inplace(source, numeric_string);
290         generate_fft_transpose(source, numeric_string);
291         generate_fft_transpose_inplace(source, numeric_string);
292         generate_fft_vandermonde_prod(source, numeric_string);
293         generate_fft_zero2(source, numeric_string);
294       }
295 
296       std::string prog_name = program_name();
297       #ifdef VIENNACL_BUILD_INFO
298       std::cout << "Creating program " << prog_name << std::endl;
299       #endif
300       ctx.add_program(source, prog_name);
301       init_done[ctx.handle().get()] = true;
302     } //if
303   } //init
304 };
305 
306 }  // namespace kernels
307 }  // namespace opencl
308 }  // namespace linalg
309 }  // namespace viennacl
310 #endif
311 
312