1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
3
4 /* =========================================================================
5 Copyright (c) 2010-2016, Institute for Microelectronics,
6 Institute for Analysis and Scientific Computing,
7 TU Wien.
8 Portions of this software are copyright by UChicago Argonne, LLC.
9
10 -----------------
11 ViennaCL - The Vienna Computing Library
12 -----------------
13
14 Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15
16 (A list of authors and contributors can be found in the manual)
17
18 License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
23 #include "viennacl/ocl/platform.hpp"
24 #include "viennacl/ocl/utils.hpp"
25
26 /** @file viennacl/linalg/opencl/kernels/fft.hpp
27 * @brief OpenCL kernel file for FFT operations */
28 namespace viennacl
29 {
30 namespace linalg
31 {
32 namespace opencl
33 {
34 namespace kernels
35 {
36
37 //////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
38
39
40 // Postprocessing phase of Bluestein algorithm
41 template<typename StringT>
generate_fft_bluestein_post(StringT & source,std::string const & numeric_string)42 void generate_fft_bluestein_post(StringT & source, std::string const & numeric_string)
43 {
44 source.append("__kernel void bluestein_post(__global "); source.append(numeric_string); source.append("2 *Z, \n");
45 source.append(" __global "); source.append(numeric_string); source.append("2 *out, \n");
46 source.append(" unsigned int size) \n");
47 source.append("{ \n");
48 source.append(" unsigned int glb_id = get_global_id(0); \n");
49 source.append(" unsigned int glb_sz = get_global_size(0); \n");
50
51 source.append(" unsigned int double_size = size << 1; \n");
52 source.append(" "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
53 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
54
55 source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
56 source.append(" unsigned int rm = i * i % (double_size); \n");
57 source.append(" "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * (-NUM_PI); \n");
58
59 source.append(" sn_a = sincos(angle, &cs_a); \n");
60
61 source.append(" "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
62 source.append(" out[i] = ("); source.append(numeric_string); source.append("2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
63 source.append(" } \n");
64 source.append("} \n");
65 }
66
67 // Preprocessing phase of Bluestein algorithm
68 template<typename StringT>
generate_fft_bluestein_pre(StringT & source,std::string const & numeric_string)69 void generate_fft_bluestein_pre(StringT & source, std::string const & numeric_string)
70 {
71 source.append("__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append("2 *input, \n");
72 source.append(" __global "); source.append(numeric_string); source.append("2 *A, \n");
73 source.append(" __global "); source.append(numeric_string); source.append("2 *B, \n");
74 source.append(" unsigned int size, \n");
75 source.append(" unsigned int ext_size \n");
76 source.append(" ) { \n");
77 source.append(" unsigned int glb_id = get_global_id(0); \n");
78 source.append(" unsigned int glb_sz = get_global_size(0); \n");
79
80 source.append(" unsigned int double_size = size << 1; \n");
81
82 source.append(" "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
83 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
84
85 source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
86 source.append(" unsigned int rm = i * i % (double_size); \n");
87 source.append(" "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * NUM_PI; \n");
88
89 source.append(" sn_a = sincos(-angle, &cs_a); \n");
90
91 source.append(" "); source.append(numeric_string); source.append("2 a_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
92 source.append(" "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, -sn_a); \n");
93
94 source.append(" A[i] = ("); source.append(numeric_string); source.append("2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
95 source.append(" B[i] = b_i; \n");
96
97 // very bad instruction, to be fixed
98 source.append(" if (i) \n");
99 source.append(" B[ext_size - i] = b_i; \n");
100 source.append(" } \n");
101 source.append("} \n");
102 }
103
104 /** @brief Extract real part of a complex number array */
105 template<typename StringT>
generate_fft_complex_to_real(StringT & source,std::string const & numeric_string)106 void generate_fft_complex_to_real(StringT & source, std::string const & numeric_string)
107 {
108 source.append("__kernel void complex_to_real(__global "); source.append(numeric_string); source.append("2 *in, \n");
109 source.append(" __global "); source.append(numeric_string); source.append(" *out, \n");
110 source.append(" unsigned int size) { \n");
111 source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
112 source.append(" out[i] = in[i].x; \n");
113 source.append("} \n");
114 }
115
116 /** @brief OpenCL kernel generation code for dividing a complex number by a real number */
117 template<typename StringT>
generate_fft_div_vec_scalar(StringT & source,std::string const & numeric_string)118 void generate_fft_div_vec_scalar(StringT & source, std::string const & numeric_string)
119 {
120 source.append("__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append("2 *input1, \n");
121 source.append(" unsigned int size, \n");
122 source.append(" "); source.append(numeric_string); source.append(" factor) { \n");
123 source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
124 source.append(" input1[i] /= factor; \n");
125 source.append("} \n");
126 }
127
128 /** @brief Elementwise product of two complex vectors */
129 template<typename StringT>
generate_fft_mult_vec(StringT & source,std::string const & numeric_string)130 void generate_fft_mult_vec(StringT & source, std::string const & numeric_string)
131 {
132 source.append("__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append("2 *input1, \n");
133 source.append(" __global const "); source.append(numeric_string); source.append("2 *input2, \n");
134 source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
135 source.append(" unsigned int size) { \n");
136 source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
137 source.append(" "); source.append(numeric_string); source.append("2 in1 = input1[i]; \n");
138 source.append(" "); source.append(numeric_string); source.append("2 in2 = input2[i]; \n");
139
140 source.append(" output[i] = ("); source.append(numeric_string); source.append("2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
141 source.append(" } \n");
142 source.append("} \n");
143 }
144
145 /** @brief Embedds a real-valued vector into a complex one */
146 template<typename StringT>
generate_fft_real_to_complex(StringT & source,std::string const & numeric_string)147 void generate_fft_real_to_complex(StringT & source, std::string const & numeric_string)
148 {
149 source.append("__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(" *in, \n");
150 source.append(" __global "); source.append(numeric_string); source.append("2 *out, \n");
151 source.append(" unsigned int size) { \n");
152 source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
153 source.append(" "); source.append(numeric_string); source.append("2 val = 0; \n");
154 source.append(" val.x = in[i]; \n");
155 source.append(" out[i] = val; \n");
156 source.append(" } \n");
157 source.append("} \n");
158 }
159
160 /** @brief Reverses the entries in a vector */
161 template<typename StringT>
generate_fft_reverse_inplace(StringT & source,std::string const & numeric_string)162 void generate_fft_reverse_inplace(StringT & source, std::string const & numeric_string)
163 {
164 source.append("__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(" *vec, uint size) { \n");
165 source.append(" for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
166 source.append(" "); source.append(numeric_string); source.append(" val1 = vec[i]; \n");
167 source.append(" "); source.append(numeric_string); source.append(" val2 = vec[size - i - 1]; \n");
168
169 source.append(" vec[i] = val2; \n");
170 source.append(" vec[size - i - 1] = val1; \n");
171 source.append(" } \n");
172 source.append("} \n");
173 }
174
175 /** @brief Simplistic matrix transpose function */
176 template<typename StringT>
generate_fft_transpose(StringT & source,std::string const & numeric_string)177 void generate_fft_transpose(StringT & source, std::string const & numeric_string)
178 {
179 source.append("__kernel void transpose(__global "); source.append(numeric_string); source.append("2 *input, \n");
180 source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
181 source.append(" unsigned int row_num, \n");
182 source.append(" unsigned int col_num) { \n");
183 source.append(" unsigned int size = row_num * col_num; \n");
184 source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
185 source.append(" unsigned int row = i / col_num; \n");
186 source.append(" unsigned int col = i - row*col_num; \n");
187
188 source.append(" unsigned int new_pos = col * row_num + row; \n");
189
190 source.append(" output[new_pos] = input[i]; \n");
191 source.append(" } \n");
192 source.append("} \n");
193 }
194
195 /** @brief Simplistic inplace matrix transpose function */
196 template<typename StringT>
generate_fft_transpose_inplace(StringT & source,std::string const & numeric_string)197 void generate_fft_transpose_inplace(StringT & source, std::string const & numeric_string)
198 {
199 source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("2* input, \n");
200 source.append(" unsigned int row_num, \n");
201 source.append(" unsigned int col_num) { \n");
202 source.append(" unsigned int size = row_num * col_num; \n");
203 source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
204 source.append(" unsigned int row = i / col_num; \n");
205 source.append(" unsigned int col = i - row*col_num; \n");
206
207 source.append(" unsigned int new_pos = col * row_num + row; \n");
208
209 source.append(" if (i < new_pos) { \n");
210 source.append(" "); source.append(numeric_string); source.append("2 val = input[i]; \n");
211 source.append(" input[i] = input[new_pos]; \n");
212 source.append(" input[new_pos] = val; \n");
213 source.append(" } \n");
214 source.append(" } \n");
215 source.append("} \n");
216 }
217
218 /** @brief Computes the matrix vector product with a Vandermonde matrix */
219 template<typename StringT>
generate_fft_vandermonde_prod(StringT & source,std::string const & numeric_string)220 void generate_fft_vandermonde_prod(StringT & source, std::string const & numeric_string)
221 {
222 source.append("__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(" *vander, \n");
223 source.append(" __global "); source.append(numeric_string); source.append(" *vector, \n");
224 source.append(" __global "); source.append(numeric_string); source.append(" *result, \n");
225 source.append(" uint size) { \n");
226 source.append(" for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
227 source.append(" "); source.append(numeric_string); source.append(" mul = vander[i]; \n");
228 source.append(" "); source.append(numeric_string); source.append(" pwr = 1; \n");
229 source.append(" "); source.append(numeric_string); source.append(" val = 0; \n");
230
231 source.append(" for (uint j = 0; j < size; j++) { \n");
232 source.append(" val = val + pwr * vector[j]; \n");
233 source.append(" pwr *= mul; \n");
234 source.append(" } \n");
235
236 source.append(" result[i] = val; \n");
237 source.append(" } \n");
238 source.append("} \n");
239 }
240
241 /** @brief Zero two complex vectors (to avoid kernel launch overhead) */
242 template<typename StringT>
generate_fft_zero2(StringT & source,std::string const & numeric_string)243 void generate_fft_zero2(StringT & source, std::string const & numeric_string)
244 {
245 source.append("__kernel void zero2(__global "); source.append(numeric_string); source.append("2 *input1, \n");
246 source.append(" __global "); source.append(numeric_string); source.append("2 *input2, \n");
247 source.append(" unsigned int size) { \n");
248 source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
249 source.append(" input1[i] = 0; \n");
250 source.append(" input2[i] = 0; \n");
251 source.append(" } \n");
252 source.append("} \n");
253 }
254
255 //////////////////////////// Part 2: Main kernel class ////////////////////////////////////
256
257 // main kernel class
258 /** @brief Main kernel class for generating OpenCL kernels for the fast Fourier transform. */
259 template<typename NumericT>
260 struct fft
261 {
program_nameviennacl::linalg::opencl::kernels::fft262 static std::string program_name()
263 {
264 return viennacl::ocl::type_to_string<NumericT>::apply() + "_fft";
265 }
266
initviennacl::linalg::opencl::kernels::fft267 static void init(viennacl::ocl::context & ctx)
268 {
269 static std::map<cl_context, bool> init_done;
270 if (!init_done[ctx.handle().get()])
271 {
272 viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
273 std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
274
275 std::string source;
276 source.reserve(8192);
277
278 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
279
280 // unary operations
281 if (numeric_string == "float" || numeric_string == "double")
282 {
283 generate_fft_bluestein_post(source, numeric_string);
284 generate_fft_bluestein_pre(source, numeric_string);
285 generate_fft_complex_to_real(source, numeric_string);
286 generate_fft_div_vec_scalar(source, numeric_string);
287 generate_fft_mult_vec(source, numeric_string);
288 generate_fft_real_to_complex(source, numeric_string);
289 generate_fft_reverse_inplace(source, numeric_string);
290 generate_fft_transpose(source, numeric_string);
291 generate_fft_transpose_inplace(source, numeric_string);
292 generate_fft_vandermonde_prod(source, numeric_string);
293 generate_fft_zero2(source, numeric_string);
294 }
295
296 std::string prog_name = program_name();
297 #ifdef VIENNACL_BUILD_INFO
298 std::cout << "Creating program " << prog_name << std::endl;
299 #endif
300 ctx.add_program(source, prog_name);
301 init_done[ctx.handle().get()] = true;
302 } //if
303 } //init
304 };
305
306 } // namespace kernels
307 } // namespace opencl
308 } // namespace linalg
309 } // namespace viennacl
310 #endif
311
312