1 /* ************************************************************************
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 This file contains the implementation of inplace transpose kernel string generation.
20 This includes both square and non square, twiddle and non twiddle, as well as the kernels
21 that swap lines following permutation algorithm.
22 */
23 #include <vector>
24 #include "generator.transpose.h"
25
26 namespace clfft_transpose_generator
27 {
28 // generating string for calculating offset within sqaure transpose kernels (genTransposeKernelBatched)
OffsetCalc(std::stringstream & transKernel,const FFTKernelGenKeyParams & params,bool input)29 void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& params, bool input)
30 {
31 const size_t *stride = input ? params.fft_inStride : params.fft_outStride;
32 std::string offset = input ? "iOffset" : "oOffset";
33
34
35 clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
36 clKernWrite(transKernel, 3) << "g_index = get_group_id(0);" << std::endl;
37
38 for (size_t i = params.fft_DataDim - 2; i > 0; i--)
39 {
40 clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
41 clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
42 }
43
44 clKernWrite(transKernel, 3) << std::endl;
45 }
46
47 // generating string for calculating offset within sqaure transpose kernels (genTransposeKernelLeadingDimensionBatched)
OffsetCalcLeadingDimensionBatched(std::stringstream & transKernel,const FFTKernelGenKeyParams & params)48 void OffsetCalcLeadingDimensionBatched(std::stringstream& transKernel, const FFTKernelGenKeyParams& params)
49 {
50 const size_t *stride = params.fft_inStride;
51 std::string offset = "iOffset";
52
53 clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
54 clKernWrite(transKernel, 3) << "g_index = get_group_id(0);" << std::endl;
55
56 for (size_t i = params.fft_DataDim - 2; i > 0; i--)
57 {
58 clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
59 clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
60 }
61
62 clKernWrite(transKernel, 3) << std::endl;
63 }
64
65 // generating string for calculating offset within swap kernels (genSwapKernel)
Swap_OffsetCalc(std::stringstream & transKernel,const FFTKernelGenKeyParams & params)66 void Swap_OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& params)
67 {
68 const size_t *stride = params.fft_inStride;
69 std::string offset = "iOffset";
70
71 clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
72
73 for (size_t i = params.fft_DataDim - 2; i > 0; i--)
74 {
75 clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
76 clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
77 }
78
79 clKernWrite(transKernel, 3) << std::endl;
80 }
81
82 // Small snippet of code that multiplies the twiddle factors into the butterfiles. It is only emitted if the plan tells
83 // the generator that it wants the twiddle factors generated inside of the transpose
genTwiddleMath(const FFTKernelGenKeyParams & params,std::stringstream & transKernel,const std::string & dtComplex,bool fwd)84 clfftStatus genTwiddleMath(const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd)
85 {
86
87 clKernWrite(transKernel, 9) << std::endl;
88
89 clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (t_gx_p*32 + lidx) * (t_gy_p*32 + lidy + loop*8) );" << std::endl;
90 clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (t_gy_p*32 + lidx) * (t_gx_p*32 + lidy + loop*8) );" << std::endl;
91
92 clKernWrite(transKernel, 9) << dtComplex << " Tm, Tt;" << std::endl;
93
94 if (fwd)
95 {
96 clKernWrite(transKernel, 9) << "Tm.x = ( Wm.x * tmpm.x ) - ( Wm.y * tmpm.y );" << std::endl;
97 clKernWrite(transKernel, 9) << "Tm.y = ( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
98 clKernWrite(transKernel, 9) << "Tt.x = ( Wt.x * tmpt.x ) - ( Wt.y * tmpt.y );" << std::endl;
99 clKernWrite(transKernel, 9) << "Tt.y = ( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
100 }
101 else
102 {
103 clKernWrite(transKernel, 9) << "Tm.x = ( Wm.x * tmpm.x ) + ( Wm.y * tmpm.y );" << std::endl;
104 clKernWrite(transKernel, 9) << "Tm.y = -( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
105 clKernWrite(transKernel, 9) << "Tt.x = ( Wt.x * tmpt.x ) + ( Wt.y * tmpt.y );" << std::endl;
106 clKernWrite(transKernel, 9) << "Tt.y = -( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
107 }
108
109 clKernWrite(transKernel, 9) << "tmpm.x = Tm.x;" << std::endl;
110 clKernWrite(transKernel, 9) << "tmpm.y = Tm.y;" << std::endl;
111 clKernWrite(transKernel, 9) << "tmpt.x = Tt.x;" << std::endl;
112 clKernWrite(transKernel, 9) << "tmpt.y = Tt.y;" << std::endl;
113
114 clKernWrite(transKernel, 9) << std::endl;
115
116 return CLFFT_SUCCESS;
117 }
118
119 // Small snippet of code that multiplies the twiddle factors into the butterfiles. It is only emitted if the plan tells
120 // the generator that it wants the twiddle factors generated inside of the transpose
genTwiddleMathLeadingDimensionBatched(const FFTKernelGenKeyParams & params,std::stringstream & transKernel,const std::string & dtComplex,bool fwd)121 clfftStatus genTwiddleMathLeadingDimensionBatched(const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd)
122 {
123
124 clKernWrite(transKernel, 9) << std::endl;
125 if (params.fft_N[0] > params.fft_N[1])
126 {
127 clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (" << params.fft_N[1] << " * square_matrix_index + t_gx_p*32 + lidx) * (t_gy_p*32 + lidy + loop*8) );" << std::endl;
128 clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (" << params.fft_N[1] << " * square_matrix_index + t_gy_p*32 + lidx) * (t_gx_p*32 + lidy + loop*8) );" << std::endl;
129 }
130 else
131 {
132 clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (t_gx_p*32 + lidx) * (" << params.fft_N[0] << " * square_matrix_index + t_gy_p*32 + lidy + loop*8) );" << std::endl;
133 clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (t_gy_p*32 + lidx) * (" << params.fft_N[0] << " * square_matrix_index + t_gx_p*32 + lidy + loop*8) );" << std::endl;
134 }
135 clKernWrite(transKernel, 9) << dtComplex << " Tm, Tt;" << std::endl;
136
137 if (fwd)
138 {
139 clKernWrite(transKernel, 9) << "Tm.x = ( Wm.x * tmpm.x ) - ( Wm.y * tmpm.y );" << std::endl;
140 clKernWrite(transKernel, 9) << "Tm.y = ( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
141 clKernWrite(transKernel, 9) << "Tt.x = ( Wt.x * tmpt.x ) - ( Wt.y * tmpt.y );" << std::endl;
142 clKernWrite(transKernel, 9) << "Tt.y = ( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
143 }
144 else
145 {
146 clKernWrite(transKernel, 9) << "Tm.x = ( Wm.x * tmpm.x ) + ( Wm.y * tmpm.y );" << std::endl;
147 clKernWrite(transKernel, 9) << "Tm.y = -( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
148 clKernWrite(transKernel, 9) << "Tt.x = ( Wt.x * tmpt.x ) + ( Wt.y * tmpt.y );" << std::endl;
149 clKernWrite(transKernel, 9) << "Tt.y = -( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
150 }
151
152 clKernWrite(transKernel, 9) << "tmpm.x = Tm.x;" << std::endl;
153 clKernWrite(transKernel, 9) << "tmpm.y = Tm.y;" << std::endl;
154 clKernWrite(transKernel, 9) << "tmpt.x = Tt.x;" << std::endl;
155 clKernWrite(transKernel, 9) << "tmpt.y = Tt.y;" << std::endl;
156
157 clKernWrite(transKernel, 9) << std::endl;
158
159 return CLFFT_SUCCESS;
160 }
161
genTransposePrototype(const FFTGeneratedTransposeSquareAction::Signature & params,const size_t & lwSize,const std::string & dtPlanar,const std::string & dtComplex,const std::string & funcName,std::stringstream & transKernel,std::string & dtInput,std::string & dtOutput)162 clfftStatus genTransposePrototype(const FFTGeneratedTransposeSquareAction::Signature & params, const size_t& lwSize, const std::string& dtPlanar, const std::string& dtComplex,
163 const std::string &funcName, std::stringstream& transKernel, std::string& dtInput, std::string& dtOutput)
164 {
165
166 // Declare and define the function
167 clKernWrite(transKernel, 0) << "__attribute__(( reqd_work_group_size( " << lwSize << ", 1, 1 ) ))" << std::endl;
168 clKernWrite(transKernel, 0) << "kernel void" << std::endl;
169
170 clKernWrite(transKernel, 0) << funcName << "( ";
171
172 switch (params.fft_inputLayout)
173 {
174 case CLFFT_COMPLEX_INTERLEAVED:
175 dtInput = dtComplex;
176 dtOutput = dtComplex;
177 clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
178 break;
179 case CLFFT_COMPLEX_PLANAR:
180 dtInput = dtPlanar;
181 dtOutput = dtPlanar;
182 clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA_R" << ", global " << dtInput << "* restrict inputA_I";
183 break;
184 case CLFFT_HERMITIAN_INTERLEAVED:
185 case CLFFT_HERMITIAN_PLANAR:
186 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
187 case CLFFT_REAL:
188 dtInput = dtPlanar;
189 dtOutput = dtPlanar;
190
191 clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
192 break;
193 default:
194 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
195 }
196
197 if (params.fft_placeness == CLFFT_OUTOFPLACE)
198 switch (params.fft_outputLayout)
199 {
200 case CLFFT_COMPLEX_INTERLEAVED:
201 dtInput = dtComplex;
202 dtOutput = dtComplex;
203 clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
204 break;
205 case CLFFT_COMPLEX_PLANAR:
206 dtInput = dtPlanar;
207 dtOutput = dtPlanar;
208 clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA_R" << ", global " << dtOutput << "* restrict outputA_I";
209 break;
210 case CLFFT_HERMITIAN_INTERLEAVED:
211 case CLFFT_HERMITIAN_PLANAR:
212 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
213 case CLFFT_REAL:
214 dtInput = dtPlanar;
215 dtOutput = dtPlanar;
216 clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
217 break;
218 default:
219 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
220 }
221
222 if (params.fft_hasPreCallback)
223 {
224 assert(!params.fft_hasPostCallback);
225
226 if (params.fft_preCallback.localMemSize > 0)
227 {
228 clKernWrite(transKernel, 0) << ", __global void* pre_userdata, __local void* localmem";
229 }
230 else
231 {
232 clKernWrite(transKernel, 0) << ", __global void* pre_userdata";
233 }
234 }
235 if (params.fft_hasPostCallback)
236 {
237 assert(!params.fft_hasPreCallback);
238
239 if (params.fft_postCallback.localMemSize > 0)
240 {
241 clKernWrite(transKernel, 0) << ", __global void* post_userdata, __local void* localmem";
242 }
243 else
244 {
245 clKernWrite(transKernel, 0) << ", __global void* post_userdata";
246 }
247 }
248
249 // Close the method signature
250 clKernWrite(transKernel, 0) << " )\n{" << std::endl;
251 return CLFFT_SUCCESS;
252 }
253
genTransposePrototypeLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params,const size_t & lwSize,const std::string & dtPlanar,const std::string & dtComplex,const std::string & funcName,std::stringstream & transKernel,std::string & dtInput,std::string & dtOutput)254 clfftStatus genTransposePrototypeLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params, const size_t& lwSize,
255 const std::string& dtPlanar, const std::string& dtComplex,
256 const std::string &funcName, std::stringstream& transKernel,
257 std::string& dtInput, std::string& dtOutput)
258 {
259
260 // Declare and define the function
261 clKernWrite(transKernel, 0) << "__attribute__(( reqd_work_group_size( " << lwSize << ", 1, 1 ) ))" << std::endl;
262 clKernWrite(transKernel, 0) << "kernel void" << std::endl;
263
264 clKernWrite(transKernel, 0) << funcName << "( ";
265
266 switch (params.fft_inputLayout)
267 {
268 case CLFFT_COMPLEX_INTERLEAVED:
269 dtInput = dtComplex;
270 dtOutput = dtComplex;
271 clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
272 break;
273 case CLFFT_COMPLEX_PLANAR:
274 dtInput = dtPlanar;
275 dtOutput = dtPlanar;
276 clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA_R" << ", global " << dtInput << "* restrict inputA_I";
277 break;
278 case CLFFT_HERMITIAN_INTERLEAVED:
279 case CLFFT_HERMITIAN_PLANAR:
280 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
281 case CLFFT_REAL:
282 dtInput = dtPlanar;
283 dtOutput = dtPlanar;
284
285 clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
286 break;
287 default:
288 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
289 }
290
291 if (params.fft_hasPreCallback)
292 {
293 assert(!params.fft_hasPostCallback);
294 if (params.fft_preCallback.localMemSize > 0)
295 {
296 clKernWrite(transKernel, 0) << ", __global void* pre_userdata, __local void* localmem";
297 }
298 else
299 {
300 clKernWrite(transKernel, 0) << ", __global void* pre_userdata";
301 }
302 }
303 if (params.fft_hasPostCallback)
304 {
305 assert(!params.fft_hasPreCallback);
306
307 if (params.fft_postCallback.localMemSize > 0)
308 {
309 clKernWrite(transKernel, 0) << ", __global void* post_userdata, __local void* localmem";
310 }
311 else
312 {
313 clKernWrite(transKernel, 0) << ", __global void* post_userdata";
314 }
315 }
316
317
318 // Close the method signature
319 clKernWrite(transKernel, 0) << " )\n{" << std::endl;
320 return CLFFT_SUCCESS;
321 }
322
323 /* -> get_cycles function gets the swapping logic required for given row x col matrix.
324 -> cycle_map[0] holds the total number of cycles required.
325 -> cycles start and end with the same index, hence we can identify individual cycles,
326 though we tend to store the cycle index contiguously*/
get_cycles(size_t * cycle_map,size_t num_reduced_row,size_t num_reduced_col)327 void get_cycles(size_t *cycle_map, size_t num_reduced_row, size_t num_reduced_col)
328 {
329 int *is_swapped = new int[num_reduced_row * num_reduced_col];
330 int i, map_index = 1, num_cycles = 0;
331 size_t swap_id;
332 /*initialize swap map*/
333 is_swapped[0] = 1;
334 is_swapped[num_reduced_row * num_reduced_col - 1] = 1;
335 for (i = 1; i < (num_reduced_row * num_reduced_col - 1); i++)
336 {
337 is_swapped[i] = 0;
338 }
339
340 for (i = 1; i < (num_reduced_row * num_reduced_col - 1); i++)
341 {
342 swap_id = i;
343 while (!is_swapped[swap_id])
344 {
345 is_swapped[swap_id] = 1;
346 cycle_map[map_index++] = swap_id;
347 swap_id = (num_reduced_row * swap_id) % (num_reduced_row * num_reduced_col - 1);
348 if (swap_id == i)
349 {
350 cycle_map[map_index++] = swap_id;
351 num_cycles++;
352 }
353 }
354 }
355 cycle_map[0] = num_cycles;
356 delete[] is_swapped;
357 }
358
359 /*
360 calculate the permutation cycles consumed in swap kernels.
361 each cycle is strored in a vecotor. hopfully there are mutliple independent vectors thus we use a vector of vecotor
362 */
permutation_calculation(size_t m,size_t n,std::vector<std::vector<size_t>> & permutationVec)363 void permutation_calculation(size_t m, size_t n, std::vector<std::vector<size_t> > &permutationVec)
364 {
365 /*
366 calculate inplace transpose permutation lists
367 reference:
368 https://en.wikipedia.org/wiki/In-place_matrix_transposition
369 and
370 http://www.netlib.org/utk/people/JackDongarra/CCDSC-2014/talk35.pdf
371 row major matrix of size n x m
372 p(k) = (k*n)mod(m*n-1), if 0 < k < m*n-1
373 when k = 0 or m*n-1, it does not require movement
374 */
375 if (m < 1 || n < 1)
376 return;
377
378 size_t mn_minus_one = m*n - 1;
379 //maintain a table so check is faster
380 size_t *table = new size_t[mn_minus_one + 1]();//init to zeros
381 table[0] = 1;
382
383 for (size_t i = 1; i < mn_minus_one; i++)
384 {
385 //first check if i is already stored in somewhere in vector of vectors
386 bool already_checked = false;
387 if (table[i] >= 1)
388 already_checked = true;
389 if (already_checked == true)
390 continue;
391
392 //if not checked yet
393 std::vector<size_t> vec;
394 vec.push_back(i);
395 table[i] += 1;
396 size_t temp = i;
397
398 while (1)
399 {
400 temp = (temp*n);
401 temp = temp % (mn_minus_one);
402 if (find(vec.begin(), vec.end(), temp) != vec.end())
403 {
404 //what goes around comes around and it should
405 break;
406 }
407 if (table[temp] >= 1)
408 {
409 already_checked = true;
410 break;
411 }
412 vec.push_back(temp);
413 table[temp] += 1;
414 }
415 if (already_checked == true)
416 continue;
417 permutationVec.push_back(vec);
418 }
419 delete[] table;
420 }
421 //swap lines. This kind of kernels are using with combination of square transpose kernels to perform nonsqaure transpose
422 //this function assumes a 1:2 ratio
genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature & params,std::string & strKernel,std::string & KernelFuncName,const size_t & lwSize,const size_t reShapeFactor)423 clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature & params, std::string& strKernel, std::string& KernelFuncName, const size_t& lwSize, const size_t reShapeFactor)
424 {
425 strKernel.reserve(4096);
426 std::stringstream transKernel(std::stringstream::out);
427
428 // These strings represent the various data types we read or write in the kernel, depending on how the plan
429 // is configured
430 std::string dtInput; // The type read as input into kernel
431 std::string dtOutput; // The type written as output from kernel
432 std::string dtPlanar; // Fundamental type for planar arrays
433 std::string tmpBuffType;
434 std::string dtComplex; // Fundamental type for complex arrays
435
436 // NOTE: Enable only for debug
437 // clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
438
439 //if (params.fft_inputLayout != params.fft_outputLayout)
440 // return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
441
442 switch (params.fft_precision)
443 {
444 case CLFFT_SINGLE:
445 case CLFFT_SINGLE_FAST:
446 dtPlanar = "float";
447 dtComplex = "float2";
448 break;
449 case CLFFT_DOUBLE:
450 case CLFFT_DOUBLE_FAST:
451 dtPlanar = "double";
452 dtComplex = "double2";
453
454 // Emit code that enables double precision in the kernel
455 clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
456 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
457 clKernWrite(transKernel, 0) << "#else" << std::endl;
458 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
459 clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
460
461 break;
462 default:
463 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
464 break;
465 }
466
467 // This detects whether the input matrix is rectangle of ratio 1:2
468
469 if ((params.fft_N[0] != 2 * params.fft_N[1]) && (params.fft_N[1] != 2 * params.fft_N[0]))
470 {
471 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
472 }
473
474 if (params.fft_placeness == CLFFT_OUTOFPLACE)
475 {
476 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
477 }
478
479 size_t smaller_dim = (params.fft_N[0] < params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
480
481 size_t input_elm_size_in_bytes;
482 switch (params.fft_precision)
483 {
484 case CLFFT_SINGLE:
485 case CLFFT_SINGLE_FAST:
486 input_elm_size_in_bytes = 4;
487 break;
488 case CLFFT_DOUBLE:
489 case CLFFT_DOUBLE_FAST:
490 input_elm_size_in_bytes = 8;
491 break;
492 default:
493 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
494 }
495
496 switch (params.fft_outputLayout)
497 {
498 case CLFFT_COMPLEX_INTERLEAVED:
499 case CLFFT_COMPLEX_PLANAR:
500 input_elm_size_in_bytes *= 2;
501 break;
502 case CLFFT_REAL:
503 break;
504 default:
505 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
506 }
507 size_t max_elements_loaded = AVAIL_MEM_SIZE / input_elm_size_in_bytes;
508 size_t num_elements_loaded;
509 size_t local_work_size_swap, num_grps_pro_row;
510
511 tmpBuffType = "__local";
512 if ((max_elements_loaded >> 1) > smaller_dim)
513 {
514 local_work_size_swap = (smaller_dim < 256) ? smaller_dim : 256;
515 num_elements_loaded = smaller_dim;
516 num_grps_pro_row = 1;
517 }
518 else
519 {
520 num_grps_pro_row = (smaller_dim << 1) / max_elements_loaded;
521 num_elements_loaded = max_elements_loaded >> 1;
522 local_work_size_swap = (num_elements_loaded < 256) ? num_elements_loaded : 256;
523 }
524
525 //If post-callback is set for the plan
526 if (params.fft_hasPostCallback)
527 {
528 //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by swap kernel
529 if (params.fft_postCallback.localMemSize > 0)
530 {
531 bool validLDSSize = false;
532
533 validLDSSize = ((2 * input_elm_size_in_bytes * (num_elements_loaded * 2)) + params.fft_postCallback.localMemSize) < params.limit_LocalMemSize;
534
535 if (!validLDSSize)
536 {
537 fprintf(stderr, "Requested local memory size not available\n");
538 return CLFFT_INVALID_ARG_VALUE;
539 }
540 }
541
542 //Insert callback function code at the beginning
543 clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
544 clKernWrite(transKernel, 0) << std::endl;
545 }
546 //If pre-callback is set for the plan
547 if (params.fft_hasPreCallback)
548 {
549 //we have already checked available LDS for pre callback
550 //Insert callback function code at the beginning
551 clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
552 clKernWrite(transKernel, 0) << std::endl;
553 }
554
555 /*Generating the swapping logic*/
556 {
557 size_t num_reduced_row;
558 size_t num_reduced_col;
559
560 if (params.fft_N[1] == smaller_dim)
561 {
562 num_reduced_row = smaller_dim;
563 num_reduced_col = 2;
564 }
565 else
566 {
567 num_reduced_row = 2;
568 num_reduced_col = smaller_dim;
569 }
570
571 std::string funcName;
572
573 clKernWrite(transKernel, 0) << std::endl;
574
575 size_t *cycle_map = new size_t[num_reduced_row * num_reduced_col * 2];
576 /* The memory required by cycle_map cannot exceed 2 times row*col by design*/
577
578 get_cycles(cycle_map, num_reduced_row, num_reduced_col);
579
580 size_t *cycle_stat = new size_t[cycle_map[0] * 2], stat_idx = 0;
581 clKernWrite(transKernel, 0) << std::endl;
582
583 clKernWrite(transKernel, 0) << "__constant size_t swap_table[][3] = {" << std::endl;
584
585 size_t inx = 0, start_inx, swap_inx = 0, num_swaps = 0;
586 for (size_t i = 0; i < cycle_map[0]; i++)
587 {
588 start_inx = cycle_map[++inx];
589 clKernWrite(transKernel, 0) << "{ " << start_inx << ", " << cycle_map[inx + 1] << ", 0}," << std::endl;
590 cycle_stat[stat_idx++] = num_swaps;
591 num_swaps++;
592
593 while (start_inx != cycle_map[++inx])
594 {
595 size_t action_var = (cycle_map[inx + 1] == start_inx) ? 2 : 1;
596 clKernWrite(transKernel, 0) << "{ " << cycle_map[inx] << ", " << cycle_map[inx + 1] << ", " << action_var << "}," << std::endl;
597 if (action_var == 2)
598 cycle_stat[stat_idx++] = num_swaps;
599 num_swaps++;
600 }
601 }
602 /*Appending swap table for touching corner elements for post call back*/
603 size_t last_datablk_idx = num_reduced_row * num_reduced_col - 1;
604 clKernWrite(transKernel, 0) << "{ 0, 0, 0}," << std::endl;
605 clKernWrite(transKernel, 0) << "{ " << last_datablk_idx << ", " << last_datablk_idx << ", 0}," << std::endl;
606
607 clKernWrite(transKernel, 0) << "};" << std::endl;
608 /*cycle_map[0] + 2, + 2 is added for post callback table appending*/
609 size_t num_cycles_minus_1 = cycle_map[0] - 1;
610
611 clKernWrite(transKernel, 0) << "__constant size_t cycle_stat[" << cycle_map[0] << "][2] = {" << std::endl;
612 for (size_t i = 0; i < num_cycles_minus_1; i++)
613 {
614 clKernWrite(transKernel, 0) << "{ " << cycle_stat[i * 2] << ", " << cycle_stat[i * 2 + 1] << "}," << std::endl;
615 }
616 clKernWrite(transKernel, 0) << "{ " << cycle_stat[num_cycles_minus_1 * 2] << ", " << (cycle_stat[num_cycles_minus_1 * 2 + 1] + 2) << "}," << std::endl;
617
618 clKernWrite(transKernel, 0) << "};" << std::endl;
619
620 clKernWrite(transKernel, 0) << std::endl;
621
622 switch (params.fft_inputLayout)
623 {
624 case CLFFT_COMPLEX_INTERLEAVED:
625 clKernWrite(transKernel, 0) << "void swap(global " << dtComplex << "* inputA, " << tmpBuffType << " " << dtComplex << "* Ls, " << tmpBuffType << " " << dtComplex << " * Ld, size_t is, size_t id, size_t pos, size_t end_indx, size_t work_id";
626 break;
627 case CLFFT_COMPLEX_PLANAR:
628 clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA_R, global " << dtPlanar << "* inputA_I, " << tmpBuffType << " " << dtComplex << "* Ls, " << tmpBuffType << " " << dtComplex << "* Ld, size_t is, size_t id, size_t pos, size_t end_indx, size_t work_id";
629 break;
630 case CLFFT_HERMITIAN_INTERLEAVED:
631 case CLFFT_HERMITIAN_PLANAR:
632 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
633 case CLFFT_REAL:
634 clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA, " << tmpBuffType << " " << dtPlanar << "* Ls, " << tmpBuffType << " " << dtPlanar << "* Ld, size_t is, size_t id, size_t pos, size_t end_indx, size_t work_id";
635 break;
636 default:
637 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
638 }
639
640 if (params.fft_hasPostCallback)
641 {
642 clKernWrite(transKernel, 0) << ", size_t iOffset, __global void* post_userdata";
643 if (params.fft_postCallback.localMemSize > 0)
644 {
645 clKernWrite(transKernel, 0) << ", __local void* localmem";
646 }
647 }
648
649 if (params.fft_hasPreCallback)
650 {
651 clKernWrite(transKernel, 0) << ", size_t iOffset, __global void* pre_userdata";
652 if (params.fft_preCallback.localMemSize > 0)
653 {
654 clKernWrite(transKernel, 0) << ", __local void* localmem";
655 }
656 }
657
658 clKernWrite(transKernel, 0) << "){" << std::endl;
659
660 clKernWrite(transKernel, 3) << "for (size_t j = get_local_id(0); j < end_indx; j += " << local_work_size_swap << "){" << std::endl;
661
662 switch (params.fft_inputLayout)
663 {
664 case CLFFT_REAL:
665 case CLFFT_COMPLEX_INTERLEAVED:
666
667 if (params.fft_hasPreCallback)
668 {
669 clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
670
671 clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA, ( is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
672 //clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
673 if (params.fft_preCallback.localMemSize > 0)
674 {
675 clKernWrite(transKernel, 0) << ", localmem";
676 }
677 clKernWrite(transKernel, 0) << ");" << std::endl;
678
679 clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
680 //clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
681 if (params.fft_preCallback.localMemSize > 0)
682 {
683 clKernWrite(transKernel, 0) << ", localmem";
684 }
685 clKernWrite(transKernel, 0) << ");" << std::endl;
686 clKernWrite(transKernel, 6) << "}" << std::endl;
687
688 clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
689 clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
690 //clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
691 if (params.fft_preCallback.localMemSize > 0)
692 {
693 clKernWrite(transKernel, 0) << ", localmem";
694 }
695 clKernWrite(transKernel, 0) << ");" << std::endl;
696 clKernWrite(transKernel, 6) << "}" << std::endl;
697 }
698 else
699 {
700 clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
701 clKernWrite(transKernel, 9) << "Ls[j] = inputA[is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
702 clKernWrite(transKernel, 9) << "Ld[j] = inputA[id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
703 clKernWrite(transKernel, 6) << "}" << std::endl;
704
705 clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
706 clKernWrite(transKernel, 9) << "Ld[j] = inputA[id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
707 clKernWrite(transKernel, 6) << "}" << std::endl;
708 }
709
710 if (params.fft_hasPostCallback)
711 {
712 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA, (iOffset + id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), post_userdata, Ls[j]";
713 if (params.fft_postCallback.localMemSize > 0)
714 {
715 clKernWrite(transKernel, 0) << ", localmem";
716 }
717 clKernWrite(transKernel, 0) << ");" << std::endl;
718 }
719 else if (params.fft_hasPreCallback)
720 {
721 clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset] = Ls[j];" << std::endl;
722 }
723 else
724 {
725 clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
726 }
727 break;
728 case CLFFT_HERMITIAN_INTERLEAVED:
729 case CLFFT_HERMITIAN_PLANAR:
730 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
731 case CLFFT_COMPLEX_PLANAR:
732 if (params.fft_hasPreCallback)
733 {
734 clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
735 clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, (is * " << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
736 if (params.fft_preCallback.localMemSize > 0)
737 {
738 clKernWrite(transKernel, 0) << ", localmem";
739 }
740 clKernWrite(transKernel, 0) << ");" << std::endl;
741
742 clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, (id * " << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
743 if (params.fft_preCallback.localMemSize > 0)
744 {
745 clKernWrite(transKernel, 0) << ", localmem";
746 }
747 clKernWrite(transKernel, 0) << ");" << std::endl;
748
749 clKernWrite(transKernel, 6) << "}" << std::endl;
750
751 clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
752
753 clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, (id * " << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
754 if (params.fft_preCallback.localMemSize > 0)
755 {
756 clKernWrite(transKernel, 0) << ", localmem";
757 }
758 clKernWrite(transKernel, 0) << ");" << std::endl;
759
760 clKernWrite(transKernel, 6) << "}" << std::endl;
761 }
762 else
763 {
764 clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
765 clKernWrite(transKernel, 9) << "Ls[j].x = inputA_R[is*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
766 clKernWrite(transKernel, 9) << "Ls[j].y = inputA_I[is*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
767 clKernWrite(transKernel, 9) << "Ld[j].x = inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
768 clKernWrite(transKernel, 9) << "Ld[j].y = inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
769 clKernWrite(transKernel, 6) << "}" << std::endl;
770
771 clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
772 clKernWrite(transKernel, 9) << "Ld[j].x = inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
773 clKernWrite(transKernel, 9) << "Ld[j].y = inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
774 clKernWrite(transKernel, 6) << "}" << std::endl;
775 }
776 if (params.fft_hasPostCallback)
777 {
778 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R, inputA_I, (iOffset + id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), post_userdata, Ls[j].x, Ls[j].y";
779 if (params.fft_postCallback.localMemSize > 0)
780 {
781 clKernWrite(transKernel, 0) << ", localmem";
782 }
783 clKernWrite(transKernel, 0) << ");" << std::endl;
784 }
785 else
786 {
787 clKernWrite(transKernel, 6) << "inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].x;" << std::endl;
788 clKernWrite(transKernel, 6) << "inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].y;" << std::endl;
789 }
790 break;
791 default:
792 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
793 }
794 clKernWrite(transKernel, 3) << "}" << std::endl;
795
796 clKernWrite(transKernel, 0) << "}" << std::endl << std::endl;
797
798 funcName = "swap_nonsquare";
799 KernelFuncName = funcName;
800 // Generate kernel API
801
802 /*when swap can be performed in LDS itself then, same prototype of transpose can be used for swap function too*/
803 genTransposePrototypeLeadingDimensionBatched(params, local_work_size_swap, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
804
805 clKernWrite(transKernel, 3) << "size_t g_index = get_group_id(0);" << std::endl;
806
807 clKernWrite(transKernel, 3) << "const size_t numGroupsY_1 = " << cycle_map[0] * num_grps_pro_row << " ;" << std::endl;
808 for (size_t i = 2; i < params.fft_DataDim - 1; i++)
809 {
810 clKernWrite(transKernel, 3) << "const size_t numGroupsY_" << i << " = numGroupsY_" << i - 1 << " * " << params.fft_N[i] << ";" << std::endl;
811 }
812
813 delete[] cycle_map;
814 delete[] cycle_stat;
815
816 Swap_OffsetCalc(transKernel, params);
817
818 // Handle planar and interleaved right here
819 switch (params.fft_inputLayout)
820 {
821 case CLFFT_COMPLEX_INTERLEAVED:
822 case CLFFT_REAL:
823
824 clKernWrite(transKernel, 3) << "__local " << dtInput << " tmp_tot_mem[" << (num_elements_loaded * 2) << "];" << std::endl;
825 clKernWrite(transKernel, 3) << tmpBuffType << " " << dtInput << " *te = tmp_tot_mem;" << std::endl;
826
827 clKernWrite(transKernel, 3) << tmpBuffType << " " << dtInput << " *to = (tmp_tot_mem + " << num_elements_loaded << ");" << std::endl;
828
829 //Do not advance offset when postcallback is set as the starting address of global buffer is needed
830 if (!params.fft_hasPostCallback && !params.fft_hasPreCallback)
831 clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl; // Set A ptr to the start of each slice
832 break;
833 case CLFFT_COMPLEX_PLANAR:
834
835 clKernWrite(transKernel, 3) << "__local " << dtComplex << " tmp_tot_mem[" << (num_elements_loaded * 2) << "];" << std::endl;
836 clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *te = tmp_tot_mem;" << std::endl;
837
838 clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *to = (tmp_tot_mem + " << num_elements_loaded << ");" << std::endl;
839
840 //Do not advance offset when postcallback is set as the starting address of global buffer is needed
841 if (!params.fft_hasPostCallback)
842 {
843 clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl; // Set A ptr to the start of each slice
844 clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl; // Set A ptr to the start of each slice
845 }
846 break;
847 case CLFFT_HERMITIAN_INTERLEAVED:
848 case CLFFT_HERMITIAN_PLANAR:
849 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
850
851 default:
852 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
853 }
854
855 switch (params.fft_inputLayout)
856 {
857 case CLFFT_COMPLEX_INTERLEAVED:
858 case CLFFT_COMPLEX_PLANAR:
859 clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *tmp_swap_ptr[2];" << std::endl;
860 break;
861 case CLFFT_REAL:
862 clKernWrite(transKernel, 3) << tmpBuffType << " " << dtPlanar << " *tmp_swap_ptr[2];" << std::endl;
863 }
864 clKernWrite(transKernel, 3) << "tmp_swap_ptr[0] = te;" << std::endl;
865 clKernWrite(transKernel, 3) << "tmp_swap_ptr[1] = to;" << std::endl;
866
867 clKernWrite(transKernel, 3) << "size_t swap_inx = 0;" << std::endl;
868
869 clKernWrite(transKernel, 3) << "size_t start = cycle_stat[g_index / " << num_grps_pro_row << "][0];" << std::endl;
870 clKernWrite(transKernel, 3) << "size_t end = cycle_stat[g_index / " << num_grps_pro_row << "][1];" << std::endl;
871
872 clKernWrite(transKernel, 3) << "size_t end_indx = " << num_elements_loaded << ";" << std::endl;
873 clKernWrite(transKernel, 3) << "size_t work_id = g_index % " << num_grps_pro_row << ";" << std::endl;
874
875 clKernWrite(transKernel, 3) << "if( work_id == " << (num_grps_pro_row - 1) << " ){" << std::endl;
876 clKernWrite(transKernel, 6) << "end_indx = " << smaller_dim - num_elements_loaded * (num_grps_pro_row - 1) << ";" << std::endl;
877 clKernWrite(transKernel, 3) << "}" << std::endl;
878
879 clKernWrite(transKernel, 3) << "for (size_t loop = start; loop <= end; loop ++){" << std::endl;
880 clKernWrite(transKernel, 6) << "swap_inx = 1 - swap_inx;" << std::endl;
881
882 switch (params.fft_inputLayout)
883 {
884 case CLFFT_COMPLEX_INTERLEAVED:
885 case CLFFT_REAL:
886 clKernWrite(transKernel, 6) << "swap(inputA, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id";
887 break;
888 case CLFFT_COMPLEX_PLANAR:
889 clKernWrite(transKernel, 6) << "swap(inputA_R, inputA_I, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id";
890 break;
891 }
892 if (params.fft_hasPostCallback)
893 {
894 clKernWrite(transKernel, 0) << ", iOffset, post_userdata";
895 if (params.fft_postCallback.localMemSize > 0)
896 {
897 clKernWrite(transKernel, 0) << ", localmem";
898 }
899 }
900 if (params.fft_hasPreCallback)
901 {
902 clKernWrite(transKernel, 0) << ", iOffset, pre_userdata";
903 if (params.fft_preCallback.localMemSize > 0)
904 {
905 clKernWrite(transKernel, 0) << ", localmem";
906 }
907 }
908 clKernWrite(transKernel, 0) << ");" << std::endl;
909
910 clKernWrite(transKernel, 3) << "}" << std::endl;
911
912 clKernWrite(transKernel, 0) << "}" << std::endl;
913 strKernel = transKernel.str();
914 }
915 return CLFFT_SUCCESS;
916 }
917
918 //swap lines. a more general kernel generator.
919 //this function accepts any ratio in theory. But in practice we restrict it to 1:2, 1:3, 1:5 and 1:10 ration
genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Signature & params,std::string & strKernel,std::string & KernelFuncName,const size_t & lwSize,const size_t reShapeFactor)920 clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Signature & params, std::string& strKernel, std::string& KernelFuncName, const size_t& lwSize, const size_t reShapeFactor)
921 {
922 if (params.fft_placeness == CLFFT_OUTOFPLACE)
923 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
924
925 size_t smaller_dim = (params.fft_N[0] < params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
926 size_t bigger_dim = (params.fft_N[0] >= params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
927 size_t dim_ratio = bigger_dim / smaller_dim;
928 /*
929 if ( (params.fft_N[0] != 2 * params.fft_N[1]) && (params.fft_N[1] != 2 * params.fft_N[0]) &&
930 (params.fft_N[0] != 3 * params.fft_N[1]) && (params.fft_N[1] != 3 * params.fft_N[0]) &&
931 (params.fft_N[0] != 5 * params.fft_N[1]) && (params.fft_N[1] != 5 * params.fft_N[0]) &&
932 (params.fft_N[0] != 10 * params.fft_N[1]) && (params.fft_N[1] != 10 * params.fft_N[0]) )
933 */
934 if(dim_ratio % 2 != 0 && dim_ratio % 3 != 0 && dim_ratio % 5 != 0 && dim_ratio % 10 != 0)
935 {
936 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
937 }
938
939 strKernel.reserve(4096);
940 std::stringstream transKernel(std::stringstream::out);
941
942 // These strings represent the various data types we read or write in the kernel, depending on how the plan
943 // is configured
944 std::string dtInput; // The type read as input into kernel
945 std::string dtOutput; // The type written as output from kernel
946 std::string dtPlanar; // Fundamental type for planar arrays
947 std::string tmpBuffType;
948 std::string dtComplex; // Fundamental type for complex arrays
949
950 // NOTE: Enable only for debug
951 // clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
952
953 //if (params.fft_inputLayout != params.fft_outputLayout)
954 // return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
955
956 switch (params.fft_precision)
957 {
958 case CLFFT_SINGLE:
959 case CLFFT_SINGLE_FAST:
960 dtPlanar = "float";
961 dtComplex = "float2";
962 break;
963 case CLFFT_DOUBLE:
964 case CLFFT_DOUBLE_FAST:
965 dtPlanar = "double";
966 dtComplex = "double2";
967
968 // Emit code that enables double precision in the kernel
969 clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
970 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
971 clKernWrite(transKernel, 0) << "#else" << std::endl;
972 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
973 clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
974
975 break;
976 default:
977 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
978 break;
979 }
980
981
982 size_t LDS_per_WG = smaller_dim;
983 while (LDS_per_WG > 1024)//avoiding using too much lds memory. the biggest LDS memory we will allocate would be 1024*sizeof(float2/double2)*2
984 {
985 if (LDS_per_WG % 2 == 0)
986 {
987 LDS_per_WG /= 2;
988 continue;
989 }
990 if (LDS_per_WG % 3 == 0)
991 {
992 LDS_per_WG /= 3;
993 continue;
994 }
995 if (LDS_per_WG % 5 == 0)
996 {
997 LDS_per_WG /= 5;
998 continue;
999 }
1000 return CLFFT_NOTIMPLEMENTED;
1001 }
1002 size_t WG_per_line = smaller_dim / LDS_per_WG;
1003
1004 size_t input_elm_size_in_bytes;
1005 switch (params.fft_precision)
1006 {
1007 case CLFFT_SINGLE:
1008 case CLFFT_SINGLE_FAST:
1009 input_elm_size_in_bytes = 4;
1010 break;
1011 case CLFFT_DOUBLE:
1012 case CLFFT_DOUBLE_FAST:
1013 input_elm_size_in_bytes = 8;
1014 break;
1015 default:
1016 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1017 }
1018
1019 switch (params.fft_outputLayout)
1020 {
1021 case CLFFT_COMPLEX_INTERLEAVED:
1022 case CLFFT_COMPLEX_PLANAR:
1023 input_elm_size_in_bytes *= 2;
1024 break;
1025 case CLFFT_REAL:
1026 break;
1027 default:
1028 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1029 }
1030 /* not entirely clearly why do i need this yet
1031 size_t max_elements_loaded = AVAIL_MEM_SIZE / input_elm_size_in_bytes;
1032 size_t num_elements_loaded;
1033 size_t local_work_size_swap, num_grps_pro_row;
1034 */
1035
1036 //if pre-callback is set for the plan
1037 if (params.fft_hasPreCallback)
1038 {
1039 //we have already checked available LDS for pre callback
1040 //Insert callback function code at the beginning
1041 clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
1042 clKernWrite(transKernel, 0) << std::endl;
1043 }
1044 //if post-callback is set for the plan
1045 //rarely do we need post callback in swap kernel. But it is possible.
1046 if (params.fft_hasPostCallback)
1047 {
1048 clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
1049 clKernWrite(transKernel, 0) << std::endl;
1050 }
1051
1052 //twiddle in swap kernel (for now, swap with twiddle seems to always be the second kernel after transpose)
1053 bool twiddleSwapKernel = params.fft_3StepTwiddle && (dim_ratio > 1);
1054 //twiddle factors applied to the output of swap kernels if swap kernels are the last kernel in transpose order
1055 bool twiddleSwapKernelOut = twiddleSwapKernel && (params.nonSquareKernelOrder == TRANSPOSE_AND_SWAP || params.nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP);
1056 //twiddle factors applied to the input of swap kernels if swap kernels are the first kernel in transpose order
1057 bool twiddleSwapKernelIn = twiddleSwapKernel && (params.nonSquareKernelOrder == SWAP_AND_TRANSPOSE);
1058
1059
1060 //generate the swap_table
1061 std::vector<std::vector<size_t> > permutationTable;
1062 permutation_calculation(dim_ratio, smaller_dim, permutationTable);
1063
1064 clKernWrite(transKernel, 0) << "__constant size_t swap_table["<< permutationTable.size()+2 <<"][1] = {" << std::endl;
1065 clKernWrite(transKernel, 0) << "{0}," << std::endl;
1066 clKernWrite(transKernel, 0) << "{"<< smaller_dim * dim_ratio - 1 <<"}," << std::endl;// add the first and last row to the swap table. needed for twiddling
1067 for (std::vector<std::vector<size_t> >::iterator itor = permutationTable.begin(); itor != permutationTable.end(); itor++)
1068 {
1069 clKernWrite(transKernel, 0) << "{" << (*itor)[0] << "}";
1070 if (itor == (permutationTable.end() - 1))//last vector
1071 clKernWrite(transKernel, 0) << std::endl << "};" << std::endl;
1072 else
1073 clKernWrite(transKernel, 0) << "," << std::endl;
1074 }
1075
1076 //twiddle in swap kernel
1077 //twiddle in or out should be using the same twiddling table
1078 if (twiddleSwapKernel)
1079 {
1080 std::string str;
1081 StockhamGenerator::TwiddleTableLarge twLarge(smaller_dim * smaller_dim * dim_ratio);
1082 if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
1083 twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
1084 else
1085 twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
1086 clKernWrite(transKernel, 0) << str << std::endl;
1087 clKernWrite(transKernel, 0) << std::endl;
1088 }
1089
1090 //std::string funcName = "swap_nonsquare_" + std::to_string(smaller_dim) + "_" + std::to_string(dim_ratio);
1091 std::string funcName = "swap_nonsquare_";
1092 std::string smaller_dim_str = static_cast<const std::ostringstream&>(std::ostringstream() << smaller_dim).str();
1093 std::string dim_ratio_str = static_cast<const std::ostringstream&>(std::ostringstream() << dim_ratio).str();
1094 if(params.fft_N[0] > params.fft_N[1])
1095 funcName = funcName + smaller_dim_str + "_" + dim_ratio_str;
1096 else
1097 funcName = funcName + dim_ratio_str + "_" + smaller_dim_str;
1098
1099 KernelFuncName = funcName;
1100 size_t local_work_size_swap = 256;
1101
1102 for (size_t bothDir = 0; bothDir < 2; bothDir++)
1103 {
1104 bool fwd = bothDir ? false : true;
1105 // Generate kernel API
1106
1107 /*when swap can be performed in LDS itself then, same prototype of transpose can be used for swap function too*/
1108 std::string funcNameTW;
1109 if (twiddleSwapKernel)
1110 {
1111 if (fwd)
1112 funcNameTW = funcName + "_tw_fwd";
1113 else
1114 funcNameTW = funcName + "_tw_back";
1115 }
1116 else
1117 funcNameTW = funcName;
1118
1119 genTransposePrototypeLeadingDimensionBatched(params, local_work_size_swap, dtPlanar, dtComplex, funcNameTW, transKernel, dtInput, dtOutput);
1120
1121 clKernWrite(transKernel, 3) << "//each wg handles 1/"<< WG_per_line <<" row of " << LDS_per_WG << " in memory" << std::endl;
1122 clKernWrite(transKernel, 3) << "const size_t num_wg_per_batch = " << (permutationTable.size() + 2)*WG_per_line << ";" << std::endl; // number of wg per batch = number of independent cycles
1123 clKernWrite(transKernel, 3) << "size_t group_id = get_group_id(0);" << std::endl;
1124 clKernWrite(transKernel, 3) << "size_t idx = get_local_id(0);" << std::endl;
1125
1126 clKernWrite(transKernel, 3) << std::endl;
1127 clKernWrite(transKernel, 3) << "size_t batch_offset = group_id / num_wg_per_batch;" << std::endl;
1128 switch (params.fft_inputLayout)
1129 {
1130 case CLFFT_REAL:
1131 case CLFFT_COMPLEX_INTERLEAVED:
1132 clKernWrite(transKernel, 3) << "inputA += batch_offset*" << smaller_dim * bigger_dim << ";" << std::endl;
1133 break;
1134 case CLFFT_HERMITIAN_INTERLEAVED:
1135 case CLFFT_HERMITIAN_PLANAR:
1136 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1137 case CLFFT_COMPLEX_PLANAR:
1138 {
1139 clKernWrite(transKernel, 3) << "inputA_R += batch_offset*" << smaller_dim * bigger_dim << ";" << std::endl;
1140 clKernWrite(transKernel, 3) << "inputA_I += batch_offset*" << smaller_dim * bigger_dim << ";" << std::endl;
1141 break;
1142 }
1143 default:
1144 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1145 }
1146 clKernWrite(transKernel, 3) << "group_id -= batch_offset*" << (permutationTable.size() + 2)*WG_per_line << ";" << std::endl;
1147
1148 clKernWrite(transKernel, 3) << std::endl;
1149 if(WG_per_line == 1)
1150 clKernWrite(transKernel, 3) << "size_t prev = swap_table[group_id][0];" << std::endl;
1151 else
1152 clKernWrite(transKernel, 3) << "size_t prev = swap_table[group_id/" << WG_per_line <<"][0];" << std::endl;
1153 clKernWrite(transKernel, 3) << "size_t next = 0;" << std::endl;
1154
1155 clKernWrite(transKernel, 3) << std::endl;
1156 switch (params.fft_inputLayout)
1157 {
1158 case CLFFT_REAL:
1159 case CLFFT_COMPLEX_INTERLEAVED:
1160 {
1161 clKernWrite(transKernel, 3) << "__local " << dtInput << " prevValue[" << LDS_per_WG << "];" << std::endl;//lds within each wg should be able to store a row block (smaller_dim) of element
1162 clKernWrite(transKernel, 3) << "__local " << dtInput << " nextValue[" << LDS_per_WG << "];" << std::endl;
1163 break;
1164 }
1165 case CLFFT_COMPLEX_PLANAR:
1166 {
1167 clKernWrite(transKernel, 3) << "__local " << dtComplex << " prevValue[" << LDS_per_WG << "];" << std::endl;//lds within each wg should be able to store a row block (smaller_dim) of element
1168 clKernWrite(transKernel, 3) << "__local " << dtComplex << " nextValue[" << LDS_per_WG << "];" << std::endl;
1169 break;
1170 }
1171 case CLFFT_HERMITIAN_INTERLEAVED:
1172 case CLFFT_HERMITIAN_PLANAR:
1173 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1174 default:
1175 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1176 }
1177
1178 clKernWrite(transKernel, 3) << std::endl;
1179 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1180 {
1181 if (WG_per_line == 1)
1182 {
1183 //might look like: size_t group_offset = (prev/3)*729*3 + (prev%3)*729;
1184 clKernWrite(transKernel, 3) << "size_t group_offset = (prev/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1185 << " + (prev%" << dim_ratio << ")*" << smaller_dim << ";" << std::endl;
1186 }
1187 else
1188 {
1189 //if smaller_dim is 2187 > 1024 this should look like size_t group_offset = (prev/3)*2187*3 + (prev%3)*2187 + (group_id % 3)*729;
1190 clKernWrite(transKernel, 3) << "size_t group_offset = (prev/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1191 << " + (prev%" << dim_ratio << ")*" << smaller_dim << " + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1192 }
1193 }
1194 else
1195 {
1196 if (WG_per_line == 1)//might look like: size_t group_offset = prev*729;
1197 clKernWrite(transKernel, 3) << "size_t group_offset = (prev*" << smaller_dim << ");" << std::endl;
1198 else//if smaller_dim is 2187 > 1024 this should look like size_t group_offset = prev*2187 + (group_id % 3)*729;
1199 clKernWrite(transKernel, 3) << "size_t group_offset = (prev*" << smaller_dim << ") + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1200 }
1201
1202 clKernWrite(transKernel, 3) << std::endl;
1203 //move to that row block and load that row block to LDS
1204 if (twiddleSwapKernelIn)
1205 {
1206 clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
1207 clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
1208 clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
1209 }
1210 switch (params.fft_inputLayout)
1211 {
1212 case CLFFT_REAL:
1213 case CLFFT_COMPLEX_INTERLEAVED:
1214 {
1215 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1216 {
1217 if (i + 256 < LDS_per_WG)
1218 {
1219 if (params.fft_hasPreCallback)
1220 {
1221 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1222 << "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1223 }
1224 else
1225 {
1226 if (twiddleSwapKernelIn)
1227 {
1228 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1229 {
1230 //input is wide; output is tall; read input index realted
1231 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1232 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1233 }
1234 else
1235 {
1236 //input is tall; output is wide; read input index realted
1237 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1238 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1239 }
1240 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1241 if (fwd)
1242 {
1243 //forward
1244 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1245 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1246 }
1247 else
1248 {
1249 //backward
1250 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1251 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1252 }
1253 }
1254 else
1255 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1256 }
1257 }
1258 else
1259 {
1260 // need to handle boundary
1261 clKernWrite(transKernel, 3) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1262 if (params.fft_hasPreCallback)
1263 {
1264 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1265 << "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1266 }
1267 else
1268 {
1269 if (twiddleSwapKernelIn)
1270 {
1271 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1272 {
1273 //input is wide; output is tall; read input index realted
1274 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1275 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1276 }
1277 else
1278 {
1279 //input is tall; output is wide; read input index realted
1280 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1281 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1282 }
1283 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1284 if (fwd)
1285 {
1286 //forward
1287 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1288 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1289 }
1290 else
1291 {
1292 //backward
1293 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1294 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1295 }
1296 }
1297 else
1298 clKernWrite(transKernel, 6) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1299 }
1300 clKernWrite(transKernel, 3) << "}" << std::endl;
1301 }
1302 }
1303 break;
1304 }
1305 case CLFFT_HERMITIAN_INTERLEAVED:
1306 case CLFFT_HERMITIAN_PLANAR:
1307 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1308 case CLFFT_COMPLEX_PLANAR:
1309 {
1310 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1311 {
1312 if (i + 256 < LDS_per_WG)
1313 {
1314 if (params.fft_hasPreCallback)
1315 {
1316 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1317 "(inputA_R-batch_offset*"<< smaller_dim * bigger_dim <<", inputA_I-batch_offset*"<< smaller_dim * bigger_dim <<
1318 ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1319 }
1320 else
1321 {
1322 if (twiddleSwapKernelIn)
1323 {
1324 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1325 {
1326 //input is wide; output is tall; read input index realted
1327 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1328 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1329 }
1330 else
1331 {
1332 //input is tall; output is wide; read input index realted
1333 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1334 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1335 }
1336 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1337 if (fwd)
1338 {
1339 //forward
1340 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1341 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1342 }
1343 else
1344 {
1345 //backward
1346 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1347 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1348 }
1349 }
1350 else
1351 {
1352 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1353 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1354 }
1355 }
1356 }
1357 else
1358 {
1359 // need to handle boundary
1360 clKernWrite(transKernel, 3) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1361 if (params.fft_hasPreCallback)
1362 {
1363 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1364 "(inputA_R-batch_offset*" << smaller_dim * bigger_dim << ", inputA_I-batch_offset*" << smaller_dim * bigger_dim <<
1365 ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1366 }
1367 else
1368 {
1369 if (twiddleSwapKernelIn)
1370 {
1371 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1372 {
1373 //input is wide; output is tall; read input index realted
1374 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1375 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1376 }
1377 else
1378 {
1379 //input is tall; output is wide; read input index realted
1380 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1381 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1382 }
1383 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1384 if (fwd)
1385 {
1386 //forward
1387 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1388 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1389 }
1390 else
1391 {
1392 //backward
1393 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1394 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1395 }
1396 }
1397 else
1398 {
1399 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1400 clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1401 }
1402 }
1403 clKernWrite(transKernel, 3) << "}" << std::endl;
1404 }
1405 }
1406 break;
1407 }
1408 default:
1409 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1410 }
1411 clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1412
1413 clKernWrite(transKernel, 3) << std::endl;
1414 clKernWrite(transKernel, 3) << "do{" << std::endl;//begining of do-while
1415 //calculate the next location p(k) = (k*n)mod(m*n-1), if 0 < k < m*n-1
1416 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1417 {
1418 clKernWrite(transKernel, 6) << "next = (prev*" << smaller_dim << ")%" << smaller_dim*dim_ratio - 1 << ";" << std::endl;
1419 //takes care the last row
1420 clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
1421 clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
1422 if (WG_per_line == 1)
1423 {
1424 clKernWrite(transKernel, 6) << "group_offset = (next/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1425 << " + (next%" << dim_ratio << ")*" << smaller_dim << ";" << std::endl; //might look like: group_offset = (next/3)*729*3 + (next%3)*729;
1426 }
1427 else
1428 {
1429 //if smaller_dim is 2187 > 1024 this should look like size_t group_offset = (next/3)*2187*3 + (next%3)*2187 + (group_id % 3)*729;
1430 clKernWrite(transKernel, 6) << "group_offset = (next/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1431 << " + (next%" << dim_ratio << ")*" << smaller_dim << " + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1432 }
1433 }
1434 else
1435 {
1436 clKernWrite(transKernel, 6) << "next = (prev*" << dim_ratio << ")%" << smaller_dim*dim_ratio - 1 << ";" << std::endl;
1437 //takes care the last row
1438 clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
1439 clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
1440 if (WG_per_line == 1) //might look like: size_t group_offset = prev*729;
1441 clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ");" << std::endl;
1442 else//if smaller_dim is 2187 > 1024 this should look like size_t group_offset = next*2187 + (group_id % 3)*729;
1443 clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ") + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1444 }
1445
1446
1447 clKernWrite(transKernel, 3) << std::endl;
1448 switch (params.fft_inputLayout)
1449 {
1450 case CLFFT_REAL:
1451 case CLFFT_COMPLEX_INTERLEAVED:
1452 {
1453 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1454 {
1455 if (i + 256 < LDS_per_WG)
1456 if (params.fft_hasPreCallback)
1457 {
1458 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1459 << "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim*bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1460 }
1461 else
1462 {
1463 if (twiddleSwapKernelIn)
1464 {
1465 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1466 {
1467 //input is wide; output is tall; read input index realted
1468 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1469 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1470 }
1471 else
1472 {
1473 //input is tall; output is wide; read input index realted
1474 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1475 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1476 }
1477 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1478 if (fwd)
1479 {
1480 //forward
1481 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1482 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1483 }
1484 else
1485 {
1486 //backward
1487 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1488 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1489 }
1490 }
1491 else
1492 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1493 }
1494 else
1495 {
1496 // need to handle boundary
1497 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1498 if (params.fft_hasPreCallback)
1499 {
1500 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1501 << "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim*bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1502 }
1503 else
1504 {
1505 if (twiddleSwapKernelIn)
1506 {
1507 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1508 {
1509 //input is wide; output is tall; read input index realted
1510 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1511 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1512 }
1513 else
1514 {
1515 //input is tall; output is wide; read input index realted
1516 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1517 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1518 }
1519 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1520 if (fwd)
1521 {
1522 //forward
1523 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1524 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1525 }
1526 else
1527 {
1528 //backward
1529 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1530 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1531 }
1532 }
1533 else
1534 clKernWrite(transKernel, 9) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1535 }
1536 clKernWrite(transKernel, 6) << "}" << std::endl;
1537 }
1538 }
1539 break;
1540 }
1541 case CLFFT_HERMITIAN_INTERLEAVED:
1542 case CLFFT_HERMITIAN_PLANAR:
1543 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1544 case CLFFT_COMPLEX_PLANAR:
1545 {
1546 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1547 {
1548 if (i + 256 < LDS_per_WG)
1549 {
1550 if (params.fft_hasPreCallback)
1551 {
1552 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1553 "(inputA_R-batch_offset*" << smaller_dim * bigger_dim << ", inputA_I-batch_offset*" << smaller_dim * bigger_dim <<
1554 ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1555 }
1556 else
1557 {
1558 if (twiddleSwapKernelIn)
1559 {
1560 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1561 {
1562 //input is wide; output is tall; read input index realted
1563 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1564 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1565 }
1566 else
1567 {
1568 //input is tall; output is wide; read input index realted
1569 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1570 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1571 }
1572 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1573 if (fwd)
1574 {
1575 //forward
1576 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1577 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1578 }
1579 else
1580 {
1581 //backward
1582 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1583 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1584 }
1585 }
1586 else
1587 {
1588 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1589 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1590 }
1591 }
1592 }
1593 else
1594 {
1595 // need to handle boundary
1596 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1597 if (params.fft_hasPreCallback)
1598 {
1599 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1600 "(inputA_R-batch_offset*" << smaller_dim * bigger_dim << ", inputA_I-batch_offset*" << smaller_dim * bigger_dim <<
1601 ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1602 }
1603 else
1604 {
1605 if (twiddleSwapKernelIn)
1606 {
1607 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1608 {
1609 //input is wide; output is tall; read input index realted
1610 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1611 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1612 }
1613 else
1614 {
1615 //input is tall; output is wide; read input index realted
1616 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1617 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1618 }
1619 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1620 if (fwd)
1621 {
1622 //forward
1623 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1624 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1625 }
1626 else
1627 {
1628 //backward
1629 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1630 clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1631 }
1632 }
1633 else
1634 {
1635 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1636 clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1637 }
1638 }
1639 clKernWrite(transKernel, 6) << "}" << std::endl;
1640 }
1641 }
1642 break;
1643 }
1644 default:
1645 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1646 }
1647
1648 clKernWrite(transKernel, 6) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1649
1650 clKernWrite(transKernel, 3) << std::endl;
1651 switch (params.fft_inputLayout)
1652 {
1653 case CLFFT_REAL:// for real case this is different
1654 case CLFFT_COMPLEX_INTERLEAVED:
1655 {
1656 if (twiddleSwapKernelOut)
1657 {
1658 clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
1659 clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
1660 clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
1661
1662 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1663 {
1664 if (i + 256 < LDS_per_WG)
1665 {
1666 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1667 {
1668 //input is wide; output is tall
1669 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1670 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1671 }
1672 else
1673 {
1674 //input is tall; output is wide
1675 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1676 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1677 }
1678 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1679 if (fwd)
1680 {
1681 //forward
1682 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1683 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1684 }
1685 else
1686 {
1687 //backward
1688 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1689 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1690 }
1691 //clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1692 }
1693 else
1694 {
1695 // need to handle boundary
1696 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1697 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1698 {
1699 //input is wide; output is tall
1700 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1701 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1702 }
1703 else
1704 {
1705 //input is tall; output is wide
1706 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1707 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1708 }
1709 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1710 if (fwd)
1711 {
1712 //forward
1713 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1714 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1715 }
1716 else
1717 {
1718 //backward
1719 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1720 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1721 }
1722 //clKernWrite(transKernel, 9) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1723 clKernWrite(transKernel, 6) << "}" << std::endl;
1724 }
1725 }
1726 }
1727 else if(!twiddleSwapKernelOut)//could be twiddleSwapKernelIn
1728 {
1729 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1730 {
1731 //twiddling and callback do not coexist
1732 if (params.fft_hasPostCallback)
1733 {
1734 if (i + 256 < LDS_per_WG)
1735 {
1736 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
1737 << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
1738 << "]);" << std::endl;
1739 }
1740 else
1741 {
1742 // need to handle boundary
1743 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1744 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
1745 << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
1746 << "]);" << std::endl;
1747 clKernWrite(transKernel, 6) << "}" << std::endl;
1748 }
1749 }
1750 else
1751 {
1752 if (i + 256 < LDS_per_WG)
1753 clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1754 else
1755 {
1756 // need to handle boundary
1757 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1758 clKernWrite(transKernel, 9) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1759 clKernWrite(transKernel, 6) << "}" << std::endl;
1760 }
1761 }
1762 }
1763 }
1764 break;
1765 }
1766 case CLFFT_HERMITIAN_INTERLEAVED:
1767 case CLFFT_HERMITIAN_PLANAR:
1768 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1769 case CLFFT_COMPLEX_PLANAR:
1770 {
1771 if (twiddleSwapKernelOut)
1772 {
1773 clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
1774 clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
1775 clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
1776 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1777 {
1778 if (i + 256 < LDS_per_WG)
1779 {
1780 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1781 {
1782 //input is wide; output is tall
1783 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1784 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1785 }
1786 else
1787 {
1788 //input is tall; output is wide
1789 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1790 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1791 }
1792 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1793 if (fwd)
1794 {
1795 //forward
1796 clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1797 clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1798 }
1799 else
1800 {
1801 //backward
1802 clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1803 clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1804 }
1805 }
1806 else
1807 {
1808 // need to handle boundary
1809 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1810 if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1811 {
1812 //input is wide; output is tall
1813 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1814 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1815 }
1816 else
1817 {
1818 //input is tall; output is wide
1819 clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1820 clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1821 }
1822 clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1823 if (fwd)
1824 {
1825 //forward
1826 clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1827 clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1828 }
1829 else
1830 {
1831 //backward
1832 clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1833 clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1834 }
1835 clKernWrite(transKernel, 6) << "}" << std::endl;
1836 }
1837 clKernWrite(transKernel, 3) << std::endl;
1838 }
1839 }
1840 else if (!twiddleSwapKernelOut)//could be twiddleSwapKernelIn
1841 {
1842 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1843 {
1844 //twiddling and callback do not coexist
1845 if (params.fft_hasPostCallback)
1846 {
1847 if (i + 256 < LDS_per_WG)
1848 {
1849 //clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
1850 //clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1851 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
1852 << ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
1853 << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
1854 }
1855 else
1856 {
1857 // need to handle boundary
1858 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1859 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
1860 << ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
1861 << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
1862 clKernWrite(transKernel, 6) << "}" << std::endl;
1863 }
1864 }
1865 else
1866 {
1867 if (i + 256 < LDS_per_WG)
1868 {
1869 clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
1870 clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1871 }
1872 else
1873 {
1874 // need to handle boundary
1875 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1876 clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
1877 clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1878 clKernWrite(transKernel, 6) << "}" << std::endl;
1879 }
1880 }
1881 }
1882 }
1883 break;
1884 }
1885 default:
1886 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1887 }
1888 clKernWrite(transKernel, 6) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1889
1890 clKernWrite(transKernel, 3) << std::endl;
1891 switch (params.fft_inputLayout)
1892 {
1893 case CLFFT_REAL:
1894 case CLFFT_COMPLEX_INTERLEAVED:
1895 case CLFFT_COMPLEX_PLANAR:
1896 {
1897 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1898 {
1899 if (i + 256 < LDS_per_WG)
1900 clKernWrite(transKernel, 6) << "prevValue[idx+" << i << "] = nextValue[idx+" << i << "];" << std::endl;
1901 else
1902 {
1903 // need to handle boundary
1904 clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1905 clKernWrite(transKernel, 9) << "prevValue[idx + " << i << "] = nextValue[idx + " << i << "]; " << std::endl;
1906 clKernWrite(transKernel, 6) << "}" << std::endl;
1907 }
1908 }
1909 break;
1910 }
1911 case CLFFT_HERMITIAN_INTERLEAVED:
1912 case CLFFT_HERMITIAN_PLANAR:
1913 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1914 default:
1915 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1916 }
1917
1918 clKernWrite(transKernel, 6) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1919
1920 clKernWrite(transKernel, 3) << std::endl;
1921 clKernWrite(transKernel, 3) << "prev = next;" << std::endl;
1922 if (WG_per_line == 1)
1923 clKernWrite(transKernel, 3) << "}while(next!=swap_table[group_id][0]);" << std::endl;//end of do-while
1924 else
1925 clKernWrite(transKernel, 3) << "}while(next!=swap_table[group_id/"<< WG_per_line <<"][0]);" << std::endl;//end of do-while
1926 clKernWrite(transKernel, 0) << "}" << std::endl;//end of kernel
1927
1928 if (!twiddleSwapKernel)
1929 break; // break for bothDir only need one kernel if twiddle is not done here
1930
1931 }//end of for (size_t bothDir = 0; bothDir < 2; bothDir++)
1932
1933
1934 //by now the kernel string is generated
1935 strKernel = transKernel.str();
1936 return CLFFT_SUCCESS;
1937 }
1938
1939 //generate transepose kernel with sqaure 2d matrix of row major with arbitrary batch size
1940 /*
1941 Below is a matrix(row major) containing three sqaure sub matrix along column
1942 The transpose will be done within each sub matrix.
1943 [M0
1944 M1
1945 M2]
1946 */
genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::Signature & params,std::string & strKernel,const size_t & lwSize,const size_t reShapeFactor)1947 clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::Signature & params, std::string& strKernel, const size_t& lwSize, const size_t reShapeFactor)
1948 {
1949 strKernel.reserve(4096);
1950 std::stringstream transKernel(std::stringstream::out);
1951
1952 // These strings represent the various data types we read or write in the kernel, depending on how the plan
1953 // is configured
1954 std::string dtInput; // The type read as input into kernel
1955 std::string dtOutput; // The type written as output from kernel
1956 std::string dtPlanar; // Fundamental type for planar arrays
1957 std::string dtComplex; // Fundamental type for complex arrays
1958
1959 // NOTE: Enable only for debug
1960 // clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
1961
1962 //if (params.fft_inputLayout != params.fft_outputLayout)
1963 // return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1964
1965 switch (params.fft_precision)
1966 {
1967 case CLFFT_SINGLE:
1968 case CLFFT_SINGLE_FAST:
1969 dtPlanar = "float";
1970 dtComplex = "float2";
1971 break;
1972 case CLFFT_DOUBLE:
1973 case CLFFT_DOUBLE_FAST:
1974 dtPlanar = "double";
1975 dtComplex = "double2";
1976
1977 // Emit code that enables double precision in the kernel
1978 clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
1979 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
1980 clKernWrite(transKernel, 0) << "#else" << std::endl;
1981 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
1982 clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
1983
1984 break;
1985 default:
1986 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1987 break;
1988 }
1989
1990 // it is a better idea to do twiddle in swap kernel if we will have a swap kernel.
1991 // for pure square transpose, twiddle will be done in transpose kernel
1992 bool twiddleTransposeKernel = params.fft_3StepTwiddle && (params.transposeMiniBatchSize == 1);//when transposeMiniBatchSize == 1 it is guaranteed to be a sqaure matrix transpose
1993 // If twiddle computation has been requested, generate the lookup function
1994
1995 if (twiddleTransposeKernel)
1996 {
1997 std::string str;
1998 StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
1999 if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
2000 twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
2001 else
2002 twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
2003 clKernWrite(transKernel, 0) << str << std::endl;
2004 clKernWrite(transKernel, 0) << std::endl;
2005 }
2006
2007
2008
2009 // This detects whether the input matrix is square
2010 bool notSquare = (params.fft_N[0] == params.fft_N[1]) ? false : true;
2011
2012 if (notSquare && (params.fft_placeness == CLFFT_INPLACE))
2013 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2014
2015 // This detects whether the input matrix is a multiple of 16*reshapefactor or not
2016
2017 bool mult_of_16 = (params.fft_N[0] % (reShapeFactor * 16) == 0) ? true : false;
2018
2019
2020
2021 for (size_t bothDir = 0; bothDir < 2; bothDir++)
2022 {
2023 bool fwd = bothDir ? false : true;
2024
2025 //If pre-callback is set for the plan
2026 if (params.fft_hasPreCallback)
2027 {
2028 //Insert callback function code at the beginning
2029 clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
2030 clKernWrite(transKernel, 0) << std::endl;
2031 }
2032 //If post-callback is set for the plan
2033 if (params.fft_hasPostCallback)
2034 {
2035 //Insert callback function code at the beginning
2036 clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
2037 clKernWrite(transKernel, 0) << std::endl;
2038 }
2039
2040 std::string funcName;
2041 if (twiddleTransposeKernel) // it makes more sense to do twiddling in swap kernel
2042 funcName = fwd ? "transpose_square_tw_fwd" : "transpose_square_tw_back";
2043 else
2044 funcName = "transpose_square";
2045
2046
2047 // Generate kernel API
2048 genTransposePrototype(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
2049 size_t wgPerBatch;
2050 if (mult_of_16)
2051 wgPerBatch = (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2;
2052 else
2053 wgPerBatch = (params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) / 2;
2054 clKernWrite(transKernel, 3) << "const size_t numGroupsY_1 = " << wgPerBatch << ";" << std::endl;
2055
2056 for (size_t i = 2; i < params.fft_DataDim - 1; i++)
2057 {
2058 clKernWrite(transKernel, 3) << "const size_t numGroupsY_" << i << " = numGroupsY_" << i - 1 << " * " << params.fft_N[i] << ";" << std::endl;
2059 }
2060
2061 clKernWrite(transKernel, 3) << "size_t g_index;" << std::endl;
2062 clKernWrite(transKernel, 3) << std::endl;
2063
2064 OffsetCalc(transKernel, params, true);
2065
2066
2067 if (params.fft_placeness == CLFFT_OUTOFPLACE)
2068 OffsetCalc(transKernel, params, false);
2069
2070
2071 // Handle planar and interleaved right here
2072 switch (params.fft_inputLayout)
2073 {
2074 case CLFFT_COMPLEX_INTERLEAVED:
2075 //Do not advance offset when precallback is set as the starting address of global buffer is needed
2076 if (!params.fft_hasPreCallback)
2077 {
2078 clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl; // Set A ptr to the start of each slice
2079 }
2080 break;
2081 case CLFFT_COMPLEX_PLANAR:
2082 //Do not advance offset when precallback is set as the starting address of global buffer is needed
2083 if (!params.fft_hasPreCallback)
2084 {
2085 clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl; // Set A ptr to the start of each slice
2086 clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl; // Set A ptr to the start of each slice
2087 }
2088 break;
2089 case CLFFT_HERMITIAN_INTERLEAVED:
2090 case CLFFT_HERMITIAN_PLANAR:
2091 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2092 case CLFFT_REAL:
2093 break;
2094 default:
2095 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2096 }
2097
2098 if (params.fft_placeness == CLFFT_OUTOFPLACE)
2099 {
2100 switch (params.fft_outputLayout)
2101 {
2102 case CLFFT_COMPLEX_INTERLEAVED:
2103 clKernWrite(transKernel, 3) << "outputA += oOffset;" << std::endl; // Set A ptr to the start of each slice
2104
2105 break;
2106 case CLFFT_COMPLEX_PLANAR:
2107
2108 clKernWrite(transKernel, 3) << "outputA_R += oOffset;" << std::endl; // Set A ptr to the start of each slice
2109 clKernWrite(transKernel, 3) << "outputA_I += oOffset;" << std::endl; // Set A ptr to the start of each slice
2110 break;
2111 case CLFFT_HERMITIAN_INTERLEAVED:
2112 case CLFFT_HERMITIAN_PLANAR:
2113 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2114 case CLFFT_REAL:
2115 break;
2116 default:
2117 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2118 }
2119 }
2120 else
2121 {
2122 switch (params.fft_inputLayout)
2123 {
2124 case CLFFT_COMPLEX_INTERLEAVED:
2125 if (params.fft_hasPreCallback)
2126 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl;
2127 else
2128 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
2129 break;
2130 case CLFFT_COMPLEX_PLANAR:
2131 if (params.fft_hasPreCallback)
2132 {
2133 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R + iOffset;" << std::endl;
2134 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I + iOffset;" << std::endl;
2135 }
2136 else
2137 {
2138 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
2139 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
2140 }
2141 break;
2142 case CLFFT_HERMITIAN_INTERLEAVED:
2143 case CLFFT_HERMITIAN_PLANAR:
2144 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2145 case CLFFT_REAL:
2146 break;
2147 default:
2148 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2149 }
2150 }
2151
2152
2153 clKernWrite(transKernel, 3) << std::endl;
2154
2155
2156 // Now compute the corresponding y,x coordinates
2157 // for a triangular indexing
2158 if (mult_of_16)
2159 clKernWrite(transKernel, 3) << "float row = (" << -2.0f*params.fft_N[0] / 16 / reShapeFactor - 1 << "+sqrt((" << 4.0f*params.fft_N[0] / 16 / reShapeFactor*(params.fft_N[0] / 16 / reShapeFactor + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2160 else
2161 clKernWrite(transKernel, 3) << "float row = (" << -2.0f*(params.fft_N[0] / (16 * reShapeFactor) + 1) - 1 << "+sqrt((" << 4.0f*(params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2162
2163
2164 clKernWrite(transKernel, 3) << "if (row == (float)(size_t)row) row -= 1; " << std::endl;
2165 clKernWrite(transKernel, 3) << "const size_t t_gy = (size_t)row;" << std::endl;
2166
2167 clKernWrite(transKernel, 3) << "" << std::endl;
2168
2169 if (mult_of_16)
2170 clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (params.fft_N[0] / 16 / reShapeFactor) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2171 else
2172 clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (params.fft_N[0] / (16 * reShapeFactor) + 1) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2173
2174 clKernWrite(transKernel, 3) << "const long t_gy_p = t_gx_p - t_gy;" << std::endl;
2175
2176
2177 clKernWrite(transKernel, 3) << "" << std::endl;
2178
2179 clKernWrite(transKernel, 3) << "const size_t d_lidx = get_local_id(0) % 16;" << std::endl;
2180 clKernWrite(transKernel, 3) << "const size_t d_lidy = get_local_id(0) / 16;" << std::endl;
2181
2182 clKernWrite(transKernel, 3) << "" << std::endl;
2183
2184 clKernWrite(transKernel, 3) << "const size_t lidy = (d_lidy * 16 + d_lidx) /" << (16 * reShapeFactor) << ";" << std::endl;
2185 clKernWrite(transKernel, 3) << "const size_t lidx = (d_lidy * 16 + d_lidx) %" << (16 * reShapeFactor) << ";" << std::endl;
2186
2187 clKernWrite(transKernel, 3) << "" << std::endl;
2188
2189 clKernWrite(transKernel, 3) << "const size_t idx = lidx + t_gx_p*" << 16 * reShapeFactor << ";" << std::endl;
2190 clKernWrite(transKernel, 3) << "const size_t idy = lidy + t_gy_p*" << 16 * reShapeFactor << ";" << std::endl;
2191
2192 clKernWrite(transKernel, 3) << "" << std::endl;
2193
2194 clKernWrite(transKernel, 3) << "const size_t starting_index_yx = t_gy_p*" << 16 * reShapeFactor << " + t_gx_p*" << 16 * reShapeFactor*params.fft_N[0] << ";" << std::endl;
2195
2196 clKernWrite(transKernel, 3) << "" << std::endl;
2197
2198 clKernWrite(transKernel, 3) << "__local " << dtComplex << " xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
2199 clKernWrite(transKernel, 3) << "__local " << dtComplex << " yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
2200
2201 clKernWrite(transKernel, 3) << dtComplex << " tmpm, tmpt;" << std::endl;
2202
2203 clKernWrite(transKernel, 3) << "" << std::endl;
2204
2205 // Step 1: Load both blocks into local memory
2206 // Here I load inputA for both blocks contiguously and write it contigously into
2207 // the corresponding shared memories.
2208 // Afterwards I use non-contiguous access from local memory and write contiguously
2209 // back into the arrays
2210
2211 if (mult_of_16) {
2212 clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
2213 clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2214 clKernWrite(transKernel, 6) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
2215
2216 // Handle planar and interleaved right here
2217 switch (params.fft_inputLayout)
2218 {
2219 case CLFFT_COMPLEX_INTERLEAVED:
2220 {
2221 if (params.fft_hasPreCallback)
2222 {
2223 if (params.fft_preCallback.localMemSize > 0)
2224 {
2225 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2226 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2227 }
2228 else
2229 {
2230 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2231 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2232 }
2233 }
2234 else
2235 {
2236 clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2237 clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2238 }
2239 }
2240 break;
2241 case CLFFT_COMPLEX_PLANAR:
2242 dtInput = dtPlanar;
2243 dtOutput = dtPlanar;
2244 if (params.fft_hasPreCallback)
2245 {
2246 if (params.fft_preCallback.localMemSize > 0)
2247 {
2248 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2249 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2250 }
2251 else
2252 {
2253 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2254 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2255 }
2256 }
2257 else
2258 {
2259 clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2260 clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2261
2262 clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2263 clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2264 }
2265 break;
2266 case CLFFT_HERMITIAN_INTERLEAVED:
2267 case CLFFT_HERMITIAN_PLANAR:
2268 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2269 case CLFFT_REAL:
2270 break;
2271 default:
2272 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2273 }
2274
2275 // it makes more sense to do twiddling in swap kernel
2276 // If requested, generate the Twiddle math to multiply constant values
2277 if (twiddleTransposeKernel)
2278 genTwiddleMath(params, transKernel, dtComplex, fwd);
2279
2280 clKernWrite(transKernel, 6) << "xy_s[index] = tmpm; " << std::endl;
2281 clKernWrite(transKernel, 6) << "yx_s[index] = tmpt; " << std::endl;
2282
2283 clKernWrite(transKernel, 3) << "}" << std::endl;
2284
2285 clKernWrite(transKernel, 3) << "" << std::endl;
2286
2287 clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
2288
2289 clKernWrite(transKernel, 3) << "" << std::endl;
2290
2291
2292 // Step2: Write from shared to global
2293 clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2294 clKernWrite(transKernel, 6) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
2295
2296
2297 // Handle planar and interleaved right here
2298 switch (params.fft_outputLayout)
2299 {
2300 case CLFFT_COMPLEX_INTERLEAVED:
2301 if (params.fft_hasPostCallback)
2302 {
2303 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2304 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2305 else
2306 {
2307 //assume tranpose is only two dimensional for now
2308 //size_t actualBatchSize = params.transposeBatchSize / params.transposeMiniBatchSize;
2309 size_t blockOffset = params.fft_inStride[2];
2310 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-" << blockOffset <<"*((get_group_id(0)/numGroupsY_1)%"<< params.transposeMiniBatchSize <<"), ((idy + loop*" << 16 / reShapeFactor << ")*"
2311 << params.fft_N[0] << " + idx + "<< blockOffset <<"*( (get_group_id(0)/numGroupsY_1 )%" << params.transposeMiniBatchSize <<") " << "), post_userdata, yx_s[index]";
2312 }
2313 if (params.fft_postCallback.localMemSize > 0)
2314 {
2315 clKernWrite(transKernel, 0) << ", localmem";
2316 }
2317 clKernWrite(transKernel, 0) << ");" << std::endl;
2318
2319 if (params.transposeMiniBatchSize < 2)
2320 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2321 else
2322 {
2323 size_t blockOffset = params.fft_inStride[2];
2324 //clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-iOffset, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx +iOffset), post_userdata, xy_s[index]";
2325 //clKernWrite(transKernel, 0) << std::endl;
2326 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize << "), ((lidy + loop*" << 16 / reShapeFactor << ")*"
2327 << params.fft_N[0] << " + lidx + starting_index_yx + " << blockOffset << "*( (get_group_id(0)/numGroupsY_1 )%" << params.transposeMiniBatchSize << ") " << "), post_userdata, xy_s[index]";
2328 }
2329 if (params.fft_postCallback.localMemSize > 0)
2330 {
2331 clKernWrite(transKernel, 0) << ", localmem";
2332 }
2333 clKernWrite(transKernel, 0) << ");" << std::endl;
2334 }
2335 else
2336 {
2337 clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
2338 clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
2339 }
2340 break;
2341 case CLFFT_COMPLEX_PLANAR:
2342 if (params.fft_hasPostCallback)
2343 {
2344 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2345 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2346 else
2347 {
2348 size_t blockOffset = params.fft_inStride[2];
2349 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R - "<< blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2350 "), outputA_I -" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2351 "), ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx +"<< blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2352 ")), post_userdata, yx_s[index].x, yx_s[index].y";
2353 }
2354 if (params.fft_postCallback.localMemSize > 0)
2355 {
2356 clKernWrite(transKernel, 0) << ", localmem";
2357 }
2358 clKernWrite(transKernel, 0) << ");" << std::endl;
2359
2360 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2361 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2362 else
2363 {
2364 size_t blockOffset = params.fft_inStride[2];
2365 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R - " << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2366 "), outputA_I -" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2367 "), ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx +" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2368 ")), post_userdata, xy_s[index].x, xy_s[index].y";
2369 }
2370 if (params.fft_postCallback.localMemSize > 0)
2371 {
2372 clKernWrite(transKernel, 0) << ", localmem";
2373 }
2374 clKernWrite(transKernel, 0) << ");" << std::endl;
2375 }
2376 else
2377 {
2378 clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
2379 clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
2380
2381 clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
2382 clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
2383 }
2384 break;
2385 case CLFFT_HERMITIAN_INTERLEAVED:
2386 case CLFFT_HERMITIAN_PLANAR:
2387 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2388 case CLFFT_REAL:
2389 break;
2390 default:
2391 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2392 }
2393
2394
2395
2396 clKernWrite(transKernel, 3) << "}" << std::endl;
2397
2398 }
2399 else {//mult_of_16
2400
2401 clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
2402 clKernWrite(transKernel, 3) << "if (" << params.fft_N[0] << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
2403 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2404 clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
2405
2406 // Handle planar and interleaved right here
2407 switch (params.fft_inputLayout)
2408 {
2409 case CLFFT_COMPLEX_INTERLEAVED:
2410 if (params.fft_hasPreCallback)
2411 {
2412 if (params.fft_preCallback.localMemSize > 0)
2413 {
2414 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2415 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2416 }
2417 else
2418 {
2419 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2420 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2421 }
2422 }
2423 else
2424 {
2425 clKernWrite(transKernel, 9) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2426 clKernWrite(transKernel, 9) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2427 }
2428 break;
2429 case CLFFT_COMPLEX_PLANAR:
2430 dtInput = dtPlanar;
2431 dtOutput = dtPlanar;
2432 if (params.fft_hasPreCallback)
2433 {
2434 if (params.fft_preCallback.localMemSize > 0)
2435 {
2436 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2437 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2438 }
2439 else
2440 {
2441 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2442 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2443 }
2444 }
2445 else
2446 {
2447 clKernWrite(transKernel, 9) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2448 clKernWrite(transKernel, 9) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2449
2450 clKernWrite(transKernel, 9) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2451 clKernWrite(transKernel, 9) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2452 }
2453 break;
2454 case CLFFT_HERMITIAN_INTERLEAVED:
2455 case CLFFT_HERMITIAN_PLANAR:
2456 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2457 case CLFFT_REAL:
2458 break;
2459 default:
2460 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2461 }
2462
2463 // it makes more sense to do twiddling in swap kernel
2464 // If requested, generate the Twiddle math to multiply constant values
2465 if (twiddleTransposeKernel)
2466 genTwiddleMath(params, transKernel, dtComplex, fwd);
2467
2468 clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
2469 clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
2470 clKernWrite(transKernel, 6) << "}" << std::endl;
2471 clKernWrite(transKernel, 3) << "}" << std::endl;
2472
2473 clKernWrite(transKernel, 3) << "else{" << std::endl;
2474 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2475 clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
2476
2477
2478 // Handle planar and interleaved right here
2479 switch (params.fft_inputLayout)
2480 {
2481 case CLFFT_COMPLEX_INTERLEAVED:
2482 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << "&& idx<" << params.fft_N[0] << ")" << std::endl;
2483 if (params.fft_hasPreCallback)
2484 {
2485 if (params.fft_preCallback.localMemSize > 0)
2486 {
2487 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2488 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
2489 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2490 }
2491 else
2492 {
2493 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2494 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
2495 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2496 }
2497 }
2498 else
2499 {
2500 clKernWrite(transKernel, 12) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2501 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
2502 clKernWrite(transKernel, 12) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2503 }
2504 break;
2505 case CLFFT_COMPLEX_PLANAR:
2506 dtInput = dtPlanar;
2507 dtOutput = dtPlanar;
2508 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << "&& idx<" << params.fft_N[0] << ") {" << std::endl;
2509 if (params.fft_hasPreCallback)
2510 {
2511 if (params.fft_preCallback.localMemSize > 0)
2512 {
2513 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem); }" << std::endl;
2514 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2515 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem); }" << std::endl;
2516 }
2517 else
2518 {
2519 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata); }" << std::endl;
2520 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2521 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata); }" << std::endl;
2522 }
2523 }
2524 else
2525 {
2526 clKernWrite(transKernel, 12) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2527 clKernWrite(transKernel, 12) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx]; }" << std::endl;
2528 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2529 clKernWrite(transKernel, 12) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2530 clKernWrite(transKernel, 12) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx]; }" << std::endl;
2531 }
2532 break;
2533 case CLFFT_HERMITIAN_INTERLEAVED:
2534 case CLFFT_HERMITIAN_PLANAR:
2535 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2536 case CLFFT_REAL:
2537 break;
2538 default:
2539 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2540 }
2541
2542
2543 // If requested, generate the Twiddle math to multiply constant values
2544 if (twiddleTransposeKernel)
2545 genTwiddleMath(params, transKernel, dtComplex, fwd);
2546
2547 clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
2548 clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
2549
2550 clKernWrite(transKernel, 9) << "}" << std::endl;
2551 clKernWrite(transKernel, 3) << "}" << std::endl;
2552
2553 clKernWrite(transKernel, 3) << "" << std::endl;
2554 clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
2555 clKernWrite(transKernel, 3) << "" << std::endl;
2556
2557 // Step2: Write from shared to global
2558
2559 clKernWrite(transKernel, 3) << "if (" << params.fft_N[0] << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
2560 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2561 clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop ;" << std::endl;
2562
2563 // Handle planar and interleaved right here
2564 switch (params.fft_outputLayout)
2565 {
2566 case CLFFT_COMPLEX_INTERLEAVED:
2567 if (params.fft_hasPostCallback)
2568 {
2569 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2570 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2571 else
2572 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2573 if (params.fft_postCallback.localMemSize > 0)
2574 {
2575 clKernWrite(transKernel, 0) << ", localmem";
2576 }
2577 clKernWrite(transKernel, 0) << ");" << std::endl;
2578 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2579 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2580 else
2581 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2582 if (params.fft_postCallback.localMemSize > 0)
2583 {
2584 clKernWrite(transKernel, 0) << ", localmem";
2585 }
2586 clKernWrite(transKernel, 0) << ");" << std::endl;
2587 }
2588 else
2589 {
2590 clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
2591 clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
2592 }
2593 break;
2594 case CLFFT_COMPLEX_PLANAR:
2595 if (params.fft_hasPostCallback)
2596 {
2597 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2598 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2599 else
2600 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2601 if (params.fft_postCallback.localMemSize > 0)
2602 {
2603 clKernWrite(transKernel, 0) << ", localmem";
2604 }
2605 clKernWrite(transKernel, 0) << ");" << std::endl;
2606
2607 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2608 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2609 else
2610 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2611 if (params.fft_postCallback.localMemSize > 0)
2612 {
2613 clKernWrite(transKernel, 0) << ", localmem";
2614 }
2615 clKernWrite(transKernel, 0) << ");" << std::endl;
2616 }
2617 else
2618 {
2619 clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
2620 clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
2621 clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
2622 clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
2623 }
2624 break;
2625 case CLFFT_HERMITIAN_INTERLEAVED:
2626 case CLFFT_HERMITIAN_PLANAR:
2627 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2628 case CLFFT_REAL:
2629 break;
2630 default:
2631 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2632 }
2633
2634
2635 clKernWrite(transKernel, 6) << "}" << std::endl;
2636 clKernWrite(transKernel, 3) << "}" << std::endl;
2637
2638 clKernWrite(transKernel, 3) << "else{" << std::endl;
2639 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2640
2641 clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
2642
2643 // Handle planar and interleaved right here
2644 switch (params.fft_outputLayout)
2645 {
2646 case CLFFT_COMPLEX_INTERLEAVED:
2647 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << " && idx<" << params.fft_N[0] << ")" << std::endl;
2648 if (params.fft_hasPostCallback)
2649 {
2650 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2651 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2652 else
2653 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2654 if (params.fft_postCallback.localMemSize > 0)
2655 {
2656 clKernWrite(transKernel, 0) << ", localmem";
2657 }
2658 clKernWrite(transKernel, 0) << ");" << std::endl;
2659
2660 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ")" << std::endl;
2661 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2662 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2663 else
2664 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2665
2666 if (params.fft_postCallback.localMemSize > 0)
2667 {
2668 clKernWrite(transKernel, 0) << ", localmem";
2669 }
2670 clKernWrite(transKernel, 0) << ");" << std::endl;
2671 }
2672 else
2673 {
2674 clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
2675 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ")" << std::endl;
2676 clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
2677 }
2678 break;
2679 case CLFFT_COMPLEX_PLANAR:
2680 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << " && idx<" << params.fft_N[0] << ") {" << std::endl;
2681
2682 if (params.fft_hasPostCallback)
2683 {
2684 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2685 {
2686 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2687 }
2688 else
2689 {
2690 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2691 }
2692 if (params.fft_postCallback.localMemSize > 0)
2693 {
2694 clKernWrite(transKernel, 0) << ", localmem";
2695 }
2696 clKernWrite(transKernel, 0) << "); }" << std::endl;
2697
2698 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2699 if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2700 {
2701 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2702 }
2703 else
2704 {
2705 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2706 }
2707 if (params.fft_postCallback.localMemSize > 0)
2708 {
2709 clKernWrite(transKernel, 0) << ", localmem";
2710 }
2711 clKernWrite(transKernel, 0) << "); }" << std::endl;
2712 }
2713 else
2714 {
2715 clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
2716 clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
2717 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2718 clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
2719 clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
2720 }
2721
2722 break;
2723 case CLFFT_HERMITIAN_INTERLEAVED:
2724 case CLFFT_HERMITIAN_PLANAR:
2725 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2726 case CLFFT_REAL:
2727 break;
2728 default:
2729 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2730 }
2731
2732
2733 clKernWrite(transKernel, 6) << "}" << std::endl; // end for
2734 clKernWrite(transKernel, 3) << "}" << std::endl; // end else
2735
2736
2737 }
2738 clKernWrite(transKernel, 0) << "}" << std::endl;
2739
2740 strKernel = transKernel.str();
2741
2742
2743 if (!twiddleTransposeKernel)
2744 break; // break for bothDir
2745 }
2746
2747 return CLFFT_SUCCESS;
2748 }
2749
2750 //generate transpose kernel with square 2d matrix of row major with blocks along the leading dimension
2751 //aka leading dimension batched
2752 /*
2753 Below is a matrix(row major) contaning three square sub matrix along row
2754 [M0 M2 M2]
2755 */
genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params,std::string & strKernel,const size_t & lwSize,const size_t reShapeFactor)2756 clfftStatus genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params, std::string& strKernel, const size_t& lwSize, const size_t reShapeFactor)
2757 {
2758 strKernel.reserve(4096);
2759 std::stringstream transKernel(std::stringstream::out);
2760
2761 // These strings represent the various data types we read or write in the kernel, depending on how the plan
2762 // is configured
2763 std::string dtInput; // The type read as input into kernel
2764 std::string dtOutput; // The type written as output from kernel
2765 std::string dtPlanar; // Fundamental type for planar arrays
2766 std::string dtComplex; // Fundamental type for complex arrays
2767
2768 // NOTE: Enable only for debug
2769 // clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
2770
2771 //if (params.fft_inputLayout != params.fft_outputLayout)
2772 // return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2773
2774 switch (params.fft_precision)
2775 {
2776 case CLFFT_SINGLE:
2777 case CLFFT_SINGLE_FAST:
2778 dtPlanar = "float";
2779 dtComplex = "float2";
2780 break;
2781 case CLFFT_DOUBLE:
2782 case CLFFT_DOUBLE_FAST:
2783 dtPlanar = "double";
2784 dtComplex = "double2";
2785
2786 // Emit code that enables double precision in the kernel
2787 clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
2788 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
2789 clKernWrite(transKernel, 0) << "#else" << std::endl;
2790 clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
2791 clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
2792
2793 break;
2794 default:
2795 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2796 break;
2797 }
2798
2799
2800 // If twiddle computation has been requested, generate the lookup function
2801 if (params.fft_3StepTwiddle)
2802 {
2803 std::string str;
2804 StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
2805 if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
2806 twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
2807 else
2808 twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
2809 clKernWrite(transKernel, 0) << str << std::endl;
2810 clKernWrite(transKernel, 0) << std::endl;
2811 }
2812
2813 size_t smaller_dim = (params.fft_N[0] < params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
2814 size_t bigger_dim = (params.fft_N[0] >= params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
2815 size_t dim_ratio = bigger_dim / smaller_dim;
2816
2817 // This detects whether the input matrix is rectangle of ratio 1:2
2818
2819 if ((params.fft_N[0] != 2 * params.fft_N[1]) && (params.fft_N[1] != 2 * params.fft_N[0]) &&
2820 (params.fft_N[0] != 3 * params.fft_N[1]) && (params.fft_N[1] != 3 * params.fft_N[0]) &&
2821 (params.fft_N[0] != 5 * params.fft_N[1]) && (params.fft_N[1] != 5 * params.fft_N[0]) &&
2822 (params.fft_N[0] != 10 * params.fft_N[1]) && (params.fft_N[1] != 10 * params.fft_N[0]))
2823 {
2824 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2825 }
2826
2827 if (params.fft_placeness == CLFFT_OUTOFPLACE)
2828 {
2829 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2830 }
2831
2832 // This detects whether the input matrix is a multiple of 16*reshapefactor or not
2833
2834 bool mult_of_16 = (smaller_dim % (reShapeFactor * 16) == 0) ? true : false;
2835
2836 for (size_t bothDir = 0; bothDir < 2; bothDir++)
2837 {
2838 bool fwd = bothDir ? false : true;
2839
2840 //If pre-callback is set for the plan
2841 if (params.fft_hasPreCallback)
2842 {
2843 //Insert callback function code at the beginning
2844 clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
2845 clKernWrite(transKernel, 0) << std::endl;
2846 }
2847 //If post-callback is set for the plan
2848 if (params.fft_hasPostCallback)
2849 {
2850 //Insert callback function code at the beginning
2851 clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
2852 clKernWrite(transKernel, 0) << std::endl;
2853 }
2854
2855 std::string funcName;
2856 if (params.fft_3StepTwiddle) // TODO
2857 funcName = fwd ? "transpose_nonsquare_tw_fwd" : "transpose_nonsquare_tw_back";
2858 else
2859 funcName = "transpose_nonsquare";
2860
2861
2862 // Generate kernel API
2863 genTransposePrototypeLeadingDimensionBatched(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
2864
2865 if (mult_of_16)//number of WG per sub square block
2866 clKernWrite(transKernel, 3) << "const size_t numGroups_square_matrix_Y_1 = " << (smaller_dim / 16 / reShapeFactor)*(smaller_dim / 16 / reShapeFactor + 1) / 2 << ";" << std::endl;
2867 else
2868 clKernWrite(transKernel, 3) << "const size_t numGroups_square_matrix_Y_1 = " << (smaller_dim / (16 * reShapeFactor) + 1)*(smaller_dim / (16 * reShapeFactor) + 1 + 1) / 2 << ";" << std::endl;
2869
2870 clKernWrite(transKernel, 3) << "const size_t numGroupsY_1 = numGroups_square_matrix_Y_1 * "<< dim_ratio <<";" << std::endl;
2871
2872 for (size_t i = 2; i < params.fft_DataDim - 1; i++)
2873 {
2874 clKernWrite(transKernel, 3) << "const size_t numGroupsY_" << i << " = numGroupsY_" << i - 1 << " * " << params.fft_N[i] << ";" << std::endl;
2875 }
2876
2877 clKernWrite(transKernel, 3) << "size_t g_index;" << std::endl;
2878 clKernWrite(transKernel, 3) << "size_t square_matrix_index;" << std::endl;
2879 clKernWrite(transKernel, 3) << "size_t square_matrix_offset;" << std::endl;
2880 clKernWrite(transKernel, 3) << std::endl;
2881
2882 OffsetCalcLeadingDimensionBatched(transKernel, params);
2883
2884 clKernWrite(transKernel, 3) << "square_matrix_index = (g_index / numGroups_square_matrix_Y_1) ;" << std::endl;
2885 clKernWrite(transKernel, 3) << "g_index = g_index % numGroups_square_matrix_Y_1" << ";" << std::endl;
2886 clKernWrite(transKernel, 3) << std::endl;
2887
2888 if (smaller_dim == params.fft_N[1])
2889 {
2890 clKernWrite(transKernel, 3) << "square_matrix_offset = square_matrix_index * " << smaller_dim << ";" << std::endl;
2891 }
2892 else
2893 {
2894 clKernWrite(transKernel, 3) << "square_matrix_offset = square_matrix_index *" << smaller_dim * smaller_dim << ";" << std::endl;
2895 }
2896
2897 clKernWrite(transKernel, 3) << "iOffset += square_matrix_offset ;" << std::endl;
2898
2899 // Handle planar and interleaved right here
2900 switch (params.fft_inputLayout)
2901 {
2902 case CLFFT_COMPLEX_INTERLEAVED:
2903 case CLFFT_REAL:
2904 //Do not advance offset when precallback is set as the starting address of global buffer is needed
2905 if (!params.fft_hasPreCallback)
2906 {
2907 clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl; // Set A ptr to the start of each slice
2908 }
2909 break;
2910 case CLFFT_COMPLEX_PLANAR:
2911 //Do not advance offset when precallback is set as the starting address of global buffer is needed
2912 if (!params.fft_hasPreCallback)
2913 {
2914 clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl; // Set A ptr to the start of each slice
2915 clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl; // Set A ptr to the start of each slice
2916 }
2917 break;
2918 case CLFFT_HERMITIAN_INTERLEAVED:
2919 case CLFFT_HERMITIAN_PLANAR:
2920 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2921 default:
2922 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2923 }
2924
2925 switch (params.fft_inputLayout)
2926 {
2927 case CLFFT_COMPLEX_INTERLEAVED:
2928 case CLFFT_REAL:
2929 if (params.fft_hasPreCallback)
2930 {
2931 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl;
2932 }
2933 else
2934 {
2935 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
2936 }
2937 break;
2938 case CLFFT_COMPLEX_PLANAR:
2939 if (params.fft_hasPreCallback)
2940 {
2941 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R + iOffset;" << std::endl;
2942 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I + iOffset;" << std::endl;
2943 }
2944 else
2945 {
2946 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
2947 clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
2948 }
2949 break;
2950 case CLFFT_HERMITIAN_INTERLEAVED:
2951 case CLFFT_HERMITIAN_PLANAR:
2952 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2953 default:
2954 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2955 }
2956
2957
2958 clKernWrite(transKernel, 3) << std::endl;
2959
2960 // Now compute the corresponding y,x coordinates
2961 // for a triangular indexing
2962 if (mult_of_16)
2963 clKernWrite(transKernel, 3) << "float row = (" << -2.0f*smaller_dim / 16 / reShapeFactor - 1 << "+sqrt((" << 4.0f*smaller_dim / 16 / reShapeFactor*(smaller_dim / 16 / reShapeFactor + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2964 else
2965 clKernWrite(transKernel, 3) << "float row = (" << -2.0f*(smaller_dim / (16 * reShapeFactor) + 1) - 1 << "+sqrt((" << 4.0f*(smaller_dim / (16 * reShapeFactor) + 1)*(smaller_dim / (16 * reShapeFactor) + 1 + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2966
2967
2968 clKernWrite(transKernel, 3) << "if (row == (float)(int)row) row -= 1; " << std::endl;
2969 clKernWrite(transKernel, 3) << "const size_t t_gy = (int)row;" << std::endl;
2970
2971 clKernWrite(transKernel, 3) << "" << std::endl;
2972
2973 if (mult_of_16)
2974 clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (smaller_dim / 16 / reShapeFactor) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2975 else
2976 clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (smaller_dim / (16 * reShapeFactor) + 1) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2977
2978 clKernWrite(transKernel, 3) << "const long t_gy_p = t_gx_p - t_gy;" << std::endl;
2979
2980
2981 clKernWrite(transKernel, 3) << "" << std::endl;
2982
2983 clKernWrite(transKernel, 3) << "const size_t d_lidx = get_local_id(0) % 16;" << std::endl;
2984 clKernWrite(transKernel, 3) << "const size_t d_lidy = get_local_id(0) / 16;" << std::endl;
2985
2986 clKernWrite(transKernel, 3) << "" << std::endl;
2987
2988 clKernWrite(transKernel, 3) << "const size_t lidy = (d_lidy * 16 + d_lidx) /" << (16 * reShapeFactor) << ";" << std::endl;
2989 clKernWrite(transKernel, 3) << "const size_t lidx = (d_lidy * 16 + d_lidx) %" << (16 * reShapeFactor) << ";" << std::endl;
2990
2991 clKernWrite(transKernel, 3) << "" << std::endl;
2992
2993 clKernWrite(transKernel, 3) << "const size_t idx = lidx + t_gx_p*" << 16 * reShapeFactor << ";" << std::endl;
2994 clKernWrite(transKernel, 3) << "const size_t idy = lidy + t_gy_p*" << 16 * reShapeFactor << ";" << std::endl;
2995
2996 clKernWrite(transKernel, 3) << "" << std::endl;
2997
2998 clKernWrite(transKernel, 3) << "const size_t starting_index_yx = t_gy_p*" << 16 * reShapeFactor << " + t_gx_p*" << 16 * reShapeFactor*params.fft_N[0] << ";" << std::endl;
2999
3000 clKernWrite(transKernel, 3) << "" << std::endl;
3001
3002 switch (params.fft_inputLayout)
3003 {
3004 case CLFFT_REAL:
3005 case CLFFT_COMPLEX_INTERLEAVED:
3006 clKernWrite(transKernel, 3) << "__local " << dtInput << " xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3007 clKernWrite(transKernel, 3) << "__local " << dtInput << " yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3008
3009 clKernWrite(transKernel, 3) << dtInput << " tmpm, tmpt;" << std::endl;
3010 break;
3011 case CLFFT_COMPLEX_PLANAR:
3012 clKernWrite(transKernel, 3) << "__local " << dtComplex << " xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3013 clKernWrite(transKernel, 3) << "__local " << dtComplex << " yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3014
3015 clKernWrite(transKernel, 3) << dtComplex << " tmpm, tmpt;" << std::endl;
3016 break;
3017 default:
3018 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3019 }
3020 clKernWrite(transKernel, 3) << "" << std::endl;
3021
3022 // Step 1: Load both blocks into local memory
3023 // Here I load inputA for both blocks contiguously and write it contigously into
3024 // the corresponding shared memories.
3025 // Afterwards I use non-contiguous access from local memory and write contiguously
3026 // back into the arrays
3027
3028 if (mult_of_16) {
3029 clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
3030 clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3031 clKernWrite(transKernel, 6) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
3032
3033 // Handle planar and interleaved right here
3034 switch (params.fft_inputLayout)
3035 {
3036 case CLFFT_COMPLEX_INTERLEAVED:
3037 case CLFFT_REAL:
3038 {
3039 if (params.fft_hasPreCallback)
3040 {
3041 if (params.fft_preCallback.localMemSize > 0)
3042 {
3043 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3044 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3045 }
3046 else
3047 {
3048 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3049 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3050 }
3051 }
3052 else
3053 {
3054 clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3055 clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3056 }
3057 }
3058 break;
3059 case CLFFT_COMPLEX_PLANAR:
3060 dtInput = dtPlanar;
3061 dtOutput = dtPlanar;
3062 if (params.fft_hasPreCallback)
3063 {
3064 if (params.fft_preCallback.localMemSize > 0)
3065 {
3066 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3067 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3068 }
3069 else
3070 {
3071 clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3072 clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3073 }
3074 }
3075 else
3076 {
3077 clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3078 clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3079
3080 clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3081 clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3082 }
3083 break;
3084 case CLFFT_HERMITIAN_INTERLEAVED:
3085 case CLFFT_HERMITIAN_PLANAR:
3086 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3087 default:
3088 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3089 }
3090
3091 // If requested, generate the Twiddle math to multiply constant values
3092 if (params.fft_3StepTwiddle)
3093 genTwiddleMathLeadingDimensionBatched(params, transKernel, dtComplex, fwd);
3094
3095 clKernWrite(transKernel, 6) << "xy_s[index] = tmpm; " << std::endl;
3096 clKernWrite(transKernel, 6) << "yx_s[index] = tmpt; " << std::endl;
3097
3098 clKernWrite(transKernel, 3) << "}" << std::endl;
3099
3100 clKernWrite(transKernel, 3) << "" << std::endl;
3101
3102 clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
3103
3104 clKernWrite(transKernel, 3) << "" << std::endl;
3105
3106
3107 // Step2: Write from shared to global
3108 clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3109 clKernWrite(transKernel, 6) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
3110
3111
3112 // Handle planar and interleaved right here
3113 switch (params.fft_outputLayout)
3114 {
3115 case CLFFT_COMPLEX_INTERLEAVED:
3116 if (params.fft_hasPostCallback)
3117 {
3118 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
3119 if (params.fft_postCallback.localMemSize > 0)
3120 {
3121 clKernWrite(transKernel, 0) << ", localmem";
3122 }
3123 clKernWrite(transKernel, 0) << ");" << std::endl;
3124
3125 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index]";
3126 if (params.fft_postCallback.localMemSize > 0)
3127 {
3128 clKernWrite(transKernel, 0) << ", localmem";
3129 }
3130 clKernWrite(transKernel, 0) << ");" << std::endl;
3131 }
3132 else
3133 {
3134 clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
3135 clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
3136 }
3137
3138 break;
3139 case CLFFT_COMPLEX_PLANAR:
3140 if (params.fft_hasPostCallback)
3141 {
3142 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
3143 if (params.fft_postCallback.localMemSize > 0)
3144 {
3145 clKernWrite(transKernel, 0) << ", localmem";
3146 }
3147 clKernWrite(transKernel, 0) << ");" << std::endl;
3148
3149 clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
3150 if (params.fft_postCallback.localMemSize > 0)
3151 {
3152 clKernWrite(transKernel, 0) << ", localmem";
3153 }
3154 clKernWrite(transKernel, 0) << ");" << std::endl;
3155 }
3156 else
3157 {
3158 clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
3159 clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
3160
3161 clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
3162 clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
3163 }
3164 break;
3165 case CLFFT_HERMITIAN_INTERLEAVED:
3166 case CLFFT_HERMITIAN_PLANAR:
3167 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3168 case CLFFT_REAL:
3169 break;
3170 default:
3171 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3172 }
3173
3174
3175
3176 clKernWrite(transKernel, 3) << "}" << std::endl;
3177
3178 }
3179 else {
3180
3181 clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
3182 clKernWrite(transKernel, 3) << "if (" << smaller_dim << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
3183 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3184 clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
3185
3186 // Handle planar and interleaved right here
3187 switch (params.fft_inputLayout)
3188 {
3189 case CLFFT_COMPLEX_INTERLEAVED:
3190 case CLFFT_REAL:
3191 if (params.fft_hasPreCallback)
3192 {
3193 if (params.fft_preCallback.localMemSize > 0)
3194 {
3195 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3196 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3197 }
3198 else
3199 {
3200 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3201 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3202 }
3203 }
3204 else
3205 {
3206 clKernWrite(transKernel, 9) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3207 clKernWrite(transKernel, 9) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3208 }
3209 break;
3210 case CLFFT_COMPLEX_PLANAR:
3211 dtInput = dtPlanar;
3212 dtOutput = dtPlanar;
3213 if (params.fft_hasPreCallback)
3214 {
3215 if (params.fft_preCallback.localMemSize > 0)
3216 {
3217 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3218 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3219 }
3220 else
3221 {
3222 clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3223 clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3224 }
3225 }
3226 else
3227 {
3228 clKernWrite(transKernel, 9) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3229 clKernWrite(transKernel, 9) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3230
3231 clKernWrite(transKernel, 9) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3232 clKernWrite(transKernel, 9) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3233 }
3234 break;
3235 case CLFFT_HERMITIAN_INTERLEAVED:
3236 case CLFFT_HERMITIAN_PLANAR:
3237 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3238 default:
3239 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3240 }
3241
3242 // If requested, generate the Twiddle math to multiply constant values
3243 if (params.fft_3StepTwiddle)
3244 genTwiddleMathLeadingDimensionBatched(params, transKernel, dtComplex, fwd);
3245
3246 clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
3247 clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
3248 clKernWrite(transKernel, 6) << "}" << std::endl;
3249 clKernWrite(transKernel, 3) << "}" << std::endl;
3250
3251 clKernWrite(transKernel, 3) << "else{" << std::endl;
3252 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3253 clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
3254
3255
3256 // Handle planar and interleaved right here
3257 switch (params.fft_inputLayout)
3258 {
3259 case CLFFT_COMPLEX_INTERLEAVED:
3260 case CLFFT_REAL:
3261 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << "&& idx<" << smaller_dim << ")" << std::endl;
3262 if (params.fft_hasPreCallback)
3263 {
3264 if (params.fft_preCallback.localMemSize > 0)
3265 {
3266 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3267 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
3268 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3269 }
3270 else
3271 {
3272 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3273 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
3274 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3275 }
3276 }
3277 else
3278 {
3279 clKernWrite(transKernel, 12) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3280 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
3281 clKernWrite(transKernel, 12) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3282 }
3283 break;
3284 case CLFFT_COMPLEX_PLANAR:
3285 dtInput = dtPlanar;
3286 dtOutput = dtPlanar;
3287 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << "&& idx<" << smaller_dim << ") {" << std::endl;
3288 if (params.fft_hasPreCallback)
3289 {
3290 if (params.fft_preCallback.localMemSize > 0)
3291 {
3292 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem); }" << std::endl;
3293 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3294 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem); }" << std::endl;
3295 }
3296 else
3297 {
3298 clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata); }" << std::endl;
3299 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3300 clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata); }" << std::endl;
3301 }
3302 }
3303 else
3304 {
3305 clKernWrite(transKernel, 12) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3306 clKernWrite(transKernel, 12) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx]; }" << std::endl;
3307 clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3308 clKernWrite(transKernel, 12) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3309 clKernWrite(transKernel, 12) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx]; }" << std::endl;
3310 }
3311 break;
3312 case CLFFT_HERMITIAN_INTERLEAVED:
3313 case CLFFT_HERMITIAN_PLANAR:
3314 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3315 default:
3316 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3317 }
3318
3319
3320 // If requested, generate the Twiddle math to multiply constant values
3321 if (params.fft_3StepTwiddle)
3322 genTwiddleMathLeadingDimensionBatched(params, transKernel, dtComplex, fwd);
3323
3324 clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
3325 clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
3326
3327 clKernWrite(transKernel, 9) << "}" << std::endl;
3328 clKernWrite(transKernel, 3) << "}" << std::endl;
3329
3330 clKernWrite(transKernel, 3) << "" << std::endl;
3331 clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
3332 clKernWrite(transKernel, 3) << "" << std::endl;
3333
3334 // Step2: Write from shared to global
3335
3336 clKernWrite(transKernel, 3) << "if (" << smaller_dim << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
3337 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3338 clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop ;" << std::endl;
3339
3340 // Handle planar and interleaved right here
3341 switch (params.fft_outputLayout)
3342 {
3343 case CLFFT_COMPLEX_INTERLEAVED:
3344 if (params.fft_hasPostCallback)
3345 {
3346 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
3347 if (params.fft_postCallback.localMemSize > 0)
3348 {
3349 clKernWrite(transKernel, 0) << ", localmem";
3350 }
3351 clKernWrite(transKernel, 0) << ");" << std::endl;
3352
3353 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
3354 if (params.fft_postCallback.localMemSize > 0)
3355 {
3356 clKernWrite(transKernel, 0) << ", localmem";
3357 }
3358 clKernWrite(transKernel, 0) << ");" << std::endl;
3359 }
3360 else
3361 {
3362 clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
3363 clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
3364 }
3365
3366 break;
3367 case CLFFT_COMPLEX_PLANAR:
3368 if (params.fft_hasPostCallback)
3369 {
3370 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
3371 if (params.fft_postCallback.localMemSize > 0)
3372 {
3373 clKernWrite(transKernel, 0) << ", localmem";
3374 }
3375 clKernWrite(transKernel, 0) << ");" << std::endl;
3376
3377 clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
3378 if (params.fft_postCallback.localMemSize > 0)
3379 {
3380 clKernWrite(transKernel, 0) << ", localmem";
3381 }
3382 clKernWrite(transKernel, 0) << ");" << std::endl;
3383 }
3384 else
3385 {
3386 clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
3387 clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
3388 clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
3389 clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
3390 }
3391 break;
3392 case CLFFT_HERMITIAN_INTERLEAVED:
3393 case CLFFT_HERMITIAN_PLANAR:
3394 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3395 case CLFFT_REAL:
3396 break;
3397 default:
3398 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3399 }
3400
3401
3402 clKernWrite(transKernel, 6) << "}" << std::endl;
3403 clKernWrite(transKernel, 3) << "}" << std::endl;
3404
3405 clKernWrite(transKernel, 3) << "else{" << std::endl;
3406 clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3407
3408 clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
3409
3410 // Handle planar and interleaved right here
3411 switch (params.fft_outputLayout)
3412 {
3413 case CLFFT_COMPLEX_INTERLEAVED:
3414 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << " && idx<" << smaller_dim << ")" << std::endl;
3415 if (params.fft_hasPostCallback)
3416 {
3417 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
3418 if (params.fft_postCallback.localMemSize > 0)
3419 {
3420 clKernWrite(transKernel, 0) << ", localmem";
3421 }
3422 clKernWrite(transKernel, 0) << ");" << std::endl;
3423
3424 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
3425
3426 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
3427 if (params.fft_postCallback.localMemSize > 0)
3428 {
3429 clKernWrite(transKernel, 0) << ", localmem";
3430 }
3431 clKernWrite(transKernel, 0) << ");" << std::endl;
3432 }
3433 else
3434 {
3435 clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
3436 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
3437 clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
3438 }
3439 break;
3440 case CLFFT_COMPLEX_PLANAR:
3441 clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << " && idx<" << smaller_dim << ") {" << std::endl;
3442
3443 if (params.fft_hasPostCallback)
3444 {
3445 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
3446 if (params.fft_postCallback.localMemSize > 0)
3447 {
3448 clKernWrite(transKernel, 0) << ", localmem";
3449 }
3450 clKernWrite(transKernel, 0) << "); }" << std::endl;
3451
3452 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3453
3454 clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
3455 if (params.fft_postCallback.localMemSize > 0)
3456 {
3457 clKernWrite(transKernel, 0) << ", localmem";
3458 }
3459 clKernWrite(transKernel, 0) << "); }" << std::endl;
3460 }
3461 else
3462 {
3463 clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
3464 clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
3465 clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3466 clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
3467 clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
3468 }
3469
3470 break;
3471 case CLFFT_HERMITIAN_INTERLEAVED:
3472 case CLFFT_HERMITIAN_PLANAR:
3473 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3474 case CLFFT_REAL:
3475 break;
3476 default:
3477 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3478 }
3479
3480
3481 clKernWrite(transKernel, 6) << "}" << std::endl; // end for
3482 clKernWrite(transKernel, 3) << "}" << std::endl; // end else
3483
3484 }
3485 clKernWrite(transKernel, 0) << "}" << std::endl;
3486
3487 strKernel = transKernel.str();
3488
3489 if (!params.fft_3StepTwiddle)
3490 break;
3491 }
3492
3493 return CLFFT_SUCCESS;
3494 }
3495
3496 }// end of namespace clfft_transpose_generator
3497
3498