1 /* ************************************************************************
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16 
17 
18 /*
19 This file contains the implementation of inplace transpose kernel string generation.
20 This includes both square and non square, twiddle and non twiddle, as well as the kernels
21 that swap lines following permutation algorithm.
22 */
23 #include <vector>
24 #include "generator.transpose.h"
25 
26 namespace clfft_transpose_generator
27 {
28 // generating string for calculating offset within sqaure transpose kernels (genTransposeKernelBatched)
OffsetCalc(std::stringstream & transKernel,const FFTKernelGenKeyParams & params,bool input)29 void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& params, bool input)
30 {
31 	const size_t *stride = input ? params.fft_inStride : params.fft_outStride;
32 	std::string offset = input ? "iOffset" : "oOffset";
33 
34 
35 	clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
36 	clKernWrite(transKernel, 3) << "g_index = get_group_id(0);" << std::endl;
37 
38 	for (size_t i = params.fft_DataDim - 2; i > 0; i--)
39 	{
40 		clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
41 		clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
42 	}
43 
44 	clKernWrite(transKernel, 3) << std::endl;
45 }
46 
47 // generating string for calculating offset within sqaure transpose kernels (genTransposeKernelLeadingDimensionBatched)
OffsetCalcLeadingDimensionBatched(std::stringstream & transKernel,const FFTKernelGenKeyParams & params)48 void OffsetCalcLeadingDimensionBatched(std::stringstream& transKernel, const FFTKernelGenKeyParams& params)
49 {
50 	const size_t *stride = params.fft_inStride;
51 	std::string offset = "iOffset";
52 
53 	clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
54 	clKernWrite(transKernel, 3) << "g_index = get_group_id(0);" << std::endl;
55 
56 	for (size_t i = params.fft_DataDim - 2; i > 0; i--)
57 	{
58 		clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
59 		clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
60 	}
61 
62 	clKernWrite(transKernel, 3) << std::endl;
63 }
64 
65 // generating string for calculating offset within swap kernels (genSwapKernel)
Swap_OffsetCalc(std::stringstream & transKernel,const FFTKernelGenKeyParams & params)66 void Swap_OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& params)
67 {
68 	const size_t *stride = params.fft_inStride;
69 	std::string offset = "iOffset";
70 
71 	clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
72 
73 	for (size_t i = params.fft_DataDim - 2; i > 0; i--)
74 	{
75 		clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
76 		clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
77 	}
78 
79 	clKernWrite(transKernel, 3) << std::endl;
80 }
81 
82 // Small snippet of code that multiplies the twiddle factors into the butterfiles.  It is only emitted if the plan tells
83 // the generator that it wants the twiddle factors generated inside of the transpose
genTwiddleMath(const FFTKernelGenKeyParams & params,std::stringstream & transKernel,const std::string & dtComplex,bool fwd)84 clfftStatus genTwiddleMath(const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd)
85 {
86 
87 	clKernWrite(transKernel, 9) << std::endl;
88 
89 	clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (t_gx_p*32 + lidx) * (t_gy_p*32 + lidy + loop*8) );" << std::endl;
90 	clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (t_gy_p*32 + lidx) * (t_gx_p*32 + lidy + loop*8) );" << std::endl;
91 
92 	clKernWrite(transKernel, 9) << dtComplex << " Tm, Tt;" << std::endl;
93 
94 	if (fwd)
95 	{
96 		clKernWrite(transKernel, 9) << "Tm.x = ( Wm.x * tmpm.x ) - ( Wm.y * tmpm.y );" << std::endl;
97 		clKernWrite(transKernel, 9) << "Tm.y = ( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
98 		clKernWrite(transKernel, 9) << "Tt.x = ( Wt.x * tmpt.x ) - ( Wt.y * tmpt.y );" << std::endl;
99 		clKernWrite(transKernel, 9) << "Tt.y = ( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
100 	}
101 	else
102 	{
103 		clKernWrite(transKernel, 9) << "Tm.x =  ( Wm.x * tmpm.x ) + ( Wm.y * tmpm.y );" << std::endl;
104 		clKernWrite(transKernel, 9) << "Tm.y = -( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
105 		clKernWrite(transKernel, 9) << "Tt.x =  ( Wt.x * tmpt.x ) + ( Wt.y * tmpt.y );" << std::endl;
106 		clKernWrite(transKernel, 9) << "Tt.y = -( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
107 	}
108 
109 	clKernWrite(transKernel, 9) << "tmpm.x = Tm.x;" << std::endl;
110 	clKernWrite(transKernel, 9) << "tmpm.y = Tm.y;" << std::endl;
111 	clKernWrite(transKernel, 9) << "tmpt.x = Tt.x;" << std::endl;
112 	clKernWrite(transKernel, 9) << "tmpt.y = Tt.y;" << std::endl;
113 
114 	clKernWrite(transKernel, 9) << std::endl;
115 
116 	return CLFFT_SUCCESS;
117 }
118 
119 // Small snippet of code that multiplies the twiddle factors into the butterfiles.  It is only emitted if the plan tells
120 // the generator that it wants the twiddle factors generated inside of the transpose
genTwiddleMathLeadingDimensionBatched(const FFTKernelGenKeyParams & params,std::stringstream & transKernel,const std::string & dtComplex,bool fwd)121 clfftStatus genTwiddleMathLeadingDimensionBatched(const FFTKernelGenKeyParams& params, std::stringstream& transKernel, const std::string& dtComplex, bool fwd)
122 {
123 
124 	clKernWrite(transKernel, 9) << std::endl;
125 	if (params.fft_N[0] > params.fft_N[1])
126 	{
127 		clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (" << params.fft_N[1] << " * square_matrix_index + t_gx_p*32 + lidx) * (t_gy_p*32 + lidy + loop*8) );" << std::endl;
128 		clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (" << params.fft_N[1] << " * square_matrix_index + t_gy_p*32 + lidx) * (t_gx_p*32 + lidy + loop*8) );" << std::endl;
129 	}
130 	else
131 	{
132 		clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (t_gx_p*32 + lidx) * (" << params.fft_N[0] << " * square_matrix_index + t_gy_p*32 + lidy + loop*8) );" << std::endl;
133 		clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (t_gy_p*32 + lidx) * (" << params.fft_N[0] << " * square_matrix_index + t_gx_p*32 + lidy + loop*8) );" << std::endl;
134 	}
135 	clKernWrite(transKernel, 9) << dtComplex << " Tm, Tt;" << std::endl;
136 
137 	if (fwd)
138 	{
139 		clKernWrite(transKernel, 9) << "Tm.x = ( Wm.x * tmpm.x ) - ( Wm.y * tmpm.y );" << std::endl;
140 		clKernWrite(transKernel, 9) << "Tm.y = ( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
141 		clKernWrite(transKernel, 9) << "Tt.x = ( Wt.x * tmpt.x ) - ( Wt.y * tmpt.y );" << std::endl;
142 		clKernWrite(transKernel, 9) << "Tt.y = ( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
143 	}
144 	else
145 	{
146 		clKernWrite(transKernel, 9) << "Tm.x =  ( Wm.x * tmpm.x ) + ( Wm.y * tmpm.y );" << std::endl;
147 		clKernWrite(transKernel, 9) << "Tm.y = -( Wm.y * tmpm.x ) + ( Wm.x * tmpm.y );" << std::endl;
148 		clKernWrite(transKernel, 9) << "Tt.x =  ( Wt.x * tmpt.x ) + ( Wt.y * tmpt.y );" << std::endl;
149 		clKernWrite(transKernel, 9) << "Tt.y = -( Wt.y * tmpt.x ) + ( Wt.x * tmpt.y );" << std::endl;
150 	}
151 
152 	clKernWrite(transKernel, 9) << "tmpm.x = Tm.x;" << std::endl;
153 	clKernWrite(transKernel, 9) << "tmpm.y = Tm.y;" << std::endl;
154 	clKernWrite(transKernel, 9) << "tmpt.x = Tt.x;" << std::endl;
155 	clKernWrite(transKernel, 9) << "tmpt.y = Tt.y;" << std::endl;
156 
157 	clKernWrite(transKernel, 9) << std::endl;
158 
159 	return CLFFT_SUCCESS;
160 }
161 
genTransposePrototype(const FFTGeneratedTransposeSquareAction::Signature & params,const size_t & lwSize,const std::string & dtPlanar,const std::string & dtComplex,const std::string & funcName,std::stringstream & transKernel,std::string & dtInput,std::string & dtOutput)162 clfftStatus genTransposePrototype(const FFTGeneratedTransposeSquareAction::Signature & params, const size_t& lwSize, const std::string& dtPlanar, const std::string& dtComplex,
163 	const std::string &funcName, std::stringstream& transKernel, std::string& dtInput, std::string& dtOutput)
164 {
165 
166 	// Declare and define the function
167 	clKernWrite(transKernel, 0) << "__attribute__(( reqd_work_group_size( " << lwSize << ", 1, 1 ) ))" << std::endl;
168 	clKernWrite(transKernel, 0) << "kernel void" << std::endl;
169 
170 	clKernWrite(transKernel, 0) << funcName << "( ";
171 
172 	switch (params.fft_inputLayout)
173 	{
174 	case CLFFT_COMPLEX_INTERLEAVED:
175 		dtInput = dtComplex;
176 		dtOutput = dtComplex;
177 		clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
178 		break;
179 	case CLFFT_COMPLEX_PLANAR:
180 		dtInput = dtPlanar;
181 		dtOutput = dtPlanar;
182 		clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA_R" << ", global " << dtInput << "* restrict inputA_I";
183 		break;
184 	case CLFFT_HERMITIAN_INTERLEAVED:
185 	case CLFFT_HERMITIAN_PLANAR:
186 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
187 	case CLFFT_REAL:
188 		dtInput = dtPlanar;
189 		dtOutput = dtPlanar;
190 
191 		clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
192 		break;
193 	default:
194 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
195 	}
196 
197 	if (params.fft_placeness == CLFFT_OUTOFPLACE)
198 		switch (params.fft_outputLayout)
199 		{
200 		case CLFFT_COMPLEX_INTERLEAVED:
201 			dtInput = dtComplex;
202 			dtOutput = dtComplex;
203 			clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
204 			break;
205 		case CLFFT_COMPLEX_PLANAR:
206 			dtInput = dtPlanar;
207 			dtOutput = dtPlanar;
208 			clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA_R" << ", global " << dtOutput << "* restrict outputA_I";
209 			break;
210 		case CLFFT_HERMITIAN_INTERLEAVED:
211 		case CLFFT_HERMITIAN_PLANAR:
212 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
213 		case CLFFT_REAL:
214 			dtInput = dtPlanar;
215 			dtOutput = dtPlanar;
216 			clKernWrite(transKernel, 0) << ", global " << dtOutput << "* restrict outputA";
217 			break;
218 		default:
219 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
220 		}
221 
222 	if (params.fft_hasPreCallback)
223 	{
224 		assert(!params.fft_hasPostCallback);
225 
226 		if (params.fft_preCallback.localMemSize > 0)
227 		{
228 			clKernWrite(transKernel, 0) << ", __global void* pre_userdata, __local void* localmem";
229 		}
230 		else
231 		{
232 			clKernWrite(transKernel, 0) << ", __global void* pre_userdata";
233 		}
234 	}
235 	if (params.fft_hasPostCallback)
236 	{
237 		assert(!params.fft_hasPreCallback);
238 
239 		if (params.fft_postCallback.localMemSize > 0)
240 		{
241 			clKernWrite(transKernel, 0) << ", __global void* post_userdata, __local void* localmem";
242 		}
243 		else
244 		{
245 			clKernWrite(transKernel, 0) << ", __global void* post_userdata";
246 		}
247 	}
248 
249 	// Close the method signature
250 	clKernWrite(transKernel, 0) << " )\n{" << std::endl;
251 	return CLFFT_SUCCESS;
252 }
253 
genTransposePrototypeLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params,const size_t & lwSize,const std::string & dtPlanar,const std::string & dtComplex,const std::string & funcName,std::stringstream & transKernel,std::string & dtInput,std::string & dtOutput)254 clfftStatus genTransposePrototypeLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params, const size_t& lwSize,
255                                                          const std::string& dtPlanar, const std::string& dtComplex,
256                                                          const std::string &funcName, std::stringstream& transKernel,
257                                                          std::string& dtInput, std::string& dtOutput)
258 {
259 
260 	// Declare and define the function
261 	clKernWrite(transKernel, 0) << "__attribute__(( reqd_work_group_size( " << lwSize << ", 1, 1 ) ))" << std::endl;
262 	clKernWrite(transKernel, 0) << "kernel void" << std::endl;
263 
264 	clKernWrite(transKernel, 0) << funcName << "( ";
265 
266 	switch (params.fft_inputLayout)
267 	{
268 	case CLFFT_COMPLEX_INTERLEAVED:
269 		dtInput = dtComplex;
270 		dtOutput = dtComplex;
271 		clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
272 		break;
273 	case CLFFT_COMPLEX_PLANAR:
274 		dtInput = dtPlanar;
275 		dtOutput = dtPlanar;
276 		clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA_R" << ", global " << dtInput << "* restrict inputA_I";
277 		break;
278 	case CLFFT_HERMITIAN_INTERLEAVED:
279 	case CLFFT_HERMITIAN_PLANAR:
280 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
281 	case CLFFT_REAL:
282 		dtInput = dtPlanar;
283 		dtOutput = dtPlanar;
284 
285 		clKernWrite(transKernel, 0) << "global " << dtInput << "* restrict inputA";
286 		break;
287 	default:
288 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
289 	}
290 
291 	if (params.fft_hasPreCallback)
292 	{
293 		assert(!params.fft_hasPostCallback);
294 		if (params.fft_preCallback.localMemSize > 0)
295 		{
296 			clKernWrite(transKernel, 0) << ", __global void* pre_userdata, __local void* localmem";
297 		}
298 		else
299 		{
300 			clKernWrite(transKernel, 0) << ", __global void* pre_userdata";
301 		}
302 	}
303 	if (params.fft_hasPostCallback)
304 	{
305 		assert(!params.fft_hasPreCallback);
306 
307 		if (params.fft_postCallback.localMemSize > 0)
308 		{
309 			clKernWrite(transKernel, 0) << ", __global void* post_userdata, __local void* localmem";
310 		}
311 		else
312 		{
313 			clKernWrite(transKernel, 0) << ", __global void* post_userdata";
314 		}
315 	}
316 
317 
318 	// Close the method signature
319 	clKernWrite(transKernel, 0) << " )\n{" << std::endl;
320 	return CLFFT_SUCCESS;
321 }
322 
323 /* -> get_cycles function gets the swapping logic required for given row x col matrix.
324 -> cycle_map[0] holds the total number of cycles required.
325 -> cycles start and end with the same index, hence we can identify individual cycles,
326 though we tend to store the cycle index contiguously*/
get_cycles(size_t * cycle_map,size_t num_reduced_row,size_t num_reduced_col)327 void get_cycles(size_t *cycle_map, size_t num_reduced_row, size_t num_reduced_col)
328 {
329 	int *is_swapped = new int[num_reduced_row * num_reduced_col];
330 	int i, map_index = 1, num_cycles = 0;
331 	size_t swap_id;
332 	/*initialize swap map*/
333 	is_swapped[0] = 1;
334 	is_swapped[num_reduced_row * num_reduced_col - 1] = 1;
335 	for (i = 1; i < (num_reduced_row * num_reduced_col - 1); i++)
336 	{
337 		is_swapped[i] = 0;
338 	}
339 
340 	for (i = 1; i < (num_reduced_row * num_reduced_col - 1); i++)
341 	{
342 		swap_id = i;
343 		while (!is_swapped[swap_id])
344 		{
345 			is_swapped[swap_id] = 1;
346 			cycle_map[map_index++] = swap_id;
347 			swap_id = (num_reduced_row * swap_id) % (num_reduced_row * num_reduced_col - 1);
348 			if (swap_id == i)
349 			{
350 				cycle_map[map_index++] = swap_id;
351 				num_cycles++;
352 			}
353 		}
354 	}
355 	cycle_map[0] = num_cycles;
356 	delete[] is_swapped;
357 }
358 
359 /*
360 calculate the permutation cycles consumed in swap kernels.
361 each cycle is strored in a vecotor. hopfully there are mutliple independent vectors thus we use a vector of vecotor
362 */
permutation_calculation(size_t m,size_t n,std::vector<std::vector<size_t>> & permutationVec)363 void permutation_calculation(size_t m, size_t n, std::vector<std::vector<size_t> > &permutationVec)
364 {
365 	/*
366 	calculate inplace transpose permutation lists
367 	reference:
368 	https://en.wikipedia.org/wiki/In-place_matrix_transposition
369 	and
370 	http://www.netlib.org/utk/people/JackDongarra/CCDSC-2014/talk35.pdf
371 	row major matrix of size n x m
372 	p(k) = (k*n)mod(m*n-1), if 0 < k < m*n-1
373 	when k = 0 or m*n-1, it does not require movement
374 	*/
375 	if (m < 1 || n < 1)
376 		return;
377 
378 	size_t mn_minus_one = m*n - 1;
379 	//maintain a table so check is faster
380 	size_t *table = new size_t[mn_minus_one + 1]();//init to zeros
381 	table[0] = 1;
382 
383 	for (size_t i = 1; i < mn_minus_one; i++)
384 	{
385 		//first check if i is already stored in somewhere in vector of vectors
386 		bool already_checked = false;
387 		if (table[i] >= 1)
388 			already_checked = true;
389 		if (already_checked == true)
390 			continue;
391 
392 		//if not checked yet
393 		std::vector<size_t> vec;
394 		vec.push_back(i);
395 		table[i] += 1;
396 		size_t temp = i;
397 
398 		while (1)
399 		{
400 			temp = (temp*n);
401 			temp = temp % (mn_minus_one);
402 			if (find(vec.begin(), vec.end(), temp) != vec.end())
403 			{
404 				//what goes around comes around and it should
405 				break;
406 			}
407 			if (table[temp] >= 1)
408 			{
409 				already_checked = true;
410 				break;
411 			}
412 			vec.push_back(temp);
413 			table[temp] += 1;
414 		}
415 		if (already_checked == true)
416 			continue;
417 		permutationVec.push_back(vec);
418 	}
419 	delete[] table;
420 }
421 //swap lines. This kind of kernels are using with combination of square transpose kernels to perform nonsqaure transpose
422 //this function assumes a 1:2 ratio
genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature & params,std::string & strKernel,std::string & KernelFuncName,const size_t & lwSize,const size_t reShapeFactor)423 clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature & params, std::string& strKernel, std::string& KernelFuncName, const size_t& lwSize, const size_t reShapeFactor)
424 {
425 	strKernel.reserve(4096);
426 	std::stringstream transKernel(std::stringstream::out);
427 
428 	// These strings represent the various data types we read or write in the kernel, depending on how the plan
429 	// is configured
430 	std::string dtInput;        // The type read as input into kernel
431 	std::string dtOutput;       // The type written as output from kernel
432 	std::string dtPlanar;       // Fundamental type for planar arrays
433 	std::string tmpBuffType;
434 	std::string dtComplex;      // Fundamental type for complex arrays
435 
436 								// NOTE:  Enable only for debug
437 								// clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
438 
439 								//if (params.fft_inputLayout != params.fft_outputLayout)
440 								//	return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
441 
442 	switch (params.fft_precision)
443 	{
444 	case CLFFT_SINGLE:
445 	case CLFFT_SINGLE_FAST:
446 		dtPlanar = "float";
447 		dtComplex = "float2";
448 		break;
449 	case CLFFT_DOUBLE:
450 	case CLFFT_DOUBLE_FAST:
451 		dtPlanar = "double";
452 		dtComplex = "double2";
453 
454 		// Emit code that enables double precision in the kernel
455 		clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
456 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
457 		clKernWrite(transKernel, 0) << "#else" << std::endl;
458 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
459 		clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
460 
461 		break;
462 	default:
463 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
464 		break;
465 	}
466 
467 	// This detects whether the input matrix is rectangle of ratio 1:2
468 
469 	if ((params.fft_N[0] != 2 * params.fft_N[1]) && (params.fft_N[1] != 2 * params.fft_N[0]))
470 	{
471 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
472 	}
473 
474 	if (params.fft_placeness == CLFFT_OUTOFPLACE)
475 	{
476 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
477 	}
478 
479 	size_t smaller_dim = (params.fft_N[0] < params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
480 
481 	size_t input_elm_size_in_bytes;
482 	switch (params.fft_precision)
483 	{
484 	case CLFFT_SINGLE:
485 	case CLFFT_SINGLE_FAST:
486 		input_elm_size_in_bytes = 4;
487 		break;
488 	case CLFFT_DOUBLE:
489 	case CLFFT_DOUBLE_FAST:
490 		input_elm_size_in_bytes = 8;
491 		break;
492 	default:
493 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
494 	}
495 
496 	switch (params.fft_outputLayout)
497 	{
498 	case CLFFT_COMPLEX_INTERLEAVED:
499 	case CLFFT_COMPLEX_PLANAR:
500 		input_elm_size_in_bytes *= 2;
501 		break;
502 	case CLFFT_REAL:
503 		break;
504 	default:
505 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
506 	}
507 	size_t max_elements_loaded = AVAIL_MEM_SIZE / input_elm_size_in_bytes;
508 	size_t num_elements_loaded;
509 	size_t local_work_size_swap, num_grps_pro_row;
510 
511 	tmpBuffType = "__local";
512 	if ((max_elements_loaded >> 1) > smaller_dim)
513 	{
514 		local_work_size_swap = (smaller_dim < 256) ? smaller_dim : 256;
515 		num_elements_loaded = smaller_dim;
516 		num_grps_pro_row = 1;
517 	}
518 	else
519 	{
520 		num_grps_pro_row = (smaller_dim << 1) / max_elements_loaded;
521 		num_elements_loaded = max_elements_loaded >> 1;
522 		local_work_size_swap = (num_elements_loaded < 256) ? num_elements_loaded : 256;
523 	}
524 
525 	//If post-callback is set for the plan
526 	if (params.fft_hasPostCallback)
527 	{
528 		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by swap kernel
529 		if (params.fft_postCallback.localMemSize > 0)
530 		{
531 			bool validLDSSize = false;
532 
533 			validLDSSize = ((2 * input_elm_size_in_bytes * (num_elements_loaded * 2)) + params.fft_postCallback.localMemSize) < params.limit_LocalMemSize;
534 
535 			if (!validLDSSize)
536 			{
537 				fprintf(stderr, "Requested local memory size not available\n");
538 				return CLFFT_INVALID_ARG_VALUE;
539 			}
540 		}
541 
542 		//Insert callback function code at the beginning
543 		clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
544 		clKernWrite(transKernel, 0) << std::endl;
545 	}
546 	//If pre-callback is set for the plan
547 	if (params.fft_hasPreCallback)
548 	{
549 		//we have already checked available LDS for pre callback
550 		//Insert callback function code at the beginning
551 		clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
552 		clKernWrite(transKernel, 0) << std::endl;
553 	}
554 
555 	/*Generating the  swapping logic*/
556 	{
557 		size_t num_reduced_row;
558 		size_t num_reduced_col;
559 
560 		if (params.fft_N[1] == smaller_dim)
561 		{
562 			num_reduced_row = smaller_dim;
563 			num_reduced_col = 2;
564 		}
565 		else
566 		{
567 			num_reduced_row = 2;
568 			num_reduced_col = smaller_dim;
569 		}
570 
571 		std::string funcName;
572 
573 		clKernWrite(transKernel, 0) << std::endl;
574 
575 		size_t *cycle_map = new size_t[num_reduced_row * num_reduced_col * 2];
576 		/* The memory required by cycle_map cannot exceed 2 times row*col by design*/
577 
578 		get_cycles(cycle_map, num_reduced_row, num_reduced_col);
579 
580 		size_t *cycle_stat = new size_t[cycle_map[0] * 2], stat_idx = 0;
581 		clKernWrite(transKernel, 0) << std::endl;
582 
583 		clKernWrite(transKernel, 0) << "__constant size_t swap_table[][3] = {" << std::endl;
584 
585 		size_t inx = 0, start_inx, swap_inx = 0, num_swaps = 0;
586 		for (size_t i = 0; i < cycle_map[0]; i++)
587 		{
588 			start_inx = cycle_map[++inx];
589 			clKernWrite(transKernel, 0) << "{  " << start_inx << ",  " << cycle_map[inx + 1] << ",  0}," << std::endl;
590 			cycle_stat[stat_idx++] = num_swaps;
591 			num_swaps++;
592 
593 			while (start_inx != cycle_map[++inx])
594 			{
595 				size_t action_var = (cycle_map[inx + 1] == start_inx) ? 2 : 1;
596 				clKernWrite(transKernel, 0) << "{  " << cycle_map[inx] << ",  " << cycle_map[inx + 1] << ",  " << action_var << "}," << std::endl;
597 				if (action_var == 2)
598 					cycle_stat[stat_idx++] = num_swaps;
599 				num_swaps++;
600 			}
601 		}
602 		/*Appending swap table for touching corner elements for post call back*/
603 		size_t last_datablk_idx = num_reduced_row * num_reduced_col - 1;
604 		clKernWrite(transKernel, 0) << "{  0,  0,  0}," << std::endl;
605 		clKernWrite(transKernel, 0) << "{  " << last_datablk_idx << ",  " << last_datablk_idx << ",  0}," << std::endl;
606 
607 		clKernWrite(transKernel, 0) << "};" << std::endl;
608 		/*cycle_map[0] + 2, + 2 is added for post callback table appending*/
609 		size_t num_cycles_minus_1 = cycle_map[0] - 1;
610 
611 		clKernWrite(transKernel, 0) << "__constant size_t cycle_stat[" << cycle_map[0] << "][2] = {" << std::endl;
612 		for (size_t i = 0; i < num_cycles_minus_1; i++)
613 		{
614 			clKernWrite(transKernel, 0) << "{  " << cycle_stat[i * 2] << ",  " << cycle_stat[i * 2 + 1] << "}," << std::endl;
615 		}
616 		clKernWrite(transKernel, 0) << "{  " << cycle_stat[num_cycles_minus_1 * 2] << ",  " << (cycle_stat[num_cycles_minus_1 * 2 + 1] + 2) << "}," << std::endl;
617 
618 		clKernWrite(transKernel, 0) << "};" << std::endl;
619 
620 		clKernWrite(transKernel, 0) << std::endl;
621 
622 		switch (params.fft_inputLayout)
623 		{
624 		case CLFFT_COMPLEX_INTERLEAVED:
625 			clKernWrite(transKernel, 0) << "void swap(global " << dtComplex << "* inputA, " << tmpBuffType << " " << dtComplex << "* Ls, " << tmpBuffType << " " << dtComplex << " * Ld, size_t is, size_t id, size_t pos, size_t end_indx, size_t work_id";
626 			break;
627 		case CLFFT_COMPLEX_PLANAR:
628 			clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA_R, global " << dtPlanar << "* inputA_I, " << tmpBuffType << " " << dtComplex << "* Ls, " << tmpBuffType << " " << dtComplex << "* Ld, size_t is, size_t id, size_t pos, size_t end_indx, size_t work_id";
629 			break;
630 		case CLFFT_HERMITIAN_INTERLEAVED:
631 		case CLFFT_HERMITIAN_PLANAR:
632 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
633 		case CLFFT_REAL:
634 			clKernWrite(transKernel, 0) << "void swap(global " << dtPlanar << "* inputA, " << tmpBuffType << " " << dtPlanar << "* Ls, " << tmpBuffType << " " << dtPlanar << "* Ld, size_t is, size_t id, size_t pos, size_t end_indx, size_t work_id";
635 			break;
636 		default:
637 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
638 		}
639 
640 		if (params.fft_hasPostCallback)
641 		{
642 			clKernWrite(transKernel, 0) << ", size_t iOffset, __global void* post_userdata";
643 			if (params.fft_postCallback.localMemSize > 0)
644 			{
645 				clKernWrite(transKernel, 0) << ", __local void* localmem";
646 			}
647 		}
648 
649 		if (params.fft_hasPreCallback)
650 		{
651 			clKernWrite(transKernel, 0) << ", size_t iOffset, __global void* pre_userdata";
652 			if (params.fft_preCallback.localMemSize > 0)
653 			{
654 				clKernWrite(transKernel, 0) << ", __local void* localmem";
655 			}
656 		}
657 
658 		clKernWrite(transKernel, 0) << "){" << std::endl;
659 
660 		clKernWrite(transKernel, 3) << "for (size_t j = get_local_id(0); j < end_indx; j += " << local_work_size_swap << "){" << std::endl;
661 
662 		switch (params.fft_inputLayout)
663 		{
664 		case CLFFT_REAL:
665 		case CLFFT_COMPLEX_INTERLEAVED:
666 
667 			if (params.fft_hasPreCallback)
668 			{
669 				clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
670 
671 				clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA, ( is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
672 				//clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
673 				if (params.fft_preCallback.localMemSize > 0)
674 				{
675 					clKernWrite(transKernel, 0) << ", localmem";
676 				}
677 				clKernWrite(transKernel, 0) << ");" << std::endl;
678 
679 				clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
680 				//clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
681 				if (params.fft_preCallback.localMemSize > 0)
682 				{
683 					clKernWrite(transKernel, 0) << ", localmem";
684 				}
685 				clKernWrite(transKernel, 0) << ");" << std::endl;
686 				clKernWrite(transKernel, 6) << "}" << std::endl;
687 
688 				clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
689 				clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
690 				//clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
691 				if (params.fft_preCallback.localMemSize > 0)
692 				{
693 					clKernWrite(transKernel, 0) << ", localmem";
694 				}
695 				clKernWrite(transKernel, 0) << ");" << std::endl;
696 				clKernWrite(transKernel, 6) << "}" << std::endl;
697 			}
698 			else
699 			{
700 				clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
701 				clKernWrite(transKernel, 9) << "Ls[j] = inputA[is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
702 				clKernWrite(transKernel, 9) << "Ld[j] = inputA[id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
703 				clKernWrite(transKernel, 6) << "}" << std::endl;
704 
705 				clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
706 				clKernWrite(transKernel, 9) << "Ld[j] = inputA[id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
707 				clKernWrite(transKernel, 6) << "}" << std::endl;
708 			}
709 
710 			if (params.fft_hasPostCallback)
711 			{
712 				clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA, (iOffset + id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), post_userdata, Ls[j]";
713 				if (params.fft_postCallback.localMemSize > 0)
714 				{
715 					clKernWrite(transKernel, 0) << ", localmem";
716 				}
717 				clKernWrite(transKernel, 0) << ");" << std::endl;
718 			}
719 			else if (params.fft_hasPreCallback)
720 		    {
721 				clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset] = Ls[j];" << std::endl;
722 			}
723 			else
724 			{
725 				clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
726 			}
727 			break;
728 		case CLFFT_HERMITIAN_INTERLEAVED:
729 		case CLFFT_HERMITIAN_PLANAR:
730 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
731 		case CLFFT_COMPLEX_PLANAR:
732 			if (params.fft_hasPreCallback)
733 			{
734 				clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
735 				clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, (is * " << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
736 				if (params.fft_preCallback.localMemSize > 0)
737 				{
738 					clKernWrite(transKernel, 0) << ", localmem";
739 				}
740 				clKernWrite(transKernel, 0) << ");" << std::endl;
741 
742 				clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, (id * " << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
743 				if (params.fft_preCallback.localMemSize > 0)
744 				{
745 					clKernWrite(transKernel, 0) << ", localmem";
746 				}
747 				clKernWrite(transKernel, 0) << ");" << std::endl;
748 
749 				clKernWrite(transKernel, 6) << "}" << std::endl;
750 
751 				clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
752 
753 				clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, (id * " << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
754 				if (params.fft_preCallback.localMemSize > 0)
755 				{
756 					clKernWrite(transKernel, 0) << ", localmem";
757 				}
758 				clKernWrite(transKernel, 0) << ");" << std::endl;
759 
760 				clKernWrite(transKernel, 6) << "}" << std::endl;
761 			}
762 			else
763 			{
764 				clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
765 				clKernWrite(transKernel, 9) << "Ls[j].x = inputA_R[is*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
766 				clKernWrite(transKernel, 9) << "Ls[j].y = inputA_I[is*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
767 				clKernWrite(transKernel, 9) << "Ld[j].x = inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
768 				clKernWrite(transKernel, 9) << "Ld[j].y = inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
769 				clKernWrite(transKernel, 6) << "}" << std::endl;
770 
771 				clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
772 				clKernWrite(transKernel, 9) << "Ld[j].x = inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
773 				clKernWrite(transKernel, 9) << "Ld[j].y = inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j];" << std::endl;
774 				clKernWrite(transKernel, 6) << "}" << std::endl;
775 			}
776 			if (params.fft_hasPostCallback)
777 			{
778 				clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R, inputA_I, (iOffset + id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), post_userdata, Ls[j].x, Ls[j].y";
779 				if (params.fft_postCallback.localMemSize > 0)
780 				{
781 					clKernWrite(transKernel, 0) << ", localmem";
782 				}
783 				clKernWrite(transKernel, 0) << ");" << std::endl;
784 			}
785 			else
786 			{
787 				clKernWrite(transKernel, 6) << "inputA_R[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].x;" << std::endl;
788 				clKernWrite(transKernel, 6) << "inputA_I[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j].y;" << std::endl;
789 			}
790 			break;
791 		default:
792 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
793 		}
794 		clKernWrite(transKernel, 3) << "}" << std::endl;
795 
796 		clKernWrite(transKernel, 0) << "}" << std::endl << std::endl;
797 
798 		funcName = "swap_nonsquare";
799 		KernelFuncName = funcName;
800 		// Generate kernel API
801 
802 		/*when swap can be performed in LDS itself then, same prototype of transpose can be used for swap function too*/
803 		genTransposePrototypeLeadingDimensionBatched(params, local_work_size_swap, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
804 
805 		clKernWrite(transKernel, 3) << "size_t g_index = get_group_id(0);" << std::endl;
806 
807 		clKernWrite(transKernel, 3) << "const size_t numGroupsY_1 = " << cycle_map[0] * num_grps_pro_row << " ;" << std::endl;
808 		for (size_t i = 2; i < params.fft_DataDim - 1; i++)
809 		{
810 			clKernWrite(transKernel, 3) << "const size_t numGroupsY_" << i << " = numGroupsY_" << i - 1 << " * " << params.fft_N[i] << ";" << std::endl;
811 		}
812 
813 		delete[] cycle_map;
814 		delete[] cycle_stat;
815 
816 		Swap_OffsetCalc(transKernel, params);
817 
818 		// Handle planar and interleaved right here
819 		switch (params.fft_inputLayout)
820 		{
821 		case CLFFT_COMPLEX_INTERLEAVED:
822 		case CLFFT_REAL:
823 
824 			clKernWrite(transKernel, 3) << "__local " << dtInput << " tmp_tot_mem[" << (num_elements_loaded * 2) << "];" << std::endl;
825 			clKernWrite(transKernel, 3) << tmpBuffType << " " << dtInput << " *te = tmp_tot_mem;" << std::endl;
826 
827 			clKernWrite(transKernel, 3) << tmpBuffType << " " << dtInput << " *to = (tmp_tot_mem + " << num_elements_loaded << ");" << std::endl;
828 
829 			//Do not advance offset when postcallback is set as the starting address of global buffer is needed
830 			if (!params.fft_hasPostCallback && !params.fft_hasPreCallback)
831 				clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl;  // Set A ptr to the start of each slice
832 			break;
833 		case CLFFT_COMPLEX_PLANAR:
834 
835 			clKernWrite(transKernel, 3) << "__local " << dtComplex << " tmp_tot_mem[" << (num_elements_loaded * 2) << "];" << std::endl;
836 			clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *te = tmp_tot_mem;" << std::endl;
837 
838 			clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *to = (tmp_tot_mem + " << num_elements_loaded << ");" << std::endl;
839 
840 			//Do not advance offset when postcallback is set as the starting address of global buffer is needed
841 			if (!params.fft_hasPostCallback)
842 			{
843 				clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl;  // Set A ptr to the start of each slice
844 				clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl;  // Set A ptr to the start of each slice
845 			}
846 			break;
847 		case CLFFT_HERMITIAN_INTERLEAVED:
848 		case CLFFT_HERMITIAN_PLANAR:
849 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
850 
851 		default:
852 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
853 		}
854 
855 		switch (params.fft_inputLayout)
856 		{
857 		case CLFFT_COMPLEX_INTERLEAVED:
858 		case CLFFT_COMPLEX_PLANAR:
859 			clKernWrite(transKernel, 3) << tmpBuffType << " " << dtComplex << " *tmp_swap_ptr[2];" << std::endl;
860 			break;
861 		case CLFFT_REAL:
862 			clKernWrite(transKernel, 3) << tmpBuffType << " " << dtPlanar << " *tmp_swap_ptr[2];" << std::endl;
863 		}
864 		clKernWrite(transKernel, 3) << "tmp_swap_ptr[0] = te;" << std::endl;
865 		clKernWrite(transKernel, 3) << "tmp_swap_ptr[1] = to;" << std::endl;
866 
867 		clKernWrite(transKernel, 3) << "size_t swap_inx = 0;" << std::endl;
868 
869 		clKernWrite(transKernel, 3) << "size_t start = cycle_stat[g_index / " << num_grps_pro_row << "][0];" << std::endl;
870 		clKernWrite(transKernel, 3) << "size_t end = cycle_stat[g_index / " << num_grps_pro_row << "][1];" << std::endl;
871 
872 		clKernWrite(transKernel, 3) << "size_t end_indx = " << num_elements_loaded << ";" << std::endl;
873 		clKernWrite(transKernel, 3) << "size_t work_id = g_index % " << num_grps_pro_row << ";" << std::endl;
874 
875 		clKernWrite(transKernel, 3) << "if( work_id == " << (num_grps_pro_row - 1) << " ){" << std::endl;
876 		clKernWrite(transKernel, 6) << "end_indx = " << smaller_dim - num_elements_loaded * (num_grps_pro_row - 1) << ";" << std::endl;
877 		clKernWrite(transKernel, 3) << "}" << std::endl;
878 
879 		clKernWrite(transKernel, 3) << "for (size_t loop = start; loop <= end; loop ++){" << std::endl;
880 		clKernWrite(transKernel, 6) << "swap_inx = 1 - swap_inx;" << std::endl;
881 
882 		switch (params.fft_inputLayout)
883 		{
884 		case CLFFT_COMPLEX_INTERLEAVED:
885 		case CLFFT_REAL:
886 			clKernWrite(transKernel, 6) << "swap(inputA, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id";
887 			break;
888 		case CLFFT_COMPLEX_PLANAR:
889 			clKernWrite(transKernel, 6) << "swap(inputA_R, inputA_I, tmp_swap_ptr[swap_inx], tmp_swap_ptr[1 - swap_inx], swap_table[loop][0], swap_table[loop][1], swap_table[loop][2], end_indx, work_id";
890 			break;
891 		}
892 		if (params.fft_hasPostCallback)
893 		{
894 			clKernWrite(transKernel, 0) << ", iOffset, post_userdata";
895 			if (params.fft_postCallback.localMemSize > 0)
896 			{
897 				clKernWrite(transKernel, 0) << ", localmem";
898 			}
899 		}
900 		if (params.fft_hasPreCallback)
901 		{
902 			clKernWrite(transKernel, 0) << ", iOffset, pre_userdata";
903 			if (params.fft_preCallback.localMemSize > 0)
904 			{
905 				clKernWrite(transKernel, 0) << ", localmem";
906 			}
907 		}
908 		clKernWrite(transKernel, 0) << ");" << std::endl;
909 
910 		clKernWrite(transKernel, 3) << "}" << std::endl;
911 
912 		clKernWrite(transKernel, 0) << "}" << std::endl;
913 		strKernel = transKernel.str();
914 	}
915 	return CLFFT_SUCCESS;
916 }
917 
918 //swap lines. a more general kernel generator.
919 //this function accepts any ratio in theory. But in practice we restrict it to 1:2, 1:3, 1:5 and 1:10 ration
genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Signature & params,std::string & strKernel,std::string & KernelFuncName,const size_t & lwSize,const size_t reShapeFactor)920 clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Signature & params, std::string& strKernel, std::string& KernelFuncName, const size_t& lwSize, const size_t reShapeFactor)
921 {
922 	if (params.fft_placeness == CLFFT_OUTOFPLACE)
923 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
924 
925 	size_t smaller_dim = (params.fft_N[0] < params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
926 	size_t bigger_dim = (params.fft_N[0] >= params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
927 	size_t dim_ratio = bigger_dim / smaller_dim;
928 	/*
929 	if ( (params.fft_N[0] != 2 * params.fft_N[1]) && (params.fft_N[1] != 2 * params.fft_N[0]) &&
930 		 (params.fft_N[0] != 3 * params.fft_N[1]) && (params.fft_N[1] != 3 * params.fft_N[0]) &&
931 		 (params.fft_N[0] != 5 * params.fft_N[1]) && (params.fft_N[1] != 5 * params.fft_N[0]) &&
932 		 (params.fft_N[0] != 10 * params.fft_N[1]) && (params.fft_N[1] != 10 * params.fft_N[0]) )
933     */
934 	if(dim_ratio % 2 != 0 && dim_ratio % 3 != 0 && dim_ratio % 5 != 0 && dim_ratio % 10 != 0)
935 	{
936 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
937 	}
938 
939 	strKernel.reserve(4096);
940 	std::stringstream transKernel(std::stringstream::out);
941 
942 	// These strings represent the various data types we read or write in the kernel, depending on how the plan
943 	// is configured
944 	std::string dtInput;        // The type read as input into kernel
945 	std::string dtOutput;       // The type written as output from kernel
946 	std::string dtPlanar;       // Fundamental type for planar arrays
947 	std::string tmpBuffType;
948 	std::string dtComplex;      // Fundamental type for complex arrays
949 
950 								// NOTE:  Enable only for debug
951 								// clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
952 
953 								//if (params.fft_inputLayout != params.fft_outputLayout)
954 								//	return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
955 
956 	switch (params.fft_precision)
957 	{
958 	case CLFFT_SINGLE:
959 	case CLFFT_SINGLE_FAST:
960 		dtPlanar = "float";
961 		dtComplex = "float2";
962 		break;
963 	case CLFFT_DOUBLE:
964 	case CLFFT_DOUBLE_FAST:
965 		dtPlanar = "double";
966 		dtComplex = "double2";
967 
968 		// Emit code that enables double precision in the kernel
969 		clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
970 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
971 		clKernWrite(transKernel, 0) << "#else" << std::endl;
972 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
973 		clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
974 
975 		break;
976 	default:
977 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
978 		break;
979 	}
980 
981 
982 	size_t LDS_per_WG = smaller_dim;
983 	while (LDS_per_WG > 1024)//avoiding using too much lds memory. the biggest LDS memory we will allocate would be 1024*sizeof(float2/double2)*2
984 	{
985 		if (LDS_per_WG % 2 == 0)
986 		{
987 			LDS_per_WG /= 2;
988 			continue;
989 		}
990 		if (LDS_per_WG % 3 == 0)
991 		{
992 			LDS_per_WG /= 3;
993 			continue;
994 		}
995 		if (LDS_per_WG % 5 == 0)
996 		{
997 			LDS_per_WG /= 5;
998 			continue;
999 		}
1000 		return CLFFT_NOTIMPLEMENTED;
1001 	}
1002 	size_t WG_per_line = smaller_dim / LDS_per_WG;
1003 
1004 	size_t input_elm_size_in_bytes;
1005 	switch (params.fft_precision)
1006 	{
1007 	case CLFFT_SINGLE:
1008 	case CLFFT_SINGLE_FAST:
1009 		input_elm_size_in_bytes = 4;
1010 		break;
1011 	case CLFFT_DOUBLE:
1012 	case CLFFT_DOUBLE_FAST:
1013 		input_elm_size_in_bytes = 8;
1014 		break;
1015 	default:
1016 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1017 	}
1018 
1019 	switch (params.fft_outputLayout)
1020 	{
1021 	case CLFFT_COMPLEX_INTERLEAVED:
1022 	case CLFFT_COMPLEX_PLANAR:
1023 		input_elm_size_in_bytes *= 2;
1024 		break;
1025 	case CLFFT_REAL:
1026 		break;
1027 	default:
1028 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1029 	}
1030 	/* not entirely clearly why do i need this yet
1031 	size_t max_elements_loaded = AVAIL_MEM_SIZE / input_elm_size_in_bytes;
1032 	size_t num_elements_loaded;
1033 	size_t local_work_size_swap, num_grps_pro_row;
1034 	*/
1035 
1036 	//if pre-callback is set for the plan
1037 	if (params.fft_hasPreCallback)
1038 	{
1039 		//we have already checked available LDS for pre callback
1040 		//Insert callback function code at the beginning
1041 		clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
1042 		clKernWrite(transKernel, 0) << std::endl;
1043 	}
1044 	//if post-callback is set for the plan
1045 	//rarely do we need post callback in swap kernel. But it is possible.
1046 	if (params.fft_hasPostCallback)
1047 	{
1048 		clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
1049 		clKernWrite(transKernel, 0) << std::endl;
1050 	}
1051 
1052 	//twiddle in swap kernel (for now, swap with twiddle seems to always be the second kernel after transpose)
1053 	bool twiddleSwapKernel = params.fft_3StepTwiddle && (dim_ratio > 1);
1054 	//twiddle factors applied to the output of swap kernels if swap kernels are the last kernel in transpose order
1055 	bool twiddleSwapKernelOut = twiddleSwapKernel && (params.nonSquareKernelOrder == TRANSPOSE_AND_SWAP || params.nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP);
1056 	//twiddle factors applied to the input of swap kernels if swap kernels are the first kernel in transpose order
1057 	bool twiddleSwapKernelIn = twiddleSwapKernel && (params.nonSquareKernelOrder == SWAP_AND_TRANSPOSE);
1058 
1059 
1060 	//generate the swap_table
1061 	std::vector<std::vector<size_t> > permutationTable;
1062 	permutation_calculation(dim_ratio, smaller_dim, permutationTable);
1063 
1064 	clKernWrite(transKernel, 0) << "__constant size_t swap_table["<< permutationTable.size()+2 <<"][1] = {" << std::endl;
1065 	clKernWrite(transKernel, 0) << "{0}," << std::endl;
1066 	clKernWrite(transKernel, 0) << "{"<< smaller_dim * dim_ratio - 1 <<"}," << std::endl;// add the first and last row to the swap table. needed for twiddling
1067 	for (std::vector<std::vector<size_t> >::iterator itor = permutationTable.begin(); itor != permutationTable.end(); itor++)
1068 	{
1069 		clKernWrite(transKernel, 0) << "{" << (*itor)[0] << "}";
1070 		if (itor == (permutationTable.end() - 1))//last vector
1071 			clKernWrite(transKernel, 0) << std::endl << "};" << std::endl;
1072 		else
1073 			clKernWrite(transKernel, 0) << "," << std::endl;
1074 	}
1075 
1076 	//twiddle in swap kernel
1077 	//twiddle in or out should be using the same twiddling table
1078 	if (twiddleSwapKernel)
1079 	{
1080 		std::string str;
1081 		StockhamGenerator::TwiddleTableLarge twLarge(smaller_dim * smaller_dim * dim_ratio);
1082 		if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
1083 			twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
1084 		else
1085 			twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
1086 		clKernWrite(transKernel, 0) << str << std::endl;
1087 		clKernWrite(transKernel, 0) << std::endl;
1088 	}
1089 
1090 	//std::string funcName = "swap_nonsquare_" + std::to_string(smaller_dim) + "_" + std::to_string(dim_ratio);
1091 	std::string funcName = "swap_nonsquare_";
1092 	std::string smaller_dim_str = static_cast<const std::ostringstream&>(std::ostringstream() << smaller_dim).str();
1093 	std::string dim_ratio_str = static_cast<const std::ostringstream&>(std::ostringstream() << dim_ratio).str();
1094 	if(params.fft_N[0] > params.fft_N[1])
1095 		funcName = funcName + smaller_dim_str + "_" + dim_ratio_str;
1096 	else
1097 		funcName = funcName + dim_ratio_str + "_" + smaller_dim_str;
1098 
1099 	KernelFuncName = funcName;
1100 	size_t local_work_size_swap = 256;
1101 
1102     for (size_t bothDir = 0; bothDir < 2; bothDir++)
1103     {
1104         bool fwd = bothDir ? false : true;
1105         // Generate kernel API
1106 
1107         /*when swap can be performed in LDS itself then, same prototype of transpose can be used for swap function too*/
1108         std::string funcNameTW;
1109         if (twiddleSwapKernel)
1110         {
1111             if (fwd)
1112                 funcNameTW = funcName + "_tw_fwd";
1113             else
1114                 funcNameTW = funcName + "_tw_back";
1115         }
1116         else
1117             funcNameTW = funcName;
1118 
1119         genTransposePrototypeLeadingDimensionBatched(params, local_work_size_swap, dtPlanar, dtComplex, funcNameTW, transKernel, dtInput, dtOutput);
1120 
1121         clKernWrite(transKernel, 3) << "//each wg handles 1/"<< WG_per_line <<" row of " << LDS_per_WG << " in memory" << std::endl;
1122         clKernWrite(transKernel, 3) << "const size_t num_wg_per_batch = " << (permutationTable.size() + 2)*WG_per_line << ";" << std::endl; // number of wg per batch = number of independent cycles
1123         clKernWrite(transKernel, 3) << "size_t group_id = get_group_id(0);" << std::endl;
1124         clKernWrite(transKernel, 3) << "size_t idx = get_local_id(0);" << std::endl;
1125 
1126         clKernWrite(transKernel, 3) << std::endl;
1127         clKernWrite(transKernel, 3) << "size_t batch_offset = group_id / num_wg_per_batch;" << std::endl;
1128         switch (params.fft_inputLayout)
1129         {
1130         case CLFFT_REAL:
1131         case CLFFT_COMPLEX_INTERLEAVED:
1132             clKernWrite(transKernel, 3) << "inputA += batch_offset*" << smaller_dim * bigger_dim << ";" << std::endl;
1133             break;
1134         case CLFFT_HERMITIAN_INTERLEAVED:
1135         case CLFFT_HERMITIAN_PLANAR:
1136             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1137         case CLFFT_COMPLEX_PLANAR:
1138         {
1139             clKernWrite(transKernel, 3) << "inputA_R += batch_offset*" << smaller_dim * bigger_dim << ";" << std::endl;
1140             clKernWrite(transKernel, 3) << "inputA_I += batch_offset*" << smaller_dim * bigger_dim << ";" << std::endl;
1141             break;
1142         }
1143         default:
1144             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1145         }
1146         clKernWrite(transKernel, 3) << "group_id -= batch_offset*" << (permutationTable.size() + 2)*WG_per_line << ";" << std::endl;
1147 
1148         clKernWrite(transKernel, 3) << std::endl;
1149 		if(WG_per_line == 1)
1150 			clKernWrite(transKernel, 3) << "size_t prev = swap_table[group_id][0];" << std::endl;
1151 		else
1152 			clKernWrite(transKernel, 3) << "size_t prev = swap_table[group_id/" << WG_per_line <<"][0];" << std::endl;
1153         clKernWrite(transKernel, 3) << "size_t next = 0;" << std::endl;
1154 
1155         clKernWrite(transKernel, 3) << std::endl;
1156         switch (params.fft_inputLayout)
1157         {
1158         case CLFFT_REAL:
1159         case CLFFT_COMPLEX_INTERLEAVED:
1160         {
1161             clKernWrite(transKernel, 3) << "__local " << dtInput << " prevValue[" << LDS_per_WG << "];" << std::endl;//lds within each wg should be able to store a row block (smaller_dim) of element
1162             clKernWrite(transKernel, 3) << "__local " << dtInput << " nextValue[" << LDS_per_WG << "];" << std::endl;
1163             break;
1164         }
1165         case CLFFT_COMPLEX_PLANAR:
1166         {
1167             clKernWrite(transKernel, 3) << "__local " << dtComplex << " prevValue[" << LDS_per_WG << "];" << std::endl;//lds within each wg should be able to store a row block (smaller_dim) of element
1168             clKernWrite(transKernel, 3) << "__local " << dtComplex << " nextValue[" << LDS_per_WG << "];" << std::endl;
1169             break;
1170         }
1171         case CLFFT_HERMITIAN_INTERLEAVED:
1172         case CLFFT_HERMITIAN_PLANAR:
1173             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1174         default:
1175             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1176         }
1177 
1178         clKernWrite(transKernel, 3) << std::endl;
1179         if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1180         {
1181 			if (WG_per_line == 1)
1182 			{
1183 				//might look like: size_t group_offset = (prev/3)*729*3 + (prev%3)*729;
1184 				clKernWrite(transKernel, 3) << "size_t group_offset = (prev/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1185 					<< " + (prev%" << dim_ratio << ")*" << smaller_dim << ";" << std::endl;
1186 			}
1187 			else
1188 			{
1189 				//if smaller_dim is 2187 > 1024 this should look like size_t group_offset = (prev/3)*2187*3 + (prev%3)*2187 + (group_id % 3)*729;
1190 				clKernWrite(transKernel, 3) << "size_t group_offset = (prev/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1191 					<< " + (prev%" << dim_ratio << ")*" << smaller_dim << " + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1192 			}
1193         }
1194         else
1195         {
1196 			if (WG_per_line == 1)//might look like: size_t group_offset = prev*729;
1197 				clKernWrite(transKernel, 3) << "size_t group_offset = (prev*" << smaller_dim << ");" << std::endl;
1198 			else//if smaller_dim is 2187 > 1024 this should look like size_t group_offset = prev*2187 + (group_id % 3)*729;
1199 				clKernWrite(transKernel, 3) << "size_t group_offset = (prev*" << smaller_dim << ") + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1200         }
1201 
1202         clKernWrite(transKernel, 3) << std::endl;
1203         //move to that row block and load that row block to LDS
1204 		if (twiddleSwapKernelIn)
1205 		{
1206 			clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
1207 			clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
1208 			clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
1209 		}
1210         switch (params.fft_inputLayout)
1211         {
1212         case CLFFT_REAL:
1213         case CLFFT_COMPLEX_INTERLEAVED:
1214         {
1215             for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1216             {
1217 				if (i + 256 < LDS_per_WG)
1218 				{
1219 					if (params.fft_hasPreCallback)
1220 					{
1221 						clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1222 							<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1223 					}
1224 					else
1225 					{
1226 						if (twiddleSwapKernelIn)
1227 						{
1228 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1229 							{
1230 								//input is wide; output is tall; read input index realted
1231 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1232 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1233 							}
1234 							else
1235 							{
1236 								//input is tall; output is wide; read input index realted
1237 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1238 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1239 							}
1240 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1241 							if (fwd)
1242 							{
1243 								//forward
1244 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1245 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1246 							}
1247 							else
1248 							{
1249 								//backward
1250 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1251 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1252 							}
1253 						}
1254 						else
1255 							clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1256 					}
1257 				}
1258                 else
1259                 {
1260                     // need to handle boundary
1261                     clKernWrite(transKernel, 3) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1262 					if (params.fft_hasPreCallback)
1263 					{
1264 						clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1265 							<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1266 					}
1267 					else
1268 					{
1269 						if (twiddleSwapKernelIn)
1270 						{
1271 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1272 							{
1273 								//input is wide; output is tall; read input index realted
1274 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1275 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1276 							}
1277 							else
1278 							{
1279 								//input is tall; output is wide; read input index realted
1280 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1281 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1282 							}
1283 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1284 							if (fwd)
1285 							{
1286 								//forward
1287 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1288 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1289 							}
1290 							else
1291 							{
1292 								//backward
1293 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1294 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1295 							}
1296 						}
1297 						else
1298 							clKernWrite(transKernel, 6) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1299 					}
1300                     clKernWrite(transKernel, 3) << "}" << std::endl;
1301                 }
1302             }
1303             break;
1304         }
1305         case CLFFT_HERMITIAN_INTERLEAVED:
1306         case CLFFT_HERMITIAN_PLANAR:
1307             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1308         case CLFFT_COMPLEX_PLANAR:
1309         {
1310             for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1311             {
1312                 if (i + 256 < LDS_per_WG)
1313                 {
1314 					if (params.fft_hasPreCallback)
1315 					{
1316 						clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1317 							"(inputA_R-batch_offset*"<< smaller_dim * bigger_dim <<", inputA_I-batch_offset*"<< smaller_dim * bigger_dim <<
1318 							", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1319 					}
1320 					else
1321 					{
1322 						if (twiddleSwapKernelIn)
1323 						{
1324 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1325 							{
1326 								//input is wide; output is tall; read input index realted
1327 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1328 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1329 							}
1330 							else
1331 							{
1332 								//input is tall; output is wide; read input index realted
1333 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1334 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1335 							}
1336 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1337 							if (fwd)
1338 							{
1339 								//forward
1340 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1341 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1342 							}
1343 							else
1344 							{
1345 								//backward
1346 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1347 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1348 							}
1349 						}
1350 						else
1351 						{
1352 							clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1353 							clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1354 						}
1355 					}
1356                 }
1357                 else
1358                 {
1359                     // need to handle boundary
1360                     clKernWrite(transKernel, 3) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1361 					if (params.fft_hasPreCallback)
1362 					{
1363 						clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1364 							"(inputA_R-batch_offset*" << smaller_dim * bigger_dim << ", inputA_I-batch_offset*" << smaller_dim * bigger_dim <<
1365 							", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1366 					}
1367 					else
1368 					{
1369 						if (twiddleSwapKernelIn)
1370 						{
1371 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1372 							{
1373 								//input is wide; output is tall; read input index realted
1374 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1375 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1376 							}
1377 							else
1378 							{
1379 								//input is tall; output is wide; read input index realted
1380 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1381 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1382 							}
1383 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1384 							if (fwd)
1385 							{
1386 								//forward
1387 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1388 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1389 							}
1390 							else
1391 							{
1392 								//backward
1393 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1394 								clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1395 							}
1396 						}
1397 						else
1398 						{
1399 							clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1400 							clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1401 						}
1402 					}
1403                     clKernWrite(transKernel, 3) << "}" << std::endl;
1404                 }
1405             }
1406             break;
1407         }
1408         default:
1409             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1410         }
1411         clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1412 
1413         clKernWrite(transKernel, 3) << std::endl;
1414         clKernWrite(transKernel, 3) << "do{" << std::endl;//begining of do-while
1415         //calculate the next location p(k) = (k*n)mod(m*n-1), if 0 < k < m*n-1
1416         if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1417         {
1418             clKernWrite(transKernel, 6) << "next = (prev*" << smaller_dim << ")%" << smaller_dim*dim_ratio - 1 << ";" << std::endl;
1419             //takes care the last row
1420             clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
1421             clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
1422 			if (WG_per_line == 1)
1423 			{
1424 				clKernWrite(transKernel, 6) << "group_offset = (next/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1425 					<< " + (next%" << dim_ratio << ")*" << smaller_dim << ";" << std::endl; //might look like: group_offset = (next/3)*729*3 + (next%3)*729;
1426 			}
1427 			else
1428 			{
1429 				//if smaller_dim is 2187 > 1024 this should look like size_t group_offset = (next/3)*2187*3 + (next%3)*2187 + (group_id % 3)*729;
1430 				clKernWrite(transKernel, 6) << "group_offset = (next/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
1431 					<< " + (next%" << dim_ratio << ")*" << smaller_dim << " + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1432 			}
1433         }
1434         else
1435         {
1436             clKernWrite(transKernel, 6) << "next = (prev*" << dim_ratio << ")%" << smaller_dim*dim_ratio - 1 << ";" << std::endl;
1437             //takes care the last row
1438             clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
1439             clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
1440 			if (WG_per_line == 1) //might look like: size_t group_offset = prev*729;
1441 				clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ");" << std::endl;
1442 			else//if smaller_dim is 2187 > 1024 this should look like size_t group_offset = next*2187 + (group_id % 3)*729;
1443 				clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ") + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
1444         }
1445 
1446 
1447         clKernWrite(transKernel, 3) << std::endl;
1448         switch (params.fft_inputLayout)
1449         {
1450         case CLFFT_REAL:
1451         case CLFFT_COMPLEX_INTERLEAVED:
1452         {
1453             for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1454             {
1455                 if (i + 256 < LDS_per_WG)
1456 					if (params.fft_hasPreCallback)
1457 					{
1458 						clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1459 							<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim*bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1460 					}
1461 					else
1462 					{
1463 						if (twiddleSwapKernelIn)
1464 						{
1465 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1466 							{
1467 								//input is wide; output is tall; read input index realted
1468 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1469 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1470 							}
1471 							else
1472 							{
1473 								//input is tall; output is wide; read input index realted
1474 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1475 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1476 							}
1477 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1478 							if (fwd)
1479 							{
1480 								//forward
1481 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1482 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1483 							}
1484 							else
1485 							{
1486 								//backward
1487 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1488 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1489 							}
1490 						}
1491 						else
1492 							clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1493 					}
1494                 else
1495                 {
1496                     // need to handle boundary
1497                     clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1498 					if (params.fft_hasPreCallback)
1499 					{
1500 						clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname
1501 							<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim*bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1502 					}
1503 					else
1504 					{
1505 						if (twiddleSwapKernelIn)
1506 						{
1507 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1508 							{
1509 								//input is wide; output is tall; read input index realted
1510 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1511 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1512 							}
1513 							else
1514 							{
1515 								//input is tall; output is wide; read input index realted
1516 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1517 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1518 							}
1519 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1520 							if (fwd)
1521 							{
1522 								//forward
1523 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1524 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1525 							}
1526 							else
1527 							{
1528 								//backward
1529 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1530 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1531 							}
1532 						}
1533 						else
1534 							clKernWrite(transKernel, 9) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
1535 					}
1536                     clKernWrite(transKernel, 6) << "}" << std::endl;
1537                 }
1538             }
1539             break;
1540         }
1541         case CLFFT_HERMITIAN_INTERLEAVED:
1542         case CLFFT_HERMITIAN_PLANAR:
1543             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1544         case CLFFT_COMPLEX_PLANAR:
1545         {
1546             for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1547             {
1548                 if (i + 256 < LDS_per_WG)
1549                 {
1550 					if (params.fft_hasPreCallback)
1551 					{
1552 						clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1553 							"(inputA_R-batch_offset*" << smaller_dim * bigger_dim << ", inputA_I-batch_offset*" << smaller_dim * bigger_dim <<
1554 							", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1555 					}
1556 					else
1557 					{
1558 						if (twiddleSwapKernelIn)
1559 						{
1560 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1561 							{
1562 								//input is wide; output is tall; read input index realted
1563 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1564 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1565 							}
1566 							else
1567 							{
1568 								//input is tall; output is wide; read input index realted
1569 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1570 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1571 							}
1572 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1573 							if (fwd)
1574 							{
1575 								//forward
1576 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1577 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1578 							}
1579 							else
1580 							{
1581 								//backward
1582 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1583 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1584 							}
1585 						}
1586 						else
1587 						{
1588 							clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1589 							clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1590 						}
1591 					}
1592                 }
1593                 else
1594                 {
1595                     // need to handle boundary
1596                     clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1597 					if (params.fft_hasPreCallback)
1598 					{
1599 						clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = " << params.fft_preCallback.funcname <<
1600 							"(inputA_R-batch_offset*" << smaller_dim * bigger_dim << ", inputA_I-batch_offset*" << smaller_dim * bigger_dim <<
1601 							", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
1602 					}
1603 					else
1604 					{
1605 						if (twiddleSwapKernelIn)
1606 						{
1607 							if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1608 							{
1609 								//input is wide; output is tall; read input index realted
1610 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1611 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1612 							}
1613 							else
1614 							{
1615 								//input is tall; output is wide; read input index realted
1616 								clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1617 								clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1618 							}
1619 							clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1620 							if (fwd)
1621 							{
1622 								//forward
1623 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1624 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
1625 							}
1626 							else
1627 							{
1628 								//backward
1629 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1630 								clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
1631 							}
1632 						}
1633 						else
1634 						{
1635 							clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
1636 							clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
1637 						}
1638 					}
1639                     clKernWrite(transKernel, 6) << "}" << std::endl;
1640                 }
1641             }
1642             break;
1643         }
1644         default:
1645             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1646         }
1647 
1648         clKernWrite(transKernel, 6) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1649 
1650         clKernWrite(transKernel, 3) << std::endl;
1651         switch (params.fft_inputLayout)
1652         {
1653         case CLFFT_REAL:// for real case this is different
1654         case CLFFT_COMPLEX_INTERLEAVED:
1655         {
1656             if (twiddleSwapKernelOut)
1657             {
1658                 clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
1659                 clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
1660                 clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
1661 
1662                 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1663                 {
1664                     if (i + 256 < LDS_per_WG)
1665                     {
1666                         if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1667                         {
1668                             //input is wide; output is tall
1669                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1670                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1671                         }
1672                         else
1673                         {
1674                             //input is tall; output is wide
1675                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1676                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1677                         }
1678                         clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1679                         if (fwd)
1680                         {
1681                             //forward
1682                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1683                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1684                         }
1685                         else
1686                         {
1687                             //backward
1688                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1689                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1690                         }
1691                         //clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1692                     }
1693                     else
1694                     {
1695                         // need to handle boundary
1696                         clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1697                         if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1698                         {
1699                             //input is wide; output is tall
1700                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1701                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1702                         }
1703                         else
1704                         {
1705                             //input is tall; output is wide
1706                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1707                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1708                         }
1709                         clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1710                         if (fwd)
1711                         {
1712                             //forward
1713                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1714                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1715                         }
1716                         else
1717                         {
1718                             //backward
1719                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].x = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1720                             clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "].y = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1721                         }
1722                         //clKernWrite(transKernel, 9) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1723                         clKernWrite(transKernel, 6) << "}" << std::endl;
1724                     }
1725                 }
1726             }
1727             else if(!twiddleSwapKernelOut)//could be twiddleSwapKernelIn
1728             {
1729                 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1730                 {
1731 					//twiddling and callback do not coexist
1732 					if (params.fft_hasPostCallback)
1733 					{
1734 						if (i + 256 < LDS_per_WG)
1735 						{
1736 							clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
1737 								<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
1738 								<< "]);" << std::endl;
1739 						}
1740 						else
1741 						{
1742 							// need to handle boundary
1743 							clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1744 							clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
1745 								<< ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i
1746 								<< "]);" << std::endl;
1747 							clKernWrite(transKernel, 6) << "}" << std::endl;
1748 						}
1749 					}
1750 					else
1751 					{
1752 						if (i + 256 < LDS_per_WG)
1753 							clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1754 						else
1755 						{
1756 							// need to handle boundary
1757 							clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1758 							clKernWrite(transKernel, 9) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
1759 							clKernWrite(transKernel, 6) << "}" << std::endl;
1760 						}
1761 					}
1762                 }
1763             }
1764             break;
1765         }
1766         case CLFFT_HERMITIAN_INTERLEAVED:
1767         case CLFFT_HERMITIAN_PLANAR:
1768             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1769         case CLFFT_COMPLEX_PLANAR:
1770         {
1771             if (twiddleSwapKernelOut)
1772             {
1773                 clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
1774                 clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
1775                 clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
1776                 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1777                 {
1778                     if (i + 256 < LDS_per_WG)
1779                     {
1780                         if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1781                         {
1782                             //input is wide; output is tall
1783                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1784                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1785                         }
1786                         else
1787                         {
1788                             //input is tall; output is wide
1789                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1790                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1791                         }
1792                         clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1793                         if (fwd)
1794                         {
1795                             //forward
1796                             clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1797                             clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1798                         }
1799                         else
1800                         {
1801                             //backward
1802                             clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1803                             clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1804                         }
1805                     }
1806                     else
1807                     {
1808                         // need to handle boundary
1809                         clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1810                         if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
1811                         {
1812                             //input is wide; output is tall
1813                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
1814                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
1815                         }
1816                         else
1817                         {
1818                             //input is tall; output is wide
1819                             clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
1820                             clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
1821                         }
1822                         clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
1823                         if (fwd)
1824                         {
1825                             //forward
1826                             clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1827                             clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
1828                         }
1829                         else
1830                         {
1831                             //backward
1832                             clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x + prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
1833                             clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y * twiddle_factor.x - prevValue[idx+" << i << "].x * twiddle_factor.y;" << std::endl;
1834                         }
1835                         clKernWrite(transKernel, 6) << "}" << std::endl;
1836                     }
1837                     clKernWrite(transKernel, 3) << std::endl;
1838                 }
1839             }
1840 			else if (!twiddleSwapKernelOut)//could be twiddleSwapKernelIn
1841             {
1842                 for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1843                 {
1844 					//twiddling and callback do not coexist
1845 					if (params.fft_hasPostCallback)
1846 					{
1847 						if (i + 256 < LDS_per_WG)
1848 						{
1849 							//clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
1850 							//clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1851 							clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
1852 								<< ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
1853 								<< "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
1854 						}
1855 						else
1856 						{
1857 							// need to handle boundary
1858 							clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1859 							clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA_R - batch_offset*" << smaller_dim * bigger_dim
1860 								<< ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
1861 								<< "+group_offset+idx+" << i << ", post_userdata, prevValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
1862 							clKernWrite(transKernel, 6) << "}" << std::endl;
1863 						}
1864 					}
1865 					else
1866 					{
1867 						if (i + 256 < LDS_per_WG)
1868 						{
1869 							clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
1870 							clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1871 						}
1872 						else
1873 						{
1874 							// need to handle boundary
1875 							clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1876 							clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
1877 							clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
1878 							clKernWrite(transKernel, 6) << "}" << std::endl;
1879 						}
1880 					}
1881                 }
1882             }
1883             break;
1884         }
1885         default:
1886             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1887         }
1888         clKernWrite(transKernel, 6) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1889 
1890         clKernWrite(transKernel, 3) << std::endl;
1891         switch (params.fft_inputLayout)
1892         {
1893         case CLFFT_REAL:
1894         case CLFFT_COMPLEX_INTERLEAVED:
1895         case CLFFT_COMPLEX_PLANAR:
1896         {
1897             for (size_t i = 0; i < LDS_per_WG; i = i + 256)
1898             {
1899                 if (i + 256 < LDS_per_WG)
1900                     clKernWrite(transKernel, 6) << "prevValue[idx+" << i << "] = nextValue[idx+" << i << "];" << std::endl;
1901                 else
1902                 {
1903                     // need to handle boundary
1904                     clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
1905                     clKernWrite(transKernel, 9) << "prevValue[idx + " << i << "] = nextValue[idx + " << i << "]; " << std::endl;
1906                     clKernWrite(transKernel, 6) << "}" << std::endl;
1907                 }
1908             }
1909             break;
1910         }
1911         case CLFFT_HERMITIAN_INTERLEAVED:
1912         case CLFFT_HERMITIAN_PLANAR:
1913             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1914         default:
1915             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1916         }
1917 
1918         clKernWrite(transKernel, 6) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
1919 
1920         clKernWrite(transKernel, 3) << std::endl;
1921         clKernWrite(transKernel, 3) << "prev = next;" << std::endl;
1922 		if (WG_per_line == 1)
1923 			clKernWrite(transKernel, 3) << "}while(next!=swap_table[group_id][0]);" << std::endl;//end of do-while
1924 		else
1925 			clKernWrite(transKernel, 3) << "}while(next!=swap_table[group_id/"<< WG_per_line <<"][0]);" << std::endl;//end of do-while
1926         clKernWrite(transKernel, 0) << "}" << std::endl;//end of kernel
1927 
1928         if (!twiddleSwapKernel)
1929             break; // break for bothDir only need one kernel if twiddle is not done here
1930 
1931     }//end of for (size_t bothDir = 0; bothDir < 2; bothDir++)
1932 
1933 
1934 	//by now the kernel string is generated
1935 	strKernel = transKernel.str();
1936 	return CLFFT_SUCCESS;
1937 }
1938 
1939 //generate transepose kernel with sqaure 2d matrix of row major with arbitrary batch size
1940 /*
1941 Below is a matrix(row major) containing three sqaure sub matrix along column
1942 The transpose will be done within each sub matrix.
1943 [M0
1944  M1
1945  M2]
1946 */
genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::Signature & params,std::string & strKernel,const size_t & lwSize,const size_t reShapeFactor)1947 clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::Signature & params, std::string& strKernel, const size_t& lwSize, const size_t reShapeFactor)
1948 {
1949 	strKernel.reserve(4096);
1950 	std::stringstream transKernel(std::stringstream::out);
1951 
1952 	// These strings represent the various data types we read or write in the kernel, depending on how the plan
1953 	// is configured
1954 	std::string dtInput;        // The type read as input into kernel
1955 	std::string dtOutput;       // The type written as output from kernel
1956 	std::string dtPlanar;       // Fundamental type for planar arrays
1957 	std::string dtComplex;      // Fundamental type for complex arrays
1958 
1959 								// NOTE:  Enable only for debug
1960 								// clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
1961 
1962 								//if (params.fft_inputLayout != params.fft_outputLayout)
1963 								//	return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1964 
1965 	switch (params.fft_precision)
1966 	{
1967 	case CLFFT_SINGLE:
1968 	case CLFFT_SINGLE_FAST:
1969 		dtPlanar = "float";
1970 		dtComplex = "float2";
1971 		break;
1972 	case CLFFT_DOUBLE:
1973 	case CLFFT_DOUBLE_FAST:
1974 		dtPlanar = "double";
1975 		dtComplex = "double2";
1976 
1977 		// Emit code that enables double precision in the kernel
1978 		clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
1979 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
1980 		clKernWrite(transKernel, 0) << "#else" << std::endl;
1981 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
1982 		clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
1983 
1984 		break;
1985 	default:
1986 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
1987 		break;
1988 	}
1989 
1990 	//  it is a better idea to do twiddle in swap kernel if we will have a swap kernel.
1991 	//  for pure square transpose, twiddle will be done in transpose kernel
1992 	bool twiddleTransposeKernel = params.fft_3StepTwiddle && (params.transposeMiniBatchSize == 1);//when transposeMiniBatchSize == 1 it is guaranteed to be a sqaure matrix transpose
1993 	//	If twiddle computation has been requested, generate the lookup function
1994 
1995 	if (twiddleTransposeKernel)
1996 	{
1997 		std::string str;
1998 		StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
1999 		if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
2000 			twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
2001 		else
2002 			twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
2003 		clKernWrite(transKernel, 0) << str << std::endl;
2004 		clKernWrite(transKernel, 0) << std::endl;
2005 	}
2006 
2007 
2008 
2009 	// This detects whether the input matrix is square
2010 	bool notSquare = (params.fft_N[0] == params.fft_N[1]) ? false : true;
2011 
2012 	if (notSquare && (params.fft_placeness == CLFFT_INPLACE))
2013 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2014 
2015 	// This detects whether the input matrix is a multiple of 16*reshapefactor or not
2016 
2017 	bool mult_of_16 = (params.fft_N[0] % (reShapeFactor * 16) == 0) ? true : false;
2018 
2019 
2020 
2021 	for (size_t bothDir = 0; bothDir < 2; bothDir++)
2022 	{
2023 		bool fwd = bothDir ? false : true;
2024 
2025 		//If pre-callback is set for the plan
2026 		if (params.fft_hasPreCallback)
2027 		{
2028 			//Insert callback function code at the beginning
2029 			clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
2030 			clKernWrite(transKernel, 0) << std::endl;
2031 		}
2032 		//If post-callback is set for the plan
2033 		if (params.fft_hasPostCallback)
2034 		{
2035 			//Insert callback function code at the beginning
2036 			clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
2037 			clKernWrite(transKernel, 0) << std::endl;
2038 		}
2039 
2040 		std::string funcName;
2041 		if (twiddleTransposeKernel) // it makes more sense to do twiddling in swap kernel
2042 			funcName = fwd ? "transpose_square_tw_fwd" : "transpose_square_tw_back";
2043 		else
2044 			funcName = "transpose_square";
2045 
2046 
2047 		// Generate kernel API
2048 		genTransposePrototype(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
2049 		size_t wgPerBatch;
2050 		if (mult_of_16)
2051 			wgPerBatch = (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2;
2052 		else
2053 			wgPerBatch = (params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) / 2;
2054 		clKernWrite(transKernel, 3) << "const size_t numGroupsY_1 = " << wgPerBatch << ";" << std::endl;
2055 
2056 		for (size_t i = 2; i < params.fft_DataDim - 1; i++)
2057 		{
2058 			clKernWrite(transKernel, 3) << "const size_t numGroupsY_" << i << " = numGroupsY_" << i - 1 << " * " << params.fft_N[i] << ";" << std::endl;
2059 		}
2060 
2061 		clKernWrite(transKernel, 3) << "size_t g_index;" << std::endl;
2062 		clKernWrite(transKernel, 3) << std::endl;
2063 
2064 		OffsetCalc(transKernel, params, true);
2065 
2066 
2067 		if (params.fft_placeness == CLFFT_OUTOFPLACE)
2068 			OffsetCalc(transKernel, params, false);
2069 
2070 
2071 		// Handle planar and interleaved right here
2072 		switch (params.fft_inputLayout)
2073 		{
2074 		case CLFFT_COMPLEX_INTERLEAVED:
2075 			//Do not advance offset when precallback is set as the starting address of global buffer is needed
2076 			if (!params.fft_hasPreCallback)
2077 			{
2078 				clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl;  // Set A ptr to the start of each slice
2079 			}
2080 			break;
2081 		case CLFFT_COMPLEX_PLANAR:
2082 			//Do not advance offset when precallback is set as the starting address of global buffer is needed
2083 			if (!params.fft_hasPreCallback)
2084 			{
2085 				clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl;  // Set A ptr to the start of each slice
2086 				clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl;  // Set A ptr to the start of each slice
2087 			}
2088 			break;
2089 		case CLFFT_HERMITIAN_INTERLEAVED:
2090 		case CLFFT_HERMITIAN_PLANAR:
2091 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2092 		case CLFFT_REAL:
2093 			break;
2094 		default:
2095 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2096 		}
2097 
2098 		if (params.fft_placeness == CLFFT_OUTOFPLACE)
2099 		{
2100 			switch (params.fft_outputLayout)
2101 			{
2102 			case CLFFT_COMPLEX_INTERLEAVED:
2103 				clKernWrite(transKernel, 3) << "outputA += oOffset;" << std::endl;  // Set A ptr to the start of each slice
2104 
2105 				break;
2106 			case CLFFT_COMPLEX_PLANAR:
2107 
2108 				clKernWrite(transKernel, 3) << "outputA_R += oOffset;" << std::endl;  // Set A ptr to the start of each slice
2109 				clKernWrite(transKernel, 3) << "outputA_I += oOffset;" << std::endl;  // Set A ptr to the start of each slice
2110 				break;
2111 			case CLFFT_HERMITIAN_INTERLEAVED:
2112 			case CLFFT_HERMITIAN_PLANAR:
2113 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2114 			case CLFFT_REAL:
2115 				break;
2116 			default:
2117 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2118 			}
2119 		}
2120 		else
2121 		{
2122 			switch (params.fft_inputLayout)
2123 			{
2124 			case CLFFT_COMPLEX_INTERLEAVED:
2125 				if (params.fft_hasPreCallback)
2126 					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl;
2127 				else
2128 					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
2129 				break;
2130 			case CLFFT_COMPLEX_PLANAR:
2131 				if (params.fft_hasPreCallback)
2132 				{
2133 					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R + iOffset;" << std::endl;
2134 					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I + iOffset;" << std::endl;
2135 				}
2136 				else
2137 				{
2138 					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
2139 					clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
2140 				}
2141 				break;
2142 			case CLFFT_HERMITIAN_INTERLEAVED:
2143 			case CLFFT_HERMITIAN_PLANAR:
2144 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2145 			case CLFFT_REAL:
2146 				break;
2147 			default:
2148 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2149 			}
2150 		}
2151 
2152 
2153 		clKernWrite(transKernel, 3) << std::endl;
2154 
2155 
2156 		// Now compute the corresponding y,x coordinates
2157 		// for a triangular indexing
2158 		if (mult_of_16)
2159 			clKernWrite(transKernel, 3) << "float row = (" << -2.0f*params.fft_N[0] / 16 / reShapeFactor - 1 << "+sqrt((" << 4.0f*params.fft_N[0] / 16 / reShapeFactor*(params.fft_N[0] / 16 / reShapeFactor + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2160 		else
2161 			clKernWrite(transKernel, 3) << "float row = (" << -2.0f*(params.fft_N[0] / (16 * reShapeFactor) + 1) - 1 << "+sqrt((" << 4.0f*(params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2162 
2163 
2164 		clKernWrite(transKernel, 3) << "if (row == (float)(size_t)row) row -= 1; " << std::endl;
2165 		clKernWrite(transKernel, 3) << "const size_t t_gy = (size_t)row;" << std::endl;
2166 
2167 		clKernWrite(transKernel, 3) << "" << std::endl;
2168 
2169 		if (mult_of_16)
2170 			clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (params.fft_N[0] / 16 / reShapeFactor) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2171 		else
2172 			clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (params.fft_N[0] / (16 * reShapeFactor) + 1) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2173 
2174 		clKernWrite(transKernel, 3) << "const long t_gy_p = t_gx_p - t_gy;" << std::endl;
2175 
2176 
2177 		clKernWrite(transKernel, 3) << "" << std::endl;
2178 
2179 		clKernWrite(transKernel, 3) << "const size_t d_lidx = get_local_id(0) % 16;" << std::endl;
2180 		clKernWrite(transKernel, 3) << "const size_t d_lidy = get_local_id(0) / 16;" << std::endl;
2181 
2182 		clKernWrite(transKernel, 3) << "" << std::endl;
2183 
2184 		clKernWrite(transKernel, 3) << "const size_t lidy = (d_lidy * 16 + d_lidx) /" << (16 * reShapeFactor) << ";" << std::endl;
2185 		clKernWrite(transKernel, 3) << "const size_t lidx = (d_lidy * 16 + d_lidx) %" << (16 * reShapeFactor) << ";" << std::endl;
2186 
2187 		clKernWrite(transKernel, 3) << "" << std::endl;
2188 
2189 		clKernWrite(transKernel, 3) << "const size_t idx = lidx + t_gx_p*" << 16 * reShapeFactor << ";" << std::endl;
2190 		clKernWrite(transKernel, 3) << "const size_t idy = lidy + t_gy_p*" << 16 * reShapeFactor << ";" << std::endl;
2191 
2192 		clKernWrite(transKernel, 3) << "" << std::endl;
2193 
2194 		clKernWrite(transKernel, 3) << "const size_t starting_index_yx = t_gy_p*" << 16 * reShapeFactor << " + t_gx_p*" << 16 * reShapeFactor*params.fft_N[0] << ";" << std::endl;
2195 
2196 		clKernWrite(transKernel, 3) << "" << std::endl;
2197 
2198 		clKernWrite(transKernel, 3) << "__local " << dtComplex << " xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
2199 		clKernWrite(transKernel, 3) << "__local " << dtComplex << " yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
2200 
2201 		clKernWrite(transKernel, 3) << dtComplex << " tmpm, tmpt;" << std::endl;
2202 
2203 		clKernWrite(transKernel, 3) << "" << std::endl;
2204 
2205 		// Step 1: Load both blocks into local memory
2206 		// Here I load inputA for both blocks contiguously and write it contigously into
2207 		// the corresponding shared memories.
2208 		// Afterwards I use non-contiguous access from local memory and write contiguously
2209 		// back into the arrays
2210 
2211 		if (mult_of_16) {
2212 			clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
2213 			clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2214 			clKernWrite(transKernel, 6) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
2215 
2216 			// Handle planar and interleaved right here
2217 			switch (params.fft_inputLayout)
2218 			{
2219 			case CLFFT_COMPLEX_INTERLEAVED:
2220 			{
2221 				if (params.fft_hasPreCallback)
2222 				{
2223 					if (params.fft_preCallback.localMemSize > 0)
2224 					{
2225 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2226 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2227 					}
2228 					else
2229 					{
2230 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2231 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2232 					}
2233 				}
2234 				else
2235 				{
2236 					clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2237 					clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2238 				}
2239 			}
2240 			break;
2241 			case CLFFT_COMPLEX_PLANAR:
2242 				dtInput = dtPlanar;
2243 				dtOutput = dtPlanar;
2244 				if (params.fft_hasPreCallback)
2245 				{
2246 					if (params.fft_preCallback.localMemSize > 0)
2247 					{
2248 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2249 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2250 					}
2251 					else
2252 					{
2253 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2254 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2255 					}
2256 				}
2257 				else
2258 				{
2259 					clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2260 					clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2261 
2262 					clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2263 					clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2264 				}
2265 				break;
2266 			case CLFFT_HERMITIAN_INTERLEAVED:
2267 			case CLFFT_HERMITIAN_PLANAR:
2268 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2269 			case CLFFT_REAL:
2270 				break;
2271 			default:
2272 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2273 			}
2274 
2275 			// it makes more sense to do twiddling in swap kernel
2276 			// If requested, generate the Twiddle math to multiply constant values
2277 			if (twiddleTransposeKernel)
2278 				genTwiddleMath(params, transKernel, dtComplex, fwd);
2279 
2280 			clKernWrite(transKernel, 6) << "xy_s[index] = tmpm; " << std::endl;
2281 			clKernWrite(transKernel, 6) << "yx_s[index] = tmpt; " << std::endl;
2282 
2283 			clKernWrite(transKernel, 3) << "}" << std::endl;
2284 
2285 			clKernWrite(transKernel, 3) << "" << std::endl;
2286 
2287 			clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
2288 
2289 			clKernWrite(transKernel, 3) << "" << std::endl;
2290 
2291 
2292 			// Step2: Write from shared to global
2293 			clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2294 			clKernWrite(transKernel, 6) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
2295 
2296 
2297 			// Handle planar and interleaved right here
2298 			switch (params.fft_outputLayout)
2299 			{
2300 			case CLFFT_COMPLEX_INTERLEAVED:
2301 				if (params.fft_hasPostCallback)
2302 				{
2303 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2304 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2305 					else
2306 					{
2307 						//assume tranpose is only two dimensional for now
2308 						//size_t actualBatchSize = params.transposeBatchSize / params.transposeMiniBatchSize;
2309 						size_t blockOffset = params.fft_inStride[2];
2310 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-" << blockOffset <<"*((get_group_id(0)/numGroupsY_1)%"<< params.transposeMiniBatchSize <<"), ((idy + loop*" << 16 / reShapeFactor << ")*"
2311 							<< params.fft_N[0] << " + idx + "<< blockOffset <<"*( (get_group_id(0)/numGroupsY_1 )%" << params.transposeMiniBatchSize <<") " << "), post_userdata, yx_s[index]";
2312 					}
2313 					if (params.fft_postCallback.localMemSize > 0)
2314 					{
2315 						clKernWrite(transKernel, 0) << ", localmem";
2316 					}
2317 					clKernWrite(transKernel, 0) << ");" << std::endl;
2318 
2319 					if (params.transposeMiniBatchSize < 2)
2320 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2321 					else
2322 					{
2323 						size_t blockOffset = params.fft_inStride[2];
2324 						//clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-iOffset, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx +iOffset), post_userdata, xy_s[index]";
2325 						//clKernWrite(transKernel, 0) << std::endl;
2326 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize << "), ((lidy + loop*" << 16 / reShapeFactor << ")*"
2327 							<< params.fft_N[0] << " + lidx + starting_index_yx + " << blockOffset << "*( (get_group_id(0)/numGroupsY_1 )%" << params.transposeMiniBatchSize << ") " << "), post_userdata, xy_s[index]";
2328 					}
2329 					if (params.fft_postCallback.localMemSize > 0)
2330 					{
2331 						clKernWrite(transKernel, 0) << ", localmem";
2332 					}
2333 					clKernWrite(transKernel, 0) << ");" << std::endl;
2334 				}
2335 				else
2336 				{
2337 					clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
2338 					clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
2339 				}
2340 				break;
2341 			case CLFFT_COMPLEX_PLANAR:
2342 				if (params.fft_hasPostCallback)
2343 				{
2344 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2345 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2346 					else
2347 					{
2348 						size_t blockOffset = params.fft_inStride[2];
2349 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R - "<< blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2350 							"), outputA_I -" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2351 							"), ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx +"<< blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2352 							")), post_userdata, yx_s[index].x, yx_s[index].y";
2353 					}
2354 					if (params.fft_postCallback.localMemSize > 0)
2355 					{
2356 						clKernWrite(transKernel, 0) << ", localmem";
2357 					}
2358 					clKernWrite(transKernel, 0) << ");" << std::endl;
2359 
2360 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2361 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2362 					else
2363 					{
2364 						size_t blockOffset = params.fft_inStride[2];
2365 						clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R - " << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2366 							"), outputA_I -" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2367 							"), ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx +" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
2368 							")), post_userdata, xy_s[index].x, xy_s[index].y";
2369 					}
2370 					if (params.fft_postCallback.localMemSize > 0)
2371 					{
2372 						clKernWrite(transKernel, 0) << ", localmem";
2373 					}
2374 					clKernWrite(transKernel, 0) << ");" << std::endl;
2375 				}
2376 				else
2377 				{
2378 					clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
2379 					clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
2380 
2381 					clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
2382 					clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
2383 				}
2384 				break;
2385 			case CLFFT_HERMITIAN_INTERLEAVED:
2386 			case CLFFT_HERMITIAN_PLANAR:
2387 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2388 			case CLFFT_REAL:
2389 				break;
2390 			default:
2391 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2392 			}
2393 
2394 
2395 
2396 			clKernWrite(transKernel, 3) << "}" << std::endl;
2397 
2398 		}
2399 		else {//mult_of_16
2400 
2401 			clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
2402 			clKernWrite(transKernel, 3) << "if (" << params.fft_N[0] << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
2403 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2404 			clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
2405 
2406 			// Handle planar and interleaved right here
2407 			switch (params.fft_inputLayout)
2408 			{
2409 			case CLFFT_COMPLEX_INTERLEAVED:
2410 				if (params.fft_hasPreCallback)
2411 				{
2412 					if (params.fft_preCallback.localMemSize > 0)
2413 					{
2414 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2415 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2416 					}
2417 					else
2418 					{
2419 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2420 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2421 					}
2422 				}
2423 				else
2424 				{
2425 					clKernWrite(transKernel, 9) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2426 					clKernWrite(transKernel, 9) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2427 				}
2428 				break;
2429 			case CLFFT_COMPLEX_PLANAR:
2430 				dtInput = dtPlanar;
2431 				dtOutput = dtPlanar;
2432 				if (params.fft_hasPreCallback)
2433 				{
2434 					if (params.fft_preCallback.localMemSize > 0)
2435 					{
2436 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2437 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2438 					}
2439 					else
2440 					{
2441 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2442 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2443 					}
2444 				}
2445 				else
2446 				{
2447 					clKernWrite(transKernel, 9) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2448 					clKernWrite(transKernel, 9) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2449 
2450 					clKernWrite(transKernel, 9) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2451 					clKernWrite(transKernel, 9) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2452 				}
2453 				break;
2454 			case CLFFT_HERMITIAN_INTERLEAVED:
2455 			case CLFFT_HERMITIAN_PLANAR:
2456 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2457 			case CLFFT_REAL:
2458 				break;
2459 			default:
2460 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2461 			}
2462 
2463 			// it makes more sense to do twiddling in swap kernel
2464 			// If requested, generate the Twiddle math to multiply constant values
2465 			if (twiddleTransposeKernel)
2466 				genTwiddleMath(params, transKernel, dtComplex, fwd);
2467 
2468 			clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
2469 			clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
2470 			clKernWrite(transKernel, 6) << "}" << std::endl;
2471 			clKernWrite(transKernel, 3) << "}" << std::endl;
2472 
2473 			clKernWrite(transKernel, 3) << "else{" << std::endl;
2474 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2475 			clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
2476 
2477 
2478 			// Handle planar and interleaved right here
2479 			switch (params.fft_inputLayout)
2480 			{
2481 			case CLFFT_COMPLEX_INTERLEAVED:
2482 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << "&& idx<" << params.fft_N[0] << ")" << std::endl;
2483 				if (params.fft_hasPreCallback)
2484 				{
2485 					if (params.fft_preCallback.localMemSize > 0)
2486 					{
2487 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
2488 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
2489 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
2490 					}
2491 					else
2492 					{
2493 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
2494 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
2495 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
2496 					}
2497 				}
2498 				else
2499 				{
2500 					clKernWrite(transKernel, 12) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2501 					clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") " << std::endl;
2502 					clKernWrite(transKernel, 12) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2503 				}
2504 				break;
2505 			case CLFFT_COMPLEX_PLANAR:
2506 				dtInput = dtPlanar;
2507 				dtOutput = dtPlanar;
2508 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << "&& idx<" << params.fft_N[0] << ") {" << std::endl;
2509 				if (params.fft_hasPreCallback)
2510 				{
2511 					if (params.fft_preCallback.localMemSize > 0)
2512 					{
2513 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem); }" << std::endl;
2514 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2515 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem); }" << std::endl;
2516 					}
2517 					else
2518 					{
2519 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata); }" << std::endl;
2520 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2521 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata); }" << std::endl;
2522 					}
2523 				}
2524 				else
2525 				{
2526 					clKernWrite(transKernel, 12) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
2527 					clKernWrite(transKernel, 12) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx]; }" << std::endl;
2528 					clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2529 					clKernWrite(transKernel, 12) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
2530 					clKernWrite(transKernel, 12) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx]; }" << std::endl;
2531 				}
2532 				break;
2533 			case CLFFT_HERMITIAN_INTERLEAVED:
2534 			case CLFFT_HERMITIAN_PLANAR:
2535 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2536 			case CLFFT_REAL:
2537 				break;
2538 			default:
2539 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2540 			}
2541 
2542 
2543 			// If requested, generate the Twiddle math to multiply constant values
2544 			if (twiddleTransposeKernel)
2545 				genTwiddleMath(params, transKernel, dtComplex, fwd);
2546 
2547 			clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
2548 			clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
2549 
2550 			clKernWrite(transKernel, 9) << "}" << std::endl;
2551 			clKernWrite(transKernel, 3) << "}" << std::endl;
2552 
2553 			clKernWrite(transKernel, 3) << "" << std::endl;
2554 			clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
2555 			clKernWrite(transKernel, 3) << "" << std::endl;
2556 
2557 			// Step2: Write from shared to global
2558 
2559 			clKernWrite(transKernel, 3) << "if (" << params.fft_N[0] << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
2560 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2561 			clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop ;" << std::endl;
2562 
2563 			// Handle planar and interleaved right here
2564 			switch (params.fft_outputLayout)
2565 			{
2566 			case CLFFT_COMPLEX_INTERLEAVED:
2567 				if (params.fft_hasPostCallback)
2568 				{
2569 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2570 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2571 					else
2572 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2573 					if (params.fft_postCallback.localMemSize > 0)
2574 					{
2575 						clKernWrite(transKernel, 0) << ", localmem";
2576 					}
2577 					clKernWrite(transKernel, 0) << ");" << std::endl;
2578 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2579 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2580 					else
2581 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2582 					if (params.fft_postCallback.localMemSize > 0)
2583 					{
2584 						clKernWrite(transKernel, 0) << ", localmem";
2585 					}
2586 					clKernWrite(transKernel, 0) << ");" << std::endl;
2587 				}
2588 				else
2589 				{
2590 					clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
2591 					clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
2592 				}
2593 				break;
2594 			case CLFFT_COMPLEX_PLANAR:
2595 				if (params.fft_hasPostCallback)
2596 				{
2597 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2598 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2599 					else
2600 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2601 					if (params.fft_postCallback.localMemSize > 0)
2602 					{
2603 						clKernWrite(transKernel, 0) << ", localmem";
2604 					}
2605 					clKernWrite(transKernel, 0) << ");" << std::endl;
2606 
2607 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2608 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2609 					else
2610 						clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2611 					if (params.fft_postCallback.localMemSize > 0)
2612 					{
2613 						clKernWrite(transKernel, 0) << ", localmem";
2614 					}
2615 					clKernWrite(transKernel, 0) << ");" << std::endl;
2616 				}
2617 				else
2618 				{
2619 					clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
2620 					clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
2621 					clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
2622 					clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
2623 				}
2624 				break;
2625 			case CLFFT_HERMITIAN_INTERLEAVED:
2626 			case CLFFT_HERMITIAN_PLANAR:
2627 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2628 			case CLFFT_REAL:
2629 				break;
2630 			default:
2631 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2632 			}
2633 
2634 
2635 			clKernWrite(transKernel, 6) << "}" << std::endl;
2636 			clKernWrite(transKernel, 3) << "}" << std::endl;
2637 
2638 			clKernWrite(transKernel, 3) << "else{" << std::endl;
2639 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
2640 
2641 			clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
2642 
2643 			// Handle planar and interleaved right here
2644 			switch (params.fft_outputLayout)
2645 			{
2646 			case CLFFT_COMPLEX_INTERLEAVED:
2647 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << " && idx<" << params.fft_N[0] << ")" << std::endl;
2648 				if (params.fft_hasPostCallback)
2649 				{
2650 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2651 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2652 					else
2653 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
2654 					if (params.fft_postCallback.localMemSize > 0)
2655 					{
2656 						clKernWrite(transKernel, 0) << ", localmem";
2657 					}
2658 					clKernWrite(transKernel, 0) << ");" << std::endl;
2659 
2660 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ")" << std::endl;
2661 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2662 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2663 					else
2664 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
2665 
2666 					if (params.fft_postCallback.localMemSize > 0)
2667 					{
2668 						clKernWrite(transKernel, 0) << ", localmem";
2669 					}
2670 					clKernWrite(transKernel, 0) << ");" << std::endl;
2671 				}
2672 				else
2673 				{
2674 					clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
2675 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ")" << std::endl;
2676 					clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
2677 				}
2678 				break;
2679 			case CLFFT_COMPLEX_PLANAR:
2680 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << " && idx<" << params.fft_N[0] << ") {" << std::endl;
2681 
2682 				if (params.fft_hasPostCallback)
2683 				{
2684 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2685 					{
2686 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2687 					}
2688 					else
2689 					{
2690 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
2691 					}
2692 					if (params.fft_postCallback.localMemSize > 0)
2693 					{
2694 						clKernWrite(transKernel, 0) << ", localmem";
2695 					}
2696 					clKernWrite(transKernel, 0) << "); }" << std::endl;
2697 
2698 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2699 					if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
2700 					{
2701 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2702 					}
2703 					else
2704 					{
2705 						clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
2706 					}
2707 					if (params.fft_postCallback.localMemSize > 0)
2708 					{
2709 						clKernWrite(transKernel, 0) << ", localmem";
2710 					}
2711 					clKernWrite(transKernel, 0) << "); }" << std::endl;
2712 				}
2713 				else
2714 				{
2715 					clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
2716 					clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
2717 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << params.fft_N[0] << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << params.fft_N[0] << ") {" << std::endl;
2718 					clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
2719 					clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
2720 				}
2721 
2722 				break;
2723 			case CLFFT_HERMITIAN_INTERLEAVED:
2724 			case CLFFT_HERMITIAN_PLANAR:
2725 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2726 			case CLFFT_REAL:
2727 				break;
2728 			default:
2729 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2730 			}
2731 
2732 
2733 			clKernWrite(transKernel, 6) << "}" << std::endl; // end for
2734 			clKernWrite(transKernel, 3) << "}" << std::endl; // end else
2735 
2736 
2737 		}
2738 		clKernWrite(transKernel, 0) << "}" << std::endl;
2739 
2740 		strKernel = transKernel.str();
2741 
2742 
2743 		if (!twiddleTransposeKernel)
2744 			break; // break for bothDir
2745 	}
2746 
2747 	return CLFFT_SUCCESS;
2748 }
2749 
2750 //generate transpose kernel with square 2d matrix of row major with blocks along the leading dimension
2751 //aka leading dimension batched
2752 /*
2753 Below is a matrix(row major) contaning three square sub matrix along row
2754 [M0 M2 M2]
2755 */
genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params,std::string & strKernel,const size_t & lwSize,const size_t reShapeFactor)2756 clfftStatus genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTransposeNonSquareAction::Signature & params, std::string& strKernel, const size_t& lwSize, const size_t reShapeFactor)
2757 {
2758 	strKernel.reserve(4096);
2759 	std::stringstream transKernel(std::stringstream::out);
2760 
2761 	// These strings represent the various data types we read or write in the kernel, depending on how the plan
2762 	// is configured
2763 	std::string dtInput;        // The type read as input into kernel
2764 	std::string dtOutput;       // The type written as output from kernel
2765 	std::string dtPlanar;       // Fundamental type for planar arrays
2766 	std::string dtComplex;      // Fundamental type for complex arrays
2767 
2768 								// NOTE:  Enable only for debug
2769 								// clKernWrite( transKernel, 0 ) << "#pragma OPENCL EXTENSION cl_amd_printf : enable\n" << std::endl;
2770 
2771 								//if (params.fft_inputLayout != params.fft_outputLayout)
2772 								//	return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2773 
2774 	switch (params.fft_precision)
2775 	{
2776 	case CLFFT_SINGLE:
2777 	case CLFFT_SINGLE_FAST:
2778 		dtPlanar = "float";
2779 		dtComplex = "float2";
2780 		break;
2781 	case CLFFT_DOUBLE:
2782 	case CLFFT_DOUBLE_FAST:
2783 		dtPlanar = "double";
2784 		dtComplex = "double2";
2785 
2786 		// Emit code that enables double precision in the kernel
2787 		clKernWrite(transKernel, 0) << "#ifdef cl_khr_fp64" << std::endl;
2788 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
2789 		clKernWrite(transKernel, 0) << "#else" << std::endl;
2790 		clKernWrite(transKernel, 3) << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
2791 		clKernWrite(transKernel, 0) << "#endif\n" << std::endl;
2792 
2793 		break;
2794 	default:
2795 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2796 		break;
2797 	}
2798 
2799 
2800 	//	If twiddle computation has been requested, generate the lookup function
2801 	if (params.fft_3StepTwiddle)
2802 	{
2803 		std::string str;
2804 		StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
2805 		if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
2806 			twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
2807 		else
2808 			twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
2809 		clKernWrite(transKernel, 0) << str << std::endl;
2810 		clKernWrite(transKernel, 0) << std::endl;
2811 	}
2812 
2813     size_t smaller_dim = (params.fft_N[0] < params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
2814     size_t bigger_dim = (params.fft_N[0] >= params.fft_N[1]) ? params.fft_N[0] : params.fft_N[1];
2815     size_t dim_ratio = bigger_dim / smaller_dim;
2816 
2817 	// This detects whether the input matrix is rectangle of ratio 1:2
2818 
2819     if ((params.fft_N[0] != 2 * params.fft_N[1]) && (params.fft_N[1] != 2 * params.fft_N[0]) &&
2820         (params.fft_N[0] != 3 * params.fft_N[1]) && (params.fft_N[1] != 3 * params.fft_N[0]) &&
2821         (params.fft_N[0] != 5 * params.fft_N[1]) && (params.fft_N[1] != 5 * params.fft_N[0]) &&
2822         (params.fft_N[0] != 10 * params.fft_N[1]) && (params.fft_N[1] != 10 * params.fft_N[0]))
2823     {
2824         return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2825     }
2826 
2827 	if (params.fft_placeness == CLFFT_OUTOFPLACE)
2828 	{
2829 		return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2830 	}
2831 
2832 	// This detects whether the input matrix is a multiple of 16*reshapefactor or not
2833 
2834 	bool mult_of_16 = (smaller_dim % (reShapeFactor * 16) == 0) ? true : false;
2835 
2836 	for (size_t bothDir = 0; bothDir < 2; bothDir++)
2837 	{
2838 		bool fwd = bothDir ? false : true;
2839 
2840 		//If pre-callback is set for the plan
2841 		if (params.fft_hasPreCallback)
2842 		{
2843 			//Insert callback function code at the beginning
2844 			clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
2845 			clKernWrite(transKernel, 0) << std::endl;
2846 		}
2847 		//If post-callback is set for the plan
2848 		if (params.fft_hasPostCallback)
2849 		{
2850 			//Insert callback function code at the beginning
2851 			clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
2852 			clKernWrite(transKernel, 0) << std::endl;
2853 		}
2854 
2855 		std::string funcName;
2856 		if (params.fft_3StepTwiddle) // TODO
2857 			funcName = fwd ? "transpose_nonsquare_tw_fwd" : "transpose_nonsquare_tw_back";
2858 		else
2859 			funcName = "transpose_nonsquare";
2860 
2861 
2862 		// Generate kernel API
2863 		genTransposePrototypeLeadingDimensionBatched(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
2864 
2865 		if (mult_of_16)//number of WG per sub square block
2866 			clKernWrite(transKernel, 3) << "const size_t  numGroups_square_matrix_Y_1 = " << (smaller_dim / 16 / reShapeFactor)*(smaller_dim / 16 / reShapeFactor + 1) / 2 << ";" << std::endl;
2867 		else
2868 			clKernWrite(transKernel, 3) << "const size_t  numGroups_square_matrix_Y_1 = " << (smaller_dim / (16 * reShapeFactor) + 1)*(smaller_dim / (16 * reShapeFactor) + 1 + 1) / 2 << ";" << std::endl;
2869 
2870 		clKernWrite(transKernel, 3) << "const size_t  numGroupsY_1 =  numGroups_square_matrix_Y_1 * "<< dim_ratio <<";" << std::endl;
2871 
2872 		for (size_t  i = 2; i < params.fft_DataDim - 1; i++)
2873 		{
2874 			clKernWrite(transKernel, 3) << "const size_t numGroupsY_" << i << " = numGroupsY_" << i - 1 << " * " << params.fft_N[i] << ";" << std::endl;
2875 		}
2876 
2877 		clKernWrite(transKernel, 3) << "size_t g_index;" << std::endl;
2878 		clKernWrite(transKernel, 3) << "size_t square_matrix_index;" << std::endl;
2879 		clKernWrite(transKernel, 3) << "size_t square_matrix_offset;" << std::endl;
2880 		clKernWrite(transKernel, 3) << std::endl;
2881 
2882 		OffsetCalcLeadingDimensionBatched(transKernel, params);
2883 
2884 		clKernWrite(transKernel, 3) << "square_matrix_index = (g_index / numGroups_square_matrix_Y_1) ;" << std::endl;
2885 		clKernWrite(transKernel, 3) << "g_index = g_index % numGroups_square_matrix_Y_1" << ";" << std::endl;
2886 		clKernWrite(transKernel, 3) << std::endl;
2887 
2888 		if (smaller_dim == params.fft_N[1])
2889 		{
2890 			clKernWrite(transKernel, 3) << "square_matrix_offset = square_matrix_index * " << smaller_dim << ";" << std::endl;
2891 		}
2892 		else
2893 		{
2894 			clKernWrite(transKernel, 3) << "square_matrix_offset = square_matrix_index *" << smaller_dim * smaller_dim << ";" << std::endl;
2895 		}
2896 
2897 		clKernWrite(transKernel, 3) << "iOffset += square_matrix_offset ;" << std::endl;
2898 
2899 		// Handle planar and interleaved right here
2900 		switch (params.fft_inputLayout)
2901 		{
2902 		case CLFFT_COMPLEX_INTERLEAVED:
2903 		case CLFFT_REAL:
2904 			//Do not advance offset when precallback is set as the starting address of global buffer is needed
2905 			if (!params.fft_hasPreCallback)
2906 			{
2907 				clKernWrite(transKernel, 3) << "inputA += iOffset;" << std::endl;  // Set A ptr to the start of each slice
2908 			}
2909 			break;
2910 		case CLFFT_COMPLEX_PLANAR:
2911 			//Do not advance offset when precallback is set as the starting address of global buffer is needed
2912 			if (!params.fft_hasPreCallback)
2913 			{
2914 				clKernWrite(transKernel, 3) << "inputA_R += iOffset;" << std::endl;  // Set A ptr to the start of each slice
2915 				clKernWrite(transKernel, 3) << "inputA_I += iOffset;" << std::endl;  // Set A ptr to the start of each slice
2916 			}
2917 			break;
2918 		case CLFFT_HERMITIAN_INTERLEAVED:
2919 		case CLFFT_HERMITIAN_PLANAR:
2920 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2921 		default:
2922 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2923 		}
2924 
2925 		switch (params.fft_inputLayout)
2926 		{
2927 		case CLFFT_COMPLEX_INTERLEAVED:
2928 		case CLFFT_REAL:
2929 			if (params.fft_hasPreCallback)
2930 			{
2931 				clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl;
2932 			}
2933 			else
2934 			{
2935 				clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
2936 			}
2937 			break;
2938 		case CLFFT_COMPLEX_PLANAR:
2939 			if (params.fft_hasPreCallback)
2940 			{
2941 				clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R + iOffset;" << std::endl;
2942 				clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I + iOffset;" << std::endl;
2943 			}
2944 			else
2945 			{
2946 				clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_R = inputA_R;" << std::endl;
2947 				clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA_I = inputA_I;" << std::endl;
2948 			}
2949 			break;
2950 		case CLFFT_HERMITIAN_INTERLEAVED:
2951 		case CLFFT_HERMITIAN_PLANAR:
2952 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2953 		default:
2954 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
2955 		}
2956 
2957 
2958 		clKernWrite(transKernel, 3) << std::endl;
2959 
2960 		// Now compute the corresponding y,x coordinates
2961 		// for a triangular indexing
2962 		if (mult_of_16)
2963 			clKernWrite(transKernel, 3) << "float row = (" << -2.0f*smaller_dim / 16 / reShapeFactor - 1 << "+sqrt((" << 4.0f*smaller_dim / 16 / reShapeFactor*(smaller_dim / 16 / reShapeFactor + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2964 		else
2965 			clKernWrite(transKernel, 3) << "float row = (" << -2.0f*(smaller_dim / (16 * reShapeFactor) + 1) - 1 << "+sqrt((" << 4.0f*(smaller_dim / (16 * reShapeFactor) + 1)*(smaller_dim / (16 * reShapeFactor) + 1 + 1) << "-8.0f*g_index- 7)))/ (-2.0f);" << std::endl;
2966 
2967 
2968 		clKernWrite(transKernel, 3) << "if (row == (float)(int)row) row -= 1; " << std::endl;
2969 		clKernWrite(transKernel, 3) << "const size_t t_gy = (int)row;" << std::endl;
2970 
2971 		clKernWrite(transKernel, 3) << "" << std::endl;
2972 
2973 		if (mult_of_16)
2974 			clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (smaller_dim / 16 / reShapeFactor) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2975 		else
2976 			clKernWrite(transKernel, 3) << "const long t_gx_p = g_index - " << (smaller_dim / (16 * reShapeFactor) + 1) << "*t_gy + t_gy*(t_gy + 1) / 2;" << std::endl;
2977 
2978 		clKernWrite(transKernel, 3) << "const long t_gy_p = t_gx_p - t_gy;" << std::endl;
2979 
2980 
2981 		clKernWrite(transKernel, 3) << "" << std::endl;
2982 
2983 		clKernWrite(transKernel, 3) << "const size_t d_lidx = get_local_id(0) % 16;" << std::endl;
2984 		clKernWrite(transKernel, 3) << "const size_t d_lidy = get_local_id(0) / 16;" << std::endl;
2985 
2986 		clKernWrite(transKernel, 3) << "" << std::endl;
2987 
2988 		clKernWrite(transKernel, 3) << "const size_t lidy = (d_lidy * 16 + d_lidx) /" << (16 * reShapeFactor) << ";" << std::endl;
2989 		clKernWrite(transKernel, 3) << "const size_t lidx = (d_lidy * 16 + d_lidx) %" << (16 * reShapeFactor) << ";" << std::endl;
2990 
2991 		clKernWrite(transKernel, 3) << "" << std::endl;
2992 
2993 		clKernWrite(transKernel, 3) << "const size_t idx = lidx + t_gx_p*" << 16 * reShapeFactor << ";" << std::endl;
2994 		clKernWrite(transKernel, 3) << "const size_t idy = lidy + t_gy_p*" << 16 * reShapeFactor << ";" << std::endl;
2995 
2996 		clKernWrite(transKernel, 3) << "" << std::endl;
2997 
2998 		clKernWrite(transKernel, 3) << "const size_t starting_index_yx = t_gy_p*" << 16 * reShapeFactor << " + t_gx_p*" << 16 * reShapeFactor*params.fft_N[0] << ";" << std::endl;
2999 
3000 		clKernWrite(transKernel, 3) << "" << std::endl;
3001 
3002 		switch (params.fft_inputLayout)
3003 		{
3004 		case CLFFT_REAL:
3005 		case CLFFT_COMPLEX_INTERLEAVED:
3006 			clKernWrite(transKernel, 3) << "__local " << dtInput << " xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3007 			clKernWrite(transKernel, 3) << "__local " << dtInput << " yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3008 
3009 			clKernWrite(transKernel, 3) << dtInput << " tmpm, tmpt;" << std::endl;
3010 			break;
3011 		case CLFFT_COMPLEX_PLANAR:
3012 			clKernWrite(transKernel, 3) << "__local " << dtComplex << " xy_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3013 			clKernWrite(transKernel, 3) << "__local " << dtComplex << " yx_s[" << 16 * reShapeFactor * 16 * reShapeFactor << "];" << std::endl;
3014 
3015 			clKernWrite(transKernel, 3) << dtComplex << " tmpm, tmpt;" << std::endl;
3016 			break;
3017 		default:
3018 			return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3019 		}
3020 		clKernWrite(transKernel, 3) << "" << std::endl;
3021 
3022 		// Step 1: Load both blocks into local memory
3023 		// Here I load inputA for both blocks contiguously and write it contigously into
3024 		// the corresponding shared memories.
3025 		// Afterwards I use non-contiguous access from local memory and write contiguously
3026 		// back into the arrays
3027 
3028 		if (mult_of_16) {
3029 			clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
3030 			clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3031 			clKernWrite(transKernel, 6) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
3032 
3033 			// Handle planar and interleaved right here
3034 			switch (params.fft_inputLayout)
3035 			{
3036 			case CLFFT_COMPLEX_INTERLEAVED:
3037 			case CLFFT_REAL:
3038 			{
3039 				if (params.fft_hasPreCallback)
3040 				{
3041 					if (params.fft_preCallback.localMemSize > 0)
3042 					{
3043 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3044 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3045 					}
3046 					else
3047 					{
3048 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3049 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop * " << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3050 					}
3051 				}
3052 				else
3053 				{
3054 					clKernWrite(transKernel, 6) << "tmpm = inputA[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3055 					clKernWrite(transKernel, 6) << "tmpt = inputA[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3056 				}
3057 			}
3058 			break;
3059 			case CLFFT_COMPLEX_PLANAR:
3060 				dtInput = dtPlanar;
3061 				dtOutput = dtPlanar;
3062 				if (params.fft_hasPreCallback)
3063 				{
3064 					if (params.fft_preCallback.localMemSize > 0)
3065 					{
3066 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3067 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3068 					}
3069 					else
3070 					{
3071 						clKernWrite(transKernel, 6) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3072 						clKernWrite(transKernel, 6) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3073 					}
3074 				}
3075 				else
3076 				{
3077 					clKernWrite(transKernel, 6) << "tmpm.x = inputA_R[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3078 					clKernWrite(transKernel, 6) << "tmpm.y = inputA_I[(idy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3079 
3080 					clKernWrite(transKernel, 6) << "tmpt.x = inputA_R[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3081 					clKernWrite(transKernel, 6) << "tmpt.y = inputA_I[(lidy + loop *" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3082 				}
3083 				break;
3084 			case CLFFT_HERMITIAN_INTERLEAVED:
3085 			case CLFFT_HERMITIAN_PLANAR:
3086 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3087 			default:
3088 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3089 			}
3090 
3091 			// If requested, generate the Twiddle math to multiply constant values
3092 			if (params.fft_3StepTwiddle)
3093 				genTwiddleMathLeadingDimensionBatched(params, transKernel, dtComplex, fwd);
3094 
3095 			clKernWrite(transKernel, 6) << "xy_s[index] = tmpm; " << std::endl;
3096 			clKernWrite(transKernel, 6) << "yx_s[index] = tmpt; " << std::endl;
3097 
3098 			clKernWrite(transKernel, 3) << "}" << std::endl;
3099 
3100 			clKernWrite(transKernel, 3) << "" << std::endl;
3101 
3102 			clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
3103 
3104 			clKernWrite(transKernel, 3) << "" << std::endl;
3105 
3106 
3107 			// Step2: Write from shared to global
3108 			clKernWrite(transKernel, 3) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3109 			clKernWrite(transKernel, 6) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
3110 
3111 
3112 			// Handle planar and interleaved right here
3113 			switch (params.fft_outputLayout)
3114 			{
3115 			case CLFFT_COMPLEX_INTERLEAVED:
3116 				if (params.fft_hasPostCallback)
3117 				{
3118 					clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
3119 					if (params.fft_postCallback.localMemSize > 0)
3120 					{
3121 						clKernWrite(transKernel, 0) << ", localmem";
3122 					}
3123 					clKernWrite(transKernel, 0) << ");" << std::endl;
3124 
3125 					clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index]";
3126 					if (params.fft_postCallback.localMemSize > 0)
3127 					{
3128 						clKernWrite(transKernel, 0) << ", localmem";
3129 					}
3130 					clKernWrite(transKernel, 0) << ");" << std::endl;
3131 				}
3132 			    else
3133 			    {
3134 				    clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
3135 				    clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
3136 			    }
3137 
3138 				break;
3139 			case CLFFT_COMPLEX_PLANAR:
3140 				if (params.fft_hasPostCallback)
3141 				{
3142 					clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
3143 					if (params.fft_postCallback.localMemSize > 0)
3144 					{
3145 						clKernWrite(transKernel, 0) << ", localmem";
3146 					}
3147 					clKernWrite(transKernel, 0) << ");" << std::endl;
3148 
3149 					clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
3150 					if (params.fft_postCallback.localMemSize > 0)
3151 					{
3152 						clKernWrite(transKernel, 0) << ", localmem";
3153 					}
3154 					clKernWrite(transKernel, 0) << ");" << std::endl;
3155 				}
3156 				else
3157 				{
3158 					clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
3159 					clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
3160 
3161 					clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
3162 					clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
3163 				}
3164 				break;
3165 			case CLFFT_HERMITIAN_INTERLEAVED:
3166 			case CLFFT_HERMITIAN_PLANAR:
3167 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3168 			case CLFFT_REAL:
3169 				break;
3170 			default:
3171 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3172 			}
3173 
3174 
3175 
3176 			clKernWrite(transKernel, 3) << "}" << std::endl;
3177 
3178 		}
3179 		else {
3180 
3181 			clKernWrite(transKernel, 3) << "size_t index;" << std::endl;
3182 			clKernWrite(transKernel, 3) << "if (" << smaller_dim << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
3183 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3184 			clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
3185 
3186 			// Handle planar and interleaved right here
3187 			switch (params.fft_inputLayout)
3188 			{
3189 			case CLFFT_COMPLEX_INTERLEAVED:
3190 			case CLFFT_REAL:
3191 				if (params.fft_hasPreCallback)
3192 				{
3193 					if (params.fft_preCallback.localMemSize > 0)
3194 					{
3195 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3196 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3197 					}
3198 					else
3199 					{
3200 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3201 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3202 					}
3203 				}
3204 				else
3205 				{
3206 					clKernWrite(transKernel, 9) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3207 					clKernWrite(transKernel, 9) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3208 				}
3209 				break;
3210 			case CLFFT_COMPLEX_PLANAR:
3211 				dtInput = dtPlanar;
3212 				dtOutput = dtPlanar;
3213 				if (params.fft_hasPreCallback)
3214 				{
3215 					if (params.fft_preCallback.localMemSize > 0)
3216 					{
3217 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3218 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3219 					}
3220 					else
3221 					{
3222 						clKernWrite(transKernel, 9) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3223 						clKernWrite(transKernel, 9) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3224 					}
3225 				}
3226 				else
3227 				{
3228 					clKernWrite(transKernel, 9) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3229 					clKernWrite(transKernel, 9) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3230 
3231 					clKernWrite(transKernel, 9) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3232 					clKernWrite(transKernel, 9) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3233 				}
3234 				break;
3235 			case CLFFT_HERMITIAN_INTERLEAVED:
3236 			case CLFFT_HERMITIAN_PLANAR:
3237 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3238 			default:
3239 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3240 			}
3241 
3242 			// If requested, generate the Twiddle math to multiply constant values
3243 			if (params.fft_3StepTwiddle)
3244 				genTwiddleMathLeadingDimensionBatched(params, transKernel, dtComplex, fwd);
3245 
3246 			clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
3247 			clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
3248 			clKernWrite(transKernel, 6) << "}" << std::endl;
3249 			clKernWrite(transKernel, 3) << "}" << std::endl;
3250 
3251 			clKernWrite(transKernel, 3) << "else{" << std::endl;
3252 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3253 			clKernWrite(transKernel, 9) << "index = lidy*" << 16 * reShapeFactor << " + lidx + loop*256;" << std::endl;
3254 
3255 
3256 			// Handle planar and interleaved right here
3257 			switch (params.fft_inputLayout)
3258 			{
3259 			case CLFFT_COMPLEX_INTERLEAVED:
3260 			case CLFFT_REAL:
3261 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << "&& idx<" << smaller_dim << ")" << std::endl;
3262 				if (params.fft_hasPreCallback)
3263 				{
3264 					if (params.fft_preCallback.localMemSize > 0)
3265 					{
3266 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem);" << std::endl;
3267 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
3268 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem);" << std::endl;
3269 					}
3270 					else
3271 					{
3272 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata);" << std::endl;
3273 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
3274 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata);" << std::endl;
3275 					}
3276 				}
3277 				else
3278 				{
3279 					clKernWrite(transKernel, 12) << "tmpm = inputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3280 					clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") " << std::endl;
3281 					clKernWrite(transKernel, 12) << "tmpt = inputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3282 				}
3283 				break;
3284 			case CLFFT_COMPLEX_PLANAR:
3285 				dtInput = dtPlanar;
3286 				dtOutput = dtPlanar;
3287 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << "&& idx<" << smaller_dim << ") {" << std::endl;
3288 				if (params.fft_hasPreCallback)
3289 				{
3290 					if (params.fft_preCallback.localMemSize > 0)
3291 					{
3292 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata, localmem); }" << std::endl;
3293 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3294 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata, localmem); }" << std::endl;
3295 					}
3296 					else
3297 					{
3298 						clKernWrite(transKernel, 12) << "tmpm = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx, pre_userdata); }" << std::endl;
3299 						clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3300 						clKernWrite(transKernel, 12) << "tmpt = " << params.fft_preCallback.funcname << "(inputA_R, inputA_I, iOffset + (lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx, pre_userdata); }" << std::endl;
3301 					}
3302 				}
3303 				else
3304 				{
3305 					clKernWrite(transKernel, 12) << "tmpm.x = inputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx];" << std::endl;
3306 					clKernWrite(transKernel, 12) << "tmpm.y = inputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx]; }" << std::endl;
3307 					clKernWrite(transKernel, 9) << "if ((t_gy_p *" << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3308 					clKernWrite(transKernel, 12) << "tmpt.x = inputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx];" << std::endl;
3309 					clKernWrite(transKernel, 12) << "tmpt.y = inputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx]; }" << std::endl;
3310 				}
3311 				break;
3312 			case CLFFT_HERMITIAN_INTERLEAVED:
3313 			case CLFFT_HERMITIAN_PLANAR:
3314 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3315 			default:
3316 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3317 			}
3318 
3319 
3320 			// If requested, generate the Twiddle math to multiply constant values
3321 			if (params.fft_3StepTwiddle)
3322 				genTwiddleMathLeadingDimensionBatched(params, transKernel, dtComplex, fwd);
3323 
3324 			clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
3325 			clKernWrite(transKernel, 9) << "yx_s[index] = tmpt;" << std::endl;
3326 
3327 			clKernWrite(transKernel, 9) << "}" << std::endl;
3328 			clKernWrite(transKernel, 3) << "}" << std::endl;
3329 
3330 			clKernWrite(transKernel, 3) << "" << std::endl;
3331 			clKernWrite(transKernel, 3) << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
3332 			clKernWrite(transKernel, 3) << "" << std::endl;
3333 
3334 			// Step2: Write from shared to global
3335 
3336 			clKernWrite(transKernel, 3) << "if (" << smaller_dim << " - (t_gx_p + 1) *" << 16 * reShapeFactor << ">0){" << std::endl;
3337 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3338 			clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop ;" << std::endl;
3339 
3340 			// Handle planar and interleaved right here
3341 			switch (params.fft_outputLayout)
3342 			{
3343 			case CLFFT_COMPLEX_INTERLEAVED:
3344 				if (params.fft_hasPostCallback)
3345 				{
3346 					clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
3347 					if (params.fft_postCallback.localMemSize > 0)
3348 					{
3349 						clKernWrite(transKernel, 0) << ", localmem";
3350 					}
3351 					clKernWrite(transKernel, 0) << ");" << std::endl;
3352 
3353 					clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
3354 					if (params.fft_postCallback.localMemSize > 0)
3355 					{
3356 						clKernWrite(transKernel, 0) << ", localmem";
3357 					}
3358 					clKernWrite(transKernel, 0) << ");" << std::endl;
3359 				}
3360 				else
3361 				{
3362 					clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
3363 					clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
3364 				}
3365 
3366 				break;
3367 			case CLFFT_COMPLEX_PLANAR:
3368 				if (params.fft_hasPostCallback)
3369 				{
3370 					clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
3371 					if (params.fft_postCallback.localMemSize > 0)
3372 					{
3373 						clKernWrite(transKernel, 0) << ", localmem";
3374 					}
3375 					clKernWrite(transKernel, 0) << ");" << std::endl;
3376 
3377 					clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
3378 					if (params.fft_postCallback.localMemSize > 0)
3379 					{
3380 						clKernWrite(transKernel, 0) << ", localmem";
3381 					}
3382 					clKernWrite(transKernel, 0) << ");" << std::endl;
3383 				}
3384 				else
3385 				{
3386 					clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
3387 					clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
3388 					clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
3389 					clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
3390 				}
3391 				break;
3392 			case CLFFT_HERMITIAN_INTERLEAVED:
3393 			case CLFFT_HERMITIAN_PLANAR:
3394 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3395 			case CLFFT_REAL:
3396 				break;
3397 			default:
3398 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3399 			}
3400 
3401 
3402 			clKernWrite(transKernel, 6) << "}" << std::endl;
3403 			clKernWrite(transKernel, 3) << "}" << std::endl;
3404 
3405 			clKernWrite(transKernel, 3) << "else{" << std::endl;
3406 			clKernWrite(transKernel, 6) << "for (size_t loop = 0; loop<" << reShapeFactor*reShapeFactor << "; ++loop){" << std::endl;
3407 
3408 			clKernWrite(transKernel, 9) << "index = lidx*" << 16 * reShapeFactor << " + lidy + " << 16 / reShapeFactor << "*loop;" << std::endl;
3409 
3410 			// Handle planar and interleaved right here
3411 			switch (params.fft_outputLayout)
3412 			{
3413 			case CLFFT_COMPLEX_INTERLEAVED:
3414 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << " && idx<" << smaller_dim << ")" << std::endl;
3415 				if (params.fft_hasPostCallback)
3416 				{
3417 					clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
3418 					if (params.fft_postCallback.localMemSize > 0)
3419 					{
3420 						clKernWrite(transKernel, 0) << ", localmem";
3421 					}
3422 					clKernWrite(transKernel, 0) << ");" << std::endl;
3423 
3424 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
3425 
3426 					clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
3427 					if (params.fft_postCallback.localMemSize > 0)
3428 					{
3429 						clKernWrite(transKernel, 0) << ", localmem";
3430 					}
3431 					clKernWrite(transKernel, 0) << ");" << std::endl;
3432 				}
3433 				else
3434 				{
3435 					clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
3436 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
3437 					clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
3438 				}
3439 				break;
3440 			case CLFFT_COMPLEX_PLANAR:
3441 				clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << " && idx<" << smaller_dim << ") {" << std::endl;
3442 
3443 				if (params.fft_hasPostCallback)
3444 				{
3445 					clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
3446 					if (params.fft_postCallback.localMemSize > 0)
3447 					{
3448 						clKernWrite(transKernel, 0) << ", localmem";
3449 					}
3450 					clKernWrite(transKernel, 0) << "); }" << std::endl;
3451 
3452 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3453 
3454 					clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
3455 					if (params.fft_postCallback.localMemSize > 0)
3456 					{
3457 						clKernWrite(transKernel, 0) << ", localmem";
3458 					}
3459 					clKernWrite(transKernel, 0) << "); }" << std::endl;
3460 				}
3461 				else
3462 				{
3463 					clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
3464 					clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
3465 					clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
3466 					clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
3467 					clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
3468 				}
3469 
3470 				break;
3471 			case CLFFT_HERMITIAN_INTERLEAVED:
3472 			case CLFFT_HERMITIAN_PLANAR:
3473 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3474 			case CLFFT_REAL:
3475 				break;
3476 			default:
3477 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
3478 			}
3479 
3480 
3481 			clKernWrite(transKernel, 6) << "}" << std::endl; // end for
3482 			clKernWrite(transKernel, 3) << "}" << std::endl; // end else
3483 
3484 		}
3485 		clKernWrite(transKernel, 0) << "}" << std::endl;
3486 
3487 		strKernel = transKernel.str();
3488 
3489 		if (!params.fft_3StepTwiddle)
3490 			break;
3491 	}
3492 
3493 	return CLFFT_SUCCESS;
3494 }
3495 
3496 }// end of namespace clfft_transpose_generator
3497 
3498