1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16 
17 
18 // action.transpose.nonsquare.cpp provides the entry points of "baking"
19 // nonsquare inplace transpose kernels called in plan.cpp.
20 // the actual kernel string generation is provided by generator.transpose.cpp
21 
22 #include "stdafx.h"
23 
24 #include <math.h>
25 #include <iomanip>
26 #include "generator.transpose.h"
27 #include "action.transpose.h"
28 #include "generator.stockham.h"
29 
30 #include "action.h"
31 
FFTGeneratedTransposeNonSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)32 FFTGeneratedTransposeNonSquareAction::FFTGeneratedTransposeNonSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
33     : FFTTransposeNonSquareAction(plHandle, plan, queue, err)
34 {
35     if (err != CLFFT_SUCCESS)
36     {
37         // FFTTransposeNonSquareAction() failed, exit
38         fprintf(stderr, "FFTTransposeNonSquareAction() failed!\n");
39         return;
40     }
41 
42     // Initialize the FFTAction::FFTKernelGenKeyParams member
43     err = this->initParams();
44 
45     if (err != CLFFT_SUCCESS)
46     {
47         fprintf(stderr, "FFTGeneratedTransposeNonSquareAction::initParams() failed!\n");
48         return;
49     }
50 
51     FFTRepo &fftRepo = FFTRepo::getInstance();
52 
53     err = this->generateKernel(fftRepo, queue);
54 
55     if (err != CLFFT_SUCCESS)
56     {
57         fprintf(stderr, "FFTGeneratedTransposeNonSquareAction::generateKernel failed\n");
58         return;
59     }
60 
61     err = compileKernels(queue, plHandle, plan);
62 
63     if (err != CLFFT_SUCCESS)
64     {
65         fprintf(stderr, "FFTGeneratedTransposeNonSquareAction::compileKernels failed\n");
66         return;
67     }
68 
69     err = CLFFT_SUCCESS;
70 }
71 
72 
buildForwardKernel()73 bool FFTGeneratedTransposeNonSquareAction::buildForwardKernel()
74 {
75     clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
76     clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
77 
78     bool r2c_transform = (inputLayout == CLFFT_REAL);
79     bool c2r_transform = (outputLayout == CLFFT_REAL);
80     bool real_transform = (r2c_transform || c2r_transform);
81 
82     return (!real_transform) || r2c_transform;
83 }
84 
buildBackwardKernel()85 bool FFTGeneratedTransposeNonSquareAction::buildBackwardKernel()
86 {
87     clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
88     clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
89 
90     bool r2c_transform = (inputLayout == CLFFT_REAL);
91     bool c2r_transform = (outputLayout == CLFFT_REAL);
92     bool real_transform = (r2c_transform || c2r_transform);
93 
94     return (!real_transform) || c2r_transform;
95 }
96 
97 // These strings represent the names that are used as strKernel parameters
98 const std::string pmRealIn("pmRealIn");
99 const std::string pmImagIn("pmImagIn");
100 const std::string pmRealOut("pmRealOut");
101 const std::string pmImagOut("pmImagOut");
102 const std::string pmComplexIn("pmComplexIn");
103 const std::string pmComplexOut("pmComplexOut");
104 
initParams()105 clfftStatus FFTGeneratedTransposeNonSquareAction::initParams()
106 {
107 
108     this->signature.fft_precision = this->plan->precision;
109     this->signature.fft_placeness = this->plan->placeness;
110     this->signature.fft_inputLayout = this->plan->inputLayout;
111     this->signature.fft_outputLayout = this->plan->outputLayout;
112     this->signature.fft_3StepTwiddle = false;
113     this->signature.nonSquareKernelType = this->plan->nonSquareKernelType;
114 
115     this->signature.fft_realSpecial = this->plan->realSpecial;
116 
117     this->signature.transOutHorizontal = this->plan->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
118                                                                             // we do this so as to reuse flags in FFTKernelGenKeyParams
119                                                                             // and to avoid making a new one
120 
121     ARG_CHECK(this->plan->inStride.size() == this->plan->outStride.size());
122 
123     if (CLFFT_INPLACE == this->signature.fft_placeness)
124     {
125         //	If this is an in-place transform the
126         //	input and output layout
127         //	*MUST* be the same.
128         //
129         ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
130 
131     /*        for (size_t u = this->plan->inStride.size(); u-- > 0; )
132             {
133                 ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
134             }*/
135     }
136 
137     this->signature.fft_DataDim = this->plan->length.size() + 1;
138 
139     int i = 0;
140     for (i = 0; i < (this->signature.fft_DataDim - 1); i++)
141     {
142         this->signature.fft_N[i] = this->plan->length[i];
143         this->signature.fft_inStride[i] = this->plan->inStride[i];
144         this->signature.fft_outStride[i] = this->plan->outStride[i];
145 
146     }
147     this->signature.fft_inStride[i] = this->plan->iDist;
148     this->signature.fft_outStride[i] = this->plan->oDist;
149 
150     if (this->plan->large1D != 0) {
151         ARG_CHECK(this->signature.fft_N[0] != 0)
152             //ToDo:ENABLE ASSERT
153        //     ARG_CHECK((this->plan->large1D % this->signature.fft_N[0]) == 0)
154             this->signature.fft_3StepTwiddle = true;
155         //ToDo:ENABLE ASSERT
156        // ARG_CHECK(this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]));
157     }
158 
159     //	Query the devices in this context for their local memory sizes
160     //	How we generate a kernel depends on the *minimum* LDS size for all devices.
161     //
162     const FFTEnvelope * pEnvelope = NULL;
163     OPENCL_V(this->plan->GetEnvelope(&pEnvelope), _T("GetEnvelope failed"));
164     BUG_CHECK(NULL != pEnvelope);
165 
166     // TODO:  Since I am going with a 2D workgroup size now, I need a better check than this 1D use
167     // Check:  CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
168     // CL_DEVICE_MAX_WORK_ITEM_SIZES
169     this->signature.fft_R = 1; // Dont think i'll use
170     this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
171 
172                                                                //Set callback if specified
173     if (this->plan->hasPreCallback)
174     {
175         this->signature.fft_hasPreCallback = true;
176         this->signature.fft_preCallback = this->plan->preCallback;
177     }
178 	if (this->plan->hasPostCallback)
179 	{
180 		this->signature.fft_hasPostCallback = true;
181 		this->signature.fft_postCallback = this->plan->postCallbackParam;
182 	}
183 	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
184 
185 	this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
186 	this->signature.nonSquareKernelOrder = this->plan->nonSquareKernelOrder;
187 	this->signature.transposeBatchSize = this->plan->batchsize;
188 
189     return CLFFT_SUCCESS;
190 }
191 
192 
193 static const size_t lwSize = 256;
194 static const size_t reShapeFactor = 2;
195 
196 
197 //	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
198 //	Feed this generator the FFTPlan, and it returns the generated program as a string
generateKernel(FFTRepo & fftRepo,const cl_command_queue commQueueFFT)199 clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT)
200 {
201 
202 
203     std::string programCode;
204 	std::string kernelFuncName;//applied to swap kernel for now
205     if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
206     {
207 		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
208 		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
209 		{
210 			assert(!this->signature.fft_hasPostCallback);
211 
212 			bool validLDSSize = false;
213 			size_t requestedCallbackLDS = 0;
214 
215 			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
216 
217 			validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
218 
219 			if(!validLDSSize)
220 			{
221 				fprintf(stderr, "Requested local memory size not available\n");
222 				return CLFFT_INVALID_ARG_VALUE;
223 			}
224 		}
225         OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
226     }
227 	else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
228 	{
229 		//pre call back check
230 		//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
231 		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
232 		{
233 			assert(!this->signature.fft_hasPostCallback);
234 
235 			bool validLDSSize = false;
236 			size_t requestedCallbackLDS = 0;
237 
238 			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
239 
240 			validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
241 
242 			if (!validLDSSize)
243 			{
244 				fprintf(stderr, "Requested local memory size not available\n");
245 				return CLFFT_INVALID_ARG_VALUE;
246 			}
247 		}
248 		OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
249 	}
250     else
251     {
252 		//pre-callback is possible in swap kernel now
253 		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
254 		{
255 			assert(!this->signature.fft_hasPostCallback);
256 
257 			bool validLDSSize = false;
258 			size_t requestedCallbackLDS = 0;
259 
260 			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
261 			//LDS usage of swap lines is exactly 2 lines
262 			size_t lineSize = (this->signature.fft_N[0]) < (this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
263 			validLDSSize = ((2 * this->plan->ElementSize() * lineSize) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
264 
265 			if (!validLDSSize)
266 			{
267 				fprintf(stderr, "Requested local memory size not available\n");
268 				return CLFFT_INVALID_ARG_VALUE;
269 			}
270 		}
271 		//here we should decide generate what kind of swap kernel. 1:2 and 1:3 probably need different swap kernels
272 		/*
273 		if (this->signature.fft_N[0] == 2 * this->signature.fft_N[1] || 2 * this->signature.fft_N[0] == this->signature.fft_N[1])
274 		{
275 			OPENCL_V(clfft_transpose_generator::genSwapKernel(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
276 		}
277 		else
278 		{
279 			OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
280 		}
281 		*/
282 		//general swap kernel takes care of all ratio
283 		OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
284     }
285 	//std::cout << programCode << std::endl;
286     cl_int status = CL_SUCCESS;
287     cl_device_id Device = NULL;
288     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
289     OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
290 
291     cl_context QueueContext = NULL;
292     status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
293     OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
294 
295 
296     OPENCL_V(fftRepo.setProgramCode(Transpose_NONSQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
297     if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
298     {
299         // Note:  See genFunctionPrototype( )
300         if (this->signature.fft_3StepTwiddle)
301         {
302             OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_nonsquare_tw_fwd", "transpose_nonsquare_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
303         }
304         else
305         {
306             OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_nonsquare", "transpose_nonsquare", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
307         }
308     }
309 	else if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
310 	{
311         //for non square we do twiddling in swap kernel
312         /*
313 		if (this->signature.fft_3StepTwiddle && (this->signature.transposeMiniBatchSize == 1))
314 		{
315 			OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
316 		}
317 		else
318 		{
319 			OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
320 		}
321         */
322         OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
323 	}
324     else
325     {
326         if (this->signature.fft_3StepTwiddle)//if miniBatchSize > 1 twiddling is done in swap kernel
327         {
328             std::string kernelFwdFuncName = kernelFuncName + "_tw_fwd";
329             std::string kernelBwdFuncName = kernelFuncName + "_tw_back";
330             OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), kernelFwdFuncName.c_str(), kernelBwdFuncName.c_str(), Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
331         }
332         else
333             OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), kernelFuncName.c_str(), kernelFuncName.c_str(), Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
334     }
335     return CLFFT_SUCCESS;
336 }
337 
338 
getWorkSizes(std::vector<size_t> & globalWS,std::vector<size_t> & localWS)339 clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size_t >& globalWS, std::vector< size_t >& localWS)
340 {
341 
342     size_t wg_slice;
343     size_t smaller_dim = (this->signature.fft_N[0] < this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
344 	size_t bigger_dim = (this->signature.fft_N[0] >= this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
345 	size_t dim_ratio = bigger_dim / smaller_dim;
346     size_t global_item_size;
347 
348     if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
349     {
350         if (smaller_dim % (16 * reShapeFactor) == 0)
351             wg_slice = smaller_dim / 16 / reShapeFactor;
352         else
353             wg_slice = (smaller_dim / (16 * reShapeFactor)) + 1;
354 
355         global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
356 
357         for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
358         {
359             global_item_size *= this->signature.fft_N[i];
360         }
361 
362         /*Push the data required for the transpose kernels*/
363         globalWS.clear();
364 		if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
365 			globalWS.push_back(global_item_size * dim_ratio);
366 		else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
367 			globalWS.push_back(global_item_size);
368 
369 
370         localWS.clear();
371         localWS.push_back(lwSize);
372     }
373 	else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
374 	{
375 		if (smaller_dim % (16 * reShapeFactor) == 0)
376 			wg_slice = smaller_dim / 16 / reShapeFactor;
377 		else
378 			wg_slice = (smaller_dim / (16 * reShapeFactor)) + 1;
379 
380 		global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
381 
382 		for (int i = 2; i < this->plan->length.size(); i++)
383 		{
384 			global_item_size *= this->plan->length[i];
385 		}
386 
387 		/*Push the data required for the transpose kernels*/
388 		globalWS.clear();
389 		globalWS.push_back(global_item_size);
390 
391 
392 		localWS.clear();
393 		localWS.push_back(lwSize);
394 	}
395     else
396     {
397         /*Now calculate the data for the swap kernels */
398 		// general swap kernel takes care of all ratio. need clean up here
399 		if(dim_ratio == 2 && 0){
400 			//1:2 ratio
401 			size_t input_elm_size_in_bytes;
402 			switch (this->signature.fft_precision)
403 			{
404 			case CLFFT_SINGLE:
405 			case CLFFT_SINGLE_FAST:
406 				input_elm_size_in_bytes = 4;
407 				break;
408 			case CLFFT_DOUBLE:
409 			case CLFFT_DOUBLE_FAST:
410 				input_elm_size_in_bytes = 8;
411 				break;
412 			default:
413 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
414 			}
415 
416 			switch (this->signature.fft_outputLayout)
417 			{
418 			case CLFFT_COMPLEX_INTERLEAVED:
419 			case CLFFT_COMPLEX_PLANAR:
420 				input_elm_size_in_bytes *= 2;
421 				break;
422 			case CLFFT_REAL:
423 				break;
424 			default:
425 				return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
426 			}
427 			size_t max_elements_loaded = AVAIL_MEM_SIZE / input_elm_size_in_bytes;
428 			size_t num_elements_loaded;
429 			size_t local_work_size_swap, num_grps_pro_row;
430 
431 			if ((max_elements_loaded >> 1) > smaller_dim)
432 			{
433 				local_work_size_swap = (smaller_dim < 256) ? smaller_dim : 256;
434 				num_elements_loaded = smaller_dim;
435 				num_grps_pro_row = 1;
436 			}
437 			else
438 			{
439 				num_grps_pro_row = (smaller_dim << 1) / max_elements_loaded;
440 				num_elements_loaded = max_elements_loaded >> 1;
441 				local_work_size_swap = (num_elements_loaded < 256) ? num_elements_loaded : 256;
442 			}
443 			size_t num_reduced_row;
444 			size_t num_reduced_col;
445 
446 			if (this->signature.fft_N[1] == smaller_dim)
447 			{
448 				num_reduced_row = smaller_dim;
449 				num_reduced_col = 2;
450 			}
451 			else
452 			{
453 				num_reduced_row = 2;
454 				num_reduced_col = smaller_dim;
455 			}
456 
457 			size_t *cycle_map = new size_t[num_reduced_row * num_reduced_col * 2];
458 			/* The memory required by cycle_map cannot exceed 2 times row*col by design*/
459 			clfft_transpose_generator::get_cycles(cycle_map, num_reduced_row, num_reduced_col);
460 
461 			global_item_size = local_work_size_swap * num_grps_pro_row * cycle_map[0] * this->plan->batchsize;
462 
463 			for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
464 			{
465 				global_item_size *= this->signature.fft_N[i];
466 			}
467 			delete[] cycle_map;
468 
469 			globalWS.push_back(global_item_size);
470 			localWS.push_back(local_work_size_swap);
471 		}
472 		else
473 		{
474 			//if (dim_ratio == 2 || dim_ratio == 3 || dim_ratio == 5 || dim_ratio == 10)
475 			if (dim_ratio % 2 == 0 || dim_ratio % 3 == 0 || dim_ratio % 5 == 0 || dim_ratio % 10 == 0)
476 			{
477 				size_t local_work_size_swap = 256;
478 				std::vector<std::vector<size_t> > permutationTable;
479 				clfft_transpose_generator::permutation_calculation(dim_ratio, smaller_dim, permutationTable);
480 				size_t global_item_size;
481 				if(this->plan->large1D && (dim_ratio > 1))
482 					global_item_size = (permutationTable.size() + 2) * local_work_size_swap * this->plan->batchsize;
483 				else
484 					global_item_size = (permutationTable.size() + 2) * local_work_size_swap * this->plan->batchsize;
485 				//for (int i = 2; i < this->plan->length.size(); i++)
486 				//	global_item_size *= this->plan->length[i];
487 				size_t LDS_per_WG = smaller_dim;
488 				while (LDS_per_WG > 1024)//avoiding using too much lds memory. the biggest LDS memory we will allocate would be 1024*sizeof(float2/double2)*2
489 				{
490 					if (LDS_per_WG % 2 == 0)
491 					{
492 						LDS_per_WG /= 2;
493 						continue;
494 					}
495 					if (LDS_per_WG % 3 == 0)
496 					{
497 						LDS_per_WG /= 3;
498 						continue;
499 					}
500 					if (LDS_per_WG % 5 == 0)
501 					{
502 						LDS_per_WG /= 5;
503 						continue;
504 					}
505 					return CLFFT_NOTIMPLEMENTED;
506 				}
507 
508 				size_t WG_per_line = smaller_dim / LDS_per_WG;
509 				global_item_size *= WG_per_line;
510 				globalWS.push_back(global_item_size);
511 				localWS.push_back(local_work_size_swap);
512 			}
513 			else
514 				return CLFFT_NOTIMPLEMENTED;
515 		}
516     }
517     return CLFFT_SUCCESS;
518 }
519 
FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)520 FFTGeneratedTransposeSquareAction::FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
521 	: FFTTransposeSquareAction(plHandle, plan, queue, err)
522 {
523 	if (err != CLFFT_SUCCESS)
524 	{
525 		// FFTTransposeSquareAction() failed, exit
526 		fprintf(stderr, "FFTTransposeSquareAction() failed!\n");
527 		return;
528 	}
529 
530 	// Initialize the FFTAction::FFTKernelGenKeyParams member
531 	err = this->initParams();
532 
533 	if (err != CLFFT_SUCCESS)
534 	{
535 		fprintf(stderr, "FFTGeneratedTransposeSquareAction::initParams() failed!\n");
536 		return;
537 	}
538 
539 	FFTRepo &fftRepo = FFTRepo::getInstance();
540 
541 	err = this->generateKernel(fftRepo, queue);
542 
543 	if (err != CLFFT_SUCCESS)
544 	{
545 		fprintf(stderr, "FFTGeneratedTransposeSquareAction::generateKernel failed\n");
546 		return;
547 	}
548 
549 	err = compileKernels(queue, plHandle, plan);
550 
551 	if (err != CLFFT_SUCCESS)
552 	{
553 		fprintf(stderr, "FFTGeneratedTransposeSquareAction::compileKernels failed\n");
554 		return;
555 	}
556 
557 	err = CLFFT_SUCCESS;
558 }
559 
560 
buildForwardKernel()561 bool FFTGeneratedTransposeSquareAction::buildForwardKernel()
562 {
563 	clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
564 	clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
565 
566 	bool r2c_transform = (inputLayout == CLFFT_REAL);
567 	bool c2r_transform = (outputLayout == CLFFT_REAL);
568 	bool real_transform = (r2c_transform || c2r_transform);
569 
570 	return (!real_transform) || r2c_transform;
571 }
572 
buildBackwardKernel()573 bool FFTGeneratedTransposeSquareAction::buildBackwardKernel()
574 {
575 	clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
576 	clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
577 
578 	bool r2c_transform = (inputLayout == CLFFT_REAL);
579 	bool c2r_transform = (outputLayout == CLFFT_REAL);
580 	bool real_transform = (r2c_transform || c2r_transform);
581 
582 	return (!real_transform) || c2r_transform;
583 }
584 
585 /*sqaure action*/
initParams()586 clfftStatus FFTGeneratedTransposeSquareAction::initParams()
587 {
588 
589 	this->signature.fft_precision = this->plan->precision;
590 	this->signature.fft_placeness = this->plan->placeness;
591 	this->signature.fft_inputLayout = this->plan->inputLayout;
592 	this->signature.fft_outputLayout = this->plan->outputLayout;
593 	this->signature.fft_3StepTwiddle = false;
594 
595 	this->signature.fft_realSpecial = this->plan->realSpecial;
596 
597 	this->signature.transOutHorizontal = this->plan->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
598 																			// we do this so as to reuse flags in FFTKernelGenKeyParams
599 																			// and to avoid making a new one
600 
601 	ARG_CHECK(this->plan->inStride.size() == this->plan->outStride.size());
602 
603 	if (CLFFT_INPLACE == this->signature.fft_placeness)
604 	{
605 		//	If this is an in-place transform the
606 		//	input and output layout, dimensions and strides
607 		//	*MUST* be the same.
608 		//
609 		ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
610 
611 			for (size_t u = this->plan->inStride.size(); u-- > 0; )
612 			{
613 				ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
614 			}
615 	}
616 
617 	this->signature.fft_DataDim = this->plan->length.size() + 1;
618 	int i = 0;
619 	for (i = 0; i < (this->signature.fft_DataDim - 1); i++)
620 	{
621 		this->signature.fft_N[i] = this->plan->length[i];
622 		this->signature.fft_inStride[i] = this->plan->inStride[i];
623 		this->signature.fft_outStride[i] = this->plan->outStride[i];
624 
625 	}
626 	this->signature.fft_inStride[i] = this->plan->iDist;
627 	this->signature.fft_outStride[i] = this->plan->oDist;
628 
629 	if (this->plan->large1D != 0) {
630 		ARG_CHECK(this->signature.fft_N[0] != 0)
631 			ARG_CHECK((this->plan->large1D % this->signature.fft_N[0]) == 0)
632 			this->signature.fft_3StepTwiddle = true;
633 		ARG_CHECK(this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]));
634 	}
635 
636 	//	Query the devices in this context for their local memory sizes
637 	//	How we generate a kernel depends on the *minimum* LDS size for all devices.
638 	//
639 	const FFTEnvelope * pEnvelope = NULL;
640 	OPENCL_V(this->plan->GetEnvelope(&pEnvelope), _T("GetEnvelope failed"));
641 	BUG_CHECK(NULL != pEnvelope);
642 
643 	// TODO:  Since I am going with a 2D workgroup size now, I need a better check than this 1D use
644 	// Check:  CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
645 	// CL_DEVICE_MAX_WORK_ITEM_SIZES
646 	this->signature.fft_R = 1; // Dont think i'll use
647 	this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
648 
649 															   //Set callback if specified
650 	if (this->plan->hasPreCallback)
651 	{
652 		this->signature.fft_hasPreCallback = true;
653 		this->signature.fft_preCallback = this->plan->preCallback;
654 	}
655 	if (this->plan->hasPostCallback)
656 	{
657 		this->signature.fft_hasPostCallback = true;
658 		this->signature.fft_postCallback = this->plan->postCallbackParam;
659 	}
660 	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
661 
662 	this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
663 	this->signature.transposeBatchSize = this->plan->batchsize;
664 
665 	return CLFFT_SUCCESS;
666 }
667 
668 
669 //	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
670 //	Feed this generator the FFTPlan, and it returns the generated program as a string
generateKernel(FFTRepo & fftRepo,const cl_command_queue commQueueFFT)671 clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT)
672 {
673 	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
674 	if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) ||
675 		(this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
676 	{
677 		assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
678 
679 		bool validLDSSize = false;
680 		size_t requestedCallbackLDS = 0;
681 
682 		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
683 			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
684 		else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
685 			requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
686 
687 		validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
688 
689 		if (!validLDSSize)
690 		{
691 			fprintf(stderr, "Requested local memory size not available\n");
692 			return CLFFT_INVALID_ARG_VALUE;
693 		}
694 	}
695 
696 	std::string programCode;
697 	OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
698 
699 	cl_int status = CL_SUCCESS;
700 	cl_device_id Device = NULL;
701 	status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
702 	OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
703 
704 	cl_context QueueContext = NULL;
705 	status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
706 	OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
707 
708 
709 	OPENCL_V(fftRepo.setProgramCode(Transpose_SQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
710 
711 	// Note:  See genFunctionPrototype( )
712 	if (this->signature.fft_3StepTwiddle)
713 	{
714 		OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
715 	}
716 	else
717 	{
718 		OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
719 	}
720 
721 	return CLFFT_SUCCESS;
722 }
723 
724 
getWorkSizes(std::vector<size_t> & globalWS,std::vector<size_t> & localWS)725 clfftStatus FFTGeneratedTransposeSquareAction::getWorkSizes(std::vector< size_t >& globalWS, std::vector< size_t >& localWS)
726 {
727 
728 	size_t wg_slice;
729 	if (this->signature.fft_N[0] % (16 * reShapeFactor) == 0)
730 		wg_slice = this->signature.fft_N[0] / 16 / reShapeFactor;
731 	else
732 		wg_slice = (this->signature.fft_N[0] / (16 * reShapeFactor)) + 1;
733 
734 	size_t global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
735 
736 	for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
737 	{
738 		global_item_size *= this->signature.fft_N[i];
739 	}
740 
741 	globalWS.clear();
742 	globalWS.push_back(global_item_size);
743 
744 	localWS.clear();
745 	localWS.push_back(lwSize);
746 
747 	return CLFFT_SUCCESS;
748 }
749