1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 // action.transpose.nonsquare.cpp provides the entry points of "baking"
19 // nonsquare inplace transpose kernels called in plan.cpp.
20 // the actual kernel string generation is provided by generator.transpose.cpp
21
22 #include "stdafx.h"
23
24 #include <math.h>
25 #include <iomanip>
26 #include "generator.transpose.h"
27 #include "action.transpose.h"
28 #include "generator.stockham.h"
29
30 #include "action.h"
31
FFTGeneratedTransposeNonSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)32 FFTGeneratedTransposeNonSquareAction::FFTGeneratedTransposeNonSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
33 : FFTTransposeNonSquareAction(plHandle, plan, queue, err)
34 {
35 if (err != CLFFT_SUCCESS)
36 {
37 // FFTTransposeNonSquareAction() failed, exit
38 fprintf(stderr, "FFTTransposeNonSquareAction() failed!\n");
39 return;
40 }
41
42 // Initialize the FFTAction::FFTKernelGenKeyParams member
43 err = this->initParams();
44
45 if (err != CLFFT_SUCCESS)
46 {
47 fprintf(stderr, "FFTGeneratedTransposeNonSquareAction::initParams() failed!\n");
48 return;
49 }
50
51 FFTRepo &fftRepo = FFTRepo::getInstance();
52
53 err = this->generateKernel(fftRepo, queue);
54
55 if (err != CLFFT_SUCCESS)
56 {
57 fprintf(stderr, "FFTGeneratedTransposeNonSquareAction::generateKernel failed\n");
58 return;
59 }
60
61 err = compileKernels(queue, plHandle, plan);
62
63 if (err != CLFFT_SUCCESS)
64 {
65 fprintf(stderr, "FFTGeneratedTransposeNonSquareAction::compileKernels failed\n");
66 return;
67 }
68
69 err = CLFFT_SUCCESS;
70 }
71
72
buildForwardKernel()73 bool FFTGeneratedTransposeNonSquareAction::buildForwardKernel()
74 {
75 clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
76 clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
77
78 bool r2c_transform = (inputLayout == CLFFT_REAL);
79 bool c2r_transform = (outputLayout == CLFFT_REAL);
80 bool real_transform = (r2c_transform || c2r_transform);
81
82 return (!real_transform) || r2c_transform;
83 }
84
buildBackwardKernel()85 bool FFTGeneratedTransposeNonSquareAction::buildBackwardKernel()
86 {
87 clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
88 clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
89
90 bool r2c_transform = (inputLayout == CLFFT_REAL);
91 bool c2r_transform = (outputLayout == CLFFT_REAL);
92 bool real_transform = (r2c_transform || c2r_transform);
93
94 return (!real_transform) || c2r_transform;
95 }
96
97 // These strings represent the names that are used as strKernel parameters
98 const std::string pmRealIn("pmRealIn");
99 const std::string pmImagIn("pmImagIn");
100 const std::string pmRealOut("pmRealOut");
101 const std::string pmImagOut("pmImagOut");
102 const std::string pmComplexIn("pmComplexIn");
103 const std::string pmComplexOut("pmComplexOut");
104
initParams()105 clfftStatus FFTGeneratedTransposeNonSquareAction::initParams()
106 {
107
108 this->signature.fft_precision = this->plan->precision;
109 this->signature.fft_placeness = this->plan->placeness;
110 this->signature.fft_inputLayout = this->plan->inputLayout;
111 this->signature.fft_outputLayout = this->plan->outputLayout;
112 this->signature.fft_3StepTwiddle = false;
113 this->signature.nonSquareKernelType = this->plan->nonSquareKernelType;
114
115 this->signature.fft_realSpecial = this->plan->realSpecial;
116
117 this->signature.transOutHorizontal = this->plan->transOutHorizontal; // using the twiddle front flag to specify horizontal write
118 // we do this so as to reuse flags in FFTKernelGenKeyParams
119 // and to avoid making a new one
120
121 ARG_CHECK(this->plan->inStride.size() == this->plan->outStride.size());
122
123 if (CLFFT_INPLACE == this->signature.fft_placeness)
124 {
125 // If this is an in-place transform the
126 // input and output layout
127 // *MUST* be the same.
128 //
129 ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
130
131 /* for (size_t u = this->plan->inStride.size(); u-- > 0; )
132 {
133 ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
134 }*/
135 }
136
137 this->signature.fft_DataDim = this->plan->length.size() + 1;
138
139 int i = 0;
140 for (i = 0; i < (this->signature.fft_DataDim - 1); i++)
141 {
142 this->signature.fft_N[i] = this->plan->length[i];
143 this->signature.fft_inStride[i] = this->plan->inStride[i];
144 this->signature.fft_outStride[i] = this->plan->outStride[i];
145
146 }
147 this->signature.fft_inStride[i] = this->plan->iDist;
148 this->signature.fft_outStride[i] = this->plan->oDist;
149
150 if (this->plan->large1D != 0) {
151 ARG_CHECK(this->signature.fft_N[0] != 0)
152 //ToDo:ENABLE ASSERT
153 // ARG_CHECK((this->plan->large1D % this->signature.fft_N[0]) == 0)
154 this->signature.fft_3StepTwiddle = true;
155 //ToDo:ENABLE ASSERT
156 // ARG_CHECK(this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]));
157 }
158
159 // Query the devices in this context for their local memory sizes
160 // How we generate a kernel depends on the *minimum* LDS size for all devices.
161 //
162 const FFTEnvelope * pEnvelope = NULL;
163 OPENCL_V(this->plan->GetEnvelope(&pEnvelope), _T("GetEnvelope failed"));
164 BUG_CHECK(NULL != pEnvelope);
165
166 // TODO: Since I am going with a 2D workgroup size now, I need a better check than this 1D use
167 // Check: CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
168 // CL_DEVICE_MAX_WORK_ITEM_SIZES
169 this->signature.fft_R = 1; // Dont think i'll use
170 this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
171
172 //Set callback if specified
173 if (this->plan->hasPreCallback)
174 {
175 this->signature.fft_hasPreCallback = true;
176 this->signature.fft_preCallback = this->plan->preCallback;
177 }
178 if (this->plan->hasPostCallback)
179 {
180 this->signature.fft_hasPostCallback = true;
181 this->signature.fft_postCallback = this->plan->postCallbackParam;
182 }
183 this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
184
185 this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
186 this->signature.nonSquareKernelOrder = this->plan->nonSquareKernelOrder;
187 this->signature.transposeBatchSize = this->plan->batchsize;
188
189 return CLFFT_SUCCESS;
190 }
191
192
193 static const size_t lwSize = 256;
194 static const size_t reShapeFactor = 2;
195
196
197 // OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
198 // Feed this generator the FFTPlan, and it returns the generated program as a string
generateKernel(FFTRepo & fftRepo,const cl_command_queue commQueueFFT)199 clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT)
200 {
201
202
203 std::string programCode;
204 std::string kernelFuncName;//applied to swap kernel for now
205 if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
206 {
207 //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
208 if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
209 {
210 assert(!this->signature.fft_hasPostCallback);
211
212 bool validLDSSize = false;
213 size_t requestedCallbackLDS = 0;
214
215 requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
216
217 validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
218
219 if(!validLDSSize)
220 {
221 fprintf(stderr, "Requested local memory size not available\n");
222 return CLFFT_INVALID_ARG_VALUE;
223 }
224 }
225 OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
226 }
227 else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
228 {
229 //pre call back check
230 //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
231 if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
232 {
233 assert(!this->signature.fft_hasPostCallback);
234
235 bool validLDSSize = false;
236 size_t requestedCallbackLDS = 0;
237
238 requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
239
240 validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
241
242 if (!validLDSSize)
243 {
244 fprintf(stderr, "Requested local memory size not available\n");
245 return CLFFT_INVALID_ARG_VALUE;
246 }
247 }
248 OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
249 }
250 else
251 {
252 //pre-callback is possible in swap kernel now
253 if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
254 {
255 assert(!this->signature.fft_hasPostCallback);
256
257 bool validLDSSize = false;
258 size_t requestedCallbackLDS = 0;
259
260 requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
261 //LDS usage of swap lines is exactly 2 lines
262 size_t lineSize = (this->signature.fft_N[0]) < (this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
263 validLDSSize = ((2 * this->plan->ElementSize() * lineSize) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
264
265 if (!validLDSSize)
266 {
267 fprintf(stderr, "Requested local memory size not available\n");
268 return CLFFT_INVALID_ARG_VALUE;
269 }
270 }
271 //here we should decide generate what kind of swap kernel. 1:2 and 1:3 probably need different swap kernels
272 /*
273 if (this->signature.fft_N[0] == 2 * this->signature.fft_N[1] || 2 * this->signature.fft_N[0] == this->signature.fft_N[1])
274 {
275 OPENCL_V(clfft_transpose_generator::genSwapKernel(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
276 }
277 else
278 {
279 OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
280 }
281 */
282 //general swap kernel takes care of all ratio
283 OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
284 }
285 //std::cout << programCode << std::endl;
286 cl_int status = CL_SUCCESS;
287 cl_device_id Device = NULL;
288 status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
289 OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
290
291 cl_context QueueContext = NULL;
292 status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
293 OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
294
295
296 OPENCL_V(fftRepo.setProgramCode(Transpose_NONSQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
297 if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
298 {
299 // Note: See genFunctionPrototype( )
300 if (this->signature.fft_3StepTwiddle)
301 {
302 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_nonsquare_tw_fwd", "transpose_nonsquare_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
303 }
304 else
305 {
306 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_nonsquare", "transpose_nonsquare", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
307 }
308 }
309 else if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
310 {
311 //for non square we do twiddling in swap kernel
312 /*
313 if (this->signature.fft_3StepTwiddle && (this->signature.transposeMiniBatchSize == 1))
314 {
315 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
316 }
317 else
318 {
319 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
320 }
321 */
322 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
323 }
324 else
325 {
326 if (this->signature.fft_3StepTwiddle)//if miniBatchSize > 1 twiddling is done in swap kernel
327 {
328 std::string kernelFwdFuncName = kernelFuncName + "_tw_fwd";
329 std::string kernelBwdFuncName = kernelFuncName + "_tw_back";
330 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), kernelFwdFuncName.c_str(), kernelBwdFuncName.c_str(), Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
331 }
332 else
333 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), kernelFuncName.c_str(), kernelFuncName.c_str(), Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
334 }
335 return CLFFT_SUCCESS;
336 }
337
338
getWorkSizes(std::vector<size_t> & globalWS,std::vector<size_t> & localWS)339 clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size_t >& globalWS, std::vector< size_t >& localWS)
340 {
341
342 size_t wg_slice;
343 size_t smaller_dim = (this->signature.fft_N[0] < this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
344 size_t bigger_dim = (this->signature.fft_N[0] >= this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
345 size_t dim_ratio = bigger_dim / smaller_dim;
346 size_t global_item_size;
347
348 if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
349 {
350 if (smaller_dim % (16 * reShapeFactor) == 0)
351 wg_slice = smaller_dim / 16 / reShapeFactor;
352 else
353 wg_slice = (smaller_dim / (16 * reShapeFactor)) + 1;
354
355 global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
356
357 for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
358 {
359 global_item_size *= this->signature.fft_N[i];
360 }
361
362 /*Push the data required for the transpose kernels*/
363 globalWS.clear();
364 if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
365 globalWS.push_back(global_item_size * dim_ratio);
366 else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
367 globalWS.push_back(global_item_size);
368
369
370 localWS.clear();
371 localWS.push_back(lwSize);
372 }
373 else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
374 {
375 if (smaller_dim % (16 * reShapeFactor) == 0)
376 wg_slice = smaller_dim / 16 / reShapeFactor;
377 else
378 wg_slice = (smaller_dim / (16 * reShapeFactor)) + 1;
379
380 global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
381
382 for (int i = 2; i < this->plan->length.size(); i++)
383 {
384 global_item_size *= this->plan->length[i];
385 }
386
387 /*Push the data required for the transpose kernels*/
388 globalWS.clear();
389 globalWS.push_back(global_item_size);
390
391
392 localWS.clear();
393 localWS.push_back(lwSize);
394 }
395 else
396 {
397 /*Now calculate the data for the swap kernels */
398 // general swap kernel takes care of all ratio. need clean up here
399 if(dim_ratio == 2 && 0){
400 //1:2 ratio
401 size_t input_elm_size_in_bytes;
402 switch (this->signature.fft_precision)
403 {
404 case CLFFT_SINGLE:
405 case CLFFT_SINGLE_FAST:
406 input_elm_size_in_bytes = 4;
407 break;
408 case CLFFT_DOUBLE:
409 case CLFFT_DOUBLE_FAST:
410 input_elm_size_in_bytes = 8;
411 break;
412 default:
413 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
414 }
415
416 switch (this->signature.fft_outputLayout)
417 {
418 case CLFFT_COMPLEX_INTERLEAVED:
419 case CLFFT_COMPLEX_PLANAR:
420 input_elm_size_in_bytes *= 2;
421 break;
422 case CLFFT_REAL:
423 break;
424 default:
425 return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
426 }
427 size_t max_elements_loaded = AVAIL_MEM_SIZE / input_elm_size_in_bytes;
428 size_t num_elements_loaded;
429 size_t local_work_size_swap, num_grps_pro_row;
430
431 if ((max_elements_loaded >> 1) > smaller_dim)
432 {
433 local_work_size_swap = (smaller_dim < 256) ? smaller_dim : 256;
434 num_elements_loaded = smaller_dim;
435 num_grps_pro_row = 1;
436 }
437 else
438 {
439 num_grps_pro_row = (smaller_dim << 1) / max_elements_loaded;
440 num_elements_loaded = max_elements_loaded >> 1;
441 local_work_size_swap = (num_elements_loaded < 256) ? num_elements_loaded : 256;
442 }
443 size_t num_reduced_row;
444 size_t num_reduced_col;
445
446 if (this->signature.fft_N[1] == smaller_dim)
447 {
448 num_reduced_row = smaller_dim;
449 num_reduced_col = 2;
450 }
451 else
452 {
453 num_reduced_row = 2;
454 num_reduced_col = smaller_dim;
455 }
456
457 size_t *cycle_map = new size_t[num_reduced_row * num_reduced_col * 2];
458 /* The memory required by cycle_map cannot exceed 2 times row*col by design*/
459 clfft_transpose_generator::get_cycles(cycle_map, num_reduced_row, num_reduced_col);
460
461 global_item_size = local_work_size_swap * num_grps_pro_row * cycle_map[0] * this->plan->batchsize;
462
463 for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
464 {
465 global_item_size *= this->signature.fft_N[i];
466 }
467 delete[] cycle_map;
468
469 globalWS.push_back(global_item_size);
470 localWS.push_back(local_work_size_swap);
471 }
472 else
473 {
474 //if (dim_ratio == 2 || dim_ratio == 3 || dim_ratio == 5 || dim_ratio == 10)
475 if (dim_ratio % 2 == 0 || dim_ratio % 3 == 0 || dim_ratio % 5 == 0 || dim_ratio % 10 == 0)
476 {
477 size_t local_work_size_swap = 256;
478 std::vector<std::vector<size_t> > permutationTable;
479 clfft_transpose_generator::permutation_calculation(dim_ratio, smaller_dim, permutationTable);
480 size_t global_item_size;
481 if(this->plan->large1D && (dim_ratio > 1))
482 global_item_size = (permutationTable.size() + 2) * local_work_size_swap * this->plan->batchsize;
483 else
484 global_item_size = (permutationTable.size() + 2) * local_work_size_swap * this->plan->batchsize;
485 //for (int i = 2; i < this->plan->length.size(); i++)
486 // global_item_size *= this->plan->length[i];
487 size_t LDS_per_WG = smaller_dim;
488 while (LDS_per_WG > 1024)//avoiding using too much lds memory. the biggest LDS memory we will allocate would be 1024*sizeof(float2/double2)*2
489 {
490 if (LDS_per_WG % 2 == 0)
491 {
492 LDS_per_WG /= 2;
493 continue;
494 }
495 if (LDS_per_WG % 3 == 0)
496 {
497 LDS_per_WG /= 3;
498 continue;
499 }
500 if (LDS_per_WG % 5 == 0)
501 {
502 LDS_per_WG /= 5;
503 continue;
504 }
505 return CLFFT_NOTIMPLEMENTED;
506 }
507
508 size_t WG_per_line = smaller_dim / LDS_per_WG;
509 global_item_size *= WG_per_line;
510 globalWS.push_back(global_item_size);
511 localWS.push_back(local_work_size_swap);
512 }
513 else
514 return CLFFT_NOTIMPLEMENTED;
515 }
516 }
517 return CLFFT_SUCCESS;
518 }
519
FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)520 FFTGeneratedTransposeSquareAction::FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
521 : FFTTransposeSquareAction(plHandle, plan, queue, err)
522 {
523 if (err != CLFFT_SUCCESS)
524 {
525 // FFTTransposeSquareAction() failed, exit
526 fprintf(stderr, "FFTTransposeSquareAction() failed!\n");
527 return;
528 }
529
530 // Initialize the FFTAction::FFTKernelGenKeyParams member
531 err = this->initParams();
532
533 if (err != CLFFT_SUCCESS)
534 {
535 fprintf(stderr, "FFTGeneratedTransposeSquareAction::initParams() failed!\n");
536 return;
537 }
538
539 FFTRepo &fftRepo = FFTRepo::getInstance();
540
541 err = this->generateKernel(fftRepo, queue);
542
543 if (err != CLFFT_SUCCESS)
544 {
545 fprintf(stderr, "FFTGeneratedTransposeSquareAction::generateKernel failed\n");
546 return;
547 }
548
549 err = compileKernels(queue, plHandle, plan);
550
551 if (err != CLFFT_SUCCESS)
552 {
553 fprintf(stderr, "FFTGeneratedTransposeSquareAction::compileKernels failed\n");
554 return;
555 }
556
557 err = CLFFT_SUCCESS;
558 }
559
560
buildForwardKernel()561 bool FFTGeneratedTransposeSquareAction::buildForwardKernel()
562 {
563 clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
564 clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
565
566 bool r2c_transform = (inputLayout == CLFFT_REAL);
567 bool c2r_transform = (outputLayout == CLFFT_REAL);
568 bool real_transform = (r2c_transform || c2r_transform);
569
570 return (!real_transform) || r2c_transform;
571 }
572
buildBackwardKernel()573 bool FFTGeneratedTransposeSquareAction::buildBackwardKernel()
574 {
575 clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
576 clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
577
578 bool r2c_transform = (inputLayout == CLFFT_REAL);
579 bool c2r_transform = (outputLayout == CLFFT_REAL);
580 bool real_transform = (r2c_transform || c2r_transform);
581
582 return (!real_transform) || c2r_transform;
583 }
584
585 /*sqaure action*/
initParams()586 clfftStatus FFTGeneratedTransposeSquareAction::initParams()
587 {
588
589 this->signature.fft_precision = this->plan->precision;
590 this->signature.fft_placeness = this->plan->placeness;
591 this->signature.fft_inputLayout = this->plan->inputLayout;
592 this->signature.fft_outputLayout = this->plan->outputLayout;
593 this->signature.fft_3StepTwiddle = false;
594
595 this->signature.fft_realSpecial = this->plan->realSpecial;
596
597 this->signature.transOutHorizontal = this->plan->transOutHorizontal; // using the twiddle front flag to specify horizontal write
598 // we do this so as to reuse flags in FFTKernelGenKeyParams
599 // and to avoid making a new one
600
601 ARG_CHECK(this->plan->inStride.size() == this->plan->outStride.size());
602
603 if (CLFFT_INPLACE == this->signature.fft_placeness)
604 {
605 // If this is an in-place transform the
606 // input and output layout, dimensions and strides
607 // *MUST* be the same.
608 //
609 ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
610
611 for (size_t u = this->plan->inStride.size(); u-- > 0; )
612 {
613 ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
614 }
615 }
616
617 this->signature.fft_DataDim = this->plan->length.size() + 1;
618 int i = 0;
619 for (i = 0; i < (this->signature.fft_DataDim - 1); i++)
620 {
621 this->signature.fft_N[i] = this->plan->length[i];
622 this->signature.fft_inStride[i] = this->plan->inStride[i];
623 this->signature.fft_outStride[i] = this->plan->outStride[i];
624
625 }
626 this->signature.fft_inStride[i] = this->plan->iDist;
627 this->signature.fft_outStride[i] = this->plan->oDist;
628
629 if (this->plan->large1D != 0) {
630 ARG_CHECK(this->signature.fft_N[0] != 0)
631 ARG_CHECK((this->plan->large1D % this->signature.fft_N[0]) == 0)
632 this->signature.fft_3StepTwiddle = true;
633 ARG_CHECK(this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]));
634 }
635
636 // Query the devices in this context for their local memory sizes
637 // How we generate a kernel depends on the *minimum* LDS size for all devices.
638 //
639 const FFTEnvelope * pEnvelope = NULL;
640 OPENCL_V(this->plan->GetEnvelope(&pEnvelope), _T("GetEnvelope failed"));
641 BUG_CHECK(NULL != pEnvelope);
642
643 // TODO: Since I am going with a 2D workgroup size now, I need a better check than this 1D use
644 // Check: CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
645 // CL_DEVICE_MAX_WORK_ITEM_SIZES
646 this->signature.fft_R = 1; // Dont think i'll use
647 this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
648
649 //Set callback if specified
650 if (this->plan->hasPreCallback)
651 {
652 this->signature.fft_hasPreCallback = true;
653 this->signature.fft_preCallback = this->plan->preCallback;
654 }
655 if (this->plan->hasPostCallback)
656 {
657 this->signature.fft_hasPostCallback = true;
658 this->signature.fft_postCallback = this->plan->postCallbackParam;
659 }
660 this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
661
662 this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
663 this->signature.transposeBatchSize = this->plan->batchsize;
664
665 return CLFFT_SUCCESS;
666 }
667
668
669 // OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
670 // Feed this generator the FFTPlan, and it returns the generated program as a string
generateKernel(FFTRepo & fftRepo,const cl_command_queue commQueueFFT)671 clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT)
672 {
673 //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
674 if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) ||
675 (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
676 {
677 assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
678
679 bool validLDSSize = false;
680 size_t requestedCallbackLDS = 0;
681
682 if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
683 requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
684 else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
685 requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
686
687 validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
688
689 if (!validLDSSize)
690 {
691 fprintf(stderr, "Requested local memory size not available\n");
692 return CLFFT_INVALID_ARG_VALUE;
693 }
694 }
695
696 std::string programCode;
697 OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
698
699 cl_int status = CL_SUCCESS;
700 cl_device_id Device = NULL;
701 status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
702 OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
703
704 cl_context QueueContext = NULL;
705 status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
706 OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
707
708
709 OPENCL_V(fftRepo.setProgramCode(Transpose_SQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
710
711 // Note: See genFunctionPrototype( )
712 if (this->signature.fft_3StepTwiddle)
713 {
714 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
715 }
716 else
717 {
718 OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
719 }
720
721 return CLFFT_SUCCESS;
722 }
723
724
getWorkSizes(std::vector<size_t> & globalWS,std::vector<size_t> & localWS)725 clfftStatus FFTGeneratedTransposeSquareAction::getWorkSizes(std::vector< size_t >& globalWS, std::vector< size_t >& localWS)
726 {
727
728 size_t wg_slice;
729 if (this->signature.fft_N[0] % (16 * reShapeFactor) == 0)
730 wg_slice = this->signature.fft_N[0] / 16 / reShapeFactor;
731 else
732 wg_slice = (this->signature.fft_N[0] / (16 * reShapeFactor)) + 1;
733
734 size_t global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
735
736 for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
737 {
738 global_item_size *= this->signature.fft_N[i];
739 }
740
741 globalWS.clear();
742 globalWS.push_back(global_item_size);
743
744 localWS.clear();
745 localWS.push_back(lwSize);
746
747 return CLFFT_SUCCESS;
748 }
749