1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 #pragma once
19 #if !defined( AMD_CLFFT_plan_H )
20 #define AMD_CLFFT_plan_H
21 #include <cstring>
22 #include "private.h"
23 #include "lock.h"
24 #include "generator.h"
25 
26 std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle);
27 
28 namespace ARBITRARY {
29 	// TODO:  These arbitrary parameters should be tuned for the type of GPU
30 	//	being used.  These values are probably OK for Radeon 58xx and 68xx.
31 	enum {
32 		MAX_DIMS  = 3,
33 			//  The clEnqueuNDRangeKernel accepts a multi-dimensional domain array.
34 			//  The # of dimensions is arbitrary, but limited by the OpenCL implementation
35 			//  usually to 3 dimensions (CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
36 			//  The kernel generator also assumes a limit on the # of dimensions.
37 
38 		SIMD_WIDTH = 64,
39 			//  Workgroup size.  This is the # of work items that share
40 			//  local data storage (LDS).  This # is best for Evergreen gpus,
41 			//  but might change in the future.
42 
43 		LDS_BANK_BITS = 5,
44 		LDS_BANK_SIZE = (1 << LDS_BANK_BITS),
45 		LDS_PADDING   = false,//true,
46 			//  On AMD hardware, the low-order bits of the local_id enumerate
47 			//  the work items that access LDS in parallel.  Ideally, we will
48 			//  pad our LDS arrays so that these work items access different banks
49 			//  of the LDS.
50 			//  2 ** LDS_BANK_BITS is the number of LDS banks.
51 			//  If LDS_PADDING is non-zero, the kernel generator should pad the
52 			//  LDS arrays to reduce or eliminate bank conflicts.
53 
54 		LDS_FRACTION_IDEAL = 6,    // i.e., 1/6th
55 		LDS_FRACTION_MAX   = 4,    // i.e., 1/4
56 			//  For best performance, each workgroup should use 1/IDEAL'th the amount of LDS
57 			//  revealed by clGetDeviceInfo (.. CL_DEVICE_LOCAL_MEM_SIZE, ...)
58 			//  However, we can use up to 1/MAX'th of LDS per workgroup when necessary to
59 			//  perform the FFT in a single pass instead of multiple passes.
60 			//  This tuning parameter is a good value for Evergreen gpus,
61 			//  but might change in the future.
62 
63 		LDS_COMPLEX = false,
64 			//  This is the default value for FFTKernelGenKeyParams::fft_LdsComplex.
65 			//  The generated kernels require so many bytes of LDS for each single precision
66 			//..complex number in the vector.
67 			//  If LDS_COMPLEX, then we declare an LDS array of complex numbers (8 bytes each)
68 			//  and swap data between workitems with a single barrier.
69 			//  If ! LDS_COMPLEX, then we declare an LDS array or scalar numbers (4 bytes each)
70 			//  and swap data between workitems in two phases, with extra barriers.
71 			//  The former approach uses fewer instructions and barriers;
72 			//  The latter uses half as much LDS space, so twice as many wavefronts can be run
73 			//  in parallel.
74 
75 		TWIDDLE_DEE = 8,
76 			//  number of bits per row of matrix.
77 	};
78 
79 };
80 
81 
82 enum BlockComputeType
83 {
84 	BCT_C2C,	// Column to column
85 	BCT_C2R,	// Column to row
86 	BCT_R2C,	// Row to column
87 };
88 
89 
90 //NonSquareKernelType
91 enum NonSquareTransposeKernelType
92 {
93     NON_SQUARE_TRANS_PARENT,
94     NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING,
95     NON_SQUARE_TRANS_TRANSPOSE_BATCHED,
96     NON_SQUARE_TRANS_SWAP
97 };
98 
99 /*
100 There are three ways of conducting inplace transpose with 1:2 (or 2:1) dimension ratio.
101 A. first conduct line swapping kernels for the whole non square matrix
102 then conduct batched square transpose along column dim (a 'real' batched transpose)
103 B. first conduct batched square transpose along column dim (a 'real' batched transpose)
104 then conduct line swapping kernels for the whole non square matrix (for 2:1 case)
105 C. first conduct batched square transpose along leading dim (row dim)
106 then conduct line swapping kernels for the whole non square matrix
107 Note that the twiddle computation has to go at the begining of the first kernel or the end of the second kernel
108 
109 if leading dimension is bigger, it makes more sense (faster) to swap line first and then conduct batched square transpose
110 if leading dimension is smaller, it makes more sense (faster) to conduct batched transpose and then swap lines.
111 */
112 enum NON_SQUARE_KERNEL_ORDER
113 {
114 	NOT_A_TRANSPOSE,
115 	SWAP_AND_TRANSPOSE, // A.
116 	TRANSPOSE_AND_SWAP, // B.
117 	TRANSPOSE_LEADING_AND_SWAP, // C.
118 };
119 
120 #define CLFFT_CB_SIZE 32
121 #define CLFFT_MAX_INTERNAL_DIM 16
122 
123 /*! @brief Data structure to store the callback function string and other metadata passed by client
124 *  @details Client sets the callback function and other required parameters through clfftSetPlanCallback()
125 *  in order to register the callback function. The library populates these values into this data structure
126 */
127 typedef struct clfftCallbackParam_
128 {
129 	int localMemSize;			/*!< optional local memory size if needed by callback */
130 	const char* funcname;		/*!< callback function name */
131 	const char* funcstring;		/*!< callback function in string form */
132 }clfftCallbackParam;
133 
134 struct FFTKernelGenKeyParams {
135 	/*
136 	 *	This structure distills a subset of the fftPlan data,
137 	 *	including all information that is used to generate the OpenCL kernel.
138 	 *	This structure can be used as a key to reusing kernels that have already
139 	 *	been compiled.
140 	 */
141 	size_t                   fft_DataDim;       // Dimensionality of the data
142 	size_t                   fft_N[CLFFT_MAX_INTERNAL_DIM];          // [0] is FFT size, e.g. 1024
143 	                                            // This must be <= size of LDS!
144 	size_t                   fft_inStride [CLFFT_MAX_INTERNAL_DIM];  // input strides
145 	size_t                   fft_outStride[CLFFT_MAX_INTERNAL_DIM];  // output strides
146 
147 	clfftResultLocation   fft_placeness;
148 	clfftLayout           fft_inputLayout;
149 	clfftLayout           fft_outputLayout;
150 	clfftPrecision        fft_precision;
151 	double                   fft_fwdScale;
152 	double                   fft_backScale;
153 
154 	size_t                   fft_SIMD;          // Assume this SIMD/workgroup size
155 	size_t                   fft_LDSsize;       // Limit the use of LDS to this many bytes.
156 	size_t                   fft_R;             // # of complex values to keep in working registers
157 	                                            // SIMD size * R must be <= size of LDS!
158 
159 	size_t					 fft_MaxWorkGroupSize; // Limit for work group size
160 
161 	bool                     fft_3StepTwiddle;  // This is one pass of the "3-step" algorithm;
162 	                                            // so extra twiddles are applied on output.
163 	bool					 fft_twiddleFront;	// do twiddle scaling at the beginning pass
164 
165 	bool					 fft_realSpecial;	// this is the flag to control the special case step (4th step)
166 	                                            // in the 5-step real 1D large breakdown
167 	size_t					 fft_realSpecial_Nr;
168 
169 	bool                     fft_RCsimple;
170 
171 	bool					 transOutHorizontal;	// tiles traverse the output buffer in horizontal direction
172 
173 	bool					 blockCompute;
174 	BlockComputeType		 blockComputeType;
175 	size_t					 blockSIMD;
176 	size_t					 blockLDS;
177 
178 	NonSquareTransposeKernelType      nonSquareKernelType;
179 	// sometimes non square matrix are broken down into a number of
180 	// square matrix during inplace transpose
181 	// let's call this number transposeMiniBatchSize
182 	// no user of the library should set its value
183 	size_t transposeMiniBatchSize;
184 	// transposeBatchSize is the number of batchs times transposeMiniBatchSzie
185 	// no user of the library should set its value
186 	size_t transposeBatchSize;
187 	// no user of the library should set its value
188 	NON_SQUARE_KERNEL_ORDER nonSquareKernelOrder;
189 
190 	bool fft_hasPreCallback;
191 	clfftCallbackParam fft_preCallback;
192 
193 	bool fft_hasPostCallback;
194 	clfftCallbackParam fft_postCallback;
195 
196 	cl_ulong   limit_LocalMemSize;
197 
198 	// Default constructor
FFTKernelGenKeyParamsFFTKernelGenKeyParams199 	FFTKernelGenKeyParams()
200 	{
201 		fft_DataDim = 0;
202 		for(int i=0; i<CLFFT_MAX_INTERNAL_DIM; i++)
203 		{
204 			fft_N[i] = 0;
205 			fft_inStride[i] = 0;
206 			fft_outStride[i] = 0;
207 		}
208 
209 		fft_placeness = CLFFT_OUTOFPLACE;
210 		fft_inputLayout = CLFFT_COMPLEX_INTERLEAVED;
211 		fft_outputLayout = CLFFT_COMPLEX_INTERLEAVED;
212 		fft_precision = CLFFT_SINGLE;
213 		fft_fwdScale = fft_backScale = 0.0;
214 		fft_SIMD = 0;
215 		fft_LDSsize = 0;
216 		fft_R = 0;
217 		fft_MaxWorkGroupSize = 0;
218 		fft_3StepTwiddle = false;
219 		fft_twiddleFront = false;
220 
221 		transOutHorizontal = false;
222 
223 		fft_realSpecial = false;
224 		fft_realSpecial_Nr = 0;
225 
226 		fft_RCsimple = false;
227 
228 		blockCompute = false;
229 		blockComputeType = BCT_C2C;
230 		blockSIMD = 0;
231 		blockLDS = 0;
232         nonSquareKernelType = NON_SQUARE_TRANS_PARENT;
233 		transposeMiniBatchSize = 1;
234 		transposeBatchSize = 1;
235 		fft_hasPreCallback = false;
236 		fft_hasPostCallback = false;
237 		limit_LocalMemSize = 0;
238 	}
239 };
240 
241 
242 //	Sorting operator for struct FFTKernelGenKeyParams, such that it can be used in a map
243 bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs);
244 
245 class	FFTPlan;
246 class   FFTRepo;
247 
248 // Action ID
249 enum FFTActionImplID
250 {
251     FFT_DEFAULT_STOCKHAM_ACTION,
252     FFT_DEFAULT_TRANSPOSE_ACTION,
253     FFT_DEFAULT_COPY_ACTION,
254     FFT_STATIC_STOCKHAM_ACTION
255 };
256 
257 
258 //
259 // FFTKernelSignatureHeader
260 //
261 // This structure is a wrapper for the FFTKernelSignature.
262 // It stores the signature size and the action ID. This ensure that every
263 // FFTKernelSignature (even with an empty DATA) is unique
264 //
265 // This class is used as the return type of FFTAction::getSignatureData()
266 //
267 struct FFTKernelSignatureHeader
268 {
269     int datasize;
270     FFTActionImplID id;
271 
272     //clfftLayout           fft_inputLayout;
273     //clfftLayout           fft_outputLayout;
274 
FFTKernelSignatureHeaderFFTKernelSignatureHeader275     FFTKernelSignatureHeader(int size_, FFTActionImplID id_)
276     {
277         // Set to 0 the whole signature structure
278         ::memset(this, 0, size_);
279         datasize = size_;
280         id = id_;
281     }
282 };
283 
284 //
285 // FFTKernelSignature
286 //
287 // This struct represents the signature of an action. An action signature
288 // stores (by inheritage):
289 //  - the action ID
290 //  - its signature data size
291 //  - a set of bytes caracterizes a FFT action
292 //
293 // This template class FFTKernelSignature provides a simple mechanism to
294 // build a proper signature (see also in src/library/repo.h) from any POD type.
295 //
296 // It is used as a key in the different cache mecanisms (binary cache and
297 // dynamic cache managed by FFTRepo)
298 //
299 template <typename DATA, FFTActionImplID ID>
300 struct FFTKernelSignature : public FFTKernelSignatureHeader, public DATA
301 {
FFTKernelSignatureFFTKernelSignature302     FFTKernelSignature()
303         : FFTKernelSignatureHeader(sizeof(FFTKernelSignature<DATA, ID>), ID)
304     {
305     }
306 };
307 
308 
309 
310 //
311 // FFTAction is the base class for all actions used to implement FFT computes
312 //
313 // An action basically implements some OpenCL related actions, for instance:
314 //  - Generating a kernel source code from kernel characteristics
315 //  - Computing the work-group local sizes according to a kernel
316 //  - Enqueuing arguments to the kernel call
317 //
318 // Kernels generated and compiled by an action are stored in the different
319 // cache mechanism (see repo.h for the dynamic cache and fft_binary_lookup.h
320 // for disk cache for more information). Each cache entry can be obtained from
321 // the action signature which is set of byte characterizing the action itself.
322 //
323 // At the moment, FFTAction only implements the enqueue method which is
324 // inherited by every action subclasses. But it may change in the time. There
325 // are no clear rules and the choices made until now are still subject to
326 // change.
327 //
328 class FFTAction
329 {
330 public:
331     FFTAction(FFTPlan * plan, clfftStatus & err);
332 
333     virtual clfftStatus enqueue(clfftPlanHandle plHandle,
334                                 clfftDirection dir,
335                                 cl_uint numQueuesAndEvents,
336                                 cl_command_queue* commQueues,
337                                 cl_uint numWaitEvents,
338                                 const cl_event* waitEvents,
339                                 cl_event* outEvents,
340                                 cl_mem* clInputBuffers,
341                                 cl_mem* clOutputBuffers);
342 
343 protected:
344 
345     virtual clfftGenerators                getGenerator() = 0;
346 
347     clfftStatus                            compileKernels  ( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan);
348     clfftStatus                            writeKernel     ( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device);
349 
350     virtual clfftStatus                    generateKernel  ( FFTRepo & fftRepo, const cl_command_queue commQueueFFT) = 0;
351     virtual clfftStatus                    getWorkSizes    ( std::vector<size_t> & globalws, std::vector<size_t> & localws) = 0;
352 
353     virtual const FFTKernelSignatureHeader * getSignatureData() = 0;
354 
355     FFTPlan * plan;
356 
357 private:
358 
359     clfftStatus selectBufferArguments(FFTPlan * plan,
360                                       cl_mem* clInputBuffers,
361                                       cl_mem* clOutputBuffers,
362                                       std::vector< cl_mem > &inputBuff,
363                                       std::vector< cl_mem > &outputBuff);
364 
365     virtual bool buildForwardKernel() = 0;
366     virtual bool buildBackwardKernel() = 0;
367 };
368 
369 
370 //	The "envelope" is a set of limits imposed by the hardware
371 //	This will depend on the GPU(s) in the OpenCL context.
372 //	If there are multiple devices, this should be the least
373 //	common denominators.
374 //
375 struct FFTEnvelope {
376 	cl_ulong   limit_LocalMemSize;
377 	           //  this is the minimum of CL_DEVICE_LOCAL_MEM_SIZE
378 	size_t     limit_Dimensions;
379 	           //  this is the minimum of CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
380 	size_t     limit_Size[8];
381 	           //  these are the minimima of CL_DEVICE_MAX_WORK_ITEM_SIZES[0..n]
382 	size_t     limit_WorkGroupSize;
383 	           //  this is the minimum of CL_DEVICE_MAX_WORK_GROUP_SIZE
384 
385 	// ??  CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
386 
FFTEnvelopeFFTEnvelope387 	FFTEnvelope ()
388 	:	limit_LocalMemSize (0)
389 	,	limit_Dimensions (0)
390 	,	limit_WorkGroupSize (0)
391 	{
392 		::memset( &limit_Size, 0, sizeof( limit_Size ) );
393 	}
394 };
395 
396 
397 //	This class contains objects that are specific to a particular FFT transform, and the data herein is useful
398 //	for us to know ahead of transform time such that we can optimize for these settings
399 class	FFTPlan
400 {
401 
402 public:
403 
404 	bool baked;
405 
406 	//	Properties provided by the user.
407 	clfftDim             dim;
408 	clfftLayout          inputLayout;
409 	clfftLayout          outputLayout;
410 	clfftResultLocation  placeness;
411 	clfftResultTransposed transposed;
412 	clfftPrecision       precision;
413 	cl_context              context;
414 	double                  forwardScale, backwardScale;
415 	size_t                  iDist, oDist;
416 	size_t                  batchsize;
417 
418 	// Note the device passed to BakePlan, assuming we are baking for one device
419 	// TODO, change this logic for handling multiple GPUs/devices
420 	cl_device_id bakeDevice;
421 
422 	// Disabling devices member, plan has 1-on-1 mapping with single device as identified by bakeDevice
423 	//	Devices that the user specified in the context passed to the create function
424 	// std::vector< cl_device_id > devices;
425 
426 	//	Length of the FFT in each dimension
427 	std::vector< size_t >	length;
428 
429 	//	Stride of the FFT in each dimension
430 	std::vector< size_t >	inStride, outStride;
431 
432 	//	Hardware Limits
433 	FFTEnvelope                 envelope;
434 
435 
436 	// Reserved copy for large 1d, 2d, and 3d plan
437 	size_t tmpBufSize;
438 	cl_mem intBuffer;
439 	bool libCreatedIntBuffer;
440 
441 	// for RC copies
442 	size_t	tmpBufSizeRC;
443 	cl_mem	intBufferRC;
444 
445 	// for C-to-R transforms that are OUTOFPLACE
446 	// we need this because the user supplied output buffer is not big enough
447 	// to hold intermediate results for any problem other than normal 1D
448 	size_t  tmpBufSizeC2R;
449 	cl_mem  intBufferC2R;
450 
451 
452 	size_t  large1D;
453 	bool    large2D;
454 	bool	twiddleFront;
455 
456 	clfftPlanHandle planX;
457 	clfftPlanHandle planY;
458 	clfftPlanHandle planZ;
459 
460 	bool transflag;
461 	bool transOutHorizontal;
462 	clfftPlanHandle planTX;
463 	clfftPlanHandle planTY;
464 	clfftPlanHandle planTZ; //reserve for 3D transpose
465 
466 	clfftPlanHandle planRCcopy;
467 	clfftPlanHandle planCopy;
468 
469 	// Plan resources
470 	//
471 	cl_mem const_buffer;
472 
473 	// Generator type
474 	clfftGenerators gen;
475 
476 
477 	// Real-Complex simple flag
478 	// if this is set we do real to-and-from full complex using simple algorithm
479 	// where imaginary of input is set to zero in forward and imaginary not written in backward
480 	bool RCsimple;
481 
482 	// Real FFT special flag
483 	// if this is set it means we are doing the 4th step in the 5-step real FFT breakdown algorithm
484 	bool realSpecial;
485 
486 	size_t realSpecial_Nr; // this value stores the logical column height (N0) of matrix in the 4th step
487 	                       // length[1] should be 1 + N0/2
488 
489 	// User created plan
490 	bool userPlan;
491 
492 
493 	// Allocate no extra memory
494 	bool allOpsInplace;
495 
496 	// flag to indicate transpose placeness in 2D breakdown
497 	bool transpose_in_2d_inplace;
498 
499 
500 	// A flag to say that blocked FFTs are going to be performed
501 	// It can only be one of these: column to row, row to column or column to column
502 	// row to row is just the normal case where blocking is not needed
503 	bool blockCompute;
504 	BlockComputeType blockComputeType;
505 
506 	bool hasPreCallback;
507 	bool hasPostCallback;
508 
509 	clfftCallbackParam preCallback;
510 	clfftCallbackParam postCallbackParam;
511 
512 	cl_mem precallUserData;
513 	cl_mem postcallUserData;
514 
515     clfftPlanHandle plHandle;
516 
517     // The action
518     FFTAction * action;
519 
520     NonSquareTransposeKernelType nonSquareKernelType;
521 	// sometimes non square matrix are broken down into a number of
522 	// square matrix during inplace transpose
523 	// let's call this number transposeMiniBatchSize
524 	// no user of the library should set its value
525 	size_t transposeMiniBatchSize;
526 	NON_SQUARE_KERNEL_ORDER nonSquareKernelOrder;
527 
FFTPlan()528 	FFTPlan ()
529 	:	baked (false)
530 	,	dim (CLFFT_1D)
531 	,	inputLayout (CLFFT_COMPLEX_INTERLEAVED)
532 	,	outputLayout (CLFFT_COMPLEX_INTERLEAVED)
533 	,	placeness (CLFFT_INPLACE)
534 	,   transposed (CLFFT_NOTRANSPOSE)
535 	,	precision (CLFFT_SINGLE)
536 	,	context (NULL)
537 	,	forwardScale (1.0)
538 	,	backwardScale (1.0)
539 	,	iDist( 1 ), oDist( 1 )
540 	,	batchsize (1)
541 	,   tmpBufSize (0)
542 	,	intBuffer( NULL )
543 	,	libCreatedIntBuffer(false)
544 	,	tmpBufSizeRC (0)
545 	,	intBufferRC( NULL )
546 	,	tmpBufSizeC2R (0)
547 	,	intBufferC2R( NULL )
548 	,   large1D(0)
549 	,   large2D(false)
550 	,	twiddleFront(false)
551 	,   planX( 0 )
552 	,   planY( 0 )
553 	,   planZ( 0 )
554 	,   transflag(false)
555 	,	transOutHorizontal(false)
556 	,	RCsimple(false)
557 	,	realSpecial(false)
558 	,	realSpecial_Nr(0)
559 	,	userPlan(false)
560 	,	allOpsInplace(false)
561 	,	transpose_in_2d_inplace(false)
562 	,	blockCompute(false)
563 	,	blockComputeType(BCT_C2C)
564 	,   planTX( 0 )
565 	,   planTY( 0 )
566 	,   planTZ( 0 )
567 	,	planRCcopy(0)
568 	,	planCopy(0)
569 	,	const_buffer( NULL )
570 	,	gen(Stockham)
571     ,   action(0)
572     ,   nonSquareKernelType(NON_SQUARE_TRANS_PARENT)
573 	,   transposeMiniBatchSize(1)
574 	,   nonSquareKernelOrder(NOT_A_TRANSPOSE)
575     ,   plHandle(0)
576 	,   hasPreCallback(false)
577 	,   hasPostCallback(false)
578 	{
579 	};
580 
581 
582 	size_t ElementSize() const;
583 
584 	clfftStatus AllocateBuffers ();
585 	clfftStatus ReleaseBuffers ();
586 
587 	clfftStatus GetMax1DLength (size_t *longest ) const;
588 
589 	clfftStatus ConstructAndEnqueueConstantBuffers( cl_command_queue* commQueueFFT );
590 
591 	clfftStatus GetEnvelope (const FFTEnvelope **) const;
592 	clfftStatus SetEnvelope ();
593 
594 	clfftStatus GetMax1DLengthStockham (size_t *longest ) const;
595 
~FFTPlan()596 	~FFTPlan ()
597 	{
598 		ReleaseBuffers ();
599 
600 		if (action != NULL)
601 		{
602 			delete action;
603 			action = 0;
604 		}
605 	}
606 };
607 
Is1DPossible(size_t length,size_t large1DThreshold)608 static bool Is1DPossible(size_t length, size_t large1DThreshold)
609 {
610 	if (length > large1DThreshold)
611 		return false;
612 
613 	if ( (length%7 == 0) && (length%5 == 0) && (length%3 == 0) )
614 		return false;
615 
616 	// radix 11 & 2 is ok, anything else we cannot do in 1 kernel
617 	if ( (length % 11 == 0) && ((length % 13 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
618 		return false;
619 
620 	// radix 13 & 2 is ok, anything else we cannot do in 1 kernel
621 	if ( (length % 13 == 0) && ((length % 11 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
622 		return false;
623 
624 	return true;
625 }
626 
627 #endif // AMD_CLFFT_plan_H
628 
629