1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 #pragma once
19 #if !defined( AMD_CLFFT_plan_H )
20 #define AMD_CLFFT_plan_H
21 #include <cstring>
22 #include "private.h"
23 #include "lock.h"
24 #include "generator.h"
25
26 std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHandle, bool withPlHandle);
27
28 namespace ARBITRARY {
29 // TODO: These arbitrary parameters should be tuned for the type of GPU
30 // being used. These values are probably OK for Radeon 58xx and 68xx.
31 enum {
32 MAX_DIMS = 3,
33 // The clEnqueuNDRangeKernel accepts a multi-dimensional domain array.
34 // The # of dimensions is arbitrary, but limited by the OpenCL implementation
35 // usually to 3 dimensions (CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
36 // The kernel generator also assumes a limit on the # of dimensions.
37
38 SIMD_WIDTH = 64,
39 // Workgroup size. This is the # of work items that share
40 // local data storage (LDS). This # is best for Evergreen gpus,
41 // but might change in the future.
42
43 LDS_BANK_BITS = 5,
44 LDS_BANK_SIZE = (1 << LDS_BANK_BITS),
45 LDS_PADDING = false,//true,
46 // On AMD hardware, the low-order bits of the local_id enumerate
47 // the work items that access LDS in parallel. Ideally, we will
48 // pad our LDS arrays so that these work items access different banks
49 // of the LDS.
50 // 2 ** LDS_BANK_BITS is the number of LDS banks.
51 // If LDS_PADDING is non-zero, the kernel generator should pad the
52 // LDS arrays to reduce or eliminate bank conflicts.
53
54 LDS_FRACTION_IDEAL = 6, // i.e., 1/6th
55 LDS_FRACTION_MAX = 4, // i.e., 1/4
56 // For best performance, each workgroup should use 1/IDEAL'th the amount of LDS
57 // revealed by clGetDeviceInfo (.. CL_DEVICE_LOCAL_MEM_SIZE, ...)
58 // However, we can use up to 1/MAX'th of LDS per workgroup when necessary to
59 // perform the FFT in a single pass instead of multiple passes.
60 // This tuning parameter is a good value for Evergreen gpus,
61 // but might change in the future.
62
63 LDS_COMPLEX = false,
64 // This is the default value for FFTKernelGenKeyParams::fft_LdsComplex.
65 // The generated kernels require so many bytes of LDS for each single precision
66 //..complex number in the vector.
67 // If LDS_COMPLEX, then we declare an LDS array of complex numbers (8 bytes each)
68 // and swap data between workitems with a single barrier.
69 // If ! LDS_COMPLEX, then we declare an LDS array or scalar numbers (4 bytes each)
70 // and swap data between workitems in two phases, with extra barriers.
71 // The former approach uses fewer instructions and barriers;
72 // The latter uses half as much LDS space, so twice as many wavefronts can be run
73 // in parallel.
74
75 TWIDDLE_DEE = 8,
76 // number of bits per row of matrix.
77 };
78
79 };
80
81
82 enum BlockComputeType
83 {
84 BCT_C2C, // Column to column
85 BCT_C2R, // Column to row
86 BCT_R2C, // Row to column
87 };
88
89
90 //NonSquareKernelType
91 enum NonSquareTransposeKernelType
92 {
93 NON_SQUARE_TRANS_PARENT,
94 NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING,
95 NON_SQUARE_TRANS_TRANSPOSE_BATCHED,
96 NON_SQUARE_TRANS_SWAP
97 };
98
99 /*
100 There are three ways of conducting inplace transpose with 1:2 (or 2:1) dimension ratio.
101 A. first conduct line swapping kernels for the whole non square matrix
102 then conduct batched square transpose along column dim (a 'real' batched transpose)
103 B. first conduct batched square transpose along column dim (a 'real' batched transpose)
104 then conduct line swapping kernels for the whole non square matrix (for 2:1 case)
105 C. first conduct batched square transpose along leading dim (row dim)
106 then conduct line swapping kernels for the whole non square matrix
107 Note that the twiddle computation has to go at the begining of the first kernel or the end of the second kernel
108
109 if leading dimension is bigger, it makes more sense (faster) to swap line first and then conduct batched square transpose
110 if leading dimension is smaller, it makes more sense (faster) to conduct batched transpose and then swap lines.
111 */
112 enum NON_SQUARE_KERNEL_ORDER
113 {
114 NOT_A_TRANSPOSE,
115 SWAP_AND_TRANSPOSE, // A.
116 TRANSPOSE_AND_SWAP, // B.
117 TRANSPOSE_LEADING_AND_SWAP, // C.
118 };
119
120 #define CLFFT_CB_SIZE 32
121 #define CLFFT_MAX_INTERNAL_DIM 16
122
123 /*! @brief Data structure to store the callback function string and other metadata passed by client
124 * @details Client sets the callback function and other required parameters through clfftSetPlanCallback()
125 * in order to register the callback function. The library populates these values into this data structure
126 */
127 typedef struct clfftCallbackParam_
128 {
129 int localMemSize; /*!< optional local memory size if needed by callback */
130 const char* funcname; /*!< callback function name */
131 const char* funcstring; /*!< callback function in string form */
132 }clfftCallbackParam;
133
134 struct FFTKernelGenKeyParams {
135 /*
136 * This structure distills a subset of the fftPlan data,
137 * including all information that is used to generate the OpenCL kernel.
138 * This structure can be used as a key to reusing kernels that have already
139 * been compiled.
140 */
141 size_t fft_DataDim; // Dimensionality of the data
142 size_t fft_N[CLFFT_MAX_INTERNAL_DIM]; // [0] is FFT size, e.g. 1024
143 // This must be <= size of LDS!
144 size_t fft_inStride [CLFFT_MAX_INTERNAL_DIM]; // input strides
145 size_t fft_outStride[CLFFT_MAX_INTERNAL_DIM]; // output strides
146
147 clfftResultLocation fft_placeness;
148 clfftLayout fft_inputLayout;
149 clfftLayout fft_outputLayout;
150 clfftPrecision fft_precision;
151 double fft_fwdScale;
152 double fft_backScale;
153
154 size_t fft_SIMD; // Assume this SIMD/workgroup size
155 size_t fft_LDSsize; // Limit the use of LDS to this many bytes.
156 size_t fft_R; // # of complex values to keep in working registers
157 // SIMD size * R must be <= size of LDS!
158
159 size_t fft_MaxWorkGroupSize; // Limit for work group size
160
161 bool fft_3StepTwiddle; // This is one pass of the "3-step" algorithm;
162 // so extra twiddles are applied on output.
163 bool fft_twiddleFront; // do twiddle scaling at the beginning pass
164
165 bool fft_realSpecial; // this is the flag to control the special case step (4th step)
166 // in the 5-step real 1D large breakdown
167 size_t fft_realSpecial_Nr;
168
169 bool fft_RCsimple;
170
171 bool transOutHorizontal; // tiles traverse the output buffer in horizontal direction
172
173 bool blockCompute;
174 BlockComputeType blockComputeType;
175 size_t blockSIMD;
176 size_t blockLDS;
177
178 NonSquareTransposeKernelType nonSquareKernelType;
179 // sometimes non square matrix are broken down into a number of
180 // square matrix during inplace transpose
181 // let's call this number transposeMiniBatchSize
182 // no user of the library should set its value
183 size_t transposeMiniBatchSize;
184 // transposeBatchSize is the number of batchs times transposeMiniBatchSzie
185 // no user of the library should set its value
186 size_t transposeBatchSize;
187 // no user of the library should set its value
188 NON_SQUARE_KERNEL_ORDER nonSquareKernelOrder;
189
190 bool fft_hasPreCallback;
191 clfftCallbackParam fft_preCallback;
192
193 bool fft_hasPostCallback;
194 clfftCallbackParam fft_postCallback;
195
196 cl_ulong limit_LocalMemSize;
197
198 // Default constructor
FFTKernelGenKeyParamsFFTKernelGenKeyParams199 FFTKernelGenKeyParams()
200 {
201 fft_DataDim = 0;
202 for(int i=0; i<CLFFT_MAX_INTERNAL_DIM; i++)
203 {
204 fft_N[i] = 0;
205 fft_inStride[i] = 0;
206 fft_outStride[i] = 0;
207 }
208
209 fft_placeness = CLFFT_OUTOFPLACE;
210 fft_inputLayout = CLFFT_COMPLEX_INTERLEAVED;
211 fft_outputLayout = CLFFT_COMPLEX_INTERLEAVED;
212 fft_precision = CLFFT_SINGLE;
213 fft_fwdScale = fft_backScale = 0.0;
214 fft_SIMD = 0;
215 fft_LDSsize = 0;
216 fft_R = 0;
217 fft_MaxWorkGroupSize = 0;
218 fft_3StepTwiddle = false;
219 fft_twiddleFront = false;
220
221 transOutHorizontal = false;
222
223 fft_realSpecial = false;
224 fft_realSpecial_Nr = 0;
225
226 fft_RCsimple = false;
227
228 blockCompute = false;
229 blockComputeType = BCT_C2C;
230 blockSIMD = 0;
231 blockLDS = 0;
232 nonSquareKernelType = NON_SQUARE_TRANS_PARENT;
233 transposeMiniBatchSize = 1;
234 transposeBatchSize = 1;
235 fft_hasPreCallback = false;
236 fft_hasPostCallback = false;
237 limit_LocalMemSize = 0;
238 }
239 };
240
241
242 // Sorting operator for struct FFTKernelGenKeyParams, such that it can be used in a map
243 bool operator<( const FFTKernelGenKeyParams& lhs, const FFTKernelGenKeyParams& rhs);
244
245 class FFTPlan;
246 class FFTRepo;
247
248 // Action ID
249 enum FFTActionImplID
250 {
251 FFT_DEFAULT_STOCKHAM_ACTION,
252 FFT_DEFAULT_TRANSPOSE_ACTION,
253 FFT_DEFAULT_COPY_ACTION,
254 FFT_STATIC_STOCKHAM_ACTION
255 };
256
257
258 //
259 // FFTKernelSignatureHeader
260 //
261 // This structure is a wrapper for the FFTKernelSignature.
262 // It stores the signature size and the action ID. This ensure that every
263 // FFTKernelSignature (even with an empty DATA) is unique
264 //
265 // This class is used as the return type of FFTAction::getSignatureData()
266 //
267 struct FFTKernelSignatureHeader
268 {
269 int datasize;
270 FFTActionImplID id;
271
272 //clfftLayout fft_inputLayout;
273 //clfftLayout fft_outputLayout;
274
FFTKernelSignatureHeaderFFTKernelSignatureHeader275 FFTKernelSignatureHeader(int size_, FFTActionImplID id_)
276 {
277 // Set to 0 the whole signature structure
278 ::memset(this, 0, size_);
279 datasize = size_;
280 id = id_;
281 }
282 };
283
284 //
285 // FFTKernelSignature
286 //
287 // This struct represents the signature of an action. An action signature
288 // stores (by inheritage):
289 // - the action ID
290 // - its signature data size
291 // - a set of bytes caracterizes a FFT action
292 //
293 // This template class FFTKernelSignature provides a simple mechanism to
294 // build a proper signature (see also in src/library/repo.h) from any POD type.
295 //
296 // It is used as a key in the different cache mecanisms (binary cache and
297 // dynamic cache managed by FFTRepo)
298 //
299 template <typename DATA, FFTActionImplID ID>
300 struct FFTKernelSignature : public FFTKernelSignatureHeader, public DATA
301 {
FFTKernelSignatureFFTKernelSignature302 FFTKernelSignature()
303 : FFTKernelSignatureHeader(sizeof(FFTKernelSignature<DATA, ID>), ID)
304 {
305 }
306 };
307
308
309
310 //
311 // FFTAction is the base class for all actions used to implement FFT computes
312 //
313 // An action basically implements some OpenCL related actions, for instance:
314 // - Generating a kernel source code from kernel characteristics
315 // - Computing the work-group local sizes according to a kernel
316 // - Enqueuing arguments to the kernel call
317 //
318 // Kernels generated and compiled by an action are stored in the different
319 // cache mechanism (see repo.h for the dynamic cache and fft_binary_lookup.h
320 // for disk cache for more information). Each cache entry can be obtained from
321 // the action signature which is set of byte characterizing the action itself.
322 //
323 // At the moment, FFTAction only implements the enqueue method which is
324 // inherited by every action subclasses. But it may change in the time. There
325 // are no clear rules and the choices made until now are still subject to
326 // change.
327 //
328 class FFTAction
329 {
330 public:
331 FFTAction(FFTPlan * plan, clfftStatus & err);
332
333 virtual clfftStatus enqueue(clfftPlanHandle plHandle,
334 clfftDirection dir,
335 cl_uint numQueuesAndEvents,
336 cl_command_queue* commQueues,
337 cl_uint numWaitEvents,
338 const cl_event* waitEvents,
339 cl_event* outEvents,
340 cl_mem* clInputBuffers,
341 cl_mem* clOutputBuffers);
342
343 protected:
344
345 virtual clfftGenerators getGenerator() = 0;
346
347 clfftStatus compileKernels ( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan);
348 clfftStatus writeKernel ( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device);
349
350 virtual clfftStatus generateKernel ( FFTRepo & fftRepo, const cl_command_queue commQueueFFT) = 0;
351 virtual clfftStatus getWorkSizes ( std::vector<size_t> & globalws, std::vector<size_t> & localws) = 0;
352
353 virtual const FFTKernelSignatureHeader * getSignatureData() = 0;
354
355 FFTPlan * plan;
356
357 private:
358
359 clfftStatus selectBufferArguments(FFTPlan * plan,
360 cl_mem* clInputBuffers,
361 cl_mem* clOutputBuffers,
362 std::vector< cl_mem > &inputBuff,
363 std::vector< cl_mem > &outputBuff);
364
365 virtual bool buildForwardKernel() = 0;
366 virtual bool buildBackwardKernel() = 0;
367 };
368
369
370 // The "envelope" is a set of limits imposed by the hardware
371 // This will depend on the GPU(s) in the OpenCL context.
372 // If there are multiple devices, this should be the least
373 // common denominators.
374 //
375 struct FFTEnvelope {
376 cl_ulong limit_LocalMemSize;
377 // this is the minimum of CL_DEVICE_LOCAL_MEM_SIZE
378 size_t limit_Dimensions;
379 // this is the minimum of CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
380 size_t limit_Size[8];
381 // these are the minimima of CL_DEVICE_MAX_WORK_ITEM_SIZES[0..n]
382 size_t limit_WorkGroupSize;
383 // this is the minimum of CL_DEVICE_MAX_WORK_GROUP_SIZE
384
385 // ?? CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
386
FFTEnvelopeFFTEnvelope387 FFTEnvelope ()
388 : limit_LocalMemSize (0)
389 , limit_Dimensions (0)
390 , limit_WorkGroupSize (0)
391 {
392 ::memset( &limit_Size, 0, sizeof( limit_Size ) );
393 }
394 };
395
396
397 // This class contains objects that are specific to a particular FFT transform, and the data herein is useful
398 // for us to know ahead of transform time such that we can optimize for these settings
399 class FFTPlan
400 {
401
402 public:
403
404 bool baked;
405
406 // Properties provided by the user.
407 clfftDim dim;
408 clfftLayout inputLayout;
409 clfftLayout outputLayout;
410 clfftResultLocation placeness;
411 clfftResultTransposed transposed;
412 clfftPrecision precision;
413 cl_context context;
414 double forwardScale, backwardScale;
415 size_t iDist, oDist;
416 size_t batchsize;
417
418 // Note the device passed to BakePlan, assuming we are baking for one device
419 // TODO, change this logic for handling multiple GPUs/devices
420 cl_device_id bakeDevice;
421
422 // Disabling devices member, plan has 1-on-1 mapping with single device as identified by bakeDevice
423 // Devices that the user specified in the context passed to the create function
424 // std::vector< cl_device_id > devices;
425
426 // Length of the FFT in each dimension
427 std::vector< size_t > length;
428
429 // Stride of the FFT in each dimension
430 std::vector< size_t > inStride, outStride;
431
432 // Hardware Limits
433 FFTEnvelope envelope;
434
435
436 // Reserved copy for large 1d, 2d, and 3d plan
437 size_t tmpBufSize;
438 cl_mem intBuffer;
439 bool libCreatedIntBuffer;
440
441 // for RC copies
442 size_t tmpBufSizeRC;
443 cl_mem intBufferRC;
444
445 // for C-to-R transforms that are OUTOFPLACE
446 // we need this because the user supplied output buffer is not big enough
447 // to hold intermediate results for any problem other than normal 1D
448 size_t tmpBufSizeC2R;
449 cl_mem intBufferC2R;
450
451
452 size_t large1D;
453 bool large2D;
454 bool twiddleFront;
455
456 clfftPlanHandle planX;
457 clfftPlanHandle planY;
458 clfftPlanHandle planZ;
459
460 bool transflag;
461 bool transOutHorizontal;
462 clfftPlanHandle planTX;
463 clfftPlanHandle planTY;
464 clfftPlanHandle planTZ; //reserve for 3D transpose
465
466 clfftPlanHandle planRCcopy;
467 clfftPlanHandle planCopy;
468
469 // Plan resources
470 //
471 cl_mem const_buffer;
472
473 // Generator type
474 clfftGenerators gen;
475
476
477 // Real-Complex simple flag
478 // if this is set we do real to-and-from full complex using simple algorithm
479 // where imaginary of input is set to zero in forward and imaginary not written in backward
480 bool RCsimple;
481
482 // Real FFT special flag
483 // if this is set it means we are doing the 4th step in the 5-step real FFT breakdown algorithm
484 bool realSpecial;
485
486 size_t realSpecial_Nr; // this value stores the logical column height (N0) of matrix in the 4th step
487 // length[1] should be 1 + N0/2
488
489 // User created plan
490 bool userPlan;
491
492
493 // Allocate no extra memory
494 bool allOpsInplace;
495
496 // flag to indicate transpose placeness in 2D breakdown
497 bool transpose_in_2d_inplace;
498
499
500 // A flag to say that blocked FFTs are going to be performed
501 // It can only be one of these: column to row, row to column or column to column
502 // row to row is just the normal case where blocking is not needed
503 bool blockCompute;
504 BlockComputeType blockComputeType;
505
506 bool hasPreCallback;
507 bool hasPostCallback;
508
509 clfftCallbackParam preCallback;
510 clfftCallbackParam postCallbackParam;
511
512 cl_mem precallUserData;
513 cl_mem postcallUserData;
514
515 clfftPlanHandle plHandle;
516
517 // The action
518 FFTAction * action;
519
520 NonSquareTransposeKernelType nonSquareKernelType;
521 // sometimes non square matrix are broken down into a number of
522 // square matrix during inplace transpose
523 // let's call this number transposeMiniBatchSize
524 // no user of the library should set its value
525 size_t transposeMiniBatchSize;
526 NON_SQUARE_KERNEL_ORDER nonSquareKernelOrder;
527
FFTPlan()528 FFTPlan ()
529 : baked (false)
530 , dim (CLFFT_1D)
531 , inputLayout (CLFFT_COMPLEX_INTERLEAVED)
532 , outputLayout (CLFFT_COMPLEX_INTERLEAVED)
533 , placeness (CLFFT_INPLACE)
534 , transposed (CLFFT_NOTRANSPOSE)
535 , precision (CLFFT_SINGLE)
536 , context (NULL)
537 , forwardScale (1.0)
538 , backwardScale (1.0)
539 , iDist( 1 ), oDist( 1 )
540 , batchsize (1)
541 , tmpBufSize (0)
542 , intBuffer( NULL )
543 , libCreatedIntBuffer(false)
544 , tmpBufSizeRC (0)
545 , intBufferRC( NULL )
546 , tmpBufSizeC2R (0)
547 , intBufferC2R( NULL )
548 , large1D(0)
549 , large2D(false)
550 , twiddleFront(false)
551 , planX( 0 )
552 , planY( 0 )
553 , planZ( 0 )
554 , transflag(false)
555 , transOutHorizontal(false)
556 , RCsimple(false)
557 , realSpecial(false)
558 , realSpecial_Nr(0)
559 , userPlan(false)
560 , allOpsInplace(false)
561 , transpose_in_2d_inplace(false)
562 , blockCompute(false)
563 , blockComputeType(BCT_C2C)
564 , planTX( 0 )
565 , planTY( 0 )
566 , planTZ( 0 )
567 , planRCcopy(0)
568 , planCopy(0)
569 , const_buffer( NULL )
570 , gen(Stockham)
571 , action(0)
572 , nonSquareKernelType(NON_SQUARE_TRANS_PARENT)
573 , transposeMiniBatchSize(1)
574 , nonSquareKernelOrder(NOT_A_TRANSPOSE)
575 , plHandle(0)
576 , hasPreCallback(false)
577 , hasPostCallback(false)
578 {
579 };
580
581
582 size_t ElementSize() const;
583
584 clfftStatus AllocateBuffers ();
585 clfftStatus ReleaseBuffers ();
586
587 clfftStatus GetMax1DLength (size_t *longest ) const;
588
589 clfftStatus ConstructAndEnqueueConstantBuffers( cl_command_queue* commQueueFFT );
590
591 clfftStatus GetEnvelope (const FFTEnvelope **) const;
592 clfftStatus SetEnvelope ();
593
594 clfftStatus GetMax1DLengthStockham (size_t *longest ) const;
595
~FFTPlan()596 ~FFTPlan ()
597 {
598 ReleaseBuffers ();
599
600 if (action != NULL)
601 {
602 delete action;
603 action = 0;
604 }
605 }
606 };
607
Is1DPossible(size_t length,size_t large1DThreshold)608 static bool Is1DPossible(size_t length, size_t large1DThreshold)
609 {
610 if (length > large1DThreshold)
611 return false;
612
613 if ( (length%7 == 0) && (length%5 == 0) && (length%3 == 0) )
614 return false;
615
616 // radix 11 & 2 is ok, anything else we cannot do in 1 kernel
617 if ( (length % 11 == 0) && ((length % 13 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
618 return false;
619
620 // radix 13 & 2 is ok, anything else we cannot do in 1 kernel
621 if ( (length % 13 == 0) && ((length % 11 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
622 return false;
623
624 return true;
625 }
626
627 #endif // AMD_CLFFT_plan_H
628
629