1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 #include "stdafx.h"
18 #include <math.h>
19 #include "private.h"
20 #include "repo.h"
21 #include "plan.h"
22 #include "generator.stockham.h"
23 #include "../include/convenienceFunctions.h"
24
25 #include "action.h"
26 #include "fft_binary_lookup.h"
27
28 #define FFT_CACHE_DEBUG 0
29
30
31
FFTCopyAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)32 FFTCopyAction::FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
33 : FFTAction(plan, err)
34 {
35 if (err != CLFFT_SUCCESS)
36 {
37 // FFTAction() failed, exit constructor
38 return;
39 }
40
41 err = CLFFT_SUCCESS;
42 }
43
FFTTransposeGCNAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)44 FFTTransposeGCNAction::FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
45 : FFTAction(plan, err)
46 {
47 if (err != CLFFT_SUCCESS)
48 {
49 // FFTAction() failed, exit constructor
50 return;
51 }
52
53 err = CLFFT_SUCCESS;
54 }
55
FFTTransposeSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)56 FFTTransposeSquareAction::FFTTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
57 : FFTAction(plan, err)
58 {
59 if (err != CLFFT_SUCCESS)
60 {
61 // FFTAction() failed, exit constructor
62 return;
63 }
64
65 err = CLFFT_SUCCESS;
66 }
67
FFTTransposeNonSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)68 FFTTransposeNonSquareAction::FFTTransposeNonSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
69 : FFTAction(plan, err)
70 {
71 if (err != CLFFT_SUCCESS)
72 {
73 // FFTAction() failed, exit constructor
74 return;
75 }
76
77 err = CLFFT_SUCCESS;
78 }
79
FFTStockhamAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)80 FFTStockhamAction::FFTStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
81 : FFTAction(plan, err)
82 {
83 if (err != CLFFT_SUCCESS)
84 {
85 // FFTAction() failed, exit constructor
86 return;
87 }
88
89 err = CLFFT_SUCCESS;
90 }
91
92
93
FFTAction(FFTPlan * fftPlan,clfftStatus & err)94 FFTAction::FFTAction(FFTPlan * fftPlan, clfftStatus & err)
95 : plan(fftPlan)
96 {
97 err = CLFFT_SUCCESS;
98 }
99
selectBufferArguments(FFTPlan * fftPlan,cl_mem * clInputBuffers,cl_mem * clOutputBuffers,std::vector<cl_mem> & inputBuff,std::vector<cl_mem> & outputBuff)100 clfftStatus FFTAction::selectBufferArguments(FFTPlan * fftPlan,
101 cl_mem* clInputBuffers,
102 cl_mem* clOutputBuffers,
103 std::vector< cl_mem > &inputBuff,
104 std::vector< cl_mem > &outputBuff)
105 {
106
107 // 1d with normal length will fall into the below category
108 // add: 2d transpose kernel will fall into here too.
109 inputBuff.reserve( 2 );
110 outputBuff.reserve( 2 );
111
112 // Decode the relevant properties from the plan paramter to figure out how many input/output buffers we have
113 switch( fftPlan->inputLayout )
114 {
115 case CLFFT_COMPLEX_INTERLEAVED:
116 {
117 switch( fftPlan->outputLayout )
118 {
119 case CLFFT_COMPLEX_INTERLEAVED:
120 {
121 if( fftPlan->placeness == CLFFT_INPLACE )
122 {
123 inputBuff.push_back( clInputBuffers[ 0 ] );
124 }
125 else
126 {
127 inputBuff.push_back( clInputBuffers[ 0 ] );
128 outputBuff.push_back( clOutputBuffers[ 0 ] );
129 }
130
131 break;
132 }
133 case CLFFT_COMPLEX_PLANAR:
134 {
135 if( fftPlan->placeness == CLFFT_INPLACE )
136 {
137 // Invalid to be an inplace transform, and go from 1 to 2 buffers
138 return CLFFT_INVALID_ARG_VALUE;
139 }
140 else
141 {
142 inputBuff.push_back( clInputBuffers[ 0 ] );
143
144 outputBuff.push_back( clOutputBuffers[ 0 ] );
145 outputBuff.push_back( clOutputBuffers[ 1 ] );
146 }
147
148 break;
149 }
150 case CLFFT_HERMITIAN_INTERLEAVED:
151 {
152 if( fftPlan->placeness == CLFFT_INPLACE )
153 {
154 return CLFFT_INVALID_ARG_VALUE;
155 }
156 else
157 {
158 inputBuff.push_back( clInputBuffers[ 0 ] );
159 outputBuff.push_back( clOutputBuffers[ 0 ] );
160 }
161
162 break;
163 }
164 case CLFFT_HERMITIAN_PLANAR:
165 {
166 if( fftPlan->placeness == CLFFT_INPLACE )
167 {
168 return CLFFT_INVALID_ARG_VALUE;
169 }
170 else
171 {
172 inputBuff.push_back( clInputBuffers[ 0 ] );
173
174 outputBuff.push_back( clOutputBuffers[ 0 ] );
175 outputBuff.push_back( clOutputBuffers[ 1 ] );
176 }
177
178 break;
179 }
180 case CLFFT_REAL:
181 {
182 if( fftPlan->placeness == CLFFT_INPLACE )
183 {
184 inputBuff.push_back( clInputBuffers[ 0 ] );
185 }
186 else
187 {
188 inputBuff.push_back( clInputBuffers[ 0 ] );
189 outputBuff.push_back( clOutputBuffers[ 0 ] );
190 }
191
192 break;
193 }
194 default:
195 {
196 // Don't recognize output layout
197 return CLFFT_INVALID_ARG_VALUE;
198 }
199 }
200
201 break;
202 }
203 case CLFFT_COMPLEX_PLANAR:
204 {
205 switch( fftPlan->outputLayout )
206 {
207 case CLFFT_COMPLEX_INTERLEAVED:
208 {
209 if( fftPlan->placeness == CLFFT_INPLACE )
210 {
211 return CLFFT_INVALID_ARG_VALUE;
212 }
213 else
214 {
215 inputBuff.push_back( clInputBuffers[ 0 ] );
216 inputBuff.push_back( clInputBuffers[ 1 ] );
217
218 outputBuff.push_back( clOutputBuffers[ 0 ] );
219 }
220
221 break;
222 }
223 case CLFFT_COMPLEX_PLANAR:
224 {
225 if( fftPlan->placeness == CLFFT_INPLACE )
226 {
227 inputBuff.push_back( clInputBuffers[ 0 ] );
228 inputBuff.push_back( clInputBuffers[ 1 ] );
229 }
230 else
231 {
232 inputBuff.push_back( clInputBuffers[ 0 ] );
233 inputBuff.push_back( clInputBuffers[ 1 ] );
234
235 outputBuff.push_back( clOutputBuffers[ 0 ] );
236 outputBuff.push_back( clOutputBuffers[ 1 ] );
237 }
238
239 break;
240 }
241 case CLFFT_HERMITIAN_INTERLEAVED:
242 {
243 if( fftPlan->placeness == CLFFT_INPLACE )
244 {
245 return CLFFT_INVALID_ARG_VALUE;
246 }
247 else
248 {
249 inputBuff.push_back( clInputBuffers[ 0 ] );
250 inputBuff.push_back( clInputBuffers[ 1 ] );
251
252 outputBuff.push_back( clOutputBuffers[ 0 ] );
253 }
254
255 break;
256 }
257 case CLFFT_HERMITIAN_PLANAR:
258 {
259 if( fftPlan->placeness == CLFFT_INPLACE )
260 {
261 return CLFFT_INVALID_ARG_VALUE;
262 }
263 else
264 {
265 inputBuff.push_back( clInputBuffers[ 0 ] );
266 inputBuff.push_back( clInputBuffers[ 1 ] );
267
268 outputBuff.push_back( clOutputBuffers[ 0 ] );
269 outputBuff.push_back( clOutputBuffers[ 1 ] );
270 }
271
272 break;
273 }
274 case CLFFT_REAL:
275 {
276 if( fftPlan->placeness == CLFFT_INPLACE )
277 {
278 return CLFFT_INVALID_ARG_VALUE;
279 }
280 else
281 {
282 inputBuff.push_back( clInputBuffers[ 0 ] );
283 inputBuff.push_back( clInputBuffers[ 1 ] );
284
285 outputBuff.push_back( clOutputBuffers[ 0 ] );
286 }
287
288 break;
289 }
290 default:
291 {
292 // Don't recognize output layout
293 return CLFFT_INVALID_ARG_VALUE;
294 }
295 }
296
297 break;
298 }
299 case CLFFT_HERMITIAN_INTERLEAVED:
300 {
301 switch( fftPlan->outputLayout )
302 {
303 case CLFFT_COMPLEX_INTERLEAVED:
304 {
305 if( fftPlan->placeness == CLFFT_INPLACE )
306 {
307 return CLFFT_INVALID_ARG_VALUE;
308 }
309 else
310 {
311 inputBuff.push_back( clInputBuffers[ 0 ] );
312 outputBuff.push_back( clOutputBuffers[ 0 ] );
313 }
314
315 break;
316 }
317 case CLFFT_COMPLEX_PLANAR:
318 {
319 if( fftPlan->placeness == CLFFT_INPLACE )
320 {
321 return CLFFT_INVALID_ARG_VALUE;
322 }
323 else
324 {
325 inputBuff.push_back( clInputBuffers[ 0 ] );
326
327 outputBuff.push_back( clOutputBuffers[ 0 ] );
328 outputBuff.push_back( clOutputBuffers[ 1 ] );
329 }
330
331 break;
332 }
333 case CLFFT_HERMITIAN_INTERLEAVED:
334 {
335 return CLFFT_INVALID_ARG_VALUE;
336 }
337 case CLFFT_HERMITIAN_PLANAR:
338 {
339 return CLFFT_INVALID_ARG_VALUE;
340 }
341 case CLFFT_REAL:
342 {
343 if( fftPlan->placeness == CLFFT_INPLACE )
344 {
345 inputBuff.push_back( clInputBuffers[ 0 ] );
346 }
347 else
348 {
349 inputBuff.push_back( clInputBuffers[ 0 ] );
350 outputBuff.push_back( clOutputBuffers[ 0 ] );
351 }
352
353 break;
354 }
355 default:
356 {
357 // Don't recognize output layout
358 return CLFFT_INVALID_ARG_VALUE;
359 }
360 }
361
362 break;
363 }
364 case CLFFT_HERMITIAN_PLANAR:
365 {
366 switch( fftPlan->outputLayout )
367 {
368 case CLFFT_COMPLEX_INTERLEAVED:
369 {
370 if( fftPlan->placeness == CLFFT_INPLACE )
371 {
372 return CLFFT_INVALID_ARG_VALUE;
373 }
374 else
375 {
376 inputBuff.push_back( clInputBuffers[ 0 ] );
377 inputBuff.push_back( clInputBuffers[ 1 ] );
378
379 outputBuff.push_back( clOutputBuffers[ 0 ] );
380 }
381
382 break;
383 }
384 case CLFFT_COMPLEX_PLANAR:
385 {
386 if( fftPlan->placeness == CLFFT_INPLACE )
387 {
388 return CLFFT_INVALID_ARG_VALUE;
389 }
390 else
391 {
392 inputBuff.push_back( clInputBuffers[ 0 ] );
393 inputBuff.push_back( clInputBuffers[ 1 ] );
394
395 outputBuff.push_back( clOutputBuffers[ 0 ] );
396 outputBuff.push_back( clOutputBuffers[ 1 ] );
397 }
398
399 break;
400 }
401 case CLFFT_HERMITIAN_INTERLEAVED:
402 {
403 return CLFFT_INVALID_ARG_VALUE;
404 }
405 case CLFFT_HERMITIAN_PLANAR:
406 {
407 return CLFFT_INVALID_ARG_VALUE;
408 }
409 case CLFFT_REAL:
410 {
411 if( fftPlan->placeness == CLFFT_INPLACE )
412 {
413 return CLFFT_INVALID_ARG_VALUE;
414 }
415 else
416 {
417 inputBuff.push_back( clInputBuffers[ 0 ] );
418 inputBuff.push_back( clInputBuffers[ 1 ] );
419
420 outputBuff.push_back( clOutputBuffers[ 0 ] );
421 }
422
423 break;
424 }
425 default:
426 {
427 // Don't recognize output layout
428 return CLFFT_INVALID_ARG_VALUE;
429 }
430 }
431
432 break;
433 }
434 case CLFFT_REAL:
435 {
436 switch( fftPlan->outputLayout )
437 {
438 case CLFFT_COMPLEX_INTERLEAVED:
439 {
440 if( fftPlan->placeness == CLFFT_INPLACE )
441 {
442 inputBuff.push_back( clInputBuffers[ 0 ] );
443 }
444 else
445 {
446 inputBuff.push_back( clInputBuffers[ 0 ] );
447 outputBuff.push_back( clOutputBuffers[ 0 ] );
448 }
449
450 break;
451 }
452 case CLFFT_COMPLEX_PLANAR:
453 {
454 if( fftPlan->placeness == CLFFT_INPLACE )
455 {
456 return CLFFT_INVALID_ARG_VALUE;
457 }
458 else
459 {
460 inputBuff.push_back( clInputBuffers[ 0 ] );
461
462 outputBuff.push_back( clOutputBuffers[ 0 ] );
463 outputBuff.push_back( clOutputBuffers[ 1 ] );
464 }
465
466 break;
467 }
468 case CLFFT_HERMITIAN_INTERLEAVED:
469 {
470 if( fftPlan->placeness == CLFFT_INPLACE )
471 {
472 inputBuff.push_back( clInputBuffers[ 0 ] );
473 }
474 else
475 {
476 inputBuff.push_back( clInputBuffers[ 0 ] );
477 outputBuff.push_back( clOutputBuffers[ 0 ] );
478 }
479
480 break;
481 }
482 case CLFFT_HERMITIAN_PLANAR:
483 {
484 if( fftPlan->placeness == CLFFT_INPLACE )
485 {
486 return CLFFT_INVALID_ARG_VALUE;
487 }
488 else
489 {
490 inputBuff.push_back( clInputBuffers[ 0 ] );
491
492 outputBuff.push_back( clOutputBuffers[ 0 ] );
493 outputBuff.push_back( clOutputBuffers[ 1 ] );
494 }
495
496 break;
497 }
498 default:
499 {
500 if(fftPlan->transflag)
501 {
502 if( fftPlan->placeness == CLFFT_INPLACE )
503 {
504 return CLFFT_INVALID_ARG_VALUE;
505 }
506 else
507 {
508 inputBuff.push_back( clInputBuffers[ 0 ] );
509 outputBuff.push_back( clOutputBuffers[ 0 ] );
510 }
511 }
512 else
513 {
514 // Don't recognize output layout
515 return CLFFT_INVALID_ARG_VALUE;
516 }
517 }
518 }
519
520 break;
521 }
522 default:
523 {
524 // Don't recognize output layout
525 return CLFFT_INVALID_ARG_VALUE;
526 }
527 }
528
529 return CLFFT_SUCCESS;
530 }
531
532
enqueue(clfftPlanHandle plHandle,clfftDirection dir,cl_uint numQueuesAndEvents,cl_command_queue * commQueues,cl_uint numWaitEvents,const cl_event * waitEvents,cl_event * outEvents,cl_mem * clInputBuffers,cl_mem * clOutputBuffers)533 clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
534 clfftDirection dir,
535 cl_uint numQueuesAndEvents,
536 cl_command_queue* commQueues,
537 cl_uint numWaitEvents,
538 const cl_event* waitEvents,
539 cl_event* outEvents,
540 cl_mem* clInputBuffers,
541 cl_mem* clOutputBuffers)
542 {
543 FFTRepo & fftRepo = FFTRepo::getInstance();
544
545 std::vector< cl_mem > inputBuff;
546 std::vector< cl_mem > outputBuff;
547
548
549 clfftStatus status = selectBufferArguments(this->plan,
550 clInputBuffers, clOutputBuffers,
551 inputBuff, outputBuff);
552
553 if (status != CLFFT_SUCCESS)
554 {
555 return status;
556 }
557
558 // TODO: In the case of length == 1, FFT is a trivial NOP, but we still need to apply the forward and backwards tranforms
559 // TODO: Are map lookups expensive to call here? We can cache a pointer to the cl_program/cl_kernel in the plan
560
561 // Translate the user plan into the structure that we use to map plans to clPrograms
562
563 cl_program prog;
564 cl_kernel kern;
565 lockRAII* kernelLock;
566 OPENCL_V( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), prog, this->plan->bakeDevice, this->plan->context ), _T( "fftRepo.getclProgram failed" ) );
567 OPENCL_V( fftRepo.getclKernel( prog, dir, kern, kernelLock), _T( "fftRepo.getclKernels failed" ) );
568
569 scopedLock sLock(*kernelLock, _T("FFTAction::enqueue"));
570
571 cl_uint uarg = 0;
572 if (!this->plan->transflag && !(this->plan->gen == Copy))
573 {
574 // ::clSetKernelArg() is not thread safe, according to the openCL spec for the same cl_kernel object
575 // TODO: Need to verify that two different plans (which would get through our lock above) with exactly the same
576 // parameters would NOT share the same cl_kernel objects
577
578 /* constant buffer */
579 OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->const_buffer ), _T( "clSetKernelArg failed" ) );
580 }
581
582 // Input buffer(s)
583 // Input may be 1 buffer (CLFFT_COMPLEX_INTERLEAVED)
584 // or 2 buffers (CLFFT_COMPLEX_PLANAR)
585
586 for (size_t i = 0; i < inputBuff.size(); ++i)
587 {
588 OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&inputBuff[i] ), _T( "clSetKernelArg failed" ) );
589 }
590 // Output buffer(s)
591 // Output may be 0 buffers (CLFFT_INPLACE)
592 // or 1 buffer (CLFFT_COMPLEX_INTERLEAVED)
593 // or 2 buffers (CLFFT_COMPLEX_PLANAR)
594 for (size_t o = 0; o < outputBuff.size(); ++o)
595 {
596 OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
597 }
598
599 //If callback function is set for the plan, pass the appropriate aruments
600 if (this->plan->hasPreCallback || this->plan->hasPostCallback)
601 {
602 if (this->plan->hasPreCallback)
603 {
604 OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->precallUserData ), _T( "clSetKernelArg failed" ) );
605 }
606
607 //If post-callback function is set for the plan, pass the appropriate aruments
608 if (this->plan->hasPostCallback)
609 {
610 OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->postcallUserData ), _T( "clSetKernelArg failed" ) );
611 }
612
613 //Pass LDS size arument if set
614 if ((this->plan->hasPreCallback && this->plan->preCallback.localMemSize > 0) ||
615 (this->plan->hasPostCallback && this->plan->postCallbackParam.localMemSize > 0))
616 {
617 int localmemSize = 0;
618 if (this->plan->hasPreCallback && this->plan->preCallback.localMemSize > 0)
619 localmemSize = this->plan->preCallback.localMemSize;
620 if (this->plan->hasPostCallback && this->plan->postCallbackParam.localMemSize > 0)
621 localmemSize += this->plan->postCallbackParam.localMemSize;
622
623 OPENCL_V( clSetKernelArg( kern, uarg++, localmemSize, NULL ), _T( "clSetKernelArg failed" ) );
624 }
625 }
626
627 std::vector< size_t > gWorkSize;
628 std::vector< size_t > lWorkSize;
629 clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
630 //std::cout << "work sizes are " << gWorkSize[0] << ", " << lWorkSize[0] << std::endl;
631 /*
632 std::cout << "work sizes are ";
633 for (auto itor = gWorkSize.begin(); itor != gWorkSize.end(); itor++)
634 std::cout << *itor << " ";
635 std::cout << ", ";
636 for (auto itor = lWorkSize.begin(); itor != lWorkSize.end(); itor++)
637 std::cout << *itor << " ";
638 std::cout << std::endl;
639 */
640 // TODO: if getWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
641 // that this multidimensional input data array is too large to be transformed
642 // with a single call to clEnqueueNDRangeKernel. For now, we will just return
643 // the error code back up the call stack.
644 // The *correct* course of action would be to split the work into mutliple
645 // calls to clEnqueueNDRangeKernel.
646 if (CLFFT_INVALID_GLOBAL_WORK_SIZE == result)
647 {
648 OPENCL_V( result, _T("Work size too large for clEnqueNDRangeKernel()"));
649 }
650 else
651 {
652 OPENCL_V( result, _T("FFTAction::getWorkSizes failed"));
653 }
654 BUG_CHECK (gWorkSize.size() == lWorkSize.size());
655
656
657 cl_int call_status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
658 NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
659 OPENCL_V( call_status, _T( "clEnqueueNDRangeKernel failed" ) );
660
661 if( fftRepo.pStatTimer )
662 {
663 fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize, lWorkSize );
664 }
665
666 return CLFFT_SUCCESS;
667 }
668
669
670
671 // Read the kernels that this plan uses from file, and store into the plan
writeKernel(const clfftPlanHandle plHandle,const clfftGenerators gen,const FFTKernelSignatureHeader * data,const cl_context & context,const cl_device_id & device)672 clfftStatus FFTAction::writeKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device )
673 {
674 FFTRepo& fftRepo = FFTRepo::getInstance( );
675
676 std::string kernelPath = getKernelName(gen, plHandle, true);
677
678 // Logic to write string contents out to file
679 tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.c_str( ) );
680 if( !kernelFile.get( ) )
681 {
682 std::cerr << "Failed to open kernel file for writing: " << kernelPath.c_str( ) << std::endl;
683 return CLFFT_FILE_CREATE_FAILURE;
684 }
685
686 std::string kernel;
687 OPENCL_V( fftRepo.getProgramCode( gen, data, kernel, device, context ), _T( "fftRepo.getProgramCode failed." ) );
688
689 kernelFile.get( ) << kernel << std::endl;
690
691 return CLFFT_SUCCESS;
692 }
693
694
695 // **************** TODO TODO TODO ***********************
696 // Making compileKernels function take in command queue parameter so we can build for 1 particular device only;
697 // this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
698 // make changes appropriately before enabling persistent plans and then remove this comment
699
700 // Compile the kernels that this plan uses, and store into the plan
compileKernels(const cl_command_queue commQueueFFT,const clfftPlanHandle plHandle,FFTPlan * fftPlan)701 clfftStatus FFTAction::compileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan )
702 {
703 cl_int status = 0;
704 size_t deviceListSize = 0;
705
706 FFTRepo& fftRepo = FFTRepo::getInstance( );
707
708 // create a cl program executable for the device associated with command queue
709 // Get the device
710 cl_device_id &q_device = fftPlan->bakeDevice;
711
712 cl_program program;
713 if( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
714 {
715 FFTBinaryLookup lookup (this->getGenerator(), plHandle, fftPlan->context, q_device);
716
717 lookup.variantRaw(this->getSignatureData(), this->getSignatureData()->datasize);
718
719 if (lookup.found())
720 {
721 #if FFT_CACHE_DEBUG
722 // debug message in debug mode to ensure that the cache is used
723 fprintf(stderr, "Kernel loaded from cache\n");
724 #endif
725
726 program = lookup.getProgram();
727 }
728 else
729 {
730 #if FFT_CACHE_DEBUG
731 fprintf(stderr, "Kernel built from source\n");
732 #endif
733
734 // If the user wishes us to write the kernels out to disk, we do so
735 if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
736 {
737 OPENCL_V( writeKernel( plHandle, this->getGenerator(), this->getSignatureData(), fftPlan->context, fftPlan->bakeDevice ), _T( "writeKernel failed." ) );
738 }
739
740 std::string programCode;
741 OPENCL_V( fftRepo.getProgramCode( this->getGenerator(), this->getSignatureData(), programCode, q_device, fftPlan->context ), _T( "fftRepo.getProgramCode failed." ) );
742
743 const char* source = programCode.c_str();
744 program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
745 OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
746
747 // create a cl program executable for the device associated with command queue
748
749 #if defined(DEBUGGING)
750 status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
751
752 // if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
753 // this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
754 #ifdef DEBUG_BREAK_GDB
755 __debugbreak();
756 #endif
757 #else
758 status = clBuildProgram( program, 1, &q_device, "", NULL, NULL);
759 #endif
760 if( status != CL_SUCCESS )
761 {
762 if( status == CL_BUILD_PROGRAM_FAILURE )
763 {
764 size_t buildLogSize = 0;
765 OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
766 _T( "clGetProgramBuildInfo failed" ) );
767
768 std::vector< char > buildLog( buildLogSize );
769 ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
770
771 OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
772 _T( "clGetProgramBuildInfo failed" ) );
773
774 std::cerr << "\n\t\t\tBUILD LOG\n";
775 std::cerr << "************************************************\n";
776 std::cerr << &buildLog[ 0 ] << std::endl;
777 std::cerr << "************************************************\n";
778 }
779
780 OPENCL_V( status, _T( "clBuildProgram failed" ) );
781 }
782
783 lookup.setProgram(program, source);
784 lookup.populateCache();
785 }
786
787 fftRepo.setclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context );
788
789
790 // For real transforms we compile either forward or backward kernel
791 bool buildFwdKernel = buildForwardKernel();
792 bool buildBwdKernel = buildBackwardKernel();
793
794 // get a kernel object handle for a kernel with the given name
795 cl_kernel kernel;
796 if( buildFwdKernel )
797 {
798 lockRAII *kernelLock;
799 if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel, kernelLock) == CLFFT_INVALID_KERNEL )
800 {
801 std::string entryPoint;
802 OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
803
804 kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
805 OPENCL_V( status, _T( "clCreateKernel failed" ) );
806
807 fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
808 }
809 }
810
811 if( buildBwdKernel )
812 {
813 lockRAII *kernelLock;
814 if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel, kernelLock ) == CLFFT_INVALID_KERNEL )
815 {
816 std::string entryPoint;
817 OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
818
819 kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
820 OPENCL_V( status, _T( "clCreateKernel failed" ) );
821
822 fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
823 }
824 }
825 }
826
827 return CLFFT_SUCCESS;
828 }
829
830
831