1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #include "stdafx.h"
18 #include <math.h>
19 #include "private.h"
20 #include "repo.h"
21 #include "plan.h"
22 #include "generator.stockham.h"
23 #include "../include/convenienceFunctions.h"
24 
25 #include "action.h"
26 #include "fft_binary_lookup.h"
27 
28 #define FFT_CACHE_DEBUG 0
29 
30 
31 
FFTCopyAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)32 FFTCopyAction::FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
33     : FFTAction(plan, err)
34 {
35     if (err != CLFFT_SUCCESS)
36     {
37         // FFTAction() failed, exit constructor
38         return;
39     }
40 
41     err = CLFFT_SUCCESS;
42 }
43 
FFTTransposeGCNAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)44 FFTTransposeGCNAction::FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
45     : FFTAction(plan, err)
46 {
47     if (err != CLFFT_SUCCESS)
48     {
49         // FFTAction() failed, exit constructor
50         return;
51     }
52 
53     err = CLFFT_SUCCESS;
54 }
55 
FFTTransposeSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)56 FFTTransposeSquareAction::FFTTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
57     : FFTAction(plan, err)
58 {
59     if (err != CLFFT_SUCCESS)
60     {
61         // FFTAction() failed, exit constructor
62         return;
63     }
64 
65     err = CLFFT_SUCCESS;
66 }
67 
FFTTransposeNonSquareAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)68 FFTTransposeNonSquareAction::FFTTransposeNonSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
69     : FFTAction(plan, err)
70 {
71     if (err != CLFFT_SUCCESS)
72     {
73         // FFTAction() failed, exit constructor
74         return;
75     }
76 
77     err = CLFFT_SUCCESS;
78 }
79 
FFTStockhamAction(clfftPlanHandle plHandle,FFTPlan * plan,cl_command_queue queue,clfftStatus & err)80 FFTStockhamAction::FFTStockhamAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
81     : FFTAction(plan, err)
82 {
83     if (err != CLFFT_SUCCESS)
84     {
85         // FFTAction() failed, exit constructor
86         return;
87     }
88 
89     err = CLFFT_SUCCESS;
90 }
91 
92 
93 
FFTAction(FFTPlan * fftPlan,clfftStatus & err)94 FFTAction::FFTAction(FFTPlan * fftPlan, clfftStatus & err)
95     : plan(fftPlan)
96 {
97     err = CLFFT_SUCCESS;
98 }
99 
selectBufferArguments(FFTPlan * fftPlan,cl_mem * clInputBuffers,cl_mem * clOutputBuffers,std::vector<cl_mem> & inputBuff,std::vector<cl_mem> & outputBuff)100 clfftStatus FFTAction::selectBufferArguments(FFTPlan * fftPlan,
101                                              cl_mem* clInputBuffers,
102                                              cl_mem* clOutputBuffers,
103                                              std::vector< cl_mem > &inputBuff,
104                                              std::vector< cl_mem > &outputBuff)
105 {
106 
107     // 1d with normal length will fall into the below category
108     // add: 2d transpose kernel will fall into here too.
109     inputBuff.reserve( 2 );
110     outputBuff.reserve( 2 );
111 
112     //	Decode the relevant properties from the plan paramter to figure out how many input/output buffers we have
113     switch( fftPlan->inputLayout )
114     {
115     case CLFFT_COMPLEX_INTERLEAVED:
116     {
117         switch( fftPlan->outputLayout )
118         {
119         case CLFFT_COMPLEX_INTERLEAVED:
120         {
121             if( fftPlan->placeness == CLFFT_INPLACE )
122             {
123                 inputBuff.push_back( clInputBuffers[ 0 ] );
124             }
125             else
126             {
127                 inputBuff.push_back( clInputBuffers[ 0 ] );
128                 outputBuff.push_back( clOutputBuffers[ 0 ] );
129             }
130 
131             break;
132         }
133         case CLFFT_COMPLEX_PLANAR:
134         {
135             if( fftPlan->placeness == CLFFT_INPLACE )
136             {
137                 //	Invalid to be an inplace transform, and go from 1 to 2 buffers
138                 return CLFFT_INVALID_ARG_VALUE;
139             }
140             else
141             {
142                 inputBuff.push_back( clInputBuffers[ 0 ] );
143 
144                 outputBuff.push_back( clOutputBuffers[ 0 ] );
145                 outputBuff.push_back( clOutputBuffers[ 1 ] );
146             }
147 
148             break;
149         }
150         case CLFFT_HERMITIAN_INTERLEAVED:
151         {
152             if( fftPlan->placeness == CLFFT_INPLACE )
153             {
154                 return CLFFT_INVALID_ARG_VALUE;
155             }
156             else
157             {
158                 inputBuff.push_back( clInputBuffers[ 0 ] );
159                 outputBuff.push_back( clOutputBuffers[ 0 ] );
160             }
161 
162             break;
163         }
164         case CLFFT_HERMITIAN_PLANAR:
165         {
166             if( fftPlan->placeness == CLFFT_INPLACE )
167             {
168                 return CLFFT_INVALID_ARG_VALUE;
169             }
170             else
171             {
172                 inputBuff.push_back( clInputBuffers[ 0 ] );
173 
174                 outputBuff.push_back( clOutputBuffers[ 0 ] );
175                 outputBuff.push_back( clOutputBuffers[ 1 ] );
176             }
177 
178             break;
179         }
180         case CLFFT_REAL:
181         {
182             if( fftPlan->placeness == CLFFT_INPLACE )
183             {
184                 inputBuff.push_back( clInputBuffers[ 0 ] );
185             }
186             else
187             {
188                 inputBuff.push_back( clInputBuffers[ 0 ] );
189                 outputBuff.push_back( clOutputBuffers[ 0 ] );
190             }
191 
192             break;
193         }
194         default:
195         {
196             //	Don't recognize output layout
197             return CLFFT_INVALID_ARG_VALUE;
198         }
199         }
200 
201         break;
202     }
203     case CLFFT_COMPLEX_PLANAR:
204     {
205         switch( fftPlan->outputLayout )
206         {
207         case CLFFT_COMPLEX_INTERLEAVED:
208         {
209             if( fftPlan->placeness == CLFFT_INPLACE )
210             {
211                 return CLFFT_INVALID_ARG_VALUE;
212             }
213             else
214             {
215                 inputBuff.push_back( clInputBuffers[ 0 ] );
216                 inputBuff.push_back( clInputBuffers[ 1 ] );
217 
218                 outputBuff.push_back( clOutputBuffers[ 0 ] );
219             }
220 
221             break;
222         }
223         case CLFFT_COMPLEX_PLANAR:
224         {
225             if( fftPlan->placeness == CLFFT_INPLACE )
226             {
227                 inputBuff.push_back( clInputBuffers[ 0 ] );
228                 inputBuff.push_back( clInputBuffers[ 1 ] );
229             }
230             else
231             {
232                 inputBuff.push_back( clInputBuffers[ 0 ] );
233                 inputBuff.push_back( clInputBuffers[ 1 ] );
234 
235                 outputBuff.push_back( clOutputBuffers[ 0 ] );
236                 outputBuff.push_back( clOutputBuffers[ 1 ] );
237             }
238 
239             break;
240         }
241         case CLFFT_HERMITIAN_INTERLEAVED:
242         {
243             if( fftPlan->placeness == CLFFT_INPLACE )
244             {
245                 return CLFFT_INVALID_ARG_VALUE;
246             }
247             else
248             {
249                 inputBuff.push_back( clInputBuffers[ 0 ] );
250                 inputBuff.push_back( clInputBuffers[ 1 ] );
251 
252                 outputBuff.push_back( clOutputBuffers[ 0 ] );
253             }
254 
255             break;
256         }
257         case CLFFT_HERMITIAN_PLANAR:
258         {
259             if( fftPlan->placeness == CLFFT_INPLACE )
260             {
261                 return CLFFT_INVALID_ARG_VALUE;
262             }
263             else
264             {
265                 inputBuff.push_back( clInputBuffers[ 0 ] );
266                 inputBuff.push_back( clInputBuffers[ 1 ] );
267 
268                 outputBuff.push_back( clOutputBuffers[ 0 ] );
269                 outputBuff.push_back( clOutputBuffers[ 1 ] );
270             }
271 
272             break;
273         }
274         case CLFFT_REAL:
275         {
276             if( fftPlan->placeness == CLFFT_INPLACE )
277             {
278                 return CLFFT_INVALID_ARG_VALUE;
279             }
280             else
281             {
282                 inputBuff.push_back( clInputBuffers[ 0 ] );
283                 inputBuff.push_back( clInputBuffers[ 1 ] );
284 
285                 outputBuff.push_back( clOutputBuffers[ 0 ] );
286             }
287 
288             break;
289         }
290         default:
291         {
292             //	Don't recognize output layout
293             return CLFFT_INVALID_ARG_VALUE;
294         }
295         }
296 
297         break;
298     }
299     case CLFFT_HERMITIAN_INTERLEAVED:
300     {
301         switch( fftPlan->outputLayout )
302         {
303         case CLFFT_COMPLEX_INTERLEAVED:
304         {
305             if( fftPlan->placeness == CLFFT_INPLACE )
306             {
307                 return CLFFT_INVALID_ARG_VALUE;
308             }
309             else
310             {
311                 inputBuff.push_back( clInputBuffers[ 0 ] );
312                 outputBuff.push_back( clOutputBuffers[ 0 ] );
313             }
314 
315             break;
316         }
317         case CLFFT_COMPLEX_PLANAR:
318         {
319             if( fftPlan->placeness == CLFFT_INPLACE )
320             {
321                 return CLFFT_INVALID_ARG_VALUE;
322             }
323             else
324             {
325                 inputBuff.push_back( clInputBuffers[ 0 ] );
326 
327                 outputBuff.push_back( clOutputBuffers[ 0 ] );
328                 outputBuff.push_back( clOutputBuffers[ 1 ] );
329             }
330 
331             break;
332         }
333         case CLFFT_HERMITIAN_INTERLEAVED:
334         {
335             return CLFFT_INVALID_ARG_VALUE;
336         }
337         case CLFFT_HERMITIAN_PLANAR:
338         {
339             return CLFFT_INVALID_ARG_VALUE;
340         }
341         case CLFFT_REAL:
342         {
343             if( fftPlan->placeness == CLFFT_INPLACE )
344             {
345                 inputBuff.push_back( clInputBuffers[ 0 ] );
346             }
347             else
348             {
349                 inputBuff.push_back( clInputBuffers[ 0 ] );
350                 outputBuff.push_back( clOutputBuffers[ 0 ] );
351             }
352 
353             break;
354         }
355         default:
356         {
357             //	Don't recognize output layout
358             return CLFFT_INVALID_ARG_VALUE;
359         }
360         }
361 
362         break;
363     }
364     case CLFFT_HERMITIAN_PLANAR:
365     {
366         switch( fftPlan->outputLayout )
367         {
368         case CLFFT_COMPLEX_INTERLEAVED:
369         {
370             if( fftPlan->placeness == CLFFT_INPLACE )
371             {
372                 return CLFFT_INVALID_ARG_VALUE;
373             }
374             else
375             {
376                 inputBuff.push_back( clInputBuffers[ 0 ] );
377                 inputBuff.push_back( clInputBuffers[ 1 ] );
378 
379                 outputBuff.push_back( clOutputBuffers[ 0 ] );
380             }
381 
382             break;
383         }
384         case CLFFT_COMPLEX_PLANAR:
385         {
386             if( fftPlan->placeness == CLFFT_INPLACE )
387             {
388                 return CLFFT_INVALID_ARG_VALUE;
389             }
390             else
391             {
392                 inputBuff.push_back( clInputBuffers[ 0 ] );
393                 inputBuff.push_back( clInputBuffers[ 1 ] );
394 
395                 outputBuff.push_back( clOutputBuffers[ 0 ] );
396                 outputBuff.push_back( clOutputBuffers[ 1 ] );
397             }
398 
399             break;
400         }
401         case CLFFT_HERMITIAN_INTERLEAVED:
402         {
403             return CLFFT_INVALID_ARG_VALUE;
404         }
405         case CLFFT_HERMITIAN_PLANAR:
406         {
407             return CLFFT_INVALID_ARG_VALUE;
408         }
409         case CLFFT_REAL:
410         {
411             if( fftPlan->placeness == CLFFT_INPLACE )
412             {
413                 return CLFFT_INVALID_ARG_VALUE;
414             }
415             else
416             {
417                 inputBuff.push_back( clInputBuffers[ 0 ] );
418                 inputBuff.push_back( clInputBuffers[ 1 ] );
419 
420                 outputBuff.push_back( clOutputBuffers[ 0 ] );
421             }
422 
423             break;
424         }
425         default:
426         {
427             //	Don't recognize output layout
428             return CLFFT_INVALID_ARG_VALUE;
429         }
430         }
431 
432         break;
433     }
434     case CLFFT_REAL:
435     {
436         switch( fftPlan->outputLayout )
437         {
438         case CLFFT_COMPLEX_INTERLEAVED:
439         {
440             if( fftPlan->placeness == CLFFT_INPLACE )
441             {
442                 inputBuff.push_back( clInputBuffers[ 0 ] );
443             }
444             else
445             {
446                 inputBuff.push_back( clInputBuffers[ 0 ] );
447                 outputBuff.push_back( clOutputBuffers[ 0 ] );
448             }
449 
450             break;
451         }
452         case CLFFT_COMPLEX_PLANAR:
453         {
454             if( fftPlan->placeness == CLFFT_INPLACE )
455             {
456                 return CLFFT_INVALID_ARG_VALUE;
457             }
458             else
459             {
460                 inputBuff.push_back( clInputBuffers[ 0 ] );
461 
462                 outputBuff.push_back( clOutputBuffers[ 0 ] );
463                 outputBuff.push_back( clOutputBuffers[ 1 ] );
464             }
465 
466             break;
467         }
468         case CLFFT_HERMITIAN_INTERLEAVED:
469         {
470             if( fftPlan->placeness == CLFFT_INPLACE )
471             {
472                 inputBuff.push_back( clInputBuffers[ 0 ] );
473             }
474             else
475             {
476                 inputBuff.push_back( clInputBuffers[ 0 ] );
477                 outputBuff.push_back( clOutputBuffers[ 0 ] );
478             }
479 
480             break;
481         }
482         case CLFFT_HERMITIAN_PLANAR:
483         {
484             if( fftPlan->placeness == CLFFT_INPLACE )
485             {
486                 return CLFFT_INVALID_ARG_VALUE;
487             }
488             else
489             {
490                 inputBuff.push_back( clInputBuffers[ 0 ] );
491 
492                 outputBuff.push_back( clOutputBuffers[ 0 ] );
493                 outputBuff.push_back( clOutputBuffers[ 1 ] );
494             }
495 
496             break;
497         }
498         default:
499         {
500 			if(fftPlan->transflag)
501 			{
502 				if( fftPlan->placeness == CLFFT_INPLACE )
503 				{
504 					return CLFFT_INVALID_ARG_VALUE;
505 				}
506 				else
507 				{
508 					inputBuff.push_back( clInputBuffers[ 0 ] );
509 					outputBuff.push_back( clOutputBuffers[ 0 ] );
510 				}
511 			}
512 			else
513 			{
514 				//	Don't recognize output layout
515 				return CLFFT_INVALID_ARG_VALUE;
516 			}
517         }
518         }
519 
520         break;
521     }
522     default:
523     {
524         //	Don't recognize output layout
525         return CLFFT_INVALID_ARG_VALUE;
526     }
527     }
528 
529     return CLFFT_SUCCESS;
530 }
531 
532 
enqueue(clfftPlanHandle plHandle,clfftDirection dir,cl_uint numQueuesAndEvents,cl_command_queue * commQueues,cl_uint numWaitEvents,const cl_event * waitEvents,cl_event * outEvents,cl_mem * clInputBuffers,cl_mem * clOutputBuffers)533 clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
534                                clfftDirection dir,
535                                cl_uint numQueuesAndEvents,
536                                cl_command_queue* commQueues,
537                                cl_uint numWaitEvents,
538                                const cl_event* waitEvents,
539                                cl_event* outEvents,
540                                cl_mem* clInputBuffers,
541                                cl_mem* clOutputBuffers)
542 {
543     FFTRepo & fftRepo = FFTRepo::getInstance();
544 
545     std::vector< cl_mem > inputBuff;
546     std::vector< cl_mem > outputBuff;
547 
548 
549     clfftStatus status = selectBufferArguments(this->plan,
550                                                clInputBuffers, clOutputBuffers,
551                                                inputBuff, outputBuff);
552 
553     if (status != CLFFT_SUCCESS)
554     {
555         return status;
556     }
557 
558     //	TODO:  In the case of length == 1, FFT is a trivial NOP, but we still need to apply the forward and backwards tranforms
559     //	TODO:  Are map lookups expensive to call here?  We can cache a pointer to the cl_program/cl_kernel in the plan
560 
561     //	Translate the user plan into the structure that we use to map plans to clPrograms
562 
563     cl_program	prog;
564     cl_kernel	kern;
565 	lockRAII* kernelLock;
566     OPENCL_V( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), prog, this->plan->bakeDevice, this->plan->context ), _T( "fftRepo.getclProgram failed" ) );
567     OPENCL_V( fftRepo.getclKernel( prog, dir, kern, kernelLock), _T( "fftRepo.getclKernels failed" ) );
568 
569 	scopedLock sLock(*kernelLock, _T("FFTAction::enqueue"));
570 
571     cl_uint uarg = 0;
572     if (!this->plan->transflag && !(this->plan->gen == Copy))
573     {
574         //	::clSetKernelArg() is not thread safe, according to the openCL spec for the same cl_kernel object
575         //	TODO:  Need to verify that two different plans (which would get through our lock above) with exactly the same
576         //	parameters would NOT share the same cl_kernel objects
577 
578         /* constant buffer */
579         OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->const_buffer ), _T( "clSetKernelArg failed" ) );
580     }
581 
582     //	Input buffer(s)
583     //	Input may be 1 buffer  (CLFFT_COMPLEX_INTERLEAVED)
584     //	          or 2 buffers (CLFFT_COMPLEX_PLANAR)
585 
586     for (size_t i = 0; i < inputBuff.size(); ++i)
587     {
588         OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&inputBuff[i] ), _T( "clSetKernelArg failed" ) );
589     }
590     //	Output buffer(s)
591     //	Output may be 0 buffers (CLFFT_INPLACE)
592     //	           or 1 buffer  (CLFFT_COMPLEX_INTERLEAVED)
593     //	           or 2 buffers (CLFFT_COMPLEX_PLANAR)
594     for (size_t o = 0; o < outputBuff.size(); ++o)
595     {
596         OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&outputBuff[o] ), _T( "clSetKernelArg failed" ) );
597     }
598 
599 	//If callback function is set for the plan, pass the appropriate aruments
600 	if (this->plan->hasPreCallback || this->plan->hasPostCallback)
601 	{
602 	if (this->plan->hasPreCallback)
603 	{
604 		OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->precallUserData ), _T( "clSetKernelArg failed" ) );
605 		}
606 
607 		//If post-callback function is set for the plan, pass the appropriate aruments
608 		if (this->plan->hasPostCallback)
609 		{
610 			OPENCL_V( clSetKernelArg( kern, uarg++, sizeof( cl_mem ), (void*)&this->plan->postcallUserData ), _T( "clSetKernelArg failed" ) );
611 		}
612 
613 		//Pass LDS size arument if set
614 		if ((this->plan->hasPreCallback && this->plan->preCallback.localMemSize > 0) ||
615 			(this->plan->hasPostCallback && this->plan->postCallbackParam.localMemSize > 0))
616 		{
617 			int localmemSize = 0;
618 			if (this->plan->hasPreCallback && this->plan->preCallback.localMemSize > 0)
619 				localmemSize = this->plan->preCallback.localMemSize;
620 			if (this->plan->hasPostCallback && this->plan->postCallbackParam.localMemSize > 0)
621 				localmemSize += this->plan->postCallbackParam.localMemSize;
622 
623 			OPENCL_V( clSetKernelArg( kern, uarg++, localmemSize, NULL ), _T( "clSetKernelArg failed" ) );
624 		}
625 	}
626 
627     std::vector< size_t > gWorkSize;
628     std::vector< size_t > lWorkSize;
629     clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
630 	//std::cout << "work sizes are " << gWorkSize[0] << ", " << lWorkSize[0] << std::endl;
631 	/*
632 	std::cout << "work sizes are ";
633 	for (auto itor = gWorkSize.begin(); itor != gWorkSize.end(); itor++)
634 		std::cout << *itor << " ";
635 	std::cout << ", ";
636 	for (auto itor = lWorkSize.begin(); itor != lWorkSize.end(); itor++)
637 		std::cout << *itor << " ";
638 	std::cout << std::endl;
639 	*/
640     // TODO:  if getWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
641     // that this multidimensional input data array is too large to be transformed
642     // with a single call to clEnqueueNDRangeKernel.  For now, we will just return
643     // the error code back up the call stack.
644     // The *correct* course of action would be to split the work into mutliple
645     // calls to clEnqueueNDRangeKernel.
646     if (CLFFT_INVALID_GLOBAL_WORK_SIZE == result)
647     {
648         OPENCL_V( result, _T("Work size too large for clEnqueNDRangeKernel()"));
649     }
650     else
651     {
652         OPENCL_V( result, _T("FFTAction::getWorkSizes failed"));
653     }
654     BUG_CHECK (gWorkSize.size() == lWorkSize.size());
655 
656 
657     cl_int call_status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
658                                             NULL, &gWorkSize[ 0 ],  &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
659     OPENCL_V( call_status, _T( "clEnqueueNDRangeKernel failed" ) );
660 
661     if( fftRepo.pStatTimer )
662     {
663         fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize, lWorkSize );
664     }
665 
666     return CLFFT_SUCCESS;
667 }
668 
669 
670 
671 //	Read the kernels that this plan uses from file, and store into the plan
writeKernel(const clfftPlanHandle plHandle,const clfftGenerators gen,const FFTKernelSignatureHeader * data,const cl_context & context,const cl_device_id & device)672 clfftStatus FFTAction::writeKernel( const clfftPlanHandle plHandle, const clfftGenerators gen, const FFTKernelSignatureHeader* data, const cl_context& context, const cl_device_id &device )
673 {
674     FFTRepo& fftRepo	= FFTRepo::getInstance( );
675 
676     std::string kernelPath = getKernelName(gen, plHandle, true);
677 
678     //	Logic to write string contents out to file
679     tofstreamRAII< std::ofstream, std::string > kernelFile( kernelPath.c_str( ) );
680     if( !kernelFile.get( ) )
681     {
682         std::cerr << "Failed to open kernel file for writing: " << kernelPath.c_str( ) << std::endl;
683         return CLFFT_FILE_CREATE_FAILURE;
684     }
685 
686     std::string kernel;
687     OPENCL_V( fftRepo.getProgramCode( gen, data, kernel, device, context ), _T( "fftRepo.getProgramCode failed." ) );
688 
689     kernelFile.get( ) << kernel << std::endl;
690 
691     return	CLFFT_SUCCESS;
692 }
693 
694 
695 // **************** TODO TODO TODO ***********************
696 // Making compileKernels function take in command queue parameter so we can build for 1 particular device only;
697 // this may not be desirable for persistent plans, where we may have to compile for all devices in the context;
698 // make changes appropriately before enabling persistent plans and then remove this comment
699 
700 //	Compile the kernels that this plan uses, and store into the plan
compileKernels(const cl_command_queue commQueueFFT,const clfftPlanHandle plHandle,FFTPlan * fftPlan)701 clfftStatus FFTAction::compileKernels( const cl_command_queue commQueueFFT, const clfftPlanHandle plHandle, FFTPlan* fftPlan )
702 {
703     cl_int status = 0;
704     size_t deviceListSize = 0;
705 
706     FFTRepo& fftRepo	= FFTRepo::getInstance( );
707 
708     // create a cl program executable for the device associated with command queue
709     // Get the device
710     cl_device_id &q_device = fftPlan->bakeDevice;
711 
712     cl_program program;
713     if( fftRepo.getclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context ) == CLFFT_INVALID_PROGRAM )
714     {
715         FFTBinaryLookup lookup (this->getGenerator(), plHandle, fftPlan->context, q_device);
716 
717         lookup.variantRaw(this->getSignatureData(), this->getSignatureData()->datasize);
718 
719         if (lookup.found())
720         {
721 #if FFT_CACHE_DEBUG
722             // debug message in debug mode to ensure that the cache is used
723             fprintf(stderr, "Kernel loaded from cache\n");
724 #endif
725 
726             program = lookup.getProgram();
727         }
728         else
729         {
730 #if FFT_CACHE_DEBUG
731             fprintf(stderr, "Kernel built from source\n");
732 #endif
733 
734             //	If the user wishes us to write the kernels out to disk, we do so
735             if( fftRepo.setupData.debugFlags & CLFFT_DUMP_PROGRAMS )
736             {
737 				OPENCL_V( writeKernel( plHandle, this->getGenerator(), this->getSignatureData(), fftPlan->context, fftPlan->bakeDevice ), _T( "writeKernel failed." ) );
738             }
739 
740             std::string programCode;
741             OPENCL_V( fftRepo.getProgramCode( this->getGenerator(), this->getSignatureData(), programCode, q_device, fftPlan->context  ), _T( "fftRepo.getProgramCode failed." ) );
742 
743             const char* source = programCode.c_str();
744             program = clCreateProgramWithSource( fftPlan->context, 1, &source, NULL, &status );
745             OPENCL_V( status, _T( "clCreateProgramWithSource failed." ) );
746 
747             // create a cl program executable for the device associated with command queue
748 
749 #if defined(DEBUGGING)
750             status = clBuildProgram( program, 1, &q_device, "-g -cl-opt-disable", NULL, NULL); // good for debugging kernels
751 
752 // if you have trouble creating smbols that GDB can pick up to set a breakpoint after kernels are loaded into memory
753 // this can be used to stop execution to allow you to set a breakpoint in a kernel after kernel symbols are in memory.
754 #ifdef DEBUG_BREAK_GDB
755             __debugbreak();
756 #endif
757 #else
758             status = clBuildProgram( program, 1, &q_device, "", NULL, NULL);
759 #endif
760             if( status != CL_SUCCESS )
761             {
762                 if( status == CL_BUILD_PROGRAM_FAILURE )
763                 {
764                     size_t buildLogSize = 0;
765                     OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
766                               _T( "clGetProgramBuildInfo failed" ) );
767 
768                     std::vector< char > buildLog( buildLogSize );
769                     ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
770 
771                     OPENCL_V( clGetProgramBuildInfo( program, q_device, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
772                               _T( "clGetProgramBuildInfo failed" ) );
773 
774                     std::cerr << "\n\t\t\tBUILD LOG\n";
775                     std::cerr << "************************************************\n";
776                     std::cerr << &buildLog[ 0 ] << std::endl;
777                     std::cerr << "************************************************\n";
778                 }
779 
780                 OPENCL_V( status, _T( "clBuildProgram failed" ) );
781             }
782 
783             lookup.setProgram(program, source);
784             lookup.populateCache();
785         }
786 
787         fftRepo.setclProgram( this->getGenerator(), this->getSignatureData(), program, q_device, fftPlan->context );
788 
789 
790         // For real transforms we compile either forward or backward kernel
791         bool buildFwdKernel = buildForwardKernel();
792         bool buildBwdKernel = buildBackwardKernel();
793 
794         // get a kernel object handle for a kernel with the given name
795         cl_kernel kernel;
796         if( buildFwdKernel )
797         {
798 			lockRAII *kernelLock;
799             if( fftRepo.getclKernel( program, CLFFT_FORWARD, kernel, kernelLock) == CLFFT_INVALID_KERNEL )
800             {
801                 std::string entryPoint;
802                 OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_FORWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
803 
804                 kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
805                 OPENCL_V( status, _T( "clCreateKernel failed" ) );
806 
807                 fftRepo.setclKernel( program, CLFFT_FORWARD, kernel );
808             }
809         }
810 
811         if( buildBwdKernel )
812         {
813 			lockRAII *kernelLock;
814             if( fftRepo.getclKernel( program, CLFFT_BACKWARD, kernel, kernelLock ) == CLFFT_INVALID_KERNEL )
815             {
816                 std::string entryPoint;
817                 OPENCL_V( fftRepo.getProgramEntryPoint( this->getGenerator(), this->getSignatureData(), CLFFT_BACKWARD, entryPoint, q_device, fftPlan->context ), _T( "fftRepo.getProgramEntryPoint failed." ) );
818 
819                 kernel = clCreateKernel( program, entryPoint.c_str( ), &status );
820                 OPENCL_V( status, _T( "clCreateKernel failed" ) );
821 
822                 fftRepo.setclKernel( program, CLFFT_BACKWARD, kernel );
823             }
824         }
825     }
826 
827     return	CLFFT_SUCCESS;
828 }
829 
830 
831