1 /*=========================================================================
2 *
3 *  Copyright Insight Software Consortium
4 *
5 *  Licensed under the Apache License, Version 2.0 (the "License");
6 *  you may not use this file except in compliance with the License.
7 *  You may obtain a copy of the License at
8 *
9 *         http://www.apache.org/licenses/LICENSE-2.0.txt
10 *
11 *  Unless required by applicable law or agreed to in writing, software
12 *  distributed under the License is distributed on an "AS IS" BASIS,
13 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *  See the License for the specific language governing permissions and
15 *  limitations under the License.
16 *
17 *=========================================================================*/
18 
19 #include "itkGPUKernelManager.h"
20 
21 namespace itk
22 {
GPUKernelManager()23 GPUKernelManager::GPUKernelManager()
24 {
25   m_Program = nullptr;
26   m_Manager = GPUContextManager::GetInstance();
27 
28   if(m_Manager->GetNumberOfCommandQueues() > 0) m_CommandQueueId = 0;   // default
29                                                                   // command
30                                                                   // queue
31 }
32 
~GPUKernelManager()33 GPUKernelManager::~GPUKernelManager()
34 {
35   cl_int errid;
36 
37   while(m_KernelContainer.size() > 0)
38     {
39     errid = clReleaseKernel(m_KernelContainer.back());
40     OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
41     m_KernelContainer.pop_back();
42     }
43 
44   if(m_Program != nullptr)
45     {
46     errid = clReleaseProgram(m_Program);
47     OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
48     }
49 }
50 
LoadProgramFromFile(const char * filename,const char * cPreamble)51 bool GPUKernelManager::LoadProgramFromFile(const char* filename, const char* cPreamble)
52 {
53   // locals
54   FILE*  pFileStream = nullptr;
55   size_t szSourceLength;
56   size_t szFinalLength;
57 
58   // open the OpenCL source code file
59 #ifdef _WIN32   // Windows version
60   if(fopen_s(&pFileStream, filename, "rb") != 0)
61     {
62     itkWarningMacro("Cannot open OpenCL source file");
63     return false;
64     }
65 #else           // Linux version
66   // printout OpenCL source Path
67   std::cout << "Loading source file: " << filename << std::endl;
68   pFileStream = fopen(filename, "rb");
69   if(pFileStream == nullptr)
70     {
71     itkWarningMacro("Cannot open OpenCL source file");
72     return false;
73     }
74 #endif
75 
76   size_t szPreambleLength = strlen(cPreamble);
77 
78   // get the length of the source code
79   fseek(pFileStream, 0, SEEK_END);
80   szSourceLength = ftell(pFileStream);
81   fseek(pFileStream, 0, SEEK_SET);
82 
83   // allocate a buffer for the source code string and read it in
84   char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1000);
85   if(szPreambleLength > 0) memcpy(cSourceString, cPreamble, szPreambleLength);
86   if (fread( (cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
87     {
88     fclose(pFileStream);
89     free(cSourceString);
90     return false;
91     }
92 
93   // close the file and return the total length of the combined (preamble +
94   // source) string
95   fclose(pFileStream);
96 
97   szFinalLength = szSourceLength + szPreambleLength;
98 
99   cSourceString[szSourceLength + szPreambleLength] = '\0';
100 
101   //
102   // Create OpenCL program from source strings
103   //
104   cl_int errid;
105   m_Program = clCreateProgramWithSource(
106       m_Manager->GetCurrentContext(), 1, const_cast<const char **>(&cSourceString), &szFinalLength, &errid);
107   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
108   free(cSourceString);
109 
110   if(errid != CL_SUCCESS)
111     {
112     itkWarningMacro("Cannot create GPU program");
113     return false;
114     }
115 
116   // build program
117   errid = clBuildProgram(m_Program, 0, nullptr, nullptr, nullptr, nullptr);
118   if(errid != CL_SUCCESS)
119     {
120     //itkWarningMacro("OpenCL program build error");
121 
122     // print out build error
123     size_t paramValueSize = 0;
124 
125     // get error message size
126     clGetProgramBuildInfo(m_Program, m_Manager->GetDeviceId(0), CL_PROGRAM_BUILD_LOG, 0, nullptr, &paramValueSize);
127 
128     char *paramValue;
129     paramValue = (char*)malloc(paramValueSize);
130 
131     // get error message
132     clGetProgramBuildInfo(m_Program, m_Manager->GetDeviceId(0), CL_PROGRAM_BUILD_LOG, paramValueSize, paramValue, nullptr);
133 
134     /*
135     std::ostringstream itkmsg;
136     itkmsg << "ERROR: In " __FILE__ ", line " << __LINE__ << "\n"
137            << this->GetNameOfClass() << " (" << this << "): "
138            << "OpenCL program build error:" << paramValue
139            << "\n\n";
140     ::itk::OutputWindowDisplayErrorText( itkmsg.str().c_str() );
141     */
142 
143     std::cerr << paramValue << std::endl;
144 
145     free( paramValue );
146 
147     OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
148 
149     return false;
150     }
151 
152   return true;
153 }
154 
LoadProgramFromString(const char * cSource,const char * cPreamble)155 bool GPUKernelManager::LoadProgramFromString(const char* cSource, const char* cPreamble)
156 {
157   size_t szSourceLength;
158   size_t szPreambleLength;
159   size_t szFinalLength;
160 
161   szSourceLength = strlen(cSource);
162   szPreambleLength = strlen(cPreamble);
163   szFinalLength = szSourceLength + szPreambleLength;
164 
165   // allocate a buffer for the source code string and read it in
166   char* cSourceString = (char *)malloc(szFinalLength + 1);
167   if(szPreambleLength > 0) memcpy(cSourceString, cPreamble, szPreambleLength);
168 
169   memcpy(cSourceString + szPreambleLength, cSource, szSourceLength);
170 
171 
172   cSourceString[szFinalLength] = '\0';
173 
174   //
175   // Create OpenCL program from source strings
176   //
177   cl_int errid;
178   m_Program = clCreateProgramWithSource(
179       m_Manager->GetCurrentContext(), 1, const_cast<const char **>(&cSourceString), &szFinalLength, &errid);
180   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
181   free(cSourceString);
182 
183   if(errid != CL_SUCCESS)
184     {
185     itkWarningMacro("Cannot create GPU program");
186     return false;
187     }
188 
189   // build program
190   errid = clBuildProgram(m_Program, 0, nullptr, nullptr, nullptr, nullptr);
191   if(errid != CL_SUCCESS)
192     {
193     //itkWarningMacro("OpenCL program build error");
194 
195     // print out build error
196     size_t paramValueSize = 0;
197 
198     // get error message size
199     clGetProgramBuildInfo(m_Program, m_Manager->GetDeviceId(0), CL_PROGRAM_BUILD_LOG, 0, nullptr, &paramValueSize);
200 
201     char *paramValue;
202     paramValue = (char*)malloc(paramValueSize);
203 
204     // get error message
205     clGetProgramBuildInfo(m_Program, m_Manager->GetDeviceId(0), CL_PROGRAM_BUILD_LOG, paramValueSize, paramValue, nullptr);
206 
207     /*
208     std::ostringstream itkmsg;
209     itkmsg << "ERROR: In " __FILE__ ", line " << __LINE__ << "\n"
210            << this->GetNameOfClass() << " (" << this << "): "
211            << "OpenCL program build error:" << paramValue
212            << "\n\n";
213     ::itk::OutputWindowDisplayErrorText( itkmsg.str().c_str() );
214     */
215 
216     std::cerr << paramValue << std::endl;
217 
218     free( paramValue );
219 
220     OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
221 
222     return false;
223     }
224 
225   return true;
226 }
227 
CreateKernel(const char * kernelName)228 int GPUKernelManager::CreateKernel(const char* kernelName)
229 {
230   cl_int errid;
231 
232   // create kernel
233   cl_kernel newKernel = clCreateKernel(m_Program, kernelName, &errid);
234 
235   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
236 
237   if(errid != CL_SUCCESS)   // failed
238     {
239     itkWarningMacro("Fail to create GPU kernel");
240     return -1;
241     }
242 
243   m_KernelContainer.push_back( newKernel );
244 
245   // argument list
246   m_KernelArgumentReady.push_back( std::vector< KernelArgumentList >() );
247   cl_uint nArg;
248   errid = clGetKernelInfo( newKernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &nArg, nullptr);
249   (m_KernelArgumentReady.back() ).resize( nArg );
250 
251   ResetArguments( (int)m_KernelContainer.size()-1 );
252 
253   return (int)m_KernelContainer.size()-1;
254 }
255 
GetKernelWorkGroupInfo(int kernelIdx,cl_kernel_work_group_info paramName,void * value)256 cl_int GPUKernelManager::GetKernelWorkGroupInfo(int kernelIdx,
257                                                 cl_kernel_work_group_info paramName, void *value)
258 {
259   size_t valueSize, valueSizeRet;
260 
261   switch (paramName)
262     {
263     case CL_KERNEL_WORK_GROUP_SIZE:
264       valueSize = sizeof(size_t);
265       break;
266     case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
267       valueSize = 3 * sizeof(size_t);
268       break;
269     case CL_KERNEL_LOCAL_MEM_SIZE:
270       valueSize = sizeof(cl_ulong);
271       break;
272     default:
273       itkGenericExceptionMacro (<< "Unknown type of work goup information");
274     }
275 
276   cl_int errid = clGetKernelWorkGroupInfo(m_KernelContainer[kernelIdx], m_Manager->GetDeviceId(0),
277                                           paramName, valueSize, value, &valueSizeRet);
278 
279   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
280 
281   return errid;
282 }
283 
GetDeviceInfo(cl_kernel_work_group_info paramName,size_t argSize,void * argValue)284 cl_int GPUKernelManager::GetDeviceInfo(
285                      cl_kernel_work_group_info paramName,
286                      size_t argSize, void *argValue)
287 {
288   cl_int errid;
289 
290   switch (paramName)
291     {
292     case CL_DEVICE_MAX_WORK_ITEM_SIZES:
293       errid = clGetDeviceInfo(m_Manager->GetDeviceId(0),
294         CL_DEVICE_MAX_WORK_ITEM_SIZES, argSize, argValue, nullptr);
295       break;
296     default:
297       itkGenericExceptionMacro (<< "Unknown type of device info");
298     }
299   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
300 
301   return errid;
302 }
303 
SetKernelArg(int kernelIdx,cl_uint argIdx,size_t argSize,const void * argVal)304 bool GPUKernelManager::SetKernelArg(int kernelIdx, cl_uint argIdx, size_t argSize, const void* argVal)
305 {
306   if(kernelIdx < 0 || kernelIdx >= (int)m_KernelContainer.size() ) return false;
307 
308   cl_int errid;
309 
310   errid = clSetKernelArg(m_KernelContainer[kernelIdx], argIdx, argSize, argVal);
311   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
312 
313   m_KernelArgumentReady[kernelIdx][argIdx].m_IsReady = true;
314   m_KernelArgumentReady[kernelIdx][argIdx].m_GPUDataManager = (GPUDataManager::Pointer)nullptr;
315 
316   return true;
317 }
318 
SetKernelArgWithImage(int kernelIdx,cl_uint argIdx,GPUDataManager::Pointer manager)319 bool GPUKernelManager::SetKernelArgWithImage(int kernelIdx, cl_uint argIdx, GPUDataManager::Pointer manager)
320 {
321   if(kernelIdx < 0 || kernelIdx >= (int)m_KernelContainer.size() ) return false;
322 
323   cl_int errid;
324 
325   errid = clSetKernelArg(m_KernelContainer[kernelIdx], argIdx, sizeof(cl_mem), manager->GetGPUBufferPointer() );
326   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
327 
328   m_KernelArgumentReady[kernelIdx][argIdx].m_IsReady = true;
329   m_KernelArgumentReady[kernelIdx][argIdx].m_GPUDataManager = manager;
330 
331   return true;
332 }
333 
334 // this function must be called right before GPU kernel is launched
CheckArgumentReady(int kernelIdx)335 bool GPUKernelManager::CheckArgumentReady(int kernelIdx)
336 {
337   int nArg = m_KernelArgumentReady[kernelIdx].size();
338 
339   for(int i=0; i<nArg; i++)
340     {
341     if(!(m_KernelArgumentReady[kernelIdx][i].m_IsReady) ) return false;
342 
343     // automatic synchronization before kernel launch
344     if(m_KernelArgumentReady[kernelIdx][i].m_GPUDataManager != (GPUDataManager::Pointer)nullptr)
345       {
346       m_KernelArgumentReady[kernelIdx][i].m_GPUDataManager->SetCPUBufferDirty();
347       }
348     }
349   return true;
350 }
351 
ResetArguments(int kernelIdx)352 void GPUKernelManager::ResetArguments(int kernelIdx)
353 {
354   int nArg = m_KernelArgumentReady[kernelIdx].size();
355 
356   for(int i=0; i<nArg; i++)
357     {
358     m_KernelArgumentReady[kernelIdx][i].m_IsReady = false;
359     m_KernelArgumentReady[kernelIdx][i].m_GPUDataManager = (GPUDataManager::Pointer)nullptr;
360     }
361 }
362 
LaunchKernel1D(int kernelIdx,size_t globalWorkSize,size_t itkNotUsed (localWorkSize))363 bool GPUKernelManager::LaunchKernel1D(int kernelIdx, size_t globalWorkSize, size_t itkNotUsed(localWorkSize))
364 {
365   if(kernelIdx < 0 || kernelIdx >= (int)m_KernelContainer.size() ) return false;
366 
367   if(!CheckArgumentReady(kernelIdx) )
368     {
369     itkWarningMacro("GPU kernel arguments are not completely assigned");
370     return false;
371     }
372 
373   cl_int errid;
374   // TODO should we allow the user to determine localWorkSize?
375 //   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
376 //                                    m_CommandQueueId), m_KernelContainer[kernelIdx], 1, nullptr, &globalWorkSize,
377 //                                  &localWorkSize, 0, nullptr, nullptr);
378   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
379                                    m_CommandQueueId), m_KernelContainer[kernelIdx], 1, nullptr, &globalWorkSize,
380                                  nullptr, 0, nullptr, nullptr);
381   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
382 
383   if(errid != CL_SUCCESS)
384     {
385     itkWarningMacro("GPU kernel launch failed");
386     return false;
387     }
388 
389   return true;
390 }
391 
LaunchKernel2D(int kernelIdx,size_t globalWorkSizeX,size_t globalWorkSizeY,size_t itkNotUsed (localWorkSizeX),size_t itkNotUsed (localWorkSizeY))392 bool GPUKernelManager::LaunchKernel2D(int kernelIdx,
393                                       size_t globalWorkSizeX, size_t globalWorkSizeY,
394                                       size_t itkNotUsed(localWorkSizeX),  size_t itkNotUsed(localWorkSizeY) )
395 {
396   if(kernelIdx < 0 || kernelIdx >= (int)m_KernelContainer.size() ) return false;
397 
398   if(!CheckArgumentReady(kernelIdx) )
399     {
400     itkWarningMacro("GPU kernel arguments are not completely assigned");
401     return false;
402     }
403 
404   size_t gws[2];
405   gws[0] = globalWorkSizeX;
406   gws[1] = globalWorkSizeY;
407 
408 //  size_t lws[2];
409 //  lws[0] = localWorkSizeX;
410 //  lws[1] = localWorkSizeY;
411 
412   cl_int errid;
413   // TODO should we allow the user to determine localWorkSize?
414 //   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
415 //                                    m_CommandQueueId), m_KernelContainer[kernelIdx], 2, nullptr, gws,
416 //                                  lws, 0, nullptr, nullptr);
417   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
418                                    m_CommandQueueId), m_KernelContainer[kernelIdx], 2, nullptr, gws, nullptr, 0, nullptr, nullptr);
419   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
420 
421   if(errid != CL_SUCCESS)
422     {
423     itkWarningMacro("GPU kernel launch failed");
424     return false;
425     }
426 
427   return true;
428 }
429 
LaunchKernel3D(int kernelIdx,size_t globalWorkSizeX,size_t globalWorkSizeY,size_t globalWorkSizeZ,size_t itkNotUsed (localWorkSizeX),size_t itkNotUsed (localWorkSizeY),size_t itkNotUsed (localWorkSizeZ))430 bool GPUKernelManager::LaunchKernel3D(int kernelIdx,
431                                       size_t globalWorkSizeX, size_t globalWorkSizeY, size_t globalWorkSizeZ,
432                                       size_t itkNotUsed(localWorkSizeX),  size_t itkNotUsed(localWorkSizeY), size_t itkNotUsed(localWorkSizeZ) )
433 {
434   if(kernelIdx < 0 || kernelIdx >= (int)m_KernelContainer.size() ) return false;
435 
436   if(!CheckArgumentReady(kernelIdx) )
437     {
438     itkWarningMacro("GPU kernel arguments are not completely assigned");
439     return false;
440     }
441 
442   size_t gws[3];
443   gws[0] = globalWorkSizeX;
444   gws[1] = globalWorkSizeY;
445   gws[2] = globalWorkSizeZ;
446 
447 //  size_t lws[3];
448 //  lws[0] = localWorkSizeX;
449 //  lws[1] = localWorkSizeY;
450 //  lws[2] = localWorkSizeZ;
451 
452   cl_int errid;
453   // TODO should we allow the user to determine localWorkSize?
454 //   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
455 //                                    m_CommandQueueId), m_KernelContainer[kernelIdx], 3, nullptr, gws, lws, 0, nullptr, nullptr);
456   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
457                                    m_CommandQueueId), m_KernelContainer[kernelIdx], 3, nullptr, gws, nullptr, 0, nullptr, nullptr);
458   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
459 
460   if(errid != CL_SUCCESS)
461     {
462     itkWarningMacro("GPU kernel launch failed");
463     return false;
464     }
465 
466   return true;
467 }
468 
LaunchKernel(int kernelIdx,int dim,size_t * globalWorkSize,size_t * localWorkSize)469 bool GPUKernelManager::LaunchKernel(int kernelIdx, int dim, size_t *globalWorkSize, size_t *localWorkSize)
470 {
471   if(kernelIdx < 0 || kernelIdx >= (int)m_KernelContainer.size() ) return false;
472 
473   if(!CheckArgumentReady(kernelIdx) )
474     {
475     itkWarningMacro("GPU kernel arguments are not completely assigned");
476     return false;
477     }
478 
479 // debug
480 //std::cout << "Dim : " << dim << std::endl;
481 
482 // debug - if devicetype is CPU
483 //localWorkSize[0] = localWorkSize[1] = localWorkSize[2] = 1;
484 //
485 
486   cl_int errid;
487   errid = clEnqueueNDRangeKernel(m_Manager->GetCommandQueue(
488                                    m_CommandQueueId), m_KernelContainer[kernelIdx], (cl_uint)dim, nullptr, globalWorkSize,
489                                     localWorkSize, 0, nullptr, nullptr);
490   OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
491 
492 /*
493 std::cout << "Check point 1" << std::endl;
494 
495 // debug -- synchronize
496 errid = clFlush(m_Manager->GetCommandQueue(m_CommandQueueId));
497 OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
498 
499 std::cout << "Check point 2" << std::endl;
500 */
501 errid = clFinish(m_Manager->GetCommandQueue(m_CommandQueueId));
502 OpenCLCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
503 /*
504 std::cout << "Wait for kernel execution ends" << std::endl;
505 */
506 
507   if(errid != CL_SUCCESS)
508     {
509     itkWarningMacro("GPU kernel launch failed");
510     return false;
511     }
512 
513   return true;
514 }
515 
SetCurrentCommandQueue(int queueid)516 void GPUKernelManager::SetCurrentCommandQueue( int queueid )
517 {
518   if( queueid >= 0 && queueid < (int)m_Manager->GetNumberOfCommandQueues() )
519     {
520     // Assumption: different command queue is assigned to different device
521     m_CommandQueueId = queueid;
522     }
523   else
524     {
525     itkWarningMacro("Not a valid command queue id");
526     }
527 }
528 
GetCurrentCommandQueueID()529 int GPUKernelManager::GetCurrentCommandQueueID()
530 {
531   return m_CommandQueueId;
532 }
533 
534 }
535