1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 
11 #ifdef USE_OPENCL
12 
13 #  ifdef _WIN32
14 #    include <io.h>
15 #  else
16 #    include <sys/types.h>
17 #    include <unistd.h>
18 #  endif
19 #  include <cfloat>
20 #  include <ctime> // for clock_gettime
21 
22 #  include "oclkernels.h"
23 #  include "openclwrapper.h"
24 
25 // for micro-benchmark
26 #  include "otsuthr.h"
27 #  include "thresholder.h"
28 
29 // platform preprocessor commands
30 #  if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(__CYGWIN__) || \
31       defined(__MINGW32__)
32 #    define ON_WINDOWS 1
33 #    define ON_APPLE 0
34 #  elif defined(__linux__)
35 #    define ON_WINDOWS 0
36 #    define ON_APPLE 0
37 #  elif defined(__APPLE__)
38 #    define ON_WINDOWS 0
39 #    define ON_APPLE 1
40 #  else
41 #    define ON_WINDOWS 0
42 #    define ON_APPLE 0
43 #  endif
44 
45 #  if ON_APPLE
46 #    include <mach/mach_time.h>
47 #  endif
48 
49 #  include <cstdio>
50 #  include <cstdlib>
51 #  include <cstring> // for memset, strcpy, ...
52 #  include <vector>
53 
54 #  include "errcode.h" // for ASSERT_HOST
55 #  include "image.h"   // for Image
56 
57 namespace tesseract {
58 
59 GPUEnv OpenclDevice::gpuEnv;
60 
61 bool OpenclDevice::deviceIsSelected = false;
62 ds_device OpenclDevice::selectedDevice;
63 
64 int OpenclDevice::isInited = 0;
65 
66 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
67 
68 static const l_uint32 lmask32[] = {
69     0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
70     0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
71     0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
72     0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
73 
74 static const l_uint32 rmask32[] = {
75     0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
76     0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
77     0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
78     0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
79 
80 static cl_mem pixsCLBuffer, pixdCLBuffer,
81     pixdCLIntermediate;    // Morph operations buffers
82 static cl_mem pixThBuffer; // output from thresholdtopix calculation
83 static cl_int clStatus;
84 static KernelEnv rEnv;
85 
86 #  define DS_TAG_VERSION "<version>"
87 #  define DS_TAG_VERSION_END "</version>"
88 #  define DS_TAG_DEVICE "<device>"
89 #  define DS_TAG_DEVICE_END "</device>"
90 #  define DS_TAG_SCORE "<score>"
91 #  define DS_TAG_SCORE_END "</score>"
92 #  define DS_TAG_DEVICE_TYPE "<type>"
93 #  define DS_TAG_DEVICE_TYPE_END "</type>"
94 #  define DS_TAG_DEVICE_NAME "<name>"
95 #  define DS_TAG_DEVICE_NAME_END "</name>"
96 #  define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
97 #  define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
98 
99 #  define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
100 
101 #  define DS_DEVICE_NAME_LENGTH 256
102 
103 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
104 
105 struct ds_profile {
106   std::vector<ds_device> devices;
107   unsigned int numDevices;
108   const char *version;
109 };
110 
111 enum ds_status {
112   DS_SUCCESS = 0,
113   DS_INVALID_PROFILE = 1000,
114   DS_MEMORY_ERROR,
115   DS_INVALID_PERF_EVALUATOR_TYPE,
116   DS_INVALID_PERF_EVALUATOR,
117   DS_PERF_EVALUATOR_ERROR,
118   DS_FILE_ERROR,
119   DS_UNKNOWN_DEVICE_TYPE,
120   DS_PROFILE_FILE_ERROR,
121   DS_SCORE_SERIALIZER_ERROR,
122   DS_SCORE_DESERIALIZER_ERROR
123 };
124 
125 // Pointer to a function that calculates the score of a device (ex:
126 // device->score) update the data size of score. The encoding and the format
127 // of the score data is implementation defined. The function should return
128 // DS_SUCCESS if there's no error to be reported.
129 typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
130 
131 // deallocate memory used by score
132 typedef ds_status (*ds_score_release)(TessDeviceScore *score);
133 
releaseDSProfile(ds_profile * profile,ds_score_release sr)134 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
135   ds_status status = DS_SUCCESS;
136   if (profile != nullptr) {
137     if (sr != nullptr) {
138       unsigned int i;
139       for (i = 0; i < profile->numDevices; i++) {
140         free(profile->devices[i].oclDeviceName);
141         free(profile->devices[i].oclDriverVersion);
142         status = sr(profile->devices[i].score);
143         if (status != DS_SUCCESS)
144           break;
145       }
146     }
147     delete profile;
148   }
149   return status;
150 }
151 
initDSProfile(ds_profile ** p,const char * version)152 static ds_status initDSProfile(ds_profile **p, const char *version) {
153   int numDevices;
154   cl_uint numPlatforms;
155   std::vector<cl_platform_id> platforms;
156   std::vector<cl_device_id> devices;
157   ds_status status = DS_SUCCESS;
158   unsigned int next;
159   unsigned int i;
160 
161   if (p == nullptr)
162     return DS_INVALID_PROFILE;
163 
164   ds_profile *profile = new ds_profile;
165 
166   memset(profile, 0, sizeof(ds_profile));
167 
168   clGetPlatformIDs(0, nullptr, &numPlatforms);
169 
170   if (numPlatforms > 0) {
171     platforms.reserve(numPlatforms);
172     clGetPlatformIDs(numPlatforms, &platforms[0], nullptr);
173   }
174 
175   numDevices = 0;
176   for (i = 0; i < numPlatforms; i++) {
177     cl_uint num;
178     clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
179     numDevices += num;
180   }
181 
182   if (numDevices > 0) {
183     devices.reserve(numDevices);
184   }
185 
186   profile->numDevices = numDevices + 1; // +1 to numDevices to include the native CPU
187   profile->devices.reserve(profile->numDevices);
188   memset(&profile->devices[0], 0, profile->numDevices * sizeof(ds_device));
189 
190   next = 0;
191   for (i = 0; i < numPlatforms; i++) {
192     cl_uint num;
193     unsigned j;
194     clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
195     for (j = 0; j < num; j++, next++) {
196       char buffer[DS_DEVICE_NAME_LENGTH];
197       size_t length;
198 
199       profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
200       profile->devices[next].oclDeviceID = devices[j];
201 
202       clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH,
203                       &buffer, nullptr);
204       length = strlen(buffer);
205       profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
206       memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
207 
208       clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH,
209                       &buffer, nullptr);
210       length = strlen(buffer);
211       profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
212       memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
213     }
214   }
215   profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
216   profile->version = version;
217 
218   *p = profile;
219   return status;
220 }
221 
profileDevices(ds_profile * profile,const ds_evaluation_type type,ds_perf_evaluator evaluator,void * evaluatorData,unsigned int * numUpdates)222 static ds_status profileDevices(ds_profile *profile, const ds_evaluation_type type,
223                                 ds_perf_evaluator evaluator, void *evaluatorData,
224                                 unsigned int *numUpdates) {
225   ds_status status = DS_SUCCESS;
226   unsigned int i;
227   unsigned int updates = 0;
228 
229   if (profile == nullptr) {
230     return DS_INVALID_PROFILE;
231   }
232   if (evaluator == nullptr) {
233     return DS_INVALID_PERF_EVALUATOR;
234   }
235 
236   for (i = 0; i < profile->numDevices; i++) {
237     ds_status evaluatorStatus;
238 
239     switch (type) {
240       case DS_EVALUATE_NEW_ONLY:
241         if (profile->devices[i].score != nullptr)
242           break;
243       //  else fall through
244       case DS_EVALUATE_ALL:
245         evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
246         if (evaluatorStatus != DS_SUCCESS) {
247           status = evaluatorStatus;
248           return status;
249         }
250         updates++;
251         break;
252       default:
253         return DS_INVALID_PERF_EVALUATOR_TYPE;
254         break;
255     };
256   }
257   if (numUpdates)
258     *numUpdates = updates;
259   return status;
260 }
261 
findString(const char * contentStart,const char * contentEnd,const char * string)262 static const char *findString(const char *contentStart, const char *contentEnd,
263                               const char *string) {
264   size_t stringLength;
265   const char *currentPosition;
266   const char *found = nullptr;
267   stringLength = strlen(string);
268   currentPosition = contentStart;
269   for (currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
270     if (*currentPosition == string[0]) {
271       if (currentPosition + stringLength < contentEnd) {
272         if (strncmp(currentPosition, string, stringLength) == 0) {
273           found = currentPosition;
274           break;
275         }
276       }
277     }
278   }
279   return found;
280 }
281 
readProFile(const char * fileName,char ** content,size_t * contentSize)282 static ds_status readProFile(const char *fileName, char **content, size_t *contentSize) {
283   *contentSize = 0;
284   *content = nullptr;
285   ds_status status = DS_SUCCESS;
286   FILE *input = fopen(fileName, "rb");
287   if (input == nullptr) {
288     status = DS_FILE_ERROR;
289   } else {
290     fseek(input, 0L, SEEK_END);
291     auto pos = std::ftell(input);
292     rewind(input);
293     if (pos > 0) {
294       size_t size = pos;
295       char *binary = new char[size];
296       if (fread(binary, sizeof(char), size, input) != size) {
297         status = DS_FILE_ERROR;
298         delete[] binary;
299       } else {
300         *contentSize = size;
301         *content = binary;
302       }
303     }
304     fclose(input);
305   }
306   return status;
307 }
308 
309 typedef ds_status (*ds_score_deserializer)(ds_device *device, const uint8_t *serializedScore,
310                                            unsigned int serializedScoreSize);
311 
readProfileFromFile(ds_profile * profile,ds_score_deserializer deserializer,const char * file)312 static ds_status readProfileFromFile(ds_profile *profile, ds_score_deserializer deserializer,
313                                      const char *file) {
314   ds_status status = DS_SUCCESS;
315   char *contentStart;
316   size_t contentSize;
317 
318   if (profile == nullptr)
319     return DS_INVALID_PROFILE;
320 
321   status = readProFile(file, &contentStart, &contentSize);
322   if (status == DS_SUCCESS) {
323     const char *currentPosition;
324     const char *dataStart;
325     const char *dataEnd;
326 
327     const char *contentEnd = contentStart + contentSize;
328     currentPosition = contentStart;
329 
330     // parse the version string
331     dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332     if (dataStart == nullptr) {
333       status = DS_PROFILE_FILE_ERROR;
334       goto cleanup;
335     }
336     dataStart += strlen(DS_TAG_VERSION);
337 
338     dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339     if (dataEnd == nullptr) {
340       status = DS_PROFILE_FILE_ERROR;
341       goto cleanup;
342     }
343 
344     size_t versionStringLength = strlen(profile->version);
345     if (versionStringLength + dataStart != dataEnd ||
346         strncmp(profile->version, dataStart, versionStringLength) != 0) {
347       // version mismatch
348       status = DS_PROFILE_FILE_ERROR;
349       goto cleanup;
350     }
351     currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
352 
353     // parse the device information
354     while (1) {
355       unsigned int i;
356 
357       const char *deviceTypeStart;
358       const char *deviceTypeEnd;
359       ds_device_type deviceType;
360 
361       const char *deviceNameStart;
362       const char *deviceNameEnd;
363 
364       const char *deviceScoreStart;
365       const char *deviceScoreEnd;
366 
367       const char *deviceDriverStart;
368       const char *deviceDriverEnd;
369 
370       dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371       if (dataStart == nullptr) {
372         // nothing useful remain, quit...
373         break;
374       }
375       dataStart += strlen(DS_TAG_DEVICE);
376       dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377       if (dataEnd == nullptr) {
378         status = DS_PROFILE_FILE_ERROR;
379         goto cleanup;
380       }
381 
382       // parse the device type
383       deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384       if (deviceTypeStart == nullptr) {
385         status = DS_PROFILE_FILE_ERROR;
386         goto cleanup;
387       }
388       deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
389       deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
390       if (deviceTypeEnd == nullptr) {
391         status = DS_PROFILE_FILE_ERROR;
392         goto cleanup;
393       }
394       memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
395 
396       // parse the device name
397       if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
398         deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
399         if (deviceNameStart == nullptr) {
400           status = DS_PROFILE_FILE_ERROR;
401           goto cleanup;
402         }
403         deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
404         deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
405         if (deviceNameEnd == nullptr) {
406           status = DS_PROFILE_FILE_ERROR;
407           goto cleanup;
408         }
409 
410         deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
411         if (deviceDriverStart == nullptr) {
412           status = DS_PROFILE_FILE_ERROR;
413           goto cleanup;
414         }
415         deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
416         deviceDriverEnd =
417             findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
418         if (deviceDriverEnd == nullptr) {
419           status = DS_PROFILE_FILE_ERROR;
420           goto cleanup;
421         }
422 
423         // check if this device is on the system
424         for (i = 0; i < profile->numDevices; i++) {
425           if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
426             size_t actualDeviceNameLength;
427             size_t driverVersionLength;
428 
429             actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
430             driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
431             if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
432                 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
433                 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
434                         actualDeviceNameLength) == 0 &&
435                 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
436                         driverVersionLength) == 0) {
437               deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
438               deviceScoreStart += strlen(DS_TAG_SCORE);
439               deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
440               status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
441                                     deviceScoreEnd - deviceScoreStart);
442               if (status != DS_SUCCESS) {
443                 goto cleanup;
444               }
445             }
446           }
447         }
448       } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
449         for (i = 0; i < profile->numDevices; i++) {
450           if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
451             deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
452             if (deviceScoreStart == nullptr) {
453               status = DS_PROFILE_FILE_ERROR;
454               goto cleanup;
455             }
456             deviceScoreStart += strlen(DS_TAG_SCORE);
457             deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
458             status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
459                                   deviceScoreEnd - deviceScoreStart);
460             if (status != DS_SUCCESS) {
461               goto cleanup;
462             }
463           }
464         }
465       }
466 
467       // skip over the current one to find the next device
468       currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
469     }
470   }
471 cleanup:
472   delete[] contentStart;
473   return status;
474 }
475 
476 typedef ds_status (*ds_score_serializer)(ds_device *device, uint8_t **serializedScore,
477                                          unsigned int *serializedScoreSize);
writeProfileToFile(ds_profile * profile,ds_score_serializer serializer,const char * file)478 static ds_status writeProfileToFile(ds_profile *profile, ds_score_serializer serializer,
479                                     const char *file) {
480   ds_status status = DS_SUCCESS;
481 
482   if (profile == nullptr)
483     return DS_INVALID_PROFILE;
484 
485   FILE *profileFile = fopen(file, "wb");
486   if (profileFile == nullptr) {
487     status = DS_FILE_ERROR;
488   } else {
489     unsigned int i;
490 
491     // write version string
492     fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
493     fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
494     fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
495     fwrite("\n", sizeof(char), 1, profileFile);
496 
497     for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
498       uint8_t *serializedScore;
499       unsigned int serializedScoreSize;
500 
501       fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
502 
503       fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
504       fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
505       fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
506 
507       switch (profile->devices[i].type) {
508         case DS_DEVICE_NATIVE_CPU: {
509           // There's no need to emit a device name for the native CPU device.
510           /*
511 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
512        profileFile);
513 fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
514        strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
515 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
516        strlen(DS_TAG_DEVICE_NAME_END), profileFile);
517 */
518         } break;
519         case DS_DEVICE_OPENCL_DEVICE: {
520           fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), profileFile);
521           fwrite(profile->devices[i].oclDeviceName, sizeof(char),
522                  strlen(profile->devices[i].oclDeviceName), profileFile);
523           fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
524 
525           fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), strlen(DS_TAG_DEVICE_DRIVER_VERSION),
526                  profileFile);
527           fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
528                  strlen(profile->devices[i].oclDriverVersion), profileFile);
529           fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
530                  strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
531         } break;
532         default:
533           status = DS_UNKNOWN_DEVICE_TYPE;
534           continue;
535       };
536 
537       fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
538       status = serializer(&profile->devices[i], &serializedScore, &serializedScoreSize);
539       if (status == DS_SUCCESS && serializedScore != nullptr && serializedScoreSize > 0) {
540         fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
541         delete[] serializedScore;
542       }
543       fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
544       fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
545       fwrite("\n", sizeof(char), 1, profileFile);
546     }
547     fclose(profileFile);
548   }
549   return status;
550 }
551 
552 // substitute invalid characters in device name with _
legalizeFileName(char * fileName)553 static void legalizeFileName(char *fileName) {
554   // tprintf("fileName: %s\n", fileName);
555   const char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
556   // for each invalid char
557   for (unsigned i = 0; i < strlen(invalidChars); i++) {
558     char invalidStr[4];
559     invalidStr[0] = invalidChars[i];
560     invalidStr[1] = '\0';
561     // tprintf("eliminating %s\n", invalidStr);
562     // char *pos = strstr(fileName, invalidStr);
563     // initial ./ is valid for present directory
564     // if (*pos == '.') pos++;
565     // if (*pos == '/') pos++;
566     for (char *pos = strstr(fileName, invalidStr); pos != nullptr;
567          pos = strstr(pos + 1, invalidStr)) {
568       // tprintf("\tfound: %s, ", pos);
569       pos[0] = '_';
570       // tprintf("fileName: %s\n", fileName);
571     }
572   }
573 }
574 
populateGPUEnvFromDevice(GPUEnv * gpuInfo,cl_device_id device)575 static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
576   // tprintf("[DS] populateGPUEnvFromDevice\n");
577   size_t size;
578   gpuInfo->mnIsUserCreated = 1;
579   // device
580   gpuInfo->mpDevID = device;
581   gpuInfo->mpArryDevsID = new cl_device_id[1];
582   gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
583   clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, sizeof(cl_device_type),
584                              &gpuInfo->mDevType, &size);
585   CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
586   // platform
587   clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
588                              &gpuInfo->mpPlatformID, &size);
589   CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
590   // context
591   cl_context_properties props[3];
592   props[0] = CL_CONTEXT_PLATFORM;
593   props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
594   props[2] = 0;
595   gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
596   CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
597   // queue
598   cl_command_queue_properties queueProperties = 0;
599   gpuInfo->mpCmdQueue =
600       clCreateCommandQueue(gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
601   CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
602 }
603 
LoadOpencl()604 int OpenclDevice::LoadOpencl() {
605 #  ifdef WIN32
606   HINSTANCE HOpenclDll = nullptr;
607   void *OpenclDll = nullptr;
608   // fprintf(stderr, " LoadOpenclDllxx... \n");
609   OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
610   OpenclDll = LoadLibrary("openCL.dll");
611   if (!static_cast<HINSTANCE>(OpenclDll)) {
612     fprintf(stderr, "[OD] Load opencl.dll failed!\n");
613     FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
614     return 0;
615   }
616   fprintf(stderr, "[OD] Load opencl.dll successful!\n");
617 #  endif
618   return 1;
619 }
SetKernelEnv(KernelEnv * envInfo)620 int OpenclDevice::SetKernelEnv(KernelEnv *envInfo) {
621   envInfo->mpkContext = gpuEnv.mpContext;
622   envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
623   envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
624 
625   return 1;
626 }
627 
allocateZeroCopyBuffer(const KernelEnv & rEnv,l_uint32 * hostbuffer,size_t nElements,cl_mem_flags flags,cl_int * pStatus)628 static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer, size_t nElements,
629                                      cl_mem_flags flags, cl_int *pStatus) {
630   cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
631                                     nElements * sizeof(l_uint32), hostbuffer, pStatus);
632 
633   return membuffer;
634 }
635 
mapOutputCLBuffer(const KernelEnv & rEnv,cl_mem clbuffer,Image pixd,Image pixs,int elements,cl_mem_flags flags,bool memcopy=false,bool sync=true)636 static Image mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Image pixd, Image pixs,
637                               int elements, cl_mem_flags flags, bool memcopy = false,
638                               bool sync = true) {
639   if (!pixd) {
640     if (memcopy) {
641       if ((pixd = pixCreateTemplate(pixs)) == nullptr)
642         tprintf("pixd not made\n");
643     } else {
644       if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) ==
645           nullptr)
646         tprintf("pixd not made\n");
647     }
648   }
649   l_uint32 *pValues =
650       (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
651                                      elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
652 
653   if (memcopy) {
654     memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
655   } else {
656     pixSetData(pixd, pValues);
657   }
658 
659   clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr, nullptr);
660 
661   if (sync) {
662     clFinish(rEnv.mpkCmdQueue);
663   }
664 
665   return pixd;
666 }
667 
releaseMorphCLBuffers()668 void OpenclDevice::releaseMorphCLBuffers() {
669   if (pixdCLIntermediate != nullptr)
670     clReleaseMemObject(pixdCLIntermediate);
671   if (pixsCLBuffer != nullptr)
672     clReleaseMemObject(pixsCLBuffer);
673   if (pixdCLBuffer != nullptr)
674     clReleaseMemObject(pixdCLBuffer);
675   if (pixThBuffer != nullptr)
676     clReleaseMemObject(pixThBuffer);
677   pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
678 }
679 
initMorphCLAllocations(l_int32 wpl,l_int32 h,Image pixs)680 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Image pixs) {
681   SetKernelEnv(&rEnv);
682 
683   if (pixThBuffer != nullptr) {
684     pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
685 
686     // Get the output from ThresholdToPix operation
687     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
688                                    sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
689   } else {
690     // Get data from the source image
691     l_uint32 *srcdata = reinterpret_cast<l_uint32 *>(malloc(wpl * h * sizeof(l_uint32)));
692     memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
693 
694     pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, CL_MEM_USE_HOST_PTR, &clStatus);
695   }
696 
697   pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
698 
699   pixdCLIntermediate =
700       allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
701 
702   return (int)clStatus;
703 }
704 
InitEnv()705 int OpenclDevice::InitEnv() {
706 //    tprintf("[OD] OpenclDevice::InitEnv()\n");
707 #  ifdef SAL_WIN32
708   while (1) {
709     if (1 == LoadOpencl())
710       break;
711   }
712 #  endif
713   // sets up environment, compiles programs
714 
715   InitOpenclRunEnv_DeviceSelection(0);
716   return 1;
717 }
718 
ReleaseOpenclRunEnv()719 int OpenclDevice::ReleaseOpenclRunEnv() {
720   ReleaseOpenclEnv(&gpuEnv);
721 #  ifdef SAL_WIN32
722   FreeOpenclDll();
723 #  endif
724   return 1;
725 }
726 
AddKernelConfig(int kCount,const char * kName)727 inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
728   ASSERT_HOST(kCount > 0);
729   ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
730   strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
731   gpuEnv.mnKernelCount++;
732   return 0;
733 }
734 
RegistOpenclKernel()735 int OpenclDevice::RegistOpenclKernel() {
736   if (!gpuEnv.mnIsUserCreated)
737     memset(&gpuEnv, 0, sizeof(gpuEnv));
738 
739   gpuEnv.mnFileCount = 0; // argc;
740   gpuEnv.mnKernelCount = 0UL;
741 
742   AddKernelConfig(1, "oclAverageSub1");
743   return 0;
744 }
745 
InitOpenclRunEnv_DeviceSelection(int argc)746 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) {
747   if (!isInited) {
748     // after programs compiled, selects best device
749     ds_device bestDevice_DS = getDeviceSelection();
750     cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
751     // overwrite global static GPUEnv with new device
752     if (selectedDeviceIsOpenCL()) {
753       // tprintf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
754       // for selected device\n");
755       populateGPUEnvFromDevice(&gpuEnv, bestDevice);
756       gpuEnv.mnFileCount = 0; // argc;
757       gpuEnv.mnKernelCount = 0UL;
758       CompileKernelFile(&gpuEnv, "");
759     } else {
760       // tprintf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
761       // b/c native cpu selected\n");
762     }
763     isInited = 1;
764   }
765   return 0;
766 }
767 
OpenclDevice()768 OpenclDevice::OpenclDevice() {
769   // InitEnv();
770 }
771 
~OpenclDevice()772 OpenclDevice::~OpenclDevice() {
773   // ReleaseOpenclRunEnv();
774 }
775 
ReleaseOpenclEnv(GPUEnv * gpuInfo)776 int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
777   int i = 0;
778   int clStatus = 0;
779 
780   if (!isInited) {
781     return 1;
782   }
783 
784   for (i = 0; i < gpuEnv.mnFileCount; i++) {
785     if (gpuEnv.mpArryPrograms[i]) {
786       clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
787       CHECK_OPENCL(clStatus, "clReleaseProgram");
788       gpuEnv.mpArryPrograms[i] = nullptr;
789     }
790   }
791   if (gpuEnv.mpCmdQueue) {
792     clReleaseCommandQueue(gpuEnv.mpCmdQueue);
793     gpuEnv.mpCmdQueue = nullptr;
794   }
795   if (gpuEnv.mpContext) {
796     clReleaseContext(gpuEnv.mpContext);
797     gpuEnv.mpContext = nullptr;
798   }
799   isInited = 0;
800   gpuInfo->mnIsUserCreated = 0;
801   delete[] gpuInfo->mpArryDevsID;
802   return 1;
803 }
BinaryGenerated(const char * clFileName,FILE ** fhandle)804 int OpenclDevice::BinaryGenerated(const char *clFileName, FILE **fhandle) {
805   unsigned int i = 0;
806   cl_int clStatus;
807   int status = 0;
808   FILE *fd = nullptr;
809   char fileName[256] = {0}, cl_name[128] = {0};
810   char deviceName[1024];
811   clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName,
812                              nullptr);
813   CHECK_OPENCL(clStatus, "clGetDeviceInfo");
814   const char *str = strstr(clFileName, ".cl");
815   memcpy(cl_name, clFileName, str - clFileName);
816   cl_name[str - clFileName] = '\0';
817   sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
818   legalizeFileName(fileName);
819   fd = fopen(fileName, "rb");
820   status = (fd != nullptr) ? 1 : 0;
821   if (fd != nullptr) {
822     *fhandle = fd;
823   }
824   return status;
825 }
CachedOfKernerPrg(const GPUEnv * gpuEnvCached,const char * clFileName)826 int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached, const char *clFileName) {
827   int i;
828   for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
829     if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
830       if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
831         return 1;
832       }
833     }
834   }
835 
836   return 0;
837 }
WriteBinaryToFile(const char * fileName,const char * birary,size_t numBytes)838 int OpenclDevice::WriteBinaryToFile(const char *fileName, const char *birary, size_t numBytes) {
839   FILE *output = nullptr;
840   output = fopen(fileName, "wb");
841   if (output == nullptr) {
842     return 0;
843   }
844 
845   fwrite(birary, sizeof(char), numBytes, output);
846   fclose(output);
847 
848   return 1;
849 }
850 
GeneratBinFromKernelSource(cl_program program,const char * clFileName)851 int OpenclDevice::GeneratBinFromKernelSource(cl_program program, const char *clFileName) {
852   unsigned int i = 0;
853   cl_int clStatus;
854   cl_uint numDevices;
855 
856   clStatus =
857       clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, nullptr);
858   CHECK_OPENCL(clStatus, "clGetProgramInfo");
859 
860   std::vector<cl_device_id> mpArryDevsID(numDevices);
861 
862   /* grab the handles to all of the devices in the program. */
863   clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices,
864                               &mpArryDevsID[0], nullptr);
865   CHECK_OPENCL(clStatus, "clGetProgramInfo");
866 
867   /* figure out the sizes of each of the binaries. */
868   std::vector<size_t> binarySizes(numDevices);
869 
870   clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices,
871                               &binarySizes[0], nullptr);
872   CHECK_OPENCL(clStatus, "clGetProgramInfo");
873 
874   /* copy over all of the generated binaries. */
875   std::vector<char *> binaries(numDevices);
876 
877   for (i = 0; i < numDevices; i++) {
878     if (binarySizes[i] != 0) {
879       binaries[i] = new char[binarySizes[i]];
880     } else {
881       binaries[i] = nullptr;
882     }
883   }
884 
885   clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices,
886                               &binaries[0], nullptr);
887   CHECK_OPENCL(clStatus, "clGetProgramInfo");
888 
889   /* dump out each binary into its own separate file. */
890   for (i = 0; i < numDevices; i++) {
891     char fileName[256] = {0}, cl_name[128] = {0};
892 
893     if (binarySizes[i] != 0) {
894       char deviceName[1024];
895       clStatus =
896           clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
897       CHECK_OPENCL(clStatus, "clGetDeviceInfo");
898 
899       const char *str = strstr(clFileName, ".cl");
900       memcpy(cl_name, clFileName, str - clFileName);
901       cl_name[str - clFileName] = '\0';
902       sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
903       legalizeFileName(fileName);
904       if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
905         tprintf("[OD] write binary[%s] failed\n", fileName);
906         return 0;
907       } // else
908       tprintf("[OD] write binary[%s] successfully\n", fileName);
909     }
910   }
911 
912   // Release all resources and memory
913   for (i = 0; i < numDevices; i++) {
914     delete[] binaries[i];
915   }
916 
917   return 1;
918 }
919 
CompileKernelFile(GPUEnv * gpuInfo,const char * buildOption)920 int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
921   cl_int clStatus = 0;
922   const char *source;
923   size_t source_size[1];
924   int binary_status, binaryExisted, idx;
925   cl_uint numDevices;
926   FILE *fd, *fd1;
927   const char *filename = "kernel.cl";
928   // fprintf(stderr, "[OD] CompileKernelFile ... \n");
929   if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
930     return 1;
931   }
932 
933   idx = gpuInfo->mnFileCount;
934 
935   source = kernel_src;
936 
937   source_size[0] = strlen(source);
938   binaryExisted = 0;
939   binaryExisted = BinaryGenerated(filename, &fd); // don't check for binary during microbenchmark
940   if (binaryExisted == 1) {
941     clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, sizeof(numDevices),
942                                 &numDevices, nullptr);
943     CHECK_OPENCL(clStatus, "clGetContextInfo");
944 
945     std::vector<cl_device_id> mpArryDevsID(numDevices);
946     bool b_error = fseek(fd, 0, SEEK_END) < 0;
947     auto pos = std::ftell(fd);
948     b_error |= (pos <= 0);
949     size_t length = pos;
950     b_error |= fseek(fd, 0, SEEK_SET) < 0;
951     if (b_error) {
952       fclose(fd);
953       return 0;
954     }
955 
956     std::vector<uint8_t> binary(length + 2);
957 
958     memset(&binary[0], 0, length + 2);
959     b_error |= fread(&binary[0], 1, length, fd) != length;
960 
961     fclose(fd);
962     fd = nullptr;
963     // grab the handles to all of the devices in the context.
964     clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
965                                 sizeof(cl_device_id) * numDevices, &mpArryDevsID[0], nullptr);
966     CHECK_OPENCL(clStatus, "clGetContextInfo");
967     // fprintf(stderr, "[OD] Create kernel from binary\n");
968     const uint8_t *c_binary = &binary[0];
969     gpuInfo->mpArryPrograms[idx] =
970         clCreateProgramWithBinary(gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length,
971                                   &c_binary, &binary_status, &clStatus);
972     CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
973   } else {
974     // create a CL program using the kernel source
975     // fprintf(stderr, "[OD] Create kernel from source\n");
976     gpuInfo->mpArryPrograms[idx] =
977         clCreateProgramWithSource(gpuInfo->mpContext, 1, &source, source_size, &clStatus);
978     CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
979   }
980 
981   if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
982     return 0;
983   }
984 
985   // char options[512];
986   // create a cl program executable for all the devices specified
987   // tprintf("[OD] BuildProgram.\n");
988   if (!gpuInfo->mnIsUserCreated) {
989     clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption,
990                               nullptr, nullptr);
991   } else {
992     clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption,
993                               nullptr, nullptr);
994   }
995   if (clStatus != CL_SUCCESS) {
996     tprintf("BuildProgram error!\n");
997     size_t length;
998     if (!gpuInfo->mnIsUserCreated) {
999       clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1000                                        CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1001     } else {
1002       clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1003                                        CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1004     }
1005     if (clStatus != CL_SUCCESS) {
1006       tprintf("opencl create build log fail\n");
1007       return 0;
1008     }
1009     std::vector<char> buildLog(length);
1010     if (!gpuInfo->mnIsUserCreated) {
1011       clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1012                                        CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1013     } else {
1014       clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1015                                        CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1016     }
1017     if (clStatus != CL_SUCCESS) {
1018       tprintf("opencl program build info fail\n");
1019       return 0;
1020     }
1021 
1022     fd1 = fopen("kernel-build.log", "w+");
1023     if (fd1 != nullptr) {
1024       fwrite(&buildLog[0], sizeof(char), length, fd1);
1025       fclose(fd1);
1026     }
1027 
1028     return 0;
1029   }
1030 
1031   strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1032   if (binaryExisted == 0) {
1033     GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1034   }
1035 
1036   gpuInfo->mnFileCount += 1;
1037   return 1;
1038 }
1039 
pixReadFromTiffKernel(l_uint32 * tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 * line)1040 l_uint32 *OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata, l_int32 w, l_int32 h, l_int32 wpl,
1041                                               l_uint32 *line) {
1042   cl_int clStatus;
1043   KernelEnv rEnv;
1044   size_t globalThreads[2];
1045   size_t localThreads[2];
1046   int gsize;
1047   cl_mem valuesCl;
1048   cl_mem outputCl;
1049 
1050   // global and local work dimensions for Horizontal pass
1051   gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1052   globalThreads[0] = gsize;
1053   gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1054   globalThreads[1] = gsize;
1055   localThreads[0] = GROUPSIZE_X;
1056   localThreads[1] = GROUPSIZE_Y;
1057 
1058   SetKernelEnv(&rEnv);
1059 
1060   l_uint32 *pResult = (l_uint32 *)malloc(w * h * sizeof(l_uint32));
1061   rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1062   CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1063 
1064   // Allocate input and output OCL buffers
1065   valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1066                                     &clStatus);
1067   outputCl = allocateZeroCopyBuffer(rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
1068                                     &clStatus);
1069 
1070   // Kernel arguments
1071   clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1072   CHECK_OPENCL(clStatus, "clSetKernelArg");
1073   clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1074   CHECK_OPENCL(clStatus, "clSetKernelArg");
1075   clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1076   CHECK_OPENCL(clStatus, "clSetKernelArg");
1077   clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1078   CHECK_OPENCL(clStatus, "clSetKernelArg");
1079   clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1080   CHECK_OPENCL(clStatus, "clSetKernelArg");
1081 
1082   // Kernel enqueue
1083   clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1084                                     localThreads, 0, nullptr, nullptr);
1085   CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1086 
1087   /* map results back from gpu */
1088   void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0,
1089                                  w * h * sizeof(l_uint32), 0, nullptr, nullptr, &clStatus);
1090   CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1091   clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1092 
1093   // Sync
1094   clFinish(rEnv.mpkCmdQueue);
1095   return pResult;
1096 }
1097 
1098 // Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
1099 // OpenCL kernels
pixDilateCL_55(l_int32 wpl,l_int32 h)1100 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1101   size_t globalThreads[2];
1102   cl_mem pixtemp;
1103   cl_int status;
1104   int gsize;
1105   size_t localThreads[2];
1106 
1107   // Horizontal pass
1108   gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1109   globalThreads[0] = gsize;
1110   globalThreads[1] = GROUPSIZE_HMORY;
1111   localThreads[0] = GROUPSIZE_HMORX;
1112   localThreads[1] = GROUPSIZE_HMORY;
1113 
1114   rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1115   CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1116 
1117   status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1118   status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1119   status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1120   status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1121 
1122   status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1123                                   localThreads, 0, nullptr, nullptr);
1124 
1125   // Swap source and dest buffers
1126   pixtemp = pixsCLBuffer;
1127   pixsCLBuffer = pixdCLBuffer;
1128   pixdCLBuffer = pixtemp;
1129 
1130   // Vertical
1131   gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1132   globalThreads[0] = gsize;
1133   gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1134   globalThreads[1] = gsize;
1135   localThreads[0] = GROUPSIZE_X;
1136   localThreads[1] = GROUPSIZE_Y;
1137 
1138   rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1139   CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1140 
1141   status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1142   status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1143   status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1144   status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1145   status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1146                                   localThreads, 0, nullptr, nullptr);
1147 
1148   return status;
1149 }
1150 
1151 // Morphology Erode operation for 5x5 structuring element. Invokes the relevant
1152 // OpenCL kernels
pixErodeCL_55(l_int32 wpl,l_int32 h)1153 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1154   size_t globalThreads[2];
1155   cl_mem pixtemp;
1156   cl_int status;
1157   int gsize;
1158   l_uint32 fwmask, lwmask;
1159   size_t localThreads[2];
1160 
1161   lwmask = lmask32[31 - 2];
1162   fwmask = rmask32[31 - 2];
1163 
1164   // Horizontal pass
1165   gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1166   globalThreads[0] = gsize;
1167   globalThreads[1] = GROUPSIZE_HMORY;
1168   localThreads[0] = GROUPSIZE_HMORX;
1169   localThreads[1] = GROUPSIZE_HMORY;
1170 
1171   rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1172   CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1173 
1174   status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1175   status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1176   status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1177   status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1178 
1179   status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1180                                   localThreads, 0, nullptr, nullptr);
1181 
1182   // Swap source and dest buffers
1183   pixtemp = pixsCLBuffer;
1184   pixsCLBuffer = pixdCLBuffer;
1185   pixdCLBuffer = pixtemp;
1186 
1187   // Vertical
1188   gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1189   globalThreads[0] = gsize;
1190   gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1191   globalThreads[1] = gsize;
1192   localThreads[0] = GROUPSIZE_X;
1193   localThreads[1] = GROUPSIZE_Y;
1194 
1195   rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1196   CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1197 
1198   status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1199   status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1200   status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1201   status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1202   status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1203   status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1204   status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1205                                   localThreads, 0, nullptr, nullptr);
1206 
1207   return status;
1208 }
1209 
1210 // Morphology Dilate operation. Invokes the relevant OpenCL kernels
pixDilateCL(l_int32 hsize,l_int32 vsize,l_int32 wpl,l_int32 h)1211 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1212   l_int32 xp, yp, xn, yn;
1213   SEL *sel;
1214   size_t globalThreads[2];
1215   cl_mem pixtemp;
1216   cl_int status = 0;
1217   int gsize;
1218   size_t localThreads[2];
1219   char isEven;
1220 
1221   OpenclDevice::SetKernelEnv(&rEnv);
1222 
1223   if (hsize == 5 && vsize == 5) {
1224     // Specific case for 5x5
1225     status = pixDilateCL_55(wpl, h);
1226     return status;
1227   }
1228 
1229   sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1230 
1231   selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1232   selDestroy(&sel);
1233   // global and local work dimensions for Horizontal pass
1234   gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1235   globalThreads[0] = gsize;
1236   gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1237   globalThreads[1] = gsize;
1238   localThreads[0] = GROUPSIZE_X;
1239   localThreads[1] = GROUPSIZE_Y;
1240 
1241   if (xp > 31 || xn > 31) {
1242     // Generic case.
1243     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1244     CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1245 
1246     status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1247     status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1248     status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1249     status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1250     status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1251     status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1252     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1253                                     localThreads, 0, nullptr, nullptr);
1254 
1255     if (yp > 0 || yn > 0) {
1256       pixtemp = pixsCLBuffer;
1257       pixsCLBuffer = pixdCLBuffer;
1258       pixdCLBuffer = pixtemp;
1259     }
1260   } else if (xp > 0 || xn > 0) {
1261     // Specific Horizontal pass kernel for half width < 32
1262     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1263     CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1264     isEven = (xp != xn);
1265 
1266     status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1267     status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1268     status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1269     status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1270     status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1271     status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1272     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1273                                     localThreads, 0, nullptr, nullptr);
1274 
1275     if (yp > 0 || yn > 0) {
1276       pixtemp = pixsCLBuffer;
1277       pixsCLBuffer = pixdCLBuffer;
1278       pixdCLBuffer = pixtemp;
1279     }
1280   }
1281 
1282   if (yp > 0 || yn > 0) {
1283     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1284     CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1285 
1286     status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1287     status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1288     status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1289     status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1290     status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1291     status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1292     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1293                                     localThreads, 0, nullptr, nullptr);
1294   }
1295 
1296   return status;
1297 }
1298 
1299 // Morphology Erode operation. Invokes the relevant OpenCL kernels
pixErodeCL(l_int32 hsize,l_int32 vsize,l_uint32 wpl,l_uint32 h)1300 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1301   l_int32 xp, yp, xn, yn;
1302   SEL *sel;
1303   size_t globalThreads[2];
1304   size_t localThreads[2];
1305   cl_mem pixtemp;
1306   cl_int status = 0;
1307   int gsize;
1308   char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1309   l_uint32 rwmask, lwmask;
1310   char isEven;
1311 
1312   sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1313 
1314   selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1315   selDestroy(&sel);
1316   OpenclDevice::SetKernelEnv(&rEnv);
1317 
1318   if (hsize == 5 && vsize == 5 && isAsymmetric) {
1319     // Specific kernel for 5x5
1320     status = pixErodeCL_55(wpl, h);
1321     return status;
1322   }
1323 
1324   lwmask = lmask32[31 - (xn & 31)];
1325   rwmask = rmask32[31 - (xp & 31)];
1326 
1327   // global and local work dimensions for Horizontal pass
1328   gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1329   globalThreads[0] = gsize;
1330   gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1331   globalThreads[1] = gsize;
1332   localThreads[0] = GROUPSIZE_X;
1333   localThreads[1] = GROUPSIZE_Y;
1334 
1335   // Horizontal Pass
1336   if (xp > 31 || xn > 31) {
1337     // Generic case.
1338     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1339 
1340     status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1341     status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1342     status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1343     status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1344     status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1345     status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1346     status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1347     status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1348     status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1349     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1350                                     localThreads, 0, nullptr, nullptr);
1351 
1352     if (yp > 0 || yn > 0) {
1353       pixtemp = pixsCLBuffer;
1354       pixsCLBuffer = pixdCLBuffer;
1355       pixdCLBuffer = pixtemp;
1356     }
1357   } else if (xp > 0 || xn > 0) {
1358     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1359     isEven = (xp != xn);
1360 
1361     status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1362     status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1363     status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1364     status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1365     status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1366     status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1367     status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1368     status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1369     status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1370     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1371                                     localThreads, 0, nullptr, nullptr);
1372 
1373     if (yp > 0 || yn > 0) {
1374       pixtemp = pixsCLBuffer;
1375       pixsCLBuffer = pixdCLBuffer;
1376       pixdCLBuffer = pixtemp;
1377     }
1378   }
1379 
1380   // Vertical Pass
1381   if (yp > 0 || yn > 0) {
1382     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1383     CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1384 
1385     status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1386     status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1387     status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1388     status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1389     status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1390     status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1391     status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1392     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1393                                     localThreads, 0, nullptr, nullptr);
1394   }
1395 
1396   return status;
1397 }
1398 
1399 // Morphology Open operation. Invokes the relevant OpenCL kernels
pixOpenCL(l_int32 hsize,l_int32 vsize,l_int32 wpl,l_int32 h)1400 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1401   cl_int status;
1402   cl_mem pixtemp;
1403 
1404   // Erode followed by Dilate
1405   status = pixErodeCL(hsize, vsize, wpl, h);
1406 
1407   pixtemp = pixsCLBuffer;
1408   pixsCLBuffer = pixdCLBuffer;
1409   pixdCLBuffer = pixtemp;
1410 
1411   status = pixDilateCL(hsize, vsize, wpl, h);
1412 
1413   return status;
1414 }
1415 
1416 // Morphology Close operation. Invokes the relevant OpenCL kernels
pixCloseCL(l_int32 hsize,l_int32 vsize,l_int32 wpl,l_int32 h)1417 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1418   cl_int status;
1419   cl_mem pixtemp;
1420 
1421   // Dilate followed by Erode
1422   status = pixDilateCL(hsize, vsize, wpl, h);
1423 
1424   pixtemp = pixsCLBuffer;
1425   pixsCLBuffer = pixdCLBuffer;
1426   pixdCLBuffer = pixtemp;
1427 
1428   status = pixErodeCL(hsize, vsize, wpl, h);
1429 
1430   return status;
1431 }
1432 
1433 // output = buffer1 & ~(buffer2)
pixSubtractCL_work(l_uint32 wpl,l_uint32 h,cl_mem buffer1,cl_mem buffer2)1434 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2) {
1435   cl_int status;
1436   size_t globalThreads[2];
1437   int gsize;
1438   size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1439 
1440   gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1441   globalThreads[0] = gsize;
1442   gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1443   globalThreads[1] = gsize;
1444 
1445   rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1446   CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1447 
1448   // Enqueue a kernel run call.
1449   status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1450   status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1451   status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1452   status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1453   status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1454                                   localThreads, 0, nullptr, nullptr);
1455 
1456   return status;
1457 }
1458 
1459 // OpenCL implementation of Get Lines from pix function
1460 // Note: Assumes the source and dest opencl buffer are initialized. No check
1461 // done
pixGetLinesCL(Image pixd,Image pixs,Image * pix_vline,Image * pix_hline,Image * pixClosed,bool getpixClosed,l_int32 close_hsize,l_int32 close_vsize,l_int32 open_hsize,l_int32 open_vsize,l_int32 line_hsize,l_int32 line_vsize)1462 void OpenclDevice::pixGetLinesCL(Image pixd, Image pixs, Image *pix_vline, Image *pix_hline,
1463                                  Image *pixClosed, bool getpixClosed, l_int32 close_hsize,
1464                                  l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize,
1465                                  l_int32 line_hsize, l_int32 line_vsize) {
1466   l_uint32 wpl, h;
1467   cl_mem pixtemp;
1468 
1469   wpl = pixGetWpl(pixs);
1470   h = pixGetHeight(pixs);
1471 
1472   // First step : Close Morph operation: Dilate followed by Erode
1473   clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1474 
1475   // Copy the Close output to CPU buffer
1476   if (getpixClosed) {
1477     *pixClosed =
1478         mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl * h, CL_MAP_READ, true, false);
1479   }
1480 
1481   // Store the output of close operation in an intermediate buffer
1482   // this will be later used for pixsubtract
1483   clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1484                                  sizeof(int) * wpl * h, 0, nullptr, nullptr);
1485 
1486   // Second step: Open Operation - Erode followed by Dilate
1487   pixtemp = pixsCLBuffer;
1488   pixsCLBuffer = pixdCLBuffer;
1489   pixdCLBuffer = pixtemp;
1490 
1491   clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1492 
1493   // Third step: Subtract : (Close - Open)
1494   pixtemp = pixsCLBuffer;
1495   pixsCLBuffer = pixdCLBuffer;
1496   pixdCLBuffer = pixdCLIntermediate;
1497   pixdCLIntermediate = pixtemp;
1498 
1499   clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1500 
1501   // Store the output of Hollow operation in an intermediate buffer
1502   // this will be later used
1503   clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1504                                  sizeof(int) * wpl * h, 0, nullptr, nullptr);
1505 
1506   pixtemp = pixsCLBuffer;
1507   pixsCLBuffer = pixdCLBuffer;
1508   pixdCLBuffer = pixtemp;
1509 
1510   // Fourth step: Get vertical line
1511   // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1512   clStatus = pixOpenCL(1, line_vsize, wpl, h);
1513 
1514   // Copy the vertical line output to CPU buffer
1515   *pix_vline =
1516       mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, CL_MAP_READ, true, false);
1517 
1518   pixtemp = pixsCLBuffer;
1519   pixsCLBuffer = pixdCLIntermediate;
1520   pixdCLIntermediate = pixtemp;
1521 
1522   // Fifth step: Get horizontal line
1523   // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1524   clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1525 
1526   // Copy the horizontal line output to CPU buffer
1527   *pix_hline =
1528       mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, CL_MAP_READ, true, true);
1529 
1530   return;
1531 }
1532 
1533 /*************************************************************************
1534  *  HistogramRect
1535  *  Otsu Thresholding Operations
1536  *  histogramAllChannels is laid out as all channel 0, then all channel 1...
1537  *  only supports 1 or 4 channels (bytes_per_pixel)
1538  ************************************************************************/
HistogramRectOCL(void * imageData,int bytes_per_pixel,int bytes_per_line,int left,int top,int width,int height,int kHistogramSize,int * histogramAllChannels)1539 int OpenclDevice::HistogramRectOCL(void *imageData, int bytes_per_pixel, int bytes_per_line,
1540                                    int left, // always 0
1541                                    int top,  // always 0
1542                                    int width, int height, int kHistogramSize,
1543                                    int *histogramAllChannels) {
1544   cl_int clStatus;
1545   int retVal = 0;
1546   KernelEnv histKern;
1547   SetKernelEnv(&histKern);
1548   KernelEnv histRedKern;
1549   SetKernelEnv(&histRedKern);
1550   /* map imagedata to device as read only */
1551   // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1552   // coherent which we don't need.
1553   // faster option would be to allocate initial image buffer
1554   // using a garlic bus memory type
1555   cl_mem imageBuffer =
1556       clCreateBuffer(histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1557                      width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1558   CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1559 
1560   /* setup work group size parameters */
1561   int block_size = 256;
1562   cl_uint numCUs;
1563   clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1564                              nullptr);
1565   CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1566 
1567   int requestedOccupancy = 10;
1568   int numWorkGroups = numCUs * requestedOccupancy;
1569   int numThreads = block_size * numWorkGroups;
1570   size_t local_work_size[] = {static_cast<size_t>(block_size)};
1571   size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1572   size_t red_global_work_size[] = {
1573       static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1574 
1575   /* map histogramAllChannels as write only */
1576 
1577   cl_mem histogramBuffer = clCreateBuffer(
1578       histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1579       kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels, &clStatus);
1580   CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1581 
1582   /* intermediate histogram buffer */
1583   int histRed = 256;
1584   int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1585 
1586   cl_mem tmpHistogramBuffer =
1587       clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins * sizeof(cl_uint),
1588                      nullptr, &clStatus);
1589   CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1590 
1591   /* atomic sync buffer */
1592   int *zeroBuffer = new int[1];
1593   zeroBuffer[0] = 0;
1594   cl_mem atomicSyncBuffer =
1595       clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int),
1596                      zeroBuffer, &clStatus);
1597   CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1598   delete[] zeroBuffer;
1599   // Create kernel objects based on bytes_per_pixel
1600   if (bytes_per_pixel == 1) {
1601     histKern.mpkKernel =
1602         clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1603     CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1604 
1605     histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1606                                            "kernel_HistogramRectOneChannelReduction", &clStatus);
1607     CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
1608   } else {
1609     histKern.mpkKernel =
1610         clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
1611     CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1612 
1613     histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1614                                            "kernel_HistogramRectAllChannelsReduction", &clStatus);
1615     CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1616   }
1617 
1618   void *ptr;
1619 
1620   // Initialize tmpHistogramBuffer buffer
1621   ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1622                            tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
1623   CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1624 
1625   memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
1626   clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, nullptr, nullptr);
1627 
1628   /* set kernel 1 arguments */
1629   clStatus = clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1630   CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1631   cl_uint numPixels = width * height;
1632   clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1633   CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
1634   clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), &tmpHistogramBuffer);
1635   CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1636 
1637   /* set kernel 2 arguments */
1638   int n = numThreads / bytes_per_pixel;
1639   clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1640   CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1641   clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), &tmpHistogramBuffer);
1642   CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1643   clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), &histogramBuffer);
1644   CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer");
1645 
1646   /* launch histogram */
1647   clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, nullptr,
1648                                     global_work_size, local_work_size, 0, nullptr, nullptr);
1649   CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1650   clFinish(histKern.mpkCmdQueue);
1651   if (clStatus != 0) {
1652     retVal = -1;
1653   }
1654   /* launch histogram */
1655   clStatus = clEnqueueNDRangeKernel(histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1656                                     red_global_work_size, local_work_size, 0, nullptr, nullptr);
1657   CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1658   clFinish(histRedKern.mpkCmdQueue);
1659   if (clStatus != 0) {
1660     retVal = -1;
1661   }
1662 
1663   /* map results back from gpu */
1664   ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0,
1665                            kHistogramSize * bytes_per_pixel * sizeof(int), 0, nullptr, nullptr,
1666                            &clStatus);
1667   CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1668   if (clStatus != 0) {
1669     retVal = -1;
1670   }
1671   clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, nullptr, nullptr);
1672 
1673   clReleaseMemObject(histogramBuffer);
1674   clReleaseMemObject(imageBuffer);
1675   return retVal;
1676 }
1677 
1678 /*************************************************************************
1679  * Threshold the rectangle, taking everything except the image buffer pointer
1680  * from the class, using thresholds/hi_values to the output IMAGE.
1681  * only supports 1 or 4 channels
1682  ************************************************************************/
ThresholdRectToPixOCL(unsigned char * imageData,int bytes_per_pixel,int bytes_per_line,int * thresholds,int * hi_values,Image * pix,int height,int width,int top,int left)1683 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, int bytes_per_pixel,
1684                                         int bytes_per_line, int *thresholds, int *hi_values,
1685                                         Image *pix, int height, int width, int top, int left) {
1686   int retVal = 0;
1687   /* create pix result buffer */
1688   *pix = pixCreate(width, height, 1);
1689   uint32_t *pixData = pixGetData(*pix);
1690   int wpl = pixGetWpl(*pix);
1691   int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1692 
1693   cl_int clStatus;
1694   KernelEnv rEnv;
1695   SetKernelEnv(&rEnv);
1696 
1697   /* setup work group size parameters */
1698   int block_size = 256;
1699   cl_uint numCUs = 6;
1700   clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1701                              nullptr);
1702   CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1703 
1704   int requestedOccupancy = 10;
1705   int numWorkGroups = numCUs * requestedOccupancy;
1706   int numThreads = block_size * numWorkGroups;
1707   size_t local_work_size[] = {(size_t)block_size};
1708   size_t global_work_size[] = {(size_t)numThreads};
1709 
1710   /* map imagedata to device as read only */
1711   // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1712   // coherent which we don't need.
1713   // faster option would be to allocate initial image buffer
1714   // using a garlic bus memory type
1715   cl_mem imageBuffer =
1716       clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1717                      width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1718   CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1719 
1720   /* map pix as write only */
1721   pixThBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize,
1722                                pixData, &clStatus);
1723   CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1724 
1725   /* map thresholds and hi_values */
1726   cl_mem thresholdsBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1727                                            bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1728   CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1729   cl_mem hiValuesBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1730                                          bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1731   CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1732 
1733   /* compile kernel */
1734   if (bytes_per_pixel == 4) {
1735     rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1736     CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1737   } else {
1738     rEnv.mpkKernel =
1739         clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1740     CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1741   }
1742 
1743   /* set kernel arguments */
1744   clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1745   CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1746   clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1747   CHECK_OPENCL(clStatus, "clSetKernelArg height");
1748   clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1749   CHECK_OPENCL(clStatus, "clSetKernelArg width");
1750   clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1751   CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1752   clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1753   CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1754   clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1755   CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1756   clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1757   CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1758 
1759   /* launch kernel & wait */
1760   clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1, nullptr, global_work_size,
1761                                     local_work_size, 0, nullptr, nullptr);
1762   CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1763   clFinish(rEnv.mpkCmdQueue);
1764   if (clStatus != 0) {
1765     tprintf("Setting return value to -1\n");
1766     retVal = -1;
1767   }
1768   /* map results back from gpu */
1769   void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0,
1770                                  nullptr, nullptr, &clStatus);
1771   CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1772   clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr, nullptr);
1773 
1774   clReleaseMemObject(imageBuffer);
1775   clReleaseMemObject(thresholdsBuffer);
1776   clReleaseMemObject(hiValuesBuffer);
1777 
1778   return retVal;
1779 }
1780 
1781 /******************************************************************************
1782  * Data Types for Device Selection
1783  *****************************************************************************/
1784 
1785 struct TessScoreEvaluationInputData {
1786   int height;
1787   int width;
1788   int numChannels;
1789   unsigned char *imageData;
1790   Image pix;
1791 };
1792 
populateTessScoreEvaluationInputData(TessScoreEvaluationInputData * input)1793 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
1794   srand(1);
1795   // 8.5x11 inches @ 300dpi rounded to clean multiples
1796   int height = 3328; // %256
1797   int width = 2560;  // %512
1798   int numChannels = 4;
1799   input->height = height;
1800   input->width = width;
1801   input->numChannels = numChannels;
1802   unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
1803       height * width * numChannels * sizeof(unsigned char)); // new unsigned char[4][height*width];
1804   input->imageData = (unsigned char *)&imageData4[0];
1805 
1806   // zero out image
1807   unsigned char pixelWhite[4] = {0, 0, 0, 255};
1808   unsigned char pixelBlack[4] = {255, 255, 255, 255};
1809   for (int p = 0; p < height * width; p++) {
1810     // unsigned char tmp[4] = imageData4[0];
1811     imageData4[p][0] = pixelWhite[0];
1812     imageData4[p][1] = pixelWhite[1];
1813     imageData4[p][2] = pixelWhite[2];
1814     imageData4[p][3] = pixelWhite[3];
1815   }
1816   // random lines to be eliminated
1817   int maxLineWidth = 64; // pixels wide
1818   int numLines = 10;
1819   // vertical lines
1820   for (int i = 0; i < numLines; i++) {
1821     int lineWidth = rand() % maxLineWidth;
1822     int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1823     // tprintf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
1824     for (int row = vertLinePos - lineWidth / 2; row < vertLinePos + lineWidth / 2; row++) {
1825       for (int col = 0; col < height; col++) {
1826         // imageData4[row*width+col] = pixelBlack;
1827         imageData4[row * width + col][0] = pixelBlack[0];
1828         imageData4[row * width + col][1] = pixelBlack[1];
1829         imageData4[row * width + col][2] = pixelBlack[2];
1830         imageData4[row * width + col][3] = pixelBlack[3];
1831       }
1832     }
1833   }
1834   // horizontal lines
1835   for (int i = 0; i < numLines; i++) {
1836     int lineWidth = rand() % maxLineWidth;
1837     int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1838     // tprintf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
1839     for (int row = 0; row < width; row++) {
1840       for (int col = horLinePos - lineWidth / 2; col < horLinePos + lineWidth / 2;
1841            col++) { // for (int row = vertLinePos-lineWidth/2; row <
1842                     // vertLinePos+lineWidth/2; row++) {
1843         // tprintf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
1844         // imageData4[row*width+col] = pixelBlack;
1845         imageData4[row * width + col][0] = pixelBlack[0];
1846         imageData4[row * width + col][1] = pixelBlack[1];
1847         imageData4[row * width + col][2] = pixelBlack[2];
1848         imageData4[row * width + col][3] = pixelBlack[3];
1849       }
1850     }
1851   }
1852   // spots (noise, squares)
1853   float fractionBlack = 0.1; // how much of the image should be blackened
1854   int numSpots = (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1855   for (int i = 0; i < numSpots; i++) {
1856     int lineWidth = rand() % maxLineWidth;
1857     int col = lineWidth + rand() % (width - 2 * lineWidth);
1858     int row = lineWidth + rand() % (height - 2 * lineWidth);
1859     // tprintf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
1860     for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1861       for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1862         // tprintf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
1863         // imageData4[row*width+col] = pixelBlack;
1864         imageData4[r * width + c][0] = pixelBlack[0];
1865         imageData4[r * width + c][1] = pixelBlack[1];
1866         imageData4[r * width + c][2] = pixelBlack[2];
1867         imageData4[r * width + c][3] = pixelBlack[3];
1868       }
1869     }
1870   }
1871 
1872   input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1873 }
1874 
1875 struct TessDeviceScore {
1876   float time;   // small time means faster device
1877   bool clError; // were there any opencl errors
1878   bool valid;   // was the correct response generated
1879 };
1880 
1881 /******************************************************************************
1882  * Micro Benchmarks for Device Selection
1883  *****************************************************************************/
1884 
composeRGBPixelMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)1885 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1886                                         ds_device_type type) {
1887   double time = 0;
1888 #  if ON_WINDOWS
1889   LARGE_INTEGER freq, time_funct_start, time_funct_end;
1890   QueryPerformanceFrequency(&freq);
1891 #  elif ON_APPLE
1892   mach_timebase_info_data_t info = {0, 0};
1893   mach_timebase_info(&info);
1894   long long start, stop;
1895 #  else
1896   timespec time_funct_start, time_funct_end;
1897 #  endif
1898   // input data
1899   l_uint32 *tiffdata = (l_uint32 *)input.imageData; // same size and random data; data doesn't
1900                                                     // change workload
1901 
1902   // function call
1903   if (type == DS_DEVICE_OPENCL_DEVICE) {
1904 #  if ON_WINDOWS
1905     QueryPerformanceCounter(&time_funct_start);
1906 #  elif ON_APPLE
1907     start = mach_absolute_time();
1908 #  else
1909     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1910 #  endif
1911 
1912     OpenclDevice::gpuEnv = *env;
1913     int wpl = pixGetWpl(input.pix);
1914     OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, nullptr);
1915 #  if ON_WINDOWS
1916     QueryPerformanceCounter(&time_funct_end);
1917     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1918 #  elif ON_APPLE
1919     stop = mach_absolute_time();
1920     time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1921 #  else
1922     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1923     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1924            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1925 #  endif
1926 
1927   } else {
1928 #  if ON_WINDOWS
1929     QueryPerformanceCounter(&time_funct_start);
1930 #  elif ON_APPLE
1931     start = mach_absolute_time();
1932 #  else
1933     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1934 #  endif
1935     Image pix = pixCreate(input.width, input.height, 32);
1936     l_uint32 *pixData = pixGetData(pix);
1937     int i, j;
1938     int idx = 0;
1939     for (i = 0; i < input.height; i++) {
1940       for (j = 0; j < input.width; j++) {
1941         l_uint32 tiffword = tiffdata[i * input.width + j];
1942         l_int32 rval = ((tiffword)&0xff);
1943         l_int32 gval = (((tiffword) >> 8) & 0xff);
1944         l_int32 bval = (((tiffword) >> 16) & 0xff);
1945         l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
1946         pixData[idx] = value;
1947         idx++;
1948       }
1949     }
1950 #  if ON_WINDOWS
1951     QueryPerformanceCounter(&time_funct_end);
1952     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1953 #  elif ON_APPLE
1954     stop = mach_absolute_time();
1955     time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1956 #  else
1957     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1958     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1959            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1960 #  endif
1961     pix.destroy();
1962   }
1963 
1964   return time;
1965 }
1966 
histogramRectMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)1967 static double histogramRectMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1968                                       ds_device_type type) {
1969   double time;
1970 #  if ON_WINDOWS
1971   LARGE_INTEGER freq, time_funct_start, time_funct_end;
1972   QueryPerformanceFrequency(&freq);
1973 #  elif ON_APPLE
1974   mach_timebase_info_data_t info = {0, 0};
1975   mach_timebase_info(&info);
1976   long long start, stop;
1977 #  else
1978   timespec time_funct_start, time_funct_end;
1979 #  endif
1980 
1981   const int left = 0;
1982   const int top = 0;
1983   int kHistogramSize = 256;
1984   int bytes_per_line = input.width * input.numChannels;
1985   int *histogramAllChannels = new int[kHistogramSize * input.numChannels];
1986   // function call
1987   if (type == DS_DEVICE_OPENCL_DEVICE) {
1988 #  if ON_WINDOWS
1989     QueryPerformanceCounter(&time_funct_start);
1990 #  elif ON_APPLE
1991     start = mach_absolute_time();
1992 #  else
1993     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1994 #  endif
1995 
1996     OpenclDevice::gpuEnv = *env;
1997     int retVal = OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line,
1998                                                 left, top, input.width, input.height,
1999                                                 kHistogramSize, histogramAllChannels);
2000 
2001 #  if ON_WINDOWS
2002     QueryPerformanceCounter(&time_funct_end);
2003     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2004 #  elif ON_APPLE
2005     stop = mach_absolute_time();
2006     if (retVal == 0) {
2007       time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2008     } else {
2009       time = FLT_MAX;
2010     }
2011 #  else
2012     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2013     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2014            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2015 #  endif
2016   } else {
2017     int *histogram = new int[kHistogramSize];
2018 #  if ON_WINDOWS
2019     QueryPerformanceCounter(&time_funct_start);
2020 #  elif ON_APPLE
2021     start = mach_absolute_time();
2022 #  else
2023     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2024 #  endif
2025     for (int ch = 0; ch < input.numChannels; ++ch) {
2026       tesseract::HistogramRect(input.pix, input.numChannels, left, top, input.width, input.height,
2027                                histogram);
2028     }
2029 #  if ON_WINDOWS
2030     QueryPerformanceCounter(&time_funct_end);
2031     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2032 #  elif ON_APPLE
2033     stop = mach_absolute_time();
2034     time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2035 #  else
2036     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2037     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2038            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2039 #  endif
2040     delete[] histogram;
2041   }
2042 
2043   // cleanup
2044   delete[] histogramAllChannels;
2045   return time;
2046 }
2047 
2048 // Reproducing the ThresholdRectToPix native version
ThresholdRectToPix_Native(const unsigned char * imagedata,int bytes_per_pixel,int bytes_per_line,const int * thresholds,const int * hi_values,Image * pix)2049 static void ThresholdRectToPix_Native(const unsigned char *imagedata, int bytes_per_pixel,
2050                                       int bytes_per_line, const int *thresholds,
2051                                       const int *hi_values, Image *pix) {
2052   int top = 0;
2053   int left = 0;
2054   int width = pixGetWidth(*pix);
2055   int height = pixGetHeight(*pix);
2056 
2057   *pix = pixCreate(width, height, 1);
2058   uint32_t *pixdata = pixGetData(*pix);
2059   int wpl = pixGetWpl(*pix);
2060   const unsigned char *srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel;
2061   for (int y = 0; y < height; ++y) {
2062     const uint8_t *linedata = srcdata;
2063     uint32_t *pixline = pixdata + y * wpl;
2064     for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2065       bool white_result = true;
2066       for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2067         if (hi_values[ch] >= 0 && (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2068           white_result = false;
2069           break;
2070         }
2071       }
2072       if (white_result)
2073         CLEAR_DATA_BIT(pixline, x);
2074       else
2075         SET_DATA_BIT(pixline, x);
2076     }
2077     srcdata += bytes_per_line;
2078   }
2079 }
2080 
thresholdRectToPixMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)2081 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2082                                            ds_device_type type) {
2083   double time;
2084 #  if ON_WINDOWS
2085   LARGE_INTEGER freq, time_funct_start, time_funct_end;
2086   QueryPerformanceFrequency(&freq);
2087 #  elif ON_APPLE
2088   mach_timebase_info_data_t info = {0, 0};
2089   mach_timebase_info(&info);
2090   long long start, stop;
2091 #  else
2092   timespec time_funct_start, time_funct_end;
2093 #  endif
2094 
2095   // input data
2096   unsigned char pixelHi = (unsigned char)255;
2097   int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2098 
2099   // Pix* pix = pixCreate(width, height, 1);
2100   int top = 0;
2101   int left = 0;
2102   int bytes_per_line = input.width * input.numChannels;
2103 
2104   // function call
2105   if (type == DS_DEVICE_OPENCL_DEVICE) {
2106 #  if ON_WINDOWS
2107     QueryPerformanceCounter(&time_funct_start);
2108 #  elif ON_APPLE
2109     start = mach_absolute_time();
2110 #  else
2111     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2112 #  endif
2113 
2114     OpenclDevice::gpuEnv = *env;
2115     int hi_values[4];
2116     int retVal = OpenclDevice::ThresholdRectToPixOCL(
2117         input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix,
2118         input.height, input.width, top, left);
2119 
2120 #  if ON_WINDOWS
2121     QueryPerformanceCounter(&time_funct_end);
2122     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2123 #  elif ON_APPLE
2124     stop = mach_absolute_time();
2125     if (retVal == 0) {
2126       time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2127     } else {
2128       time = FLT_MAX;
2129     }
2130 
2131 #  else
2132     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2133     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2134            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2135 #  endif
2136   } else {
2137     tesseract::ImageThresholder thresholder;
2138     thresholder.SetImage(input.pix);
2139 #  if ON_WINDOWS
2140     QueryPerformanceCounter(&time_funct_start);
2141 #  elif ON_APPLE
2142     start = mach_absolute_time();
2143 #  else
2144     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2145 #  endif
2146     int hi_values[4] = {};
2147     ThresholdRectToPix_Native(input.imageData, input.numChannels, bytes_per_line, thresholds,
2148                               hi_values, &input.pix);
2149 
2150 #  if ON_WINDOWS
2151     QueryPerformanceCounter(&time_funct_end);
2152     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2153 #  elif ON_APPLE
2154     stop = mach_absolute_time();
2155     time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2156 #  else
2157     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2158     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2159            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2160 #  endif
2161   }
2162 
2163   return time;
2164 }
2165 
getLineMasksMorphMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)2166 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2167                                           ds_device_type type) {
2168   double time = 0;
2169 #  if ON_WINDOWS
2170   LARGE_INTEGER freq, time_funct_start, time_funct_end;
2171   QueryPerformanceFrequency(&freq);
2172 #  elif ON_APPLE
2173   mach_timebase_info_data_t info = {0, 0};
2174   mach_timebase_info(&info);
2175   long long start, stop;
2176 #  else
2177   timespec time_funct_start, time_funct_end;
2178 #  endif
2179 
2180   // input data
2181   int resolution = 300;
2182   int wpl = pixGetWpl(input.pix);
2183   int kThinLineFraction = 20;     // tess constant
2184   int kMinLineLengthFraction = 4; // tess constant
2185   int max_line_width = resolution / kThinLineFraction;
2186   int min_line_length = resolution / kMinLineLengthFraction;
2187   int closing_brick = max_line_width / 3;
2188 
2189   // function call
2190   if (type == DS_DEVICE_OPENCL_DEVICE) {
2191 #  if ON_WINDOWS
2192     QueryPerformanceCounter(&time_funct_start);
2193 #  elif ON_APPLE
2194     start = mach_absolute_time();
2195 #  else
2196     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2197 #  endif
2198     OpenclDevice::gpuEnv = *env;
2199     OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2200     Image pix_vline = nullptr, pix_hline = nullptr, pix_closed = nullptr;
2201     OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2202                                 closing_brick, closing_brick, max_line_width, max_line_width,
2203                                 min_line_length, min_line_length);
2204 
2205     OpenclDevice::releaseMorphCLBuffers();
2206 
2207 #  if ON_WINDOWS
2208     QueryPerformanceCounter(&time_funct_end);
2209     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2210 #  elif ON_APPLE
2211     stop = mach_absolute_time();
2212     time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2213 #  else
2214     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2215     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2216            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2217 #  endif
2218   } else {
2219 #  if ON_WINDOWS
2220     QueryPerformanceCounter(&time_funct_start);
2221 #  elif ON_APPLE
2222     start = mach_absolute_time();
2223 #  else
2224     clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2225 #  endif
2226 
2227     // native serial code
2228     Image src_pix = input.pix;
2229     Image pix_closed = pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2230     Image pix_solid = pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2231     Image pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2232     pix_solid.destroy();
2233     Image pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2234     Image pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2235     pix_hline.destroy();
2236     pix_vline.destroy();
2237     pix_hollow.destroy();
2238 
2239 #  if ON_WINDOWS
2240     QueryPerformanceCounter(&time_funct_end);
2241     time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2242 #  elif ON_APPLE
2243     stop = mach_absolute_time();
2244     time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2245 #  else
2246     clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2247     time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2248            (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2249 #  endif
2250   }
2251 
2252   return time;
2253 }
2254 
2255 /******************************************************************************
2256  * Device Selection
2257  *****************************************************************************/
2258 
2259 // encode score object as byte string
serializeScore(ds_device * device,uint8_t ** serializedScore,unsigned int * serializedScoreSize)2260 static ds_status serializeScore(ds_device *device, uint8_t **serializedScore,
2261                                 unsigned int *serializedScoreSize) {
2262   *serializedScoreSize = sizeof(TessDeviceScore);
2263   *serializedScore = new uint8_t[*serializedScoreSize];
2264   memcpy(*serializedScore, device->score, *serializedScoreSize);
2265   return DS_SUCCESS;
2266 }
2267 
2268 // parses byte string and stores in score object
deserializeScore(ds_device * device,const uint8_t * serializedScore,unsigned int serializedScoreSize)2269 static ds_status deserializeScore(ds_device *device, const uint8_t *serializedScore,
2270                                   unsigned int serializedScoreSize) {
2271   // check that serializedScoreSize == sizeof(TessDeviceScore);
2272   device->score = new TessDeviceScore;
2273   memcpy(device->score, serializedScore, serializedScoreSize);
2274   return DS_SUCCESS;
2275 }
2276 
releaseScore(TessDeviceScore * score)2277 static ds_status releaseScore(TessDeviceScore *score) {
2278   delete score;
2279   return DS_SUCCESS;
2280 }
2281 
2282 // evaluate devices
evaluateScoreForDevice(ds_device * device,void * inputData)2283 static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) {
2284   // overwrite statuc gpuEnv w/ current device
2285   // so native opencl calls can be used; they use static gpuEnv
2286   tprintf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2287           device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2288   GPUEnv *env = nullptr;
2289   if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2290     env = &OpenclDevice::gpuEnv;
2291     memset(env, 0, sizeof(*env));
2292     // tprintf("[DS] populating tmp GPUEnv from device\n");
2293     populateGPUEnvFromDevice(env, device->oclDeviceID);
2294     env->mnFileCount = 0; // argc;
2295     env->mnKernelCount = 0UL;
2296     // tprintf("[DS] compiling kernels for tmp GPUEnv\n");
2297     OpenclDevice::CompileKernelFile(env, "");
2298   }
2299 
2300   TessScoreEvaluationInputData *input = static_cast<TessScoreEvaluationInputData *>(inputData);
2301 
2302   // pixReadTiff
2303   double composeRGBPixelTime = composeRGBPixelMicroBench(env, *input, device->type);
2304 
2305   // HistogramRect
2306   double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2307 
2308   // ThresholdRectToPix
2309   double thresholdRectToPixTime = thresholdRectToPixMicroBench(env, *input, device->type);
2310 
2311   // getLineMasks
2312   double getLineMasksMorphTime = getLineMasksMorphMicroBench(env, *input, device->type);
2313 
2314   // weigh times (% of cpu time)
2315   // these weights should be the % execution time that the native cpu code took
2316   float composeRGBPixelWeight = 1.2f;
2317   float histogramRectWeight = 2.4f;
2318   float thresholdRectToPixWeight = 4.5f;
2319   float getLineMasksMorphWeight = 5.0f;
2320 
2321   float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2322                        histogramRectWeight * histogramRectTime +
2323                        thresholdRectToPixWeight * thresholdRectToPixTime +
2324                        getLineMasksMorphWeight * getLineMasksMorphTime;
2325   device->score = new TessDeviceScore;
2326   device->score->time = weightedTime;
2327 
2328   tprintf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2329           device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2330   tprintf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight);
2331   tprintf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight);
2332   tprintf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime,
2333           thresholdRectToPixWeight);
2334   tprintf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2335           getLineMasksMorphWeight);
2336   tprintf("[DS]%25s: %f\n", "Score", device->score->time);
2337   return DS_SUCCESS;
2338 }
2339 
2340 // initial call to select device
getDeviceSelection()2341 ds_device OpenclDevice::getDeviceSelection() {
2342   if (!deviceIsSelected) {
2343     // check if opencl is available at runtime
2344     if (1 == LoadOpencl()) {
2345       // opencl is available
2346       // setup devices
2347       ds_status status;
2348       ds_profile *profile;
2349       status = initDSProfile(&profile, "v0.1");
2350       // try reading scores from file
2351       const char *fileName = "tesseract_opencl_profile_devices.dat";
2352       status = readProfileFromFile(profile, deserializeScore, fileName);
2353       if (status != DS_SUCCESS) {
2354         // need to run evaluation
2355         tprintf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
2356 
2357         // create input data
2358         TessScoreEvaluationInputData input;
2359         populateTessScoreEvaluationInputData(&input);
2360         // perform evaluations
2361         unsigned int numUpdates;
2362         status =
2363             profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, &input, &numUpdates);
2364         // write scores to file
2365         if (status == DS_SUCCESS) {
2366           status = writeProfileToFile(profile, serializeScore, fileName);
2367           if (status == DS_SUCCESS) {
2368             tprintf("[DS] Scores written to file (%s).\n", fileName);
2369           } else {
2370             tprintf(
2371                 "[DS] Error saving scores to file (%s); scores not written to "
2372                 "file.\n",
2373                 fileName);
2374           }
2375         } else {
2376           tprintf(
2377               "[DS] Unable to evaluate performance; scores not written to "
2378               "file.\n");
2379         }
2380       } else {
2381         tprintf("[DS] Profile read from file (%s).\n", fileName);
2382       }
2383 
2384       // we now have device scores either from file or evaluation
2385       // select fastest using custom Tesseract selection algorithm
2386       float bestTime = FLT_MAX; // begin search with worst possible time
2387       int bestDeviceIdx = -1;
2388       for (unsigned d = 0; d < profile->numDevices; d++) {
2389         ds_device device = profile->devices[d];
2390         if (device.score == nullptr)
2391           continue;
2392         TessDeviceScore score = *device.score;
2393 
2394         float time = score.time;
2395         tprintf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, device.oclDeviceName,
2396                 time);
2397         if (time < bestTime) {
2398           bestTime = time;
2399           bestDeviceIdx = d;
2400         }
2401       }
2402       if (bestDeviceIdx >= 0) {
2403         tprintf(
2404             "[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2405             profile->devices[bestDeviceIdx].oclDeviceName,
2406             profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2407       }
2408       // cleanup
2409       // TODO: call destructor for profile object?
2410 
2411       bool overridden = false;
2412       char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2413       if (overrideDeviceStr != nullptr) {
2414         int overrideDeviceIdx = atoi(overrideDeviceStr);
2415         if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2416           tprintf(
2417               "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2418               "%i)\n",
2419               overrideDeviceStr, overrideDeviceIdx);
2420           bestDeviceIdx = overrideDeviceIdx - 1;
2421           overridden = true;
2422         } else {
2423           tprintf(
2424               "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2425               "valid devices).\n",
2426               overrideDeviceStr, profile->numDevices);
2427         }
2428       }
2429 
2430       if (overridden) {
2431         tprintf(
2432             "[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2433             profile->devices[bestDeviceIdx].oclDeviceName,
2434             profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2435       }
2436       selectedDevice = profile->devices[bestDeviceIdx];
2437       // cleanup
2438       releaseDSProfile(profile, releaseScore);
2439     } else {
2440       // opencl isn't available at runtime, select native cpu device
2441       tprintf("[DS] OpenCL runtime not available.\n");
2442       selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2443       selectedDevice.oclDeviceName = "(null)";
2444       selectedDevice.score = nullptr;
2445       selectedDevice.oclDeviceID = nullptr;
2446       selectedDevice.oclDriverVersion = nullptr;
2447     }
2448     deviceIsSelected = true;
2449   }
2450   return selectedDevice;
2451 }
2452 
selectedDeviceIsOpenCL()2453 bool OpenclDevice::selectedDeviceIsOpenCL() {
2454   ds_device device = getDeviceSelection();
2455   return (device.type == DS_DEVICE_OPENCL_DEVICE);
2456 }
2457 
2458 } // namespace
2459 
2460 #endif
2461