1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10
11 #ifdef USE_OPENCL
12
13 # ifdef _WIN32
14 # include <io.h>
15 # else
16 # include <sys/types.h>
17 # include <unistd.h>
18 # endif
19 # include <cfloat>
20 # include <ctime> // for clock_gettime
21
22 # include "oclkernels.h"
23 # include "openclwrapper.h"
24
25 // for micro-benchmark
26 # include "otsuthr.h"
27 # include "thresholder.h"
28
29 // platform preprocessor commands
30 # if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(__CYGWIN__) || \
31 defined(__MINGW32__)
32 # define ON_WINDOWS 1
33 # define ON_APPLE 0
34 # elif defined(__linux__)
35 # define ON_WINDOWS 0
36 # define ON_APPLE 0
37 # elif defined(__APPLE__)
38 # define ON_WINDOWS 0
39 # define ON_APPLE 1
40 # else
41 # define ON_WINDOWS 0
42 # define ON_APPLE 0
43 # endif
44
45 # if ON_APPLE
46 # include <mach/mach_time.h>
47 # endif
48
49 # include <cstdio>
50 # include <cstdlib>
51 # include <cstring> // for memset, strcpy, ...
52 # include <vector>
53
54 # include "errcode.h" // for ASSERT_HOST
55 # include "image.h" // for Image
56
57 namespace tesseract {
58
59 GPUEnv OpenclDevice::gpuEnv;
60
61 bool OpenclDevice::deviceIsSelected = false;
62 ds_device OpenclDevice::selectedDevice;
63
64 int OpenclDevice::isInited = 0;
65
66 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
67
68 static const l_uint32 lmask32[] = {
69 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000,
70 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000,
71 0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
72 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff};
73
74 static const l_uint32 rmask32[] = {
75 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
76 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
77 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
78 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff};
79
80 static cl_mem pixsCLBuffer, pixdCLBuffer,
81 pixdCLIntermediate; // Morph operations buffers
82 static cl_mem pixThBuffer; // output from thresholdtopix calculation
83 static cl_int clStatus;
84 static KernelEnv rEnv;
85
86 # define DS_TAG_VERSION "<version>"
87 # define DS_TAG_VERSION_END "</version>"
88 # define DS_TAG_DEVICE "<device>"
89 # define DS_TAG_DEVICE_END "</device>"
90 # define DS_TAG_SCORE "<score>"
91 # define DS_TAG_SCORE_END "</score>"
92 # define DS_TAG_DEVICE_TYPE "<type>"
93 # define DS_TAG_DEVICE_TYPE_END "</type>"
94 # define DS_TAG_DEVICE_NAME "<name>"
95 # define DS_TAG_DEVICE_NAME_END "</name>"
96 # define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
97 # define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
98
99 # define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
100
101 # define DS_DEVICE_NAME_LENGTH 256
102
103 enum ds_evaluation_type { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY };
104
105 struct ds_profile {
106 std::vector<ds_device> devices;
107 unsigned int numDevices;
108 const char *version;
109 };
110
111 enum ds_status {
112 DS_SUCCESS = 0,
113 DS_INVALID_PROFILE = 1000,
114 DS_MEMORY_ERROR,
115 DS_INVALID_PERF_EVALUATOR_TYPE,
116 DS_INVALID_PERF_EVALUATOR,
117 DS_PERF_EVALUATOR_ERROR,
118 DS_FILE_ERROR,
119 DS_UNKNOWN_DEVICE_TYPE,
120 DS_PROFILE_FILE_ERROR,
121 DS_SCORE_SERIALIZER_ERROR,
122 DS_SCORE_DESERIALIZER_ERROR
123 };
124
125 // Pointer to a function that calculates the score of a device (ex:
126 // device->score) update the data size of score. The encoding and the format
127 // of the score data is implementation defined. The function should return
128 // DS_SUCCESS if there's no error to be reported.
129 typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
130
131 // deallocate memory used by score
132 typedef ds_status (*ds_score_release)(TessDeviceScore *score);
133
releaseDSProfile(ds_profile * profile,ds_score_release sr)134 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
135 ds_status status = DS_SUCCESS;
136 if (profile != nullptr) {
137 if (sr != nullptr) {
138 unsigned int i;
139 for (i = 0; i < profile->numDevices; i++) {
140 free(profile->devices[i].oclDeviceName);
141 free(profile->devices[i].oclDriverVersion);
142 status = sr(profile->devices[i].score);
143 if (status != DS_SUCCESS)
144 break;
145 }
146 }
147 delete profile;
148 }
149 return status;
150 }
151
initDSProfile(ds_profile ** p,const char * version)152 static ds_status initDSProfile(ds_profile **p, const char *version) {
153 int numDevices;
154 cl_uint numPlatforms;
155 std::vector<cl_platform_id> platforms;
156 std::vector<cl_device_id> devices;
157 ds_status status = DS_SUCCESS;
158 unsigned int next;
159 unsigned int i;
160
161 if (p == nullptr)
162 return DS_INVALID_PROFILE;
163
164 ds_profile *profile = new ds_profile;
165
166 memset(profile, 0, sizeof(ds_profile));
167
168 clGetPlatformIDs(0, nullptr, &numPlatforms);
169
170 if (numPlatforms > 0) {
171 platforms.reserve(numPlatforms);
172 clGetPlatformIDs(numPlatforms, &platforms[0], nullptr);
173 }
174
175 numDevices = 0;
176 for (i = 0; i < numPlatforms; i++) {
177 cl_uint num;
178 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
179 numDevices += num;
180 }
181
182 if (numDevices > 0) {
183 devices.reserve(numDevices);
184 }
185
186 profile->numDevices = numDevices + 1; // +1 to numDevices to include the native CPU
187 profile->devices.reserve(profile->numDevices);
188 memset(&profile->devices[0], 0, profile->numDevices * sizeof(ds_device));
189
190 next = 0;
191 for (i = 0; i < numPlatforms; i++) {
192 cl_uint num;
193 unsigned j;
194 clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &num);
195 for (j = 0; j < num; j++, next++) {
196 char buffer[DS_DEVICE_NAME_LENGTH];
197 size_t length;
198
199 profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
200 profile->devices[next].oclDeviceID = devices[j];
201
202 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, DS_DEVICE_NAME_LENGTH,
203 &buffer, nullptr);
204 length = strlen(buffer);
205 profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
206 memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
207
208 clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, DS_DEVICE_NAME_LENGTH,
209 &buffer, nullptr);
210 length = strlen(buffer);
211 profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
212 memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
213 }
214 }
215 profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
216 profile->version = version;
217
218 *p = profile;
219 return status;
220 }
221
profileDevices(ds_profile * profile,const ds_evaluation_type type,ds_perf_evaluator evaluator,void * evaluatorData,unsigned int * numUpdates)222 static ds_status profileDevices(ds_profile *profile, const ds_evaluation_type type,
223 ds_perf_evaluator evaluator, void *evaluatorData,
224 unsigned int *numUpdates) {
225 ds_status status = DS_SUCCESS;
226 unsigned int i;
227 unsigned int updates = 0;
228
229 if (profile == nullptr) {
230 return DS_INVALID_PROFILE;
231 }
232 if (evaluator == nullptr) {
233 return DS_INVALID_PERF_EVALUATOR;
234 }
235
236 for (i = 0; i < profile->numDevices; i++) {
237 ds_status evaluatorStatus;
238
239 switch (type) {
240 case DS_EVALUATE_NEW_ONLY:
241 if (profile->devices[i].score != nullptr)
242 break;
243 // else fall through
244 case DS_EVALUATE_ALL:
245 evaluatorStatus = evaluator(&profile->devices[i], evaluatorData);
246 if (evaluatorStatus != DS_SUCCESS) {
247 status = evaluatorStatus;
248 return status;
249 }
250 updates++;
251 break;
252 default:
253 return DS_INVALID_PERF_EVALUATOR_TYPE;
254 break;
255 };
256 }
257 if (numUpdates)
258 *numUpdates = updates;
259 return status;
260 }
261
findString(const char * contentStart,const char * contentEnd,const char * string)262 static const char *findString(const char *contentStart, const char *contentEnd,
263 const char *string) {
264 size_t stringLength;
265 const char *currentPosition;
266 const char *found = nullptr;
267 stringLength = strlen(string);
268 currentPosition = contentStart;
269 for (currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) {
270 if (*currentPosition == string[0]) {
271 if (currentPosition + stringLength < contentEnd) {
272 if (strncmp(currentPosition, string, stringLength) == 0) {
273 found = currentPosition;
274 break;
275 }
276 }
277 }
278 }
279 return found;
280 }
281
readProFile(const char * fileName,char ** content,size_t * contentSize)282 static ds_status readProFile(const char *fileName, char **content, size_t *contentSize) {
283 *contentSize = 0;
284 *content = nullptr;
285 ds_status status = DS_SUCCESS;
286 FILE *input = fopen(fileName, "rb");
287 if (input == nullptr) {
288 status = DS_FILE_ERROR;
289 } else {
290 fseek(input, 0L, SEEK_END);
291 auto pos = std::ftell(input);
292 rewind(input);
293 if (pos > 0) {
294 size_t size = pos;
295 char *binary = new char[size];
296 if (fread(binary, sizeof(char), size, input) != size) {
297 status = DS_FILE_ERROR;
298 delete[] binary;
299 } else {
300 *contentSize = size;
301 *content = binary;
302 }
303 }
304 fclose(input);
305 }
306 return status;
307 }
308
309 typedef ds_status (*ds_score_deserializer)(ds_device *device, const uint8_t *serializedScore,
310 unsigned int serializedScoreSize);
311
readProfileFromFile(ds_profile * profile,ds_score_deserializer deserializer,const char * file)312 static ds_status readProfileFromFile(ds_profile *profile, ds_score_deserializer deserializer,
313 const char *file) {
314 ds_status status = DS_SUCCESS;
315 char *contentStart;
316 size_t contentSize;
317
318 if (profile == nullptr)
319 return DS_INVALID_PROFILE;
320
321 status = readProFile(file, &contentStart, &contentSize);
322 if (status == DS_SUCCESS) {
323 const char *currentPosition;
324 const char *dataStart;
325 const char *dataEnd;
326
327 const char *contentEnd = contentStart + contentSize;
328 currentPosition = contentStart;
329
330 // parse the version string
331 dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
332 if (dataStart == nullptr) {
333 status = DS_PROFILE_FILE_ERROR;
334 goto cleanup;
335 }
336 dataStart += strlen(DS_TAG_VERSION);
337
338 dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
339 if (dataEnd == nullptr) {
340 status = DS_PROFILE_FILE_ERROR;
341 goto cleanup;
342 }
343
344 size_t versionStringLength = strlen(profile->version);
345 if (versionStringLength + dataStart != dataEnd ||
346 strncmp(profile->version, dataStart, versionStringLength) != 0) {
347 // version mismatch
348 status = DS_PROFILE_FILE_ERROR;
349 goto cleanup;
350 }
351 currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
352
353 // parse the device information
354 while (1) {
355 unsigned int i;
356
357 const char *deviceTypeStart;
358 const char *deviceTypeEnd;
359 ds_device_type deviceType;
360
361 const char *deviceNameStart;
362 const char *deviceNameEnd;
363
364 const char *deviceScoreStart;
365 const char *deviceScoreEnd;
366
367 const char *deviceDriverStart;
368 const char *deviceDriverEnd;
369
370 dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
371 if (dataStart == nullptr) {
372 // nothing useful remain, quit...
373 break;
374 }
375 dataStart += strlen(DS_TAG_DEVICE);
376 dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
377 if (dataEnd == nullptr) {
378 status = DS_PROFILE_FILE_ERROR;
379 goto cleanup;
380 }
381
382 // parse the device type
383 deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
384 if (deviceTypeStart == nullptr) {
385 status = DS_PROFILE_FILE_ERROR;
386 goto cleanup;
387 }
388 deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
389 deviceTypeEnd = findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
390 if (deviceTypeEnd == nullptr) {
391 status = DS_PROFILE_FILE_ERROR;
392 goto cleanup;
393 }
394 memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
395
396 // parse the device name
397 if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
398 deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
399 if (deviceNameStart == nullptr) {
400 status = DS_PROFILE_FILE_ERROR;
401 goto cleanup;
402 }
403 deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
404 deviceNameEnd = findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
405 if (deviceNameEnd == nullptr) {
406 status = DS_PROFILE_FILE_ERROR;
407 goto cleanup;
408 }
409
410 deviceDriverStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
411 if (deviceDriverStart == nullptr) {
412 status = DS_PROFILE_FILE_ERROR;
413 goto cleanup;
414 }
415 deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
416 deviceDriverEnd =
417 findString(deviceDriverStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION_END);
418 if (deviceDriverEnd == nullptr) {
419 status = DS_PROFILE_FILE_ERROR;
420 goto cleanup;
421 }
422
423 // check if this device is on the system
424 for (i = 0; i < profile->numDevices; i++) {
425 if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
426 size_t actualDeviceNameLength;
427 size_t driverVersionLength;
428
429 actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
430 driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
431 if (deviceNameStart + actualDeviceNameLength == deviceNameEnd &&
432 deviceDriverStart + driverVersionLength == deviceDriverEnd &&
433 strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
434 actualDeviceNameLength) == 0 &&
435 strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
436 driverVersionLength) == 0) {
437 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
438 deviceScoreStart += strlen(DS_TAG_SCORE);
439 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
440 status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
441 deviceScoreEnd - deviceScoreStart);
442 if (status != DS_SUCCESS) {
443 goto cleanup;
444 }
445 }
446 }
447 }
448 } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
449 for (i = 0; i < profile->numDevices; i++) {
450 if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
451 deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
452 if (deviceScoreStart == nullptr) {
453 status = DS_PROFILE_FILE_ERROR;
454 goto cleanup;
455 }
456 deviceScoreStart += strlen(DS_TAG_SCORE);
457 deviceScoreEnd = findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
458 status = deserializer(&profile->devices[i], (const unsigned char *)deviceScoreStart,
459 deviceScoreEnd - deviceScoreStart);
460 if (status != DS_SUCCESS) {
461 goto cleanup;
462 }
463 }
464 }
465 }
466
467 // skip over the current one to find the next device
468 currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
469 }
470 }
471 cleanup:
472 delete[] contentStart;
473 return status;
474 }
475
476 typedef ds_status (*ds_score_serializer)(ds_device *device, uint8_t **serializedScore,
477 unsigned int *serializedScoreSize);
writeProfileToFile(ds_profile * profile,ds_score_serializer serializer,const char * file)478 static ds_status writeProfileToFile(ds_profile *profile, ds_score_serializer serializer,
479 const char *file) {
480 ds_status status = DS_SUCCESS;
481
482 if (profile == nullptr)
483 return DS_INVALID_PROFILE;
484
485 FILE *profileFile = fopen(file, "wb");
486 if (profileFile == nullptr) {
487 status = DS_FILE_ERROR;
488 } else {
489 unsigned int i;
490
491 // write version string
492 fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
493 fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile);
494 fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile);
495 fwrite("\n", sizeof(char), 1, profileFile);
496
497 for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
498 uint8_t *serializedScore;
499 unsigned int serializedScoreSize;
500
501 fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
502
503 fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), profileFile);
504 fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
505 fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
506
507 switch (profile->devices[i].type) {
508 case DS_DEVICE_NATIVE_CPU: {
509 // There's no need to emit a device name for the native CPU device.
510 /*
511 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
512 profileFile);
513 fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
514 strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
515 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
516 strlen(DS_TAG_DEVICE_NAME_END), profileFile);
517 */
518 } break;
519 case DS_DEVICE_OPENCL_DEVICE: {
520 fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), profileFile);
521 fwrite(profile->devices[i].oclDeviceName, sizeof(char),
522 strlen(profile->devices[i].oclDeviceName), profileFile);
523 fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), strlen(DS_TAG_DEVICE_NAME_END), profileFile);
524
525 fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), strlen(DS_TAG_DEVICE_DRIVER_VERSION),
526 profileFile);
527 fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
528 strlen(profile->devices[i].oclDriverVersion), profileFile);
529 fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
530 strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
531 } break;
532 default:
533 status = DS_UNKNOWN_DEVICE_TYPE;
534 continue;
535 };
536
537 fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
538 status = serializer(&profile->devices[i], &serializedScore, &serializedScoreSize);
539 if (status == DS_SUCCESS && serializedScore != nullptr && serializedScoreSize > 0) {
540 fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
541 delete[] serializedScore;
542 }
543 fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile);
544 fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile);
545 fwrite("\n", sizeof(char), 1, profileFile);
546 }
547 fclose(profileFile);
548 }
549 return status;
550 }
551
552 // substitute invalid characters in device name with _
legalizeFileName(char * fileName)553 static void legalizeFileName(char *fileName) {
554 // tprintf("fileName: %s\n", fileName);
555 const char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
556 // for each invalid char
557 for (unsigned i = 0; i < strlen(invalidChars); i++) {
558 char invalidStr[4];
559 invalidStr[0] = invalidChars[i];
560 invalidStr[1] = '\0';
561 // tprintf("eliminating %s\n", invalidStr);
562 // char *pos = strstr(fileName, invalidStr);
563 // initial ./ is valid for present directory
564 // if (*pos == '.') pos++;
565 // if (*pos == '/') pos++;
566 for (char *pos = strstr(fileName, invalidStr); pos != nullptr;
567 pos = strstr(pos + 1, invalidStr)) {
568 // tprintf("\tfound: %s, ", pos);
569 pos[0] = '_';
570 // tprintf("fileName: %s\n", fileName);
571 }
572 }
573 }
574
populateGPUEnvFromDevice(GPUEnv * gpuInfo,cl_device_id device)575 static void populateGPUEnvFromDevice(GPUEnv *gpuInfo, cl_device_id device) {
576 // tprintf("[DS] populateGPUEnvFromDevice\n");
577 size_t size;
578 gpuInfo->mnIsUserCreated = 1;
579 // device
580 gpuInfo->mpDevID = device;
581 gpuInfo->mpArryDevsID = new cl_device_id[1];
582 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
583 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, sizeof(cl_device_type),
584 &gpuInfo->mDevType, &size);
585 CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
586 // platform
587 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
588 &gpuInfo->mpPlatformID, &size);
589 CHECK_OPENCL(clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
590 // context
591 cl_context_properties props[3];
592 props[0] = CL_CONTEXT_PLATFORM;
593 props[1] = (cl_context_properties)gpuInfo->mpPlatformID;
594 props[2] = 0;
595 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, nullptr, &clStatus);
596 CHECK_OPENCL(clStatus, "populateGPUEnv::createContext");
597 // queue
598 cl_command_queue_properties queueProperties = 0;
599 gpuInfo->mpCmdQueue =
600 clCreateCommandQueue(gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus);
601 CHECK_OPENCL(clStatus, "populateGPUEnv::createCommandQueue");
602 }
603
LoadOpencl()604 int OpenclDevice::LoadOpencl() {
605 # ifdef WIN32
606 HINSTANCE HOpenclDll = nullptr;
607 void *OpenclDll = nullptr;
608 // fprintf(stderr, " LoadOpenclDllxx... \n");
609 OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
610 OpenclDll = LoadLibrary("openCL.dll");
611 if (!static_cast<HINSTANCE>(OpenclDll)) {
612 fprintf(stderr, "[OD] Load opencl.dll failed!\n");
613 FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
614 return 0;
615 }
616 fprintf(stderr, "[OD] Load opencl.dll successful!\n");
617 # endif
618 return 1;
619 }
SetKernelEnv(KernelEnv * envInfo)620 int OpenclDevice::SetKernelEnv(KernelEnv *envInfo) {
621 envInfo->mpkContext = gpuEnv.mpContext;
622 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
623 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
624
625 return 1;
626 }
627
allocateZeroCopyBuffer(const KernelEnv & rEnv,l_uint32 * hostbuffer,size_t nElements,cl_mem_flags flags,cl_int * pStatus)628 static cl_mem allocateZeroCopyBuffer(const KernelEnv &rEnv, l_uint32 *hostbuffer, size_t nElements,
629 cl_mem_flags flags, cl_int *pStatus) {
630 cl_mem membuffer = clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(flags),
631 nElements * sizeof(l_uint32), hostbuffer, pStatus);
632
633 return membuffer;
634 }
635
mapOutputCLBuffer(const KernelEnv & rEnv,cl_mem clbuffer,Image pixd,Image pixs,int elements,cl_mem_flags flags,bool memcopy=false,bool sync=true)636 static Image mapOutputCLBuffer(const KernelEnv &rEnv, cl_mem clbuffer, Image pixd, Image pixs,
637 int elements, cl_mem_flags flags, bool memcopy = false,
638 bool sync = true) {
639 if (!pixd) {
640 if (memcopy) {
641 if ((pixd = pixCreateTemplate(pixs)) == nullptr)
642 tprintf("pixd not made\n");
643 } else {
644 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) ==
645 nullptr)
646 tprintf("pixd not made\n");
647 }
648 }
649 l_uint32 *pValues =
650 (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
651 elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
652
653 if (memcopy) {
654 memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
655 } else {
656 pixSetData(pixd, pValues);
657 }
658
659 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr, nullptr);
660
661 if (sync) {
662 clFinish(rEnv.mpkCmdQueue);
663 }
664
665 return pixd;
666 }
667
releaseMorphCLBuffers()668 void OpenclDevice::releaseMorphCLBuffers() {
669 if (pixdCLIntermediate != nullptr)
670 clReleaseMemObject(pixdCLIntermediate);
671 if (pixsCLBuffer != nullptr)
672 clReleaseMemObject(pixsCLBuffer);
673 if (pixdCLBuffer != nullptr)
674 clReleaseMemObject(pixdCLBuffer);
675 if (pixThBuffer != nullptr)
676 clReleaseMemObject(pixThBuffer);
677 pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
678 }
679
initMorphCLAllocations(l_int32 wpl,l_int32 h,Image pixs)680 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Image pixs) {
681 SetKernelEnv(&rEnv);
682
683 if (pixThBuffer != nullptr) {
684 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
685
686 // Get the output from ThresholdToPix operation
687 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
688 sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
689 } else {
690 // Get data from the source image
691 l_uint32 *srcdata = reinterpret_cast<l_uint32 *>(malloc(wpl * h * sizeof(l_uint32)));
692 memcpy(srcdata, pixGetData(pixs), wpl * h * sizeof(l_uint32));
693
694 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl * h, CL_MEM_USE_HOST_PTR, &clStatus);
695 }
696
697 pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
698
699 pixdCLIntermediate =
700 allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
701
702 return (int)clStatus;
703 }
704
InitEnv()705 int OpenclDevice::InitEnv() {
706 // tprintf("[OD] OpenclDevice::InitEnv()\n");
707 # ifdef SAL_WIN32
708 while (1) {
709 if (1 == LoadOpencl())
710 break;
711 }
712 # endif
713 // sets up environment, compiles programs
714
715 InitOpenclRunEnv_DeviceSelection(0);
716 return 1;
717 }
718
ReleaseOpenclRunEnv()719 int OpenclDevice::ReleaseOpenclRunEnv() {
720 ReleaseOpenclEnv(&gpuEnv);
721 # ifdef SAL_WIN32
722 FreeOpenclDll();
723 # endif
724 return 1;
725 }
726
AddKernelConfig(int kCount,const char * kName)727 inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
728 ASSERT_HOST(kCount > 0);
729 ASSERT_HOST(strlen(kName) < sizeof(gpuEnv.mArrykernelNames[kCount - 1]));
730 strcpy(gpuEnv.mArrykernelNames[kCount - 1], kName);
731 gpuEnv.mnKernelCount++;
732 return 0;
733 }
734
RegistOpenclKernel()735 int OpenclDevice::RegistOpenclKernel() {
736 if (!gpuEnv.mnIsUserCreated)
737 memset(&gpuEnv, 0, sizeof(gpuEnv));
738
739 gpuEnv.mnFileCount = 0; // argc;
740 gpuEnv.mnKernelCount = 0UL;
741
742 AddKernelConfig(1, "oclAverageSub1");
743 return 0;
744 }
745
InitOpenclRunEnv_DeviceSelection(int argc)746 int OpenclDevice::InitOpenclRunEnv_DeviceSelection(int argc) {
747 if (!isInited) {
748 // after programs compiled, selects best device
749 ds_device bestDevice_DS = getDeviceSelection();
750 cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
751 // overwrite global static GPUEnv with new device
752 if (selectedDeviceIsOpenCL()) {
753 // tprintf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice()
754 // for selected device\n");
755 populateGPUEnvFromDevice(&gpuEnv, bestDevice);
756 gpuEnv.mnFileCount = 0; // argc;
757 gpuEnv.mnKernelCount = 0UL;
758 CompileKernelFile(&gpuEnv, "");
759 } else {
760 // tprintf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice()
761 // b/c native cpu selected\n");
762 }
763 isInited = 1;
764 }
765 return 0;
766 }
767
OpenclDevice()768 OpenclDevice::OpenclDevice() {
769 // InitEnv();
770 }
771
~OpenclDevice()772 OpenclDevice::~OpenclDevice() {
773 // ReleaseOpenclRunEnv();
774 }
775
ReleaseOpenclEnv(GPUEnv * gpuInfo)776 int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
777 int i = 0;
778 int clStatus = 0;
779
780 if (!isInited) {
781 return 1;
782 }
783
784 for (i = 0; i < gpuEnv.mnFileCount; i++) {
785 if (gpuEnv.mpArryPrograms[i]) {
786 clStatus = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
787 CHECK_OPENCL(clStatus, "clReleaseProgram");
788 gpuEnv.mpArryPrograms[i] = nullptr;
789 }
790 }
791 if (gpuEnv.mpCmdQueue) {
792 clReleaseCommandQueue(gpuEnv.mpCmdQueue);
793 gpuEnv.mpCmdQueue = nullptr;
794 }
795 if (gpuEnv.mpContext) {
796 clReleaseContext(gpuEnv.mpContext);
797 gpuEnv.mpContext = nullptr;
798 }
799 isInited = 0;
800 gpuInfo->mnIsUserCreated = 0;
801 delete[] gpuInfo->mpArryDevsID;
802 return 1;
803 }
BinaryGenerated(const char * clFileName,FILE ** fhandle)804 int OpenclDevice::BinaryGenerated(const char *clFileName, FILE **fhandle) {
805 unsigned int i = 0;
806 cl_int clStatus;
807 int status = 0;
808 FILE *fd = nullptr;
809 char fileName[256] = {0}, cl_name[128] = {0};
810 char deviceName[1024];
811 clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName,
812 nullptr);
813 CHECK_OPENCL(clStatus, "clGetDeviceInfo");
814 const char *str = strstr(clFileName, ".cl");
815 memcpy(cl_name, clFileName, str - clFileName);
816 cl_name[str - clFileName] = '\0';
817 sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
818 legalizeFileName(fileName);
819 fd = fopen(fileName, "rb");
820 status = (fd != nullptr) ? 1 : 0;
821 if (fd != nullptr) {
822 *fhandle = fd;
823 }
824 return status;
825 }
CachedOfKernerPrg(const GPUEnv * gpuEnvCached,const char * clFileName)826 int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached, const char *clFileName) {
827 int i;
828 for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
829 if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
830 if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
831 return 1;
832 }
833 }
834 }
835
836 return 0;
837 }
WriteBinaryToFile(const char * fileName,const char * birary,size_t numBytes)838 int OpenclDevice::WriteBinaryToFile(const char *fileName, const char *birary, size_t numBytes) {
839 FILE *output = nullptr;
840 output = fopen(fileName, "wb");
841 if (output == nullptr) {
842 return 0;
843 }
844
845 fwrite(birary, sizeof(char), numBytes, output);
846 fclose(output);
847
848 return 1;
849 }
850
GeneratBinFromKernelSource(cl_program program,const char * clFileName)851 int OpenclDevice::GeneratBinFromKernelSource(cl_program program, const char *clFileName) {
852 unsigned int i = 0;
853 cl_int clStatus;
854 cl_uint numDevices;
855
856 clStatus =
857 clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(numDevices), &numDevices, nullptr);
858 CHECK_OPENCL(clStatus, "clGetProgramInfo");
859
860 std::vector<cl_device_id> mpArryDevsID(numDevices);
861
862 /* grab the handles to all of the devices in the program. */
863 clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * numDevices,
864 &mpArryDevsID[0], nullptr);
865 CHECK_OPENCL(clStatus, "clGetProgramInfo");
866
867 /* figure out the sizes of each of the binaries. */
868 std::vector<size_t> binarySizes(numDevices);
869
870 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * numDevices,
871 &binarySizes[0], nullptr);
872 CHECK_OPENCL(clStatus, "clGetProgramInfo");
873
874 /* copy over all of the generated binaries. */
875 std::vector<char *> binaries(numDevices);
876
877 for (i = 0; i < numDevices; i++) {
878 if (binarySizes[i] != 0) {
879 binaries[i] = new char[binarySizes[i]];
880 } else {
881 binaries[i] = nullptr;
882 }
883 }
884
885 clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char *) * numDevices,
886 &binaries[0], nullptr);
887 CHECK_OPENCL(clStatus, "clGetProgramInfo");
888
889 /* dump out each binary into its own separate file. */
890 for (i = 0; i < numDevices; i++) {
891 char fileName[256] = {0}, cl_name[128] = {0};
892
893 if (binarySizes[i] != 0) {
894 char deviceName[1024];
895 clStatus =
896 clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
897 CHECK_OPENCL(clStatus, "clGetDeviceInfo");
898
899 const char *str = strstr(clFileName, ".cl");
900 memcpy(cl_name, clFileName, str - clFileName);
901 cl_name[str - clFileName] = '\0';
902 sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
903 legalizeFileName(fileName);
904 if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
905 tprintf("[OD] write binary[%s] failed\n", fileName);
906 return 0;
907 } // else
908 tprintf("[OD] write binary[%s] successfully\n", fileName);
909 }
910 }
911
912 // Release all resources and memory
913 for (i = 0; i < numDevices; i++) {
914 delete[] binaries[i];
915 }
916
917 return 1;
918 }
919
CompileKernelFile(GPUEnv * gpuInfo,const char * buildOption)920 int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
921 cl_int clStatus = 0;
922 const char *source;
923 size_t source_size[1];
924 int binary_status, binaryExisted, idx;
925 cl_uint numDevices;
926 FILE *fd, *fd1;
927 const char *filename = "kernel.cl";
928 // fprintf(stderr, "[OD] CompileKernelFile ... \n");
929 if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
930 return 1;
931 }
932
933 idx = gpuInfo->mnFileCount;
934
935 source = kernel_src;
936
937 source_size[0] = strlen(source);
938 binaryExisted = 0;
939 binaryExisted = BinaryGenerated(filename, &fd); // don't check for binary during microbenchmark
940 if (binaryExisted == 1) {
941 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, sizeof(numDevices),
942 &numDevices, nullptr);
943 CHECK_OPENCL(clStatus, "clGetContextInfo");
944
945 std::vector<cl_device_id> mpArryDevsID(numDevices);
946 bool b_error = fseek(fd, 0, SEEK_END) < 0;
947 auto pos = std::ftell(fd);
948 b_error |= (pos <= 0);
949 size_t length = pos;
950 b_error |= fseek(fd, 0, SEEK_SET) < 0;
951 if (b_error) {
952 fclose(fd);
953 return 0;
954 }
955
956 std::vector<uint8_t> binary(length + 2);
957
958 memset(&binary[0], 0, length + 2);
959 b_error |= fread(&binary[0], 1, length, fd) != length;
960
961 fclose(fd);
962 fd = nullptr;
963 // grab the handles to all of the devices in the context.
964 clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
965 sizeof(cl_device_id) * numDevices, &mpArryDevsID[0], nullptr);
966 CHECK_OPENCL(clStatus, "clGetContextInfo");
967 // fprintf(stderr, "[OD] Create kernel from binary\n");
968 const uint8_t *c_binary = &binary[0];
969 gpuInfo->mpArryPrograms[idx] =
970 clCreateProgramWithBinary(gpuInfo->mpContext, numDevices, &mpArryDevsID[0], &length,
971 &c_binary, &binary_status, &clStatus);
972 CHECK_OPENCL(clStatus, "clCreateProgramWithBinary");
973 } else {
974 // create a CL program using the kernel source
975 // fprintf(stderr, "[OD] Create kernel from source\n");
976 gpuInfo->mpArryPrograms[idx] =
977 clCreateProgramWithSource(gpuInfo->mpContext, 1, &source, source_size, &clStatus);
978 CHECK_OPENCL(clStatus, "clCreateProgramWithSource");
979 }
980
981 if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
982 return 0;
983 }
984
985 // char options[512];
986 // create a cl program executable for all the devices specified
987 // tprintf("[OD] BuildProgram.\n");
988 if (!gpuInfo->mnIsUserCreated) {
989 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption,
990 nullptr, nullptr);
991 } else {
992 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption,
993 nullptr, nullptr);
994 }
995 if (clStatus != CL_SUCCESS) {
996 tprintf("BuildProgram error!\n");
997 size_t length;
998 if (!gpuInfo->mnIsUserCreated) {
999 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1000 CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1001 } else {
1002 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1003 CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1004 }
1005 if (clStatus != CL_SUCCESS) {
1006 tprintf("opencl create build log fail\n");
1007 return 0;
1008 }
1009 std::vector<char> buildLog(length);
1010 if (!gpuInfo->mnIsUserCreated) {
1011 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1012 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1013 } else {
1014 clStatus = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1015 CL_PROGRAM_BUILD_LOG, length, &buildLog[0], &length);
1016 }
1017 if (clStatus != CL_SUCCESS) {
1018 tprintf("opencl program build info fail\n");
1019 return 0;
1020 }
1021
1022 fd1 = fopen("kernel-build.log", "w+");
1023 if (fd1 != nullptr) {
1024 fwrite(&buildLog[0], sizeof(char), length, fd1);
1025 fclose(fd1);
1026 }
1027
1028 return 0;
1029 }
1030
1031 strcpy(gpuInfo->mArryKnelSrcFile[idx], filename);
1032 if (binaryExisted == 0) {
1033 GeneratBinFromKernelSource(gpuInfo->mpArryPrograms[idx], filename);
1034 }
1035
1036 gpuInfo->mnFileCount += 1;
1037 return 1;
1038 }
1039
pixReadFromTiffKernel(l_uint32 * tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 * line)1040 l_uint32 *OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata, l_int32 w, l_int32 h, l_int32 wpl,
1041 l_uint32 *line) {
1042 cl_int clStatus;
1043 KernelEnv rEnv;
1044 size_t globalThreads[2];
1045 size_t localThreads[2];
1046 int gsize;
1047 cl_mem valuesCl;
1048 cl_mem outputCl;
1049
1050 // global and local work dimensions for Horizontal pass
1051 gsize = (w + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1052 globalThreads[0] = gsize;
1053 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1054 globalThreads[1] = gsize;
1055 localThreads[0] = GROUPSIZE_X;
1056 localThreads[1] = GROUPSIZE_Y;
1057
1058 SetKernelEnv(&rEnv);
1059
1060 l_uint32 *pResult = (l_uint32 *)malloc(w * h * sizeof(l_uint32));
1061 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "composeRGBPixel", &clStatus);
1062 CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1063
1064 // Allocate input and output OCL buffers
1065 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w * h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1066 &clStatus);
1067 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w * h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
1068 &clStatus);
1069
1070 // Kernel arguments
1071 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1072 CHECK_OPENCL(clStatus, "clSetKernelArg");
1073 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1074 CHECK_OPENCL(clStatus, "clSetKernelArg");
1075 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1076 CHECK_OPENCL(clStatus, "clSetKernelArg");
1077 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1078 CHECK_OPENCL(clStatus, "clSetKernelArg");
1079 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1080 CHECK_OPENCL(clStatus, "clSetKernelArg");
1081
1082 // Kernel enqueue
1083 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1084 localThreads, 0, nullptr, nullptr);
1085 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1086
1087 /* map results back from gpu */
1088 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0,
1089 w * h * sizeof(l_uint32), 0, nullptr, nullptr, &clStatus);
1090 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1091 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1092
1093 // Sync
1094 clFinish(rEnv.mpkCmdQueue);
1095 return pResult;
1096 }
1097
1098 // Morphology Dilate operation for 5x5 structuring element. Invokes the relevant
1099 // OpenCL kernels
pixDilateCL_55(l_int32 wpl,l_int32 h)1100 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) {
1101 size_t globalThreads[2];
1102 cl_mem pixtemp;
1103 cl_int status;
1104 int gsize;
1105 size_t localThreads[2];
1106
1107 // Horizontal pass
1108 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1109 globalThreads[0] = gsize;
1110 globalThreads[1] = GROUPSIZE_HMORY;
1111 localThreads[0] = GROUPSIZE_HMORX;
1112 localThreads[1] = GROUPSIZE_HMORY;
1113
1114 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_5x5", &status);
1115 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1116
1117 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1118 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1119 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1120 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1121
1122 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1123 localThreads, 0, nullptr, nullptr);
1124
1125 // Swap source and dest buffers
1126 pixtemp = pixsCLBuffer;
1127 pixsCLBuffer = pixdCLBuffer;
1128 pixdCLBuffer = pixtemp;
1129
1130 // Vertical
1131 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1132 globalThreads[0] = gsize;
1133 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1134 globalThreads[1] = gsize;
1135 localThreads[0] = GROUPSIZE_X;
1136 localThreads[1] = GROUPSIZE_Y;
1137
1138 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer_5x5", &status);
1139 CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1140
1141 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1142 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1143 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1144 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1145 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1146 localThreads, 0, nullptr, nullptr);
1147
1148 return status;
1149 }
1150
1151 // Morphology Erode operation for 5x5 structuring element. Invokes the relevant
1152 // OpenCL kernels
pixErodeCL_55(l_int32 wpl,l_int32 h)1153 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h) {
1154 size_t globalThreads[2];
1155 cl_mem pixtemp;
1156 cl_int status;
1157 int gsize;
1158 l_uint32 fwmask, lwmask;
1159 size_t localThreads[2];
1160
1161 lwmask = lmask32[31 - 2];
1162 fwmask = rmask32[31 - 2];
1163
1164 // Horizontal pass
1165 gsize = (wpl * h + GROUPSIZE_HMORX - 1) / GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1166 globalThreads[0] = gsize;
1167 globalThreads[1] = GROUPSIZE_HMORY;
1168 localThreads[0] = GROUPSIZE_HMORX;
1169 localThreads[1] = GROUPSIZE_HMORY;
1170
1171 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_5x5", &status);
1172 CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1173
1174 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1175 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1176 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1177 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1178
1179 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1180 localThreads, 0, nullptr, nullptr);
1181
1182 // Swap source and dest buffers
1183 pixtemp = pixsCLBuffer;
1184 pixsCLBuffer = pixdCLBuffer;
1185 pixdCLBuffer = pixtemp;
1186
1187 // Vertical
1188 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1189 globalThreads[0] = gsize;
1190 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1191 globalThreads[1] = gsize;
1192 localThreads[0] = GROUPSIZE_X;
1193 localThreads[1] = GROUPSIZE_Y;
1194
1195 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer_5x5", &status);
1196 CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1197
1198 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1199 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1200 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1201 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1202 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1203 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1204 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1205 localThreads, 0, nullptr, nullptr);
1206
1207 return status;
1208 }
1209
1210 // Morphology Dilate operation. Invokes the relevant OpenCL kernels
pixDilateCL(l_int32 hsize,l_int32 vsize,l_int32 wpl,l_int32 h)1211 static cl_int pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1212 l_int32 xp, yp, xn, yn;
1213 SEL *sel;
1214 size_t globalThreads[2];
1215 cl_mem pixtemp;
1216 cl_int status = 0;
1217 int gsize;
1218 size_t localThreads[2];
1219 char isEven;
1220
1221 OpenclDevice::SetKernelEnv(&rEnv);
1222
1223 if (hsize == 5 && vsize == 5) {
1224 // Specific case for 5x5
1225 status = pixDilateCL_55(wpl, h);
1226 return status;
1227 }
1228
1229 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1230
1231 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1232 selDestroy(&sel);
1233 // global and local work dimensions for Horizontal pass
1234 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1235 globalThreads[0] = gsize;
1236 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1237 globalThreads[1] = gsize;
1238 localThreads[0] = GROUPSIZE_X;
1239 localThreads[1] = GROUPSIZE_Y;
1240
1241 if (xp > 31 || xn > 31) {
1242 // Generic case.
1243 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1244 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1245
1246 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1247 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1248 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1249 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1250 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1251 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1252 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1253 localThreads, 0, nullptr, nullptr);
1254
1255 if (yp > 0 || yn > 0) {
1256 pixtemp = pixsCLBuffer;
1257 pixsCLBuffer = pixdCLBuffer;
1258 pixdCLBuffer = pixtemp;
1259 }
1260 } else if (xp > 0 || xn > 0) {
1261 // Specific Horizontal pass kernel for half width < 32
1262 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1263 CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1264 isEven = (xp != xn);
1265
1266 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1267 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1268 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1269 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1270 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1271 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1272 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1273 localThreads, 0, nullptr, nullptr);
1274
1275 if (yp > 0 || yn > 0) {
1276 pixtemp = pixsCLBuffer;
1277 pixsCLBuffer = pixdCLBuffer;
1278 pixdCLBuffer = pixtemp;
1279 }
1280 }
1281
1282 if (yp > 0 || yn > 0) {
1283 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoDilateVer", &status);
1284 CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1285
1286 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1287 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1288 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1289 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1290 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1291 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1292 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1293 localThreads, 0, nullptr, nullptr);
1294 }
1295
1296 return status;
1297 }
1298
1299 // Morphology Erode operation. Invokes the relevant OpenCL kernels
pixErodeCL(l_int32 hsize,l_int32 vsize,l_uint32 wpl,l_uint32 h)1300 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1301 l_int32 xp, yp, xn, yn;
1302 SEL *sel;
1303 size_t globalThreads[2];
1304 size_t localThreads[2];
1305 cl_mem pixtemp;
1306 cl_int status = 0;
1307 int gsize;
1308 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1309 l_uint32 rwmask, lwmask;
1310 char isEven;
1311
1312 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1313
1314 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1315 selDestroy(&sel);
1316 OpenclDevice::SetKernelEnv(&rEnv);
1317
1318 if (hsize == 5 && vsize == 5 && isAsymmetric) {
1319 // Specific kernel for 5x5
1320 status = pixErodeCL_55(wpl, h);
1321 return status;
1322 }
1323
1324 lwmask = lmask32[31 - (xn & 31)];
1325 rwmask = rmask32[31 - (xp & 31)];
1326
1327 // global and local work dimensions for Horizontal pass
1328 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1329 globalThreads[0] = gsize;
1330 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1331 globalThreads[1] = gsize;
1332 localThreads[0] = GROUPSIZE_X;
1333 localThreads[1] = GROUPSIZE_Y;
1334
1335 // Horizontal Pass
1336 if (xp > 31 || xn > 31) {
1337 // Generic case.
1338 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1339
1340 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1341 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1342 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1343 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1344 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1345 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1346 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1347 status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1348 status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1349 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1350 localThreads, 0, nullptr, nullptr);
1351
1352 if (yp > 0 || yn > 0) {
1353 pixtemp = pixsCLBuffer;
1354 pixsCLBuffer = pixdCLBuffer;
1355 pixdCLBuffer = pixtemp;
1356 }
1357 } else if (xp > 0 || xn > 0) {
1358 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1359 isEven = (xp != xn);
1360
1361 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1362 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1363 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1364 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1365 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1366 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1367 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1368 status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1369 status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1370 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1371 localThreads, 0, nullptr, nullptr);
1372
1373 if (yp > 0 || yn > 0) {
1374 pixtemp = pixsCLBuffer;
1375 pixsCLBuffer = pixdCLBuffer;
1376 pixdCLBuffer = pixtemp;
1377 }
1378 }
1379
1380 // Vertical Pass
1381 if (yp > 0 || yn > 0) {
1382 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1383 CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1384
1385 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1386 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1387 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1388 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1389 status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1390 status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1391 status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1392 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1393 localThreads, 0, nullptr, nullptr);
1394 }
1395
1396 return status;
1397 }
1398
1399 // Morphology Open operation. Invokes the relevant OpenCL kernels
pixOpenCL(l_int32 hsize,l_int32 vsize,l_int32 wpl,l_int32 h)1400 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1401 cl_int status;
1402 cl_mem pixtemp;
1403
1404 // Erode followed by Dilate
1405 status = pixErodeCL(hsize, vsize, wpl, h);
1406
1407 pixtemp = pixsCLBuffer;
1408 pixsCLBuffer = pixdCLBuffer;
1409 pixdCLBuffer = pixtemp;
1410
1411 status = pixDilateCL(hsize, vsize, wpl, h);
1412
1413 return status;
1414 }
1415
1416 // Morphology Close operation. Invokes the relevant OpenCL kernels
pixCloseCL(l_int32 hsize,l_int32 vsize,l_int32 wpl,l_int32 h)1417 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) {
1418 cl_int status;
1419 cl_mem pixtemp;
1420
1421 // Dilate followed by Erode
1422 status = pixDilateCL(hsize, vsize, wpl, h);
1423
1424 pixtemp = pixsCLBuffer;
1425 pixsCLBuffer = pixdCLBuffer;
1426 pixdCLBuffer = pixtemp;
1427
1428 status = pixErodeCL(hsize, vsize, wpl, h);
1429
1430 return status;
1431 }
1432
1433 // output = buffer1 & ~(buffer2)
pixSubtractCL_work(l_uint32 wpl,l_uint32 h,cl_mem buffer1,cl_mem buffer2)1434 static cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2) {
1435 cl_int status;
1436 size_t globalThreads[2];
1437 int gsize;
1438 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1439
1440 gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1441 globalThreads[0] = gsize;
1442 gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1443 globalThreads[1] = gsize;
1444
1445 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1446 CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1447
1448 // Enqueue a kernel run call.
1449 status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1450 status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1451 status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1452 status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1453 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, globalThreads,
1454 localThreads, 0, nullptr, nullptr);
1455
1456 return status;
1457 }
1458
1459 // OpenCL implementation of Get Lines from pix function
1460 // Note: Assumes the source and dest opencl buffer are initialized. No check
1461 // done
pixGetLinesCL(Image pixd,Image pixs,Image * pix_vline,Image * pix_hline,Image * pixClosed,bool getpixClosed,l_int32 close_hsize,l_int32 close_vsize,l_int32 open_hsize,l_int32 open_vsize,l_int32 line_hsize,l_int32 line_vsize)1462 void OpenclDevice::pixGetLinesCL(Image pixd, Image pixs, Image *pix_vline, Image *pix_hline,
1463 Image *pixClosed, bool getpixClosed, l_int32 close_hsize,
1464 l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize,
1465 l_int32 line_hsize, l_int32 line_vsize) {
1466 l_uint32 wpl, h;
1467 cl_mem pixtemp;
1468
1469 wpl = pixGetWpl(pixs);
1470 h = pixGetHeight(pixs);
1471
1472 // First step : Close Morph operation: Dilate followed by Erode
1473 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1474
1475 // Copy the Close output to CPU buffer
1476 if (getpixClosed) {
1477 *pixClosed =
1478 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl * h, CL_MAP_READ, true, false);
1479 }
1480
1481 // Store the output of close operation in an intermediate buffer
1482 // this will be later used for pixsubtract
1483 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1484 sizeof(int) * wpl * h, 0, nullptr, nullptr);
1485
1486 // Second step: Open Operation - Erode followed by Dilate
1487 pixtemp = pixsCLBuffer;
1488 pixsCLBuffer = pixdCLBuffer;
1489 pixdCLBuffer = pixtemp;
1490
1491 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1492
1493 // Third step: Subtract : (Close - Open)
1494 pixtemp = pixsCLBuffer;
1495 pixsCLBuffer = pixdCLBuffer;
1496 pixdCLBuffer = pixdCLIntermediate;
1497 pixdCLIntermediate = pixtemp;
1498
1499 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1500
1501 // Store the output of Hollow operation in an intermediate buffer
1502 // this will be later used
1503 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0,
1504 sizeof(int) * wpl * h, 0, nullptr, nullptr);
1505
1506 pixtemp = pixsCLBuffer;
1507 pixsCLBuffer = pixdCLBuffer;
1508 pixdCLBuffer = pixtemp;
1509
1510 // Fourth step: Get vertical line
1511 // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1512 clStatus = pixOpenCL(1, line_vsize, wpl, h);
1513
1514 // Copy the vertical line output to CPU buffer
1515 *pix_vline =
1516 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, CL_MAP_READ, true, false);
1517
1518 pixtemp = pixsCLBuffer;
1519 pixsCLBuffer = pixdCLIntermediate;
1520 pixdCLIntermediate = pixtemp;
1521
1522 // Fifth step: Get horizontal line
1523 // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1524 clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1525
1526 // Copy the horizontal line output to CPU buffer
1527 *pix_hline =
1528 mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, CL_MAP_READ, true, true);
1529
1530 return;
1531 }
1532
1533 /*************************************************************************
1534 * HistogramRect
1535 * Otsu Thresholding Operations
1536 * histogramAllChannels is laid out as all channel 0, then all channel 1...
1537 * only supports 1 or 4 channels (bytes_per_pixel)
1538 ************************************************************************/
HistogramRectOCL(void * imageData,int bytes_per_pixel,int bytes_per_line,int left,int top,int width,int height,int kHistogramSize,int * histogramAllChannels)1539 int OpenclDevice::HistogramRectOCL(void *imageData, int bytes_per_pixel, int bytes_per_line,
1540 int left, // always 0
1541 int top, // always 0
1542 int width, int height, int kHistogramSize,
1543 int *histogramAllChannels) {
1544 cl_int clStatus;
1545 int retVal = 0;
1546 KernelEnv histKern;
1547 SetKernelEnv(&histKern);
1548 KernelEnv histRedKern;
1549 SetKernelEnv(&histRedKern);
1550 /* map imagedata to device as read only */
1551 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1552 // coherent which we don't need.
1553 // faster option would be to allocate initial image buffer
1554 // using a garlic bus memory type
1555 cl_mem imageBuffer =
1556 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1557 width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1558 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1559
1560 /* setup work group size parameters */
1561 int block_size = 256;
1562 cl_uint numCUs;
1563 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1564 nullptr);
1565 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1566
1567 int requestedOccupancy = 10;
1568 int numWorkGroups = numCUs * requestedOccupancy;
1569 int numThreads = block_size * numWorkGroups;
1570 size_t local_work_size[] = {static_cast<size_t>(block_size)};
1571 size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1572 size_t red_global_work_size[] = {
1573 static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1574
1575 /* map histogramAllChannels as write only */
1576
1577 cl_mem histogramBuffer = clCreateBuffer(
1578 histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1579 kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels, &clStatus);
1580 CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1581
1582 /* intermediate histogram buffer */
1583 int histRed = 256;
1584 int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1585
1586 cl_mem tmpHistogramBuffer =
1587 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins * sizeof(cl_uint),
1588 nullptr, &clStatus);
1589 CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1590
1591 /* atomic sync buffer */
1592 int *zeroBuffer = new int[1];
1593 zeroBuffer[0] = 0;
1594 cl_mem atomicSyncBuffer =
1595 clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int),
1596 zeroBuffer, &clStatus);
1597 CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1598 delete[] zeroBuffer;
1599 // Create kernel objects based on bytes_per_pixel
1600 if (bytes_per_pixel == 1) {
1601 histKern.mpkKernel =
1602 clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1603 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1604
1605 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1606 "kernel_HistogramRectOneChannelReduction", &clStatus);
1607 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
1608 } else {
1609 histKern.mpkKernel =
1610 clCreateKernel(histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus);
1611 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1612
1613 histRedKern.mpkKernel = clCreateKernel(histRedKern.mpkProgram,
1614 "kernel_HistogramRectAllChannelsReduction", &clStatus);
1615 CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1616 }
1617
1618 void *ptr;
1619
1620 // Initialize tmpHistogramBuffer buffer
1621 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1622 tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
1623 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1624
1625 memset(ptr, 0, tmpHistogramBins * sizeof(cl_uint));
1626 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, nullptr, nullptr);
1627
1628 /* set kernel 1 arguments */
1629 clStatus = clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1630 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1631 cl_uint numPixels = width * height;
1632 clStatus = clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1633 CHECK_OPENCL(clStatus, "clSetKernelArg numPixels");
1634 clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), &tmpHistogramBuffer);
1635 CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1636
1637 /* set kernel 2 arguments */
1638 int n = numThreads / bytes_per_pixel;
1639 clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1640 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1641 clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), &tmpHistogramBuffer);
1642 CHECK_OPENCL(clStatus, "clSetKernelArg tmpHistogramBuffer");
1643 clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), &histogramBuffer);
1644 CHECK_OPENCL(clStatus, "clSetKernelArg histogramBuffer");
1645
1646 /* launch histogram */
1647 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, nullptr,
1648 global_work_size, local_work_size, 0, nullptr, nullptr);
1649 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1650 clFinish(histKern.mpkCmdQueue);
1651 if (clStatus != 0) {
1652 retVal = -1;
1653 }
1654 /* launch histogram */
1655 clStatus = clEnqueueNDRangeKernel(histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1656 red_global_work_size, local_work_size, 0, nullptr, nullptr);
1657 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction");
1658 clFinish(histRedKern.mpkCmdQueue);
1659 if (clStatus != 0) {
1660 retVal = -1;
1661 }
1662
1663 /* map results back from gpu */
1664 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0,
1665 kHistogramSize * bytes_per_pixel * sizeof(int), 0, nullptr, nullptr,
1666 &clStatus);
1667 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1668 if (clStatus != 0) {
1669 retVal = -1;
1670 }
1671 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, nullptr, nullptr);
1672
1673 clReleaseMemObject(histogramBuffer);
1674 clReleaseMemObject(imageBuffer);
1675 return retVal;
1676 }
1677
1678 /*************************************************************************
1679 * Threshold the rectangle, taking everything except the image buffer pointer
1680 * from the class, using thresholds/hi_values to the output IMAGE.
1681 * only supports 1 or 4 channels
1682 ************************************************************************/
ThresholdRectToPixOCL(unsigned char * imageData,int bytes_per_pixel,int bytes_per_line,int * thresholds,int * hi_values,Image * pix,int height,int width,int top,int left)1683 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, int bytes_per_pixel,
1684 int bytes_per_line, int *thresholds, int *hi_values,
1685 Image *pix, int height, int width, int top, int left) {
1686 int retVal = 0;
1687 /* create pix result buffer */
1688 *pix = pixCreate(width, height, 1);
1689 uint32_t *pixData = pixGetData(*pix);
1690 int wpl = pixGetWpl(*pix);
1691 int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1692
1693 cl_int clStatus;
1694 KernelEnv rEnv;
1695 SetKernelEnv(&rEnv);
1696
1697 /* setup work group size parameters */
1698 int block_size = 256;
1699 cl_uint numCUs = 6;
1700 clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs,
1701 nullptr);
1702 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1703
1704 int requestedOccupancy = 10;
1705 int numWorkGroups = numCUs * requestedOccupancy;
1706 int numThreads = block_size * numWorkGroups;
1707 size_t local_work_size[] = {(size_t)block_size};
1708 size_t global_work_size[] = {(size_t)numThreads};
1709
1710 /* map imagedata to device as read only */
1711 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1712 // coherent which we don't need.
1713 // faster option would be to allocate initial image buffer
1714 // using a garlic bus memory type
1715 cl_mem imageBuffer =
1716 clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1717 width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1718 CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1719
1720 /* map pix as write only */
1721 pixThBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize,
1722 pixData, &clStatus);
1723 CHECK_OPENCL(clStatus, "clCreateBuffer pix");
1724
1725 /* map thresholds and hi_values */
1726 cl_mem thresholdsBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1727 bytes_per_pixel * sizeof(int), thresholds, &clStatus);
1728 CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
1729 cl_mem hiValuesBuffer = clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1730 bytes_per_pixel * sizeof(int), hi_values, &clStatus);
1731 CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
1732
1733 /* compile kernel */
1734 if (bytes_per_pixel == 4) {
1735 rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
1736 CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
1737 } else {
1738 rEnv.mpkKernel =
1739 clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
1740 CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
1741 }
1742
1743 /* set kernel arguments */
1744 clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1745 CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
1746 clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
1747 CHECK_OPENCL(clStatus, "clSetKernelArg height");
1748 clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
1749 CHECK_OPENCL(clStatus, "clSetKernelArg width");
1750 clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
1751 CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
1752 clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
1753 CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
1754 clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
1755 CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
1756 clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
1757 CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
1758
1759 /* launch kernel & wait */
1760 clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1, nullptr, global_work_size,
1761 local_work_size, 0, nullptr, nullptr);
1762 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
1763 clFinish(rEnv.mpkCmdQueue);
1764 if (clStatus != 0) {
1765 tprintf("Setting return value to -1\n");
1766 retVal = -1;
1767 }
1768 /* map results back from gpu */
1769 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0,
1770 nullptr, nullptr, &clStatus);
1771 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
1772 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr, nullptr);
1773
1774 clReleaseMemObject(imageBuffer);
1775 clReleaseMemObject(thresholdsBuffer);
1776 clReleaseMemObject(hiValuesBuffer);
1777
1778 return retVal;
1779 }
1780
1781 /******************************************************************************
1782 * Data Types for Device Selection
1783 *****************************************************************************/
1784
1785 struct TessScoreEvaluationInputData {
1786 int height;
1787 int width;
1788 int numChannels;
1789 unsigned char *imageData;
1790 Image pix;
1791 };
1792
populateTessScoreEvaluationInputData(TessScoreEvaluationInputData * input)1793 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
1794 srand(1);
1795 // 8.5x11 inches @ 300dpi rounded to clean multiples
1796 int height = 3328; // %256
1797 int width = 2560; // %512
1798 int numChannels = 4;
1799 input->height = height;
1800 input->width = width;
1801 input->numChannels = numChannels;
1802 unsigned char(*imageData4)[4] = (unsigned char(*)[4])malloc(
1803 height * width * numChannels * sizeof(unsigned char)); // new unsigned char[4][height*width];
1804 input->imageData = (unsigned char *)&imageData4[0];
1805
1806 // zero out image
1807 unsigned char pixelWhite[4] = {0, 0, 0, 255};
1808 unsigned char pixelBlack[4] = {255, 255, 255, 255};
1809 for (int p = 0; p < height * width; p++) {
1810 // unsigned char tmp[4] = imageData4[0];
1811 imageData4[p][0] = pixelWhite[0];
1812 imageData4[p][1] = pixelWhite[1];
1813 imageData4[p][2] = pixelWhite[2];
1814 imageData4[p][3] = pixelWhite[3];
1815 }
1816 // random lines to be eliminated
1817 int maxLineWidth = 64; // pixels wide
1818 int numLines = 10;
1819 // vertical lines
1820 for (int i = 0; i < numLines; i++) {
1821 int lineWidth = rand() % maxLineWidth;
1822 int vertLinePos = lineWidth + rand() % (width - 2 * lineWidth);
1823 // tprintf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
1824 for (int row = vertLinePos - lineWidth / 2; row < vertLinePos + lineWidth / 2; row++) {
1825 for (int col = 0; col < height; col++) {
1826 // imageData4[row*width+col] = pixelBlack;
1827 imageData4[row * width + col][0] = pixelBlack[0];
1828 imageData4[row * width + col][1] = pixelBlack[1];
1829 imageData4[row * width + col][2] = pixelBlack[2];
1830 imageData4[row * width + col][3] = pixelBlack[3];
1831 }
1832 }
1833 }
1834 // horizontal lines
1835 for (int i = 0; i < numLines; i++) {
1836 int lineWidth = rand() % maxLineWidth;
1837 int horLinePos = lineWidth + rand() % (height - 2 * lineWidth);
1838 // tprintf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
1839 for (int row = 0; row < width; row++) {
1840 for (int col = horLinePos - lineWidth / 2; col < horLinePos + lineWidth / 2;
1841 col++) { // for (int row = vertLinePos-lineWidth/2; row <
1842 // vertLinePos+lineWidth/2; row++) {
1843 // tprintf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
1844 // imageData4[row*width+col] = pixelBlack;
1845 imageData4[row * width + col][0] = pixelBlack[0];
1846 imageData4[row * width + col][1] = pixelBlack[1];
1847 imageData4[row * width + col][2] = pixelBlack[2];
1848 imageData4[row * width + col][3] = pixelBlack[3];
1849 }
1850 }
1851 }
1852 // spots (noise, squares)
1853 float fractionBlack = 0.1; // how much of the image should be blackened
1854 int numSpots = (height * width) * fractionBlack / (maxLineWidth * maxLineWidth / 2 / 2);
1855 for (int i = 0; i < numSpots; i++) {
1856 int lineWidth = rand() % maxLineWidth;
1857 int col = lineWidth + rand() % (width - 2 * lineWidth);
1858 int row = lineWidth + rand() % (height - 2 * lineWidth);
1859 // tprintf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
1860 for (int r = row - lineWidth / 2; r < row + lineWidth / 2; r++) {
1861 for (int c = col - lineWidth / 2; c < col + lineWidth / 2; c++) {
1862 // tprintf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
1863 // imageData4[row*width+col] = pixelBlack;
1864 imageData4[r * width + c][0] = pixelBlack[0];
1865 imageData4[r * width + c][1] = pixelBlack[1];
1866 imageData4[r * width + c][2] = pixelBlack[2];
1867 imageData4[r * width + c][3] = pixelBlack[3];
1868 }
1869 }
1870 }
1871
1872 input->pix = pixCreate(input->width, input->height, 8 * input->numChannels);
1873 }
1874
1875 struct TessDeviceScore {
1876 float time; // small time means faster device
1877 bool clError; // were there any opencl errors
1878 bool valid; // was the correct response generated
1879 };
1880
1881 /******************************************************************************
1882 * Micro Benchmarks for Device Selection
1883 *****************************************************************************/
1884
composeRGBPixelMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)1885 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1886 ds_device_type type) {
1887 double time = 0;
1888 # if ON_WINDOWS
1889 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1890 QueryPerformanceFrequency(&freq);
1891 # elif ON_APPLE
1892 mach_timebase_info_data_t info = {0, 0};
1893 mach_timebase_info(&info);
1894 long long start, stop;
1895 # else
1896 timespec time_funct_start, time_funct_end;
1897 # endif
1898 // input data
1899 l_uint32 *tiffdata = (l_uint32 *)input.imageData; // same size and random data; data doesn't
1900 // change workload
1901
1902 // function call
1903 if (type == DS_DEVICE_OPENCL_DEVICE) {
1904 # if ON_WINDOWS
1905 QueryPerformanceCounter(&time_funct_start);
1906 # elif ON_APPLE
1907 start = mach_absolute_time();
1908 # else
1909 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1910 # endif
1911
1912 OpenclDevice::gpuEnv = *env;
1913 int wpl = pixGetWpl(input.pix);
1914 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, nullptr);
1915 # if ON_WINDOWS
1916 QueryPerformanceCounter(&time_funct_end);
1917 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1918 # elif ON_APPLE
1919 stop = mach_absolute_time();
1920 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1921 # else
1922 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1923 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1924 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1925 # endif
1926
1927 } else {
1928 # if ON_WINDOWS
1929 QueryPerformanceCounter(&time_funct_start);
1930 # elif ON_APPLE
1931 start = mach_absolute_time();
1932 # else
1933 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1934 # endif
1935 Image pix = pixCreate(input.width, input.height, 32);
1936 l_uint32 *pixData = pixGetData(pix);
1937 int i, j;
1938 int idx = 0;
1939 for (i = 0; i < input.height; i++) {
1940 for (j = 0; j < input.width; j++) {
1941 l_uint32 tiffword = tiffdata[i * input.width + j];
1942 l_int32 rval = ((tiffword)&0xff);
1943 l_int32 gval = (((tiffword) >> 8) & 0xff);
1944 l_int32 bval = (((tiffword) >> 16) & 0xff);
1945 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
1946 pixData[idx] = value;
1947 idx++;
1948 }
1949 }
1950 # if ON_WINDOWS
1951 QueryPerformanceCounter(&time_funct_end);
1952 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
1953 # elif ON_APPLE
1954 stop = mach_absolute_time();
1955 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
1956 # else
1957 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
1958 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
1959 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
1960 # endif
1961 pix.destroy();
1962 }
1963
1964 return time;
1965 }
1966
histogramRectMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)1967 static double histogramRectMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
1968 ds_device_type type) {
1969 double time;
1970 # if ON_WINDOWS
1971 LARGE_INTEGER freq, time_funct_start, time_funct_end;
1972 QueryPerformanceFrequency(&freq);
1973 # elif ON_APPLE
1974 mach_timebase_info_data_t info = {0, 0};
1975 mach_timebase_info(&info);
1976 long long start, stop;
1977 # else
1978 timespec time_funct_start, time_funct_end;
1979 # endif
1980
1981 const int left = 0;
1982 const int top = 0;
1983 int kHistogramSize = 256;
1984 int bytes_per_line = input.width * input.numChannels;
1985 int *histogramAllChannels = new int[kHistogramSize * input.numChannels];
1986 // function call
1987 if (type == DS_DEVICE_OPENCL_DEVICE) {
1988 # if ON_WINDOWS
1989 QueryPerformanceCounter(&time_funct_start);
1990 # elif ON_APPLE
1991 start = mach_absolute_time();
1992 # else
1993 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
1994 # endif
1995
1996 OpenclDevice::gpuEnv = *env;
1997 int retVal = OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line,
1998 left, top, input.width, input.height,
1999 kHistogramSize, histogramAllChannels);
2000
2001 # if ON_WINDOWS
2002 QueryPerformanceCounter(&time_funct_end);
2003 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2004 # elif ON_APPLE
2005 stop = mach_absolute_time();
2006 if (retVal == 0) {
2007 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2008 } else {
2009 time = FLT_MAX;
2010 }
2011 # else
2012 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2013 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2014 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2015 # endif
2016 } else {
2017 int *histogram = new int[kHistogramSize];
2018 # if ON_WINDOWS
2019 QueryPerformanceCounter(&time_funct_start);
2020 # elif ON_APPLE
2021 start = mach_absolute_time();
2022 # else
2023 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2024 # endif
2025 for (int ch = 0; ch < input.numChannels; ++ch) {
2026 tesseract::HistogramRect(input.pix, input.numChannels, left, top, input.width, input.height,
2027 histogram);
2028 }
2029 # if ON_WINDOWS
2030 QueryPerformanceCounter(&time_funct_end);
2031 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2032 # elif ON_APPLE
2033 stop = mach_absolute_time();
2034 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2035 # else
2036 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2037 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2038 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2039 # endif
2040 delete[] histogram;
2041 }
2042
2043 // cleanup
2044 delete[] histogramAllChannels;
2045 return time;
2046 }
2047
2048 // Reproducing the ThresholdRectToPix native version
ThresholdRectToPix_Native(const unsigned char * imagedata,int bytes_per_pixel,int bytes_per_line,const int * thresholds,const int * hi_values,Image * pix)2049 static void ThresholdRectToPix_Native(const unsigned char *imagedata, int bytes_per_pixel,
2050 int bytes_per_line, const int *thresholds,
2051 const int *hi_values, Image *pix) {
2052 int top = 0;
2053 int left = 0;
2054 int width = pixGetWidth(*pix);
2055 int height = pixGetHeight(*pix);
2056
2057 *pix = pixCreate(width, height, 1);
2058 uint32_t *pixdata = pixGetData(*pix);
2059 int wpl = pixGetWpl(*pix);
2060 const unsigned char *srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel;
2061 for (int y = 0; y < height; ++y) {
2062 const uint8_t *linedata = srcdata;
2063 uint32_t *pixline = pixdata + y * wpl;
2064 for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2065 bool white_result = true;
2066 for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2067 if (hi_values[ch] >= 0 && (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2068 white_result = false;
2069 break;
2070 }
2071 }
2072 if (white_result)
2073 CLEAR_DATA_BIT(pixline, x);
2074 else
2075 SET_DATA_BIT(pixline, x);
2076 }
2077 srcdata += bytes_per_line;
2078 }
2079 }
2080
thresholdRectToPixMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)2081 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2082 ds_device_type type) {
2083 double time;
2084 # if ON_WINDOWS
2085 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2086 QueryPerformanceFrequency(&freq);
2087 # elif ON_APPLE
2088 mach_timebase_info_data_t info = {0, 0};
2089 mach_timebase_info(&info);
2090 long long start, stop;
2091 # else
2092 timespec time_funct_start, time_funct_end;
2093 # endif
2094
2095 // input data
2096 unsigned char pixelHi = (unsigned char)255;
2097 int thresholds[4] = {pixelHi, pixelHi, pixelHi, pixelHi};
2098
2099 // Pix* pix = pixCreate(width, height, 1);
2100 int top = 0;
2101 int left = 0;
2102 int bytes_per_line = input.width * input.numChannels;
2103
2104 // function call
2105 if (type == DS_DEVICE_OPENCL_DEVICE) {
2106 # if ON_WINDOWS
2107 QueryPerformanceCounter(&time_funct_start);
2108 # elif ON_APPLE
2109 start = mach_absolute_time();
2110 # else
2111 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2112 # endif
2113
2114 OpenclDevice::gpuEnv = *env;
2115 int hi_values[4];
2116 int retVal = OpenclDevice::ThresholdRectToPixOCL(
2117 input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix,
2118 input.height, input.width, top, left);
2119
2120 # if ON_WINDOWS
2121 QueryPerformanceCounter(&time_funct_end);
2122 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2123 # elif ON_APPLE
2124 stop = mach_absolute_time();
2125 if (retVal == 0) {
2126 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2127 } else {
2128 time = FLT_MAX;
2129 }
2130
2131 # else
2132 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2133 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2134 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2135 # endif
2136 } else {
2137 tesseract::ImageThresholder thresholder;
2138 thresholder.SetImage(input.pix);
2139 # if ON_WINDOWS
2140 QueryPerformanceCounter(&time_funct_start);
2141 # elif ON_APPLE
2142 start = mach_absolute_time();
2143 # else
2144 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2145 # endif
2146 int hi_values[4] = {};
2147 ThresholdRectToPix_Native(input.imageData, input.numChannels, bytes_per_line, thresholds,
2148 hi_values, &input.pix);
2149
2150 # if ON_WINDOWS
2151 QueryPerformanceCounter(&time_funct_end);
2152 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2153 # elif ON_APPLE
2154 stop = mach_absolute_time();
2155 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2156 # else
2157 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2158 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2159 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2160 # endif
2161 }
2162
2163 return time;
2164 }
2165
getLineMasksMorphMicroBench(GPUEnv * env,TessScoreEvaluationInputData input,ds_device_type type)2166 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input,
2167 ds_device_type type) {
2168 double time = 0;
2169 # if ON_WINDOWS
2170 LARGE_INTEGER freq, time_funct_start, time_funct_end;
2171 QueryPerformanceFrequency(&freq);
2172 # elif ON_APPLE
2173 mach_timebase_info_data_t info = {0, 0};
2174 mach_timebase_info(&info);
2175 long long start, stop;
2176 # else
2177 timespec time_funct_start, time_funct_end;
2178 # endif
2179
2180 // input data
2181 int resolution = 300;
2182 int wpl = pixGetWpl(input.pix);
2183 int kThinLineFraction = 20; // tess constant
2184 int kMinLineLengthFraction = 4; // tess constant
2185 int max_line_width = resolution / kThinLineFraction;
2186 int min_line_length = resolution / kMinLineLengthFraction;
2187 int closing_brick = max_line_width / 3;
2188
2189 // function call
2190 if (type == DS_DEVICE_OPENCL_DEVICE) {
2191 # if ON_WINDOWS
2192 QueryPerformanceCounter(&time_funct_start);
2193 # elif ON_APPLE
2194 start = mach_absolute_time();
2195 # else
2196 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2197 # endif
2198 OpenclDevice::gpuEnv = *env;
2199 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2200 Image pix_vline = nullptr, pix_hline = nullptr, pix_closed = nullptr;
2201 OpenclDevice::pixGetLinesCL(nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2202 closing_brick, closing_brick, max_line_width, max_line_width,
2203 min_line_length, min_line_length);
2204
2205 OpenclDevice::releaseMorphCLBuffers();
2206
2207 # if ON_WINDOWS
2208 QueryPerformanceCounter(&time_funct_end);
2209 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2210 # elif ON_APPLE
2211 stop = mach_absolute_time();
2212 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2213 # else
2214 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2215 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2216 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2217 # endif
2218 } else {
2219 # if ON_WINDOWS
2220 QueryPerformanceCounter(&time_funct_start);
2221 # elif ON_APPLE
2222 start = mach_absolute_time();
2223 # else
2224 clock_gettime(CLOCK_MONOTONIC, &time_funct_start);
2225 # endif
2226
2227 // native serial code
2228 Image src_pix = input.pix;
2229 Image pix_closed = pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2230 Image pix_solid = pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2231 Image pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2232 pix_solid.destroy();
2233 Image pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2234 Image pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2235 pix_hline.destroy();
2236 pix_vline.destroy();
2237 pix_hollow.destroy();
2238
2239 # if ON_WINDOWS
2240 QueryPerformanceCounter(&time_funct_end);
2241 time = (time_funct_end.QuadPart - time_funct_start.QuadPart) / (double)(freq.QuadPart);
2242 # elif ON_APPLE
2243 stop = mach_absolute_time();
2244 time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2245 # else
2246 clock_gettime(CLOCK_MONOTONIC, &time_funct_end);
2247 time = (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 +
2248 (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0;
2249 # endif
2250 }
2251
2252 return time;
2253 }
2254
2255 /******************************************************************************
2256 * Device Selection
2257 *****************************************************************************/
2258
2259 // encode score object as byte string
serializeScore(ds_device * device,uint8_t ** serializedScore,unsigned int * serializedScoreSize)2260 static ds_status serializeScore(ds_device *device, uint8_t **serializedScore,
2261 unsigned int *serializedScoreSize) {
2262 *serializedScoreSize = sizeof(TessDeviceScore);
2263 *serializedScore = new uint8_t[*serializedScoreSize];
2264 memcpy(*serializedScore, device->score, *serializedScoreSize);
2265 return DS_SUCCESS;
2266 }
2267
2268 // parses byte string and stores in score object
deserializeScore(ds_device * device,const uint8_t * serializedScore,unsigned int serializedScoreSize)2269 static ds_status deserializeScore(ds_device *device, const uint8_t *serializedScore,
2270 unsigned int serializedScoreSize) {
2271 // check that serializedScoreSize == sizeof(TessDeviceScore);
2272 device->score = new TessDeviceScore;
2273 memcpy(device->score, serializedScore, serializedScoreSize);
2274 return DS_SUCCESS;
2275 }
2276
releaseScore(TessDeviceScore * score)2277 static ds_status releaseScore(TessDeviceScore *score) {
2278 delete score;
2279 return DS_SUCCESS;
2280 }
2281
2282 // evaluate devices
evaluateScoreForDevice(ds_device * device,void * inputData)2283 static ds_status evaluateScoreForDevice(ds_device *device, void *inputData) {
2284 // overwrite statuc gpuEnv w/ current device
2285 // so native opencl calls can be used; they use static gpuEnv
2286 tprintf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName,
2287 device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2288 GPUEnv *env = nullptr;
2289 if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2290 env = &OpenclDevice::gpuEnv;
2291 memset(env, 0, sizeof(*env));
2292 // tprintf("[DS] populating tmp GPUEnv from device\n");
2293 populateGPUEnvFromDevice(env, device->oclDeviceID);
2294 env->mnFileCount = 0; // argc;
2295 env->mnKernelCount = 0UL;
2296 // tprintf("[DS] compiling kernels for tmp GPUEnv\n");
2297 OpenclDevice::CompileKernelFile(env, "");
2298 }
2299
2300 TessScoreEvaluationInputData *input = static_cast<TessScoreEvaluationInputData *>(inputData);
2301
2302 // pixReadTiff
2303 double composeRGBPixelTime = composeRGBPixelMicroBench(env, *input, device->type);
2304
2305 // HistogramRect
2306 double histogramRectTime = histogramRectMicroBench(env, *input, device->type);
2307
2308 // ThresholdRectToPix
2309 double thresholdRectToPixTime = thresholdRectToPixMicroBench(env, *input, device->type);
2310
2311 // getLineMasks
2312 double getLineMasksMorphTime = getLineMasksMorphMicroBench(env, *input, device->type);
2313
2314 // weigh times (% of cpu time)
2315 // these weights should be the % execution time that the native cpu code took
2316 float composeRGBPixelWeight = 1.2f;
2317 float histogramRectWeight = 2.4f;
2318 float thresholdRectToPixWeight = 4.5f;
2319 float getLineMasksMorphWeight = 5.0f;
2320
2321 float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2322 histogramRectWeight * histogramRectTime +
2323 thresholdRectToPixWeight * thresholdRectToPixTime +
2324 getLineMasksMorphWeight * getLineMasksMorphTime;
2325 device->score = new TessDeviceScore;
2326 device->score->time = weightedTime;
2327
2328 tprintf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName,
2329 device->type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2330 tprintf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight);
2331 tprintf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight);
2332 tprintf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime,
2333 thresholdRectToPixWeight);
2334 tprintf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime,
2335 getLineMasksMorphWeight);
2336 tprintf("[DS]%25s: %f\n", "Score", device->score->time);
2337 return DS_SUCCESS;
2338 }
2339
2340 // initial call to select device
getDeviceSelection()2341 ds_device OpenclDevice::getDeviceSelection() {
2342 if (!deviceIsSelected) {
2343 // check if opencl is available at runtime
2344 if (1 == LoadOpencl()) {
2345 // opencl is available
2346 // setup devices
2347 ds_status status;
2348 ds_profile *profile;
2349 status = initDSProfile(&profile, "v0.1");
2350 // try reading scores from file
2351 const char *fileName = "tesseract_opencl_profile_devices.dat";
2352 status = readProfileFromFile(profile, deserializeScore, fileName);
2353 if (status != DS_SUCCESS) {
2354 // need to run evaluation
2355 tprintf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
2356
2357 // create input data
2358 TessScoreEvaluationInputData input;
2359 populateTessScoreEvaluationInputData(&input);
2360 // perform evaluations
2361 unsigned int numUpdates;
2362 status =
2363 profileDevices(profile, DS_EVALUATE_ALL, evaluateScoreForDevice, &input, &numUpdates);
2364 // write scores to file
2365 if (status == DS_SUCCESS) {
2366 status = writeProfileToFile(profile, serializeScore, fileName);
2367 if (status == DS_SUCCESS) {
2368 tprintf("[DS] Scores written to file (%s).\n", fileName);
2369 } else {
2370 tprintf(
2371 "[DS] Error saving scores to file (%s); scores not written to "
2372 "file.\n",
2373 fileName);
2374 }
2375 } else {
2376 tprintf(
2377 "[DS] Unable to evaluate performance; scores not written to "
2378 "file.\n");
2379 }
2380 } else {
2381 tprintf("[DS] Profile read from file (%s).\n", fileName);
2382 }
2383
2384 // we now have device scores either from file or evaluation
2385 // select fastest using custom Tesseract selection algorithm
2386 float bestTime = FLT_MAX; // begin search with worst possible time
2387 int bestDeviceIdx = -1;
2388 for (unsigned d = 0; d < profile->numDevices; d++) {
2389 ds_device device = profile->devices[d];
2390 if (device.score == nullptr)
2391 continue;
2392 TessDeviceScore score = *device.score;
2393
2394 float time = score.time;
2395 tprintf("[DS] Device[%u] %i:%s score is %f\n", d + 1, device.type, device.oclDeviceName,
2396 time);
2397 if (time < bestTime) {
2398 bestTime = time;
2399 bestDeviceIdx = d;
2400 }
2401 }
2402 if (bestDeviceIdx >= 0) {
2403 tprintf(
2404 "[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2405 profile->devices[bestDeviceIdx].oclDeviceName,
2406 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2407 }
2408 // cleanup
2409 // TODO: call destructor for profile object?
2410
2411 bool overridden = false;
2412 char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2413 if (overrideDeviceStr != nullptr) {
2414 int overrideDeviceIdx = atoi(overrideDeviceStr);
2415 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2416 tprintf(
2417 "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2418 "%i)\n",
2419 overrideDeviceStr, overrideDeviceIdx);
2420 bestDeviceIdx = overrideDeviceIdx - 1;
2421 overridden = true;
2422 } else {
2423 tprintf(
2424 "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2425 "valid devices).\n",
2426 overrideDeviceStr, profile->numDevices);
2427 }
2428 }
2429
2430 if (overridden) {
2431 tprintf(
2432 "[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2433 profile->devices[bestDeviceIdx].oclDeviceName,
2434 profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
2435 }
2436 selectedDevice = profile->devices[bestDeviceIdx];
2437 // cleanup
2438 releaseDSProfile(profile, releaseScore);
2439 } else {
2440 // opencl isn't available at runtime, select native cpu device
2441 tprintf("[DS] OpenCL runtime not available.\n");
2442 selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2443 selectedDevice.oclDeviceName = "(null)";
2444 selectedDevice.score = nullptr;
2445 selectedDevice.oclDeviceID = nullptr;
2446 selectedDevice.oclDriverVersion = nullptr;
2447 }
2448 deviceIsSelected = true;
2449 }
2450 return selectedDevice;
2451 }
2452
selectedDeviceIsOpenCL()2453 bool OpenclDevice::selectedDeviceIsOpenCL() {
2454 ds_device device = getDeviceSelection();
2455 return (device.type == DS_DEVICE_OPENCL_DEVICE);
2456 }
2457
2458 } // namespace
2459
2460 #endif
2461