1 /*
2 This file is part of darktable,
3 Copyright (C) 2010-2021 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #ifdef HAVE_OPENCL
20
21 #include "common/opencl.h"
22 #include "common/bilateralcl.h"
23 #include "common/darktable.h"
24 #include "common/dlopencl.h"
25 #include "common/dwt.h"
26 #include "common/file_location.h"
27 #include "common/gaussian.h"
28 #include "common/guided_filter.h"
29 #include "common/heal.h"
30 #include "common/interpolation.h"
31 #include "common/locallaplaciancl.h"
32 #include "common/nvidia_gpus.h"
33 #include "common/opencl_drivers_blacklist.h"
34 #include "common/tea.h"
35 #include "control/conf.h"
36 #include "control/control.h"
37 #include "develop/blend.h"
38 #include "develop/pixelpipe.h"
39
40 #include <assert.h>
41 #include <locale.h>
42 #include <stdio.h>
43 #include <string.h>
44 #include <strings.h>
45
46 #include <ctype.h>
47 #include <errno.h>
48 #include <libgen.h>
49 #include <sys/stat.h>
50 #include <zlib.h>
51
52 static const char *dt_opencl_get_vendor_by_id(unsigned int id);
53 static float dt_opencl_benchmark_gpu(const int devid, const size_t width, const size_t height, const int count, const float sigma);
54 static float dt_opencl_benchmark_cpu(const size_t width, const size_t height, const int count, const float sigma);
55 static char *_ascii_str_canonical(const char *in, char *out, int maxlen);
56 /** parse a single token of priority string and store priorities in priority_list */
57 static void dt_opencl_priority_parse(dt_opencl_t *cl, char *configstr, int *priority_list, int *mandatory);
58 /** parse a complete priority string */
59 static void dt_opencl_priorities_parse(dt_opencl_t *cl, const char *configstr);
60 /** set device priorities according to config string */
61 static void dt_opencl_update_priorities(const char *configstr);
62 /** read scheduling profile for config variables */
63 static dt_opencl_scheduling_profile_t dt_opencl_get_scheduling_profile(void);
64 /** read config of when/if to sync to cache */
65 static dt_opencl_sync_cache_t dt_opencl_get_sync_cache(void);
66 /** adjust opencl subsystem according to scheduling profile */
67 static void dt_opencl_apply_scheduling_profile(dt_opencl_scheduling_profile_t profile);
68 /** set opencl specific synchronization timeout */
69 static void dt_opencl_set_synchronization_timeout(int value);
70
71
dt_opencl_get_device_info(dt_opencl_t * cl,cl_device_id device,cl_device_info param_name,void ** param_value,size_t * param_value_size)72 int dt_opencl_get_device_info(dt_opencl_t *cl, cl_device_id device, cl_device_info param_name, void **param_value,
73 size_t *param_value_size)
74 {
75 cl_int err;
76
77 *param_value_size = SIZE_MAX;
78
79 // 1. figure out how much memory is needed
80 err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(device, param_name, 0, NULL, param_value_size);
81 if(err != CL_SUCCESS)
82 {
83 dt_print(DT_DEBUG_OPENCL,
84 "[dt_opencl_get_device_info] could not query the actual size in bytes of info %d: %d\n", param_name,
85 err);
86 goto error;
87 }
88
89 // 2. did we /actually/ get the size?
90 if(*param_value_size == SIZE_MAX || *param_value_size == 0)
91 {
92 // both of these sizes make no sense. either i failed to parse spec, or opencl implementation bug?
93 dt_print(DT_DEBUG_OPENCL,
94 "[dt_opencl_get_device_info] ERROR: no size returned, or zero size returned for data %d: %zu\n",
95 param_name, *param_value_size);
96 err = CL_INVALID_VALUE; // FIXME: anything better?
97 goto error;
98 }
99
100 // 3. make sure that *param_value points to big-enough memory block
101 {
102 void *ptr = realloc(*param_value, *param_value_size);
103 if(!ptr)
104 {
105 dt_print(DT_DEBUG_OPENCL,
106 "[dt_opencl_get_device_info] memory allocation failed! tried to allocate %zu bytes for data %d: %d",
107 *param_value_size, param_name, err);
108 err = CL_OUT_OF_HOST_MEMORY;
109 goto error;
110 }
111
112 // allocation succeeded, update pointer.
113 *param_value = ptr;
114 }
115
116 // 4. actually get the value
117 err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(device, param_name, *param_value_size, *param_value, NULL);
118 if(err != CL_SUCCESS)
119 {
120 dt_print(DT_DEBUG_OPENCL, "[dt_opencl_get_device_info] could not query info %d: %d\n", param_name, err);
121 goto error;
122 }
123
124 return CL_SUCCESS;
125
126 error:
127 free(*param_value);
128 *param_value = NULL;
129 *param_value_size = 0;
130 return err;
131 }
132
133 // returns 0 if all ok
134 // returns 1 if we failed hard, and need to skip opencl initialization
135 // returns -1 if we failed to init this device
dt_opencl_device_init(dt_opencl_t * cl,const int dev,cl_device_id * devices,const int k,const int opencl_memory_requirement)136 static int dt_opencl_device_init(dt_opencl_t *cl, const int dev, cl_device_id *devices, const int k,
137 const int opencl_memory_requirement)
138 {
139 int res;
140 cl_int err;
141
142 memset(cl->dev[dev].program, 0x0, sizeof(cl_program) * DT_OPENCL_MAX_PROGRAMS);
143 memset(cl->dev[dev].program_used, 0x0, sizeof(int) * DT_OPENCL_MAX_PROGRAMS);
144 memset(cl->dev[dev].kernel, 0x0, sizeof(cl_kernel) * DT_OPENCL_MAX_KERNELS);
145 memset(cl->dev[dev].kernel_used, 0x0, sizeof(int) * DT_OPENCL_MAX_KERNELS);
146 cl->dev[dev].eventlist = NULL;
147 cl->dev[dev].eventtags = NULL;
148 cl->dev[dev].numevents = 0;
149 cl->dev[dev].eventsconsolidated = 0;
150 cl->dev[dev].maxevents = 0;
151 cl->dev[dev].lostevents = 0;
152 cl->dev[dev].totalevents = 0;
153 cl->dev[dev].totalsuccess = 0;
154 cl->dev[dev].totallost = 0;
155 cl->dev[dev].summary = CL_COMPLETE;
156 cl->dev[dev].used_global_mem = 0;
157 cl->dev[dev].nvidia_sm_20 = 0;
158 cl->dev[dev].vendor = NULL;
159 cl->dev[dev].name = NULL;
160 cl->dev[dev].cname = NULL;
161 cl->dev[dev].options = NULL;
162 cl->dev[dev].memory_in_use = 0;
163 cl->dev[dev].peak_memory = 0;
164 cl_device_id devid = cl->dev[dev].devid = devices[k];
165
166 char *infostr = NULL;
167 size_t infostr_size;
168
169 char *cname = NULL;
170 size_t cname_size;
171
172 char *options = NULL;
173
174 char *vendor = NULL;
175 size_t vendor_size;
176
177 char *driverversion = NULL;
178 size_t driverversion_size;
179
180 char *deviceversion = NULL;
181 size_t deviceversion_size;
182
183 size_t infoint;
184 size_t *infointtab = NULL;
185 cl_device_type type;
186 cl_bool image_support = 0;
187 cl_bool device_available = 0;
188 cl_uint vendor_id = 0;
189 cl_bool little_endian = 0;
190
191 char *dtcache = calloc(PATH_MAX, sizeof(char));
192 char *cachedir = calloc(PATH_MAX, sizeof(char));
193 char *devname = calloc(1024, sizeof(char));
194 char *drvversion = calloc(1024, sizeof(char));
195
196 char kerneldir[PATH_MAX] = { 0 };
197 char *filename = calloc(PATH_MAX, sizeof(char));
198 char *confentry = calloc(PATH_MAX, sizeof(char));
199 char *binname = calloc(PATH_MAX, sizeof(char));
200
201 // test GPU availability, vendor, memory, image support etc:
202 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_AVAILABLE, sizeof(cl_bool), &device_available, NULL);
203
204 err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_VENDOR, (void **)&vendor, &vendor_size);
205 if(err != CL_SUCCESS)
206 {
207 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get vendor name of device %d: %d\n", k, err);
208 res = -1;
209 goto end;
210 }
211
212 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &vendor_id, NULL);
213
214 err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_NAME, (void **)&infostr, &infostr_size);
215 if(err != CL_SUCCESS)
216 {
217 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get device name of device %d: %d\n", k, err);
218 res = -1;
219 goto end;
220 }
221
222 err = dt_opencl_get_device_info(cl, devid, CL_DRIVER_VERSION, (void **)&driverversion, &driverversion_size);
223 if(err != CL_SUCCESS)
224 {
225 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get driver version of device %d `%s': %d\n", k, infostr, err);
226 res = -1;
227 goto end;
228 }
229
230 err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_VERSION, (void **)&deviceversion, &deviceversion_size);
231 if(err != CL_SUCCESS)
232 {
233 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get device version of device %d `%s': %d\n", k, infostr, err);
234 res = -1;
235 goto end;
236 }
237
238 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
239 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL);
240 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t),
241 &(cl->dev[dev].max_image_height), NULL);
242 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t),
243 &(cl->dev[dev].max_image_width), NULL);
244 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
245 &(cl->dev[dev].max_mem_alloc), NULL);
246 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), &little_endian, NULL);
247
248
249 cname_size = infostr_size;
250 cname = malloc(cname_size);
251 _ascii_str_canonical(infostr, cname, sizeof(cname_size));
252
253 if(!strncasecmp(vendor, "NVIDIA", 6))
254 {
255 // very lame attempt to detect support for atomic float add in global memory.
256 // we need compute model sm_20, but let's try for all nvidia devices :(
257 cl->dev[dev].nvidia_sm_20 = dt_nvidia_gpu_supports_sm_20(infostr);
258 dt_print(DT_DEBUG_OPENCL, "[opencl_init] device %d `%s' %s sm_20 support.\n", k, infostr,
259 cl->dev[dev].nvidia_sm_20 ? "has" : "doesn't have");
260 }
261
262 if(((type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) && !dt_conf_get_bool("opencl_use_cpu_devices"))
263 {
264 dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding CPU device %d `%s'.\n", k, infostr);
265 res = -1;
266 goto end;
267 }
268
269 if(dt_opencl_check_driver_blacklist(deviceversion) && !dt_conf_get_bool("opencl_disable_drivers_blacklist"))
270 {
271 dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding device %d `%s' because the driver `%s' is blacklisted.\n",
272 k, infostr, deviceversion);
273 res = -1;
274 goto end;
275 }
276
277 if(!device_available)
278 {
279 dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding device %d `%s' as it is not available.\n", k, infostr);
280 res = -1;
281 goto end;
282 }
283
284 if(!image_support)
285 {
286 dt_print(DT_DEBUG_OPENCL,
287 "[opencl_init] discarding device %d `%s' - The OpenCL driver "
288 "doesn't provide image support. See also 'clinfo' output.\n",
289 k, infostr);
290 res = -1;
291 goto end;
292 }
293
294 if(!little_endian)
295 {
296 dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding device %d `%s' as it is not little endian.\n", k, infostr);
297 res = -1;
298 goto end;
299 }
300
301 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong),
302 &(cl->dev[dev].max_global_mem), NULL);
303 if(cl->dev[dev].max_global_mem < (uint64_t)opencl_memory_requirement * 1024 * 1024)
304 {
305 dt_print(DT_DEBUG_OPENCL,
306 "[opencl_init] discarding device %d `%s' due to insufficient global memory (%" PRIu64 "MB).\n", k,
307 infostr, cl->dev[dev].max_global_mem / 1024 / 1024);
308 res = -1;
309 goto end;
310 }
311
312 cl->dev[dev].vendor = strdup(dt_opencl_get_vendor_by_id(vendor_id));
313 cl->dev[dev].name = strdup(infostr);
314 cl->dev[dev].cname = strdup(cname);
315
316 cl->crc = crc32(cl->crc, (const unsigned char *)infostr, strlen(infostr));
317
318 dt_print(DT_DEBUG_OPENCL, "[opencl_init] device %d `%s' supports image sizes of %zd x %zd\n", k, infostr,
319 cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
320 dt_print(DT_DEBUG_OPENCL, "[opencl_init] device %d `%s' allows GPU memory allocations of up to %" PRIu64 "MB\n",
321 k, infostr, cl->dev[dev].max_mem_alloc / 1024 / 1024);
322
323 if(darktable.unmuted & DT_DEBUG_OPENCL)
324 {
325 printf("[opencl_init] device %d: %s \n", k, infostr);
326 printf(" GLOBAL_MEM_SIZE: %.0fMB\n", (double)cl->dev[dev].max_global_mem / 1024.0 / 1024.0);
327 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(infoint), &infoint, NULL);
328 printf(" MAX_WORK_GROUP_SIZE: %zu\n", infoint);
329 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(infoint), &infoint,
330 NULL);
331 printf(" MAX_WORK_ITEM_DIMENSIONS: %zu\n", infoint);
332 printf(" MAX_WORK_ITEM_SIZES: [ ");
333
334 size_t infointtab_size;
335 err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_MAX_WORK_ITEM_SIZES, (void **)&infointtab,
336 &infointtab_size);
337 if(err == CL_SUCCESS)
338 {
339 for(size_t i = 0; i < infoint; i++) printf("%zu ", infointtab[i]);
340 free(infointtab);
341 infointtab = NULL;
342 }
343 else
344 {
345 res = -1;
346 goto end;
347 }
348
349 printf("]\n");
350 printf(" DRIVER_VERSION: %s\n", driverversion);
351 printf(" DEVICE_VERSION: %s\n", deviceversion);
352 }
353
354 dt_pthread_mutex_init(&cl->dev[dev].lock, NULL);
355
356 cl->dev[dev].context = (cl->dlocl->symbols->dt_clCreateContext)(0, 1, &devid, NULL, NULL, &err);
357 if(err != CL_SUCCESS)
358 {
359 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not create context for device %d: %d\n", k, err);
360 res = -1;
361 goto end;
362 }
363 // create a command queue for first device the context reported
364 cl->dev[dev].cmd_queue = (cl->dlocl->symbols->dt_clCreateCommandQueue)(
365 cl->dev[dev].context, devid, (darktable.unmuted & DT_DEBUG_PERF) ? CL_QUEUE_PROFILING_ENABLE : 0, &err);
366 if(err != CL_SUCCESS)
367 {
368 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not create command queue for device %d: %d\n", k, err);
369 res = -1;
370 goto end;
371 }
372
373 double tstart, tend, tdiff;
374 dt_loc_get_user_cache_dir(dtcache, PATH_MAX * sizeof(char));
375
376 int len = MIN(strlen(infostr),1024 * sizeof(char));;
377 int j = 0;
378 // remove non-alphanumeric chars from device name
379 for(int i = 0; i < len; i++)
380 if(isalnum(infostr[i])) devname[j++] = infostr[i];
381 devname[j] = 0;
382 len = MIN(strlen(driverversion), 1024 * sizeof(char));
383 j = 0;
384 // remove non-alphanumeric chars from driver version
385 for(int i = 0; i < len; i++)
386 if(isalnum(driverversion[i])) drvversion[j++] = driverversion[i];
387 drvversion[j] = 0;
388 snprintf(cachedir, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "cached_kernels_for_%s_%s", dtcache, devname, drvversion);
389 if(g_mkdir_with_parents(cachedir, 0700) == -1)
390 {
391 dt_print(DT_DEBUG_OPENCL, "[opencl_init] failed to create directory `%s'!\n", cachedir);
392 res = -1;
393 goto end;
394 }
395
396 dt_loc_get_kerneldir(kerneldir, sizeof(kerneldir));
397 dt_print(DT_DEBUG_DEV, "kernel directory: %s\n", kerneldir);
398
399 snprintf(filename, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "programs.conf", kerneldir);
400
401 char *escapedkerneldir = NULL;
402 #ifndef __APPLE__
403 escapedkerneldir = g_strdup_printf("\"%s\"", kerneldir);
404 #else
405 escapedkerneldir = dt_util_str_replace(kerneldir, " ", "\\ ");
406 #endif
407
408 options = g_strdup_printf("-w -cl-fast-relaxed-math %s -D%s=1 -I%s",
409 (cl->dev[dev].nvidia_sm_20 ? " -DNVIDIA_SM_20=1" : ""),
410 dt_opencl_get_vendor_by_id(vendor_id), escapedkerneldir);
411 cl->dev[dev].options = strdup(options);
412
413 dt_print(DT_DEBUG_OPENCL, "[opencl_init] options for OpenCL compiler: %s\n", options);
414
415 g_free(options);
416 options = NULL;
417 g_free(escapedkerneldir);
418 escapedkerneldir = NULL;
419
420 const char *clincludes[DT_OPENCL_MAX_INCLUDES] = { "rgb_norms.h", "noise_generator.h", "color_conversion.h", "colorspaces.cl", "colorspace.h", "common.h", NULL };
421 char *includemd5[DT_OPENCL_MAX_INCLUDES] = { NULL };
422 dt_opencl_md5sum(clincludes, includemd5);
423
424 // now load all darktable cl kernels.
425 // TODO: compile as a job?
426 tstart = dt_get_wtime();
427 FILE *f = g_fopen(filename, "rb");
428 if(f)
429 {
430
431 while(!feof(f))
432 {
433 int prog = -1;
434 gchar *confline_pattern = g_strdup_printf("%%%zu[^\n]\n", PATH_MAX * sizeof(char) - 1);
435 int rd = fscanf(f, confline_pattern, confentry);
436 g_free(confline_pattern);
437 if(rd != 1) continue;
438 // remove comments:
439 size_t end = strlen(confentry);
440 for(size_t pos = 0; pos < end; pos++)
441 if(confentry[pos] == '#')
442 {
443 confentry[pos] = '\0';
444 for(int l = pos - 1; l >= 0; l--)
445 {
446 if(confentry[l] == ' ')
447 confentry[l] = '\0';
448 else
449 break;
450 }
451 break;
452 }
453 if(confentry[0] == '\0') continue;
454
455 const char *programname = NULL, *programnumber = NULL;
456 gchar **tokens = g_strsplit_set(confentry, " \t", 2);
457 if(tokens)
458 {
459 programname = tokens[0];
460 if(tokens[0])
461 programnumber = tokens[1]; // if the 0st wasn't NULL then we have at least the terminating NULL in [1]
462 }
463
464 prog = programnumber ? strtol(programnumber, NULL, 10) : -1;
465
466 if(!programname || programname[0] == '\0' || prog < 0)
467 {
468 dt_print(DT_DEBUG_OPENCL, "[opencl_init] malformed entry in programs.conf `%s'; ignoring it!\n", confentry);
469 continue;
470 }
471
472 snprintf(filename, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "%s", kerneldir, programname);
473 snprintf(binname, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "%s.bin", cachedir, programname);
474 dt_print(DT_DEBUG_OPENCL, "[opencl_init] compiling program `%s' ..\n", programname);
475 int loaded_cached;
476 char md5sum[33];
477 if(dt_opencl_load_program(dev, prog, filename, binname, cachedir, md5sum, includemd5, &loaded_cached)
478 && dt_opencl_build_program(dev, prog, binname, cachedir, md5sum, loaded_cached) != CL_SUCCESS)
479 {
480 dt_print(DT_DEBUG_OPENCL, "[opencl_init] failed to compile program `%s'!\n", programname);
481 fclose(f);
482 g_strfreev(tokens);
483 res = -1;
484 goto end;
485 }
486
487 g_strfreev(tokens);
488 }
489
490 fclose(f);
491 tend = dt_get_wtime();
492 tdiff = tend - tstart;
493 dt_print(DT_DEBUG_OPENCL, "[opencl_init] kernel loading time: %2.4lf \n", tdiff);
494 }
495 else
496 {
497 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not open `%s'!\n", filename);
498 res = -1;
499 goto end;
500 }
501 for(int n = 0; n < DT_OPENCL_MAX_INCLUDES; n++) g_free(includemd5[n]);
502
503 res = 0;
504
505 end:
506
507 free(infostr);
508 free(cname);
509 free(options);
510 free(vendor);
511 free(driverversion);
512 free(deviceversion);
513
514 free(dtcache);
515 free(cachedir);
516 free(devname);
517 free(drvversion);
518
519 free(filename);
520 free(confentry);
521 free(binname);
522
523 return res;
524 }
525
dt_opencl_init(dt_opencl_t * cl,const gboolean exclude_opencl,const gboolean print_statistics)526 void dt_opencl_init(dt_opencl_t *cl, const gboolean exclude_opencl, const gboolean print_statistics)
527 {
528 char *str;
529 dt_pthread_mutex_init(&cl->lock, NULL);
530 cl->inited = 0;
531 cl->enabled = 0;
532 cl->stopped = 0;
533 cl->error_count = 0;
534 cl->print_statistics = print_statistics;
535
536 // work-around to fix a bug in some AMD OpenCL compilers, which would fail parsing certain numerical
537 // constants if locale is different from "C".
538 // we save the current locale, set locale to "C", and restore the previous setting after OpenCL is
539 // initialized
540 char *locale = strdup(setlocale(LC_ALL, NULL));
541 setlocale(LC_ALL, "C");
542
543 int handles = dt_conf_get_int("opencl_number_event_handles");
544 handles = (handles < 0 ? 0x7fffffff : handles);
545 cl->number_event_handles = handles;
546 cl->use_events = (handles != 0);
547
548 cl->avoid_atomics = dt_conf_get_bool("opencl_avoid_atomics");
549 cl->async_pixelpipe = dt_conf_get_bool("opencl_async_pixelpipe");
550 cl->sync_cache = dt_opencl_get_sync_cache();
551 cl->micro_nap = dt_conf_get_int("opencl_micro_nap");
552 cl->crc = 5781;
553 cl->dlocl = NULL;
554 cl->dev_priority_image = NULL;
555 cl->dev_priority_preview = NULL;
556 cl->dev_priority_preview2 = NULL;
557 cl->dev_priority_export = NULL;
558 cl->dev_priority_thumbnail = NULL;
559
560 cl_platform_id *all_platforms = NULL;
561 cl_uint *all_num_devices = NULL;
562
563 // user selectable parameter defines minimum requirement on GPU memory
564 // default is 768MB
565 // values below 200 will be (re)set to 200
566 const int opencl_memory_requirement = MAX(200, dt_conf_get_int("opencl_memory_requirement"));
567 dt_conf_set_int("opencl_memory_requirement", opencl_memory_requirement);
568
569 if(exclude_opencl)
570 {
571 dt_print(DT_DEBUG_OPENCL, "[opencl_init] do not try to find and use an opencl runtime library due to "
572 "explicit user request\n");
573 goto finally;
574 }
575
576 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl related configuration options:\n");
577 dt_print(DT_DEBUG_OPENCL, "[opencl_init] \n");
578 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl: %d\n", dt_conf_get_bool("opencl"));
579 str = dt_conf_get_string("opencl_scheduling_profile");
580 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_scheduling_profile: '%s'\n", str);
581 g_free(str);
582 str = dt_conf_get_string("opencl_library");
583 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_library: '%s'\n", str);
584 g_free(str);
585 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_memory_requirement: %d\n",
586 dt_conf_get_int("opencl_memory_requirement"));
587 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_memory_headroom: %d\n",
588 dt_conf_get_int("opencl_memory_headroom"));
589 str = dt_conf_get_string("opencl_device_priority");
590 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_device_priority: '%s'\n", str);
591 g_free(str);
592 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_mandatory_timeout: %d\n",
593 dt_conf_get_int("opencl_mandatory_timeout"));
594 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_size_roundup: %d\n",
595 dt_conf_get_int("opencl_size_roundup"));
596 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_async_pixelpipe: %d\n",
597 dt_conf_get_bool("opencl_async_pixelpipe"));
598 str = dt_conf_get_string("opencl_synch_cache");
599 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_synch_cache: %s\n", str);
600 g_free(str);
601 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_number_event_handles: %d\n",
602 dt_conf_get_int("opencl_number_event_handles"));
603 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_micro_nap: %d\n", dt_conf_get_int("opencl_micro_nap"));
604 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_use_pinned_memory: %d\n",
605 dt_conf_get_bool("opencl_use_pinned_memory"));
606 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_use_cpu_devices: %d\n",
607 dt_conf_get_bool("opencl_use_cpu_devices"));
608
609 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_avoid_atomics: %d\n",
610 dt_conf_get_bool("opencl_avoid_atomics"));
611
612
613 dt_print(DT_DEBUG_OPENCL, "[opencl_init] \n");
614
615 // look for explicit definition of opencl_runtime library in preferences
616 char *library = dt_conf_get_string("opencl_library");
617
618 // dynamically load opencl runtime
619 if((cl->dlocl = dt_dlopencl_init(library)) == NULL)
620 {
621 dt_print(DT_DEBUG_OPENCL,
622 "[opencl_init] no working opencl library found. Continue with opencl disabled\n");
623 g_free(library);
624 goto finally;
625 }
626 else
627 {
628 dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl library '%s' found on your system and loaded\n",
629 cl->dlocl->library);
630 }
631 g_free(library);
632
633 cl_int err;
634 all_platforms = malloc(sizeof(cl_platform_id) * DT_OPENCL_MAX_PLATFORMS);
635 all_num_devices = malloc(sizeof(cl_uint) * DT_OPENCL_MAX_PLATFORMS);
636 cl_uint num_platforms = DT_OPENCL_MAX_PLATFORMS;
637 err = (cl->dlocl->symbols->dt_clGetPlatformIDs)(DT_OPENCL_MAX_PLATFORMS, all_platforms, &num_platforms);
638 if(err != CL_SUCCESS)
639 {
640 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get platforms: %d\n", err);
641 goto finally;
642 }
643
644 if(num_platforms == 0)
645 {
646 dt_print(DT_DEBUG_OPENCL, "[opencl_init] no opencl platform available\n");
647 goto finally;
648 }
649 dt_print(DT_DEBUG_OPENCL, "[opencl_init] found %d platform%s\n", num_platforms,
650 num_platforms > 1 ? "s" : "");
651
652 for(int n = 0; n < num_platforms; n++)
653 {
654 cl_platform_id platform = all_platforms[n];
655 // get the number of GPU devices available to the platforms
656 // the other common option is CL_DEVICE_TYPE_GPU/CPU (but the latter doesn't work with the nvidia drivers)
657 err = (cl->dlocl->symbols->dt_clGetDeviceIDs)(platform, CL_DEVICE_TYPE_ALL, 0, NULL,
658 &(all_num_devices[n]));
659 if(err != CL_SUCCESS)
660 {
661 all_num_devices[n] = 0;
662 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get device id size: %d\n", err);
663 }
664 }
665
666 cl_uint num_devices = 0;
667 for(int n = 0; n < num_platforms; n++) num_devices += all_num_devices[n];
668
669 // create the device list
670 cl_device_id *devices = 0;
671 if(num_devices)
672 {
673 cl->dev = (dt_opencl_device_t *)malloc(sizeof(dt_opencl_device_t) * num_devices);
674 devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
675 if(!cl->dev || !devices)
676 {
677 free(cl->dev);
678 cl->dev = NULL;
679 free(devices);
680 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not allocate memory\n");
681 goto finally;
682 }
683 }
684
685 cl_device_id *devs = devices;
686 for(int n = 0; n < num_platforms; n++)
687 {
688 if(all_num_devices[n])
689 {
690 cl_platform_id platform = all_platforms[n];
691 err = (cl->dlocl->symbols->dt_clGetDeviceIDs)(platform, CL_DEVICE_TYPE_ALL, all_num_devices[n], devs,
692 NULL);
693 if(err != CL_SUCCESS)
694 {
695 num_devices -= all_num_devices[n];
696 dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get devices list: %d\n", err);
697 }
698 devs += all_num_devices[n];
699 }
700 }
701 devs = NULL;
702
703 dt_print(DT_DEBUG_OPENCL, "[opencl_init] found %d device%s\n", num_devices, num_devices > 1 ? "s" : "");
704 if(num_devices == 0)
705 {
706 if(devices)
707 free(devices);
708 goto finally;
709 }
710
711 int dev = 0;
712 for(int k = 0; k < num_devices; k++)
713 {
714 const int res = dt_opencl_device_init(cl, dev, devices, k, opencl_memory_requirement);
715
716 if(res != 0)
717 continue;
718
719 // increase dev only if dt_opencl_device_init was successful (res == 0)
720
721 ++dev;
722 }
723 free(devices);
724 devices = NULL;
725
726 if(dev > 0)
727 {
728 cl->num_devs = dev;
729 cl->inited = 1;
730 cl->enabled = dt_conf_get_bool("opencl");
731 memset(cl->mandatory, 0, sizeof(cl->mandatory));
732 cl->dev_priority_image = (int *)malloc(sizeof(int) * (dev + 1));
733 cl->dev_priority_preview = (int *)malloc(sizeof(int) * (dev + 1));
734 cl->dev_priority_preview2 = (int *)malloc(sizeof(int) * (dev + 1));
735 cl->dev_priority_export = (int *)malloc(sizeof(int) * (dev + 1));
736 cl->dev_priority_thumbnail = (int *)malloc(sizeof(int) * (dev + 1));
737
738 // only check successful malloc in debug mode; darktable will crash anyhow sooner or later if mallocs that
739 // small would fail
740 assert(cl->dev_priority_image != NULL && cl->dev_priority_preview != NULL && cl->dev_priority_preview2 != NULL
741 && cl->dev_priority_export != NULL && cl->dev_priority_thumbnail != NULL);
742
743 dt_print(DT_DEBUG_OPENCL, "[opencl_init] OpenCL successfully initialized.\n");
744 dt_print(
745 DT_DEBUG_OPENCL,
746 "[opencl_init] here are the internal numbers and names of OpenCL devices available to darktable:\n");
747 for(int i = 0; i < dev; i++) dt_print(DT_DEBUG_OPENCL, "[opencl_init]\t\t%d\t'%s'\n", i, cl->dev[i].name);
748 }
749 else
750 {
751 dt_print(DT_DEBUG_OPENCL, "[opencl_init] no suitable devices found.\n");
752 }
753
754 finally:
755 dt_print(DT_DEBUG_OPENCL, "[opencl_init] FINALLY: opencl is %sAVAILABLE on this system.\n",
756 cl->inited ? "" : "NOT ");
757 dt_print(DT_DEBUG_OPENCL, "[opencl_init] initial status of opencl enabled flag is %s.\n",
758 cl->enabled ? "ON" : "OFF");
759 if(cl->inited)
760 {
761 dt_capabilities_add("opencl");
762 cl->blendop = dt_develop_blend_init_cl_global();
763 cl->bilateral = dt_bilateral_init_cl_global();
764 cl->gaussian = dt_gaussian_init_cl_global();
765 cl->interpolation = dt_interpolation_init_cl_global();
766 cl->local_laplacian = dt_local_laplacian_init_cl_global();
767 cl->dwt = dt_dwt_init_cl_global();
768 cl->heal = dt_heal_init_cl_global();
769 cl->colorspaces = dt_colorspaces_init_cl_global();
770 cl->guided_filter = dt_guided_filter_init_cl_global();
771
772 char checksum[64];
773 snprintf(checksum, sizeof(checksum), "%u", cl->crc);
774 char *oldchecksum = dt_conf_get_string("opencl_checksum");
775
776 // check if the configuration (OpenCL device setup) has changed, indicated by checksum != oldchecksum
777 if(strcasecmp(oldchecksum, "OFF") != 0 && strcmp(oldchecksum, checksum) != 0)
778 {
779 // store new checksum value in config
780 dt_conf_set_string("opencl_checksum", checksum);
781 // do CPU bencharking
782 float tcpu = dt_opencl_benchmark_cpu(1024, 1024, 5, 100.0f);
783 // get best benchmarking value of all detected OpenCL devices
784 float tgpumin = INFINITY;
785 for(int n = 0; n < cl->num_devs; n++)
786 {
787 float tgpu = cl->dev[n].benchmark = dt_opencl_benchmark_gpu(n, 1024, 1024, 5, 100.0f);
788 tgpumin = fmin(tgpu, tgpumin);
789 }
790 dt_print(DT_DEBUG_OPENCL, "[opencl_init] benchmarking results: %f seconds for fastest GPU versus %f seconds for CPU.\n",
791 tgpumin, tcpu);
792
793 if(tcpu <= 1.5f * tgpumin)
794 {
795 // de-activate opencl for darktable in case of too slow GPU(s). user can always manually overrule this later.
796 cl->enabled = FALSE;
797 dt_conf_set_bool("opencl", FALSE);
798 dt_print(DT_DEBUG_OPENCL, "[opencl_init] due to a slow GPU the opencl flag has been set to OFF.\n");
799 dt_control_log(_("due to a slow GPU hardware acceleration via opencl has been de-activated."));
800 }
801 else if(cl->num_devs >= 2)
802 {
803 // set scheduling profile to "multiple GPUs" if more than one device has been found
804 dt_conf_set_string("opencl_scheduling_profile", "multiple GPUs");
805 dt_print(DT_DEBUG_OPENCL, "[opencl_init] set scheduling profile for multiple GPUs.\n");
806 dt_control_log(_("multiple GPUs detected - opencl scheduling profile has been set accordingly."));
807 }
808 else if(tcpu >= 6.0f * tgpumin)
809 {
810 // set scheduling profile to "very fast GPU" if CPU is way too slow
811 dt_conf_set_string("opencl_scheduling_profile", "very fast GPU");
812 dt_print(DT_DEBUG_OPENCL, "[opencl_init] set scheduling profile for very fast GPU.\n");
813 dt_control_log(_("very fast GPU detected - opencl scheduling profile has been set accordingly."));
814 }
815 else
816 {
817 // set scheduling profile to "default"
818 dt_conf_set_string("opencl_scheduling_profile", "default");
819 dt_print(DT_DEBUG_OPENCL, "[opencl_init] set scheduling profile to default.\n");
820 dt_control_log(_("opencl scheduling profile set to default."));
821 }
822 }
823 g_free(oldchecksum);
824
825 // apply config settings for scheduling profile: sets device priorities and pixelpipe synchronization timeout
826 dt_opencl_scheduling_profile_t profile = dt_opencl_get_scheduling_profile();
827 dt_opencl_apply_scheduling_profile(profile);
828 }
829 else // initialization failed
830 {
831 for(int i = 0; cl->dev && i < cl->num_devs; i++)
832 {
833 dt_pthread_mutex_destroy(&cl->dev[i].lock);
834 for(int k = 0; k < DT_OPENCL_MAX_KERNELS; k++)
835 if(cl->dev[i].kernel_used[k]) (cl->dlocl->symbols->dt_clReleaseKernel)(cl->dev[i].kernel[k]);
836 for(int k = 0; k < DT_OPENCL_MAX_PROGRAMS; k++)
837 if(cl->dev[i].program_used[k]) (cl->dlocl->symbols->dt_clReleaseProgram)(cl->dev[i].program[k]);
838 (cl->dlocl->symbols->dt_clReleaseCommandQueue)(cl->dev[i].cmd_queue);
839 (cl->dlocl->symbols->dt_clReleaseContext)(cl->dev[i].context);
840 if(cl->use_events)
841 {
842 dt_opencl_events_reset(i);
843 free(cl->dev[i].eventlist);
844 free(cl->dev[i].eventtags);
845 }
846 free((void *)(cl->dev[i].vendor));
847 free((void *)(cl->dev[i].name));
848 free((void *)(cl->dev[i].cname));
849 free((void *)(cl->dev[i].options));
850 }
851 }
852
853 free(all_num_devices);
854 free(all_platforms);
855
856 if(locale)
857 {
858 setlocale(LC_ALL, locale);
859 free(locale);
860 }
861
862 return;
863 }
864
dt_opencl_cleanup(dt_opencl_t * cl)865 void dt_opencl_cleanup(dt_opencl_t *cl)
866 {
867 if(cl->inited)
868 {
869 dt_develop_blend_free_cl_global(cl->blendop);
870 dt_bilateral_free_cl_global(cl->bilateral);
871 dt_gaussian_free_cl_global(cl->gaussian);
872 dt_interpolation_free_cl_global(cl->interpolation);
873 dt_dwt_free_cl_global(cl->dwt);
874 dt_heal_free_cl_global(cl->heal);
875 dt_colorspaces_free_cl_global(cl->colorspaces);
876 dt_guided_filter_free_cl_global(cl->guided_filter);
877
878 for(int i = 0; i < cl->num_devs; i++)
879 {
880 dt_pthread_mutex_destroy(&cl->dev[i].lock);
881 for(int k = 0; k < DT_OPENCL_MAX_KERNELS; k++)
882 if(cl->dev[i].kernel_used[k]) (cl->dlocl->symbols->dt_clReleaseKernel)(cl->dev[i].kernel[k]);
883 for(int k = 0; k < DT_OPENCL_MAX_PROGRAMS; k++)
884 if(cl->dev[i].program_used[k]) (cl->dlocl->symbols->dt_clReleaseProgram)(cl->dev[i].program[k]);
885 (cl->dlocl->symbols->dt_clReleaseCommandQueue)(cl->dev[i].cmd_queue);
886 (cl->dlocl->symbols->dt_clReleaseContext)(cl->dev[i].context);
887
888 if(cl->print_statistics && (darktable.unmuted & DT_DEBUG_MEMORY))
889 {
890 dt_print(DT_DEBUG_OPENCL, "[opencl_summary_statistics] device '%s' (%d): peak memory usage %zu bytes (%.1f MB)\n",
891 cl->dev[i].name, i, cl->dev[i].peak_memory, (float)cl->dev[i].peak_memory/(1024*1024));
892 }
893
894 if(cl->print_statistics && cl->use_events)
895 {
896 if(cl->dev[i].totalevents)
897 {
898 dt_print(DT_DEBUG_OPENCL, "[opencl_summary_statistics] device '%s' (%d): %d out of %d events were "
899 "successful and %d events lost\n",
900 cl->dev[i].name, i, cl->dev[i].totalsuccess, cl->dev[i].totalevents, cl->dev[i].totallost);
901 }
902 else
903 {
904 dt_print(DT_DEBUG_OPENCL, "[opencl_summary_statistics] device '%s' (%d): NOT utilized\n",
905 cl->dev[i].name, i);
906 }
907 }
908
909 if(cl->use_events)
910 {
911 dt_opencl_events_reset(i);
912
913 free(cl->dev[i].eventlist);
914 free(cl->dev[i].eventtags);
915 }
916
917 free((void *)(cl->dev[i].vendor));
918 free((void *)(cl->dev[i].name));
919 free((void *)(cl->dev[i].cname));
920 free((void *)(cl->dev[i].options));
921 }
922 free(cl->dev_priority_image);
923 free(cl->dev_priority_preview);
924 free(cl->dev_priority_preview2);
925 free(cl->dev_priority_export);
926 free(cl->dev_priority_thumbnail);
927 }
928
929 if(cl->dlocl)
930 {
931 free(cl->dlocl->symbols);
932 g_free(cl->dlocl->library);
933 free(cl->dlocl);
934 }
935
936 free(cl->dev);
937 dt_pthread_mutex_destroy(&cl->lock);
938 }
939
dt_opencl_get_vendor_by_id(unsigned int id)940 static const char *dt_opencl_get_vendor_by_id(unsigned int id)
941 {
942 const char *vendor;
943
944 switch(id)
945 {
946 case 4098:
947 vendor = "AMD";
948 break;
949 case 4318:
950 vendor = "NVIDIA";
951 break;
952 case 0x8086u:
953 vendor = "INTEL";
954 break;
955 default:
956 vendor = "UNKNOWN";
957 }
958
959 return vendor;
960 }
961
dt_opencl_benchmark_gpu(const int devid,const size_t width,const size_t height,const int count,const float sigma)962 static float dt_opencl_benchmark_gpu(const int devid, const size_t width, const size_t height, const int count, const float sigma)
963 {
964 const int bpp = 4 * sizeof(float);
965 cl_int err = 666;
966 cl_mem dev_mem = NULL;
967 float *buf = NULL;
968 dt_gaussian_cl_t *g = NULL;
969
970 const float Labmax[] = { INFINITY, INFINITY, INFINITY, INFINITY };
971 const float Labmin[] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
972
973 unsigned int *const tea_states = alloc_tea_states(dt_get_num_threads());
974
975 buf = dt_alloc_align(64, width * height * bpp);
976 if(buf == NULL) goto error;
977
978 #ifdef _OPENMP
979 #pragma omp parallel for default(none) \
980 dt_omp_firstprivate(height, tea_states, width) \
981 shared(buf)
982 #endif
983 for(size_t j = 0; j < height; j++)
984 {
985 unsigned int *tea_state = get_tea_state(tea_states,dt_get_thread_num());
986 tea_state[0] = j + dt_get_thread_num();
987 size_t index = j * 4 * width;
988 for(int i = 0; i < 4 * width; i++)
989 {
990 encrypt_tea(tea_state);
991 buf[index + i] = 100.0f * tpdf(tea_state[0]);
992 }
993 }
994
995 // start timer
996 double start = dt_get_wtime();
997
998 // allocate dev_mem buffer
999 dev_mem = dt_opencl_alloc_device_use_host_pointer(devid, width, height, bpp, width*bpp, buf);
1000 if(dev_mem == NULL) goto error;
1001
1002 // prepare gaussian filter
1003 g = dt_gaussian_init_cl(devid, width, height, 4, Labmax, Labmin, sigma, 0);
1004 if(!g) goto error;
1005
1006 // gaussian blur
1007 for(int n = 0; n < count; n++)
1008 {
1009 err = dt_gaussian_blur_cl(g, dev_mem, dev_mem);
1010 if(err != CL_SUCCESS) goto error;
1011 }
1012
1013 // cleanup gaussian filter
1014 dt_gaussian_free_cl(g);
1015 g = NULL;
1016
1017 // copy dev_mem -> buf
1018 err = dt_opencl_copy_device_to_host(devid, buf, dev_mem, width, height, bpp);
1019 if(err != CL_SUCCESS) goto error;
1020
1021 // free dev_mem
1022 dt_opencl_release_mem_object(dev_mem);
1023
1024 // end timer
1025 double end = dt_get_wtime();
1026
1027 dt_free_align(buf);
1028 free_tea_states(tea_states);
1029 return (end - start);
1030
1031 error:
1032 dt_gaussian_free_cl(g);
1033 dt_free_align(buf);
1034 free_tea_states(tea_states);
1035 dt_opencl_release_mem_object(dev_mem);
1036 return INFINITY;
1037 }
1038
dt_opencl_benchmark_cpu(const size_t width,const size_t height,const int count,const float sigma)1039 static float dt_opencl_benchmark_cpu(const size_t width, const size_t height, const int count, const float sigma)
1040 {
1041 const int bpp = 4 * sizeof(float);
1042 float *buf = NULL;
1043 dt_gaussian_t *g = NULL;
1044
1045 const float Labmax[] = { INFINITY, INFINITY, INFINITY, INFINITY };
1046 const float Labmin[] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
1047
1048 unsigned int *const tea_states = alloc_tea_states(dt_get_num_threads());
1049
1050 buf = dt_alloc_align(64, width * height * bpp);
1051 if(buf == NULL) goto error;
1052
1053 #ifdef _OPENMP
1054 #pragma omp parallel for default(none) \
1055 dt_omp_firstprivate(height, width, tea_states) \
1056 shared(buf)
1057 #endif
1058 for(size_t j = 0; j < height; j++)
1059 {
1060 unsigned int *tea_state = get_tea_state(tea_states,dt_get_thread_num());
1061 tea_state[0] = j + dt_get_thread_num();
1062 size_t index = j * 4 * width;
1063 for(int i = 0; i < 4 * width; i++)
1064 {
1065 encrypt_tea(tea_state);
1066 buf[index + i] = 100.0f * tpdf(tea_state[0]);
1067 }
1068 }
1069
1070 // start timer
1071 double start = dt_get_wtime();
1072
1073 // prepare gaussian filter
1074 g = dt_gaussian_init(width, height, 4, Labmax, Labmin, sigma, 0);
1075 if(!g) goto error;
1076
1077 // gaussian blur
1078 for(int n = 0; n < count; n++)
1079 {
1080 dt_gaussian_blur(g, buf, buf);
1081 }
1082
1083 // cleanup gaussian filter
1084 dt_gaussian_free(g);
1085 g = NULL;
1086
1087 // end timer
1088 double end = dt_get_wtime();
1089
1090 dt_free_align(buf);
1091 free_tea_states(tea_states);
1092 return (end - start);
1093
1094 error:
1095 dt_gaussian_free(g);
1096 dt_free_align(buf);
1097 free_tea_states(tea_states);
1098 return INFINITY;
1099 }
1100
1101
dt_opencl_finish(const int devid)1102 int dt_opencl_finish(const int devid)
1103 {
1104 dt_opencl_t *cl = darktable.opencl;
1105 if(!cl->inited || devid < 0) return -1;
1106
1107 cl_int err = (cl->dlocl->symbols->dt_clFinish)(cl->dev[devid].cmd_queue);
1108
1109 // take the opportunity to release some event handles, but without printing
1110 // summary statistics
1111 cl_int success = dt_opencl_events_flush(devid, 0);
1112
1113 return (err == CL_SUCCESS && success == CL_COMPLETE);
1114 }
1115
dt_opencl_enqueue_barrier(const int devid)1116 int dt_opencl_enqueue_barrier(const int devid)
1117 {
1118 dt_opencl_t *cl = darktable.opencl;
1119 if(!cl->inited || devid < 0) return -1;
1120 return (cl->dlocl->symbols->dt_clEnqueueBarrier)(cl->dev[devid].cmd_queue);
1121 }
1122
_take_from_list(int * list,int value)1123 static int _take_from_list(int *list, int value)
1124 {
1125 int result = -1;
1126
1127 while(*list != -1 && *list != value) list++;
1128 result = *list;
1129
1130 while(*list != -1)
1131 {
1132 *list = *(list + 1);
1133 list++;
1134 }
1135
1136 return result;
1137 }
1138
1139
_device_by_cname(const char * name)1140 static int _device_by_cname(const char *name)
1141 {
1142 dt_opencl_t *cl = darktable.opencl;
1143 int devs = cl->num_devs;
1144 char tmp[2048] = { 0 };
1145 int result = -1;
1146
1147 _ascii_str_canonical(name, tmp, sizeof(tmp));
1148
1149 for(int i = 0; i < devs; i++)
1150 {
1151 if(!strcmp(tmp, cl->dev[i].cname))
1152 {
1153 result = i;
1154 break;
1155 }
1156 }
1157
1158 return result;
1159 }
1160
1161
_ascii_str_canonical(const char * in,char * out,int maxlen)1162 static char *_ascii_str_canonical(const char *in, char *out, int maxlen)
1163 {
1164 if(out == NULL)
1165 {
1166 maxlen = strlen(in) + 1;
1167 out = malloc(maxlen);
1168 if(out == NULL) return NULL;
1169 }
1170
1171 int len = 0;
1172
1173 while(*in != '\0' && len < maxlen - 1)
1174 {
1175 int n = strcspn(in, "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
1176 in += n;
1177 if(n != 0) continue;
1178 out[len] = tolower(*in);
1179 len++;
1180 in++;
1181 }
1182 out[len] = '\0';
1183
1184 return out;
1185 }
1186
1187
_strsep(char ** stringp,const char * delim)1188 static char *_strsep(char **stringp, const char *delim)
1189 {
1190 char *begin, *end;
1191
1192 begin = *stringp;
1193 if(begin == NULL) return NULL;
1194
1195 if(delim[0] == '\0' || delim[1] == '\0')
1196 {
1197 char ch = delim[0];
1198
1199 if(ch == '\0')
1200 end = NULL;
1201 else
1202 {
1203 if(*begin == ch)
1204 end = begin;
1205 else if(*begin == '\0')
1206 end = NULL;
1207 else
1208 end = strchr(begin + 1, ch);
1209 }
1210 }
1211 else
1212 end = strpbrk(begin, delim);
1213
1214 if(end)
1215 {
1216 *end++ = '\0';
1217 *stringp = end;
1218 }
1219 else
1220 *stringp = NULL;
1221
1222 return begin;
1223 }
1224
1225
1226 // parse a single token of priority string and store priorities in priority_list
dt_opencl_priority_parse(dt_opencl_t * cl,char * configstr,int * priority_list,int * mandatory)1227 static void dt_opencl_priority_parse(dt_opencl_t *cl, char *configstr, int *priority_list, int *mandatory)
1228 {
1229 int devs = cl->num_devs;
1230 int count = 0;
1231 int *full = malloc(sizeof(int) * (devs + 1));
1232 int mnd = 0;
1233
1234 // NULL or empty configstring?
1235 if(configstr == NULL || *configstr == '\0')
1236 {
1237 priority_list[0] = -1;
1238 *mandatory = 0;
1239 free(full);
1240 return;
1241 }
1242
1243 // check if user wants us to force-use opencl device(s)
1244 if(configstr[0] == '+')
1245 {
1246 mnd = 1;
1247 configstr++;
1248 }
1249
1250 // first start with a full list of devices to take from
1251 for(int i = 0; i < devs; i++) full[i] = i;
1252 full[devs] = -1;
1253
1254 gchar **tokens = g_strsplit(configstr, ",", 0);
1255 gchar **tokens_ptr = tokens;
1256
1257 while(tokens != NULL && *tokens_ptr != NULL && count < devs + 1 && full[0] != -1)
1258 {
1259 gchar *str = *tokens_ptr;
1260 int not = 0;
1261 int all = 0;
1262
1263 switch(*str)
1264 {
1265 case '*':
1266 all = 1;
1267 break;
1268 case '!':
1269 not = 1;
1270 while(*str == '!') str++;
1271 break;
1272 }
1273
1274 if(all)
1275 {
1276 // copy all remaining device numbers from full to priority list
1277 for(int i = 0; i < devs && full[i] != -1; i++)
1278 {
1279 priority_list[count] = full[i];
1280 count++;
1281 }
1282 full[0] = -1; // mark full list as empty
1283 }
1284 else if(*str != '\0')
1285 {
1286 char *endptr = NULL;
1287
1288 // first check if str corresponds to an existing canonical device name
1289 long number = _device_by_cname(str);
1290
1291 // if not try to convert string into decimal device number
1292 if(number < 0) number = strtol(str, &endptr, 10);
1293
1294 // still not found or negative number given? set number to -1
1295 if(number < 0 || (number == 0 && endptr == str)) number = -1;
1296
1297 // try to take number out of remaining device list
1298 int dev_number = _take_from_list(full, number);
1299
1300 if(!not&&dev_number != -1)
1301 {
1302 priority_list[count] = dev_number;
1303 count++;
1304 }
1305 }
1306
1307 tokens_ptr++;
1308 }
1309
1310 g_strfreev(tokens);
1311
1312 // terminate priority list with -1
1313 while(count < devs + 1) priority_list[count++] = -1;
1314
1315 // opencl use can only be mandatory if at least one opencl device is given
1316 *mandatory = (priority_list[0] != -1) ? mnd : 0;
1317
1318 free(full);
1319 }
1320
1321 // parse a complete priority string
dt_opencl_priorities_parse(dt_opencl_t * cl,const char * configstr)1322 static void dt_opencl_priorities_parse(dt_opencl_t *cl, const char *configstr)
1323 {
1324 char tmp[2048];
1325 int len = 0;
1326
1327 // first get rid of all invalid characters
1328 while(*configstr != '\0' && len < sizeof(tmp) - 1)
1329 {
1330 int n = strcspn(configstr, "/!,*+0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
1331 configstr += n;
1332 if(n != 0) continue;
1333 tmp[len] = *configstr;
1334 len++;
1335 configstr++;
1336 }
1337 tmp[len] = '\0';
1338
1339 char *str = tmp;
1340
1341 // now split config string into tokens, separated by '/' and parse them one after the other
1342 char *prio = _strsep(&str, "/");
1343 dt_opencl_priority_parse(cl, prio, cl->dev_priority_image, &cl->mandatory[0]);
1344
1345 prio = _strsep(&str, "/");
1346 dt_opencl_priority_parse(cl, prio, cl->dev_priority_preview, &cl->mandatory[1]);
1347
1348 prio = _strsep(&str, "/");
1349 dt_opencl_priority_parse(cl, prio, cl->dev_priority_export, &cl->mandatory[2]);
1350
1351 prio = _strsep(&str, "/");
1352 dt_opencl_priority_parse(cl, prio, cl->dev_priority_thumbnail, &cl->mandatory[3]);
1353
1354 prio = _strsep(&str, "/");
1355 dt_opencl_priority_parse(cl, prio, cl->dev_priority_preview2, &cl->mandatory[4]);
1356 }
1357
1358 // set device priorities according to config string
dt_opencl_update_priorities(const char * configstr)1359 static void dt_opencl_update_priorities(const char *configstr)
1360 {
1361 dt_opencl_t *cl = darktable.opencl;
1362 dt_opencl_priorities_parse(cl, configstr);
1363
1364 dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] these are your device priorities:\n");
1365 dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] \t\timage\tpreview\texport\tthumbs\tpreview2\n");
1366 for(int i = 0; i < cl->num_devs; i++)
1367 dt_print(DT_DEBUG_OPENCL, "[opencl_priorities]\t\t%d\t%d\t%d\t%d\t%d\n", cl->dev_priority_image[i],
1368 cl->dev_priority_preview[i], cl->dev_priority_export[i], cl->dev_priority_thumbnail[i], cl->dev_priority_preview2[i]);
1369 dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] show if opencl use is mandatory for a given pixelpipe:\n");
1370 dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] \t\timage\tpreview\texport\tthumbs\tpreview2\n");
1371 dt_print(DT_DEBUG_OPENCL, "[opencl_priorities]\t\t%d\t%d\t%d\t%d\t%d\n", cl->mandatory[0],
1372 cl->mandatory[1], cl->mandatory[2], cl->mandatory[3], cl->mandatory[4]);
1373 }
1374
dt_opencl_lock_device(const int pipetype)1375 int dt_opencl_lock_device(const int pipetype)
1376 {
1377 dt_opencl_t *cl = darktable.opencl;
1378 if(!cl->inited) return -1;
1379
1380
1381 dt_pthread_mutex_lock(&cl->lock);
1382
1383 size_t prio_size = sizeof(int) * (cl->num_devs + 1);
1384 int *priority = (int *)malloc(prio_size);
1385 int mandatory;
1386
1387 switch(pipetype)
1388 {
1389 case DT_DEV_PIXELPIPE_FULL:
1390 memcpy(priority, cl->dev_priority_image, prio_size);
1391 mandatory = cl->mandatory[0];
1392 break;
1393 case DT_DEV_PIXELPIPE_PREVIEW:
1394 memcpy(priority, cl->dev_priority_preview, prio_size);
1395 mandatory = cl->mandatory[1];
1396 break;
1397 case DT_DEV_PIXELPIPE_EXPORT:
1398 memcpy(priority, cl->dev_priority_export, prio_size);
1399 mandatory = cl->mandatory[2];
1400 break;
1401 case DT_DEV_PIXELPIPE_THUMBNAIL:
1402 memcpy(priority, cl->dev_priority_thumbnail, prio_size);
1403 mandatory = cl->mandatory[3];
1404 break;
1405 case DT_DEV_PIXELPIPE_PREVIEW2:
1406 memcpy(priority, cl->dev_priority_preview2, prio_size);
1407 mandatory = cl->mandatory[4];
1408 break;
1409 default:
1410 free(priority);
1411 priority = NULL;
1412 mandatory = 0;
1413 }
1414
1415 dt_pthread_mutex_unlock(&cl->lock);
1416
1417 if(priority)
1418 {
1419 const int usec = 5000;
1420 const int nloop = MAX(0, dt_conf_get_int("opencl_mandatory_timeout"));
1421
1422 // check for free opencl device repeatedly if mandatory is TRUE, else give up after first try
1423 for(int n = 0; n < nloop; n++)
1424 {
1425 const int *prio = priority;
1426
1427 while(*prio != -1)
1428 {
1429 if(!dt_pthread_mutex_BAD_trylock(&cl->dev[*prio].lock))
1430 {
1431 int devid = *prio;
1432 free(priority);
1433 return devid;
1434 }
1435 prio++;
1436 }
1437
1438 if(!mandatory)
1439 {
1440 free(priority);
1441 return -1;
1442 }
1443
1444 dt_iop_nap(usec);
1445 }
1446 }
1447 else
1448 {
1449 // only a fallback if a new pipe type would be added and we forget to take care of it in opencl.c
1450 for(int try_dev = 0; try_dev < cl->num_devs; try_dev++)
1451 {
1452 // get first currently unused processor
1453 if(!dt_pthread_mutex_BAD_trylock(&cl->dev[try_dev].lock)) return try_dev;
1454 }
1455 }
1456
1457 free(priority);
1458
1459 // no free GPU :(
1460 // use CPU processing, if no free device:
1461 return -1;
1462 }
1463
dt_opencl_unlock_device(const int dev)1464 void dt_opencl_unlock_device(const int dev)
1465 {
1466 dt_opencl_t *cl = darktable.opencl;
1467 if(!cl->inited) return;
1468 if(dev < 0 || dev >= cl->num_devs) return;
1469 dt_pthread_mutex_BAD_unlock(&cl->dev[dev].lock);
1470 }
1471
fopen_stat(const char * filename,struct stat * st)1472 static FILE *fopen_stat(const char *filename, struct stat *st)
1473 {
1474 FILE *f = g_fopen(filename, "rb");
1475 if(!f)
1476 {
1477 dt_print(DT_DEBUG_OPENCL, "[opencl_fopen_stat] could not open file `%s'!\n", filename);
1478 return NULL;
1479 }
1480 int fd = fileno(f);
1481 if(fstat(fd, st) < 0)
1482 {
1483 dt_print(DT_DEBUG_OPENCL, "[opencl_fopen_stat] could not stat file `%s'!\n", filename);
1484 return NULL;
1485 }
1486 return f;
1487 }
1488
1489
dt_opencl_md5sum(const char ** files,char ** md5sums)1490 void dt_opencl_md5sum(const char **files, char **md5sums)
1491 {
1492 char kerneldir[PATH_MAX] = { 0 };
1493 char filename[PATH_MAX] = { 0 };
1494 dt_loc_get_kerneldir(kerneldir, sizeof(kerneldir));
1495
1496 for(int n = 0; n < DT_OPENCL_MAX_INCLUDES; n++, files++, md5sums++)
1497 {
1498 if(!*files)
1499 {
1500 *md5sums = NULL;
1501 continue;
1502 }
1503
1504 snprintf(filename, sizeof(filename), "%s" G_DIR_SEPARATOR_S "%s", kerneldir, *files);
1505
1506 struct stat filestat;
1507 FILE *f = fopen_stat(filename, &filestat);
1508
1509 if(!f)
1510 {
1511 dt_print(DT_DEBUG_OPENCL, "[opencl_md5sums] could not open file `%s'!\n", filename);
1512 *md5sums = NULL;
1513 continue;
1514 }
1515
1516 size_t filesize = filestat.st_size;
1517 char *file = (char *)malloc(filesize);
1518
1519 if(!file)
1520 {
1521 dt_print(DT_DEBUG_OPENCL, "[opencl_md5sums] could not allocate buffer for file `%s'!\n", filename);
1522 *md5sums = NULL;
1523 fclose(f);
1524 continue;
1525 }
1526
1527 size_t rd = fread(file, sizeof(char), filesize, f);
1528 fclose(f);
1529
1530 if(rd != filesize)
1531 {
1532 free(file);
1533 dt_print(DT_DEBUG_OPENCL, "[opencl_md5sums] could not read all of file `%s'!\n", filename);
1534 *md5sums = NULL;
1535 continue;
1536 }
1537
1538 *md5sums = g_compute_checksum_for_data(G_CHECKSUM_MD5, (guchar *)file, filesize);
1539
1540 free(file);
1541 }
1542 }
1543
dt_opencl_load_program(const int dev,const int prog,const char * filename,const char * binname,const char * cachedir,char * md5sum,char ** includemd5,int * loaded_cached)1544 int dt_opencl_load_program(const int dev, const int prog, const char *filename, const char *binname,
1545 const char *cachedir, char *md5sum, char **includemd5, int *loaded_cached)
1546 {
1547 cl_int err;
1548 dt_opencl_t *cl = darktable.opencl;
1549
1550 struct stat filestat, cachedstat;
1551 *loaded_cached = 0;
1552
1553 if(prog < 0 || prog >= DT_OPENCL_MAX_PROGRAMS)
1554 {
1555 dt_print(DT_DEBUG_OPENCL, "[opencl_load_source] invalid program number `%d' of file `%s'!\n", prog,
1556 filename);
1557 return 0;
1558 }
1559
1560 if(cl->dev[dev].program_used[prog])
1561 {
1562 dt_print(DT_DEBUG_OPENCL,
1563 "[opencl_load_source] program number `%d' already in use when loading file `%s'!\n", prog,
1564 filename);
1565 return 0;
1566 }
1567
1568 FILE *f = fopen_stat(filename, &filestat);
1569 if(!f) return 0;
1570
1571 size_t filesize = filestat.st_size;
1572 char *file = (char *)malloc(filesize + 2048);
1573 size_t rd = fread(file, sizeof(char), filesize, f);
1574 fclose(f);
1575 if(rd != filesize)
1576 {
1577 free(file);
1578 dt_print(DT_DEBUG_OPENCL, "[opencl_load_source] could not read all of file `%s'!\n", filename);
1579 return 0;
1580 }
1581
1582 char *start = file + filesize;
1583 char *end = start + 2048;
1584 size_t len;
1585
1586 cl_device_id devid = cl->dev[dev].devid;
1587 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DRIVER_VERSION, end - start, start, &len);
1588 start += len;
1589
1590 cl_platform_id platform;
1591 (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL);
1592
1593 (cl->dlocl->symbols->dt_clGetPlatformInfo)(platform, CL_PLATFORM_VERSION, end - start, start, &len);
1594 start += len;
1595
1596 len = g_strlcpy(start, cl->dev[dev].options, end - start);
1597 start += len;
1598
1599 /* make sure that the md5sums of all the includes are applied as well */
1600 for(int n = 0; n < DT_OPENCL_MAX_INCLUDES; n++)
1601 {
1602 if(!includemd5[n]) continue;
1603 len = g_strlcpy(start, includemd5[n], end - start);
1604 start += len;
1605 }
1606
1607 char *source_md5 = g_compute_checksum_for_data(G_CHECKSUM_MD5, (guchar *)file, start - file);
1608 g_strlcpy(md5sum, source_md5, 33);
1609 g_free(source_md5);
1610
1611 file[filesize] = '\0';
1612
1613 char linkedfile[PATH_MAX] = { 0 };
1614 ssize_t linkedfile_len = 0;
1615
1616 #if defined(_WIN32)
1617 // No symlinks on Windows
1618 // Have to figure out the name using the filename + md5sum
1619 char dup[PATH_MAX] = { 0 };
1620 snprintf(dup, sizeof(dup), "%s.%s", binname, md5sum);
1621 FILE *cached = fopen_stat(dup, &cachedstat);
1622 g_strlcpy(linkedfile, md5sum, sizeof(linkedfile));
1623 linkedfile_len = strlen(md5sum);
1624 #else
1625 FILE *cached = fopen_stat(binname, &cachedstat);
1626 #endif
1627
1628 if(cached)
1629 {
1630 #if !defined(_WIN32)
1631 linkedfile_len = readlink(binname, linkedfile, sizeof(linkedfile) - 1);
1632 #endif // !defined(_WIN32)
1633 if(linkedfile_len > 0)
1634 {
1635 linkedfile[linkedfile_len] = '\0';
1636
1637 if(strncmp(linkedfile, md5sum, 33) == 0)
1638 {
1639 // md5sum matches, load cached binary
1640 size_t cached_filesize = cachedstat.st_size;
1641
1642 unsigned char *cached_content = (unsigned char *)malloc(cached_filesize + 1);
1643 rd = fread(cached_content, sizeof(char), cached_filesize, cached);
1644 if(rd != cached_filesize)
1645 {
1646 dt_print(DT_DEBUG_OPENCL, "[opencl_load_program] could not read all of file '%s' MD5: %s!\n", binname, md5sum);
1647 }
1648 else
1649 {
1650 cl->dev[dev].program[prog] = (cl->dlocl->symbols->dt_clCreateProgramWithBinary)(
1651 cl->dev[dev].context, 1, &(cl->dev[dev].devid), &cached_filesize,
1652 (const unsigned char **)&cached_content, NULL, &err);
1653 if(err != CL_SUCCESS)
1654 {
1655 dt_print(DT_DEBUG_OPENCL,
1656 "[opencl_load_program] could not load cached binary program from file '%s' MD5: '%s'! (%d)\n",
1657 binname, md5sum, err);
1658 }
1659 else
1660 {
1661 cl->dev[dev].program_used[prog] = 1;
1662 *loaded_cached = 1;
1663 }
1664 }
1665 free(cached_content);
1666 }
1667 }
1668 fclose(cached);
1669 }
1670
1671
1672 if(*loaded_cached == 0)
1673 {
1674 // if loading cached was unsuccessful for whatever reason,
1675 // try to remove cached binary & link
1676 #if !defined(_WIN32)
1677 if(linkedfile_len > 0)
1678 {
1679 char link_dest[PATH_MAX] = { 0 };
1680 snprintf(link_dest, sizeof(link_dest), "%s" G_DIR_SEPARATOR_S "%s", cachedir, linkedfile);
1681 g_unlink(link_dest);
1682 }
1683 g_unlink(binname);
1684 #else
1685 // delete the file which contains the MD5 name
1686 g_unlink(dup);
1687 #endif //!defined(_WIN32)
1688
1689 dt_print(DT_DEBUG_OPENCL,
1690 "[opencl_load_program] could not load cached binary program, trying to compile source\n");
1691
1692 cl->dev[dev].program[prog] = (cl->dlocl->symbols->dt_clCreateProgramWithSource)(
1693 cl->dev[dev].context, 1, (const char **)&file, &filesize, &err);
1694 free(file);
1695 if(err != CL_SUCCESS)
1696 {
1697 dt_print(DT_DEBUG_OPENCL, "[opencl_load_source] could not create program from file `%s'! (%d)\n",
1698 filename, err);
1699 return 0;
1700 }
1701 else
1702 {
1703 cl->dev[dev].program_used[prog] = 1;
1704 }
1705 }
1706 else
1707 {
1708 free(file);
1709 dt_print(DT_DEBUG_OPENCL, "[opencl_load_program] loaded cached binary program from file '%s' MD5: '%s' \n", binname, md5sum);
1710 }
1711
1712 dt_print(DT_DEBUG_OPENCL, "[opencl_load_program] successfully loaded program from '%s' MD5: '%s'\n", filename, md5sum);
1713
1714 return 1;
1715 }
1716
dt_opencl_build_program(const int dev,const int prog,const char * binname,const char * cachedir,char * md5sum,int loaded_cached)1717 int dt_opencl_build_program(const int dev, const int prog, const char *binname, const char *cachedir,
1718 char *md5sum, int loaded_cached)
1719 {
1720 if(prog < 0 || prog >= DT_OPENCL_MAX_PROGRAMS) return -1;
1721 dt_opencl_t *cl = darktable.opencl;
1722 cl_program program = cl->dev[dev].program[prog];
1723 cl_int err;
1724 err = (cl->dlocl->symbols->dt_clBuildProgram)(program, 1, &(cl->dev[dev].devid), cl->dev[dev].options, 0, 0);
1725
1726 if(err != CL_SUCCESS)
1727 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] could not build program: %d\n", err);
1728 else
1729 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] successfully built program\n");
1730
1731 cl_build_status build_status;
1732 (cl->dlocl->symbols->dt_clGetProgramBuildInfo)(program, cl->dev[dev].devid, CL_PROGRAM_BUILD_STATUS,
1733 sizeof(cl_build_status), &build_status, NULL);
1734 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] BUILD STATUS: %d\n", build_status);
1735
1736 char *build_log;
1737 size_t ret_val_size;
1738 (cl->dlocl->symbols->dt_clGetProgramBuildInfo)(program, cl->dev[dev].devid, CL_PROGRAM_BUILD_LOG, 0, NULL,
1739 &ret_val_size);
1740 if(ret_val_size != SIZE_MAX)
1741 {
1742 build_log = (char *)malloc(sizeof(char) * (ret_val_size + 1));
1743 if(build_log)
1744 {
1745 (cl->dlocl->symbols->dt_clGetProgramBuildInfo)(program, cl->dev[dev].devid, CL_PROGRAM_BUILD_LOG,
1746 ret_val_size, build_log, NULL);
1747
1748 build_log[ret_val_size] = '\0';
1749
1750 dt_print(DT_DEBUG_OPENCL, "BUILD LOG:\n");
1751 dt_print(DT_DEBUG_OPENCL, "%s\n", build_log);
1752
1753 free(build_log);
1754 }
1755 }
1756
1757 if(err != CL_SUCCESS)
1758 return err;
1759 else
1760 {
1761 if(!loaded_cached)
1762 {
1763 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] saving binary\n");
1764
1765 cl_uint numdev = 0;
1766 err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint),
1767 &numdev, NULL);
1768 if(err != CL_SUCCESS)
1769 {
1770 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_NUM_DEVICES failed: %d\n", err);
1771 return CL_SUCCESS;
1772 }
1773
1774 cl_device_id *devices = malloc(sizeof(cl_device_id) * numdev);
1775 err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_DEVICES,
1776 sizeof(cl_device_id) * numdev, devices, NULL);
1777 if(err != CL_SUCCESS)
1778 {
1779 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_DEVICES failed: %d\n", err);
1780 free(devices);
1781 return CL_SUCCESS;
1782 }
1783
1784 size_t *binary_sizes = malloc(sizeof(size_t) * numdev);
1785 err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_BINARY_SIZES,
1786 sizeof(size_t) * numdev, binary_sizes, NULL);
1787 if(err != CL_SUCCESS)
1788 {
1789 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_BINARY_SIZES failed: %d\n", err);
1790 free(binary_sizes);
1791 free(devices);
1792 return CL_SUCCESS;
1793 }
1794
1795 unsigned char **binaries = malloc(sizeof(unsigned char *) * numdev);
1796 for(int i = 0; i < numdev; i++) binaries[i] = (unsigned char *)malloc(binary_sizes[i]);
1797 err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_BINARIES,
1798 sizeof(unsigned char *) * numdev, binaries, NULL);
1799 if(err != CL_SUCCESS)
1800 {
1801 dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_BINARIES failed: %d\n", err);
1802 goto ret;
1803 }
1804
1805 for(int i = 0; i < numdev; i++)
1806 if(cl->dev[dev].devid == devices[i])
1807 {
1808 // save opencl compiled binary as md5sum-named file
1809 char link_dest[PATH_MAX] = { 0 };
1810 snprintf(link_dest, sizeof(link_dest), "%s" G_DIR_SEPARATOR_S "%s", cachedir, md5sum);
1811 FILE *f = g_fopen(link_dest, "wb");
1812 if(!f) goto ret;
1813 size_t bytes_written = fwrite(binaries[i], sizeof(char), binary_sizes[i], f);
1814 if(bytes_written != binary_sizes[i]) goto ret;
1815 fclose(f);
1816
1817 // create link (e.g. basic.cl.bin -> f1430102c53867c162bb60af6c163328)
1818 char cwd[PATH_MAX] = { 0 };
1819 if(!getcwd(cwd, sizeof(cwd))) goto ret;
1820 if(chdir(cachedir) != 0) goto ret;
1821 char dup[PATH_MAX] = { 0 };
1822 g_strlcpy(dup, binname, sizeof(dup));
1823 char *bname = basename(dup);
1824 #if defined(_WIN32)
1825 //CreateSymbolicLink in Windows requires admin privileges, which we don't want/need
1826 //store has using a simple filerename
1827 char finalfilename[PATH_MAX] = { 0 };
1828 snprintf(finalfilename, sizeof(finalfilename), "%s" G_DIR_SEPARATOR_S "%s.%s", cachedir, bname, md5sum);
1829 rename(link_dest, finalfilename);
1830 #else
1831 if(symlink(md5sum, bname) != 0) goto ret;
1832 #endif //!defined(_WIN32)
1833 if(chdir(cwd) != 0) goto ret;
1834 }
1835
1836 ret:
1837 for(int i = 0; i < numdev; i++) free(binaries[i]);
1838 free(binaries);
1839 free(binary_sizes);
1840 free(devices);
1841 }
1842 return CL_SUCCESS;
1843 }
1844 }
1845
dt_opencl_create_kernel(const int prog,const char * name)1846 int dt_opencl_create_kernel(const int prog, const char *name)
1847 {
1848 dt_opencl_t *cl = darktable.opencl;
1849 if(!cl->inited) return -1;
1850 if(prog < 0 || prog >= DT_OPENCL_MAX_PROGRAMS) return -1;
1851 dt_pthread_mutex_lock(&cl->lock);
1852 int k = 0;
1853 for(int dev = 0; dev < cl->num_devs; dev++)
1854 {
1855 cl_int err;
1856 for(; k < DT_OPENCL_MAX_KERNELS; k++)
1857 if(!cl->dev[dev].kernel_used[k])
1858 {
1859 cl->dev[dev].kernel_used[k] = 1;
1860 cl->dev[dev].kernel[k]
1861 = (cl->dlocl->symbols->dt_clCreateKernel)(cl->dev[dev].program[prog], name, &err);
1862 if(err != CL_SUCCESS)
1863 {
1864 dt_print(DT_DEBUG_OPENCL, "[opencl_create_kernel] could not create kernel `%s'! (%d)\n", name, err);
1865 cl->dev[dev].kernel_used[k] = 0;
1866 goto error;
1867 }
1868 else
1869 break;
1870 }
1871 if(k < DT_OPENCL_MAX_KERNELS)
1872 {
1873 dt_print(DT_DEBUG_OPENCL, "[opencl_create_kernel] successfully loaded kernel `%s' (%d) for device %d\n",
1874 name, k, dev);
1875 }
1876 else
1877 {
1878 dt_print(DT_DEBUG_OPENCL, "[opencl_create_kernel] too many kernels! can't create kernel `%s'\n", name);
1879 goto error;
1880 }
1881 }
1882 dt_pthread_mutex_unlock(&cl->lock);
1883 return k;
1884 error:
1885 dt_pthread_mutex_unlock(&cl->lock);
1886 return -1;
1887 }
1888
dt_opencl_free_kernel(const int kernel)1889 void dt_opencl_free_kernel(const int kernel)
1890 {
1891 dt_opencl_t *cl = darktable.opencl;
1892 if(!cl->inited) return;
1893 if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return;
1894 dt_pthread_mutex_lock(&cl->lock);
1895 for(int dev = 0; dev < cl->num_devs; dev++)
1896 {
1897 cl->dev[dev].kernel_used[kernel] = 0;
1898 (cl->dlocl->symbols->dt_clReleaseKernel)(cl->dev[dev].kernel[kernel]);
1899 }
1900 dt_pthread_mutex_unlock(&cl->lock);
1901 }
1902
dt_opencl_get_max_work_item_sizes(const int dev,size_t * sizes)1903 int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes)
1904 {
1905 dt_opencl_t *cl = darktable.opencl;
1906 if(!cl->inited || dev < 0) return -1;
1907 return (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1908 sizeof(size_t) * 3, sizes, NULL);
1909 }
1910
dt_opencl_get_work_group_limits(const int dev,size_t * sizes,size_t * workgroupsize,unsigned long * localmemsize)1911 int dt_opencl_get_work_group_limits(const int dev, size_t *sizes, size_t *workgroupsize,
1912 unsigned long *localmemsize)
1913 {
1914 dt_opencl_t *cl = darktable.opencl;
1915 if(!cl->inited || dev < 0) return -1;
1916 cl_ulong lmemsize;
1917 cl_int err;
1918
1919 err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_LOCAL_MEM_SIZE,
1920 sizeof(cl_ulong), &lmemsize, NULL);
1921 if(err != CL_SUCCESS) return err;
1922
1923 *localmemsize = lmemsize;
1924
1925 err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_MAX_WORK_GROUP_SIZE,
1926 sizeof(size_t), workgroupsize, NULL);
1927 if(err != CL_SUCCESS) return err;
1928
1929 return dt_opencl_get_max_work_item_sizes(dev, sizes);
1930 }
1931
1932
dt_opencl_get_kernel_work_group_size(const int dev,const int kernel,size_t * kernelworkgroupsize)1933 int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel, size_t *kernelworkgroupsize)
1934 {
1935 dt_opencl_t *cl = darktable.opencl;
1936 if(!cl->inited || dev < 0) return -1;
1937 if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return -1;
1938
1939 return (cl->dlocl->symbols->dt_clGetKernelWorkGroupInfo)(cl->dev[dev].kernel[kernel], cl->dev[dev].devid,
1940 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
1941 kernelworkgroupsize, NULL);
1942 }
1943
1944
dt_opencl_set_kernel_arg(const int dev,const int kernel,const int num,const size_t size,const void * arg)1945 int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size,
1946 const void *arg)
1947 {
1948 dt_opencl_t *cl = darktable.opencl;
1949 if(!cl->inited || dev < 0) return -1;
1950 if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return -1;
1951 return (cl->dlocl->symbols->dt_clSetKernelArg)(cl->dev[dev].kernel[kernel], num, size, arg);
1952 }
1953
dt_opencl_enqueue_kernel_2d(const int dev,const int kernel,const size_t * sizes)1954 int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes)
1955 {
1956 return dt_opencl_enqueue_kernel_2d_with_local(dev, kernel, sizes, NULL);
1957 }
1958
1959
dt_opencl_enqueue_kernel_2d_with_local(const int dev,const int kernel,const size_t * sizes,const size_t * local)1960 int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes,
1961 const size_t *local)
1962 {
1963 dt_opencl_t *cl = darktable.opencl;
1964 if(!cl->inited || dev < 0) return -1;
1965 if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return -1;
1966 int err;
1967 char buf[256];
1968 buf[0] = '\0';
1969 if(darktable.unmuted & DT_DEBUG_OPENCL)
1970 (cl->dlocl->symbols->dt_clGetKernelInfo)(cl->dev[dev].kernel[kernel], CL_KERNEL_FUNCTION_NAME, 256, buf,
1971 NULL);
1972 cl_event *eventp = dt_opencl_events_get_slot(dev, buf);
1973 err = (cl->dlocl->symbols->dt_clEnqueueNDRangeKernel)(cl->dev[dev].cmd_queue, cl->dev[dev].kernel[kernel],
1974 2, NULL, sizes, local, 0, NULL, eventp);
1975 // if (err == CL_SUCCESS) err = dt_opencl_finish(dev);
1976 return err;
1977 }
1978
dt_opencl_copy_device_to_host(const int devid,void * host,void * device,const int width,const int height,const int bpp)1979 int dt_opencl_copy_device_to_host(const int devid, void *host, void *device, const int width,
1980 const int height, const int bpp)
1981 {
1982 return dt_opencl_read_host_from_device(devid, host, device, width, height, bpp);
1983 }
1984
dt_opencl_read_host_from_device(const int devid,void * host,void * device,const int width,const int height,const int bpp)1985 int dt_opencl_read_host_from_device(const int devid, void *host, void *device, const int width,
1986 const int height, const int bpp)
1987 {
1988 return dt_opencl_read_host_from_device_rowpitch(devid, host, device, width, height, bpp * width);
1989 }
1990
dt_opencl_read_host_from_device_rowpitch(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)1991 int dt_opencl_read_host_from_device_rowpitch(const int devid, void *host, void *device, const int width,
1992 const int height, const int rowpitch)
1993 {
1994 if(!darktable.opencl->inited || devid < 0) return -1;
1995 const size_t origin[] = { 0, 0, 0 };
1996 const size_t region[] = { width, height, 1 };
1997 // blocking.
1998 return dt_opencl_read_host_from_device_raw(devid, host, device, origin, region, rowpitch, CL_TRUE);
1999 }
2000
dt_opencl_read_host_from_device_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int bpp)2001 int dt_opencl_read_host_from_device_non_blocking(const int devid, void *host, void *device, const int width,
2002 const int height, const int bpp)
2003 {
2004 return dt_opencl_read_host_from_device_rowpitch_non_blocking(devid, host, device, width, height,
2005 bpp * width);
2006 }
2007
dt_opencl_read_host_from_device_rowpitch_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)2008 int dt_opencl_read_host_from_device_rowpitch_non_blocking(const int devid, void *host, void *device,
2009 const int width, const int height,
2010 const int rowpitch)
2011 {
2012 if(!darktable.opencl->inited || devid < 0) return -1;
2013 const size_t origin[] = { 0, 0, 0 };
2014 const size_t region[] = { width, height, 1 };
2015 // non-blocking.
2016 return dt_opencl_read_host_from_device_raw(devid, host, device, origin, region, rowpitch, CL_FALSE);
2017 }
2018
2019
dt_opencl_read_host_from_device_raw(const int devid,void * host,void * device,const size_t * origin,const size_t * region,const int rowpitch,const int blocking)2020 int dt_opencl_read_host_from_device_raw(const int devid, void *host, void *device, const size_t *origin,
2021 const size_t *region, const int rowpitch, const int blocking)
2022 {
2023 if(!darktable.opencl->inited) return -1;
2024
2025 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Read Image (from device to host)]");
2026
2027 return (darktable.opencl->dlocl->symbols->dt_clEnqueueReadImage)(darktable.opencl->dev[devid].cmd_queue,
2028 device, blocking, origin, region, rowpitch,
2029 0, host, 0, NULL, eventp);
2030 }
2031
dt_opencl_write_host_to_device(const int devid,void * host,void * device,const int width,const int height,const int bpp)2032 int dt_opencl_write_host_to_device(const int devid, void *host, void *device, const int width,
2033 const int height, const int bpp)
2034 {
2035 return dt_opencl_write_host_to_device_rowpitch(devid, host, device, width, height, width * bpp);
2036 }
2037
dt_opencl_write_host_to_device_rowpitch(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)2038 int dt_opencl_write_host_to_device_rowpitch(const int devid, void *host, void *device, const int width,
2039 const int height, const int rowpitch)
2040 {
2041 if(!darktable.opencl->inited || devid < 0) return -1;
2042 const size_t origin[] = { 0, 0, 0 };
2043 const size_t region[] = { width, height, 1 };
2044 // blocking.
2045 return dt_opencl_write_host_to_device_raw(devid, host, device, origin, region, rowpitch, CL_TRUE);
2046 }
2047
dt_opencl_write_host_to_device_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int bpp)2048 int dt_opencl_write_host_to_device_non_blocking(const int devid, void *host, void *device, const int width,
2049 const int height, const int bpp)
2050 {
2051 return dt_opencl_write_host_to_device_rowpitch_non_blocking(devid, host, device, width, height, width * bpp);
2052 }
2053
dt_opencl_write_host_to_device_rowpitch_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)2054 int dt_opencl_write_host_to_device_rowpitch_non_blocking(const int devid, void *host, void *device,
2055 const int width, const int height,
2056 const int rowpitch)
2057 {
2058 if(!darktable.opencl->inited || devid < 0) return -1;
2059 const size_t origin[] = { 0, 0, 0 };
2060 const size_t region[] = { width, height, 1 };
2061 // non-blocking.
2062 return dt_opencl_write_host_to_device_raw(devid, host, device, origin, region, rowpitch, CL_FALSE);
2063 }
2064
dt_opencl_write_host_to_device_raw(const int devid,void * host,void * device,const size_t * origin,const size_t * region,const int rowpitch,const int blocking)2065 int dt_opencl_write_host_to_device_raw(const int devid, void *host, void *device, const size_t *origin,
2066 const size_t *region, const int rowpitch, const int blocking)
2067 {
2068 if(!darktable.opencl->inited) return -1;
2069
2070 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Write Image (from host to device)]");
2071
2072 return (darktable.opencl->dlocl->symbols->dt_clEnqueueWriteImage)(darktable.opencl->dev[devid].cmd_queue,
2073 device, blocking, origin, region,
2074 rowpitch, 0, host, 0, NULL, eventp);
2075 }
2076
dt_opencl_enqueue_copy_image(const int devid,cl_mem src,cl_mem dst,size_t * orig_src,size_t * orig_dst,size_t * region)2077 int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst,
2078 size_t *region)
2079 {
2080 if(!darktable.opencl->inited || devid < 0) return -1;
2081 cl_int err;
2082 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Image (on device)]");
2083 err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyImage)(
2084 darktable.opencl->dev[devid].cmd_queue, src, dst, orig_src, orig_dst, region, 0, NULL, eventp);
2085 if(err != CL_SUCCESS) dt_print(DT_DEBUG_OPENCL, "[opencl copy_image] could not copy image: %d\n", err);
2086 return err;
2087 }
2088
dt_opencl_enqueue_copy_image_to_buffer(const int devid,cl_mem src_image,cl_mem dst_buffer,size_t * origin,size_t * region,size_t offset)2089 int dt_opencl_enqueue_copy_image_to_buffer(const int devid, cl_mem src_image, cl_mem dst_buffer,
2090 size_t *origin, size_t *region, size_t offset)
2091 {
2092 if(!darktable.opencl->inited) return -1;
2093 cl_int err;
2094 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Image to Buffer (on device)]");
2095 err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyImageToBuffer)(
2096 darktable.opencl->dev[devid].cmd_queue, src_image, dst_buffer, origin, region, offset, 0, NULL, eventp);
2097 if(err != CL_SUCCESS)
2098 dt_print(DT_DEBUG_OPENCL, "[opencl copy_image_to_buffer] could not copy image: %d\n", err);
2099 return err;
2100 }
2101
dt_opencl_enqueue_copy_buffer_to_image(const int devid,cl_mem src_buffer,cl_mem dst_image,size_t offset,size_t * origin,size_t * region)2102 int dt_opencl_enqueue_copy_buffer_to_image(const int devid, cl_mem src_buffer, cl_mem dst_image,
2103 size_t offset, size_t *origin, size_t *region)
2104 {
2105 if(!darktable.opencl->inited) return -1;
2106 cl_int err;
2107 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Buffer to Image (on device)]");
2108 err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyBufferToImage)(
2109 darktable.opencl->dev[devid].cmd_queue, src_buffer, dst_image, offset, origin, region, 0, NULL, eventp);
2110 if(err != CL_SUCCESS)
2111 dt_print(DT_DEBUG_OPENCL, "[opencl copy_buffer_to_image] could not copy buffer: %d\n", err);
2112 return err;
2113 }
2114
dt_opencl_enqueue_copy_buffer_to_buffer(const int devid,cl_mem src_buffer,cl_mem dst_buffer,size_t srcoffset,size_t dstoffset,size_t size)2115 int dt_opencl_enqueue_copy_buffer_to_buffer(const int devid, cl_mem src_buffer, cl_mem dst_buffer,
2116 size_t srcoffset, size_t dstoffset, size_t size)
2117 {
2118 if(!darktable.opencl->inited) return -1;
2119 cl_int err;
2120 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Buffer to Buffer (on device)]");
2121 err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyBuffer)(darktable.opencl->dev[devid].cmd_queue,
2122 src_buffer, dst_buffer, srcoffset,
2123 dstoffset, size, 0, NULL, eventp);
2124 if(err != CL_SUCCESS)
2125 dt_print(DT_DEBUG_OPENCL, "[opencl copy_buffer_to_buffer] could not copy buffer: %d\n", err);
2126 return err;
2127 }
2128
dt_opencl_read_buffer_from_device(const int devid,void * host,void * device,const size_t offset,const size_t size,const int blocking)2129 int dt_opencl_read_buffer_from_device(const int devid, void *host, void *device, const size_t offset,
2130 const size_t size, const int blocking)
2131 {
2132 if(!darktable.opencl->inited) return -1;
2133
2134 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Read Buffer (from device to host)]");
2135
2136 return (darktable.opencl->dlocl->symbols->dt_clEnqueueReadBuffer)(
2137 darktable.opencl->dev[devid].cmd_queue, device, blocking, offset, size, host, 0, NULL, eventp);
2138 }
2139
dt_opencl_write_buffer_to_device(const int devid,void * host,void * device,const size_t offset,const size_t size,const int blocking)2140 int dt_opencl_write_buffer_to_device(const int devid, void *host, void *device, const size_t offset,
2141 const size_t size, const int blocking)
2142 {
2143 if(!darktable.opencl->inited) return -1;
2144
2145 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Write Buffer (from host to device)]");
2146
2147 return (darktable.opencl->dlocl->symbols->dt_clEnqueueWriteBuffer)(
2148 darktable.opencl->dev[devid].cmd_queue, device, blocking, offset, size, host, 0, NULL, eventp);
2149 }
2150
2151
dt_opencl_copy_host_to_device_constant(const int devid,const size_t size,void * host)2152 void *dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host)
2153 {
2154 if(!darktable.opencl->inited || devid < 0) return NULL;
2155 cl_int err;
2156 cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateBuffer)(
2157 darktable.opencl->dev[devid].context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size, host, &err);
2158 if(err != CL_SUCCESS)
2159 dt_print(DT_DEBUG_OPENCL,
2160 "[opencl copy_host_to_device_constant] could not alloc buffer on device %d: %d\n", devid, err);
2161
2162 dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2163
2164 return dev;
2165 }
2166
dt_opencl_copy_host_to_device(const int devid,void * host,const int width,const int height,const int bpp)2167 void *dt_opencl_copy_host_to_device(const int devid, void *host, const int width, const int height,
2168 const int bpp)
2169 {
2170 return dt_opencl_copy_host_to_device_rowpitch(devid, host, width, height, bpp, 0);
2171 }
2172
dt_opencl_copy_host_to_device_rowpitch(const int devid,void * host,const int width,const int height,const int bpp,const int rowpitch)2173 void *dt_opencl_copy_host_to_device_rowpitch(const int devid, void *host, const int width, const int height,
2174 const int bpp, const int rowpitch)
2175 {
2176 if(!darktable.opencl->inited || devid < 0) return NULL;
2177 cl_int err;
2178 cl_image_format fmt;
2179 // guess pixel format from bytes per pixel
2180 if(bpp == 4 * sizeof(float))
2181 fmt = (cl_image_format){ CL_RGBA, CL_FLOAT };
2182 else if(bpp == sizeof(float))
2183 fmt = (cl_image_format){ CL_R, CL_FLOAT };
2184 else if(bpp == sizeof(uint16_t))
2185 fmt = (cl_image_format){ CL_R, CL_UNSIGNED_INT16 };
2186 else
2187 return NULL;
2188
2189 // TODO: if fmt = uint16_t, blow up to 4xuint16_t and copy manually!
2190 cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateImage2D)(
2191 darktable.opencl->dev[devid].context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &fmt, width, height,
2192 rowpitch, host, &err);
2193 if(err != CL_SUCCESS)
2194 dt_print(DT_DEBUG_OPENCL,
2195 "[opencl copy_host_to_device] could not alloc/copy img buffer on device %d: %d\n", devid, err);
2196
2197 dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2198
2199 return dev;
2200 }
2201
2202
dt_opencl_release_mem_object(cl_mem mem)2203 void dt_opencl_release_mem_object(cl_mem mem)
2204 {
2205 if(!darktable.opencl->inited) return;
2206
2207 // the OpenCL specs are not absolutely clear if clReleaseMemObject(NULL) is a no-op. we take care of the
2208 // case in a centralized way at this place
2209 if(mem == NULL) return;
2210
2211 dt_opencl_memory_statistics(-1, mem, OPENCL_MEMORY_SUB);
2212
2213 (darktable.opencl->dlocl->symbols->dt_clReleaseMemObject)(mem);
2214 }
2215
dt_opencl_map_buffer(const int devid,cl_mem buffer,const int blocking,const int flags,size_t offset,size_t size)2216 void *dt_opencl_map_buffer(const int devid, cl_mem buffer, const int blocking, const int flags, size_t offset,
2217 size_t size)
2218 {
2219 if(!darktable.opencl->inited) return NULL;
2220 cl_int err;
2221 void *ptr;
2222 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Map Buffer]");
2223 ptr = (darktable.opencl->dlocl->symbols->dt_clEnqueueMapBuffer)(
2224 darktable.opencl->dev[devid].cmd_queue, buffer, blocking, flags, offset, size, 0, NULL, eventp, &err);
2225 if(err != CL_SUCCESS) dt_print(DT_DEBUG_OPENCL, "[opencl map buffer] could not map buffer: %d\n", err);
2226 return ptr;
2227 }
2228
dt_opencl_unmap_mem_object(const int devid,cl_mem mem_object,void * mapped_ptr)2229 int dt_opencl_unmap_mem_object(const int devid, cl_mem mem_object, void *mapped_ptr)
2230 {
2231 if(!darktable.opencl->inited) return -1;
2232 cl_int err;
2233 cl_event *eventp = dt_opencl_events_get_slot(devid, "[Unmap Mem Object]");
2234 err = (darktable.opencl->dlocl->symbols->dt_clEnqueueUnmapMemObject)(
2235 darktable.opencl->dev[devid].cmd_queue, mem_object, mapped_ptr, 0, NULL, eventp);
2236 if(err != CL_SUCCESS)
2237 dt_print(DT_DEBUG_OPENCL, "[opencl unmap mem object] could not unmap mem object: %d\n", err);
2238 return err;
2239 }
2240
dt_opencl_alloc_device(const int devid,const int width,const int height,const int bpp)2241 void *dt_opencl_alloc_device(const int devid, const int width, const int height, const int bpp)
2242 {
2243 if(!darktable.opencl->inited || devid < 0) return NULL;
2244 cl_int err;
2245 cl_image_format fmt;
2246 // guess pixel format from bytes per pixel
2247 if(bpp == 4 * sizeof(float))
2248 fmt = (cl_image_format){ CL_RGBA, CL_FLOAT };
2249 else if(bpp == sizeof(float))
2250 fmt = (cl_image_format){ CL_R, CL_FLOAT };
2251 else if(bpp == sizeof(uint16_t))
2252 fmt = (cl_image_format){ CL_R, CL_UNSIGNED_INT16 };
2253 else
2254 return NULL;
2255
2256 cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateImage2D)(
2257 darktable.opencl->dev[devid].context, CL_MEM_READ_WRITE, &fmt, width, height, 0, NULL, &err);
2258 if(err != CL_SUCCESS)
2259 dt_print(DT_DEBUG_OPENCL, "[opencl alloc_device] could not alloc img buffer on device %d: %d\n", devid,
2260 err);
2261
2262 dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2263
2264 return dev;
2265 }
2266
2267
dt_opencl_alloc_device_use_host_pointer(const int devid,const int width,const int height,const int bpp,const int rowpitch,void * host)2268 void *dt_opencl_alloc_device_use_host_pointer(const int devid, const int width, const int height,
2269 const int bpp, const int rowpitch, void *host)
2270 {
2271 if(!darktable.opencl->inited || devid < 0) return NULL;
2272 cl_int err;
2273 cl_image_format fmt;
2274 // guess pixel format from bytes per pixel
2275 if(bpp == 4 * sizeof(float))
2276 fmt = (cl_image_format){ CL_RGBA, CL_FLOAT };
2277 else if(bpp == sizeof(float))
2278 fmt = (cl_image_format){ CL_R, CL_FLOAT };
2279 else if(bpp == sizeof(uint16_t))
2280 fmt = (cl_image_format){ CL_R, CL_UNSIGNED_INT16 };
2281 else
2282 return NULL;
2283
2284 cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateImage2D)(
2285 darktable.opencl->dev[devid].context,
2286 CL_MEM_READ_WRITE | ((host == NULL) ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_USE_HOST_PTR), &fmt, width, height,
2287 rowpitch, host, &err);
2288 if(err != CL_SUCCESS)
2289 dt_print(DT_DEBUG_OPENCL,
2290 "[opencl alloc_device_use_host_pointer] could not alloc img buffer on device %d: %d\n", devid,
2291 err);
2292
2293 dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2294
2295 return dev;
2296 }
2297
2298
dt_opencl_alloc_device_buffer(const int devid,const size_t size)2299 void *dt_opencl_alloc_device_buffer(const int devid, const size_t size)
2300 {
2301 if(!darktable.opencl->inited) return NULL;
2302 cl_int err;
2303
2304 cl_mem buf = (darktable.opencl->dlocl->symbols->dt_clCreateBuffer)(darktable.opencl->dev[devid].context,
2305 CL_MEM_READ_WRITE, size, NULL, &err);
2306 if(err != CL_SUCCESS)
2307 dt_print(DT_DEBUG_OPENCL, "[opencl alloc_device_buffer] could not alloc buffer on device %d: %d\n", devid,
2308 err);
2309
2310 dt_opencl_memory_statistics(devid, buf, OPENCL_MEMORY_ADD);
2311
2312 return buf;
2313 }
2314
dt_opencl_alloc_device_buffer_with_flags(const int devid,const size_t size,const int flags)2315 void *dt_opencl_alloc_device_buffer_with_flags(const int devid, const size_t size, const int flags)
2316 {
2317 if(!darktable.opencl->inited) return NULL;
2318 cl_int err;
2319
2320 cl_mem buf = (darktable.opencl->dlocl->symbols->dt_clCreateBuffer)(darktable.opencl->dev[devid].context,
2321 flags, size, NULL, &err);
2322 if(err != CL_SUCCESS)
2323 dt_print(DT_DEBUG_OPENCL, "[opencl alloc_device_buffer] could not alloc buffer on device %d: %d\n", devid,
2324 err);
2325
2326 dt_opencl_memory_statistics(devid, buf, OPENCL_MEMORY_ADD);
2327
2328 return buf;
2329 }
2330
dt_opencl_get_mem_object_size(cl_mem mem)2331 size_t dt_opencl_get_mem_object_size(cl_mem mem)
2332 {
2333 cl_int err;
2334 size_t size;
2335 if(mem == NULL) return 0;
2336
2337 err = (darktable.opencl->dlocl->symbols->dt_clGetMemObjectInfo)(mem, CL_MEM_SIZE, sizeof(size), &size, NULL);
2338
2339 return (err == CL_SUCCESS) ? size : 0;
2340 }
2341
dt_opencl_get_mem_context_id(cl_mem mem)2342 int dt_opencl_get_mem_context_id(cl_mem mem)
2343 {
2344 cl_int err;
2345 cl_context context;
2346 if(mem == NULL) return -1;
2347
2348 err = (darktable.opencl->dlocl->symbols->dt_clGetMemObjectInfo)(mem, CL_MEM_CONTEXT, sizeof(context), &context, NULL);
2349 if(err != CL_SUCCESS)
2350 return -1;
2351
2352 for(int devid = 0; devid < darktable.opencl->num_devs; devid++)
2353 {
2354 if(darktable.opencl->dev[devid].context == context)
2355 return devid;
2356 }
2357
2358 return -1;
2359 }
2360
dt_opencl_get_image_width(cl_mem mem)2361 int dt_opencl_get_image_width(cl_mem mem)
2362 {
2363 cl_int err;
2364 size_t size;
2365 if(mem == NULL) return 0;
2366
2367 err = (darktable.opencl->dlocl->symbols->dt_clGetImageInfo)(mem, CL_IMAGE_WIDTH, sizeof(size), &size, NULL);
2368 if(size > INT_MAX) size = 0;
2369
2370 return (err == CL_SUCCESS) ? (int)size : 0;
2371 }
2372
dt_opencl_get_image_height(cl_mem mem)2373 int dt_opencl_get_image_height(cl_mem mem)
2374 {
2375 cl_int err;
2376 size_t size;
2377 if(mem == NULL) return 0;
2378
2379 err = (darktable.opencl->dlocl->symbols->dt_clGetImageInfo)(mem, CL_IMAGE_HEIGHT, sizeof(size), &size, NULL);
2380 if(size > INT_MAX) size = 0;
2381
2382 return (err == CL_SUCCESS) ? (int)size : 0;
2383 }
2384
dt_opencl_get_image_element_size(cl_mem mem)2385 int dt_opencl_get_image_element_size(cl_mem mem)
2386 {
2387 cl_int err;
2388 size_t size;
2389 if(mem == NULL) return 0;
2390
2391 err = (darktable.opencl->dlocl->symbols->dt_clGetImageInfo)(mem, CL_IMAGE_ELEMENT_SIZE, sizeof(size), &size,
2392 NULL);
2393 if(size > INT_MAX) size = 0;
2394
2395 return (err == CL_SUCCESS) ? (int)size : 0;
2396 }
2397
dt_opencl_memory_statistics(int devid,cl_mem mem,dt_opencl_memory_t action)2398 void dt_opencl_memory_statistics(int devid, cl_mem mem, dt_opencl_memory_t action)
2399 {
2400 if(!((darktable.unmuted & DT_DEBUG_MEMORY) && (darktable.unmuted & DT_DEBUG_OPENCL)))
2401 return;
2402
2403 if(devid < 0)
2404 devid = dt_opencl_get_mem_context_id(mem);
2405
2406 if(devid < 0)
2407 return;
2408
2409 if(action == OPENCL_MEMORY_ADD)
2410 darktable.opencl->dev[devid].memory_in_use += dt_opencl_get_mem_object_size(mem);
2411 else
2412 darktable.opencl->dev[devid].memory_in_use -= dt_opencl_get_mem_object_size(mem);
2413
2414 darktable.opencl->dev[devid].peak_memory = MAX(darktable.opencl->dev[devid].peak_memory,
2415 darktable.opencl->dev[devid].memory_in_use);
2416
2417 if(darktable.unmuted & DT_DEBUG_MEMORY)
2418 dt_print(DT_DEBUG_OPENCL,
2419 "[opencl memory] device %d: %zu bytes (%.1f MB) in use\n", devid, darktable.opencl->dev[devid].memory_in_use,
2420 (float)darktable.opencl->dev[devid].memory_in_use/(1024*1024));
2421 }
2422
2423 /** check if image size fit into limits given by OpenCL runtime */
dt_opencl_image_fits_device(const int devid,const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)2424 int dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height, const unsigned bpp,
2425 const float factor, const size_t overhead)
2426 {
2427 static float headroom = -1.0f;
2428
2429 if(!darktable.opencl->inited || devid < 0) return FALSE;
2430
2431 /* first time run */
2432 if(headroom < 0.0f)
2433 {
2434 headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
2435
2436 /* don't let the user play games with us */
2437 headroom = fmin((float)darktable.opencl->dev[devid].max_global_mem, fmax(headroom, 0.0f));
2438 dt_conf_set_int("opencl_memory_headroom", headroom / 1024 / 1024);
2439 }
2440
2441 float singlebuffer = (float)width * height * bpp;
2442 float total = factor * singlebuffer + overhead;
2443
2444 if(darktable.opencl->dev[devid].max_image_width < width
2445 || darktable.opencl->dev[devid].max_image_height < height)
2446 return FALSE;
2447
2448 if(darktable.opencl->dev[devid].max_mem_alloc < singlebuffer) return FALSE;
2449
2450 if(darktable.opencl->dev[devid].max_global_mem < total + headroom) return FALSE;
2451
2452 return TRUE;
2453 }
2454
2455
2456 /** round size to a multiple of the value given in config parameter opencl_size_roundup */
dt_opencl_roundup(int size)2457 int dt_opencl_roundup(int size)
2458 {
2459 static int roundup = -1;
2460
2461 /* first time run */
2462 if(roundup < 0)
2463 {
2464 roundup = dt_conf_get_int("opencl_size_roundup");
2465
2466 /* if not yet defined (or unsane), set a sane default */
2467 if(roundup <= 0)
2468 {
2469 roundup = 16;
2470 dt_conf_set_int("opencl_size_roundup", roundup);
2471 }
2472 }
2473
2474 return (size % roundup == 0 ? size : (size / roundup + 1) * roundup);
2475 }
2476
2477
2478 /** check if opencl is inited */
dt_opencl_is_inited(void)2479 int dt_opencl_is_inited(void)
2480 {
2481 return darktable.opencl->inited;
2482 }
2483
2484
2485 /** check if opencl is enabled */
dt_opencl_is_enabled(void)2486 int dt_opencl_is_enabled(void)
2487 {
2488 if(!darktable.opencl->inited) return FALSE;
2489 return darktable.opencl->enabled;
2490 }
2491
2492
2493 /** disable opencl */
dt_opencl_disable(void)2494 void dt_opencl_disable(void)
2495 {
2496 if(!darktable.opencl->inited) return;
2497 darktable.opencl->enabled = FALSE;
2498 dt_conf_set_bool("opencl", FALSE);
2499 }
2500
2501
2502 /** update enabled flag and profile with value from preferences, returns enabled flag */
dt_opencl_update_settings(void)2503 int dt_opencl_update_settings(void)
2504 {
2505 // FIXME: This pulls in prefs every time the pixelpipe runs. Instead have a callback for DT_SIGNAL_PREFERENCES_CHANGE?
2506 if(!darktable.opencl->inited) return FALSE;
2507 const int prefs = dt_conf_get_bool("opencl");
2508
2509 if(darktable.opencl->enabled != prefs)
2510 {
2511 darktable.opencl->enabled = prefs;
2512 darktable.opencl->stopped = 0;
2513 darktable.opencl->error_count = 0;
2514 dt_print(DT_DEBUG_OPENCL, "[opencl_update_enabled] enabled flag set to %s\n", prefs ? "ON" : "OFF");
2515 }
2516
2517 dt_opencl_scheduling_profile_t profile = dt_opencl_get_scheduling_profile();
2518
2519 if(darktable.opencl->scheduling_profile != profile)
2520 {
2521 char *pstr = dt_conf_get_string("opencl_scheduling_profile");
2522 dt_print(DT_DEBUG_OPENCL, "[opencl_update_scheduling_profile] scheduling profile set to %s\n", pstr);
2523 g_free(pstr);
2524 dt_opencl_apply_scheduling_profile(profile);
2525 }
2526
2527 dt_opencl_sync_cache_t sync = dt_opencl_get_sync_cache();
2528
2529 if(darktable.opencl->sync_cache != sync)
2530 {
2531 char *pstr = dt_conf_get_string("opencl_synch_cache");
2532 dt_print(DT_DEBUG_OPENCL, "[opencl_update_synch_cache] sync cache set to %s\n", pstr);
2533 g_free(pstr);
2534 darktable.opencl->sync_cache = sync;
2535 }
2536
2537 return (darktable.opencl->enabled && !darktable.opencl->stopped);
2538 }
2539
2540 /** read scheduling profile for config variables */
dt_opencl_get_scheduling_profile(void)2541 static dt_opencl_scheduling_profile_t dt_opencl_get_scheduling_profile(void)
2542 {
2543 char *pstr = dt_conf_get_string("opencl_scheduling_profile");
2544 if(!pstr) return OPENCL_PROFILE_DEFAULT;
2545
2546 dt_opencl_scheduling_profile_t profile = OPENCL_PROFILE_DEFAULT;
2547
2548 if(!strcmp(pstr, "multiple GPUs"))
2549 profile = OPENCL_PROFILE_MULTIPLE_GPUS;
2550 else if(!strcmp(pstr, "very fast GPU"))
2551 profile = OPENCL_PROFILE_VERYFAST_GPU;
2552
2553 g_free(pstr);
2554
2555 return profile;
2556 }
2557
2558 /** read config of when/if to synch to cache */
dt_opencl_get_sync_cache(void)2559 static dt_opencl_sync_cache_t dt_opencl_get_sync_cache(void)
2560 {
2561 char *pstr = dt_conf_get_string("opencl_synch_cache");
2562 if(!pstr) return OPENCL_SYNC_ACTIVE_MODULE;
2563
2564 dt_opencl_sync_cache_t sync = OPENCL_SYNC_ACTIVE_MODULE;
2565
2566 if(!strcmp(pstr, "true"))
2567 sync = OPENCL_SYNC_TRUE;
2568 else if(!strcmp(pstr, "false"))
2569 sync = OPENCL_SYNC_FALSE;
2570
2571 g_free(pstr);
2572
2573 return sync;
2574 }
2575
2576 /** set opencl specific synchronization timeout */
dt_opencl_set_synchronization_timeout(int value)2577 static void dt_opencl_set_synchronization_timeout(int value)
2578 {
2579 darktable.opencl->opencl_synchronization_timeout = value;
2580 dt_print(DT_DEBUG_OPENCL, "[opencl_synchronization_timeout] synchronization timeout set to %d\n", value);
2581 }
2582
2583 /** adjust opencl subsystem according to scheduling profile */
dt_opencl_apply_scheduling_profile(dt_opencl_scheduling_profile_t profile)2584 static void dt_opencl_apply_scheduling_profile(dt_opencl_scheduling_profile_t profile)
2585 {
2586 char *str;
2587
2588 dt_pthread_mutex_lock(&darktable.opencl->lock);
2589 darktable.opencl->scheduling_profile = profile;
2590
2591 switch(profile)
2592 {
2593 case OPENCL_PROFILE_MULTIPLE_GPUS:
2594 dt_opencl_update_priorities("*/*/*/*/*");
2595 dt_opencl_set_synchronization_timeout(20);
2596 break;
2597 case OPENCL_PROFILE_VERYFAST_GPU:
2598 dt_opencl_update_priorities("+*/+*/+*/+*/+*");
2599 dt_opencl_set_synchronization_timeout(0);
2600 break;
2601 case OPENCL_PROFILE_DEFAULT:
2602 default:
2603 str = dt_conf_get_string("opencl_device_priority");
2604 dt_opencl_update_priorities(str);
2605 g_free(str);
2606 dt_opencl_set_synchronization_timeout(dt_conf_get_int("pixelpipe_synchronization_timeout"));
2607 break;
2608 }
2609 dt_pthread_mutex_unlock(&darktable.opencl->lock);
2610 }
2611
2612 /** get global memory of device */
dt_opencl_get_max_global_mem(const int devid)2613 cl_ulong dt_opencl_get_max_global_mem(const int devid)
2614 {
2615 if(!darktable.opencl->inited || devid < 0) return 0;
2616 return darktable.opencl->dev[devid].max_global_mem;
2617 }
2618
2619
2620 /** the following eventlist functions assume that affected structures are locked upstream */
2621
2622 /** get next free slot in eventlist (and manage size of eventlist) */
dt_opencl_events_get_slot(const int devid,const char * tag)2623 cl_event *dt_opencl_events_get_slot(const int devid, const char *tag)
2624 {
2625 dt_opencl_t *cl = darktable.opencl;
2626 if(!cl->inited || devid < 0) return NULL;
2627 if(!cl->use_events) return NULL;
2628
2629 static const cl_event zeroevent[1]; // implicitly initialized to zero
2630 cl_event **eventlist = &(cl->dev[devid].eventlist);
2631 dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2632 int *numevents = &(cl->dev[devid].numevents);
2633 int *maxevents = &(cl->dev[devid].maxevents);
2634 int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2635 int *lostevents = &(cl->dev[devid].lostevents);
2636 int *totalevents = &(cl->dev[devid].totalevents);
2637 int *totallost = &(cl->dev[devid].totallost);
2638
2639 // if first time called: allocate initial buffers
2640 if(*eventlist == NULL)
2641 {
2642 int newevents = DT_OPENCL_EVENTLISTSIZE;
2643 *eventlist = calloc(newevents, sizeof(cl_event));
2644 *eventtags = calloc(newevents, sizeof(dt_opencl_eventtag_t));
2645 if(!*eventlist || !*eventtags)
2646 {
2647 free(*eventlist);
2648 free(*eventtags);
2649 *eventlist = NULL;
2650 *eventtags = NULL;
2651 return NULL;
2652 }
2653 *maxevents = newevents;
2654 }
2655
2656 // check if currently highest event slot was actually consumed. If not use it again
2657 if(*numevents > 0 && !memcmp((*eventlist) + *numevents - 1, zeroevent, sizeof(cl_event)))
2658 {
2659 (*lostevents)++;
2660 (*totallost)++;
2661 if(tag != NULL)
2662 {
2663 g_strlcpy((*eventtags)[*numevents - 1].tag, tag, DT_OPENCL_EVENTNAMELENGTH);
2664 }
2665 else
2666 {
2667 (*eventtags)[*numevents - 1].tag[0] = '\0';
2668 }
2669
2670 (*totalevents)++;
2671 return (*eventlist) + *numevents - 1;
2672 }
2673
2674 // check if we would exceed the number of available event handles. In that case first flush existing handles
2675 if(*numevents - *eventsconsolidated + 1 > darktable.opencl->number_event_handles)
2676 (void)dt_opencl_events_flush(devid, FALSE);
2677
2678
2679 // if no more space left in eventlist: grow buffer
2680 if(*numevents == *maxevents)
2681 {
2682 int newevents = *maxevents + DT_OPENCL_EVENTLISTSIZE;
2683 cl_event *neweventlist = calloc(newevents, sizeof(cl_event));
2684 dt_opencl_eventtag_t *neweventtags = calloc(newevents, sizeof(dt_opencl_eventtag_t));
2685 if(!neweventlist || !neweventtags)
2686 {
2687 free(neweventlist);
2688 free(neweventtags);
2689 return NULL;
2690 }
2691 memcpy(neweventlist, *eventlist, sizeof(cl_event) * *maxevents);
2692 memcpy(neweventtags, *eventtags, sizeof(dt_opencl_eventtag_t) * *maxevents);
2693 free(*eventlist);
2694 free(*eventtags);
2695 *eventlist = neweventlist;
2696 *eventtags = neweventtags;
2697 *maxevents = newevents;
2698 }
2699
2700 // init next event slot and return it
2701 (*numevents)++;
2702 memcpy((*eventlist) + *numevents - 1, zeroevent, sizeof(cl_event));
2703 if(tag != NULL)
2704 {
2705 g_strlcpy((*eventtags)[*numevents - 1].tag, tag, DT_OPENCL_EVENTNAMELENGTH);
2706 }
2707 else
2708 {
2709 (*eventtags)[*numevents - 1].tag[0] = '\0';
2710 }
2711
2712 (*totalevents)++;
2713 return (*eventlist) + *numevents - 1;
2714 }
2715
2716
2717 /** reset eventlist to empty state */
dt_opencl_events_reset(const int devid)2718 void dt_opencl_events_reset(const int devid)
2719 {
2720 dt_opencl_t *cl = darktable.opencl;
2721 if(!cl->inited || devid < 0) return;
2722 if(!cl->use_events) return;
2723
2724 cl_event **eventlist = &(cl->dev[devid].eventlist);
2725 dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2726 int *numevents = &(cl->dev[devid].numevents);
2727 int *maxevents = &(cl->dev[devid].maxevents);
2728 int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2729 int *lostevents = &(cl->dev[devid].lostevents);
2730 cl_int *summary = &(cl->dev[devid].summary);
2731
2732 if(*eventlist == NULL || *numevents == 0) return; // nothing to do
2733
2734 // release all remaining events in eventlist, not to waste resources
2735 for(int k = *eventsconsolidated; k < *numevents; k++)
2736 {
2737 (cl->dlocl->symbols->dt_clReleaseEvent)((*eventlist)[k]);
2738 }
2739
2740 memset(*eventtags, 0, sizeof(dt_opencl_eventtag_t) * *maxevents);
2741 *numevents = 0;
2742 *eventsconsolidated = 0;
2743 *lostevents = 0;
2744 *summary = CL_COMPLETE;
2745 return;
2746 }
2747
2748
2749 /** Wait for events in eventlist to terminate -> this is a blocking synchronization point!
2750 Does not flush eventlist. Side effect: might adjust numevents. */
dt_opencl_events_wait_for(const int devid)2751 void dt_opencl_events_wait_for(const int devid)
2752 {
2753 dt_opencl_t *cl = darktable.opencl;
2754 if(!cl->inited || devid < 0) return;
2755 if(!cl->use_events) return;
2756
2757 static const cl_event zeroevent[1]; // implicitly initialized to zero
2758 cl_event **eventlist = &(cl->dev[devid].eventlist);
2759 int *numevents = &(cl->dev[devid].numevents);
2760 int *lostevents = &(cl->dev[devid].lostevents);
2761 int *totallost = &(cl->dev[devid].totallost);
2762 int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2763
2764 if(*eventlist == NULL || *numevents == 0) return; // nothing to do
2765
2766 // check if last event slot was actually used and correct numevents if needed
2767 if(!memcmp((*eventlist) + *numevents - 1, zeroevent, sizeof(cl_event)))
2768 {
2769 (*numevents)--;
2770 (*lostevents)++;
2771 (*totallost)++;
2772 }
2773
2774 if(*numevents == *eventsconsolidated) return; // nothing to do
2775
2776 assert(*numevents > *eventsconsolidated);
2777
2778 // now wait for all remaining events to terminate
2779 // Risk: might never return in case of OpenCL blocks or endless loops
2780 // TODO: run clWaitForEvents in separate thread and implement watchdog timer
2781 (cl->dlocl->symbols->dt_clWaitForEvents)(*numevents - *eventsconsolidated,
2782 (*eventlist) + *eventsconsolidated);
2783
2784 return;
2785 }
2786
2787
2788 /** Wait for events in eventlist to terminate, check for return status and profiling
2789 info of events.
2790 If "reset" is TRUE report summary info (would be CL_COMPLETE or last error code) and
2791 print profiling info if needed.
2792 If "reset" is FALSE just store info (success value, profiling) from terminated events
2793 and release events for re-use by OpenCL driver. */
dt_opencl_events_flush(const int devid,const int reset)2794 cl_int dt_opencl_events_flush(const int devid, const int reset)
2795 {
2796 dt_opencl_t *cl = darktable.opencl;
2797 if(!cl->inited || devid < 0) return FALSE;
2798 if(!cl->use_events) return FALSE;
2799
2800 cl_event **eventlist = &(cl->dev[devid].eventlist);
2801 dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2802 int *numevents = &(cl->dev[devid].numevents);
2803 int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2804 int *lostevents = &(cl->dev[devid].lostevents);
2805 int *totalsuccess = &(cl->dev[devid].totalsuccess);
2806
2807 cl_int *summary = &(cl->dev[devid].summary);
2808
2809 if(*eventlist == NULL || *numevents == 0) return CL_COMPLETE; // nothing to do, no news is good news
2810
2811 // Wait for command queue to terminate (side effect: might adjust *numevents)
2812 dt_opencl_events_wait_for(devid);
2813
2814 // now check return status and profiling data of all newly terminated events
2815 for(int k = *eventsconsolidated; k < *numevents; k++)
2816 {
2817 cl_int err;
2818 char *tag = (*eventtags)[k].tag;
2819 cl_int *retval = &((*eventtags)[k].retval);
2820
2821 // get return value of event
2822 err = (cl->dlocl->symbols->dt_clGetEventInfo)((*eventlist)[k], CL_EVENT_COMMAND_EXECUTION_STATUS,
2823 sizeof(cl_int), retval, NULL);
2824 if(err != CL_SUCCESS)
2825 {
2826 dt_print(DT_DEBUG_OPENCL, "[opencl_events_flush] could not get event info for '%s': %d\n",
2827 tag[0] == '\0' ? "<?>" : tag, err);
2828 }
2829 else if(*retval != CL_COMPLETE)
2830 {
2831 dt_print(DT_DEBUG_OPENCL, "[opencl_events_flush] execution of '%s' %s: %d\n",
2832 tag[0] == '\0' ? "<?>" : tag, *retval == CL_COMPLETE ? "was successful" : "failed", *retval);
2833 *summary = *retval;
2834 }
2835 else
2836 (*totalsuccess)++;
2837
2838 if(darktable.unmuted & DT_DEBUG_PERF)
2839 {
2840 // get profiling info of event (only if darktable was called with '-d perf')
2841 cl_ulong start;
2842 cl_ulong end;
2843 cl_int errs = (cl->dlocl->symbols->dt_clGetEventProfilingInfo)(
2844 (*eventlist)[k], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
2845 cl_int erre = (cl->dlocl->symbols->dt_clGetEventProfilingInfo)((*eventlist)[k], CL_PROFILING_COMMAND_END,
2846 sizeof(cl_ulong), &end, NULL);
2847 if(errs == CL_SUCCESS && erre == CL_SUCCESS)
2848 {
2849 (*eventtags)[k].timelapsed = end - start;
2850 }
2851 else
2852 {
2853 (*eventtags)[k].timelapsed = 0;
2854 (*lostevents)++;
2855 }
2856 }
2857 else
2858 (*eventtags)[k].timelapsed = 0;
2859
2860 // finally release event to be re-used by driver
2861 (cl->dlocl->symbols->dt_clReleaseEvent)((*eventlist)[k]);
2862 (*eventsconsolidated)++;
2863 }
2864
2865 cl_int result = *summary;
2866
2867 // do we want to get rid of all stored info?
2868 if(reset)
2869 {
2870 // output profiling info if wanted
2871 if(darktable.unmuted & DT_DEBUG_PERF) dt_opencl_events_profiling(devid, 1);
2872
2873 // reset eventlist structures to empty state
2874 dt_opencl_events_reset(devid);
2875 }
2876
2877 return result == CL_COMPLETE ? 0 : result;
2878 }
2879
2880
2881 /** display OpenCL profiling information. If "aggregated" is TRUE, try to generate summarized info for each
2882 * kernel */
dt_opencl_events_profiling(const int devid,const int aggregated)2883 void dt_opencl_events_profiling(const int devid, const int aggregated)
2884 {
2885 dt_opencl_t *cl = darktable.opencl;
2886 if(!cl->inited || devid < 0) return;
2887 if(!cl->use_events) return;
2888
2889 cl_event **eventlist = &(cl->dev[devid].eventlist);
2890 dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2891 int *numevents = &(cl->dev[devid].numevents);
2892 int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2893 int *lostevents = &(cl->dev[devid].lostevents);
2894
2895 if(*eventlist == NULL || *numevents == 0 || *eventtags == NULL || *eventsconsolidated == 0)
2896 return; // nothing to do
2897
2898 char **tags = malloc(sizeof(char *) * (*eventsconsolidated + 1));
2899 float *timings = malloc(sizeof(float) * (*eventsconsolidated + 1));
2900 int items = 1;
2901 tags[0] = "";
2902 timings[0] = 0.0f;
2903
2904 // get profiling info and arrange it
2905 for(int k = 0; k < *eventsconsolidated; k++)
2906 {
2907 // if aggregated is TRUE, try to sum up timings for multiple runs of each kernel
2908 if(aggregated)
2909 {
2910 // linear search: this is not efficient at all but acceptable given the limited
2911 // number of events (ca. 10 - 20)
2912 int tagfound = -1;
2913 for(int i = 0; i < items; i++)
2914 {
2915 if(!strncmp(tags[i], (*eventtags)[k].tag, DT_OPENCL_EVENTNAMELENGTH))
2916 {
2917 tagfound = i;
2918 break;
2919 }
2920 }
2921
2922 if(tagfound >= 0) // tag was already detected before
2923 {
2924 // sum up timings
2925 timings[tagfound] += (*eventtags)[k].timelapsed * 1e-9;
2926 }
2927 else // tag is new
2928 {
2929 // make new entry
2930 items++;
2931 tags[items - 1] = (*eventtags)[k].tag;
2932 timings[items - 1] = (*eventtags)[k].timelapsed * 1e-9;
2933 }
2934 }
2935
2936 else // no aggregated info wanted -> arrange event by event
2937 {
2938 items++;
2939 tags[items - 1] = (*eventtags)[k].tag;
2940 timings[items - 1] = (*eventtags)[k].timelapsed * 1e-9;
2941 }
2942 }
2943
2944 // now display profiling info
2945 dt_print(DT_DEBUG_OPENCL,
2946 "[opencl_profiling] profiling device %d ('%s'):\n", devid,
2947 cl->dev[devid].name);
2948
2949 float total = 0.0f;
2950 for(int i = 1; i < items; i++)
2951 {
2952 dt_print(DT_DEBUG_OPENCL, "[opencl_profiling] spent %7.4f seconds in %s\n", (double)timings[i],
2953 tags[i][0] == '\0' ? "<?>" : tags[i]);
2954 total += timings[i];
2955 }
2956 // aggregated timing info for items without tag (if any)
2957 if(timings[0] != 0.0f)
2958 {
2959 dt_print(DT_DEBUG_OPENCL, "[opencl_profiling] spent %7.4f seconds (unallocated)\n", (double)timings[0]);
2960 total += timings[0];
2961 }
2962
2963 dt_print(DT_DEBUG_OPENCL,
2964 "[opencl_profiling] spent %7.4f seconds totally in command queue (with %d event%s missing)\n",
2965 (double)total, *lostevents, *lostevents == 1 ? "" : "s");
2966
2967 free(timings);
2968 free(tags);
2969
2970 return;
2971 }
2972
nextpow2(int n)2973 static int nextpow2(int n)
2974 {
2975 int k = 1;
2976 while (k < n)
2977 k <<= 1;
2978 return k;
2979 }
2980
2981 // utility function to calculate optimal work group dimensions for a given kernel
2982 // taking device specific restrictions and local memory limitations into account
dt_opencl_local_buffer_opt(const int devid,const int kernel,dt_opencl_local_buffer_t * factors)2983 int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
2984 {
2985 dt_opencl_t *cl = darktable.opencl;
2986 if(!cl->inited || devid < 0) return FALSE;
2987
2988 size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group
2989 size_t workgroupsize = 0; // the maximum number of items in a work group
2990 unsigned long localmemsize = 0; // the maximum amount of local memory we can use
2991 size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel
2992
2993 int *blocksizex = &factors->sizex;
2994 int *blocksizey = &factors->sizey;
2995
2996 // initial values must be supplied in sizex and sizey.
2997 // we make sure that these are a power of 2 and lie within reasonable limits.
2998 *blocksizex = CLAMP(nextpow2(*blocksizex), 1, 1 << 16);
2999 *blocksizey = CLAMP(nextpow2(*blocksizey), 1, 1 << 16);
3000
3001 if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
3002 && dt_opencl_get_kernel_work_group_size(devid, kernel, &kernelworkgroupsize) == CL_SUCCESS)
3003 {
3004 while(maxsizes[0] < *blocksizex || maxsizes[1] < *blocksizey
3005 || localmemsize < ((factors->xfactor * (*blocksizex) + factors->xoffset) *
3006 (factors->yfactor * (*blocksizey) + factors->yoffset)) * factors->cellsize + factors->overhead
3007 || workgroupsize < (size_t)(*blocksizex) * (*blocksizey) || kernelworkgroupsize < (size_t)(*blocksizex) * (*blocksizey))
3008 {
3009 if(*blocksizex == 1 && *blocksizey == 1) return FALSE;
3010
3011 if(*blocksizex > *blocksizey)
3012 *blocksizex >>= 1;
3013 else
3014 *blocksizey >>= 1;
3015 }
3016 }
3017 else
3018 {
3019 dt_print(DT_DEBUG_OPENCL,
3020 "[opencl_demosaic] can not identify resource limits for device %d\n", devid);
3021 return FALSE;
3022 }
3023
3024 return TRUE;
3025 }
3026
3027
3028 #endif
3029
3030 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
3031 // vim: shiftwidth=2 expandtab tabstop=2 cindent
3032 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
3033