1 /*
2     This file is part of darktable,
3     Copyright (C) 2010-2021 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #ifdef HAVE_OPENCL
20 
21 #include "common/opencl.h"
22 #include "common/bilateralcl.h"
23 #include "common/darktable.h"
24 #include "common/dlopencl.h"
25 #include "common/dwt.h"
26 #include "common/file_location.h"
27 #include "common/gaussian.h"
28 #include "common/guided_filter.h"
29 #include "common/heal.h"
30 #include "common/interpolation.h"
31 #include "common/locallaplaciancl.h"
32 #include "common/nvidia_gpus.h"
33 #include "common/opencl_drivers_blacklist.h"
34 #include "common/tea.h"
35 #include "control/conf.h"
36 #include "control/control.h"
37 #include "develop/blend.h"
38 #include "develop/pixelpipe.h"
39 
40 #include <assert.h>
41 #include <locale.h>
42 #include <stdio.h>
43 #include <string.h>
44 #include <strings.h>
45 
46 #include <ctype.h>
47 #include <errno.h>
48 #include <libgen.h>
49 #include <sys/stat.h>
50 #include <zlib.h>
51 
52 static const char *dt_opencl_get_vendor_by_id(unsigned int id);
53 static float dt_opencl_benchmark_gpu(const int devid, const size_t width, const size_t height, const int count, const float sigma);
54 static float dt_opencl_benchmark_cpu(const size_t width, const size_t height, const int count, const float sigma);
55 static char *_ascii_str_canonical(const char *in, char *out, int maxlen);
56 /** parse a single token of priority string and store priorities in priority_list */
57 static void dt_opencl_priority_parse(dt_opencl_t *cl, char *configstr, int *priority_list, int *mandatory);
58 /** parse a complete priority string */
59 static void dt_opencl_priorities_parse(dt_opencl_t *cl, const char *configstr);
60 /** set device priorities according to config string */
61 static void dt_opencl_update_priorities(const char *configstr);
62 /** read scheduling profile for config variables */
63 static dt_opencl_scheduling_profile_t dt_opencl_get_scheduling_profile(void);
64 /** read config of when/if to sync to cache */
65 static dt_opencl_sync_cache_t dt_opencl_get_sync_cache(void);
66 /** adjust opencl subsystem according to scheduling profile */
67 static void dt_opencl_apply_scheduling_profile(dt_opencl_scheduling_profile_t profile);
68 /** set opencl specific synchronization timeout */
69 static void dt_opencl_set_synchronization_timeout(int value);
70 
71 
dt_opencl_get_device_info(dt_opencl_t * cl,cl_device_id device,cl_device_info param_name,void ** param_value,size_t * param_value_size)72 int dt_opencl_get_device_info(dt_opencl_t *cl, cl_device_id device, cl_device_info param_name, void **param_value,
73                               size_t *param_value_size)
74 {
75   cl_int err;
76 
77   *param_value_size = SIZE_MAX;
78 
79   // 1. figure out how much memory is needed
80   err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(device, param_name, 0, NULL, param_value_size);
81   if(err != CL_SUCCESS)
82   {
83     dt_print(DT_DEBUG_OPENCL,
84              "[dt_opencl_get_device_info] could not query the actual size in bytes of info %d: %d\n", param_name,
85              err);
86     goto error;
87   }
88 
89   // 2. did we /actually/ get the size?
90   if(*param_value_size == SIZE_MAX || *param_value_size == 0)
91   {
92     // both of these sizes make no sense. either i failed to parse spec, or opencl implementation bug?
93     dt_print(DT_DEBUG_OPENCL,
94              "[dt_opencl_get_device_info] ERROR: no size returned, or zero size returned for data %d: %zu\n",
95              param_name, *param_value_size);
96     err = CL_INVALID_VALUE; // FIXME: anything better?
97     goto error;
98   }
99 
100   // 3. make sure that *param_value points to big-enough memory block
101   {
102     void *ptr = realloc(*param_value, *param_value_size);
103     if(!ptr)
104     {
105       dt_print(DT_DEBUG_OPENCL,
106                "[dt_opencl_get_device_info] memory allocation failed! tried to allocate %zu bytes for data %d: %d",
107                *param_value_size, param_name, err);
108       err = CL_OUT_OF_HOST_MEMORY;
109       goto error;
110     }
111 
112     // allocation succeeded, update pointer.
113     *param_value = ptr;
114   }
115 
116   // 4. actually get the value
117   err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(device, param_name, *param_value_size, *param_value, NULL);
118   if(err != CL_SUCCESS)
119   {
120     dt_print(DT_DEBUG_OPENCL, "[dt_opencl_get_device_info] could not query info %d: %d\n", param_name, err);
121     goto error;
122   }
123 
124   return CL_SUCCESS;
125 
126 error:
127   free(*param_value);
128   *param_value = NULL;
129   *param_value_size = 0;
130   return err;
131 }
132 
133 // returns 0 if all ok
134 // returns 1 if we failed hard, and need to skip opencl initialization
135 // returns -1 if we failed to init this device
dt_opencl_device_init(dt_opencl_t * cl,const int dev,cl_device_id * devices,const int k,const int opencl_memory_requirement)136 static int dt_opencl_device_init(dt_opencl_t *cl, const int dev, cl_device_id *devices, const int k,
137                                  const int opencl_memory_requirement)
138 {
139   int res;
140   cl_int err;
141 
142   memset(cl->dev[dev].program, 0x0, sizeof(cl_program) * DT_OPENCL_MAX_PROGRAMS);
143   memset(cl->dev[dev].program_used, 0x0, sizeof(int) * DT_OPENCL_MAX_PROGRAMS);
144   memset(cl->dev[dev].kernel, 0x0, sizeof(cl_kernel) * DT_OPENCL_MAX_KERNELS);
145   memset(cl->dev[dev].kernel_used, 0x0, sizeof(int) * DT_OPENCL_MAX_KERNELS);
146   cl->dev[dev].eventlist = NULL;
147   cl->dev[dev].eventtags = NULL;
148   cl->dev[dev].numevents = 0;
149   cl->dev[dev].eventsconsolidated = 0;
150   cl->dev[dev].maxevents = 0;
151   cl->dev[dev].lostevents = 0;
152   cl->dev[dev].totalevents = 0;
153   cl->dev[dev].totalsuccess = 0;
154   cl->dev[dev].totallost = 0;
155   cl->dev[dev].summary = CL_COMPLETE;
156   cl->dev[dev].used_global_mem = 0;
157   cl->dev[dev].nvidia_sm_20 = 0;
158   cl->dev[dev].vendor = NULL;
159   cl->dev[dev].name = NULL;
160   cl->dev[dev].cname = NULL;
161   cl->dev[dev].options = NULL;
162   cl->dev[dev].memory_in_use = 0;
163   cl->dev[dev].peak_memory = 0;
164   cl_device_id devid = cl->dev[dev].devid = devices[k];
165 
166   char *infostr = NULL;
167   size_t infostr_size;
168 
169   char *cname = NULL;
170   size_t cname_size;
171 
172   char *options = NULL;
173 
174   char *vendor = NULL;
175   size_t vendor_size;
176 
177   char *driverversion = NULL;
178   size_t driverversion_size;
179 
180   char *deviceversion = NULL;
181   size_t deviceversion_size;
182 
183   size_t infoint;
184   size_t *infointtab = NULL;
185   cl_device_type type;
186   cl_bool image_support = 0;
187   cl_bool device_available = 0;
188   cl_uint vendor_id = 0;
189   cl_bool little_endian = 0;
190 
191   char *dtcache = calloc(PATH_MAX, sizeof(char));
192   char *cachedir = calloc(PATH_MAX, sizeof(char));
193   char *devname = calloc(1024, sizeof(char));
194   char *drvversion = calloc(1024, sizeof(char));
195 
196   char kerneldir[PATH_MAX] = { 0 };
197   char *filename = calloc(PATH_MAX, sizeof(char));
198   char *confentry = calloc(PATH_MAX, sizeof(char));
199   char *binname = calloc(PATH_MAX, sizeof(char));
200 
201   // test GPU availability, vendor, memory, image support etc:
202   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_AVAILABLE, sizeof(cl_bool), &device_available, NULL);
203 
204   err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_VENDOR, (void **)&vendor, &vendor_size);
205   if(err != CL_SUCCESS)
206   {
207     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get vendor name of device %d: %d\n", k, err);
208     res = -1;
209     goto end;
210   }
211 
212   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &vendor_id, NULL);
213 
214   err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_NAME, (void **)&infostr, &infostr_size);
215   if(err != CL_SUCCESS)
216   {
217     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get device name of device %d: %d\n", k, err);
218     res = -1;
219     goto end;
220   }
221 
222   err = dt_opencl_get_device_info(cl, devid, CL_DRIVER_VERSION, (void **)&driverversion, &driverversion_size);
223   if(err != CL_SUCCESS)
224   {
225     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get driver version of device %d `%s': %d\n", k, infostr, err);
226     res = -1;
227     goto end;
228   }
229 
230   err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_VERSION, (void **)&deviceversion, &deviceversion_size);
231   if(err != CL_SUCCESS)
232   {
233     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get device version of device %d `%s': %d\n", k, infostr, err);
234     res = -1;
235     goto end;
236   }
237 
238   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
239   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL);
240   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t),
241                                            &(cl->dev[dev].max_image_height), NULL);
242   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t),
243                                            &(cl->dev[dev].max_image_width), NULL);
244   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
245                                            &(cl->dev[dev].max_mem_alloc), NULL);
246   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), &little_endian, NULL);
247 
248 
249   cname_size = infostr_size;
250   cname = malloc(cname_size);
251   _ascii_str_canonical(infostr, cname, sizeof(cname_size));
252 
253   if(!strncasecmp(vendor, "NVIDIA", 6))
254   {
255     // very lame attempt to detect support for atomic float add in global memory.
256     // we need compute model sm_20, but let's try for all nvidia devices :(
257     cl->dev[dev].nvidia_sm_20 = dt_nvidia_gpu_supports_sm_20(infostr);
258     dt_print(DT_DEBUG_OPENCL, "[opencl_init] device %d `%s' %s sm_20 support.\n", k, infostr,
259              cl->dev[dev].nvidia_sm_20 ? "has" : "doesn't have");
260   }
261 
262   if(((type & CL_DEVICE_TYPE_CPU) == CL_DEVICE_TYPE_CPU) && !dt_conf_get_bool("opencl_use_cpu_devices"))
263   {
264     dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding CPU device %d `%s'.\n", k, infostr);
265     res = -1;
266     goto end;
267   }
268 
269   if(dt_opencl_check_driver_blacklist(deviceversion) && !dt_conf_get_bool("opencl_disable_drivers_blacklist"))
270   {
271     dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding device %d `%s' because the driver `%s' is blacklisted.\n",
272              k, infostr, deviceversion);
273     res = -1;
274     goto end;
275   }
276 
277   if(!device_available)
278   {
279     dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding device %d `%s' as it is not available.\n", k, infostr);
280     res = -1;
281     goto end;
282   }
283 
284   if(!image_support)
285   {
286     dt_print(DT_DEBUG_OPENCL,
287              "[opencl_init] discarding device %d `%s' - The OpenCL driver "
288              "doesn't provide image support. See also 'clinfo' output.\n",
289              k, infostr);
290     res = -1;
291     goto end;
292   }
293 
294   if(!little_endian)
295   {
296     dt_print(DT_DEBUG_OPENCL, "[opencl_init] discarding device %d `%s' as it is not little endian.\n", k, infostr);
297     res = -1;
298     goto end;
299   }
300 
301   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong),
302                                            &(cl->dev[dev].max_global_mem), NULL);
303   if(cl->dev[dev].max_global_mem < (uint64_t)opencl_memory_requirement * 1024 * 1024)
304   {
305     dt_print(DT_DEBUG_OPENCL,
306              "[opencl_init] discarding device %d `%s' due to insufficient global memory (%" PRIu64 "MB).\n", k,
307              infostr, cl->dev[dev].max_global_mem / 1024 / 1024);
308     res = -1;
309     goto end;
310   }
311 
312   cl->dev[dev].vendor = strdup(dt_opencl_get_vendor_by_id(vendor_id));
313   cl->dev[dev].name = strdup(infostr);
314   cl->dev[dev].cname = strdup(cname);
315 
316   cl->crc = crc32(cl->crc, (const unsigned char *)infostr, strlen(infostr));
317 
318   dt_print(DT_DEBUG_OPENCL, "[opencl_init] device %d `%s' supports image sizes of %zd x %zd\n", k, infostr,
319            cl->dev[dev].max_image_width, cl->dev[dev].max_image_height);
320   dt_print(DT_DEBUG_OPENCL, "[opencl_init] device %d `%s' allows GPU memory allocations of up to %" PRIu64 "MB\n",
321            k, infostr, cl->dev[dev].max_mem_alloc / 1024 / 1024);
322 
323   if(darktable.unmuted & DT_DEBUG_OPENCL)
324   {
325     printf("[opencl_init] device %d: %s \n", k, infostr);
326     printf("     GLOBAL_MEM_SIZE:          %.0fMB\n", (double)cl->dev[dev].max_global_mem / 1024.0 / 1024.0);
327     (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(infoint), &infoint, NULL);
328     printf("     MAX_WORK_GROUP_SIZE:      %zu\n", infoint);
329     (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(infoint), &infoint,
330                                              NULL);
331     printf("     MAX_WORK_ITEM_DIMENSIONS: %zu\n", infoint);
332     printf("     MAX_WORK_ITEM_SIZES:      [ ");
333 
334     size_t infointtab_size;
335     err = dt_opencl_get_device_info(cl, devid, CL_DEVICE_MAX_WORK_ITEM_SIZES, (void **)&infointtab,
336                                     &infointtab_size);
337     if(err == CL_SUCCESS)
338     {
339       for(size_t i = 0; i < infoint; i++) printf("%zu ", infointtab[i]);
340       free(infointtab);
341       infointtab = NULL;
342     }
343     else
344     {
345       res = -1;
346       goto end;
347     }
348 
349     printf("]\n");
350     printf("     DRIVER_VERSION:           %s\n", driverversion);
351     printf("     DEVICE_VERSION:           %s\n", deviceversion);
352   }
353 
354   dt_pthread_mutex_init(&cl->dev[dev].lock, NULL);
355 
356   cl->dev[dev].context = (cl->dlocl->symbols->dt_clCreateContext)(0, 1, &devid, NULL, NULL, &err);
357   if(err != CL_SUCCESS)
358   {
359     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not create context for device %d: %d\n", k, err);
360     res = -1;
361     goto end;
362   }
363   // create a command queue for first device the context reported
364   cl->dev[dev].cmd_queue = (cl->dlocl->symbols->dt_clCreateCommandQueue)(
365       cl->dev[dev].context, devid, (darktable.unmuted & DT_DEBUG_PERF) ? CL_QUEUE_PROFILING_ENABLE : 0, &err);
366   if(err != CL_SUCCESS)
367   {
368     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not create command queue for device %d: %d\n", k, err);
369     res = -1;
370     goto end;
371   }
372 
373   double tstart, tend, tdiff;
374   dt_loc_get_user_cache_dir(dtcache, PATH_MAX * sizeof(char));
375 
376   int len = MIN(strlen(infostr),1024 * sizeof(char));;
377   int j = 0;
378   // remove non-alphanumeric chars from device name
379   for(int i = 0; i < len; i++)
380     if(isalnum(infostr[i])) devname[j++] = infostr[i];
381   devname[j] = 0;
382   len = MIN(strlen(driverversion), 1024 * sizeof(char));
383   j = 0;
384   // remove non-alphanumeric chars from driver version
385   for(int i = 0; i < len; i++)
386     if(isalnum(driverversion[i])) drvversion[j++] = driverversion[i];
387   drvversion[j] = 0;
388   snprintf(cachedir, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "cached_kernels_for_%s_%s", dtcache, devname, drvversion);
389   if(g_mkdir_with_parents(cachedir, 0700) == -1)
390   {
391     dt_print(DT_DEBUG_OPENCL, "[opencl_init] failed to create directory `%s'!\n", cachedir);
392     res = -1;
393     goto end;
394   }
395 
396   dt_loc_get_kerneldir(kerneldir, sizeof(kerneldir));
397   dt_print(DT_DEBUG_DEV, "kernel directory: %s\n", kerneldir);
398 
399   snprintf(filename, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "programs.conf", kerneldir);
400 
401   char *escapedkerneldir = NULL;
402 #ifndef __APPLE__
403   escapedkerneldir = g_strdup_printf("\"%s\"", kerneldir);
404 #else
405   escapedkerneldir = dt_util_str_replace(kerneldir, " ", "\\ ");
406 #endif
407 
408   options = g_strdup_printf("-w -cl-fast-relaxed-math %s -D%s=1 -I%s",
409                             (cl->dev[dev].nvidia_sm_20 ? " -DNVIDIA_SM_20=1" : ""),
410                             dt_opencl_get_vendor_by_id(vendor_id), escapedkerneldir);
411   cl->dev[dev].options = strdup(options);
412 
413   dt_print(DT_DEBUG_OPENCL, "[opencl_init] options for OpenCL compiler: %s\n", options);
414 
415   g_free(options);
416   options = NULL;
417   g_free(escapedkerneldir);
418   escapedkerneldir = NULL;
419 
420   const char *clincludes[DT_OPENCL_MAX_INCLUDES] = { "rgb_norms.h", "noise_generator.h", "color_conversion.h", "colorspaces.cl", "colorspace.h", "common.h", NULL };
421   char *includemd5[DT_OPENCL_MAX_INCLUDES] = { NULL };
422   dt_opencl_md5sum(clincludes, includemd5);
423 
424   // now load all darktable cl kernels.
425   // TODO: compile as a job?
426   tstart = dt_get_wtime();
427   FILE *f = g_fopen(filename, "rb");
428   if(f)
429   {
430 
431     while(!feof(f))
432     {
433       int prog = -1;
434       gchar *confline_pattern = g_strdup_printf("%%%zu[^\n]\n", PATH_MAX * sizeof(char) - 1);
435       int rd = fscanf(f, confline_pattern, confentry);
436       g_free(confline_pattern);
437       if(rd != 1) continue;
438       // remove comments:
439       size_t end = strlen(confentry);
440       for(size_t pos = 0; pos < end; pos++)
441         if(confentry[pos] == '#')
442         {
443           confentry[pos] = '\0';
444           for(int l = pos - 1; l >= 0; l--)
445           {
446             if(confentry[l] == ' ')
447               confentry[l] = '\0';
448             else
449               break;
450           }
451           break;
452         }
453       if(confentry[0] == '\0') continue;
454 
455       const char *programname = NULL, *programnumber = NULL;
456       gchar **tokens = g_strsplit_set(confentry, " \t", 2);
457       if(tokens)
458       {
459         programname = tokens[0];
460         if(tokens[0])
461           programnumber = tokens[1]; // if the 0st wasn't NULL then we have at least the terminating NULL in [1]
462       }
463 
464       prog = programnumber ? strtol(programnumber, NULL, 10) : -1;
465 
466       if(!programname || programname[0] == '\0' || prog < 0)
467       {
468         dt_print(DT_DEBUG_OPENCL, "[opencl_init] malformed entry in programs.conf `%s'; ignoring it!\n", confentry);
469         continue;
470       }
471 
472       snprintf(filename, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "%s", kerneldir, programname);
473       snprintf(binname, PATH_MAX * sizeof(char), "%s" G_DIR_SEPARATOR_S "%s.bin", cachedir, programname);
474       dt_print(DT_DEBUG_OPENCL, "[opencl_init] compiling program `%s' ..\n", programname);
475       int loaded_cached;
476       char md5sum[33];
477       if(dt_opencl_load_program(dev, prog, filename, binname, cachedir, md5sum, includemd5, &loaded_cached)
478          && dt_opencl_build_program(dev, prog, binname, cachedir, md5sum, loaded_cached) != CL_SUCCESS)
479       {
480         dt_print(DT_DEBUG_OPENCL, "[opencl_init] failed to compile program `%s'!\n", programname);
481         fclose(f);
482         g_strfreev(tokens);
483         res = -1;
484         goto end;
485       }
486 
487       g_strfreev(tokens);
488     }
489 
490     fclose(f);
491     tend = dt_get_wtime();
492     tdiff = tend - tstart;
493     dt_print(DT_DEBUG_OPENCL, "[opencl_init] kernel loading time: %2.4lf \n", tdiff);
494   }
495   else
496   {
497     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not open `%s'!\n", filename);
498     res = -1;
499     goto end;
500   }
501   for(int n = 0; n < DT_OPENCL_MAX_INCLUDES; n++) g_free(includemd5[n]);
502 
503   res = 0;
504 
505 end:
506 
507   free(infostr);
508   free(cname);
509   free(options);
510   free(vendor);
511   free(driverversion);
512   free(deviceversion);
513 
514   free(dtcache);
515   free(cachedir);
516   free(devname);
517   free(drvversion);
518 
519   free(filename);
520   free(confentry);
521   free(binname);
522 
523   return res;
524 }
525 
dt_opencl_init(dt_opencl_t * cl,const gboolean exclude_opencl,const gboolean print_statistics)526 void dt_opencl_init(dt_opencl_t *cl, const gboolean exclude_opencl, const gboolean print_statistics)
527 {
528   char *str;
529   dt_pthread_mutex_init(&cl->lock, NULL);
530   cl->inited = 0;
531   cl->enabled = 0;
532   cl->stopped = 0;
533   cl->error_count = 0;
534   cl->print_statistics = print_statistics;
535 
536   // work-around to fix a bug in some AMD OpenCL compilers, which would fail parsing certain numerical
537   // constants if locale is different from "C".
538   // we save the current locale, set locale to "C", and restore the previous setting after OpenCL is
539   // initialized
540   char *locale = strdup(setlocale(LC_ALL, NULL));
541   setlocale(LC_ALL, "C");
542 
543   int handles = dt_conf_get_int("opencl_number_event_handles");
544   handles = (handles < 0 ? 0x7fffffff : handles);
545   cl->number_event_handles = handles;
546   cl->use_events = (handles != 0);
547 
548   cl->avoid_atomics = dt_conf_get_bool("opencl_avoid_atomics");
549   cl->async_pixelpipe = dt_conf_get_bool("opencl_async_pixelpipe");
550   cl->sync_cache = dt_opencl_get_sync_cache();
551   cl->micro_nap = dt_conf_get_int("opencl_micro_nap");
552   cl->crc = 5781;
553   cl->dlocl = NULL;
554   cl->dev_priority_image = NULL;
555   cl->dev_priority_preview = NULL;
556   cl->dev_priority_preview2 = NULL;
557   cl->dev_priority_export = NULL;
558   cl->dev_priority_thumbnail = NULL;
559 
560   cl_platform_id *all_platforms = NULL;
561   cl_uint *all_num_devices = NULL;
562 
563   // user selectable parameter defines minimum requirement on GPU memory
564   // default is 768MB
565   // values below 200 will be (re)set to 200
566   const int opencl_memory_requirement = MAX(200, dt_conf_get_int("opencl_memory_requirement"));
567   dt_conf_set_int("opencl_memory_requirement", opencl_memory_requirement);
568 
569   if(exclude_opencl)
570   {
571     dt_print(DT_DEBUG_OPENCL, "[opencl_init] do not try to find and use an opencl runtime library due to "
572                               "explicit user request\n");
573     goto finally;
574   }
575 
576   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl related configuration options:\n");
577   dt_print(DT_DEBUG_OPENCL, "[opencl_init] \n");
578   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl: %d\n", dt_conf_get_bool("opencl"));
579   str = dt_conf_get_string("opencl_scheduling_profile");
580   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_scheduling_profile: '%s'\n", str);
581   g_free(str);
582   str = dt_conf_get_string("opencl_library");
583   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_library: '%s'\n", str);
584   g_free(str);
585   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_memory_requirement: %d\n",
586            dt_conf_get_int("opencl_memory_requirement"));
587   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_memory_headroom: %d\n",
588            dt_conf_get_int("opencl_memory_headroom"));
589   str = dt_conf_get_string("opencl_device_priority");
590   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_device_priority: '%s'\n", str);
591   g_free(str);
592   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_mandatory_timeout: %d\n",
593            dt_conf_get_int("opencl_mandatory_timeout"));
594   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_size_roundup: %d\n",
595            dt_conf_get_int("opencl_size_roundup"));
596   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_async_pixelpipe: %d\n",
597            dt_conf_get_bool("opencl_async_pixelpipe"));
598   str = dt_conf_get_string("opencl_synch_cache");
599   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_synch_cache: %s\n", str);
600   g_free(str);
601   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_number_event_handles: %d\n",
602            dt_conf_get_int("opencl_number_event_handles"));
603   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_micro_nap: %d\n", dt_conf_get_int("opencl_micro_nap"));
604   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_use_pinned_memory: %d\n",
605            dt_conf_get_bool("opencl_use_pinned_memory"));
606   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_use_cpu_devices: %d\n",
607            dt_conf_get_bool("opencl_use_cpu_devices"));
608 
609   dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl_avoid_atomics: %d\n",
610            dt_conf_get_bool("opencl_avoid_atomics"));
611 
612 
613   dt_print(DT_DEBUG_OPENCL, "[opencl_init] \n");
614 
615   // look for explicit definition of opencl_runtime library in preferences
616   char *library = dt_conf_get_string("opencl_library");
617 
618   // dynamically load opencl runtime
619   if((cl->dlocl = dt_dlopencl_init(library)) == NULL)
620   {
621     dt_print(DT_DEBUG_OPENCL,
622              "[opencl_init] no working opencl library found. Continue with opencl disabled\n");
623     g_free(library);
624     goto finally;
625   }
626   else
627   {
628     dt_print(DT_DEBUG_OPENCL, "[opencl_init] opencl library '%s' found on your system and loaded\n",
629              cl->dlocl->library);
630   }
631   g_free(library);
632 
633   cl_int err;
634   all_platforms = malloc(sizeof(cl_platform_id) * DT_OPENCL_MAX_PLATFORMS);
635   all_num_devices = malloc(sizeof(cl_uint) * DT_OPENCL_MAX_PLATFORMS);
636   cl_uint num_platforms = DT_OPENCL_MAX_PLATFORMS;
637   err = (cl->dlocl->symbols->dt_clGetPlatformIDs)(DT_OPENCL_MAX_PLATFORMS, all_platforms, &num_platforms);
638   if(err != CL_SUCCESS)
639   {
640     dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get platforms: %d\n", err);
641     goto finally;
642   }
643 
644   if(num_platforms == 0)
645   {
646     dt_print(DT_DEBUG_OPENCL, "[opencl_init] no opencl platform available\n");
647     goto finally;
648   }
649   dt_print(DT_DEBUG_OPENCL, "[opencl_init] found %d platform%s\n", num_platforms,
650            num_platforms > 1 ? "s" : "");
651 
652   for(int n = 0; n < num_platforms; n++)
653   {
654     cl_platform_id platform = all_platforms[n];
655     // get the number of GPU devices available to the platforms
656     // the other common option is CL_DEVICE_TYPE_GPU/CPU (but the latter doesn't work with the nvidia drivers)
657     err = (cl->dlocl->symbols->dt_clGetDeviceIDs)(platform, CL_DEVICE_TYPE_ALL, 0, NULL,
658                                                   &(all_num_devices[n]));
659     if(err != CL_SUCCESS)
660     {
661       all_num_devices[n] = 0;
662       dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get device id size: %d\n", err);
663     }
664   }
665 
666   cl_uint num_devices = 0;
667   for(int n = 0; n < num_platforms; n++) num_devices += all_num_devices[n];
668 
669   // create the device list
670   cl_device_id *devices = 0;
671   if(num_devices)
672   {
673     cl->dev = (dt_opencl_device_t *)malloc(sizeof(dt_opencl_device_t) * num_devices);
674     devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
675     if(!cl->dev || !devices)
676     {
677       free(cl->dev);
678       cl->dev = NULL;
679       free(devices);
680       dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not allocate memory\n");
681       goto finally;
682     }
683   }
684 
685   cl_device_id *devs = devices;
686   for(int n = 0; n < num_platforms; n++)
687   {
688     if(all_num_devices[n])
689     {
690       cl_platform_id platform = all_platforms[n];
691       err = (cl->dlocl->symbols->dt_clGetDeviceIDs)(platform, CL_DEVICE_TYPE_ALL, all_num_devices[n], devs,
692                                                     NULL);
693       if(err != CL_SUCCESS)
694       {
695         num_devices -= all_num_devices[n];
696         dt_print(DT_DEBUG_OPENCL, "[opencl_init] could not get devices list: %d\n", err);
697       }
698       devs += all_num_devices[n];
699     }
700   }
701   devs = NULL;
702 
703   dt_print(DT_DEBUG_OPENCL, "[opencl_init] found %d device%s\n", num_devices, num_devices > 1 ? "s" : "");
704   if(num_devices == 0)
705   {
706     if(devices)
707       free(devices);
708     goto finally;
709   }
710 
711   int dev = 0;
712   for(int k = 0; k < num_devices; k++)
713   {
714     const int res = dt_opencl_device_init(cl, dev, devices, k, opencl_memory_requirement);
715 
716     if(res != 0)
717       continue;
718 
719     // increase dev only if dt_opencl_device_init was successful (res == 0)
720 
721     ++dev;
722   }
723   free(devices);
724   devices = NULL;
725 
726   if(dev > 0)
727   {
728     cl->num_devs = dev;
729     cl->inited = 1;
730     cl->enabled = dt_conf_get_bool("opencl");
731     memset(cl->mandatory, 0, sizeof(cl->mandatory));
732     cl->dev_priority_image = (int *)malloc(sizeof(int) * (dev + 1));
733     cl->dev_priority_preview = (int *)malloc(sizeof(int) * (dev + 1));
734     cl->dev_priority_preview2 = (int *)malloc(sizeof(int) * (dev + 1));
735     cl->dev_priority_export = (int *)malloc(sizeof(int) * (dev + 1));
736     cl->dev_priority_thumbnail = (int *)malloc(sizeof(int) * (dev + 1));
737 
738     // only check successful malloc in debug mode; darktable will crash anyhow sooner or later if mallocs that
739     // small would fail
740     assert(cl->dev_priority_image != NULL && cl->dev_priority_preview != NULL && cl->dev_priority_preview2 != NULL
741            && cl->dev_priority_export != NULL && cl->dev_priority_thumbnail != NULL);
742 
743     dt_print(DT_DEBUG_OPENCL, "[opencl_init] OpenCL successfully initialized.\n");
744     dt_print(
745         DT_DEBUG_OPENCL,
746         "[opencl_init] here are the internal numbers and names of OpenCL devices available to darktable:\n");
747     for(int i = 0; i < dev; i++) dt_print(DT_DEBUG_OPENCL, "[opencl_init]\t\t%d\t'%s'\n", i, cl->dev[i].name);
748   }
749   else
750   {
751     dt_print(DT_DEBUG_OPENCL, "[opencl_init] no suitable devices found.\n");
752   }
753 
754 finally:
755   dt_print(DT_DEBUG_OPENCL, "[opencl_init] FINALLY: opencl is %sAVAILABLE on this system.\n",
756            cl->inited ? "" : "NOT ");
757   dt_print(DT_DEBUG_OPENCL, "[opencl_init] initial status of opencl enabled flag is %s.\n",
758            cl->enabled ? "ON" : "OFF");
759   if(cl->inited)
760   {
761     dt_capabilities_add("opencl");
762     cl->blendop = dt_develop_blend_init_cl_global();
763     cl->bilateral = dt_bilateral_init_cl_global();
764     cl->gaussian = dt_gaussian_init_cl_global();
765     cl->interpolation = dt_interpolation_init_cl_global();
766     cl->local_laplacian = dt_local_laplacian_init_cl_global();
767     cl->dwt = dt_dwt_init_cl_global();
768     cl->heal = dt_heal_init_cl_global();
769     cl->colorspaces = dt_colorspaces_init_cl_global();
770     cl->guided_filter = dt_guided_filter_init_cl_global();
771 
772     char checksum[64];
773     snprintf(checksum, sizeof(checksum), "%u", cl->crc);
774     char *oldchecksum = dt_conf_get_string("opencl_checksum");
775 
776     // check if the configuration (OpenCL device setup) has changed, indicated by checksum != oldchecksum
777     if(strcasecmp(oldchecksum, "OFF") != 0 && strcmp(oldchecksum, checksum) != 0)
778     {
779       // store new checksum value in config
780       dt_conf_set_string("opencl_checksum", checksum);
781       // do CPU bencharking
782       float tcpu = dt_opencl_benchmark_cpu(1024, 1024, 5, 100.0f);
783       // get best benchmarking value of all detected OpenCL devices
784       float tgpumin = INFINITY;
785       for(int n = 0; n < cl->num_devs; n++)
786       {
787         float tgpu = cl->dev[n].benchmark = dt_opencl_benchmark_gpu(n, 1024, 1024, 5, 100.0f);
788         tgpumin = fmin(tgpu, tgpumin);
789       }
790       dt_print(DT_DEBUG_OPENCL, "[opencl_init] benchmarking results: %f seconds for fastest GPU versus %f seconds for CPU.\n",
791            tgpumin, tcpu);
792 
793       if(tcpu <= 1.5f * tgpumin)
794       {
795         // de-activate opencl for darktable in case of too slow GPU(s). user can always manually overrule this later.
796         cl->enabled = FALSE;
797         dt_conf_set_bool("opencl", FALSE);
798         dt_print(DT_DEBUG_OPENCL, "[opencl_init] due to a slow GPU the opencl flag has been set to OFF.\n");
799         dt_control_log(_("due to a slow GPU hardware acceleration via opencl has been de-activated."));
800       }
801       else if(cl->num_devs >= 2)
802       {
803         // set scheduling profile to "multiple GPUs" if more than one device has been found
804         dt_conf_set_string("opencl_scheduling_profile", "multiple GPUs");
805         dt_print(DT_DEBUG_OPENCL, "[opencl_init] set scheduling profile for multiple GPUs.\n");
806         dt_control_log(_("multiple GPUs detected - opencl scheduling profile has been set accordingly."));
807       }
808       else if(tcpu >= 6.0f * tgpumin)
809       {
810         // set scheduling profile to "very fast GPU" if CPU is way too slow
811         dt_conf_set_string("opencl_scheduling_profile", "very fast GPU");
812         dt_print(DT_DEBUG_OPENCL, "[opencl_init] set scheduling profile for very fast GPU.\n");
813         dt_control_log(_("very fast GPU detected - opencl scheduling profile has been set accordingly."));
814       }
815       else
816       {
817         // set scheduling profile to "default"
818         dt_conf_set_string("opencl_scheduling_profile", "default");
819         dt_print(DT_DEBUG_OPENCL, "[opencl_init] set scheduling profile to default.\n");
820         dt_control_log(_("opencl scheduling profile set to default."));
821       }
822     }
823     g_free(oldchecksum);
824 
825     // apply config settings for scheduling profile: sets device priorities and pixelpipe synchronization timeout
826     dt_opencl_scheduling_profile_t profile = dt_opencl_get_scheduling_profile();
827     dt_opencl_apply_scheduling_profile(profile);
828   }
829   else // initialization failed
830   {
831     for(int i = 0; cl->dev && i < cl->num_devs; i++)
832     {
833       dt_pthread_mutex_destroy(&cl->dev[i].lock);
834       for(int k = 0; k < DT_OPENCL_MAX_KERNELS; k++)
835         if(cl->dev[i].kernel_used[k]) (cl->dlocl->symbols->dt_clReleaseKernel)(cl->dev[i].kernel[k]);
836       for(int k = 0; k < DT_OPENCL_MAX_PROGRAMS; k++)
837         if(cl->dev[i].program_used[k]) (cl->dlocl->symbols->dt_clReleaseProgram)(cl->dev[i].program[k]);
838       (cl->dlocl->symbols->dt_clReleaseCommandQueue)(cl->dev[i].cmd_queue);
839       (cl->dlocl->symbols->dt_clReleaseContext)(cl->dev[i].context);
840       if(cl->use_events)
841       {
842         dt_opencl_events_reset(i);
843         free(cl->dev[i].eventlist);
844         free(cl->dev[i].eventtags);
845       }
846       free((void *)(cl->dev[i].vendor));
847       free((void *)(cl->dev[i].name));
848       free((void *)(cl->dev[i].cname));
849       free((void *)(cl->dev[i].options));
850     }
851   }
852 
853   free(all_num_devices);
854   free(all_platforms);
855 
856   if(locale)
857   {
858     setlocale(LC_ALL, locale);
859     free(locale);
860   }
861 
862   return;
863 }
864 
dt_opencl_cleanup(dt_opencl_t * cl)865 void dt_opencl_cleanup(dt_opencl_t *cl)
866 {
867   if(cl->inited)
868   {
869     dt_develop_blend_free_cl_global(cl->blendop);
870     dt_bilateral_free_cl_global(cl->bilateral);
871     dt_gaussian_free_cl_global(cl->gaussian);
872     dt_interpolation_free_cl_global(cl->interpolation);
873     dt_dwt_free_cl_global(cl->dwt);
874     dt_heal_free_cl_global(cl->heal);
875     dt_colorspaces_free_cl_global(cl->colorspaces);
876     dt_guided_filter_free_cl_global(cl->guided_filter);
877 
878     for(int i = 0; i < cl->num_devs; i++)
879     {
880       dt_pthread_mutex_destroy(&cl->dev[i].lock);
881       for(int k = 0; k < DT_OPENCL_MAX_KERNELS; k++)
882         if(cl->dev[i].kernel_used[k]) (cl->dlocl->symbols->dt_clReleaseKernel)(cl->dev[i].kernel[k]);
883       for(int k = 0; k < DT_OPENCL_MAX_PROGRAMS; k++)
884         if(cl->dev[i].program_used[k]) (cl->dlocl->symbols->dt_clReleaseProgram)(cl->dev[i].program[k]);
885       (cl->dlocl->symbols->dt_clReleaseCommandQueue)(cl->dev[i].cmd_queue);
886       (cl->dlocl->symbols->dt_clReleaseContext)(cl->dev[i].context);
887 
888       if(cl->print_statistics && (darktable.unmuted & DT_DEBUG_MEMORY))
889       {
890         dt_print(DT_DEBUG_OPENCL, "[opencl_summary_statistics] device '%s' (%d): peak memory usage %zu bytes (%.1f MB)\n",
891                    cl->dev[i].name, i, cl->dev[i].peak_memory, (float)cl->dev[i].peak_memory/(1024*1024));
892       }
893 
894       if(cl->print_statistics && cl->use_events)
895       {
896         if(cl->dev[i].totalevents)
897         {
898           dt_print(DT_DEBUG_OPENCL, "[opencl_summary_statistics] device '%s' (%d): %d out of %d events were "
899                                     "successful and %d events lost\n",
900                    cl->dev[i].name, i, cl->dev[i].totalsuccess, cl->dev[i].totalevents, cl->dev[i].totallost);
901         }
902         else
903         {
904           dt_print(DT_DEBUG_OPENCL, "[opencl_summary_statistics] device '%s' (%d): NOT utilized\n",
905                    cl->dev[i].name, i);
906         }
907       }
908 
909       if(cl->use_events)
910       {
911         dt_opencl_events_reset(i);
912 
913         free(cl->dev[i].eventlist);
914         free(cl->dev[i].eventtags);
915       }
916 
917       free((void *)(cl->dev[i].vendor));
918       free((void *)(cl->dev[i].name));
919       free((void *)(cl->dev[i].cname));
920       free((void *)(cl->dev[i].options));
921     }
922     free(cl->dev_priority_image);
923     free(cl->dev_priority_preview);
924     free(cl->dev_priority_preview2);
925     free(cl->dev_priority_export);
926     free(cl->dev_priority_thumbnail);
927   }
928 
929   if(cl->dlocl)
930   {
931     free(cl->dlocl->symbols);
932     g_free(cl->dlocl->library);
933     free(cl->dlocl);
934   }
935 
936   free(cl->dev);
937   dt_pthread_mutex_destroy(&cl->lock);
938 }
939 
dt_opencl_get_vendor_by_id(unsigned int id)940 static const char *dt_opencl_get_vendor_by_id(unsigned int id)
941 {
942   const char *vendor;
943 
944   switch(id)
945   {
946     case 4098:
947       vendor = "AMD";
948       break;
949     case 4318:
950       vendor = "NVIDIA";
951       break;
952     case 0x8086u:
953       vendor = "INTEL";
954       break;
955     default:
956       vendor = "UNKNOWN";
957   }
958 
959   return vendor;
960 }
961 
dt_opencl_benchmark_gpu(const int devid,const size_t width,const size_t height,const int count,const float sigma)962 static float dt_opencl_benchmark_gpu(const int devid, const size_t width, const size_t height, const int count, const float sigma)
963 {
964   const int bpp = 4 * sizeof(float);
965   cl_int err = 666;
966   cl_mem dev_mem = NULL;
967   float *buf = NULL;
968   dt_gaussian_cl_t *g = NULL;
969 
970   const float Labmax[] = { INFINITY, INFINITY, INFINITY, INFINITY };
971   const float Labmin[] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
972 
973   unsigned int *const tea_states = alloc_tea_states(dt_get_num_threads());
974 
975   buf = dt_alloc_align(64, width * height * bpp);
976   if(buf == NULL) goto error;
977 
978 #ifdef _OPENMP
979 #pragma omp parallel for default(none) \
980   dt_omp_firstprivate(height, tea_states, width) \
981   shared(buf)
982 #endif
983   for(size_t j = 0; j < height; j++)
984   {
985     unsigned int *tea_state = get_tea_state(tea_states,dt_get_thread_num());
986     tea_state[0] = j + dt_get_thread_num();
987     size_t index = j * 4 * width;
988     for(int i = 0; i < 4 * width; i++)
989     {
990       encrypt_tea(tea_state);
991       buf[index + i] = 100.0f * tpdf(tea_state[0]);
992     }
993   }
994 
995   // start timer
996   double start = dt_get_wtime();
997 
998   // allocate dev_mem buffer
999   dev_mem = dt_opencl_alloc_device_use_host_pointer(devid, width, height, bpp, width*bpp, buf);
1000   if(dev_mem == NULL) goto error;
1001 
1002   // prepare gaussian filter
1003   g = dt_gaussian_init_cl(devid, width, height, 4, Labmax, Labmin, sigma, 0);
1004   if(!g) goto error;
1005 
1006   // gaussian blur
1007   for(int n = 0; n < count; n++)
1008   {
1009     err = dt_gaussian_blur_cl(g, dev_mem, dev_mem);
1010     if(err != CL_SUCCESS) goto error;
1011   }
1012 
1013   // cleanup gaussian filter
1014   dt_gaussian_free_cl(g);
1015   g = NULL;
1016 
1017   // copy dev_mem -> buf
1018   err = dt_opencl_copy_device_to_host(devid, buf, dev_mem, width, height, bpp);
1019   if(err != CL_SUCCESS) goto error;
1020 
1021   // free dev_mem
1022   dt_opencl_release_mem_object(dev_mem);
1023 
1024   // end timer
1025   double end = dt_get_wtime();
1026 
1027   dt_free_align(buf);
1028   free_tea_states(tea_states);
1029   return (end - start);
1030 
1031 error:
1032   dt_gaussian_free_cl(g);
1033   dt_free_align(buf);
1034   free_tea_states(tea_states);
1035   dt_opencl_release_mem_object(dev_mem);
1036   return INFINITY;
1037 }
1038 
dt_opencl_benchmark_cpu(const size_t width,const size_t height,const int count,const float sigma)1039 static float dt_opencl_benchmark_cpu(const size_t width, const size_t height, const int count, const float sigma)
1040 {
1041   const int bpp = 4 * sizeof(float);
1042   float *buf = NULL;
1043   dt_gaussian_t *g = NULL;
1044 
1045   const float Labmax[] = { INFINITY, INFINITY, INFINITY, INFINITY };
1046   const float Labmin[] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
1047 
1048   unsigned int *const tea_states = alloc_tea_states(dt_get_num_threads());
1049 
1050   buf = dt_alloc_align(64, width * height * bpp);
1051   if(buf == NULL) goto error;
1052 
1053 #ifdef _OPENMP
1054 #pragma omp parallel for default(none) \
1055   dt_omp_firstprivate(height, width, tea_states) \
1056   shared(buf)
1057 #endif
1058   for(size_t j = 0; j < height; j++)
1059   {
1060     unsigned int *tea_state = get_tea_state(tea_states,dt_get_thread_num());
1061     tea_state[0] = j + dt_get_thread_num();
1062     size_t index = j * 4 * width;
1063     for(int i = 0; i < 4 * width; i++)
1064     {
1065       encrypt_tea(tea_state);
1066       buf[index + i] = 100.0f * tpdf(tea_state[0]);
1067     }
1068   }
1069 
1070   // start timer
1071   double start = dt_get_wtime();
1072 
1073   // prepare gaussian filter
1074   g = dt_gaussian_init(width, height, 4, Labmax, Labmin, sigma, 0);
1075   if(!g) goto error;
1076 
1077   // gaussian blur
1078   for(int n = 0; n < count; n++)
1079   {
1080     dt_gaussian_blur(g, buf, buf);
1081   }
1082 
1083   // cleanup gaussian filter
1084   dt_gaussian_free(g);
1085   g = NULL;
1086 
1087   // end timer
1088   double end = dt_get_wtime();
1089 
1090   dt_free_align(buf);
1091   free_tea_states(tea_states);
1092   return (end - start);
1093 
1094 error:
1095   dt_gaussian_free(g);
1096   dt_free_align(buf);
1097   free_tea_states(tea_states);
1098   return INFINITY;
1099 }
1100 
1101 
dt_opencl_finish(const int devid)1102 int dt_opencl_finish(const int devid)
1103 {
1104   dt_opencl_t *cl = darktable.opencl;
1105   if(!cl->inited || devid < 0) return -1;
1106 
1107   cl_int err = (cl->dlocl->symbols->dt_clFinish)(cl->dev[devid].cmd_queue);
1108 
1109   // take the opportunity to release some event handles, but without printing
1110   // summary statistics
1111   cl_int success = dt_opencl_events_flush(devid, 0);
1112 
1113   return (err == CL_SUCCESS && success == CL_COMPLETE);
1114 }
1115 
dt_opencl_enqueue_barrier(const int devid)1116 int dt_opencl_enqueue_barrier(const int devid)
1117 {
1118   dt_opencl_t *cl = darktable.opencl;
1119   if(!cl->inited || devid < 0) return -1;
1120   return (cl->dlocl->symbols->dt_clEnqueueBarrier)(cl->dev[devid].cmd_queue);
1121 }
1122 
_take_from_list(int * list,int value)1123 static int _take_from_list(int *list, int value)
1124 {
1125   int result = -1;
1126 
1127   while(*list != -1 && *list != value) list++;
1128   result = *list;
1129 
1130   while(*list != -1)
1131   {
1132     *list = *(list + 1);
1133     list++;
1134   }
1135 
1136   return result;
1137 }
1138 
1139 
_device_by_cname(const char * name)1140 static int _device_by_cname(const char *name)
1141 {
1142   dt_opencl_t *cl = darktable.opencl;
1143   int devs = cl->num_devs;
1144   char tmp[2048] = { 0 };
1145   int result = -1;
1146 
1147   _ascii_str_canonical(name, tmp, sizeof(tmp));
1148 
1149   for(int i = 0; i < devs; i++)
1150   {
1151     if(!strcmp(tmp, cl->dev[i].cname))
1152     {
1153       result = i;
1154       break;
1155     }
1156   }
1157 
1158   return result;
1159 }
1160 
1161 
_ascii_str_canonical(const char * in,char * out,int maxlen)1162 static char *_ascii_str_canonical(const char *in, char *out, int maxlen)
1163 {
1164   if(out == NULL)
1165   {
1166     maxlen = strlen(in) + 1;
1167     out = malloc(maxlen);
1168     if(out == NULL) return NULL;
1169   }
1170 
1171   int len = 0;
1172 
1173   while(*in != '\0' && len < maxlen - 1)
1174   {
1175     int n = strcspn(in, "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
1176     in += n;
1177     if(n != 0) continue;
1178     out[len] = tolower(*in);
1179     len++;
1180     in++;
1181   }
1182   out[len] = '\0';
1183 
1184   return out;
1185 }
1186 
1187 
_strsep(char ** stringp,const char * delim)1188 static char *_strsep(char **stringp, const char *delim)
1189 {
1190   char *begin, *end;
1191 
1192   begin = *stringp;
1193   if(begin == NULL) return NULL;
1194 
1195   if(delim[0] == '\0' || delim[1] == '\0')
1196   {
1197     char ch = delim[0];
1198 
1199     if(ch == '\0')
1200       end = NULL;
1201     else
1202     {
1203       if(*begin == ch)
1204         end = begin;
1205       else if(*begin == '\0')
1206         end = NULL;
1207       else
1208         end = strchr(begin + 1, ch);
1209     }
1210   }
1211   else
1212     end = strpbrk(begin, delim);
1213 
1214   if(end)
1215   {
1216     *end++ = '\0';
1217     *stringp = end;
1218   }
1219   else
1220     *stringp = NULL;
1221 
1222   return begin;
1223 }
1224 
1225 
1226 // parse a single token of priority string and store priorities in priority_list
dt_opencl_priority_parse(dt_opencl_t * cl,char * configstr,int * priority_list,int * mandatory)1227 static void dt_opencl_priority_parse(dt_opencl_t *cl, char *configstr, int *priority_list, int *mandatory)
1228 {
1229   int devs = cl->num_devs;
1230   int count = 0;
1231   int *full = malloc(sizeof(int) * (devs + 1));
1232   int mnd = 0;
1233 
1234   // NULL or empty configstring?
1235   if(configstr == NULL || *configstr == '\0')
1236   {
1237     priority_list[0] = -1;
1238     *mandatory = 0;
1239     free(full);
1240     return;
1241   }
1242 
1243   // check if user wants us to force-use opencl device(s)
1244   if(configstr[0] == '+')
1245   {
1246     mnd = 1;
1247     configstr++;
1248   }
1249 
1250   // first start with a full list of devices to take from
1251   for(int i = 0; i < devs; i++) full[i] = i;
1252   full[devs] = -1;
1253 
1254   gchar **tokens = g_strsplit(configstr, ",", 0);
1255   gchar **tokens_ptr = tokens;
1256 
1257   while(tokens != NULL && *tokens_ptr != NULL && count < devs + 1 && full[0] != -1)
1258   {
1259     gchar *str = *tokens_ptr;
1260     int not = 0;
1261     int all = 0;
1262 
1263     switch(*str)
1264     {
1265       case '*':
1266         all = 1;
1267         break;
1268       case '!':
1269         not = 1;
1270         while(*str == '!') str++;
1271         break;
1272     }
1273 
1274     if(all)
1275     {
1276       // copy all remaining device numbers from full to priority list
1277       for(int i = 0; i < devs && full[i] != -1; i++)
1278       {
1279         priority_list[count] = full[i];
1280         count++;
1281       }
1282       full[0] = -1; // mark full list as empty
1283     }
1284     else if(*str != '\0')
1285     {
1286       char *endptr = NULL;
1287 
1288       // first check if str corresponds to an existing canonical device name
1289       long number = _device_by_cname(str);
1290 
1291       // if not try to convert string into decimal device number
1292       if(number < 0) number = strtol(str, &endptr, 10);
1293 
1294       // still not found or negative number given? set number to -1
1295       if(number < 0 || (number == 0 && endptr == str)) number = -1;
1296 
1297       // try to take number out of remaining device list
1298       int dev_number = _take_from_list(full, number);
1299 
1300       if(!not&&dev_number != -1)
1301       {
1302         priority_list[count] = dev_number;
1303         count++;
1304       }
1305     }
1306 
1307     tokens_ptr++;
1308   }
1309 
1310   g_strfreev(tokens);
1311 
1312   // terminate priority list with -1
1313   while(count < devs + 1) priority_list[count++] = -1;
1314 
1315   // opencl use can only be mandatory if at least one opencl device is given
1316   *mandatory = (priority_list[0] != -1) ? mnd : 0;
1317 
1318   free(full);
1319 }
1320 
1321 // parse a complete priority string
dt_opencl_priorities_parse(dt_opencl_t * cl,const char * configstr)1322 static void dt_opencl_priorities_parse(dt_opencl_t *cl, const char *configstr)
1323 {
1324   char tmp[2048];
1325   int len = 0;
1326 
1327   // first get rid of all invalid characters
1328   while(*configstr != '\0' && len < sizeof(tmp) - 1)
1329   {
1330     int n = strcspn(configstr, "/!,*+0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
1331     configstr += n;
1332     if(n != 0) continue;
1333     tmp[len] = *configstr;
1334     len++;
1335     configstr++;
1336   }
1337   tmp[len] = '\0';
1338 
1339   char *str = tmp;
1340 
1341   // now split config string into tokens, separated by '/' and parse them one after the other
1342   char *prio = _strsep(&str, "/");
1343   dt_opencl_priority_parse(cl, prio, cl->dev_priority_image, &cl->mandatory[0]);
1344 
1345   prio = _strsep(&str, "/");
1346   dt_opencl_priority_parse(cl, prio, cl->dev_priority_preview, &cl->mandatory[1]);
1347 
1348   prio = _strsep(&str, "/");
1349   dt_opencl_priority_parse(cl, prio, cl->dev_priority_export, &cl->mandatory[2]);
1350 
1351   prio = _strsep(&str, "/");
1352   dt_opencl_priority_parse(cl, prio, cl->dev_priority_thumbnail, &cl->mandatory[3]);
1353 
1354   prio = _strsep(&str, "/");
1355   dt_opencl_priority_parse(cl, prio, cl->dev_priority_preview2, &cl->mandatory[4]);
1356 }
1357 
1358 // set device priorities according to config string
dt_opencl_update_priorities(const char * configstr)1359 static void dt_opencl_update_priorities(const char *configstr)
1360 {
1361   dt_opencl_t *cl = darktable.opencl;
1362   dt_opencl_priorities_parse(cl, configstr);
1363 
1364   dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] these are your device priorities:\n");
1365   dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] \t\timage\tpreview\texport\tthumbs\tpreview2\n");
1366   for(int i = 0; i < cl->num_devs; i++)
1367     dt_print(DT_DEBUG_OPENCL, "[opencl_priorities]\t\t%d\t%d\t%d\t%d\t%d\n", cl->dev_priority_image[i],
1368              cl->dev_priority_preview[i], cl->dev_priority_export[i], cl->dev_priority_thumbnail[i], cl->dev_priority_preview2[i]);
1369   dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] show if opencl use is mandatory for a given pixelpipe:\n");
1370   dt_print(DT_DEBUG_OPENCL, "[opencl_priorities] \t\timage\tpreview\texport\tthumbs\tpreview2\n");
1371   dt_print(DT_DEBUG_OPENCL, "[opencl_priorities]\t\t%d\t%d\t%d\t%d\t%d\n", cl->mandatory[0],
1372              cl->mandatory[1], cl->mandatory[2], cl->mandatory[3], cl->mandatory[4]);
1373 }
1374 
dt_opencl_lock_device(const int pipetype)1375 int dt_opencl_lock_device(const int pipetype)
1376 {
1377   dt_opencl_t *cl = darktable.opencl;
1378   if(!cl->inited) return -1;
1379 
1380 
1381   dt_pthread_mutex_lock(&cl->lock);
1382 
1383   size_t prio_size = sizeof(int) * (cl->num_devs + 1);
1384   int *priority = (int *)malloc(prio_size);
1385   int mandatory;
1386 
1387   switch(pipetype)
1388   {
1389     case DT_DEV_PIXELPIPE_FULL:
1390       memcpy(priority, cl->dev_priority_image, prio_size);
1391       mandatory = cl->mandatory[0];
1392       break;
1393     case DT_DEV_PIXELPIPE_PREVIEW:
1394       memcpy(priority, cl->dev_priority_preview, prio_size);
1395       mandatory = cl->mandatory[1];
1396       break;
1397     case DT_DEV_PIXELPIPE_EXPORT:
1398       memcpy(priority, cl->dev_priority_export, prio_size);
1399       mandatory = cl->mandatory[2];
1400       break;
1401     case DT_DEV_PIXELPIPE_THUMBNAIL:
1402       memcpy(priority, cl->dev_priority_thumbnail, prio_size);
1403       mandatory = cl->mandatory[3];
1404       break;
1405     case DT_DEV_PIXELPIPE_PREVIEW2:
1406       memcpy(priority, cl->dev_priority_preview2, prio_size);
1407       mandatory = cl->mandatory[4];
1408       break;
1409     default:
1410       free(priority);
1411       priority = NULL;
1412       mandatory = 0;
1413   }
1414 
1415   dt_pthread_mutex_unlock(&cl->lock);
1416 
1417   if(priority)
1418   {
1419     const int usec = 5000;
1420     const int nloop = MAX(0, dt_conf_get_int("opencl_mandatory_timeout"));
1421 
1422     // check for free opencl device repeatedly if mandatory is TRUE, else give up after first try
1423     for(int n = 0; n < nloop; n++)
1424     {
1425       const int *prio = priority;
1426 
1427       while(*prio != -1)
1428       {
1429         if(!dt_pthread_mutex_BAD_trylock(&cl->dev[*prio].lock))
1430         {
1431           int devid = *prio;
1432           free(priority);
1433           return devid;
1434         }
1435         prio++;
1436       }
1437 
1438       if(!mandatory)
1439       {
1440         free(priority);
1441         return -1;
1442       }
1443 
1444       dt_iop_nap(usec);
1445     }
1446   }
1447   else
1448   {
1449     // only a fallback if a new pipe type would be added and we forget to take care of it in opencl.c
1450     for(int try_dev = 0; try_dev < cl->num_devs; try_dev++)
1451     {
1452       // get first currently unused processor
1453       if(!dt_pthread_mutex_BAD_trylock(&cl->dev[try_dev].lock)) return try_dev;
1454     }
1455   }
1456 
1457   free(priority);
1458 
1459   // no free GPU :(
1460   // use CPU processing, if no free device:
1461   return -1;
1462 }
1463 
dt_opencl_unlock_device(const int dev)1464 void dt_opencl_unlock_device(const int dev)
1465 {
1466   dt_opencl_t *cl = darktable.opencl;
1467   if(!cl->inited) return;
1468   if(dev < 0 || dev >= cl->num_devs) return;
1469   dt_pthread_mutex_BAD_unlock(&cl->dev[dev].lock);
1470 }
1471 
fopen_stat(const char * filename,struct stat * st)1472 static FILE *fopen_stat(const char *filename, struct stat *st)
1473 {
1474   FILE *f = g_fopen(filename, "rb");
1475   if(!f)
1476   {
1477     dt_print(DT_DEBUG_OPENCL, "[opencl_fopen_stat] could not open file `%s'!\n", filename);
1478     return NULL;
1479   }
1480   int fd = fileno(f);
1481   if(fstat(fd, st) < 0)
1482   {
1483     dt_print(DT_DEBUG_OPENCL, "[opencl_fopen_stat] could not stat file `%s'!\n", filename);
1484     return NULL;
1485   }
1486   return f;
1487 }
1488 
1489 
dt_opencl_md5sum(const char ** files,char ** md5sums)1490 void dt_opencl_md5sum(const char **files, char **md5sums)
1491 {
1492   char kerneldir[PATH_MAX] = { 0 };
1493   char filename[PATH_MAX] = { 0 };
1494   dt_loc_get_kerneldir(kerneldir, sizeof(kerneldir));
1495 
1496   for(int n = 0; n < DT_OPENCL_MAX_INCLUDES; n++, files++, md5sums++)
1497   {
1498     if(!*files)
1499     {
1500       *md5sums = NULL;
1501       continue;
1502     }
1503 
1504     snprintf(filename, sizeof(filename), "%s" G_DIR_SEPARATOR_S "%s", kerneldir, *files);
1505 
1506     struct stat filestat;
1507     FILE *f = fopen_stat(filename, &filestat);
1508 
1509     if(!f)
1510     {
1511       dt_print(DT_DEBUG_OPENCL, "[opencl_md5sums] could not open file `%s'!\n", filename);
1512       *md5sums = NULL;
1513       continue;
1514     }
1515 
1516     size_t filesize = filestat.st_size;
1517     char *file = (char *)malloc(filesize);
1518 
1519     if(!file)
1520     {
1521       dt_print(DT_DEBUG_OPENCL, "[opencl_md5sums] could not allocate buffer for file `%s'!\n", filename);
1522       *md5sums = NULL;
1523       fclose(f);
1524       continue;
1525     }
1526 
1527     size_t rd = fread(file, sizeof(char), filesize, f);
1528     fclose(f);
1529 
1530     if(rd != filesize)
1531     {
1532       free(file);
1533       dt_print(DT_DEBUG_OPENCL, "[opencl_md5sums] could not read all of file `%s'!\n", filename);
1534       *md5sums = NULL;
1535       continue;
1536     }
1537 
1538     *md5sums = g_compute_checksum_for_data(G_CHECKSUM_MD5, (guchar *)file, filesize);
1539 
1540     free(file);
1541   }
1542 }
1543 
dt_opencl_load_program(const int dev,const int prog,const char * filename,const char * binname,const char * cachedir,char * md5sum,char ** includemd5,int * loaded_cached)1544 int dt_opencl_load_program(const int dev, const int prog, const char *filename, const char *binname,
1545                            const char *cachedir, char *md5sum, char **includemd5, int *loaded_cached)
1546 {
1547   cl_int err;
1548   dt_opencl_t *cl = darktable.opencl;
1549 
1550   struct stat filestat, cachedstat;
1551   *loaded_cached = 0;
1552 
1553   if(prog < 0 || prog >= DT_OPENCL_MAX_PROGRAMS)
1554   {
1555     dt_print(DT_DEBUG_OPENCL, "[opencl_load_source] invalid program number `%d' of file `%s'!\n", prog,
1556              filename);
1557     return 0;
1558   }
1559 
1560   if(cl->dev[dev].program_used[prog])
1561   {
1562     dt_print(DT_DEBUG_OPENCL,
1563              "[opencl_load_source] program number `%d' already in use when loading file `%s'!\n", prog,
1564              filename);
1565     return 0;
1566   }
1567 
1568   FILE *f = fopen_stat(filename, &filestat);
1569   if(!f) return 0;
1570 
1571   size_t filesize = filestat.st_size;
1572   char *file = (char *)malloc(filesize + 2048);
1573   size_t rd = fread(file, sizeof(char), filesize, f);
1574   fclose(f);
1575   if(rd != filesize)
1576   {
1577     free(file);
1578     dt_print(DT_DEBUG_OPENCL, "[opencl_load_source] could not read all of file `%s'!\n", filename);
1579     return 0;
1580   }
1581 
1582   char *start = file + filesize;
1583   char *end = start + 2048;
1584   size_t len;
1585 
1586   cl_device_id devid = cl->dev[dev].devid;
1587   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DRIVER_VERSION, end - start, start, &len);
1588   start += len;
1589 
1590   cl_platform_id platform;
1591   (cl->dlocl->symbols->dt_clGetDeviceInfo)(devid, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL);
1592 
1593   (cl->dlocl->symbols->dt_clGetPlatformInfo)(platform, CL_PLATFORM_VERSION, end - start, start, &len);
1594   start += len;
1595 
1596   len = g_strlcpy(start, cl->dev[dev].options, end - start);
1597   start += len;
1598 
1599   /* make sure that the md5sums of all the includes are applied as well */
1600   for(int n = 0; n < DT_OPENCL_MAX_INCLUDES; n++)
1601   {
1602     if(!includemd5[n]) continue;
1603     len = g_strlcpy(start, includemd5[n], end - start);
1604     start += len;
1605   }
1606 
1607   char *source_md5 = g_compute_checksum_for_data(G_CHECKSUM_MD5, (guchar *)file, start - file);
1608   g_strlcpy(md5sum, source_md5, 33);
1609   g_free(source_md5);
1610 
1611   file[filesize] = '\0';
1612 
1613   char linkedfile[PATH_MAX] = { 0 };
1614   ssize_t linkedfile_len = 0;
1615 
1616 #if defined(_WIN32)
1617   // No symlinks on Windows
1618   // Have to figure out the name using the filename + md5sum
1619   char dup[PATH_MAX] = { 0 };
1620   snprintf(dup, sizeof(dup), "%s.%s", binname, md5sum);
1621   FILE *cached = fopen_stat(dup, &cachedstat);
1622   g_strlcpy(linkedfile, md5sum, sizeof(linkedfile));
1623   linkedfile_len = strlen(md5sum);
1624 #else
1625   FILE *cached = fopen_stat(binname, &cachedstat);
1626 #endif
1627 
1628   if(cached)
1629   {
1630 #if !defined(_WIN32)
1631     linkedfile_len = readlink(binname, linkedfile, sizeof(linkedfile) - 1);
1632 #endif // !defined(_WIN32)
1633     if(linkedfile_len > 0)
1634     {
1635       linkedfile[linkedfile_len] = '\0';
1636 
1637       if(strncmp(linkedfile, md5sum, 33) == 0)
1638       {
1639         // md5sum matches, load cached binary
1640         size_t cached_filesize = cachedstat.st_size;
1641 
1642         unsigned char *cached_content = (unsigned char *)malloc(cached_filesize + 1);
1643         rd = fread(cached_content, sizeof(char), cached_filesize, cached);
1644         if(rd != cached_filesize)
1645         {
1646           dt_print(DT_DEBUG_OPENCL, "[opencl_load_program] could not read all of file '%s' MD5: %s!\n", binname, md5sum);
1647         }
1648         else
1649         {
1650           cl->dev[dev].program[prog] = (cl->dlocl->symbols->dt_clCreateProgramWithBinary)(
1651               cl->dev[dev].context, 1, &(cl->dev[dev].devid), &cached_filesize,
1652               (const unsigned char **)&cached_content, NULL, &err);
1653           if(err != CL_SUCCESS)
1654           {
1655             dt_print(DT_DEBUG_OPENCL,
1656                      "[opencl_load_program] could not load cached binary program from file '%s' MD5: '%s'! (%d)\n",
1657                      binname, md5sum, err);
1658           }
1659           else
1660           {
1661             cl->dev[dev].program_used[prog] = 1;
1662             *loaded_cached = 1;
1663           }
1664         }
1665         free(cached_content);
1666       }
1667     }
1668     fclose(cached);
1669   }
1670 
1671 
1672   if(*loaded_cached == 0)
1673   {
1674     // if loading cached was unsuccessful for whatever reason,
1675     // try to remove cached binary & link
1676 #if !defined(_WIN32)
1677     if(linkedfile_len > 0)
1678     {
1679       char link_dest[PATH_MAX] = { 0 };
1680       snprintf(link_dest, sizeof(link_dest), "%s" G_DIR_SEPARATOR_S "%s", cachedir, linkedfile);
1681       g_unlink(link_dest);
1682     }
1683     g_unlink(binname);
1684 #else
1685     // delete the file which contains the MD5 name
1686     g_unlink(dup);
1687 #endif //!defined(_WIN32)
1688 
1689     dt_print(DT_DEBUG_OPENCL,
1690              "[opencl_load_program] could not load cached binary program, trying to compile source\n");
1691 
1692     cl->dev[dev].program[prog] = (cl->dlocl->symbols->dt_clCreateProgramWithSource)(
1693         cl->dev[dev].context, 1, (const char **)&file, &filesize, &err);
1694     free(file);
1695     if(err != CL_SUCCESS)
1696     {
1697       dt_print(DT_DEBUG_OPENCL, "[opencl_load_source] could not create program from file `%s'! (%d)\n",
1698                filename, err);
1699       return 0;
1700     }
1701     else
1702     {
1703       cl->dev[dev].program_used[prog] = 1;
1704     }
1705   }
1706   else
1707   {
1708     free(file);
1709     dt_print(DT_DEBUG_OPENCL, "[opencl_load_program] loaded cached binary program from file '%s' MD5: '%s' \n", binname, md5sum);
1710   }
1711 
1712   dt_print(DT_DEBUG_OPENCL, "[opencl_load_program] successfully loaded program from '%s' MD5: '%s'\n", filename, md5sum);
1713 
1714   return 1;
1715 }
1716 
dt_opencl_build_program(const int dev,const int prog,const char * binname,const char * cachedir,char * md5sum,int loaded_cached)1717 int dt_opencl_build_program(const int dev, const int prog, const char *binname, const char *cachedir,
1718                             char *md5sum, int loaded_cached)
1719 {
1720   if(prog < 0 || prog >= DT_OPENCL_MAX_PROGRAMS) return -1;
1721   dt_opencl_t *cl = darktable.opencl;
1722   cl_program program = cl->dev[dev].program[prog];
1723   cl_int err;
1724   err = (cl->dlocl->symbols->dt_clBuildProgram)(program, 1, &(cl->dev[dev].devid), cl->dev[dev].options, 0, 0);
1725 
1726   if(err != CL_SUCCESS)
1727     dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] could not build program: %d\n", err);
1728   else
1729     dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] successfully built program\n");
1730 
1731   cl_build_status build_status;
1732   (cl->dlocl->symbols->dt_clGetProgramBuildInfo)(program, cl->dev[dev].devid, CL_PROGRAM_BUILD_STATUS,
1733                                                  sizeof(cl_build_status), &build_status, NULL);
1734   dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] BUILD STATUS: %d\n", build_status);
1735 
1736   char *build_log;
1737   size_t ret_val_size;
1738   (cl->dlocl->symbols->dt_clGetProgramBuildInfo)(program, cl->dev[dev].devid, CL_PROGRAM_BUILD_LOG, 0, NULL,
1739                                                  &ret_val_size);
1740   if(ret_val_size != SIZE_MAX)
1741   {
1742     build_log = (char *)malloc(sizeof(char) * (ret_val_size + 1));
1743     if(build_log)
1744     {
1745       (cl->dlocl->symbols->dt_clGetProgramBuildInfo)(program, cl->dev[dev].devid, CL_PROGRAM_BUILD_LOG,
1746                                                      ret_val_size, build_log, NULL);
1747 
1748       build_log[ret_val_size] = '\0';
1749 
1750       dt_print(DT_DEBUG_OPENCL, "BUILD LOG:\n");
1751       dt_print(DT_DEBUG_OPENCL, "%s\n", build_log);
1752 
1753       free(build_log);
1754     }
1755   }
1756 
1757   if(err != CL_SUCCESS)
1758     return err;
1759   else
1760   {
1761     if(!loaded_cached)
1762     {
1763       dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] saving binary\n");
1764 
1765       cl_uint numdev = 0;
1766       err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint),
1767                                                       &numdev, NULL);
1768       if(err != CL_SUCCESS)
1769       {
1770         dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_NUM_DEVICES failed: %d\n", err);
1771         return CL_SUCCESS;
1772       }
1773 
1774       cl_device_id *devices = malloc(sizeof(cl_device_id) * numdev);
1775       err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_DEVICES,
1776                                                       sizeof(cl_device_id) * numdev, devices, NULL);
1777       if(err != CL_SUCCESS)
1778       {
1779         dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_DEVICES failed: %d\n", err);
1780         free(devices);
1781         return CL_SUCCESS;
1782       }
1783 
1784       size_t *binary_sizes = malloc(sizeof(size_t) * numdev);
1785       err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_BINARY_SIZES,
1786                                                       sizeof(size_t) * numdev, binary_sizes, NULL);
1787       if(err != CL_SUCCESS)
1788       {
1789         dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_BINARY_SIZES failed: %d\n", err);
1790         free(binary_sizes);
1791         free(devices);
1792         return CL_SUCCESS;
1793       }
1794 
1795       unsigned char **binaries = malloc(sizeof(unsigned char *) * numdev);
1796       for(int i = 0; i < numdev; i++) binaries[i] = (unsigned char *)malloc(binary_sizes[i]);
1797       err = (cl->dlocl->symbols->dt_clGetProgramInfo)(program, CL_PROGRAM_BINARIES,
1798                                                       sizeof(unsigned char *) * numdev, binaries, NULL);
1799       if(err != CL_SUCCESS)
1800       {
1801         dt_print(DT_DEBUG_OPENCL, "[opencl_build_program] CL_PROGRAM_BINARIES failed: %d\n", err);
1802         goto ret;
1803       }
1804 
1805       for(int i = 0; i < numdev; i++)
1806         if(cl->dev[dev].devid == devices[i])
1807         {
1808           // save opencl compiled binary as md5sum-named file
1809           char link_dest[PATH_MAX] = { 0 };
1810           snprintf(link_dest, sizeof(link_dest), "%s" G_DIR_SEPARATOR_S "%s", cachedir, md5sum);
1811           FILE *f = g_fopen(link_dest, "wb");
1812           if(!f) goto ret;
1813           size_t bytes_written = fwrite(binaries[i], sizeof(char), binary_sizes[i], f);
1814           if(bytes_written != binary_sizes[i]) goto ret;
1815           fclose(f);
1816 
1817           // create link (e.g. basic.cl.bin -> f1430102c53867c162bb60af6c163328)
1818           char cwd[PATH_MAX] = { 0 };
1819           if(!getcwd(cwd, sizeof(cwd))) goto ret;
1820           if(chdir(cachedir) != 0) goto ret;
1821           char dup[PATH_MAX] = { 0 };
1822           g_strlcpy(dup, binname, sizeof(dup));
1823           char *bname = basename(dup);
1824 #if defined(_WIN32)
1825           //CreateSymbolicLink in Windows requires admin privileges, which we don't want/need
1826           //store has using a simple filerename
1827           char finalfilename[PATH_MAX] = { 0 };
1828           snprintf(finalfilename, sizeof(finalfilename), "%s" G_DIR_SEPARATOR_S "%s.%s", cachedir, bname, md5sum);
1829           rename(link_dest, finalfilename);
1830 #else
1831           if(symlink(md5sum, bname) != 0) goto ret;
1832 #endif //!defined(_WIN32)
1833           if(chdir(cwd) != 0) goto ret;
1834         }
1835 
1836     ret:
1837       for(int i = 0; i < numdev; i++) free(binaries[i]);
1838       free(binaries);
1839       free(binary_sizes);
1840       free(devices);
1841     }
1842     return CL_SUCCESS;
1843   }
1844 }
1845 
dt_opencl_create_kernel(const int prog,const char * name)1846 int dt_opencl_create_kernel(const int prog, const char *name)
1847 {
1848   dt_opencl_t *cl = darktable.opencl;
1849   if(!cl->inited) return -1;
1850   if(prog < 0 || prog >= DT_OPENCL_MAX_PROGRAMS) return -1;
1851   dt_pthread_mutex_lock(&cl->lock);
1852   int k = 0;
1853   for(int dev = 0; dev < cl->num_devs; dev++)
1854   {
1855     cl_int err;
1856     for(; k < DT_OPENCL_MAX_KERNELS; k++)
1857       if(!cl->dev[dev].kernel_used[k])
1858       {
1859         cl->dev[dev].kernel_used[k] = 1;
1860         cl->dev[dev].kernel[k]
1861             = (cl->dlocl->symbols->dt_clCreateKernel)(cl->dev[dev].program[prog], name, &err);
1862         if(err != CL_SUCCESS)
1863         {
1864           dt_print(DT_DEBUG_OPENCL, "[opencl_create_kernel] could not create kernel `%s'! (%d)\n", name, err);
1865           cl->dev[dev].kernel_used[k] = 0;
1866           goto error;
1867         }
1868         else
1869           break;
1870       }
1871     if(k < DT_OPENCL_MAX_KERNELS)
1872     {
1873       dt_print(DT_DEBUG_OPENCL, "[opencl_create_kernel] successfully loaded kernel `%s' (%d) for device %d\n",
1874                name, k, dev);
1875     }
1876     else
1877     {
1878       dt_print(DT_DEBUG_OPENCL, "[opencl_create_kernel] too many kernels! can't create kernel `%s'\n", name);
1879       goto error;
1880     }
1881   }
1882   dt_pthread_mutex_unlock(&cl->lock);
1883   return k;
1884 error:
1885   dt_pthread_mutex_unlock(&cl->lock);
1886   return -1;
1887 }
1888 
dt_opencl_free_kernel(const int kernel)1889 void dt_opencl_free_kernel(const int kernel)
1890 {
1891   dt_opencl_t *cl = darktable.opencl;
1892   if(!cl->inited) return;
1893   if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return;
1894   dt_pthread_mutex_lock(&cl->lock);
1895   for(int dev = 0; dev < cl->num_devs; dev++)
1896   {
1897     cl->dev[dev].kernel_used[kernel] = 0;
1898     (cl->dlocl->symbols->dt_clReleaseKernel)(cl->dev[dev].kernel[kernel]);
1899   }
1900   dt_pthread_mutex_unlock(&cl->lock);
1901 }
1902 
dt_opencl_get_max_work_item_sizes(const int dev,size_t * sizes)1903 int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes)
1904 {
1905   dt_opencl_t *cl = darktable.opencl;
1906   if(!cl->inited || dev < 0) return -1;
1907   return (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1908                                                   sizeof(size_t) * 3, sizes, NULL);
1909 }
1910 
dt_opencl_get_work_group_limits(const int dev,size_t * sizes,size_t * workgroupsize,unsigned long * localmemsize)1911 int dt_opencl_get_work_group_limits(const int dev, size_t *sizes, size_t *workgroupsize,
1912                                     unsigned long *localmemsize)
1913 {
1914   dt_opencl_t *cl = darktable.opencl;
1915   if(!cl->inited || dev < 0) return -1;
1916   cl_ulong lmemsize;
1917   cl_int err;
1918 
1919   err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_LOCAL_MEM_SIZE,
1920                                                  sizeof(cl_ulong), &lmemsize, NULL);
1921   if(err != CL_SUCCESS) return err;
1922 
1923   *localmemsize = lmemsize;
1924 
1925   err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_MAX_WORK_GROUP_SIZE,
1926                                                  sizeof(size_t), workgroupsize, NULL);
1927   if(err != CL_SUCCESS) return err;
1928 
1929   return dt_opencl_get_max_work_item_sizes(dev, sizes);
1930 }
1931 
1932 
dt_opencl_get_kernel_work_group_size(const int dev,const int kernel,size_t * kernelworkgroupsize)1933 int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel, size_t *kernelworkgroupsize)
1934 {
1935   dt_opencl_t *cl = darktable.opencl;
1936   if(!cl->inited || dev < 0) return -1;
1937   if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return -1;
1938 
1939   return (cl->dlocl->symbols->dt_clGetKernelWorkGroupInfo)(cl->dev[dev].kernel[kernel], cl->dev[dev].devid,
1940                                                            CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
1941                                                            kernelworkgroupsize, NULL);
1942 }
1943 
1944 
dt_opencl_set_kernel_arg(const int dev,const int kernel,const int num,const size_t size,const void * arg)1945 int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size,
1946                              const void *arg)
1947 {
1948   dt_opencl_t *cl = darktable.opencl;
1949   if(!cl->inited || dev < 0) return -1;
1950   if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return -1;
1951   return (cl->dlocl->symbols->dt_clSetKernelArg)(cl->dev[dev].kernel[kernel], num, size, arg);
1952 }
1953 
dt_opencl_enqueue_kernel_2d(const int dev,const int kernel,const size_t * sizes)1954 int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes)
1955 {
1956   return dt_opencl_enqueue_kernel_2d_with_local(dev, kernel, sizes, NULL);
1957 }
1958 
1959 
dt_opencl_enqueue_kernel_2d_with_local(const int dev,const int kernel,const size_t * sizes,const size_t * local)1960 int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes,
1961                                            const size_t *local)
1962 {
1963   dt_opencl_t *cl = darktable.opencl;
1964   if(!cl->inited || dev < 0) return -1;
1965   if(kernel < 0 || kernel >= DT_OPENCL_MAX_KERNELS) return -1;
1966   int err;
1967   char buf[256];
1968   buf[0] = '\0';
1969   if(darktable.unmuted & DT_DEBUG_OPENCL)
1970     (cl->dlocl->symbols->dt_clGetKernelInfo)(cl->dev[dev].kernel[kernel], CL_KERNEL_FUNCTION_NAME, 256, buf,
1971                                             NULL);
1972   cl_event *eventp = dt_opencl_events_get_slot(dev, buf);
1973   err = (cl->dlocl->symbols->dt_clEnqueueNDRangeKernel)(cl->dev[dev].cmd_queue, cl->dev[dev].kernel[kernel],
1974                                                         2, NULL, sizes, local, 0, NULL, eventp);
1975   // if (err == CL_SUCCESS) err = dt_opencl_finish(dev);
1976   return err;
1977 }
1978 
dt_opencl_copy_device_to_host(const int devid,void * host,void * device,const int width,const int height,const int bpp)1979 int dt_opencl_copy_device_to_host(const int devid, void *host, void *device, const int width,
1980                                   const int height, const int bpp)
1981 {
1982   return dt_opencl_read_host_from_device(devid, host, device, width, height, bpp);
1983 }
1984 
dt_opencl_read_host_from_device(const int devid,void * host,void * device,const int width,const int height,const int bpp)1985 int dt_opencl_read_host_from_device(const int devid, void *host, void *device, const int width,
1986                                     const int height, const int bpp)
1987 {
1988   return dt_opencl_read_host_from_device_rowpitch(devid, host, device, width, height, bpp * width);
1989 }
1990 
dt_opencl_read_host_from_device_rowpitch(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)1991 int dt_opencl_read_host_from_device_rowpitch(const int devid, void *host, void *device, const int width,
1992                                              const int height, const int rowpitch)
1993 {
1994   if(!darktable.opencl->inited || devid < 0) return -1;
1995   const size_t origin[] = { 0, 0, 0 };
1996   const size_t region[] = { width, height, 1 };
1997   // blocking.
1998   return dt_opencl_read_host_from_device_raw(devid, host, device, origin, region, rowpitch, CL_TRUE);
1999 }
2000 
dt_opencl_read_host_from_device_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int bpp)2001 int dt_opencl_read_host_from_device_non_blocking(const int devid, void *host, void *device, const int width,
2002                                                  const int height, const int bpp)
2003 {
2004   return dt_opencl_read_host_from_device_rowpitch_non_blocking(devid, host, device, width, height,
2005                                                                bpp * width);
2006 }
2007 
dt_opencl_read_host_from_device_rowpitch_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)2008 int dt_opencl_read_host_from_device_rowpitch_non_blocking(const int devid, void *host, void *device,
2009                                                           const int width, const int height,
2010                                                           const int rowpitch)
2011 {
2012   if(!darktable.opencl->inited || devid < 0) return -1;
2013   const size_t origin[] = { 0, 0, 0 };
2014   const size_t region[] = { width, height, 1 };
2015   // non-blocking.
2016   return dt_opencl_read_host_from_device_raw(devid, host, device, origin, region, rowpitch, CL_FALSE);
2017 }
2018 
2019 
dt_opencl_read_host_from_device_raw(const int devid,void * host,void * device,const size_t * origin,const size_t * region,const int rowpitch,const int blocking)2020 int dt_opencl_read_host_from_device_raw(const int devid, void *host, void *device, const size_t *origin,
2021                                         const size_t *region, const int rowpitch, const int blocking)
2022 {
2023   if(!darktable.opencl->inited) return -1;
2024 
2025   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Read Image (from device to host)]");
2026 
2027   return (darktable.opencl->dlocl->symbols->dt_clEnqueueReadImage)(darktable.opencl->dev[devid].cmd_queue,
2028                                                                    device, blocking, origin, region, rowpitch,
2029                                                                    0, host, 0, NULL, eventp);
2030 }
2031 
dt_opencl_write_host_to_device(const int devid,void * host,void * device,const int width,const int height,const int bpp)2032 int dt_opencl_write_host_to_device(const int devid, void *host, void *device, const int width,
2033                                    const int height, const int bpp)
2034 {
2035   return dt_opencl_write_host_to_device_rowpitch(devid, host, device, width, height, width * bpp);
2036 }
2037 
dt_opencl_write_host_to_device_rowpitch(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)2038 int dt_opencl_write_host_to_device_rowpitch(const int devid, void *host, void *device, const int width,
2039                                             const int height, const int rowpitch)
2040 {
2041   if(!darktable.opencl->inited || devid < 0) return -1;
2042   const size_t origin[] = { 0, 0, 0 };
2043   const size_t region[] = { width, height, 1 };
2044   // blocking.
2045   return dt_opencl_write_host_to_device_raw(devid, host, device, origin, region, rowpitch, CL_TRUE);
2046 }
2047 
dt_opencl_write_host_to_device_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int bpp)2048 int dt_opencl_write_host_to_device_non_blocking(const int devid, void *host, void *device, const int width,
2049                                                 const int height, const int bpp)
2050 {
2051   return dt_opencl_write_host_to_device_rowpitch_non_blocking(devid, host, device, width, height, width * bpp);
2052 }
2053 
dt_opencl_write_host_to_device_rowpitch_non_blocking(const int devid,void * host,void * device,const int width,const int height,const int rowpitch)2054 int dt_opencl_write_host_to_device_rowpitch_non_blocking(const int devid, void *host, void *device,
2055                                                          const int width, const int height,
2056                                                          const int rowpitch)
2057 {
2058   if(!darktable.opencl->inited || devid < 0) return -1;
2059   const size_t origin[] = { 0, 0, 0 };
2060   const size_t region[] = { width, height, 1 };
2061   // non-blocking.
2062   return dt_opencl_write_host_to_device_raw(devid, host, device, origin, region, rowpitch, CL_FALSE);
2063 }
2064 
dt_opencl_write_host_to_device_raw(const int devid,void * host,void * device,const size_t * origin,const size_t * region,const int rowpitch,const int blocking)2065 int dt_opencl_write_host_to_device_raw(const int devid, void *host, void *device, const size_t *origin,
2066                                        const size_t *region, const int rowpitch, const int blocking)
2067 {
2068   if(!darktable.opencl->inited) return -1;
2069 
2070   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Write Image (from host to device)]");
2071 
2072   return (darktable.opencl->dlocl->symbols->dt_clEnqueueWriteImage)(darktable.opencl->dev[devid].cmd_queue,
2073                                                                     device, blocking, origin, region,
2074                                                                     rowpitch, 0, host, 0, NULL, eventp);
2075 }
2076 
dt_opencl_enqueue_copy_image(const int devid,cl_mem src,cl_mem dst,size_t * orig_src,size_t * orig_dst,size_t * region)2077 int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst,
2078                                  size_t *region)
2079 {
2080   if(!darktable.opencl->inited || devid < 0) return -1;
2081   cl_int err;
2082   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Image (on device)]");
2083   err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyImage)(
2084       darktable.opencl->dev[devid].cmd_queue, src, dst, orig_src, orig_dst, region, 0, NULL, eventp);
2085   if(err != CL_SUCCESS) dt_print(DT_DEBUG_OPENCL, "[opencl copy_image] could not copy image: %d\n", err);
2086   return err;
2087 }
2088 
dt_opencl_enqueue_copy_image_to_buffer(const int devid,cl_mem src_image,cl_mem dst_buffer,size_t * origin,size_t * region,size_t offset)2089 int dt_opencl_enqueue_copy_image_to_buffer(const int devid, cl_mem src_image, cl_mem dst_buffer,
2090                                            size_t *origin, size_t *region, size_t offset)
2091 {
2092   if(!darktable.opencl->inited) return -1;
2093   cl_int err;
2094   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Image to Buffer (on device)]");
2095   err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyImageToBuffer)(
2096       darktable.opencl->dev[devid].cmd_queue, src_image, dst_buffer, origin, region, offset, 0, NULL, eventp);
2097   if(err != CL_SUCCESS)
2098     dt_print(DT_DEBUG_OPENCL, "[opencl copy_image_to_buffer] could not copy image: %d\n", err);
2099   return err;
2100 }
2101 
dt_opencl_enqueue_copy_buffer_to_image(const int devid,cl_mem src_buffer,cl_mem dst_image,size_t offset,size_t * origin,size_t * region)2102 int dt_opencl_enqueue_copy_buffer_to_image(const int devid, cl_mem src_buffer, cl_mem dst_image,
2103                                            size_t offset, size_t *origin, size_t *region)
2104 {
2105   if(!darktable.opencl->inited) return -1;
2106   cl_int err;
2107   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Buffer to Image (on device)]");
2108   err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyBufferToImage)(
2109       darktable.opencl->dev[devid].cmd_queue, src_buffer, dst_image, offset, origin, region, 0, NULL, eventp);
2110   if(err != CL_SUCCESS)
2111     dt_print(DT_DEBUG_OPENCL, "[opencl copy_buffer_to_image] could not copy buffer: %d\n", err);
2112   return err;
2113 }
2114 
dt_opencl_enqueue_copy_buffer_to_buffer(const int devid,cl_mem src_buffer,cl_mem dst_buffer,size_t srcoffset,size_t dstoffset,size_t size)2115 int dt_opencl_enqueue_copy_buffer_to_buffer(const int devid, cl_mem src_buffer, cl_mem dst_buffer,
2116                                             size_t srcoffset, size_t dstoffset, size_t size)
2117 {
2118   if(!darktable.opencl->inited) return -1;
2119   cl_int err;
2120   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Copy Buffer to Buffer (on device)]");
2121   err = (darktable.opencl->dlocl->symbols->dt_clEnqueueCopyBuffer)(darktable.opencl->dev[devid].cmd_queue,
2122                                                                    src_buffer, dst_buffer, srcoffset,
2123                                                                    dstoffset, size, 0, NULL, eventp);
2124   if(err != CL_SUCCESS)
2125     dt_print(DT_DEBUG_OPENCL, "[opencl copy_buffer_to_buffer] could not copy buffer: %d\n", err);
2126   return err;
2127 }
2128 
dt_opencl_read_buffer_from_device(const int devid,void * host,void * device,const size_t offset,const size_t size,const int blocking)2129 int dt_opencl_read_buffer_from_device(const int devid, void *host, void *device, const size_t offset,
2130                                       const size_t size, const int blocking)
2131 {
2132   if(!darktable.opencl->inited) return -1;
2133 
2134   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Read Buffer (from device to host)]");
2135 
2136   return (darktable.opencl->dlocl->symbols->dt_clEnqueueReadBuffer)(
2137       darktable.opencl->dev[devid].cmd_queue, device, blocking, offset, size, host, 0, NULL, eventp);
2138 }
2139 
dt_opencl_write_buffer_to_device(const int devid,void * host,void * device,const size_t offset,const size_t size,const int blocking)2140 int dt_opencl_write_buffer_to_device(const int devid, void *host, void *device, const size_t offset,
2141                                      const size_t size, const int blocking)
2142 {
2143   if(!darktable.opencl->inited) return -1;
2144 
2145   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Write Buffer (from host to device)]");
2146 
2147   return (darktable.opencl->dlocl->symbols->dt_clEnqueueWriteBuffer)(
2148       darktable.opencl->dev[devid].cmd_queue, device, blocking, offset, size, host, 0, NULL, eventp);
2149 }
2150 
2151 
dt_opencl_copy_host_to_device_constant(const int devid,const size_t size,void * host)2152 void *dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host)
2153 {
2154   if(!darktable.opencl->inited || devid < 0) return NULL;
2155   cl_int err;
2156   cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateBuffer)(
2157       darktable.opencl->dev[devid].context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size, host, &err);
2158   if(err != CL_SUCCESS)
2159     dt_print(DT_DEBUG_OPENCL,
2160              "[opencl copy_host_to_device_constant] could not alloc buffer on device %d: %d\n", devid, err);
2161 
2162   dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2163 
2164   return dev;
2165 }
2166 
dt_opencl_copy_host_to_device(const int devid,void * host,const int width,const int height,const int bpp)2167 void *dt_opencl_copy_host_to_device(const int devid, void *host, const int width, const int height,
2168                                     const int bpp)
2169 {
2170   return dt_opencl_copy_host_to_device_rowpitch(devid, host, width, height, bpp, 0);
2171 }
2172 
dt_opencl_copy_host_to_device_rowpitch(const int devid,void * host,const int width,const int height,const int bpp,const int rowpitch)2173 void *dt_opencl_copy_host_to_device_rowpitch(const int devid, void *host, const int width, const int height,
2174                                              const int bpp, const int rowpitch)
2175 {
2176   if(!darktable.opencl->inited || devid < 0) return NULL;
2177   cl_int err;
2178   cl_image_format fmt;
2179   // guess pixel format from bytes per pixel
2180   if(bpp == 4 * sizeof(float))
2181     fmt = (cl_image_format){ CL_RGBA, CL_FLOAT };
2182   else if(bpp == sizeof(float))
2183     fmt = (cl_image_format){ CL_R, CL_FLOAT };
2184   else if(bpp == sizeof(uint16_t))
2185     fmt = (cl_image_format){ CL_R, CL_UNSIGNED_INT16 };
2186   else
2187     return NULL;
2188 
2189   // TODO: if fmt = uint16_t, blow up to 4xuint16_t and copy manually!
2190   cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateImage2D)(
2191       darktable.opencl->dev[devid].context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &fmt, width, height,
2192       rowpitch, host, &err);
2193   if(err != CL_SUCCESS)
2194     dt_print(DT_DEBUG_OPENCL,
2195              "[opencl copy_host_to_device] could not alloc/copy img buffer on device %d: %d\n", devid, err);
2196 
2197   dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2198 
2199   return dev;
2200 }
2201 
2202 
dt_opencl_release_mem_object(cl_mem mem)2203 void dt_opencl_release_mem_object(cl_mem mem)
2204 {
2205   if(!darktable.opencl->inited) return;
2206 
2207   // the OpenCL specs are not absolutely clear if clReleaseMemObject(NULL) is a no-op. we take care of the
2208   // case in a centralized way at this place
2209   if(mem == NULL) return;
2210 
2211   dt_opencl_memory_statistics(-1, mem, OPENCL_MEMORY_SUB);
2212 
2213   (darktable.opencl->dlocl->symbols->dt_clReleaseMemObject)(mem);
2214 }
2215 
dt_opencl_map_buffer(const int devid,cl_mem buffer,const int blocking,const int flags,size_t offset,size_t size)2216 void *dt_opencl_map_buffer(const int devid, cl_mem buffer, const int blocking, const int flags, size_t offset,
2217                            size_t size)
2218 {
2219   if(!darktable.opencl->inited) return NULL;
2220   cl_int err;
2221   void *ptr;
2222   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Map Buffer]");
2223   ptr = (darktable.opencl->dlocl->symbols->dt_clEnqueueMapBuffer)(
2224       darktable.opencl->dev[devid].cmd_queue, buffer, blocking, flags, offset, size, 0, NULL, eventp, &err);
2225   if(err != CL_SUCCESS) dt_print(DT_DEBUG_OPENCL, "[opencl map buffer] could not map buffer: %d\n", err);
2226   return ptr;
2227 }
2228 
dt_opencl_unmap_mem_object(const int devid,cl_mem mem_object,void * mapped_ptr)2229 int dt_opencl_unmap_mem_object(const int devid, cl_mem mem_object, void *mapped_ptr)
2230 {
2231   if(!darktable.opencl->inited) return -1;
2232   cl_int err;
2233   cl_event *eventp = dt_opencl_events_get_slot(devid, "[Unmap Mem Object]");
2234   err = (darktable.opencl->dlocl->symbols->dt_clEnqueueUnmapMemObject)(
2235       darktable.opencl->dev[devid].cmd_queue, mem_object, mapped_ptr, 0, NULL, eventp);
2236   if(err != CL_SUCCESS)
2237     dt_print(DT_DEBUG_OPENCL, "[opencl unmap mem object] could not unmap mem object: %d\n", err);
2238   return err;
2239 }
2240 
dt_opencl_alloc_device(const int devid,const int width,const int height,const int bpp)2241 void *dt_opencl_alloc_device(const int devid, const int width, const int height, const int bpp)
2242 {
2243   if(!darktable.opencl->inited || devid < 0) return NULL;
2244   cl_int err;
2245   cl_image_format fmt;
2246   // guess pixel format from bytes per pixel
2247   if(bpp == 4 * sizeof(float))
2248     fmt = (cl_image_format){ CL_RGBA, CL_FLOAT };
2249   else if(bpp == sizeof(float))
2250     fmt = (cl_image_format){ CL_R, CL_FLOAT };
2251   else if(bpp == sizeof(uint16_t))
2252     fmt = (cl_image_format){ CL_R, CL_UNSIGNED_INT16 };
2253   else
2254     return NULL;
2255 
2256   cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateImage2D)(
2257       darktable.opencl->dev[devid].context, CL_MEM_READ_WRITE, &fmt, width, height, 0, NULL, &err);
2258   if(err != CL_SUCCESS)
2259     dt_print(DT_DEBUG_OPENCL, "[opencl alloc_device] could not alloc img buffer on device %d: %d\n", devid,
2260              err);
2261 
2262   dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2263 
2264   return dev;
2265 }
2266 
2267 
dt_opencl_alloc_device_use_host_pointer(const int devid,const int width,const int height,const int bpp,const int rowpitch,void * host)2268 void *dt_opencl_alloc_device_use_host_pointer(const int devid, const int width, const int height,
2269                                               const int bpp, const int rowpitch, void *host)
2270 {
2271   if(!darktable.opencl->inited || devid < 0) return NULL;
2272   cl_int err;
2273   cl_image_format fmt;
2274   // guess pixel format from bytes per pixel
2275   if(bpp == 4 * sizeof(float))
2276     fmt = (cl_image_format){ CL_RGBA, CL_FLOAT };
2277   else if(bpp == sizeof(float))
2278     fmt = (cl_image_format){ CL_R, CL_FLOAT };
2279   else if(bpp == sizeof(uint16_t))
2280     fmt = (cl_image_format){ CL_R, CL_UNSIGNED_INT16 };
2281   else
2282     return NULL;
2283 
2284   cl_mem dev = (darktable.opencl->dlocl->symbols->dt_clCreateImage2D)(
2285       darktable.opencl->dev[devid].context,
2286       CL_MEM_READ_WRITE | ((host == NULL) ? CL_MEM_ALLOC_HOST_PTR : CL_MEM_USE_HOST_PTR), &fmt, width, height,
2287       rowpitch, host, &err);
2288   if(err != CL_SUCCESS)
2289     dt_print(DT_DEBUG_OPENCL,
2290              "[opencl alloc_device_use_host_pointer] could not alloc img buffer on device %d: %d\n", devid,
2291              err);
2292 
2293   dt_opencl_memory_statistics(devid, dev, OPENCL_MEMORY_ADD);
2294 
2295   return dev;
2296 }
2297 
2298 
dt_opencl_alloc_device_buffer(const int devid,const size_t size)2299 void *dt_opencl_alloc_device_buffer(const int devid, const size_t size)
2300 {
2301   if(!darktable.opencl->inited) return NULL;
2302   cl_int err;
2303 
2304   cl_mem buf = (darktable.opencl->dlocl->symbols->dt_clCreateBuffer)(darktable.opencl->dev[devid].context,
2305                                                                      CL_MEM_READ_WRITE, size, NULL, &err);
2306   if(err != CL_SUCCESS)
2307     dt_print(DT_DEBUG_OPENCL, "[opencl alloc_device_buffer] could not alloc buffer on device %d: %d\n", devid,
2308              err);
2309 
2310   dt_opencl_memory_statistics(devid, buf, OPENCL_MEMORY_ADD);
2311 
2312   return buf;
2313 }
2314 
dt_opencl_alloc_device_buffer_with_flags(const int devid,const size_t size,const int flags)2315 void *dt_opencl_alloc_device_buffer_with_flags(const int devid, const size_t size, const int flags)
2316 {
2317   if(!darktable.opencl->inited) return NULL;
2318   cl_int err;
2319 
2320   cl_mem buf = (darktable.opencl->dlocl->symbols->dt_clCreateBuffer)(darktable.opencl->dev[devid].context,
2321                                                                      flags, size, NULL, &err);
2322   if(err != CL_SUCCESS)
2323     dt_print(DT_DEBUG_OPENCL, "[opencl alloc_device_buffer] could not alloc buffer on device %d: %d\n", devid,
2324              err);
2325 
2326   dt_opencl_memory_statistics(devid, buf, OPENCL_MEMORY_ADD);
2327 
2328   return buf;
2329 }
2330 
dt_opencl_get_mem_object_size(cl_mem mem)2331 size_t dt_opencl_get_mem_object_size(cl_mem mem)
2332 {
2333   cl_int err;
2334   size_t size;
2335   if(mem == NULL) return 0;
2336 
2337   err = (darktable.opencl->dlocl->symbols->dt_clGetMemObjectInfo)(mem, CL_MEM_SIZE, sizeof(size), &size, NULL);
2338 
2339   return (err == CL_SUCCESS) ? size : 0;
2340 }
2341 
dt_opencl_get_mem_context_id(cl_mem mem)2342 int dt_opencl_get_mem_context_id(cl_mem mem)
2343 {
2344   cl_int err;
2345   cl_context context;
2346   if(mem == NULL) return -1;
2347 
2348   err = (darktable.opencl->dlocl->symbols->dt_clGetMemObjectInfo)(mem, CL_MEM_CONTEXT, sizeof(context), &context, NULL);
2349   if(err != CL_SUCCESS)
2350     return -1;
2351 
2352   for(int devid = 0; devid < darktable.opencl->num_devs; devid++)
2353   {
2354     if(darktable.opencl->dev[devid].context == context)
2355       return devid;
2356   }
2357 
2358   return -1;
2359 }
2360 
dt_opencl_get_image_width(cl_mem mem)2361 int dt_opencl_get_image_width(cl_mem mem)
2362 {
2363   cl_int err;
2364   size_t size;
2365   if(mem == NULL) return 0;
2366 
2367   err = (darktable.opencl->dlocl->symbols->dt_clGetImageInfo)(mem, CL_IMAGE_WIDTH, sizeof(size), &size, NULL);
2368   if(size > INT_MAX) size = 0;
2369 
2370   return (err == CL_SUCCESS) ? (int)size : 0;
2371 }
2372 
dt_opencl_get_image_height(cl_mem mem)2373 int dt_opencl_get_image_height(cl_mem mem)
2374 {
2375   cl_int err;
2376   size_t size;
2377   if(mem == NULL) return 0;
2378 
2379   err = (darktable.opencl->dlocl->symbols->dt_clGetImageInfo)(mem, CL_IMAGE_HEIGHT, sizeof(size), &size, NULL);
2380   if(size > INT_MAX) size = 0;
2381 
2382   return (err == CL_SUCCESS) ? (int)size : 0;
2383 }
2384 
dt_opencl_get_image_element_size(cl_mem mem)2385 int dt_opencl_get_image_element_size(cl_mem mem)
2386 {
2387   cl_int err;
2388   size_t size;
2389   if(mem == NULL) return 0;
2390 
2391   err = (darktable.opencl->dlocl->symbols->dt_clGetImageInfo)(mem, CL_IMAGE_ELEMENT_SIZE, sizeof(size), &size,
2392                                                               NULL);
2393   if(size > INT_MAX) size = 0;
2394 
2395   return (err == CL_SUCCESS) ? (int)size : 0;
2396 }
2397 
dt_opencl_memory_statistics(int devid,cl_mem mem,dt_opencl_memory_t action)2398 void dt_opencl_memory_statistics(int devid, cl_mem mem, dt_opencl_memory_t action)
2399 {
2400   if(!((darktable.unmuted & DT_DEBUG_MEMORY) && (darktable.unmuted & DT_DEBUG_OPENCL)))
2401     return;
2402 
2403   if(devid < 0)
2404     devid = dt_opencl_get_mem_context_id(mem);
2405 
2406   if(devid < 0)
2407     return;
2408 
2409   if(action == OPENCL_MEMORY_ADD)
2410     darktable.opencl->dev[devid].memory_in_use += dt_opencl_get_mem_object_size(mem);
2411   else
2412     darktable.opencl->dev[devid].memory_in_use -= dt_opencl_get_mem_object_size(mem);
2413 
2414   darktable.opencl->dev[devid].peak_memory = MAX(darktable.opencl->dev[devid].peak_memory,
2415                                                  darktable.opencl->dev[devid].memory_in_use);
2416 
2417   if(darktable.unmuted & DT_DEBUG_MEMORY)
2418     dt_print(DT_DEBUG_OPENCL,
2419               "[opencl memory] device %d: %zu bytes (%.1f MB) in use\n", devid, darktable.opencl->dev[devid].memory_in_use,
2420                                       (float)darktable.opencl->dev[devid].memory_in_use/(1024*1024));
2421 }
2422 
2423 /** check if image size fit into limits given by OpenCL runtime */
dt_opencl_image_fits_device(const int devid,const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)2424 int dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height, const unsigned bpp,
2425                                 const float factor, const size_t overhead)
2426 {
2427   static float headroom = -1.0f;
2428 
2429   if(!darktable.opencl->inited || devid < 0) return FALSE;
2430 
2431   /* first time run */
2432   if(headroom < 0.0f)
2433   {
2434     headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
2435 
2436     /* don't let the user play games with us */
2437     headroom = fmin((float)darktable.opencl->dev[devid].max_global_mem, fmax(headroom, 0.0f));
2438     dt_conf_set_int("opencl_memory_headroom", headroom / 1024 / 1024);
2439   }
2440 
2441   float singlebuffer = (float)width * height * bpp;
2442   float total = factor * singlebuffer + overhead;
2443 
2444   if(darktable.opencl->dev[devid].max_image_width < width
2445      || darktable.opencl->dev[devid].max_image_height < height)
2446     return FALSE;
2447 
2448   if(darktable.opencl->dev[devid].max_mem_alloc < singlebuffer) return FALSE;
2449 
2450   if(darktable.opencl->dev[devid].max_global_mem < total + headroom) return FALSE;
2451 
2452   return TRUE;
2453 }
2454 
2455 
2456 /** round size to a multiple of the value given in config parameter opencl_size_roundup */
dt_opencl_roundup(int size)2457 int dt_opencl_roundup(int size)
2458 {
2459   static int roundup = -1;
2460 
2461   /* first time run */
2462   if(roundup < 0)
2463   {
2464     roundup = dt_conf_get_int("opencl_size_roundup");
2465 
2466     /* if not yet defined (or unsane), set a sane default */
2467     if(roundup <= 0)
2468     {
2469       roundup = 16;
2470       dt_conf_set_int("opencl_size_roundup", roundup);
2471     }
2472   }
2473 
2474   return (size % roundup == 0 ? size : (size / roundup + 1) * roundup);
2475 }
2476 
2477 
2478 /** check if opencl is inited */
dt_opencl_is_inited(void)2479 int dt_opencl_is_inited(void)
2480 {
2481   return darktable.opencl->inited;
2482 }
2483 
2484 
2485 /** check if opencl is enabled */
dt_opencl_is_enabled(void)2486 int dt_opencl_is_enabled(void)
2487 {
2488   if(!darktable.opencl->inited) return FALSE;
2489   return darktable.opencl->enabled;
2490 }
2491 
2492 
2493 /** disable opencl */
dt_opencl_disable(void)2494 void dt_opencl_disable(void)
2495 {
2496   if(!darktable.opencl->inited) return;
2497   darktable.opencl->enabled = FALSE;
2498   dt_conf_set_bool("opencl", FALSE);
2499 }
2500 
2501 
2502 /** update enabled flag and profile with value from preferences, returns enabled flag */
dt_opencl_update_settings(void)2503 int dt_opencl_update_settings(void)
2504 {
2505   // FIXME: This pulls in prefs every time the pixelpipe runs. Instead have a callback for DT_SIGNAL_PREFERENCES_CHANGE?
2506   if(!darktable.opencl->inited) return FALSE;
2507   const int prefs = dt_conf_get_bool("opencl");
2508 
2509   if(darktable.opencl->enabled != prefs)
2510   {
2511     darktable.opencl->enabled = prefs;
2512     darktable.opencl->stopped = 0;
2513     darktable.opencl->error_count = 0;
2514     dt_print(DT_DEBUG_OPENCL, "[opencl_update_enabled] enabled flag set to %s\n", prefs ? "ON" : "OFF");
2515   }
2516 
2517   dt_opencl_scheduling_profile_t profile = dt_opencl_get_scheduling_profile();
2518 
2519   if(darktable.opencl->scheduling_profile != profile)
2520   {
2521     char *pstr = dt_conf_get_string("opencl_scheduling_profile");
2522     dt_print(DT_DEBUG_OPENCL, "[opencl_update_scheduling_profile] scheduling profile set to %s\n", pstr);
2523     g_free(pstr);
2524     dt_opencl_apply_scheduling_profile(profile);
2525   }
2526 
2527   dt_opencl_sync_cache_t sync = dt_opencl_get_sync_cache();
2528 
2529   if(darktable.opencl->sync_cache != sync)
2530   {
2531     char *pstr = dt_conf_get_string("opencl_synch_cache");
2532     dt_print(DT_DEBUG_OPENCL, "[opencl_update_synch_cache] sync cache set to %s\n", pstr);
2533     g_free(pstr);
2534     darktable.opencl->sync_cache = sync;
2535   }
2536 
2537   return (darktable.opencl->enabled && !darktable.opencl->stopped);
2538 }
2539 
2540 /** read scheduling profile for config variables */
dt_opencl_get_scheduling_profile(void)2541 static dt_opencl_scheduling_profile_t dt_opencl_get_scheduling_profile(void)
2542 {
2543   char *pstr = dt_conf_get_string("opencl_scheduling_profile");
2544   if(!pstr) return OPENCL_PROFILE_DEFAULT;
2545 
2546   dt_opencl_scheduling_profile_t profile = OPENCL_PROFILE_DEFAULT;
2547 
2548   if(!strcmp(pstr, "multiple GPUs"))
2549     profile = OPENCL_PROFILE_MULTIPLE_GPUS;
2550   else if(!strcmp(pstr, "very fast GPU"))
2551     profile = OPENCL_PROFILE_VERYFAST_GPU;
2552 
2553   g_free(pstr);
2554 
2555   return profile;
2556 }
2557 
2558 /** read config of when/if to synch to cache */
dt_opencl_get_sync_cache(void)2559 static dt_opencl_sync_cache_t dt_opencl_get_sync_cache(void)
2560 {
2561   char *pstr = dt_conf_get_string("opencl_synch_cache");
2562   if(!pstr) return OPENCL_SYNC_ACTIVE_MODULE;
2563 
2564   dt_opencl_sync_cache_t sync = OPENCL_SYNC_ACTIVE_MODULE;
2565 
2566   if(!strcmp(pstr, "true"))
2567     sync = OPENCL_SYNC_TRUE;
2568   else if(!strcmp(pstr, "false"))
2569     sync = OPENCL_SYNC_FALSE;
2570 
2571   g_free(pstr);
2572 
2573   return sync;
2574 }
2575 
2576 /** set opencl specific synchronization timeout */
dt_opencl_set_synchronization_timeout(int value)2577 static void dt_opencl_set_synchronization_timeout(int value)
2578 {
2579   darktable.opencl->opencl_synchronization_timeout = value;
2580   dt_print(DT_DEBUG_OPENCL, "[opencl_synchronization_timeout] synchronization timeout set to %d\n", value);
2581 }
2582 
2583 /** adjust opencl subsystem according to scheduling profile */
dt_opencl_apply_scheduling_profile(dt_opencl_scheduling_profile_t profile)2584 static void dt_opencl_apply_scheduling_profile(dt_opencl_scheduling_profile_t profile)
2585 {
2586   char *str;
2587 
2588   dt_pthread_mutex_lock(&darktable.opencl->lock);
2589   darktable.opencl->scheduling_profile = profile;
2590 
2591   switch(profile)
2592   {
2593     case OPENCL_PROFILE_MULTIPLE_GPUS:
2594       dt_opencl_update_priorities("*/*/*/*/*");
2595       dt_opencl_set_synchronization_timeout(20);
2596       break;
2597     case OPENCL_PROFILE_VERYFAST_GPU:
2598       dt_opencl_update_priorities("+*/+*/+*/+*/+*");
2599       dt_opencl_set_synchronization_timeout(0);
2600       break;
2601     case OPENCL_PROFILE_DEFAULT:
2602     default:
2603       str = dt_conf_get_string("opencl_device_priority");
2604       dt_opencl_update_priorities(str);
2605       g_free(str);
2606       dt_opencl_set_synchronization_timeout(dt_conf_get_int("pixelpipe_synchronization_timeout"));
2607       break;
2608   }
2609   dt_pthread_mutex_unlock(&darktable.opencl->lock);
2610 }
2611 
2612 /** get global memory of device */
dt_opencl_get_max_global_mem(const int devid)2613 cl_ulong dt_opencl_get_max_global_mem(const int devid)
2614 {
2615   if(!darktable.opencl->inited || devid < 0) return 0;
2616   return darktable.opencl->dev[devid].max_global_mem;
2617 }
2618 
2619 
2620 /** the following eventlist functions assume that affected structures are locked upstream */
2621 
2622 /** get next free slot in eventlist (and manage size of eventlist) */
dt_opencl_events_get_slot(const int devid,const char * tag)2623 cl_event *dt_opencl_events_get_slot(const int devid, const char *tag)
2624 {
2625   dt_opencl_t *cl = darktable.opencl;
2626   if(!cl->inited || devid < 0) return NULL;
2627   if(!cl->use_events) return NULL;
2628 
2629   static const cl_event zeroevent[1]; // implicitly initialized to zero
2630   cl_event **eventlist = &(cl->dev[devid].eventlist);
2631   dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2632   int *numevents = &(cl->dev[devid].numevents);
2633   int *maxevents = &(cl->dev[devid].maxevents);
2634   int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2635   int *lostevents = &(cl->dev[devid].lostevents);
2636   int *totalevents = &(cl->dev[devid].totalevents);
2637   int *totallost = &(cl->dev[devid].totallost);
2638 
2639   // if first time called: allocate initial buffers
2640   if(*eventlist == NULL)
2641   {
2642     int newevents = DT_OPENCL_EVENTLISTSIZE;
2643     *eventlist = calloc(newevents, sizeof(cl_event));
2644     *eventtags = calloc(newevents, sizeof(dt_opencl_eventtag_t));
2645     if(!*eventlist || !*eventtags)
2646     {
2647       free(*eventlist);
2648       free(*eventtags);
2649       *eventlist = NULL;
2650       *eventtags = NULL;
2651       return NULL;
2652     }
2653     *maxevents = newevents;
2654   }
2655 
2656   // check if currently highest event slot was actually consumed. If not use it again
2657   if(*numevents > 0 && !memcmp((*eventlist) + *numevents - 1, zeroevent, sizeof(cl_event)))
2658   {
2659     (*lostevents)++;
2660     (*totallost)++;
2661     if(tag != NULL)
2662     {
2663       g_strlcpy((*eventtags)[*numevents - 1].tag, tag, DT_OPENCL_EVENTNAMELENGTH);
2664     }
2665     else
2666     {
2667       (*eventtags)[*numevents - 1].tag[0] = '\0';
2668     }
2669 
2670     (*totalevents)++;
2671     return (*eventlist) + *numevents - 1;
2672   }
2673 
2674   // check if we would exceed the number of available event handles. In that case first flush existing handles
2675   if(*numevents - *eventsconsolidated + 1 > darktable.opencl->number_event_handles)
2676     (void)dt_opencl_events_flush(devid, FALSE);
2677 
2678 
2679   // if no more space left in eventlist: grow buffer
2680   if(*numevents == *maxevents)
2681   {
2682     int newevents = *maxevents + DT_OPENCL_EVENTLISTSIZE;
2683     cl_event *neweventlist = calloc(newevents, sizeof(cl_event));
2684     dt_opencl_eventtag_t *neweventtags = calloc(newevents, sizeof(dt_opencl_eventtag_t));
2685     if(!neweventlist || !neweventtags)
2686     {
2687       free(neweventlist);
2688       free(neweventtags);
2689       return NULL;
2690     }
2691     memcpy(neweventlist, *eventlist, sizeof(cl_event) * *maxevents);
2692     memcpy(neweventtags, *eventtags, sizeof(dt_opencl_eventtag_t) * *maxevents);
2693     free(*eventlist);
2694     free(*eventtags);
2695     *eventlist = neweventlist;
2696     *eventtags = neweventtags;
2697     *maxevents = newevents;
2698   }
2699 
2700   // init next event slot and return it
2701   (*numevents)++;
2702   memcpy((*eventlist) + *numevents - 1, zeroevent, sizeof(cl_event));
2703   if(tag != NULL)
2704   {
2705     g_strlcpy((*eventtags)[*numevents - 1].tag, tag, DT_OPENCL_EVENTNAMELENGTH);
2706   }
2707   else
2708   {
2709     (*eventtags)[*numevents - 1].tag[0] = '\0';
2710   }
2711 
2712   (*totalevents)++;
2713   return (*eventlist) + *numevents - 1;
2714 }
2715 
2716 
2717 /** reset eventlist to empty state */
dt_opencl_events_reset(const int devid)2718 void dt_opencl_events_reset(const int devid)
2719 {
2720   dt_opencl_t *cl = darktable.opencl;
2721   if(!cl->inited || devid < 0) return;
2722   if(!cl->use_events) return;
2723 
2724   cl_event **eventlist = &(cl->dev[devid].eventlist);
2725   dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2726   int *numevents = &(cl->dev[devid].numevents);
2727   int *maxevents = &(cl->dev[devid].maxevents);
2728   int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2729   int *lostevents = &(cl->dev[devid].lostevents);
2730   cl_int *summary = &(cl->dev[devid].summary);
2731 
2732   if(*eventlist == NULL || *numevents == 0) return; // nothing to do
2733 
2734   // release all remaining events in eventlist, not to waste resources
2735   for(int k = *eventsconsolidated; k < *numevents; k++)
2736   {
2737     (cl->dlocl->symbols->dt_clReleaseEvent)((*eventlist)[k]);
2738   }
2739 
2740   memset(*eventtags, 0, sizeof(dt_opencl_eventtag_t) * *maxevents);
2741   *numevents = 0;
2742   *eventsconsolidated = 0;
2743   *lostevents = 0;
2744   *summary = CL_COMPLETE;
2745   return;
2746 }
2747 
2748 
2749 /** Wait for events in eventlist to terminate -> this is a blocking synchronization point!
2750     Does not flush eventlist. Side effect: might adjust numevents. */
dt_opencl_events_wait_for(const int devid)2751 void dt_opencl_events_wait_for(const int devid)
2752 {
2753   dt_opencl_t *cl = darktable.opencl;
2754   if(!cl->inited || devid < 0) return;
2755   if(!cl->use_events) return;
2756 
2757   static const cl_event zeroevent[1]; // implicitly initialized to zero
2758   cl_event **eventlist = &(cl->dev[devid].eventlist);
2759   int *numevents = &(cl->dev[devid].numevents);
2760   int *lostevents = &(cl->dev[devid].lostevents);
2761   int *totallost = &(cl->dev[devid].totallost);
2762   int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2763 
2764   if(*eventlist == NULL || *numevents == 0) return; // nothing to do
2765 
2766   // check if last event slot was actually used and correct numevents if needed
2767   if(!memcmp((*eventlist) + *numevents - 1, zeroevent, sizeof(cl_event)))
2768   {
2769     (*numevents)--;
2770     (*lostevents)++;
2771     (*totallost)++;
2772   }
2773 
2774   if(*numevents == *eventsconsolidated) return; // nothing to do
2775 
2776   assert(*numevents > *eventsconsolidated);
2777 
2778   // now wait for all remaining events to terminate
2779   // Risk: might never return in case of OpenCL blocks or endless loops
2780   // TODO: run clWaitForEvents in separate thread and implement watchdog timer
2781   (cl->dlocl->symbols->dt_clWaitForEvents)(*numevents - *eventsconsolidated,
2782                                            (*eventlist) + *eventsconsolidated);
2783 
2784   return;
2785 }
2786 
2787 
2788 /** Wait for events in eventlist to terminate, check for return status and profiling
2789 info of events.
2790 If "reset" is TRUE report summary info (would be CL_COMPLETE or last error code) and
2791 print profiling info if needed.
2792 If "reset" is FALSE just store info (success value, profiling) from terminated events
2793 and release events for re-use by OpenCL driver. */
dt_opencl_events_flush(const int devid,const int reset)2794 cl_int dt_opencl_events_flush(const int devid, const int reset)
2795 {
2796   dt_opencl_t *cl = darktable.opencl;
2797   if(!cl->inited || devid < 0) return FALSE;
2798   if(!cl->use_events) return FALSE;
2799 
2800   cl_event **eventlist = &(cl->dev[devid].eventlist);
2801   dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2802   int *numevents = &(cl->dev[devid].numevents);
2803   int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2804   int *lostevents = &(cl->dev[devid].lostevents);
2805   int *totalsuccess = &(cl->dev[devid].totalsuccess);
2806 
2807   cl_int *summary = &(cl->dev[devid].summary);
2808 
2809   if(*eventlist == NULL || *numevents == 0) return CL_COMPLETE; // nothing to do, no news is good news
2810 
2811   // Wait for command queue to terminate (side effect: might adjust *numevents)
2812   dt_opencl_events_wait_for(devid);
2813 
2814   // now check return status and profiling data of all newly terminated events
2815   for(int k = *eventsconsolidated; k < *numevents; k++)
2816   {
2817     cl_int err;
2818     char *tag = (*eventtags)[k].tag;
2819     cl_int *retval = &((*eventtags)[k].retval);
2820 
2821     // get return value of event
2822     err = (cl->dlocl->symbols->dt_clGetEventInfo)((*eventlist)[k], CL_EVENT_COMMAND_EXECUTION_STATUS,
2823                                                   sizeof(cl_int), retval, NULL);
2824     if(err != CL_SUCCESS)
2825     {
2826       dt_print(DT_DEBUG_OPENCL, "[opencl_events_flush] could not get event info for '%s': %d\n",
2827                tag[0] == '\0' ? "<?>" : tag, err);
2828     }
2829     else if(*retval != CL_COMPLETE)
2830     {
2831       dt_print(DT_DEBUG_OPENCL, "[opencl_events_flush] execution of '%s' %s: %d\n",
2832                tag[0] == '\0' ? "<?>" : tag, *retval == CL_COMPLETE ? "was successful" : "failed", *retval);
2833       *summary = *retval;
2834     }
2835     else
2836       (*totalsuccess)++;
2837 
2838     if(darktable.unmuted & DT_DEBUG_PERF)
2839     {
2840       // get profiling info of event (only if darktable was called with '-d perf')
2841       cl_ulong start;
2842       cl_ulong end;
2843       cl_int errs = (cl->dlocl->symbols->dt_clGetEventProfilingInfo)(
2844           (*eventlist)[k], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
2845       cl_int erre = (cl->dlocl->symbols->dt_clGetEventProfilingInfo)((*eventlist)[k], CL_PROFILING_COMMAND_END,
2846                                                                    sizeof(cl_ulong), &end, NULL);
2847       if(errs == CL_SUCCESS && erre == CL_SUCCESS)
2848       {
2849         (*eventtags)[k].timelapsed = end - start;
2850       }
2851       else
2852       {
2853         (*eventtags)[k].timelapsed = 0;
2854         (*lostevents)++;
2855       }
2856     }
2857     else
2858       (*eventtags)[k].timelapsed = 0;
2859 
2860     // finally release event to be re-used by driver
2861     (cl->dlocl->symbols->dt_clReleaseEvent)((*eventlist)[k]);
2862     (*eventsconsolidated)++;
2863   }
2864 
2865   cl_int result = *summary;
2866 
2867   // do we want to get rid of all stored info?
2868   if(reset)
2869   {
2870     // output profiling info if wanted
2871     if(darktable.unmuted & DT_DEBUG_PERF) dt_opencl_events_profiling(devid, 1);
2872 
2873     // reset eventlist structures to empty state
2874     dt_opencl_events_reset(devid);
2875   }
2876 
2877   return result == CL_COMPLETE ? 0 : result;
2878 }
2879 
2880 
2881 /** display OpenCL profiling information. If "aggregated" is TRUE, try to generate summarized info for each
2882  * kernel */
dt_opencl_events_profiling(const int devid,const int aggregated)2883 void dt_opencl_events_profiling(const int devid, const int aggregated)
2884 {
2885   dt_opencl_t *cl = darktable.opencl;
2886   if(!cl->inited || devid < 0) return;
2887   if(!cl->use_events) return;
2888 
2889   cl_event **eventlist = &(cl->dev[devid].eventlist);
2890   dt_opencl_eventtag_t **eventtags = &(cl->dev[devid].eventtags);
2891   int *numevents = &(cl->dev[devid].numevents);
2892   int *eventsconsolidated = &(cl->dev[devid].eventsconsolidated);
2893   int *lostevents = &(cl->dev[devid].lostevents);
2894 
2895   if(*eventlist == NULL || *numevents == 0 || *eventtags == NULL || *eventsconsolidated == 0)
2896     return; // nothing to do
2897 
2898   char **tags = malloc(sizeof(char *) * (*eventsconsolidated + 1));
2899   float *timings = malloc(sizeof(float) * (*eventsconsolidated + 1));
2900   int items = 1;
2901   tags[0] = "";
2902   timings[0] = 0.0f;
2903 
2904   // get profiling info and arrange it
2905   for(int k = 0; k < *eventsconsolidated; k++)
2906   {
2907     // if aggregated is TRUE, try to sum up timings for multiple runs of each kernel
2908     if(aggregated)
2909     {
2910       // linear search: this is not efficient at all but acceptable given the limited
2911       // number of events (ca. 10 - 20)
2912       int tagfound = -1;
2913       for(int i = 0; i < items; i++)
2914       {
2915         if(!strncmp(tags[i], (*eventtags)[k].tag, DT_OPENCL_EVENTNAMELENGTH))
2916         {
2917           tagfound = i;
2918           break;
2919         }
2920       }
2921 
2922       if(tagfound >= 0) // tag was already detected before
2923       {
2924         // sum up timings
2925         timings[tagfound] += (*eventtags)[k].timelapsed * 1e-9;
2926       }
2927       else // tag is new
2928       {
2929         // make new entry
2930         items++;
2931         tags[items - 1] = (*eventtags)[k].tag;
2932         timings[items - 1] = (*eventtags)[k].timelapsed * 1e-9;
2933       }
2934     }
2935 
2936     else // no aggregated info wanted -> arrange event by event
2937     {
2938       items++;
2939       tags[items - 1] = (*eventtags)[k].tag;
2940       timings[items - 1] = (*eventtags)[k].timelapsed * 1e-9;
2941     }
2942   }
2943 
2944   // now display profiling info
2945   dt_print(DT_DEBUG_OPENCL,
2946            "[opencl_profiling] profiling device %d ('%s'):\n", devid,
2947            cl->dev[devid].name);
2948 
2949   float total = 0.0f;
2950   for(int i = 1; i < items; i++)
2951   {
2952     dt_print(DT_DEBUG_OPENCL, "[opencl_profiling] spent %7.4f seconds in %s\n", (double)timings[i],
2953              tags[i][0] == '\0' ? "<?>" : tags[i]);
2954     total += timings[i];
2955   }
2956   // aggregated timing info for items without tag (if any)
2957   if(timings[0] != 0.0f)
2958   {
2959     dt_print(DT_DEBUG_OPENCL, "[opencl_profiling] spent %7.4f seconds (unallocated)\n", (double)timings[0]);
2960     total += timings[0];
2961   }
2962 
2963   dt_print(DT_DEBUG_OPENCL,
2964            "[opencl_profiling] spent %7.4f seconds totally in command queue (with %d event%s missing)\n",
2965            (double)total, *lostevents, *lostevents == 1 ? "" : "s");
2966 
2967   free(timings);
2968   free(tags);
2969 
2970   return;
2971 }
2972 
nextpow2(int n)2973 static int nextpow2(int n)
2974 {
2975   int k = 1;
2976   while (k < n)
2977     k <<= 1;
2978   return k;
2979 }
2980 
2981 // utility function to calculate optimal work group dimensions for a given kernel
2982 // taking device specific restrictions and local memory limitations into account
dt_opencl_local_buffer_opt(const int devid,const int kernel,dt_opencl_local_buffer_t * factors)2983 int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors)
2984 {
2985   dt_opencl_t *cl = darktable.opencl;
2986   if(!cl->inited || devid < 0) return FALSE;
2987 
2988   size_t maxsizes[3] = { 0 };     // the maximum dimensions for a work group
2989   size_t workgroupsize = 0;       // the maximum number of items in a work group
2990   unsigned long localmemsize = 0; // the maximum amount of local memory we can use
2991   size_t kernelworkgroupsize = 0; // the maximum amount of items in work group for this kernel
2992 
2993   int *blocksizex = &factors->sizex;
2994   int *blocksizey = &factors->sizey;
2995 
2996   // initial values must be supplied in sizex and sizey.
2997   // we make sure that these are a power of 2 and lie within reasonable limits.
2998   *blocksizex = CLAMP(nextpow2(*blocksizex), 1, 1 << 16);
2999   *blocksizey = CLAMP(nextpow2(*blocksizey), 1, 1 << 16);
3000 
3001   if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
3002      && dt_opencl_get_kernel_work_group_size(devid, kernel, &kernelworkgroupsize) == CL_SUCCESS)
3003   {
3004     while(maxsizes[0] < *blocksizex || maxsizes[1] < *blocksizey
3005        || localmemsize < ((factors->xfactor * (*blocksizex) + factors->xoffset) *
3006                           (factors->yfactor * (*blocksizey) + factors->yoffset)) * factors->cellsize + factors->overhead
3007        || workgroupsize < (size_t)(*blocksizex) * (*blocksizey) || kernelworkgroupsize < (size_t)(*blocksizex) * (*blocksizey))
3008     {
3009       if(*blocksizex == 1 && *blocksizey == 1) return FALSE;
3010 
3011       if(*blocksizex > *blocksizey)
3012         *blocksizex >>= 1;
3013       else
3014         *blocksizey >>= 1;
3015     }
3016   }
3017   else
3018   {
3019     dt_print(DT_DEBUG_OPENCL,
3020          "[opencl_demosaic] can not identify resource limits for device %d\n", devid);
3021     return FALSE;
3022   }
3023 
3024   return TRUE;
3025 }
3026 
3027 
3028 #endif
3029 
3030 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
3031 // vim: shiftwidth=2 expandtab tabstop=2 cindent
3032 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
3033