1 /*
2     This file is part of darktable,
3     Copyright (C) 2010-2021 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #pragma once
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #define DT_OPENCL_MAX_PLATFORMS 5
26 #define DT_OPENCL_MAX_PROGRAMS 256
27 #define DT_OPENCL_MAX_KERNELS 512
28 #define DT_OPENCL_EVENTLISTSIZE 256
29 #define DT_OPENCL_EVENTNAMELENGTH 64
30 #define DT_OPENCL_MAX_EVENTS 256
31 #define DT_OPENCL_MAX_ERRORS 5
32 #define DT_OPENCL_MAX_INCLUDES 7
33 #define DT_OPENCL_VENDOR_AMD 4098
34 #define DT_OPENCL_VENDOR_NVIDIA 4318
35 #define DT_OPENCL_VENDOR_INTEL 0x8086u
36 
37 #include "common/darktable.h"
38 
39 #ifdef HAVE_OPENCL
40 
41 #include "common/dlopencl.h"
42 #include "common/dtpthread.h"
43 #include "common/iop_profile.h"
44 #include "control/conf.h"
45 
46 // #pragma GCC diagnostic push
47 // #pragma GCC diagnostic ignored "-Wcomment"
48 #include <CL/cl.h>
49 // #pragma GCC diagnostic
50 
51 #define ROUNDUP(a, n) ((a) % (n) == 0 ? (a) : ((a) / (n)+1) * (n))
52 #define ROUNDUPWD(a) dt_opencl_roundup(a)
53 #define ROUNDUPHT(a) dt_opencl_roundup(a)
54 
55 typedef enum dt_opencl_memory_t
56 {
57   OPENCL_MEMORY_ADD,
58   OPENCL_MEMORY_SUB
59 } dt_opencl_memory_t;
60 
61 typedef enum dt_opencl_scheduling_profile_t
62 {
63   OPENCL_PROFILE_DEFAULT,
64   OPENCL_PROFILE_MULTIPLE_GPUS,
65   OPENCL_PROFILE_VERYFAST_GPU
66 } dt_opencl_scheduling_profile_t;
67 
68 typedef enum dt_opencl_sync_cache_t
69 {
70   OPENCL_SYNC_TRUE,
71   OPENCL_SYNC_ACTIVE_MODULE,
72   OPENCL_SYNC_FALSE
73 } dt_opencl_sync_cache_t;
74 
75 /**
76  * Accounting information used for OpenCL events.
77  */
78 typedef struct dt_opencl_eventtag_t
79 {
80   cl_int retval;
81   cl_ulong timelapsed;
82   char tag[DT_OPENCL_EVENTNAMELENGTH];
83 } dt_opencl_eventtag_t;
84 
85 
86 /**
87  * to support multi-gpu and mixed systems with cpu support,
88  * we encapsulate devices and use separate command queues.
89  */
90 typedef struct dt_opencl_device_t
91 {
92   dt_pthread_mutex_t lock;
93   cl_device_id devid;
94   cl_context context;
95   cl_command_queue cmd_queue;
96   size_t max_image_width;
97   size_t max_image_height;
98   cl_ulong max_mem_alloc;
99   cl_ulong max_global_mem;
100   cl_ulong used_global_mem;
101   cl_program program[DT_OPENCL_MAX_PROGRAMS];
102   cl_kernel kernel[DT_OPENCL_MAX_KERNELS];
103   int program_used[DT_OPENCL_MAX_PROGRAMS];
104   int kernel_used[DT_OPENCL_MAX_KERNELS];
105   cl_event *eventlist;
106   dt_opencl_eventtag_t *eventtags;
107   int numevents;
108   int eventsconsolidated;
109   int maxevents;
110   int lostevents;
111   int totalevents;
112   int totalsuccess;
113   int totallost;
114   int nvidia_sm_20;
115   const char *vendor;
116   const char *name;
117   const char *cname;
118   const char *options;
119   cl_int summary;
120   float benchmark;
121   size_t memory_in_use;
122   size_t peak_memory;
123 } dt_opencl_device_t;
124 
125 struct dt_bilateral_cl_global_t;
126 struct dt_local_laplacian_cl_global_t;
127 struct dt_dwt_cl_global_t; // wavelet decompose
128 struct dt_heal_cl_global_t; // healing
129 struct dt_colorspaces_cl_global_t; // colorspaces transform
130 struct dt_guided_filter_cl_global_t;
131 
132 /**
133  * main struct, stored in darktable.opencl.
134  * holds pointers to all
135  */
136 typedef struct dt_opencl_t
137 {
138   dt_pthread_mutex_t lock;
139   int inited;
140   int avoid_atomics;
141   int use_events;
142   int async_pixelpipe;
143   int number_event_handles;
144   int print_statistics;
145   dt_opencl_sync_cache_t sync_cache;
146   int micro_nap;
147   int enabled;
148   int stopped;
149   int num_devs;
150   int error_count;
151   int opencl_synchronization_timeout;
152   dt_opencl_scheduling_profile_t scheduling_profile;
153   uint32_t crc;
154   int mandatory[5];
155   int *dev_priority_image;
156   int *dev_priority_preview;
157   int *dev_priority_preview2;
158   int *dev_priority_export;
159   int *dev_priority_thumbnail;
160   dt_opencl_device_t *dev;
161   dt_dlopencl_t *dlocl;
162 
163   // global kernels for blending operations.
164   struct dt_blendop_cl_global_t *blendop;
165 
166   // global kernels for bilateral filtering, to be reused by a few plugins.
167   struct dt_bilateral_cl_global_t *bilateral;
168 
169   // global kernels for gaussian filtering, to be reused by a few plugins.
170   struct dt_gaussian_cl_global_t *gaussian;
171 
172   // global kernels for interpolation resampling.
173   struct dt_interpolation_cl_global_t *interpolation;
174 
175   // global kernels for local laplacian filter.
176   struct dt_local_laplacian_cl_global_t *local_laplacian;
177 
178   // global kernels for dwt filter.
179   struct dt_dwt_cl_global_t *dwt;
180 
181   // global kernels for heal filter.
182   struct dt_heal_cl_global_t *heal;
183 
184   // global kernels for colorspaces filter.
185   struct dt_colorspaces_cl_global_t *colorspaces;
186 
187   // global kernels for guided filter.
188   struct dt_guided_filter_cl_global_t *guided_filter;
189 } dt_opencl_t;
190 
191 /** description of memory requirements of local buffer
192   * local buffer size will be calculated as:
193   * (xoffset + xfactor * x) * (yoffset + yfactor * y) * cellsize + overhead; */
194 typedef struct dt_opencl_local_buffer_t
195 {
196   const int xoffset;
197   const int xfactor;
198   const int yoffset;
199   const int yfactor;
200   const size_t cellsize;
201   const size_t overhead;
202   int sizex;  // initial value and final values after optimization
203   int sizey;  // initial value and final values after optimization
204 } dt_opencl_local_buffer_t;
205 
206 /** internally calls dt_clGetDeviceInfo, and takes care of memory allocation
207  * afterwards, *param_value will point to memory block of size at least *param_value
208  * which needs to be free()'d manually */
209 int dt_opencl_get_device_info(dt_opencl_t *cl, cl_device_id device, cl_device_info param_name, void **param_value,
210                               size_t *param_value_size);
211 
212 /** inits the opencl subsystem. */
213 void dt_opencl_init(dt_opencl_t *cl, const gboolean exclude_opencl, const gboolean print_statistics);
214 
215 /** cleans up the opencl subsystem. */
216 void dt_opencl_cleanup(dt_opencl_t *cl);
217 
218 /** cleans up command queue. */
219 int dt_opencl_finish(const int devid);
220 
221 /** enqueues a synchronization point. */
222 int dt_opencl_enqueue_barrier(const int devid);
223 
224 /** locks a device for your thread's exclusive use */
225 int dt_opencl_lock_device(const int pipetype);
226 
227 /** done with your command queue. */
228 void dt_opencl_unlock_device(const int dev);
229 
230 /** calculates md5sums for a list of CL include files. */
231 void dt_opencl_md5sum(const char **files, char **md5sums);
232 
233 /** loads the given .cl file and returns a reference to an internal program. */
234 int dt_opencl_load_program(const int dev, const int prog, const char *filename, const char *binname,
235                            const char *cachedir, char *md5sum, char **includemd5, int *loaded_cached);
236 
237 /** builds the given program. */
238 int dt_opencl_build_program(const int dev, const int prog, const char *binname, const char *cachedir,
239                             char *md5sum, int loaded_cached);
240 
241 /** inits a kernel. returns the index or -1 if fail. */
242 int dt_opencl_create_kernel(const int program, const char *name);
243 
244 /** releases kernel resources again. */
245 void dt_opencl_free_kernel(const int kernel);
246 
247 /** return max size in sizes[3]. */
248 int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes);
249 
250 /** return max size per dimension in sizes[3] and max total size in workgroupsize */
251 int dt_opencl_get_work_group_limits(const int dev, size_t *sizes, size_t *workgroupsize,
252                                     unsigned long *localmemsize);
253 
254 /** return max workgroup size for a specific kernel */
255 int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel, size_t *kernelworkgroupsize);
256 
257 /** attach arg. */
258 int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size,
259                              const void *arg);
260 
261 /** launch kernel! */
262 int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes);
263 
264 /** launch kernel with defined local size! */
265 int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes,
266                                            const size_t *local);
267 
268 /** check if opencl is inited */
269 int dt_opencl_is_inited(void);
270 
271 /** check if opencl is enabled */
272 int dt_opencl_is_enabled(void);
273 
274 /** disable opencl */
275 void dt_opencl_disable(void);
276 
277 /** update enabled flag and profile with value from preferences, returns enabled flag */
278 int dt_opencl_update_settings(void);
279 
280 /** HAVE_OPENCL mode only: copy and alloc buffers. */
281 int dt_opencl_copy_device_to_host(const int devid, void *host, void *device, const int width,
282                                   const int height, const int bpp);
283 
284 int dt_opencl_read_host_from_device(const int devid, void *host, void *device, const int width,
285                                     const int height, const int bpp);
286 
287 int dt_opencl_read_host_from_device_rowpitch(const int devid, void *host, void *device, const int width,
288                                              const int height, const int rowpitch);
289 
290 int dt_opencl_read_host_from_device_non_blocking(const int devid, void *host, void *device, const int width,
291                                                  const int height, const int bpp);
292 
293 int dt_opencl_read_host_from_device_rowpitch_non_blocking(const int devid, void *host, void *device,
294                                                           const int width, const int height,
295                                                           const int rowpitch);
296 
297 int dt_opencl_read_host_from_device_raw(const int devid, void *host, void *device, const size_t *origin,
298                                         const size_t *region, const int rowpitch, const int blocking);
299 
300 int dt_opencl_write_host_to_device(const int devid, void *host, void *device, const int width,
301                                    const int height, const int bpp);
302 
303 int dt_opencl_write_host_to_device_rowpitch(const int devid, void *host, void *device, const int width,
304                                             const int height, const int rowpitch);
305 
306 int dt_opencl_write_host_to_device_non_blocking(const int devid, void *host, void *device, const int width,
307                                                 const int height, const int bpp);
308 
309 int dt_opencl_write_host_to_device_rowpitch_non_blocking(const int devid, void *host, void *device,
310                                                          const int width, const int height,
311                                                          const int rowpitch);
312 
313 int dt_opencl_write_host_to_device_raw(const int devid, void *host, void *device, const size_t *origin,
314                                        const size_t *region, const int rowpitch, const int blocking);
315 
316 void *dt_opencl_copy_host_to_device(const int devid, void *host, const int width, const int height,
317                                     const int bpp);
318 
319 void *dt_opencl_copy_host_to_device_rowpitch(const int devid, void *host, const int width, const int height,
320                                              const int bpp, const int rowpitch);
321 
322 void *dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host);
323 
324 int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst,
325                                  size_t *region);
326 
327 void *dt_opencl_alloc_device(const int devid, const int width, const int height, const int bpp);
328 
329 void *dt_opencl_alloc_device_use_host_pointer(const int devid, const int width, const int height,
330                                               const int bpp, const int rowpitch, void *host);
331 
332 int dt_opencl_enqueue_copy_image_to_buffer(const int devid, cl_mem src_image, cl_mem dst_buffer,
333                                            size_t *origin, size_t *region, size_t offset);
334 
335 int dt_opencl_enqueue_copy_buffer_to_image(const int devid, cl_mem src_buffer, cl_mem dst_image,
336                                            size_t offset, size_t *origin, size_t *region);
337 
338 int dt_opencl_enqueue_copy_buffer_to_buffer(const int devid, cl_mem src_buffer, cl_mem dst_buffer,
339                                             size_t srcoffset, size_t dstoffset, size_t size);
340 
341 int dt_opencl_read_buffer_from_device(const int devid, void *host, void *device, const size_t offset,
342                                       const size_t size, const int blocking);
343 
344 int dt_opencl_write_buffer_to_device(const int devid, void *host, void *device, const size_t offset,
345                                      const size_t size, const int blocking);
346 
347 void *dt_opencl_alloc_device_buffer(const int devid, const size_t size);
348 
349 void *dt_opencl_alloc_device_buffer_with_flags(const int devid, const size_t size, const int flags);
350 
351 void dt_opencl_release_mem_object(cl_mem mem);
352 
353 void *dt_opencl_map_buffer(const int devid, cl_mem buffer, const int blocking, const int flags, size_t offset,
354                            size_t size);
355 
356 int dt_opencl_unmap_mem_object(const int devid, cl_mem mem_object, void *mapped_ptr);
357 
358 size_t dt_opencl_get_mem_object_size(cl_mem mem);
359 
360 int dt_opencl_get_image_width(cl_mem mem);
361 
362 int dt_opencl_get_image_height(cl_mem mem);
363 
364 int dt_opencl_get_image_element_size(cl_mem mem);
365 
366 int dt_opencl_get_mem_context_id(cl_mem mem);
367 
368 void dt_opencl_memory_statistics(int devid, cl_mem mem, dt_opencl_memory_t action);
369 
370 /** check if image size fit into limits given by OpenCL runtime */
371 int dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height, const unsigned bpp,
372                                 const float factor, const size_t overhead);
373 
374 /** round size to a multiple of the value given in config parameter opencl_size_roundup */
375 int dt_opencl_roundup(int size);
376 
377 /** get global memory of device */
378 cl_ulong dt_opencl_get_max_global_mem(const int devid);
379 
380 /** get next free slot in eventlist and manage size of eventlist */
381 cl_event *dt_opencl_events_get_slot(const int devid, const char *tag);
382 
383 /** reset eventlist to empty state */
384 void dt_opencl_events_reset(const int devid);
385 
386 /** Wait for events in eventlist to terminate -> this is a blocking synchronization point
387     Does not flush eventlist */
388 void dt_opencl_events_wait_for(const int devid);
389 
390 /** Wait for events in eventlist to terminate, check for return status of events and
391     report summary success info (CL_COMPLETE or last error code) */
392 cl_int dt_opencl_events_flush(const int devid, const int reset);
393 
394 /** display OpenCL profiling information. If summary is not 0, try to generate summarized info for kernels */
395 void dt_opencl_events_profiling(const int devid, const int aggregated);
396 
397 /** utility function to calculate optimal work group dimensions for a given kernel */
398 int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors);
399 
400 #else
401 #include "control/conf.h"
402 #include <stdlib.h>
403 typedef struct dt_opencl_t
404 {
405   int inited;
406   int enabled;
407   int stopped;
408   int error_count;
409 } dt_opencl_t;
dt_opencl_init(dt_opencl_t * cl,const gboolean exclude_opencl,const gboolean print_statistics)410 static inline void dt_opencl_init(dt_opencl_t *cl, const gboolean exclude_opencl, const gboolean print_statistics)
411 {
412   cl->inited = 0;
413   cl->enabled = 0;
414   cl->stopped = 0;
415   cl->error_count = 0;
416   dt_conf_set_bool("opencl", FALSE);
417   dt_print(DT_DEBUG_OPENCL, "[opencl_init] this version of darktable was built without opencl support\n");
418 }
dt_opencl_cleanup(dt_opencl_t * cl)419 static inline void dt_opencl_cleanup(dt_opencl_t *cl)
420 {
421 }
dt_opencl_finish(const int devid)422 static inline int dt_opencl_finish(const int devid)
423 {
424   return -1;
425 }
dt_opencl_enqueue_barrier(const int devid)426 static inline int dt_opencl_enqueue_barrier(const int devid)
427 {
428   return -1;
429 }
dt_opencl_lock_device(const int dev)430 static inline int dt_opencl_lock_device(const int dev)
431 {
432   return -1;
433 }
dt_opencl_unlock_device(const int dev)434 static inline void dt_opencl_unlock_device(const int dev)
435 {
436 }
dt_opencl_load_program(const int dev,const char * filename)437 static inline int dt_opencl_load_program(const int dev, const char *filename)
438 {
439   return -1;
440 }
dt_opencl_build_program(const int dev,const int program)441 static inline int dt_opencl_build_program(const int dev, const int program)
442 {
443   return -1;
444 }
dt_opencl_create_kernel(const int program,const char * name)445 static inline int dt_opencl_create_kernel(const int program, const char *name)
446 {
447   return -1;
448 }
dt_opencl_free_kernel(const int kernel)449 static inline void dt_opencl_free_kernel(const int kernel)
450 {
451 }
dt_opencl_get_max_work_item_sizes(const int dev,size_t * sizes)452 static inline int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes)
453 {
454   return -1;
455 }
dt_opencl_get_work_group_limits(const int dev,size_t * sizes,size_t * workgroupsize,unsigned long * localmemsize)456 static inline int dt_opencl_get_work_group_limits(const int dev, size_t *sizes, size_t *workgroupsize,
457                                                   unsigned long *localmemsize)
458 {
459   return -1;
460 }
dt_opencl_get_kernel_work_group_size(const int dev,const int kernel,size_t * kernelworkgroupsize)461 static inline int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel,
462                                                        size_t *kernelworkgroupsize)
463 {
464   return -1;
465 }
dt_opencl_set_kernel_arg(const int dev,const int kernel,const size_t size,const void * arg)466 static inline int dt_opencl_set_kernel_arg(const int dev, const int kernel, const size_t size, const void *arg)
467 {
468   return -1;
469 }
dt_opencl_enqueue_kernel_2d(const int dev,const int kernel,const size_t * sizes)470 static inline int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes)
471 {
472   return -1;
473 }
dt_opencl_enqueue_kernel_2d_with_local(const int dev,const int kernel,const size_t * sizes,const size_t * local)474 static inline int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes,
475                                                          const size_t *local)
476 {
477   return -1;
478 }
dt_opencl_is_inited(void)479 static inline int dt_opencl_is_inited(void)
480 {
481   return 0;
482 }
dt_opencl_is_enabled(void)483 static inline int dt_opencl_is_enabled(void)
484 {
485   return 0;
486 }
dt_opencl_disable(void)487 static inline void dt_opencl_disable(void)
488 {
489 }
dt_opencl_update_settings(void)490 static inline int dt_opencl_update_settings(void)
491 {
492   return 0;
493 }
dt_opencl_image_fits_device(const int devid,const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)494 static inline int dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height,
495                                               const unsigned bpp, const float factor, const size_t overhead)
496 {
497   return 0;
498 }
dt_opencl_get_max_global_mem(const int devid)499 static inline int dt_opencl_get_max_global_mem(const int devid)
500 {
501   return 0;
502 }
dt_opencl_release_mem_object(void * mem)503 static inline void dt_opencl_release_mem_object(void *mem)
504 {
505 }
dt_opencl_events_get_slot(const int devid,const char * tag)506 static inline void *dt_opencl_events_get_slot(const int devid, const char *tag)
507 {
508   return NULL;
509 }
dt_opencl_events_reset(const int devid)510 static inline void dt_opencl_events_reset(const int devid)
511 {
512 }
dt_opencl_events_wait_for(const int devid)513 static inline void dt_opencl_events_wait_for(const int devid)
514 {
515 }
dt_opencl_events_flush(const int devid,const int reset)516 static inline int dt_opencl_events_flush(const int devid, const int reset)
517 {
518   return 0;
519 }
dt_opencl_events_profiling(const int devid,const int aggregated)520 static inline void dt_opencl_events_profiling(const int devid, const int aggregated)
521 {
522 }
523 #endif
524 
525 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
526 // vim: shiftwidth=2 expandtab tabstop=2 cindent
527 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
528