1 /*
2 This file is part of darktable,
3 Copyright (C) 2010-2021 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #pragma once
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #define DT_OPENCL_MAX_PLATFORMS 5
26 #define DT_OPENCL_MAX_PROGRAMS 256
27 #define DT_OPENCL_MAX_KERNELS 512
28 #define DT_OPENCL_EVENTLISTSIZE 256
29 #define DT_OPENCL_EVENTNAMELENGTH 64
30 #define DT_OPENCL_MAX_EVENTS 256
31 #define DT_OPENCL_MAX_ERRORS 5
32 #define DT_OPENCL_MAX_INCLUDES 7
33 #define DT_OPENCL_VENDOR_AMD 4098
34 #define DT_OPENCL_VENDOR_NVIDIA 4318
35 #define DT_OPENCL_VENDOR_INTEL 0x8086u
36
37 #include "common/darktable.h"
38
39 #ifdef HAVE_OPENCL
40
41 #include "common/dlopencl.h"
42 #include "common/dtpthread.h"
43 #include "common/iop_profile.h"
44 #include "control/conf.h"
45
46 // #pragma GCC diagnostic push
47 // #pragma GCC diagnostic ignored "-Wcomment"
48 #include <CL/cl.h>
49 // #pragma GCC diagnostic
50
51 #define ROUNDUP(a, n) ((a) % (n) == 0 ? (a) : ((a) / (n)+1) * (n))
52 #define ROUNDUPWD(a) dt_opencl_roundup(a)
53 #define ROUNDUPHT(a) dt_opencl_roundup(a)
54
55 typedef enum dt_opencl_memory_t
56 {
57 OPENCL_MEMORY_ADD,
58 OPENCL_MEMORY_SUB
59 } dt_opencl_memory_t;
60
61 typedef enum dt_opencl_scheduling_profile_t
62 {
63 OPENCL_PROFILE_DEFAULT,
64 OPENCL_PROFILE_MULTIPLE_GPUS,
65 OPENCL_PROFILE_VERYFAST_GPU
66 } dt_opencl_scheduling_profile_t;
67
68 typedef enum dt_opencl_sync_cache_t
69 {
70 OPENCL_SYNC_TRUE,
71 OPENCL_SYNC_ACTIVE_MODULE,
72 OPENCL_SYNC_FALSE
73 } dt_opencl_sync_cache_t;
74
75 /**
76 * Accounting information used for OpenCL events.
77 */
78 typedef struct dt_opencl_eventtag_t
79 {
80 cl_int retval;
81 cl_ulong timelapsed;
82 char tag[DT_OPENCL_EVENTNAMELENGTH];
83 } dt_opencl_eventtag_t;
84
85
86 /**
87 * to support multi-gpu and mixed systems with cpu support,
88 * we encapsulate devices and use separate command queues.
89 */
90 typedef struct dt_opencl_device_t
91 {
92 dt_pthread_mutex_t lock;
93 cl_device_id devid;
94 cl_context context;
95 cl_command_queue cmd_queue;
96 size_t max_image_width;
97 size_t max_image_height;
98 cl_ulong max_mem_alloc;
99 cl_ulong max_global_mem;
100 cl_ulong used_global_mem;
101 cl_program program[DT_OPENCL_MAX_PROGRAMS];
102 cl_kernel kernel[DT_OPENCL_MAX_KERNELS];
103 int program_used[DT_OPENCL_MAX_PROGRAMS];
104 int kernel_used[DT_OPENCL_MAX_KERNELS];
105 cl_event *eventlist;
106 dt_opencl_eventtag_t *eventtags;
107 int numevents;
108 int eventsconsolidated;
109 int maxevents;
110 int lostevents;
111 int totalevents;
112 int totalsuccess;
113 int totallost;
114 int nvidia_sm_20;
115 const char *vendor;
116 const char *name;
117 const char *cname;
118 const char *options;
119 cl_int summary;
120 float benchmark;
121 size_t memory_in_use;
122 size_t peak_memory;
123 } dt_opencl_device_t;
124
125 struct dt_bilateral_cl_global_t;
126 struct dt_local_laplacian_cl_global_t;
127 struct dt_dwt_cl_global_t; // wavelet decompose
128 struct dt_heal_cl_global_t; // healing
129 struct dt_colorspaces_cl_global_t; // colorspaces transform
130 struct dt_guided_filter_cl_global_t;
131
132 /**
133 * main struct, stored in darktable.opencl.
134 * holds pointers to all
135 */
136 typedef struct dt_opencl_t
137 {
138 dt_pthread_mutex_t lock;
139 int inited;
140 int avoid_atomics;
141 int use_events;
142 int async_pixelpipe;
143 int number_event_handles;
144 int print_statistics;
145 dt_opencl_sync_cache_t sync_cache;
146 int micro_nap;
147 int enabled;
148 int stopped;
149 int num_devs;
150 int error_count;
151 int opencl_synchronization_timeout;
152 dt_opencl_scheduling_profile_t scheduling_profile;
153 uint32_t crc;
154 int mandatory[5];
155 int *dev_priority_image;
156 int *dev_priority_preview;
157 int *dev_priority_preview2;
158 int *dev_priority_export;
159 int *dev_priority_thumbnail;
160 dt_opencl_device_t *dev;
161 dt_dlopencl_t *dlocl;
162
163 // global kernels for blending operations.
164 struct dt_blendop_cl_global_t *blendop;
165
166 // global kernels for bilateral filtering, to be reused by a few plugins.
167 struct dt_bilateral_cl_global_t *bilateral;
168
169 // global kernels for gaussian filtering, to be reused by a few plugins.
170 struct dt_gaussian_cl_global_t *gaussian;
171
172 // global kernels for interpolation resampling.
173 struct dt_interpolation_cl_global_t *interpolation;
174
175 // global kernels for local laplacian filter.
176 struct dt_local_laplacian_cl_global_t *local_laplacian;
177
178 // global kernels for dwt filter.
179 struct dt_dwt_cl_global_t *dwt;
180
181 // global kernels for heal filter.
182 struct dt_heal_cl_global_t *heal;
183
184 // global kernels for colorspaces filter.
185 struct dt_colorspaces_cl_global_t *colorspaces;
186
187 // global kernels for guided filter.
188 struct dt_guided_filter_cl_global_t *guided_filter;
189 } dt_opencl_t;
190
191 /** description of memory requirements of local buffer
192 * local buffer size will be calculated as:
193 * (xoffset + xfactor * x) * (yoffset + yfactor * y) * cellsize + overhead; */
194 typedef struct dt_opencl_local_buffer_t
195 {
196 const int xoffset;
197 const int xfactor;
198 const int yoffset;
199 const int yfactor;
200 const size_t cellsize;
201 const size_t overhead;
202 int sizex; // initial value and final values after optimization
203 int sizey; // initial value and final values after optimization
204 } dt_opencl_local_buffer_t;
205
206 /** internally calls dt_clGetDeviceInfo, and takes care of memory allocation
207 * afterwards, *param_value will point to memory block of size at least *param_value
208 * which needs to be free()'d manually */
209 int dt_opencl_get_device_info(dt_opencl_t *cl, cl_device_id device, cl_device_info param_name, void **param_value,
210 size_t *param_value_size);
211
212 /** inits the opencl subsystem. */
213 void dt_opencl_init(dt_opencl_t *cl, const gboolean exclude_opencl, const gboolean print_statistics);
214
215 /** cleans up the opencl subsystem. */
216 void dt_opencl_cleanup(dt_opencl_t *cl);
217
218 /** cleans up command queue. */
219 int dt_opencl_finish(const int devid);
220
221 /** enqueues a synchronization point. */
222 int dt_opencl_enqueue_barrier(const int devid);
223
224 /** locks a device for your thread's exclusive use */
225 int dt_opencl_lock_device(const int pipetype);
226
227 /** done with your command queue. */
228 void dt_opencl_unlock_device(const int dev);
229
230 /** calculates md5sums for a list of CL include files. */
231 void dt_opencl_md5sum(const char **files, char **md5sums);
232
233 /** loads the given .cl file and returns a reference to an internal program. */
234 int dt_opencl_load_program(const int dev, const int prog, const char *filename, const char *binname,
235 const char *cachedir, char *md5sum, char **includemd5, int *loaded_cached);
236
237 /** builds the given program. */
238 int dt_opencl_build_program(const int dev, const int prog, const char *binname, const char *cachedir,
239 char *md5sum, int loaded_cached);
240
241 /** inits a kernel. returns the index or -1 if fail. */
242 int dt_opencl_create_kernel(const int program, const char *name);
243
244 /** releases kernel resources again. */
245 void dt_opencl_free_kernel(const int kernel);
246
247 /** return max size in sizes[3]. */
248 int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes);
249
250 /** return max size per dimension in sizes[3] and max total size in workgroupsize */
251 int dt_opencl_get_work_group_limits(const int dev, size_t *sizes, size_t *workgroupsize,
252 unsigned long *localmemsize);
253
254 /** return max workgroup size for a specific kernel */
255 int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel, size_t *kernelworkgroupsize);
256
257 /** attach arg. */
258 int dt_opencl_set_kernel_arg(const int dev, const int kernel, const int num, const size_t size,
259 const void *arg);
260
261 /** launch kernel! */
262 int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes);
263
264 /** launch kernel with defined local size! */
265 int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes,
266 const size_t *local);
267
268 /** check if opencl is inited */
269 int dt_opencl_is_inited(void);
270
271 /** check if opencl is enabled */
272 int dt_opencl_is_enabled(void);
273
274 /** disable opencl */
275 void dt_opencl_disable(void);
276
277 /** update enabled flag and profile with value from preferences, returns enabled flag */
278 int dt_opencl_update_settings(void);
279
280 /** HAVE_OPENCL mode only: copy and alloc buffers. */
281 int dt_opencl_copy_device_to_host(const int devid, void *host, void *device, const int width,
282 const int height, const int bpp);
283
284 int dt_opencl_read_host_from_device(const int devid, void *host, void *device, const int width,
285 const int height, const int bpp);
286
287 int dt_opencl_read_host_from_device_rowpitch(const int devid, void *host, void *device, const int width,
288 const int height, const int rowpitch);
289
290 int dt_opencl_read_host_from_device_non_blocking(const int devid, void *host, void *device, const int width,
291 const int height, const int bpp);
292
293 int dt_opencl_read_host_from_device_rowpitch_non_blocking(const int devid, void *host, void *device,
294 const int width, const int height,
295 const int rowpitch);
296
297 int dt_opencl_read_host_from_device_raw(const int devid, void *host, void *device, const size_t *origin,
298 const size_t *region, const int rowpitch, const int blocking);
299
300 int dt_opencl_write_host_to_device(const int devid, void *host, void *device, const int width,
301 const int height, const int bpp);
302
303 int dt_opencl_write_host_to_device_rowpitch(const int devid, void *host, void *device, const int width,
304 const int height, const int rowpitch);
305
306 int dt_opencl_write_host_to_device_non_blocking(const int devid, void *host, void *device, const int width,
307 const int height, const int bpp);
308
309 int dt_opencl_write_host_to_device_rowpitch_non_blocking(const int devid, void *host, void *device,
310 const int width, const int height,
311 const int rowpitch);
312
313 int dt_opencl_write_host_to_device_raw(const int devid, void *host, void *device, const size_t *origin,
314 const size_t *region, const int rowpitch, const int blocking);
315
316 void *dt_opencl_copy_host_to_device(const int devid, void *host, const int width, const int height,
317 const int bpp);
318
319 void *dt_opencl_copy_host_to_device_rowpitch(const int devid, void *host, const int width, const int height,
320 const int bpp, const int rowpitch);
321
322 void *dt_opencl_copy_host_to_device_constant(const int devid, const size_t size, void *host);
323
324 int dt_opencl_enqueue_copy_image(const int devid, cl_mem src, cl_mem dst, size_t *orig_src, size_t *orig_dst,
325 size_t *region);
326
327 void *dt_opencl_alloc_device(const int devid, const int width, const int height, const int bpp);
328
329 void *dt_opencl_alloc_device_use_host_pointer(const int devid, const int width, const int height,
330 const int bpp, const int rowpitch, void *host);
331
332 int dt_opencl_enqueue_copy_image_to_buffer(const int devid, cl_mem src_image, cl_mem dst_buffer,
333 size_t *origin, size_t *region, size_t offset);
334
335 int dt_opencl_enqueue_copy_buffer_to_image(const int devid, cl_mem src_buffer, cl_mem dst_image,
336 size_t offset, size_t *origin, size_t *region);
337
338 int dt_opencl_enqueue_copy_buffer_to_buffer(const int devid, cl_mem src_buffer, cl_mem dst_buffer,
339 size_t srcoffset, size_t dstoffset, size_t size);
340
341 int dt_opencl_read_buffer_from_device(const int devid, void *host, void *device, const size_t offset,
342 const size_t size, const int blocking);
343
344 int dt_opencl_write_buffer_to_device(const int devid, void *host, void *device, const size_t offset,
345 const size_t size, const int blocking);
346
347 void *dt_opencl_alloc_device_buffer(const int devid, const size_t size);
348
349 void *dt_opencl_alloc_device_buffer_with_flags(const int devid, const size_t size, const int flags);
350
351 void dt_opencl_release_mem_object(cl_mem mem);
352
353 void *dt_opencl_map_buffer(const int devid, cl_mem buffer, const int blocking, const int flags, size_t offset,
354 size_t size);
355
356 int dt_opencl_unmap_mem_object(const int devid, cl_mem mem_object, void *mapped_ptr);
357
358 size_t dt_opencl_get_mem_object_size(cl_mem mem);
359
360 int dt_opencl_get_image_width(cl_mem mem);
361
362 int dt_opencl_get_image_height(cl_mem mem);
363
364 int dt_opencl_get_image_element_size(cl_mem mem);
365
366 int dt_opencl_get_mem_context_id(cl_mem mem);
367
368 void dt_opencl_memory_statistics(int devid, cl_mem mem, dt_opencl_memory_t action);
369
370 /** check if image size fit into limits given by OpenCL runtime */
371 int dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height, const unsigned bpp,
372 const float factor, const size_t overhead);
373
374 /** round size to a multiple of the value given in config parameter opencl_size_roundup */
375 int dt_opencl_roundup(int size);
376
377 /** get global memory of device */
378 cl_ulong dt_opencl_get_max_global_mem(const int devid);
379
380 /** get next free slot in eventlist and manage size of eventlist */
381 cl_event *dt_opencl_events_get_slot(const int devid, const char *tag);
382
383 /** reset eventlist to empty state */
384 void dt_opencl_events_reset(const int devid);
385
386 /** Wait for events in eventlist to terminate -> this is a blocking synchronization point
387 Does not flush eventlist */
388 void dt_opencl_events_wait_for(const int devid);
389
390 /** Wait for events in eventlist to terminate, check for return status of events and
391 report summary success info (CL_COMPLETE or last error code) */
392 cl_int dt_opencl_events_flush(const int devid, const int reset);
393
394 /** display OpenCL profiling information. If summary is not 0, try to generate summarized info for kernels */
395 void dt_opencl_events_profiling(const int devid, const int aggregated);
396
397 /** utility function to calculate optimal work group dimensions for a given kernel */
398 int dt_opencl_local_buffer_opt(const int devid, const int kernel, dt_opencl_local_buffer_t *factors);
399
400 #else
401 #include "control/conf.h"
402 #include <stdlib.h>
403 typedef struct dt_opencl_t
404 {
405 int inited;
406 int enabled;
407 int stopped;
408 int error_count;
409 } dt_opencl_t;
dt_opencl_init(dt_opencl_t * cl,const gboolean exclude_opencl,const gboolean print_statistics)410 static inline void dt_opencl_init(dt_opencl_t *cl, const gboolean exclude_opencl, const gboolean print_statistics)
411 {
412 cl->inited = 0;
413 cl->enabled = 0;
414 cl->stopped = 0;
415 cl->error_count = 0;
416 dt_conf_set_bool("opencl", FALSE);
417 dt_print(DT_DEBUG_OPENCL, "[opencl_init] this version of darktable was built without opencl support\n");
418 }
dt_opencl_cleanup(dt_opencl_t * cl)419 static inline void dt_opencl_cleanup(dt_opencl_t *cl)
420 {
421 }
dt_opencl_finish(const int devid)422 static inline int dt_opencl_finish(const int devid)
423 {
424 return -1;
425 }
dt_opencl_enqueue_barrier(const int devid)426 static inline int dt_opencl_enqueue_barrier(const int devid)
427 {
428 return -1;
429 }
dt_opencl_lock_device(const int dev)430 static inline int dt_opencl_lock_device(const int dev)
431 {
432 return -1;
433 }
dt_opencl_unlock_device(const int dev)434 static inline void dt_opencl_unlock_device(const int dev)
435 {
436 }
dt_opencl_load_program(const int dev,const char * filename)437 static inline int dt_opencl_load_program(const int dev, const char *filename)
438 {
439 return -1;
440 }
dt_opencl_build_program(const int dev,const int program)441 static inline int dt_opencl_build_program(const int dev, const int program)
442 {
443 return -1;
444 }
dt_opencl_create_kernel(const int program,const char * name)445 static inline int dt_opencl_create_kernel(const int program, const char *name)
446 {
447 return -1;
448 }
dt_opencl_free_kernel(const int kernel)449 static inline void dt_opencl_free_kernel(const int kernel)
450 {
451 }
dt_opencl_get_max_work_item_sizes(const int dev,size_t * sizes)452 static inline int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes)
453 {
454 return -1;
455 }
dt_opencl_get_work_group_limits(const int dev,size_t * sizes,size_t * workgroupsize,unsigned long * localmemsize)456 static inline int dt_opencl_get_work_group_limits(const int dev, size_t *sizes, size_t *workgroupsize,
457 unsigned long *localmemsize)
458 {
459 return -1;
460 }
dt_opencl_get_kernel_work_group_size(const int dev,const int kernel,size_t * kernelworkgroupsize)461 static inline int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel,
462 size_t *kernelworkgroupsize)
463 {
464 return -1;
465 }
dt_opencl_set_kernel_arg(const int dev,const int kernel,const size_t size,const void * arg)466 static inline int dt_opencl_set_kernel_arg(const int dev, const int kernel, const size_t size, const void *arg)
467 {
468 return -1;
469 }
dt_opencl_enqueue_kernel_2d(const int dev,const int kernel,const size_t * sizes)470 static inline int dt_opencl_enqueue_kernel_2d(const int dev, const int kernel, const size_t *sizes)
471 {
472 return -1;
473 }
dt_opencl_enqueue_kernel_2d_with_local(const int dev,const int kernel,const size_t * sizes,const size_t * local)474 static inline int dt_opencl_enqueue_kernel_2d_with_local(const int dev, const int kernel, const size_t *sizes,
475 const size_t *local)
476 {
477 return -1;
478 }
dt_opencl_is_inited(void)479 static inline int dt_opencl_is_inited(void)
480 {
481 return 0;
482 }
dt_opencl_is_enabled(void)483 static inline int dt_opencl_is_enabled(void)
484 {
485 return 0;
486 }
dt_opencl_disable(void)487 static inline void dt_opencl_disable(void)
488 {
489 }
dt_opencl_update_settings(void)490 static inline int dt_opencl_update_settings(void)
491 {
492 return 0;
493 }
dt_opencl_image_fits_device(const int devid,const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)494 static inline int dt_opencl_image_fits_device(const int devid, const size_t width, const size_t height,
495 const unsigned bpp, const float factor, const size_t overhead)
496 {
497 return 0;
498 }
dt_opencl_get_max_global_mem(const int devid)499 static inline int dt_opencl_get_max_global_mem(const int devid)
500 {
501 return 0;
502 }
dt_opencl_release_mem_object(void * mem)503 static inline void dt_opencl_release_mem_object(void *mem)
504 {
505 }
dt_opencl_events_get_slot(const int devid,const char * tag)506 static inline void *dt_opencl_events_get_slot(const int devid, const char *tag)
507 {
508 return NULL;
509 }
dt_opencl_events_reset(const int devid)510 static inline void dt_opencl_events_reset(const int devid)
511 {
512 }
dt_opencl_events_wait_for(const int devid)513 static inline void dt_opencl_events_wait_for(const int devid)
514 {
515 }
dt_opencl_events_flush(const int devid,const int reset)516 static inline int dt_opencl_events_flush(const int devid, const int reset)
517 {
518 return 0;
519 }
dt_opencl_events_profiling(const int devid,const int aggregated)520 static inline void dt_opencl_events_profiling(const int devid, const int aggregated)
521 {
522 }
523 #endif
524
525 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
526 // vim: shiftwidth=2 expandtab tabstop=2 cindent
527 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
528