1 #ifdef HAVE_OPENCL
2 /*
3 This file is part of darktable,
4 Copyright (C) 2016-2020 darktable developers.
5
6 darktable is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 darktable is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with darktable. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #include "common/darktable.h"
20 #include "common/opencl.h"
21 #include "common/locallaplaciancl.h"
22
23 #define max_levels 30
24 #define num_gamma 6
25
26 // downsample width/height to given level
dl(uint64_t size,const int level)27 static inline uint64_t dl(uint64_t size, const int level)
28 {
29 for(int l=0;l<level;l++)
30 size = (size-1)/2+1;
31 return size;
32 }
33
dt_local_laplacian_init_cl_global()34 dt_local_laplacian_cl_global_t *dt_local_laplacian_init_cl_global()
35 {
36 dt_local_laplacian_cl_global_t *g = malloc(sizeof(dt_local_laplacian_cl_global_t));
37
38 const int program = 19; // locallaplacian.cl, from programs.conf
39 g->kernel_pad_input = dt_opencl_create_kernel(program, "pad_input");
40 g->kernel_gauss_expand = dt_opencl_create_kernel(program, "gauss_expand");
41 g->kernel_gauss_reduce = dt_opencl_create_kernel(program, "gauss_reduce");
42 g->kernel_laplacian_assemble = dt_opencl_create_kernel(program, "laplacian_assemble");
43 g->kernel_process_curve = dt_opencl_create_kernel(program, "process_curve");
44 g->kernel_write_back = dt_opencl_create_kernel(program, "write_back");
45 return g;
46 }
47
dt_local_laplacian_free_cl(dt_local_laplacian_cl_t * g)48 void dt_local_laplacian_free_cl(dt_local_laplacian_cl_t *g)
49 {
50 if(!g) return;
51 // be sure we're done with the memory:
52 dt_opencl_finish(g->devid);
53
54 // free device mem
55 for(int l=0;l<max_levels;l++)
56 {
57 dt_opencl_release_mem_object(g->dev_padded[l]);
58 dt_opencl_release_mem_object(g->dev_output[l]);
59 for(int k=0;k<num_gamma;k++)
60 dt_opencl_release_mem_object(g->dev_processed[k][l]);
61 }
62 for(int k=0;k<num_gamma;k++) free(g->dev_processed[k]);
63 free(g->dev_padded);
64 free(g->dev_output);
65 free(g->dev_processed);
66 g->dev_padded = g->dev_output = 0;
67 g->dev_processed = 0;
68 free(g);
69 }
70
dt_local_laplacian_init_cl(const int devid,const int width,const int height,const float sigma,const float shadows,const float highlights,const float clarity)71 dt_local_laplacian_cl_t *dt_local_laplacian_init_cl(
72 const int devid,
73 const int width, // width of input image
74 const int height, // height of input image
75 const float sigma, // user param: separate shadows/midtones/highlights
76 const float shadows, // user param: lift shadows
77 const float highlights, // user param: compress highlights
78 const float clarity) // user param: increase clarity/local contrast
79 {
80 dt_local_laplacian_cl_t *g = malloc(sizeof(dt_local_laplacian_cl_t));
81 if(!g) return NULL;
82
83 g->global = darktable.opencl->local_laplacian;
84 g->devid = devid;
85 g->width = width;
86 g->height = height;
87 g->sigma = sigma;
88 g->shadows = shadows;
89 g->highlights = highlights;
90 g->clarity = clarity;
91 g->dev_padded = calloc(max_levels, sizeof(cl_mem));
92 g->dev_output = calloc(max_levels, sizeof(cl_mem));
93 g->dev_processed = calloc(num_gamma, sizeof(cl_mem *));
94 for(int k=0;k<num_gamma;k++)
95 g->dev_processed[k] = calloc(max_levels, sizeof(cl_mem));
96
97 g->num_levels = MIN(max_levels, 31-__builtin_clz(MIN(width,height)));
98 g->max_supp = 1<<(g->num_levels-1);
99 g->bwidth = ROUNDUPWD(width + 2*g->max_supp);
100 g->bheight = ROUNDUPHT(height + 2*g->max_supp);
101
102 // get intermediate vector buffers with read-write access
103 for(int l=0;l<g->num_levels;l++)
104 {
105 g->dev_padded[l] = dt_opencl_alloc_device(devid, ROUNDUPWD(dl(g->bwidth, l)), ROUNDUPHT(dl(g->bheight, l)), sizeof(float));
106 if(!g->dev_padded[l]) goto error;
107 g->dev_output[l] = dt_opencl_alloc_device(devid, ROUNDUPWD(dl(g->bwidth, l)), ROUNDUPHT(dl(g->bheight, l)), sizeof(float));
108 if(!g->dev_output[l]) goto error;
109 for(int k=0;k<num_gamma;k++)
110 {
111 g->dev_processed[k][l] = dt_opencl_alloc_device(devid, ROUNDUPWD(dl(g->bwidth, l)), ROUNDUPHT(dl(g->bheight, l)), sizeof(float));
112 if(!g->dev_processed[k][l]) goto error;
113 }
114 }
115
116 return g;
117
118 error:
119 fprintf(stderr, "[local laplacian cl] could not allocate temporary buffers\n");
120 dt_local_laplacian_free_cl(g);
121 return NULL;
122 }
123
dt_local_laplacian_cl(dt_local_laplacian_cl_t * b,cl_mem input,cl_mem output)124 cl_int dt_local_laplacian_cl(
125 dt_local_laplacian_cl_t *b, // opencl context with temp buffers
126 cl_mem input, // input buffer in some Labx or yuvx format
127 cl_mem output) // output buffer with colour
128 {
129 cl_int err = -666;
130
131 if(b->bwidth <= 1 || b->bheight <= 1) return err;
132
133 size_t sizes_pad[] = { ROUNDUPWD(b->bwidth), ROUNDUPHT(b->bheight), 1 };
134 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 0, sizeof(cl_mem), &input);
135 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 1, sizeof(cl_mem), &b->dev_padded[0]);
136 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 2, sizeof(int), &b->width);
137 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 3, sizeof(int), &b->height);
138 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 4, sizeof(int), &b->max_supp);
139 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 5, sizeof(int), &b->bwidth);
140 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 6, sizeof(int), &b->bheight);
141 err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_pad_input, sizes_pad);
142 if(err != CL_SUCCESS) goto error;
143
144 // create gauss pyramid of padded input, write coarse directly to output
145 for(int l=1;l<b->num_levels;l++)
146 {
147 const int wd = dl(b->bwidth, l), ht = dl(b->bheight, l);
148 size_t sizes[] = { ROUNDUPWD(wd), ROUNDUPHT(ht), 1 };
149 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 0, sizeof(cl_mem), &b->dev_padded[l-1]);
150 if(l == b->num_levels-1)
151 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 1, sizeof(cl_mem), &b->dev_output[l]);
152 else
153 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 1, sizeof(cl_mem), &b->dev_padded[l]);
154 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 2, sizeof(int), &wd);
155 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 3, sizeof(int), &ht);
156 err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_gauss_reduce, sizes);
157 if(err != CL_SUCCESS) goto error;
158 }
159
160 for(int k=0;k<num_gamma;k++)
161 { // process images
162 const float g = (k+.5f)/(float)num_gamma;
163 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 0, sizeof(cl_mem), &b->dev_padded[0]);
164 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 1, sizeof(cl_mem), &b->dev_processed[k][0]);
165 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 2, sizeof(float), &g);
166 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 3, sizeof(float), &b->sigma);
167 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 4, sizeof(float), &b->shadows);
168 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 5, sizeof(float), &b->highlights);
169 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 6, sizeof(float), &b->clarity);
170 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 7, sizeof(int), &b->bwidth);
171 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 8, sizeof(int), &b->bheight);
172 err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_process_curve, sizes_pad);
173 if(err != CL_SUCCESS) goto error;
174
175 // create gaussian pyramids
176 for(int l=1;l<b->num_levels;l++)
177 {
178 const int wd = dl(b->bwidth, l), ht = dl(b->bheight, l);
179 size_t sizes[] = { ROUNDUPWD(wd), ROUNDUPHT(ht), 1 };
180 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 0, sizeof(cl_mem), &b->dev_processed[k][l-1]);
181 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 1, sizeof(cl_mem), &b->dev_processed[k][l]);
182 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 2, sizeof(int), &wd);
183 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 3, sizeof(int), &ht);
184 err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_gauss_reduce, sizes);
185 if(err != CL_SUCCESS) goto error;
186 }
187 }
188
189 // assemble output pyramid coarse to fine
190 for(int l=b->num_levels-2;l >= 0; l--)
191 {
192 const int pw = dl(b->bwidth,l), ph = dl(b->bheight,l);
193 size_t sizes[] = { ROUNDUPWD(pw), ROUNDUPHT(ph), 1 };
194 // this is so dumb:
195 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 0, sizeof(cl_mem), &b->dev_padded[l]);
196 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 1, sizeof(cl_mem), &b->dev_output[l+1]);
197 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 2, sizeof(cl_mem), &b->dev_output[l]);
198 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 3, sizeof(cl_mem), &b->dev_processed[0][l]);
199 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 4, sizeof(cl_mem), &b->dev_processed[0][l+1]);
200 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 5, sizeof(cl_mem), &b->dev_processed[1][l]);
201 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 6, sizeof(cl_mem), &b->dev_processed[1][l+1]);
202 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 7, sizeof(cl_mem), &b->dev_processed[2][l]);
203 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 8, sizeof(cl_mem), &b->dev_processed[2][l+1]);
204 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 9, sizeof(cl_mem), &b->dev_processed[3][l]);
205 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 10, sizeof(cl_mem), &b->dev_processed[3][l+1]);
206 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 11, sizeof(cl_mem), &b->dev_processed[4][l]);
207 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 12, sizeof(cl_mem), &b->dev_processed[4][l+1]);
208 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 13, sizeof(cl_mem), &b->dev_processed[5][l]);
209 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 14, sizeof(cl_mem), &b->dev_processed[5][l+1]);
210 // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 15, sizeof(cl_mem), &b->dev_processed[6][l]);
211 // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 16, sizeof(cl_mem), &b->dev_processed[6][l+1]);
212 // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 17, sizeof(cl_mem), &b->dev_processed[7][l]);
213 // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 18, sizeof(cl_mem), &b->dev_processed[7][l+1]);
214 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 15, sizeof(int), &pw);
215 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 16, sizeof(int), &ph);
216 err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_laplacian_assemble, sizes);
217 if(err != CL_SUCCESS) goto error;
218 }
219
220 // read back processed L channel and copy colours:
221 size_t sizes[] = { ROUNDUPWD(b->width), ROUNDUPHT(b->height), 1 };
222 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 0, sizeof(cl_mem), &input);
223 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 1, sizeof(cl_mem), &b->dev_output[0]);
224 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 2, sizeof(cl_mem), &output);
225 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 3, sizeof(int), &b->max_supp);
226 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 4, sizeof(int), &b->width);
227 dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 5, sizeof(int), &b->height);
228 err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_write_back, sizes);
229 if(err != CL_SUCCESS) goto error;
230
231 return CL_SUCCESS;
232
233 error:
234 fprintf(stderr, "[local laplacian cl] failed: %d\n", err);
235 return err;
236 }
237
238 #undef max_levels
239 #undef num_gamma
240 #endif
241