1 #ifdef HAVE_OPENCL
2 /*
3     This file is part of darktable,
4     Copyright (C) 2016-2020 darktable developers.
5 
6     darktable is free software: you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation, either version 3 of the License, or
9     (at your option) any later version.
10 
11     darktable is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
18 */
19 #include "common/darktable.h"
20 #include "common/opencl.h"
21 #include "common/locallaplaciancl.h"
22 
23 #define max_levels 30
24 #define num_gamma 6
25 
26 // downsample width/height to given level
dl(uint64_t size,const int level)27 static inline uint64_t dl(uint64_t size, const int level)
28 {
29   for(int l=0;l<level;l++)
30     size = (size-1)/2+1;
31   return size;
32 }
33 
dt_local_laplacian_init_cl_global()34 dt_local_laplacian_cl_global_t *dt_local_laplacian_init_cl_global()
35 {
36   dt_local_laplacian_cl_global_t *g = malloc(sizeof(dt_local_laplacian_cl_global_t));
37 
38   const int program = 19; // locallaplacian.cl, from programs.conf
39   g->kernel_pad_input          = dt_opencl_create_kernel(program, "pad_input");
40   g->kernel_gauss_expand       = dt_opencl_create_kernel(program, "gauss_expand");
41   g->kernel_gauss_reduce       = dt_opencl_create_kernel(program, "gauss_reduce");
42   g->kernel_laplacian_assemble = dt_opencl_create_kernel(program, "laplacian_assemble");
43   g->kernel_process_curve      = dt_opencl_create_kernel(program, "process_curve");
44   g->kernel_write_back         = dt_opencl_create_kernel(program, "write_back");
45   return g;
46 }
47 
dt_local_laplacian_free_cl(dt_local_laplacian_cl_t * g)48 void dt_local_laplacian_free_cl(dt_local_laplacian_cl_t *g)
49 {
50   if(!g) return;
51   // be sure we're done with the memory:
52   dt_opencl_finish(g->devid);
53 
54   // free device mem
55   for(int l=0;l<max_levels;l++)
56   {
57     dt_opencl_release_mem_object(g->dev_padded[l]);
58     dt_opencl_release_mem_object(g->dev_output[l]);
59     for(int k=0;k<num_gamma;k++)
60       dt_opencl_release_mem_object(g->dev_processed[k][l]);
61   }
62   for(int k=0;k<num_gamma;k++) free(g->dev_processed[k]);
63   free(g->dev_padded);
64   free(g->dev_output);
65   free(g->dev_processed);
66   g->dev_padded = g->dev_output = 0;
67   g->dev_processed = 0;
68   free(g);
69 }
70 
dt_local_laplacian_init_cl(const int devid,const int width,const int height,const float sigma,const float shadows,const float highlights,const float clarity)71 dt_local_laplacian_cl_t *dt_local_laplacian_init_cl(
72     const int devid,
73     const int width,            // width of input image
74     const int height,           // height of input image
75     const float sigma,          // user param: separate shadows/midtones/highlights
76     const float shadows,        // user param: lift shadows
77     const float highlights,     // user param: compress highlights
78     const float clarity)        // user param: increase clarity/local contrast
79 {
80   dt_local_laplacian_cl_t *g = malloc(sizeof(dt_local_laplacian_cl_t));
81   if(!g) return NULL;
82 
83   g->global = darktable.opencl->local_laplacian;
84   g->devid = devid;
85   g->width = width;
86   g->height = height;
87   g->sigma = sigma;
88   g->shadows = shadows;
89   g->highlights = highlights;
90   g->clarity = clarity;
91   g->dev_padded = calloc(max_levels, sizeof(cl_mem));
92   g->dev_output = calloc(max_levels, sizeof(cl_mem));
93   g->dev_processed = calloc(num_gamma, sizeof(cl_mem *));
94   for(int k=0;k<num_gamma;k++)
95     g->dev_processed[k] = calloc(max_levels, sizeof(cl_mem));
96 
97   g->num_levels = MIN(max_levels, 31-__builtin_clz(MIN(width,height)));
98   g->max_supp = 1<<(g->num_levels-1);
99   g->bwidth = ROUNDUPWD(width  + 2*g->max_supp);
100   g->bheight = ROUNDUPHT(height + 2*g->max_supp);
101 
102   // get intermediate vector buffers with read-write access
103   for(int l=0;l<g->num_levels;l++)
104   {
105     g->dev_padded[l] = dt_opencl_alloc_device(devid, ROUNDUPWD(dl(g->bwidth, l)), ROUNDUPHT(dl(g->bheight, l)), sizeof(float));
106     if(!g->dev_padded[l]) goto error;
107     g->dev_output[l] = dt_opencl_alloc_device(devid, ROUNDUPWD(dl(g->bwidth, l)), ROUNDUPHT(dl(g->bheight, l)), sizeof(float));
108     if(!g->dev_output[l]) goto error;
109     for(int k=0;k<num_gamma;k++)
110     {
111       g->dev_processed[k][l] = dt_opencl_alloc_device(devid, ROUNDUPWD(dl(g->bwidth, l)), ROUNDUPHT(dl(g->bheight, l)), sizeof(float));
112       if(!g->dev_processed[k][l]) goto error;
113     }
114   }
115 
116   return g;
117 
118 error:
119   fprintf(stderr, "[local laplacian cl] could not allocate temporary buffers\n");
120   dt_local_laplacian_free_cl(g);
121   return NULL;
122 }
123 
dt_local_laplacian_cl(dt_local_laplacian_cl_t * b,cl_mem input,cl_mem output)124 cl_int dt_local_laplacian_cl(
125     dt_local_laplacian_cl_t *b, // opencl context with temp buffers
126     cl_mem input,               // input buffer in some Labx or yuvx format
127     cl_mem output)              // output buffer with colour
128 {
129   cl_int err = -666;
130 
131   if(b->bwidth <= 1 || b->bheight <= 1) return err;
132 
133   size_t sizes_pad[] = { ROUNDUPWD(b->bwidth), ROUNDUPHT(b->bheight), 1 };
134   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 0, sizeof(cl_mem), &input);
135   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 1, sizeof(cl_mem), &b->dev_padded[0]);
136   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 2, sizeof(int), &b->width);
137   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 3, sizeof(int), &b->height);
138   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 4, sizeof(int), &b->max_supp);
139   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 5, sizeof(int), &b->bwidth);
140   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_pad_input, 6, sizeof(int), &b->bheight);
141   err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_pad_input, sizes_pad);
142   if(err != CL_SUCCESS) goto error;
143 
144   // create gauss pyramid of padded input, write coarse directly to output
145   for(int l=1;l<b->num_levels;l++)
146   {
147     const int wd = dl(b->bwidth, l), ht = dl(b->bheight, l);
148     size_t sizes[] = { ROUNDUPWD(wd), ROUNDUPHT(ht), 1 };
149     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 0, sizeof(cl_mem), &b->dev_padded[l-1]);
150     if(l == b->num_levels-1)
151       dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 1, sizeof(cl_mem), &b->dev_output[l]);
152     else
153       dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 1, sizeof(cl_mem), &b->dev_padded[l]);
154     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 2, sizeof(int), &wd);
155     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 3, sizeof(int), &ht);
156     err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_gauss_reduce, sizes);
157     if(err != CL_SUCCESS) goto error;
158   }
159 
160   for(int k=0;k<num_gamma;k++)
161   { // process images
162     const float g = (k+.5f)/(float)num_gamma;
163     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 0, sizeof(cl_mem), &b->dev_padded[0]);
164     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 1, sizeof(cl_mem), &b->dev_processed[k][0]);
165     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 2, sizeof(float), &g);
166     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 3, sizeof(float), &b->sigma);
167     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 4, sizeof(float), &b->shadows);
168     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 5, sizeof(float), &b->highlights);
169     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 6, sizeof(float), &b->clarity);
170     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 7, sizeof(int), &b->bwidth);
171     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_process_curve, 8, sizeof(int), &b->bheight);
172     err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_process_curve, sizes_pad);
173     if(err != CL_SUCCESS) goto error;
174 
175     // create gaussian pyramids
176     for(int l=1;l<b->num_levels;l++)
177     {
178       const int wd = dl(b->bwidth, l), ht = dl(b->bheight, l);
179       size_t sizes[] = { ROUNDUPWD(wd), ROUNDUPHT(ht), 1 };
180       dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 0, sizeof(cl_mem), &b->dev_processed[k][l-1]);
181       dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 1, sizeof(cl_mem), &b->dev_processed[k][l]);
182       dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 2, sizeof(int), &wd);
183       dt_opencl_set_kernel_arg(b->devid, b->global->kernel_gauss_reduce, 3, sizeof(int), &ht);
184       err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_gauss_reduce, sizes);
185       if(err != CL_SUCCESS) goto error;
186     }
187   }
188 
189   // assemble output pyramid coarse to fine
190   for(int l=b->num_levels-2;l >= 0; l--)
191   {
192     const int pw = dl(b->bwidth,l), ph = dl(b->bheight,l);
193     size_t sizes[] = { ROUNDUPWD(pw), ROUNDUPHT(ph), 1 };
194     // this is so dumb:
195     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  0, sizeof(cl_mem), &b->dev_padded[l]);
196     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  1, sizeof(cl_mem), &b->dev_output[l+1]);
197     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  2, sizeof(cl_mem), &b->dev_output[l]);
198     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  3, sizeof(cl_mem), &b->dev_processed[0][l]);
199     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  4, sizeof(cl_mem), &b->dev_processed[0][l+1]);
200     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  5, sizeof(cl_mem), &b->dev_processed[1][l]);
201     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  6, sizeof(cl_mem), &b->dev_processed[1][l+1]);
202     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  7, sizeof(cl_mem), &b->dev_processed[2][l]);
203     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  8, sizeof(cl_mem), &b->dev_processed[2][l+1]);
204     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble,  9, sizeof(cl_mem), &b->dev_processed[3][l]);
205     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 10, sizeof(cl_mem), &b->dev_processed[3][l+1]);
206     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 11, sizeof(cl_mem), &b->dev_processed[4][l]);
207     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 12, sizeof(cl_mem), &b->dev_processed[4][l+1]);
208     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 13, sizeof(cl_mem), &b->dev_processed[5][l]);
209     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 14, sizeof(cl_mem), &b->dev_processed[5][l+1]);
210     // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 15, sizeof(cl_mem), &b->dev_processed[6][l]);
211     // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 16, sizeof(cl_mem), &b->dev_processed[6][l+1]);
212     // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 17, sizeof(cl_mem), &b->dev_processed[7][l]);
213     // dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 18, sizeof(cl_mem), &b->dev_processed[7][l+1]);
214     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 15, sizeof(int), &pw);
215     dt_opencl_set_kernel_arg(b->devid, b->global->kernel_laplacian_assemble, 16, sizeof(int), &ph);
216     err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_laplacian_assemble, sizes);
217     if(err != CL_SUCCESS) goto error;
218   }
219 
220   // read back processed L channel and copy colours:
221   size_t sizes[] = { ROUNDUPWD(b->width), ROUNDUPHT(b->height), 1 };
222   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 0, sizeof(cl_mem), &input);
223   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 1, sizeof(cl_mem), &b->dev_output[0]);
224   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 2, sizeof(cl_mem), &output);
225   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 3, sizeof(int), &b->max_supp);
226   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 4, sizeof(int), &b->width);
227   dt_opencl_set_kernel_arg(b->devid, b->global->kernel_write_back, 5, sizeof(int), &b->height);
228   err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_write_back, sizes);
229   if(err != CL_SUCCESS) goto error;
230 
231   return CL_SUCCESS;
232 
233 error:
234   fprintf(stderr, "[local laplacian cl] failed: %d\n", err);
235   return err;
236 }
237 
238 #undef max_levels
239 #undef num_gamma
240 #endif
241