1 /*
2     This file is part of darktable,
3     Copyright (C) 2017-2021 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 
18 
19     Implementation of the guided image filter as described in
20 
21     "Guided Image Filtering" by Kaiming He, Jian Sun, and Xiaoou Tang in
22     K. Daniilidis, P. Maragos, N. Paragios (Eds.): ECCV 2010, Part I,
23     LNCS 6311, pp. 1-14, 2010. Springer-Verlag Berlin Heidelberg 2010
24 
25     "Guided Image Filtering" by Kaiming He, Jian Sun, and Xiaoou Tang in
26     IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 35,
27     no. 6, June 2013, 1397-1409
28 
29 */
30 
31 #include "common/box_filters.h"
32 #include "common/guided_filter.h"
33 #include "common/math.h"
34 #include "common/opencl.h"
35 #include <assert.h>
36 #include <float.h>
37 #include <stdlib.h>
38 #include <string.h>
39 
40 // processing is split into tiles of this size (or three times the filter
41 // width, if greater) to keep memory use under control.
42 #define GF_TILE_SIZE 512
43 
44 // some shorthand to make code more legible
45 // if we have OpenMP simd enabled, declare a vectorizable for loop;
46 // otherwise, just leave it a plain for()
47 #if defined(_OPENMP) && defined(OPENMP_SIMD_)
48 #define SIMD_FOR \
49   _Pragma("omp simd") \
50   for
51 #else
52 #define SIMD_FOR for
53 #endif
54 
55 // avoid cluttering the scalar codepath with #ifdefs by hiding the dependency on SSE2
56 #ifndef __SSE2__
57 # define _mm_prefetch(where,hint)
58 #endif
59 
60 // the filter does internal tiling to keep memory requirements reasonable, so this structure
61 // defines the position of the tile being processed
62 typedef struct tile
63 {
64   int left, right, lower, upper;
65 } tile;
66 
67 typedef struct color_image
68 {
69   float *data;
70   int width, height, stride;
71 } color_image;
72 
73 // allocate space for n-component image of size width x height
new_color_image(int width,int height,int ch)74 static inline color_image new_color_image(int width, int height, int ch)
75 {
76   return (color_image){ dt_alloc_align_float((size_t)width * height * ch), width, height, ch };
77 }
78 
79 // free space for n-component image
free_color_image(color_image * img_p)80 static inline void free_color_image(color_image *img_p)
81 {
82   dt_free_align(img_p->data);
83   img_p->data = NULL;
84 }
85 
86 // get a pointer to pixel number 'i' within the image
get_color_pixel(color_image img,size_t i)87 static inline float *get_color_pixel(color_image img, size_t i)
88 {
89   return img.data + i * img.stride;
90 }
91 
92 
93 // apply guided filter to single-component image img using the 3-components image imgg as a guide
94 // the filtering applies a monochrome box filter to a total of 13 image channels:
95 //    1 monochrome input image
96 //    3 color guide image
97 //    3 covariance (R, G, B)
98 //    6 variance (R-R, R-G, R-B, G-G, G-B, B-B)
99 // for computational efficiency, we'll pack them into a four-channel image and a 9-channel image
100 // image instead of running 13 separate box filters: guide+input, R/G/B/R-R/R-G/R-B/G-G/G-B/B-B.
guided_filter_tiling(color_image imgg,gray_image img,gray_image img_out,tile target,const int w,const float eps,const float guide_weight,const float min,const float max)101 static void guided_filter_tiling(color_image imgg, gray_image img, gray_image img_out, tile target, const int w,
102                                  const float eps, const float guide_weight, const float min, const float max)
103 {
104   const tile source = { max_i(target.left - 2 * w, 0), min_i(target.right + 2 * w, imgg.width),
105                         max_i(target.lower - 2 * w, 0), min_i(target.upper + 2 * w, imgg.height) };
106   const int width = source.right - source.left;
107   const int height = source.upper - source.lower;
108   size_t size = (size_t)width * (size_t)height;
109 // since we're packing multiple monochrome planes into a color image, define symbolic constants so that
110 // we can keep track of which values we're actually using
111 #define INP_MEAN 0
112 #define GUIDE_MEAN_R 1
113 #define GUIDE_MEAN_G 2
114 #define GUIDE_MEAN_B 3
115 #define COV_R 0
116 #define COV_G 1
117 #define COV_B 2
118 #define VAR_RR 3
119 #define VAR_RG 4
120 #define VAR_RB 5
121 #define VAR_GG 6
122 #define VAR_BB 8
123 #define VAR_GB 7
124   color_image mean = new_color_image(width, height, 4);
125   color_image variance = new_color_image(width, height, 9);
126   const size_t img_dimen = mean.width;
127   size_t img_bak_sz;
128   float *img_bak = dt_alloc_perthread_float(9*img_dimen, &img_bak_sz);
129 #ifdef _OPENMP
130 #pragma omp parallel for schedule(static) default(none) shared(img, imgg, mean, variance, img_bak) \
131   dt_omp_firstprivate(img_bak_sz, img_dimen, w, guide_weight) dt_omp_sharedconst(source)
132 #endif
133   for(int j_imgg = source.lower; j_imgg < source.upper; j_imgg++)
134   {
135     int j = j_imgg - source.lower;
136     float *const restrict meanpx = mean.data + 4 * j * mean.width;
137     float *const restrict varpx = variance.data + 9 * j * variance.width;
138     for(int i_imgg = source.left; i_imgg < source.right; i_imgg++)
139     {
140       size_t i = i_imgg - source.left;
141       const float *pixel_ = get_color_pixel(imgg, i_imgg + (size_t)j_imgg * imgg.width);
142       float DT_ALIGNED_PIXEL pixel[4] =
143         { pixel_[0] * guide_weight, pixel_[1] * guide_weight, pixel_[2] * guide_weight, pixel_[3] * guide_weight };
144       const float input = img.data[i_imgg + (size_t)j_imgg * img.width];
145       meanpx[4*i+INP_MEAN] = input;
146       meanpx[4*i+GUIDE_MEAN_R] = pixel[0];
147       meanpx[4*i+GUIDE_MEAN_G] = pixel[1];
148       meanpx[4*i+GUIDE_MEAN_B] = pixel[2];
149       varpx[9*i+COV_R] = pixel[0] * input;
150       varpx[9*i+COV_G] = pixel[1] * input;
151       varpx[9*i+COV_B] = pixel[2] * input;
152       varpx[9*i+VAR_RR] = pixel[0] * pixel[0];
153       varpx[9*i+VAR_RG] = pixel[0] * pixel[1];
154       varpx[9*i+VAR_RB] = pixel[0] * pixel[2];
155       varpx[9*i+VAR_GG] = pixel[1] * pixel[1];
156       varpx[9*i+VAR_GB] = pixel[1] * pixel[2];
157       varpx[9*i+VAR_BB] = pixel[2] * pixel[2];
158     }
159     // apply horizontal pass of box mean filter while the cache is still hot
160     float *const restrict scratch = dt_get_perthread(img_bak, img_bak_sz);
161     dt_box_mean_horizontal(meanpx, mean.width, 4|BOXFILTER_KAHAN_SUM, w, scratch);
162     dt_box_mean_horizontal(varpx, variance.width, 9|BOXFILTER_KAHAN_SUM, w, scratch);
163   }
164   dt_free_align(img_bak);
165   dt_box_mean_vertical(mean.data, mean.height, mean.width, 4|BOXFILTER_KAHAN_SUM, w);
166   dt_box_mean_vertical(variance.data, variance.height, variance.width, 9|BOXFILTER_KAHAN_SUM, w);
167   // we will recycle memory of 'mean' for the new coefficient arrays a_? and b to reduce memory foot print
168   color_image a_b = mean;
169   #define A_RED 0
170   #define A_GREEN 1
171   #define A_BLUE 2
172   #define B 3
173 #ifdef _OPENMP
174 #pragma omp parallel for schedule(static) default(none) \
175   dt_omp_firstprivate(size, eps) shared(mean, variance, a_b)
176 #endif
177   for(size_t i = 0; i < size; i++)
178   {
179     const float *meanpx = get_color_pixel(mean, i);
180     const float inp_mean = meanpx[INP_MEAN];
181     const float guide_r = meanpx[GUIDE_MEAN_R];
182     const float guide_g = meanpx[GUIDE_MEAN_G];
183     const float guide_b = meanpx[GUIDE_MEAN_B];
184     float *const varpx = get_color_pixel(variance, i);
185     // solve linear system of equations of size 3x3 via Cramer's rule
186     // symmetric coefficient matrix
187     const float Sigma_0_0 = varpx[VAR_RR] - (guide_r * guide_r) + eps;
188     const float Sigma_0_1 = varpx[VAR_RG] - (guide_r * guide_g);
189     const float Sigma_0_2 = varpx[VAR_RB] - (guide_r * guide_b);
190     const float Sigma_1_1 = varpx[VAR_GG] - (guide_g * guide_g) + eps;;
191     const float Sigma_1_2 = varpx[VAR_GB] - (guide_g * guide_b);
192     const float Sigma_2_2 = varpx[VAR_BB] - (guide_b * guide_b) + eps;
193     const float det0 = Sigma_0_0 * (Sigma_1_1 * Sigma_2_2 - Sigma_1_2 * Sigma_1_2)
194       - Sigma_0_1 * (Sigma_0_1 * Sigma_2_2 - Sigma_0_2 * Sigma_1_2)
195       + Sigma_0_2 * (Sigma_0_1 * Sigma_1_2 - Sigma_0_2 * Sigma_1_1);
196     float a_r_, a_g_, a_b_, b_;
197     if(fabsf(det0) > 4.f * FLT_EPSILON)
198     {
199       const float cov_r = varpx[COV_R] - guide_r * inp_mean;
200       const float cov_g = varpx[COV_G] - guide_g * inp_mean;
201       const float cov_b = varpx[COV_B] - guide_b * inp_mean;
202       const float det1 = cov_r * (Sigma_1_1 * Sigma_2_2 - Sigma_1_2 * Sigma_1_2)
203         - Sigma_0_1 * (cov_g * Sigma_2_2 - cov_b * Sigma_1_2)
204         + Sigma_0_2 * (cov_g * Sigma_1_2 - cov_b * Sigma_1_1);
205       const float det2 = Sigma_0_0 * (cov_g * Sigma_2_2 - cov_b * Sigma_1_2)
206         - cov_r * (Sigma_0_1 * Sigma_2_2 - Sigma_0_2 * Sigma_1_2)
207         + Sigma_0_2 * (Sigma_0_1 * cov_b - Sigma_0_2 * cov_g);
208       const float det3 = Sigma_0_0 * (Sigma_1_1 * cov_b - Sigma_1_2 * cov_g)
209         - Sigma_0_1 * (Sigma_0_1 * cov_b - Sigma_0_2 * cov_g)
210         + cov_r * (Sigma_0_1 * Sigma_1_2 - Sigma_0_2 * Sigma_1_1);
211       a_r_ = det1 / det0;
212       a_g_ = det2 / det0;
213       a_b_ = det3 / det0;
214       b_ = inp_mean - a_r_ * guide_r - a_g_ * guide_g - a_b_ * guide_b;
215     }
216     else
217     {
218       // linear system is singular
219       a_r_ = 0.f;
220       a_g_ = 0.f;
221       a_b_ = 0.f;
222       b_ = get_color_pixel(mean, i)[INP_MEAN];
223     }
224     // now data of imgg_mean_? is no longer needed, we can safely overwrite aliasing arrays
225     a_b.data[4*i+A_RED] = a_r_;
226     a_b.data[4*i+A_GREEN] = a_g_;
227     a_b.data[4*i+A_BLUE] = a_b_;
228     a_b.data[4*i+B] = b_;
229   }
230   free_color_image(&variance);
231 
232   dt_box_mean(a_b.data, a_b.height, a_b.width, a_b.stride|BOXFILTER_KAHAN_SUM, w, 1);
233 
234 #ifdef _OPENMP
235 #pragma omp parallel for schedule(static) default(none) \
236   shared(target, imgg, a_b, img_out) dt_omp_sharedconst(source) dt_omp_firstprivate(min, max, width, guide_weight)
237 #endif
238   for(int j_imgg = target.lower; j_imgg < target.upper; j_imgg++)
239   {
240     // index of the left most target pixel in the current row
241     size_t l = target.left + (size_t)j_imgg * imgg.width;
242     // index of the left most source pixel in the current row of the
243     // smaller auxiliary gray-scale images a_r, a_g, a_b, and b
244     // excluding boundary data from neighboring tiles
245     size_t k = (target.left - source.left) + (size_t)(j_imgg - source.lower) * width;
246     for(int i_imgg = target.left; i_imgg < target.right; i_imgg++, k++, l++)
247     {
248       const float *pixel = get_color_pixel(imgg, l);
249       const float *px_ab = get_color_pixel(a_b, k);
250       float res = guide_weight * (px_ab[A_RED] * pixel[0] + px_ab[A_GREEN] * pixel[1] + px_ab[A_BLUE] * pixel[2]);
251       res += px_ab[B];
252       img_out.data[i_imgg + (size_t)j_imgg * imgg.width] = CLAMP(res, min, max);
253     }
254   }
255   free_color_image(&mean);
256 }
257 
compute_tile_height(const int height,const int w)258 static int compute_tile_height(const int height, const int w)
259 {
260   int tile_h = max_i(3 * w, GF_TILE_SIZE);
261 #if 0 // enabling the below doesn't make any measureable speed difference, but does cause a handful of pixels
262       // to round off differently (as does changing GF_TILE_SIZE)
263   if ((height % tile_h) > 0 && (height % tile_h) < GF_TILE_SIZE/3)
264   {
265     // if there's just a sliver left over for the last row of tiles, see whether slicing off a few pixels
266     // gives us a mostly-full tile
267     if (height % (tile_h - 8) >= GF_TILE_SIZE/3)
268       tile_h -= 8;
269     else  if (height % (tile_h - w/4) >= GF_TILE_SIZE/3)
270       tile_h -= (w/4);
271     else  if (height % (tile_h - w/2) >= GF_TILE_SIZE/3)
272       tile_h -= (w/2);
273     // try adding a few pixels
274     else if (height % (tile_h + 8) >= GF_TILE_SIZE/3)
275       tile_h += 8;
276     else if (height % (tile_h + 16) >= GF_TILE_SIZE/3)
277       tile_h += 16;
278   }
279 #endif
280   return tile_h;
281 }
282 
compute_tile_width(const int width,const int w)283 static int compute_tile_width(const int width, const int w)
284 {
285   int tile_w = max_i(3 * w, GF_TILE_SIZE);
286 #if 0 // enabling the below doesn't make any measureable speed difference, but does cause a handful of pixels
287       // to round off differently (as does changing GF_TILE_SIZE)
288   if ((width % tile_w) > 0 && (width % tile_w) < GF_TILE_SIZE/2)
289   {
290     // if there's just a sliver left over for the last column of tiles, see whether slicing off a few pixels
291     // gives us a mostly-full tile
292     if (width % (tile_w - 8) >= GF_TILE_SIZE/3)
293       tile_w -= 8;
294     else  if (width % (tile_w - w/4) >= GF_TILE_SIZE/3)
295       tile_w -= (w/4);
296     else  if (width % (tile_w - w/2) >= GF_TILE_SIZE/3)
297       tile_w -= (w/2);
298     // try adding a few pixels
299     else if (width % (tile_w + 8) >= GF_TILE_SIZE/3)
300       tile_w += 8;
301     else if (width % (tile_w + 16) >= GF_TILE_SIZE/3)
302       tile_w += 16;
303   }
304 #endif
305   return tile_w;
306 }
307 
guided_filter(const float * const guide,const float * const in,float * const out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)308 void guided_filter(const float *const guide, const float *const in, float *const out, const int width,
309                    const int height, const int ch,
310                    const int w,              // window size
311                    const float sqrt_eps,     // regularization parameter
312                    const float guide_weight, // to balance the amplitudes in the guiding image and the input image
313                    const float min, const float max)
314 {
315   assert(ch >= 3);
316   assert(w >= 1);
317 
318   color_image img_guide = (color_image){ (float *)guide, width, height, ch };
319   gray_image img_in = (gray_image){ (float *)in, width, height };
320   gray_image img_out = (gray_image){ out, width, height };
321   const int tile_width = compute_tile_width(width,w);
322   const int tile_height = compute_tile_height(height,w);
323   const float eps = sqrt_eps * sqrt_eps; // this is the regularization parameter of the original papers
324 
325   for(int j = 0; j < height; j += tile_height)
326   {
327     for(int i = 0; i < width; i += tile_width)
328     {
329       tile target = { i, min_i(i + tile_width, width), j, min_i(j + tile_height, height) };
330       guided_filter_tiling(img_guide, img_in, img_out, target, w, eps, guide_weight, min, max);
331     }
332   }
333 }
334 
335 #ifdef HAVE_OPENCL
336 
dt_guided_filter_init_cl_global()337 dt_guided_filter_cl_global_t *dt_guided_filter_init_cl_global()
338 {
339   dt_guided_filter_cl_global_t *g = malloc(sizeof(*g));
340   const int program = 26; // guided_filter.cl, from programs.conf
341   g->kernel_guided_filter_split_rgb = dt_opencl_create_kernel(program, "guided_filter_split_rgb_image");
342   g->kernel_guided_filter_box_mean_x = dt_opencl_create_kernel(program, "guided_filter_box_mean_x");
343   g->kernel_guided_filter_box_mean_y = dt_opencl_create_kernel(program, "guided_filter_box_mean_y");
344   g->kernel_guided_filter_guided_filter_covariances
345       = dt_opencl_create_kernel(program, "guided_filter_covariances");
346   g->kernel_guided_filter_guided_filter_variances = dt_opencl_create_kernel(program, "guided_filter_variances");
347   g->kernel_guided_filter_update_covariance = dt_opencl_create_kernel(program, "guided_filter_update_covariance");
348   g->kernel_guided_filter_solve = dt_opencl_create_kernel(program, "guided_filter_solve");
349   g->kernel_guided_filter_generate_result = dt_opencl_create_kernel(program, "guided_filter_generate_result");
350   return g;
351 }
352 
353 
dt_guided_filter_free_cl_global(dt_guided_filter_cl_global_t * g)354 void dt_guided_filter_free_cl_global(dt_guided_filter_cl_global_t *g)
355 {
356   if(!g) return;
357   // destroy kernels
358   dt_opencl_free_kernel(g->kernel_guided_filter_split_rgb);
359   dt_opencl_free_kernel(g->kernel_guided_filter_box_mean_x);
360   dt_opencl_free_kernel(g->kernel_guided_filter_box_mean_y);
361   dt_opencl_free_kernel(g->kernel_guided_filter_guided_filter_covariances);
362   dt_opencl_free_kernel(g->kernel_guided_filter_guided_filter_variances);
363   dt_opencl_free_kernel(g->kernel_guided_filter_update_covariance);
364   dt_opencl_free_kernel(g->kernel_guided_filter_solve);
365   dt_opencl_free_kernel(g->kernel_guided_filter_generate_result);
366   free(g);
367 }
368 
369 
cl_split_rgb(const int devid,const int width,const int height,cl_mem guide,cl_mem imgg_r,cl_mem imgg_g,cl_mem imgg_b,const float guide_weight)370 static int cl_split_rgb(const int devid, const int width, const int height, cl_mem guide, cl_mem imgg_r,
371                         cl_mem imgg_g, cl_mem imgg_b, const float guide_weight)
372 {
373   const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_split_rgb;
374   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
375   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
376   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
377   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(imgg_r), &imgg_r);
378   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(imgg_g), &imgg_g);
379   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(imgg_b), &imgg_b);
380   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(guide_weight), &guide_weight);
381   const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
382   return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
383 }
384 
385 
cl_box_mean(const int devid,const int width,const int height,const int w,cl_mem in,cl_mem out,cl_mem temp)386 static int cl_box_mean(const int devid, const int width, const int height, const int w, cl_mem in, cl_mem out,
387                        cl_mem temp)
388 {
389   const int kernel_x = darktable.opencl->guided_filter->kernel_guided_filter_box_mean_x;
390   dt_opencl_set_kernel_arg(devid, kernel_x, 0, sizeof(width), &width);
391   dt_opencl_set_kernel_arg(devid, kernel_x, 1, sizeof(height), &height);
392   dt_opencl_set_kernel_arg(devid, kernel_x, 2, sizeof(in), &in);
393   dt_opencl_set_kernel_arg(devid, kernel_x, 3, sizeof(temp), &temp);
394   dt_opencl_set_kernel_arg(devid, kernel_x, 4, sizeof(w), &w);
395   const size_t sizes_x[] = { 1, ROUNDUPWD(height) };
396   const int err = dt_opencl_enqueue_kernel_2d(devid, kernel_x, sizes_x);
397   if(err != CL_SUCCESS) return err;
398 
399   const int kernel_y = darktable.opencl->guided_filter->kernel_guided_filter_box_mean_y;
400   dt_opencl_set_kernel_arg(devid, kernel_y, 0, sizeof(width), &width);
401   dt_opencl_set_kernel_arg(devid, kernel_y, 1, sizeof(height), &height);
402   dt_opencl_set_kernel_arg(devid, kernel_y, 2, sizeof(temp), &temp);
403   dt_opencl_set_kernel_arg(devid, kernel_y, 3, sizeof(out), &out);
404   dt_opencl_set_kernel_arg(devid, kernel_y, 4, sizeof(w), &w);
405   const size_t sizes_y[] = { ROUNDUPWD(width), 1 };
406   return dt_opencl_enqueue_kernel_2d(devid, kernel_y, sizes_y);
407 }
408 
409 
cl_covariances(const int devid,const int width,const int height,cl_mem guide,cl_mem in,cl_mem cov_imgg_img_r,cl_mem cov_imgg_img_g,cl_mem cov_imgg_img_b,const float guide_weight)410 static int cl_covariances(const int devid, const int width, const int height, cl_mem guide, cl_mem in,
411                           cl_mem cov_imgg_img_r, cl_mem cov_imgg_img_g, cl_mem cov_imgg_img_b,
412                           const float guide_weight)
413 {
414   const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_guided_filter_covariances;
415   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
416   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
417   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
418   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(in), &in);
419   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cov_imgg_img_r), &cov_imgg_img_r);
420   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cov_imgg_img_g), &cov_imgg_img_g);
421   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cov_imgg_img_b), &cov_imgg_img_b);
422   dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(guide_weight), &guide_weight);
423   const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
424   return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
425 }
426 
427 
cl_variances(const int devid,const int width,const int height,cl_mem guide,cl_mem var_imgg_rr,cl_mem var_imgg_rg,cl_mem var_imgg_rb,cl_mem var_imgg_gg,cl_mem var_imgg_gb,cl_mem var_imgg_bb,const float guide_weight)428 static int cl_variances(const int devid, const int width, const int height, cl_mem guide, cl_mem var_imgg_rr,
429                         cl_mem var_imgg_rg, cl_mem var_imgg_rb, cl_mem var_imgg_gg, cl_mem var_imgg_gb,
430                         cl_mem var_imgg_bb, const float guide_weight)
431 {
432   const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_guided_filter_variances;
433   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
434   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
435   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
436   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(var_imgg_rr), &var_imgg_rr);
437   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(var_imgg_rg), &var_imgg_rg);
438   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(var_imgg_rb), &var_imgg_rb);
439   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(var_imgg_gg), &var_imgg_gg);
440   dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(var_imgg_gb), &var_imgg_gb);
441   dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(var_imgg_bb), &var_imgg_bb);
442   dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(guide_weight), &guide_weight);
443   size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
444   return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
445 }
446 
447 
cl_update_covariance(const int devid,const int width,const int height,cl_mem in,cl_mem out,cl_mem a,cl_mem b,float eps)448 static int cl_update_covariance(const int devid, const int width, const int height, cl_mem in, cl_mem out,
449                                 cl_mem a, cl_mem b, float eps)
450 {
451   const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_update_covariance;
452   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
453   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
454   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(in), &in);
455   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(out), &out);
456   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(a), &a);
457   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(b), &b);
458   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(eps), &eps);
459   const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
460   return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
461 }
462 
463 
cl_solve(const int devid,const int width,const int height,cl_mem img_mean,cl_mem imgg_mean_r,cl_mem imgg_mean_g,cl_mem imgg_mean_b,cl_mem cov_imgg_img_r,cl_mem cov_imgg_img_g,cl_mem cov_imgg_img_b,cl_mem var_imgg_rr,cl_mem var_imgg_rg,cl_mem var_imgg_rb,cl_mem var_imgg_gg,cl_mem var_imgg_gb,cl_mem var_imgg_bb,cl_mem a_r,cl_mem a_g,cl_mem a_b,cl_mem b)464 static int cl_solve(const int devid, const int width, const int height, cl_mem img_mean, cl_mem imgg_mean_r,
465                     cl_mem imgg_mean_g, cl_mem imgg_mean_b, cl_mem cov_imgg_img_r, cl_mem cov_imgg_img_g,
466                     cl_mem cov_imgg_img_b, cl_mem var_imgg_rr, cl_mem var_imgg_rg, cl_mem var_imgg_rb,
467                     cl_mem var_imgg_gg, cl_mem var_imgg_gb, cl_mem var_imgg_bb, cl_mem a_r, cl_mem a_g, cl_mem a_b,
468                     cl_mem b)
469 {
470   const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_solve;
471   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
472   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
473   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(img_mean), &img_mean);
474   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(imgg_mean_r), &imgg_mean_r);
475   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(imgg_mean_g), &imgg_mean_g);
476   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(imgg_mean_b), &imgg_mean_b);
477   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cov_imgg_img_r), &cov_imgg_img_r);
478   dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cov_imgg_img_g), &cov_imgg_img_g);
479   dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cov_imgg_img_b), &cov_imgg_img_b);
480   dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(var_imgg_rr), &var_imgg_rr);
481   dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(var_imgg_rg), &var_imgg_rg);
482   dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(var_imgg_rb), &var_imgg_rb);
483   dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(var_imgg_gg), &var_imgg_gg);
484   dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(var_imgg_gb), &var_imgg_gb);
485   dt_opencl_set_kernel_arg(devid, kernel, 14, sizeof(var_imgg_bb), &var_imgg_bb);
486   dt_opencl_set_kernel_arg(devid, kernel, 15, sizeof(a_r), &a_r);
487   dt_opencl_set_kernel_arg(devid, kernel, 16, sizeof(a_g), &a_g);
488   dt_opencl_set_kernel_arg(devid, kernel, 17, sizeof(a_b), &a_b);
489   dt_opencl_set_kernel_arg(devid, kernel, 18, sizeof(b), &b);
490   const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
491   return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
492 }
493 
494 
cl_generate_result(const int devid,const int width,const int height,cl_mem guide,cl_mem a_r,cl_mem a_g,cl_mem a_b,cl_mem b,cl_mem out,const float guide_weight,const float min,const float max)495 static int cl_generate_result(const int devid, const int width, const int height, cl_mem guide, cl_mem a_r,
496                               cl_mem a_g, cl_mem a_b, cl_mem b, cl_mem out, const float guide_weight,
497                               const float min, const float max)
498 {
499   const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_generate_result;
500   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
501   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
502   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
503   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(a_r), &a_r);
504   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(a_g), &a_g);
505   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(a_b), &a_b);
506   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(b), &b);
507   dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(out), &out);
508   dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(guide_weight), &guide_weight);
509   dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(min), &min);
510   dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(max), &max);
511   const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
512   return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
513 }
514 
515 
guided_filter_cl_impl(int devid,cl_mem guide,cl_mem in,cl_mem out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)516 static int guided_filter_cl_impl(int devid, cl_mem guide, cl_mem in, cl_mem out, const int width, const int height,
517                                  const int ch,
518                                  const int w,              // window size
519                                  const float sqrt_eps,     // regularization parameter
520                                  const float guide_weight, // to balance the amplitudes in the guiding image and
521                                                            // the input// image
522                                  const float min, const float max)
523 {
524   const float eps = sqrt_eps * sqrt_eps; // this is the regularization parameter of the original papers
525 
526   void *temp1 = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
527   void *temp2 = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
528   void *imgg_mean_r = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
529   void *imgg_mean_g = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
530   void *imgg_mean_b = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
531   void *img_mean = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
532   void *cov_imgg_img_r = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
533   void *cov_imgg_img_g = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
534   void *cov_imgg_img_b = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
535   void *var_imgg_rr = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
536   void *var_imgg_gg = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
537   void *var_imgg_bb = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
538   void *var_imgg_rg = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
539   void *var_imgg_rb = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
540   void *var_imgg_gb = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
541   void *a_r = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
542   void *a_g = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
543   void *a_b = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
544   void *b = temp2;
545 
546   int err = CL_SUCCESS;
547   if(temp1 == NULL || temp2 == NULL ||                                                        //
548      imgg_mean_r == NULL || imgg_mean_g == NULL || imgg_mean_b == NULL || img_mean == NULL || //
549      cov_imgg_img_r == NULL || cov_imgg_img_g == NULL || cov_imgg_img_b == NULL ||            //
550      var_imgg_rr == NULL || var_imgg_gg == NULL || var_imgg_bb == NULL ||                     //
551      var_imgg_rg == NULL || var_imgg_rb == NULL || var_imgg_gb == NULL ||                     //
552      a_r == NULL || a_g == NULL || a_b == NULL)
553   {
554     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
555     goto error;
556   }
557 
558   err = cl_split_rgb(devid, width, height, guide, imgg_mean_r, imgg_mean_g, imgg_mean_b, guide_weight);
559   if(err != CL_SUCCESS) goto error;
560 
561   err = cl_box_mean(devid, width, height, w, in, img_mean, temp1);
562   if(err != CL_SUCCESS) goto error;
563   err = cl_box_mean(devid, width, height, w, imgg_mean_r, imgg_mean_r, temp1);
564   if(err != CL_SUCCESS) goto error;
565   err = cl_box_mean(devid, width, height, w, imgg_mean_g, imgg_mean_g, temp1);
566   if(err != CL_SUCCESS) goto error;
567   err = cl_box_mean(devid, width, height, w, imgg_mean_b, imgg_mean_b, temp1);
568   if(err != CL_SUCCESS) goto error;
569 
570   err = cl_covariances(devid, width, height, guide, in, cov_imgg_img_r, cov_imgg_img_g, cov_imgg_img_b,
571                        guide_weight);
572   if(err != CL_SUCCESS) goto error;
573 
574   err = cl_variances(devid, width, height, guide, var_imgg_rr, var_imgg_rg, var_imgg_rb, var_imgg_gg, var_imgg_gb,
575                      var_imgg_bb, guide_weight);
576   if(err != CL_SUCCESS) goto error;
577 
578   err = cl_box_mean(devid, width, height, w, cov_imgg_img_r, temp2, temp1);
579   if(err != CL_SUCCESS) goto error;
580   err = cl_update_covariance(devid, width, height, temp2, cov_imgg_img_r, imgg_mean_r, img_mean, 0.f);
581   if(err != CL_SUCCESS) goto error;
582   err = cl_box_mean(devid, width, height, w, cov_imgg_img_g, temp2, temp1);
583   if(err != CL_SUCCESS) goto error;
584   err = cl_update_covariance(devid, width, height, temp2, cov_imgg_img_g, imgg_mean_g, img_mean, 0.f);
585   if(err != CL_SUCCESS) goto error;
586   err = cl_box_mean(devid, width, height, w, cov_imgg_img_b, temp2, temp1);
587   if(err != CL_SUCCESS) goto error;
588   err = cl_update_covariance(devid, width, height, temp2, cov_imgg_img_b, imgg_mean_b, img_mean, 0.f);
589   if(err != CL_SUCCESS) goto error;
590   err = cl_box_mean(devid, width, height, w, var_imgg_rr, temp2, temp1);
591   if(err != CL_SUCCESS) goto error;
592   err = cl_update_covariance(devid, width, height, temp2, var_imgg_rr, imgg_mean_r, imgg_mean_r, eps);
593   if(err != CL_SUCCESS) goto error;
594   err = cl_box_mean(devid, width, height, w, var_imgg_rg, temp2, temp1);
595   if(err != CL_SUCCESS) goto error;
596   err = cl_update_covariance(devid, width, height, temp2, var_imgg_rg, imgg_mean_r, imgg_mean_g, 0.f);
597   if(err != CL_SUCCESS) goto error;
598   err = cl_box_mean(devid, width, height, w, var_imgg_rb, temp2, temp1);
599   if(err != CL_SUCCESS) goto error;
600   err = cl_update_covariance(devid, width, height, temp2, var_imgg_rb, imgg_mean_r, imgg_mean_b, 0.f);
601   if(err != CL_SUCCESS) goto error;
602   err = cl_box_mean(devid, width, height, w, var_imgg_gg, temp2, temp1);
603   if(err != CL_SUCCESS) goto error;
604   err = cl_update_covariance(devid, width, height, temp2, var_imgg_gg, imgg_mean_g, imgg_mean_g, eps);
605   if(err != CL_SUCCESS) goto error;
606   err = cl_box_mean(devid, width, height, w, var_imgg_gb, temp2, temp1);
607   if(err != CL_SUCCESS) goto error;
608   err = cl_update_covariance(devid, width, height, temp2, var_imgg_gb, imgg_mean_g, imgg_mean_b, 0.f);
609   if(err != CL_SUCCESS) goto error;
610   err = cl_box_mean(devid, width, height, w, var_imgg_bb, temp2, temp1);
611   if(err != CL_SUCCESS) goto error;
612   err = cl_update_covariance(devid, width, height, temp2, var_imgg_bb, imgg_mean_b, imgg_mean_b, eps);
613   if(err != CL_SUCCESS) goto error;
614 
615   err = cl_solve(devid, width, height, img_mean, imgg_mean_r, imgg_mean_g, imgg_mean_b, cov_imgg_img_r,
616                  cov_imgg_img_g, cov_imgg_img_b, var_imgg_rr, var_imgg_rg, var_imgg_rb, var_imgg_gg, var_imgg_gb,
617                  var_imgg_bb, a_r, a_g, a_b, b);
618   if(err != CL_SUCCESS) goto error;
619 
620   err = cl_box_mean(devid, width, height, w, a_r, a_r, temp1);
621   if(err != CL_SUCCESS) goto error;
622   err = cl_box_mean(devid, width, height, w, a_g, a_g, temp1);
623   if(err != CL_SUCCESS) goto error;
624   err = cl_box_mean(devid, width, height, w, a_b, a_b, temp1);
625   if(err != CL_SUCCESS) goto error;
626   err = cl_box_mean(devid, width, height, w, b, b, temp1);
627   if(err != CL_SUCCESS) goto error;
628 
629   err = cl_generate_result(devid, width, height, guide, a_r, a_g, a_b, b, out, guide_weight, min, max);
630 
631 error:
632   if(err != CL_SUCCESS) dt_print(DT_DEBUG_OPENCL, "[guided filter] unknown error: %d\n", err);
633 
634   dt_opencl_release_mem_object(a_r);
635   dt_opencl_release_mem_object(a_g);
636   dt_opencl_release_mem_object(a_b);
637   dt_opencl_release_mem_object(var_imgg_rr);
638   dt_opencl_release_mem_object(var_imgg_rg);
639   dt_opencl_release_mem_object(var_imgg_rb);
640   dt_opencl_release_mem_object(var_imgg_gg);
641   dt_opencl_release_mem_object(var_imgg_gb);
642   dt_opencl_release_mem_object(var_imgg_bb);
643   dt_opencl_release_mem_object(cov_imgg_img_r);
644   dt_opencl_release_mem_object(cov_imgg_img_g);
645   dt_opencl_release_mem_object(cov_imgg_img_b);
646   dt_opencl_release_mem_object(img_mean);
647   dt_opencl_release_mem_object(imgg_mean_r);
648   dt_opencl_release_mem_object(imgg_mean_g);
649   dt_opencl_release_mem_object(imgg_mean_b);
650   dt_opencl_release_mem_object(temp1);
651   dt_opencl_release_mem_object(temp2);
652 
653   return err;
654 }
655 
656 
guided_filter_cl_fallback(int devid,cl_mem guide,cl_mem in,cl_mem out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)657 static void guided_filter_cl_fallback(int devid, cl_mem guide, cl_mem in, cl_mem out, const int width,
658                                       const int height, const int ch,
659                                       const int w,              // window size
660                                       const float sqrt_eps,     // regularization parameter
661                                       const float guide_weight, // to balance the amplitudes in the guiding image
662                                                                 // and the input// image
663                                       const float min, const float max)
664 {
665   // fall-back implementation: copy data from device memory to host memory and perform filter
666   // by CPU until there is a proper OpenCL implementation
667   float *guide_host = dt_alloc_align(64, sizeof(*guide_host) * width * height * ch);
668   float *in_host = dt_alloc_align(64, sizeof(*in_host) * width * height);
669   float *out_host = dt_alloc_align(64, sizeof(*out_host) * width * height);
670   int err;
671   err = dt_opencl_read_host_from_device(devid, guide_host, guide, width, height, ch * sizeof(float));
672   if(err != CL_SUCCESS) goto error;
673   err = dt_opencl_read_host_from_device(devid, in_host, in, width, height, sizeof(float));
674   if(err != CL_SUCCESS) goto error;
675   guided_filter(guide_host, in_host, out_host, width, height, ch, w, sqrt_eps, guide_weight, min, max);
676   err = dt_opencl_write_host_to_device(devid, out_host, out, width, height, sizeof(float));
677   if(err != CL_SUCCESS) goto error;
678 error:
679   dt_free_align(guide_host);
680   dt_free_align(in_host);
681   dt_free_align(out_host);
682 }
683 
684 
guided_filter_cl(int devid,cl_mem guide,cl_mem in,cl_mem out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)685 void guided_filter_cl(int devid, cl_mem guide, cl_mem in, cl_mem out, const int width, const int height,
686                       const int ch,
687                       const int w,              // window size
688                       const float sqrt_eps,     // regularization parameter
689                       const float guide_weight, // to balance the amplitudes in the guiding image and the input
690                                                 // image
691                       const float min, const float max)
692 {
693   assert(ch >= 3);
694   assert(w >= 1);
695 
696   const cl_ulong max_global_mem = dt_opencl_get_max_global_mem(devid);
697   const size_t reserved_memory = (size_t)(dt_conf_get_float("opencl_memory_headroom") * 1024 * 1024);
698   // estimate required memory for OpenCL code path with a safety factor of 5/4
699   const size_t required_memory
700       = darktable.opencl->dev[devid].memory_in_use + (size_t)width * height * sizeof(float) * 18 * 5 / 4;
701   int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
702   if(max_global_mem - reserved_memory > required_memory)
703     err = guided_filter_cl_impl(devid, guide, in, out, width, height, ch, w, sqrt_eps, guide_weight, min, max);
704   if(err != CL_SUCCESS)
705   {
706     dt_print(DT_DEBUG_OPENCL, "[guided filter] fall back to cpu implementation due to insufficient gpu memory\n");
707     guided_filter_cl_fallback(devid, guide, in, out, width, height, ch, w, sqrt_eps, guide_weight, min, max);
708   }
709 }
710 
711 #endif
712