1 /*
2 This file is part of darktable,
3 Copyright (C) 2017-2021 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17
18
19 Implementation of the guided image filter as described in
20
21 "Guided Image Filtering" by Kaiming He, Jian Sun, and Xiaoou Tang in
22 K. Daniilidis, P. Maragos, N. Paragios (Eds.): ECCV 2010, Part I,
23 LNCS 6311, pp. 1-14, 2010. Springer-Verlag Berlin Heidelberg 2010
24
25 "Guided Image Filtering" by Kaiming He, Jian Sun, and Xiaoou Tang in
26 IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 35,
27 no. 6, June 2013, 1397-1409
28
29 */
30
31 #include "common/box_filters.h"
32 #include "common/guided_filter.h"
33 #include "common/math.h"
34 #include "common/opencl.h"
35 #include <assert.h>
36 #include <float.h>
37 #include <stdlib.h>
38 #include <string.h>
39
40 // processing is split into tiles of this size (or three times the filter
41 // width, if greater) to keep memory use under control.
42 #define GF_TILE_SIZE 512
43
44 // some shorthand to make code more legible
45 // if we have OpenMP simd enabled, declare a vectorizable for loop;
46 // otherwise, just leave it a plain for()
47 #if defined(_OPENMP) && defined(OPENMP_SIMD_)
48 #define SIMD_FOR \
49 _Pragma("omp simd") \
50 for
51 #else
52 #define SIMD_FOR for
53 #endif
54
55 // avoid cluttering the scalar codepath with #ifdefs by hiding the dependency on SSE2
56 #ifndef __SSE2__
57 # define _mm_prefetch(where,hint)
58 #endif
59
60 // the filter does internal tiling to keep memory requirements reasonable, so this structure
61 // defines the position of the tile being processed
62 typedef struct tile
63 {
64 int left, right, lower, upper;
65 } tile;
66
67 typedef struct color_image
68 {
69 float *data;
70 int width, height, stride;
71 } color_image;
72
73 // allocate space for n-component image of size width x height
new_color_image(int width,int height,int ch)74 static inline color_image new_color_image(int width, int height, int ch)
75 {
76 return (color_image){ dt_alloc_align_float((size_t)width * height * ch), width, height, ch };
77 }
78
79 // free space for n-component image
free_color_image(color_image * img_p)80 static inline void free_color_image(color_image *img_p)
81 {
82 dt_free_align(img_p->data);
83 img_p->data = NULL;
84 }
85
86 // get a pointer to pixel number 'i' within the image
get_color_pixel(color_image img,size_t i)87 static inline float *get_color_pixel(color_image img, size_t i)
88 {
89 return img.data + i * img.stride;
90 }
91
92
93 // apply guided filter to single-component image img using the 3-components image imgg as a guide
94 // the filtering applies a monochrome box filter to a total of 13 image channels:
95 // 1 monochrome input image
96 // 3 color guide image
97 // 3 covariance (R, G, B)
98 // 6 variance (R-R, R-G, R-B, G-G, G-B, B-B)
99 // for computational efficiency, we'll pack them into a four-channel image and a 9-channel image
100 // image instead of running 13 separate box filters: guide+input, R/G/B/R-R/R-G/R-B/G-G/G-B/B-B.
guided_filter_tiling(color_image imgg,gray_image img,gray_image img_out,tile target,const int w,const float eps,const float guide_weight,const float min,const float max)101 static void guided_filter_tiling(color_image imgg, gray_image img, gray_image img_out, tile target, const int w,
102 const float eps, const float guide_weight, const float min, const float max)
103 {
104 const tile source = { max_i(target.left - 2 * w, 0), min_i(target.right + 2 * w, imgg.width),
105 max_i(target.lower - 2 * w, 0), min_i(target.upper + 2 * w, imgg.height) };
106 const int width = source.right - source.left;
107 const int height = source.upper - source.lower;
108 size_t size = (size_t)width * (size_t)height;
109 // since we're packing multiple monochrome planes into a color image, define symbolic constants so that
110 // we can keep track of which values we're actually using
111 #define INP_MEAN 0
112 #define GUIDE_MEAN_R 1
113 #define GUIDE_MEAN_G 2
114 #define GUIDE_MEAN_B 3
115 #define COV_R 0
116 #define COV_G 1
117 #define COV_B 2
118 #define VAR_RR 3
119 #define VAR_RG 4
120 #define VAR_RB 5
121 #define VAR_GG 6
122 #define VAR_BB 8
123 #define VAR_GB 7
124 color_image mean = new_color_image(width, height, 4);
125 color_image variance = new_color_image(width, height, 9);
126 const size_t img_dimen = mean.width;
127 size_t img_bak_sz;
128 float *img_bak = dt_alloc_perthread_float(9*img_dimen, &img_bak_sz);
129 #ifdef _OPENMP
130 #pragma omp parallel for schedule(static) default(none) shared(img, imgg, mean, variance, img_bak) \
131 dt_omp_firstprivate(img_bak_sz, img_dimen, w, guide_weight) dt_omp_sharedconst(source)
132 #endif
133 for(int j_imgg = source.lower; j_imgg < source.upper; j_imgg++)
134 {
135 int j = j_imgg - source.lower;
136 float *const restrict meanpx = mean.data + 4 * j * mean.width;
137 float *const restrict varpx = variance.data + 9 * j * variance.width;
138 for(int i_imgg = source.left; i_imgg < source.right; i_imgg++)
139 {
140 size_t i = i_imgg - source.left;
141 const float *pixel_ = get_color_pixel(imgg, i_imgg + (size_t)j_imgg * imgg.width);
142 float DT_ALIGNED_PIXEL pixel[4] =
143 { pixel_[0] * guide_weight, pixel_[1] * guide_weight, pixel_[2] * guide_weight, pixel_[3] * guide_weight };
144 const float input = img.data[i_imgg + (size_t)j_imgg * img.width];
145 meanpx[4*i+INP_MEAN] = input;
146 meanpx[4*i+GUIDE_MEAN_R] = pixel[0];
147 meanpx[4*i+GUIDE_MEAN_G] = pixel[1];
148 meanpx[4*i+GUIDE_MEAN_B] = pixel[2];
149 varpx[9*i+COV_R] = pixel[0] * input;
150 varpx[9*i+COV_G] = pixel[1] * input;
151 varpx[9*i+COV_B] = pixel[2] * input;
152 varpx[9*i+VAR_RR] = pixel[0] * pixel[0];
153 varpx[9*i+VAR_RG] = pixel[0] * pixel[1];
154 varpx[9*i+VAR_RB] = pixel[0] * pixel[2];
155 varpx[9*i+VAR_GG] = pixel[1] * pixel[1];
156 varpx[9*i+VAR_GB] = pixel[1] * pixel[2];
157 varpx[9*i+VAR_BB] = pixel[2] * pixel[2];
158 }
159 // apply horizontal pass of box mean filter while the cache is still hot
160 float *const restrict scratch = dt_get_perthread(img_bak, img_bak_sz);
161 dt_box_mean_horizontal(meanpx, mean.width, 4|BOXFILTER_KAHAN_SUM, w, scratch);
162 dt_box_mean_horizontal(varpx, variance.width, 9|BOXFILTER_KAHAN_SUM, w, scratch);
163 }
164 dt_free_align(img_bak);
165 dt_box_mean_vertical(mean.data, mean.height, mean.width, 4|BOXFILTER_KAHAN_SUM, w);
166 dt_box_mean_vertical(variance.data, variance.height, variance.width, 9|BOXFILTER_KAHAN_SUM, w);
167 // we will recycle memory of 'mean' for the new coefficient arrays a_? and b to reduce memory foot print
168 color_image a_b = mean;
169 #define A_RED 0
170 #define A_GREEN 1
171 #define A_BLUE 2
172 #define B 3
173 #ifdef _OPENMP
174 #pragma omp parallel for schedule(static) default(none) \
175 dt_omp_firstprivate(size, eps) shared(mean, variance, a_b)
176 #endif
177 for(size_t i = 0; i < size; i++)
178 {
179 const float *meanpx = get_color_pixel(mean, i);
180 const float inp_mean = meanpx[INP_MEAN];
181 const float guide_r = meanpx[GUIDE_MEAN_R];
182 const float guide_g = meanpx[GUIDE_MEAN_G];
183 const float guide_b = meanpx[GUIDE_MEAN_B];
184 float *const varpx = get_color_pixel(variance, i);
185 // solve linear system of equations of size 3x3 via Cramer's rule
186 // symmetric coefficient matrix
187 const float Sigma_0_0 = varpx[VAR_RR] - (guide_r * guide_r) + eps;
188 const float Sigma_0_1 = varpx[VAR_RG] - (guide_r * guide_g);
189 const float Sigma_0_2 = varpx[VAR_RB] - (guide_r * guide_b);
190 const float Sigma_1_1 = varpx[VAR_GG] - (guide_g * guide_g) + eps;;
191 const float Sigma_1_2 = varpx[VAR_GB] - (guide_g * guide_b);
192 const float Sigma_2_2 = varpx[VAR_BB] - (guide_b * guide_b) + eps;
193 const float det0 = Sigma_0_0 * (Sigma_1_1 * Sigma_2_2 - Sigma_1_2 * Sigma_1_2)
194 - Sigma_0_1 * (Sigma_0_1 * Sigma_2_2 - Sigma_0_2 * Sigma_1_2)
195 + Sigma_0_2 * (Sigma_0_1 * Sigma_1_2 - Sigma_0_2 * Sigma_1_1);
196 float a_r_, a_g_, a_b_, b_;
197 if(fabsf(det0) > 4.f * FLT_EPSILON)
198 {
199 const float cov_r = varpx[COV_R] - guide_r * inp_mean;
200 const float cov_g = varpx[COV_G] - guide_g * inp_mean;
201 const float cov_b = varpx[COV_B] - guide_b * inp_mean;
202 const float det1 = cov_r * (Sigma_1_1 * Sigma_2_2 - Sigma_1_2 * Sigma_1_2)
203 - Sigma_0_1 * (cov_g * Sigma_2_2 - cov_b * Sigma_1_2)
204 + Sigma_0_2 * (cov_g * Sigma_1_2 - cov_b * Sigma_1_1);
205 const float det2 = Sigma_0_0 * (cov_g * Sigma_2_2 - cov_b * Sigma_1_2)
206 - cov_r * (Sigma_0_1 * Sigma_2_2 - Sigma_0_2 * Sigma_1_2)
207 + Sigma_0_2 * (Sigma_0_1 * cov_b - Sigma_0_2 * cov_g);
208 const float det3 = Sigma_0_0 * (Sigma_1_1 * cov_b - Sigma_1_2 * cov_g)
209 - Sigma_0_1 * (Sigma_0_1 * cov_b - Sigma_0_2 * cov_g)
210 + cov_r * (Sigma_0_1 * Sigma_1_2 - Sigma_0_2 * Sigma_1_1);
211 a_r_ = det1 / det0;
212 a_g_ = det2 / det0;
213 a_b_ = det3 / det0;
214 b_ = inp_mean - a_r_ * guide_r - a_g_ * guide_g - a_b_ * guide_b;
215 }
216 else
217 {
218 // linear system is singular
219 a_r_ = 0.f;
220 a_g_ = 0.f;
221 a_b_ = 0.f;
222 b_ = get_color_pixel(mean, i)[INP_MEAN];
223 }
224 // now data of imgg_mean_? is no longer needed, we can safely overwrite aliasing arrays
225 a_b.data[4*i+A_RED] = a_r_;
226 a_b.data[4*i+A_GREEN] = a_g_;
227 a_b.data[4*i+A_BLUE] = a_b_;
228 a_b.data[4*i+B] = b_;
229 }
230 free_color_image(&variance);
231
232 dt_box_mean(a_b.data, a_b.height, a_b.width, a_b.stride|BOXFILTER_KAHAN_SUM, w, 1);
233
234 #ifdef _OPENMP
235 #pragma omp parallel for schedule(static) default(none) \
236 shared(target, imgg, a_b, img_out) dt_omp_sharedconst(source) dt_omp_firstprivate(min, max, width, guide_weight)
237 #endif
238 for(int j_imgg = target.lower; j_imgg < target.upper; j_imgg++)
239 {
240 // index of the left most target pixel in the current row
241 size_t l = target.left + (size_t)j_imgg * imgg.width;
242 // index of the left most source pixel in the current row of the
243 // smaller auxiliary gray-scale images a_r, a_g, a_b, and b
244 // excluding boundary data from neighboring tiles
245 size_t k = (target.left - source.left) + (size_t)(j_imgg - source.lower) * width;
246 for(int i_imgg = target.left; i_imgg < target.right; i_imgg++, k++, l++)
247 {
248 const float *pixel = get_color_pixel(imgg, l);
249 const float *px_ab = get_color_pixel(a_b, k);
250 float res = guide_weight * (px_ab[A_RED] * pixel[0] + px_ab[A_GREEN] * pixel[1] + px_ab[A_BLUE] * pixel[2]);
251 res += px_ab[B];
252 img_out.data[i_imgg + (size_t)j_imgg * imgg.width] = CLAMP(res, min, max);
253 }
254 }
255 free_color_image(&mean);
256 }
257
compute_tile_height(const int height,const int w)258 static int compute_tile_height(const int height, const int w)
259 {
260 int tile_h = max_i(3 * w, GF_TILE_SIZE);
261 #if 0 // enabling the below doesn't make any measureable speed difference, but does cause a handful of pixels
262 // to round off differently (as does changing GF_TILE_SIZE)
263 if ((height % tile_h) > 0 && (height % tile_h) < GF_TILE_SIZE/3)
264 {
265 // if there's just a sliver left over for the last row of tiles, see whether slicing off a few pixels
266 // gives us a mostly-full tile
267 if (height % (tile_h - 8) >= GF_TILE_SIZE/3)
268 tile_h -= 8;
269 else if (height % (tile_h - w/4) >= GF_TILE_SIZE/3)
270 tile_h -= (w/4);
271 else if (height % (tile_h - w/2) >= GF_TILE_SIZE/3)
272 tile_h -= (w/2);
273 // try adding a few pixels
274 else if (height % (tile_h + 8) >= GF_TILE_SIZE/3)
275 tile_h += 8;
276 else if (height % (tile_h + 16) >= GF_TILE_SIZE/3)
277 tile_h += 16;
278 }
279 #endif
280 return tile_h;
281 }
282
compute_tile_width(const int width,const int w)283 static int compute_tile_width(const int width, const int w)
284 {
285 int tile_w = max_i(3 * w, GF_TILE_SIZE);
286 #if 0 // enabling the below doesn't make any measureable speed difference, but does cause a handful of pixels
287 // to round off differently (as does changing GF_TILE_SIZE)
288 if ((width % tile_w) > 0 && (width % tile_w) < GF_TILE_SIZE/2)
289 {
290 // if there's just a sliver left over for the last column of tiles, see whether slicing off a few pixels
291 // gives us a mostly-full tile
292 if (width % (tile_w - 8) >= GF_TILE_SIZE/3)
293 tile_w -= 8;
294 else if (width % (tile_w - w/4) >= GF_TILE_SIZE/3)
295 tile_w -= (w/4);
296 else if (width % (tile_w - w/2) >= GF_TILE_SIZE/3)
297 tile_w -= (w/2);
298 // try adding a few pixels
299 else if (width % (tile_w + 8) >= GF_TILE_SIZE/3)
300 tile_w += 8;
301 else if (width % (tile_w + 16) >= GF_TILE_SIZE/3)
302 tile_w += 16;
303 }
304 #endif
305 return tile_w;
306 }
307
guided_filter(const float * const guide,const float * const in,float * const out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)308 void guided_filter(const float *const guide, const float *const in, float *const out, const int width,
309 const int height, const int ch,
310 const int w, // window size
311 const float sqrt_eps, // regularization parameter
312 const float guide_weight, // to balance the amplitudes in the guiding image and the input image
313 const float min, const float max)
314 {
315 assert(ch >= 3);
316 assert(w >= 1);
317
318 color_image img_guide = (color_image){ (float *)guide, width, height, ch };
319 gray_image img_in = (gray_image){ (float *)in, width, height };
320 gray_image img_out = (gray_image){ out, width, height };
321 const int tile_width = compute_tile_width(width,w);
322 const int tile_height = compute_tile_height(height,w);
323 const float eps = sqrt_eps * sqrt_eps; // this is the regularization parameter of the original papers
324
325 for(int j = 0; j < height; j += tile_height)
326 {
327 for(int i = 0; i < width; i += tile_width)
328 {
329 tile target = { i, min_i(i + tile_width, width), j, min_i(j + tile_height, height) };
330 guided_filter_tiling(img_guide, img_in, img_out, target, w, eps, guide_weight, min, max);
331 }
332 }
333 }
334
335 #ifdef HAVE_OPENCL
336
dt_guided_filter_init_cl_global()337 dt_guided_filter_cl_global_t *dt_guided_filter_init_cl_global()
338 {
339 dt_guided_filter_cl_global_t *g = malloc(sizeof(*g));
340 const int program = 26; // guided_filter.cl, from programs.conf
341 g->kernel_guided_filter_split_rgb = dt_opencl_create_kernel(program, "guided_filter_split_rgb_image");
342 g->kernel_guided_filter_box_mean_x = dt_opencl_create_kernel(program, "guided_filter_box_mean_x");
343 g->kernel_guided_filter_box_mean_y = dt_opencl_create_kernel(program, "guided_filter_box_mean_y");
344 g->kernel_guided_filter_guided_filter_covariances
345 = dt_opencl_create_kernel(program, "guided_filter_covariances");
346 g->kernel_guided_filter_guided_filter_variances = dt_opencl_create_kernel(program, "guided_filter_variances");
347 g->kernel_guided_filter_update_covariance = dt_opencl_create_kernel(program, "guided_filter_update_covariance");
348 g->kernel_guided_filter_solve = dt_opencl_create_kernel(program, "guided_filter_solve");
349 g->kernel_guided_filter_generate_result = dt_opencl_create_kernel(program, "guided_filter_generate_result");
350 return g;
351 }
352
353
dt_guided_filter_free_cl_global(dt_guided_filter_cl_global_t * g)354 void dt_guided_filter_free_cl_global(dt_guided_filter_cl_global_t *g)
355 {
356 if(!g) return;
357 // destroy kernels
358 dt_opencl_free_kernel(g->kernel_guided_filter_split_rgb);
359 dt_opencl_free_kernel(g->kernel_guided_filter_box_mean_x);
360 dt_opencl_free_kernel(g->kernel_guided_filter_box_mean_y);
361 dt_opencl_free_kernel(g->kernel_guided_filter_guided_filter_covariances);
362 dt_opencl_free_kernel(g->kernel_guided_filter_guided_filter_variances);
363 dt_opencl_free_kernel(g->kernel_guided_filter_update_covariance);
364 dt_opencl_free_kernel(g->kernel_guided_filter_solve);
365 dt_opencl_free_kernel(g->kernel_guided_filter_generate_result);
366 free(g);
367 }
368
369
cl_split_rgb(const int devid,const int width,const int height,cl_mem guide,cl_mem imgg_r,cl_mem imgg_g,cl_mem imgg_b,const float guide_weight)370 static int cl_split_rgb(const int devid, const int width, const int height, cl_mem guide, cl_mem imgg_r,
371 cl_mem imgg_g, cl_mem imgg_b, const float guide_weight)
372 {
373 const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_split_rgb;
374 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
375 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
376 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
377 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(imgg_r), &imgg_r);
378 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(imgg_g), &imgg_g);
379 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(imgg_b), &imgg_b);
380 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(guide_weight), &guide_weight);
381 const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
382 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
383 }
384
385
cl_box_mean(const int devid,const int width,const int height,const int w,cl_mem in,cl_mem out,cl_mem temp)386 static int cl_box_mean(const int devid, const int width, const int height, const int w, cl_mem in, cl_mem out,
387 cl_mem temp)
388 {
389 const int kernel_x = darktable.opencl->guided_filter->kernel_guided_filter_box_mean_x;
390 dt_opencl_set_kernel_arg(devid, kernel_x, 0, sizeof(width), &width);
391 dt_opencl_set_kernel_arg(devid, kernel_x, 1, sizeof(height), &height);
392 dt_opencl_set_kernel_arg(devid, kernel_x, 2, sizeof(in), &in);
393 dt_opencl_set_kernel_arg(devid, kernel_x, 3, sizeof(temp), &temp);
394 dt_opencl_set_kernel_arg(devid, kernel_x, 4, sizeof(w), &w);
395 const size_t sizes_x[] = { 1, ROUNDUPWD(height) };
396 const int err = dt_opencl_enqueue_kernel_2d(devid, kernel_x, sizes_x);
397 if(err != CL_SUCCESS) return err;
398
399 const int kernel_y = darktable.opencl->guided_filter->kernel_guided_filter_box_mean_y;
400 dt_opencl_set_kernel_arg(devid, kernel_y, 0, sizeof(width), &width);
401 dt_opencl_set_kernel_arg(devid, kernel_y, 1, sizeof(height), &height);
402 dt_opencl_set_kernel_arg(devid, kernel_y, 2, sizeof(temp), &temp);
403 dt_opencl_set_kernel_arg(devid, kernel_y, 3, sizeof(out), &out);
404 dt_opencl_set_kernel_arg(devid, kernel_y, 4, sizeof(w), &w);
405 const size_t sizes_y[] = { ROUNDUPWD(width), 1 };
406 return dt_opencl_enqueue_kernel_2d(devid, kernel_y, sizes_y);
407 }
408
409
cl_covariances(const int devid,const int width,const int height,cl_mem guide,cl_mem in,cl_mem cov_imgg_img_r,cl_mem cov_imgg_img_g,cl_mem cov_imgg_img_b,const float guide_weight)410 static int cl_covariances(const int devid, const int width, const int height, cl_mem guide, cl_mem in,
411 cl_mem cov_imgg_img_r, cl_mem cov_imgg_img_g, cl_mem cov_imgg_img_b,
412 const float guide_weight)
413 {
414 const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_guided_filter_covariances;
415 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
416 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
417 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
418 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(in), &in);
419 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cov_imgg_img_r), &cov_imgg_img_r);
420 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cov_imgg_img_g), &cov_imgg_img_g);
421 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cov_imgg_img_b), &cov_imgg_img_b);
422 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(guide_weight), &guide_weight);
423 const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
424 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
425 }
426
427
cl_variances(const int devid,const int width,const int height,cl_mem guide,cl_mem var_imgg_rr,cl_mem var_imgg_rg,cl_mem var_imgg_rb,cl_mem var_imgg_gg,cl_mem var_imgg_gb,cl_mem var_imgg_bb,const float guide_weight)428 static int cl_variances(const int devid, const int width, const int height, cl_mem guide, cl_mem var_imgg_rr,
429 cl_mem var_imgg_rg, cl_mem var_imgg_rb, cl_mem var_imgg_gg, cl_mem var_imgg_gb,
430 cl_mem var_imgg_bb, const float guide_weight)
431 {
432 const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_guided_filter_variances;
433 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
434 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
435 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
436 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(var_imgg_rr), &var_imgg_rr);
437 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(var_imgg_rg), &var_imgg_rg);
438 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(var_imgg_rb), &var_imgg_rb);
439 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(var_imgg_gg), &var_imgg_gg);
440 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(var_imgg_gb), &var_imgg_gb);
441 dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(var_imgg_bb), &var_imgg_bb);
442 dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(guide_weight), &guide_weight);
443 size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
444 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
445 }
446
447
cl_update_covariance(const int devid,const int width,const int height,cl_mem in,cl_mem out,cl_mem a,cl_mem b,float eps)448 static int cl_update_covariance(const int devid, const int width, const int height, cl_mem in, cl_mem out,
449 cl_mem a, cl_mem b, float eps)
450 {
451 const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_update_covariance;
452 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
453 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
454 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(in), &in);
455 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(out), &out);
456 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(a), &a);
457 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(b), &b);
458 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(eps), &eps);
459 const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
460 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
461 }
462
463
cl_solve(const int devid,const int width,const int height,cl_mem img_mean,cl_mem imgg_mean_r,cl_mem imgg_mean_g,cl_mem imgg_mean_b,cl_mem cov_imgg_img_r,cl_mem cov_imgg_img_g,cl_mem cov_imgg_img_b,cl_mem var_imgg_rr,cl_mem var_imgg_rg,cl_mem var_imgg_rb,cl_mem var_imgg_gg,cl_mem var_imgg_gb,cl_mem var_imgg_bb,cl_mem a_r,cl_mem a_g,cl_mem a_b,cl_mem b)464 static int cl_solve(const int devid, const int width, const int height, cl_mem img_mean, cl_mem imgg_mean_r,
465 cl_mem imgg_mean_g, cl_mem imgg_mean_b, cl_mem cov_imgg_img_r, cl_mem cov_imgg_img_g,
466 cl_mem cov_imgg_img_b, cl_mem var_imgg_rr, cl_mem var_imgg_rg, cl_mem var_imgg_rb,
467 cl_mem var_imgg_gg, cl_mem var_imgg_gb, cl_mem var_imgg_bb, cl_mem a_r, cl_mem a_g, cl_mem a_b,
468 cl_mem b)
469 {
470 const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_solve;
471 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
472 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
473 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(img_mean), &img_mean);
474 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(imgg_mean_r), &imgg_mean_r);
475 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(imgg_mean_g), &imgg_mean_g);
476 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(imgg_mean_b), &imgg_mean_b);
477 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cov_imgg_img_r), &cov_imgg_img_r);
478 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cov_imgg_img_g), &cov_imgg_img_g);
479 dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cov_imgg_img_b), &cov_imgg_img_b);
480 dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(var_imgg_rr), &var_imgg_rr);
481 dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(var_imgg_rg), &var_imgg_rg);
482 dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(var_imgg_rb), &var_imgg_rb);
483 dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(var_imgg_gg), &var_imgg_gg);
484 dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(var_imgg_gb), &var_imgg_gb);
485 dt_opencl_set_kernel_arg(devid, kernel, 14, sizeof(var_imgg_bb), &var_imgg_bb);
486 dt_opencl_set_kernel_arg(devid, kernel, 15, sizeof(a_r), &a_r);
487 dt_opencl_set_kernel_arg(devid, kernel, 16, sizeof(a_g), &a_g);
488 dt_opencl_set_kernel_arg(devid, kernel, 17, sizeof(a_b), &a_b);
489 dt_opencl_set_kernel_arg(devid, kernel, 18, sizeof(b), &b);
490 const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
491 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
492 }
493
494
cl_generate_result(const int devid,const int width,const int height,cl_mem guide,cl_mem a_r,cl_mem a_g,cl_mem a_b,cl_mem b,cl_mem out,const float guide_weight,const float min,const float max)495 static int cl_generate_result(const int devid, const int width, const int height, cl_mem guide, cl_mem a_r,
496 cl_mem a_g, cl_mem a_b, cl_mem b, cl_mem out, const float guide_weight,
497 const float min, const float max)
498 {
499 const int kernel = darktable.opencl->guided_filter->kernel_guided_filter_generate_result;
500 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(width), &width);
501 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(height), &height);
502 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(guide), &guide);
503 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(a_r), &a_r);
504 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(a_g), &a_g);
505 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(a_b), &a_b);
506 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(b), &b);
507 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(out), &out);
508 dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(guide_weight), &guide_weight);
509 dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(min), &min);
510 dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(max), &max);
511 const size_t sizes[] = { ROUNDUPWD(width), ROUNDUPWD(height) };
512 return dt_opencl_enqueue_kernel_2d(devid, kernel, sizes);
513 }
514
515
guided_filter_cl_impl(int devid,cl_mem guide,cl_mem in,cl_mem out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)516 static int guided_filter_cl_impl(int devid, cl_mem guide, cl_mem in, cl_mem out, const int width, const int height,
517 const int ch,
518 const int w, // window size
519 const float sqrt_eps, // regularization parameter
520 const float guide_weight, // to balance the amplitudes in the guiding image and
521 // the input// image
522 const float min, const float max)
523 {
524 const float eps = sqrt_eps * sqrt_eps; // this is the regularization parameter of the original papers
525
526 void *temp1 = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
527 void *temp2 = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
528 void *imgg_mean_r = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
529 void *imgg_mean_g = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
530 void *imgg_mean_b = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
531 void *img_mean = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
532 void *cov_imgg_img_r = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
533 void *cov_imgg_img_g = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
534 void *cov_imgg_img_b = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
535 void *var_imgg_rr = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
536 void *var_imgg_gg = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
537 void *var_imgg_bb = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
538 void *var_imgg_rg = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
539 void *var_imgg_rb = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
540 void *var_imgg_gb = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
541 void *a_r = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
542 void *a_g = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
543 void *a_b = dt_opencl_alloc_device(devid, width, height, (int)sizeof(float));
544 void *b = temp2;
545
546 int err = CL_SUCCESS;
547 if(temp1 == NULL || temp2 == NULL || //
548 imgg_mean_r == NULL || imgg_mean_g == NULL || imgg_mean_b == NULL || img_mean == NULL || //
549 cov_imgg_img_r == NULL || cov_imgg_img_g == NULL || cov_imgg_img_b == NULL || //
550 var_imgg_rr == NULL || var_imgg_gg == NULL || var_imgg_bb == NULL || //
551 var_imgg_rg == NULL || var_imgg_rb == NULL || var_imgg_gb == NULL || //
552 a_r == NULL || a_g == NULL || a_b == NULL)
553 {
554 err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
555 goto error;
556 }
557
558 err = cl_split_rgb(devid, width, height, guide, imgg_mean_r, imgg_mean_g, imgg_mean_b, guide_weight);
559 if(err != CL_SUCCESS) goto error;
560
561 err = cl_box_mean(devid, width, height, w, in, img_mean, temp1);
562 if(err != CL_SUCCESS) goto error;
563 err = cl_box_mean(devid, width, height, w, imgg_mean_r, imgg_mean_r, temp1);
564 if(err != CL_SUCCESS) goto error;
565 err = cl_box_mean(devid, width, height, w, imgg_mean_g, imgg_mean_g, temp1);
566 if(err != CL_SUCCESS) goto error;
567 err = cl_box_mean(devid, width, height, w, imgg_mean_b, imgg_mean_b, temp1);
568 if(err != CL_SUCCESS) goto error;
569
570 err = cl_covariances(devid, width, height, guide, in, cov_imgg_img_r, cov_imgg_img_g, cov_imgg_img_b,
571 guide_weight);
572 if(err != CL_SUCCESS) goto error;
573
574 err = cl_variances(devid, width, height, guide, var_imgg_rr, var_imgg_rg, var_imgg_rb, var_imgg_gg, var_imgg_gb,
575 var_imgg_bb, guide_weight);
576 if(err != CL_SUCCESS) goto error;
577
578 err = cl_box_mean(devid, width, height, w, cov_imgg_img_r, temp2, temp1);
579 if(err != CL_SUCCESS) goto error;
580 err = cl_update_covariance(devid, width, height, temp2, cov_imgg_img_r, imgg_mean_r, img_mean, 0.f);
581 if(err != CL_SUCCESS) goto error;
582 err = cl_box_mean(devid, width, height, w, cov_imgg_img_g, temp2, temp1);
583 if(err != CL_SUCCESS) goto error;
584 err = cl_update_covariance(devid, width, height, temp2, cov_imgg_img_g, imgg_mean_g, img_mean, 0.f);
585 if(err != CL_SUCCESS) goto error;
586 err = cl_box_mean(devid, width, height, w, cov_imgg_img_b, temp2, temp1);
587 if(err != CL_SUCCESS) goto error;
588 err = cl_update_covariance(devid, width, height, temp2, cov_imgg_img_b, imgg_mean_b, img_mean, 0.f);
589 if(err != CL_SUCCESS) goto error;
590 err = cl_box_mean(devid, width, height, w, var_imgg_rr, temp2, temp1);
591 if(err != CL_SUCCESS) goto error;
592 err = cl_update_covariance(devid, width, height, temp2, var_imgg_rr, imgg_mean_r, imgg_mean_r, eps);
593 if(err != CL_SUCCESS) goto error;
594 err = cl_box_mean(devid, width, height, w, var_imgg_rg, temp2, temp1);
595 if(err != CL_SUCCESS) goto error;
596 err = cl_update_covariance(devid, width, height, temp2, var_imgg_rg, imgg_mean_r, imgg_mean_g, 0.f);
597 if(err != CL_SUCCESS) goto error;
598 err = cl_box_mean(devid, width, height, w, var_imgg_rb, temp2, temp1);
599 if(err != CL_SUCCESS) goto error;
600 err = cl_update_covariance(devid, width, height, temp2, var_imgg_rb, imgg_mean_r, imgg_mean_b, 0.f);
601 if(err != CL_SUCCESS) goto error;
602 err = cl_box_mean(devid, width, height, w, var_imgg_gg, temp2, temp1);
603 if(err != CL_SUCCESS) goto error;
604 err = cl_update_covariance(devid, width, height, temp2, var_imgg_gg, imgg_mean_g, imgg_mean_g, eps);
605 if(err != CL_SUCCESS) goto error;
606 err = cl_box_mean(devid, width, height, w, var_imgg_gb, temp2, temp1);
607 if(err != CL_SUCCESS) goto error;
608 err = cl_update_covariance(devid, width, height, temp2, var_imgg_gb, imgg_mean_g, imgg_mean_b, 0.f);
609 if(err != CL_SUCCESS) goto error;
610 err = cl_box_mean(devid, width, height, w, var_imgg_bb, temp2, temp1);
611 if(err != CL_SUCCESS) goto error;
612 err = cl_update_covariance(devid, width, height, temp2, var_imgg_bb, imgg_mean_b, imgg_mean_b, eps);
613 if(err != CL_SUCCESS) goto error;
614
615 err = cl_solve(devid, width, height, img_mean, imgg_mean_r, imgg_mean_g, imgg_mean_b, cov_imgg_img_r,
616 cov_imgg_img_g, cov_imgg_img_b, var_imgg_rr, var_imgg_rg, var_imgg_rb, var_imgg_gg, var_imgg_gb,
617 var_imgg_bb, a_r, a_g, a_b, b);
618 if(err != CL_SUCCESS) goto error;
619
620 err = cl_box_mean(devid, width, height, w, a_r, a_r, temp1);
621 if(err != CL_SUCCESS) goto error;
622 err = cl_box_mean(devid, width, height, w, a_g, a_g, temp1);
623 if(err != CL_SUCCESS) goto error;
624 err = cl_box_mean(devid, width, height, w, a_b, a_b, temp1);
625 if(err != CL_SUCCESS) goto error;
626 err = cl_box_mean(devid, width, height, w, b, b, temp1);
627 if(err != CL_SUCCESS) goto error;
628
629 err = cl_generate_result(devid, width, height, guide, a_r, a_g, a_b, b, out, guide_weight, min, max);
630
631 error:
632 if(err != CL_SUCCESS) dt_print(DT_DEBUG_OPENCL, "[guided filter] unknown error: %d\n", err);
633
634 dt_opencl_release_mem_object(a_r);
635 dt_opencl_release_mem_object(a_g);
636 dt_opencl_release_mem_object(a_b);
637 dt_opencl_release_mem_object(var_imgg_rr);
638 dt_opencl_release_mem_object(var_imgg_rg);
639 dt_opencl_release_mem_object(var_imgg_rb);
640 dt_opencl_release_mem_object(var_imgg_gg);
641 dt_opencl_release_mem_object(var_imgg_gb);
642 dt_opencl_release_mem_object(var_imgg_bb);
643 dt_opencl_release_mem_object(cov_imgg_img_r);
644 dt_opencl_release_mem_object(cov_imgg_img_g);
645 dt_opencl_release_mem_object(cov_imgg_img_b);
646 dt_opencl_release_mem_object(img_mean);
647 dt_opencl_release_mem_object(imgg_mean_r);
648 dt_opencl_release_mem_object(imgg_mean_g);
649 dt_opencl_release_mem_object(imgg_mean_b);
650 dt_opencl_release_mem_object(temp1);
651 dt_opencl_release_mem_object(temp2);
652
653 return err;
654 }
655
656
guided_filter_cl_fallback(int devid,cl_mem guide,cl_mem in,cl_mem out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)657 static void guided_filter_cl_fallback(int devid, cl_mem guide, cl_mem in, cl_mem out, const int width,
658 const int height, const int ch,
659 const int w, // window size
660 const float sqrt_eps, // regularization parameter
661 const float guide_weight, // to balance the amplitudes in the guiding image
662 // and the input// image
663 const float min, const float max)
664 {
665 // fall-back implementation: copy data from device memory to host memory and perform filter
666 // by CPU until there is a proper OpenCL implementation
667 float *guide_host = dt_alloc_align(64, sizeof(*guide_host) * width * height * ch);
668 float *in_host = dt_alloc_align(64, sizeof(*in_host) * width * height);
669 float *out_host = dt_alloc_align(64, sizeof(*out_host) * width * height);
670 int err;
671 err = dt_opencl_read_host_from_device(devid, guide_host, guide, width, height, ch * sizeof(float));
672 if(err != CL_SUCCESS) goto error;
673 err = dt_opencl_read_host_from_device(devid, in_host, in, width, height, sizeof(float));
674 if(err != CL_SUCCESS) goto error;
675 guided_filter(guide_host, in_host, out_host, width, height, ch, w, sqrt_eps, guide_weight, min, max);
676 err = dt_opencl_write_host_to_device(devid, out_host, out, width, height, sizeof(float));
677 if(err != CL_SUCCESS) goto error;
678 error:
679 dt_free_align(guide_host);
680 dt_free_align(in_host);
681 dt_free_align(out_host);
682 }
683
684
guided_filter_cl(int devid,cl_mem guide,cl_mem in,cl_mem out,const int width,const int height,const int ch,const int w,const float sqrt_eps,const float guide_weight,const float min,const float max)685 void guided_filter_cl(int devid, cl_mem guide, cl_mem in, cl_mem out, const int width, const int height,
686 const int ch,
687 const int w, // window size
688 const float sqrt_eps, // regularization parameter
689 const float guide_weight, // to balance the amplitudes in the guiding image and the input
690 // image
691 const float min, const float max)
692 {
693 assert(ch >= 3);
694 assert(w >= 1);
695
696 const cl_ulong max_global_mem = dt_opencl_get_max_global_mem(devid);
697 const size_t reserved_memory = (size_t)(dt_conf_get_float("opencl_memory_headroom") * 1024 * 1024);
698 // estimate required memory for OpenCL code path with a safety factor of 5/4
699 const size_t required_memory
700 = darktable.opencl->dev[devid].memory_in_use + (size_t)width * height * sizeof(float) * 18 * 5 / 4;
701 int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
702 if(max_global_mem - reserved_memory > required_memory)
703 err = guided_filter_cl_impl(devid, guide, in, out, width, height, ch, w, sqrt_eps, guide_weight, min, max);
704 if(err != CL_SUCCESS)
705 {
706 dt_print(DT_DEBUG_OPENCL, "[guided filter] fall back to cpu implementation due to insufficient gpu memory\n");
707 guided_filter_cl_fallback(devid, guide, in, out, width, height, ch, w, sqrt_eps, guide_weight, min, max);
708 }
709 }
710
711 #endif
712