1 /*
2     This file is part of darktable,
3     Copyright (C) 2017-2020 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include "control/control.h"
20 #include "develop/imageop.h"
21 #include "develop/openmp_maths.h"
22 #include "heal.h"
23 
24 /* Based on the original source code of GIMP's Healing Tool, by Jean-Yves Couleaud
25  *
26  * http://www.gimp.org/
27  *
28  * */
29 
30 /* NOTES
31  *
32  * The method used here is similar to the lighting invariant correction
33  * method but slightly different: we do not divide the RGB components,
34  * but subtract them I2 = I0 - I1, where I0 is the sample image to be
35  * corrected, I1 is the reference pattern. Then we solve DeltaI=0
36  * (Laplace) with I2 Dirichlet conditions at the borders of the
37  * mask. The solver is a red/black checker Gauss-Seidel with over-relaxation.
38  * It could benefit from a multi-grid evaluation of an initial solution
39  * before the main iteration loop.
40  *
41  * I reduced the convergence criteria to 0.1% (0.001) as we are
42  * dealing here with RGB integer components, more is overkill.
43  *
44  * Jean-Yves Couleaud cjyves@free.fr
45  */
46 
47 
48 // Subtract bottom from top and store in result as a float
dt_heal_sub(const float * const top_buffer,const float * const bottom_buffer,float * const restrict result_buffer,const int width,const int height)49 static void dt_heal_sub(const float *const top_buffer, const float *const bottom_buffer,
50                         float *const restrict result_buffer, const int width, const int height)
51 {
52   const size_t i_size = (size_t)width * height * 4;
53 
54 #ifdef _OPENMP
55 #pragma omp parallel for default(none) \
56   dt_omp_firstprivate(top_buffer, bottom_buffer, i_size) \
57   dt_omp_sharedconst(result_buffer) \
58   schedule(static)
59 #endif
60   for(size_t i = 0; i < i_size; i++) result_buffer[i] = top_buffer[i] - bottom_buffer[i];
61 }
62 
63 // Add first to second and store in result
dt_heal_add(const float * const restrict first_buffer,const float * const restrict second_buffer,float * const restrict result_buffer,const int width,const int height)64 static void dt_heal_add(const float *const restrict first_buffer, const float *const restrict second_buffer,
65                         float *const restrict result_buffer, const int width, const int height)
66 {
67   const size_t i_size = (size_t)width * height * 4;
68 
69 #ifdef _OPENMP
70 #pragma omp parallel for default(none) \
71   dt_omp_firstprivate(first_buffer, second_buffer, i_size) \
72   dt_omp_sharedconst(result_buffer) \
73   schedule(static)
74 #endif
75   for(size_t i = 0; i < i_size; i++) result_buffer[i] = first_buffer[i] + second_buffer[i];
76 }
77 
78 // define a custom reduction operation to handle a 3-vector of floats
79 // we can't return an array from a function, so wrap the array type in a struct
80 typedef struct _aligned_pixel { dt_aligned_pixel_t v; } _aligned_pixel;
81 #ifdef _OPENMP
add_float4(_aligned_pixel acc,_aligned_pixel newval)82 static inline _aligned_pixel add_float4(_aligned_pixel acc, _aligned_pixel newval)
83 {
84   for_each_channel(c) acc.v[c] += newval.v[c];
85   return acc;
86 }
87 #pragma omp declare reduction(vsum:_aligned_pixel:omp_out=add_float4(omp_out,omp_in)) \
88   initializer(omp_priv = { { 0.0f, 0.0f, 0.0f, 0.0f } })
89 #endif
90 
91 // Perform one iteration of Gauss-Seidel, and return the sum squared residual.
dt_heal_laplace_iteration(float * const restrict pixels,const float * const restrict Adiag,const int * const restrict Aidx,const float w,const int nmask_from,const int nmask_to)92 static float dt_heal_laplace_iteration(float *const restrict pixels, const float *const restrict Adiag,
93                                        const int *const restrict Aidx, const float w,
94                                        const int nmask_from, const int nmask_to)
95 {
96   _aligned_pixel err = { { 0.f } };
97 
98 #if !(defined(__apple_build_version__) && __apple_build_version__ < 11030000) //makes Xcode 11.3.1 compiler crash
99 #ifdef _OPENMP
100 #pragma omp parallel for default(none) \
101   dt_omp_firstprivate(Adiag, Aidx, w, nmask_from, nmask_to) \
102   dt_omp_sharedconst(pixels) \
103   schedule(static) \
104   reduction(vsum : err)
105 #endif
106 #endif
107   for(int i = nmask_from; i < nmask_to; i++)
108   {
109     const size_t j0 = Aidx[i * 5 + 0];
110     const size_t j1 = Aidx[i * 5 + 1];
111     const size_t j2 = Aidx[i * 5 + 2];
112     const size_t j3 = Aidx[i * 5 + 3];
113     const size_t j4 = Aidx[i * 5 + 4];
114     const float a = Adiag[i];
115 
116     dt_aligned_pixel_t diff;
117     for_each_channel(k,aligned(pixels))
118     {
119       diff[k] = w * (a * pixels[j0 + k] - (pixels[j1 + k] + pixels[j2 + k] + pixels[j3 + k] + pixels[j4 + k]));
120       pixels[j0 + k] -= diff[k];
121       err.v[k] += diff[k] * diff[k];
122     }
123   }
124 
125   return err.v[0] + err.v[1] + err.v[2];
126 }
127 
128 // Solve the laplace equation for pixels and store the result in-place.
dt_heal_laplace_loop(float * pixels,const int width,const int height,const float * const mask)129 static void dt_heal_laplace_loop(float *pixels, const int width, const int height,
130                                  const float *const mask)
131 {
132   int nmask = 0;
133   int nmask2 = 0;
134 
135   float *Adiag = dt_alloc_align_float((size_t)width * height);
136   int *Aidx = dt_alloc_align(64, sizeof(int) * 5 * width * height);
137 
138   if((Adiag == NULL) || (Aidx == NULL))
139   {
140     fprintf(stderr, "dt_heal_laplace_loop: error allocating memory for healing\n");
141     goto cleanup;
142   }
143 
144   /* All off-diagonal elements of A are either -1 or 0. We could store it as a
145    * general-purpose sparse matrix, but that adds some unnecessary overhead to
146    * the inner loop. Instead, assume exactly 4 off-diagonal elements in each
147    * row, all of which have value -1. Any row that in fact wants less than 4
148    * coefs can put them in a dummy column to be multiplied by an empty pixel.
149    */
150   const int zero = 4 * width * height;
151   memset(pixels + zero, 0, sizeof(float) * 4);
152 
153   /* Construct the system of equations.
154    * Arrange Aidx in checkerboard order, so that a single linear pass over that
155    * array results updating all of the red cells and then all of the black cells.
156    */
157   for(int parity = 0; parity < 2; parity++)
158   {
159     if(parity == 1) nmask2 = nmask;
160 
161     for(int i = 0; i < height; i++)
162     {
163       for(int j = (i & 1) ^ parity; j < width; j += 2)
164       {
165         if(mask[j + i * width])
166         {
167 #define A_POS(di, dj) (((i + di) * width + (j  + dj)) * 4)
168 
169           /* Omit Dirichlet conditions for any neighbors off the
170            * edge of the canvas.
171            */
172           Adiag[nmask] = 4 - (i == 0) - (j == 0) - (i == height - 1) - (j == width - 1);
173           Aidx[5 * nmask] = A_POS(0,0);
174           Aidx[5 * nmask + 1] = (j == width-1) ? zero : A_POS(0,1);
175           Aidx[5 * nmask + 2] = (i == height-1) ? zero : A_POS(1,0);
176           Aidx[5 * nmask + 3] = (j == 0) ? zero : A_POS(0,-1);
177           Aidx[5 * nmask + 4] = (i == 0) ? zero : A_POS(-1,0);
178           nmask++;
179         }
180       }
181     }
182   }
183 
184 #undef A_POS
185 
186   /* Empirically optimal over-relaxation factor. (Benchmarked on
187    * round brushes, at least. I don't know whether aspect ratio
188    * affects it.)
189    */
190   const float w = ((2.0f - 1.0f / (0.1575f * sqrtf(nmask) + 0.8f)) * .25f);
191 
192   const int max_iter = 1000;
193   const float epsilon = (0.1 / 255);
194   const float err_exit = epsilon * epsilon * w * w;
195 
196   /* Gauss-Seidel with successive over-relaxation */
197   for(int iter = 0; iter < max_iter; iter++)
198   {
199     // process red/black cells separate
200     float err = dt_heal_laplace_iteration(pixels, Adiag, Aidx, w, 0, nmask2);
201     err += dt_heal_laplace_iteration(pixels, Adiag, Aidx, w, nmask2, nmask);
202 
203     if(err < err_exit) break;
204   }
205 
206 cleanup:
207   if(Adiag) dt_free_align(Adiag);
208   if(Aidx) dt_free_align(Aidx);
209 }
210 
211 
212 /* Original Algorithm Design:
213  *
214  * T. Georgiev, "Photoshop Healing Brush: a Tool for Seamless Cloning
215  * http://www.tgeorgiev.net/Photoshop_Healing.pdf
216  */
dt_heal(const float * const src_buffer,float * dest_buffer,const float * const mask_buffer,const int width,const int height,const int ch)217 void dt_heal(const float *const src_buffer, float *dest_buffer, const float *const mask_buffer, const int width,
218              const int height, const int ch)
219 {
220   if(ch != 4)
221   {
222     fprintf(stderr,"dt_heal: full-color image required\n");
223     return;
224   }
225   float *const restrict diff_buffer = dt_alloc_align_float((size_t)ch * width * (height + 1));
226 
227   if(diff_buffer == NULL)
228   {
229     fprintf(stderr, "dt_heal: error allocating memory for healing\n");
230     goto cleanup;
231   }
232 
233   /* subtract pattern from image and store the result in diff */
234   dt_heal_sub(dest_buffer, src_buffer, diff_buffer, width, height);
235 
236   dt_heal_laplace_loop(diff_buffer, width, height, mask_buffer);
237 
238   /* add solution to original image and store in dest */
239   dt_heal_add(diff_buffer, src_buffer, dest_buffer, width, height);
240 
241 cleanup:
242   if(diff_buffer) dt_free_align(diff_buffer);
243 }
244 
245 #ifdef HAVE_OPENCL
246 
dt_heal_init_cl_global()247 dt_heal_cl_global_t *dt_heal_init_cl_global()
248 {
249   dt_heal_cl_global_t *g = (dt_heal_cl_global_t *)malloc(sizeof(dt_heal_cl_global_t));
250 
251   return g;
252 }
253 
dt_heal_free_cl_global(dt_heal_cl_global_t * g)254 void dt_heal_free_cl_global(dt_heal_cl_global_t *g)
255 {
256   if(!g) return;
257 
258   free(g);
259 }
260 
dt_heal_init_cl(const int devid)261 heal_params_cl_t *dt_heal_init_cl(const int devid)
262 {
263 
264   heal_params_cl_t *p = (heal_params_cl_t *)malloc(sizeof(heal_params_cl_t));
265   if(!p) return NULL;
266 
267   p->global = darktable.opencl->heal;
268   p->devid = devid;
269 
270   return p;
271 }
272 
dt_heal_free_cl(heal_params_cl_t * p)273 void dt_heal_free_cl(heal_params_cl_t *p)
274 {
275   if(!p) return;
276 
277   // be sure we're done with the memory:
278   dt_opencl_finish(p->devid);
279 
280   free(p);
281 }
282 
dt_heal_cl(heal_params_cl_t * p,cl_mem dev_src,cl_mem dev_dest,const float * const mask_buffer,const int width,const int height)283 cl_int dt_heal_cl(heal_params_cl_t *p, cl_mem dev_src, cl_mem dev_dest, const float *const mask_buffer,
284                   const int width, const int height)
285 {
286   cl_int err = CL_SUCCESS;
287 
288   const int ch = 4;
289 
290   float *src_buffer = NULL;
291   float *dest_buffer = NULL;
292 
293   src_buffer = dt_alloc_align_float((size_t)ch * width * height);
294   if(src_buffer == NULL)
295   {
296     fprintf(stderr, "dt_heal_cl: error allocating memory for healing\n");
297     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
298     goto cleanup;
299   }
300 
301   dest_buffer = dt_alloc_align_float((size_t)ch * width * height);
302   if(dest_buffer == NULL)
303   {
304     fprintf(stderr, "dt_heal_cl: error allocating memory for healing\n");
305     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
306     goto cleanup;
307   }
308 
309   err = dt_opencl_read_buffer_from_device(p->devid, (void *)src_buffer, dev_src, 0,
310                                           (size_t)width * height * ch * sizeof(float), CL_TRUE);
311   if(err != CL_SUCCESS)
312   {
313     goto cleanup;
314   }
315 
316   err = dt_opencl_read_buffer_from_device(p->devid, (void *)dest_buffer, dev_dest, 0,
317                                           (size_t)width * height * ch * sizeof(float), CL_TRUE);
318   if(err != CL_SUCCESS)
319   {
320     goto cleanup;
321   }
322 
323   // I couldn't make it run fast on opencl (the reduction takes forever), so just call the cpu version
324   dt_heal(src_buffer, dest_buffer, mask_buffer, width, height, ch);
325 
326   err = dt_opencl_write_buffer_to_device(p->devid, dest_buffer, dev_dest, 0, sizeof(float) * width * height * ch,
327                                          TRUE);
328   if(err != CL_SUCCESS)
329   {
330     goto cleanup;
331   }
332 
333 cleanup:
334   if(src_buffer) dt_free_align(src_buffer);
335   if(dest_buffer) dt_free_align(dest_buffer);
336 
337   return err;
338 }
339 
340 #endif
341