1 /*
2     This file is part of darktable,
3     Copyright (C) 2011-2020 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 
20 #include "develop/tiling.h"
21 #include "common/opencl.h"
22 #include "control/control.h"
23 #include "develop/blend.h"
24 #include "develop/pixelpipe.h"
25 
26 #include <assert.h>
27 #include <math.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <unistd.h>
32 
33 #define CLAMPI(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
34 
35 
36 /* this defines an additional alignment requirement for opencl image width.
37    It can have strong effects on processing speed. Reasonable values are a
38    power of 2. set to 1 for no effect. */
39 #define CL_ALIGNMENT 4
40 
41 /* parameter RESERVE for extended roi_in sizes due to inaccuracies when doing
42    roi_out -> roi_in estimations.
43    Needs to be increased if tiling fails due to insufficient buffer sizes. */
44 #define RESERVE 5
45 
46 
47 /* greatest common divisor */
_gcd(unsigned a,unsigned b)48 static unsigned _gcd(unsigned a, unsigned b)
49 {
50   unsigned t;
51   while(b != 0)
52   {
53     t = b;
54     b = a % b;
55     a = t;
56   }
57   return MAX(a, 1);
58 }
59 
60 /* least common multiple */
_lcm(unsigned a,unsigned b)61 static unsigned _lcm(unsigned a, unsigned b)
62 {
63   return (((unsigned long)a * b) / _gcd(a, b));
64 }
65 
66 
_min(int a,int b)67 static inline int _min(int a, int b)
68 {
69   return a < b ? a : b;
70 }
71 
_max(int a,int b)72 static inline int _max(int a, int b)
73 {
74   return a > b ? a : b;
75 }
76 
77 
_align_up(int n,int a)78 static inline int _align_up(int n, int a)
79 {
80   return n % a != 0 ? (n / a + 1) * a : n;
81 }
82 
_align_down(int n,int a)83 static inline int _align_down(int n, int a)
84 {
85   return n % a != 0 ? (n / a) * a : n;
86 }
87 
88 
_print_roi(const dt_iop_roi_t * roi,const char * label)89 void _print_roi(const dt_iop_roi_t *roi, const char *label)
90 {
91   printf("{ %5d  %5d  %5d  %5d  %.6f } %s\n", roi->x, roi->y, roi->width, roi->height, roi->scale, label);
92 }
93 
94 
95 #if 0
96 static void
97 _nm_constraints(double x[], int n)
98 {
99   x[0] = fabs(x[0]);
100   x[1] = fabs(x[1]);
101   x[2] = fabs(x[2]);
102   x[3] = fabs(x[3]);
103 
104   if(x[0] > 1.0) x[0] = 1.0 - x[0];
105   if(x[1] > 1.0) x[1] = 1.0 - x[1];
106   if(x[2] > 1.0) x[2] = 1.0 - x[2];
107   if(x[3] > 1.0) x[3] = 1.0 - x[3];
108 
109 }
110 #endif
111 
_nm_fitness(double x[],void * rest[])112 static double _nm_fitness(double x[], void *rest[])
113 {
114   struct dt_iop_module_t *self = (struct dt_iop_module_t *)rest[0];
115   struct dt_dev_pixelpipe_iop_t *piece = (struct dt_dev_pixelpipe_iop_t *)rest[1];
116   struct dt_iop_roi_t *iroi = (struct dt_iop_roi_t *)rest[2];
117   struct dt_iop_roi_t *oroi = (struct dt_iop_roi_t *)rest[3];
118 
119   dt_iop_roi_t oroi_test = *oroi;
120   oroi_test.x = x[0] * piece->iwidth;
121   oroi_test.y = x[1] * piece->iheight;
122   oroi_test.width = x[2] * piece->iwidth;
123   oroi_test.height = x[3] * piece->iheight;
124 
125   dt_iop_roi_t iroi_probe = *iroi;
126   self->modify_roi_in(self, piece, &oroi_test, &iroi_probe);
127 
128   double fitness = 0.0;
129 
130   fitness += (double)(iroi_probe.x - iroi->x) * (iroi_probe.x - iroi->x);
131   fitness += (double)(iroi_probe.y - iroi->y) * (iroi_probe.y - iroi->y);
132   fitness += (double)(iroi_probe.width - iroi->width) * (iroi_probe.width - iroi->width);
133   fitness += (double)(iroi_probe.height - iroi->height) * (iroi_probe.height - iroi->height);
134 
135   return fitness;
136 }
137 
138 
139 /* We use a Nelder-Mead simplex algorithm based on an implementation of Michael F. Hutt.
140    It is covered by the following copyright notice: */
141 /*
142  * Program: nmsimplex.c
143  * Author : Michael F. Hutt
144  * http://www.mikehutt.com
145  * 11/3/97
146  *
147  * An implementation of the Nelder-Mead simplex method.
148  *
149  * Copyright (c) 1997-2011 <Michael F. Hutt>
150  *
151  * Permission is hereby granted, free of charge, to any person obtaining
152  * a copy of this software and associated documentation files (the
153  * "Software"), to deal in the Software without restriction, including
154  * without limitation the rights to use, copy, modify, merge, publish,
155  * distribute, sublicense, and/or sell copies of the Software, and to
156  * permit persons to whom the Software is furnished to do so, subject to
157  * the following conditions:
158  *
159  * The above copyright notice and this permission notice shall be
160  * included in all copies or substantial portions of the Software.
161  *
162  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
163  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
164  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
165  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
166  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
167  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
168  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
169  *
170  */
171 
172 #define MAX_IT 1000 /* maximum number of iterations */
173 #define ALPHA 1.0   /* reflection coefficient */
174 #define BETA 0.5    /* contraction coefficient */
175 #define GAMMA 2.0   /* expansion coefficient */
176 
_simplex(double (* objfunc)(double[],void * []),double start[],int n,double EPSILON,double scale,int maxiter,void (* constrain)(double[],int n),void * rest[])177 static int _simplex(double (*objfunc)(double[], void *[]), double start[], int n, double EPSILON,
178                     double scale, int maxiter, void (*constrain)(double[], int n), void *rest[])
179 {
180 
181   int vs; /* vertex with smallest value */
182   int vh; /* vertex with next smallest value */
183   int vg; /* vertex with largest value */
184 
185   int i, j = 0, m, row;
186   int k;   /* track the number of function evaluations */
187   int itr; /* track the number of iterations */
188 
189   double **v;    /* holds vertices of simplex */
190   double pn, qn; /* values used to create initial simplex */
191   double *f;     /* value of function at each vertex */
192   double fr;     /* value of function at reflection point */
193   double fe;     /* value of function at expansion point */
194   double fc;     /* value of function at contraction point */
195   double *vr;    /* reflection - coordinates */
196   double *ve;    /* expansion - coordinates */
197   double *vc;    /* contraction - coordinates */
198   double *vm;    /* centroid - coordinates */
199 
200   double fsum, favg, s, cent;
201 
202   /* dynamically allocate arrays */
203 
204   /* allocate the rows of the arrays */
205   v = (double **)malloc(sizeof(double *) * (n + 1));
206   f = (double *)malloc(sizeof(double) * (n + 1));
207   vr = (double *)malloc(sizeof(double) * n);
208   ve = (double *)malloc(sizeof(double) * n);
209   vc = (double *)malloc(sizeof(double) * n);
210   vm = (double *)malloc(sizeof(double) * n);
211 
212   /* allocate the columns of the arrays */
213   for(i = 0; i <= n; i++)
214   {
215     v[i] = (double *)malloc(sizeof(double) * n);
216   }
217 
218   /* create the initial simplex */
219   /* assume one of the vertices is 0,0 */
220 
221   pn = scale * (sqrt(n + 1) - 1 + n) / (n * sqrt(2));
222   qn = scale * (sqrt(n + 1) - 1) / (n * sqrt(2));
223 
224   for(i = 0; i < n; i++)
225   {
226     v[0][i] = start[i];
227   }
228 
229   for(i = 1; i <= n; i++)
230   {
231     for(j = 0; j < n; j++)
232     {
233       if(i - 1 == j)
234       {
235         v[i][j] = pn + start[j];
236       }
237       else
238       {
239         v[i][j] = qn + start[j];
240       }
241     }
242   }
243 
244   if(constrain != NULL)
245   {
246     constrain(v[j], n);
247   }
248   /* find the initial function values */
249   for(j = 0; j <= n; j++)
250   {
251     f[j] = objfunc(v[j], rest);
252   }
253 
254   k = n + 1;
255 
256 #if 0
257   /* print out the initial values */
258   printf ("Initial Values\n");
259   for (j = 0; j <= n; j++)
260   {
261     for (i = 0; i < n; i++)
262     {
263       printf ("%f %f\n", v[j][i], f[j]);
264     }
265   }
266 #endif
267 
268   /* begin the main loop of the minimization */
269   for(itr = 1; itr <= maxiter; itr++)
270   {
271     /* find the index of the largest value */
272     vg = 0;
273     for(j = 0; j <= n; j++)
274     {
275       if(f[j] > f[vg])
276       {
277         vg = j;
278       }
279     }
280 
281     /* find the index of the smallest value */
282     vs = 0;
283     for(j = 0; j <= n; j++)
284     {
285       if(f[j] < f[vs])
286       {
287         vs = j;
288       }
289     }
290 
291     /* find the index of the second largest value */
292     vh = vs;
293     for(j = 0; j <= n; j++)
294     {
295       if(f[j] > f[vh] && f[j] < f[vg])
296       {
297         vh = j;
298       }
299     }
300 
301     /* calculate the centroid */
302     for(j = 0; j <= n - 1; j++)
303     {
304       cent = 0.0;
305       for(m = 0; m <= n; m++)
306       {
307         if(m != vg)
308         {
309           cent += v[m][j];
310         }
311       }
312       vm[j] = cent / n;
313     }
314 
315     /* reflect vg to new vertex vr */
316     for(j = 0; j <= n - 1; j++)
317     {
318       /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j]; */
319       vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
320     }
321     if(constrain != NULL)
322     {
323       constrain(vr, n);
324     }
325     fr = objfunc(vr, rest);
326     k++;
327 
328     if(fr < f[vh] && fr >= f[vs])
329     {
330       for(j = 0; j <= n - 1; j++)
331       {
332         v[vg][j] = vr[j];
333       }
334       f[vg] = fr;
335     }
336 
337     /* investigate a step further in this direction */
338     if(fr < f[vs])
339     {
340       for(j = 0; j <= n - 1; j++)
341       {
342         /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j]; */
343         ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
344       }
345       if(constrain != NULL)
346       {
347         constrain(ve, n);
348       }
349       fe = objfunc(ve, rest);
350       k++;
351 
352       /* by making fe < fr as opposed to fe < f[vs],
353          Rosenbrocks function takes 63 iterations as opposed
354          to 64 when using double variables. */
355 
356       if(fe < fr)
357       {
358         for(j = 0; j <= n - 1; j++)
359         {
360           v[vg][j] = ve[j];
361         }
362         f[vg] = fe;
363       }
364       else
365       {
366         for(j = 0; j <= n - 1; j++)
367         {
368           v[vg][j] = vr[j];
369         }
370         f[vg] = fr;
371       }
372     }
373 
374     /* check to see if a contraction is necessary */
375     if(fr >= f[vh])
376     {
377       if(fr < f[vg] && fr >= f[vh])
378       {
379         /* perform outside contraction */
380         for(j = 0; j <= n - 1; j++)
381         {
382           /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
383           vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
384         }
385         if(constrain != NULL)
386         {
387           constrain(vc, n);
388         }
389         fc = objfunc(vc, rest);
390         k++;
391       }
392       else
393       {
394         /* perform inside contraction */
395         for(j = 0; j <= n - 1; j++)
396         {
397           /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
398           vc[j] = vm[j] - BETA * (vm[j] - v[vg][j]);
399         }
400         if(constrain != NULL)
401         {
402           constrain(vc, n);
403         }
404         fc = objfunc(vc, rest);
405         k++;
406       }
407 
408 
409       if(fc < f[vg])
410       {
411         for(j = 0; j <= n - 1; j++)
412         {
413           v[vg][j] = vc[j];
414         }
415         f[vg] = fc;
416       }
417       /* at this point the contraction is not successful,
418          we must halve the distance from vs to all the
419          vertices of the simplex and then continue.
420          10/31/97 - modified to account for ALL vertices.
421        */
422       else
423       {
424         for(row = 0; row <= n; row++)
425         {
426           if(row != vs)
427           {
428             for(j = 0; j <= n - 1; j++)
429             {
430               v[row][j] = v[vs][j] + (v[row][j] - v[vs][j]) / 2.0;
431             }
432           }
433         }
434         if(constrain != NULL)
435         {
436           constrain(v[vg], n);
437         }
438         f[vg] = objfunc(v[vg], rest);
439         k++;
440         if(constrain != NULL)
441         {
442           constrain(v[vh], n);
443         }
444         f[vh] = objfunc(v[vh], rest);
445         k++;
446       }
447     }
448 
449 #if 0
450     /* print out the value at each iteration */
451     printf ("Iteration %d\n", itr);
452     for (j = 0; j <= n; j++)
453     {
454       for (i = 0; i < n; i++)
455       {
456         printf ("%f %f\n", v[j][i], f[j]);
457       }
458     }
459 #endif
460 
461     /* test for convergence */
462     fsum = 0.0;
463     for(j = 0; j <= n; j++)
464     {
465       fsum += f[j];
466     }
467     favg = fsum / (n + 1);
468     s = 0.0;
469     for(j = 0; j <= n; j++)
470     {
471       s += pow((f[j] - favg), 2.0) / (n);
472     }
473     s = sqrt(s);
474     if(s < EPSILON) break;
475   }
476   /* end main loop of the minimization */
477 
478   /* find the index of the smallest value */
479   vs = 0;
480   for(j = 0; j <= n; j++)
481   {
482     if(f[j] < f[vs])
483     {
484       vs = j;
485     }
486   }
487 
488 #if 0
489   printf ("The minimum was found at\n");
490   for (j = 0; j < n; j++)
491   {
492     printf ("%e\n", v[vs][j]);
493     start[j] = v[vs][j];
494   }
495   double min = objfunc (v[vs], rest);
496   printf ("Function value at minimum %f\n", min);
497   k++;
498   printf ("%d Function Evaluations\n", k);
499   printf ("%d Iterations through program\n", itr);
500 #endif
501 
502   free(f);
503   free(vr);
504   free(ve);
505   free(vc);
506   free(vm);
507   for(i = 0; i <= n; i++)
508   {
509     free(v[i]);
510   }
511   free(v);
512   return itr;
513 }
514 
515 
_nm_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta)516 static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
517                                        const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta)
518 {
519   void *rest[4] = { (void *)self, (void *)piece, (void *)iroi, (void *)oroi };
520   double start[4] = { (float)oroi->x / piece->iwidth, (float)oroi->y / piece->iheight,
521                       (float)oroi->width / piece->iwidth, (float)oroi->height / piece->iheight };
522   double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
523   int maxiter = 1000;
524 
525   int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
526 
527   // printf("_simplex: %d, delta: %d, epsilon: %f\n", iter, delta, epsilon);
528 
529   oroi->x = start[0] * piece->iwidth;
530   oroi->y = start[1] * piece->iheight;
531   oroi->width = start[2] * piece->iwidth;
532   oroi->height = start[3] * piece->iheight;
533 
534   return (iter <= maxiter);
535 }
536 
537 
538 
539 /* find a matching oroi_full by probing start value of oroi and get corresponding input roi into iroi_probe.
540    We search in two steps. first by a simplicistic iterative search which will succeed in most cases.
541    If this does not converge, we do a downhill simplex (nelder-mead) fitting */
_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta,int iter)542 static int _fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
543                                     const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta, int iter)
544 {
545   dt_iop_roi_t iroi_probe = *iroi;
546   dt_iop_roi_t save_oroi = *oroi;
547 
548   // try to go the easy way. this works in many cases where output is
549   // just like input, only scaled down
550   self->modify_roi_in(self, piece, oroi, &iroi_probe);
551   while((abs((int)iroi_probe.x - (int)iroi->x) > delta || abs((int)iroi_probe.y - (int)iroi->y) > delta
552          || abs((int)iroi_probe.width - (int)iroi->width) > delta
553          || abs((int)iroi_probe.height - (int)iroi->height) > delta) && iter > 0)
554   {
555     //_print_roi(&iroi_probe, "tile iroi_probe");
556     //_print_roi(oroi, "tile oroi old");
557 
558     oroi->x += (iroi->x - iroi_probe.x) * oroi->scale / iroi->scale;
559     oroi->y += (iroi->y - iroi_probe.y) * oroi->scale / iroi->scale;
560     oroi->width += (iroi->width - iroi_probe.width) * oroi->scale / iroi->scale;
561     oroi->height += (iroi->height - iroi_probe.height) * oroi->scale / iroi->scale;
562 
563     //_print_roi(oroi, "tile oroi new");
564 
565     self->modify_roi_in(self, piece, oroi, &iroi_probe);
566     iter--;
567   }
568 
569   if(iter > 0) return TRUE;
570 
571   *oroi = save_oroi;
572 
573   // simplicistic approach did not converge.
574   // try simplex downhill fitting now.
575   // it's crucial that we have a good starting point in oroi, else this
576   // will not converge as well.
577   int fit = _nm_fit_output_to_input_roi(self, piece, iroi, oroi, delta);
578   return fit;
579 }
580 
581 
582 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)583 static void _default_process_tiling_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
584                                         const void *const ivoid, void *const ovoid,
585                                         const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
586                                         const int in_bpp)
587 {
588   void *input = NULL;
589   void *output = NULL;
590   dt_iop_buffer_dsc_t dsc;
591   self->output_format(self, piece->pipe, piece, &dsc);
592   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
593 
594   const int ipitch = roi_in->width * in_bpp;
595   const int opitch = roi_out->width * out_bpp;
596   const int max_bpp = _max(in_bpp, out_bpp);
597 
598   /* get tiling requirements of module */
599   dt_develop_tiling_t tiling = { 0 };
600   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
601 
602   /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
603    */
604   if(tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp)
605   {
606     dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] no need to use tiling for module '%s' as no real "
607                            "memory saving to be expected\n",
608              self->op);
609     goto fallback;
610   }
611 
612   /* calculate optimal size of tiles */
613   float available = dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
614   assert(available >= 500.0f * 1024.0f * 1024.0f);
615   /* correct for size of ivoid and ovoid which are needed on top of tiling */
616   available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
617                    - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
618                    0);
619 
620   /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
621      this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
622      reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
623   float singlebuffer = dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
624   singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
625   const float factor = fmax(tiling.factor, 1.0f);
626   const float maxbuf = fmax(tiling.maxbuf, 1.0f);
627   singlebuffer = fmax(available / factor, singlebuffer);
628 
629   int width = roi_in->width;
630   int height = roi_in->height;
631 
632   /* shrink tile size in case it would exceed singlebuffer size */
633   if((float)width * height * max_bpp * maxbuf > singlebuffer)
634   {
635     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
636 
637     /* TODO: can we make this more efficient to minimize total overlap between tiles? */
638     if(width < height && scale >= 0.333f)
639     {
640       height = floorf(height * scale);
641     }
642     else if(height <= width && scale >= 0.333f)
643     {
644       width = floorf(width * scale);
645     }
646     else
647     {
648       width = floorf(width * sqrtf(scale));
649       height = floorf(height * sqrtf(scale));
650     }
651   }
652 
653   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
654   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
655   {
656     width = height = floorf(sqrtf((float)width * height));
657   }
658 
659   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
660      Modules will report alignment requirements via xalign and yalign within tiling_callback().
661      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
662      direction.
663      We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
664      that is identical to image width/height no special alignment is needed. */
665 
666   const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
667 
668   assert(xyalign != 0);
669 
670   /* properly align tile width and height by making them smaller if needed */
671   if(width < roi_in->width) width = (width / xyalign) * xyalign;
672   if(height < roi_in->height) height = (height / xyalign) * xyalign;
673 
674   /* also make sure that overlap follows alignment rules by making it wider when needed */
675   const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
676                                                     : tiling.overlap;
677 
678   /* calculate effective tile size */
679   const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
680   const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
681 
682   /* calculate number of tiles */
683   const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
684   const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
685 
686   /* sanity check: don't run wild on too many tiles */
687   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
688   {
689     dt_print(DT_DEBUG_DEV,
690              "[default_process_tiling_ptp] gave up tiling for module '%s'. too many tiles: %d x %d\n",
691              self->op, tiles_x, tiles_y);
692     goto error;
693   }
694 
695 
696   dt_print(DT_DEBUG_DEV,
697            "[default_process_tiling_ptp] use tiling on module '%s' for image with full size %d x %d\n",
698            self->op, roi_in->width, roi_in->height);
699   dt_print(DT_DEBUG_DEV,
700            "[default_process_tiling_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
701            tiles_x, tiles_y, width, height, overlap);
702 
703   /* reserve input and output buffers for tiles */
704   input = dt_alloc_align(64, (size_t)width * height * in_bpp);
705   if(input == NULL)
706   {
707     dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc input buffer for module '%s'\n",
708              self->op);
709     goto error;
710   }
711   output = dt_alloc_align(64, (size_t)width * height * out_bpp);
712   if(output == NULL)
713   {
714     dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc output buffer for module '%s'\n",
715              self->op);
716     goto error;
717   }
718 
719   /* store processed_maximum to be re-used and aggregated */
720   float processed_maximum_saved[4];
721   float processed_maximum_new[4] = { 1.0f };
722   for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
723 
724 
725   /* iterate over tiles */
726   for(size_t tx = 0; tx < tiles_x; tx++)
727   {
728     const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
729     for(size_t ty = 0; ty < tiles_y; ty++)
730     {
731       piece->pipe->tiling = 1;
732 
733       const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
734 
735       /* no need to process end-tiles that are smaller than the total overlap area */
736       if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
737 
738       /* origin and region of effective part of tile, which we want to store later */
739       size_t origin[] = { 0, 0, 0 };
740       size_t region[] = { wd, ht, 1 };
741 
742       /* roi_in and roi_out for process_cl on subbuffer */
743       dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
744       dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
745 
746       /* offsets of tile into ivoid and ovoid */
747       const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
748       size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
749 
750 
751       dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n",
752                tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
753 
754 /* prepare input tile buffer */
755 #ifdef _OPENMP
756 #pragma omp parallel for default(none) \
757       dt_omp_firstprivate(ht, in_bpp, ipitch, ivoid, wd) \
758       dt_omp_sharedconst(ioffs) \
759       shared(input, width) \
760       schedule(static)
761 #endif
762       for(size_t j = 0; j < ht; j++)
763         memcpy((char *)input + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch, (size_t)wd * in_bpp);
764 
765       /* take original processed_maximum as starting point */
766       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
767 
768       /* call process() of module */
769       self->process(self, piece, input, output, &iroi, &oroi);
770 
771       /* aggregate resulting processed_maximum */
772       /* TODO: check if there really can be differences between tiles and take
773                appropriate action (calculate minimum, maximum, average, ...?) */
774       for(int k = 0; k < 4; k++)
775       {
776         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
777           dt_print(
778               DT_DEBUG_DEV,
779               "[default_process_tiling_ptp] processed_maximum[%d] differs between tiles in module '%s'\n", k,
780               self->op);
781         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
782       }
783 
784       /* correct origin and region of tile for overlap.
785          make sure that we only copy back the "good" part. */
786       if(tx > 0)
787       {
788         origin[0] += overlap;
789         region[0] -= overlap;
790         ooffs += (size_t)overlap * out_bpp;
791       }
792       if(ty > 0)
793       {
794         origin[1] += overlap;
795         region[1] -= overlap;
796         ooffs += (size_t)overlap * opitch;
797       }
798 
799 /* copy "good" part of tile to output buffer */
800 #ifdef _OPENMP
801 #pragma omp parallel for default(none) \
802       dt_omp_firstprivate(opitch, out_bpp, ovoid, wd) \
803       shared(ooffs, output, width, origin, region) \
804       schedule(static)
805 #endif
806       for(size_t j = 0; j < region[1]; j++)
807         memcpy((char *)ovoid + ooffs + j * opitch,
808                (char *)output + ((j + origin[1]) * wd + origin[0]) * out_bpp, (size_t)region[0] * out_bpp);
809     }
810   }
811 
812   /* copy back final processed_maximum */
813   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
814 
815   if(input != NULL) dt_free_align(input);
816   if(output != NULL) dt_free_align(output);
817   piece->pipe->tiling = 0;
818   return;
819 
820 error:
821   dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
822 // fall through
823 
824 fallback:
825   if(input != NULL) dt_free_align(input);
826   if(output != NULL) dt_free_align(output);
827   piece->pipe->tiling = 0;
828   dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] fall back to standard processing for module '%s'\n",
829            self->op);
830   self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
831   return;
832 }
833 
834 
835 
836 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
837    more tiles and larger overlap */
_default_process_tiling_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)838 static void _default_process_tiling_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
839                                         const void *const ivoid, void *const ovoid,
840                                         const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
841                                         const int in_bpp)
842 {
843   void *input = NULL;
844   void *output = NULL;
845 
846   //_print_roi(roi_in, "module roi_in");
847   //_print_roi(roi_out, "module roi_out");
848 
849   dt_iop_buffer_dsc_t dsc;
850   self->output_format(self, piece->pipe, piece, &dsc);
851   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
852 
853   const int ipitch = roi_in->width * in_bpp;
854   const int opitch = roi_out->width * out_bpp;
855   const int max_bpp = _max(in_bpp, out_bpp);
856 
857   float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
858                                                               / ((float)roi_out->width * roi_out->height)));
859 
860   /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
861   const int delta = ceilf(fullscale);
862 
863   /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
864   const int inacc = RESERVE * delta;
865 
866   /* get tiling requirements of module */
867   dt_develop_tiling_t tiling = { 0 };
868   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
869 
870   /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
871    */
872   if(tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp)
873   {
874     dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] no need to use tiling for module '%s' as no real "
875                            "memory saving to be expected\n",
876              self->op);
877     goto fallback;
878   }
879 
880   /* calculate optimal size of tiles */
881   float available = dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
882   assert(available >= 500.0f * 1024.0f * 1024.0f);
883   /* correct for size of ivoid and ovoid which are needed on top of tiling */
884   available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
885                    - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
886                    0);
887 
888   /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
889      this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
890      reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
891   float singlebuffer = dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
892   singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
893   const float factor = fmax(tiling.factor, 1.0f);
894   const float maxbuf = fmax(tiling.maxbuf, 1.0f);
895   singlebuffer = fmax(available / factor, singlebuffer);
896 
897   int width = _max(roi_in->width, roi_out->width);
898   int height = _max(roi_in->height, roi_out->height);
899 
900   /* shrink tile size in case it would exceed singlebuffer size */
901   if((float)width * height * max_bpp * maxbuf > singlebuffer)
902   {
903     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
904 
905     /* TODO: can we make this more efficient to minimize total overlap between tiles? */
906     if(width < height && scale >= 0.333f)
907     {
908       height = floorf(height * scale);
909     }
910     else if(height <= width && scale >= 0.333f)
911     {
912       width = floorf(width * scale);
913     }
914     else
915     {
916       width = floorf(width * sqrtf(scale));
917       height = floorf(height * sqrtf(scale));
918     }
919   }
920 
921   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
922   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
923   {
924     width = height = floorf(sqrtf((float)width * height));
925   }
926 
927   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
928      Modules will report alignment requirements via xalign and yalign within tiling_callback().
929      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
930      direction. */
931 
932   /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
933   const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
934 
935   assert(xyalign != 0);
936 
937   /* make sure that overlap follows alignment rules by making it wider when needed.
938      overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
939   const int overlap_in = _align_up(tiling.overlap, xyalign);
940   const int overlap_out = ceilf((float)overlap_in / fullscale);
941 
942   int tiles_x = 1, tiles_y = 1;
943 
944   /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
945      normally it is roi_in > roi_out; but let's be prepared */
946   if(roi_in->width > roi_out->width)
947     tiles_x = width < roi_in->width
948                   ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
949                   : 1;
950   else
951     tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
952                                      : 1;
953 
954   if(roi_in->height > roi_out->height)
955     tiles_y = height < roi_in->height
956                   ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
957                   : 1;
958   else
959     tiles_y = height < roi_out->height
960                   ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
961                   : 1;
962 
963   /* sanity check: don't run wild on too many tiles */
964   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
965   {
966     dt_print(DT_DEBUG_DEV,
967              "[default_process_tiling_roi] gave up tiling for module '%s'. too many tiles: %d x %d\n",
968              self->op, tiles_x, tiles_y);
969     goto error;
970   }
971 
972 
973   /* calculate tile width and height excl. overlap (i.e. the good part) for output.
974      values are important for all following processing steps. */
975   const int tile_wd = _align_up(
976       roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
977   const int tile_ht = _align_up(
978       roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
979 
980   dt_print(DT_DEBUG_DEV,
981            "[default_process_tiling_roi] use tiling on module '%s' for image with full input size %d x %d\n",
982            self->op, roi_in->width, roi_in->height);
983   dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] (%d x %d) tiles with max dimensions %d x %d\n",
984            tiles_x, tiles_y, width, height);
985 
986 
987   /* store processed_maximum to be re-used and aggregated */
988   float processed_maximum_saved[4];
989   float processed_maximum_new[4] = { 1.0f };
990   for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
991 
992   /* iterate over tiles */
993   for(size_t tx = 0; tx < tiles_x; tx++)
994     for(size_t ty = 0; ty < tiles_y; ty++)
995     {
996       piece->pipe->tiling = 1;
997 
998       /* the output dimensions of the good part of this specific tile */
999       size_t wd = (tx + 1) * tile_wd > roi_out->width ? roi_out->width - tx * tile_wd : tile_wd;
1000       size_t ht = (ty + 1) * tile_ht > roi_out->height ? roi_out->height - ty * tile_ht : tile_ht;
1001 
1002       /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1003          iroi_good is calculated by modify_roi_in() of respective module */
1004       dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1005       dt_iop_roi_t oroi_good
1006           = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1007 
1008       self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1009 
1010       /* clamp iroi_good to not exceed roi_in */
1011       iroi_good.x = _max(iroi_good.x, roi_in->x);
1012       iroi_good.y = _max(iroi_good.y, roi_in->y);
1013       iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1014       iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1015 
1016       //_print_roi(&iroi_good, "tile iroi_good");
1017       //_print_roi(&oroi_good, "tile oroi_good");
1018 
1019       /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1020          requirements
1021          and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1022          -> generates first estimate of iroi_full */
1023       const int x_in = iroi_good.x;
1024       const int y_in = iroi_good.y;
1025       const int width_in = iroi_good.width;
1026       const int height_in = iroi_good.height;
1027       const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1028       const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1029       const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1030                                     roi_in->width + roi_in->x - new_x_in);
1031       const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1032                                      roi_in->height + roi_in->y - new_y_in);
1033 
1034       /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1035        * following iterative search */
1036       dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1037       dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1038 
1039       //_print_roi(&iroi_full, "tile iroi_full before optimization");
1040       //_print_roi(&oroi_full, "tile oroi_full before optimization");
1041 
1042       /* try to find a matching oroi_full */
1043       if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1044       {
1045         dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] can not handle requested roi's. tiling for "
1046                                "module '%s' not possible.\n",
1047                  self->op);
1048         goto error;
1049       }
1050 
1051       //_print_roi(&iroi_full, "tile iroi_full after optimization");
1052       //_print_roi(&oroi_full, "tile oroi_full after optimization");
1053 
1054       /* make sure that oroi_full at least covers the range of oroi_good.
1055          this step is needed due to the possibility of rounding errors */
1056       oroi_full.x = _min(oroi_full.x, oroi_good.x);
1057       oroi_full.y = _min(oroi_full.y, oroi_good.y);
1058       oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1059       oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1060 
1061       /* clamp oroi_full to not exceed roi_out */
1062       oroi_full.x = _max(oroi_full.x, roi_out->x);
1063       oroi_full.y = _max(oroi_full.y, roi_out->y);
1064       oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1065       oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1066 
1067       /* calculate final iroi_full */
1068       self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1069 
1070       /* clamp iroi_full to not exceed roi_in */
1071       iroi_full.x = _max(iroi_full.x, roi_in->x);
1072       iroi_full.y = _max(iroi_full.y, roi_in->y);
1073       iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1074       iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1075 
1076 
1077       //_print_roi(&iroi_full, "tile iroi_full final");
1078       //_print_roi(&oroi_full, "tile oroi_full final");
1079 
1080       /* offsets of tile into ivoid and ovoid */
1081       const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1082       size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1083                      + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1084 
1085       dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n",
1086                tx, ty, iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1087 
1088 
1089       /* prepare input tile buffer */
1090       input = dt_alloc_align(64, (size_t)iroi_full.width * iroi_full.height * in_bpp);
1091       if(input == NULL)
1092       {
1093         dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc input buffer for module '%s'\n",
1094                  self->op);
1095         goto error;
1096       }
1097       output = dt_alloc_align(64, (size_t)oroi_full.width * oroi_full.height * out_bpp);
1098       if(output == NULL)
1099       {
1100         dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc output buffer for module '%s'\n",
1101                  self->op);
1102         goto error;
1103       }
1104 
1105 #ifdef _OPENMP
1106 #pragma omp parallel for default(none) \
1107       dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1108       dt_omp_sharedconst(ioffs) shared(input, iroi_full) \
1109       schedule(static)
1110 #endif
1111       for(size_t j = 0; j < iroi_full.height; j++)
1112         memcpy((char *)input + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1113                (size_t)iroi_full.width * in_bpp);
1114 
1115       /* take original processed_maximum as starting point */
1116       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1117 
1118       /* call process() of module */
1119       self->process(self, piece, input, output, &iroi_full, &oroi_full);
1120 
1121       /* aggregate resulting processed_maximum */
1122       /* TODO: check if there really can be differences between tiles and take
1123                appropriate action (calculate minimum, maximum, average, ...?) */
1124       for(int k = 0; k < 4; k++)
1125       {
1126         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1127           dt_print(
1128               DT_DEBUG_DEV,
1129               "[default_process_tiling_roi] processed_maximum[%d] differs between tiles in module '%s'\n", k,
1130               self->op);
1131         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1132       }
1133 
1134       /* copy "good" part of tile to output buffer */
1135       const int origin_x = oroi_good.x - oroi_full.x;
1136       const int origin_y = oroi_good.y - oroi_full.y;
1137 #ifdef _OPENMP
1138 #pragma omp parallel for default(none) \
1139       dt_omp_firstprivate(opitch, origin_x, origin_y, out_bpp, ovoid) \
1140       shared(ooffs, output, oroi_good, oroi_full) \
1141       schedule(static)
1142 #endif
1143       for(size_t j = 0; j < oroi_good.height; j++)
1144         memcpy((char *)ovoid + ooffs + j * opitch,
1145                (char *)output + ((j + origin_y) * oroi_full.width + origin_x) * out_bpp,
1146                (size_t)oroi_good.width * out_bpp);
1147 
1148       dt_free_align(input);
1149       dt_free_align(output);
1150       input = output = NULL;
1151     }
1152 
1153   /* copy back final processed_maximum */
1154   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1155 
1156   if(input != NULL) dt_free_align(input);
1157   if(output != NULL) dt_free_align(output);
1158   piece->pipe->tiling = 0;
1159   return;
1160 
1161 error:
1162   dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
1163 // fall through
1164 
1165 fallback:
1166   if(input != NULL) dt_free_align(input);
1167   if(output != NULL) dt_free_align(output);
1168   piece->pipe->tiling = 0;
1169   dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] fall back to standard processing for module '%s'\n",
1170            self->op);
1171   self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
1172   return;
1173 }
1174 
1175 
1176 
1177 /* if a module does not implement process_tiling() by itself, this function is called instead.
1178    _default_process_tiling_ptp() is able to handle standard cases where pixels do not change their places.
1179    _default_process_tiling_roi() takes care of all other cases where image gets distorted and for module
1180    "clipping",
1181    "flip" as this may flip or mirror the image. */
default_process_tiling(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1182 void default_process_tiling(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1183                             const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
1184                             const dt_iop_roi_t *const roi_out, const int in_bpp)
1185 {
1186   if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1187     _default_process_tiling_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1188   else
1189     _default_process_tiling_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1190   return;
1191 }
1192 
1193 
1194 
1195 #ifdef HAVE_OPENCL
1196 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_cl_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1197 static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1198                                           const void *const ivoid, void *const ovoid,
1199                                           const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1200                                           const int in_bpp)
1201 {
1202   cl_int err = -999;
1203   cl_mem input = NULL;
1204   cl_mem output = NULL;
1205   cl_mem pinned_input = NULL;
1206   cl_mem pinned_output = NULL;
1207   void *input_buffer = NULL;
1208   void *output_buffer = NULL;
1209 
1210   dt_iop_buffer_dsc_t dsc;
1211   self->output_format(self, piece->pipe, piece, &dsc);
1212   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1213 
1214   const int devid = piece->pipe->devid;
1215   const int ipitch = roi_in->width * in_bpp;
1216   const int opitch = roi_out->width * out_bpp;
1217   const int max_bpp = _max(in_bpp, out_bpp);
1218 
1219   /* get tiling requirements of module */
1220   dt_develop_tiling_t tiling = { 0 };
1221   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1222 
1223   /* shall we use pinned memory transfers? */
1224   int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1225   const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1226                                                                 // which seemingly get allocated not only on
1227                                                                 // host but also on device (why???)
1228   const float pinned_buffer_slack
1229       = use_pinned_memory
1230             ? 0.85f
1231             : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1232 
1233   /* calculate optimal size of tiles */
1234   float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1235   headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1236   const float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1237   const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1238   const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1239                                   pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1240   const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1241   int width = _min(roi_in->width, darktable.opencl->dev[devid].max_image_width);
1242   int height = _min(roi_in->height, darktable.opencl->dev[devid].max_image_height);
1243 
1244   /* shrink tile size in case it would exceed singlebuffer size */
1245   if((float)width * height * max_bpp * maxbuf > singlebuffer)
1246   {
1247     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1248 
1249     if(width < height && scale >= 0.333f)
1250     {
1251       height = floorf(height * scale);
1252     }
1253     else if(height <= width && scale >= 0.333f)
1254     {
1255       width = floorf(width * scale);
1256     }
1257     else
1258     {
1259       width = floorf(width * sqrtf(scale));
1260       height = floorf(height * sqrtf(scale));
1261     }
1262   }
1263 
1264   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1265   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1266   {
1267     width = height = floorf(sqrtf((float)width * height));
1268   }
1269 
1270 
1271   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1272      Modules will report alignment requirements via xalign and yalign within tiling_callback().
1273      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1274      direction. Additional alignment requirements are set via definition of CL_ALIGNMENT.
1275      We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
1276      that is identical to image width/height no special alignment is done. */
1277 
1278   /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1279   const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1280 
1281   /* determining alignment requirement for tile width/height.
1282      in case of tile width also align according to definition of CL_ALIGNMENT */
1283   const unsigned int walign = _lcm(xyalign, CL_ALIGNMENT);
1284   const unsigned int halign = xyalign;
1285 
1286   assert(xyalign != 0 && walign != 0 && halign != 0);
1287 
1288   /* properly align tile width and height by making them smaller if needed */
1289   if(width < roi_in->width) width = (width / walign) * walign;
1290   if(height < roi_in->height) height = (height / halign) * halign;
1291 
1292   /* also make sure that overlap follows alignment rules by making it wider when needed */
1293   const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
1294                                                     : tiling.overlap;
1295 
1296 
1297   /* calculate effective tile size */
1298   const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
1299   const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
1300 
1301 
1302   /* calculate number of tiles */
1303   const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
1304   const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
1305 
1306   /* sanity check: don't run wild on too many tiles */
1307   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1308   {
1309     dt_print(DT_DEBUG_OPENCL,
1310              "[default_process_tiling_cl_ptp] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1311              self->op, tiles_x, tiles_y);
1312     return FALSE;
1313   }
1314 
1315 
1316   dt_print(DT_DEBUG_OPENCL,
1317            "[default_process_tiling_cl_ptp] use tiling on module '%s' for image with full size %d x %d\n",
1318            self->op, roi_in->width, roi_in->height);
1319   dt_print(DT_DEBUG_OPENCL,
1320            "[default_process_tiling_cl_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
1321            tiles_x, tiles_y, width, height, overlap);
1322 
1323   /* store processed_maximum to be re-used and aggregated */
1324   float processed_maximum_saved[4];
1325   float processed_maximum_new[4] = { 1.0f };
1326   for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1327 
1328   /* reserve pinned input and output memory for host<->device data transfer */
1329   if(use_pinned_memory)
1330   {
1331     pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1332                                                             CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1333     if(pinned_input == NULL)
1334     {
1335       dt_print(DT_DEBUG_OPENCL,
1336                "[default_process_tiling_cl_ptp] could not alloc pinned input buffer for module '%s'\n",
1337                self->op);
1338       use_pinned_memory = 0;
1339     }
1340   }
1341 
1342   if(use_pinned_memory)
1343   {
1344 
1345     input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1346                                         (size_t)width * height * in_bpp);
1347     if(input_buffer == NULL)
1348     {
1349       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned input buffer to host "
1350                                 "memory for module '%s'\n",
1351                self->op);
1352       use_pinned_memory = 0;
1353     }
1354   }
1355 
1356   if(use_pinned_memory)
1357   {
1358 
1359     pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1360                                                              CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1361     if(pinned_output == NULL)
1362     {
1363       dt_print(DT_DEBUG_OPENCL,
1364                "[default_process_tiling_cl_ptp] could not alloc pinned output buffer for module '%s'\n",
1365                self->op);
1366       use_pinned_memory = 0;
1367     }
1368   }
1369 
1370   if(use_pinned_memory)
1371   {
1372 
1373     output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1374                                          (size_t)width * height * out_bpp);
1375     if(output_buffer == NULL)
1376     {
1377       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned output buffer to host "
1378                                 "memory for module '%s'\n",
1379                self->op);
1380       use_pinned_memory = 0;
1381     }
1382   }
1383 
1384   /* iterate over tiles */
1385   for(size_t tx = 0; tx < tiles_x; tx++)
1386     for(size_t ty = 0; ty < tiles_y; ty++)
1387     {
1388       piece->pipe->tiling = 1;
1389 
1390       const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
1391       const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
1392 
1393       /* no need to process (end)tiles that are smaller than the total overlap area */
1394       if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
1395 
1396       /* origin and region of effective part of tile, which we want to store later */
1397       size_t origin[] = { 0, 0, 0 };
1398       size_t region[] = { wd, ht, 1 };
1399 
1400       /* roi_in and roi_out for process_cl on subbuffer */
1401       dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1402       dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1403 
1404 
1405       /* offsets of tile into ivoid and ovoid */
1406       const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
1407       size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
1408 
1409 
1410       dt_print(DT_DEBUG_OPENCL,
1411                "[default_process_tiling_cl_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n", tx, ty, wd,
1412                ht, tx * tile_wd, ty * tile_ht);
1413 
1414       /* get input and output buffers */
1415       input = dt_opencl_alloc_device(devid, wd, ht, in_bpp);
1416       if(input == NULL) goto error;
1417       output = dt_opencl_alloc_device(devid, wd, ht, out_bpp);
1418       if(output == NULL) goto error;
1419 
1420       if(use_pinned_memory)
1421       {
1422 /* prepare pinned input tile buffer: copy part of input image */
1423 #ifdef _OPENMP
1424 #pragma omp parallel for default(none) \
1425         dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1426         dt_omp_sharedconst(ioffs, wd, ht) shared(input_buffer, width) \
1427         schedule(static)
1428 #endif
1429         for(size_t j = 0; j < ht; j++)
1430           memcpy((char *)input_buffer + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1431                  (size_t)wd * in_bpp);
1432 
1433         /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1434         err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, origin, region,
1435                                                  wd * in_bpp, CL_TRUE);
1436         if(err != CL_SUCCESS) goto error;
1437       }
1438       else
1439       {
1440         /* blocking direct memory transfer: host input image -> opencl/device tile */
1441         err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, origin, region, ipitch,
1442                                                  CL_TRUE);
1443         if(err != CL_SUCCESS) goto error;
1444       }
1445 
1446       /* take original processed_maximum as starting point */
1447       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1448 
1449       /* call process_cl of module */
1450       if(!self->process_cl(self, piece, input, output, &iroi, &oroi)) goto error;
1451 
1452       /* aggregate resulting processed_maximum */
1453       /* TODO: check if there really can be differences between tiles and take
1454                appropriate action (calculate minimum, maximum, average, ...?) */
1455       for(int k = 0; k < 4; k++)
1456       {
1457         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1458           dt_print(
1459               DT_DEBUG_OPENCL,
1460               "[default_process_tiling_cl_ptp] processed_maximum[%d] differs between tiles in module '%s'\n",
1461               k, self->op);
1462         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1463       }
1464 
1465       if(use_pinned_memory)
1466       {
1467         /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1468         err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, origin, region,
1469                                                   wd * out_bpp, CL_TRUE);
1470         if(err != CL_SUCCESS) goto error;
1471       }
1472 
1473       /* correct origin and region of tile for overlap.
1474          makes sure that we only copy back the "good" part. */
1475       if(tx > 0)
1476       {
1477         origin[0] += overlap;
1478         region[0] -= overlap;
1479         ooffs += (size_t)overlap * out_bpp;
1480       }
1481       if(ty > 0)
1482       {
1483         origin[1] += overlap;
1484         region[1] -= overlap;
1485         ooffs += (size_t)overlap * opitch;
1486       }
1487 
1488       if(use_pinned_memory)
1489       {
1490 /* copy "good" part of tile from pinned output buffer to output image */
1491 #if 0 // def _OPENMP
1492 #pragma omp parallel for default(none) shared(ovoid, ooffs, output_buffer, width, origin, region,            \
1493                                               wd) schedule(static)
1494 #endif
1495         for(size_t j = 0; j < region[1]; j++)
1496           memcpy((char *)ovoid + ooffs + j * opitch,
1497                  (char *)output_buffer + ((j + origin[1]) * wd + origin[0]) * out_bpp,
1498                  (size_t)region[0] * out_bpp);
1499       }
1500       else
1501       {
1502         /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1503         err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, origin, region,
1504                                                   opitch, CL_TRUE);
1505         if(err != CL_SUCCESS) goto error;
1506       }
1507 
1508       /* release input and output buffers */
1509       dt_opencl_release_mem_object(input);
1510       input = NULL;
1511       dt_opencl_release_mem_object(output);
1512       output = NULL;
1513 
1514       /* block until opencl queue has finished to free all used event handlers */
1515       if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1516         dt_opencl_finish(devid);
1517     }
1518 
1519   /* copy back final processed_maximum */
1520   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1521 
1522   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1523   dt_opencl_release_mem_object(pinned_input);
1524   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1525   dt_opencl_release_mem_object(pinned_output);
1526   dt_opencl_release_mem_object(input);
1527   dt_opencl_release_mem_object(output);
1528   piece->pipe->tiling = 0;
1529   return TRUE;
1530 
1531 error:
1532   /* copy back stored processed_maximum */
1533   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1534   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1535   dt_opencl_release_mem_object(pinned_input);
1536   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1537   dt_opencl_release_mem_object(pinned_output);
1538   dt_opencl_release_mem_object(input);
1539   dt_opencl_release_mem_object(output);
1540   piece->pipe->tiling = 0;
1541   dt_print(
1542       DT_DEBUG_OPENCL,
1543       "[default_process_tiling_opencl_ptp] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1544       self->op, err);
1545   return FALSE;
1546 }
1547 
1548 
1549 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
1550    more tiles and larger overlap */
_default_process_tiling_cl_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1551 static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1552                                           const void *const ivoid, void *const ovoid,
1553                                           const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1554                                           const int in_bpp)
1555 {
1556   cl_int err = -999;
1557   cl_mem input = NULL;
1558   cl_mem output = NULL;
1559   cl_mem pinned_input = NULL;
1560   cl_mem pinned_output = NULL;
1561   void *input_buffer = NULL;
1562   void *output_buffer = NULL;
1563 
1564 
1565   //_print_roi(roi_in, "module roi_in");
1566   //_print_roi(roi_out, "module roi_out");
1567 
1568   dt_iop_buffer_dsc_t dsc;
1569   self->output_format(self, piece->pipe, piece, &dsc);
1570   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1571 
1572   const int devid = piece->pipe->devid;
1573   const int ipitch = roi_in->width * in_bpp;
1574   const int opitch = roi_out->width * out_bpp;
1575   const int max_bpp = _max(in_bpp, out_bpp);
1576 
1577   const float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
1578                                                               / ((float)roi_out->width * roi_out->height)));
1579 
1580   /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
1581   const int delta = ceilf(fullscale);
1582 
1583   /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
1584   const int inacc = RESERVE * delta;
1585 
1586   /* get tiling requirements of module */
1587   dt_develop_tiling_t tiling = { 0 };
1588   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1589 
1590   /* shall we use pinned memory transfers? */
1591   int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1592   const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1593                                                                 // which seemingly get allocated not only on
1594                                                                 // host but also on device (why???)
1595   const float pinned_buffer_slack
1596       = use_pinned_memory
1597             ? 0.85f
1598             : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1599 
1600   /* calculate optimal size of tiles */
1601   float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1602   headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1603   const float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1604   const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1605   const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1606                                   pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1607   const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1608 
1609   int width = _min(_max(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
1610   int height = _min(_max(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);
1611 
1612   /* shrink tile size in case it would exceed singlebuffer size */
1613   if((float)width * height * max_bpp * maxbuf > singlebuffer)
1614   {
1615     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1616 
1617     if(width < height && scale >= 0.333f)
1618     {
1619       height = floorf(height * scale);
1620     }
1621     else if(height <= width && scale >= 0.333f)
1622     {
1623       width = floorf(width * scale);
1624     }
1625     else
1626     {
1627       width = floorf(width * sqrtf(scale));
1628       height = floorf(height * sqrtf(scale));
1629     }
1630   }
1631 
1632   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1633   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1634   {
1635     width = height = floorf(sqrtf((float)width * height));
1636   }
1637 
1638 
1639   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1640      Modules will report alignment requirements via xalign and yalign within tiling_callback().
1641      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1642      direction. Additional alignment requirements are set via definition of CL_ALIGNMENT. */
1643 
1644   /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1645   unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1646   xyalign = _lcm(xyalign, CL_ALIGNMENT);
1647 
1648   assert(xyalign != 0);
1649 
1650   /* make sure that overlap follows alignment rules by making it wider when needed.
1651      overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
1652   const int overlap_in = _align_up(tiling.overlap, xyalign);
1653   const int overlap_out = ceilf((float)overlap_in / fullscale);
1654 
1655   int tiles_x = 1, tiles_y = 1;
1656 
1657   /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
1658      normally it is roi_in > roi_out; but let's be prepared */
1659   if(roi_in->width > roi_out->width)
1660     tiles_x = width < roi_in->width
1661                   ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
1662                   : 1;
1663   else
1664     tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
1665                                      : 1;
1666 
1667   if(roi_in->height > roi_out->height)
1668     tiles_y = height < roi_in->height
1669                   ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
1670                   : 1;
1671   else
1672     tiles_y = height < roi_out->height
1673                   ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
1674                   : 1;
1675 
1676   /* sanity check: don't run wild on too many tiles */
1677   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1678   {
1679     dt_print(DT_DEBUG_OPENCL,
1680              "[default_process_tiling_cl_roi] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1681              self->op, tiles_x, tiles_y);
1682     return FALSE;
1683   }
1684 
1685   /* calculate tile width and height excl. overlap (i.e. the good part) for output.
1686      important for all following processing steps. */
1687   const int tile_wd = _align_up(
1688       roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
1689   const int tile_ht = _align_up(
1690       roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
1691 
1692 
1693   dt_print(
1694       DT_DEBUG_OPENCL,
1695       "[default_process_tiling_cl_roi] use tiling on module '%s' for image with full input size %d x %d\n",
1696       self->op, roi_in->width, roi_in->height);
1697   dt_print(DT_DEBUG_OPENCL,
1698            "[default_process_tiling_cl_roi] (%d x %d) tiles with max input dimensions %d x %d\n", tiles_x,
1699            tiles_y, width, height);
1700 
1701 
1702   /* store processed_maximum to be re-used and aggregated */
1703   float processed_maximum_saved[4];
1704   float processed_maximum_new[4] = { 1.0f };
1705   for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1706 
1707   /* reserve pinned input and output memory for host<->device data transfer */
1708   if(use_pinned_memory)
1709   {
1710     pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1711                                                             CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1712     if(pinned_input == NULL)
1713     {
1714       dt_print(DT_DEBUG_OPENCL,
1715                "[default_process_tiling_cl_roi] could not alloc pinned input buffer for module '%s'\n",
1716                self->op);
1717       use_pinned_memory = 0;
1718     }
1719   }
1720 
1721   if(use_pinned_memory)
1722   {
1723 
1724     input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1725                                         (size_t)width * height * in_bpp);
1726     if(input_buffer == NULL)
1727     {
1728       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned input buffer to host "
1729                                 "memory for module '%s'\n",
1730                self->op);
1731       use_pinned_memory = 0;
1732     }
1733   }
1734 
1735   if(use_pinned_memory)
1736   {
1737 
1738     pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1739                                                              CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1740     if(pinned_output == NULL)
1741     {
1742       dt_print(DT_DEBUG_OPENCL,
1743                "[default_process_tiling_cl_roi] could not alloc pinned output buffer for module '%s'\n",
1744                self->op);
1745       use_pinned_memory = 0;
1746     }
1747   }
1748 
1749   if(use_pinned_memory)
1750   {
1751 
1752     output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1753                                          (size_t)width * height * out_bpp);
1754     if(output_buffer == NULL)
1755     {
1756       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned output buffer to host "
1757                                 "memory for module '%s'\n",
1758                self->op);
1759       use_pinned_memory = 0;
1760     }
1761   }
1762 
1763 
1764   /* iterate over tiles */
1765   for(size_t tx = 0; tx < tiles_x; tx++)
1766     for(size_t ty = 0; ty < tiles_y; ty++)
1767     {
1768       piece->pipe->tiling = 1;
1769 
1770       /* the output dimensions of the good part of this specific tile */
1771       const size_t wd = (tx + 1) * tile_wd > roi_out->width ? roi_out->width - tx * tile_wd : tile_wd;
1772       const size_t ht = (ty + 1) * tile_ht > roi_out->height ? roi_out->height - ty * tile_ht : tile_ht;
1773 
1774       /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1775          iroi_good is calculated by modify_roi_in() of respective module */
1776       dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1777       dt_iop_roi_t oroi_good
1778           = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1779 
1780       self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1781 
1782       /* clamp iroi_good to not exceed roi_in */
1783       iroi_good.x = _max(iroi_good.x, roi_in->x);
1784       iroi_good.y = _max(iroi_good.y, roi_in->y);
1785       iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1786       iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1787 
1788       //_print_roi(&iroi_good, "tile iroi_good");
1789       //_print_roi(&oroi_good, "tile oroi_good");
1790 
1791       /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1792          requirements
1793          and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1794          -> generates first estimate of iroi_full */
1795       const int x_in = iroi_good.x;
1796       const int y_in = iroi_good.y;
1797       const int width_in = iroi_good.width;
1798       const int height_in = iroi_good.height;
1799       const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1800       const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1801       const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1802                                     roi_in->width + roi_in->x - new_x_in);
1803       const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1804                                      roi_in->height + roi_in->y - new_y_in);
1805 
1806       /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1807        * following iterative search */
1808       dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1809       dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1810 
1811       //_print_roi(&iroi_full, "tile iroi_full before optimization");
1812       //_print_roi(&oroi_full, "tile oroi_full before optimization");
1813 
1814       /* try to find a matching oroi_full */
1815       if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1816       {
1817         dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] can not handle requested roi's. tiling "
1818                                   "for module '%s' not possible.\n",
1819                  self->op);
1820         goto error;
1821       }
1822 
1823 
1824       /* make sure that oroi_full at least covers the range of oroi_good.
1825          this step is needed due to the possibility of rounding errors */
1826       oroi_full.x = _min(oroi_full.x, oroi_good.x);
1827       oroi_full.y = _min(oroi_full.y, oroi_good.y);
1828       oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1829       oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1830 
1831       /* clamp oroi_full to not exceed roi_out */
1832       oroi_full.x = _max(oroi_full.x, roi_out->x);
1833       oroi_full.y = _max(oroi_full.y, roi_out->y);
1834       oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1835       oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1836 
1837 
1838       /* calculate final iroi_full */
1839       self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1840 
1841       /* clamp iroi_full to not exceed roi_in */
1842       iroi_full.x = _max(iroi_full.x, roi_in->x);
1843       iroi_full.y = _max(iroi_full.y, roi_in->y);
1844       iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1845       iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1846 
1847       //_print_roi(&iroi_full, "tile iroi_full");
1848       //_print_roi(&oroi_full, "tile oroi_full");
1849 
1850       /* offsets of tile into ivoid and ovoid */
1851       const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1852       const size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1853                            + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1854 
1855       dt_print(DT_DEBUG_OPENCL,
1856                "[default_process_tiling_cl_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n", tx, ty,
1857                iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1858 
1859       /* origin and region of full input tile */
1860       size_t iorigin[] = { 0, 0, 0 };
1861       size_t iregion[] = { iroi_full.width, iroi_full.height, 1 };
1862 
1863       /* origin and region of full output tile */
1864       size_t oforigin[] = { 0, 0, 0 };
1865       size_t ofregion[] = { oroi_full.width, oroi_full.height, 1 };
1866 
1867       /* origin and region of good part of output tile */
1868       size_t oorigin[] = { oroi_good.x - oroi_full.x, oroi_good.y - oroi_full.y, 0 };
1869       size_t oregion[] = { oroi_good.width, oroi_good.height, 1 };
1870 
1871       /* get opencl input and output buffers */
1872       input = dt_opencl_alloc_device(devid, iroi_full.width, iroi_full.height, in_bpp);
1873       if(input == NULL) goto error;
1874 
1875       output = dt_opencl_alloc_device(devid, oroi_full.width, oroi_full.height, out_bpp);
1876       if(output == NULL) goto error;
1877 
1878       if(use_pinned_memory)
1879       {
1880 /* prepare pinned input tile buffer: copy part of input image */
1881 #ifdef _OPENMP
1882 #pragma omp parallel for default(none) \
1883         dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1884         dt_omp_sharedconst(ioffs) shared(input_buffer, width, iroi_full) schedule(static)
1885 #endif
1886         for(size_t j = 0; j < iroi_full.height; j++)
1887           memcpy((char *)input_buffer + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1888                  (size_t)iroi_full.width * in_bpp);
1889 
1890         /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1891         err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, iorigin, iregion,
1892                                                  (size_t)iroi_full.width * in_bpp, CL_TRUE);
1893         if(err != CL_SUCCESS) goto error;
1894       }
1895       else
1896       {
1897         /* blocking direct memory transfer: host input image -> opencl/device tile */
1898         err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, iorigin, iregion,
1899                                                  ipitch, CL_TRUE);
1900         if(err != CL_SUCCESS) goto error;
1901       }
1902 
1903       /* take original processed_maximum as starting point */
1904       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1905 
1906       /* call process_cl of module */
1907       if(!self->process_cl(self, piece, input, output, &iroi_full, &oroi_full)) goto error;
1908 
1909       /* aggregate resulting processed_maximum */
1910       /* TODO: check if there really can be differences between tiles and take
1911                appropriate action (calculate minimum, maximum, average, ...?) */
1912       for(int k = 0; k < 4; k++)
1913       {
1914         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1915           dt_print(
1916               DT_DEBUG_OPENCL,
1917               "[default_process_tiling_cl_roi] processed_maximum[%d] differs between tiles in module '%s'\n",
1918               k, self->op);
1919         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1920       }
1921 
1922       if(use_pinned_memory)
1923       {
1924         /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1925         err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, oforigin, ofregion,
1926                                                   (size_t)oroi_full.width * out_bpp, CL_TRUE);
1927         if(err != CL_SUCCESS) goto error;
1928 
1929 /* copy "good" part of tile from pinned output buffer to output image */
1930 #ifdef _OPENMP
1931 #pragma omp parallel for default(none) \
1932         dt_omp_firstprivate(ipitch, opitch, ovoid, out_bpp) \
1933         dt_omp_sharedconst(ooffs) shared(output_buffer, oroi_full, oorigin, oregion) \
1934         schedule(static)
1935 #endif
1936         for(size_t j = 0; j < oregion[1]; j++)
1937           memcpy((char *)ovoid + ooffs + j * opitch,
1938                  (char *)output_buffer + ((j + oorigin[1]) * oroi_full.width + oorigin[0]) * out_bpp,
1939                  (size_t)oregion[0] * out_bpp);
1940       }
1941       else
1942       {
1943         /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1944         err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, oorigin, oregion,
1945                                                   opitch, CL_TRUE);
1946         if(err != CL_SUCCESS) goto error;
1947       }
1948 
1949       /* release input and output buffers */
1950       dt_opencl_release_mem_object(input);
1951       input = NULL;
1952       dt_opencl_release_mem_object(output);
1953       output = NULL;
1954 
1955       /* block until opencl queue has finished to free all used event handlers */
1956       if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1957         dt_opencl_finish(devid);
1958     }
1959 
1960   /* copy back final processed_maximum */
1961   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1962   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1963   dt_opencl_release_mem_object(pinned_input);
1964   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1965   dt_opencl_release_mem_object(pinned_output);
1966   dt_opencl_release_mem_object(input);
1967   dt_opencl_release_mem_object(output);
1968   piece->pipe->tiling = 0;
1969   return TRUE;
1970 
1971 error:
1972   /* copy back stored processed_maximum */
1973   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1974   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1975   dt_opencl_release_mem_object(pinned_input);
1976   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1977   dt_opencl_release_mem_object(pinned_output);
1978   dt_opencl_release_mem_object(input);
1979   dt_opencl_release_mem_object(output);
1980   piece->pipe->tiling = 0;
1981   dt_print(
1982       DT_DEBUG_OPENCL,
1983       "[default_process_tiling_opencl_roi] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1984       self->op, err);
1985   return FALSE;
1986 }
1987 
1988 
1989 
1990 /* if a module does not implement process_tiling_cl() by itself, this function is called instead.
1991    _default_process_tiling_cl_ptp() is able to handle standard cases where pixels do not change their places.
1992    _default_process_tiling_cl_roi() takes care of all other cases where image gets distorted. */
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1993 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1994                               const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
1995                               const dt_iop_roi_t *const roi_out, const int in_bpp)
1996 {
1997   if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1998     return _default_process_tiling_cl_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1999   else
2000     return _default_process_tiling_cl_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
2001 }
2002 
2003 #else
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)2004 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2005                               const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
2006                               const dt_iop_roi_t *const roi_out, const int in_bpp)
2007 {
2008   return FALSE;
2009 }
2010 #endif
2011 
2012 
2013 /* If a module does not implement tiling_callback() by itself, this function is called instead.
2014    Default is an image size factor of 2 (i.e. input + output buffer needed), no overhead (1),
2015    no overlap between tiles, and an pixel alignment of 1 in x and y direction, i.e. no special
2016    alignment required. Simple pixel to pixel modules (take tonecurve as an example) can happily
2017    live with that.
2018    (1) Small overhead like look-up-tables in tonecurve can be ignored safely. */
default_tiling_callback(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * roi_in,const dt_iop_roi_t * roi_out,struct dt_develop_tiling_t * tiling)2019 void default_tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2020                              const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
2021                              struct dt_develop_tiling_t *tiling)
2022 {
2023   const float ioratio
2024       = ((float)roi_out->width * (float)roi_out->height) / ((float)roi_in->width * (float)roi_in->height);
2025 
2026   tiling->factor = 1.0f + ioratio;
2027   tiling->factor_cl = tiling->factor;  // by default, we need the same memory on host or GPU
2028   tiling->maxbuf = 1.0f;
2029   tiling->maxbuf_cl = tiling->maxbuf;
2030   tiling->overhead = 0;
2031   tiling->overlap = 0;
2032   tiling->xalign = 1;
2033   tiling->yalign = 1;
2034 
2035   if((self->flags() & IOP_FLAGS_TILING_FULL_ROI) == IOP_FLAGS_TILING_FULL_ROI) tiling->overlap = 4;
2036 
2037   if(self->iop_order > dt_ioppr_get_iop_order(piece->pipe->iop_order_list, "demosaic", 0)) return;
2038 
2039   // all operations that work with mosaiced data should respect pattern size!
2040 
2041   if(!piece->pipe->dsc.filters) return;
2042 
2043   if(piece->pipe->dsc.filters == 9u)
2044   {
2045     // X-Trans, sensor is 6x6
2046     tiling->xalign = 6;
2047     tiling->yalign = 6;
2048   }
2049   else
2050   {
2051     // Bayer, good old 2x2
2052     tiling->xalign = 2;
2053     tiling->yalign = 2;
2054   }
2055 
2056   return;
2057 }
2058 
dt_tiling_piece_fits_host_memory(const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)2059 int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp,
2060                                      const float factor, const size_t overhead)
2061 {
2062   static int host_memory_limit = -1;
2063 
2064   /* first time run */
2065   if(host_memory_limit < 0)
2066   {
2067     host_memory_limit = dt_conf_get_int("host_memory_limit");
2068 
2069     /* don't let the user play games with us */
2070     if(host_memory_limit != 0) host_memory_limit = CLAMPI(host_memory_limit, 500, 50000);
2071     dt_conf_set_int("host_memory_limit", host_memory_limit);
2072   }
2073 
2074   const float requirement = factor * width * height * bpp + overhead;
2075 
2076   if(host_memory_limit == 0 || requirement <= host_memory_limit * 1024.0f * 1024.0f)
2077     return TRUE;
2078   else
2079     return FALSE;
2080 }
2081 
2082 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2083 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2084 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2085