1 /*
2     This file is part of darktable,
3     Copyright (C) 2011-2020 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 
20 #include "develop/tiling.h"
21 #include "common/opencl.h"
22 #include "control/control.h"
23 #include "develop/blend.h"
24 #include "develop/pixelpipe.h"
25 
26 #include <assert.h>
27 #include <math.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <unistd.h>
32 
33 #define CLAMPI(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
34 
35 
36 /* this defines an additional alignment requirement for opencl image width.
37    It can have strong effects on processing speed. Reasonable values are a
38    power of 2. set to 1 for no effect. */
39 #define CL_ALIGNMENT 4
40 
41 /* parameter RESERVE for extended roi_in sizes due to inaccuracies when doing
42    roi_out -> roi_in estimations.
43    Needs to be increased if tiling fails due to insufficient buffer sizes. */
44 #define RESERVE 5
45 
46 
47 /* greatest common divisor */
_gcd(unsigned a,unsigned b)48 static unsigned _gcd(unsigned a, unsigned b)
49 {
50   unsigned t;
51   while(b != 0)
52   {
53     t = b;
54     b = a % b;
55     a = t;
56   }
57   return MAX(a, 1);
58 }
59 
60 /* least common multiple */
_lcm(unsigned a,unsigned b)61 static unsigned _lcm(unsigned a, unsigned b)
62 {
63   return (((unsigned long)a * b) / _gcd(a, b));
64 }
65 
66 
_min(int a,int b)67 static inline int _min(int a, int b)
68 {
69   return a < b ? a : b;
70 }
71 
_max(int a,int b)72 static inline int _max(int a, int b)
73 {
74   return a > b ? a : b;
75 }
76 
77 
_align_up(int n,int a)78 static inline int _align_up(int n, int a)
79 {
80   return n % a != 0 ? (n / a + 1) * a : n;
81 }
82 
_align_down(int n,int a)83 static inline int _align_down(int n, int a)
84 {
85   return n % a != 0 ? (n / a) * a : n;
86 }
87 
88 
_print_roi(const dt_iop_roi_t * roi,const char * label)89 void _print_roi(const dt_iop_roi_t *roi, const char *label)
90 {
91   printf("{ %5d  %5d  %5d  %5d  %.6f } %s\n", roi->x, roi->y, roi->width, roi->height, roi->scale, label);
92 }
93 
94 
95 #if 0
96 static void
97 _nm_constraints(double x[], int n)
98 {
99   x[0] = fabs(x[0]);
100   x[1] = fabs(x[1]);
101   x[2] = fabs(x[2]);
102   x[3] = fabs(x[3]);
103 
104   if(x[0] > 1.0) x[0] = 1.0 - x[0];
105   if(x[1] > 1.0) x[1] = 1.0 - x[1];
106   if(x[2] > 1.0) x[2] = 1.0 - x[2];
107   if(x[3] > 1.0) x[3] = 1.0 - x[3];
108 
109 }
110 #endif
111 
_nm_fitness(double x[],void * rest[])112 static double _nm_fitness(double x[], void *rest[])
113 {
114   struct dt_iop_module_t *self = (struct dt_iop_module_t *)rest[0];
115   struct dt_dev_pixelpipe_iop_t *piece = (struct dt_dev_pixelpipe_iop_t *)rest[1];
116   struct dt_iop_roi_t *iroi = (struct dt_iop_roi_t *)rest[2];
117   struct dt_iop_roi_t *oroi = (struct dt_iop_roi_t *)rest[3];
118 
119   dt_iop_roi_t oroi_test = *oroi;
120   oroi_test.x = x[0] * piece->iwidth;
121   oroi_test.y = x[1] * piece->iheight;
122   oroi_test.width = x[2] * piece->iwidth;
123   oroi_test.height = x[3] * piece->iheight;
124 
125   dt_iop_roi_t iroi_probe = *iroi;
126   self->modify_roi_in(self, piece, &oroi_test, &iroi_probe);
127 
128   double fitness = 0.0;
129 
130   fitness += (double)(iroi_probe.x - iroi->x) * (iroi_probe.x - iroi->x);
131   fitness += (double)(iroi_probe.y - iroi->y) * (iroi_probe.y - iroi->y);
132   fitness += (double)(iroi_probe.width - iroi->width) * (iroi_probe.width - iroi->width);
133   fitness += (double)(iroi_probe.height - iroi->height) * (iroi_probe.height - iroi->height);
134 
135   return fitness;
136 }
137 
138 
139 /* We use a Nelder-Mead simplex algorithm based on an implementation of Michael F. Hutt.
140    It is covered by the following copyright notice: */
141 /*
142  * Program: nmsimplex.c
143  * Author : Michael F. Hutt
144  * http://www.mikehutt.com
145  * 11/3/97
146  *
147  * An implementation of the Nelder-Mead simplex method.
148  *
149  * Copyright (c) 1997-2011 <Michael F. Hutt>
150  *
151  * Permission is hereby granted, free of charge, to any person obtaining
152  * a copy of this software and associated documentation files (the
153  * "Software"), to deal in the Software without restriction, including
154  * without limitation the rights to use, copy, modify, merge, publish,
155  * distribute, sublicense, and/or sell copies of the Software, and to
156  * permit persons to whom the Software is furnished to do so, subject to
157  * the following conditions:
158  *
159  * The above copyright notice and this permission notice shall be
160  * included in all copies or substantial portions of the Software.
161  *
162  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
163  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
164  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
165  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
166  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
167  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
168  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
169  *
170  */
171 
172 #define MAX_IT 1000 /* maximum number of iterations */
173 #define ALPHA 1.0   /* reflection coefficient */
174 #define BETA 0.5    /* contraction coefficient */
175 #define GAMMA 2.0   /* expansion coefficient */
176 
_simplex(double (* objfunc)(double[],void * []),double start[],int n,double EPSILON,double scale,int maxiter,void (* constrain)(double[],int n),void * rest[])177 static int _simplex(double (*objfunc)(double[], void *[]), double start[], int n, double EPSILON,
178                     double scale, int maxiter, void (*constrain)(double[], int n), void *rest[])
179 {
180 
181   int vs; /* vertex with smallest value */
182   int vh; /* vertex with next smallest value */
183   int vg; /* vertex with largest value */
184 
185   int i, j = 0, m, row;
186   int k;   /* track the number of function evaluations */
187   int itr; /* track the number of iterations */
188 
189   double **v;    /* holds vertices of simplex */
190   double pn, qn; /* values used to create initial simplex */
191   double *f;     /* value of function at each vertex */
192   double fr;     /* value of function at reflection point */
193   double fe;     /* value of function at expansion point */
194   double fc;     /* value of function at contraction point */
195   double *vr;    /* reflection - coordinates */
196   double *ve;    /* expansion - coordinates */
197   double *vc;    /* contraction - coordinates */
198   double *vm;    /* centroid - coordinates */
199 
200   double fsum, favg, s, cent;
201 
202   /* dynamically allocate arrays */
203 
204   /* allocate the rows of the arrays */
205   v = (double **)malloc(sizeof(double *) * (n + 1));
206   f = (double *)malloc(sizeof(double) * (n + 1));
207   vr = (double *)malloc(sizeof(double) * n);
208   ve = (double *)malloc(sizeof(double) * n);
209   vc = (double *)malloc(sizeof(double) * n);
210   vm = (double *)malloc(sizeof(double) * n);
211 
212   /* allocate the columns of the arrays */
213   for(i = 0; i <= n; i++)
214   {
215     v[i] = (double *)malloc(sizeof(double) * n);
216   }
217 
218   /* create the initial simplex */
219   /* assume one of the vertices is 0,0 */
220 
221   pn = scale * (sqrt(n + 1) - 1 + n) / (n * sqrt(2));
222   qn = scale * (sqrt(n + 1) - 1) / (n * sqrt(2));
223 
224   for(i = 0; i < n; i++)
225   {
226     v[0][i] = start[i];
227   }
228 
229   for(i = 1; i <= n; i++)
230   {
231     for(j = 0; j < n; j++)
232     {
233       if(i - 1 == j)
234       {
235         v[i][j] = pn + start[j];
236       }
237       else
238       {
239         v[i][j] = qn + start[j];
240       }
241     }
242   }
243 
244   if(constrain != NULL)
245   {
246     constrain(v[j], n);
247   }
248   /* find the initial function values */
249   for(j = 0; j <= n; j++)
250   {
251     f[j] = objfunc(v[j], rest);
252   }
253 
254   k = n + 1;
255 
256 #if 0
257   /* print out the initial values */
258   printf ("Initial Values\n");
259   for (j = 0; j <= n; j++)
260   {
261     for (i = 0; i < n; i++)
262     {
263       printf ("%f %f\n", v[j][i], f[j]);
264     }
265   }
266 #endif
267 
268   /* begin the main loop of the minimization */
269   for(itr = 1; itr <= maxiter; itr++)
270   {
271     /* find the index of the largest value */
272     vg = 0;
273     for(j = 0; j <= n; j++)
274     {
275       if(f[j] > f[vg])
276       {
277         vg = j;
278       }
279     }
280 
281     /* find the index of the smallest value */
282     vs = 0;
283     for(j = 0; j <= n; j++)
284     {
285       if(f[j] < f[vs])
286       {
287         vs = j;
288       }
289     }
290 
291     /* find the index of the second largest value */
292     vh = vs;
293     for(j = 0; j <= n; j++)
294     {
295       if(f[j] > f[vh] && f[j] < f[vg])
296       {
297         vh = j;
298       }
299     }
300 
301     /* calculate the centroid */
302     for(j = 0; j <= n - 1; j++)
303     {
304       cent = 0.0;
305       for(m = 0; m <= n; m++)
306       {
307         if(m != vg)
308         {
309           cent += v[m][j];
310         }
311       }
312       vm[j] = cent / n;
313     }
314 
315     /* reflect vg to new vertex vr */
316     for(j = 0; j <= n - 1; j++)
317     {
318       /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j]; */
319       vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
320     }
321     if(constrain != NULL)
322     {
323       constrain(vr, n);
324     }
325     fr = objfunc(vr, rest);
326     k++;
327 
328     if(fr < f[vh] && fr >= f[vs])
329     {
330       for(j = 0; j <= n - 1; j++)
331       {
332         v[vg][j] = vr[j];
333       }
334       f[vg] = fr;
335     }
336 
337     /* investigate a step further in this direction */
338     if(fr < f[vs])
339     {
340       for(j = 0; j <= n - 1; j++)
341       {
342         /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j]; */
343         ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
344       }
345       if(constrain != NULL)
346       {
347         constrain(ve, n);
348       }
349       fe = objfunc(ve, rest);
350       k++;
351 
352       /* by making fe < fr as opposed to fe < f[vs],
353          Rosenbrocks function takes 63 iterations as opposed
354          to 64 when using double variables. */
355 
356       if(fe < fr)
357       {
358         for(j = 0; j <= n - 1; j++)
359         {
360           v[vg][j] = ve[j];
361         }
362         f[vg] = fe;
363       }
364       else
365       {
366         for(j = 0; j <= n - 1; j++)
367         {
368           v[vg][j] = vr[j];
369         }
370         f[vg] = fr;
371       }
372     }
373 
374     /* check to see if a contraction is necessary */
375     if(fr >= f[vh])
376     {
377       if(fr < f[vg] && fr >= f[vh])
378       {
379         /* perform outside contraction */
380         for(j = 0; j <= n - 1; j++)
381         {
382           /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
383           vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
384         }
385         if(constrain != NULL)
386         {
387           constrain(vc, n);
388         }
389         fc = objfunc(vc, rest);
390         k++;
391       }
392       else
393       {
394         /* perform inside contraction */
395         for(j = 0; j <= n - 1; j++)
396         {
397           /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
398           vc[j] = vm[j] - BETA * (vm[j] - v[vg][j]);
399         }
400         if(constrain != NULL)
401         {
402           constrain(vc, n);
403         }
404         fc = objfunc(vc, rest);
405         k++;
406       }
407 
408 
409       if(fc < f[vg])
410       {
411         for(j = 0; j <= n - 1; j++)
412         {
413           v[vg][j] = vc[j];
414         }
415         f[vg] = fc;
416       }
417       /* at this point the contraction is not successful,
418          we must halve the distance from vs to all the
419          vertices of the simplex and then continue.
420          10/31/97 - modified to account for ALL vertices.
421        */
422       else
423       {
424         for(row = 0; row <= n; row++)
425         {
426           if(row != vs)
427           {
428             for(j = 0; j <= n - 1; j++)
429             {
430               v[row][j] = v[vs][j] + (v[row][j] - v[vs][j]) / 2.0;
431             }
432           }
433         }
434         if(constrain != NULL)
435         {
436           constrain(v[vg], n);
437         }
438         f[vg] = objfunc(v[vg], rest);
439         k++;
440         if(constrain != NULL)
441         {
442           constrain(v[vh], n);
443         }
444         f[vh] = objfunc(v[vh], rest);
445         k++;
446       }
447     }
448 
449 #if 0
450     /* print out the value at each iteration */
451     printf ("Iteration %d\n", itr);
452     for (j = 0; j <= n; j++)
453     {
454       for (i = 0; i < n; i++)
455       {
456         printf ("%f %f\n", v[j][i], f[j]);
457       }
458     }
459 #endif
460 
461     /* test for convergence */
462     fsum = 0.0;
463     for(j = 0; j <= n; j++)
464     {
465       fsum += f[j];
466     }
467     favg = fsum / (n + 1);
468     s = 0.0;
469     for(j = 0; j <= n; j++)
470     {
471       s += pow((f[j] - favg), 2.0) / (n);
472     }
473     s = sqrt(s);
474     if(s < EPSILON) break;
475   }
476   /* end main loop of the minimization */
477 
478   /* find the index of the smallest value */
479   vs = 0;
480   for(j = 0; j <= n; j++)
481   {
482     if(f[j] < f[vs])
483     {
484       vs = j;
485     }
486   }
487 
488 #if 0
489   printf ("The minimum was found at\n");
490   for (j = 0; j < n; j++)
491   {
492     printf ("%e\n", v[vs][j]);
493     start[j] = v[vs][j];
494   }
495   double min = objfunc (v[vs], rest);
496   printf ("Function value at minimum %f\n", min);
497   k++;
498   printf ("%d Function Evaluations\n", k);
499   printf ("%d Iterations through program\n", itr);
500 #endif
501 
502   free(f);
503   free(vr);
504   free(ve);
505   free(vc);
506   free(vm);
507   for(i = 0; i <= n; i++)
508   {
509     free(v[i]);
510   }
511   free(v);
512   return itr;
513 }
514 
515 
_nm_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta)516 static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
517                                        const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta)
518 {
519   void *rest[4] = { (void *)self, (void *)piece, (void *)iroi, (void *)oroi };
520   double start[4] = { (float)oroi->x / piece->iwidth, (float)oroi->y / piece->iheight,
521                       (float)oroi->width / piece->iwidth, (float)oroi->height / piece->iheight };
522   double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
523   int maxiter = 1000;
524 
525   int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
526 
527   // printf("_simplex: %d, delta: %d, epsilon: %f\n", iter, delta, epsilon);
528 
529   oroi->x = start[0] * piece->iwidth;
530   oroi->y = start[1] * piece->iheight;
531   oroi->width = start[2] * piece->iwidth;
532   oroi->height = start[3] * piece->iheight;
533 
534   return (iter <= maxiter);
535 }
536 
537 
538 
539 /* find a matching oroi_full by probing start value of oroi and get corresponding input roi into iroi_probe.
540    We search in two steps. first by a simplicistic iterative search which will succeed in most cases.
541    If this does not converge, we do a downhill simplex (nelder-mead) fitting */
_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta,int iter)542 static int _fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
543                                     const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta, int iter)
544 {
545   dt_iop_roi_t iroi_probe = *iroi;
546   dt_iop_roi_t save_oroi = *oroi;
547 
548   // try to go the easy way. this works in many cases where output is
549   // just like input, only scaled down
550   self->modify_roi_in(self, piece, oroi, &iroi_probe);
551   while((abs((int)iroi_probe.x - (int)iroi->x) > delta || abs((int)iroi_probe.y - (int)iroi->y) > delta
552          || abs((int)iroi_probe.width - (int)iroi->width) > delta
553          || abs((int)iroi_probe.height - (int)iroi->height) > delta) && iter > 0)
554   {
555     //_print_roi(&iroi_probe, "tile iroi_probe");
556     //_print_roi(oroi, "tile oroi old");
557 
558     oroi->x += (iroi->x - iroi_probe.x) * oroi->scale / iroi->scale;
559     oroi->y += (iroi->y - iroi_probe.y) * oroi->scale / iroi->scale;
560     oroi->width += (iroi->width - iroi_probe.width) * oroi->scale / iroi->scale;
561     oroi->height += (iroi->height - iroi_probe.height) * oroi->scale / iroi->scale;
562 
563     //_print_roi(oroi, "tile oroi new");
564 
565     self->modify_roi_in(self, piece, oroi, &iroi_probe);
566     iter--;
567   }
568 
569   if(iter > 0) return TRUE;
570 
571   *oroi = save_oroi;
572 
573   // simplicistic approach did not converge.
574   // try simplex downhill fitting now.
575   // it's crucial that we have a good starting point in oroi, else this
576   // will not converge as well.
577   int fit = _nm_fit_output_to_input_roi(self, piece, iroi, oroi, delta);
578   return fit;
579 }
580 
581 
582 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)583 static void _default_process_tiling_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
584                                         const void *const ivoid, void *const ovoid,
585                                         const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
586                                         const int in_bpp)
587 {
588   void *input = NULL;
589   void *output = NULL;
590   dt_iop_buffer_dsc_t dsc;
591   self->output_format(self, piece->pipe, piece, &dsc);
592   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
593 
594   const int ipitch = roi_in->width * in_bpp;
595   const int opitch = roi_out->width * out_bpp;
596   const int max_bpp = _max(in_bpp, out_bpp);
597 
598   /* get tiling requirements of module */
599   dt_develop_tiling_t tiling = { 0 };
600   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
601 
602   /* shall we enforce tiling */
603   const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
604   /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
605    */
606   if((tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp) && !force_tile)
607   {
608     dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] no need to use tiling for module '%s' as no real "
609                            "memory saving to be expected\n",
610              self->op);
611     goto fallback;
612   }
613 
614   /* calculate optimal size of tiles */
615   float available = force_tile ? 500.0f * 1024.0f * 1024.0f : dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
616   assert(available >= 500.0f * 1024.0f * 1024.0f);
617   /* correct for size of ivoid and ovoid which are needed on top of tiling */
618   available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
619                    - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
620                    0);
621 
622   /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
623      this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
624      reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
625   float singlebuffer = force_tile ? 2.0f * 1024.0f * 1024.0f : dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
626   singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
627   const float factor = fmax(tiling.factor, 1.0f);
628   const float maxbuf = fmax(tiling.maxbuf, 1.0f);
629   singlebuffer = fmax(available / factor, singlebuffer);
630 
631   int width = roi_in->width;
632   int height = roi_in->height;
633 
634   /* shrink tile size in case it would exceed singlebuffer size */
635   if((float)width * height * max_bpp * maxbuf > singlebuffer)
636   {
637     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
638 
639     /* TODO: can we make this more efficient to minimize total overlap between tiles? */
640     if(width < height && scale >= 0.333f)
641     {
642       height = floorf(height * scale);
643     }
644     else if(height <= width && scale >= 0.333f)
645     {
646       width = floorf(width * scale);
647     }
648     else
649     {
650       width = floorf(width * sqrtf(scale));
651       height = floorf(height * sqrtf(scale));
652     }
653   }
654 
655   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
656   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
657   {
658     width = height = floorf(sqrtf((float)width * height));
659   }
660 
661   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
662      Modules will report alignment requirements via xalign and yalign within tiling_callback().
663      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
664      direction.
665      We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
666      that is identical to image width/height no special alignment is needed. */
667 
668   const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
669 
670   assert(xyalign != 0);
671 
672   /* properly align tile width and height by making them smaller if needed */
673   if(width < roi_in->width) width = (width / xyalign) * xyalign;
674   if(height < roi_in->height) height = (height / xyalign) * xyalign;
675 
676   /* also make sure that overlap follows alignment rules by making it wider when needed */
677   const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
678                                                     : tiling.overlap;
679 
680   /* calculate effective tile size */
681   const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
682   const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
683 
684   /* calculate number of tiles */
685   const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
686   const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
687 
688   /* sanity check: don't run wild on too many tiles */
689   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
690   {
691     dt_print(DT_DEBUG_DEV,
692              "[default_process_tiling_ptp] gave up tiling for module '%s'. too many tiles: %d x %d\n",
693              self->op, tiles_x, tiles_y);
694     goto error;
695   }
696 
697 
698   dt_print(DT_DEBUG_DEV,
699            "[default_process_tiling_ptp] use tiling on module '%s' for image with full size %d x %d\n",
700            self->op, roi_in->width, roi_in->height);
701   dt_print(DT_DEBUG_DEV,
702            "[default_process_tiling_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
703            tiles_x, tiles_y, width, height, overlap);
704 
705   /* reserve input and output buffers for tiles */
706   input = dt_alloc_align(64, (size_t)width * height * in_bpp);
707   if(input == NULL)
708   {
709     dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc input buffer for module '%s'\n",
710              self->op);
711     goto error;
712   }
713   output = dt_alloc_align(64, (size_t)width * height * out_bpp);
714   if(output == NULL)
715   {
716     dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc output buffer for module '%s'\n",
717              self->op);
718     goto error;
719   }
720 
721   /* store processed_maximum to be re-used and aggregated */
722   dt_aligned_pixel_t processed_maximum_saved;
723   dt_aligned_pixel_t processed_maximum_new = { 1.0f };
724   for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
725 
726   /* iterate over tiles */
727   for(size_t tx = 0; tx < tiles_x; tx++)
728   {
729     const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
730     for(size_t ty = 0; ty < tiles_y; ty++)
731     {
732       piece->pipe->tiling = 1;
733 
734       const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
735 
736       /* no need to process end-tiles that are smaller than the total overlap area */
737       if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
738 
739       /* origin and region of effective part of tile, which we want to store later */
740       size_t origin[] = { 0, 0, 0 };
741       size_t region[] = { wd, ht, 1 };
742 
743       /* roi_in and roi_out for process_cl on subbuffer */
744       dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
745       dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
746 
747       /* offsets of tile into ivoid and ovoid */
748       const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
749       size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
750 
751 
752       dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n",
753                tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
754 
755 /* prepare input tile buffer */
756 #ifdef _OPENMP
757 #pragma omp parallel for default(none) \
758       dt_omp_firstprivate(ht, in_bpp, ipitch, ivoid, wd) \
759       dt_omp_sharedconst(ioffs) \
760       shared(input, width) \
761       schedule(static)
762 #endif
763       for(size_t j = 0; j < ht; j++)
764         memcpy((char *)input + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch, (size_t)wd * in_bpp);
765 
766       /* take original processed_maximum as starting point */
767       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
768 
769       /* call process() of module */
770       self->process(self, piece, input, output, &iroi, &oroi);
771 
772       /* aggregate resulting processed_maximum */
773       /* TODO: check if there really can be differences between tiles and take
774                appropriate action (calculate minimum, maximum, average, ...?) */
775       for(int k = 0; k < 4; k++)
776       {
777         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
778           dt_print(
779               DT_DEBUG_DEV,
780               "[default_process_tiling_ptp] processed_maximum[%d] differs between tiles in module '%s'\n", k,
781               self->op);
782         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
783       }
784 
785       /* correct origin and region of tile for overlap.
786          make sure that we only copy back the "good" part. */
787       if(tx > 0)
788       {
789         origin[0] += overlap;
790         region[0] -= overlap;
791         ooffs += (size_t)overlap * out_bpp;
792       }
793       if(ty > 0)
794       {
795         origin[1] += overlap;
796         region[1] -= overlap;
797         ooffs += (size_t)overlap * opitch;
798       }
799 
800 /* copy "good" part of tile to output buffer */
801 #ifdef _OPENMP
802 #pragma omp parallel for default(none) \
803       dt_omp_firstprivate(opitch, out_bpp, ovoid, wd) \
804       shared(ooffs, output, width, origin, region) \
805       schedule(static)
806 #endif
807       for(size_t j = 0; j < region[1]; j++)
808         memcpy((char *)ovoid + ooffs + j * opitch,
809                (char *)output + ((j + origin[1]) * wd + origin[0]) * out_bpp, (size_t)region[0] * out_bpp);
810     }
811   }
812 
813   /* copy back final processed_maximum */
814   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
815 
816   if(input != NULL) dt_free_align(input);
817   if(output != NULL) dt_free_align(output);
818   piece->pipe->tiling = 0;
819   return;
820 
821 error:
822   dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
823 // fall through
824 
825 fallback:
826   if(input != NULL) dt_free_align(input);
827   if(output != NULL) dt_free_align(output);
828   piece->pipe->tiling = 0;
829   dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] fall back to standard processing for module '%s'\n",
830            self->op);
831   self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
832   return;
833 }
834 
835 
836 
837 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
838    more tiles and larger overlap */
_default_process_tiling_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)839 static void _default_process_tiling_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
840                                         const void *const ivoid, void *const ovoid,
841                                         const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
842                                         const int in_bpp)
843 {
844   void *input = NULL;
845   void *output = NULL;
846 
847   //_print_roi(roi_in, "module roi_in");
848   //_print_roi(roi_out, "module roi_out");
849 
850   dt_iop_buffer_dsc_t dsc;
851   self->output_format(self, piece->pipe, piece, &dsc);
852   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
853 
854   const int ipitch = roi_in->width * in_bpp;
855   const int opitch = roi_out->width * out_bpp;
856   const int max_bpp = _max(in_bpp, out_bpp);
857 
858   float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
859                                                               / ((float)roi_out->width * roi_out->height)));
860 
861   /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
862   const int delta = ceilf(fullscale);
863 
864   /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
865   const int inacc = RESERVE * delta;
866 
867   /* get tiling requirements of module */
868   dt_develop_tiling_t tiling = { 0 };
869   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
870 
871   /* shall we enforce tiling */
872   const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
873   /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
874    */
875   if((tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp) && !force_tile)
876   {
877     dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] no need to use tiling for module '%s' as no real "
878                            "memory saving to be expected\n",
879              self->op);
880     goto fallback;
881   }
882 
883   /* calculate optimal size of tiles */
884   float available = force_tile ? 500.0f * 1024.0f * 1024.0f : dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
885   assert(available >= 500.0f * 1024.0f * 1024.0f);
886   /* correct for size of ivoid and ovoid which are needed on top of tiling */
887   available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
888                    - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
889                    0);
890 
891   /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
892      this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
893      reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
894   float singlebuffer = force_tile ? 2.0f * 1024.0f * 1024.0f : dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
895   singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
896   const float factor = fmax(tiling.factor, 1.0f);
897   const float maxbuf = fmax(tiling.maxbuf, 1.0f);
898   singlebuffer = fmax(available / factor, singlebuffer);
899 
900   int width = _max(roi_in->width, roi_out->width);
901   int height = _max(roi_in->height, roi_out->height);
902 
903   /* shrink tile size in case it would exceed singlebuffer size */
904   if((float)width * height * max_bpp * maxbuf > singlebuffer)
905   {
906     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
907 
908     /* TODO: can we make this more efficient to minimize total overlap between tiles? */
909     if(width < height && scale >= 0.333f)
910     {
911       height = floorf(height * scale);
912     }
913     else if(height <= width && scale >= 0.333f)
914     {
915       width = floorf(width * scale);
916     }
917     else
918     {
919       width = floorf(width * sqrtf(scale));
920       height = floorf(height * sqrtf(scale));
921     }
922   }
923 
924   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
925   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
926   {
927     width = height = floorf(sqrtf((float)width * height));
928   }
929 
930   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
931      Modules will report alignment requirements via xalign and yalign within tiling_callback().
932      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
933      direction. */
934 
935   /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
936   const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
937 
938   assert(xyalign != 0);
939 
940   /* make sure that overlap follows alignment rules by making it wider when needed.
941      overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
942   const int overlap_in = _align_up(tiling.overlap, xyalign);
943   const int overlap_out = ceilf((float)overlap_in / fullscale);
944 
945   int tiles_x = 1, tiles_y = 1;
946 
947   /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
948      normally it is roi_in > roi_out; but let's be prepared */
949   if(roi_in->width > roi_out->width)
950     tiles_x = width < roi_in->width
951                   ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
952                   : 1;
953   else
954     tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
955                                      : 1;
956 
957   if(roi_in->height > roi_out->height)
958     tiles_y = height < roi_in->height
959                   ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
960                   : 1;
961   else
962     tiles_y = height < roi_out->height
963                   ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
964                   : 1;
965 
966   /* sanity check: don't run wild on too many tiles */
967   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
968   {
969     dt_print(DT_DEBUG_DEV,
970              "[default_process_tiling_roi] gave up tiling for module '%s'. too many tiles: %d x %d\n",
971              self->op, tiles_x, tiles_y);
972     goto error;
973   }
974 
975 
976   /* calculate tile width and height excl. overlap (i.e. the good part) for output.
977      values are important for all following processing steps. */
978   const int tile_wd = _align_up(
979       roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
980   const int tile_ht = _align_up(
981       roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
982 
983   dt_print(DT_DEBUG_DEV,
984            "[default_process_tiling_roi] use tiling on module '%s' for image with full input size %d x %d\n",
985            self->op, roi_in->width, roi_in->height);
986   dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] (%d x %d) tiles with max dimensions %d x %d\n",
987            tiles_x, tiles_y, width, height);
988 
989 
990   /* store processed_maximum to be re-used and aggregated */
991   dt_aligned_pixel_t processed_maximum_saved;
992   dt_aligned_pixel_t processed_maximum_new = { 1.0f };
993   for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
994 
995   /* iterate over tiles */
996   for(size_t tx = 0; tx < tiles_x; tx++)
997     for(size_t ty = 0; ty < tiles_y; ty++)
998     {
999       piece->pipe->tiling = 1;
1000 
1001       /* the output dimensions of the good part of this specific tile */
1002       const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
1003       const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
1004 
1005       /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1006          iroi_good is calculated by modify_roi_in() of respective module */
1007       dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1008       dt_iop_roi_t oroi_good
1009           = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1010 
1011       self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1012 
1013       /* clamp iroi_good to not exceed roi_in */
1014       iroi_good.x = _max(iroi_good.x, roi_in->x);
1015       iroi_good.y = _max(iroi_good.y, roi_in->y);
1016       iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1017       iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1018 
1019       //_print_roi(&iroi_good, "tile iroi_good");
1020       //_print_roi(&oroi_good, "tile oroi_good");
1021 
1022       /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1023          requirements
1024          and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1025          -> generates first estimate of iroi_full */
1026       const int x_in = iroi_good.x;
1027       const int y_in = iroi_good.y;
1028       const int width_in = iroi_good.width;
1029       const int height_in = iroi_good.height;
1030       const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1031       const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1032       const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1033                                     roi_in->width + roi_in->x - new_x_in);
1034       const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1035                                      roi_in->height + roi_in->y - new_y_in);
1036 
1037       /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1038        * following iterative search */
1039       dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1040       dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1041 
1042       //_print_roi(&iroi_full, "tile iroi_full before optimization");
1043       //_print_roi(&oroi_full, "tile oroi_full before optimization");
1044 
1045       /* try to find a matching oroi_full */
1046       if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1047       {
1048         dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] can not handle requested roi's. tiling for "
1049                                "module '%s' not possible.\n",
1050                  self->op);
1051         goto error;
1052       }
1053 
1054       //_print_roi(&iroi_full, "tile iroi_full after optimization");
1055       //_print_roi(&oroi_full, "tile oroi_full after optimization");
1056 
1057       /* make sure that oroi_full at least covers the range of oroi_good.
1058          this step is needed due to the possibility of rounding errors */
1059       oroi_full.x = _min(oroi_full.x, oroi_good.x);
1060       oroi_full.y = _min(oroi_full.y, oroi_good.y);
1061       oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1062       oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1063 
1064       /* clamp oroi_full to not exceed roi_out */
1065       oroi_full.x = _max(oroi_full.x, roi_out->x);
1066       oroi_full.y = _max(oroi_full.y, roi_out->y);
1067       oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1068       oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1069 
1070       /* calculate final iroi_full */
1071       self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1072 
1073       /* clamp iroi_full to not exceed roi_in */
1074       iroi_full.x = _max(iroi_full.x, roi_in->x);
1075       iroi_full.y = _max(iroi_full.y, roi_in->y);
1076       iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1077       iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1078 
1079 
1080       //_print_roi(&iroi_full, "tile iroi_full final");
1081       //_print_roi(&oroi_full, "tile oroi_full final");
1082 
1083       /* offsets of tile into ivoid and ovoid */
1084       const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1085       size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1086                      + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1087 
1088       dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n",
1089                tx, ty, iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1090 
1091 
1092       /* prepare input tile buffer */
1093       input = dt_alloc_align(64, (size_t)iroi_full.width * iroi_full.height * in_bpp);
1094       if(input == NULL)
1095       {
1096         dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc input buffer for module '%s'\n",
1097                  self->op);
1098         goto error;
1099       }
1100       output = dt_alloc_align(64, (size_t)oroi_full.width * oroi_full.height * out_bpp);
1101       if(output == NULL)
1102       {
1103         dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc output buffer for module '%s'\n",
1104                  self->op);
1105         goto error;
1106       }
1107 
1108 #ifdef _OPENMP
1109 #pragma omp parallel for default(none) \
1110       dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1111       dt_omp_sharedconst(ioffs) shared(input, iroi_full) \
1112       schedule(static)
1113 #endif
1114       for(size_t j = 0; j < iroi_full.height; j++)
1115         memcpy((char *)input + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1116                (size_t)iroi_full.width * in_bpp);
1117 
1118       /* take original processed_maximum as starting point */
1119       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1120 
1121       /* call process() of module */
1122       self->process(self, piece, input, output, &iroi_full, &oroi_full);
1123 
1124       /* aggregate resulting processed_maximum */
1125       /* TODO: check if there really can be differences between tiles and take
1126                appropriate action (calculate minimum, maximum, average, ...?) */
1127       for(int k = 0; k < 4; k++)
1128       {
1129         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1130           dt_print(
1131               DT_DEBUG_DEV,
1132               "[default_process_tiling_roi] processed_maximum[%d] differs between tiles in module '%s'\n", k,
1133               self->op);
1134         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1135       }
1136 
1137       /* copy "good" part of tile to output buffer */
1138       const int origin_x = oroi_good.x - oroi_full.x;
1139       const int origin_y = oroi_good.y - oroi_full.y;
1140 #ifdef _OPENMP
1141 #pragma omp parallel for default(none) \
1142       dt_omp_firstprivate(opitch, origin_x, origin_y, out_bpp, ovoid) \
1143       shared(ooffs, output, oroi_good, oroi_full) \
1144       schedule(static)
1145 #endif
1146       for(size_t j = 0; j < oroi_good.height; j++)
1147         memcpy((char *)ovoid + ooffs + j * opitch,
1148                (char *)output + ((j + origin_y) * oroi_full.width + origin_x) * out_bpp,
1149                (size_t)oroi_good.width * out_bpp);
1150 
1151       dt_free_align(input);
1152       dt_free_align(output);
1153       input = output = NULL;
1154     }
1155 
1156   /* copy back final processed_maximum */
1157   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1158 
1159   if(input != NULL) dt_free_align(input);
1160   if(output != NULL) dt_free_align(output);
1161   piece->pipe->tiling = 0;
1162   return;
1163 
1164 error:
1165   dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
1166 // fall through
1167 
1168 fallback:
1169   if(input != NULL) dt_free_align(input);
1170   if(output != NULL) dt_free_align(output);
1171   piece->pipe->tiling = 0;
1172   dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] fall back to standard processing for module '%s'\n",
1173            self->op);
1174   self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
1175   return;
1176 }
1177 
1178 
1179 
1180 /* if a module does not implement process_tiling() by itself, this function is called instead.
1181    _default_process_tiling_ptp() is able to handle standard cases where pixels do not change their places.
1182    _default_process_tiling_roi() takes care of all other cases where image gets distorted and for module
1183    "clipping",
1184    "flip" as this may flip or mirror the image. */
default_process_tiling(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1185 void default_process_tiling(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1186                             const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
1187                             const dt_iop_roi_t *const roi_out, const int in_bpp)
1188 {
1189   if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1190     _default_process_tiling_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1191   else
1192     _default_process_tiling_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1193   return;
1194 }
1195 
1196 
1197 
1198 #ifdef HAVE_OPENCL
1199 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_cl_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1200 static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1201                                           const void *const ivoid, void *const ovoid,
1202                                           const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1203                                           const int in_bpp)
1204 {
1205   cl_int err = -999;
1206   cl_mem input = NULL;
1207   cl_mem output = NULL;
1208   cl_mem pinned_input = NULL;
1209   cl_mem pinned_output = NULL;
1210   void *input_buffer = NULL;
1211   void *output_buffer = NULL;
1212 
1213   dt_iop_buffer_dsc_t dsc;
1214   self->output_format(self, piece->pipe, piece, &dsc);
1215   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1216 
1217   const int devid = piece->pipe->devid;
1218   const int ipitch = roi_in->width * in_bpp;
1219   const int opitch = roi_out->width * out_bpp;
1220   const int max_bpp = _max(in_bpp, out_bpp);
1221 
1222   /* get tiling requirements of module */
1223   dt_develop_tiling_t tiling = { 0 };
1224   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1225 
1226   /* shall we use pinned memory transfers? */
1227   int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1228   const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1229                                                                 // which seemingly get allocated not only on
1230                                                                 // host but also on device (why???)
1231   const float pinned_buffer_slack
1232       = use_pinned_memory
1233             ? 0.85f
1234             : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1235 
1236   /* shall we enforce tiling */
1237   const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
1238   /* calculate optimal size of tiles */
1239   float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1240   headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1241   float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1242   if(force_tile) available = fmin(available, 500.0f * 1024.0f * 1024.0f);
1243   const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1244   const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1245                                   pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1246   const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1247   int width = _min(roi_in->width, darktable.opencl->dev[devid].max_image_width);
1248   int height = _min(roi_in->height, darktable.opencl->dev[devid].max_image_height);
1249 
1250   /* shrink tile size in case it would exceed singlebuffer size */
1251   if((float)width * height * max_bpp * maxbuf > singlebuffer)
1252   {
1253     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1254 
1255     if(width < height && scale >= 0.333f)
1256     {
1257       height = floorf(height * scale);
1258     }
1259     else if(height <= width && scale >= 0.333f)
1260     {
1261       width = floorf(width * scale);
1262     }
1263     else
1264     {
1265       width = floorf(width * sqrtf(scale));
1266       height = floorf(height * sqrtf(scale));
1267     }
1268   }
1269 
1270   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1271   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1272   {
1273     width = height = floorf(sqrtf((float)width * height));
1274   }
1275 
1276 
1277   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1278      Modules will report alignment requirements via xalign and yalign within tiling_callback().
1279      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1280      direction. Additional alignment requirements are set via definition of CL_ALIGNMENT.
1281      We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
1282      that is identical to image width/height no special alignment is done. */
1283 
1284   /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1285   const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1286 
1287   /* determining alignment requirement for tile width/height.
1288      in case of tile width also align according to definition of CL_ALIGNMENT */
1289   const unsigned int walign = _lcm(xyalign, CL_ALIGNMENT);
1290   const unsigned int halign = xyalign;
1291 
1292   assert(xyalign != 0 && walign != 0 && halign != 0);
1293 
1294   /* properly align tile width and height by making them smaller if needed */
1295   if(width < roi_in->width) width = (width / walign) * walign;
1296   if(height < roi_in->height) height = (height / halign) * halign;
1297 
1298   /* also make sure that overlap follows alignment rules by making it wider when needed */
1299   const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
1300                                                     : tiling.overlap;
1301 
1302 
1303   /* calculate effective tile size */
1304   const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
1305   const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
1306 
1307 
1308   /* calculate number of tiles */
1309   const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
1310   const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
1311 
1312   /* sanity check: don't run wild on too many tiles */
1313   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1314   {
1315     dt_print(DT_DEBUG_OPENCL,
1316              "[default_process_tiling_cl_ptp] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1317              self->op, tiles_x, tiles_y);
1318     return FALSE;
1319   }
1320 
1321 
1322   dt_print(DT_DEBUG_OPENCL,
1323            "[default_process_tiling_cl_ptp] use tiling on module '%s' for image with full size %d x %d\n",
1324            self->op, roi_in->width, roi_in->height);
1325   dt_print(DT_DEBUG_OPENCL,
1326            "[default_process_tiling_cl_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
1327            tiles_x, tiles_y, width, height, overlap);
1328 
1329   /* store processed_maximum to be re-used and aggregated */
1330   dt_aligned_pixel_t processed_maximum_saved;
1331   dt_aligned_pixel_t processed_maximum_new = { 1.0f };
1332   for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1333 
1334   /* reserve pinned input and output memory for host<->device data transfer */
1335   if(use_pinned_memory)
1336   {
1337     pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1338                                                             CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1339     if(pinned_input == NULL)
1340     {
1341       dt_print(DT_DEBUG_OPENCL,
1342                "[default_process_tiling_cl_ptp] could not alloc pinned input buffer for module '%s'\n",
1343                self->op);
1344       use_pinned_memory = 0;
1345     }
1346   }
1347 
1348   if(use_pinned_memory)
1349   {
1350 
1351     input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1352                                         (size_t)width * height * in_bpp);
1353     if(input_buffer == NULL)
1354     {
1355       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned input buffer to host "
1356                                 "memory for module '%s'\n",
1357                self->op);
1358       use_pinned_memory = 0;
1359     }
1360   }
1361 
1362   if(use_pinned_memory)
1363   {
1364 
1365     pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1366                                                              CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1367     if(pinned_output == NULL)
1368     {
1369       dt_print(DT_DEBUG_OPENCL,
1370                "[default_process_tiling_cl_ptp] could not alloc pinned output buffer for module '%s'\n",
1371                self->op);
1372       use_pinned_memory = 0;
1373     }
1374   }
1375 
1376   if(use_pinned_memory)
1377   {
1378 
1379     output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1380                                          (size_t)width * height * out_bpp);
1381     if(output_buffer == NULL)
1382     {
1383       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned output buffer to host "
1384                                 "memory for module '%s'\n",
1385                self->op);
1386       use_pinned_memory = 0;
1387     }
1388   }
1389 
1390   /* iterate over tiles */
1391   for(size_t tx = 0; tx < tiles_x; tx++)
1392     for(size_t ty = 0; ty < tiles_y; ty++)
1393     {
1394       piece->pipe->tiling = 1;
1395 
1396       const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
1397       const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
1398 
1399       /* no need to process (end)tiles that are smaller than the total overlap area */
1400       if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
1401 
1402       /* origin and region of effective part of tile, which we want to store later */
1403       size_t origin[] = { 0, 0, 0 };
1404       size_t region[] = { wd, ht, 1 };
1405 
1406       /* roi_in and roi_out for process_cl on subbuffer */
1407       dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1408       dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1409 
1410 
1411       /* offsets of tile into ivoid and ovoid */
1412       const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
1413       size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
1414 
1415 
1416       dt_print(DT_DEBUG_OPENCL,
1417                "[default_process_tiling_cl_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n", tx, ty, wd,
1418                ht, tx * tile_wd, ty * tile_ht);
1419 
1420       /* get input and output buffers */
1421       input = dt_opencl_alloc_device(devid, wd, ht, in_bpp);
1422       if(input == NULL) goto error;
1423       output = dt_opencl_alloc_device(devid, wd, ht, out_bpp);
1424       if(output == NULL) goto error;
1425 
1426       if(use_pinned_memory)
1427       {
1428 /* prepare pinned input tile buffer: copy part of input image */
1429 #ifdef _OPENMP
1430 #pragma omp parallel for default(none) \
1431         dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1432         dt_omp_sharedconst(ioffs, wd, ht) shared(input_buffer, width) \
1433         schedule(static)
1434 #endif
1435         for(size_t j = 0; j < ht; j++)
1436           memcpy((char *)input_buffer + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1437                  (size_t)wd * in_bpp);
1438 
1439         /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1440         err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, origin, region,
1441                                                  wd * in_bpp, CL_TRUE);
1442         if(err != CL_SUCCESS) goto error;
1443       }
1444       else
1445       {
1446         /* blocking direct memory transfer: host input image -> opencl/device tile */
1447         err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, origin, region, ipitch,
1448                                                  CL_TRUE);
1449         if(err != CL_SUCCESS) goto error;
1450       }
1451 
1452       /* take original processed_maximum as starting point */
1453       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1454 
1455       /* call process_cl of module */
1456       if(!self->process_cl(self, piece, input, output, &iroi, &oroi)) goto error;
1457 
1458       /* aggregate resulting processed_maximum */
1459       /* TODO: check if there really can be differences between tiles and take
1460                appropriate action (calculate minimum, maximum, average, ...?) */
1461       for(int k = 0; k < 4; k++)
1462       {
1463         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1464           dt_print(
1465               DT_DEBUG_OPENCL,
1466               "[default_process_tiling_cl_ptp] processed_maximum[%d] differs between tiles in module '%s'\n",
1467               k, self->op);
1468         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1469       }
1470 
1471       if(use_pinned_memory)
1472       {
1473         /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1474         err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, origin, region,
1475                                                   wd * out_bpp, CL_TRUE);
1476         if(err != CL_SUCCESS) goto error;
1477       }
1478 
1479       /* correct origin and region of tile for overlap.
1480          makes sure that we only copy back the "good" part. */
1481       if(tx > 0)
1482       {
1483         origin[0] += overlap;
1484         region[0] -= overlap;
1485         ooffs += (size_t)overlap * out_bpp;
1486       }
1487       if(ty > 0)
1488       {
1489         origin[1] += overlap;
1490         region[1] -= overlap;
1491         ooffs += (size_t)overlap * opitch;
1492       }
1493 
1494       if(use_pinned_memory)
1495       {
1496 /* copy "good" part of tile from pinned output buffer to output image */
1497 #if 0 // def _OPENMP
1498 #pragma omp parallel for default(none) shared(ovoid, ooffs, output_buffer, width, origin, region,            \
1499                                               wd) schedule(static)
1500 #endif
1501         for(size_t j = 0; j < region[1]; j++)
1502           memcpy((char *)ovoid + ooffs + j * opitch,
1503                  (char *)output_buffer + ((j + origin[1]) * wd + origin[0]) * out_bpp,
1504                  (size_t)region[0] * out_bpp);
1505       }
1506       else
1507       {
1508         /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1509         err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, origin, region,
1510                                                   opitch, CL_TRUE);
1511         if(err != CL_SUCCESS) goto error;
1512       }
1513 
1514       /* release input and output buffers */
1515       dt_opencl_release_mem_object(input);
1516       input = NULL;
1517       dt_opencl_release_mem_object(output);
1518       output = NULL;
1519 
1520       /* block until opencl queue has finished to free all used event handlers */
1521       if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1522         dt_opencl_finish(devid);
1523     }
1524 
1525   /* copy back final processed_maximum */
1526   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1527 
1528   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1529   dt_opencl_release_mem_object(pinned_input);
1530   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1531   dt_opencl_release_mem_object(pinned_output);
1532   dt_opencl_release_mem_object(input);
1533   dt_opencl_release_mem_object(output);
1534   piece->pipe->tiling = 0;
1535   return TRUE;
1536 
1537 error:
1538   /* copy back stored processed_maximum */
1539   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1540   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1541   dt_opencl_release_mem_object(pinned_input);
1542   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1543   dt_opencl_release_mem_object(pinned_output);
1544   dt_opencl_release_mem_object(input);
1545   dt_opencl_release_mem_object(output);
1546   piece->pipe->tiling = 0;
1547   dt_print(
1548       DT_DEBUG_OPENCL,
1549       "[default_process_tiling_opencl_ptp] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1550       self->op, err);
1551   return FALSE;
1552 }
1553 
1554 
1555 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
1556    more tiles and larger overlap */
_default_process_tiling_cl_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1557 static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1558                                           const void *const ivoid, void *const ovoid,
1559                                           const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1560                                           const int in_bpp)
1561 {
1562   cl_int err = -999;
1563   cl_mem input = NULL;
1564   cl_mem output = NULL;
1565   cl_mem pinned_input = NULL;
1566   cl_mem pinned_output = NULL;
1567   void *input_buffer = NULL;
1568   void *output_buffer = NULL;
1569 
1570 
1571   //_print_roi(roi_in, "module roi_in");
1572   //_print_roi(roi_out, "module roi_out");
1573 
1574   dt_iop_buffer_dsc_t dsc;
1575   self->output_format(self, piece->pipe, piece, &dsc);
1576   const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1577 
1578   const int devid = piece->pipe->devid;
1579   const int ipitch = roi_in->width * in_bpp;
1580   const int opitch = roi_out->width * out_bpp;
1581   const int max_bpp = _max(in_bpp, out_bpp);
1582 
1583   const float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
1584                                                               / ((float)roi_out->width * roi_out->height)));
1585 
1586   /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
1587   const int delta = ceilf(fullscale);
1588 
1589   /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
1590   const int inacc = RESERVE * delta;
1591 
1592   /* get tiling requirements of module */
1593   dt_develop_tiling_t tiling = { 0 };
1594   self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1595 
1596   /* shall we use pinned memory transfers? */
1597   int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1598   const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1599                                                                 // which seemingly get allocated not only on
1600                                                                 // host but also on device (why???)
1601   const float pinned_buffer_slack
1602       = use_pinned_memory
1603             ? 0.85f
1604             : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1605 
1606   /* shall we enforce tiling */
1607   const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
1608   /* calculate optimal size of tiles */
1609   float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1610   headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1611   float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1612   if(force_tile) available = fmin(available, 500.0f * 1024.0f * 1024.0f);
1613   const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1614   const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1615                                   pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1616   const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1617 
1618   int width = _min(_max(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
1619   int height = _min(_max(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);
1620 
1621   /* shrink tile size in case it would exceed singlebuffer size */
1622   if((float)width * height * max_bpp * maxbuf > singlebuffer)
1623   {
1624     const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1625 
1626     if(width < height && scale >= 0.333f)
1627     {
1628       height = floorf(height * scale);
1629     }
1630     else if(height <= width && scale >= 0.333f)
1631     {
1632       width = floorf(width * scale);
1633     }
1634     else
1635     {
1636       width = floorf(width * sqrtf(scale));
1637       height = floorf(height * sqrtf(scale));
1638     }
1639   }
1640 
1641   /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1642   if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1643   {
1644     width = height = floorf(sqrtf((float)width * height));
1645   }
1646 
1647 
1648   /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1649      Modules will report alignment requirements via xalign and yalign within tiling_callback().
1650      Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1651      direction. Additional alignment requirements are set via definition of CL_ALIGNMENT. */
1652 
1653   /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1654   unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1655   xyalign = _lcm(xyalign, CL_ALIGNMENT);
1656 
1657   assert(xyalign != 0);
1658 
1659   /* make sure that overlap follows alignment rules by making it wider when needed.
1660      overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
1661   const int overlap_in = _align_up(tiling.overlap, xyalign);
1662   const int overlap_out = ceilf((float)overlap_in / fullscale);
1663 
1664   int tiles_x = 1, tiles_y = 1;
1665 
1666   /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
1667      normally it is roi_in > roi_out; but let's be prepared */
1668   if(roi_in->width > roi_out->width)
1669     tiles_x = width < roi_in->width
1670                   ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
1671                   : 1;
1672   else
1673     tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
1674                                      : 1;
1675 
1676   if(roi_in->height > roi_out->height)
1677     tiles_y = height < roi_in->height
1678                   ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
1679                   : 1;
1680   else
1681     tiles_y = height < roi_out->height
1682                   ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
1683                   : 1;
1684 
1685   /* sanity check: don't run wild on too many tiles */
1686   if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1687   {
1688     dt_print(DT_DEBUG_OPENCL,
1689              "[default_process_tiling_cl_roi] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1690              self->op, tiles_x, tiles_y);
1691     return FALSE;
1692   }
1693 
1694   /* calculate tile width and height excl. overlap (i.e. the good part) for output.
1695      important for all following processing steps. */
1696   const int tile_wd = _align_up(
1697       roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
1698   const int tile_ht = _align_up(
1699       roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
1700 
1701 
1702   dt_print(
1703       DT_DEBUG_OPENCL,
1704       "[default_process_tiling_cl_roi] use tiling on module '%s' for image with full input size %d x %d\n",
1705       self->op, roi_in->width, roi_in->height);
1706   dt_print(DT_DEBUG_OPENCL,
1707            "[default_process_tiling_cl_roi] (%d x %d) tiles with max input dimensions %d x %d\n", tiles_x,
1708            tiles_y, width, height);
1709 
1710 
1711   /* store processed_maximum to be re-used and aggregated */
1712   dt_aligned_pixel_t processed_maximum_saved;
1713   dt_aligned_pixel_t processed_maximum_new = { 1.0f };
1714   for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1715 
1716   /* reserve pinned input and output memory for host<->device data transfer */
1717   if(use_pinned_memory)
1718   {
1719     pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1720                                                             CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1721     if(pinned_input == NULL)
1722     {
1723       dt_print(DT_DEBUG_OPENCL,
1724                "[default_process_tiling_cl_roi] could not alloc pinned input buffer for module '%s'\n",
1725                self->op);
1726       use_pinned_memory = 0;
1727     }
1728   }
1729 
1730   if(use_pinned_memory)
1731   {
1732 
1733     input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1734                                         (size_t)width * height * in_bpp);
1735     if(input_buffer == NULL)
1736     {
1737       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned input buffer to host "
1738                                 "memory for module '%s'\n",
1739                self->op);
1740       use_pinned_memory = 0;
1741     }
1742   }
1743 
1744   if(use_pinned_memory)
1745   {
1746 
1747     pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1748                                                              CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1749     if(pinned_output == NULL)
1750     {
1751       dt_print(DT_DEBUG_OPENCL,
1752                "[default_process_tiling_cl_roi] could not alloc pinned output buffer for module '%s'\n",
1753                self->op);
1754       use_pinned_memory = 0;
1755     }
1756   }
1757 
1758   if(use_pinned_memory)
1759   {
1760 
1761     output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1762                                          (size_t)width * height * out_bpp);
1763     if(output_buffer == NULL)
1764     {
1765       dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned output buffer to host "
1766                                 "memory for module '%s'\n",
1767                self->op);
1768       use_pinned_memory = 0;
1769     }
1770   }
1771 
1772 
1773   /* iterate over tiles */
1774   for(size_t tx = 0; tx < tiles_x; tx++)
1775     for(size_t ty = 0; ty < tiles_y; ty++)
1776     {
1777       piece->pipe->tiling = 1;
1778 
1779       /* the output dimensions of the good part of this specific tile */
1780       const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
1781       const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
1782 
1783       /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1784          iroi_good is calculated by modify_roi_in() of respective module */
1785       dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1786       dt_iop_roi_t oroi_good
1787           = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1788 
1789       self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1790 
1791       /* clamp iroi_good to not exceed roi_in */
1792       iroi_good.x = _max(iroi_good.x, roi_in->x);
1793       iroi_good.y = _max(iroi_good.y, roi_in->y);
1794       iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1795       iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1796 
1797       //_print_roi(&iroi_good, "tile iroi_good");
1798       //_print_roi(&oroi_good, "tile oroi_good");
1799 
1800       /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1801          requirements
1802          and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1803          -> generates first estimate of iroi_full */
1804       const int x_in = iroi_good.x;
1805       const int y_in = iroi_good.y;
1806       const int width_in = iroi_good.width;
1807       const int height_in = iroi_good.height;
1808       const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1809       const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1810       const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1811                                     roi_in->width + roi_in->x - new_x_in);
1812       const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1813                                      roi_in->height + roi_in->y - new_y_in);
1814 
1815       /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1816        * following iterative search */
1817       dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1818       dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1819 
1820       //_print_roi(&iroi_full, "tile iroi_full before optimization");
1821       //_print_roi(&oroi_full, "tile oroi_full before optimization");
1822 
1823       /* try to find a matching oroi_full */
1824       if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1825       {
1826         dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] can not handle requested roi's. tiling "
1827                                   "for module '%s' not possible.\n",
1828                  self->op);
1829         goto error;
1830       }
1831 
1832 
1833       /* make sure that oroi_full at least covers the range of oroi_good.
1834          this step is needed due to the possibility of rounding errors */
1835       oroi_full.x = _min(oroi_full.x, oroi_good.x);
1836       oroi_full.y = _min(oroi_full.y, oroi_good.y);
1837       oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1838       oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1839 
1840       /* clamp oroi_full to not exceed roi_out */
1841       oroi_full.x = _max(oroi_full.x, roi_out->x);
1842       oroi_full.y = _max(oroi_full.y, roi_out->y);
1843       oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1844       oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1845 
1846 
1847       /* calculate final iroi_full */
1848       self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1849 
1850       /* clamp iroi_full to not exceed roi_in */
1851       iroi_full.x = _max(iroi_full.x, roi_in->x);
1852       iroi_full.y = _max(iroi_full.y, roi_in->y);
1853       iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1854       iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1855 
1856       //_print_roi(&iroi_full, "tile iroi_full");
1857       //_print_roi(&oroi_full, "tile oroi_full");
1858 
1859       /* offsets of tile into ivoid and ovoid */
1860       const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1861       const size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1862                            + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1863 
1864       dt_print(DT_DEBUG_OPENCL,
1865                "[default_process_tiling_cl_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n", tx, ty,
1866                iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1867 
1868       /* origin and region of full input tile */
1869       size_t iorigin[] = { 0, 0, 0 };
1870       size_t iregion[] = { iroi_full.width, iroi_full.height, 1 };
1871 
1872       /* origin and region of full output tile */
1873       size_t oforigin[] = { 0, 0, 0 };
1874       size_t ofregion[] = { oroi_full.width, oroi_full.height, 1 };
1875 
1876       /* origin and region of good part of output tile */
1877       size_t oorigin[] = { oroi_good.x - oroi_full.x, oroi_good.y - oroi_full.y, 0 };
1878       size_t oregion[] = { oroi_good.width, oroi_good.height, 1 };
1879 
1880       /* get opencl input and output buffers */
1881       input = dt_opencl_alloc_device(devid, iroi_full.width, iroi_full.height, in_bpp);
1882       if(input == NULL) goto error;
1883 
1884       output = dt_opencl_alloc_device(devid, oroi_full.width, oroi_full.height, out_bpp);
1885       if(output == NULL) goto error;
1886 
1887       if(use_pinned_memory)
1888       {
1889 /* prepare pinned input tile buffer: copy part of input image */
1890 #ifdef _OPENMP
1891 #pragma omp parallel for default(none) \
1892         dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1893         dt_omp_sharedconst(ioffs) shared(input_buffer, width, iroi_full) schedule(static)
1894 #endif
1895         for(size_t j = 0; j < iroi_full.height; j++)
1896           memcpy((char *)input_buffer + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1897                  (size_t)iroi_full.width * in_bpp);
1898 
1899         /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1900         err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, iorigin, iregion,
1901                                                  (size_t)iroi_full.width * in_bpp, CL_TRUE);
1902         if(err != CL_SUCCESS) goto error;
1903       }
1904       else
1905       {
1906         /* blocking direct memory transfer: host input image -> opencl/device tile */
1907         err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, iorigin, iregion,
1908                                                  ipitch, CL_TRUE);
1909         if(err != CL_SUCCESS) goto error;
1910       }
1911 
1912       /* take original processed_maximum as starting point */
1913       for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1914 
1915       /* call process_cl of module */
1916       if(!self->process_cl(self, piece, input, output, &iroi_full, &oroi_full)) goto error;
1917 
1918       /* aggregate resulting processed_maximum */
1919       /* TODO: check if there really can be differences between tiles and take
1920                appropriate action (calculate minimum, maximum, average, ...?) */
1921       for(int k = 0; k < 4; k++)
1922       {
1923         if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1924           dt_print(
1925               DT_DEBUG_OPENCL,
1926               "[default_process_tiling_cl_roi] processed_maximum[%d] differs between tiles in module '%s'\n",
1927               k, self->op);
1928         processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1929       }
1930 
1931       if(use_pinned_memory)
1932       {
1933         /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1934         err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, oforigin, ofregion,
1935                                                   (size_t)oroi_full.width * out_bpp, CL_TRUE);
1936         if(err != CL_SUCCESS) goto error;
1937 
1938 /* copy "good" part of tile from pinned output buffer to output image */
1939 #ifdef _OPENMP
1940 #pragma omp parallel for default(none) \
1941         dt_omp_firstprivate(ipitch, opitch, ovoid, out_bpp) \
1942         dt_omp_sharedconst(ooffs) shared(output_buffer, oroi_full, oorigin, oregion) \
1943         schedule(static)
1944 #endif
1945         for(size_t j = 0; j < oregion[1]; j++)
1946           memcpy((char *)ovoid + ooffs + j * opitch,
1947                  (char *)output_buffer + ((j + oorigin[1]) * oroi_full.width + oorigin[0]) * out_bpp,
1948                  (size_t)oregion[0] * out_bpp);
1949       }
1950       else
1951       {
1952         /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1953         err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, oorigin, oregion,
1954                                                   opitch, CL_TRUE);
1955         if(err != CL_SUCCESS) goto error;
1956       }
1957 
1958       /* release input and output buffers */
1959       dt_opencl_release_mem_object(input);
1960       input = NULL;
1961       dt_opencl_release_mem_object(output);
1962       output = NULL;
1963 
1964       /* block until opencl queue has finished to free all used event handlers */
1965       if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1966         dt_opencl_finish(devid);
1967     }
1968 
1969   /* copy back final processed_maximum */
1970   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1971   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1972   dt_opencl_release_mem_object(pinned_input);
1973   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1974   dt_opencl_release_mem_object(pinned_output);
1975   dt_opencl_release_mem_object(input);
1976   dt_opencl_release_mem_object(output);
1977   piece->pipe->tiling = 0;
1978   return TRUE;
1979 
1980 error:
1981   /* copy back stored processed_maximum */
1982   for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1983   if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1984   dt_opencl_release_mem_object(pinned_input);
1985   if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1986   dt_opencl_release_mem_object(pinned_output);
1987   dt_opencl_release_mem_object(input);
1988   dt_opencl_release_mem_object(output);
1989   piece->pipe->tiling = 0;
1990   dt_print(
1991       DT_DEBUG_OPENCL,
1992       "[default_process_tiling_opencl_roi] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1993       self->op, err);
1994   return FALSE;
1995 }
1996 
1997 
1998 
1999 /* if a module does not implement process_tiling_cl() by itself, this function is called instead.
2000    _default_process_tiling_cl_ptp() is able to handle standard cases where pixels do not change their places.
2001    _default_process_tiling_cl_roi() takes care of all other cases where image gets distorted. */
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)2002 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2003                               const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
2004                               const dt_iop_roi_t *const roi_out, const int in_bpp)
2005 {
2006   if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
2007     return _default_process_tiling_cl_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
2008   else
2009     return _default_process_tiling_cl_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
2010 }
2011 
2012 #else
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)2013 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2014                               const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
2015                               const dt_iop_roi_t *const roi_out, const int in_bpp)
2016 {
2017   return FALSE;
2018 }
2019 #endif
2020 
2021 
2022 /* If a module does not implement tiling_callback() by itself, this function is called instead.
2023    Default is an image size factor of 2 (i.e. input + output buffer needed), no overhead (1),
2024    no overlap between tiles, and an pixel alignment of 1 in x and y direction, i.e. no special
2025    alignment required. Simple pixel to pixel modules (take tonecurve as an example) can happily
2026    live with that.
2027    (1) Small overhead like look-up-tables in tonecurve can be ignored safely. */
default_tiling_callback(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * roi_in,const dt_iop_roi_t * roi_out,struct dt_develop_tiling_t * tiling)2028 void default_tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2029                              const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
2030                              struct dt_develop_tiling_t *tiling)
2031 {
2032   const float ioratio
2033       = ((float)roi_out->width * (float)roi_out->height) / ((float)roi_in->width * (float)roi_in->height);
2034 
2035   tiling->factor = 1.0f + ioratio;
2036   tiling->factor_cl = tiling->factor;  // by default, we need the same memory on host or GPU
2037   tiling->maxbuf = 1.0f;
2038   tiling->maxbuf_cl = tiling->maxbuf;
2039   tiling->overhead = 0;
2040   tiling->overlap = 0;
2041   tiling->xalign = 1;
2042   tiling->yalign = 1;
2043 
2044   if((self->flags() & IOP_FLAGS_TILING_FULL_ROI) == IOP_FLAGS_TILING_FULL_ROI) tiling->overlap = 4;
2045 
2046   if(self->iop_order > dt_ioppr_get_iop_order(piece->pipe->iop_order_list, "demosaic", 0)) return;
2047 
2048   // all operations that work with mosaiced data should respect pattern size!
2049 
2050   if(!piece->pipe->dsc.filters) return;
2051 
2052   if(piece->pipe->dsc.filters == 9u)
2053   {
2054     // X-Trans, sensor is 6x6
2055     tiling->xalign = 6;
2056     tiling->yalign = 6;
2057   }
2058   else
2059   {
2060     // Bayer, good old 2x2
2061     tiling->xalign = 2;
2062     tiling->yalign = 2;
2063   }
2064 
2065   return;
2066 }
2067 
dt_tiling_piece_fits_host_memory(const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)2068 int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp,
2069                                      const float factor, const size_t overhead)
2070 {
2071   static int host_memory_limit = -1;
2072 
2073   /* first time run */
2074   if(host_memory_limit < 0)
2075   {
2076     host_memory_limit = dt_conf_get_int("host_memory_limit");
2077 
2078     /* don't let the user play games with us */
2079     if(host_memory_limit != 0) host_memory_limit = CLAMPI(host_memory_limit, 500, 50000);
2080     dt_conf_set_int("host_memory_limit", host_memory_limit);
2081   }
2082 
2083   const float requirement = factor * width * height * bpp + overhead;
2084 
2085   if(host_memory_limit == 0 || requirement <= host_memory_limit * 1024.0f * 1024.0f)
2086     return TRUE;
2087   else
2088     return FALSE;
2089 }
2090 
2091 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2092 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2093 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2094