1 /*
2 This file is part of darktable,
3 Copyright (C) 2011-2020 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19
20 #include "develop/tiling.h"
21 #include "common/opencl.h"
22 #include "control/control.h"
23 #include "develop/blend.h"
24 #include "develop/pixelpipe.h"
25
26 #include <assert.h>
27 #include <math.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <unistd.h>
32
33 #define CLAMPI(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
34
35
36 /* this defines an additional alignment requirement for opencl image width.
37 It can have strong effects on processing speed. Reasonable values are a
38 power of 2. set to 1 for no effect. */
39 #define CL_ALIGNMENT 4
40
41 /* parameter RESERVE for extended roi_in sizes due to inaccuracies when doing
42 roi_out -> roi_in estimations.
43 Needs to be increased if tiling fails due to insufficient buffer sizes. */
44 #define RESERVE 5
45
46
47 /* greatest common divisor */
_gcd(unsigned a,unsigned b)48 static unsigned _gcd(unsigned a, unsigned b)
49 {
50 unsigned t;
51 while(b != 0)
52 {
53 t = b;
54 b = a % b;
55 a = t;
56 }
57 return MAX(a, 1);
58 }
59
60 /* least common multiple */
_lcm(unsigned a,unsigned b)61 static unsigned _lcm(unsigned a, unsigned b)
62 {
63 return (((unsigned long)a * b) / _gcd(a, b));
64 }
65
66
_min(int a,int b)67 static inline int _min(int a, int b)
68 {
69 return a < b ? a : b;
70 }
71
_max(int a,int b)72 static inline int _max(int a, int b)
73 {
74 return a > b ? a : b;
75 }
76
77
_align_up(int n,int a)78 static inline int _align_up(int n, int a)
79 {
80 return n % a != 0 ? (n / a + 1) * a : n;
81 }
82
_align_down(int n,int a)83 static inline int _align_down(int n, int a)
84 {
85 return n % a != 0 ? (n / a) * a : n;
86 }
87
88
_print_roi(const dt_iop_roi_t * roi,const char * label)89 void _print_roi(const dt_iop_roi_t *roi, const char *label)
90 {
91 printf("{ %5d %5d %5d %5d %.6f } %s\n", roi->x, roi->y, roi->width, roi->height, roi->scale, label);
92 }
93
94
95 #if 0
96 static void
97 _nm_constraints(double x[], int n)
98 {
99 x[0] = fabs(x[0]);
100 x[1] = fabs(x[1]);
101 x[2] = fabs(x[2]);
102 x[3] = fabs(x[3]);
103
104 if(x[0] > 1.0) x[0] = 1.0 - x[0];
105 if(x[1] > 1.0) x[1] = 1.0 - x[1];
106 if(x[2] > 1.0) x[2] = 1.0 - x[2];
107 if(x[3] > 1.0) x[3] = 1.0 - x[3];
108
109 }
110 #endif
111
_nm_fitness(double x[],void * rest[])112 static double _nm_fitness(double x[], void *rest[])
113 {
114 struct dt_iop_module_t *self = (struct dt_iop_module_t *)rest[0];
115 struct dt_dev_pixelpipe_iop_t *piece = (struct dt_dev_pixelpipe_iop_t *)rest[1];
116 struct dt_iop_roi_t *iroi = (struct dt_iop_roi_t *)rest[2];
117 struct dt_iop_roi_t *oroi = (struct dt_iop_roi_t *)rest[3];
118
119 dt_iop_roi_t oroi_test = *oroi;
120 oroi_test.x = x[0] * piece->iwidth;
121 oroi_test.y = x[1] * piece->iheight;
122 oroi_test.width = x[2] * piece->iwidth;
123 oroi_test.height = x[3] * piece->iheight;
124
125 dt_iop_roi_t iroi_probe = *iroi;
126 self->modify_roi_in(self, piece, &oroi_test, &iroi_probe);
127
128 double fitness = 0.0;
129
130 fitness += (double)(iroi_probe.x - iroi->x) * (iroi_probe.x - iroi->x);
131 fitness += (double)(iroi_probe.y - iroi->y) * (iroi_probe.y - iroi->y);
132 fitness += (double)(iroi_probe.width - iroi->width) * (iroi_probe.width - iroi->width);
133 fitness += (double)(iroi_probe.height - iroi->height) * (iroi_probe.height - iroi->height);
134
135 return fitness;
136 }
137
138
139 /* We use a Nelder-Mead simplex algorithm based on an implementation of Michael F. Hutt.
140 It is covered by the following copyright notice: */
141 /*
142 * Program: nmsimplex.c
143 * Author : Michael F. Hutt
144 * http://www.mikehutt.com
145 * 11/3/97
146 *
147 * An implementation of the Nelder-Mead simplex method.
148 *
149 * Copyright (c) 1997-2011 <Michael F. Hutt>
150 *
151 * Permission is hereby granted, free of charge, to any person obtaining
152 * a copy of this software and associated documentation files (the
153 * "Software"), to deal in the Software without restriction, including
154 * without limitation the rights to use, copy, modify, merge, publish,
155 * distribute, sublicense, and/or sell copies of the Software, and to
156 * permit persons to whom the Software is furnished to do so, subject to
157 * the following conditions:
158 *
159 * The above copyright notice and this permission notice shall be
160 * included in all copies or substantial portions of the Software.
161 *
162 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
163 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
164 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
165 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
166 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
167 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
168 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
169 *
170 */
171
172 #define MAX_IT 1000 /* maximum number of iterations */
173 #define ALPHA 1.0 /* reflection coefficient */
174 #define BETA 0.5 /* contraction coefficient */
175 #define GAMMA 2.0 /* expansion coefficient */
176
_simplex(double (* objfunc)(double[],void * []),double start[],int n,double EPSILON,double scale,int maxiter,void (* constrain)(double[],int n),void * rest[])177 static int _simplex(double (*objfunc)(double[], void *[]), double start[], int n, double EPSILON,
178 double scale, int maxiter, void (*constrain)(double[], int n), void *rest[])
179 {
180
181 int vs; /* vertex with smallest value */
182 int vh; /* vertex with next smallest value */
183 int vg; /* vertex with largest value */
184
185 int i, j = 0, m, row;
186 int k; /* track the number of function evaluations */
187 int itr; /* track the number of iterations */
188
189 double **v; /* holds vertices of simplex */
190 double pn, qn; /* values used to create initial simplex */
191 double *f; /* value of function at each vertex */
192 double fr; /* value of function at reflection point */
193 double fe; /* value of function at expansion point */
194 double fc; /* value of function at contraction point */
195 double *vr; /* reflection - coordinates */
196 double *ve; /* expansion - coordinates */
197 double *vc; /* contraction - coordinates */
198 double *vm; /* centroid - coordinates */
199
200 double fsum, favg, s, cent;
201
202 /* dynamically allocate arrays */
203
204 /* allocate the rows of the arrays */
205 v = (double **)malloc(sizeof(double *) * (n + 1));
206 f = (double *)malloc(sizeof(double) * (n + 1));
207 vr = (double *)malloc(sizeof(double) * n);
208 ve = (double *)malloc(sizeof(double) * n);
209 vc = (double *)malloc(sizeof(double) * n);
210 vm = (double *)malloc(sizeof(double) * n);
211
212 /* allocate the columns of the arrays */
213 for(i = 0; i <= n; i++)
214 {
215 v[i] = (double *)malloc(sizeof(double) * n);
216 }
217
218 /* create the initial simplex */
219 /* assume one of the vertices is 0,0 */
220
221 pn = scale * (sqrt(n + 1) - 1 + n) / (n * sqrt(2));
222 qn = scale * (sqrt(n + 1) - 1) / (n * sqrt(2));
223
224 for(i = 0; i < n; i++)
225 {
226 v[0][i] = start[i];
227 }
228
229 for(i = 1; i <= n; i++)
230 {
231 for(j = 0; j < n; j++)
232 {
233 if(i - 1 == j)
234 {
235 v[i][j] = pn + start[j];
236 }
237 else
238 {
239 v[i][j] = qn + start[j];
240 }
241 }
242 }
243
244 if(constrain != NULL)
245 {
246 constrain(v[j], n);
247 }
248 /* find the initial function values */
249 for(j = 0; j <= n; j++)
250 {
251 f[j] = objfunc(v[j], rest);
252 }
253
254 k = n + 1;
255
256 #if 0
257 /* print out the initial values */
258 printf ("Initial Values\n");
259 for (j = 0; j <= n; j++)
260 {
261 for (i = 0; i < n; i++)
262 {
263 printf ("%f %f\n", v[j][i], f[j]);
264 }
265 }
266 #endif
267
268 /* begin the main loop of the minimization */
269 for(itr = 1; itr <= maxiter; itr++)
270 {
271 /* find the index of the largest value */
272 vg = 0;
273 for(j = 0; j <= n; j++)
274 {
275 if(f[j] > f[vg])
276 {
277 vg = j;
278 }
279 }
280
281 /* find the index of the smallest value */
282 vs = 0;
283 for(j = 0; j <= n; j++)
284 {
285 if(f[j] < f[vs])
286 {
287 vs = j;
288 }
289 }
290
291 /* find the index of the second largest value */
292 vh = vs;
293 for(j = 0; j <= n; j++)
294 {
295 if(f[j] > f[vh] && f[j] < f[vg])
296 {
297 vh = j;
298 }
299 }
300
301 /* calculate the centroid */
302 for(j = 0; j <= n - 1; j++)
303 {
304 cent = 0.0;
305 for(m = 0; m <= n; m++)
306 {
307 if(m != vg)
308 {
309 cent += v[m][j];
310 }
311 }
312 vm[j] = cent / n;
313 }
314
315 /* reflect vg to new vertex vr */
316 for(j = 0; j <= n - 1; j++)
317 {
318 /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j]; */
319 vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
320 }
321 if(constrain != NULL)
322 {
323 constrain(vr, n);
324 }
325 fr = objfunc(vr, rest);
326 k++;
327
328 if(fr < f[vh] && fr >= f[vs])
329 {
330 for(j = 0; j <= n - 1; j++)
331 {
332 v[vg][j] = vr[j];
333 }
334 f[vg] = fr;
335 }
336
337 /* investigate a step further in this direction */
338 if(fr < f[vs])
339 {
340 for(j = 0; j <= n - 1; j++)
341 {
342 /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j]; */
343 ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
344 }
345 if(constrain != NULL)
346 {
347 constrain(ve, n);
348 }
349 fe = objfunc(ve, rest);
350 k++;
351
352 /* by making fe < fr as opposed to fe < f[vs],
353 Rosenbrocks function takes 63 iterations as opposed
354 to 64 when using double variables. */
355
356 if(fe < fr)
357 {
358 for(j = 0; j <= n - 1; j++)
359 {
360 v[vg][j] = ve[j];
361 }
362 f[vg] = fe;
363 }
364 else
365 {
366 for(j = 0; j <= n - 1; j++)
367 {
368 v[vg][j] = vr[j];
369 }
370 f[vg] = fr;
371 }
372 }
373
374 /* check to see if a contraction is necessary */
375 if(fr >= f[vh])
376 {
377 if(fr < f[vg] && fr >= f[vh])
378 {
379 /* perform outside contraction */
380 for(j = 0; j <= n - 1; j++)
381 {
382 /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
383 vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
384 }
385 if(constrain != NULL)
386 {
387 constrain(vc, n);
388 }
389 fc = objfunc(vc, rest);
390 k++;
391 }
392 else
393 {
394 /* perform inside contraction */
395 for(j = 0; j <= n - 1; j++)
396 {
397 /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
398 vc[j] = vm[j] - BETA * (vm[j] - v[vg][j]);
399 }
400 if(constrain != NULL)
401 {
402 constrain(vc, n);
403 }
404 fc = objfunc(vc, rest);
405 k++;
406 }
407
408
409 if(fc < f[vg])
410 {
411 for(j = 0; j <= n - 1; j++)
412 {
413 v[vg][j] = vc[j];
414 }
415 f[vg] = fc;
416 }
417 /* at this point the contraction is not successful,
418 we must halve the distance from vs to all the
419 vertices of the simplex and then continue.
420 10/31/97 - modified to account for ALL vertices.
421 */
422 else
423 {
424 for(row = 0; row <= n; row++)
425 {
426 if(row != vs)
427 {
428 for(j = 0; j <= n - 1; j++)
429 {
430 v[row][j] = v[vs][j] + (v[row][j] - v[vs][j]) / 2.0;
431 }
432 }
433 }
434 if(constrain != NULL)
435 {
436 constrain(v[vg], n);
437 }
438 f[vg] = objfunc(v[vg], rest);
439 k++;
440 if(constrain != NULL)
441 {
442 constrain(v[vh], n);
443 }
444 f[vh] = objfunc(v[vh], rest);
445 k++;
446 }
447 }
448
449 #if 0
450 /* print out the value at each iteration */
451 printf ("Iteration %d\n", itr);
452 for (j = 0; j <= n; j++)
453 {
454 for (i = 0; i < n; i++)
455 {
456 printf ("%f %f\n", v[j][i], f[j]);
457 }
458 }
459 #endif
460
461 /* test for convergence */
462 fsum = 0.0;
463 for(j = 0; j <= n; j++)
464 {
465 fsum += f[j];
466 }
467 favg = fsum / (n + 1);
468 s = 0.0;
469 for(j = 0; j <= n; j++)
470 {
471 s += pow((f[j] - favg), 2.0) / (n);
472 }
473 s = sqrt(s);
474 if(s < EPSILON) break;
475 }
476 /* end main loop of the minimization */
477
478 /* find the index of the smallest value */
479 vs = 0;
480 for(j = 0; j <= n; j++)
481 {
482 if(f[j] < f[vs])
483 {
484 vs = j;
485 }
486 }
487
488 #if 0
489 printf ("The minimum was found at\n");
490 for (j = 0; j < n; j++)
491 {
492 printf ("%e\n", v[vs][j]);
493 start[j] = v[vs][j];
494 }
495 double min = objfunc (v[vs], rest);
496 printf ("Function value at minimum %f\n", min);
497 k++;
498 printf ("%d Function Evaluations\n", k);
499 printf ("%d Iterations through program\n", itr);
500 #endif
501
502 free(f);
503 free(vr);
504 free(ve);
505 free(vc);
506 free(vm);
507 for(i = 0; i <= n; i++)
508 {
509 free(v[i]);
510 }
511 free(v);
512 return itr;
513 }
514
515
_nm_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta)516 static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
517 const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta)
518 {
519 void *rest[4] = { (void *)self, (void *)piece, (void *)iroi, (void *)oroi };
520 double start[4] = { (float)oroi->x / piece->iwidth, (float)oroi->y / piece->iheight,
521 (float)oroi->width / piece->iwidth, (float)oroi->height / piece->iheight };
522 double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
523 int maxiter = 1000;
524
525 int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
526
527 // printf("_simplex: %d, delta: %d, epsilon: %f\n", iter, delta, epsilon);
528
529 oroi->x = start[0] * piece->iwidth;
530 oroi->y = start[1] * piece->iheight;
531 oroi->width = start[2] * piece->iwidth;
532 oroi->height = start[3] * piece->iheight;
533
534 return (iter <= maxiter);
535 }
536
537
538
539 /* find a matching oroi_full by probing start value of oroi and get corresponding input roi into iroi_probe.
540 We search in two steps. first by a simplicistic iterative search which will succeed in most cases.
541 If this does not converge, we do a downhill simplex (nelder-mead) fitting */
_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta,int iter)542 static int _fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
543 const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta, int iter)
544 {
545 dt_iop_roi_t iroi_probe = *iroi;
546 dt_iop_roi_t save_oroi = *oroi;
547
548 // try to go the easy way. this works in many cases where output is
549 // just like input, only scaled down
550 self->modify_roi_in(self, piece, oroi, &iroi_probe);
551 while((abs((int)iroi_probe.x - (int)iroi->x) > delta || abs((int)iroi_probe.y - (int)iroi->y) > delta
552 || abs((int)iroi_probe.width - (int)iroi->width) > delta
553 || abs((int)iroi_probe.height - (int)iroi->height) > delta) && iter > 0)
554 {
555 //_print_roi(&iroi_probe, "tile iroi_probe");
556 //_print_roi(oroi, "tile oroi old");
557
558 oroi->x += (iroi->x - iroi_probe.x) * oroi->scale / iroi->scale;
559 oroi->y += (iroi->y - iroi_probe.y) * oroi->scale / iroi->scale;
560 oroi->width += (iroi->width - iroi_probe.width) * oroi->scale / iroi->scale;
561 oroi->height += (iroi->height - iroi_probe.height) * oroi->scale / iroi->scale;
562
563 //_print_roi(oroi, "tile oroi new");
564
565 self->modify_roi_in(self, piece, oroi, &iroi_probe);
566 iter--;
567 }
568
569 if(iter > 0) return TRUE;
570
571 *oroi = save_oroi;
572
573 // simplicistic approach did not converge.
574 // try simplex downhill fitting now.
575 // it's crucial that we have a good starting point in oroi, else this
576 // will not converge as well.
577 int fit = _nm_fit_output_to_input_roi(self, piece, iroi, oroi, delta);
578 return fit;
579 }
580
581
582 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)583 static void _default_process_tiling_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
584 const void *const ivoid, void *const ovoid,
585 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
586 const int in_bpp)
587 {
588 void *input = NULL;
589 void *output = NULL;
590 dt_iop_buffer_dsc_t dsc;
591 self->output_format(self, piece->pipe, piece, &dsc);
592 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
593
594 const int ipitch = roi_in->width * in_bpp;
595 const int opitch = roi_out->width * out_bpp;
596 const int max_bpp = _max(in_bpp, out_bpp);
597
598 /* get tiling requirements of module */
599 dt_develop_tiling_t tiling = { 0 };
600 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
601
602 /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
603 */
604 if(tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp)
605 {
606 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] no need to use tiling for module '%s' as no real "
607 "memory saving to be expected\n",
608 self->op);
609 goto fallback;
610 }
611
612 /* calculate optimal size of tiles */
613 float available = dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
614 assert(available >= 500.0f * 1024.0f * 1024.0f);
615 /* correct for size of ivoid and ovoid which are needed on top of tiling */
616 available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
617 - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
618 0);
619
620 /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
621 this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
622 reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
623 float singlebuffer = dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
624 singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
625 const float factor = fmax(tiling.factor, 1.0f);
626 const float maxbuf = fmax(tiling.maxbuf, 1.0f);
627 singlebuffer = fmax(available / factor, singlebuffer);
628
629 int width = roi_in->width;
630 int height = roi_in->height;
631
632 /* shrink tile size in case it would exceed singlebuffer size */
633 if((float)width * height * max_bpp * maxbuf > singlebuffer)
634 {
635 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
636
637 /* TODO: can we make this more efficient to minimize total overlap between tiles? */
638 if(width < height && scale >= 0.333f)
639 {
640 height = floorf(height * scale);
641 }
642 else if(height <= width && scale >= 0.333f)
643 {
644 width = floorf(width * scale);
645 }
646 else
647 {
648 width = floorf(width * sqrtf(scale));
649 height = floorf(height * sqrtf(scale));
650 }
651 }
652
653 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
654 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
655 {
656 width = height = floorf(sqrtf((float)width * height));
657 }
658
659 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
660 Modules will report alignment requirements via xalign and yalign within tiling_callback().
661 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
662 direction.
663 We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
664 that is identical to image width/height no special alignment is needed. */
665
666 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
667
668 assert(xyalign != 0);
669
670 /* properly align tile width and height by making them smaller if needed */
671 if(width < roi_in->width) width = (width / xyalign) * xyalign;
672 if(height < roi_in->height) height = (height / xyalign) * xyalign;
673
674 /* also make sure that overlap follows alignment rules by making it wider when needed */
675 const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
676 : tiling.overlap;
677
678 /* calculate effective tile size */
679 const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
680 const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
681
682 /* calculate number of tiles */
683 const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
684 const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
685
686 /* sanity check: don't run wild on too many tiles */
687 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
688 {
689 dt_print(DT_DEBUG_DEV,
690 "[default_process_tiling_ptp] gave up tiling for module '%s'. too many tiles: %d x %d\n",
691 self->op, tiles_x, tiles_y);
692 goto error;
693 }
694
695
696 dt_print(DT_DEBUG_DEV,
697 "[default_process_tiling_ptp] use tiling on module '%s' for image with full size %d x %d\n",
698 self->op, roi_in->width, roi_in->height);
699 dt_print(DT_DEBUG_DEV,
700 "[default_process_tiling_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
701 tiles_x, tiles_y, width, height, overlap);
702
703 /* reserve input and output buffers for tiles */
704 input = dt_alloc_align(64, (size_t)width * height * in_bpp);
705 if(input == NULL)
706 {
707 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc input buffer for module '%s'\n",
708 self->op);
709 goto error;
710 }
711 output = dt_alloc_align(64, (size_t)width * height * out_bpp);
712 if(output == NULL)
713 {
714 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc output buffer for module '%s'\n",
715 self->op);
716 goto error;
717 }
718
719 /* store processed_maximum to be re-used and aggregated */
720 float processed_maximum_saved[4];
721 float processed_maximum_new[4] = { 1.0f };
722 for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
723
724
725 /* iterate over tiles */
726 for(size_t tx = 0; tx < tiles_x; tx++)
727 {
728 const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
729 for(size_t ty = 0; ty < tiles_y; ty++)
730 {
731 piece->pipe->tiling = 1;
732
733 const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
734
735 /* no need to process end-tiles that are smaller than the total overlap area */
736 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
737
738 /* origin and region of effective part of tile, which we want to store later */
739 size_t origin[] = { 0, 0, 0 };
740 size_t region[] = { wd, ht, 1 };
741
742 /* roi_in and roi_out for process_cl on subbuffer */
743 dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
744 dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
745
746 /* offsets of tile into ivoid and ovoid */
747 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
748 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
749
750
751 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n",
752 tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
753
754 /* prepare input tile buffer */
755 #ifdef _OPENMP
756 #pragma omp parallel for default(none) \
757 dt_omp_firstprivate(ht, in_bpp, ipitch, ivoid, wd) \
758 dt_omp_sharedconst(ioffs) \
759 shared(input, width) \
760 schedule(static)
761 #endif
762 for(size_t j = 0; j < ht; j++)
763 memcpy((char *)input + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch, (size_t)wd * in_bpp);
764
765 /* take original processed_maximum as starting point */
766 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
767
768 /* call process() of module */
769 self->process(self, piece, input, output, &iroi, &oroi);
770
771 /* aggregate resulting processed_maximum */
772 /* TODO: check if there really can be differences between tiles and take
773 appropriate action (calculate minimum, maximum, average, ...?) */
774 for(int k = 0; k < 4; k++)
775 {
776 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
777 dt_print(
778 DT_DEBUG_DEV,
779 "[default_process_tiling_ptp] processed_maximum[%d] differs between tiles in module '%s'\n", k,
780 self->op);
781 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
782 }
783
784 /* correct origin and region of tile for overlap.
785 make sure that we only copy back the "good" part. */
786 if(tx > 0)
787 {
788 origin[0] += overlap;
789 region[0] -= overlap;
790 ooffs += (size_t)overlap * out_bpp;
791 }
792 if(ty > 0)
793 {
794 origin[1] += overlap;
795 region[1] -= overlap;
796 ooffs += (size_t)overlap * opitch;
797 }
798
799 /* copy "good" part of tile to output buffer */
800 #ifdef _OPENMP
801 #pragma omp parallel for default(none) \
802 dt_omp_firstprivate(opitch, out_bpp, ovoid, wd) \
803 shared(ooffs, output, width, origin, region) \
804 schedule(static)
805 #endif
806 for(size_t j = 0; j < region[1]; j++)
807 memcpy((char *)ovoid + ooffs + j * opitch,
808 (char *)output + ((j + origin[1]) * wd + origin[0]) * out_bpp, (size_t)region[0] * out_bpp);
809 }
810 }
811
812 /* copy back final processed_maximum */
813 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
814
815 if(input != NULL) dt_free_align(input);
816 if(output != NULL) dt_free_align(output);
817 piece->pipe->tiling = 0;
818 return;
819
820 error:
821 dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
822 // fall through
823
824 fallback:
825 if(input != NULL) dt_free_align(input);
826 if(output != NULL) dt_free_align(output);
827 piece->pipe->tiling = 0;
828 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] fall back to standard processing for module '%s'\n",
829 self->op);
830 self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
831 return;
832 }
833
834
835
836 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
837 more tiles and larger overlap */
_default_process_tiling_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)838 static void _default_process_tiling_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
839 const void *const ivoid, void *const ovoid,
840 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
841 const int in_bpp)
842 {
843 void *input = NULL;
844 void *output = NULL;
845
846 //_print_roi(roi_in, "module roi_in");
847 //_print_roi(roi_out, "module roi_out");
848
849 dt_iop_buffer_dsc_t dsc;
850 self->output_format(self, piece->pipe, piece, &dsc);
851 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
852
853 const int ipitch = roi_in->width * in_bpp;
854 const int opitch = roi_out->width * out_bpp;
855 const int max_bpp = _max(in_bpp, out_bpp);
856
857 float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
858 / ((float)roi_out->width * roi_out->height)));
859
860 /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
861 const int delta = ceilf(fullscale);
862
863 /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
864 const int inacc = RESERVE * delta;
865
866 /* get tiling requirements of module */
867 dt_develop_tiling_t tiling = { 0 };
868 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
869
870 /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
871 */
872 if(tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp)
873 {
874 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] no need to use tiling for module '%s' as no real "
875 "memory saving to be expected\n",
876 self->op);
877 goto fallback;
878 }
879
880 /* calculate optimal size of tiles */
881 float available = dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
882 assert(available >= 500.0f * 1024.0f * 1024.0f);
883 /* correct for size of ivoid and ovoid which are needed on top of tiling */
884 available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
885 - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
886 0);
887
888 /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
889 this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
890 reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
891 float singlebuffer = dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
892 singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
893 const float factor = fmax(tiling.factor, 1.0f);
894 const float maxbuf = fmax(tiling.maxbuf, 1.0f);
895 singlebuffer = fmax(available / factor, singlebuffer);
896
897 int width = _max(roi_in->width, roi_out->width);
898 int height = _max(roi_in->height, roi_out->height);
899
900 /* shrink tile size in case it would exceed singlebuffer size */
901 if((float)width * height * max_bpp * maxbuf > singlebuffer)
902 {
903 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
904
905 /* TODO: can we make this more efficient to minimize total overlap between tiles? */
906 if(width < height && scale >= 0.333f)
907 {
908 height = floorf(height * scale);
909 }
910 else if(height <= width && scale >= 0.333f)
911 {
912 width = floorf(width * scale);
913 }
914 else
915 {
916 width = floorf(width * sqrtf(scale));
917 height = floorf(height * sqrtf(scale));
918 }
919 }
920
921 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
922 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
923 {
924 width = height = floorf(sqrtf((float)width * height));
925 }
926
927 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
928 Modules will report alignment requirements via xalign and yalign within tiling_callback().
929 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
930 direction. */
931
932 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
933 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
934
935 assert(xyalign != 0);
936
937 /* make sure that overlap follows alignment rules by making it wider when needed.
938 overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
939 const int overlap_in = _align_up(tiling.overlap, xyalign);
940 const int overlap_out = ceilf((float)overlap_in / fullscale);
941
942 int tiles_x = 1, tiles_y = 1;
943
944 /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
945 normally it is roi_in > roi_out; but let's be prepared */
946 if(roi_in->width > roi_out->width)
947 tiles_x = width < roi_in->width
948 ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
949 : 1;
950 else
951 tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
952 : 1;
953
954 if(roi_in->height > roi_out->height)
955 tiles_y = height < roi_in->height
956 ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
957 : 1;
958 else
959 tiles_y = height < roi_out->height
960 ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
961 : 1;
962
963 /* sanity check: don't run wild on too many tiles */
964 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
965 {
966 dt_print(DT_DEBUG_DEV,
967 "[default_process_tiling_roi] gave up tiling for module '%s'. too many tiles: %d x %d\n",
968 self->op, tiles_x, tiles_y);
969 goto error;
970 }
971
972
973 /* calculate tile width and height excl. overlap (i.e. the good part) for output.
974 values are important for all following processing steps. */
975 const int tile_wd = _align_up(
976 roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
977 const int tile_ht = _align_up(
978 roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
979
980 dt_print(DT_DEBUG_DEV,
981 "[default_process_tiling_roi] use tiling on module '%s' for image with full input size %d x %d\n",
982 self->op, roi_in->width, roi_in->height);
983 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] (%d x %d) tiles with max dimensions %d x %d\n",
984 tiles_x, tiles_y, width, height);
985
986
987 /* store processed_maximum to be re-used and aggregated */
988 float processed_maximum_saved[4];
989 float processed_maximum_new[4] = { 1.0f };
990 for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
991
992 /* iterate over tiles */
993 for(size_t tx = 0; tx < tiles_x; tx++)
994 for(size_t ty = 0; ty < tiles_y; ty++)
995 {
996 piece->pipe->tiling = 1;
997
998 /* the output dimensions of the good part of this specific tile */
999 size_t wd = (tx + 1) * tile_wd > roi_out->width ? roi_out->width - tx * tile_wd : tile_wd;
1000 size_t ht = (ty + 1) * tile_ht > roi_out->height ? roi_out->height - ty * tile_ht : tile_ht;
1001
1002 /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1003 iroi_good is calculated by modify_roi_in() of respective module */
1004 dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1005 dt_iop_roi_t oroi_good
1006 = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1007
1008 self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1009
1010 /* clamp iroi_good to not exceed roi_in */
1011 iroi_good.x = _max(iroi_good.x, roi_in->x);
1012 iroi_good.y = _max(iroi_good.y, roi_in->y);
1013 iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1014 iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1015
1016 //_print_roi(&iroi_good, "tile iroi_good");
1017 //_print_roi(&oroi_good, "tile oroi_good");
1018
1019 /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1020 requirements
1021 and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1022 -> generates first estimate of iroi_full */
1023 const int x_in = iroi_good.x;
1024 const int y_in = iroi_good.y;
1025 const int width_in = iroi_good.width;
1026 const int height_in = iroi_good.height;
1027 const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1028 const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1029 const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1030 roi_in->width + roi_in->x - new_x_in);
1031 const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1032 roi_in->height + roi_in->y - new_y_in);
1033
1034 /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1035 * following iterative search */
1036 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1037 dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1038
1039 //_print_roi(&iroi_full, "tile iroi_full before optimization");
1040 //_print_roi(&oroi_full, "tile oroi_full before optimization");
1041
1042 /* try to find a matching oroi_full */
1043 if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1044 {
1045 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] can not handle requested roi's. tiling for "
1046 "module '%s' not possible.\n",
1047 self->op);
1048 goto error;
1049 }
1050
1051 //_print_roi(&iroi_full, "tile iroi_full after optimization");
1052 //_print_roi(&oroi_full, "tile oroi_full after optimization");
1053
1054 /* make sure that oroi_full at least covers the range of oroi_good.
1055 this step is needed due to the possibility of rounding errors */
1056 oroi_full.x = _min(oroi_full.x, oroi_good.x);
1057 oroi_full.y = _min(oroi_full.y, oroi_good.y);
1058 oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1059 oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1060
1061 /* clamp oroi_full to not exceed roi_out */
1062 oroi_full.x = _max(oroi_full.x, roi_out->x);
1063 oroi_full.y = _max(oroi_full.y, roi_out->y);
1064 oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1065 oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1066
1067 /* calculate final iroi_full */
1068 self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1069
1070 /* clamp iroi_full to not exceed roi_in */
1071 iroi_full.x = _max(iroi_full.x, roi_in->x);
1072 iroi_full.y = _max(iroi_full.y, roi_in->y);
1073 iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1074 iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1075
1076
1077 //_print_roi(&iroi_full, "tile iroi_full final");
1078 //_print_roi(&oroi_full, "tile oroi_full final");
1079
1080 /* offsets of tile into ivoid and ovoid */
1081 const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1082 size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1083 + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1084
1085 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n",
1086 tx, ty, iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1087
1088
1089 /* prepare input tile buffer */
1090 input = dt_alloc_align(64, (size_t)iroi_full.width * iroi_full.height * in_bpp);
1091 if(input == NULL)
1092 {
1093 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc input buffer for module '%s'\n",
1094 self->op);
1095 goto error;
1096 }
1097 output = dt_alloc_align(64, (size_t)oroi_full.width * oroi_full.height * out_bpp);
1098 if(output == NULL)
1099 {
1100 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc output buffer for module '%s'\n",
1101 self->op);
1102 goto error;
1103 }
1104
1105 #ifdef _OPENMP
1106 #pragma omp parallel for default(none) \
1107 dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1108 dt_omp_sharedconst(ioffs) shared(input, iroi_full) \
1109 schedule(static)
1110 #endif
1111 for(size_t j = 0; j < iroi_full.height; j++)
1112 memcpy((char *)input + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1113 (size_t)iroi_full.width * in_bpp);
1114
1115 /* take original processed_maximum as starting point */
1116 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1117
1118 /* call process() of module */
1119 self->process(self, piece, input, output, &iroi_full, &oroi_full);
1120
1121 /* aggregate resulting processed_maximum */
1122 /* TODO: check if there really can be differences between tiles and take
1123 appropriate action (calculate minimum, maximum, average, ...?) */
1124 for(int k = 0; k < 4; k++)
1125 {
1126 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1127 dt_print(
1128 DT_DEBUG_DEV,
1129 "[default_process_tiling_roi] processed_maximum[%d] differs between tiles in module '%s'\n", k,
1130 self->op);
1131 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1132 }
1133
1134 /* copy "good" part of tile to output buffer */
1135 const int origin_x = oroi_good.x - oroi_full.x;
1136 const int origin_y = oroi_good.y - oroi_full.y;
1137 #ifdef _OPENMP
1138 #pragma omp parallel for default(none) \
1139 dt_omp_firstprivate(opitch, origin_x, origin_y, out_bpp, ovoid) \
1140 shared(ooffs, output, oroi_good, oroi_full) \
1141 schedule(static)
1142 #endif
1143 for(size_t j = 0; j < oroi_good.height; j++)
1144 memcpy((char *)ovoid + ooffs + j * opitch,
1145 (char *)output + ((j + origin_y) * oroi_full.width + origin_x) * out_bpp,
1146 (size_t)oroi_good.width * out_bpp);
1147
1148 dt_free_align(input);
1149 dt_free_align(output);
1150 input = output = NULL;
1151 }
1152
1153 /* copy back final processed_maximum */
1154 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1155
1156 if(input != NULL) dt_free_align(input);
1157 if(output != NULL) dt_free_align(output);
1158 piece->pipe->tiling = 0;
1159 return;
1160
1161 error:
1162 dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
1163 // fall through
1164
1165 fallback:
1166 if(input != NULL) dt_free_align(input);
1167 if(output != NULL) dt_free_align(output);
1168 piece->pipe->tiling = 0;
1169 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] fall back to standard processing for module '%s'\n",
1170 self->op);
1171 self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
1172 return;
1173 }
1174
1175
1176
1177 /* if a module does not implement process_tiling() by itself, this function is called instead.
1178 _default_process_tiling_ptp() is able to handle standard cases where pixels do not change their places.
1179 _default_process_tiling_roi() takes care of all other cases where image gets distorted and for module
1180 "clipping",
1181 "flip" as this may flip or mirror the image. */
default_process_tiling(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1182 void default_process_tiling(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1183 const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
1184 const dt_iop_roi_t *const roi_out, const int in_bpp)
1185 {
1186 if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1187 _default_process_tiling_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1188 else
1189 _default_process_tiling_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1190 return;
1191 }
1192
1193
1194
1195 #ifdef HAVE_OPENCL
1196 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_cl_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1197 static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1198 const void *const ivoid, void *const ovoid,
1199 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1200 const int in_bpp)
1201 {
1202 cl_int err = -999;
1203 cl_mem input = NULL;
1204 cl_mem output = NULL;
1205 cl_mem pinned_input = NULL;
1206 cl_mem pinned_output = NULL;
1207 void *input_buffer = NULL;
1208 void *output_buffer = NULL;
1209
1210 dt_iop_buffer_dsc_t dsc;
1211 self->output_format(self, piece->pipe, piece, &dsc);
1212 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1213
1214 const int devid = piece->pipe->devid;
1215 const int ipitch = roi_in->width * in_bpp;
1216 const int opitch = roi_out->width * out_bpp;
1217 const int max_bpp = _max(in_bpp, out_bpp);
1218
1219 /* get tiling requirements of module */
1220 dt_develop_tiling_t tiling = { 0 };
1221 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1222
1223 /* shall we use pinned memory transfers? */
1224 int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1225 const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1226 // which seemingly get allocated not only on
1227 // host but also on device (why???)
1228 const float pinned_buffer_slack
1229 = use_pinned_memory
1230 ? 0.85f
1231 : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1232
1233 /* calculate optimal size of tiles */
1234 float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1235 headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1236 const float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1237 const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1238 const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1239 pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1240 const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1241 int width = _min(roi_in->width, darktable.opencl->dev[devid].max_image_width);
1242 int height = _min(roi_in->height, darktable.opencl->dev[devid].max_image_height);
1243
1244 /* shrink tile size in case it would exceed singlebuffer size */
1245 if((float)width * height * max_bpp * maxbuf > singlebuffer)
1246 {
1247 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1248
1249 if(width < height && scale >= 0.333f)
1250 {
1251 height = floorf(height * scale);
1252 }
1253 else if(height <= width && scale >= 0.333f)
1254 {
1255 width = floorf(width * scale);
1256 }
1257 else
1258 {
1259 width = floorf(width * sqrtf(scale));
1260 height = floorf(height * sqrtf(scale));
1261 }
1262 }
1263
1264 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1265 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1266 {
1267 width = height = floorf(sqrtf((float)width * height));
1268 }
1269
1270
1271 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1272 Modules will report alignment requirements via xalign and yalign within tiling_callback().
1273 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1274 direction. Additional alignment requirements are set via definition of CL_ALIGNMENT.
1275 We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
1276 that is identical to image width/height no special alignment is done. */
1277
1278 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1279 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1280
1281 /* determining alignment requirement for tile width/height.
1282 in case of tile width also align according to definition of CL_ALIGNMENT */
1283 const unsigned int walign = _lcm(xyalign, CL_ALIGNMENT);
1284 const unsigned int halign = xyalign;
1285
1286 assert(xyalign != 0 && walign != 0 && halign != 0);
1287
1288 /* properly align tile width and height by making them smaller if needed */
1289 if(width < roi_in->width) width = (width / walign) * walign;
1290 if(height < roi_in->height) height = (height / halign) * halign;
1291
1292 /* also make sure that overlap follows alignment rules by making it wider when needed */
1293 const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
1294 : tiling.overlap;
1295
1296
1297 /* calculate effective tile size */
1298 const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
1299 const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
1300
1301
1302 /* calculate number of tiles */
1303 const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
1304 const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
1305
1306 /* sanity check: don't run wild on too many tiles */
1307 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1308 {
1309 dt_print(DT_DEBUG_OPENCL,
1310 "[default_process_tiling_cl_ptp] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1311 self->op, tiles_x, tiles_y);
1312 return FALSE;
1313 }
1314
1315
1316 dt_print(DT_DEBUG_OPENCL,
1317 "[default_process_tiling_cl_ptp] use tiling on module '%s' for image with full size %d x %d\n",
1318 self->op, roi_in->width, roi_in->height);
1319 dt_print(DT_DEBUG_OPENCL,
1320 "[default_process_tiling_cl_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
1321 tiles_x, tiles_y, width, height, overlap);
1322
1323 /* store processed_maximum to be re-used and aggregated */
1324 float processed_maximum_saved[4];
1325 float processed_maximum_new[4] = { 1.0f };
1326 for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1327
1328 /* reserve pinned input and output memory for host<->device data transfer */
1329 if(use_pinned_memory)
1330 {
1331 pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1332 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1333 if(pinned_input == NULL)
1334 {
1335 dt_print(DT_DEBUG_OPENCL,
1336 "[default_process_tiling_cl_ptp] could not alloc pinned input buffer for module '%s'\n",
1337 self->op);
1338 use_pinned_memory = 0;
1339 }
1340 }
1341
1342 if(use_pinned_memory)
1343 {
1344
1345 input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1346 (size_t)width * height * in_bpp);
1347 if(input_buffer == NULL)
1348 {
1349 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned input buffer to host "
1350 "memory for module '%s'\n",
1351 self->op);
1352 use_pinned_memory = 0;
1353 }
1354 }
1355
1356 if(use_pinned_memory)
1357 {
1358
1359 pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1360 CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1361 if(pinned_output == NULL)
1362 {
1363 dt_print(DT_DEBUG_OPENCL,
1364 "[default_process_tiling_cl_ptp] could not alloc pinned output buffer for module '%s'\n",
1365 self->op);
1366 use_pinned_memory = 0;
1367 }
1368 }
1369
1370 if(use_pinned_memory)
1371 {
1372
1373 output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1374 (size_t)width * height * out_bpp);
1375 if(output_buffer == NULL)
1376 {
1377 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned output buffer to host "
1378 "memory for module '%s'\n",
1379 self->op);
1380 use_pinned_memory = 0;
1381 }
1382 }
1383
1384 /* iterate over tiles */
1385 for(size_t tx = 0; tx < tiles_x; tx++)
1386 for(size_t ty = 0; ty < tiles_y; ty++)
1387 {
1388 piece->pipe->tiling = 1;
1389
1390 const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
1391 const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
1392
1393 /* no need to process (end)tiles that are smaller than the total overlap area */
1394 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
1395
1396 /* origin and region of effective part of tile, which we want to store later */
1397 size_t origin[] = { 0, 0, 0 };
1398 size_t region[] = { wd, ht, 1 };
1399
1400 /* roi_in and roi_out for process_cl on subbuffer */
1401 dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1402 dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1403
1404
1405 /* offsets of tile into ivoid and ovoid */
1406 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
1407 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
1408
1409
1410 dt_print(DT_DEBUG_OPENCL,
1411 "[default_process_tiling_cl_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n", tx, ty, wd,
1412 ht, tx * tile_wd, ty * tile_ht);
1413
1414 /* get input and output buffers */
1415 input = dt_opencl_alloc_device(devid, wd, ht, in_bpp);
1416 if(input == NULL) goto error;
1417 output = dt_opencl_alloc_device(devid, wd, ht, out_bpp);
1418 if(output == NULL) goto error;
1419
1420 if(use_pinned_memory)
1421 {
1422 /* prepare pinned input tile buffer: copy part of input image */
1423 #ifdef _OPENMP
1424 #pragma omp parallel for default(none) \
1425 dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1426 dt_omp_sharedconst(ioffs, wd, ht) shared(input_buffer, width) \
1427 schedule(static)
1428 #endif
1429 for(size_t j = 0; j < ht; j++)
1430 memcpy((char *)input_buffer + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1431 (size_t)wd * in_bpp);
1432
1433 /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1434 err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, origin, region,
1435 wd * in_bpp, CL_TRUE);
1436 if(err != CL_SUCCESS) goto error;
1437 }
1438 else
1439 {
1440 /* blocking direct memory transfer: host input image -> opencl/device tile */
1441 err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, origin, region, ipitch,
1442 CL_TRUE);
1443 if(err != CL_SUCCESS) goto error;
1444 }
1445
1446 /* take original processed_maximum as starting point */
1447 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1448
1449 /* call process_cl of module */
1450 if(!self->process_cl(self, piece, input, output, &iroi, &oroi)) goto error;
1451
1452 /* aggregate resulting processed_maximum */
1453 /* TODO: check if there really can be differences between tiles and take
1454 appropriate action (calculate minimum, maximum, average, ...?) */
1455 for(int k = 0; k < 4; k++)
1456 {
1457 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1458 dt_print(
1459 DT_DEBUG_OPENCL,
1460 "[default_process_tiling_cl_ptp] processed_maximum[%d] differs between tiles in module '%s'\n",
1461 k, self->op);
1462 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1463 }
1464
1465 if(use_pinned_memory)
1466 {
1467 /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1468 err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, origin, region,
1469 wd * out_bpp, CL_TRUE);
1470 if(err != CL_SUCCESS) goto error;
1471 }
1472
1473 /* correct origin and region of tile for overlap.
1474 makes sure that we only copy back the "good" part. */
1475 if(tx > 0)
1476 {
1477 origin[0] += overlap;
1478 region[0] -= overlap;
1479 ooffs += (size_t)overlap * out_bpp;
1480 }
1481 if(ty > 0)
1482 {
1483 origin[1] += overlap;
1484 region[1] -= overlap;
1485 ooffs += (size_t)overlap * opitch;
1486 }
1487
1488 if(use_pinned_memory)
1489 {
1490 /* copy "good" part of tile from pinned output buffer to output image */
1491 #if 0 // def _OPENMP
1492 #pragma omp parallel for default(none) shared(ovoid, ooffs, output_buffer, width, origin, region, \
1493 wd) schedule(static)
1494 #endif
1495 for(size_t j = 0; j < region[1]; j++)
1496 memcpy((char *)ovoid + ooffs + j * opitch,
1497 (char *)output_buffer + ((j + origin[1]) * wd + origin[0]) * out_bpp,
1498 (size_t)region[0] * out_bpp);
1499 }
1500 else
1501 {
1502 /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1503 err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, origin, region,
1504 opitch, CL_TRUE);
1505 if(err != CL_SUCCESS) goto error;
1506 }
1507
1508 /* release input and output buffers */
1509 dt_opencl_release_mem_object(input);
1510 input = NULL;
1511 dt_opencl_release_mem_object(output);
1512 output = NULL;
1513
1514 /* block until opencl queue has finished to free all used event handlers */
1515 if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1516 dt_opencl_finish(devid);
1517 }
1518
1519 /* copy back final processed_maximum */
1520 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1521
1522 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1523 dt_opencl_release_mem_object(pinned_input);
1524 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1525 dt_opencl_release_mem_object(pinned_output);
1526 dt_opencl_release_mem_object(input);
1527 dt_opencl_release_mem_object(output);
1528 piece->pipe->tiling = 0;
1529 return TRUE;
1530
1531 error:
1532 /* copy back stored processed_maximum */
1533 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1534 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1535 dt_opencl_release_mem_object(pinned_input);
1536 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1537 dt_opencl_release_mem_object(pinned_output);
1538 dt_opencl_release_mem_object(input);
1539 dt_opencl_release_mem_object(output);
1540 piece->pipe->tiling = 0;
1541 dt_print(
1542 DT_DEBUG_OPENCL,
1543 "[default_process_tiling_opencl_ptp] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1544 self->op, err);
1545 return FALSE;
1546 }
1547
1548
1549 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
1550 more tiles and larger overlap */
_default_process_tiling_cl_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1551 static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1552 const void *const ivoid, void *const ovoid,
1553 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1554 const int in_bpp)
1555 {
1556 cl_int err = -999;
1557 cl_mem input = NULL;
1558 cl_mem output = NULL;
1559 cl_mem pinned_input = NULL;
1560 cl_mem pinned_output = NULL;
1561 void *input_buffer = NULL;
1562 void *output_buffer = NULL;
1563
1564
1565 //_print_roi(roi_in, "module roi_in");
1566 //_print_roi(roi_out, "module roi_out");
1567
1568 dt_iop_buffer_dsc_t dsc;
1569 self->output_format(self, piece->pipe, piece, &dsc);
1570 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1571
1572 const int devid = piece->pipe->devid;
1573 const int ipitch = roi_in->width * in_bpp;
1574 const int opitch = roi_out->width * out_bpp;
1575 const int max_bpp = _max(in_bpp, out_bpp);
1576
1577 const float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
1578 / ((float)roi_out->width * roi_out->height)));
1579
1580 /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
1581 const int delta = ceilf(fullscale);
1582
1583 /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
1584 const int inacc = RESERVE * delta;
1585
1586 /* get tiling requirements of module */
1587 dt_develop_tiling_t tiling = { 0 };
1588 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1589
1590 /* shall we use pinned memory transfers? */
1591 int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1592 const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1593 // which seemingly get allocated not only on
1594 // host but also on device (why???)
1595 const float pinned_buffer_slack
1596 = use_pinned_memory
1597 ? 0.85f
1598 : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1599
1600 /* calculate optimal size of tiles */
1601 float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1602 headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1603 const float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1604 const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1605 const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1606 pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1607 const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1608
1609 int width = _min(_max(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
1610 int height = _min(_max(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);
1611
1612 /* shrink tile size in case it would exceed singlebuffer size */
1613 if((float)width * height * max_bpp * maxbuf > singlebuffer)
1614 {
1615 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1616
1617 if(width < height && scale >= 0.333f)
1618 {
1619 height = floorf(height * scale);
1620 }
1621 else if(height <= width && scale >= 0.333f)
1622 {
1623 width = floorf(width * scale);
1624 }
1625 else
1626 {
1627 width = floorf(width * sqrtf(scale));
1628 height = floorf(height * sqrtf(scale));
1629 }
1630 }
1631
1632 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1633 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1634 {
1635 width = height = floorf(sqrtf((float)width * height));
1636 }
1637
1638
1639 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1640 Modules will report alignment requirements via xalign and yalign within tiling_callback().
1641 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1642 direction. Additional alignment requirements are set via definition of CL_ALIGNMENT. */
1643
1644 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1645 unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1646 xyalign = _lcm(xyalign, CL_ALIGNMENT);
1647
1648 assert(xyalign != 0);
1649
1650 /* make sure that overlap follows alignment rules by making it wider when needed.
1651 overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
1652 const int overlap_in = _align_up(tiling.overlap, xyalign);
1653 const int overlap_out = ceilf((float)overlap_in / fullscale);
1654
1655 int tiles_x = 1, tiles_y = 1;
1656
1657 /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
1658 normally it is roi_in > roi_out; but let's be prepared */
1659 if(roi_in->width > roi_out->width)
1660 tiles_x = width < roi_in->width
1661 ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
1662 : 1;
1663 else
1664 tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
1665 : 1;
1666
1667 if(roi_in->height > roi_out->height)
1668 tiles_y = height < roi_in->height
1669 ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
1670 : 1;
1671 else
1672 tiles_y = height < roi_out->height
1673 ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
1674 : 1;
1675
1676 /* sanity check: don't run wild on too many tiles */
1677 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1678 {
1679 dt_print(DT_DEBUG_OPENCL,
1680 "[default_process_tiling_cl_roi] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1681 self->op, tiles_x, tiles_y);
1682 return FALSE;
1683 }
1684
1685 /* calculate tile width and height excl. overlap (i.e. the good part) for output.
1686 important for all following processing steps. */
1687 const int tile_wd = _align_up(
1688 roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
1689 const int tile_ht = _align_up(
1690 roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
1691
1692
1693 dt_print(
1694 DT_DEBUG_OPENCL,
1695 "[default_process_tiling_cl_roi] use tiling on module '%s' for image with full input size %d x %d\n",
1696 self->op, roi_in->width, roi_in->height);
1697 dt_print(DT_DEBUG_OPENCL,
1698 "[default_process_tiling_cl_roi] (%d x %d) tiles with max input dimensions %d x %d\n", tiles_x,
1699 tiles_y, width, height);
1700
1701
1702 /* store processed_maximum to be re-used and aggregated */
1703 float processed_maximum_saved[4];
1704 float processed_maximum_new[4] = { 1.0f };
1705 for(int k = 0; k < 4; k++) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1706
1707 /* reserve pinned input and output memory for host<->device data transfer */
1708 if(use_pinned_memory)
1709 {
1710 pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1711 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1712 if(pinned_input == NULL)
1713 {
1714 dt_print(DT_DEBUG_OPENCL,
1715 "[default_process_tiling_cl_roi] could not alloc pinned input buffer for module '%s'\n",
1716 self->op);
1717 use_pinned_memory = 0;
1718 }
1719 }
1720
1721 if(use_pinned_memory)
1722 {
1723
1724 input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1725 (size_t)width * height * in_bpp);
1726 if(input_buffer == NULL)
1727 {
1728 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned input buffer to host "
1729 "memory for module '%s'\n",
1730 self->op);
1731 use_pinned_memory = 0;
1732 }
1733 }
1734
1735 if(use_pinned_memory)
1736 {
1737
1738 pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1739 CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1740 if(pinned_output == NULL)
1741 {
1742 dt_print(DT_DEBUG_OPENCL,
1743 "[default_process_tiling_cl_roi] could not alloc pinned output buffer for module '%s'\n",
1744 self->op);
1745 use_pinned_memory = 0;
1746 }
1747 }
1748
1749 if(use_pinned_memory)
1750 {
1751
1752 output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1753 (size_t)width * height * out_bpp);
1754 if(output_buffer == NULL)
1755 {
1756 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned output buffer to host "
1757 "memory for module '%s'\n",
1758 self->op);
1759 use_pinned_memory = 0;
1760 }
1761 }
1762
1763
1764 /* iterate over tiles */
1765 for(size_t tx = 0; tx < tiles_x; tx++)
1766 for(size_t ty = 0; ty < tiles_y; ty++)
1767 {
1768 piece->pipe->tiling = 1;
1769
1770 /* the output dimensions of the good part of this specific tile */
1771 const size_t wd = (tx + 1) * tile_wd > roi_out->width ? roi_out->width - tx * tile_wd : tile_wd;
1772 const size_t ht = (ty + 1) * tile_ht > roi_out->height ? roi_out->height - ty * tile_ht : tile_ht;
1773
1774 /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1775 iroi_good is calculated by modify_roi_in() of respective module */
1776 dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1777 dt_iop_roi_t oroi_good
1778 = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1779
1780 self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1781
1782 /* clamp iroi_good to not exceed roi_in */
1783 iroi_good.x = _max(iroi_good.x, roi_in->x);
1784 iroi_good.y = _max(iroi_good.y, roi_in->y);
1785 iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1786 iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1787
1788 //_print_roi(&iroi_good, "tile iroi_good");
1789 //_print_roi(&oroi_good, "tile oroi_good");
1790
1791 /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1792 requirements
1793 and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1794 -> generates first estimate of iroi_full */
1795 const int x_in = iroi_good.x;
1796 const int y_in = iroi_good.y;
1797 const int width_in = iroi_good.width;
1798 const int height_in = iroi_good.height;
1799 const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1800 const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1801 const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1802 roi_in->width + roi_in->x - new_x_in);
1803 const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1804 roi_in->height + roi_in->y - new_y_in);
1805
1806 /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1807 * following iterative search */
1808 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1809 dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1810
1811 //_print_roi(&iroi_full, "tile iroi_full before optimization");
1812 //_print_roi(&oroi_full, "tile oroi_full before optimization");
1813
1814 /* try to find a matching oroi_full */
1815 if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1816 {
1817 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] can not handle requested roi's. tiling "
1818 "for module '%s' not possible.\n",
1819 self->op);
1820 goto error;
1821 }
1822
1823
1824 /* make sure that oroi_full at least covers the range of oroi_good.
1825 this step is needed due to the possibility of rounding errors */
1826 oroi_full.x = _min(oroi_full.x, oroi_good.x);
1827 oroi_full.y = _min(oroi_full.y, oroi_good.y);
1828 oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1829 oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1830
1831 /* clamp oroi_full to not exceed roi_out */
1832 oroi_full.x = _max(oroi_full.x, roi_out->x);
1833 oroi_full.y = _max(oroi_full.y, roi_out->y);
1834 oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1835 oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1836
1837
1838 /* calculate final iroi_full */
1839 self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1840
1841 /* clamp iroi_full to not exceed roi_in */
1842 iroi_full.x = _max(iroi_full.x, roi_in->x);
1843 iroi_full.y = _max(iroi_full.y, roi_in->y);
1844 iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1845 iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1846
1847 //_print_roi(&iroi_full, "tile iroi_full");
1848 //_print_roi(&oroi_full, "tile oroi_full");
1849
1850 /* offsets of tile into ivoid and ovoid */
1851 const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1852 const size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1853 + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1854
1855 dt_print(DT_DEBUG_OPENCL,
1856 "[default_process_tiling_cl_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n", tx, ty,
1857 iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1858
1859 /* origin and region of full input tile */
1860 size_t iorigin[] = { 0, 0, 0 };
1861 size_t iregion[] = { iroi_full.width, iroi_full.height, 1 };
1862
1863 /* origin and region of full output tile */
1864 size_t oforigin[] = { 0, 0, 0 };
1865 size_t ofregion[] = { oroi_full.width, oroi_full.height, 1 };
1866
1867 /* origin and region of good part of output tile */
1868 size_t oorigin[] = { oroi_good.x - oroi_full.x, oroi_good.y - oroi_full.y, 0 };
1869 size_t oregion[] = { oroi_good.width, oroi_good.height, 1 };
1870
1871 /* get opencl input and output buffers */
1872 input = dt_opencl_alloc_device(devid, iroi_full.width, iroi_full.height, in_bpp);
1873 if(input == NULL) goto error;
1874
1875 output = dt_opencl_alloc_device(devid, oroi_full.width, oroi_full.height, out_bpp);
1876 if(output == NULL) goto error;
1877
1878 if(use_pinned_memory)
1879 {
1880 /* prepare pinned input tile buffer: copy part of input image */
1881 #ifdef _OPENMP
1882 #pragma omp parallel for default(none) \
1883 dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1884 dt_omp_sharedconst(ioffs) shared(input_buffer, width, iroi_full) schedule(static)
1885 #endif
1886 for(size_t j = 0; j < iroi_full.height; j++)
1887 memcpy((char *)input_buffer + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1888 (size_t)iroi_full.width * in_bpp);
1889
1890 /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1891 err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, iorigin, iregion,
1892 (size_t)iroi_full.width * in_bpp, CL_TRUE);
1893 if(err != CL_SUCCESS) goto error;
1894 }
1895 else
1896 {
1897 /* blocking direct memory transfer: host input image -> opencl/device tile */
1898 err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, iorigin, iregion,
1899 ipitch, CL_TRUE);
1900 if(err != CL_SUCCESS) goto error;
1901 }
1902
1903 /* take original processed_maximum as starting point */
1904 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1905
1906 /* call process_cl of module */
1907 if(!self->process_cl(self, piece, input, output, &iroi_full, &oroi_full)) goto error;
1908
1909 /* aggregate resulting processed_maximum */
1910 /* TODO: check if there really can be differences between tiles and take
1911 appropriate action (calculate minimum, maximum, average, ...?) */
1912 for(int k = 0; k < 4; k++)
1913 {
1914 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1915 dt_print(
1916 DT_DEBUG_OPENCL,
1917 "[default_process_tiling_cl_roi] processed_maximum[%d] differs between tiles in module '%s'\n",
1918 k, self->op);
1919 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1920 }
1921
1922 if(use_pinned_memory)
1923 {
1924 /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1925 err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, oforigin, ofregion,
1926 (size_t)oroi_full.width * out_bpp, CL_TRUE);
1927 if(err != CL_SUCCESS) goto error;
1928
1929 /* copy "good" part of tile from pinned output buffer to output image */
1930 #ifdef _OPENMP
1931 #pragma omp parallel for default(none) \
1932 dt_omp_firstprivate(ipitch, opitch, ovoid, out_bpp) \
1933 dt_omp_sharedconst(ooffs) shared(output_buffer, oroi_full, oorigin, oregion) \
1934 schedule(static)
1935 #endif
1936 for(size_t j = 0; j < oregion[1]; j++)
1937 memcpy((char *)ovoid + ooffs + j * opitch,
1938 (char *)output_buffer + ((j + oorigin[1]) * oroi_full.width + oorigin[0]) * out_bpp,
1939 (size_t)oregion[0] * out_bpp);
1940 }
1941 else
1942 {
1943 /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1944 err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, oorigin, oregion,
1945 opitch, CL_TRUE);
1946 if(err != CL_SUCCESS) goto error;
1947 }
1948
1949 /* release input and output buffers */
1950 dt_opencl_release_mem_object(input);
1951 input = NULL;
1952 dt_opencl_release_mem_object(output);
1953 output = NULL;
1954
1955 /* block until opencl queue has finished to free all used event handlers */
1956 if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1957 dt_opencl_finish(devid);
1958 }
1959
1960 /* copy back final processed_maximum */
1961 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1962 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1963 dt_opencl_release_mem_object(pinned_input);
1964 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1965 dt_opencl_release_mem_object(pinned_output);
1966 dt_opencl_release_mem_object(input);
1967 dt_opencl_release_mem_object(output);
1968 piece->pipe->tiling = 0;
1969 return TRUE;
1970
1971 error:
1972 /* copy back stored processed_maximum */
1973 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1974 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1975 dt_opencl_release_mem_object(pinned_input);
1976 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1977 dt_opencl_release_mem_object(pinned_output);
1978 dt_opencl_release_mem_object(input);
1979 dt_opencl_release_mem_object(output);
1980 piece->pipe->tiling = 0;
1981 dt_print(
1982 DT_DEBUG_OPENCL,
1983 "[default_process_tiling_opencl_roi] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1984 self->op, err);
1985 return FALSE;
1986 }
1987
1988
1989
1990 /* if a module does not implement process_tiling_cl() by itself, this function is called instead.
1991 _default_process_tiling_cl_ptp() is able to handle standard cases where pixels do not change their places.
1992 _default_process_tiling_cl_roi() takes care of all other cases where image gets distorted. */
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1993 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1994 const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
1995 const dt_iop_roi_t *const roi_out, const int in_bpp)
1996 {
1997 if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1998 return _default_process_tiling_cl_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1999 else
2000 return _default_process_tiling_cl_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
2001 }
2002
2003 #else
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)2004 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2005 const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
2006 const dt_iop_roi_t *const roi_out, const int in_bpp)
2007 {
2008 return FALSE;
2009 }
2010 #endif
2011
2012
2013 /* If a module does not implement tiling_callback() by itself, this function is called instead.
2014 Default is an image size factor of 2 (i.e. input + output buffer needed), no overhead (1),
2015 no overlap between tiles, and an pixel alignment of 1 in x and y direction, i.e. no special
2016 alignment required. Simple pixel to pixel modules (take tonecurve as an example) can happily
2017 live with that.
2018 (1) Small overhead like look-up-tables in tonecurve can be ignored safely. */
default_tiling_callback(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * roi_in,const dt_iop_roi_t * roi_out,struct dt_develop_tiling_t * tiling)2019 void default_tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2020 const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
2021 struct dt_develop_tiling_t *tiling)
2022 {
2023 const float ioratio
2024 = ((float)roi_out->width * (float)roi_out->height) / ((float)roi_in->width * (float)roi_in->height);
2025
2026 tiling->factor = 1.0f + ioratio;
2027 tiling->factor_cl = tiling->factor; // by default, we need the same memory on host or GPU
2028 tiling->maxbuf = 1.0f;
2029 tiling->maxbuf_cl = tiling->maxbuf;
2030 tiling->overhead = 0;
2031 tiling->overlap = 0;
2032 tiling->xalign = 1;
2033 tiling->yalign = 1;
2034
2035 if((self->flags() & IOP_FLAGS_TILING_FULL_ROI) == IOP_FLAGS_TILING_FULL_ROI) tiling->overlap = 4;
2036
2037 if(self->iop_order > dt_ioppr_get_iop_order(piece->pipe->iop_order_list, "demosaic", 0)) return;
2038
2039 // all operations that work with mosaiced data should respect pattern size!
2040
2041 if(!piece->pipe->dsc.filters) return;
2042
2043 if(piece->pipe->dsc.filters == 9u)
2044 {
2045 // X-Trans, sensor is 6x6
2046 tiling->xalign = 6;
2047 tiling->yalign = 6;
2048 }
2049 else
2050 {
2051 // Bayer, good old 2x2
2052 tiling->xalign = 2;
2053 tiling->yalign = 2;
2054 }
2055
2056 return;
2057 }
2058
dt_tiling_piece_fits_host_memory(const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)2059 int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp,
2060 const float factor, const size_t overhead)
2061 {
2062 static int host_memory_limit = -1;
2063
2064 /* first time run */
2065 if(host_memory_limit < 0)
2066 {
2067 host_memory_limit = dt_conf_get_int("host_memory_limit");
2068
2069 /* don't let the user play games with us */
2070 if(host_memory_limit != 0) host_memory_limit = CLAMPI(host_memory_limit, 500, 50000);
2071 dt_conf_set_int("host_memory_limit", host_memory_limit);
2072 }
2073
2074 const float requirement = factor * width * height * bpp + overhead;
2075
2076 if(host_memory_limit == 0 || requirement <= host_memory_limit * 1024.0f * 1024.0f)
2077 return TRUE;
2078 else
2079 return FALSE;
2080 }
2081
2082 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2083 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2084 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2085