1 /*
2 This file is part of darktable,
3 Copyright (C) 2011-2020 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19
20 #include "develop/tiling.h"
21 #include "common/opencl.h"
22 #include "control/control.h"
23 #include "develop/blend.h"
24 #include "develop/pixelpipe.h"
25
26 #include <assert.h>
27 #include <math.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <unistd.h>
32
33 #define CLAMPI(a, mn, mx) ((a) < (mn) ? (mn) : ((a) > (mx) ? (mx) : (a)))
34
35
36 /* this defines an additional alignment requirement for opencl image width.
37 It can have strong effects on processing speed. Reasonable values are a
38 power of 2. set to 1 for no effect. */
39 #define CL_ALIGNMENT 4
40
41 /* parameter RESERVE for extended roi_in sizes due to inaccuracies when doing
42 roi_out -> roi_in estimations.
43 Needs to be increased if tiling fails due to insufficient buffer sizes. */
44 #define RESERVE 5
45
46
47 /* greatest common divisor */
_gcd(unsigned a,unsigned b)48 static unsigned _gcd(unsigned a, unsigned b)
49 {
50 unsigned t;
51 while(b != 0)
52 {
53 t = b;
54 b = a % b;
55 a = t;
56 }
57 return MAX(a, 1);
58 }
59
60 /* least common multiple */
_lcm(unsigned a,unsigned b)61 static unsigned _lcm(unsigned a, unsigned b)
62 {
63 return (((unsigned long)a * b) / _gcd(a, b));
64 }
65
66
_min(int a,int b)67 static inline int _min(int a, int b)
68 {
69 return a < b ? a : b;
70 }
71
_max(int a,int b)72 static inline int _max(int a, int b)
73 {
74 return a > b ? a : b;
75 }
76
77
_align_up(int n,int a)78 static inline int _align_up(int n, int a)
79 {
80 return n % a != 0 ? (n / a + 1) * a : n;
81 }
82
_align_down(int n,int a)83 static inline int _align_down(int n, int a)
84 {
85 return n % a != 0 ? (n / a) * a : n;
86 }
87
88
_print_roi(const dt_iop_roi_t * roi,const char * label)89 void _print_roi(const dt_iop_roi_t *roi, const char *label)
90 {
91 printf("{ %5d %5d %5d %5d %.6f } %s\n", roi->x, roi->y, roi->width, roi->height, roi->scale, label);
92 }
93
94
95 #if 0
96 static void
97 _nm_constraints(double x[], int n)
98 {
99 x[0] = fabs(x[0]);
100 x[1] = fabs(x[1]);
101 x[2] = fabs(x[2]);
102 x[3] = fabs(x[3]);
103
104 if(x[0] > 1.0) x[0] = 1.0 - x[0];
105 if(x[1] > 1.0) x[1] = 1.0 - x[1];
106 if(x[2] > 1.0) x[2] = 1.0 - x[2];
107 if(x[3] > 1.0) x[3] = 1.0 - x[3];
108
109 }
110 #endif
111
_nm_fitness(double x[],void * rest[])112 static double _nm_fitness(double x[], void *rest[])
113 {
114 struct dt_iop_module_t *self = (struct dt_iop_module_t *)rest[0];
115 struct dt_dev_pixelpipe_iop_t *piece = (struct dt_dev_pixelpipe_iop_t *)rest[1];
116 struct dt_iop_roi_t *iroi = (struct dt_iop_roi_t *)rest[2];
117 struct dt_iop_roi_t *oroi = (struct dt_iop_roi_t *)rest[3];
118
119 dt_iop_roi_t oroi_test = *oroi;
120 oroi_test.x = x[0] * piece->iwidth;
121 oroi_test.y = x[1] * piece->iheight;
122 oroi_test.width = x[2] * piece->iwidth;
123 oroi_test.height = x[3] * piece->iheight;
124
125 dt_iop_roi_t iroi_probe = *iroi;
126 self->modify_roi_in(self, piece, &oroi_test, &iroi_probe);
127
128 double fitness = 0.0;
129
130 fitness += (double)(iroi_probe.x - iroi->x) * (iroi_probe.x - iroi->x);
131 fitness += (double)(iroi_probe.y - iroi->y) * (iroi_probe.y - iroi->y);
132 fitness += (double)(iroi_probe.width - iroi->width) * (iroi_probe.width - iroi->width);
133 fitness += (double)(iroi_probe.height - iroi->height) * (iroi_probe.height - iroi->height);
134
135 return fitness;
136 }
137
138
139 /* We use a Nelder-Mead simplex algorithm based on an implementation of Michael F. Hutt.
140 It is covered by the following copyright notice: */
141 /*
142 * Program: nmsimplex.c
143 * Author : Michael F. Hutt
144 * http://www.mikehutt.com
145 * 11/3/97
146 *
147 * An implementation of the Nelder-Mead simplex method.
148 *
149 * Copyright (c) 1997-2011 <Michael F. Hutt>
150 *
151 * Permission is hereby granted, free of charge, to any person obtaining
152 * a copy of this software and associated documentation files (the
153 * "Software"), to deal in the Software without restriction, including
154 * without limitation the rights to use, copy, modify, merge, publish,
155 * distribute, sublicense, and/or sell copies of the Software, and to
156 * permit persons to whom the Software is furnished to do so, subject to
157 * the following conditions:
158 *
159 * The above copyright notice and this permission notice shall be
160 * included in all copies or substantial portions of the Software.
161 *
162 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
163 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
164 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
165 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
166 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
167 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
168 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
169 *
170 */
171
172 #define MAX_IT 1000 /* maximum number of iterations */
173 #define ALPHA 1.0 /* reflection coefficient */
174 #define BETA 0.5 /* contraction coefficient */
175 #define GAMMA 2.0 /* expansion coefficient */
176
_simplex(double (* objfunc)(double[],void * []),double start[],int n,double EPSILON,double scale,int maxiter,void (* constrain)(double[],int n),void * rest[])177 static int _simplex(double (*objfunc)(double[], void *[]), double start[], int n, double EPSILON,
178 double scale, int maxiter, void (*constrain)(double[], int n), void *rest[])
179 {
180
181 int vs; /* vertex with smallest value */
182 int vh; /* vertex with next smallest value */
183 int vg; /* vertex with largest value */
184
185 int i, j = 0, m, row;
186 int k; /* track the number of function evaluations */
187 int itr; /* track the number of iterations */
188
189 double **v; /* holds vertices of simplex */
190 double pn, qn; /* values used to create initial simplex */
191 double *f; /* value of function at each vertex */
192 double fr; /* value of function at reflection point */
193 double fe; /* value of function at expansion point */
194 double fc; /* value of function at contraction point */
195 double *vr; /* reflection - coordinates */
196 double *ve; /* expansion - coordinates */
197 double *vc; /* contraction - coordinates */
198 double *vm; /* centroid - coordinates */
199
200 double fsum, favg, s, cent;
201
202 /* dynamically allocate arrays */
203
204 /* allocate the rows of the arrays */
205 v = (double **)malloc(sizeof(double *) * (n + 1));
206 f = (double *)malloc(sizeof(double) * (n + 1));
207 vr = (double *)malloc(sizeof(double) * n);
208 ve = (double *)malloc(sizeof(double) * n);
209 vc = (double *)malloc(sizeof(double) * n);
210 vm = (double *)malloc(sizeof(double) * n);
211
212 /* allocate the columns of the arrays */
213 for(i = 0; i <= n; i++)
214 {
215 v[i] = (double *)malloc(sizeof(double) * n);
216 }
217
218 /* create the initial simplex */
219 /* assume one of the vertices is 0,0 */
220
221 pn = scale * (sqrt(n + 1) - 1 + n) / (n * sqrt(2));
222 qn = scale * (sqrt(n + 1) - 1) / (n * sqrt(2));
223
224 for(i = 0; i < n; i++)
225 {
226 v[0][i] = start[i];
227 }
228
229 for(i = 1; i <= n; i++)
230 {
231 for(j = 0; j < n; j++)
232 {
233 if(i - 1 == j)
234 {
235 v[i][j] = pn + start[j];
236 }
237 else
238 {
239 v[i][j] = qn + start[j];
240 }
241 }
242 }
243
244 if(constrain != NULL)
245 {
246 constrain(v[j], n);
247 }
248 /* find the initial function values */
249 for(j = 0; j <= n; j++)
250 {
251 f[j] = objfunc(v[j], rest);
252 }
253
254 k = n + 1;
255
256 #if 0
257 /* print out the initial values */
258 printf ("Initial Values\n");
259 for (j = 0; j <= n; j++)
260 {
261 for (i = 0; i < n; i++)
262 {
263 printf ("%f %f\n", v[j][i], f[j]);
264 }
265 }
266 #endif
267
268 /* begin the main loop of the minimization */
269 for(itr = 1; itr <= maxiter; itr++)
270 {
271 /* find the index of the largest value */
272 vg = 0;
273 for(j = 0; j <= n; j++)
274 {
275 if(f[j] > f[vg])
276 {
277 vg = j;
278 }
279 }
280
281 /* find the index of the smallest value */
282 vs = 0;
283 for(j = 0; j <= n; j++)
284 {
285 if(f[j] < f[vs])
286 {
287 vs = j;
288 }
289 }
290
291 /* find the index of the second largest value */
292 vh = vs;
293 for(j = 0; j <= n; j++)
294 {
295 if(f[j] > f[vh] && f[j] < f[vg])
296 {
297 vh = j;
298 }
299 }
300
301 /* calculate the centroid */
302 for(j = 0; j <= n - 1; j++)
303 {
304 cent = 0.0;
305 for(m = 0; m <= n; m++)
306 {
307 if(m != vg)
308 {
309 cent += v[m][j];
310 }
311 }
312 vm[j] = cent / n;
313 }
314
315 /* reflect vg to new vertex vr */
316 for(j = 0; j <= n - 1; j++)
317 {
318 /*vr[j] = (1+ALPHA)*vm[j] - ALPHA*v[vg][j]; */
319 vr[j] = vm[j] + ALPHA * (vm[j] - v[vg][j]);
320 }
321 if(constrain != NULL)
322 {
323 constrain(vr, n);
324 }
325 fr = objfunc(vr, rest);
326 k++;
327
328 if(fr < f[vh] && fr >= f[vs])
329 {
330 for(j = 0; j <= n - 1; j++)
331 {
332 v[vg][j] = vr[j];
333 }
334 f[vg] = fr;
335 }
336
337 /* investigate a step further in this direction */
338 if(fr < f[vs])
339 {
340 for(j = 0; j <= n - 1; j++)
341 {
342 /*ve[j] = GAMMA*vr[j] + (1-GAMMA)*vm[j]; */
343 ve[j] = vm[j] + GAMMA * (vr[j] - vm[j]);
344 }
345 if(constrain != NULL)
346 {
347 constrain(ve, n);
348 }
349 fe = objfunc(ve, rest);
350 k++;
351
352 /* by making fe < fr as opposed to fe < f[vs],
353 Rosenbrocks function takes 63 iterations as opposed
354 to 64 when using double variables. */
355
356 if(fe < fr)
357 {
358 for(j = 0; j <= n - 1; j++)
359 {
360 v[vg][j] = ve[j];
361 }
362 f[vg] = fe;
363 }
364 else
365 {
366 for(j = 0; j <= n - 1; j++)
367 {
368 v[vg][j] = vr[j];
369 }
370 f[vg] = fr;
371 }
372 }
373
374 /* check to see if a contraction is necessary */
375 if(fr >= f[vh])
376 {
377 if(fr < f[vg] && fr >= f[vh])
378 {
379 /* perform outside contraction */
380 for(j = 0; j <= n - 1; j++)
381 {
382 /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
383 vc[j] = vm[j] + BETA * (vr[j] - vm[j]);
384 }
385 if(constrain != NULL)
386 {
387 constrain(vc, n);
388 }
389 fc = objfunc(vc, rest);
390 k++;
391 }
392 else
393 {
394 /* perform inside contraction */
395 for(j = 0; j <= n - 1; j++)
396 {
397 /*vc[j] = BETA*v[vg][j] + (1-BETA)*vm[j]; */
398 vc[j] = vm[j] - BETA * (vm[j] - v[vg][j]);
399 }
400 if(constrain != NULL)
401 {
402 constrain(vc, n);
403 }
404 fc = objfunc(vc, rest);
405 k++;
406 }
407
408
409 if(fc < f[vg])
410 {
411 for(j = 0; j <= n - 1; j++)
412 {
413 v[vg][j] = vc[j];
414 }
415 f[vg] = fc;
416 }
417 /* at this point the contraction is not successful,
418 we must halve the distance from vs to all the
419 vertices of the simplex and then continue.
420 10/31/97 - modified to account for ALL vertices.
421 */
422 else
423 {
424 for(row = 0; row <= n; row++)
425 {
426 if(row != vs)
427 {
428 for(j = 0; j <= n - 1; j++)
429 {
430 v[row][j] = v[vs][j] + (v[row][j] - v[vs][j]) / 2.0;
431 }
432 }
433 }
434 if(constrain != NULL)
435 {
436 constrain(v[vg], n);
437 }
438 f[vg] = objfunc(v[vg], rest);
439 k++;
440 if(constrain != NULL)
441 {
442 constrain(v[vh], n);
443 }
444 f[vh] = objfunc(v[vh], rest);
445 k++;
446 }
447 }
448
449 #if 0
450 /* print out the value at each iteration */
451 printf ("Iteration %d\n", itr);
452 for (j = 0; j <= n; j++)
453 {
454 for (i = 0; i < n; i++)
455 {
456 printf ("%f %f\n", v[j][i], f[j]);
457 }
458 }
459 #endif
460
461 /* test for convergence */
462 fsum = 0.0;
463 for(j = 0; j <= n; j++)
464 {
465 fsum += f[j];
466 }
467 favg = fsum / (n + 1);
468 s = 0.0;
469 for(j = 0; j <= n; j++)
470 {
471 s += pow((f[j] - favg), 2.0) / (n);
472 }
473 s = sqrt(s);
474 if(s < EPSILON) break;
475 }
476 /* end main loop of the minimization */
477
478 /* find the index of the smallest value */
479 vs = 0;
480 for(j = 0; j <= n; j++)
481 {
482 if(f[j] < f[vs])
483 {
484 vs = j;
485 }
486 }
487
488 #if 0
489 printf ("The minimum was found at\n");
490 for (j = 0; j < n; j++)
491 {
492 printf ("%e\n", v[vs][j]);
493 start[j] = v[vs][j];
494 }
495 double min = objfunc (v[vs], rest);
496 printf ("Function value at minimum %f\n", min);
497 k++;
498 printf ("%d Function Evaluations\n", k);
499 printf ("%d Iterations through program\n", itr);
500 #endif
501
502 free(f);
503 free(vr);
504 free(ve);
505 free(vc);
506 free(vm);
507 for(i = 0; i <= n; i++)
508 {
509 free(v[i]);
510 }
511 free(v);
512 return itr;
513 }
514
515
_nm_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta)516 static int _nm_fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
517 const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta)
518 {
519 void *rest[4] = { (void *)self, (void *)piece, (void *)iroi, (void *)oroi };
520 double start[4] = { (float)oroi->x / piece->iwidth, (float)oroi->y / piece->iheight,
521 (float)oroi->width / piece->iwidth, (float)oroi->height / piece->iheight };
522 double epsilon = (double)delta / MIN(piece->iwidth, piece->iheight);
523 int maxiter = 1000;
524
525 int iter = _simplex(_nm_fitness, start, 4, epsilon, 1.0, maxiter, NULL, rest);
526
527 // printf("_simplex: %d, delta: %d, epsilon: %f\n", iter, delta, epsilon);
528
529 oroi->x = start[0] * piece->iwidth;
530 oroi->y = start[1] * piece->iheight;
531 oroi->width = start[2] * piece->iwidth;
532 oroi->height = start[3] * piece->iheight;
533
534 return (iter <= maxiter);
535 }
536
537
538
539 /* find a matching oroi_full by probing start value of oroi and get corresponding input roi into iroi_probe.
540 We search in two steps. first by a simplicistic iterative search which will succeed in most cases.
541 If this does not converge, we do a downhill simplex (nelder-mead) fitting */
_fit_output_to_input_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * iroi,dt_iop_roi_t * oroi,int delta,int iter)542 static int _fit_output_to_input_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
543 const dt_iop_roi_t *iroi, dt_iop_roi_t *oroi, int delta, int iter)
544 {
545 dt_iop_roi_t iroi_probe = *iroi;
546 dt_iop_roi_t save_oroi = *oroi;
547
548 // try to go the easy way. this works in many cases where output is
549 // just like input, only scaled down
550 self->modify_roi_in(self, piece, oroi, &iroi_probe);
551 while((abs((int)iroi_probe.x - (int)iroi->x) > delta || abs((int)iroi_probe.y - (int)iroi->y) > delta
552 || abs((int)iroi_probe.width - (int)iroi->width) > delta
553 || abs((int)iroi_probe.height - (int)iroi->height) > delta) && iter > 0)
554 {
555 //_print_roi(&iroi_probe, "tile iroi_probe");
556 //_print_roi(oroi, "tile oroi old");
557
558 oroi->x += (iroi->x - iroi_probe.x) * oroi->scale / iroi->scale;
559 oroi->y += (iroi->y - iroi_probe.y) * oroi->scale / iroi->scale;
560 oroi->width += (iroi->width - iroi_probe.width) * oroi->scale / iroi->scale;
561 oroi->height += (iroi->height - iroi_probe.height) * oroi->scale / iroi->scale;
562
563 //_print_roi(oroi, "tile oroi new");
564
565 self->modify_roi_in(self, piece, oroi, &iroi_probe);
566 iter--;
567 }
568
569 if(iter > 0) return TRUE;
570
571 *oroi = save_oroi;
572
573 // simplicistic approach did not converge.
574 // try simplex downhill fitting now.
575 // it's crucial that we have a good starting point in oroi, else this
576 // will not converge as well.
577 int fit = _nm_fit_output_to_input_roi(self, piece, iroi, oroi, delta);
578 return fit;
579 }
580
581
582 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)583 static void _default_process_tiling_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
584 const void *const ivoid, void *const ovoid,
585 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
586 const int in_bpp)
587 {
588 void *input = NULL;
589 void *output = NULL;
590 dt_iop_buffer_dsc_t dsc;
591 self->output_format(self, piece->pipe, piece, &dsc);
592 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
593
594 const int ipitch = roi_in->width * in_bpp;
595 const int opitch = roi_out->width * out_bpp;
596 const int max_bpp = _max(in_bpp, out_bpp);
597
598 /* get tiling requirements of module */
599 dt_develop_tiling_t tiling = { 0 };
600 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
601
602 /* shall we enforce tiling */
603 const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
604 /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
605 */
606 if((tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp) && !force_tile)
607 {
608 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] no need to use tiling for module '%s' as no real "
609 "memory saving to be expected\n",
610 self->op);
611 goto fallback;
612 }
613
614 /* calculate optimal size of tiles */
615 float available = force_tile ? 500.0f * 1024.0f * 1024.0f : dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
616 assert(available >= 500.0f * 1024.0f * 1024.0f);
617 /* correct for size of ivoid and ovoid which are needed on top of tiling */
618 available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
619 - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
620 0);
621
622 /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
623 this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
624 reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
625 float singlebuffer = force_tile ? 2.0f * 1024.0f * 1024.0f : dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
626 singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
627 const float factor = fmax(tiling.factor, 1.0f);
628 const float maxbuf = fmax(tiling.maxbuf, 1.0f);
629 singlebuffer = fmax(available / factor, singlebuffer);
630
631 int width = roi_in->width;
632 int height = roi_in->height;
633
634 /* shrink tile size in case it would exceed singlebuffer size */
635 if((float)width * height * max_bpp * maxbuf > singlebuffer)
636 {
637 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
638
639 /* TODO: can we make this more efficient to minimize total overlap between tiles? */
640 if(width < height && scale >= 0.333f)
641 {
642 height = floorf(height * scale);
643 }
644 else if(height <= width && scale >= 0.333f)
645 {
646 width = floorf(width * scale);
647 }
648 else
649 {
650 width = floorf(width * sqrtf(scale));
651 height = floorf(height * sqrtf(scale));
652 }
653 }
654
655 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
656 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
657 {
658 width = height = floorf(sqrtf((float)width * height));
659 }
660
661 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
662 Modules will report alignment requirements via xalign and yalign within tiling_callback().
663 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
664 direction.
665 We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
666 that is identical to image width/height no special alignment is needed. */
667
668 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
669
670 assert(xyalign != 0);
671
672 /* properly align tile width and height by making them smaller if needed */
673 if(width < roi_in->width) width = (width / xyalign) * xyalign;
674 if(height < roi_in->height) height = (height / xyalign) * xyalign;
675
676 /* also make sure that overlap follows alignment rules by making it wider when needed */
677 const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
678 : tiling.overlap;
679
680 /* calculate effective tile size */
681 const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
682 const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
683
684 /* calculate number of tiles */
685 const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
686 const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
687
688 /* sanity check: don't run wild on too many tiles */
689 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
690 {
691 dt_print(DT_DEBUG_DEV,
692 "[default_process_tiling_ptp] gave up tiling for module '%s'. too many tiles: %d x %d\n",
693 self->op, tiles_x, tiles_y);
694 goto error;
695 }
696
697
698 dt_print(DT_DEBUG_DEV,
699 "[default_process_tiling_ptp] use tiling on module '%s' for image with full size %d x %d\n",
700 self->op, roi_in->width, roi_in->height);
701 dt_print(DT_DEBUG_DEV,
702 "[default_process_tiling_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
703 tiles_x, tiles_y, width, height, overlap);
704
705 /* reserve input and output buffers for tiles */
706 input = dt_alloc_align(64, (size_t)width * height * in_bpp);
707 if(input == NULL)
708 {
709 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc input buffer for module '%s'\n",
710 self->op);
711 goto error;
712 }
713 output = dt_alloc_align(64, (size_t)width * height * out_bpp);
714 if(output == NULL)
715 {
716 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] could not alloc output buffer for module '%s'\n",
717 self->op);
718 goto error;
719 }
720
721 /* store processed_maximum to be re-used and aggregated */
722 dt_aligned_pixel_t processed_maximum_saved;
723 dt_aligned_pixel_t processed_maximum_new = { 1.0f };
724 for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
725
726 /* iterate over tiles */
727 for(size_t tx = 0; tx < tiles_x; tx++)
728 {
729 const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
730 for(size_t ty = 0; ty < tiles_y; ty++)
731 {
732 piece->pipe->tiling = 1;
733
734 const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
735
736 /* no need to process end-tiles that are smaller than the total overlap area */
737 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
738
739 /* origin and region of effective part of tile, which we want to store later */
740 size_t origin[] = { 0, 0, 0 };
741 size_t region[] = { wd, ht, 1 };
742
743 /* roi_in and roi_out for process_cl on subbuffer */
744 dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
745 dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
746
747 /* offsets of tile into ivoid and ovoid */
748 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
749 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
750
751
752 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n",
753 tx, ty, wd, ht, tx * tile_wd, ty * tile_ht);
754
755 /* prepare input tile buffer */
756 #ifdef _OPENMP
757 #pragma omp parallel for default(none) \
758 dt_omp_firstprivate(ht, in_bpp, ipitch, ivoid, wd) \
759 dt_omp_sharedconst(ioffs) \
760 shared(input, width) \
761 schedule(static)
762 #endif
763 for(size_t j = 0; j < ht; j++)
764 memcpy((char *)input + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch, (size_t)wd * in_bpp);
765
766 /* take original processed_maximum as starting point */
767 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
768
769 /* call process() of module */
770 self->process(self, piece, input, output, &iroi, &oroi);
771
772 /* aggregate resulting processed_maximum */
773 /* TODO: check if there really can be differences between tiles and take
774 appropriate action (calculate minimum, maximum, average, ...?) */
775 for(int k = 0; k < 4; k++)
776 {
777 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
778 dt_print(
779 DT_DEBUG_DEV,
780 "[default_process_tiling_ptp] processed_maximum[%d] differs between tiles in module '%s'\n", k,
781 self->op);
782 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
783 }
784
785 /* correct origin and region of tile for overlap.
786 make sure that we only copy back the "good" part. */
787 if(tx > 0)
788 {
789 origin[0] += overlap;
790 region[0] -= overlap;
791 ooffs += (size_t)overlap * out_bpp;
792 }
793 if(ty > 0)
794 {
795 origin[1] += overlap;
796 region[1] -= overlap;
797 ooffs += (size_t)overlap * opitch;
798 }
799
800 /* copy "good" part of tile to output buffer */
801 #ifdef _OPENMP
802 #pragma omp parallel for default(none) \
803 dt_omp_firstprivate(opitch, out_bpp, ovoid, wd) \
804 shared(ooffs, output, width, origin, region) \
805 schedule(static)
806 #endif
807 for(size_t j = 0; j < region[1]; j++)
808 memcpy((char *)ovoid + ooffs + j * opitch,
809 (char *)output + ((j + origin[1]) * wd + origin[0]) * out_bpp, (size_t)region[0] * out_bpp);
810 }
811 }
812
813 /* copy back final processed_maximum */
814 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
815
816 if(input != NULL) dt_free_align(input);
817 if(output != NULL) dt_free_align(output);
818 piece->pipe->tiling = 0;
819 return;
820
821 error:
822 dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
823 // fall through
824
825 fallback:
826 if(input != NULL) dt_free_align(input);
827 if(output != NULL) dt_free_align(output);
828 piece->pipe->tiling = 0;
829 dt_print(DT_DEBUG_DEV, "[default_process_tiling_ptp] fall back to standard processing for module '%s'\n",
830 self->op);
831 self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
832 return;
833 }
834
835
836
837 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
838 more tiles and larger overlap */
_default_process_tiling_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)839 static void _default_process_tiling_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
840 const void *const ivoid, void *const ovoid,
841 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
842 const int in_bpp)
843 {
844 void *input = NULL;
845 void *output = NULL;
846
847 //_print_roi(roi_in, "module roi_in");
848 //_print_roi(roi_out, "module roi_out");
849
850 dt_iop_buffer_dsc_t dsc;
851 self->output_format(self, piece->pipe, piece, &dsc);
852 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
853
854 const int ipitch = roi_in->width * in_bpp;
855 const int opitch = roi_out->width * out_bpp;
856 const int max_bpp = _max(in_bpp, out_bpp);
857
858 float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
859 / ((float)roi_out->width * roi_out->height)));
860
861 /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
862 const int delta = ceilf(fullscale);
863
864 /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
865 const int inacc = RESERVE * delta;
866
867 /* get tiling requirements of module */
868 dt_develop_tiling_t tiling = { 0 };
869 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
870
871 /* shall we enforce tiling */
872 const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
873 /* tiling really does not make sense in these cases. standard process() is not better or worse than we are
874 */
875 if((tiling.factor < 2.2f && tiling.overhead < 0.2f * roi_in->width * roi_in->height * max_bpp) && !force_tile)
876 {
877 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] no need to use tiling for module '%s' as no real "
878 "memory saving to be expected\n",
879 self->op);
880 goto fallback;
881 }
882
883 /* calculate optimal size of tiles */
884 float available = force_tile ? 500.0f * 1024.0f * 1024.0f : dt_conf_get_float("host_memory_limit") * 1024.0f * 1024.0f;
885 assert(available >= 500.0f * 1024.0f * 1024.0f);
886 /* correct for size of ivoid and ovoid which are needed on top of tiling */
887 available = fmax(available - ((float)roi_out->width * roi_out->height * out_bpp)
888 - ((float)roi_in->width * roi_in->height * in_bpp) - tiling.overhead,
889 0);
890
891 /* we ignore the above value if singlebuffer_limit (is defined and) is higher than available/tiling.factor.
892 this will mainly allow tiling for modules with high and "unpredictable" memory demand which is
893 reflected in high values of tiling.factor (take bilateral noise reduction as an example). */
894 float singlebuffer = force_tile ? 2.0f * 1024.0f * 1024.0f : dt_conf_get_float("singlebuffer_limit") * 1024.0f * 1024.0f;
895 singlebuffer = fmax(singlebuffer, 2.0f * 1024.0f * 1024.0f);
896 const float factor = fmax(tiling.factor, 1.0f);
897 const float maxbuf = fmax(tiling.maxbuf, 1.0f);
898 singlebuffer = fmax(available / factor, singlebuffer);
899
900 int width = _max(roi_in->width, roi_out->width);
901 int height = _max(roi_in->height, roi_out->height);
902
903 /* shrink tile size in case it would exceed singlebuffer size */
904 if((float)width * height * max_bpp * maxbuf > singlebuffer)
905 {
906 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
907
908 /* TODO: can we make this more efficient to minimize total overlap between tiles? */
909 if(width < height && scale >= 0.333f)
910 {
911 height = floorf(height * scale);
912 }
913 else if(height <= width && scale >= 0.333f)
914 {
915 width = floorf(width * scale);
916 }
917 else
918 {
919 width = floorf(width * sqrtf(scale));
920 height = floorf(height * sqrtf(scale));
921 }
922 }
923
924 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
925 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
926 {
927 width = height = floorf(sqrtf((float)width * height));
928 }
929
930 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
931 Modules will report alignment requirements via xalign and yalign within tiling_callback().
932 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
933 direction. */
934
935 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
936 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
937
938 assert(xyalign != 0);
939
940 /* make sure that overlap follows alignment rules by making it wider when needed.
941 overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
942 const int overlap_in = _align_up(tiling.overlap, xyalign);
943 const int overlap_out = ceilf((float)overlap_in / fullscale);
944
945 int tiles_x = 1, tiles_y = 1;
946
947 /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
948 normally it is roi_in > roi_out; but let's be prepared */
949 if(roi_in->width > roi_out->width)
950 tiles_x = width < roi_in->width
951 ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
952 : 1;
953 else
954 tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
955 : 1;
956
957 if(roi_in->height > roi_out->height)
958 tiles_y = height < roi_in->height
959 ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
960 : 1;
961 else
962 tiles_y = height < roi_out->height
963 ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
964 : 1;
965
966 /* sanity check: don't run wild on too many tiles */
967 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
968 {
969 dt_print(DT_DEBUG_DEV,
970 "[default_process_tiling_roi] gave up tiling for module '%s'. too many tiles: %d x %d\n",
971 self->op, tiles_x, tiles_y);
972 goto error;
973 }
974
975
976 /* calculate tile width and height excl. overlap (i.e. the good part) for output.
977 values are important for all following processing steps. */
978 const int tile_wd = _align_up(
979 roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
980 const int tile_ht = _align_up(
981 roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
982
983 dt_print(DT_DEBUG_DEV,
984 "[default_process_tiling_roi] use tiling on module '%s' for image with full input size %d x %d\n",
985 self->op, roi_in->width, roi_in->height);
986 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] (%d x %d) tiles with max dimensions %d x %d\n",
987 tiles_x, tiles_y, width, height);
988
989
990 /* store processed_maximum to be re-used and aggregated */
991 dt_aligned_pixel_t processed_maximum_saved;
992 dt_aligned_pixel_t processed_maximum_new = { 1.0f };
993 for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
994
995 /* iterate over tiles */
996 for(size_t tx = 0; tx < tiles_x; tx++)
997 for(size_t ty = 0; ty < tiles_y; ty++)
998 {
999 piece->pipe->tiling = 1;
1000
1001 /* the output dimensions of the good part of this specific tile */
1002 const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
1003 const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
1004
1005 /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1006 iroi_good is calculated by modify_roi_in() of respective module */
1007 dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1008 dt_iop_roi_t oroi_good
1009 = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1010
1011 self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1012
1013 /* clamp iroi_good to not exceed roi_in */
1014 iroi_good.x = _max(iroi_good.x, roi_in->x);
1015 iroi_good.y = _max(iroi_good.y, roi_in->y);
1016 iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1017 iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1018
1019 //_print_roi(&iroi_good, "tile iroi_good");
1020 //_print_roi(&oroi_good, "tile oroi_good");
1021
1022 /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1023 requirements
1024 and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1025 -> generates first estimate of iroi_full */
1026 const int x_in = iroi_good.x;
1027 const int y_in = iroi_good.y;
1028 const int width_in = iroi_good.width;
1029 const int height_in = iroi_good.height;
1030 const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1031 const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1032 const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1033 roi_in->width + roi_in->x - new_x_in);
1034 const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1035 roi_in->height + roi_in->y - new_y_in);
1036
1037 /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1038 * following iterative search */
1039 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1040 dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1041
1042 //_print_roi(&iroi_full, "tile iroi_full before optimization");
1043 //_print_roi(&oroi_full, "tile oroi_full before optimization");
1044
1045 /* try to find a matching oroi_full */
1046 if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1047 {
1048 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] can not handle requested roi's. tiling for "
1049 "module '%s' not possible.\n",
1050 self->op);
1051 goto error;
1052 }
1053
1054 //_print_roi(&iroi_full, "tile iroi_full after optimization");
1055 //_print_roi(&oroi_full, "tile oroi_full after optimization");
1056
1057 /* make sure that oroi_full at least covers the range of oroi_good.
1058 this step is needed due to the possibility of rounding errors */
1059 oroi_full.x = _min(oroi_full.x, oroi_good.x);
1060 oroi_full.y = _min(oroi_full.y, oroi_good.y);
1061 oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1062 oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1063
1064 /* clamp oroi_full to not exceed roi_out */
1065 oroi_full.x = _max(oroi_full.x, roi_out->x);
1066 oroi_full.y = _max(oroi_full.y, roi_out->y);
1067 oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1068 oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1069
1070 /* calculate final iroi_full */
1071 self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1072
1073 /* clamp iroi_full to not exceed roi_in */
1074 iroi_full.x = _max(iroi_full.x, roi_in->x);
1075 iroi_full.y = _max(iroi_full.y, roi_in->y);
1076 iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1077 iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1078
1079
1080 //_print_roi(&iroi_full, "tile iroi_full final");
1081 //_print_roi(&oroi_full, "tile oroi_full final");
1082
1083 /* offsets of tile into ivoid and ovoid */
1084 const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1085 size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1086 + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1087
1088 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n",
1089 tx, ty, iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1090
1091
1092 /* prepare input tile buffer */
1093 input = dt_alloc_align(64, (size_t)iroi_full.width * iroi_full.height * in_bpp);
1094 if(input == NULL)
1095 {
1096 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc input buffer for module '%s'\n",
1097 self->op);
1098 goto error;
1099 }
1100 output = dt_alloc_align(64, (size_t)oroi_full.width * oroi_full.height * out_bpp);
1101 if(output == NULL)
1102 {
1103 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] could not alloc output buffer for module '%s'\n",
1104 self->op);
1105 goto error;
1106 }
1107
1108 #ifdef _OPENMP
1109 #pragma omp parallel for default(none) \
1110 dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1111 dt_omp_sharedconst(ioffs) shared(input, iroi_full) \
1112 schedule(static)
1113 #endif
1114 for(size_t j = 0; j < iroi_full.height; j++)
1115 memcpy((char *)input + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1116 (size_t)iroi_full.width * in_bpp);
1117
1118 /* take original processed_maximum as starting point */
1119 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1120
1121 /* call process() of module */
1122 self->process(self, piece, input, output, &iroi_full, &oroi_full);
1123
1124 /* aggregate resulting processed_maximum */
1125 /* TODO: check if there really can be differences between tiles and take
1126 appropriate action (calculate minimum, maximum, average, ...?) */
1127 for(int k = 0; k < 4; k++)
1128 {
1129 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1130 dt_print(
1131 DT_DEBUG_DEV,
1132 "[default_process_tiling_roi] processed_maximum[%d] differs between tiles in module '%s'\n", k,
1133 self->op);
1134 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1135 }
1136
1137 /* copy "good" part of tile to output buffer */
1138 const int origin_x = oroi_good.x - oroi_full.x;
1139 const int origin_y = oroi_good.y - oroi_full.y;
1140 #ifdef _OPENMP
1141 #pragma omp parallel for default(none) \
1142 dt_omp_firstprivate(opitch, origin_x, origin_y, out_bpp, ovoid) \
1143 shared(ooffs, output, oroi_good, oroi_full) \
1144 schedule(static)
1145 #endif
1146 for(size_t j = 0; j < oroi_good.height; j++)
1147 memcpy((char *)ovoid + ooffs + j * opitch,
1148 (char *)output + ((j + origin_y) * oroi_full.width + origin_x) * out_bpp,
1149 (size_t)oroi_good.width * out_bpp);
1150
1151 dt_free_align(input);
1152 dt_free_align(output);
1153 input = output = NULL;
1154 }
1155
1156 /* copy back final processed_maximum */
1157 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1158
1159 if(input != NULL) dt_free_align(input);
1160 if(output != NULL) dt_free_align(output);
1161 piece->pipe->tiling = 0;
1162 return;
1163
1164 error:
1165 dt_control_log(_("tiling failed for module '%s'. output might be garbled."), self->op);
1166 // fall through
1167
1168 fallback:
1169 if(input != NULL) dt_free_align(input);
1170 if(output != NULL) dt_free_align(output);
1171 piece->pipe->tiling = 0;
1172 dt_print(DT_DEBUG_DEV, "[default_process_tiling_roi] fall back to standard processing for module '%s'\n",
1173 self->op);
1174 self->process(self, piece, ivoid, ovoid, roi_in, roi_out);
1175 return;
1176 }
1177
1178
1179
1180 /* if a module does not implement process_tiling() by itself, this function is called instead.
1181 _default_process_tiling_ptp() is able to handle standard cases where pixels do not change their places.
1182 _default_process_tiling_roi() takes care of all other cases where image gets distorted and for module
1183 "clipping",
1184 "flip" as this may flip or mirror the image. */
default_process_tiling(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1185 void default_process_tiling(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1186 const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
1187 const dt_iop_roi_t *const roi_out, const int in_bpp)
1188 {
1189 if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
1190 _default_process_tiling_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1191 else
1192 _default_process_tiling_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
1193 return;
1194 }
1195
1196
1197
1198 #ifdef HAVE_OPENCL
1199 /* simple tiling algorithm for roi_in == roi_out, i.e. for pixel to pixel modules/operations */
_default_process_tiling_cl_ptp(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1200 static int _default_process_tiling_cl_ptp(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1201 const void *const ivoid, void *const ovoid,
1202 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1203 const int in_bpp)
1204 {
1205 cl_int err = -999;
1206 cl_mem input = NULL;
1207 cl_mem output = NULL;
1208 cl_mem pinned_input = NULL;
1209 cl_mem pinned_output = NULL;
1210 void *input_buffer = NULL;
1211 void *output_buffer = NULL;
1212
1213 dt_iop_buffer_dsc_t dsc;
1214 self->output_format(self, piece->pipe, piece, &dsc);
1215 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1216
1217 const int devid = piece->pipe->devid;
1218 const int ipitch = roi_in->width * in_bpp;
1219 const int opitch = roi_out->width * out_bpp;
1220 const int max_bpp = _max(in_bpp, out_bpp);
1221
1222 /* get tiling requirements of module */
1223 dt_develop_tiling_t tiling = { 0 };
1224 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1225
1226 /* shall we use pinned memory transfers? */
1227 int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1228 const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1229 // which seemingly get allocated not only on
1230 // host but also on device (why???)
1231 const float pinned_buffer_slack
1232 = use_pinned_memory
1233 ? 0.85f
1234 : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1235
1236 /* shall we enforce tiling */
1237 const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
1238 /* calculate optimal size of tiles */
1239 float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1240 headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1241 float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1242 if(force_tile) available = fmin(available, 500.0f * 1024.0f * 1024.0f);
1243 const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1244 const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1245 pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1246 const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1247 int width = _min(roi_in->width, darktable.opencl->dev[devid].max_image_width);
1248 int height = _min(roi_in->height, darktable.opencl->dev[devid].max_image_height);
1249
1250 /* shrink tile size in case it would exceed singlebuffer size */
1251 if((float)width * height * max_bpp * maxbuf > singlebuffer)
1252 {
1253 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1254
1255 if(width < height && scale >= 0.333f)
1256 {
1257 height = floorf(height * scale);
1258 }
1259 else if(height <= width && scale >= 0.333f)
1260 {
1261 width = floorf(width * scale);
1262 }
1263 else
1264 {
1265 width = floorf(width * sqrtf(scale));
1266 height = floorf(height * sqrtf(scale));
1267 }
1268 }
1269
1270 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1271 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1272 {
1273 width = height = floorf(sqrtf((float)width * height));
1274 }
1275
1276
1277 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1278 Modules will report alignment requirements via xalign and yalign within tiling_callback().
1279 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1280 direction. Additional alignment requirements are set via definition of CL_ALIGNMENT.
1281 We guarantee alignment by selecting image width/height and overlap accordingly. For a tile width/height
1282 that is identical to image width/height no special alignment is done. */
1283
1284 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1285 const unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1286
1287 /* determining alignment requirement for tile width/height.
1288 in case of tile width also align according to definition of CL_ALIGNMENT */
1289 const unsigned int walign = _lcm(xyalign, CL_ALIGNMENT);
1290 const unsigned int halign = xyalign;
1291
1292 assert(xyalign != 0 && walign != 0 && halign != 0);
1293
1294 /* properly align tile width and height by making them smaller if needed */
1295 if(width < roi_in->width) width = (width / walign) * walign;
1296 if(height < roi_in->height) height = (height / halign) * halign;
1297
1298 /* also make sure that overlap follows alignment rules by making it wider when needed */
1299 const int overlap = tiling.overlap % xyalign != 0 ? (tiling.overlap / xyalign + 1) * xyalign
1300 : tiling.overlap;
1301
1302
1303 /* calculate effective tile size */
1304 const int tile_wd = width - 2 * overlap > 0 ? width - 2 * overlap : 1;
1305 const int tile_ht = height - 2 * overlap > 0 ? height - 2 * overlap : 1;
1306
1307
1308 /* calculate number of tiles */
1309 const int tiles_x = width < roi_in->width ? ceilf(roi_in->width / (float)tile_wd) : 1;
1310 const int tiles_y = height < roi_in->height ? ceilf(roi_in->height / (float)tile_ht) : 1;
1311
1312 /* sanity check: don't run wild on too many tiles */
1313 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1314 {
1315 dt_print(DT_DEBUG_OPENCL,
1316 "[default_process_tiling_cl_ptp] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1317 self->op, tiles_x, tiles_y);
1318 return FALSE;
1319 }
1320
1321
1322 dt_print(DT_DEBUG_OPENCL,
1323 "[default_process_tiling_cl_ptp] use tiling on module '%s' for image with full size %d x %d\n",
1324 self->op, roi_in->width, roi_in->height);
1325 dt_print(DT_DEBUG_OPENCL,
1326 "[default_process_tiling_cl_ptp] (%d x %d) tiles with max dimensions %d x %d and overlap %d\n",
1327 tiles_x, tiles_y, width, height, overlap);
1328
1329 /* store processed_maximum to be re-used and aggregated */
1330 dt_aligned_pixel_t processed_maximum_saved;
1331 dt_aligned_pixel_t processed_maximum_new = { 1.0f };
1332 for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1333
1334 /* reserve pinned input and output memory for host<->device data transfer */
1335 if(use_pinned_memory)
1336 {
1337 pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1338 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1339 if(pinned_input == NULL)
1340 {
1341 dt_print(DT_DEBUG_OPENCL,
1342 "[default_process_tiling_cl_ptp] could not alloc pinned input buffer for module '%s'\n",
1343 self->op);
1344 use_pinned_memory = 0;
1345 }
1346 }
1347
1348 if(use_pinned_memory)
1349 {
1350
1351 input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1352 (size_t)width * height * in_bpp);
1353 if(input_buffer == NULL)
1354 {
1355 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned input buffer to host "
1356 "memory for module '%s'\n",
1357 self->op);
1358 use_pinned_memory = 0;
1359 }
1360 }
1361
1362 if(use_pinned_memory)
1363 {
1364
1365 pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1366 CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1367 if(pinned_output == NULL)
1368 {
1369 dt_print(DT_DEBUG_OPENCL,
1370 "[default_process_tiling_cl_ptp] could not alloc pinned output buffer for module '%s'\n",
1371 self->op);
1372 use_pinned_memory = 0;
1373 }
1374 }
1375
1376 if(use_pinned_memory)
1377 {
1378
1379 output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1380 (size_t)width * height * out_bpp);
1381 if(output_buffer == NULL)
1382 {
1383 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_ptp] could not map pinned output buffer to host "
1384 "memory for module '%s'\n",
1385 self->op);
1386 use_pinned_memory = 0;
1387 }
1388 }
1389
1390 /* iterate over tiles */
1391 for(size_t tx = 0; tx < tiles_x; tx++)
1392 for(size_t ty = 0; ty < tiles_y; ty++)
1393 {
1394 piece->pipe->tiling = 1;
1395
1396 const size_t wd = tx * tile_wd + width > roi_in->width ? roi_in->width - tx * tile_wd : width;
1397 const size_t ht = ty * tile_ht + height > roi_in->height ? roi_in->height - ty * tile_ht : height;
1398
1399 /* no need to process (end)tiles that are smaller than the total overlap area */
1400 if((wd <= 2 * overlap && tx > 0) || (ht <= 2 * overlap && ty > 0)) continue;
1401
1402 /* origin and region of effective part of tile, which we want to store later */
1403 size_t origin[] = { 0, 0, 0 };
1404 size_t region[] = { wd, ht, 1 };
1405
1406 /* roi_in and roi_out for process_cl on subbuffer */
1407 dt_iop_roi_t iroi = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1408 dt_iop_roi_t oroi = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1409
1410
1411 /* offsets of tile into ivoid and ovoid */
1412 const size_t ioffs = (ty * tile_ht) * ipitch + (tx * tile_wd) * in_bpp;
1413 size_t ooffs = (ty * tile_ht) * opitch + (tx * tile_wd) * out_bpp;
1414
1415
1416 dt_print(DT_DEBUG_OPENCL,
1417 "[default_process_tiling_cl_ptp] tile (%zu, %zu) with %zu x %zu at origin [%zu, %zu]\n", tx, ty, wd,
1418 ht, tx * tile_wd, ty * tile_ht);
1419
1420 /* get input and output buffers */
1421 input = dt_opencl_alloc_device(devid, wd, ht, in_bpp);
1422 if(input == NULL) goto error;
1423 output = dt_opencl_alloc_device(devid, wd, ht, out_bpp);
1424 if(output == NULL) goto error;
1425
1426 if(use_pinned_memory)
1427 {
1428 /* prepare pinned input tile buffer: copy part of input image */
1429 #ifdef _OPENMP
1430 #pragma omp parallel for default(none) \
1431 dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1432 dt_omp_sharedconst(ioffs, wd, ht) shared(input_buffer, width) \
1433 schedule(static)
1434 #endif
1435 for(size_t j = 0; j < ht; j++)
1436 memcpy((char *)input_buffer + j * wd * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1437 (size_t)wd * in_bpp);
1438
1439 /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1440 err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, origin, region,
1441 wd * in_bpp, CL_TRUE);
1442 if(err != CL_SUCCESS) goto error;
1443 }
1444 else
1445 {
1446 /* blocking direct memory transfer: host input image -> opencl/device tile */
1447 err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, origin, region, ipitch,
1448 CL_TRUE);
1449 if(err != CL_SUCCESS) goto error;
1450 }
1451
1452 /* take original processed_maximum as starting point */
1453 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1454
1455 /* call process_cl of module */
1456 if(!self->process_cl(self, piece, input, output, &iroi, &oroi)) goto error;
1457
1458 /* aggregate resulting processed_maximum */
1459 /* TODO: check if there really can be differences between tiles and take
1460 appropriate action (calculate minimum, maximum, average, ...?) */
1461 for(int k = 0; k < 4; k++)
1462 {
1463 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1464 dt_print(
1465 DT_DEBUG_OPENCL,
1466 "[default_process_tiling_cl_ptp] processed_maximum[%d] differs between tiles in module '%s'\n",
1467 k, self->op);
1468 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1469 }
1470
1471 if(use_pinned_memory)
1472 {
1473 /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1474 err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, origin, region,
1475 wd * out_bpp, CL_TRUE);
1476 if(err != CL_SUCCESS) goto error;
1477 }
1478
1479 /* correct origin and region of tile for overlap.
1480 makes sure that we only copy back the "good" part. */
1481 if(tx > 0)
1482 {
1483 origin[0] += overlap;
1484 region[0] -= overlap;
1485 ooffs += (size_t)overlap * out_bpp;
1486 }
1487 if(ty > 0)
1488 {
1489 origin[1] += overlap;
1490 region[1] -= overlap;
1491 ooffs += (size_t)overlap * opitch;
1492 }
1493
1494 if(use_pinned_memory)
1495 {
1496 /* copy "good" part of tile from pinned output buffer to output image */
1497 #if 0 // def _OPENMP
1498 #pragma omp parallel for default(none) shared(ovoid, ooffs, output_buffer, width, origin, region, \
1499 wd) schedule(static)
1500 #endif
1501 for(size_t j = 0; j < region[1]; j++)
1502 memcpy((char *)ovoid + ooffs + j * opitch,
1503 (char *)output_buffer + ((j + origin[1]) * wd + origin[0]) * out_bpp,
1504 (size_t)region[0] * out_bpp);
1505 }
1506 else
1507 {
1508 /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1509 err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, origin, region,
1510 opitch, CL_TRUE);
1511 if(err != CL_SUCCESS) goto error;
1512 }
1513
1514 /* release input and output buffers */
1515 dt_opencl_release_mem_object(input);
1516 input = NULL;
1517 dt_opencl_release_mem_object(output);
1518 output = NULL;
1519
1520 /* block until opencl queue has finished to free all used event handlers */
1521 if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1522 dt_opencl_finish(devid);
1523 }
1524
1525 /* copy back final processed_maximum */
1526 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1527
1528 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1529 dt_opencl_release_mem_object(pinned_input);
1530 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1531 dt_opencl_release_mem_object(pinned_output);
1532 dt_opencl_release_mem_object(input);
1533 dt_opencl_release_mem_object(output);
1534 piece->pipe->tiling = 0;
1535 return TRUE;
1536
1537 error:
1538 /* copy back stored processed_maximum */
1539 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1540 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1541 dt_opencl_release_mem_object(pinned_input);
1542 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1543 dt_opencl_release_mem_object(pinned_output);
1544 dt_opencl_release_mem_object(input);
1545 dt_opencl_release_mem_object(output);
1546 piece->pipe->tiling = 0;
1547 dt_print(
1548 DT_DEBUG_OPENCL,
1549 "[default_process_tiling_opencl_ptp] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1550 self->op, err);
1551 return FALSE;
1552 }
1553
1554
1555 /* more elaborate tiling algorithm for roi_in != roi_out: slower than the ptp variant,
1556 more tiles and larger overlap */
_default_process_tiling_cl_roi(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)1557 static int _default_process_tiling_cl_roi(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
1558 const void *const ivoid, void *const ovoid,
1559 const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
1560 const int in_bpp)
1561 {
1562 cl_int err = -999;
1563 cl_mem input = NULL;
1564 cl_mem output = NULL;
1565 cl_mem pinned_input = NULL;
1566 cl_mem pinned_output = NULL;
1567 void *input_buffer = NULL;
1568 void *output_buffer = NULL;
1569
1570
1571 //_print_roi(roi_in, "module roi_in");
1572 //_print_roi(roi_out, "module roi_out");
1573
1574 dt_iop_buffer_dsc_t dsc;
1575 self->output_format(self, piece->pipe, piece, &dsc);
1576 const int out_bpp = dt_iop_buffer_dsc_to_bpp(&dsc);
1577
1578 const int devid = piece->pipe->devid;
1579 const int ipitch = roi_in->width * in_bpp;
1580 const int opitch = roi_out->width * out_bpp;
1581 const int max_bpp = _max(in_bpp, out_bpp);
1582
1583 const float fullscale = fmax(roi_in->scale / roi_out->scale, sqrtf(((float)roi_in->width * roi_in->height)
1584 / ((float)roi_out->width * roi_out->height)));
1585
1586 /* inaccuracy for roi_in elements in roi_out -> roi_in calculations */
1587 const int delta = ceilf(fullscale);
1588
1589 /* estimate for additional (space) requirement in buffer dimensions due to inaccuracies */
1590 const int inacc = RESERVE * delta;
1591
1592 /* get tiling requirements of module */
1593 dt_develop_tiling_t tiling = { 0 };
1594 self->tiling_callback(self, piece, roi_in, roi_out, &tiling);
1595
1596 /* shall we use pinned memory transfers? */
1597 int use_pinned_memory = dt_conf_get_bool("opencl_use_pinned_memory");
1598 const int pinned_buffer_overhead = use_pinned_memory ? 2 : 0; // add two additional pinned memory buffers
1599 // which seemingly get allocated not only on
1600 // host but also on device (why???)
1601 const float pinned_buffer_slack
1602 = use_pinned_memory
1603 ? 0.85f
1604 : 1.0f; // avoid problems when pinned buffer size gets too close to max_mem_alloc size
1605
1606 /* shall we enforce tiling */
1607 const gboolean force_tile = (darktable.unmuted & DT_DEBUG_TILING);
1608 /* calculate optimal size of tiles */
1609 float headroom = dt_conf_get_float("opencl_memory_headroom") * 1024.0f * 1024.0f;
1610 headroom = fmin(fmax(headroom, 0.0f), (float)darktable.opencl->dev[devid].max_global_mem);
1611 float available = darktable.opencl->dev[devid].max_global_mem - headroom;
1612 if(force_tile) available = fmin(available, 500.0f * 1024.0f * 1024.0f);
1613 const float factor = fmax(tiling.factor_cl + pinned_buffer_overhead, 1.0f);
1614 const float singlebuffer = fmin(fmax((available - tiling.overhead) / factor, 0.0f),
1615 pinned_buffer_slack * darktable.opencl->dev[devid].max_mem_alloc);
1616 const float maxbuf = fmax(tiling.maxbuf_cl, 1.0f);
1617
1618 int width = _min(_max(roi_in->width, roi_out->width), darktable.opencl->dev[devid].max_image_width);
1619 int height = _min(_max(roi_in->height, roi_out->height), darktable.opencl->dev[devid].max_image_height);
1620
1621 /* shrink tile size in case it would exceed singlebuffer size */
1622 if((float)width * height * max_bpp * maxbuf > singlebuffer)
1623 {
1624 const float scale = singlebuffer / ((float)width * height * max_bpp * maxbuf);
1625
1626 if(width < height && scale >= 0.333f)
1627 {
1628 height = floorf(height * scale);
1629 }
1630 else if(height <= width && scale >= 0.333f)
1631 {
1632 width = floorf(width * scale);
1633 }
1634 else
1635 {
1636 width = floorf(width * sqrtf(scale));
1637 height = floorf(height * sqrtf(scale));
1638 }
1639 }
1640
1641 /* make sure we have a reasonably effective tile dimension. if not try square tiles */
1642 if(3 * tiling.overlap > width || 3 * tiling.overlap > height)
1643 {
1644 width = height = floorf(sqrtf((float)width * height));
1645 }
1646
1647
1648 /* Alignment rules: we need to make sure that alignment requirements of module are fulfilled.
1649 Modules will report alignment requirements via xalign and yalign within tiling_callback().
1650 Typical use case is demosaic where Bayer pattern requires alignment to a multiple of 2 in x and y
1651 direction. Additional alignment requirements are set via definition of CL_ALIGNMENT. */
1652
1653 /* for simplicity reasons we use only one alignment that fits to x and y requirements at the same time */
1654 unsigned int xyalign = _lcm(tiling.xalign, tiling.yalign);
1655 xyalign = _lcm(xyalign, CL_ALIGNMENT);
1656
1657 assert(xyalign != 0);
1658
1659 /* make sure that overlap follows alignment rules by making it wider when needed.
1660 overlap_in needs to be aligned, overlap_out is only here to calculate output buffer size */
1661 const int overlap_in = _align_up(tiling.overlap, xyalign);
1662 const int overlap_out = ceilf((float)overlap_in / fullscale);
1663
1664 int tiles_x = 1, tiles_y = 1;
1665
1666 /* calculate number of tiles taking the larger buffer (input or output) as a guiding one.
1667 normally it is roi_in > roi_out; but let's be prepared */
1668 if(roi_in->width > roi_out->width)
1669 tiles_x = width < roi_in->width
1670 ? ceilf((float)roi_in->width / (float)_max(width - 2 * overlap_in - inacc, 1))
1671 : 1;
1672 else
1673 tiles_x = width < roi_out->width ? ceilf((float)roi_out->width / (float)_max(width - 2 * overlap_out, 1))
1674 : 1;
1675
1676 if(roi_in->height > roi_out->height)
1677 tiles_y = height < roi_in->height
1678 ? ceilf((float)roi_in->height / (float)_max(height - 2 * overlap_in - inacc, 1))
1679 : 1;
1680 else
1681 tiles_y = height < roi_out->height
1682 ? ceilf((float)roi_out->height / (float)_max(height - 2 * overlap_out, 1))
1683 : 1;
1684
1685 /* sanity check: don't run wild on too many tiles */
1686 if(tiles_x * tiles_y > dt_conf_get_int("maximum_number_tiles"))
1687 {
1688 dt_print(DT_DEBUG_OPENCL,
1689 "[default_process_tiling_cl_roi] aborted tiling for module '%s'. too many tiles: %d x %d\n",
1690 self->op, tiles_x, tiles_y);
1691 return FALSE;
1692 }
1693
1694 /* calculate tile width and height excl. overlap (i.e. the good part) for output.
1695 important for all following processing steps. */
1696 const int tile_wd = _align_up(
1697 roi_out->width % tiles_x == 0 ? roi_out->width / tiles_x : roi_out->width / tiles_x + 1, xyalign);
1698 const int tile_ht = _align_up(
1699 roi_out->height % tiles_y == 0 ? roi_out->height / tiles_y : roi_out->height / tiles_y + 1, xyalign);
1700
1701
1702 dt_print(
1703 DT_DEBUG_OPENCL,
1704 "[default_process_tiling_cl_roi] use tiling on module '%s' for image with full input size %d x %d\n",
1705 self->op, roi_in->width, roi_in->height);
1706 dt_print(DT_DEBUG_OPENCL,
1707 "[default_process_tiling_cl_roi] (%d x %d) tiles with max input dimensions %d x %d\n", tiles_x,
1708 tiles_y, width, height);
1709
1710
1711 /* store processed_maximum to be re-used and aggregated */
1712 dt_aligned_pixel_t processed_maximum_saved;
1713 dt_aligned_pixel_t processed_maximum_new = { 1.0f };
1714 for_four_channels(k) processed_maximum_saved[k] = piece->pipe->dsc.processed_maximum[k];
1715
1716 /* reserve pinned input and output memory for host<->device data transfer */
1717 if(use_pinned_memory)
1718 {
1719 pinned_input = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * in_bpp,
1720 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR);
1721 if(pinned_input == NULL)
1722 {
1723 dt_print(DT_DEBUG_OPENCL,
1724 "[default_process_tiling_cl_roi] could not alloc pinned input buffer for module '%s'\n",
1725 self->op);
1726 use_pinned_memory = 0;
1727 }
1728 }
1729
1730 if(use_pinned_memory)
1731 {
1732
1733 input_buffer = dt_opencl_map_buffer(devid, pinned_input, CL_TRUE, CL_MAP_WRITE, 0,
1734 (size_t)width * height * in_bpp);
1735 if(input_buffer == NULL)
1736 {
1737 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned input buffer to host "
1738 "memory for module '%s'\n",
1739 self->op);
1740 use_pinned_memory = 0;
1741 }
1742 }
1743
1744 if(use_pinned_memory)
1745 {
1746
1747 pinned_output = dt_opencl_alloc_device_buffer_with_flags(devid, (size_t)width * height * out_bpp,
1748 CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
1749 if(pinned_output == NULL)
1750 {
1751 dt_print(DT_DEBUG_OPENCL,
1752 "[default_process_tiling_cl_roi] could not alloc pinned output buffer for module '%s'\n",
1753 self->op);
1754 use_pinned_memory = 0;
1755 }
1756 }
1757
1758 if(use_pinned_memory)
1759 {
1760
1761 output_buffer = dt_opencl_map_buffer(devid, pinned_output, CL_TRUE, CL_MAP_READ, 0,
1762 (size_t)width * height * out_bpp);
1763 if(output_buffer == NULL)
1764 {
1765 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] could not map pinned output buffer to host "
1766 "memory for module '%s'\n",
1767 self->op);
1768 use_pinned_memory = 0;
1769 }
1770 }
1771
1772
1773 /* iterate over tiles */
1774 for(size_t tx = 0; tx < tiles_x; tx++)
1775 for(size_t ty = 0; ty < tiles_y; ty++)
1776 {
1777 piece->pipe->tiling = 1;
1778
1779 /* the output dimensions of the good part of this specific tile */
1780 const size_t wd = (tx + 1) * tile_wd > roi_out->width ? (size_t)roi_out->width - tx * tile_wd : tile_wd;
1781 const size_t ht = (ty + 1) * tile_ht > roi_out->height ? (size_t)roi_out->height - ty * tile_ht : tile_ht;
1782
1783 /* roi_in and roi_out of good part: oroi_good easy to calculate based on number and dimension of tile.
1784 iroi_good is calculated by modify_roi_in() of respective module */
1785 dt_iop_roi_t iroi_good = { roi_in->x + tx * tile_wd, roi_in->y + ty * tile_ht, wd, ht, roi_in->scale };
1786 dt_iop_roi_t oroi_good
1787 = { roi_out->x + tx * tile_wd, roi_out->y + ty * tile_ht, wd, ht, roi_out->scale };
1788
1789 self->modify_roi_in(self, piece, &oroi_good, &iroi_good);
1790
1791 /* clamp iroi_good to not exceed roi_in */
1792 iroi_good.x = _max(iroi_good.x, roi_in->x);
1793 iroi_good.y = _max(iroi_good.y, roi_in->y);
1794 iroi_good.width = _min(iroi_good.width, roi_in->width + roi_in->x - iroi_good.x);
1795 iroi_good.height = _min(iroi_good.height, roi_in->height + roi_in->y - iroi_good.y);
1796
1797 //_print_roi(&iroi_good, "tile iroi_good");
1798 //_print_roi(&oroi_good, "tile oroi_good");
1799
1800 /* now we need to calculate full region of this tile: increase input roi to take care of overlap
1801 requirements
1802 and alignment and add additional delta to correct for possible rounding errors in modify_roi_in()
1803 -> generates first estimate of iroi_full */
1804 const int x_in = iroi_good.x;
1805 const int y_in = iroi_good.y;
1806 const int width_in = iroi_good.width;
1807 const int height_in = iroi_good.height;
1808 const int new_x_in = _max(_align_down(x_in - overlap_in - delta, xyalign), roi_in->x);
1809 const int new_y_in = _max(_align_down(y_in - overlap_in - delta, xyalign), roi_in->y);
1810 const int new_width_in = _min(_align_up(width_in + overlap_in + delta + (x_in - new_x_in), xyalign),
1811 roi_in->width + roi_in->x - new_x_in);
1812 const int new_height_in = _min(_align_up(height_in + overlap_in + delta + (y_in - new_y_in), xyalign),
1813 roi_in->height + roi_in->y - new_y_in);
1814
1815 /* iroi_full based on calculated numbers and dimensions. oroi_full just set as a starting point for the
1816 * following iterative search */
1817 dt_iop_roi_t iroi_full = { new_x_in, new_y_in, new_width_in, new_height_in, iroi_good.scale };
1818 dt_iop_roi_t oroi_full = oroi_good; // a good starting point for optimization
1819
1820 //_print_roi(&iroi_full, "tile iroi_full before optimization");
1821 //_print_roi(&oroi_full, "tile oroi_full before optimization");
1822
1823 /* try to find a matching oroi_full */
1824 if(!_fit_output_to_input_roi(self, piece, &iroi_full, &oroi_full, delta, 10))
1825 {
1826 dt_print(DT_DEBUG_OPENCL, "[default_process_tiling_cl_roi] can not handle requested roi's. tiling "
1827 "for module '%s' not possible.\n",
1828 self->op);
1829 goto error;
1830 }
1831
1832
1833 /* make sure that oroi_full at least covers the range of oroi_good.
1834 this step is needed due to the possibility of rounding errors */
1835 oroi_full.x = _min(oroi_full.x, oroi_good.x);
1836 oroi_full.y = _min(oroi_full.y, oroi_good.y);
1837 oroi_full.width = _max(oroi_full.width, oroi_good.x + oroi_good.width - oroi_full.x);
1838 oroi_full.height = _max(oroi_full.height, oroi_good.y + oroi_good.height - oroi_full.y);
1839
1840 /* clamp oroi_full to not exceed roi_out */
1841 oroi_full.x = _max(oroi_full.x, roi_out->x);
1842 oroi_full.y = _max(oroi_full.y, roi_out->y);
1843 oroi_full.width = _min(oroi_full.width, roi_out->width + roi_out->x - oroi_full.x);
1844 oroi_full.height = _min(oroi_full.height, roi_out->height + roi_out->y - oroi_full.y);
1845
1846
1847 /* calculate final iroi_full */
1848 self->modify_roi_in(self, piece, &oroi_full, &iroi_full);
1849
1850 /* clamp iroi_full to not exceed roi_in */
1851 iroi_full.x = _max(iroi_full.x, roi_in->x);
1852 iroi_full.y = _max(iroi_full.y, roi_in->y);
1853 iroi_full.width = _min(iroi_full.width, roi_in->width + roi_in->x - iroi_full.x);
1854 iroi_full.height = _min(iroi_full.height, roi_in->height + roi_in->y - iroi_full.y);
1855
1856 //_print_roi(&iroi_full, "tile iroi_full");
1857 //_print_roi(&oroi_full, "tile oroi_full");
1858
1859 /* offsets of tile into ivoid and ovoid */
1860 const size_t ioffs = ((size_t)iroi_full.y - roi_in->y) * ipitch + ((size_t)iroi_full.x - roi_in->x) * in_bpp;
1861 const size_t ooffs = ((size_t)oroi_good.y - roi_out->y) * opitch
1862 + ((size_t)oroi_good.x - roi_out->x) * out_bpp;
1863
1864 dt_print(DT_DEBUG_OPENCL,
1865 "[default_process_tiling_cl_roi] tile (%zu, %zu) with %d x %d at origin [%d, %d]\n", tx, ty,
1866 iroi_full.width, iroi_full.height, iroi_full.x, iroi_full.y);
1867
1868 /* origin and region of full input tile */
1869 size_t iorigin[] = { 0, 0, 0 };
1870 size_t iregion[] = { iroi_full.width, iroi_full.height, 1 };
1871
1872 /* origin and region of full output tile */
1873 size_t oforigin[] = { 0, 0, 0 };
1874 size_t ofregion[] = { oroi_full.width, oroi_full.height, 1 };
1875
1876 /* origin and region of good part of output tile */
1877 size_t oorigin[] = { oroi_good.x - oroi_full.x, oroi_good.y - oroi_full.y, 0 };
1878 size_t oregion[] = { oroi_good.width, oroi_good.height, 1 };
1879
1880 /* get opencl input and output buffers */
1881 input = dt_opencl_alloc_device(devid, iroi_full.width, iroi_full.height, in_bpp);
1882 if(input == NULL) goto error;
1883
1884 output = dt_opencl_alloc_device(devid, oroi_full.width, oroi_full.height, out_bpp);
1885 if(output == NULL) goto error;
1886
1887 if(use_pinned_memory)
1888 {
1889 /* prepare pinned input tile buffer: copy part of input image */
1890 #ifdef _OPENMP
1891 #pragma omp parallel for default(none) \
1892 dt_omp_firstprivate(in_bpp, ipitch, ivoid) \
1893 dt_omp_sharedconst(ioffs) shared(input_buffer, width, iroi_full) schedule(static)
1894 #endif
1895 for(size_t j = 0; j < iroi_full.height; j++)
1896 memcpy((char *)input_buffer + j * iroi_full.width * in_bpp, (char *)ivoid + ioffs + j * ipitch,
1897 (size_t)iroi_full.width * in_bpp);
1898
1899 /* blocking memory transfer: pinned host input buffer -> opencl/device tile */
1900 err = dt_opencl_write_host_to_device_raw(devid, (char *)input_buffer, input, iorigin, iregion,
1901 (size_t)iroi_full.width * in_bpp, CL_TRUE);
1902 if(err != CL_SUCCESS) goto error;
1903 }
1904 else
1905 {
1906 /* blocking direct memory transfer: host input image -> opencl/device tile */
1907 err = dt_opencl_write_host_to_device_raw(devid, (char *)ivoid + ioffs, input, iorigin, iregion,
1908 ipitch, CL_TRUE);
1909 if(err != CL_SUCCESS) goto error;
1910 }
1911
1912 /* take original processed_maximum as starting point */
1913 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1914
1915 /* call process_cl of module */
1916 if(!self->process_cl(self, piece, input, output, &iroi_full, &oroi_full)) goto error;
1917
1918 /* aggregate resulting processed_maximum */
1919 /* TODO: check if there really can be differences between tiles and take
1920 appropriate action (calculate minimum, maximum, average, ...?) */
1921 for(int k = 0; k < 4; k++)
1922 {
1923 if(tx + ty > 0 && fabs(processed_maximum_new[k] - piece->pipe->dsc.processed_maximum[k]) > 1.0e-6f)
1924 dt_print(
1925 DT_DEBUG_OPENCL,
1926 "[default_process_tiling_cl_roi] processed_maximum[%d] differs between tiles in module '%s'\n",
1927 k, self->op);
1928 processed_maximum_new[k] = piece->pipe->dsc.processed_maximum[k];
1929 }
1930
1931 if(use_pinned_memory)
1932 {
1933 /* blocking memory transfer: complete opencl/device tile -> pinned host output buffer */
1934 err = dt_opencl_read_host_from_device_raw(devid, (char *)output_buffer, output, oforigin, ofregion,
1935 (size_t)oroi_full.width * out_bpp, CL_TRUE);
1936 if(err != CL_SUCCESS) goto error;
1937
1938 /* copy "good" part of tile from pinned output buffer to output image */
1939 #ifdef _OPENMP
1940 #pragma omp parallel for default(none) \
1941 dt_omp_firstprivate(ipitch, opitch, ovoid, out_bpp) \
1942 dt_omp_sharedconst(ooffs) shared(output_buffer, oroi_full, oorigin, oregion) \
1943 schedule(static)
1944 #endif
1945 for(size_t j = 0; j < oregion[1]; j++)
1946 memcpy((char *)ovoid + ooffs + j * opitch,
1947 (char *)output_buffer + ((j + oorigin[1]) * oroi_full.width + oorigin[0]) * out_bpp,
1948 (size_t)oregion[0] * out_bpp);
1949 }
1950 else
1951 {
1952 /* blocking direct memory transfer: good part of opencl/device tile -> host output image */
1953 err = dt_opencl_read_host_from_device_raw(devid, (char *)ovoid + ooffs, output, oorigin, oregion,
1954 opitch, CL_TRUE);
1955 if(err != CL_SUCCESS) goto error;
1956 }
1957
1958 /* release input and output buffers */
1959 dt_opencl_release_mem_object(input);
1960 input = NULL;
1961 dt_opencl_release_mem_object(output);
1962 output = NULL;
1963
1964 /* block until opencl queue has finished to free all used event handlers */
1965 if(!darktable.opencl->async_pixelpipe || piece->pipe->type == DT_DEV_PIXELPIPE_EXPORT)
1966 dt_opencl_finish(devid);
1967 }
1968
1969 /* copy back final processed_maximum */
1970 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_new[k];
1971 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1972 dt_opencl_release_mem_object(pinned_input);
1973 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1974 dt_opencl_release_mem_object(pinned_output);
1975 dt_opencl_release_mem_object(input);
1976 dt_opencl_release_mem_object(output);
1977 piece->pipe->tiling = 0;
1978 return TRUE;
1979
1980 error:
1981 /* copy back stored processed_maximum */
1982 for(int k = 0; k < 4; k++) piece->pipe->dsc.processed_maximum[k] = processed_maximum_saved[k];
1983 if(input_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_input, input_buffer);
1984 dt_opencl_release_mem_object(pinned_input);
1985 if(output_buffer != NULL) dt_opencl_unmap_mem_object(devid, pinned_output, output_buffer);
1986 dt_opencl_release_mem_object(pinned_output);
1987 dt_opencl_release_mem_object(input);
1988 dt_opencl_release_mem_object(output);
1989 piece->pipe->tiling = 0;
1990 dt_print(
1991 DT_DEBUG_OPENCL,
1992 "[default_process_tiling_opencl_roi] couldn't run process_cl() for module '%s' in tiling mode: %d\n",
1993 self->op, err);
1994 return FALSE;
1995 }
1996
1997
1998
1999 /* if a module does not implement process_tiling_cl() by itself, this function is called instead.
2000 _default_process_tiling_cl_ptp() is able to handle standard cases where pixels do not change their places.
2001 _default_process_tiling_cl_roi() takes care of all other cases where image gets distorted. */
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)2002 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2003 const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
2004 const dt_iop_roi_t *const roi_out, const int in_bpp)
2005 {
2006 if(memcmp(roi_in, roi_out, sizeof(struct dt_iop_roi_t)) || (self->flags() & IOP_FLAGS_TILING_FULL_ROI))
2007 return _default_process_tiling_cl_roi(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
2008 else
2009 return _default_process_tiling_cl_ptp(self, piece, ivoid, ovoid, roi_in, roi_out, in_bpp);
2010 }
2011
2012 #else
default_process_tiling_cl(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const void * const ivoid,void * const ovoid,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int in_bpp)2013 int default_process_tiling_cl(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2014 const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in,
2015 const dt_iop_roi_t *const roi_out, const int in_bpp)
2016 {
2017 return FALSE;
2018 }
2019 #endif
2020
2021
2022 /* If a module does not implement tiling_callback() by itself, this function is called instead.
2023 Default is an image size factor of 2 (i.e. input + output buffer needed), no overhead (1),
2024 no overlap between tiles, and an pixel alignment of 1 in x and y direction, i.e. no special
2025 alignment required. Simple pixel to pixel modules (take tonecurve as an example) can happily
2026 live with that.
2027 (1) Small overhead like look-up-tables in tonecurve can be ignored safely. */
default_tiling_callback(struct dt_iop_module_t * self,struct dt_dev_pixelpipe_iop_t * piece,const dt_iop_roi_t * roi_in,const dt_iop_roi_t * roi_out,struct dt_develop_tiling_t * tiling)2028 void default_tiling_callback(struct dt_iop_module_t *self, struct dt_dev_pixelpipe_iop_t *piece,
2029 const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out,
2030 struct dt_develop_tiling_t *tiling)
2031 {
2032 const float ioratio
2033 = ((float)roi_out->width * (float)roi_out->height) / ((float)roi_in->width * (float)roi_in->height);
2034
2035 tiling->factor = 1.0f + ioratio;
2036 tiling->factor_cl = tiling->factor; // by default, we need the same memory on host or GPU
2037 tiling->maxbuf = 1.0f;
2038 tiling->maxbuf_cl = tiling->maxbuf;
2039 tiling->overhead = 0;
2040 tiling->overlap = 0;
2041 tiling->xalign = 1;
2042 tiling->yalign = 1;
2043
2044 if((self->flags() & IOP_FLAGS_TILING_FULL_ROI) == IOP_FLAGS_TILING_FULL_ROI) tiling->overlap = 4;
2045
2046 if(self->iop_order > dt_ioppr_get_iop_order(piece->pipe->iop_order_list, "demosaic", 0)) return;
2047
2048 // all operations that work with mosaiced data should respect pattern size!
2049
2050 if(!piece->pipe->dsc.filters) return;
2051
2052 if(piece->pipe->dsc.filters == 9u)
2053 {
2054 // X-Trans, sensor is 6x6
2055 tiling->xalign = 6;
2056 tiling->yalign = 6;
2057 }
2058 else
2059 {
2060 // Bayer, good old 2x2
2061 tiling->xalign = 2;
2062 tiling->yalign = 2;
2063 }
2064
2065 return;
2066 }
2067
dt_tiling_piece_fits_host_memory(const size_t width,const size_t height,const unsigned bpp,const float factor,const size_t overhead)2068 int dt_tiling_piece_fits_host_memory(const size_t width, const size_t height, const unsigned bpp,
2069 const float factor, const size_t overhead)
2070 {
2071 static int host_memory_limit = -1;
2072
2073 /* first time run */
2074 if(host_memory_limit < 0)
2075 {
2076 host_memory_limit = dt_conf_get_int("host_memory_limit");
2077
2078 /* don't let the user play games with us */
2079 if(host_memory_limit != 0) host_memory_limit = CLAMPI(host_memory_limit, 500, 50000);
2080 dt_conf_set_int("host_memory_limit", host_memory_limit);
2081 }
2082
2083 const float requirement = factor * width * height * bpp + overhead;
2084
2085 if(host_memory_limit == 0 || requirement <= host_memory_limit * 1024.0f * 1024.0f)
2086 return TRUE;
2087 else
2088 return FALSE;
2089 }
2090
2091 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2092 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2093 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2094