1 /* --------------------------------------------------------------------------
2 This file is part of darktable,
3 Copyright (C) 2012-2021 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 * ------------------------------------------------------------------------*/
18
19 #include "common/interpolation.h"
20 #include "common/darktable.h"
21 #include "common/math.h"
22 #include "control/conf.h"
23
24 #include <assert.h>
25 #include <glib.h>
26 #include <inttypes.h>
27 #include <stddef.h>
28 #include <stdint.h>
29
30 /** Border extrapolation modes */
31 enum border_mode
32 {
33 BORDER_REPLICATE, // aaaa|abcdefg|gggg
34 BORDER_WRAP, // defg|abcdefg|abcd
35 BORDER_MIRROR, // edcb|abcdefg|fedc
36 BORDER_CLAMP // ....|abcdefg|....
37 };
38
39 /* Supporting them all might be overkill, let the compiler trim all
40 * unnecessary modes in clip for resampling codepath*/
41 #define RESAMPLING_BORDER_MODE BORDER_REPLICATE
42
43 /* Supporting them all might be overkill, let the compiler trim all
44 * unnecessary modes in interpolation codepath */
45 #define INTERPOLATION_BORDER_MODE BORDER_MIRROR
46
47 // Defines minimum alignment requirement for critical SIMD code
48 #define SSE_ALIGNMENT 64
49
50 // Defines the maximum kernel half length
51 // !! Make sure to sync this with the filter array !!
52 #define MAX_HALF_FILTER_WIDTH 3
53
54 // Add code for timing resampling function
55 #define DEBUG_RESAMPLING_TIMING 0
56
57 // Add debug info messages to stderr
58 #define DEBUG_PRINT_INFO 0
59
60 // Add *verbose* (like one msg per pixel out) debug message to stderr
61 #define DEBUG_PRINT_VERBOSE 0
62
63 /* --------------------------------------------------------------------------
64 * Debug helpers
65 * ------------------------------------------------------------------------*/
66
67 #if DEBUG_RESAMPLING_TIMING
68 #include <sys/time.h>
69 #endif
70
71 #if DEBUG_PRINT_INFO
72 #define debug_info(...) \
73 do \
74 { \
75 fprintf(stderr, __VA_ARGS__); \
76 } while(0)
77 #else
78 #define debug_info(...)
79 #endif
80
81 #if DEBUG_PRINT_VERBOSE
82 #define debug_extra(...) \
83 do \
84 { \
85 fprintf(stderr, __VA_ARGS__); \
86 } while(0)
87 #else
88 #define debug_extra(...)
89 #endif
90
91 #if DEBUG_RESAMPLING_TIMING
getts()92 static inline int64_t getts()
93 {
94 struct timeval t;
95 gettimeofday(&t, NULL);
96 return t.tv_sec * INT64_C(1000000) + t.tv_usec;
97 }
98 #endif
99
100 /* --------------------------------------------------------------------------
101 * Generic helpers
102 * ------------------------------------------------------------------------*/
103
104 /** Clip into specified range
105 * @param idx index to filter
106 * @param length length of line
107 */
clip(int i,int min,int max,enum border_mode mode)108 static inline int clip(int i, int min, int max, enum border_mode mode)
109 {
110 switch(mode)
111 {
112 case BORDER_REPLICATE:
113 if(i < min)
114 {
115 i = min;
116 }
117 else if(i > max)
118 {
119 i = max;
120 }
121 break;
122 case BORDER_MIRROR:
123 if(i < min)
124 {
125 i = min - i;
126 }
127 else if(i > max)
128 {
129 i = 2 * max - i;
130 }
131 break;
132 case BORDER_WRAP:
133 if(i < min)
134 {
135 i = max - (min - i);
136 }
137 else if(i > max)
138 {
139 i = min + (i - max);
140 }
141 break;
142 case BORDER_CLAMP:
143 if(i < min || i > max)
144 {
145 /* Should not be used as is, we prevent -1 usage, filtering the taps
146 * we clip the sample indexes for. So understand this function is
147 * specific to its caller. */
148 i = -1;
149 }
150 break;
151 }
152
153 return i;
154 }
155
prepare_tap_boundaries(int * tap_first,int * tap_last,const enum border_mode mode,const int filterwidth,const int t,const int max)156 static inline void prepare_tap_boundaries(int *tap_first, int *tap_last, const enum border_mode mode,
157 const int filterwidth, const int t, const int max)
158 {
159 /* Check lower bound pixel index and skip as many pixels as necessary to
160 * fall into range */
161 *tap_first = 0;
162 if(mode == BORDER_CLAMP && t < 0)
163 {
164 *tap_first = -t;
165 }
166
167 // Same for upper bound pixel
168 *tap_last = filterwidth;
169 if(mode == BORDER_CLAMP && t + filterwidth >= max)
170 {
171 *tap_last = max - t;
172 }
173 }
174
175 /** Make sure an aligned chunk will not misalign its following chunk
176 * proposing an adapted length
177 *
178 * @param l Length required for current chunk
179 * @param align Required alignment for next chunk
180 *
181 * @return Required length for keeping alignment ok if chaining data chunks
182 */
increase_for_alignment(size_t l,size_t align)183 static inline size_t increase_for_alignment(size_t l, size_t align)
184 {
185 align -= 1;
186 return (l + align) & (~align);
187 }
188
189 /* --------------------------------------------------------------------------
190 * Interpolation kernels
191 * ------------------------------------------------------------------------*/
192
193 /* --------------------------------------------------------------------------
194 * Bilinear interpolation
195 * ------------------------------------------------------------------------*/
196
bilinear(float width,float t)197 static inline float bilinear(float width, float t)
198 {
199 float r;
200 t = fabsf(t);
201 if(t > 1.f)
202 {
203 r = 0.f;
204 }
205 else
206 {
207 r = 1.f - t;
208 }
209 return r;
210 }
211
212 #if defined(__SSE2__)
bilinear_sse(__m128 width,__m128 t)213 static inline __m128 bilinear_sse(__m128 width, __m128 t)
214 {
215 static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
216 return _mm_sub_ps(one, _mm_abs_ps(t));
217 }
218 #endif
219
220 /* --------------------------------------------------------------------------
221 * Bicubic interpolation
222 * ------------------------------------------------------------------------*/
223
bicubic(float width,float t)224 static inline float bicubic(float width, float t)
225 {
226 float r;
227 t = fabsf(t);
228 if(t >= 2.f)
229 {
230 r = 0.f;
231 }
232 else if(t > 1.f && t < 2.f)
233 {
234 float t2 = t * t;
235 r = 0.5f * (t * (-t2 + 5.f * t - 8.f) + 4.f);
236 }
237 else
238 {
239 float t2 = t * t;
240 r = 0.5f * (t * (3.f * t2 - 5.f * t) + 2.f);
241 }
242 return r;
243 }
244
245 #if defined(__SSE2__)
bicubic_sse(__m128 width,__m128 t)246 static inline __m128 bicubic_sse(__m128 width, __m128 t)
247 {
248 static const __m128 half = { .5f, .5f, .5f, .5f };
249 static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
250 static const __m128 two = { 2.f, 2.f, 2.f, 2.f };
251 static const __m128 three = { 3.f, 3.f, 3.f, 3.f };
252 static const __m128 four = { 4.f, 4.f, 4.f, 4.f };
253 static const __m128 five = { 5.f, 5.f, 5.f, 5.f };
254 static const __m128 eight = { 8.f, 8.f, 8.f, 8.f };
255
256 t = _mm_abs_ps(t);
257 const __m128 t2 = _mm_mul_ps(t, t);
258
259 /* Compute 1 < t < 2 case:
260 * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
261 * half*(t*(mt2 + t5 - eight) + four)
262 * half*(t*(mt2 + t5_sub_8) + four)
263 * half*(t*(mt2_add_t5_sub_8) + four) */
264 const __m128 t5 = _mm_mul_ps(five, t);
265 const __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
266 const __m128 zero = _mm_setzero_ps();
267 const __m128 mt2 = _mm_sub_ps(zero, t2);
268 const __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
269 const __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
270 const __m128 b = _mm_add_ps(a, four);
271 __m128 r12 = _mm_mul_ps(b, half);
272
273 /* Compute case < 1
274 * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
275 const __m128 t23 = _mm_mul_ps(three, t2);
276 const __m128 c = _mm_sub_ps(t23, t5);
277 const __m128 d = _mm_mul_ps(t, c);
278 const __m128 e = _mm_add_ps(d, two);
279 __m128 r01 = _mm_mul_ps(half, e);
280
281 // Compute masks fr keeping correct components
282 const __m128 mask01 = _mm_cmple_ps(t, one);
283 const __m128 mask12 = _mm_cmpgt_ps(t, one);
284 r01 = _mm_and_ps(mask01, r01);
285 r12 = _mm_and_ps(mask12, r12);
286
287
288 return _mm_or_ps(r01, r12);
289 }
290 #endif
291
292 /* --------------------------------------------------------------------------
293 * Lanczos interpolation
294 * ------------------------------------------------------------------------*/
295
296 #define DT_LANCZOS_EPSILON (1e-9f)
297
298 #if 0
299 // Reference version left here for ... documentation
300 static inline float
301 lanczos(float width, float t)
302 {
303 float r;
304
305 if (t<-width || t>width)
306 {
307 r = 0.f;
308 }
309 else if (t>-DT_LANCZOS_EPSILON && t<DT_LANCZOS_EPSILON)
310 {
311 r = 1.f;
312 }
313 else
314 {
315 r = width*sinf(M_PI*t)*sinf(M_PI*t/width)/(M_PI*M_PI*t*t);
316 }
317 return r;
318 }
319 #endif
320
321 /* Fast lanczos version, no calls to math.h functions, too accurate, too slow
322 *
323 * Based on a forum entry at
324 * http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/
325 *
326 * Apart the fast sine function approximation, the only trick is to compute:
327 * sin(pi.t) = sin(a.pi + r.pi) where t = a + r = trunc(t) + r
328 * = sin(a.pi).cos(r.pi) + sin(r.pi).cos(a.pi)
329 * = 0*cos(r.pi) + sin(r.pi).cos(a.pi)
330 * = sign.sin(r.pi) where sign = 1 if the a is even
331 * = -1 if the a is odd
332 *
333 * Of course we know that lanczos func will only be called for
334 * the range -width < t < width so we can additionally avoid the
335 * range check. */
336
lanczos(float width,float t)337 static inline float lanczos(float width, float t)
338 {
339 /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
340 * correct */
341 int a = (int)t;
342 float r = t - (float)a;
343
344 // Compute the correct sign for sinf(pi.r)
345 union
346 {
347 float f;
348 uint32_t i;
349 } sign;
350 sign.i = ((a & 1) << 31) | 0x3f800000;
351
352 return (DT_LANCZOS_EPSILON + width * sign.f * sinf_fast(M_PI_F * r) * sinf_fast(M_PI_F * t / width))
353 / (DT_LANCZOS_EPSILON + M_PI_F * M_PI_F * t * t);
354 }
355
356 #if defined(__SSE2__)
lanczos_sse2(__m128 width,__m128 t)357 static inline __m128 lanczos_sse2(__m128 width, __m128 t)
358 {
359 /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
360 * correct */
361 __m128i a = _mm_cvtps_epi32(t);
362 __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));
363
364 // Compute the correct sign for sinf(pi.r)
365 static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT)))
366 = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
367 static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1 };
368 static const __m128 eps
369 = { DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON };
370 static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
371 static const __m128 pi2 = { M_PI * M_PI, M_PI * M_PI, M_PI * M_PI, M_PI * M_PI };
372
373 __m128i isign = _mm_and_si128(*(__m128i *)ione, a);
374 isign = _mm_slli_epi64(isign, 31);
375 isign = _mm_or_si128(*(__m128i *)fone, isign);
376 const __m128 fsign = _mm_castsi128_ps(isign);
377
378 __m128 num = _mm_mul_ps(width, fsign);
379 num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
380 num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
381 num = _mm_add_ps(eps, num);
382
383 __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
384 den = _mm_add_ps(eps, den);
385
386 return _mm_div_ps(num, den);
387 }
388 #endif
389
390 #undef DT_LANCZOS_EPSILON
391
392 /* --------------------------------------------------------------------------
393 * All our known interpolators
394 * ------------------------------------------------------------------------*/
395
396 /* !!! !!! !!!
397 * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width
398 * of this filter list. Otherwise bad things will happen
399 * !!! !!! !!!
400 */
401 static const struct dt_interpolation dt_interpolator[] = {
402 {.id = DT_INTERPOLATION_BILINEAR,
403 .name = "bilinear",
404 .width = 1,
405 .func = &bilinear,
406 #if defined(__SSE2__)
407 .funcsse = &bilinear_sse
408 #endif
409 },
410 {.id = DT_INTERPOLATION_BICUBIC,
411 .name = "bicubic",
412 .width = 2,
413 .func = &bicubic,
414 #if defined(__SSE2__)
415 .funcsse = &bicubic_sse
416 #endif
417 },
418 {.id = DT_INTERPOLATION_LANCZOS2,
419 .name = "lanczos2",
420 .width = 2,
421 .func = &lanczos,
422 #if defined(__SSE2__)
423 .funcsse = &lanczos_sse2
424 #endif
425 },
426 {.id = DT_INTERPOLATION_LANCZOS3,
427 .name = "lanczos3",
428 .width = 3,
429 .func = &lanczos,
430 #if defined(__SSE2__)
431 .funcsse = &lanczos_sse2
432 #endif
433 },
434 };
435
436 /* --------------------------------------------------------------------------
437 * Kernel utility methods
438 * ------------------------------------------------------------------------*/
439
440 /** Computes an upsampling filtering kernel
441 *
442 * @param itor [in] Interpolator used
443 * @param kernel [out] resulting itor->width*2 filter taps
444 * @param norm [out] Kernel norm
445 * @param first [out] first input sample index used
446 * @param t [in] Interpolated coordinate */
compute_upsampling_kernel_plain(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)447 static inline void compute_upsampling_kernel_plain(const struct dt_interpolation *itor, float *kernel,
448 float *norm, int *first, float t)
449 {
450 int f = (int)t - itor->width + 1;
451 if(first)
452 {
453 *first = f;
454 }
455
456 /* Find closest integer position and then offset that to match first
457 * filtered sample position */
458 t = t - (float)f;
459
460 // Will hold kernel norm
461 float n = 0.f;
462
463 // Compute the raw kernel
464 for(int i = 0; i < 2 * itor->width; i++)
465 {
466 float tap = itor->func((float)itor->width, t);
467 n += tap;
468 kernel[i] = tap;
469 t -= 1.f;
470 }
471 if(norm)
472 {
473 *norm = n;
474 }
475 }
476
477 #if defined(__SSE2__)
478 /** Computes an upsampling filtering kernel (SSE version, four taps per inner loop)
479 *
480 * @param itor [in] Interpolator used
481 * @param kernel [out] resulting itor->width*2 filter taps (array must be at least (itor->width*2+3)/4*4
482 *floats long)
483 * @param norm [out] Kernel norm
484 * @param first [out] first input sample index used
485 * @param t [in] Interpolated coordinate
486 *
487 * @return kernel norm
488 */
compute_upsampling_kernel_sse(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)489 static inline void compute_upsampling_kernel_sse(const struct dt_interpolation *itor, float *kernel,
490 float *norm, int *first, float t)
491 {
492 int f = (int)t - itor->width + 1;
493 if(first)
494 {
495 *first = f;
496 }
497
498 /* Find closest integer position and then offset that to match first
499 * filtered sample position */
500 t = t - (float)f;
501
502 // Prepare t vector to compute four values a loop
503 static const __m128 bootstrap = { 0.f, -1.f, -2.f, -3.f };
504 static const __m128 iter = { -4.f, -4.f, -4.f, -4.f };
505 __m128 vt = _mm_add_ps(_mm_set_ps1(t), bootstrap);
506 __m128 vw = _mm_set_ps1((float)itor->width);
507
508 // Prepare counters (math kept stupid for understanding)
509 int i = 0;
510 const int runs = (2 * itor->width + 3) / 4;
511
512 while(i < runs)
513 {
514 // Compute the values
515 const __m128 vr = itor->funcsse(vw, vt);
516
517 // Save result
518 *(__m128 *)kernel = vr;
519
520 // Prepare next iteration
521 vt = _mm_add_ps(vt, iter);
522 kernel += 4;
523 i++;
524 }
525
526 // compute norm now
527 if(norm)
528 {
529 float n = 0.f;
530 i = 0;
531 kernel -= 4 * runs;
532 while(i < 2 * itor->width)
533 {
534 n += *kernel;
535 kernel++;
536 i++;
537 }
538 *norm = n;
539 }
540 }
541 #endif
542
compute_upsampling_kernel(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)543 static inline void compute_upsampling_kernel(const struct dt_interpolation *itor, float *kernel, float *norm,
544 int *first, float t)
545 {
546 if(darktable.codepath.OPENMP_SIMD) return compute_upsampling_kernel_plain(itor, kernel, norm, first, t);
547 #if defined(__SSE2__)
548 else if(darktable.codepath.SSE2)
549 return compute_upsampling_kernel_sse(itor, kernel, norm, first, t);
550 #endif
551 else
552 dt_unreachable_codepath();
553 }
554
555 /** Computes a downsampling filtering kernel
556 *
557 * @param itor [in] Interpolator used
558 * @param kernelsize [out] Number of taps
559 * @param kernel [out] resulting taps (at least itor->width/inoout elements for no overflow)
560 * @param norm [out] Kernel norm
561 * @param first [out] index of the first sample for which the kernel is to be applied
562 * @param outoinratio [in] "out samples" over "in samples" ratio
563 * @param xout [in] Output coordinate */
compute_downsampling_kernel_plain(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)564 static inline void compute_downsampling_kernel_plain(const struct dt_interpolation *itor, int *taps,
565 int *first, float *kernel, float *norm,
566 float outoinratio, int xout)
567 {
568 // Keep this at hand
569 const float w = (float)itor->width;
570
571 /* Compute the phase difference between output pixel and its
572 * input corresponding input pixel */
573 const float xin = ceil_fast(((float)xout - w) / outoinratio);
574 if(first)
575 {
576 *first = (int)xin;
577 }
578
579 // Compute first interpolator parameter
580 float t = xin * outoinratio - (float)xout;
581
582 // Will hold kernel norm
583 float n = 0.f;
584
585 // Compute all filter taps
586 *taps = (int)((w - t) / outoinratio);
587 for(int i = 0; i < *taps; i++)
588 {
589 *kernel = itor->func(w, t);
590 n += *kernel;
591 t += outoinratio;
592 kernel++;
593 }
594
595 if(norm)
596 {
597 *norm = n;
598 }
599 }
600
601
602 #if defined(__SSE2__)
603 /** Computes a downsampling filtering kernel (SSE version, four taps per inner loop iteration)
604 *
605 * @param itor [in] Interpolator used
606 * @param kernelsize [out] Number of taps
607 * @param kernel [out] resulting taps (at least itor->width/inoout + 4 elements for no overflow)
608 * @param norm [out] Kernel norm
609 * @param first [out] index of the first sample for which the kernel is to be applied
610 * @param outoinratio [in] "out samples" over "in samples" ratio
611 * @param xout [in] Output coordinate */
compute_downsampling_kernel_sse(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)612 static inline void compute_downsampling_kernel_sse(const struct dt_interpolation *itor, int *taps, int *first,
613 float *kernel, float *norm, float outoinratio, int xout)
614 {
615 // Keep this at hand
616 const float w = (float)itor->width;
617
618 /* Compute the phase difference between output pixel and its
619 * input corresponding input pixel */
620 const float xin = ceil_fast(((float)xout - w) / outoinratio);
621 if(first)
622 {
623 *first = (int)xin;
624 }
625
626 // Compute first interpolator parameter
627 float t = xin * outoinratio - (float)xout;
628
629 // Compute all filter taps
630 *taps = (int)((w - t) / outoinratio);
631
632 // Bootstrap vector t
633 static const __m128 bootstrap = { 0.f, 1.f, 2.f, 3.f };
634 const __m128 iter = _mm_set_ps1(4.f * outoinratio);
635 const __m128 vw = _mm_set_ps1(w);
636 __m128 vt = _mm_add_ps(_mm_set_ps1(t), _mm_mul_ps(_mm_set_ps1(outoinratio), bootstrap));
637
638 // Prepare counters (math kept stupid for understanding)
639 int i = 0;
640 const int runs = (*taps + 3) / 4;
641
642 while(i < runs)
643 {
644 // Compute the values
645 const __m128 vr = itor->funcsse(vw, vt);
646
647 // Save result
648 *(__m128 *)kernel = vr;
649
650 // Prepare next iteration
651 vt = _mm_add_ps(vt, iter);
652 kernel += 4;
653 i++;
654 }
655
656 // compute norm now
657 if(norm)
658 {
659 float n = 0.f;
660 i = 0;
661 kernel -= 4 * runs;
662 while(i < *taps)
663 {
664 n += *kernel;
665 kernel++;
666 i++;
667 }
668 *norm = n;
669 }
670 }
671 #endif
672
compute_downsampling_kernel(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)673 static inline void compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first,
674 float *kernel, float *norm, float outoinratio, int xout)
675 {
676 if(darktable.codepath.OPENMP_SIMD)
677 return compute_downsampling_kernel_plain(itor, taps, first, kernel, norm, outoinratio, xout);
678 #if defined(__SSE2__)
679 else if(darktable.codepath.SSE2)
680 return compute_downsampling_kernel_sse(itor, taps, first, kernel, norm, outoinratio, xout);
681 #endif
682 else
683 dt_unreachable_codepath();
684 }
685
686 /* --------------------------------------------------------------------------
687 * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
688 * ------------------------------------------------------------------------*/
689
690 #define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))
691
dt_interpolation_compute_sample(const struct dt_interpolation * itor,const float * in,const float x,const float y,const int width,const int height,const int samplestride,const int linestride)692 float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x,
693 const float y, const int width, const int height,
694 const int samplestride, const int linestride)
695 {
696 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
697
698 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
699 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
700
701 // Compute both horizontal and vertical kernels
702 float normh;
703 float normv;
704 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
705 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
706
707 int ix = (int)x;
708 int iy = (int)y;
709
710 /* Now 2 cases, the pixel + filter width goes outside the image
711 * in that case we have to use index clipping to keep all reads
712 * in the input image (slow path) or we are sure it won't fall
713 * outside and can do more simple code */
714 float r;
715 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
716 && iy < (height - itor->width))
717 {
718 // Inside image boundary case
719
720 // Go to top left pixel
721 in = (float *)in + linestride * iy + ix * samplestride;
722 in = in - (itor->width - 1) * (samplestride + linestride);
723
724 // Apply the kernel
725 float s = 0.f;
726 for(int i = 0; i < 2 * itor->width; i++)
727 {
728 float h = 0.0f;
729 for(int j = 0; j < 2 * itor->width; j++)
730 {
731 h += kernelh[j] * in[j * samplestride];
732 }
733 s += kernelv[i] * h;
734 in += linestride;
735 }
736 r = s / (normh * normv);
737 }
738 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
739 {
740 // At least a valid coordinate
741
742 // Point to the upper left pixel index wise
743 iy -= itor->width - 1;
744 ix -= itor->width - 1;
745
746 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
747 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
748
749 int xtap_first;
750 int xtap_last;
751 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
752
753 int ytap_first;
754 int ytap_last;
755 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
756
757 // Apply the kernel
758 float s = 0.f;
759 for(int i = ytap_first; i < ytap_last; i++)
760 {
761 const int clip_y = clip(iy + i, 0, height - 1, bordermode);
762 float h = 0.0f;
763 for(int j = xtap_first; j < xtap_last; j++)
764 {
765 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
766 const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
767 h += kernelh[j] * ipixel[0];
768 }
769 s += kernelv[i] * h;
770 }
771
772 r = s / (normh * normv);
773 }
774 else
775 {
776 // invalid coordinate
777 r = 0.0f;
778 }
779 return r;
780 }
781
782 /* --------------------------------------------------------------------------
783 * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
784 * ------------------------------------------------------------------------*/
785
dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)786 static void dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation *itor, const float *in,
787 float *out, const float x, const float y, const int width,
788 const int height, const int linestride)
789 {
790 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
791
792 // Quite a bit of space for kernels
793 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
794 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
795
796 // Compute both horizontal and vertical kernels
797 float normh;
798 float normv;
799 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
800 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
801
802 // Precompute the inverse of the filter norm for later use
803 const float oonorm = (1.f / (normh * normv));
804
805 /* Now 2 cases, the pixel + filter width goes outside the image
806 * in that case we have to use index clipping to keep all reads
807 * in the input image (slow path) or we are sure it won't fall
808 * outside and can do more simple code */
809 int ix = (int)x;
810 int iy = (int)y;
811
812 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
813 && iy < (height - itor->width))
814 {
815 // Inside image boundary case
816
817 // Go to top left pixel
818 in = (float *)in + linestride * iy + ix * 4;
819 in = in - (itor->width - 1) * (4 + linestride);
820
821 // Apply the kernel
822 float pixel[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
823 for(int i = 0; i < 2 * itor->width; i++)
824 {
825 float h[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
826 for(int j = 0; j < 2 * itor->width; j++)
827 {
828 for(int c = 0; c < 3; c++) h[c] += kernelh[j] * in[j * 4 + c];
829 }
830 for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
831 in += linestride;
832 }
833
834 for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
835 }
836 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
837 {
838 // At least a valid coordinate
839
840 // Point to the upper left pixel index wise
841 iy -= itor->width - 1;
842 ix -= itor->width - 1;
843
844 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
845 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
846
847 int xtap_first;
848 int xtap_last;
849 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
850
851 int ytap_first;
852 int ytap_last;
853 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
854
855 // Apply the kernel
856 float pixel[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
857 for(int i = ytap_first; i < ytap_last; i++)
858 {
859 const int clip_y = clip(iy + i, 0, height - 1, bordermode);
860 float h[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
861 for(int j = xtap_first; j < xtap_last; j++)
862 {
863 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
864 const float *ipixel = in + clip_y * linestride + clip_x * 4;
865 for(int c = 0; c < 3; c++) h[c] += kernelh[j] * ipixel[c];
866 }
867 for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
868 }
869
870 for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
871 }
872 else
873 {
874 for(int c = 0; c < 3; c++) out[c] = 0.0f;
875 }
876 }
877
878 #if defined(__SSE2__)
dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)879 static void dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation *itor, const float *in,
880 float *out, const float x, const float y, const int width,
881 const int height, const int linestride)
882 {
883 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
884
885 // Quite a bit of space for kernels
886 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
887 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
888 __m128 vkernelh[2 * MAX_HALF_FILTER_WIDTH];
889 __m128 vkernelv[2 * MAX_HALF_FILTER_WIDTH];
890
891 // Compute both horizontal and vertical kernels
892 float normh;
893 float normv;
894 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
895 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
896
897 // We will process four components a time, duplicate the information
898 for(int i = 0; i < 2 * itor->width; i++)
899 {
900 vkernelh[i] = _mm_set_ps1(kernelh[i]);
901 vkernelv[i] = _mm_set_ps1(kernelv[i]);
902 }
903
904 // Precompute the inverse of the filter norm for later use
905 const __m128 oonorm = _mm_set_ps1(1.f / (normh * normv));
906
907 /* Now 2 cases, the pixel + filter width goes outside the image
908 * in that case we have to use index clipping to keep all reads
909 * in the input image (slow path) or we are sure it won't fall
910 * outside and can do more simple code */
911 int ix = (int)x;
912 int iy = (int)y;
913
914 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
915 && iy < (height - itor->width))
916 {
917 // Inside image boundary case
918
919 // Go to top left pixel
920 in = (float *)in + linestride * iy + ix * 4;
921 in = in - (itor->width - 1) * (4 + linestride);
922
923 // Apply the kernel
924 __m128 pixel = _mm_setzero_ps();
925 for(int i = 0; i < 2 * itor->width; i++)
926 {
927 __m128 h = _mm_setzero_ps();
928 for(int j = 0; j < 2 * itor->width; j++)
929 {
930 h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)&in[j * 4]));
931 }
932 pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
933 in += linestride;
934 }
935
936 *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
937 }
938 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
939 {
940 // At least a valid coordinate
941
942 // Point to the upper left pixel index wise
943 iy -= itor->width - 1;
944 ix -= itor->width - 1;
945
946 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
947 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
948
949 int xtap_first;
950 int xtap_last;
951 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
952
953 int ytap_first;
954 int ytap_last;
955 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
956
957 // Apply the kernel
958 __m128 pixel = _mm_setzero_ps();
959 for(int i = ytap_first; i < ytap_last; i++)
960 {
961 int clip_y = clip(iy + i, 0, height - 1, bordermode);
962 __m128 h = _mm_setzero_ps();
963 for(int j = xtap_first; j < xtap_last; j++)
964 {
965 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
966 const float *ipixel = in + clip_y * linestride + clip_x * 4;
967 h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)ipixel));
968 }
969 pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
970 }
971
972 *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
973 }
974 else
975 {
976 *(__m128 *)out = _mm_set_ps1(0.0f);
977 }
978 }
979 #endif
980
dt_interpolation_compute_pixel4c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)981 void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out,
982 const float x, const float y, const int width, const int height,
983 const int linestride)
984 {
985 if(darktable.codepath.OPENMP_SIMD)
986 return dt_interpolation_compute_pixel4c_plain(itor, in, out, x, y, width, height, linestride);
987 #if defined(__SSE2__)
988 else if(darktable.codepath.SSE2)
989 return dt_interpolation_compute_pixel4c_sse(itor, in, out, x, y, width, height, linestride);
990 #endif
991 else
992 dt_unreachable_codepath();
993 }
994
dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)995 static void dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation *itor, const float *in,
996 float *out, const float x, const float y, const int width,
997 const int height, const int linestride)
998 {
999 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
1000
1001 // Quite a bit of space for kernels
1002 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1003 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1004
1005 // Compute both horizontal and vertical kernels
1006 float normh;
1007 float normv;
1008 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
1009 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
1010
1011 // Precompute the inverse of the filter norm for later use
1012 const float oonorm = (1.f / (normh * normv));
1013
1014 /* Now 2 cases, the pixel + filter width goes outside the image
1015 * in that case we have to use index clipping to keep all reads
1016 * in the input image (slow path) or we are sure it won't fall
1017 * outside and can do more simple code */
1018 int ix = (int)x;
1019 int iy = (int)y;
1020
1021 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
1022 && iy < (height - itor->width))
1023 {
1024 // Inside image boundary case
1025
1026 // Go to top left pixel
1027 in = (float *)in + linestride * iy + ix;
1028 in = in - (itor->width - 1) * (1 + linestride);
1029
1030 // Apply the kernel
1031 float pixel = 0.0f;
1032 for(int i = 0; i < 2 * itor->width; i++)
1033 {
1034 float h = 0.0f;
1035 for(int j = 0; j < 2 * itor->width; j++)
1036 {
1037 h += kernelh[j] * in[j];
1038 }
1039 pixel += kernelv[i] * h;
1040 in += linestride;
1041 }
1042
1043 *out = oonorm * pixel;
1044 }
1045 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
1046 {
1047 // At least a valid coordinate
1048
1049 // Point to the upper left pixel index wise
1050 iy -= itor->width - 1;
1051 ix -= itor->width - 1;
1052
1053 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
1054 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
1055
1056 int xtap_first;
1057 int xtap_last;
1058 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
1059
1060 int ytap_first;
1061 int ytap_last;
1062 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
1063
1064 // Apply the kernel
1065 float pixel = 0.0f;
1066 for(int i = ytap_first; i < ytap_last; i++)
1067 {
1068 const int clip_y = clip(iy + i, 0, height - 1, bordermode);
1069 float h = 0.0f;
1070 for(int j = xtap_first; j < xtap_last; j++)
1071 {
1072 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
1073 const float *ipixel = in + clip_y * linestride + clip_x;
1074 h += kernelh[j] * *ipixel;
1075 }
1076 pixel += kernelv[i] * h;
1077 }
1078
1079 *out = oonorm * pixel;
1080 }
1081 else
1082 {
1083 *out = 0.0f;
1084 }
1085 }
1086
dt_interpolation_compute_pixel1c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)1087 void dt_interpolation_compute_pixel1c(const struct dt_interpolation *itor, const float *in, float *out,
1088 const float x, const float y, const int width, const int height,
1089 const int linestride)
1090 {
1091 return dt_interpolation_compute_pixel1c_plain(itor, in, out, x, y, width, height, linestride);
1092 }
1093
1094 /* --------------------------------------------------------------------------
1095 * Interpolation factory
1096 * ------------------------------------------------------------------------*/
1097
dt_interpolation_new(enum dt_interpolation_type type)1098 const struct dt_interpolation *dt_interpolation_new(enum dt_interpolation_type type)
1099 {
1100 const struct dt_interpolation *itor = NULL;
1101
1102 if(type == DT_INTERPOLATION_USERPREF)
1103 {
1104 // Find user preferred interpolation method
1105 gchar *uipref = dt_conf_get_string("plugins/lighttable/export/pixel_interpolator");
1106 for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1107 {
1108 if(!strcmp(uipref, dt_interpolator[i].name))
1109 {
1110 // Found the one
1111 itor = &dt_interpolator[i];
1112 break;
1113 }
1114 }
1115 g_free(uipref);
1116
1117 /* In the case the search failed (!uipref or name not found),
1118 * prepare later search pass with default fallback */
1119 type = DT_INTERPOLATION_DEFAULT;
1120 }
1121 else if(type == DT_INTERPOLATION_USERPREF_WARP)
1122 {
1123 // Find user preferred interpolation method
1124 gchar *uipref = dt_conf_get_string("plugins/lighttable/export/pixel_interpolator_warp");
1125 for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1126 {
1127 if(!strcmp(uipref, dt_interpolator[i].name))
1128 {
1129 // Found the one
1130 itor = &dt_interpolator[i];
1131 break;
1132 }
1133 }
1134 g_free(uipref);
1135
1136 /* In the case the search failed (!uipref or name not found),
1137 * prepare later search pass with default fallback */
1138 type = DT_INTERPOLATION_DEFAULT_WARP;
1139 }
1140 if(!itor)
1141 {
1142 // Did not find the userpref one or we've been asked for a specific one
1143 for(int i = DT_INTERPOLATION_FIRST; i < DT_INTERPOLATION_LAST; i++)
1144 {
1145 if(dt_interpolator[i].id == type)
1146 {
1147 itor = &dt_interpolator[i];
1148 break;
1149 }
1150 if(dt_interpolator[i].id == DT_INTERPOLATION_DEFAULT)
1151 {
1152 itor = &dt_interpolator[i];
1153 }
1154 }
1155 }
1156
1157 return itor;
1158 }
1159
1160 /* --------------------------------------------------------------------------
1161 * Image resampling
1162 * ------------------------------------------------------------------------*/
1163
1164 /** Prepares a 1D resampling plan
1165 *
1166 * This consists of the following information
1167 * <ul>
1168 * <li>A list of lengths that tell how many pixels are relevant for the
1169 * next output</li>
1170 * <li>A list of required filter kernels</li>
1171 * <li>A list of sample indexes</li>
1172 * </ul>
1173 *
1174 * How to apply the resampling plan:
1175 * <ol>
1176 * <li>Pick a length from the length array</li>
1177 * <li>until length is reached
1178 * <ol>
1179 * <li>pick a kernel tap></li>
1180 * <li>pick the relevant sample according to the picked index</li>
1181 * <li>multiply them and accumulate</li>
1182 * </ol>
1183 * </li>
1184 * <li>here goes a single output sample</li>
1185 * </ol>
1186 *
1187 * This until you reach the number of output pixels
1188 *
1189 * @param itor interpolator used to resample
1190 * @param in [in] Number of input samples
1191 * @param out [in] Number of output samples
1192 * @param plength [out] Array of lengths for each pixel filtering (number
1193 * of taps/indexes to use). This array mus be freed with dt_free_align() when you're
1194 * done with the plan.
1195 * @param pkernel [out] Array of filter kernel taps
1196 * @param pindex [out] Array of sample indexes to be used for applying each kernel tap
1197 * arrays of information
1198 * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary
1199 * out position meta[3*out]
1200 * @return 0 for success, !0 for failure
1201 */
prepare_resampling_plan(const struct dt_interpolation * itor,int in,const int in_x0,int out,const int out_x0,float scale,int ** plength,float ** pkernel,int ** pindex,int ** pmeta)1202 static int prepare_resampling_plan(const struct dt_interpolation *itor, int in, const int in_x0, int out,
1203 const int out_x0, float scale, int **plength, float **pkernel,
1204 int **pindex, int **pmeta)
1205 {
1206 // Safe return values
1207 *plength = NULL;
1208 *pkernel = NULL;
1209 *pindex = NULL;
1210 if(pmeta)
1211 {
1212 *pmeta = NULL;
1213 }
1214
1215 if(scale == 1.f)
1216 {
1217 // No resampling required
1218 return 0;
1219 }
1220
1221 // Compute common upsampling/downsampling memory requirements
1222 int maxtapsapixel;
1223 if(scale > 1.f)
1224 {
1225 // Upscale... the easy one. The values are exact
1226 maxtapsapixel = 2 * itor->width;
1227 }
1228 else
1229 {
1230 // Downscale... going for worst case values memory wise
1231 maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);
1232 }
1233
1234 int nlengths = out;
1235 const int nindex = maxtapsapixel * out;
1236 const int nkernel = maxtapsapixel * out;
1237 const size_t lengthreq = increase_for_alignment(nlengths * sizeof(int), SSE_ALIGNMENT);
1238 const size_t indexreq = increase_for_alignment(nindex * sizeof(int), SSE_ALIGNMENT);
1239 const size_t kernelreq = increase_for_alignment(nkernel * sizeof(float), SSE_ALIGNMENT);
1240 const size_t scratchreq = maxtapsapixel * sizeof(float) + 4 * sizeof(float);
1241 // NB: because sse versions compute four taps a time
1242 const size_t metareq = pmeta ? 3 * sizeof(int) * out : 0;
1243
1244 void *blob = NULL;
1245 const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
1246 blob = dt_alloc_align(SSE_ALIGNMENT, totalreq);
1247 if(!blob)
1248 {
1249 return 1;
1250 }
1251
1252 int *lengths = (int *)blob;
1253 blob = (char *)blob + lengthreq;
1254 int *index = (int *)blob;
1255 blob = (char *)blob + indexreq;
1256 float *kernel = (float *)blob;
1257 blob = (char *)blob + kernelreq;
1258 float *scratchpad = scratchreq ? (float *)blob : NULL;
1259 blob = (char *)blob + scratchreq;
1260 int *meta = metareq ? (int *)blob : NULL;
1261 // blob = (char *)blob + metareq;
1262
1263 /* setting this as a const should help the compilers trim all unnecessary
1264 * codepaths */
1265 const enum border_mode bordermode = RESAMPLING_BORDER_MODE;
1266
1267 /* Upscale and downscale differ in subtle points, getting rid of code
1268 * duplication might have been tricky and i prefer keeping the code
1269 * as straight as possible */
1270 if(scale > 1.f)
1271 {
1272 int kidx = 0;
1273 int iidx = 0;
1274 int lidx = 0;
1275 int midx = 0;
1276 for(int x = 0; x < out; x++)
1277 {
1278 if(meta)
1279 {
1280 meta[midx++] = lidx;
1281 meta[midx++] = kidx;
1282 meta[midx++] = iidx;
1283 }
1284
1285 // Projected position in input samples
1286 float fx = (float)(out_x0 + x) / scale;
1287
1288 // Compute the filter kernel at that position
1289 int first;
1290 compute_upsampling_kernel(itor, scratchpad, NULL, &first, fx);
1291
1292 /* Check lower and higher bound pixel index and skip as many pixels as
1293 * necessary to fall into range */
1294 int tap_first;
1295 int tap_last;
1296 prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->width, first, in);
1297
1298 // Track number of taps that will be used
1299 lengths[lidx++] = tap_last - tap_first;
1300
1301 // Precompute the inverse of the norm
1302 float norm = 0.f;
1303 for(int tap = tap_first; tap < tap_last; tap++)
1304 {
1305 norm += scratchpad[tap];
1306 }
1307 norm = 1.f / norm;
1308
1309 /* Unlike single pixel or single sample code, here it's interesting to
1310 * precompute the normalized filter kernel as this will avoid dividing
1311 * by the norm for all processed samples/pixels
1312 * NB: use the same loop to put in place the index list */
1313 first += tap_first;
1314 for(int tap = tap_first; tap < tap_last; tap++)
1315 {
1316 kernel[kidx++] = scratchpad[tap] * norm;
1317 index[iidx++] = clip(first++, 0, in - 1, bordermode);
1318 }
1319 }
1320 }
1321 else
1322 {
1323 int kidx = 0;
1324 int iidx = 0;
1325 int lidx = 0;
1326 int midx = 0;
1327 for(int x = 0; x < out; x++)
1328 {
1329 if(meta)
1330 {
1331 meta[midx++] = lidx;
1332 meta[midx++] = kidx;
1333 meta[midx++] = iidx;
1334 }
1335
1336 // Compute downsampling kernel centered on output position
1337 int taps;
1338 int first;
1339 compute_downsampling_kernel(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);
1340
1341 /* Check lower and higher bound pixel index and skip as many pixels as
1342 * necessary to fall into range */
1343 int tap_first;
1344 int tap_last;
1345 prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);
1346
1347 // Track number of taps that will be used
1348 lengths[lidx++] = tap_last - tap_first;
1349
1350 // Precompute the inverse of the norm
1351 float norm = 0.f;
1352 for(int tap = tap_first; tap < tap_last; tap++)
1353 {
1354 norm += scratchpad[tap];
1355 }
1356 norm = 1.f / norm;
1357
1358 /* Unlike single pixel or single sample code, here it's interesting to
1359 * precompute the normalized filter kernel as this will avoid dividing
1360 * by the norm for all processed samples/pixels
1361 * NB: use the same loop to put in place the index list */
1362 first += tap_first;
1363 for(int tap = tap_first; tap < tap_last; tap++)
1364 {
1365 kernel[kidx++] = scratchpad[tap] * norm;
1366 index[iidx++] = clip(first++, 0, in - 1, bordermode);
1367 }
1368 }
1369 }
1370
1371 // Validate plan wrt caller
1372 *plength = lengths;
1373 *pindex = index;
1374 *pkernel = kernel;
1375 if(pmeta)
1376 {
1377 *pmeta = meta;
1378 }
1379
1380 return 0;
1381 }
1382
dt_interpolation_resample_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1383 static void dt_interpolation_resample_plain(const struct dt_interpolation *itor, float *out,
1384 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1385 const float *const in, const dt_iop_roi_t *const roi_in,
1386 const int32_t in_stride)
1387 {
1388 int *hindex = NULL;
1389 int *hlength = NULL;
1390 float *hkernel = NULL;
1391 int *vindex = NULL;
1392 int *vlength = NULL;
1393 float *vkernel = NULL;
1394 int *vmeta = NULL;
1395
1396 const int32_t in_stride_floats = in_stride / sizeof(float);
1397 const int32_t out_stride_floats = out_stride / sizeof(float);
1398 int r;
1399
1400 debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1401 roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1402 roi_out->x, roi_out->y, roi_out->scale);
1403
1404 // Fast code path for 1:1 copy, only cropping area can change
1405 if(roi_out->scale == 1.f)
1406 {
1407 const int x0 = roi_out->x * 4 * sizeof(float);
1408 #if DEBUG_RESAMPLING_TIMING
1409 int64_t ts_resampling = getts();
1410 #endif
1411 #ifdef _OPENMP
1412 #pragma omp parallel for default(none) \
1413 dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1414 shared(out)
1415 #endif
1416 for(int y = 0; y < roi_out->height; y++)
1417 {
1418 memcpy((char *)out + (size_t)out_stride * y,
1419 (char *)in + (size_t)in_stride * (y + roi_out->y) + x0,
1420 out_stride);
1421 }
1422 #if DEBUG_RESAMPLING_TIMING
1423 ts_resampling = getts() - ts_resampling;
1424 fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1425 #endif
1426 // All done, so easy case
1427 return;
1428 }
1429
1430 // Generic non 1:1 case... much more complicated :D
1431 #if DEBUG_RESAMPLING_TIMING
1432 int64_t ts_plan = getts();
1433 #endif
1434
1435 // Prepare resampling plans once and for all
1436 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1437 &hlength, &hkernel, &hindex, NULL);
1438 if(r)
1439 {
1440 goto exit;
1441 }
1442
1443 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1444 &vlength, &vkernel, &vindex, &vmeta);
1445 if(r)
1446 {
1447 goto exit;
1448 }
1449
1450 #if DEBUG_RESAMPLING_TIMING
1451 ts_plan = getts() - ts_plan;
1452 #endif
1453
1454 #if DEBUG_RESAMPLING_TIMING
1455 int64_t ts_resampling = getts();
1456 #endif
1457
1458 // Process each output line
1459 #ifdef _OPENMP
1460 #pragma omp parallel for default(none) \
1461 dt_omp_firstprivate(in, in_stride_floats, out_stride_floats, roi_out) \
1462 shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1463 #endif
1464 for(int oy = 0; oy < roi_out->height; oy++)
1465 {
1466 // Initialize column resampling indexes
1467 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1468 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1469 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1470
1471 // Initialize row resampling indexes
1472 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1473 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1474 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1475
1476 // Number of lines contributing to the output line
1477 int vl = vlength[vlidx++]; // V(ertical) L(ength)
1478
1479 // Process each output column
1480 for(int ox = 0; ox < roi_out->width; ox++)
1481 {
1482 debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1483
1484 // This will hold the resulting pixel
1485 DT_ALIGNED_PIXEL float vs[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
1486
1487 // Number of horizontal samples contributing to the output
1488 int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1489
1490 for(int iy = 0; iy < vl; iy++)
1491 {
1492 // This is our input line
1493 size_t baseidx_vindex = (size_t)vindex[viidx++] * in_stride_floats;
1494
1495 DT_ALIGNED_PIXEL float vhs[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
1496
1497 for(int ix = 0; ix < hl; ix++)
1498 {
1499 // Apply the precomputed filter kernel
1500 const size_t baseidx = baseidx_vindex + (size_t)hindex[hiidx++] * 4;
1501 const float htap = hkernel[hkidx++];
1502 // Convince gcc 10 to vectorize
1503 DT_ALIGNED_PIXEL float tmp[4] = { in[baseidx], in[baseidx+1], in[baseidx+2], in[baseidx+3] };
1504 for_each_channel(c, aligned(tmp,vhs:16)) vhs[c] += tmp[c] * htap;
1505 }
1506
1507 // Accumulate contribution from this line
1508 const float vtap = vkernel[vkidx++];
1509 for_each_channel(c, aligned(vhs,vs:16)) vs[c] += vhs[c] * vtap;
1510
1511 // Reset horizontal resampling context
1512 hkidx -= hl;
1513 hiidx -= hl;
1514 }
1515
1516 // Output pixel is ready
1517 const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
1518
1519 // Clip negative RGB that may be produced by Lanczos undershooting
1520 // Negative RGB are invalid values no matter the RGB space (light is positive)
1521 for_each_channel(c, aligned(vs:16)) out[baseidx + c] = fmaxf(vs[c], 0.f);
1522
1523 // Reset vertical resampling context
1524 viidx -= vl;
1525 vkidx -= vl;
1526
1527 // Progress in horizontal context
1528 hiidx += hl;
1529 hkidx += hl;
1530 }
1531 }
1532
1533 #if DEBUG_RESAMPLING_TIMING
1534 ts_resampling = getts() - ts_resampling;
1535 fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1536 #endif
1537
1538 exit:
1539 /* Free the resampling plans. It's nasty to optimize allocs like that, but
1540 * it simplifies the code :-D. The length array is in fact the only memory
1541 * allocated. */
1542 dt_free_align(hlength);
1543 dt_free_align(vlength);
1544 }
1545
1546 #if defined(__SSE2__)
dt_interpolation_resample_sse(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1547 static void dt_interpolation_resample_sse(const struct dt_interpolation *itor, float *out,
1548 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1549 const float *const in, const dt_iop_roi_t *const roi_in,
1550 const int32_t in_stride)
1551 {
1552 int *hindex = NULL;
1553 int *hlength = NULL;
1554 float *hkernel = NULL;
1555 int *vindex = NULL;
1556 int *vlength = NULL;
1557 float *vkernel = NULL;
1558 int *vmeta = NULL;
1559
1560 int r;
1561
1562 debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1563 roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1564 roi_out->x, roi_out->y, roi_out->scale);
1565
1566 // Fast code path for 1:1 copy, only cropping area can change
1567 if(roi_out->scale == 1.f)
1568 {
1569 const int x0 = roi_out->x * 4 * sizeof(float);
1570 #if DEBUG_RESAMPLING_TIMING
1571 int64_t ts_resampling = getts();
1572 #endif
1573 #ifdef _OPENMP
1574 #pragma omp parallel for default(none) \
1575 dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1576 shared(out)
1577 #endif
1578 for(int y = 0; y < roi_out->height; y++)
1579 {
1580 float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
1581 float *o = (float *)((char *)out + (size_t)out_stride * y);
1582 memcpy(o, i, out_stride);
1583 }
1584 #if DEBUG_RESAMPLING_TIMING
1585 ts_resampling = getts() - ts_resampling;
1586 fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1587 #endif
1588 // All done, so easy case
1589 return;
1590 }
1591
1592 // Generic non 1:1 case... much more complicated :D
1593 #if DEBUG_RESAMPLING_TIMING
1594 int64_t ts_plan = getts();
1595 #endif
1596
1597 // Prepare resampling plans once and for all
1598 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1599 &hlength, &hkernel, &hindex, NULL);
1600 if(r)
1601 {
1602 goto exit;
1603 }
1604
1605 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1606 &vlength, &vkernel, &vindex, &vmeta);
1607 if(r)
1608 {
1609 goto exit;
1610 }
1611
1612 #if DEBUG_RESAMPLING_TIMING
1613 ts_plan = getts() - ts_plan;
1614 #endif
1615
1616 #if DEBUG_RESAMPLING_TIMING
1617 int64_t ts_resampling = getts();
1618 #endif
1619
1620 // Process each output line
1621 #ifdef _OPENMP
1622 #pragma omp parallel for default(none) \
1623 dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
1624 shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1625 #endif
1626 for(int oy = 0; oy < roi_out->height; oy++)
1627 {
1628 // Initialize column resampling indexes
1629 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1630 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1631 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1632
1633 // Initialize row resampling indexes
1634 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1635 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1636 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1637
1638 // Number of lines contributing to the output line
1639 int vl = vlength[vlidx++]; // V(ertical) L(ength)
1640
1641 // Process each output column
1642 for(int ox = 0; ox < roi_out->width; ox++)
1643 {
1644 debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1645
1646 // This will hold the resulting pixel
1647 __m128 vs = _mm_setzero_ps();
1648
1649 // Number of horizontal samples contributing to the output
1650 const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1651
1652 for(int iy = 0; iy < vl; iy++)
1653 {
1654 // This is our input line
1655 const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
1656
1657 __m128 vhs = _mm_setzero_ps();
1658
1659 for(int ix = 0; ix < hl; ix++)
1660 {
1661 // Apply the precomputed filter kernel
1662 const size_t baseidx = (size_t)hindex[hiidx++] * 4;
1663 const float htap = hkernel[hkidx++];
1664 const __m128 vhtap = _mm_set_ps1(htap);
1665 vhs = _mm_add_ps(vhs, _mm_mul_ps(*(__m128 *)&i[baseidx], vhtap));
1666 }
1667
1668 // Accumulate contribution from this line
1669 const float vtap = vkernel[vkidx++];
1670 const __m128 vvtap = _mm_set_ps1(vtap);
1671 vs = _mm_add_ps(vs, _mm_mul_ps(vhs, vvtap));
1672
1673 // Reset horizontal resampling context
1674 hkidx -= hl;
1675 hiidx -= hl;
1676 }
1677
1678 // Output pixel is ready
1679 float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * 4 * sizeof(float));
1680
1681 // Clip negative RGB that may be produced by Lanczos undershooting
1682 // Negative RGB are invalid values no matter the RGB space (light is positive)
1683 vs = _mm_max_ps(vs, _mm_setzero_ps());
1684 _mm_stream_ps(o, vs);
1685
1686 // Reset vertical resampling context
1687 viidx -= vl;
1688 vkidx -= vl;
1689
1690 // Progress in horizontal context
1691 hiidx += hl;
1692 hkidx += hl;
1693 }
1694 }
1695
1696 _mm_sfence();
1697
1698 #if DEBUG_RESAMPLING_TIMING
1699 ts_resampling = getts() - ts_resampling;
1700 fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1701 #endif
1702
1703 exit:
1704 /* Free the resampling plans. It's nasty to optimize allocs like that, but
1705 * it simplifies the code :-D. The length array is in fact the only memory
1706 * allocated. */
1707 dt_free_align(hlength);
1708 dt_free_align(vlength);
1709 }
1710 #endif
1711
1712 /** Applies resampling (re-scaling) on *full* input and output buffers.
1713 * roi_in and roi_out define the part of the buffers that is affected.
1714 */
dt_interpolation_resample(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1715 void dt_interpolation_resample(const struct dt_interpolation *itor, float *out,
1716 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1717 const float *const in, const dt_iop_roi_t *const roi_in,
1718 const int32_t in_stride)
1719 {
1720 if(darktable.codepath.OPENMP_SIMD)
1721 return dt_interpolation_resample_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1722 #if defined(__SSE2__)
1723 else if(darktable.codepath.SSE2)
1724 return dt_interpolation_resample_sse(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1725 #endif
1726 else
1727 dt_unreachable_codepath();
1728 }
1729
1730 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1731 * and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1732 * positions of the roi's within the full input and output image, respectively.
1733 */
dt_interpolation_resample_roi(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1734 void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out,
1735 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1736 const float *const in, const dt_iop_roi_t *const roi_in,
1737 const int32_t in_stride)
1738 {
1739 dt_iop_roi_t oroi = *roi_out;
1740 oroi.x = oroi.y = 0;
1741
1742 dt_iop_roi_t iroi = *roi_in;
1743 iroi.x = iroi.y = 0;
1744
1745 dt_interpolation_resample(itor, out, &oroi, out_stride, in, &iroi, in_stride);
1746 }
1747
1748 #ifdef HAVE_OPENCL
dt_interpolation_init_cl_global()1749 dt_interpolation_cl_global_t *dt_interpolation_init_cl_global()
1750 {
1751 dt_interpolation_cl_global_t *g
1752 = (dt_interpolation_cl_global_t *)malloc(sizeof(dt_interpolation_cl_global_t));
1753
1754 const int program = 2; // basic.cl, from programs.conf
1755 g->kernel_interpolation_resample = dt_opencl_create_kernel(program, "interpolation_resample");
1756 return g;
1757 }
1758
dt_interpolation_free_cl_global(dt_interpolation_cl_global_t * g)1759 void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
1760 {
1761 if(!g) return;
1762 // destroy kernels
1763 dt_opencl_free_kernel(g->kernel_interpolation_resample);
1764 free(g);
1765 }
1766
roundToNextPowerOfTwo(uint32_t x)1767 static uint32_t roundToNextPowerOfTwo(uint32_t x)
1768 {
1769 x--;
1770 x |= x >> 1;
1771 x |= x >> 2;
1772 x |= x >> 4;
1773 x |= x >> 8;
1774 x |= x >> 16;
1775 x++;
1776 return x;
1777 }
1778
1779 /** Applies resampling (re-scaling) on *full* input and output buffers.
1780 * roi_in and roi_out define the part of the buffers that is affected.
1781 */
dt_interpolation_resample_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1782 int dt_interpolation_resample_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1783 const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1784 const dt_iop_roi_t *const roi_in)
1785 {
1786 int *hindex = NULL;
1787 int *hlength = NULL;
1788 float *hkernel = NULL;
1789 int *hmeta = NULL;
1790 int *vindex = NULL;
1791 int *vlength = NULL;
1792 float *vkernel = NULL;
1793 int *vmeta = NULL;
1794
1795 int r;
1796 cl_int err = -999;
1797
1798 cl_mem dev_hindex = NULL;
1799 cl_mem dev_hlength = NULL;
1800 cl_mem dev_hkernel = NULL;
1801 cl_mem dev_hmeta = NULL;
1802 cl_mem dev_vindex = NULL;
1803 cl_mem dev_vlength = NULL;
1804 cl_mem dev_vkernel = NULL;
1805 cl_mem dev_vmeta = NULL;
1806
1807 debug_info("resampling_cl %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", (void *)dev_in,
1808 roi_in->width, roi_in->height, roi_in->x, roi_in->y, roi_in->scale, (void *)dev_out,
1809 roi_out->width, roi_out->height, roi_out->x, roi_out->y, roi_out->scale);
1810
1811 // Fast code path for 1:1 copy, only cropping area can change
1812 if(roi_out->scale == 1.f)
1813 {
1814 #if DEBUG_RESAMPLING_TIMING
1815 int64_t ts_resampling = getts();
1816 #endif
1817 size_t iorigin[] = { roi_out->x, roi_out->y, 0 };
1818 size_t oorigin[] = { 0, 0, 0 };
1819 size_t region[] = { roi_out->width, roi_out->height, 1 };
1820
1821 // copy original input from dev_in -> dev_out as starting point
1822 err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
1823 if(err != CL_SUCCESS) goto error;
1824
1825 #if DEBUG_RESAMPLING_TIMING
1826 ts_resampling = getts() - ts_resampling;
1827 fprintf(stderr, "resampling_cl %p plan:0us resampling:%" PRId64 "us\n", (void *)dev_in, ts_resampling);
1828 #endif
1829 // All done, so easy case
1830 return CL_SUCCESS;
1831 }
1832
1833 // Generic non 1:1 case... much more complicated :D
1834 #if DEBUG_RESAMPLING_TIMING
1835 int64_t ts_plan = getts();
1836 #endif
1837
1838 // Prepare resampling plans once and for all
1839 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1840 &hlength, &hkernel, &hindex, &hmeta);
1841 if(r)
1842 {
1843 goto error;
1844 }
1845
1846 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1847 &vlength, &vkernel, &vindex, &vmeta);
1848 if(r)
1849 {
1850 goto error;
1851 }
1852
1853 int hmaxtaps = -1, vmaxtaps = -1;
1854 for(int k = 0; k < roi_out->width; k++) hmaxtaps = MAX(hmaxtaps, hlength[k]);
1855 for(int k = 0; k < roi_out->height; k++) vmaxtaps = MAX(vmaxtaps, vlength[k]);
1856
1857 #if DEBUG_RESAMPLING_TIMING
1858 ts_plan = getts() - ts_plan;
1859 #endif
1860
1861 #if DEBUG_RESAMPLING_TIMING
1862 int64_t ts_resampling = getts();
1863 #endif
1864
1865 // strategy: process image column-wise (local[0] = 1). For each row generate
1866 // a number of parallel work items each taking care of one horizontal convolution,
1867 // then sum over work items to do the vertical convolution
1868
1869 const int kernel = darktable.opencl->interpolation->kernel_interpolation_resample;
1870 const int width = roi_out->width;
1871 const int height = roi_out->height;
1872
1873 // make sure blocksize is not too large
1874 const int taps = roundToNextPowerOfTwo(vmaxtaps); // the number of work items per row rounded up to a power of 2
1875 // (for quick recursive reduction)
1876 int vblocksize;
1877
1878 dt_opencl_local_buffer_t locopt
1879 = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
1880 .cellsize = 4 * sizeof(float), .overhead = hmaxtaps * sizeof(float) + hmaxtaps * sizeof(int),
1881 .sizex = 1, .sizey = (1 << 16) * taps };
1882
1883 if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))
1884 vblocksize = locopt.sizey;
1885 else
1886 vblocksize = 1;
1887
1888 if(vblocksize < taps)
1889 {
1890 // our strategy does not work: the vertical number of taps exceeds the vertical workgroupsize;
1891 // there is no point in continuing on the GPU - that would be way too slow; let's delegate the stuff to
1892 // the CPU then.
1893 dt_print(
1894 DT_DEBUG_OPENCL,
1895 "[opencl_resampling] resampling plan cannot efficiently be run on the GPU - fall back to CPU.\n");
1896 goto error;
1897 }
1898
1899 size_t sizes[3] = { ROUNDUPWD(width), ROUNDUP(height * taps, vblocksize), 1 };
1900 size_t local[3] = { 1, vblocksize, 1 };
1901
1902 // store resampling plan to device memory
1903 // hindex, vindex, hkernel, vkernel: (v|h)maxtaps might be too small, so store a bit more than needed
1904 dev_hindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * (hmaxtaps + 1), hindex);
1905 if(dev_hindex == NULL) goto error;
1906
1907 dev_hlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width, hlength);
1908 if(dev_hlength == NULL) goto error;
1909
1910 dev_hkernel
1911 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * width * (hmaxtaps + 1), hkernel);
1912 if(dev_hkernel == NULL) goto error;
1913
1914 dev_hmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * 3, hmeta);
1915 if(dev_hmeta == NULL) goto error;
1916
1917 dev_vindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * (vmaxtaps + 1), vindex);
1918 if(dev_vindex == NULL) goto error;
1919
1920 dev_vlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height, vlength);
1921 if(dev_vlength == NULL) goto error;
1922
1923 dev_vkernel
1924 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * height * (vmaxtaps + 1), vkernel);
1925 if(dev_vkernel == NULL) goto error;
1926
1927 dev_vmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * 3, vmeta);
1928 if(dev_vmeta == NULL) goto error;
1929
1930 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);
1931 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);
1932 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);
1933 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);
1934 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cl_mem), (void *)&dev_hmeta);
1935 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cl_mem), (void *)&dev_vmeta);
1936 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cl_mem), (void *)&dev_hlength);
1937 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cl_mem), (void *)&dev_vlength);
1938 dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cl_mem), (void *)&dev_hindex);
1939 dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(cl_mem), (void *)&dev_vindex);
1940 dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(cl_mem), (void *)&dev_hkernel);
1941 dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(cl_mem), (void *)&dev_vkernel);
1942 dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(int), (void *)&hmaxtaps);
1943 dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(int), (void *)&taps);
1944 dt_opencl_set_kernel_arg(devid, kernel, 14, hmaxtaps * sizeof(float), NULL);
1945 dt_opencl_set_kernel_arg(devid, kernel, 15, hmaxtaps * sizeof(int), NULL);
1946 dt_opencl_set_kernel_arg(devid, kernel, 16, vblocksize * 4 * sizeof(float), NULL);
1947 err = dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizes, local);
1948 if(err != CL_SUCCESS) goto error;
1949
1950 #if DEBUG_RESAMPLING_TIMING
1951 ts_resampling = getts() - ts_resampling;
1952 fprintf(stderr, "resampling_cl %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", (void *)dev_in, ts_plan,
1953 ts_resampling);
1954 #endif
1955
1956 dt_opencl_release_mem_object(dev_hindex);
1957 dt_opencl_release_mem_object(dev_hlength);
1958 dt_opencl_release_mem_object(dev_hkernel);
1959 dt_opencl_release_mem_object(dev_hmeta);
1960 dt_opencl_release_mem_object(dev_vindex);
1961 dt_opencl_release_mem_object(dev_vlength);
1962 dt_opencl_release_mem_object(dev_vkernel);
1963 dt_opencl_release_mem_object(dev_vmeta);
1964 dt_free_align(hlength);
1965 dt_free_align(vlength);
1966 return CL_SUCCESS;
1967
1968 error:
1969 dt_opencl_release_mem_object(dev_hindex);
1970 dt_opencl_release_mem_object(dev_hlength);
1971 dt_opencl_release_mem_object(dev_hkernel);
1972 dt_opencl_release_mem_object(dev_hmeta);
1973 dt_opencl_release_mem_object(dev_vindex);
1974 dt_opencl_release_mem_object(dev_vlength);
1975 dt_opencl_release_mem_object(dev_vkernel);
1976 dt_opencl_release_mem_object(dev_vmeta);
1977 dt_free_align(hlength);
1978 dt_free_align(vlength);
1979 dt_print(DT_DEBUG_OPENCL, "[opencl_resampling] couldn't enqueue kernel! %d\n", err);
1980 return err;
1981 }
1982
1983 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1984 * and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1985 * positions of the roi's within the full input and output image, respectively.
1986 */
dt_interpolation_resample_roi_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1987 int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1988 const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1989 const dt_iop_roi_t *const roi_in)
1990 {
1991 dt_iop_roi_t oroi = *roi_out;
1992 oroi.x = oroi.y = 0;
1993
1994 dt_iop_roi_t iroi = *roi_in;
1995 iroi.x = iroi.y = 0;
1996
1997 return dt_interpolation_resample_cl(itor, devid, dev_out, &oroi, dev_in, &iroi);
1998 }
1999 #endif
2000
dt_interpolation_resample_1c_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2001 static void dt_interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out,
2002 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2003 const float *const in, const dt_iop_roi_t *const roi_in,
2004 const int32_t in_stride)
2005 {
2006 int *hindex = NULL;
2007 int *hlength = NULL;
2008 float *hkernel = NULL;
2009 int *vindex = NULL;
2010 int *vlength = NULL;
2011 float *vkernel = NULL;
2012 int *vmeta = NULL;
2013
2014 int r;
2015
2016 debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
2017 roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
2018 roi_out->x, roi_out->y, roi_out->scale);
2019
2020 // Fast code path for 1:1 copy, only cropping area can change
2021 if(roi_out->scale == 1.f)
2022 {
2023 const int x0 = roi_out->x * sizeof(float);
2024 #if DEBUG_RESAMPLING_TIMING
2025 int64_t ts_resampling = getts();
2026 #endif
2027 #ifdef _OPENMP
2028 #pragma omp parallel for default(none) \
2029 dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
2030 shared(out)
2031 #endif
2032 for(int y = 0; y < roi_out->height; y++)
2033 {
2034 float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
2035 float *o = (float *)((char *)out + (size_t)out_stride * y);
2036 memcpy(o, i, out_stride);
2037 }
2038 #if DEBUG_RESAMPLING_TIMING
2039 ts_resampling = getts() - ts_resampling;
2040 fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
2041 #endif
2042 // All done, so easy case
2043 return;
2044 }
2045
2046 // Generic non 1:1 case... much more complicated :D
2047 #if DEBUG_RESAMPLING_TIMING
2048 int64_t ts_plan = getts();
2049 #endif
2050
2051 // Prepare resampling plans once and for all
2052 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
2053 &hlength, &hkernel, &hindex, NULL);
2054 if(r)
2055 {
2056 goto exit;
2057 }
2058
2059 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
2060 &vlength, &vkernel, &vindex, &vmeta);
2061 if(r)
2062 {
2063 goto exit;
2064 }
2065
2066 #if DEBUG_RESAMPLING_TIMING
2067 ts_plan = getts() - ts_plan;
2068 #endif
2069
2070 #if DEBUG_RESAMPLING_TIMING
2071 int64_t ts_resampling = getts();
2072 #endif
2073
2074 // Process each output line
2075 #ifdef _OPENMP
2076 #pragma omp parallel for default(none) \
2077 dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
2078 shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
2079 #endif
2080 for(int oy = 0; oy < roi_out->height; oy++)
2081 {
2082 // Initialize column resampling indexes
2083 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
2084 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
2085 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
2086
2087 // Initialize row resampling indexes
2088 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
2089 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
2090 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
2091
2092 // Number of lines contributing to the output line
2093 int vl = vlength[vlidx++]; // V(ertical) L(ength)
2094
2095 // Process each output column
2096 for(int ox = 0; ox < roi_out->width; ox++)
2097 {
2098 debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
2099
2100 // This will hold the resulting pixel
2101 float vs = 0.0f;
2102
2103 // Number of horizontal samples contributing to the output
2104 const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
2105
2106 for(int iy = 0; iy < vl; iy++)
2107 {
2108 // This is our input line
2109 const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
2110
2111 float vhs = 0.0f;
2112
2113 for(int ix = 0; ix < hl; ix++)
2114 {
2115 // Apply the precomputed filter kernel
2116 const size_t baseidx = (size_t)hindex[hiidx++];
2117 const float htap = hkernel[hkidx++];
2118 vhs += i[baseidx] * htap;
2119 }
2120
2121 // Accumulate contribution from this line
2122 const float vtap = vkernel[vkidx++];
2123 vs += vhs * vtap;
2124
2125 // Reset horizontal resampling context
2126 hkidx -= hl;
2127 hiidx -= hl;
2128 }
2129
2130 // Output pixel is ready
2131 float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * sizeof(float));
2132 *o = vs;
2133
2134 // Reset vertical resampling context
2135 viidx -= vl;
2136 vkidx -= vl;
2137
2138 // Progress in horizontal context
2139 hiidx += hl;
2140 hkidx += hl;
2141 }
2142 }
2143
2144 #if DEBUG_RESAMPLING_TIMING
2145 ts_resampling = getts() - ts_resampling;
2146 fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
2147 #endif
2148
2149 exit:
2150 /* Free the resampling plans. It's nasty to optimize allocs like that, but
2151 * it simplifies the code :-D. The length array is in fact the only memory
2152 * allocated. */
2153 dt_free_align(hlength);
2154 dt_free_align(vlength);
2155 }
2156
2157 /** Applies resampling (re-scaling) on *full* input and output buffers.
2158 * roi_in and roi_out define the part of the buffers that is affected.
2159 */
dt_interpolation_resample_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2160 void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out,
2161 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2162 const float *const in, const dt_iop_roi_t *const roi_in,
2163 const int32_t in_stride)
2164 {
2165 return dt_interpolation_resample_1c_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
2166 }
2167
2168 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
2169 * and output buffers hold exactly those roi's. roi_in and roi_out define the relative
2170 * positions of the roi's within the full input and output image, respectively.
2171 */
dt_interpolation_resample_roi_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2172 void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out,
2173 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2174 const float *const in, const dt_iop_roi_t *const roi_in,
2175 const int32_t in_stride)
2176 {
2177 dt_iop_roi_t oroi = *roi_out;
2178 oroi.x = oroi.y = 0;
2179
2180 dt_iop_roi_t iroi = *roi_in;
2181 iroi.x = iroi.y = 0;
2182
2183 dt_interpolation_resample_1c(itor, out, &oroi, out_stride, in, &iroi, in_stride);
2184 }
2185
2186 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2187 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2188 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2189