1 /* --------------------------------------------------------------------------
2 This file is part of darktable,
3 Copyright (C) 2012-2021 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 * ------------------------------------------------------------------------*/
18
19 #include "common/interpolation.h"
20 #include "common/darktable.h"
21 #include "common/math.h"
22 #include "control/conf.h"
23
24 #include <assert.h>
25 #include <glib.h>
26 #include <inttypes.h>
27 #include <stddef.h>
28 #include <stdint.h>
29
30 /** Border extrapolation modes */
31 enum border_mode
32 {
33 BORDER_REPLICATE, // aaaa|abcdefg|gggg
34 BORDER_WRAP, // defg|abcdefg|abcd
35 BORDER_MIRROR, // edcb|abcdefg|fedc
36 BORDER_CLAMP // ....|abcdefg|....
37 };
38
39 /* Supporting them all might be overkill, let the compiler trim all
40 * unnecessary modes in clip for resampling codepath*/
41 #define RESAMPLING_BORDER_MODE BORDER_REPLICATE
42
43 /* Supporting them all might be overkill, let the compiler trim all
44 * unnecessary modes in interpolation codepath */
45 #define INTERPOLATION_BORDER_MODE BORDER_MIRROR
46
47 // Defines minimum alignment requirement for critical SIMD code
48 #define SSE_ALIGNMENT 64
49
50 // Defines the maximum kernel half length
51 // !! Make sure to sync this with the filter array !!
52 #define MAX_HALF_FILTER_WIDTH 3
53
54 // Add code for timing resampling function
55 #define DEBUG_RESAMPLING_TIMING 0
56
57 // Add debug info messages to stderr
58 #define DEBUG_PRINT_INFO 0
59
60 // Add *verbose* (like one msg per pixel out) debug message to stderr
61 #define DEBUG_PRINT_VERBOSE 0
62
63 /* --------------------------------------------------------------------------
64 * Debug helpers
65 * ------------------------------------------------------------------------*/
66
67 #if DEBUG_RESAMPLING_TIMING
68 #include <sys/time.h>
69 #endif
70
71 #if DEBUG_PRINT_INFO
72 #define debug_info(...) \
73 do \
74 { \
75 fprintf(stderr, __VA_ARGS__); \
76 } while(0)
77 #else
78 #define debug_info(...)
79 #endif
80
81 #if DEBUG_PRINT_VERBOSE
82 #define debug_extra(...) \
83 do \
84 { \
85 fprintf(stderr, __VA_ARGS__); \
86 } while(0)
87 #else
88 #define debug_extra(...)
89 #endif
90
91 #if DEBUG_RESAMPLING_TIMING
getts()92 static inline int64_t getts()
93 {
94 struct timeval t;
95 gettimeofday(&t, NULL);
96 return t.tv_sec * INT64_C(1000000) + t.tv_usec;
97 }
98 #endif
99
100 /* --------------------------------------------------------------------------
101 * Generic helpers
102 * ------------------------------------------------------------------------*/
103
104 /** Clip into specified range
105 * @param idx index to filter
106 * @param length length of line
107 */
clip(int i,int min,int max,enum border_mode mode)108 static inline int clip(int i, int min, int max, enum border_mode mode)
109 {
110 switch(mode)
111 {
112 case BORDER_REPLICATE:
113 if(i < min)
114 {
115 i = min;
116 }
117 else if(i > max)
118 {
119 i = max;
120 }
121 break;
122 case BORDER_MIRROR:
123 if(i < min)
124 {
125 i = min - i;
126 }
127 else if(i > max)
128 {
129 i = 2 * max - i;
130 }
131 break;
132 case BORDER_WRAP:
133 if(i < min)
134 {
135 i = max - (min - i);
136 }
137 else if(i > max)
138 {
139 i = min + (i - max);
140 }
141 break;
142 case BORDER_CLAMP:
143 if(i < min || i > max)
144 {
145 /* Should not be used as is, we prevent -1 usage, filtering the taps
146 * we clip the sample indexes for. So understand this function is
147 * specific to its caller. */
148 i = -1;
149 }
150 break;
151 }
152
153 return i;
154 }
155
prepare_tap_boundaries(int * tap_first,int * tap_last,const enum border_mode mode,const int filterwidth,const int t,const int max)156 static inline void prepare_tap_boundaries(int *tap_first, int *tap_last, const enum border_mode mode,
157 const int filterwidth, const int t, const int max)
158 {
159 /* Check lower bound pixel index and skip as many pixels as necessary to
160 * fall into range */
161 *tap_first = 0;
162 if(mode == BORDER_CLAMP && t < 0)
163 {
164 *tap_first = -t;
165 }
166
167 // Same for upper bound pixel
168 *tap_last = filterwidth;
169 if(mode == BORDER_CLAMP && t + filterwidth >= max)
170 {
171 *tap_last = max - t;
172 }
173 }
174
175 /** Make sure an aligned chunk will not misalign its following chunk
176 * proposing an adapted length
177 *
178 * @param l Length required for current chunk
179 * @param align Required alignment for next chunk
180 *
181 * @return Required length for keeping alignment ok if chaining data chunks
182 */
increase_for_alignment(size_t l,size_t align)183 static inline size_t increase_for_alignment(size_t l, size_t align)
184 {
185 align -= 1;
186 return (l + align) & (~align);
187 }
188
189 /* --------------------------------------------------------------------------
190 * Interpolation kernels
191 * ------------------------------------------------------------------------*/
192
193 /* --------------------------------------------------------------------------
194 * Bilinear interpolation
195 * ------------------------------------------------------------------------*/
196
bilinear(float width,float t)197 static inline float bilinear(float width, float t)
198 {
199 float r;
200 t = fabsf(t);
201 if(t > 1.f)
202 {
203 r = 0.f;
204 }
205 else
206 {
207 r = 1.f - t;
208 }
209 return r;
210 }
211
212 #if defined(__SSE2__)
bilinear_sse(__m128 width,__m128 t)213 static inline __m128 bilinear_sse(__m128 width, __m128 t)
214 {
215 static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
216 return _mm_sub_ps(one, _mm_abs_ps(t));
217 }
218 #endif
219
220 /* --------------------------------------------------------------------------
221 * Bicubic interpolation
222 * ------------------------------------------------------------------------*/
223
bicubic(float width,float t)224 static inline float bicubic(float width, float t)
225 {
226 float r;
227 t = fabsf(t);
228 if(t >= 2.f)
229 {
230 r = 0.f;
231 }
232 else if(t > 1.f && t < 2.f)
233 {
234 float t2 = t * t;
235 r = 0.5f * (t * (-t2 + 5.f * t - 8.f) + 4.f);
236 }
237 else
238 {
239 float t2 = t * t;
240 r = 0.5f * (t * (3.f * t2 - 5.f * t) + 2.f);
241 }
242 return r;
243 }
244
245 #if defined(__SSE2__)
bicubic_sse(__m128 width,__m128 t)246 static inline __m128 bicubic_sse(__m128 width, __m128 t)
247 {
248 static const __m128 half = { .5f, .5f, .5f, .5f };
249 static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
250 static const __m128 two = { 2.f, 2.f, 2.f, 2.f };
251 static const __m128 three = { 3.f, 3.f, 3.f, 3.f };
252 static const __m128 four = { 4.f, 4.f, 4.f, 4.f };
253 static const __m128 five = { 5.f, 5.f, 5.f, 5.f };
254 static const __m128 eight = { 8.f, 8.f, 8.f, 8.f };
255
256 t = _mm_abs_ps(t);
257 const __m128 t2 = _mm_mul_ps(t, t);
258
259 /* Compute 1 < t < 2 case:
260 * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
261 * half*(t*(mt2 + t5 - eight) + four)
262 * half*(t*(mt2 + t5_sub_8) + four)
263 * half*(t*(mt2_add_t5_sub_8) + four) */
264 const __m128 t5 = _mm_mul_ps(five, t);
265 const __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
266 const __m128 zero = _mm_setzero_ps();
267 const __m128 mt2 = _mm_sub_ps(zero, t2);
268 const __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
269 const __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
270 const __m128 b = _mm_add_ps(a, four);
271 __m128 r12 = _mm_mul_ps(b, half);
272
273 /* Compute case < 1
274 * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
275 const __m128 t23 = _mm_mul_ps(three, t2);
276 const __m128 c = _mm_sub_ps(t23, t5);
277 const __m128 d = _mm_mul_ps(t, c);
278 const __m128 e = _mm_add_ps(d, two);
279 __m128 r01 = _mm_mul_ps(half, e);
280
281 // Compute masks fr keeping correct components
282 const __m128 mask01 = _mm_cmple_ps(t, one);
283 const __m128 mask12 = _mm_cmpgt_ps(t, one);
284 r01 = _mm_and_ps(mask01, r01);
285 r12 = _mm_and_ps(mask12, r12);
286
287
288 return _mm_or_ps(r01, r12);
289 }
290 #endif
291
292 /* --------------------------------------------------------------------------
293 * Lanczos interpolation
294 * ------------------------------------------------------------------------*/
295
296 #define DT_LANCZOS_EPSILON (1e-9f)
297
298 #if 0
299 // Reference version left here for ... documentation
300 static inline float
301 lanczos(float width, float t)
302 {
303 float r;
304
305 if (t<-width || t>width)
306 {
307 r = 0.f;
308 }
309 else if (t>-DT_LANCZOS_EPSILON && t<DT_LANCZOS_EPSILON)
310 {
311 r = 1.f;
312 }
313 else
314 {
315 r = width*sinf(M_PI*t)*sinf(M_PI*t/width)/(M_PI*M_PI*t*t);
316 }
317 return r;
318 }
319 #endif
320
321 /* Fast lanczos version, no calls to math.h functions, too accurate, too slow
322 *
323 * Based on a forum entry at
324 * http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/
325 *
326 * Apart the fast sine function approximation, the only trick is to compute:
327 * sin(pi.t) = sin(a.pi + r.pi) where t = a + r = trunc(t) + r
328 * = sin(a.pi).cos(r.pi) + sin(r.pi).cos(a.pi)
329 * = 0*cos(r.pi) + sin(r.pi).cos(a.pi)
330 * = sign.sin(r.pi) where sign = 1 if the a is even
331 * = -1 if the a is odd
332 *
333 * Of course we know that lanczos func will only be called for
334 * the range -width < t < width so we can additionally avoid the
335 * range check. */
336
lanczos(float width,float t)337 static inline float lanczos(float width, float t)
338 {
339 /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
340 * correct */
341 int a = (int)t;
342 float r = t - (float)a;
343
344 // Compute the correct sign for sinf(pi.r)
345 union
346 {
347 float f;
348 uint32_t i;
349 } sign;
350 sign.i = ((a & 1) << 31) | 0x3f800000;
351
352 return (DT_LANCZOS_EPSILON + width * sign.f * sinf_fast(M_PI_F * r) * sinf_fast(M_PI_F * t / width))
353 / (DT_LANCZOS_EPSILON + M_PI_F * M_PI_F * t * t);
354 }
355
356 #if defined(__SSE2__)
lanczos_sse2(__m128 width,__m128 t)357 static inline __m128 lanczos_sse2(__m128 width, __m128 t)
358 {
359 /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
360 * correct */
361 __m128i a = _mm_cvtps_epi32(t);
362 __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));
363
364 // Compute the correct sign for sinf(pi.r)
365 static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT)))
366 = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
367 static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1 };
368 static const __m128 eps
369 = { DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON };
370 static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
371 static const __m128 pi2 = { M_PI * M_PI, M_PI * M_PI, M_PI * M_PI, M_PI * M_PI };
372
373 __m128i isign = _mm_and_si128(*(__m128i *)ione, a);
374 isign = _mm_slli_epi64(isign, 31);
375 isign = _mm_or_si128(*(__m128i *)fone, isign);
376 const __m128 fsign = _mm_castsi128_ps(isign);
377
378 __m128 num = _mm_mul_ps(width, fsign);
379 num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
380 num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
381 num = _mm_add_ps(eps, num);
382
383 __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
384 den = _mm_add_ps(eps, den);
385
386 return _mm_div_ps(num, den);
387 }
388 #endif
389
390 #undef DT_LANCZOS_EPSILON
391
392 /* --------------------------------------------------------------------------
393 * All our known interpolators
394 * ------------------------------------------------------------------------*/
395
396 /* !!! !!! !!!
397 * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width
398 * of this filter list. Otherwise bad things will happen
399 * !!! !!! !!!
400 */
401 static const struct dt_interpolation dt_interpolator[] = {
402 {.id = DT_INTERPOLATION_BILINEAR,
403 .name = "bilinear",
404 .width = 1,
405 .func = &bilinear,
406 #if defined(__SSE2__)
407 .funcsse = &bilinear_sse
408 #endif
409 },
410 {.id = DT_INTERPOLATION_BICUBIC,
411 .name = "bicubic",
412 .width = 2,
413 .func = &bicubic,
414 #if defined(__SSE2__)
415 .funcsse = &bicubic_sse
416 #endif
417 },
418 {.id = DT_INTERPOLATION_LANCZOS2,
419 .name = "lanczos2",
420 .width = 2,
421 .func = &lanczos,
422 #if defined(__SSE2__)
423 .funcsse = &lanczos_sse2
424 #endif
425 },
426 {.id = DT_INTERPOLATION_LANCZOS3,
427 .name = "lanczos3",
428 .width = 3,
429 .func = &lanczos,
430 #if defined(__SSE2__)
431 .funcsse = &lanczos_sse2
432 #endif
433 },
434 };
435
436 /* --------------------------------------------------------------------------
437 * Kernel utility methods
438 * ------------------------------------------------------------------------*/
439
440 /** Computes an upsampling filtering kernel
441 *
442 * @param itor [in] Interpolator used
443 * @param kernel [out] resulting itor->width*2 filter taps
444 * @param norm [out] Kernel norm
445 * @param first [out] first input sample index used
446 * @param t [in] Interpolated coordinate */
compute_upsampling_kernel_plain(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)447 static inline void compute_upsampling_kernel_plain(const struct dt_interpolation *itor, float *kernel,
448 float *norm, int *first, float t)
449 {
450 int f = (int)t - itor->width + 1;
451 if(first)
452 {
453 *first = f;
454 }
455
456 /* Find closest integer position and then offset that to match first
457 * filtered sample position */
458 t = t - (float)f;
459
460 // Will hold kernel norm
461 float n = 0.f;
462
463 // Compute the raw kernel
464 for(int i = 0; i < 2 * itor->width; i++)
465 {
466 float tap = itor->func((float)itor->width, t);
467 n += tap;
468 kernel[i] = tap;
469 t -= 1.f;
470 }
471 if(norm)
472 {
473 *norm = n;
474 }
475 }
476
477 #if defined(__SSE2__)
478 /** Computes an upsampling filtering kernel (SSE version, four taps per inner loop)
479 *
480 * @param itor [in] Interpolator used
481 * @param kernel [out] resulting itor->width*2 filter taps (array must be at least (itor->width*2+3)/4*4
482 *floats long)
483 * @param norm [out] Kernel norm
484 * @param first [out] first input sample index used
485 * @param t [in] Interpolated coordinate
486 *
487 * @return kernel norm
488 */
compute_upsampling_kernel_sse(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)489 static inline void compute_upsampling_kernel_sse(const struct dt_interpolation *itor, float *kernel,
490 float *norm, int *first, float t)
491 {
492 int f = (int)t - itor->width + 1;
493 if(first)
494 {
495 *first = f;
496 }
497
498 /* Find closest integer position and then offset that to match first
499 * filtered sample position */
500 t = t - (float)f;
501
502 // Prepare t vector to compute four values a loop
503 static const __m128 bootstrap = { 0.f, -1.f, -2.f, -3.f };
504 static const __m128 iter = { -4.f, -4.f, -4.f, -4.f };
505 __m128 vt = _mm_add_ps(_mm_set_ps1(t), bootstrap);
506 __m128 vw = _mm_set_ps1((float)itor->width);
507
508 // Prepare counters (math kept stupid for understanding)
509 int i = 0;
510 const int runs = (2 * itor->width + 3) / 4;
511
512 while(i < runs)
513 {
514 // Compute the values
515 const __m128 vr = itor->funcsse(vw, vt);
516
517 // Save result
518 *(__m128 *)kernel = vr;
519
520 // Prepare next iteration
521 vt = _mm_add_ps(vt, iter);
522 kernel += 4;
523 i++;
524 }
525
526 // compute norm now
527 if(norm)
528 {
529 float n = 0.f;
530 i = 0;
531 kernel -= 4 * runs;
532 while(i < 2 * itor->width)
533 {
534 n += *kernel;
535 kernel++;
536 i++;
537 }
538 *norm = n;
539 }
540 }
541 #endif
542
compute_upsampling_kernel(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)543 static inline void compute_upsampling_kernel(const struct dt_interpolation *itor, float *kernel, float *norm,
544 int *first, float t)
545 {
546 if(darktable.codepath.OPENMP_SIMD) return compute_upsampling_kernel_plain(itor, kernel, norm, first, t);
547 #if defined(__SSE2__)
548 else if(darktable.codepath.SSE2)
549 return compute_upsampling_kernel_sse(itor, kernel, norm, first, t);
550 #endif
551 else
552 dt_unreachable_codepath();
553 }
554
555 /** Computes a downsampling filtering kernel
556 *
557 * @param itor [in] Interpolator used
558 * @param kernelsize [out] Number of taps
559 * @param kernel [out] resulting taps (at least itor->width/inoout elements for no overflow)
560 * @param norm [out] Kernel norm
561 * @param first [out] index of the first sample for which the kernel is to be applied
562 * @param outoinratio [in] "out samples" over "in samples" ratio
563 * @param xout [in] Output coordinate */
compute_downsampling_kernel_plain(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)564 static inline void compute_downsampling_kernel_plain(const struct dt_interpolation *itor, int *taps,
565 int *first, float *kernel, float *norm,
566 float outoinratio, int xout)
567 {
568 // Keep this at hand
569 const float w = (float)itor->width;
570
571 /* Compute the phase difference between output pixel and its
572 * input corresponding input pixel */
573 const float xin = ceil_fast(((float)xout - w) / outoinratio);
574 if(first)
575 {
576 *first = (int)xin;
577 }
578
579 // Compute first interpolator parameter
580 float t = xin * outoinratio - (float)xout;
581
582 // Will hold kernel norm
583 float n = 0.f;
584
585 // Compute all filter taps
586 *taps = (int)((w - t) / outoinratio);
587 for(int i = 0; i < *taps; i++)
588 {
589 *kernel = itor->func(w, t);
590 n += *kernel;
591 t += outoinratio;
592 kernel++;
593 }
594
595 if(norm)
596 {
597 *norm = n;
598 }
599 }
600
601
602 #if defined(__SSE2__)
603 /** Computes a downsampling filtering kernel (SSE version, four taps per inner loop iteration)
604 *
605 * @param itor [in] Interpolator used
606 * @param kernelsize [out] Number of taps
607 * @param kernel [out] resulting taps (at least itor->width/inoout + 4 elements for no overflow)
608 * @param norm [out] Kernel norm
609 * @param first [out] index of the first sample for which the kernel is to be applied
610 * @param outoinratio [in] "out samples" over "in samples" ratio
611 * @param xout [in] Output coordinate */
compute_downsampling_kernel_sse(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)612 static inline void compute_downsampling_kernel_sse(const struct dt_interpolation *itor, int *taps, int *first,
613 float *kernel, float *norm, float outoinratio, int xout)
614 {
615 // Keep this at hand
616 const float w = (float)itor->width;
617
618 /* Compute the phase difference between output pixel and its
619 * input corresponding input pixel */
620 const float xin = ceil_fast(((float)xout - w) / outoinratio);
621 if(first)
622 {
623 *first = (int)xin;
624 }
625
626 // Compute first interpolator parameter
627 float t = xin * outoinratio - (float)xout;
628
629 // Compute all filter taps
630 *taps = (int)((w - t) / outoinratio);
631
632 // Bootstrap vector t
633 static const __m128 bootstrap = { 0.f, 1.f, 2.f, 3.f };
634 const __m128 iter = _mm_set_ps1(4.f * outoinratio);
635 const __m128 vw = _mm_set_ps1(w);
636 __m128 vt = _mm_add_ps(_mm_set_ps1(t), _mm_mul_ps(_mm_set_ps1(outoinratio), bootstrap));
637
638 // Prepare counters (math kept stupid for understanding)
639 int i = 0;
640 const int runs = (*taps + 3) / 4;
641
642 while(i < runs)
643 {
644 // Compute the values
645 const __m128 vr = itor->funcsse(vw, vt);
646
647 // Save result
648 *(__m128 *)kernel = vr;
649
650 // Prepare next iteration
651 vt = _mm_add_ps(vt, iter);
652 kernel += 4;
653 i++;
654 }
655
656 // compute norm now
657 if(norm)
658 {
659 float n = 0.f;
660 i = 0;
661 kernel -= 4 * runs;
662 while(i < *taps)
663 {
664 n += *kernel;
665 kernel++;
666 i++;
667 }
668 *norm = n;
669 }
670 }
671 #endif
672
compute_downsampling_kernel(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)673 static inline void compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first,
674 float *kernel, float *norm, float outoinratio, int xout)
675 {
676 if(darktable.codepath.OPENMP_SIMD)
677 return compute_downsampling_kernel_plain(itor, taps, first, kernel, norm, outoinratio, xout);
678 #if defined(__SSE2__)
679 else if(darktable.codepath.SSE2)
680 return compute_downsampling_kernel_sse(itor, taps, first, kernel, norm, outoinratio, xout);
681 #endif
682 else
683 dt_unreachable_codepath();
684 }
685
686 /* --------------------------------------------------------------------------
687 * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
688 * ------------------------------------------------------------------------*/
689
690 #define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))
691
dt_interpolation_compute_sample(const struct dt_interpolation * itor,const float * in,const float x,const float y,const int width,const int height,const int samplestride,const int linestride)692 float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x,
693 const float y, const int width, const int height,
694 const int samplestride, const int linestride)
695 {
696 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
697
698 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
699 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
700
701 // Compute both horizontal and vertical kernels
702 float normh;
703 float normv;
704 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
705 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
706
707 int ix = (int)x;
708 int iy = (int)y;
709
710 /* Now 2 cases, the pixel + filter width goes outside the image
711 * in that case we have to use index clipping to keep all reads
712 * in the input image (slow path) or we are sure it won't fall
713 * outside and can do more simple code */
714 float r;
715 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
716 && iy < (height - itor->width))
717 {
718 // Inside image boundary case
719
720 // Go to top left pixel
721 in = (float *)in + linestride * iy + ix * samplestride;
722 in = in - (itor->width - 1) * (samplestride + linestride);
723
724 // Apply the kernel
725 float s = 0.f;
726 for(int i = 0; i < 2 * itor->width; i++)
727 {
728 float h = 0.0f;
729 for(int j = 0; j < 2 * itor->width; j++)
730 {
731 h += kernelh[j] * in[j * samplestride];
732 }
733 s += kernelv[i] * h;
734 in += linestride;
735 }
736 r = s / (normh * normv);
737 }
738 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
739 {
740 // At least a valid coordinate
741
742 // Point to the upper left pixel index wise
743 iy -= itor->width - 1;
744 ix -= itor->width - 1;
745
746 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
747 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
748
749 int xtap_first;
750 int xtap_last;
751 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
752
753 int ytap_first;
754 int ytap_last;
755 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
756
757 // Apply the kernel
758 float s = 0.f;
759 for(int i = ytap_first; i < ytap_last; i++)
760 {
761 const int clip_y = clip(iy + i, 0, height - 1, bordermode);
762 float h = 0.0f;
763 for(int j = xtap_first; j < xtap_last; j++)
764 {
765 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
766 const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
767 h += kernelh[j] * ipixel[0];
768 }
769 s += kernelv[i] * h;
770 }
771
772 r = s / (normh * normv);
773 }
774 else
775 {
776 // invalid coordinate
777 r = 0.0f;
778 }
779 return r;
780 }
781
782 /* --------------------------------------------------------------------------
783 * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
784 * ------------------------------------------------------------------------*/
785
dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)786 static void dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation *itor, const float *in,
787 float *out, const float x, const float y, const int width,
788 const int height, const int linestride)
789 {
790 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
791
792 // Quite a bit of space for kernels
793 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
794 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
795
796 // Compute both horizontal and vertical kernels
797 float normh;
798 float normv;
799 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
800 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
801
802 // Precompute the inverse of the filter norm for later use
803 const float oonorm = (1.f / (normh * normv));
804
805 /* Now 2 cases, the pixel + filter width goes outside the image
806 * in that case we have to use index clipping to keep all reads
807 * in the input image (slow path) or we are sure it won't fall
808 * outside and can do more simple code */
809 int ix = (int)x;
810 int iy = (int)y;
811
812 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
813 && iy < (height - itor->width))
814 {
815 // Inside image boundary case
816
817 // Go to top left pixel
818 in = (float *)in + linestride * iy + ix * 4;
819 in = in - (itor->width - 1) * (4 + linestride);
820
821 // Apply the kernel
822 dt_aligned_pixel_t pixel = { 0.0f, 0.0f, 0.0f, 0.0f };
823 for(int i = 0; i < 2 * itor->width; i++)
824 {
825 dt_aligned_pixel_t h = { 0.0f, 0.0f, 0.0f, 0.0f };
826 for(int j = 0; j < 2 * itor->width; j++)
827 {
828 for(int c = 0; c < 3; c++) h[c] += kernelh[j] * in[j * 4 + c];
829 }
830 for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
831 in += linestride;
832 }
833
834 for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
835 }
836 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
837 {
838 // At least a valid coordinate
839
840 // Point to the upper left pixel index wise
841 iy -= itor->width - 1;
842 ix -= itor->width - 1;
843
844 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
845 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
846
847 int xtap_first;
848 int xtap_last;
849 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
850
851 int ytap_first;
852 int ytap_last;
853 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
854
855 // Apply the kernel
856 dt_aligned_pixel_t pixel = { 0.0f, 0.0f, 0.0f, 0.0f };
857 for(int i = ytap_first; i < ytap_last; i++)
858 {
859 const int clip_y = clip(iy + i, 0, height - 1, bordermode);
860 dt_aligned_pixel_t h = { 0.0f, 0.0f, 0.0f, 0.0f };
861 for(int j = xtap_first; j < xtap_last; j++)
862 {
863 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
864 const float *ipixel = in + clip_y * linestride + clip_x * 4;
865 for(int c = 0; c < 3; c++) h[c] += kernelh[j] * ipixel[c];
866 }
867 for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
868 }
869
870 for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
871 }
872 else
873 {
874 for(int c = 0; c < 3; c++) out[c] = 0.0f;
875 }
876 }
877
878 #if defined(__SSE2__)
dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)879 static void dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation *itor, const float *in,
880 float *out, const float x, const float y, const int width,
881 const int height, const int linestride)
882 {
883 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
884
885 // Quite a bit of space for kernels
886 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
887 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
888 __m128 vkernelh[2 * MAX_HALF_FILTER_WIDTH];
889 __m128 vkernelv[2 * MAX_HALF_FILTER_WIDTH];
890
891 // Compute both horizontal and vertical kernels
892 float normh;
893 float normv;
894 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
895 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
896
897 // We will process four components a time, duplicate the information
898 for(int i = 0; i < 2 * itor->width; i++)
899 {
900 vkernelh[i] = _mm_set_ps1(kernelh[i]);
901 vkernelv[i] = _mm_set_ps1(kernelv[i]);
902 }
903
904 // Precompute the inverse of the filter norm for later use
905 const __m128 oonorm = _mm_set_ps1(1.f / (normh * normv));
906
907 /* Now 2 cases, the pixel + filter width goes outside the image
908 * in that case we have to use index clipping to keep all reads
909 * in the input image (slow path) or we are sure it won't fall
910 * outside and can do more simple code */
911 int ix = (int)x;
912 int iy = (int)y;
913
914 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
915 && iy < (height - itor->width))
916 {
917 // Inside image boundary case
918
919 // Go to top left pixel
920 in = (float *)in + linestride * iy + ix * 4;
921 in = in - (itor->width - 1) * (4 + linestride);
922
923 // Apply the kernel
924 __m128 pixel = _mm_setzero_ps();
925 for(int i = 0; i < 2 * itor->width; i++)
926 {
927 __m128 h = _mm_setzero_ps();
928 for(int j = 0; j < 2 * itor->width; j++)
929 {
930 h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)&in[j * 4]));
931 }
932 pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
933 in += linestride;
934 }
935
936 *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
937 }
938 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
939 {
940 // At least a valid coordinate
941
942 // Point to the upper left pixel index wise
943 iy -= itor->width - 1;
944 ix -= itor->width - 1;
945
946 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
947 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
948
949 int xtap_first;
950 int xtap_last;
951 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
952
953 int ytap_first;
954 int ytap_last;
955 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
956
957 // Apply the kernel
958 __m128 pixel = _mm_setzero_ps();
959 for(int i = ytap_first; i < ytap_last; i++)
960 {
961 int clip_y = clip(iy + i, 0, height - 1, bordermode);
962 __m128 h = _mm_setzero_ps();
963 for(int j = xtap_first; j < xtap_last; j++)
964 {
965 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
966 const float *ipixel = in + clip_y * linestride + clip_x * 4;
967 h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)ipixel));
968 }
969 pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
970 }
971
972 *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
973 }
974 else
975 {
976 *(__m128 *)out = _mm_set_ps1(0.0f);
977 }
978 }
979 #endif
980
dt_interpolation_compute_pixel4c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)981 void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out,
982 const float x, const float y, const int width, const int height,
983 const int linestride)
984 {
985 if(darktable.codepath.OPENMP_SIMD)
986 return dt_interpolation_compute_pixel4c_plain(itor, in, out, x, y, width, height, linestride);
987 #if defined(__SSE2__)
988 else if(darktable.codepath.SSE2)
989 return dt_interpolation_compute_pixel4c_sse(itor, in, out, x, y, width, height, linestride);
990 #endif
991 else
992 dt_unreachable_codepath();
993 }
994
dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)995 static void dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation *itor, const float *in,
996 float *out, const float x, const float y, const int width,
997 const int height, const int linestride)
998 {
999 assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
1000
1001 // Quite a bit of space for kernels
1002 float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1003 float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1004
1005 // Compute both horizontal and vertical kernels
1006 float normh;
1007 float normv;
1008 compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
1009 compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
1010
1011 // Precompute the inverse of the filter norm for later use
1012 const float oonorm = (1.f / (normh * normv));
1013
1014 /* Now 2 cases, the pixel + filter width goes outside the image
1015 * in that case we have to use index clipping to keep all reads
1016 * in the input image (slow path) or we are sure it won't fall
1017 * outside and can do more simple code */
1018 int ix = (int)x;
1019 int iy = (int)y;
1020
1021 if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
1022 && iy < (height - itor->width))
1023 {
1024 // Inside image boundary case
1025
1026 // Go to top left pixel
1027 in = (float *)in + linestride * iy + ix;
1028 in = in - (itor->width - 1) * (1 + linestride);
1029
1030 // Apply the kernel
1031 float pixel = 0.0f;
1032 for(int i = 0; i < 2 * itor->width; i++)
1033 {
1034 float h = 0.0f;
1035 for(int j = 0; j < 2 * itor->width; j++)
1036 {
1037 h += kernelh[j] * in[j];
1038 }
1039 pixel += kernelv[i] * h;
1040 in += linestride;
1041 }
1042
1043 *out = oonorm * pixel;
1044 }
1045 else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
1046 {
1047 // At least a valid coordinate
1048
1049 // Point to the upper left pixel index wise
1050 iy -= itor->width - 1;
1051 ix -= itor->width - 1;
1052
1053 static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
1054 assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
1055
1056 int xtap_first;
1057 int xtap_last;
1058 prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
1059
1060 int ytap_first;
1061 int ytap_last;
1062 prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
1063
1064 // Apply the kernel
1065 float pixel = 0.0f;
1066 for(int i = ytap_first; i < ytap_last; i++)
1067 {
1068 const int clip_y = clip(iy + i, 0, height - 1, bordermode);
1069 float h = 0.0f;
1070 for(int j = xtap_first; j < xtap_last; j++)
1071 {
1072 const int clip_x = clip(ix + j, 0, width - 1, bordermode);
1073 const float *ipixel = in + clip_y * linestride + clip_x;
1074 h += kernelh[j] * *ipixel;
1075 }
1076 pixel += kernelv[i] * h;
1077 }
1078
1079 *out = oonorm * pixel;
1080 }
1081 else
1082 {
1083 *out = 0.0f;
1084 }
1085 }
1086
dt_interpolation_compute_pixel1c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)1087 void dt_interpolation_compute_pixel1c(const struct dt_interpolation *itor, const float *in, float *out,
1088 const float x, const float y, const int width, const int height,
1089 const int linestride)
1090 {
1091 return dt_interpolation_compute_pixel1c_plain(itor, in, out, x, y, width, height, linestride);
1092 }
1093
1094 /* --------------------------------------------------------------------------
1095 * Interpolation factory
1096 * ------------------------------------------------------------------------*/
1097
dt_interpolation_new(enum dt_interpolation_type type)1098 const struct dt_interpolation *dt_interpolation_new(enum dt_interpolation_type type)
1099 {
1100 const struct dt_interpolation *itor = NULL;
1101
1102 if(type == DT_INTERPOLATION_USERPREF)
1103 {
1104 // Find user preferred interpolation method
1105 const char *uipref = dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator");
1106 for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1107 {
1108 if(!strcmp(uipref, dt_interpolator[i].name))
1109 {
1110 // Found the one
1111 itor = &dt_interpolator[i];
1112 break;
1113 }
1114 }
1115
1116 /* In the case the search failed (!uipref or name not found),
1117 * prepare later search pass with default fallback */
1118 type = DT_INTERPOLATION_DEFAULT;
1119 }
1120 else if(type == DT_INTERPOLATION_USERPREF_WARP)
1121 {
1122 // Find user preferred interpolation method
1123 const char *uipref = dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator_warp");
1124 for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1125 {
1126 if(!strcmp(uipref, dt_interpolator[i].name))
1127 {
1128 // Found the one
1129 itor = &dt_interpolator[i];
1130 break;
1131 }
1132 }
1133
1134 /* In the case the search failed (!uipref or name not found),
1135 * prepare later search pass with default fallback */
1136 type = DT_INTERPOLATION_DEFAULT_WARP;
1137 }
1138 if(!itor)
1139 {
1140 // Did not find the userpref one or we've been asked for a specific one
1141 for(int i = DT_INTERPOLATION_FIRST; i < DT_INTERPOLATION_LAST; i++)
1142 {
1143 if(dt_interpolator[i].id == type)
1144 {
1145 itor = &dt_interpolator[i];
1146 break;
1147 }
1148 if(dt_interpolator[i].id == DT_INTERPOLATION_DEFAULT)
1149 {
1150 itor = &dt_interpolator[i];
1151 }
1152 }
1153 }
1154
1155 return itor;
1156 }
1157
1158 /* --------------------------------------------------------------------------
1159 * Image resampling
1160 * ------------------------------------------------------------------------*/
1161
1162 /** Prepares a 1D resampling plan
1163 *
1164 * This consists of the following information
1165 * <ul>
1166 * <li>A list of lengths that tell how many pixels are relevant for the
1167 * next output</li>
1168 * <li>A list of required filter kernels</li>
1169 * <li>A list of sample indexes</li>
1170 * </ul>
1171 *
1172 * How to apply the resampling plan:
1173 * <ol>
1174 * <li>Pick a length from the length array</li>
1175 * <li>until length is reached
1176 * <ol>
1177 * <li>pick a kernel tap></li>
1178 * <li>pick the relevant sample according to the picked index</li>
1179 * <li>multiply them and accumulate</li>
1180 * </ol>
1181 * </li>
1182 * <li>here goes a single output sample</li>
1183 * </ol>
1184 *
1185 * This until you reach the number of output pixels
1186 *
1187 * @param itor interpolator used to resample
1188 * @param in [in] Number of input samples
1189 * @param out [in] Number of output samples
1190 * @param plength [out] Array of lengths for each pixel filtering (number
1191 * of taps/indexes to use). This array mus be freed with dt_free_align() when you're
1192 * done with the plan.
1193 * @param pkernel [out] Array of filter kernel taps
1194 * @param pindex [out] Array of sample indexes to be used for applying each kernel tap
1195 * arrays of information
1196 * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary
1197 * out position meta[3*out]
1198 * @return 0 for success, !0 for failure
1199 */
prepare_resampling_plan(const struct dt_interpolation * itor,int in,const int in_x0,int out,const int out_x0,float scale,int ** plength,float ** pkernel,int ** pindex,int ** pmeta)1200 static int prepare_resampling_plan(const struct dt_interpolation *itor, int in, const int in_x0, int out,
1201 const int out_x0, float scale, int **plength, float **pkernel,
1202 int **pindex, int **pmeta)
1203 {
1204 // Safe return values
1205 *plength = NULL;
1206 *pkernel = NULL;
1207 *pindex = NULL;
1208 if(pmeta)
1209 {
1210 *pmeta = NULL;
1211 }
1212
1213 if(scale == 1.f)
1214 {
1215 // No resampling required
1216 return 0;
1217 }
1218
1219 // Compute common upsampling/downsampling memory requirements
1220 int maxtapsapixel;
1221 if(scale > 1.f)
1222 {
1223 // Upscale... the easy one. The values are exact
1224 maxtapsapixel = 2 * itor->width;
1225 }
1226 else
1227 {
1228 // Downscale... going for worst case values memory wise
1229 maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);
1230 }
1231
1232 int nlengths = out;
1233 const int nindex = maxtapsapixel * out;
1234 const int nkernel = maxtapsapixel * out;
1235 const size_t lengthreq = increase_for_alignment(nlengths * sizeof(int), SSE_ALIGNMENT);
1236 const size_t indexreq = increase_for_alignment(nindex * sizeof(int), SSE_ALIGNMENT);
1237 const size_t kernelreq = increase_for_alignment(nkernel * sizeof(float), SSE_ALIGNMENT);
1238 const size_t scratchreq = maxtapsapixel * sizeof(float) + 4 * sizeof(float);
1239 // NB: because sse versions compute four taps a time
1240 const size_t metareq = pmeta ? 3 * sizeof(int) * out : 0;
1241
1242 void *blob = NULL;
1243 const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
1244 blob = dt_alloc_align(SSE_ALIGNMENT, totalreq);
1245 if(!blob)
1246 {
1247 return 1;
1248 }
1249
1250 int *lengths = (int *)blob;
1251 blob = (char *)blob + lengthreq;
1252 int *index = (int *)blob;
1253 blob = (char *)blob + indexreq;
1254 float *kernel = (float *)blob;
1255 blob = (char *)blob + kernelreq;
1256 float *scratchpad = scratchreq ? (float *)blob : NULL;
1257 blob = (char *)blob + scratchreq;
1258 int *meta = metareq ? (int *)blob : NULL;
1259 // blob = (char *)blob + metareq;
1260
1261 /* setting this as a const should help the compilers trim all unnecessary
1262 * codepaths */
1263 const enum border_mode bordermode = RESAMPLING_BORDER_MODE;
1264
1265 /* Upscale and downscale differ in subtle points, getting rid of code
1266 * duplication might have been tricky and i prefer keeping the code
1267 * as straight as possible */
1268 if(scale > 1.f)
1269 {
1270 int kidx = 0;
1271 int iidx = 0;
1272 int lidx = 0;
1273 int midx = 0;
1274 for(int x = 0; x < out; x++)
1275 {
1276 if(meta)
1277 {
1278 meta[midx++] = lidx;
1279 meta[midx++] = kidx;
1280 meta[midx++] = iidx;
1281 }
1282
1283 // Projected position in input samples
1284 float fx = (float)(out_x0 + x) / scale;
1285
1286 // Compute the filter kernel at that position
1287 int first;
1288 compute_upsampling_kernel(itor, scratchpad, NULL, &first, fx);
1289
1290 /* Check lower and higher bound pixel index and skip as many pixels as
1291 * necessary to fall into range */
1292 int tap_first;
1293 int tap_last;
1294 prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->width, first, in);
1295
1296 // Track number of taps that will be used
1297 lengths[lidx++] = tap_last - tap_first;
1298
1299 // Precompute the inverse of the norm
1300 float norm = 0.f;
1301 for(int tap = tap_first; tap < tap_last; tap++)
1302 {
1303 norm += scratchpad[tap];
1304 }
1305 norm = 1.f / norm;
1306
1307 /* Unlike single pixel or single sample code, here it's interesting to
1308 * precompute the normalized filter kernel as this will avoid dividing
1309 * by the norm for all processed samples/pixels
1310 * NB: use the same loop to put in place the index list */
1311 first += tap_first;
1312 for(int tap = tap_first; tap < tap_last; tap++)
1313 {
1314 kernel[kidx++] = scratchpad[tap] * norm;
1315 index[iidx++] = clip(first++, 0, in - 1, bordermode);
1316 }
1317 }
1318 }
1319 else
1320 {
1321 int kidx = 0;
1322 int iidx = 0;
1323 int lidx = 0;
1324 int midx = 0;
1325 for(int x = 0; x < out; x++)
1326 {
1327 if(meta)
1328 {
1329 meta[midx++] = lidx;
1330 meta[midx++] = kidx;
1331 meta[midx++] = iidx;
1332 }
1333
1334 // Compute downsampling kernel centered on output position
1335 int taps;
1336 int first;
1337 compute_downsampling_kernel(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);
1338
1339 /* Check lower and higher bound pixel index and skip as many pixels as
1340 * necessary to fall into range */
1341 int tap_first;
1342 int tap_last;
1343 prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);
1344
1345 // Track number of taps that will be used
1346 lengths[lidx++] = tap_last - tap_first;
1347
1348 // Precompute the inverse of the norm
1349 float norm = 0.f;
1350 for(int tap = tap_first; tap < tap_last; tap++)
1351 {
1352 norm += scratchpad[tap];
1353 }
1354 norm = 1.f / norm;
1355
1356 /* Unlike single pixel or single sample code, here it's interesting to
1357 * precompute the normalized filter kernel as this will avoid dividing
1358 * by the norm for all processed samples/pixels
1359 * NB: use the same loop to put in place the index list */
1360 first += tap_first;
1361 for(int tap = tap_first; tap < tap_last; tap++)
1362 {
1363 kernel[kidx++] = scratchpad[tap] * norm;
1364 index[iidx++] = clip(first++, 0, in - 1, bordermode);
1365 }
1366 }
1367 }
1368
1369 // Validate plan wrt caller
1370 *plength = lengths;
1371 *pindex = index;
1372 *pkernel = kernel;
1373 if(pmeta)
1374 {
1375 *pmeta = meta;
1376 }
1377
1378 return 0;
1379 }
1380
dt_interpolation_resample_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1381 static void dt_interpolation_resample_plain(const struct dt_interpolation *itor, float *out,
1382 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1383 const float *const in, const dt_iop_roi_t *const roi_in,
1384 const int32_t in_stride)
1385 {
1386 int *hindex = NULL;
1387 int *hlength = NULL;
1388 float *hkernel = NULL;
1389 int *vindex = NULL;
1390 int *vlength = NULL;
1391 float *vkernel = NULL;
1392 int *vmeta = NULL;
1393
1394 const int32_t in_stride_floats = in_stride / sizeof(float);
1395 const int32_t out_stride_floats = out_stride / sizeof(float);
1396 int r;
1397
1398 debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1399 roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1400 roi_out->x, roi_out->y, roi_out->scale);
1401
1402 // Fast code path for 1:1 copy, only cropping area can change
1403 if(roi_out->scale == 1.f)
1404 {
1405 const int x0 = roi_out->x * 4 * sizeof(float);
1406 #if DEBUG_RESAMPLING_TIMING
1407 int64_t ts_resampling = getts();
1408 #endif
1409 #ifdef _OPENMP
1410 #pragma omp parallel for default(none) \
1411 dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1412 shared(out)
1413 #endif
1414 for(int y = 0; y < roi_out->height; y++)
1415 {
1416 memcpy((char *)out + (size_t)out_stride * y,
1417 (char *)in + (size_t)in_stride * (y + roi_out->y) + x0,
1418 out_stride);
1419 }
1420 #if DEBUG_RESAMPLING_TIMING
1421 ts_resampling = getts() - ts_resampling;
1422 fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1423 #endif
1424 // All done, so easy case
1425 return;
1426 }
1427
1428 // Generic non 1:1 case... much more complicated :D
1429 #if DEBUG_RESAMPLING_TIMING
1430 int64_t ts_plan = getts();
1431 #endif
1432
1433 // Prepare resampling plans once and for all
1434 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1435 &hlength, &hkernel, &hindex, NULL);
1436 if(r)
1437 {
1438 goto exit;
1439 }
1440
1441 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1442 &vlength, &vkernel, &vindex, &vmeta);
1443 if(r)
1444 {
1445 goto exit;
1446 }
1447
1448 #if DEBUG_RESAMPLING_TIMING
1449 ts_plan = getts() - ts_plan;
1450 #endif
1451
1452 #if DEBUG_RESAMPLING_TIMING
1453 int64_t ts_resampling = getts();
1454 #endif
1455
1456 // Process each output line
1457 #ifdef _OPENMP
1458 #pragma omp parallel for default(none) \
1459 dt_omp_firstprivate(in, in_stride_floats, out_stride_floats, roi_out) \
1460 shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1461 #endif
1462 for(int oy = 0; oy < roi_out->height; oy++)
1463 {
1464 // Initialize column resampling indexes
1465 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1466 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1467 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1468
1469 // Initialize row resampling indexes
1470 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1471 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1472 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1473
1474 // Number of lines contributing to the output line
1475 int vl = vlength[vlidx++]; // V(ertical) L(ength)
1476
1477 // Process each output column
1478 for(int ox = 0; ox < roi_out->width; ox++)
1479 {
1480 debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1481
1482 // This will hold the resulting pixel
1483 dt_aligned_pixel_t vs = { 0.0f, 0.0f, 0.0f, 0.0f };
1484
1485 // Number of horizontal samples contributing to the output
1486 int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1487
1488 for(int iy = 0; iy < vl; iy++)
1489 {
1490 // This is our input line
1491 size_t baseidx_vindex = (size_t)vindex[viidx++] * in_stride_floats;
1492
1493 dt_aligned_pixel_t vhs = { 0.0f, 0.0f, 0.0f, 0.0f };
1494
1495 for(int ix = 0; ix < hl; ix++)
1496 {
1497 // Apply the precomputed filter kernel
1498 const size_t baseidx = baseidx_vindex + (size_t)hindex[hiidx++] * 4;
1499 const float htap = hkernel[hkidx++];
1500 // Convince gcc 10 to vectorize
1501 dt_aligned_pixel_t tmp = { in[baseidx], in[baseidx+1], in[baseidx+2], in[baseidx+3] };
1502 for_each_channel(c, aligned(tmp,vhs:16)) vhs[c] += tmp[c] * htap;
1503 }
1504
1505 // Accumulate contribution from this line
1506 const float vtap = vkernel[vkidx++];
1507 for_each_channel(c, aligned(vhs,vs:16)) vs[c] += vhs[c] * vtap;
1508
1509 // Reset horizontal resampling context
1510 hkidx -= hl;
1511 hiidx -= hl;
1512 }
1513
1514 // Output pixel is ready
1515 const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
1516
1517 // Clip negative RGB that may be produced by Lanczos undershooting
1518 // Negative RGB are invalid values no matter the RGB space (light is positive)
1519 for_each_channel(c, aligned(vs:16)) out[baseidx + c] = fmaxf(vs[c], 0.f);
1520
1521 // Reset vertical resampling context
1522 viidx -= vl;
1523 vkidx -= vl;
1524
1525 // Progress in horizontal context
1526 hiidx += hl;
1527 hkidx += hl;
1528 }
1529 }
1530
1531 #if DEBUG_RESAMPLING_TIMING
1532 ts_resampling = getts() - ts_resampling;
1533 fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1534 #endif
1535
1536 exit:
1537 /* Free the resampling plans. It's nasty to optimize allocs like that, but
1538 * it simplifies the code :-D. The length array is in fact the only memory
1539 * allocated. */
1540 dt_free_align(hlength);
1541 dt_free_align(vlength);
1542 }
1543
1544 #if defined(__SSE2__)
dt_interpolation_resample_sse(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1545 static void dt_interpolation_resample_sse(const struct dt_interpolation *itor, float *out,
1546 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1547 const float *const in, const dt_iop_roi_t *const roi_in,
1548 const int32_t in_stride)
1549 {
1550 int *hindex = NULL;
1551 int *hlength = NULL;
1552 float *hkernel = NULL;
1553 int *vindex = NULL;
1554 int *vlength = NULL;
1555 float *vkernel = NULL;
1556 int *vmeta = NULL;
1557
1558 int r;
1559
1560 debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1561 roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1562 roi_out->x, roi_out->y, roi_out->scale);
1563
1564 // Fast code path for 1:1 copy, only cropping area can change
1565 if(roi_out->scale == 1.f)
1566 {
1567 const int x0 = roi_out->x * 4 * sizeof(float);
1568 #if DEBUG_RESAMPLING_TIMING
1569 int64_t ts_resampling = getts();
1570 #endif
1571 #ifdef _OPENMP
1572 #pragma omp parallel for default(none) \
1573 dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1574 shared(out)
1575 #endif
1576 for(int y = 0; y < roi_out->height; y++)
1577 {
1578 float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
1579 float *o = (float *)((char *)out + (size_t)out_stride * y);
1580 memcpy(o, i, out_stride);
1581 }
1582 #if DEBUG_RESAMPLING_TIMING
1583 ts_resampling = getts() - ts_resampling;
1584 fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1585 #endif
1586 // All done, so easy case
1587 return;
1588 }
1589
1590 // Generic non 1:1 case... much more complicated :D
1591 #if DEBUG_RESAMPLING_TIMING
1592 int64_t ts_plan = getts();
1593 #endif
1594
1595 // Prepare resampling plans once and for all
1596 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1597 &hlength, &hkernel, &hindex, NULL);
1598 if(r)
1599 {
1600 goto exit;
1601 }
1602
1603 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1604 &vlength, &vkernel, &vindex, &vmeta);
1605 if(r)
1606 {
1607 goto exit;
1608 }
1609
1610 #if DEBUG_RESAMPLING_TIMING
1611 ts_plan = getts() - ts_plan;
1612 #endif
1613
1614 #if DEBUG_RESAMPLING_TIMING
1615 int64_t ts_resampling = getts();
1616 #endif
1617
1618 // Process each output line
1619 #ifdef _OPENMP
1620 #pragma omp parallel for default(none) \
1621 dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
1622 shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1623 #endif
1624 for(int oy = 0; oy < roi_out->height; oy++)
1625 {
1626 // Initialize column resampling indexes
1627 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1628 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1629 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1630
1631 // Initialize row resampling indexes
1632 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1633 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1634 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1635
1636 // Number of lines contributing to the output line
1637 int vl = vlength[vlidx++]; // V(ertical) L(ength)
1638
1639 // Process each output column
1640 for(int ox = 0; ox < roi_out->width; ox++)
1641 {
1642 debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1643
1644 // This will hold the resulting pixel
1645 __m128 vs = _mm_setzero_ps();
1646
1647 // Number of horizontal samples contributing to the output
1648 const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1649
1650 for(int iy = 0; iy < vl; iy++)
1651 {
1652 // This is our input line
1653 const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
1654
1655 __m128 vhs = _mm_setzero_ps();
1656
1657 for(int ix = 0; ix < hl; ix++)
1658 {
1659 // Apply the precomputed filter kernel
1660 const size_t baseidx = (size_t)hindex[hiidx++] * 4;
1661 const float htap = hkernel[hkidx++];
1662 const __m128 vhtap = _mm_set_ps1(htap);
1663 vhs = _mm_add_ps(vhs, _mm_mul_ps(*(__m128 *)&i[baseidx], vhtap));
1664 }
1665
1666 // Accumulate contribution from this line
1667 const float vtap = vkernel[vkidx++];
1668 const __m128 vvtap = _mm_set_ps1(vtap);
1669 vs = _mm_add_ps(vs, _mm_mul_ps(vhs, vvtap));
1670
1671 // Reset horizontal resampling context
1672 hkidx -= hl;
1673 hiidx -= hl;
1674 }
1675
1676 // Output pixel is ready
1677 float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * 4 * sizeof(float));
1678
1679 // Clip negative RGB that may be produced by Lanczos undershooting
1680 // Negative RGB are invalid values no matter the RGB space (light is positive)
1681 vs = _mm_max_ps(vs, _mm_setzero_ps());
1682 _mm_stream_ps(o, vs);
1683
1684 // Reset vertical resampling context
1685 viidx -= vl;
1686 vkidx -= vl;
1687
1688 // Progress in horizontal context
1689 hiidx += hl;
1690 hkidx += hl;
1691 }
1692 }
1693
1694 _mm_sfence();
1695
1696 #if DEBUG_RESAMPLING_TIMING
1697 ts_resampling = getts() - ts_resampling;
1698 fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1699 #endif
1700
1701 exit:
1702 /* Free the resampling plans. It's nasty to optimize allocs like that, but
1703 * it simplifies the code :-D. The length array is in fact the only memory
1704 * allocated. */
1705 dt_free_align(hlength);
1706 dt_free_align(vlength);
1707 }
1708 #endif
1709
1710 /** Applies resampling (re-scaling) on *full* input and output buffers.
1711 * roi_in and roi_out define the part of the buffers that is affected.
1712 */
dt_interpolation_resample(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1713 void dt_interpolation_resample(const struct dt_interpolation *itor, float *out,
1714 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1715 const float *const in, const dt_iop_roi_t *const roi_in,
1716 const int32_t in_stride)
1717 {
1718 if(darktable.codepath.OPENMP_SIMD)
1719 return dt_interpolation_resample_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1720 #if defined(__SSE2__)
1721 else if(darktable.codepath.SSE2)
1722 return dt_interpolation_resample_sse(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1723 #endif
1724 else
1725 dt_unreachable_codepath();
1726 }
1727
1728 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1729 * and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1730 * positions of the roi's within the full input and output image, respectively.
1731 */
dt_interpolation_resample_roi(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1732 void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out,
1733 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1734 const float *const in, const dt_iop_roi_t *const roi_in,
1735 const int32_t in_stride)
1736 {
1737 dt_iop_roi_t oroi = *roi_out;
1738 oroi.x = oroi.y = 0;
1739
1740 dt_iop_roi_t iroi = *roi_in;
1741 iroi.x = iroi.y = 0;
1742
1743 dt_interpolation_resample(itor, out, &oroi, out_stride, in, &iroi, in_stride);
1744 }
1745
1746 #ifdef HAVE_OPENCL
dt_interpolation_init_cl_global()1747 dt_interpolation_cl_global_t *dt_interpolation_init_cl_global()
1748 {
1749 dt_interpolation_cl_global_t *g
1750 = (dt_interpolation_cl_global_t *)malloc(sizeof(dt_interpolation_cl_global_t));
1751
1752 const int program = 2; // basic.cl, from programs.conf
1753 g->kernel_interpolation_resample = dt_opencl_create_kernel(program, "interpolation_resample");
1754 return g;
1755 }
1756
dt_interpolation_free_cl_global(dt_interpolation_cl_global_t * g)1757 void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
1758 {
1759 if(!g) return;
1760 // destroy kernels
1761 dt_opencl_free_kernel(g->kernel_interpolation_resample);
1762 free(g);
1763 }
1764
roundToNextPowerOfTwo(uint32_t x)1765 static uint32_t roundToNextPowerOfTwo(uint32_t x)
1766 {
1767 x--;
1768 x |= x >> 1;
1769 x |= x >> 2;
1770 x |= x >> 4;
1771 x |= x >> 8;
1772 x |= x >> 16;
1773 x++;
1774 return x;
1775 }
1776
1777 /** Applies resampling (re-scaling) on *full* input and output buffers.
1778 * roi_in and roi_out define the part of the buffers that is affected.
1779 */
dt_interpolation_resample_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1780 int dt_interpolation_resample_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1781 const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1782 const dt_iop_roi_t *const roi_in)
1783 {
1784 int *hindex = NULL;
1785 int *hlength = NULL;
1786 float *hkernel = NULL;
1787 int *hmeta = NULL;
1788 int *vindex = NULL;
1789 int *vlength = NULL;
1790 float *vkernel = NULL;
1791 int *vmeta = NULL;
1792
1793 int r;
1794 cl_int err = -999;
1795
1796 cl_mem dev_hindex = NULL;
1797 cl_mem dev_hlength = NULL;
1798 cl_mem dev_hkernel = NULL;
1799 cl_mem dev_hmeta = NULL;
1800 cl_mem dev_vindex = NULL;
1801 cl_mem dev_vlength = NULL;
1802 cl_mem dev_vkernel = NULL;
1803 cl_mem dev_vmeta = NULL;
1804
1805 debug_info("resampling_cl %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", (void *)dev_in,
1806 roi_in->width, roi_in->height, roi_in->x, roi_in->y, roi_in->scale, (void *)dev_out,
1807 roi_out->width, roi_out->height, roi_out->x, roi_out->y, roi_out->scale);
1808
1809 // Fast code path for 1:1 copy, only cropping area can change
1810 if(roi_out->scale == 1.f)
1811 {
1812 #if DEBUG_RESAMPLING_TIMING
1813 int64_t ts_resampling = getts();
1814 #endif
1815 size_t iorigin[] = { roi_out->x, roi_out->y, 0 };
1816 size_t oorigin[] = { 0, 0, 0 };
1817 size_t region[] = { roi_out->width, roi_out->height, 1 };
1818
1819 // copy original input from dev_in -> dev_out as starting point
1820 err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
1821 if(err != CL_SUCCESS) goto error;
1822
1823 #if DEBUG_RESAMPLING_TIMING
1824 ts_resampling = getts() - ts_resampling;
1825 fprintf(stderr, "resampling_cl %p plan:0us resampling:%" PRId64 "us\n", (void *)dev_in, ts_resampling);
1826 #endif
1827 // All done, so easy case
1828 return CL_SUCCESS;
1829 }
1830
1831 // Generic non 1:1 case... much more complicated :D
1832 #if DEBUG_RESAMPLING_TIMING
1833 int64_t ts_plan = getts();
1834 #endif
1835
1836 // Prepare resampling plans once and for all
1837 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1838 &hlength, &hkernel, &hindex, &hmeta);
1839 if(r)
1840 {
1841 goto error;
1842 }
1843
1844 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1845 &vlength, &vkernel, &vindex, &vmeta);
1846 if(r)
1847 {
1848 goto error;
1849 }
1850
1851 int hmaxtaps = -1, vmaxtaps = -1;
1852 for(int k = 0; k < roi_out->width; k++) hmaxtaps = MAX(hmaxtaps, hlength[k]);
1853 for(int k = 0; k < roi_out->height; k++) vmaxtaps = MAX(vmaxtaps, vlength[k]);
1854
1855 #if DEBUG_RESAMPLING_TIMING
1856 ts_plan = getts() - ts_plan;
1857 #endif
1858
1859 #if DEBUG_RESAMPLING_TIMING
1860 int64_t ts_resampling = getts();
1861 #endif
1862
1863 // strategy: process image column-wise (local[0] = 1). For each row generate
1864 // a number of parallel work items each taking care of one horizontal convolution,
1865 // then sum over work items to do the vertical convolution
1866
1867 const int kernel = darktable.opencl->interpolation->kernel_interpolation_resample;
1868 const int width = roi_out->width;
1869 const int height = roi_out->height;
1870
1871 // make sure blocksize is not too large
1872 const int taps = roundToNextPowerOfTwo(vmaxtaps); // the number of work items per row rounded up to a power of 2
1873 // (for quick recursive reduction)
1874 int vblocksize;
1875
1876 dt_opencl_local_buffer_t locopt
1877 = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
1878 .cellsize = 4 * sizeof(float), .overhead = hmaxtaps * sizeof(float) + hmaxtaps * sizeof(int),
1879 .sizex = 1, .sizey = (1 << 16) * taps };
1880
1881 if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))
1882 vblocksize = locopt.sizey;
1883 else
1884 vblocksize = 1;
1885
1886 if(vblocksize < taps)
1887 {
1888 // our strategy does not work: the vertical number of taps exceeds the vertical workgroupsize;
1889 // there is no point in continuing on the GPU - that would be way too slow; let's delegate the stuff to
1890 // the CPU then.
1891 dt_print(
1892 DT_DEBUG_OPENCL,
1893 "[opencl_resampling] resampling plan cannot efficiently be run on the GPU - fall back to CPU.\n");
1894 goto error;
1895 }
1896
1897 size_t sizes[3] = { ROUNDUPWD(width), ROUNDUP(height * taps, vblocksize), 1 };
1898 size_t local[3] = { 1, vblocksize, 1 };
1899
1900 // store resampling plan to device memory
1901 // hindex, vindex, hkernel, vkernel: (v|h)maxtaps might be too small, so store a bit more than needed
1902 dev_hindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * (hmaxtaps + 1), hindex);
1903 if(dev_hindex == NULL) goto error;
1904
1905 dev_hlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width, hlength);
1906 if(dev_hlength == NULL) goto error;
1907
1908 dev_hkernel
1909 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * width * (hmaxtaps + 1), hkernel);
1910 if(dev_hkernel == NULL) goto error;
1911
1912 dev_hmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * 3, hmeta);
1913 if(dev_hmeta == NULL) goto error;
1914
1915 dev_vindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * (vmaxtaps + 1), vindex);
1916 if(dev_vindex == NULL) goto error;
1917
1918 dev_vlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height, vlength);
1919 if(dev_vlength == NULL) goto error;
1920
1921 dev_vkernel
1922 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * height * (vmaxtaps + 1), vkernel);
1923 if(dev_vkernel == NULL) goto error;
1924
1925 dev_vmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * 3, vmeta);
1926 if(dev_vmeta == NULL) goto error;
1927
1928 dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);
1929 dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);
1930 dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);
1931 dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);
1932 dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cl_mem), (void *)&dev_hmeta);
1933 dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cl_mem), (void *)&dev_vmeta);
1934 dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cl_mem), (void *)&dev_hlength);
1935 dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cl_mem), (void *)&dev_vlength);
1936 dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cl_mem), (void *)&dev_hindex);
1937 dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(cl_mem), (void *)&dev_vindex);
1938 dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(cl_mem), (void *)&dev_hkernel);
1939 dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(cl_mem), (void *)&dev_vkernel);
1940 dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(int), (void *)&hmaxtaps);
1941 dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(int), (void *)&taps);
1942 dt_opencl_set_kernel_arg(devid, kernel, 14, hmaxtaps * sizeof(float), NULL);
1943 dt_opencl_set_kernel_arg(devid, kernel, 15, hmaxtaps * sizeof(int), NULL);
1944 dt_opencl_set_kernel_arg(devid, kernel, 16, vblocksize * 4 * sizeof(float), NULL);
1945 err = dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizes, local);
1946 if(err != CL_SUCCESS) goto error;
1947
1948 #if DEBUG_RESAMPLING_TIMING
1949 ts_resampling = getts() - ts_resampling;
1950 fprintf(stderr, "resampling_cl %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", (void *)dev_in, ts_plan,
1951 ts_resampling);
1952 #endif
1953
1954 dt_opencl_release_mem_object(dev_hindex);
1955 dt_opencl_release_mem_object(dev_hlength);
1956 dt_opencl_release_mem_object(dev_hkernel);
1957 dt_opencl_release_mem_object(dev_hmeta);
1958 dt_opencl_release_mem_object(dev_vindex);
1959 dt_opencl_release_mem_object(dev_vlength);
1960 dt_opencl_release_mem_object(dev_vkernel);
1961 dt_opencl_release_mem_object(dev_vmeta);
1962 dt_free_align(hlength);
1963 dt_free_align(vlength);
1964 return CL_SUCCESS;
1965
1966 error:
1967 dt_opencl_release_mem_object(dev_hindex);
1968 dt_opencl_release_mem_object(dev_hlength);
1969 dt_opencl_release_mem_object(dev_hkernel);
1970 dt_opencl_release_mem_object(dev_hmeta);
1971 dt_opencl_release_mem_object(dev_vindex);
1972 dt_opencl_release_mem_object(dev_vlength);
1973 dt_opencl_release_mem_object(dev_vkernel);
1974 dt_opencl_release_mem_object(dev_vmeta);
1975 dt_free_align(hlength);
1976 dt_free_align(vlength);
1977 dt_print(DT_DEBUG_OPENCL, "[opencl_resampling] couldn't enqueue kernel! %d\n", err);
1978 return err;
1979 }
1980
1981 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1982 * and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1983 * positions of the roi's within the full input and output image, respectively.
1984 */
dt_interpolation_resample_roi_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1985 int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1986 const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1987 const dt_iop_roi_t *const roi_in)
1988 {
1989 dt_iop_roi_t oroi = *roi_out;
1990 oroi.x = oroi.y = 0;
1991
1992 dt_iop_roi_t iroi = *roi_in;
1993 iroi.x = iroi.y = 0;
1994
1995 return dt_interpolation_resample_cl(itor, devid, dev_out, &oroi, dev_in, &iroi);
1996 }
1997 #endif
1998
dt_interpolation_resample_1c_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1999 static void dt_interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out,
2000 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2001 const float *const in, const dt_iop_roi_t *const roi_in,
2002 const int32_t in_stride)
2003 {
2004 int *hindex = NULL;
2005 int *hlength = NULL;
2006 float *hkernel = NULL;
2007 int *vindex = NULL;
2008 int *vlength = NULL;
2009 float *vkernel = NULL;
2010 int *vmeta = NULL;
2011
2012 int r;
2013
2014 debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
2015 roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
2016 roi_out->x, roi_out->y, roi_out->scale);
2017
2018 // Fast code path for 1:1 copy, only cropping area can change
2019 if(roi_out->scale == 1.f)
2020 {
2021 const int x0 = roi_out->x * sizeof(float);
2022 #if DEBUG_RESAMPLING_TIMING
2023 int64_t ts_resampling = getts();
2024 #endif
2025 #ifdef _OPENMP
2026 #pragma omp parallel for default(none) \
2027 dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
2028 shared(out)
2029 #endif
2030 for(int y = 0; y < roi_out->height; y++)
2031 {
2032 float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
2033 float *o = (float *)((char *)out + (size_t)out_stride * y);
2034 memcpy(o, i, out_stride);
2035 }
2036 #if DEBUG_RESAMPLING_TIMING
2037 ts_resampling = getts() - ts_resampling;
2038 fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
2039 #endif
2040 // All done, so easy case
2041 return;
2042 }
2043
2044 // Generic non 1:1 case... much more complicated :D
2045 #if DEBUG_RESAMPLING_TIMING
2046 int64_t ts_plan = getts();
2047 #endif
2048
2049 // Prepare resampling plans once and for all
2050 r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
2051 &hlength, &hkernel, &hindex, NULL);
2052 if(r)
2053 {
2054 goto exit;
2055 }
2056
2057 r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
2058 &vlength, &vkernel, &vindex, &vmeta);
2059 if(r)
2060 {
2061 goto exit;
2062 }
2063
2064 #if DEBUG_RESAMPLING_TIMING
2065 ts_plan = getts() - ts_plan;
2066 #endif
2067
2068 #if DEBUG_RESAMPLING_TIMING
2069 int64_t ts_resampling = getts();
2070 #endif
2071
2072 // Process each output line
2073 #ifdef _OPENMP
2074 #pragma omp parallel for default(none) \
2075 dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
2076 shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
2077 #endif
2078 for(int oy = 0; oy < roi_out->height; oy++)
2079 {
2080 // Initialize column resampling indexes
2081 int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
2082 int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
2083 int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
2084
2085 // Initialize row resampling indexes
2086 int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
2087 int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
2088 int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
2089
2090 // Number of lines contributing to the output line
2091 int vl = vlength[vlidx++]; // V(ertical) L(ength)
2092
2093 // Process each output column
2094 for(int ox = 0; ox < roi_out->width; ox++)
2095 {
2096 debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
2097
2098 // This will hold the resulting pixel
2099 float vs = 0.0f;
2100
2101 // Number of horizontal samples contributing to the output
2102 const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
2103
2104 for(int iy = 0; iy < vl; iy++)
2105 {
2106 // This is our input line
2107 const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
2108
2109 float vhs = 0.0f;
2110
2111 for(int ix = 0; ix < hl; ix++)
2112 {
2113 // Apply the precomputed filter kernel
2114 const size_t baseidx = (size_t)hindex[hiidx++];
2115 const float htap = hkernel[hkidx++];
2116 vhs += i[baseidx] * htap;
2117 }
2118
2119 // Accumulate contribution from this line
2120 const float vtap = vkernel[vkidx++];
2121 vs += vhs * vtap;
2122
2123 // Reset horizontal resampling context
2124 hkidx -= hl;
2125 hiidx -= hl;
2126 }
2127
2128 // Output pixel is ready
2129 float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * sizeof(float));
2130 *o = vs;
2131
2132 // Reset vertical resampling context
2133 viidx -= vl;
2134 vkidx -= vl;
2135
2136 // Progress in horizontal context
2137 hiidx += hl;
2138 hkidx += hl;
2139 }
2140 }
2141
2142 #if DEBUG_RESAMPLING_TIMING
2143 ts_resampling = getts() - ts_resampling;
2144 fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
2145 #endif
2146
2147 exit:
2148 /* Free the resampling plans. It's nasty to optimize allocs like that, but
2149 * it simplifies the code :-D. The length array is in fact the only memory
2150 * allocated. */
2151 dt_free_align(hlength);
2152 dt_free_align(vlength);
2153 }
2154
2155 /** Applies resampling (re-scaling) on *full* input and output buffers.
2156 * roi_in and roi_out define the part of the buffers that is affected.
2157 */
dt_interpolation_resample_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2158 void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out,
2159 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2160 const float *const in, const dt_iop_roi_t *const roi_in,
2161 const int32_t in_stride)
2162 {
2163 return dt_interpolation_resample_1c_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
2164 }
2165
2166 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
2167 * and output buffers hold exactly those roi's. roi_in and roi_out define the relative
2168 * positions of the roi's within the full input and output image, respectively.
2169 */
dt_interpolation_resample_roi_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2170 void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out,
2171 const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2172 const float *const in, const dt_iop_roi_t *const roi_in,
2173 const int32_t in_stride)
2174 {
2175 dt_iop_roi_t oroi = *roi_out;
2176 oroi.x = oroi.y = 0;
2177
2178 dt_iop_roi_t iroi = *roi_in;
2179 iroi.x = iroi.y = 0;
2180
2181 dt_interpolation_resample_1c(itor, out, &oroi, out_stride, in, &iroi, in_stride);
2182 }
2183
2184 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2185 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2186 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2187