1 /* --------------------------------------------------------------------------
2     This file is part of darktable,
3     Copyright (C) 2012-2021 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 * ------------------------------------------------------------------------*/
18 
19 #include "common/interpolation.h"
20 #include "common/darktable.h"
21 #include "common/math.h"
22 #include "control/conf.h"
23 
24 #include <assert.h>
25 #include <glib.h>
26 #include <inttypes.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 /** Border extrapolation modes */
31 enum border_mode
32 {
33   BORDER_REPLICATE, // aaaa|abcdefg|gggg
34   BORDER_WRAP,      // defg|abcdefg|abcd
35   BORDER_MIRROR,    // edcb|abcdefg|fedc
36   BORDER_CLAMP      // ....|abcdefg|....
37 };
38 
39 /* Supporting them all might be overkill, let the compiler trim all
40  * unnecessary modes in clip for resampling codepath*/
41 #define RESAMPLING_BORDER_MODE BORDER_REPLICATE
42 
43 /* Supporting them all might be overkill, let the compiler trim all
44  * unnecessary modes in interpolation codepath */
45 #define INTERPOLATION_BORDER_MODE BORDER_MIRROR
46 
47 // Defines minimum alignment requirement for critical SIMD code
48 #define SSE_ALIGNMENT 64
49 
50 // Defines the maximum kernel half length
51 // !! Make sure to sync this with the filter array !!
52 #define MAX_HALF_FILTER_WIDTH 3
53 
54 // Add code for timing resampling function
55 #define DEBUG_RESAMPLING_TIMING 0
56 
57 // Add debug info messages to stderr
58 #define DEBUG_PRINT_INFO 0
59 
60 // Add *verbose* (like one msg per pixel out) debug message to stderr
61 #define DEBUG_PRINT_VERBOSE 0
62 
63 /* --------------------------------------------------------------------------
64  * Debug helpers
65  * ------------------------------------------------------------------------*/
66 
67 #if DEBUG_RESAMPLING_TIMING
68 #include <sys/time.h>
69 #endif
70 
71 #if DEBUG_PRINT_INFO
72 #define debug_info(...)                                                                                      \
73   do                                                                                                         \
74   {                                                                                                          \
75     fprintf(stderr, __VA_ARGS__);                                                                            \
76   } while(0)
77 #else
78 #define debug_info(...)
79 #endif
80 
81 #if DEBUG_PRINT_VERBOSE
82 #define debug_extra(...)                                                                                     \
83   do                                                                                                         \
84   {                                                                                                          \
85     fprintf(stderr, __VA_ARGS__);                                                                            \
86   } while(0)
87 #else
88 #define debug_extra(...)
89 #endif
90 
91 #if DEBUG_RESAMPLING_TIMING
getts()92 static inline int64_t getts()
93 {
94   struct timeval t;
95   gettimeofday(&t, NULL);
96   return t.tv_sec * INT64_C(1000000) + t.tv_usec;
97 }
98 #endif
99 
100 /* --------------------------------------------------------------------------
101  * Generic helpers
102  * ------------------------------------------------------------------------*/
103 
104 /** Clip into specified range
105  * @param idx index to filter
106  * @param length length of line
107  */
clip(int i,int min,int max,enum border_mode mode)108 static inline int clip(int i, int min, int max, enum border_mode mode)
109 {
110   switch(mode)
111   {
112     case BORDER_REPLICATE:
113       if(i < min)
114       {
115         i = min;
116       }
117       else if(i > max)
118       {
119         i = max;
120       }
121       break;
122     case BORDER_MIRROR:
123       if(i < min)
124       {
125         i = min - i;
126       }
127       else if(i > max)
128       {
129         i = 2 * max - i;
130       }
131       break;
132     case BORDER_WRAP:
133       if(i < min)
134       {
135         i = max - (min - i);
136       }
137       else if(i > max)
138       {
139         i = min + (i - max);
140       }
141       break;
142     case BORDER_CLAMP:
143       if(i < min || i > max)
144       {
145         /* Should not be used as is, we prevent -1 usage, filtering the taps
146          * we clip the sample indexes for. So understand this function is
147          * specific to its caller. */
148         i = -1;
149       }
150       break;
151   }
152 
153   return i;
154 }
155 
prepare_tap_boundaries(int * tap_first,int * tap_last,const enum border_mode mode,const int filterwidth,const int t,const int max)156 static inline void prepare_tap_boundaries(int *tap_first, int *tap_last, const enum border_mode mode,
157                                           const int filterwidth, const int t, const int max)
158 {
159   /* Check lower bound pixel index and skip as many pixels as necessary to
160    * fall into range */
161   *tap_first = 0;
162   if(mode == BORDER_CLAMP && t < 0)
163   {
164     *tap_first = -t;
165   }
166 
167   // Same for upper bound pixel
168   *tap_last = filterwidth;
169   if(mode == BORDER_CLAMP && t + filterwidth >= max)
170   {
171     *tap_last = max - t;
172   }
173 }
174 
175 /** Make sure an aligned chunk will not misalign its following chunk
176  * proposing an adapted length
177  *
178  * @param l Length required for current chunk
179  * @param align Required alignment for next chunk
180  *
181  * @return Required length for keeping alignment ok if chaining data chunks
182  */
increase_for_alignment(size_t l,size_t align)183 static inline size_t increase_for_alignment(size_t l, size_t align)
184 {
185   align -= 1;
186   return (l + align) & (~align);
187 }
188 
189 /* --------------------------------------------------------------------------
190  * Interpolation kernels
191  * ------------------------------------------------------------------------*/
192 
193 /* --------------------------------------------------------------------------
194  * Bilinear interpolation
195  * ------------------------------------------------------------------------*/
196 
bilinear(float width,float t)197 static inline float bilinear(float width, float t)
198 {
199   float r;
200   t = fabsf(t);
201   if(t > 1.f)
202   {
203     r = 0.f;
204   }
205   else
206   {
207     r = 1.f - t;
208   }
209   return r;
210 }
211 
212 #if defined(__SSE2__)
bilinear_sse(__m128 width,__m128 t)213 static inline __m128 bilinear_sse(__m128 width, __m128 t)
214 {
215   static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
216   return _mm_sub_ps(one, _mm_abs_ps(t));
217 }
218 #endif
219 
220 /* --------------------------------------------------------------------------
221  * Bicubic interpolation
222  * ------------------------------------------------------------------------*/
223 
bicubic(float width,float t)224 static inline float bicubic(float width, float t)
225 {
226   float r;
227   t = fabsf(t);
228   if(t >= 2.f)
229   {
230     r = 0.f;
231   }
232   else if(t > 1.f && t < 2.f)
233   {
234     float t2 = t * t;
235     r = 0.5f * (t * (-t2 + 5.f * t - 8.f) + 4.f);
236   }
237   else
238   {
239     float t2 = t * t;
240     r = 0.5f * (t * (3.f * t2 - 5.f * t) + 2.f);
241   }
242   return r;
243 }
244 
245 #if defined(__SSE2__)
bicubic_sse(__m128 width,__m128 t)246 static inline __m128 bicubic_sse(__m128 width, __m128 t)
247 {
248   static const __m128 half = { .5f, .5f, .5f, .5f };
249   static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
250   static const __m128 two = { 2.f, 2.f, 2.f, 2.f };
251   static const __m128 three = { 3.f, 3.f, 3.f, 3.f };
252   static const __m128 four = { 4.f, 4.f, 4.f, 4.f };
253   static const __m128 five = { 5.f, 5.f, 5.f, 5.f };
254   static const __m128 eight = { 8.f, 8.f, 8.f, 8.f };
255 
256   t = _mm_abs_ps(t);
257   const __m128 t2 = _mm_mul_ps(t, t);
258 
259   /* Compute 1 < t < 2 case:
260    * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
261    * half*(t*(mt2 + t5 - eight) + four)
262    * half*(t*(mt2 + t5_sub_8) + four)
263    * half*(t*(mt2_add_t5_sub_8) + four) */
264   const __m128 t5 = _mm_mul_ps(five, t);
265   const __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
266   const __m128 zero = _mm_setzero_ps();
267   const __m128 mt2 = _mm_sub_ps(zero, t2);
268   const __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
269   const __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
270   const __m128 b = _mm_add_ps(a, four);
271   __m128 r12 = _mm_mul_ps(b, half);
272 
273   /* Compute case < 1
274    * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
275   const __m128 t23 = _mm_mul_ps(three, t2);
276   const __m128 c = _mm_sub_ps(t23, t5);
277   const __m128 d = _mm_mul_ps(t, c);
278   const __m128 e = _mm_add_ps(d, two);
279   __m128 r01 = _mm_mul_ps(half, e);
280 
281   // Compute masks fr keeping correct components
282   const __m128 mask01 = _mm_cmple_ps(t, one);
283   const __m128 mask12 = _mm_cmpgt_ps(t, one);
284   r01 = _mm_and_ps(mask01, r01);
285   r12 = _mm_and_ps(mask12, r12);
286 
287 
288   return _mm_or_ps(r01, r12);
289 }
290 #endif
291 
292 /* --------------------------------------------------------------------------
293  * Lanczos interpolation
294  * ------------------------------------------------------------------------*/
295 
296 #define DT_LANCZOS_EPSILON (1e-9f)
297 
298 #if 0
299 // Reference version left here for ... documentation
300 static inline float
301 lanczos(float width, float t)
302 {
303   float r;
304 
305   if (t<-width || t>width)
306   {
307     r = 0.f;
308   }
309   else if (t>-DT_LANCZOS_EPSILON && t<DT_LANCZOS_EPSILON)
310   {
311     r = 1.f;
312   }
313   else
314   {
315     r = width*sinf(M_PI*t)*sinf(M_PI*t/width)/(M_PI*M_PI*t*t);
316   }
317   return r;
318 }
319 #endif
320 
321 /* Fast lanczos version, no calls to math.h functions, too accurate, too slow
322  *
323  * Based on a forum entry at
324  * http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/
325  *
326  * Apart the fast sine function approximation, the only trick is to compute:
327  * sin(pi.t) = sin(a.pi + r.pi) where t = a + r = trunc(t) + r
328  *           = sin(a.pi).cos(r.pi) + sin(r.pi).cos(a.pi)
329  *           =         0*cos(r.pi) + sin(r.pi).cos(a.pi)
330  *           = sign.sin(r.pi) where sign =  1 if the a is even
331  *                                       = -1 if the a is odd
332  *
333  * Of course we know that lanczos func will only be called for
334  * the range -width < t < width so we can additionally avoid the
335  * range check.  */
336 
lanczos(float width,float t)337 static inline float lanczos(float width, float t)
338 {
339   /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
340    * correct */
341   int a = (int)t;
342   float r = t - (float)a;
343 
344   // Compute the correct sign for sinf(pi.r)
345   union
346   {
347     float f;
348     uint32_t i;
349   } sign;
350   sign.i = ((a & 1) << 31) | 0x3f800000;
351 
352   return (DT_LANCZOS_EPSILON + width * sign.f * sinf_fast(M_PI_F * r) * sinf_fast(M_PI_F * t / width))
353          / (DT_LANCZOS_EPSILON + M_PI_F * M_PI_F * t * t);
354 }
355 
356 #if defined(__SSE2__)
lanczos_sse2(__m128 width,__m128 t)357 static inline __m128 lanczos_sse2(__m128 width, __m128 t)
358 {
359   /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
360    * correct */
361   __m128i a = _mm_cvtps_epi32(t);
362   __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));
363 
364   // Compute the correct sign for sinf(pi.r)
365   static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT)))
366   = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
367   static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1 };
368   static const __m128 eps
369       = { DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON };
370   static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
371   static const __m128 pi2 = { M_PI * M_PI, M_PI * M_PI, M_PI * M_PI, M_PI * M_PI };
372 
373   __m128i isign = _mm_and_si128(*(__m128i *)ione, a);
374   isign = _mm_slli_epi64(isign, 31);
375   isign = _mm_or_si128(*(__m128i *)fone, isign);
376   const __m128 fsign = _mm_castsi128_ps(isign);
377 
378   __m128 num = _mm_mul_ps(width, fsign);
379   num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
380   num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
381   num = _mm_add_ps(eps, num);
382 
383   __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
384   den = _mm_add_ps(eps, den);
385 
386   return _mm_div_ps(num, den);
387 }
388 #endif
389 
390 #undef DT_LANCZOS_EPSILON
391 
392 /* --------------------------------------------------------------------------
393  * All our known interpolators
394  * ------------------------------------------------------------------------*/
395 
396 /* !!! !!! !!!
397  * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width
398  * of this filter list. Otherwise bad things will happen
399  * !!! !!! !!!
400  */
401 static const struct dt_interpolation dt_interpolator[] = {
402   {.id = DT_INTERPOLATION_BILINEAR,
403    .name = "bilinear",
404    .width = 1,
405    .func = &bilinear,
406 #if defined(__SSE2__)
407    .funcsse = &bilinear_sse
408 #endif
409   },
410   {.id = DT_INTERPOLATION_BICUBIC,
411    .name = "bicubic",
412    .width = 2,
413    .func = &bicubic,
414 #if defined(__SSE2__)
415    .funcsse = &bicubic_sse
416 #endif
417   },
418   {.id = DT_INTERPOLATION_LANCZOS2,
419    .name = "lanczos2",
420    .width = 2,
421    .func = &lanczos,
422 #if defined(__SSE2__)
423    .funcsse = &lanczos_sse2
424 #endif
425   },
426   {.id = DT_INTERPOLATION_LANCZOS3,
427    .name = "lanczos3",
428    .width = 3,
429    .func = &lanczos,
430 #if defined(__SSE2__)
431    .funcsse = &lanczos_sse2
432 #endif
433   },
434 };
435 
436 /* --------------------------------------------------------------------------
437  * Kernel utility methods
438  * ------------------------------------------------------------------------*/
439 
440 /** Computes an upsampling filtering kernel
441  *
442  * @param itor [in] Interpolator used
443  * @param kernel [out] resulting itor->width*2 filter taps
444  * @param norm [out] Kernel norm
445  * @param first [out] first input sample index used
446  * @param t [in] Interpolated coordinate */
compute_upsampling_kernel_plain(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)447 static inline void compute_upsampling_kernel_plain(const struct dt_interpolation *itor, float *kernel,
448                                                    float *norm, int *first, float t)
449 {
450   int f = (int)t - itor->width + 1;
451   if(first)
452   {
453     *first = f;
454   }
455 
456   /* Find closest integer position and then offset that to match first
457    * filtered sample position */
458   t = t - (float)f;
459 
460   // Will hold kernel norm
461   float n = 0.f;
462 
463   // Compute the raw kernel
464   for(int i = 0; i < 2 * itor->width; i++)
465   {
466     float tap = itor->func((float)itor->width, t);
467     n += tap;
468     kernel[i] = tap;
469     t -= 1.f;
470   }
471   if(norm)
472   {
473     *norm = n;
474   }
475 }
476 
477 #if defined(__SSE2__)
478 /** Computes an upsampling filtering kernel (SSE version, four taps per inner loop)
479  *
480  * @param itor [in] Interpolator used
481  * @param kernel [out] resulting itor->width*2 filter taps (array must be at least (itor->width*2+3)/4*4
482  *floats long)
483  * @param norm [out] Kernel norm
484  * @param first [out] first input sample index used
485  * @param t [in] Interpolated coordinate
486  *
487  * @return kernel norm
488  */
compute_upsampling_kernel_sse(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)489 static inline void compute_upsampling_kernel_sse(const struct dt_interpolation *itor, float *kernel,
490                                                  float *norm, int *first, float t)
491 {
492   int f = (int)t - itor->width + 1;
493   if(first)
494   {
495     *first = f;
496   }
497 
498   /* Find closest integer position and then offset that to match first
499    * filtered sample position */
500   t = t - (float)f;
501 
502   // Prepare t vector to compute four values a loop
503   static const __m128 bootstrap = { 0.f, -1.f, -2.f, -3.f };
504   static const __m128 iter = { -4.f, -4.f, -4.f, -4.f };
505   __m128 vt = _mm_add_ps(_mm_set_ps1(t), bootstrap);
506   __m128 vw = _mm_set_ps1((float)itor->width);
507 
508   // Prepare counters (math kept stupid for understanding)
509   int i = 0;
510   const int runs = (2 * itor->width + 3) / 4;
511 
512   while(i < runs)
513   {
514     // Compute the values
515     const __m128 vr = itor->funcsse(vw, vt);
516 
517     // Save result
518     *(__m128 *)kernel = vr;
519 
520     // Prepare next iteration
521     vt = _mm_add_ps(vt, iter);
522     kernel += 4;
523     i++;
524   }
525 
526   // compute norm now
527   if(norm)
528   {
529     float n = 0.f;
530     i = 0;
531     kernel -= 4 * runs;
532     while(i < 2 * itor->width)
533     {
534       n += *kernel;
535       kernel++;
536       i++;
537     }
538     *norm = n;
539   }
540 }
541 #endif
542 
compute_upsampling_kernel(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)543 static inline void compute_upsampling_kernel(const struct dt_interpolation *itor, float *kernel, float *norm,
544                                              int *first, float t)
545 {
546   if(darktable.codepath.OPENMP_SIMD) return compute_upsampling_kernel_plain(itor, kernel, norm, first, t);
547 #if defined(__SSE2__)
548   else if(darktable.codepath.SSE2)
549     return compute_upsampling_kernel_sse(itor, kernel, norm, first, t);
550 #endif
551   else
552     dt_unreachable_codepath();
553 }
554 
555 /** Computes a downsampling filtering kernel
556  *
557  * @param itor [in] Interpolator used
558  * @param kernelsize [out] Number of taps
559  * @param kernel [out] resulting taps (at least itor->width/inoout elements for no overflow)
560  * @param norm [out] Kernel norm
561  * @param first [out] index of the first sample for which the kernel is to be applied
562  * @param outoinratio [in] "out samples" over "in samples" ratio
563  * @param xout [in] Output coordinate */
compute_downsampling_kernel_plain(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)564 static inline void compute_downsampling_kernel_plain(const struct dt_interpolation *itor, int *taps,
565                                                      int *first, float *kernel, float *norm,
566                                                      float outoinratio, int xout)
567 {
568   // Keep this at hand
569   const float w = (float)itor->width;
570 
571   /* Compute the phase difference between output pixel and its
572    * input corresponding input pixel */
573   const float xin = ceil_fast(((float)xout - w) / outoinratio);
574   if(first)
575   {
576     *first = (int)xin;
577   }
578 
579   // Compute first interpolator parameter
580   float t = xin * outoinratio - (float)xout;
581 
582   // Will hold kernel norm
583   float n = 0.f;
584 
585   // Compute all filter taps
586   *taps = (int)((w - t) / outoinratio);
587   for(int i = 0; i < *taps; i++)
588   {
589     *kernel = itor->func(w, t);
590     n += *kernel;
591     t += outoinratio;
592     kernel++;
593   }
594 
595   if(norm)
596   {
597     *norm = n;
598   }
599 }
600 
601 
602 #if defined(__SSE2__)
603 /** Computes a downsampling filtering kernel (SSE version, four taps per inner loop iteration)
604  *
605  * @param itor [in] Interpolator used
606  * @param kernelsize [out] Number of taps
607  * @param kernel [out] resulting taps (at least itor->width/inoout + 4 elements for no overflow)
608  * @param norm [out] Kernel norm
609  * @param first [out] index of the first sample for which the kernel is to be applied
610  * @param outoinratio [in] "out samples" over "in samples" ratio
611  * @param xout [in] Output coordinate */
compute_downsampling_kernel_sse(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)612 static inline void compute_downsampling_kernel_sse(const struct dt_interpolation *itor, int *taps, int *first,
613                                                    float *kernel, float *norm, float outoinratio, int xout)
614 {
615   // Keep this at hand
616   const float w = (float)itor->width;
617 
618   /* Compute the phase difference between output pixel and its
619    * input corresponding input pixel */
620   const float xin = ceil_fast(((float)xout - w) / outoinratio);
621   if(first)
622   {
623     *first = (int)xin;
624   }
625 
626   // Compute first interpolator parameter
627   float t = xin * outoinratio - (float)xout;
628 
629   // Compute all filter taps
630   *taps = (int)((w - t) / outoinratio);
631 
632   // Bootstrap vector t
633   static const __m128 bootstrap = { 0.f, 1.f, 2.f, 3.f };
634   const __m128 iter = _mm_set_ps1(4.f * outoinratio);
635   const __m128 vw = _mm_set_ps1(w);
636   __m128 vt = _mm_add_ps(_mm_set_ps1(t), _mm_mul_ps(_mm_set_ps1(outoinratio), bootstrap));
637 
638   // Prepare counters (math kept stupid for understanding)
639   int i = 0;
640   const int runs = (*taps + 3) / 4;
641 
642   while(i < runs)
643   {
644     // Compute the values
645     const __m128 vr = itor->funcsse(vw, vt);
646 
647     // Save result
648     *(__m128 *)kernel = vr;
649 
650     // Prepare next iteration
651     vt = _mm_add_ps(vt, iter);
652     kernel += 4;
653     i++;
654   }
655 
656   // compute norm now
657   if(norm)
658   {
659     float n = 0.f;
660     i = 0;
661     kernel -= 4 * runs;
662     while(i < *taps)
663     {
664       n += *kernel;
665       kernel++;
666       i++;
667     }
668     *norm = n;
669   }
670 }
671 #endif
672 
compute_downsampling_kernel(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)673 static inline void compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first,
674                                                float *kernel, float *norm, float outoinratio, int xout)
675 {
676   if(darktable.codepath.OPENMP_SIMD)
677     return compute_downsampling_kernel_plain(itor, taps, first, kernel, norm, outoinratio, xout);
678 #if defined(__SSE2__)
679   else if(darktable.codepath.SSE2)
680     return compute_downsampling_kernel_sse(itor, taps, first, kernel, norm, outoinratio, xout);
681 #endif
682   else
683     dt_unreachable_codepath();
684 }
685 
686 /* --------------------------------------------------------------------------
687  * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
688  * ------------------------------------------------------------------------*/
689 
690 #define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))
691 
dt_interpolation_compute_sample(const struct dt_interpolation * itor,const float * in,const float x,const float y,const int width,const int height,const int samplestride,const int linestride)692 float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x,
693                                       const float y, const int width, const int height,
694                                       const int samplestride, const int linestride)
695 {
696   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
697 
698   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
699   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
700 
701   // Compute both horizontal and vertical kernels
702   float normh;
703   float normv;
704   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
705   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
706 
707   int ix = (int)x;
708   int iy = (int)y;
709 
710   /* Now 2 cases, the pixel + filter width goes outside the image
711    * in that case we have to use index clipping to keep all reads
712    * in the input image (slow path) or we are sure it won't fall
713    * outside and can do more simple code */
714   float r;
715   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
716      && iy < (height - itor->width))
717   {
718     // Inside image boundary case
719 
720     // Go to top left pixel
721     in = (float *)in + linestride * iy + ix * samplestride;
722     in = in - (itor->width - 1) * (samplestride + linestride);
723 
724     // Apply the kernel
725     float s = 0.f;
726     for(int i = 0; i < 2 * itor->width; i++)
727     {
728       float h = 0.0f;
729       for(int j = 0; j < 2 * itor->width; j++)
730       {
731         h += kernelh[j] * in[j * samplestride];
732       }
733       s += kernelv[i] * h;
734       in += linestride;
735     }
736     r = s / (normh * normv);
737   }
738   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
739   {
740     // At least a valid coordinate
741 
742     // Point to the upper left pixel index wise
743     iy -= itor->width - 1;
744     ix -= itor->width - 1;
745 
746     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
747     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
748 
749     int xtap_first;
750     int xtap_last;
751     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
752 
753     int ytap_first;
754     int ytap_last;
755     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
756 
757     // Apply the kernel
758     float s = 0.f;
759     for(int i = ytap_first; i < ytap_last; i++)
760     {
761       const int clip_y = clip(iy + i, 0, height - 1, bordermode);
762       float h = 0.0f;
763       for(int j = xtap_first; j < xtap_last; j++)
764       {
765         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
766         const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
767         h += kernelh[j] * ipixel[0];
768       }
769       s += kernelv[i] * h;
770     }
771 
772     r = s / (normh * normv);
773   }
774   else
775   {
776     // invalid coordinate
777     r = 0.0f;
778   }
779   return r;
780 }
781 
782 /* --------------------------------------------------------------------------
783  * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
784  * ------------------------------------------------------------------------*/
785 
dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)786 static void dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation *itor, const float *in,
787                                                    float *out, const float x, const float y, const int width,
788                                                    const int height, const int linestride)
789 {
790   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
791 
792   // Quite a bit of space for kernels
793   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
794   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
795 
796   // Compute both horizontal and vertical kernels
797   float normh;
798   float normv;
799   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
800   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
801 
802   // Precompute the inverse of the filter norm for later use
803   const float oonorm = (1.f / (normh * normv));
804 
805   /* Now 2 cases, the pixel + filter width goes outside the image
806    * in that case we have to use index clipping to keep all reads
807    * in the input image (slow path) or we are sure it won't fall
808    * outside and can do more simple code */
809   int ix = (int)x;
810   int iy = (int)y;
811 
812   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
813      && iy < (height - itor->width))
814   {
815     // Inside image boundary case
816 
817     // Go to top left pixel
818     in = (float *)in + linestride * iy + ix * 4;
819     in = in - (itor->width - 1) * (4 + linestride);
820 
821     // Apply the kernel
822     dt_aligned_pixel_t pixel = { 0.0f, 0.0f, 0.0f, 0.0f };
823     for(int i = 0; i < 2 * itor->width; i++)
824     {
825       dt_aligned_pixel_t h = { 0.0f, 0.0f, 0.0f, 0.0f };
826       for(int j = 0; j < 2 * itor->width; j++)
827       {
828         for(int c = 0; c < 3; c++) h[c] += kernelh[j] * in[j * 4 + c];
829       }
830       for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
831       in += linestride;
832     }
833 
834     for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
835   }
836   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
837   {
838     // At least a valid coordinate
839 
840     // Point to the upper left pixel index wise
841     iy -= itor->width - 1;
842     ix -= itor->width - 1;
843 
844     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
845     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
846 
847     int xtap_first;
848     int xtap_last;
849     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
850 
851     int ytap_first;
852     int ytap_last;
853     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
854 
855     // Apply the kernel
856     dt_aligned_pixel_t pixel = { 0.0f, 0.0f, 0.0f, 0.0f };
857     for(int i = ytap_first; i < ytap_last; i++)
858     {
859       const int clip_y = clip(iy + i, 0, height - 1, bordermode);
860       dt_aligned_pixel_t h = { 0.0f, 0.0f, 0.0f, 0.0f };
861       for(int j = xtap_first; j < xtap_last; j++)
862       {
863         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
864         const float *ipixel = in + clip_y * linestride + clip_x * 4;
865         for(int c = 0; c < 3; c++) h[c] += kernelh[j] * ipixel[c];
866       }
867       for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
868     }
869 
870     for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
871   }
872   else
873   {
874     for(int c = 0; c < 3; c++) out[c] = 0.0f;
875   }
876 }
877 
878 #if defined(__SSE2__)
dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)879 static void dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation *itor, const float *in,
880                                                  float *out, const float x, const float y, const int width,
881                                                  const int height, const int linestride)
882 {
883   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
884 
885   // Quite a bit of space for kernels
886   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
887   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
888   __m128 vkernelh[2 * MAX_HALF_FILTER_WIDTH];
889   __m128 vkernelv[2 * MAX_HALF_FILTER_WIDTH];
890 
891   // Compute both horizontal and vertical kernels
892   float normh;
893   float normv;
894   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
895   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
896 
897   // We will process four components a time, duplicate the information
898   for(int i = 0; i < 2 * itor->width; i++)
899   {
900     vkernelh[i] = _mm_set_ps1(kernelh[i]);
901     vkernelv[i] = _mm_set_ps1(kernelv[i]);
902   }
903 
904   // Precompute the inverse of the filter norm for later use
905   const __m128 oonorm = _mm_set_ps1(1.f / (normh * normv));
906 
907   /* Now 2 cases, the pixel + filter width goes outside the image
908    * in that case we have to use index clipping to keep all reads
909    * in the input image (slow path) or we are sure it won't fall
910    * outside and can do more simple code */
911   int ix = (int)x;
912   int iy = (int)y;
913 
914   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
915      && iy < (height - itor->width))
916   {
917     // Inside image boundary case
918 
919     // Go to top left pixel
920     in = (float *)in + linestride * iy + ix * 4;
921     in = in - (itor->width - 1) * (4 + linestride);
922 
923     // Apply the kernel
924     __m128 pixel = _mm_setzero_ps();
925     for(int i = 0; i < 2 * itor->width; i++)
926     {
927       __m128 h = _mm_setzero_ps();
928       for(int j = 0; j < 2 * itor->width; j++)
929       {
930         h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)&in[j * 4]));
931       }
932       pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
933       in += linestride;
934     }
935 
936     *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
937   }
938   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
939   {
940     // At least a valid coordinate
941 
942     // Point to the upper left pixel index wise
943     iy -= itor->width - 1;
944     ix -= itor->width - 1;
945 
946     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
947     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
948 
949     int xtap_first;
950     int xtap_last;
951     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
952 
953     int ytap_first;
954     int ytap_last;
955     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
956 
957     // Apply the kernel
958     __m128 pixel = _mm_setzero_ps();
959     for(int i = ytap_first; i < ytap_last; i++)
960     {
961       int clip_y = clip(iy + i, 0, height - 1, bordermode);
962       __m128 h = _mm_setzero_ps();
963       for(int j = xtap_first; j < xtap_last; j++)
964       {
965         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
966         const float *ipixel = in + clip_y * linestride + clip_x * 4;
967         h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)ipixel));
968       }
969       pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
970     }
971 
972     *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
973   }
974   else
975   {
976     *(__m128 *)out = _mm_set_ps1(0.0f);
977   }
978 }
979 #endif
980 
dt_interpolation_compute_pixel4c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)981 void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out,
982                                       const float x, const float y, const int width, const int height,
983                                       const int linestride)
984 {
985   if(darktable.codepath.OPENMP_SIMD)
986     return dt_interpolation_compute_pixel4c_plain(itor, in, out, x, y, width, height, linestride);
987 #if defined(__SSE2__)
988   else if(darktable.codepath.SSE2)
989     return dt_interpolation_compute_pixel4c_sse(itor, in, out, x, y, width, height, linestride);
990 #endif
991   else
992     dt_unreachable_codepath();
993 }
994 
dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)995 static void dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation *itor, const float *in,
996                                                    float *out, const float x, const float y, const int width,
997                                                    const int height, const int linestride)
998 {
999   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
1000 
1001   // Quite a bit of space for kernels
1002   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1003   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1004 
1005   // Compute both horizontal and vertical kernels
1006   float normh;
1007   float normv;
1008   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
1009   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
1010 
1011   // Precompute the inverse of the filter norm for later use
1012   const float oonorm = (1.f / (normh * normv));
1013 
1014   /* Now 2 cases, the pixel + filter width goes outside the image
1015    * in that case we have to use index clipping to keep all reads
1016    * in the input image (slow path) or we are sure it won't fall
1017    * outside and can do more simple code */
1018   int ix = (int)x;
1019   int iy = (int)y;
1020 
1021   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
1022     && iy < (height - itor->width))
1023   {
1024     // Inside image boundary case
1025 
1026     // Go to top left pixel
1027     in = (float *)in + linestride * iy + ix;
1028     in = in - (itor->width - 1) * (1 + linestride);
1029 
1030     // Apply the kernel
1031     float pixel = 0.0f;
1032     for(int i = 0; i < 2 * itor->width; i++)
1033     {
1034       float h = 0.0f;
1035       for(int j = 0; j < 2 * itor->width; j++)
1036       {
1037         h += kernelh[j] * in[j];
1038       }
1039       pixel += kernelv[i] * h;
1040       in += linestride;
1041     }
1042 
1043     *out = oonorm * pixel;
1044   }
1045   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
1046   {
1047     // At least a valid coordinate
1048 
1049     // Point to the upper left pixel index wise
1050     iy -= itor->width - 1;
1051     ix -= itor->width - 1;
1052 
1053     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
1054     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
1055 
1056     int xtap_first;
1057     int xtap_last;
1058     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
1059 
1060     int ytap_first;
1061     int ytap_last;
1062     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
1063 
1064     // Apply the kernel
1065     float pixel = 0.0f;
1066     for(int i = ytap_first; i < ytap_last; i++)
1067     {
1068       const int clip_y = clip(iy + i, 0, height - 1, bordermode);
1069       float h = 0.0f;
1070       for(int j = xtap_first; j < xtap_last; j++)
1071       {
1072         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
1073         const float *ipixel = in + clip_y * linestride + clip_x;
1074         h += kernelh[j] * *ipixel;
1075       }
1076       pixel += kernelv[i] * h;
1077     }
1078 
1079     *out = oonorm * pixel;
1080   }
1081   else
1082   {
1083     *out = 0.0f;
1084   }
1085 }
1086 
dt_interpolation_compute_pixel1c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)1087 void dt_interpolation_compute_pixel1c(const struct dt_interpolation *itor, const float *in, float *out,
1088                                       const float x, const float y, const int width, const int height,
1089                                       const int linestride)
1090 {
1091   return dt_interpolation_compute_pixel1c_plain(itor, in, out, x, y, width, height, linestride);
1092 }
1093 
1094 /* --------------------------------------------------------------------------
1095  * Interpolation factory
1096  * ------------------------------------------------------------------------*/
1097 
dt_interpolation_new(enum dt_interpolation_type type)1098 const struct dt_interpolation *dt_interpolation_new(enum dt_interpolation_type type)
1099 {
1100   const struct dt_interpolation *itor = NULL;
1101 
1102   if(type == DT_INTERPOLATION_USERPREF)
1103   {
1104     // Find user preferred interpolation method
1105     const char *uipref = dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator");
1106     for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1107     {
1108       if(!strcmp(uipref, dt_interpolator[i].name))
1109       {
1110         // Found the one
1111         itor = &dt_interpolator[i];
1112         break;
1113       }
1114     }
1115 
1116     /* In the case the search failed (!uipref or name not found),
1117      * prepare later search pass with default fallback */
1118     type = DT_INTERPOLATION_DEFAULT;
1119   }
1120   else if(type == DT_INTERPOLATION_USERPREF_WARP)
1121   {
1122     // Find user preferred interpolation method
1123     const char *uipref = dt_conf_get_string_const("plugins/lighttable/export/pixel_interpolator_warp");
1124     for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1125     {
1126       if(!strcmp(uipref, dt_interpolator[i].name))
1127       {
1128         // Found the one
1129         itor = &dt_interpolator[i];
1130         break;
1131       }
1132     }
1133 
1134     /* In the case the search failed (!uipref or name not found),
1135      * prepare later search pass with default fallback */
1136     type = DT_INTERPOLATION_DEFAULT_WARP;
1137   }
1138   if(!itor)
1139   {
1140     // Did not find the userpref one or we've been asked for a specific one
1141     for(int i = DT_INTERPOLATION_FIRST; i < DT_INTERPOLATION_LAST; i++)
1142     {
1143       if(dt_interpolator[i].id == type)
1144       {
1145         itor = &dt_interpolator[i];
1146         break;
1147       }
1148       if(dt_interpolator[i].id == DT_INTERPOLATION_DEFAULT)
1149       {
1150         itor = &dt_interpolator[i];
1151       }
1152     }
1153   }
1154 
1155   return itor;
1156 }
1157 
1158 /* --------------------------------------------------------------------------
1159  * Image resampling
1160  * ------------------------------------------------------------------------*/
1161 
1162 /** Prepares a 1D resampling plan
1163  *
1164  * This consists of the following information
1165  * <ul>
1166  * <li>A list of lengths that tell how many pixels are relevant for the
1167  *    next output</li>
1168  * <li>A list of required filter kernels</li>
1169  * <li>A list of sample indexes</li>
1170  * </ul>
1171  *
1172  * How to apply the resampling plan:
1173  * <ol>
1174  * <li>Pick a length from the length array</li>
1175  * <li>until length is reached
1176  *     <ol>
1177  *     <li>pick a kernel tap></li>
1178  *     <li>pick the relevant sample according to the picked index</li>
1179  *     <li>multiply them and accumulate</li>
1180  *     </ol>
1181  * </li>
1182  * <li>here goes a single output sample</li>
1183  * </ol>
1184  *
1185  * This until you reach the number of output pixels
1186  *
1187  * @param itor interpolator used to resample
1188  * @param in [in] Number of input samples
1189  * @param out [in] Number of output samples
1190  * @param plength [out] Array of lengths for each pixel filtering (number
1191  * of taps/indexes to use). This array mus be freed with dt_free_align() when you're
1192  * done with the plan.
1193  * @param pkernel [out] Array of filter kernel taps
1194  * @param pindex [out] Array of sample indexes to be used for applying each kernel tap
1195  * arrays of information
1196  * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary
1197  * out position meta[3*out]
1198  * @return 0 for success, !0 for failure
1199  */
prepare_resampling_plan(const struct dt_interpolation * itor,int in,const int in_x0,int out,const int out_x0,float scale,int ** plength,float ** pkernel,int ** pindex,int ** pmeta)1200 static int prepare_resampling_plan(const struct dt_interpolation *itor, int in, const int in_x0, int out,
1201                                    const int out_x0, float scale, int **plength, float **pkernel,
1202                                    int **pindex, int **pmeta)
1203 {
1204   // Safe return values
1205   *plength = NULL;
1206   *pkernel = NULL;
1207   *pindex = NULL;
1208   if(pmeta)
1209   {
1210     *pmeta = NULL;
1211   }
1212 
1213   if(scale == 1.f)
1214   {
1215     // No resampling required
1216     return 0;
1217   }
1218 
1219   // Compute common upsampling/downsampling memory requirements
1220   int maxtapsapixel;
1221   if(scale > 1.f)
1222   {
1223     // Upscale... the easy one. The values are exact
1224     maxtapsapixel = 2 * itor->width;
1225   }
1226   else
1227   {
1228     // Downscale... going for worst case values memory wise
1229     maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);
1230   }
1231 
1232   int nlengths = out;
1233   const int nindex = maxtapsapixel * out;
1234   const int nkernel = maxtapsapixel * out;
1235   const size_t lengthreq = increase_for_alignment(nlengths * sizeof(int), SSE_ALIGNMENT);
1236   const size_t indexreq = increase_for_alignment(nindex * sizeof(int), SSE_ALIGNMENT);
1237   const size_t kernelreq = increase_for_alignment(nkernel * sizeof(float), SSE_ALIGNMENT);
1238   const size_t scratchreq = maxtapsapixel * sizeof(float) + 4 * sizeof(float);
1239   // NB: because sse versions compute four taps a time
1240   const size_t metareq = pmeta ? 3 * sizeof(int) * out : 0;
1241 
1242   void *blob = NULL;
1243   const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
1244   blob = dt_alloc_align(SSE_ALIGNMENT, totalreq);
1245   if(!blob)
1246   {
1247     return 1;
1248   }
1249 
1250   int *lengths = (int *)blob;
1251   blob = (char *)blob + lengthreq;
1252   int *index = (int *)blob;
1253   blob = (char *)blob + indexreq;
1254   float *kernel = (float *)blob;
1255   blob = (char *)blob + kernelreq;
1256   float *scratchpad = scratchreq ? (float *)blob : NULL;
1257   blob = (char *)blob + scratchreq;
1258   int *meta = metareq ? (int *)blob : NULL;
1259 //   blob = (char *)blob + metareq;
1260 
1261   /* setting this as a const should help the compilers trim all unnecessary
1262    * codepaths */
1263   const enum border_mode bordermode = RESAMPLING_BORDER_MODE;
1264 
1265   /* Upscale and downscale differ in subtle points, getting rid of code
1266    * duplication might have been tricky and i prefer keeping the code
1267    * as straight as possible */
1268   if(scale > 1.f)
1269   {
1270     int kidx = 0;
1271     int iidx = 0;
1272     int lidx = 0;
1273     int midx = 0;
1274     for(int x = 0; x < out; x++)
1275     {
1276       if(meta)
1277       {
1278         meta[midx++] = lidx;
1279         meta[midx++] = kidx;
1280         meta[midx++] = iidx;
1281       }
1282 
1283       // Projected position in input samples
1284       float fx = (float)(out_x0 + x) / scale;
1285 
1286       // Compute the filter kernel at that position
1287       int first;
1288       compute_upsampling_kernel(itor, scratchpad, NULL, &first, fx);
1289 
1290       /* Check lower and higher bound pixel index and skip as many pixels as
1291        * necessary to fall into range */
1292       int tap_first;
1293       int tap_last;
1294       prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->width, first, in);
1295 
1296       // Track number of taps that will be used
1297       lengths[lidx++] = tap_last - tap_first;
1298 
1299       // Precompute the inverse of the norm
1300       float norm = 0.f;
1301       for(int tap = tap_first; tap < tap_last; tap++)
1302       {
1303         norm += scratchpad[tap];
1304       }
1305       norm = 1.f / norm;
1306 
1307       /* Unlike single pixel or single sample code, here it's interesting to
1308        * precompute the normalized filter kernel as this will avoid dividing
1309        * by the norm for all processed samples/pixels
1310        * NB: use the same loop to put in place the index list */
1311       first += tap_first;
1312       for(int tap = tap_first; tap < tap_last; tap++)
1313       {
1314         kernel[kidx++] = scratchpad[tap] * norm;
1315         index[iidx++] = clip(first++, 0, in - 1, bordermode);
1316       }
1317     }
1318   }
1319   else
1320   {
1321     int kidx = 0;
1322     int iidx = 0;
1323     int lidx = 0;
1324     int midx = 0;
1325     for(int x = 0; x < out; x++)
1326     {
1327       if(meta)
1328       {
1329         meta[midx++] = lidx;
1330         meta[midx++] = kidx;
1331         meta[midx++] = iidx;
1332       }
1333 
1334       // Compute downsampling kernel centered on output position
1335       int taps;
1336       int first;
1337       compute_downsampling_kernel(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);
1338 
1339       /* Check lower and higher bound pixel index and skip as many pixels as
1340        * necessary to fall into range */
1341       int tap_first;
1342       int tap_last;
1343       prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);
1344 
1345       // Track number of taps that will be used
1346       lengths[lidx++] = tap_last - tap_first;
1347 
1348       // Precompute the inverse of the norm
1349       float norm = 0.f;
1350       for(int tap = tap_first; tap < tap_last; tap++)
1351       {
1352         norm += scratchpad[tap];
1353       }
1354       norm = 1.f / norm;
1355 
1356       /* Unlike single pixel or single sample code, here it's interesting to
1357        * precompute the normalized filter kernel as this will avoid dividing
1358        * by the norm for all processed samples/pixels
1359        * NB: use the same loop to put in place the index list */
1360       first += tap_first;
1361       for(int tap = tap_first; tap < tap_last; tap++)
1362       {
1363         kernel[kidx++] = scratchpad[tap] * norm;
1364         index[iidx++] = clip(first++, 0, in - 1, bordermode);
1365       }
1366     }
1367   }
1368 
1369   // Validate plan wrt caller
1370   *plength = lengths;
1371   *pindex = index;
1372   *pkernel = kernel;
1373   if(pmeta)
1374   {
1375     *pmeta = meta;
1376   }
1377 
1378   return 0;
1379 }
1380 
dt_interpolation_resample_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1381 static void dt_interpolation_resample_plain(const struct dt_interpolation *itor, float *out,
1382                                             const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1383                                             const float *const in, const dt_iop_roi_t *const roi_in,
1384                                             const int32_t in_stride)
1385 {
1386   int *hindex = NULL;
1387   int *hlength = NULL;
1388   float *hkernel = NULL;
1389   int *vindex = NULL;
1390   int *vlength = NULL;
1391   float *vkernel = NULL;
1392   int *vmeta = NULL;
1393 
1394   const int32_t in_stride_floats = in_stride / sizeof(float);
1395   const int32_t out_stride_floats = out_stride / sizeof(float);
1396   int r;
1397 
1398   debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1399              roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1400              roi_out->x, roi_out->y, roi_out->scale);
1401 
1402   // Fast code path for 1:1 copy, only cropping area can change
1403   if(roi_out->scale == 1.f)
1404   {
1405     const int x0 = roi_out->x * 4 * sizeof(float);
1406 #if DEBUG_RESAMPLING_TIMING
1407     int64_t ts_resampling = getts();
1408 #endif
1409 #ifdef _OPENMP
1410 #pragma omp parallel for default(none) \
1411     dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1412     shared(out)
1413 #endif
1414     for(int y = 0; y < roi_out->height; y++)
1415     {
1416       memcpy((char *)out + (size_t)out_stride * y,
1417              (char *)in + (size_t)in_stride * (y + roi_out->y) + x0,
1418              out_stride);
1419     }
1420 #if DEBUG_RESAMPLING_TIMING
1421     ts_resampling = getts() - ts_resampling;
1422     fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1423 #endif
1424     // All done, so easy case
1425     return;
1426   }
1427 
1428 // Generic non 1:1 case... much more complicated :D
1429 #if DEBUG_RESAMPLING_TIMING
1430   int64_t ts_plan = getts();
1431 #endif
1432 
1433   // Prepare resampling plans once and for all
1434   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1435                               &hlength, &hkernel, &hindex, NULL);
1436   if(r)
1437   {
1438     goto exit;
1439   }
1440 
1441   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1442                               &vlength, &vkernel, &vindex, &vmeta);
1443   if(r)
1444   {
1445     goto exit;
1446   }
1447 
1448 #if DEBUG_RESAMPLING_TIMING
1449   ts_plan = getts() - ts_plan;
1450 #endif
1451 
1452 #if DEBUG_RESAMPLING_TIMING
1453   int64_t ts_resampling = getts();
1454 #endif
1455 
1456 // Process each output line
1457 #ifdef _OPENMP
1458 #pragma omp parallel for default(none) \
1459   dt_omp_firstprivate(in, in_stride_floats, out_stride_floats, roi_out) \
1460   shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1461 #endif
1462   for(int oy = 0; oy < roi_out->height; oy++)
1463   {
1464     // Initialize column resampling indexes
1465     int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1466     int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1467     int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1468 
1469     // Initialize row resampling indexes
1470     int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1471     int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1472     int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1473 
1474     // Number of lines contributing to the output line
1475     int vl = vlength[vlidx++]; // V(ertical) L(ength)
1476 
1477     // Process each output column
1478     for(int ox = 0; ox < roi_out->width; ox++)
1479     {
1480       debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1481 
1482       // This will hold the resulting pixel
1483       dt_aligned_pixel_t vs = { 0.0f, 0.0f, 0.0f, 0.0f };
1484 
1485       // Number of horizontal samples contributing to the output
1486       int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1487 
1488       for(int iy = 0; iy < vl; iy++)
1489       {
1490         // This is our input line
1491         size_t baseidx_vindex = (size_t)vindex[viidx++] * in_stride_floats;
1492 
1493         dt_aligned_pixel_t vhs = { 0.0f, 0.0f, 0.0f, 0.0f };
1494 
1495         for(int ix = 0; ix < hl; ix++)
1496         {
1497           // Apply the precomputed filter kernel
1498           const size_t baseidx = baseidx_vindex + (size_t)hindex[hiidx++] * 4;
1499           const float htap = hkernel[hkidx++];
1500           // Convince gcc 10 to vectorize
1501           dt_aligned_pixel_t tmp = { in[baseidx], in[baseidx+1], in[baseidx+2], in[baseidx+3] };
1502           for_each_channel(c, aligned(tmp,vhs:16)) vhs[c] += tmp[c] * htap;
1503         }
1504 
1505         // Accumulate contribution from this line
1506         const float vtap = vkernel[vkidx++];
1507         for_each_channel(c, aligned(vhs,vs:16)) vs[c] += vhs[c] * vtap;
1508 
1509         // Reset horizontal resampling context
1510         hkidx -= hl;
1511         hiidx -= hl;
1512       }
1513 
1514       // Output pixel is ready
1515       const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
1516 
1517       // Clip negative RGB that may be produced by Lanczos undershooting
1518       // Negative RGB are invalid values no matter the RGB space (light is positive)
1519       for_each_channel(c, aligned(vs:16)) out[baseidx + c] = fmaxf(vs[c], 0.f);
1520 
1521       // Reset vertical resampling context
1522       viidx -= vl;
1523       vkidx -= vl;
1524 
1525       // Progress in horizontal context
1526       hiidx += hl;
1527       hkidx += hl;
1528     }
1529   }
1530 
1531 #if DEBUG_RESAMPLING_TIMING
1532   ts_resampling = getts() - ts_resampling;
1533   fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1534 #endif
1535 
1536 exit:
1537   /* Free the resampling plans. It's nasty to optimize allocs like that, but
1538    * it simplifies the code :-D. The length array is in fact the only memory
1539    * allocated. */
1540   dt_free_align(hlength);
1541   dt_free_align(vlength);
1542 }
1543 
1544 #if defined(__SSE2__)
dt_interpolation_resample_sse(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1545 static void dt_interpolation_resample_sse(const struct dt_interpolation *itor, float *out,
1546                                           const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1547                                           const float *const in, const dt_iop_roi_t *const roi_in,
1548                                           const int32_t in_stride)
1549 {
1550   int *hindex = NULL;
1551   int *hlength = NULL;
1552   float *hkernel = NULL;
1553   int *vindex = NULL;
1554   int *vlength = NULL;
1555   float *vkernel = NULL;
1556   int *vmeta = NULL;
1557 
1558   int r;
1559 
1560   debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1561              roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1562              roi_out->x, roi_out->y, roi_out->scale);
1563 
1564   // Fast code path for 1:1 copy, only cropping area can change
1565   if(roi_out->scale == 1.f)
1566   {
1567     const int x0 = roi_out->x * 4 * sizeof(float);
1568 #if DEBUG_RESAMPLING_TIMING
1569     int64_t ts_resampling = getts();
1570 #endif
1571 #ifdef _OPENMP
1572 #pragma omp parallel for default(none) \
1573     dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1574     shared(out)
1575 #endif
1576     for(int y = 0; y < roi_out->height; y++)
1577     {
1578       float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
1579       float *o = (float *)((char *)out + (size_t)out_stride * y);
1580       memcpy(o, i, out_stride);
1581     }
1582 #if DEBUG_RESAMPLING_TIMING
1583     ts_resampling = getts() - ts_resampling;
1584     fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1585 #endif
1586     // All done, so easy case
1587     return;
1588   }
1589 
1590 // Generic non 1:1 case... much more complicated :D
1591 #if DEBUG_RESAMPLING_TIMING
1592   int64_t ts_plan = getts();
1593 #endif
1594 
1595   // Prepare resampling plans once and for all
1596   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1597                               &hlength, &hkernel, &hindex, NULL);
1598   if(r)
1599   {
1600     goto exit;
1601   }
1602 
1603   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1604                               &vlength, &vkernel, &vindex, &vmeta);
1605   if(r)
1606   {
1607     goto exit;
1608   }
1609 
1610 #if DEBUG_RESAMPLING_TIMING
1611   ts_plan = getts() - ts_plan;
1612 #endif
1613 
1614 #if DEBUG_RESAMPLING_TIMING
1615   int64_t ts_resampling = getts();
1616 #endif
1617 
1618 // Process each output line
1619 #ifdef _OPENMP
1620 #pragma omp parallel for default(none) \
1621   dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
1622   shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1623 #endif
1624   for(int oy = 0; oy < roi_out->height; oy++)
1625   {
1626     // Initialize column resampling indexes
1627     int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1628     int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1629     int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1630 
1631     // Initialize row resampling indexes
1632     int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1633     int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1634     int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1635 
1636     // Number of lines contributing to the output line
1637     int vl = vlength[vlidx++]; // V(ertical) L(ength)
1638 
1639     // Process each output column
1640     for(int ox = 0; ox < roi_out->width; ox++)
1641     {
1642       debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1643 
1644       // This will hold the resulting pixel
1645       __m128 vs = _mm_setzero_ps();
1646 
1647       // Number of horizontal samples contributing to the output
1648       const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1649 
1650       for(int iy = 0; iy < vl; iy++)
1651       {
1652         // This is our input line
1653         const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
1654 
1655         __m128 vhs = _mm_setzero_ps();
1656 
1657         for(int ix = 0; ix < hl; ix++)
1658         {
1659           // Apply the precomputed filter kernel
1660           const size_t baseidx = (size_t)hindex[hiidx++] * 4;
1661           const float htap = hkernel[hkidx++];
1662           const __m128 vhtap = _mm_set_ps1(htap);
1663           vhs = _mm_add_ps(vhs, _mm_mul_ps(*(__m128 *)&i[baseidx], vhtap));
1664         }
1665 
1666         // Accumulate contribution from this line
1667         const float vtap = vkernel[vkidx++];
1668         const __m128 vvtap = _mm_set_ps1(vtap);
1669         vs = _mm_add_ps(vs, _mm_mul_ps(vhs, vvtap));
1670 
1671         // Reset horizontal resampling context
1672         hkidx -= hl;
1673         hiidx -= hl;
1674       }
1675 
1676       // Output pixel is ready
1677       float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * 4 * sizeof(float));
1678 
1679       // Clip negative RGB that may be produced by Lanczos undershooting
1680       // Negative RGB are invalid values no matter the RGB space (light is positive)
1681       vs = _mm_max_ps(vs, _mm_setzero_ps());
1682       _mm_stream_ps(o, vs);
1683 
1684       // Reset vertical resampling context
1685       viidx -= vl;
1686       vkidx -= vl;
1687 
1688       // Progress in horizontal context
1689       hiidx += hl;
1690       hkidx += hl;
1691     }
1692   }
1693 
1694   _mm_sfence();
1695 
1696 #if DEBUG_RESAMPLING_TIMING
1697   ts_resampling = getts() - ts_resampling;
1698   fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1699 #endif
1700 
1701 exit:
1702   /* Free the resampling plans. It's nasty to optimize allocs like that, but
1703    * it simplifies the code :-D. The length array is in fact the only memory
1704    * allocated. */
1705   dt_free_align(hlength);
1706   dt_free_align(vlength);
1707 }
1708 #endif
1709 
1710 /** Applies resampling (re-scaling) on *full* input and output buffers.
1711  *  roi_in and roi_out define the part of the buffers that is affected.
1712  */
dt_interpolation_resample(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1713 void dt_interpolation_resample(const struct dt_interpolation *itor, float *out,
1714                                const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1715                                const float *const in, const dt_iop_roi_t *const roi_in,
1716                                const int32_t in_stride)
1717 {
1718   if(darktable.codepath.OPENMP_SIMD)
1719     return dt_interpolation_resample_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1720 #if defined(__SSE2__)
1721   else if(darktable.codepath.SSE2)
1722     return dt_interpolation_resample_sse(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1723 #endif
1724   else
1725     dt_unreachable_codepath();
1726 }
1727 
1728 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1729  *  and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1730  *  positions of the roi's within the full input and output image, respectively.
1731  */
dt_interpolation_resample_roi(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1732 void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out,
1733                                    const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1734                                    const float *const in, const dt_iop_roi_t *const roi_in,
1735                                    const int32_t in_stride)
1736 {
1737   dt_iop_roi_t oroi = *roi_out;
1738   oroi.x = oroi.y = 0;
1739 
1740   dt_iop_roi_t iroi = *roi_in;
1741   iroi.x = iroi.y = 0;
1742 
1743   dt_interpolation_resample(itor, out, &oroi, out_stride, in, &iroi, in_stride);
1744 }
1745 
1746 #ifdef HAVE_OPENCL
dt_interpolation_init_cl_global()1747 dt_interpolation_cl_global_t *dt_interpolation_init_cl_global()
1748 {
1749   dt_interpolation_cl_global_t *g
1750       = (dt_interpolation_cl_global_t *)malloc(sizeof(dt_interpolation_cl_global_t));
1751 
1752   const int program = 2; // basic.cl, from programs.conf
1753   g->kernel_interpolation_resample = dt_opencl_create_kernel(program, "interpolation_resample");
1754   return g;
1755 }
1756 
dt_interpolation_free_cl_global(dt_interpolation_cl_global_t * g)1757 void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
1758 {
1759   if(!g) return;
1760   // destroy kernels
1761   dt_opencl_free_kernel(g->kernel_interpolation_resample);
1762   free(g);
1763 }
1764 
roundToNextPowerOfTwo(uint32_t x)1765 static uint32_t roundToNextPowerOfTwo(uint32_t x)
1766 {
1767   x--;
1768   x |= x >> 1;
1769   x |= x >> 2;
1770   x |= x >> 4;
1771   x |= x >> 8;
1772   x |= x >> 16;
1773   x++;
1774   return x;
1775 }
1776 
1777 /** Applies resampling (re-scaling) on *full* input and output buffers.
1778  *  roi_in and roi_out define the part of the buffers that is affected.
1779  */
dt_interpolation_resample_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1780 int dt_interpolation_resample_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1781                                  const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1782                                  const dt_iop_roi_t *const roi_in)
1783 {
1784   int *hindex = NULL;
1785   int *hlength = NULL;
1786   float *hkernel = NULL;
1787   int *hmeta = NULL;
1788   int *vindex = NULL;
1789   int *vlength = NULL;
1790   float *vkernel = NULL;
1791   int *vmeta = NULL;
1792 
1793   int r;
1794   cl_int err = -999;
1795 
1796   cl_mem dev_hindex = NULL;
1797   cl_mem dev_hlength = NULL;
1798   cl_mem dev_hkernel = NULL;
1799   cl_mem dev_hmeta = NULL;
1800   cl_mem dev_vindex = NULL;
1801   cl_mem dev_vlength = NULL;
1802   cl_mem dev_vkernel = NULL;
1803   cl_mem dev_vmeta = NULL;
1804 
1805   debug_info("resampling_cl %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", (void *)dev_in,
1806              roi_in->width, roi_in->height, roi_in->x, roi_in->y, roi_in->scale, (void *)dev_out,
1807              roi_out->width, roi_out->height, roi_out->x, roi_out->y, roi_out->scale);
1808 
1809   // Fast code path for 1:1 copy, only cropping area can change
1810   if(roi_out->scale == 1.f)
1811   {
1812 #if DEBUG_RESAMPLING_TIMING
1813     int64_t ts_resampling = getts();
1814 #endif
1815     size_t iorigin[] = { roi_out->x, roi_out->y, 0 };
1816     size_t oorigin[] = { 0, 0, 0 };
1817     size_t region[] = { roi_out->width, roi_out->height, 1 };
1818 
1819     // copy original input from dev_in -> dev_out as starting point
1820     err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
1821     if(err != CL_SUCCESS) goto error;
1822 
1823 #if DEBUG_RESAMPLING_TIMING
1824     ts_resampling = getts() - ts_resampling;
1825     fprintf(stderr, "resampling_cl %p plan:0us resampling:%" PRId64 "us\n", (void *)dev_in, ts_resampling);
1826 #endif
1827     // All done, so easy case
1828     return CL_SUCCESS;
1829   }
1830 
1831 // Generic non 1:1 case... much more complicated :D
1832 #if DEBUG_RESAMPLING_TIMING
1833   int64_t ts_plan = getts();
1834 #endif
1835 
1836   // Prepare resampling plans once and for all
1837   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1838                               &hlength, &hkernel, &hindex, &hmeta);
1839   if(r)
1840   {
1841     goto error;
1842   }
1843 
1844   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1845                               &vlength, &vkernel, &vindex, &vmeta);
1846   if(r)
1847   {
1848     goto error;
1849   }
1850 
1851   int hmaxtaps = -1, vmaxtaps = -1;
1852   for(int k = 0; k < roi_out->width; k++) hmaxtaps = MAX(hmaxtaps, hlength[k]);
1853   for(int k = 0; k < roi_out->height; k++) vmaxtaps = MAX(vmaxtaps, vlength[k]);
1854 
1855 #if DEBUG_RESAMPLING_TIMING
1856   ts_plan = getts() - ts_plan;
1857 #endif
1858 
1859 #if DEBUG_RESAMPLING_TIMING
1860   int64_t ts_resampling = getts();
1861 #endif
1862 
1863   // strategy: process image column-wise (local[0] = 1). For each row generate
1864   // a number of parallel work items each taking care of one horizontal convolution,
1865   // then sum over work items to do the vertical convolution
1866 
1867   const int kernel = darktable.opencl->interpolation->kernel_interpolation_resample;
1868   const int width = roi_out->width;
1869   const int height = roi_out->height;
1870 
1871   // make sure blocksize is not too large
1872   const int taps = roundToNextPowerOfTwo(vmaxtaps); // the number of work items per row rounded up to a power of 2
1873                                                     // (for quick recursive reduction)
1874   int vblocksize;
1875 
1876   dt_opencl_local_buffer_t locopt
1877     = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
1878                                   .cellsize = 4 * sizeof(float), .overhead = hmaxtaps * sizeof(float) + hmaxtaps * sizeof(int),
1879                                   .sizex = 1, .sizey = (1 << 16) * taps };
1880 
1881   if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))
1882     vblocksize = locopt.sizey;
1883   else
1884     vblocksize = 1;
1885 
1886   if(vblocksize < taps)
1887   {
1888     // our strategy does not work: the vertical number of taps exceeds the vertical workgroupsize;
1889     // there is no point in continuing on the GPU - that would be way too slow; let's delegate the stuff to
1890     // the CPU then.
1891     dt_print(
1892         DT_DEBUG_OPENCL,
1893         "[opencl_resampling] resampling plan cannot efficiently be run on the GPU - fall back to CPU.\n");
1894     goto error;
1895   }
1896 
1897   size_t sizes[3] = { ROUNDUPWD(width), ROUNDUP(height * taps, vblocksize), 1 };
1898   size_t local[3] = { 1, vblocksize, 1 };
1899 
1900   // store resampling plan to device memory
1901   // hindex, vindex, hkernel, vkernel: (v|h)maxtaps might be too small, so store a bit more than needed
1902   dev_hindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * (hmaxtaps + 1), hindex);
1903   if(dev_hindex == NULL) goto error;
1904 
1905   dev_hlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width, hlength);
1906   if(dev_hlength == NULL) goto error;
1907 
1908   dev_hkernel
1909       = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * width * (hmaxtaps + 1), hkernel);
1910   if(dev_hkernel == NULL) goto error;
1911 
1912   dev_hmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * 3, hmeta);
1913   if(dev_hmeta == NULL) goto error;
1914 
1915   dev_vindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * (vmaxtaps + 1), vindex);
1916   if(dev_vindex == NULL) goto error;
1917 
1918   dev_vlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height, vlength);
1919   if(dev_vlength == NULL) goto error;
1920 
1921   dev_vkernel
1922       = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * height * (vmaxtaps + 1), vkernel);
1923   if(dev_vkernel == NULL) goto error;
1924 
1925   dev_vmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * 3, vmeta);
1926   if(dev_vmeta == NULL) goto error;
1927 
1928   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);
1929   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);
1930   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);
1931   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);
1932   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cl_mem), (void *)&dev_hmeta);
1933   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cl_mem), (void *)&dev_vmeta);
1934   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cl_mem), (void *)&dev_hlength);
1935   dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cl_mem), (void *)&dev_vlength);
1936   dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cl_mem), (void *)&dev_hindex);
1937   dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(cl_mem), (void *)&dev_vindex);
1938   dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(cl_mem), (void *)&dev_hkernel);
1939   dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(cl_mem), (void *)&dev_vkernel);
1940   dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(int), (void *)&hmaxtaps);
1941   dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(int), (void *)&taps);
1942   dt_opencl_set_kernel_arg(devid, kernel, 14, hmaxtaps * sizeof(float), NULL);
1943   dt_opencl_set_kernel_arg(devid, kernel, 15, hmaxtaps * sizeof(int), NULL);
1944   dt_opencl_set_kernel_arg(devid, kernel, 16, vblocksize * 4 * sizeof(float), NULL);
1945   err = dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizes, local);
1946   if(err != CL_SUCCESS) goto error;
1947 
1948 #if DEBUG_RESAMPLING_TIMING
1949   ts_resampling = getts() - ts_resampling;
1950   fprintf(stderr, "resampling_cl %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", (void *)dev_in, ts_plan,
1951           ts_resampling);
1952 #endif
1953 
1954   dt_opencl_release_mem_object(dev_hindex);
1955   dt_opencl_release_mem_object(dev_hlength);
1956   dt_opencl_release_mem_object(dev_hkernel);
1957   dt_opencl_release_mem_object(dev_hmeta);
1958   dt_opencl_release_mem_object(dev_vindex);
1959   dt_opencl_release_mem_object(dev_vlength);
1960   dt_opencl_release_mem_object(dev_vkernel);
1961   dt_opencl_release_mem_object(dev_vmeta);
1962   dt_free_align(hlength);
1963   dt_free_align(vlength);
1964   return CL_SUCCESS;
1965 
1966 error:
1967   dt_opencl_release_mem_object(dev_hindex);
1968   dt_opencl_release_mem_object(dev_hlength);
1969   dt_opencl_release_mem_object(dev_hkernel);
1970   dt_opencl_release_mem_object(dev_hmeta);
1971   dt_opencl_release_mem_object(dev_vindex);
1972   dt_opencl_release_mem_object(dev_vlength);
1973   dt_opencl_release_mem_object(dev_vkernel);
1974   dt_opencl_release_mem_object(dev_vmeta);
1975   dt_free_align(hlength);
1976   dt_free_align(vlength);
1977   dt_print(DT_DEBUG_OPENCL, "[opencl_resampling] couldn't enqueue kernel! %d\n", err);
1978   return err;
1979 }
1980 
1981 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1982  *  and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1983  *  positions of the roi's within the full input and output image, respectively.
1984  */
dt_interpolation_resample_roi_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1985 int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1986                                      const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1987                                      const dt_iop_roi_t *const roi_in)
1988 {
1989   dt_iop_roi_t oroi = *roi_out;
1990   oroi.x = oroi.y = 0;
1991 
1992   dt_iop_roi_t iroi = *roi_in;
1993   iroi.x = iroi.y = 0;
1994 
1995   return dt_interpolation_resample_cl(itor, devid, dev_out, &oroi, dev_in, &iroi);
1996 }
1997 #endif
1998 
dt_interpolation_resample_1c_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1999 static void dt_interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out,
2000                                                const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2001                                                const float *const in, const dt_iop_roi_t *const roi_in,
2002                                                const int32_t in_stride)
2003 {
2004   int *hindex = NULL;
2005   int *hlength = NULL;
2006   float *hkernel = NULL;
2007   int *vindex = NULL;
2008   int *vlength = NULL;
2009   float *vkernel = NULL;
2010   int *vmeta = NULL;
2011 
2012   int r;
2013 
2014   debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
2015              roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
2016              roi_out->x, roi_out->y, roi_out->scale);
2017 
2018   // Fast code path for 1:1 copy, only cropping area can change
2019   if(roi_out->scale == 1.f)
2020   {
2021     const int x0 = roi_out->x * sizeof(float);
2022 #if DEBUG_RESAMPLING_TIMING
2023     int64_t ts_resampling = getts();
2024 #endif
2025 #ifdef _OPENMP
2026 #pragma omp parallel for default(none) \
2027     dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
2028     shared(out)
2029 #endif
2030     for(int y = 0; y < roi_out->height; y++)
2031     {
2032       float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
2033       float *o = (float *)((char *)out + (size_t)out_stride * y);
2034       memcpy(o, i, out_stride);
2035     }
2036 #if DEBUG_RESAMPLING_TIMING
2037     ts_resampling = getts() - ts_resampling;
2038     fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
2039 #endif
2040     // All done, so easy case
2041     return;
2042   }
2043 
2044   // Generic non 1:1 case... much more complicated :D
2045 #if DEBUG_RESAMPLING_TIMING
2046   int64_t ts_plan = getts();
2047 #endif
2048 
2049   // Prepare resampling plans once and for all
2050   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
2051                               &hlength, &hkernel, &hindex, NULL);
2052   if(r)
2053   {
2054     goto exit;
2055   }
2056 
2057   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
2058                               &vlength, &vkernel, &vindex, &vmeta);
2059   if(r)
2060   {
2061     goto exit;
2062   }
2063 
2064 #if DEBUG_RESAMPLING_TIMING
2065   ts_plan = getts() - ts_plan;
2066 #endif
2067 
2068 #if DEBUG_RESAMPLING_TIMING
2069   int64_t ts_resampling = getts();
2070 #endif
2071 
2072   // Process each output line
2073 #ifdef _OPENMP
2074 #pragma omp parallel for default(none) \
2075   dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
2076   shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
2077 #endif
2078   for(int oy = 0; oy < roi_out->height; oy++)
2079   {
2080     // Initialize column resampling indexes
2081     int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
2082     int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
2083     int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
2084 
2085     // Initialize row resampling indexes
2086     int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
2087     int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
2088     int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
2089 
2090     // Number of lines contributing to the output line
2091     int vl = vlength[vlidx++]; // V(ertical) L(ength)
2092 
2093     // Process each output column
2094     for(int ox = 0; ox < roi_out->width; ox++)
2095     {
2096       debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
2097 
2098       // This will hold the resulting pixel
2099       float vs = 0.0f;
2100 
2101       // Number of horizontal samples contributing to the output
2102       const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
2103 
2104       for(int iy = 0; iy < vl; iy++)
2105       {
2106         // This is our input line
2107         const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
2108 
2109         float vhs = 0.0f;
2110 
2111         for(int ix = 0; ix < hl; ix++)
2112         {
2113           // Apply the precomputed filter kernel
2114           const size_t baseidx = (size_t)hindex[hiidx++];
2115           const float htap = hkernel[hkidx++];
2116           vhs += i[baseidx] * htap;
2117         }
2118 
2119         // Accumulate contribution from this line
2120         const float vtap = vkernel[vkidx++];
2121         vs += vhs * vtap;
2122 
2123         // Reset horizontal resampling context
2124         hkidx -= hl;
2125         hiidx -= hl;
2126       }
2127 
2128       // Output pixel is ready
2129       float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * sizeof(float));
2130       *o = vs;
2131 
2132       // Reset vertical resampling context
2133       viidx -= vl;
2134       vkidx -= vl;
2135 
2136       // Progress in horizontal context
2137       hiidx += hl;
2138       hkidx += hl;
2139     }
2140   }
2141 
2142 #if DEBUG_RESAMPLING_TIMING
2143   ts_resampling = getts() - ts_resampling;
2144   fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
2145 #endif
2146 
2147   exit:
2148   /* Free the resampling plans. It's nasty to optimize allocs like that, but
2149    * it simplifies the code :-D. The length array is in fact the only memory
2150    * allocated. */
2151   dt_free_align(hlength);
2152   dt_free_align(vlength);
2153 }
2154 
2155 /** Applies resampling (re-scaling) on *full* input and output buffers.
2156  *  roi_in and roi_out define the part of the buffers that is affected.
2157  */
dt_interpolation_resample_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2158 void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out,
2159                                   const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2160                                   const float *const in, const dt_iop_roi_t *const roi_in,
2161                                   const int32_t in_stride)
2162 {
2163   return dt_interpolation_resample_1c_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
2164 }
2165 
2166 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
2167  *  and output buffers hold exactly those roi's. roi_in and roi_out define the relative
2168  *  positions of the roi's within the full input and output image, respectively.
2169  */
dt_interpolation_resample_roi_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2170 void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out,
2171                                       const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2172                                       const float *const in, const dt_iop_roi_t *const roi_in,
2173                                       const int32_t in_stride)
2174 {
2175   dt_iop_roi_t oroi = *roi_out;
2176   oroi.x = oroi.y = 0;
2177 
2178   dt_iop_roi_t iroi = *roi_in;
2179   iroi.x = iroi.y = 0;
2180 
2181   dt_interpolation_resample_1c(itor, out, &oroi, out_stride, in, &iroi, in_stride);
2182 }
2183 
2184 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2185 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2186 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2187