1 /* --------------------------------------------------------------------------
2     This file is part of darktable,
3     Copyright (C) 2012-2021 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 * ------------------------------------------------------------------------*/
18 
19 #include "common/interpolation.h"
20 #include "common/darktable.h"
21 #include "common/math.h"
22 #include "control/conf.h"
23 
24 #include <assert.h>
25 #include <glib.h>
26 #include <inttypes.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 /** Border extrapolation modes */
31 enum border_mode
32 {
33   BORDER_REPLICATE, // aaaa|abcdefg|gggg
34   BORDER_WRAP,      // defg|abcdefg|abcd
35   BORDER_MIRROR,    // edcb|abcdefg|fedc
36   BORDER_CLAMP      // ....|abcdefg|....
37 };
38 
39 /* Supporting them all might be overkill, let the compiler trim all
40  * unnecessary modes in clip for resampling codepath*/
41 #define RESAMPLING_BORDER_MODE BORDER_REPLICATE
42 
43 /* Supporting them all might be overkill, let the compiler trim all
44  * unnecessary modes in interpolation codepath */
45 #define INTERPOLATION_BORDER_MODE BORDER_MIRROR
46 
47 // Defines minimum alignment requirement for critical SIMD code
48 #define SSE_ALIGNMENT 64
49 
50 // Defines the maximum kernel half length
51 // !! Make sure to sync this with the filter array !!
52 #define MAX_HALF_FILTER_WIDTH 3
53 
54 // Add code for timing resampling function
55 #define DEBUG_RESAMPLING_TIMING 0
56 
57 // Add debug info messages to stderr
58 #define DEBUG_PRINT_INFO 0
59 
60 // Add *verbose* (like one msg per pixel out) debug message to stderr
61 #define DEBUG_PRINT_VERBOSE 0
62 
63 /* --------------------------------------------------------------------------
64  * Debug helpers
65  * ------------------------------------------------------------------------*/
66 
67 #if DEBUG_RESAMPLING_TIMING
68 #include <sys/time.h>
69 #endif
70 
71 #if DEBUG_PRINT_INFO
72 #define debug_info(...)                                                                                      \
73   do                                                                                                         \
74   {                                                                                                          \
75     fprintf(stderr, __VA_ARGS__);                                                                            \
76   } while(0)
77 #else
78 #define debug_info(...)
79 #endif
80 
81 #if DEBUG_PRINT_VERBOSE
82 #define debug_extra(...)                                                                                     \
83   do                                                                                                         \
84   {                                                                                                          \
85     fprintf(stderr, __VA_ARGS__);                                                                            \
86   } while(0)
87 #else
88 #define debug_extra(...)
89 #endif
90 
91 #if DEBUG_RESAMPLING_TIMING
getts()92 static inline int64_t getts()
93 {
94   struct timeval t;
95   gettimeofday(&t, NULL);
96   return t.tv_sec * INT64_C(1000000) + t.tv_usec;
97 }
98 #endif
99 
100 /* --------------------------------------------------------------------------
101  * Generic helpers
102  * ------------------------------------------------------------------------*/
103 
104 /** Clip into specified range
105  * @param idx index to filter
106  * @param length length of line
107  */
clip(int i,int min,int max,enum border_mode mode)108 static inline int clip(int i, int min, int max, enum border_mode mode)
109 {
110   switch(mode)
111   {
112     case BORDER_REPLICATE:
113       if(i < min)
114       {
115         i = min;
116       }
117       else if(i > max)
118       {
119         i = max;
120       }
121       break;
122     case BORDER_MIRROR:
123       if(i < min)
124       {
125         i = min - i;
126       }
127       else if(i > max)
128       {
129         i = 2 * max - i;
130       }
131       break;
132     case BORDER_WRAP:
133       if(i < min)
134       {
135         i = max - (min - i);
136       }
137       else if(i > max)
138       {
139         i = min + (i - max);
140       }
141       break;
142     case BORDER_CLAMP:
143       if(i < min || i > max)
144       {
145         /* Should not be used as is, we prevent -1 usage, filtering the taps
146          * we clip the sample indexes for. So understand this function is
147          * specific to its caller. */
148         i = -1;
149       }
150       break;
151   }
152 
153   return i;
154 }
155 
prepare_tap_boundaries(int * tap_first,int * tap_last,const enum border_mode mode,const int filterwidth,const int t,const int max)156 static inline void prepare_tap_boundaries(int *tap_first, int *tap_last, const enum border_mode mode,
157                                           const int filterwidth, const int t, const int max)
158 {
159   /* Check lower bound pixel index and skip as many pixels as necessary to
160    * fall into range */
161   *tap_first = 0;
162   if(mode == BORDER_CLAMP && t < 0)
163   {
164     *tap_first = -t;
165   }
166 
167   // Same for upper bound pixel
168   *tap_last = filterwidth;
169   if(mode == BORDER_CLAMP && t + filterwidth >= max)
170   {
171     *tap_last = max - t;
172   }
173 }
174 
175 /** Make sure an aligned chunk will not misalign its following chunk
176  * proposing an adapted length
177  *
178  * @param l Length required for current chunk
179  * @param align Required alignment for next chunk
180  *
181  * @return Required length for keeping alignment ok if chaining data chunks
182  */
increase_for_alignment(size_t l,size_t align)183 static inline size_t increase_for_alignment(size_t l, size_t align)
184 {
185   align -= 1;
186   return (l + align) & (~align);
187 }
188 
189 /* --------------------------------------------------------------------------
190  * Interpolation kernels
191  * ------------------------------------------------------------------------*/
192 
193 /* --------------------------------------------------------------------------
194  * Bilinear interpolation
195  * ------------------------------------------------------------------------*/
196 
bilinear(float width,float t)197 static inline float bilinear(float width, float t)
198 {
199   float r;
200   t = fabsf(t);
201   if(t > 1.f)
202   {
203     r = 0.f;
204   }
205   else
206   {
207     r = 1.f - t;
208   }
209   return r;
210 }
211 
212 #if defined(__SSE2__)
bilinear_sse(__m128 width,__m128 t)213 static inline __m128 bilinear_sse(__m128 width, __m128 t)
214 {
215   static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
216   return _mm_sub_ps(one, _mm_abs_ps(t));
217 }
218 #endif
219 
220 /* --------------------------------------------------------------------------
221  * Bicubic interpolation
222  * ------------------------------------------------------------------------*/
223 
bicubic(float width,float t)224 static inline float bicubic(float width, float t)
225 {
226   float r;
227   t = fabsf(t);
228   if(t >= 2.f)
229   {
230     r = 0.f;
231   }
232   else if(t > 1.f && t < 2.f)
233   {
234     float t2 = t * t;
235     r = 0.5f * (t * (-t2 + 5.f * t - 8.f) + 4.f);
236   }
237   else
238   {
239     float t2 = t * t;
240     r = 0.5f * (t * (3.f * t2 - 5.f * t) + 2.f);
241   }
242   return r;
243 }
244 
245 #if defined(__SSE2__)
bicubic_sse(__m128 width,__m128 t)246 static inline __m128 bicubic_sse(__m128 width, __m128 t)
247 {
248   static const __m128 half = { .5f, .5f, .5f, .5f };
249   static const __m128 one = { 1.f, 1.f, 1.f, 1.f };
250   static const __m128 two = { 2.f, 2.f, 2.f, 2.f };
251   static const __m128 three = { 3.f, 3.f, 3.f, 3.f };
252   static const __m128 four = { 4.f, 4.f, 4.f, 4.f };
253   static const __m128 five = { 5.f, 5.f, 5.f, 5.f };
254   static const __m128 eight = { 8.f, 8.f, 8.f, 8.f };
255 
256   t = _mm_abs_ps(t);
257   const __m128 t2 = _mm_mul_ps(t, t);
258 
259   /* Compute 1 < t < 2 case:
260    * 0.5f*(t*(-t2 + 5.f*t - 8.f) + 4.f)
261    * half*(t*(mt2 + t5 - eight) + four)
262    * half*(t*(mt2 + t5_sub_8) + four)
263    * half*(t*(mt2_add_t5_sub_8) + four) */
264   const __m128 t5 = _mm_mul_ps(five, t);
265   const __m128 t5_sub_8 = _mm_sub_ps(t5, eight);
266   const __m128 zero = _mm_setzero_ps();
267   const __m128 mt2 = _mm_sub_ps(zero, t2);
268   const __m128 mt2_add_t5_sub_8 = _mm_add_ps(mt2, t5_sub_8);
269   const __m128 a = _mm_mul_ps(t, mt2_add_t5_sub_8);
270   const __m128 b = _mm_add_ps(a, four);
271   __m128 r12 = _mm_mul_ps(b, half);
272 
273   /* Compute case < 1
274    * 0.5f*(t*(3.f*t2 - 5.f*t) + 2.f) */
275   const __m128 t23 = _mm_mul_ps(three, t2);
276   const __m128 c = _mm_sub_ps(t23, t5);
277   const __m128 d = _mm_mul_ps(t, c);
278   const __m128 e = _mm_add_ps(d, two);
279   __m128 r01 = _mm_mul_ps(half, e);
280 
281   // Compute masks fr keeping correct components
282   const __m128 mask01 = _mm_cmple_ps(t, one);
283   const __m128 mask12 = _mm_cmpgt_ps(t, one);
284   r01 = _mm_and_ps(mask01, r01);
285   r12 = _mm_and_ps(mask12, r12);
286 
287 
288   return _mm_or_ps(r01, r12);
289 }
290 #endif
291 
292 /* --------------------------------------------------------------------------
293  * Lanczos interpolation
294  * ------------------------------------------------------------------------*/
295 
296 #define DT_LANCZOS_EPSILON (1e-9f)
297 
298 #if 0
299 // Reference version left here for ... documentation
300 static inline float
301 lanczos(float width, float t)
302 {
303   float r;
304 
305   if (t<-width || t>width)
306   {
307     r = 0.f;
308   }
309   else if (t>-DT_LANCZOS_EPSILON && t<DT_LANCZOS_EPSILON)
310   {
311     r = 1.f;
312   }
313   else
314   {
315     r = width*sinf(M_PI*t)*sinf(M_PI*t/width)/(M_PI*M_PI*t*t);
316   }
317   return r;
318 }
319 #endif
320 
321 /* Fast lanczos version, no calls to math.h functions, too accurate, too slow
322  *
323  * Based on a forum entry at
324  * http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/
325  *
326  * Apart the fast sine function approximation, the only trick is to compute:
327  * sin(pi.t) = sin(a.pi + r.pi) where t = a + r = trunc(t) + r
328  *           = sin(a.pi).cos(r.pi) + sin(r.pi).cos(a.pi)
329  *           =         0*cos(r.pi) + sin(r.pi).cos(a.pi)
330  *           = sign.sin(r.pi) where sign =  1 if the a is even
331  *                                       = -1 if the a is odd
332  *
333  * Of course we know that lanczos func will only be called for
334  * the range -width < t < width so we can additionally avoid the
335  * range check.  */
336 
lanczos(float width,float t)337 static inline float lanczos(float width, float t)
338 {
339   /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
340    * correct */
341   int a = (int)t;
342   float r = t - (float)a;
343 
344   // Compute the correct sign for sinf(pi.r)
345   union
346   {
347     float f;
348     uint32_t i;
349   } sign;
350   sign.i = ((a & 1) << 31) | 0x3f800000;
351 
352   return (DT_LANCZOS_EPSILON + width * sign.f * sinf_fast(M_PI_F * r) * sinf_fast(M_PI_F * t / width))
353          / (DT_LANCZOS_EPSILON + M_PI_F * M_PI_F * t * t);
354 }
355 
356 #if defined(__SSE2__)
lanczos_sse2(__m128 width,__m128 t)357 static inline __m128 lanczos_sse2(__m128 width, __m128 t)
358 {
359   /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
360    * correct */
361   __m128i a = _mm_cvtps_epi32(t);
362   __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a));
363 
364   // Compute the correct sign for sinf(pi.r)
365   static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT)))
366   = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 };
367   static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1 };
368   static const __m128 eps
369       = { DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON };
370   static const __m128 pi = { M_PI, M_PI, M_PI, M_PI };
371   static const __m128 pi2 = { M_PI * M_PI, M_PI * M_PI, M_PI * M_PI, M_PI * M_PI };
372 
373   __m128i isign = _mm_and_si128(*(__m128i *)ione, a);
374   isign = _mm_slli_epi64(isign, 31);
375   isign = _mm_or_si128(*(__m128i *)fone, isign);
376   const __m128 fsign = _mm_castsi128_ps(isign);
377 
378   __m128 num = _mm_mul_ps(width, fsign);
379   num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r)));
380   num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width)));
381   num = _mm_add_ps(eps, num);
382 
383   __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t));
384   den = _mm_add_ps(eps, den);
385 
386   return _mm_div_ps(num, den);
387 }
388 #endif
389 
390 #undef DT_LANCZOS_EPSILON
391 
392 /* --------------------------------------------------------------------------
393  * All our known interpolators
394  * ------------------------------------------------------------------------*/
395 
396 /* !!! !!! !!!
397  * Make sure MAX_HALF_FILTER_WIDTH is at least equal to the maximum width
398  * of this filter list. Otherwise bad things will happen
399  * !!! !!! !!!
400  */
401 static const struct dt_interpolation dt_interpolator[] = {
402   {.id = DT_INTERPOLATION_BILINEAR,
403    .name = "bilinear",
404    .width = 1,
405    .func = &bilinear,
406 #if defined(__SSE2__)
407    .funcsse = &bilinear_sse
408 #endif
409   },
410   {.id = DT_INTERPOLATION_BICUBIC,
411    .name = "bicubic",
412    .width = 2,
413    .func = &bicubic,
414 #if defined(__SSE2__)
415    .funcsse = &bicubic_sse
416 #endif
417   },
418   {.id = DT_INTERPOLATION_LANCZOS2,
419    .name = "lanczos2",
420    .width = 2,
421    .func = &lanczos,
422 #if defined(__SSE2__)
423    .funcsse = &lanczos_sse2
424 #endif
425   },
426   {.id = DT_INTERPOLATION_LANCZOS3,
427    .name = "lanczos3",
428    .width = 3,
429    .func = &lanczos,
430 #if defined(__SSE2__)
431    .funcsse = &lanczos_sse2
432 #endif
433   },
434 };
435 
436 /* --------------------------------------------------------------------------
437  * Kernel utility methods
438  * ------------------------------------------------------------------------*/
439 
440 /** Computes an upsampling filtering kernel
441  *
442  * @param itor [in] Interpolator used
443  * @param kernel [out] resulting itor->width*2 filter taps
444  * @param norm [out] Kernel norm
445  * @param first [out] first input sample index used
446  * @param t [in] Interpolated coordinate */
compute_upsampling_kernel_plain(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)447 static inline void compute_upsampling_kernel_plain(const struct dt_interpolation *itor, float *kernel,
448                                                    float *norm, int *first, float t)
449 {
450   int f = (int)t - itor->width + 1;
451   if(first)
452   {
453     *first = f;
454   }
455 
456   /* Find closest integer position and then offset that to match first
457    * filtered sample position */
458   t = t - (float)f;
459 
460   // Will hold kernel norm
461   float n = 0.f;
462 
463   // Compute the raw kernel
464   for(int i = 0; i < 2 * itor->width; i++)
465   {
466     float tap = itor->func((float)itor->width, t);
467     n += tap;
468     kernel[i] = tap;
469     t -= 1.f;
470   }
471   if(norm)
472   {
473     *norm = n;
474   }
475 }
476 
477 #if defined(__SSE2__)
478 /** Computes an upsampling filtering kernel (SSE version, four taps per inner loop)
479  *
480  * @param itor [in] Interpolator used
481  * @param kernel [out] resulting itor->width*2 filter taps (array must be at least (itor->width*2+3)/4*4
482  *floats long)
483  * @param norm [out] Kernel norm
484  * @param first [out] first input sample index used
485  * @param t [in] Interpolated coordinate
486  *
487  * @return kernel norm
488  */
compute_upsampling_kernel_sse(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)489 static inline void compute_upsampling_kernel_sse(const struct dt_interpolation *itor, float *kernel,
490                                                  float *norm, int *first, float t)
491 {
492   int f = (int)t - itor->width + 1;
493   if(first)
494   {
495     *first = f;
496   }
497 
498   /* Find closest integer position and then offset that to match first
499    * filtered sample position */
500   t = t - (float)f;
501 
502   // Prepare t vector to compute four values a loop
503   static const __m128 bootstrap = { 0.f, -1.f, -2.f, -3.f };
504   static const __m128 iter = { -4.f, -4.f, -4.f, -4.f };
505   __m128 vt = _mm_add_ps(_mm_set_ps1(t), bootstrap);
506   __m128 vw = _mm_set_ps1((float)itor->width);
507 
508   // Prepare counters (math kept stupid for understanding)
509   int i = 0;
510   const int runs = (2 * itor->width + 3) / 4;
511 
512   while(i < runs)
513   {
514     // Compute the values
515     const __m128 vr = itor->funcsse(vw, vt);
516 
517     // Save result
518     *(__m128 *)kernel = vr;
519 
520     // Prepare next iteration
521     vt = _mm_add_ps(vt, iter);
522     kernel += 4;
523     i++;
524   }
525 
526   // compute norm now
527   if(norm)
528   {
529     float n = 0.f;
530     i = 0;
531     kernel -= 4 * runs;
532     while(i < 2 * itor->width)
533     {
534       n += *kernel;
535       kernel++;
536       i++;
537     }
538     *norm = n;
539   }
540 }
541 #endif
542 
compute_upsampling_kernel(const struct dt_interpolation * itor,float * kernel,float * norm,int * first,float t)543 static inline void compute_upsampling_kernel(const struct dt_interpolation *itor, float *kernel, float *norm,
544                                              int *first, float t)
545 {
546   if(darktable.codepath.OPENMP_SIMD) return compute_upsampling_kernel_plain(itor, kernel, norm, first, t);
547 #if defined(__SSE2__)
548   else if(darktable.codepath.SSE2)
549     return compute_upsampling_kernel_sse(itor, kernel, norm, first, t);
550 #endif
551   else
552     dt_unreachable_codepath();
553 }
554 
555 /** Computes a downsampling filtering kernel
556  *
557  * @param itor [in] Interpolator used
558  * @param kernelsize [out] Number of taps
559  * @param kernel [out] resulting taps (at least itor->width/inoout elements for no overflow)
560  * @param norm [out] Kernel norm
561  * @param first [out] index of the first sample for which the kernel is to be applied
562  * @param outoinratio [in] "out samples" over "in samples" ratio
563  * @param xout [in] Output coordinate */
compute_downsampling_kernel_plain(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)564 static inline void compute_downsampling_kernel_plain(const struct dt_interpolation *itor, int *taps,
565                                                      int *first, float *kernel, float *norm,
566                                                      float outoinratio, int xout)
567 {
568   // Keep this at hand
569   const float w = (float)itor->width;
570 
571   /* Compute the phase difference between output pixel and its
572    * input corresponding input pixel */
573   const float xin = ceil_fast(((float)xout - w) / outoinratio);
574   if(first)
575   {
576     *first = (int)xin;
577   }
578 
579   // Compute first interpolator parameter
580   float t = xin * outoinratio - (float)xout;
581 
582   // Will hold kernel norm
583   float n = 0.f;
584 
585   // Compute all filter taps
586   *taps = (int)((w - t) / outoinratio);
587   for(int i = 0; i < *taps; i++)
588   {
589     *kernel = itor->func(w, t);
590     n += *kernel;
591     t += outoinratio;
592     kernel++;
593   }
594 
595   if(norm)
596   {
597     *norm = n;
598   }
599 }
600 
601 
602 #if defined(__SSE2__)
603 /** Computes a downsampling filtering kernel (SSE version, four taps per inner loop iteration)
604  *
605  * @param itor [in] Interpolator used
606  * @param kernelsize [out] Number of taps
607  * @param kernel [out] resulting taps (at least itor->width/inoout + 4 elements for no overflow)
608  * @param norm [out] Kernel norm
609  * @param first [out] index of the first sample for which the kernel is to be applied
610  * @param outoinratio [in] "out samples" over "in samples" ratio
611  * @param xout [in] Output coordinate */
compute_downsampling_kernel_sse(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)612 static inline void compute_downsampling_kernel_sse(const struct dt_interpolation *itor, int *taps, int *first,
613                                                    float *kernel, float *norm, float outoinratio, int xout)
614 {
615   // Keep this at hand
616   const float w = (float)itor->width;
617 
618   /* Compute the phase difference between output pixel and its
619    * input corresponding input pixel */
620   const float xin = ceil_fast(((float)xout - w) / outoinratio);
621   if(first)
622   {
623     *first = (int)xin;
624   }
625 
626   // Compute first interpolator parameter
627   float t = xin * outoinratio - (float)xout;
628 
629   // Compute all filter taps
630   *taps = (int)((w - t) / outoinratio);
631 
632   // Bootstrap vector t
633   static const __m128 bootstrap = { 0.f, 1.f, 2.f, 3.f };
634   const __m128 iter = _mm_set_ps1(4.f * outoinratio);
635   const __m128 vw = _mm_set_ps1(w);
636   __m128 vt = _mm_add_ps(_mm_set_ps1(t), _mm_mul_ps(_mm_set_ps1(outoinratio), bootstrap));
637 
638   // Prepare counters (math kept stupid for understanding)
639   int i = 0;
640   const int runs = (*taps + 3) / 4;
641 
642   while(i < runs)
643   {
644     // Compute the values
645     const __m128 vr = itor->funcsse(vw, vt);
646 
647     // Save result
648     *(__m128 *)kernel = vr;
649 
650     // Prepare next iteration
651     vt = _mm_add_ps(vt, iter);
652     kernel += 4;
653     i++;
654   }
655 
656   // compute norm now
657   if(norm)
658   {
659     float n = 0.f;
660     i = 0;
661     kernel -= 4 * runs;
662     while(i < *taps)
663     {
664       n += *kernel;
665       kernel++;
666       i++;
667     }
668     *norm = n;
669   }
670 }
671 #endif
672 
compute_downsampling_kernel(const struct dt_interpolation * itor,int * taps,int * first,float * kernel,float * norm,float outoinratio,int xout)673 static inline void compute_downsampling_kernel(const struct dt_interpolation *itor, int *taps, int *first,
674                                                float *kernel, float *norm, float outoinratio, int xout)
675 {
676   if(darktable.codepath.OPENMP_SIMD)
677     return compute_downsampling_kernel_plain(itor, taps, first, kernel, norm, outoinratio, xout);
678 #if defined(__SSE2__)
679   else if(darktable.codepath.SSE2)
680     return compute_downsampling_kernel_sse(itor, taps, first, kernel, norm, outoinratio, xout);
681 #endif
682   else
683     dt_unreachable_codepath();
684 }
685 
686 /* --------------------------------------------------------------------------
687  * Sample interpolation function (see usage in iop/lens.c and iop/clipping.c)
688  * ------------------------------------------------------------------------*/
689 
690 #define MAX_KERNEL_REQ ((2 * (MAX_HALF_FILTER_WIDTH) + 3) & (~3))
691 
dt_interpolation_compute_sample(const struct dt_interpolation * itor,const float * in,const float x,const float y,const int width,const int height,const int samplestride,const int linestride)692 float dt_interpolation_compute_sample(const struct dt_interpolation *itor, const float *in, const float x,
693                                       const float y, const int width, const int height,
694                                       const int samplestride, const int linestride)
695 {
696   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
697 
698   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
699   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
700 
701   // Compute both horizontal and vertical kernels
702   float normh;
703   float normv;
704   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
705   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
706 
707   int ix = (int)x;
708   int iy = (int)y;
709 
710   /* Now 2 cases, the pixel + filter width goes outside the image
711    * in that case we have to use index clipping to keep all reads
712    * in the input image (slow path) or we are sure it won't fall
713    * outside and can do more simple code */
714   float r;
715   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
716      && iy < (height - itor->width))
717   {
718     // Inside image boundary case
719 
720     // Go to top left pixel
721     in = (float *)in + linestride * iy + ix * samplestride;
722     in = in - (itor->width - 1) * (samplestride + linestride);
723 
724     // Apply the kernel
725     float s = 0.f;
726     for(int i = 0; i < 2 * itor->width; i++)
727     {
728       float h = 0.0f;
729       for(int j = 0; j < 2 * itor->width; j++)
730       {
731         h += kernelh[j] * in[j * samplestride];
732       }
733       s += kernelv[i] * h;
734       in += linestride;
735     }
736     r = s / (normh * normv);
737   }
738   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
739   {
740     // At least a valid coordinate
741 
742     // Point to the upper left pixel index wise
743     iy -= itor->width - 1;
744     ix -= itor->width - 1;
745 
746     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
747     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
748 
749     int xtap_first;
750     int xtap_last;
751     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
752 
753     int ytap_first;
754     int ytap_last;
755     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
756 
757     // Apply the kernel
758     float s = 0.f;
759     for(int i = ytap_first; i < ytap_last; i++)
760     {
761       const int clip_y = clip(iy + i, 0, height - 1, bordermode);
762       float h = 0.0f;
763       for(int j = xtap_first; j < xtap_last; j++)
764       {
765         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
766         const float *ipixel = in + clip_y * linestride + clip_x * samplestride;
767         h += kernelh[j] * ipixel[0];
768       }
769       s += kernelv[i] * h;
770     }
771 
772     r = s / (normh * normv);
773   }
774   else
775   {
776     // invalid coordinate
777     r = 0.0f;
778   }
779   return r;
780 }
781 
782 /* --------------------------------------------------------------------------
783  * Pixel interpolation function (see usage in iop/lens.c and iop/clipping.c)
784  * ------------------------------------------------------------------------*/
785 
dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)786 static void dt_interpolation_compute_pixel4c_plain(const struct dt_interpolation *itor, const float *in,
787                                                    float *out, const float x, const float y, const int width,
788                                                    const int height, const int linestride)
789 {
790   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
791 
792   // Quite a bit of space for kernels
793   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
794   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
795 
796   // Compute both horizontal and vertical kernels
797   float normh;
798   float normv;
799   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
800   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
801 
802   // Precompute the inverse of the filter norm for later use
803   const float oonorm = (1.f / (normh * normv));
804 
805   /* Now 2 cases, the pixel + filter width goes outside the image
806    * in that case we have to use index clipping to keep all reads
807    * in the input image (slow path) or we are sure it won't fall
808    * outside and can do more simple code */
809   int ix = (int)x;
810   int iy = (int)y;
811 
812   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
813      && iy < (height - itor->width))
814   {
815     // Inside image boundary case
816 
817     // Go to top left pixel
818     in = (float *)in + linestride * iy + ix * 4;
819     in = in - (itor->width - 1) * (4 + linestride);
820 
821     // Apply the kernel
822     float pixel[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
823     for(int i = 0; i < 2 * itor->width; i++)
824     {
825       float h[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
826       for(int j = 0; j < 2 * itor->width; j++)
827       {
828         for(int c = 0; c < 3; c++) h[c] += kernelh[j] * in[j * 4 + c];
829       }
830       for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
831       in += linestride;
832     }
833 
834     for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
835   }
836   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
837   {
838     // At least a valid coordinate
839 
840     // Point to the upper left pixel index wise
841     iy -= itor->width - 1;
842     ix -= itor->width - 1;
843 
844     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
845     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
846 
847     int xtap_first;
848     int xtap_last;
849     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
850 
851     int ytap_first;
852     int ytap_last;
853     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
854 
855     // Apply the kernel
856     float pixel[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
857     for(int i = ytap_first; i < ytap_last; i++)
858     {
859       const int clip_y = clip(iy + i, 0, height - 1, bordermode);
860       float h[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
861       for(int j = xtap_first; j < xtap_last; j++)
862       {
863         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
864         const float *ipixel = in + clip_y * linestride + clip_x * 4;
865         for(int c = 0; c < 3; c++) h[c] += kernelh[j] * ipixel[c];
866       }
867       for(int c = 0; c < 3; c++) pixel[c] += kernelv[i] * h[c];
868     }
869 
870     for(int c = 0; c < 3; c++) out[c] = oonorm * pixel[c];
871   }
872   else
873   {
874     for(int c = 0; c < 3; c++) out[c] = 0.0f;
875   }
876 }
877 
878 #if defined(__SSE2__)
dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)879 static void dt_interpolation_compute_pixel4c_sse(const struct dt_interpolation *itor, const float *in,
880                                                  float *out, const float x, const float y, const int width,
881                                                  const int height, const int linestride)
882 {
883   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
884 
885   // Quite a bit of space for kernels
886   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
887   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
888   __m128 vkernelh[2 * MAX_HALF_FILTER_WIDTH];
889   __m128 vkernelv[2 * MAX_HALF_FILTER_WIDTH];
890 
891   // Compute both horizontal and vertical kernels
892   float normh;
893   float normv;
894   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
895   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
896 
897   // We will process four components a time, duplicate the information
898   for(int i = 0; i < 2 * itor->width; i++)
899   {
900     vkernelh[i] = _mm_set_ps1(kernelh[i]);
901     vkernelv[i] = _mm_set_ps1(kernelv[i]);
902   }
903 
904   // Precompute the inverse of the filter norm for later use
905   const __m128 oonorm = _mm_set_ps1(1.f / (normh * normv));
906 
907   /* Now 2 cases, the pixel + filter width goes outside the image
908    * in that case we have to use index clipping to keep all reads
909    * in the input image (slow path) or we are sure it won't fall
910    * outside and can do more simple code */
911   int ix = (int)x;
912   int iy = (int)y;
913 
914   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
915      && iy < (height - itor->width))
916   {
917     // Inside image boundary case
918 
919     // Go to top left pixel
920     in = (float *)in + linestride * iy + ix * 4;
921     in = in - (itor->width - 1) * (4 + linestride);
922 
923     // Apply the kernel
924     __m128 pixel = _mm_setzero_ps();
925     for(int i = 0; i < 2 * itor->width; i++)
926     {
927       __m128 h = _mm_setzero_ps();
928       for(int j = 0; j < 2 * itor->width; j++)
929       {
930         h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)&in[j * 4]));
931       }
932       pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
933       in += linestride;
934     }
935 
936     *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
937   }
938   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
939   {
940     // At least a valid coordinate
941 
942     // Point to the upper left pixel index wise
943     iy -= itor->width - 1;
944     ix -= itor->width - 1;
945 
946     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
947     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
948 
949     int xtap_first;
950     int xtap_last;
951     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
952 
953     int ytap_first;
954     int ytap_last;
955     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
956 
957     // Apply the kernel
958     __m128 pixel = _mm_setzero_ps();
959     for(int i = ytap_first; i < ytap_last; i++)
960     {
961       int clip_y = clip(iy + i, 0, height - 1, bordermode);
962       __m128 h = _mm_setzero_ps();
963       for(int j = xtap_first; j < xtap_last; j++)
964       {
965         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
966         const float *ipixel = in + clip_y * linestride + clip_x * 4;
967         h = _mm_add_ps(h, _mm_mul_ps(vkernelh[j], *(__m128 *)ipixel));
968       }
969       pixel = _mm_add_ps(pixel, _mm_mul_ps(vkernelv[i], h));
970     }
971 
972     *(__m128 *)out = _mm_mul_ps(pixel, oonorm);
973   }
974   else
975   {
976     *(__m128 *)out = _mm_set_ps1(0.0f);
977   }
978 }
979 #endif
980 
dt_interpolation_compute_pixel4c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)981 void dt_interpolation_compute_pixel4c(const struct dt_interpolation *itor, const float *in, float *out,
982                                       const float x, const float y, const int width, const int height,
983                                       const int linestride)
984 {
985   if(darktable.codepath.OPENMP_SIMD)
986     return dt_interpolation_compute_pixel4c_plain(itor, in, out, x, y, width, height, linestride);
987 #if defined(__SSE2__)
988   else if(darktable.codepath.SSE2)
989     return dt_interpolation_compute_pixel4c_sse(itor, in, out, x, y, width, height, linestride);
990 #endif
991   else
992     dt_unreachable_codepath();
993 }
994 
dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)995 static void dt_interpolation_compute_pixel1c_plain(const struct dt_interpolation *itor, const float *in,
996                                                    float *out, const float x, const float y, const int width,
997                                                    const int height, const int linestride)
998 {
999   assert(itor->width < (MAX_HALF_FILTER_WIDTH + 1));
1000 
1001   // Quite a bit of space for kernels
1002   float kernelh[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1003   float kernelv[MAX_KERNEL_REQ] __attribute__((aligned(SSE_ALIGNMENT)));
1004 
1005   // Compute both horizontal and vertical kernels
1006   float normh;
1007   float normv;
1008   compute_upsampling_kernel(itor, kernelh, &normh, NULL, x);
1009   compute_upsampling_kernel(itor, kernelv, &normv, NULL, y);
1010 
1011   // Precompute the inverse of the filter norm for later use
1012   const float oonorm = (1.f / (normh * normv));
1013 
1014   /* Now 2 cases, the pixel + filter width goes outside the image
1015    * in that case we have to use index clipping to keep all reads
1016    * in the input image (slow path) or we are sure it won't fall
1017    * outside and can do more simple code */
1018   int ix = (int)x;
1019   int iy = (int)y;
1020 
1021   if(ix >= (itor->width - 1) && iy >= (itor->width - 1) && ix < (width - itor->width)
1022     && iy < (height - itor->width))
1023   {
1024     // Inside image boundary case
1025 
1026     // Go to top left pixel
1027     in = (float *)in + linestride * iy + ix;
1028     in = in - (itor->width - 1) * (1 + linestride);
1029 
1030     // Apply the kernel
1031     float pixel = 0.0f;
1032     for(int i = 0; i < 2 * itor->width; i++)
1033     {
1034       float h = 0.0f;
1035       for(int j = 0; j < 2 * itor->width; j++)
1036       {
1037         h += kernelh[j] * in[j];
1038       }
1039       pixel += kernelv[i] * h;
1040       in += linestride;
1041     }
1042 
1043     *out = oonorm * pixel;
1044   }
1045   else if(ix >= 0 && iy >= 0 && ix < width && iy < height)
1046   {
1047     // At least a valid coordinate
1048 
1049     // Point to the upper left pixel index wise
1050     iy -= itor->width - 1;
1051     ix -= itor->width - 1;
1052 
1053     static const enum border_mode bordermode = INTERPOLATION_BORDER_MODE;
1054     assert(bordermode != BORDER_CLAMP); // XXX in clamp mode, norms would be wrong
1055 
1056     int xtap_first;
1057     int xtap_last;
1058     prepare_tap_boundaries(&xtap_first, &xtap_last, bordermode, 2 * itor->width, ix, width);
1059 
1060     int ytap_first;
1061     int ytap_last;
1062     prepare_tap_boundaries(&ytap_first, &ytap_last, bordermode, 2 * itor->width, iy, height);
1063 
1064     // Apply the kernel
1065     float pixel = 0.0f;
1066     for(int i = ytap_first; i < ytap_last; i++)
1067     {
1068       const int clip_y = clip(iy + i, 0, height - 1, bordermode);
1069       float h = 0.0f;
1070       for(int j = xtap_first; j < xtap_last; j++)
1071       {
1072         const int clip_x = clip(ix + j, 0, width - 1, bordermode);
1073         const float *ipixel = in + clip_y * linestride + clip_x;
1074         h += kernelh[j] * *ipixel;
1075       }
1076       pixel += kernelv[i] * h;
1077     }
1078 
1079     *out = oonorm * pixel;
1080   }
1081   else
1082   {
1083     *out = 0.0f;
1084   }
1085 }
1086 
dt_interpolation_compute_pixel1c(const struct dt_interpolation * itor,const float * in,float * out,const float x,const float y,const int width,const int height,const int linestride)1087 void dt_interpolation_compute_pixel1c(const struct dt_interpolation *itor, const float *in, float *out,
1088                                       const float x, const float y, const int width, const int height,
1089                                       const int linestride)
1090 {
1091   return dt_interpolation_compute_pixel1c_plain(itor, in, out, x, y, width, height, linestride);
1092 }
1093 
1094 /* --------------------------------------------------------------------------
1095  * Interpolation factory
1096  * ------------------------------------------------------------------------*/
1097 
dt_interpolation_new(enum dt_interpolation_type type)1098 const struct dt_interpolation *dt_interpolation_new(enum dt_interpolation_type type)
1099 {
1100   const struct dt_interpolation *itor = NULL;
1101 
1102   if(type == DT_INTERPOLATION_USERPREF)
1103   {
1104     // Find user preferred interpolation method
1105     gchar *uipref = dt_conf_get_string("plugins/lighttable/export/pixel_interpolator");
1106     for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1107     {
1108       if(!strcmp(uipref, dt_interpolator[i].name))
1109       {
1110         // Found the one
1111         itor = &dt_interpolator[i];
1112         break;
1113       }
1114     }
1115     g_free(uipref);
1116 
1117     /* In the case the search failed (!uipref or name not found),
1118      * prepare later search pass with default fallback */
1119     type = DT_INTERPOLATION_DEFAULT;
1120   }
1121   else if(type == DT_INTERPOLATION_USERPREF_WARP)
1122   {
1123     // Find user preferred interpolation method
1124     gchar *uipref = dt_conf_get_string("plugins/lighttable/export/pixel_interpolator_warp");
1125     for(int i = DT_INTERPOLATION_FIRST; uipref && i < DT_INTERPOLATION_LAST; i++)
1126     {
1127       if(!strcmp(uipref, dt_interpolator[i].name))
1128       {
1129         // Found the one
1130         itor = &dt_interpolator[i];
1131         break;
1132       }
1133     }
1134     g_free(uipref);
1135 
1136     /* In the case the search failed (!uipref or name not found),
1137      * prepare later search pass with default fallback */
1138     type = DT_INTERPOLATION_DEFAULT_WARP;
1139   }
1140   if(!itor)
1141   {
1142     // Did not find the userpref one or we've been asked for a specific one
1143     for(int i = DT_INTERPOLATION_FIRST; i < DT_INTERPOLATION_LAST; i++)
1144     {
1145       if(dt_interpolator[i].id == type)
1146       {
1147         itor = &dt_interpolator[i];
1148         break;
1149       }
1150       if(dt_interpolator[i].id == DT_INTERPOLATION_DEFAULT)
1151       {
1152         itor = &dt_interpolator[i];
1153       }
1154     }
1155   }
1156 
1157   return itor;
1158 }
1159 
1160 /* --------------------------------------------------------------------------
1161  * Image resampling
1162  * ------------------------------------------------------------------------*/
1163 
1164 /** Prepares a 1D resampling plan
1165  *
1166  * This consists of the following information
1167  * <ul>
1168  * <li>A list of lengths that tell how many pixels are relevant for the
1169  *    next output</li>
1170  * <li>A list of required filter kernels</li>
1171  * <li>A list of sample indexes</li>
1172  * </ul>
1173  *
1174  * How to apply the resampling plan:
1175  * <ol>
1176  * <li>Pick a length from the length array</li>
1177  * <li>until length is reached
1178  *     <ol>
1179  *     <li>pick a kernel tap></li>
1180  *     <li>pick the relevant sample according to the picked index</li>
1181  *     <li>multiply them and accumulate</li>
1182  *     </ol>
1183  * </li>
1184  * <li>here goes a single output sample</li>
1185  * </ol>
1186  *
1187  * This until you reach the number of output pixels
1188  *
1189  * @param itor interpolator used to resample
1190  * @param in [in] Number of input samples
1191  * @param out [in] Number of output samples
1192  * @param plength [out] Array of lengths for each pixel filtering (number
1193  * of taps/indexes to use). This array mus be freed with dt_free_align() when you're
1194  * done with the plan.
1195  * @param pkernel [out] Array of filter kernel taps
1196  * @param pindex [out] Array of sample indexes to be used for applying each kernel tap
1197  * arrays of information
1198  * @param pmeta [out] Array of int triplets (length, kernel, index) telling where to start for an arbitrary
1199  * out position meta[3*out]
1200  * @return 0 for success, !0 for failure
1201  */
prepare_resampling_plan(const struct dt_interpolation * itor,int in,const int in_x0,int out,const int out_x0,float scale,int ** plength,float ** pkernel,int ** pindex,int ** pmeta)1202 static int prepare_resampling_plan(const struct dt_interpolation *itor, int in, const int in_x0, int out,
1203                                    const int out_x0, float scale, int **plength, float **pkernel,
1204                                    int **pindex, int **pmeta)
1205 {
1206   // Safe return values
1207   *plength = NULL;
1208   *pkernel = NULL;
1209   *pindex = NULL;
1210   if(pmeta)
1211   {
1212     *pmeta = NULL;
1213   }
1214 
1215   if(scale == 1.f)
1216   {
1217     // No resampling required
1218     return 0;
1219   }
1220 
1221   // Compute common upsampling/downsampling memory requirements
1222   int maxtapsapixel;
1223   if(scale > 1.f)
1224   {
1225     // Upscale... the easy one. The values are exact
1226     maxtapsapixel = 2 * itor->width;
1227   }
1228   else
1229   {
1230     // Downscale... going for worst case values memory wise
1231     maxtapsapixel = ceil_fast((float)2 * (float)itor->width / scale);
1232   }
1233 
1234   int nlengths = out;
1235   const int nindex = maxtapsapixel * out;
1236   const int nkernel = maxtapsapixel * out;
1237   const size_t lengthreq = increase_for_alignment(nlengths * sizeof(int), SSE_ALIGNMENT);
1238   const size_t indexreq = increase_for_alignment(nindex * sizeof(int), SSE_ALIGNMENT);
1239   const size_t kernelreq = increase_for_alignment(nkernel * sizeof(float), SSE_ALIGNMENT);
1240   const size_t scratchreq = maxtapsapixel * sizeof(float) + 4 * sizeof(float);
1241   // NB: because sse versions compute four taps a time
1242   const size_t metareq = pmeta ? 3 * sizeof(int) * out : 0;
1243 
1244   void *blob = NULL;
1245   const size_t totalreq = kernelreq + lengthreq + indexreq + scratchreq + metareq;
1246   blob = dt_alloc_align(SSE_ALIGNMENT, totalreq);
1247   if(!blob)
1248   {
1249     return 1;
1250   }
1251 
1252   int *lengths = (int *)blob;
1253   blob = (char *)blob + lengthreq;
1254   int *index = (int *)blob;
1255   blob = (char *)blob + indexreq;
1256   float *kernel = (float *)blob;
1257   blob = (char *)blob + kernelreq;
1258   float *scratchpad = scratchreq ? (float *)blob : NULL;
1259   blob = (char *)blob + scratchreq;
1260   int *meta = metareq ? (int *)blob : NULL;
1261 //   blob = (char *)blob + metareq;
1262 
1263   /* setting this as a const should help the compilers trim all unnecessary
1264    * codepaths */
1265   const enum border_mode bordermode = RESAMPLING_BORDER_MODE;
1266 
1267   /* Upscale and downscale differ in subtle points, getting rid of code
1268    * duplication might have been tricky and i prefer keeping the code
1269    * as straight as possible */
1270   if(scale > 1.f)
1271   {
1272     int kidx = 0;
1273     int iidx = 0;
1274     int lidx = 0;
1275     int midx = 0;
1276     for(int x = 0; x < out; x++)
1277     {
1278       if(meta)
1279       {
1280         meta[midx++] = lidx;
1281         meta[midx++] = kidx;
1282         meta[midx++] = iidx;
1283       }
1284 
1285       // Projected position in input samples
1286       float fx = (float)(out_x0 + x) / scale;
1287 
1288       // Compute the filter kernel at that position
1289       int first;
1290       compute_upsampling_kernel(itor, scratchpad, NULL, &first, fx);
1291 
1292       /* Check lower and higher bound pixel index and skip as many pixels as
1293        * necessary to fall into range */
1294       int tap_first;
1295       int tap_last;
1296       prepare_tap_boundaries(&tap_first, &tap_last, bordermode, 2 * itor->width, first, in);
1297 
1298       // Track number of taps that will be used
1299       lengths[lidx++] = tap_last - tap_first;
1300 
1301       // Precompute the inverse of the norm
1302       float norm = 0.f;
1303       for(int tap = tap_first; tap < tap_last; tap++)
1304       {
1305         norm += scratchpad[tap];
1306       }
1307       norm = 1.f / norm;
1308 
1309       /* Unlike single pixel or single sample code, here it's interesting to
1310        * precompute the normalized filter kernel as this will avoid dividing
1311        * by the norm for all processed samples/pixels
1312        * NB: use the same loop to put in place the index list */
1313       first += tap_first;
1314       for(int tap = tap_first; tap < tap_last; tap++)
1315       {
1316         kernel[kidx++] = scratchpad[tap] * norm;
1317         index[iidx++] = clip(first++, 0, in - 1, bordermode);
1318       }
1319     }
1320   }
1321   else
1322   {
1323     int kidx = 0;
1324     int iidx = 0;
1325     int lidx = 0;
1326     int midx = 0;
1327     for(int x = 0; x < out; x++)
1328     {
1329       if(meta)
1330       {
1331         meta[midx++] = lidx;
1332         meta[midx++] = kidx;
1333         meta[midx++] = iidx;
1334       }
1335 
1336       // Compute downsampling kernel centered on output position
1337       int taps;
1338       int first;
1339       compute_downsampling_kernel(itor, &taps, &first, scratchpad, NULL, scale, out_x0 + x);
1340 
1341       /* Check lower and higher bound pixel index and skip as many pixels as
1342        * necessary to fall into range */
1343       int tap_first;
1344       int tap_last;
1345       prepare_tap_boundaries(&tap_first, &tap_last, bordermode, taps, first, in);
1346 
1347       // Track number of taps that will be used
1348       lengths[lidx++] = tap_last - tap_first;
1349 
1350       // Precompute the inverse of the norm
1351       float norm = 0.f;
1352       for(int tap = tap_first; tap < tap_last; tap++)
1353       {
1354         norm += scratchpad[tap];
1355       }
1356       norm = 1.f / norm;
1357 
1358       /* Unlike single pixel or single sample code, here it's interesting to
1359        * precompute the normalized filter kernel as this will avoid dividing
1360        * by the norm for all processed samples/pixels
1361        * NB: use the same loop to put in place the index list */
1362       first += tap_first;
1363       for(int tap = tap_first; tap < tap_last; tap++)
1364       {
1365         kernel[kidx++] = scratchpad[tap] * norm;
1366         index[iidx++] = clip(first++, 0, in - 1, bordermode);
1367       }
1368     }
1369   }
1370 
1371   // Validate plan wrt caller
1372   *plength = lengths;
1373   *pindex = index;
1374   *pkernel = kernel;
1375   if(pmeta)
1376   {
1377     *pmeta = meta;
1378   }
1379 
1380   return 0;
1381 }
1382 
dt_interpolation_resample_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1383 static void dt_interpolation_resample_plain(const struct dt_interpolation *itor, float *out,
1384                                             const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1385                                             const float *const in, const dt_iop_roi_t *const roi_in,
1386                                             const int32_t in_stride)
1387 {
1388   int *hindex = NULL;
1389   int *hlength = NULL;
1390   float *hkernel = NULL;
1391   int *vindex = NULL;
1392   int *vlength = NULL;
1393   float *vkernel = NULL;
1394   int *vmeta = NULL;
1395 
1396   const int32_t in_stride_floats = in_stride / sizeof(float);
1397   const int32_t out_stride_floats = out_stride / sizeof(float);
1398   int r;
1399 
1400   debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1401              roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1402              roi_out->x, roi_out->y, roi_out->scale);
1403 
1404   // Fast code path for 1:1 copy, only cropping area can change
1405   if(roi_out->scale == 1.f)
1406   {
1407     const int x0 = roi_out->x * 4 * sizeof(float);
1408 #if DEBUG_RESAMPLING_TIMING
1409     int64_t ts_resampling = getts();
1410 #endif
1411 #ifdef _OPENMP
1412 #pragma omp parallel for default(none) \
1413     dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1414     shared(out)
1415 #endif
1416     for(int y = 0; y < roi_out->height; y++)
1417     {
1418       memcpy((char *)out + (size_t)out_stride * y,
1419              (char *)in + (size_t)in_stride * (y + roi_out->y) + x0,
1420              out_stride);
1421     }
1422 #if DEBUG_RESAMPLING_TIMING
1423     ts_resampling = getts() - ts_resampling;
1424     fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1425 #endif
1426     // All done, so easy case
1427     return;
1428   }
1429 
1430 // Generic non 1:1 case... much more complicated :D
1431 #if DEBUG_RESAMPLING_TIMING
1432   int64_t ts_plan = getts();
1433 #endif
1434 
1435   // Prepare resampling plans once and for all
1436   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1437                               &hlength, &hkernel, &hindex, NULL);
1438   if(r)
1439   {
1440     goto exit;
1441   }
1442 
1443   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1444                               &vlength, &vkernel, &vindex, &vmeta);
1445   if(r)
1446   {
1447     goto exit;
1448   }
1449 
1450 #if DEBUG_RESAMPLING_TIMING
1451   ts_plan = getts() - ts_plan;
1452 #endif
1453 
1454 #if DEBUG_RESAMPLING_TIMING
1455   int64_t ts_resampling = getts();
1456 #endif
1457 
1458 // Process each output line
1459 #ifdef _OPENMP
1460 #pragma omp parallel for default(none) \
1461   dt_omp_firstprivate(in, in_stride_floats, out_stride_floats, roi_out) \
1462   shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1463 #endif
1464   for(int oy = 0; oy < roi_out->height; oy++)
1465   {
1466     // Initialize column resampling indexes
1467     int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1468     int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1469     int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1470 
1471     // Initialize row resampling indexes
1472     int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1473     int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1474     int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1475 
1476     // Number of lines contributing to the output line
1477     int vl = vlength[vlidx++]; // V(ertical) L(ength)
1478 
1479     // Process each output column
1480     for(int ox = 0; ox < roi_out->width; ox++)
1481     {
1482       debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1483 
1484       // This will hold the resulting pixel
1485       DT_ALIGNED_PIXEL float vs[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
1486 
1487       // Number of horizontal samples contributing to the output
1488       int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1489 
1490       for(int iy = 0; iy < vl; iy++)
1491       {
1492         // This is our input line
1493         size_t baseidx_vindex = (size_t)vindex[viidx++] * in_stride_floats;
1494 
1495         DT_ALIGNED_PIXEL float vhs[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
1496 
1497         for(int ix = 0; ix < hl; ix++)
1498         {
1499           // Apply the precomputed filter kernel
1500           const size_t baseidx = baseidx_vindex + (size_t)hindex[hiidx++] * 4;
1501           const float htap = hkernel[hkidx++];
1502           // Convince gcc 10 to vectorize
1503           DT_ALIGNED_PIXEL float tmp[4] = { in[baseidx], in[baseidx+1], in[baseidx+2], in[baseidx+3] };
1504           for_each_channel(c, aligned(tmp,vhs:16)) vhs[c] += tmp[c] * htap;
1505         }
1506 
1507         // Accumulate contribution from this line
1508         const float vtap = vkernel[vkidx++];
1509         for_each_channel(c, aligned(vhs,vs:16)) vs[c] += vhs[c] * vtap;
1510 
1511         // Reset horizontal resampling context
1512         hkidx -= hl;
1513         hiidx -= hl;
1514       }
1515 
1516       // Output pixel is ready
1517       const size_t baseidx = (size_t)oy * out_stride_floats + (size_t)ox * 4;
1518 
1519       // Clip negative RGB that may be produced by Lanczos undershooting
1520       // Negative RGB are invalid values no matter the RGB space (light is positive)
1521       for_each_channel(c, aligned(vs:16)) out[baseidx + c] = fmaxf(vs[c], 0.f);
1522 
1523       // Reset vertical resampling context
1524       viidx -= vl;
1525       vkidx -= vl;
1526 
1527       // Progress in horizontal context
1528       hiidx += hl;
1529       hkidx += hl;
1530     }
1531   }
1532 
1533 #if DEBUG_RESAMPLING_TIMING
1534   ts_resampling = getts() - ts_resampling;
1535   fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1536 #endif
1537 
1538 exit:
1539   /* Free the resampling plans. It's nasty to optimize allocs like that, but
1540    * it simplifies the code :-D. The length array is in fact the only memory
1541    * allocated. */
1542   dt_free_align(hlength);
1543   dt_free_align(vlength);
1544 }
1545 
1546 #if defined(__SSE2__)
dt_interpolation_resample_sse(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1547 static void dt_interpolation_resample_sse(const struct dt_interpolation *itor, float *out,
1548                                           const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1549                                           const float *const in, const dt_iop_roi_t *const roi_in,
1550                                           const int32_t in_stride)
1551 {
1552   int *hindex = NULL;
1553   int *hlength = NULL;
1554   float *hkernel = NULL;
1555   int *vindex = NULL;
1556   int *vlength = NULL;
1557   float *vkernel = NULL;
1558   int *vmeta = NULL;
1559 
1560   int r;
1561 
1562   debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
1563              roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
1564              roi_out->x, roi_out->y, roi_out->scale);
1565 
1566   // Fast code path for 1:1 copy, only cropping area can change
1567   if(roi_out->scale == 1.f)
1568   {
1569     const int x0 = roi_out->x * 4 * sizeof(float);
1570 #if DEBUG_RESAMPLING_TIMING
1571     int64_t ts_resampling = getts();
1572 #endif
1573 #ifdef _OPENMP
1574 #pragma omp parallel for default(none) \
1575     dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
1576     shared(out)
1577 #endif
1578     for(int y = 0; y < roi_out->height; y++)
1579     {
1580       float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
1581       float *o = (float *)((char *)out + (size_t)out_stride * y);
1582       memcpy(o, i, out_stride);
1583     }
1584 #if DEBUG_RESAMPLING_TIMING
1585     ts_resampling = getts() - ts_resampling;
1586     fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
1587 #endif
1588     // All done, so easy case
1589     return;
1590   }
1591 
1592 // Generic non 1:1 case... much more complicated :D
1593 #if DEBUG_RESAMPLING_TIMING
1594   int64_t ts_plan = getts();
1595 #endif
1596 
1597   // Prepare resampling plans once and for all
1598   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1599                               &hlength, &hkernel, &hindex, NULL);
1600   if(r)
1601   {
1602     goto exit;
1603   }
1604 
1605   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1606                               &vlength, &vkernel, &vindex, &vmeta);
1607   if(r)
1608   {
1609     goto exit;
1610   }
1611 
1612 #if DEBUG_RESAMPLING_TIMING
1613   ts_plan = getts() - ts_plan;
1614 #endif
1615 
1616 #if DEBUG_RESAMPLING_TIMING
1617   int64_t ts_resampling = getts();
1618 #endif
1619 
1620 // Process each output line
1621 #ifdef _OPENMP
1622 #pragma omp parallel for default(none) \
1623   dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
1624   shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
1625 #endif
1626   for(int oy = 0; oy < roi_out->height; oy++)
1627   {
1628     // Initialize column resampling indexes
1629     int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
1630     int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
1631     int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
1632 
1633     // Initialize row resampling indexes
1634     int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
1635     int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
1636     int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
1637 
1638     // Number of lines contributing to the output line
1639     int vl = vlength[vlidx++]; // V(ertical) L(ength)
1640 
1641     // Process each output column
1642     for(int ox = 0; ox < roi_out->width; ox++)
1643     {
1644       debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
1645 
1646       // This will hold the resulting pixel
1647       __m128 vs = _mm_setzero_ps();
1648 
1649       // Number of horizontal samples contributing to the output
1650       const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
1651 
1652       for(int iy = 0; iy < vl; iy++)
1653       {
1654         // This is our input line
1655         const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
1656 
1657         __m128 vhs = _mm_setzero_ps();
1658 
1659         for(int ix = 0; ix < hl; ix++)
1660         {
1661           // Apply the precomputed filter kernel
1662           const size_t baseidx = (size_t)hindex[hiidx++] * 4;
1663           const float htap = hkernel[hkidx++];
1664           const __m128 vhtap = _mm_set_ps1(htap);
1665           vhs = _mm_add_ps(vhs, _mm_mul_ps(*(__m128 *)&i[baseidx], vhtap));
1666         }
1667 
1668         // Accumulate contribution from this line
1669         const float vtap = vkernel[vkidx++];
1670         const __m128 vvtap = _mm_set_ps1(vtap);
1671         vs = _mm_add_ps(vs, _mm_mul_ps(vhs, vvtap));
1672 
1673         // Reset horizontal resampling context
1674         hkidx -= hl;
1675         hiidx -= hl;
1676       }
1677 
1678       // Output pixel is ready
1679       float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * 4 * sizeof(float));
1680 
1681       // Clip negative RGB that may be produced by Lanczos undershooting
1682       // Negative RGB are invalid values no matter the RGB space (light is positive)
1683       vs = _mm_max_ps(vs, _mm_setzero_ps());
1684       _mm_stream_ps(o, vs);
1685 
1686       // Reset vertical resampling context
1687       viidx -= vl;
1688       vkidx -= vl;
1689 
1690       // Progress in horizontal context
1691       hiidx += hl;
1692       hkidx += hl;
1693     }
1694   }
1695 
1696   _mm_sfence();
1697 
1698 #if DEBUG_RESAMPLING_TIMING
1699   ts_resampling = getts() - ts_resampling;
1700   fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
1701 #endif
1702 
1703 exit:
1704   /* Free the resampling plans. It's nasty to optimize allocs like that, but
1705    * it simplifies the code :-D. The length array is in fact the only memory
1706    * allocated. */
1707   dt_free_align(hlength);
1708   dt_free_align(vlength);
1709 }
1710 #endif
1711 
1712 /** Applies resampling (re-scaling) on *full* input and output buffers.
1713  *  roi_in and roi_out define the part of the buffers that is affected.
1714  */
dt_interpolation_resample(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1715 void dt_interpolation_resample(const struct dt_interpolation *itor, float *out,
1716                                const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1717                                const float *const in, const dt_iop_roi_t *const roi_in,
1718                                const int32_t in_stride)
1719 {
1720   if(darktable.codepath.OPENMP_SIMD)
1721     return dt_interpolation_resample_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1722 #if defined(__SSE2__)
1723   else if(darktable.codepath.SSE2)
1724     return dt_interpolation_resample_sse(itor, out, roi_out, out_stride, in, roi_in, in_stride);
1725 #endif
1726   else
1727     dt_unreachable_codepath();
1728 }
1729 
1730 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1731  *  and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1732  *  positions of the roi's within the full input and output image, respectively.
1733  */
dt_interpolation_resample_roi(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)1734 void dt_interpolation_resample_roi(const struct dt_interpolation *itor, float *out,
1735                                    const dt_iop_roi_t *const roi_out, const int32_t out_stride,
1736                                    const float *const in, const dt_iop_roi_t *const roi_in,
1737                                    const int32_t in_stride)
1738 {
1739   dt_iop_roi_t oroi = *roi_out;
1740   oroi.x = oroi.y = 0;
1741 
1742   dt_iop_roi_t iroi = *roi_in;
1743   iroi.x = iroi.y = 0;
1744 
1745   dt_interpolation_resample(itor, out, &oroi, out_stride, in, &iroi, in_stride);
1746 }
1747 
1748 #ifdef HAVE_OPENCL
dt_interpolation_init_cl_global()1749 dt_interpolation_cl_global_t *dt_interpolation_init_cl_global()
1750 {
1751   dt_interpolation_cl_global_t *g
1752       = (dt_interpolation_cl_global_t *)malloc(sizeof(dt_interpolation_cl_global_t));
1753 
1754   const int program = 2; // basic.cl, from programs.conf
1755   g->kernel_interpolation_resample = dt_opencl_create_kernel(program, "interpolation_resample");
1756   return g;
1757 }
1758 
dt_interpolation_free_cl_global(dt_interpolation_cl_global_t * g)1759 void dt_interpolation_free_cl_global(dt_interpolation_cl_global_t *g)
1760 {
1761   if(!g) return;
1762   // destroy kernels
1763   dt_opencl_free_kernel(g->kernel_interpolation_resample);
1764   free(g);
1765 }
1766 
roundToNextPowerOfTwo(uint32_t x)1767 static uint32_t roundToNextPowerOfTwo(uint32_t x)
1768 {
1769   x--;
1770   x |= x >> 1;
1771   x |= x >> 2;
1772   x |= x >> 4;
1773   x |= x >> 8;
1774   x |= x >> 16;
1775   x++;
1776   return x;
1777 }
1778 
1779 /** Applies resampling (re-scaling) on *full* input and output buffers.
1780  *  roi_in and roi_out define the part of the buffers that is affected.
1781  */
dt_interpolation_resample_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1782 int dt_interpolation_resample_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1783                                  const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1784                                  const dt_iop_roi_t *const roi_in)
1785 {
1786   int *hindex = NULL;
1787   int *hlength = NULL;
1788   float *hkernel = NULL;
1789   int *hmeta = NULL;
1790   int *vindex = NULL;
1791   int *vlength = NULL;
1792   float *vkernel = NULL;
1793   int *vmeta = NULL;
1794 
1795   int r;
1796   cl_int err = -999;
1797 
1798   cl_mem dev_hindex = NULL;
1799   cl_mem dev_hlength = NULL;
1800   cl_mem dev_hkernel = NULL;
1801   cl_mem dev_hmeta = NULL;
1802   cl_mem dev_vindex = NULL;
1803   cl_mem dev_vlength = NULL;
1804   cl_mem dev_vkernel = NULL;
1805   cl_mem dev_vmeta = NULL;
1806 
1807   debug_info("resampling_cl %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", (void *)dev_in,
1808              roi_in->width, roi_in->height, roi_in->x, roi_in->y, roi_in->scale, (void *)dev_out,
1809              roi_out->width, roi_out->height, roi_out->x, roi_out->y, roi_out->scale);
1810 
1811   // Fast code path for 1:1 copy, only cropping area can change
1812   if(roi_out->scale == 1.f)
1813   {
1814 #if DEBUG_RESAMPLING_TIMING
1815     int64_t ts_resampling = getts();
1816 #endif
1817     size_t iorigin[] = { roi_out->x, roi_out->y, 0 };
1818     size_t oorigin[] = { 0, 0, 0 };
1819     size_t region[] = { roi_out->width, roi_out->height, 1 };
1820 
1821     // copy original input from dev_in -> dev_out as starting point
1822     err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, iorigin, oorigin, region);
1823     if(err != CL_SUCCESS) goto error;
1824 
1825 #if DEBUG_RESAMPLING_TIMING
1826     ts_resampling = getts() - ts_resampling;
1827     fprintf(stderr, "resampling_cl %p plan:0us resampling:%" PRId64 "us\n", (void *)dev_in, ts_resampling);
1828 #endif
1829     // All done, so easy case
1830     return CL_SUCCESS;
1831   }
1832 
1833 // Generic non 1:1 case... much more complicated :D
1834 #if DEBUG_RESAMPLING_TIMING
1835   int64_t ts_plan = getts();
1836 #endif
1837 
1838   // Prepare resampling plans once and for all
1839   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
1840                               &hlength, &hkernel, &hindex, &hmeta);
1841   if(r)
1842   {
1843     goto error;
1844   }
1845 
1846   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
1847                               &vlength, &vkernel, &vindex, &vmeta);
1848   if(r)
1849   {
1850     goto error;
1851   }
1852 
1853   int hmaxtaps = -1, vmaxtaps = -1;
1854   for(int k = 0; k < roi_out->width; k++) hmaxtaps = MAX(hmaxtaps, hlength[k]);
1855   for(int k = 0; k < roi_out->height; k++) vmaxtaps = MAX(vmaxtaps, vlength[k]);
1856 
1857 #if DEBUG_RESAMPLING_TIMING
1858   ts_plan = getts() - ts_plan;
1859 #endif
1860 
1861 #if DEBUG_RESAMPLING_TIMING
1862   int64_t ts_resampling = getts();
1863 #endif
1864 
1865   // strategy: process image column-wise (local[0] = 1). For each row generate
1866   // a number of parallel work items each taking care of one horizontal convolution,
1867   // then sum over work items to do the vertical convolution
1868 
1869   const int kernel = darktable.opencl->interpolation->kernel_interpolation_resample;
1870   const int width = roi_out->width;
1871   const int height = roi_out->height;
1872 
1873   // make sure blocksize is not too large
1874   const int taps = roundToNextPowerOfTwo(vmaxtaps); // the number of work items per row rounded up to a power of 2
1875                                                     // (for quick recursive reduction)
1876   int vblocksize;
1877 
1878   dt_opencl_local_buffer_t locopt
1879     = (dt_opencl_local_buffer_t){ .xoffset = 0, .xfactor = 1, .yoffset = 0, .yfactor = 1,
1880                                   .cellsize = 4 * sizeof(float), .overhead = hmaxtaps * sizeof(float) + hmaxtaps * sizeof(int),
1881                                   .sizex = 1, .sizey = (1 << 16) * taps };
1882 
1883   if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))
1884     vblocksize = locopt.sizey;
1885   else
1886     vblocksize = 1;
1887 
1888   if(vblocksize < taps)
1889   {
1890     // our strategy does not work: the vertical number of taps exceeds the vertical workgroupsize;
1891     // there is no point in continuing on the GPU - that would be way too slow; let's delegate the stuff to
1892     // the CPU then.
1893     dt_print(
1894         DT_DEBUG_OPENCL,
1895         "[opencl_resampling] resampling plan cannot efficiently be run on the GPU - fall back to CPU.\n");
1896     goto error;
1897   }
1898 
1899   size_t sizes[3] = { ROUNDUPWD(width), ROUNDUP(height * taps, vblocksize), 1 };
1900   size_t local[3] = { 1, vblocksize, 1 };
1901 
1902   // store resampling plan to device memory
1903   // hindex, vindex, hkernel, vkernel: (v|h)maxtaps might be too small, so store a bit more than needed
1904   dev_hindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * (hmaxtaps + 1), hindex);
1905   if(dev_hindex == NULL) goto error;
1906 
1907   dev_hlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width, hlength);
1908   if(dev_hlength == NULL) goto error;
1909 
1910   dev_hkernel
1911       = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * width * (hmaxtaps + 1), hkernel);
1912   if(dev_hkernel == NULL) goto error;
1913 
1914   dev_hmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * width * 3, hmeta);
1915   if(dev_hmeta == NULL) goto error;
1916 
1917   dev_vindex = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * (vmaxtaps + 1), vindex);
1918   if(dev_vindex == NULL) goto error;
1919 
1920   dev_vlength = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height, vlength);
1921   if(dev_vlength == NULL) goto error;
1922 
1923   dev_vkernel
1924       = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * height * (vmaxtaps + 1), vkernel);
1925   if(dev_vkernel == NULL) goto error;
1926 
1927   dev_vmeta = dt_opencl_copy_host_to_device_constant(devid, sizeof(int) * height * 3, vmeta);
1928   if(dev_vmeta == NULL) goto error;
1929 
1930   dt_opencl_set_kernel_arg(devid, kernel, 0, sizeof(cl_mem), (void *)&dev_in);
1931   dt_opencl_set_kernel_arg(devid, kernel, 1, sizeof(cl_mem), (void *)&dev_out);
1932   dt_opencl_set_kernel_arg(devid, kernel, 2, sizeof(int), (void *)&width);
1933   dt_opencl_set_kernel_arg(devid, kernel, 3, sizeof(int), (void *)&height);
1934   dt_opencl_set_kernel_arg(devid, kernel, 4, sizeof(cl_mem), (void *)&dev_hmeta);
1935   dt_opencl_set_kernel_arg(devid, kernel, 5, sizeof(cl_mem), (void *)&dev_vmeta);
1936   dt_opencl_set_kernel_arg(devid, kernel, 6, sizeof(cl_mem), (void *)&dev_hlength);
1937   dt_opencl_set_kernel_arg(devid, kernel, 7, sizeof(cl_mem), (void *)&dev_vlength);
1938   dt_opencl_set_kernel_arg(devid, kernel, 8, sizeof(cl_mem), (void *)&dev_hindex);
1939   dt_opencl_set_kernel_arg(devid, kernel, 9, sizeof(cl_mem), (void *)&dev_vindex);
1940   dt_opencl_set_kernel_arg(devid, kernel, 10, sizeof(cl_mem), (void *)&dev_hkernel);
1941   dt_opencl_set_kernel_arg(devid, kernel, 11, sizeof(cl_mem), (void *)&dev_vkernel);
1942   dt_opencl_set_kernel_arg(devid, kernel, 12, sizeof(int), (void *)&hmaxtaps);
1943   dt_opencl_set_kernel_arg(devid, kernel, 13, sizeof(int), (void *)&taps);
1944   dt_opencl_set_kernel_arg(devid, kernel, 14, hmaxtaps * sizeof(float), NULL);
1945   dt_opencl_set_kernel_arg(devid, kernel, 15, hmaxtaps * sizeof(int), NULL);
1946   dt_opencl_set_kernel_arg(devid, kernel, 16, vblocksize * 4 * sizeof(float), NULL);
1947   err = dt_opencl_enqueue_kernel_2d_with_local(devid, kernel, sizes, local);
1948   if(err != CL_SUCCESS) goto error;
1949 
1950 #if DEBUG_RESAMPLING_TIMING
1951   ts_resampling = getts() - ts_resampling;
1952   fprintf(stderr, "resampling_cl %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", (void *)dev_in, ts_plan,
1953           ts_resampling);
1954 #endif
1955 
1956   dt_opencl_release_mem_object(dev_hindex);
1957   dt_opencl_release_mem_object(dev_hlength);
1958   dt_opencl_release_mem_object(dev_hkernel);
1959   dt_opencl_release_mem_object(dev_hmeta);
1960   dt_opencl_release_mem_object(dev_vindex);
1961   dt_opencl_release_mem_object(dev_vlength);
1962   dt_opencl_release_mem_object(dev_vkernel);
1963   dt_opencl_release_mem_object(dev_vmeta);
1964   dt_free_align(hlength);
1965   dt_free_align(vlength);
1966   return CL_SUCCESS;
1967 
1968 error:
1969   dt_opencl_release_mem_object(dev_hindex);
1970   dt_opencl_release_mem_object(dev_hlength);
1971   dt_opencl_release_mem_object(dev_hkernel);
1972   dt_opencl_release_mem_object(dev_hmeta);
1973   dt_opencl_release_mem_object(dev_vindex);
1974   dt_opencl_release_mem_object(dev_vlength);
1975   dt_opencl_release_mem_object(dev_vkernel);
1976   dt_opencl_release_mem_object(dev_vmeta);
1977   dt_free_align(hlength);
1978   dt_free_align(vlength);
1979   dt_print(DT_DEBUG_OPENCL, "[opencl_resampling] couldn't enqueue kernel! %d\n", err);
1980   return err;
1981 }
1982 
1983 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
1984  *  and output buffers hold exactly those roi's. roi_in and roi_out define the relative
1985  *  positions of the roi's within the full input and output image, respectively.
1986  */
dt_interpolation_resample_roi_cl(const struct dt_interpolation * itor,int devid,cl_mem dev_out,const dt_iop_roi_t * const roi_out,cl_mem dev_in,const dt_iop_roi_t * const roi_in)1987 int dt_interpolation_resample_roi_cl(const struct dt_interpolation *itor, int devid, cl_mem dev_out,
1988                                      const dt_iop_roi_t *const roi_out, cl_mem dev_in,
1989                                      const dt_iop_roi_t *const roi_in)
1990 {
1991   dt_iop_roi_t oroi = *roi_out;
1992   oroi.x = oroi.y = 0;
1993 
1994   dt_iop_roi_t iroi = *roi_in;
1995   iroi.x = iroi.y = 0;
1996 
1997   return dt_interpolation_resample_cl(itor, devid, dev_out, &oroi, dev_in, &iroi);
1998 }
1999 #endif
2000 
dt_interpolation_resample_1c_plain(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2001 static void dt_interpolation_resample_1c_plain(const struct dt_interpolation *itor, float *out,
2002                                                const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2003                                                const float *const in, const dt_iop_roi_t *const roi_in,
2004                                                const int32_t in_stride)
2005 {
2006   int *hindex = NULL;
2007   int *hlength = NULL;
2008   float *hkernel = NULL;
2009   int *vindex = NULL;
2010   int *vlength = NULL;
2011   float *vkernel = NULL;
2012   int *vmeta = NULL;
2013 
2014   int r;
2015 
2016   debug_info("resampling %p (%dx%d@%dx%d scale %f) -> %p (%dx%d@%dx%d scale %f)\n", in, roi_in->width,
2017              roi_in->height, roi_in->x, roi_in->y, roi_in->scale, out, roi_out->width, roi_out->height,
2018              roi_out->x, roi_out->y, roi_out->scale);
2019 
2020   // Fast code path for 1:1 copy, only cropping area can change
2021   if(roi_out->scale == 1.f)
2022   {
2023     const int x0 = roi_out->x * sizeof(float);
2024 #if DEBUG_RESAMPLING_TIMING
2025     int64_t ts_resampling = getts();
2026 #endif
2027 #ifdef _OPENMP
2028 #pragma omp parallel for default(none) \
2029     dt_omp_firstprivate(in, in_stride, out_stride, roi_out, x0) \
2030     shared(out)
2031 #endif
2032     for(int y = 0; y < roi_out->height; y++)
2033     {
2034       float *i = (float *)((char *)in + (size_t)in_stride * (y + roi_out->y) + x0);
2035       float *o = (float *)((char *)out + (size_t)out_stride * y);
2036       memcpy(o, i, out_stride);
2037     }
2038 #if DEBUG_RESAMPLING_TIMING
2039     ts_resampling = getts() - ts_resampling;
2040     fprintf(stderr, "resampling %p plan:0us resampling:%" PRId64 "us\n", in, ts_resampling);
2041 #endif
2042     // All done, so easy case
2043     return;
2044   }
2045 
2046   // Generic non 1:1 case... much more complicated :D
2047 #if DEBUG_RESAMPLING_TIMING
2048   int64_t ts_plan = getts();
2049 #endif
2050 
2051   // Prepare resampling plans once and for all
2052   r = prepare_resampling_plan(itor, roi_in->width, roi_in->x, roi_out->width, roi_out->x, roi_out->scale,
2053                               &hlength, &hkernel, &hindex, NULL);
2054   if(r)
2055   {
2056     goto exit;
2057   }
2058 
2059   r = prepare_resampling_plan(itor, roi_in->height, roi_in->y, roi_out->height, roi_out->y, roi_out->scale,
2060                               &vlength, &vkernel, &vindex, &vmeta);
2061   if(r)
2062   {
2063     goto exit;
2064   }
2065 
2066 #if DEBUG_RESAMPLING_TIMING
2067   ts_plan = getts() - ts_plan;
2068 #endif
2069 
2070 #if DEBUG_RESAMPLING_TIMING
2071   int64_t ts_resampling = getts();
2072 #endif
2073 
2074   // Process each output line
2075 #ifdef _OPENMP
2076 #pragma omp parallel for default(none) \
2077   dt_omp_firstprivate(in, in_stride, out_stride, roi_out) \
2078   shared(out, hindex, hlength, hkernel, vindex, vlength, vkernel, vmeta)
2079 #endif
2080   for(int oy = 0; oy < roi_out->height; oy++)
2081   {
2082     // Initialize column resampling indexes
2083     int vlidx = vmeta[3 * oy + 0]; // V(ertical) L(ength) I(n)d(e)x
2084     int vkidx = vmeta[3 * oy + 1]; // V(ertical) K(ernel) I(n)d(e)x
2085     int viidx = vmeta[3 * oy + 2]; // V(ertical) I(ndex) I(n)d(e)x
2086 
2087     // Initialize row resampling indexes
2088     int hlidx = 0; // H(orizontal) L(ength) I(n)d(e)x
2089     int hkidx = 0; // H(orizontal) K(ernel) I(n)d(e)x
2090     int hiidx = 0; // H(orizontal) I(ndex) I(n)d(e)x
2091 
2092     // Number of lines contributing to the output line
2093     int vl = vlength[vlidx++]; // V(ertical) L(ength)
2094 
2095     // Process each output column
2096     for(int ox = 0; ox < roi_out->width; ox++)
2097     {
2098       debug_extra("output %p [% 4d % 4d]\n", out, ox, oy);
2099 
2100       // This will hold the resulting pixel
2101       float vs = 0.0f;
2102 
2103       // Number of horizontal samples contributing to the output
2104       const int hl = hlength[hlidx++]; // H(orizontal) L(ength)
2105 
2106       for(int iy = 0; iy < vl; iy++)
2107       {
2108         // This is our input line
2109         const float *i = (float *)((char *)in + (size_t)in_stride * vindex[viidx++]);
2110 
2111         float vhs = 0.0f;
2112 
2113         for(int ix = 0; ix < hl; ix++)
2114         {
2115           // Apply the precomputed filter kernel
2116           const size_t baseidx = (size_t)hindex[hiidx++];
2117           const float htap = hkernel[hkidx++];
2118           vhs += i[baseidx] * htap;
2119         }
2120 
2121         // Accumulate contribution from this line
2122         const float vtap = vkernel[vkidx++];
2123         vs += vhs * vtap;
2124 
2125         // Reset horizontal resampling context
2126         hkidx -= hl;
2127         hiidx -= hl;
2128       }
2129 
2130       // Output pixel is ready
2131       float *o = (float *)((char *)out + (size_t)oy * out_stride + (size_t)ox * sizeof(float));
2132       *o = vs;
2133 
2134       // Reset vertical resampling context
2135       viidx -= vl;
2136       vkidx -= vl;
2137 
2138       // Progress in horizontal context
2139       hiidx += hl;
2140       hkidx += hl;
2141     }
2142   }
2143 
2144 #if DEBUG_RESAMPLING_TIMING
2145   ts_resampling = getts() - ts_resampling;
2146   fprintf(stderr, "resampling %p plan:%" PRId64 "us resampling:%" PRId64 "us\n", in, ts_plan, ts_resampling);
2147 #endif
2148 
2149   exit:
2150   /* Free the resampling plans. It's nasty to optimize allocs like that, but
2151    * it simplifies the code :-D. The length array is in fact the only memory
2152    * allocated. */
2153   dt_free_align(hlength);
2154   dt_free_align(vlength);
2155 }
2156 
2157 /** Applies resampling (re-scaling) on *full* input and output buffers.
2158  *  roi_in and roi_out define the part of the buffers that is affected.
2159  */
dt_interpolation_resample_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2160 void dt_interpolation_resample_1c(const struct dt_interpolation *itor, float *out,
2161                                   const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2162                                   const float *const in, const dt_iop_roi_t *const roi_in,
2163                                   const int32_t in_stride)
2164 {
2165   return dt_interpolation_resample_1c_plain(itor, out, roi_out, out_stride, in, roi_in, in_stride);
2166 }
2167 
2168 /** Applies resampling (re-scaling) on a specific region-of-interest of an image. The input
2169  *  and output buffers hold exactly those roi's. roi_in and roi_out define the relative
2170  *  positions of the roi's within the full input and output image, respectively.
2171  */
dt_interpolation_resample_roi_1c(const struct dt_interpolation * itor,float * out,const dt_iop_roi_t * const roi_out,const int32_t out_stride,const float * const in,const dt_iop_roi_t * const roi_in,const int32_t in_stride)2172 void dt_interpolation_resample_roi_1c(const struct dt_interpolation *itor, float *out,
2173                                       const dt_iop_roi_t *const roi_out, const int32_t out_stride,
2174                                       const float *const in, const dt_iop_roi_t *const roi_in,
2175                                       const int32_t in_stride)
2176 {
2177   dt_iop_roi_t oroi = *roi_out;
2178   oroi.x = oroi.y = 0;
2179 
2180   dt_iop_roi_t iroi = *roi_in;
2181   iroi.x = iroi.y = 0;
2182 
2183   dt_interpolation_resample_1c(itor, out, &oroi, out_stride, in, &iroi, in_stride);
2184 }
2185 
2186 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2187 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2188 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2189