1 /*
2     This file is part of darktable,
3     Copyright (C) 2016-2021 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include "common/darktable.h"
20 #include "common/locallaplacian.h"
21 #include "common/math.h"
22 
23 #include <string.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <assert.h>
27 #include <stdio.h>
28 #if defined(__SSE2__)
29 #include <xmmintrin.h>
30 #endif
31 
32 // the maximum number of levels for the gaussian pyramid
33 #define max_levels 30
34 // the number of segments for the piecewise linear interpolation
35 #define num_gamma 6
36 
37 //#define DEBUG_DUMP
38 
39 // downsample width/height to given level
dl(int size,const int level)40 static inline int dl(int size, const int level)
41 {
42   for(int l=0;l<level;l++)
43     size = (size-1)/2+1;
44   return size;
45 }
46 
47 #ifdef DEBUG_DUMP
dump_PFM(const char * filename,const float * out,const uint32_t w,const uint32_t h)48 static void dump_PFM(const char *filename, const float* out, const uint32_t w, const uint32_t h)
49 {
50   FILE *f = g_fopen(filename, "wb");
51   fprintf(f, "PF\n%d %d\n-1.0\n", w, h);
52   for(int j=0;j<h;j++)
53     for(int i=0;i<w;i++)
54       for(int c=0;c<3;c++)
55         fwrite(out + w*j+i, 1, sizeof(float), f);
56   fclose(f);
57 }
58 #define debug_dump_PFM dump_PFM
59 #else
60 #define debug_dump_PFM(f,b,w,h)
61 #endif
62 
63 // needs a boundary of 1 or 2px around i,j or else it will crash.
64 // (translates to a 1px boundary around the corresponding pixel in the coarse buffer)
65 // more precisely, 1<=i<wd-1 for even wd and
66 //                 1<=i<wd-2 for odd wd (j likewise with ht)
ll_expand_gaussian(const float * const coarse,const int i,const int j,const int wd,const int ht)67 static inline float ll_expand_gaussian(
68     const float *const coarse,
69     const int i,
70     const int j,
71     const int wd,
72     const int ht)
73 {
74   assert(i > 0);
75   assert(i < wd-1);
76   assert(j > 0);
77   assert(j < ht-1);
78   assert(j/2 + 1 < (ht-1)/2+1);
79   assert(i/2 + 1 < (wd-1)/2+1);
80   const int cw = (wd-1)/2+1;
81   const int ind = (j/2)*cw+i/2;
82   // case 0:     case 1:     case 2:     case 3:
83   //  x . x . x   x . x . x   x . x . x   x . x . x
84   //  . . . . .   . . . . .   . .[.]. .   .[.]. . .
85   //  x .[x]. x   x[.]x . x   x . x . x   x . x . x
86   //  . . . . .   . . . . .   . . . . .   . . . . .
87   //  x . x . x   x . x . x   x . x . x   x . x . x
88   switch((i&1) + 2*(j&1))
89   {
90     case 0: // both are even, 3x3 stencil
91       return 4./256. * (
92           6.0f*(coarse[ind-cw] + coarse[ind-1] + 6.0f*coarse[ind] + coarse[ind+1] + coarse[ind+cw])
93           + coarse[ind-cw-1] + coarse[ind-cw+1] + coarse[ind+cw-1] + coarse[ind+cw+1]);
94     case 1: // i is odd, 2x3 stencil
95       return 4./256. * (
96           24.0*(coarse[ind] + coarse[ind+1]) +
97           4.0*(coarse[ind-cw] + coarse[ind-cw+1] + coarse[ind+cw] + coarse[ind+cw+1]));
98     case 2: // j is odd, 3x2 stencil
99       return 4./256. * (
100           24.0*(coarse[ind] + coarse[ind+cw]) +
101           4.0*(coarse[ind-1] + coarse[ind+1] + coarse[ind+cw-1] + coarse[ind+cw+1]));
102     default: // case 3: // both are odd, 2x2 stencil
103       return .25f * (coarse[ind] + coarse[ind+1] + coarse[ind+cw] + coarse[ind+cw+1]);
104   }
105 }
106 
107 // helper to fill in one pixel boundary by copying it
ll_fill_boundary1(float * const input,const int wd,const int ht)108 static inline void ll_fill_boundary1(
109     float *const input,
110     const int wd,
111     const int ht)
112 {
113   for(int j=1;j<ht-1;j++) input[j*wd] = input[j*wd+1];
114   for(int j=1;j<ht-1;j++) input[j*wd+wd-1] = input[j*wd+wd-2];
115   memcpy(input,    input+wd, sizeof(float)*wd);
116   memcpy(input+wd*(ht-1), input+wd*(ht-2), sizeof(float)*wd);
117 }
118 
119 // helper to fill in two pixels boundary by copying it
ll_fill_boundary2(float * const input,const int wd,const int ht)120 static inline void ll_fill_boundary2(
121     float *const input,
122     const int wd,
123     const int ht)
124 {
125   for(int j=1;j<ht-1;j++) input[j*wd] = input[j*wd+1];
126   if(wd & 1) for(int j=1;j<ht-1;j++) input[j*wd+wd-1] = input[j*wd+wd-2];
127   else       for(int j=1;j<ht-1;j++) input[j*wd+wd-1] = input[j*wd+wd-2] = input[j*wd+wd-3];
128   memcpy(input, input+wd, sizeof(float)*wd);
129   if(!(ht & 1)) memcpy(input+wd*(ht-2), input+wd*(ht-3), sizeof(float)*wd);
130   memcpy(input+wd*(ht-1), input+wd*(ht-2), sizeof(float)*wd);
131 }
132 
pad_by_replication(float * buf,const uint32_t w,const uint32_t h,const uint32_t padding)133 static void pad_by_replication(
134     float *buf,			// the buffer to be padded
135     const uint32_t w,		// width of a line
136     const uint32_t h,		// total height, including top and bottom padding
137     const uint32_t padding)	// number of lines of padding on each side
138 {
139 #ifdef _OPENMP
140 #pragma omp parallel for default(none) \
141   dt_omp_firstprivate(buf, padding, h, w) \
142   schedule(static)
143 #endif
144   for(int j=0;j<padding;j++)
145   {
146     memcpy(buf + w*j, buf+padding*w, sizeof(float)*w);
147     memcpy(buf + w*(h-padding+j), buf+w*(h-padding-1), sizeof(float)*w);
148   }
149 }
150 
gauss_expand(const float * const input,float * const fine,const int wd,const int ht)151 static inline void gauss_expand(
152     const float *const input, // coarse input
153     float *const fine,        // upsampled, blurry output
154     const int wd,             // fine res
155     const int ht)
156 {
157 #ifdef _OPENMP
158 #pragma omp parallel for default(none) \
159   dt_omp_firstprivate(fine, input, wd, ht) \
160   schedule(static) \
161   collapse(2)
162 #endif
163   for(int j=1;j<((ht-1)&~1);j++)  // even ht: two px boundary. odd ht: one px.
164     for(int i=1;i<((wd-1)&~1);i++)
165       fine[j*wd+i] = ll_expand_gaussian(input, i, j, wd, ht);
166   ll_fill_boundary2(fine, wd, ht);
167 }
168 
169 #if defined(__SSE2__)
convolve14641_vert(const float * in,const int wd)170 static inline __m128 convolve14641_vert(const float *in, const int wd)
171 {
172   const dt_aligned_pixel_t four = { 4.f, 4.f, 4.f, 4.f };
173   __m128 r0 = _mm_loadu_ps(in);
174   __m128 r1 = _mm_loadu_ps(in + wd);
175   __m128 r2 = _mm_loadu_ps(in + 2*wd);
176   __m128 r3 = _mm_loadu_ps(in + 3*wd);
177   __m128 r4 = _mm_loadu_ps(in + 4*wd);
178   _mm_prefetch(in+4,_MM_HINT_NTA);		// prefetch next column, which won't be used again afterwards
179   r0 = _mm_add_ps(r0, r4);                      // r0 = r0+r4
180   _mm_prefetch(in+4+wd,_MM_HINT_NTA);		// prefetch next column, which won't be used again afterwards
181   r1 = _mm_add_ps(_mm_add_ps(r1,r3), r2);       // r1 = r1+r2+r3
182   _mm_prefetch(in+4+2*wd,_MM_HINT_T0);
183   r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));     // r0 = r0+2*r2+r4
184   _mm_prefetch(in+4+3*wd, _MM_HINT_T0);
185   __m128 t = _mm_mul_ps(r1, _mm_load_ps(four)); // t= 4*r1+4*r2+4*r3
186   _mm_prefetch(in+4+4*wd, _MM_HINT_T0);
187   return _mm_add_ps(r0, t);                   // r0+4*r1+6*r2+4*r3+r4
188 }
189 #endif
190 
191 #if defined(__SSE2__)
gauss_reduce_sse2(const float * const input,float * const coarse,const int wd,const int ht)192 static inline void gauss_reduce_sse2(
193     const float *const input, // fine input buffer
194     float *const coarse,      // coarse scale, blurred input buf
195     const int wd,             // fine res
196     const int ht)
197 {
198   // blur, store only coarse res
199   const int cw = (wd-1)/2+1, ch = (ht-1)/2+1;
200 
201 #ifdef _OPENMP
202   // DON'T parallelize the very smallest levels of the pyramid, as the threading overhead
203   // is greater than the time needed to do it sequentially
204 #pragma omp parallel for default(none) if (ch*cw>1000)  \
205       dt_omp_firstprivate(cw, ch, input, wd, coarse) \
206       schedule(static)
207 #endif
208   for(int j=1;j<ch-1;j++)
209   {
210     const float *base = input + 2*(j-1)*wd;
211     float *const out = coarse + j*cw + 1;
212     // prime the vertical axis
213     const __m128 kernel = _mm_setr_ps(1.f, 4.f, 6.f, 4.f);
214     __m128 left = convolve14641_vert(base,wd);
215     for(int col=0; col<cw-3; col+=2)
216     {
217       // convolve the next four pixel wide vertical slice
218       base += 4;
219       __m128 right = convolve14641_vert(base,wd);
220       // horizontal pass, generate two output values from convolving with 1 4 6 4 1
221       // the first uses pixels 0-4, the second uses 2-6
222       __m128 conv = _mm_mul_ps(left,kernel);
223       out[col] = (conv[0] + conv[1] + conv[2] + conv[3] + right[0]) / 256.f;
224       out[col+1] = (left[2] + 4*(left[3]+right[1]) + 6*right[0] + right[2]) / 256.f;
225       // shift to next pair of output columns (four input columns)
226       left = right;
227     }
228     // handle the left-over pixel if the output size is odd
229     if (cw % 2)
230     {
231       base += 4;
232       float right = base[0] + 4*(base[wd]+base[3*wd]) + 6*base[2*wd] + base[4*wd];
233       __m128 conv = _mm_mul_ps(left,kernel);
234       out[cw-3] = (conv[0] + conv[1] + conv[2] + conv[3] + right) / 256.f;
235     }
236   }
237   ll_fill_boundary1(coarse, cw, ch);
238 }
239 #endif
240 
gauss_reduce(const float * const input,float * const coarse,const int wd,const int ht)241 static inline void gauss_reduce(
242     const float *const input, // fine input buffer
243     float *const coarse,      // coarse scale, blurred input buf
244     const int wd,             // fine res
245     const int ht)
246 {
247   // blur, store only coarse res
248   const int cw = (wd-1)/2+1, ch = (ht-1)/2+1;
249 
250   // this is the scalar (non-simd) code:
251   const float w[5] = { 1.f/16.f, 4.f/16.f, 6.f/16.f, 4.f/16.f, 1.f/16.f };
252   memset(coarse, 0, sizeof(float)*cw*ch);
253   // direct 5x5 stencil only on required pixels:
254 #ifdef _OPENMP
255   // DON'T parallelize the very smallest levels of the pyramid, as the threading overhead
256   // is greater than the time needed to do it sequentially
257 #pragma omp parallel for default(none) if (ch*cw>500)  \
258   dt_omp_firstprivate(coarse, cw, ch, input, w, wd) \
259   schedule(static) \
260   collapse(2)
261 #endif
262   for(int j=1;j<ch-1;j++)
263     for(int i=1;i<cw-1;i++)
264     {
265       for(int jj=-2;jj<=2;jj++)
266         for(int ii=-2;ii<=2;ii++)
267           coarse[j*cw+i] += input[(2*j+jj)*wd+2*i+ii] * w[ii+2] * w[jj+2];
268     }
269   ll_fill_boundary1(coarse, cw, ch);
270 }
271 
272 // allocate output buffer with monochrome brightness channel from input, padded
273 // up by max_supp on all four sides, dimensions written to wd2 ht2
ll_pad_input(const float * const input,const int wd,const int ht,const int max_supp,int * wd2,int * ht2,local_laplacian_boundary_t * b)274 static inline float *ll_pad_input(
275     const float *const input,
276     const int wd,
277     const int ht,
278     const int max_supp,
279     int *wd2,
280     int *ht2,
281     local_laplacian_boundary_t *b)
282 {
283   const int stride = 4;
284   *wd2 = 2*max_supp + wd;
285   *ht2 = 2*max_supp + ht;
286   float *const out = dt_alloc_align_float((size_t) *wd2 * *ht2);
287 
288   if(b && b->mode == 2)
289   { // pad by preview buffer
290 #ifdef _OPENMP
291 #pragma omp parallel for default(none) \
292     dt_omp_firstprivate(ht, input, max_supp, out, wd, stride) \
293     shared(wd2, ht2) \
294     schedule(static) \
295     collapse(2)
296 #endif // fill regular pixels:
297     for(int j=0;j<ht;j++) for(int i=0;i<wd;i++)
298       out[(j+max_supp)**wd2+i+max_supp] = input[stride*(wd*j+i)] * 0.01f; // L -> [0,1]
299 
300     // for all out of roi pixels on the boundary we wish to pad:
301     // compute coordinate in full image.
302     // if not out of buf:
303     //   compute padded preview pixel coordinate (clamp to padded preview buffer size)
304     // else
305     //   pad as usual (hi-res sample and hold)
306 #define LL_FILL(fallback) do {\
307     float isx = ((i - max_supp) + b->roi->x)/b->roi->scale;\
308     float isy = ((j - max_supp) + b->roi->y)/b->roi->scale;\
309     if(isx < 0 || isy >= b->buf->width\
310     || isy < 0 || isy >= b->buf->height)\
311       out[*wd2*j+i] = (fallback);\
312     else\
313     {\
314       int px = CLAMP(isx / (float)b->buf->width  * b->wd + (b->pwd-b->wd)/2, 0, b->pwd-1);\
315       int py = CLAMP(isy / (float)b->buf->height * b->ht + (b->pht-b->ht)/2, 0, b->pht-1);\
316       /* TODO: linear interpolation?*/\
317       out[*wd2*j+i] = b->pad0[b->pwd*py+px];\
318     } } while(0)
319 #ifdef _OPENMP
320 #pragma omp parallel for default(none) \
321     dt_omp_firstprivate(input, max_supp, out, wd, stride) \
322     shared(wd2, ht2, b) \
323     schedule(static) \
324     collapse(2)
325 #endif // left border
326     for(int j=max_supp;j<*ht2-max_supp;j++) for(int i=0;i<max_supp;i++)
327       LL_FILL(input[stride*wd*(j-max_supp)]* 0.01f);
328 #ifdef _OPENMP
329 #pragma omp parallel for default(none) \
330     dt_omp_firstprivate(input, max_supp, out, stride, wd) \
331     shared(wd2, ht2, b) \
332     schedule(static) \
333     collapse(2)
334 #endif // right border
335     for(int j=max_supp;j<*ht2-max_supp;j++) for(int i=wd+max_supp;i<*wd2;i++)
336       LL_FILL(input[stride*((j-max_supp)*wd+wd-1)] * 0.01f);
337 #ifdef _OPENMP
338 #pragma omp parallel for default(none) \
339     dt_omp_firstprivate(max_supp, out) \
340     shared(wd2, ht2, b) \
341     schedule(static) \
342     collapse(2)
343 #endif // top border
344     for(int j=0;j<max_supp;j++) for(int i=0;i<*wd2;i++)
345       LL_FILL(out[*wd2*max_supp+i]);
346 #ifdef _OPENMP
347 #pragma omp parallel for default(none) \
348     dt_omp_firstprivate(ht, max_supp, out) \
349     shared(wd2, ht2, b) \
350     schedule(static) \
351     collapse(2)
352 #endif // bottom border
353     for(int j=max_supp+ht;j<*ht2;j++) for(int i=0;i<*wd2;i++)
354       LL_FILL(out[*wd2*(max_supp+ht-1)+i]);
355 #undef LL_FILL
356   }
357   else
358   { // pad by replication:
359 #ifdef _OPENMP
360 #pragma omp parallel for default(none) \
361     dt_omp_firstprivate(input, ht, max_supp, out, wd, stride) \
362     shared(wd2, ht2) \
363     schedule(static)
364 #endif
365     for(int j=0;j<ht;j++)
366     {
367       for(int i=0;i<max_supp;i++)
368         out[(j+max_supp)**wd2+i] = input[stride*wd*j]* 0.01f; // L -> [0,1]
369       for(int i=0;i<wd;i++)
370         out[(j+max_supp)**wd2+i+max_supp] = input[stride*(wd*j+i)] * 0.01f; // L -> [0,1]
371       for(int i=wd+max_supp;i<*wd2;i++)
372         out[(j+max_supp)**wd2+i] = input[stride*(j*wd+wd-1)] * 0.01f; // L -> [0,1]
373     }
374     pad_by_replication(out, *wd2, *ht2, max_supp);
375   }
376 #ifdef DEBUG_DUMP
377   if(b && b->mode == 2)
378   {
379     dump_PFM("/tmp/padded.pfm",out,*wd2,*ht2);
380   }
381 #endif
382   return out;
383 }
384 
385 
ll_laplacian(const float * const coarse,const float * const fine,const int i,const int j,const int wd,const int ht)386 static inline float ll_laplacian(
387     const float *const coarse,   // coarse res gaussian
388     const float *const fine,     // fine res gaussian
389     const int i,                 // fine index
390     const int j,
391     const int wd,                // fine width
392     const int ht)                // fine height
393 {
394   const float c = ll_expand_gaussian(coarse,
395       CLAMPS(i, 1, ((wd-1)&~1)-1), CLAMPS(j, 1, ((ht-1)&~1)-1), wd, ht);
396   return fine[j*wd+i] - c;
397 }
398 
curve_scalar(const float x,const float g,const float sigma,const float shadows,const float highlights,const float clarity)399 static inline float curve_scalar(
400     const float x,
401     const float g,
402     const float sigma,
403     const float shadows,
404     const float highlights,
405     const float clarity)
406 {
407   const float c = x-g;
408   float val;
409   // blend in via quadratic bezier
410   if     (c >  2*sigma) val = g + sigma + shadows    * (c-sigma);
411   else if(c < -2*sigma) val = g - sigma + highlights * (c+sigma);
412   else if(c > 0.0f)
413   { // shadow contrast
414     const float t = CLAMPS(c / (2.0f*sigma), 0.0f, 1.0f);
415     const float t2 = t * t;
416     const float mt = 1.0f-t;
417     val = g + sigma * 2.0f*mt*t + t2*(sigma + sigma*shadows);
418   }
419   else
420   { // highlight contrast
421     const float t = CLAMPS(-c / (2.0f*sigma), 0.0f, 1.0f);
422     const float t2 = t * t;
423     const float mt = 1.0f-t;
424     val = g - sigma * 2.0f*mt*t + t2*(- sigma - sigma*highlights);
425   }
426   // midtone local contrast
427   val += clarity * c * expf(-c*c/(2.0*sigma*sigma/3.0f));
428   return val;
429 }
430 
431 #if defined(__SSE2__)
curve_vec4(const __m128 x,const __m128 g,const __m128 sigma,const __m128 shadows,const __m128 highlights,const __m128 clarity)432 static inline __m128 curve_vec4(
433     const __m128 x,
434     const __m128 g,
435     const __m128 sigma,
436     const __m128 shadows,
437     const __m128 highlights,
438     const __m128 clarity)
439 {
440   // TODO: pull these non-data dependent constants out of the loop to see
441   // whether the compiler fail to do so
442   const __m128 const0 = _mm_set_ps1(0x3f800000u);
443   const __m128 const1 = _mm_set_ps1((float)0x402DF854u); // for e^x
444   const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
445   const __m128 one = _mm_set1_ps(1.0f);
446   const __m128 two = _mm_set1_ps(2.0f);
447   const __m128 twothirds = _mm_set1_ps(2.0f/3.0f);
448   const __m128 twosig = _mm_mul_ps(two, sigma);
449   const __m128 sigma2 = _mm_mul_ps(sigma, sigma);
450   const __m128 s22 = _mm_mul_ps(twothirds, sigma2);
451 
452   const __m128 c = _mm_sub_ps(x, g);
453   const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps());
454   // select shadows or highlights as multiplier for linear part, based on c < 0
455   const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights));
456   // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma)
457   const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask));
458   // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma
459   const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma))));
460 
461   const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(),
462         _mm_div_ps(c, _mm_mul_ps(two, ssigma))));
463   const __m128 t2 = _mm_mul_ps(t, t);
464   const __m128 mt = _mm_sub_ps(one, t);
465 
466   // midtone value fading over to linear part, without local contrast:
467   const __m128 vmid = _mm_add_ps(g,
468       _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)),
469         _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi)))));
470 
471   // c > 2*sigma?
472   const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig);
473   const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid));
474 
475   // midtone local contrast
476   // dt_fast_expf in sse:
477   const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22));
478   const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0)));
479   const __m128 k = _mm_max_ps(k0, _mm_setzero_ps());
480   const __m128i ki = _mm_cvtps_epi32(k);
481   const __m128 gauss = _mm_load_ps((float*)&ki);
482   const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss));
483   return _mm_add_ps(val, vcon);
484 }
485 
486 // sse (4-wide)
apply_curve_sse2(float * const out,const float * const in,const uint32_t w,const uint32_t h,const uint32_t padding,const float g,const float sigma,const float shadows,const float highlights,const float clarity)487 void apply_curve_sse2(
488     float *const out,
489     const float *const in,
490     const uint32_t w,
491     const uint32_t h,
492     const uint32_t padding,
493     const float g,
494     const float sigma,
495     const float shadows,
496     const float highlights,
497     const float clarity)
498 {
499   // TODO: do all this in avx2 8-wide (should be straight forward):
500 #ifdef _OPENMP
501 #pragma omp parallel for default(none) \
502   dt_omp_firstprivate(clarity, g, h, highlights, in, out, padding, shadows, sigma, w) \
503   schedule(static)
504 #endif
505   for(uint32_t j=padding;j<h-padding;j++)
506   {
507     const float *in2  = in  + j*w + padding;
508     float *out2 = out + j*w + padding;
509     // find 4-byte aligned block in the middle:
510     const float *const beg = (float *)((size_t)(out2+3)&(size_t)0x10ul);
511     const float *const end = (float *)((size_t)(out2+w-padding)&(size_t)0x10ul);
512     const float *const fin = out2+w-padding;
513     const __m128 g4 = _mm_set1_ps(g);
514     const __m128 sig4 = _mm_set1_ps(sigma);
515     const __m128 shd4 = _mm_set1_ps(shadows);
516     const __m128 hil4 = _mm_set1_ps(highlights);
517     const __m128 clr4 = _mm_set1_ps(clarity);
518     for(;out2<beg;out2++,in2++)
519       *out2 = curve_scalar(*in2, g, sigma, shadows, highlights, clarity);
520     for(;out2<end;out2+=4,in2+=4)
521       _mm_stream_ps(out2, curve_vec4(_mm_load_ps(in2), g4, sig4, shd4, hil4, clr4));
522     for(;out2<fin;out2++,in2++)
523       *out2 = curve_scalar(*in2, g, sigma, shadows, highlights, clarity);
524     out2 = out + j*w;
525     for(int i=0;i<padding;i++)   out2[i] = out2[padding];
526     for(int i=w-padding;i<w;i++) out2[i] = out2[w-padding-1];
527   }
528   pad_by_replication(out, w, h, padding);
529 }
530 #endif
531 
532 // scalar version
apply_curve(float * const out,const float * const in,const uint32_t w,const uint32_t h,const uint32_t padding,const float g,const float sigma,const float shadows,const float highlights,const float clarity)533 void apply_curve(
534     float *const out,
535     const float *const in,
536     const uint32_t w,
537     const uint32_t h,
538     const uint32_t padding,
539     const float g,
540     const float sigma,
541     const float shadows,
542     const float highlights,
543     const float clarity)
544 {
545 #ifdef _OPENMP
546 #pragma omp parallel for default(none) \
547   dt_omp_firstprivate(clarity, g, h, highlights, in, out, padding, sigma, shadows, w) \
548   schedule(static)
549 #endif
550   for(uint32_t j=padding;j<h-padding;j++)
551   {
552     const float *in2  = in  + j*w + padding;
553     float *out2 = out + j*w + padding;
554     for(uint32_t i=padding;i<w-padding;i++)
555       (*out2++) = curve_scalar(*(in2++), g, sigma, shadows, highlights, clarity);
556     out2 = out + j*w;
557     for(int i=0;i<padding;i++)   out2[i] = out2[padding];
558     for(int i=w-padding;i<w;i++) out2[i] = out2[w-padding-1];
559   }
560   pad_by_replication(out, w, h, padding);
561 }
562 
local_laplacian_internal(const float * const input,float * const out,const int wd,const int ht,const float sigma,const float shadows,const float highlights,const float clarity,const int use_sse2,local_laplacian_boundary_t * b)563 void local_laplacian_internal(
564     const float *const input,   // input buffer in some Labx or yuvx format
565     float *const out,           // output buffer with colour
566     const int wd,               // width and
567     const int ht,               // height of the input buffer
568     const float sigma,          // user param: separate shadows/mid-tones/highlights
569     const float shadows,        // user param: lift shadows
570     const float highlights,     // user param: compress highlights
571     const float clarity,        // user param: increase clarity/local contrast
572     const int use_sse2,         // flag whether to use SSE version
573     local_laplacian_boundary_t *b)
574 {
575   if(wd <= 1 || ht <= 1) return;
576 
577   // don't divide by 2 more often than we can:
578   const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(wd,ht)));
579   int last_level = num_levels-1;
580   if(b && b->mode == 2) // higher number here makes it less prone to aliasing and slower.
581     last_level = num_levels > 4 ? 4 : num_levels-1;
582   const int max_supp = 1<<last_level;
583   int w, h;
584   float *padded[max_levels] = {0};
585   if(b && b->mode == 2)
586     padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h, b);
587   else
588     padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h, 0);
589 
590   // allocate pyramid pointers for padded input
591   for(int l=1;l<=last_level;l++)
592     padded[l] = dt_alloc_align_float((size_t)dl(w,l) * dl(h,l));
593 
594   // allocate pyramid pointers for output
595   float *output[max_levels] = {0};
596   for(int l=0;l<=last_level;l++)
597     output[l] = dt_alloc_align_float((size_t)dl(w,l) * dl(h,l));
598 
599   // create gauss pyramid of padded input, write coarse directly to output
600 #if defined(__SSE2__)
601   if(use_sse2)
602   {
603     for(int l=1;l<last_level;l++)
604       gauss_reduce_sse2(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1));
605     gauss_reduce_sse2(padded[last_level-1], output[last_level], dl(w,last_level-1), dl(h,last_level-1));
606   }
607   else
608 #endif
609   {
610     for(int l=1;l<last_level;l++)
611       gauss_reduce(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1));
612     gauss_reduce(padded[last_level-1], output[last_level], dl(w,last_level-1), dl(h,last_level-1));
613   }
614 
615   // evenly sample brightness [0,1]:
616   float gamma[num_gamma] = {0.0f};
617   for(int k=0;k<num_gamma;k++) gamma[k] = (k+.5f)/(float)num_gamma;
618   // for(int k=0;k<num_gamma;k++) gamma[k] = k/(num_gamma-1.0f);
619 
620   // allocate memory for intermediate laplacian pyramids
621   float *buf[num_gamma][max_levels] = {{0}};
622   for(int k=0;k<num_gamma;k++) for(int l=0;l<=last_level;l++)
623     buf[k][l] = dt_alloc_align_float((size_t)dl(w,l)*dl(h,l));
624 
625   // the paper says remapping only level 3 not 0 does the trick, too
626   // (but i really like the additional octave of sharpness we get,
627   // willing to pay the cost).
628   for(int k=0;k<num_gamma;k++)
629   { // process images
630 #if defined(__SSE2__)
631     if(use_sse2)
632       apply_curve_sse2(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);
633     else // brackets in next line needed for silly gcc warning:
634 #endif
635     {apply_curve(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);}
636 
637     // create gaussian pyramids
638     for(int l=1;l<=last_level;l++)
639 #if defined(__SSE2__)
640       if(use_sse2)
641         gauss_reduce_sse2(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1));
642       else
643 #endif
644         gauss_reduce(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1));
645   }
646 
647   // resample output[last_level] from preview
648   // requires to transform from padded/downsampled to full image and then
649   // to padded/downsampled in preview
650   if(b && b->mode == 2)
651   {
652     const float isize = powf(2.0f, last_level) / b->roi->scale; // pixel size of coarsest level in image space
653     const float psize = isize / b->buf->width * b->wd; // pixel footprint rescaled to preview buffer
654     const float pl = log2f(psize); // mip level in preview buffer
655     const int pl0 = CLAMP((int)pl, 0, b->num_levels-1), pl1 = CLAMP((int)(pl+1), 0, b->num_levels-1);
656     const float weight = CLAMP(pl-pl0, 0, 1); // weight between mip levels
657     const float mul0 = 1.0/powf(2.0f, pl0);
658     const float mul1 = 1.0/powf(2.0f, pl1);
659     const float mul = powf(2.0f, last_level);
660     const int pw = dl(w,last_level), ph = dl(h,last_level);
661     const int pw0 = dl(b->pwd, pl0), ph0 = dl(b->pht, pl0);
662     const int pw1 = dl(b->pwd, pl1), ph1 = dl(b->pht, pl1);
663     debug_dump_PFM("/tmp/coarse.pfm", b->output[pl0], pw0, ph0);
664     debug_dump_PFM("/tmp/oldcoarse.pfm", output[last_level], pw, ph);
665 #ifdef _OPENMP
666 #pragma omp parallel for schedule(static) collapse(2) default(shared)
667 #endif
668     for(int j=0;j<ph;j++) for(int i=0;i<pw;i++)
669     {
670       // image coordinates in full buffer
671       float ix = ((i*mul - max_supp) + b->roi->x)/b->roi->scale;
672       float iy = ((j*mul - max_supp) + b->roi->y)/b->roi->scale;
673       // coordinates in padded preview buffer (
674       float px = CLAMP(ix / (float)b->buf->width  * b->wd + (b->pwd-b->wd)/2.0f, 0, b->pwd);
675       float py = CLAMP(iy / (float)b->buf->height * b->ht + (b->pht-b->ht)/2.0f, 0, b->pht);
676       // trilinear lookup:
677       int px0 = CLAMP(px*mul0, 0, pw0-1);
678       int py0 = CLAMP(py*mul0, 0, ph0-1);
679       int px1 = CLAMP(px*mul1, 0, pw1-1);
680       int py1 = CLAMP(py*mul1, 0, ph1-1);
681 #if 1
682       float f0x = CLAMP(px*mul0 - px0, 0.0f, 1.0f);
683       float f0y = CLAMP(py*mul0 - py0, 0.0f, 1.0f);
684       float f1x = CLAMP(px*mul1 - px1, 0.0f, 1.0f);
685       float f1y = CLAMP(py*mul1 - py1, 0.0f, 1.0f);
686       float c0 =
687         (1.0f-f0x)*(1.0f-f0y)*b->output[pl0][CLAMP(py0  , 0, ph0-1)*pw0 + CLAMP(px0  , 0, pw0-1)]+
688         (     f0x)*(1.0f-f0y)*b->output[pl0][CLAMP(py0  , 0, ph0-1)*pw0 + CLAMP(px0+1, 0, pw0-1)]+
689         (1.0f-f0x)*(     f0y)*b->output[pl0][CLAMP(py0+1, 0, ph0-1)*pw0 + CLAMP(px0  , 0, pw0-1)]+
690         (     f0x)*(     f0y)*b->output[pl0][CLAMP(py0+1, 0, ph0-1)*pw0 + CLAMP(px0+1, 0, pw0-1)];
691       float c1 =
692         (1.0f-f1x)*(1.0f-f1y)*b->output[pl1][CLAMP(py1  , 0, ph1-1)*pw1 + CLAMP(px1  , 0, pw1-1)]+
693         (     f1x)*(1.0f-f1y)*b->output[pl1][CLAMP(py1  , 0, ph1-1)*pw1 + CLAMP(px1+1, 0, pw1-1)]+
694         (1.0f-f1x)*(     f1y)*b->output[pl1][CLAMP(py1+1, 0, ph1-1)*pw1 + CLAMP(px1  , 0, pw1-1)]+
695         (     f1x)*(     f1y)*b->output[pl1][CLAMP(py1+1, 0, ph1-1)*pw1 + CLAMP(px1+1, 0, pw1-1)];
696 #else
697       float c0 = b->output[pl0][py0*pw0 + px0];
698       float c1 = b->output[pl1][py1*pw1 + px1];
699 #endif
700       output[last_level][j*pw+i] = weight * c1 + (1.0f-weight) * c0;
701     }
702     debug_dump_PFM("/tmp/newcoarse.pfm", output[last_level], pw, ph);
703   }
704 
705   // assemble output pyramid coarse to fine
706   for(int l=last_level-1;l >= 0; l--)
707   {
708     const int pw = dl(w,l), ph = dl(h,l);
709 
710     gauss_expand(output[l+1], output[l], pw, ph);
711     // go through all coefficients in the upsampled gauss buffer:
712 #ifdef _OPENMP
713 #pragma omp parallel for default(none) \
714     dt_omp_firstprivate(ph, pw) \
715     shared(w,h,buf,output,l,gamma,padded) \
716     schedule(static) \
717     collapse(2)
718 #endif
719     for(int j=0;j<ph;j++) for(int i=0;i<pw;i++)
720     {
721       const float v = padded[l][j*pw+i];
722       int hi = 1;
723       for(;hi<num_gamma-1 && gamma[hi] <= v;hi++);
724       int lo = hi-1;
725       const float a = CLAMPS((v - gamma[lo])/(gamma[hi]-gamma[lo]), 0.0f, 1.0f);
726       const float l0 = ll_laplacian(buf[lo][l+1], buf[lo][l], i, j, pw, ph);
727       const float l1 = ll_laplacian(buf[hi][l+1], buf[hi][l], i, j, pw, ph);
728       output[l][j*pw+i] += l0 * (1.0f-a) + l1 * a;
729       // we could do this to save on memory (no need for finest buf[][]).
730       // unfortunately it results in a quite noticeable loss of sharpness, i think
731       // the extra level is worth it.
732       // else if(l == 0) // use finest scale from input to not amplify noise (and use less memory)
733       //   output[l][j*pw+i] += ll_laplacian(padded[l+1], padded[l], i, j, pw, ph);
734     }
735   }
736 #ifdef _OPENMP
737 #pragma omp parallel for default(none) \
738   dt_omp_firstprivate(ht, input, max_supp, out, wd) \
739   shared(w,output,buf) \
740   schedule(static) \
741   collapse(2)
742 #endif
743   for(int j=0;j<ht;j++) for(int i=0;i<wd;i++)
744   {
745     out[4*(j*wd+i)+0] = 100.0f * output[0][(j+max_supp)*w+max_supp+i]; // [0,1] -> L
746     out[4*(j*wd+i)+1] = input[4*(j*wd+i)+1]; // copy original colour channels
747     out[4*(j*wd+i)+2] = input[4*(j*wd+i)+2];
748   }
749   if(b && b->mode == 1)
750   { // output the buffers for later re-use
751     b->pad0 = padded[0];
752     b->wd = wd;
753     b->ht = ht;
754     b->pwd = w;
755     b->pht = h;
756     b->num_levels = num_levels;
757     for(int l=0;l<num_levels;l++) b->output[l] = output[l];
758   }
759   // free all buffers except the ones passed out for preview rendering
760   for(int l=0;l<max_levels;l++)
761   {
762     if(!b || b->mode != 1 || l)   dt_free_align(padded[l]);
763     if(!b || b->mode != 1)        dt_free_align(output[l]);
764     for(int k=0; k<num_gamma;k++) dt_free_align(buf[k][l]);
765   }
766 }
767 
768 
local_laplacian_memory_use(const int width,const int height)769 size_t local_laplacian_memory_use(const int width,     // width of input image
770                                   const int height)    // height of input image
771 {
772   const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(width,height)));
773   const int max_supp = 1<<(num_levels-1);
774   const int paddwd = width  + 2*max_supp;
775   const int paddht = height + 2*max_supp;
776 
777   size_t memory_use = 0;
778 
779   for(int l=0;l<num_levels;l++)
780     memory_use += sizeof(float) * (2 + num_gamma) * dl(paddwd, l) * dl(paddht, l);
781 
782   return memory_use;
783 }
784 
local_laplacian_singlebuffer_size(const int width,const int height)785 size_t local_laplacian_singlebuffer_size(const int width,     // width of input image
786                                          const int height)    // height of input image
787 {
788   const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(width,height)));
789   const int max_supp = 1<<(num_levels-1);
790   const int paddwd = width  + 2*max_supp;
791   const int paddht = height + 2*max_supp;
792 
793   return sizeof(float) * dl(paddwd, 0) * dl(paddht, 0);
794 }
795