1 /*
2 This file is part of darktable,
3 Copyright (C) 2016-2021 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "common/darktable.h"
20 #include "common/locallaplacian.h"
21 #include "common/math.h"
22
23 #include <string.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <assert.h>
27 #include <stdio.h>
28 #if defined(__SSE2__)
29 #include <xmmintrin.h>
30 #endif
31
32 // the maximum number of levels for the gaussian pyramid
33 #define max_levels 30
34 // the number of segments for the piecewise linear interpolation
35 #define num_gamma 6
36
37 //#define DEBUG_DUMP
38
39 // downsample width/height to given level
dl(int size,const int level)40 static inline int dl(int size, const int level)
41 {
42 for(int l=0;l<level;l++)
43 size = (size-1)/2+1;
44 return size;
45 }
46
47 #ifdef DEBUG_DUMP
dump_PFM(const char * filename,const float * out,const uint32_t w,const uint32_t h)48 static void dump_PFM(const char *filename, const float* out, const uint32_t w, const uint32_t h)
49 {
50 FILE *f = g_fopen(filename, "wb");
51 fprintf(f, "PF\n%d %d\n-1.0\n", w, h);
52 for(int j=0;j<h;j++)
53 for(int i=0;i<w;i++)
54 for(int c=0;c<3;c++)
55 fwrite(out + w*j+i, 1, sizeof(float), f);
56 fclose(f);
57 }
58 #define debug_dump_PFM dump_PFM
59 #else
60 #define debug_dump_PFM(f,b,w,h)
61 #endif
62
63 // needs a boundary of 1 or 2px around i,j or else it will crash.
64 // (translates to a 1px boundary around the corresponding pixel in the coarse buffer)
65 // more precisely, 1<=i<wd-1 for even wd and
66 // 1<=i<wd-2 for odd wd (j likewise with ht)
ll_expand_gaussian(const float * const coarse,const int i,const int j,const int wd,const int ht)67 static inline float ll_expand_gaussian(
68 const float *const coarse,
69 const int i,
70 const int j,
71 const int wd,
72 const int ht)
73 {
74 assert(i > 0);
75 assert(i < wd-1);
76 assert(j > 0);
77 assert(j < ht-1);
78 assert(j/2 + 1 < (ht-1)/2+1);
79 assert(i/2 + 1 < (wd-1)/2+1);
80 const int cw = (wd-1)/2+1;
81 const int ind = (j/2)*cw+i/2;
82 // case 0: case 1: case 2: case 3:
83 // x . x . x x . x . x x . x . x x . x . x
84 // . . . . . . . . . . . .[.]. . .[.]. . .
85 // x .[x]. x x[.]x . x x . x . x x . x . x
86 // . . . . . . . . . . . . . . . . . . . .
87 // x . x . x x . x . x x . x . x x . x . x
88 switch((i&1) + 2*(j&1))
89 {
90 case 0: // both are even, 3x3 stencil
91 return 4./256. * (
92 6.0f*(coarse[ind-cw] + coarse[ind-1] + 6.0f*coarse[ind] + coarse[ind+1] + coarse[ind+cw])
93 + coarse[ind-cw-1] + coarse[ind-cw+1] + coarse[ind+cw-1] + coarse[ind+cw+1]);
94 case 1: // i is odd, 2x3 stencil
95 return 4./256. * (
96 24.0*(coarse[ind] + coarse[ind+1]) +
97 4.0*(coarse[ind-cw] + coarse[ind-cw+1] + coarse[ind+cw] + coarse[ind+cw+1]));
98 case 2: // j is odd, 3x2 stencil
99 return 4./256. * (
100 24.0*(coarse[ind] + coarse[ind+cw]) +
101 4.0*(coarse[ind-1] + coarse[ind+1] + coarse[ind+cw-1] + coarse[ind+cw+1]));
102 default: // case 3: // both are odd, 2x2 stencil
103 return .25f * (coarse[ind] + coarse[ind+1] + coarse[ind+cw] + coarse[ind+cw+1]);
104 }
105 }
106
107 // helper to fill in one pixel boundary by copying it
ll_fill_boundary1(float * const input,const int wd,const int ht)108 static inline void ll_fill_boundary1(
109 float *const input,
110 const int wd,
111 const int ht)
112 {
113 for(int j=1;j<ht-1;j++) input[j*wd] = input[j*wd+1];
114 for(int j=1;j<ht-1;j++) input[j*wd+wd-1] = input[j*wd+wd-2];
115 memcpy(input, input+wd, sizeof(float)*wd);
116 memcpy(input+wd*(ht-1), input+wd*(ht-2), sizeof(float)*wd);
117 }
118
119 // helper to fill in two pixels boundary by copying it
ll_fill_boundary2(float * const input,const int wd,const int ht)120 static inline void ll_fill_boundary2(
121 float *const input,
122 const int wd,
123 const int ht)
124 {
125 for(int j=1;j<ht-1;j++) input[j*wd] = input[j*wd+1];
126 if(wd & 1) for(int j=1;j<ht-1;j++) input[j*wd+wd-1] = input[j*wd+wd-2];
127 else for(int j=1;j<ht-1;j++) input[j*wd+wd-1] = input[j*wd+wd-2] = input[j*wd+wd-3];
128 memcpy(input, input+wd, sizeof(float)*wd);
129 if(!(ht & 1)) memcpy(input+wd*(ht-2), input+wd*(ht-3), sizeof(float)*wd);
130 memcpy(input+wd*(ht-1), input+wd*(ht-2), sizeof(float)*wd);
131 }
132
pad_by_replication(float * buf,const uint32_t w,const uint32_t h,const uint32_t padding)133 static void pad_by_replication(
134 float *buf, // the buffer to be padded
135 const uint32_t w, // width of a line
136 const uint32_t h, // total height, including top and bottom padding
137 const uint32_t padding) // number of lines of padding on each side
138 {
139 #ifdef _OPENMP
140 #pragma omp parallel for default(none) \
141 dt_omp_firstprivate(buf, padding, h, w) \
142 schedule(static)
143 #endif
144 for(int j=0;j<padding;j++)
145 {
146 memcpy(buf + w*j, buf+padding*w, sizeof(float)*w);
147 memcpy(buf + w*(h-padding+j), buf+w*(h-padding-1), sizeof(float)*w);
148 }
149 }
150
gauss_expand(const float * const input,float * const fine,const int wd,const int ht)151 static inline void gauss_expand(
152 const float *const input, // coarse input
153 float *const fine, // upsampled, blurry output
154 const int wd, // fine res
155 const int ht)
156 {
157 #ifdef _OPENMP
158 #pragma omp parallel for default(none) \
159 dt_omp_firstprivate(fine, input, wd, ht) \
160 schedule(static) \
161 collapse(2)
162 #endif
163 for(int j=1;j<((ht-1)&~1);j++) // even ht: two px boundary. odd ht: one px.
164 for(int i=1;i<((wd-1)&~1);i++)
165 fine[j*wd+i] = ll_expand_gaussian(input, i, j, wd, ht);
166 ll_fill_boundary2(fine, wd, ht);
167 }
168
169 #if defined(__SSE2__)
convolve14641_vert(const float * in,const int wd)170 static inline __m128 convolve14641_vert(const float *in, const int wd)
171 {
172 const dt_aligned_pixel_t four = { 4.f, 4.f, 4.f, 4.f };
173 __m128 r0 = _mm_loadu_ps(in);
174 __m128 r1 = _mm_loadu_ps(in + wd);
175 __m128 r2 = _mm_loadu_ps(in + 2*wd);
176 __m128 r3 = _mm_loadu_ps(in + 3*wd);
177 __m128 r4 = _mm_loadu_ps(in + 4*wd);
178 _mm_prefetch(in+4,_MM_HINT_NTA); // prefetch next column, which won't be used again afterwards
179 r0 = _mm_add_ps(r0, r4); // r0 = r0+r4
180 _mm_prefetch(in+4+wd,_MM_HINT_NTA); // prefetch next column, which won't be used again afterwards
181 r1 = _mm_add_ps(_mm_add_ps(r1,r3), r2); // r1 = r1+r2+r3
182 _mm_prefetch(in+4+2*wd,_MM_HINT_T0);
183 r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2)); // r0 = r0+2*r2+r4
184 _mm_prefetch(in+4+3*wd, _MM_HINT_T0);
185 __m128 t = _mm_mul_ps(r1, _mm_load_ps(four)); // t= 4*r1+4*r2+4*r3
186 _mm_prefetch(in+4+4*wd, _MM_HINT_T0);
187 return _mm_add_ps(r0, t); // r0+4*r1+6*r2+4*r3+r4
188 }
189 #endif
190
191 #if defined(__SSE2__)
gauss_reduce_sse2(const float * const input,float * const coarse,const int wd,const int ht)192 static inline void gauss_reduce_sse2(
193 const float *const input, // fine input buffer
194 float *const coarse, // coarse scale, blurred input buf
195 const int wd, // fine res
196 const int ht)
197 {
198 // blur, store only coarse res
199 const int cw = (wd-1)/2+1, ch = (ht-1)/2+1;
200
201 #ifdef _OPENMP
202 // DON'T parallelize the very smallest levels of the pyramid, as the threading overhead
203 // is greater than the time needed to do it sequentially
204 #pragma omp parallel for default(none) if (ch*cw>1000) \
205 dt_omp_firstprivate(cw, ch, input, wd, coarse) \
206 schedule(static)
207 #endif
208 for(int j=1;j<ch-1;j++)
209 {
210 const float *base = input + 2*(j-1)*wd;
211 float *const out = coarse + j*cw + 1;
212 // prime the vertical axis
213 const __m128 kernel = _mm_setr_ps(1.f, 4.f, 6.f, 4.f);
214 __m128 left = convolve14641_vert(base,wd);
215 for(int col=0; col<cw-3; col+=2)
216 {
217 // convolve the next four pixel wide vertical slice
218 base += 4;
219 __m128 right = convolve14641_vert(base,wd);
220 // horizontal pass, generate two output values from convolving with 1 4 6 4 1
221 // the first uses pixels 0-4, the second uses 2-6
222 __m128 conv = _mm_mul_ps(left,kernel);
223 out[col] = (conv[0] + conv[1] + conv[2] + conv[3] + right[0]) / 256.f;
224 out[col+1] = (left[2] + 4*(left[3]+right[1]) + 6*right[0] + right[2]) / 256.f;
225 // shift to next pair of output columns (four input columns)
226 left = right;
227 }
228 // handle the left-over pixel if the output size is odd
229 if (cw % 2)
230 {
231 base += 4;
232 float right = base[0] + 4*(base[wd]+base[3*wd]) + 6*base[2*wd] + base[4*wd];
233 __m128 conv = _mm_mul_ps(left,kernel);
234 out[cw-3] = (conv[0] + conv[1] + conv[2] + conv[3] + right) / 256.f;
235 }
236 }
237 ll_fill_boundary1(coarse, cw, ch);
238 }
239 #endif
240
gauss_reduce(const float * const input,float * const coarse,const int wd,const int ht)241 static inline void gauss_reduce(
242 const float *const input, // fine input buffer
243 float *const coarse, // coarse scale, blurred input buf
244 const int wd, // fine res
245 const int ht)
246 {
247 // blur, store only coarse res
248 const int cw = (wd-1)/2+1, ch = (ht-1)/2+1;
249
250 // this is the scalar (non-simd) code:
251 const float w[5] = { 1.f/16.f, 4.f/16.f, 6.f/16.f, 4.f/16.f, 1.f/16.f };
252 memset(coarse, 0, sizeof(float)*cw*ch);
253 // direct 5x5 stencil only on required pixels:
254 #ifdef _OPENMP
255 // DON'T parallelize the very smallest levels of the pyramid, as the threading overhead
256 // is greater than the time needed to do it sequentially
257 #pragma omp parallel for default(none) if (ch*cw>500) \
258 dt_omp_firstprivate(coarse, cw, ch, input, w, wd) \
259 schedule(static) \
260 collapse(2)
261 #endif
262 for(int j=1;j<ch-1;j++)
263 for(int i=1;i<cw-1;i++)
264 {
265 for(int jj=-2;jj<=2;jj++)
266 for(int ii=-2;ii<=2;ii++)
267 coarse[j*cw+i] += input[(2*j+jj)*wd+2*i+ii] * w[ii+2] * w[jj+2];
268 }
269 ll_fill_boundary1(coarse, cw, ch);
270 }
271
272 // allocate output buffer with monochrome brightness channel from input, padded
273 // up by max_supp on all four sides, dimensions written to wd2 ht2
ll_pad_input(const float * const input,const int wd,const int ht,const int max_supp,int * wd2,int * ht2,local_laplacian_boundary_t * b)274 static inline float *ll_pad_input(
275 const float *const input,
276 const int wd,
277 const int ht,
278 const int max_supp,
279 int *wd2,
280 int *ht2,
281 local_laplacian_boundary_t *b)
282 {
283 const int stride = 4;
284 *wd2 = 2*max_supp + wd;
285 *ht2 = 2*max_supp + ht;
286 float *const out = dt_alloc_align_float((size_t) *wd2 * *ht2);
287
288 if(b && b->mode == 2)
289 { // pad by preview buffer
290 #ifdef _OPENMP
291 #pragma omp parallel for default(none) \
292 dt_omp_firstprivate(ht, input, max_supp, out, wd, stride) \
293 shared(wd2, ht2) \
294 schedule(static) \
295 collapse(2)
296 #endif // fill regular pixels:
297 for(int j=0;j<ht;j++) for(int i=0;i<wd;i++)
298 out[(j+max_supp)**wd2+i+max_supp] = input[stride*(wd*j+i)] * 0.01f; // L -> [0,1]
299
300 // for all out of roi pixels on the boundary we wish to pad:
301 // compute coordinate in full image.
302 // if not out of buf:
303 // compute padded preview pixel coordinate (clamp to padded preview buffer size)
304 // else
305 // pad as usual (hi-res sample and hold)
306 #define LL_FILL(fallback) do {\
307 float isx = ((i - max_supp) + b->roi->x)/b->roi->scale;\
308 float isy = ((j - max_supp) + b->roi->y)/b->roi->scale;\
309 if(isx < 0 || isy >= b->buf->width\
310 || isy < 0 || isy >= b->buf->height)\
311 out[*wd2*j+i] = (fallback);\
312 else\
313 {\
314 int px = CLAMP(isx / (float)b->buf->width * b->wd + (b->pwd-b->wd)/2, 0, b->pwd-1);\
315 int py = CLAMP(isy / (float)b->buf->height * b->ht + (b->pht-b->ht)/2, 0, b->pht-1);\
316 /* TODO: linear interpolation?*/\
317 out[*wd2*j+i] = b->pad0[b->pwd*py+px];\
318 } } while(0)
319 #ifdef _OPENMP
320 #pragma omp parallel for default(none) \
321 dt_omp_firstprivate(input, max_supp, out, wd, stride) \
322 shared(wd2, ht2, b) \
323 schedule(static) \
324 collapse(2)
325 #endif // left border
326 for(int j=max_supp;j<*ht2-max_supp;j++) for(int i=0;i<max_supp;i++)
327 LL_FILL(input[stride*wd*(j-max_supp)]* 0.01f);
328 #ifdef _OPENMP
329 #pragma omp parallel for default(none) \
330 dt_omp_firstprivate(input, max_supp, out, stride, wd) \
331 shared(wd2, ht2, b) \
332 schedule(static) \
333 collapse(2)
334 #endif // right border
335 for(int j=max_supp;j<*ht2-max_supp;j++) for(int i=wd+max_supp;i<*wd2;i++)
336 LL_FILL(input[stride*((j-max_supp)*wd+wd-1)] * 0.01f);
337 #ifdef _OPENMP
338 #pragma omp parallel for default(none) \
339 dt_omp_firstprivate(max_supp, out) \
340 shared(wd2, ht2, b) \
341 schedule(static) \
342 collapse(2)
343 #endif // top border
344 for(int j=0;j<max_supp;j++) for(int i=0;i<*wd2;i++)
345 LL_FILL(out[*wd2*max_supp+i]);
346 #ifdef _OPENMP
347 #pragma omp parallel for default(none) \
348 dt_omp_firstprivate(ht, max_supp, out) \
349 shared(wd2, ht2, b) \
350 schedule(static) \
351 collapse(2)
352 #endif // bottom border
353 for(int j=max_supp+ht;j<*ht2;j++) for(int i=0;i<*wd2;i++)
354 LL_FILL(out[*wd2*(max_supp+ht-1)+i]);
355 #undef LL_FILL
356 }
357 else
358 { // pad by replication:
359 #ifdef _OPENMP
360 #pragma omp parallel for default(none) \
361 dt_omp_firstprivate(input, ht, max_supp, out, wd, stride) \
362 shared(wd2, ht2) \
363 schedule(static)
364 #endif
365 for(int j=0;j<ht;j++)
366 {
367 for(int i=0;i<max_supp;i++)
368 out[(j+max_supp)**wd2+i] = input[stride*wd*j]* 0.01f; // L -> [0,1]
369 for(int i=0;i<wd;i++)
370 out[(j+max_supp)**wd2+i+max_supp] = input[stride*(wd*j+i)] * 0.01f; // L -> [0,1]
371 for(int i=wd+max_supp;i<*wd2;i++)
372 out[(j+max_supp)**wd2+i] = input[stride*(j*wd+wd-1)] * 0.01f; // L -> [0,1]
373 }
374 pad_by_replication(out, *wd2, *ht2, max_supp);
375 }
376 #ifdef DEBUG_DUMP
377 if(b && b->mode == 2)
378 {
379 dump_PFM("/tmp/padded.pfm",out,*wd2,*ht2);
380 }
381 #endif
382 return out;
383 }
384
385
ll_laplacian(const float * const coarse,const float * const fine,const int i,const int j,const int wd,const int ht)386 static inline float ll_laplacian(
387 const float *const coarse, // coarse res gaussian
388 const float *const fine, // fine res gaussian
389 const int i, // fine index
390 const int j,
391 const int wd, // fine width
392 const int ht) // fine height
393 {
394 const float c = ll_expand_gaussian(coarse,
395 CLAMPS(i, 1, ((wd-1)&~1)-1), CLAMPS(j, 1, ((ht-1)&~1)-1), wd, ht);
396 return fine[j*wd+i] - c;
397 }
398
curve_scalar(const float x,const float g,const float sigma,const float shadows,const float highlights,const float clarity)399 static inline float curve_scalar(
400 const float x,
401 const float g,
402 const float sigma,
403 const float shadows,
404 const float highlights,
405 const float clarity)
406 {
407 const float c = x-g;
408 float val;
409 // blend in via quadratic bezier
410 if (c > 2*sigma) val = g + sigma + shadows * (c-sigma);
411 else if(c < -2*sigma) val = g - sigma + highlights * (c+sigma);
412 else if(c > 0.0f)
413 { // shadow contrast
414 const float t = CLAMPS(c / (2.0f*sigma), 0.0f, 1.0f);
415 const float t2 = t * t;
416 const float mt = 1.0f-t;
417 val = g + sigma * 2.0f*mt*t + t2*(sigma + sigma*shadows);
418 }
419 else
420 { // highlight contrast
421 const float t = CLAMPS(-c / (2.0f*sigma), 0.0f, 1.0f);
422 const float t2 = t * t;
423 const float mt = 1.0f-t;
424 val = g - sigma * 2.0f*mt*t + t2*(- sigma - sigma*highlights);
425 }
426 // midtone local contrast
427 val += clarity * c * expf(-c*c/(2.0*sigma*sigma/3.0f));
428 return val;
429 }
430
431 #if defined(__SSE2__)
curve_vec4(const __m128 x,const __m128 g,const __m128 sigma,const __m128 shadows,const __m128 highlights,const __m128 clarity)432 static inline __m128 curve_vec4(
433 const __m128 x,
434 const __m128 g,
435 const __m128 sigma,
436 const __m128 shadows,
437 const __m128 highlights,
438 const __m128 clarity)
439 {
440 // TODO: pull these non-data dependent constants out of the loop to see
441 // whether the compiler fail to do so
442 const __m128 const0 = _mm_set_ps1(0x3f800000u);
443 const __m128 const1 = _mm_set_ps1((float)0x402DF854u); // for e^x
444 const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
445 const __m128 one = _mm_set1_ps(1.0f);
446 const __m128 two = _mm_set1_ps(2.0f);
447 const __m128 twothirds = _mm_set1_ps(2.0f/3.0f);
448 const __m128 twosig = _mm_mul_ps(two, sigma);
449 const __m128 sigma2 = _mm_mul_ps(sigma, sigma);
450 const __m128 s22 = _mm_mul_ps(twothirds, sigma2);
451
452 const __m128 c = _mm_sub_ps(x, g);
453 const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps());
454 // select shadows or highlights as multiplier for linear part, based on c < 0
455 const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights));
456 // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma)
457 const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask));
458 // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma
459 const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma))));
460
461 const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(),
462 _mm_div_ps(c, _mm_mul_ps(two, ssigma))));
463 const __m128 t2 = _mm_mul_ps(t, t);
464 const __m128 mt = _mm_sub_ps(one, t);
465
466 // midtone value fading over to linear part, without local contrast:
467 const __m128 vmid = _mm_add_ps(g,
468 _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)),
469 _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi)))));
470
471 // c > 2*sigma?
472 const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig);
473 const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid));
474
475 // midtone local contrast
476 // dt_fast_expf in sse:
477 const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22));
478 const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0)));
479 const __m128 k = _mm_max_ps(k0, _mm_setzero_ps());
480 const __m128i ki = _mm_cvtps_epi32(k);
481 const __m128 gauss = _mm_load_ps((float*)&ki);
482 const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss));
483 return _mm_add_ps(val, vcon);
484 }
485
486 // sse (4-wide)
apply_curve_sse2(float * const out,const float * const in,const uint32_t w,const uint32_t h,const uint32_t padding,const float g,const float sigma,const float shadows,const float highlights,const float clarity)487 void apply_curve_sse2(
488 float *const out,
489 const float *const in,
490 const uint32_t w,
491 const uint32_t h,
492 const uint32_t padding,
493 const float g,
494 const float sigma,
495 const float shadows,
496 const float highlights,
497 const float clarity)
498 {
499 // TODO: do all this in avx2 8-wide (should be straight forward):
500 #ifdef _OPENMP
501 #pragma omp parallel for default(none) \
502 dt_omp_firstprivate(clarity, g, h, highlights, in, out, padding, shadows, sigma, w) \
503 schedule(static)
504 #endif
505 for(uint32_t j=padding;j<h-padding;j++)
506 {
507 const float *in2 = in + j*w + padding;
508 float *out2 = out + j*w + padding;
509 // find 4-byte aligned block in the middle:
510 const float *const beg = (float *)((size_t)(out2+3)&(size_t)0x10ul);
511 const float *const end = (float *)((size_t)(out2+w-padding)&(size_t)0x10ul);
512 const float *const fin = out2+w-padding;
513 const __m128 g4 = _mm_set1_ps(g);
514 const __m128 sig4 = _mm_set1_ps(sigma);
515 const __m128 shd4 = _mm_set1_ps(shadows);
516 const __m128 hil4 = _mm_set1_ps(highlights);
517 const __m128 clr4 = _mm_set1_ps(clarity);
518 for(;out2<beg;out2++,in2++)
519 *out2 = curve_scalar(*in2, g, sigma, shadows, highlights, clarity);
520 for(;out2<end;out2+=4,in2+=4)
521 _mm_stream_ps(out2, curve_vec4(_mm_load_ps(in2), g4, sig4, shd4, hil4, clr4));
522 for(;out2<fin;out2++,in2++)
523 *out2 = curve_scalar(*in2, g, sigma, shadows, highlights, clarity);
524 out2 = out + j*w;
525 for(int i=0;i<padding;i++) out2[i] = out2[padding];
526 for(int i=w-padding;i<w;i++) out2[i] = out2[w-padding-1];
527 }
528 pad_by_replication(out, w, h, padding);
529 }
530 #endif
531
532 // scalar version
apply_curve(float * const out,const float * const in,const uint32_t w,const uint32_t h,const uint32_t padding,const float g,const float sigma,const float shadows,const float highlights,const float clarity)533 void apply_curve(
534 float *const out,
535 const float *const in,
536 const uint32_t w,
537 const uint32_t h,
538 const uint32_t padding,
539 const float g,
540 const float sigma,
541 const float shadows,
542 const float highlights,
543 const float clarity)
544 {
545 #ifdef _OPENMP
546 #pragma omp parallel for default(none) \
547 dt_omp_firstprivate(clarity, g, h, highlights, in, out, padding, sigma, shadows, w) \
548 schedule(static)
549 #endif
550 for(uint32_t j=padding;j<h-padding;j++)
551 {
552 const float *in2 = in + j*w + padding;
553 float *out2 = out + j*w + padding;
554 for(uint32_t i=padding;i<w-padding;i++)
555 (*out2++) = curve_scalar(*(in2++), g, sigma, shadows, highlights, clarity);
556 out2 = out + j*w;
557 for(int i=0;i<padding;i++) out2[i] = out2[padding];
558 for(int i=w-padding;i<w;i++) out2[i] = out2[w-padding-1];
559 }
560 pad_by_replication(out, w, h, padding);
561 }
562
local_laplacian_internal(const float * const input,float * const out,const int wd,const int ht,const float sigma,const float shadows,const float highlights,const float clarity,const int use_sse2,local_laplacian_boundary_t * b)563 void local_laplacian_internal(
564 const float *const input, // input buffer in some Labx or yuvx format
565 float *const out, // output buffer with colour
566 const int wd, // width and
567 const int ht, // height of the input buffer
568 const float sigma, // user param: separate shadows/mid-tones/highlights
569 const float shadows, // user param: lift shadows
570 const float highlights, // user param: compress highlights
571 const float clarity, // user param: increase clarity/local contrast
572 const int use_sse2, // flag whether to use SSE version
573 local_laplacian_boundary_t *b)
574 {
575 if(wd <= 1 || ht <= 1) return;
576
577 // don't divide by 2 more often than we can:
578 const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(wd,ht)));
579 int last_level = num_levels-1;
580 if(b && b->mode == 2) // higher number here makes it less prone to aliasing and slower.
581 last_level = num_levels > 4 ? 4 : num_levels-1;
582 const int max_supp = 1<<last_level;
583 int w, h;
584 float *padded[max_levels] = {0};
585 if(b && b->mode == 2)
586 padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h, b);
587 else
588 padded[0] = ll_pad_input(input, wd, ht, max_supp, &w, &h, 0);
589
590 // allocate pyramid pointers for padded input
591 for(int l=1;l<=last_level;l++)
592 padded[l] = dt_alloc_align_float((size_t)dl(w,l) * dl(h,l));
593
594 // allocate pyramid pointers for output
595 float *output[max_levels] = {0};
596 for(int l=0;l<=last_level;l++)
597 output[l] = dt_alloc_align_float((size_t)dl(w,l) * dl(h,l));
598
599 // create gauss pyramid of padded input, write coarse directly to output
600 #if defined(__SSE2__)
601 if(use_sse2)
602 {
603 for(int l=1;l<last_level;l++)
604 gauss_reduce_sse2(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1));
605 gauss_reduce_sse2(padded[last_level-1], output[last_level], dl(w,last_level-1), dl(h,last_level-1));
606 }
607 else
608 #endif
609 {
610 for(int l=1;l<last_level;l++)
611 gauss_reduce(padded[l-1], padded[l], dl(w,l-1), dl(h,l-1));
612 gauss_reduce(padded[last_level-1], output[last_level], dl(w,last_level-1), dl(h,last_level-1));
613 }
614
615 // evenly sample brightness [0,1]:
616 float gamma[num_gamma] = {0.0f};
617 for(int k=0;k<num_gamma;k++) gamma[k] = (k+.5f)/(float)num_gamma;
618 // for(int k=0;k<num_gamma;k++) gamma[k] = k/(num_gamma-1.0f);
619
620 // allocate memory for intermediate laplacian pyramids
621 float *buf[num_gamma][max_levels] = {{0}};
622 for(int k=0;k<num_gamma;k++) for(int l=0;l<=last_level;l++)
623 buf[k][l] = dt_alloc_align_float((size_t)dl(w,l)*dl(h,l));
624
625 // the paper says remapping only level 3 not 0 does the trick, too
626 // (but i really like the additional octave of sharpness we get,
627 // willing to pay the cost).
628 for(int k=0;k<num_gamma;k++)
629 { // process images
630 #if defined(__SSE2__)
631 if(use_sse2)
632 apply_curve_sse2(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);
633 else // brackets in next line needed for silly gcc warning:
634 #endif
635 {apply_curve(buf[k][0], padded[0], w, h, max_supp, gamma[k], sigma, shadows, highlights, clarity);}
636
637 // create gaussian pyramids
638 for(int l=1;l<=last_level;l++)
639 #if defined(__SSE2__)
640 if(use_sse2)
641 gauss_reduce_sse2(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1));
642 else
643 #endif
644 gauss_reduce(buf[k][l-1], buf[k][l], dl(w,l-1), dl(h,l-1));
645 }
646
647 // resample output[last_level] from preview
648 // requires to transform from padded/downsampled to full image and then
649 // to padded/downsampled in preview
650 if(b && b->mode == 2)
651 {
652 const float isize = powf(2.0f, last_level) / b->roi->scale; // pixel size of coarsest level in image space
653 const float psize = isize / b->buf->width * b->wd; // pixel footprint rescaled to preview buffer
654 const float pl = log2f(psize); // mip level in preview buffer
655 const int pl0 = CLAMP((int)pl, 0, b->num_levels-1), pl1 = CLAMP((int)(pl+1), 0, b->num_levels-1);
656 const float weight = CLAMP(pl-pl0, 0, 1); // weight between mip levels
657 const float mul0 = 1.0/powf(2.0f, pl0);
658 const float mul1 = 1.0/powf(2.0f, pl1);
659 const float mul = powf(2.0f, last_level);
660 const int pw = dl(w,last_level), ph = dl(h,last_level);
661 const int pw0 = dl(b->pwd, pl0), ph0 = dl(b->pht, pl0);
662 const int pw1 = dl(b->pwd, pl1), ph1 = dl(b->pht, pl1);
663 debug_dump_PFM("/tmp/coarse.pfm", b->output[pl0], pw0, ph0);
664 debug_dump_PFM("/tmp/oldcoarse.pfm", output[last_level], pw, ph);
665 #ifdef _OPENMP
666 #pragma omp parallel for schedule(static) collapse(2) default(shared)
667 #endif
668 for(int j=0;j<ph;j++) for(int i=0;i<pw;i++)
669 {
670 // image coordinates in full buffer
671 float ix = ((i*mul - max_supp) + b->roi->x)/b->roi->scale;
672 float iy = ((j*mul - max_supp) + b->roi->y)/b->roi->scale;
673 // coordinates in padded preview buffer (
674 float px = CLAMP(ix / (float)b->buf->width * b->wd + (b->pwd-b->wd)/2.0f, 0, b->pwd);
675 float py = CLAMP(iy / (float)b->buf->height * b->ht + (b->pht-b->ht)/2.0f, 0, b->pht);
676 // trilinear lookup:
677 int px0 = CLAMP(px*mul0, 0, pw0-1);
678 int py0 = CLAMP(py*mul0, 0, ph0-1);
679 int px1 = CLAMP(px*mul1, 0, pw1-1);
680 int py1 = CLAMP(py*mul1, 0, ph1-1);
681 #if 1
682 float f0x = CLAMP(px*mul0 - px0, 0.0f, 1.0f);
683 float f0y = CLAMP(py*mul0 - py0, 0.0f, 1.0f);
684 float f1x = CLAMP(px*mul1 - px1, 0.0f, 1.0f);
685 float f1y = CLAMP(py*mul1 - py1, 0.0f, 1.0f);
686 float c0 =
687 (1.0f-f0x)*(1.0f-f0y)*b->output[pl0][CLAMP(py0 , 0, ph0-1)*pw0 + CLAMP(px0 , 0, pw0-1)]+
688 ( f0x)*(1.0f-f0y)*b->output[pl0][CLAMP(py0 , 0, ph0-1)*pw0 + CLAMP(px0+1, 0, pw0-1)]+
689 (1.0f-f0x)*( f0y)*b->output[pl0][CLAMP(py0+1, 0, ph0-1)*pw0 + CLAMP(px0 , 0, pw0-1)]+
690 ( f0x)*( f0y)*b->output[pl0][CLAMP(py0+1, 0, ph0-1)*pw0 + CLAMP(px0+1, 0, pw0-1)];
691 float c1 =
692 (1.0f-f1x)*(1.0f-f1y)*b->output[pl1][CLAMP(py1 , 0, ph1-1)*pw1 + CLAMP(px1 , 0, pw1-1)]+
693 ( f1x)*(1.0f-f1y)*b->output[pl1][CLAMP(py1 , 0, ph1-1)*pw1 + CLAMP(px1+1, 0, pw1-1)]+
694 (1.0f-f1x)*( f1y)*b->output[pl1][CLAMP(py1+1, 0, ph1-1)*pw1 + CLAMP(px1 , 0, pw1-1)]+
695 ( f1x)*( f1y)*b->output[pl1][CLAMP(py1+1, 0, ph1-1)*pw1 + CLAMP(px1+1, 0, pw1-1)];
696 #else
697 float c0 = b->output[pl0][py0*pw0 + px0];
698 float c1 = b->output[pl1][py1*pw1 + px1];
699 #endif
700 output[last_level][j*pw+i] = weight * c1 + (1.0f-weight) * c0;
701 }
702 debug_dump_PFM("/tmp/newcoarse.pfm", output[last_level], pw, ph);
703 }
704
705 // assemble output pyramid coarse to fine
706 for(int l=last_level-1;l >= 0; l--)
707 {
708 const int pw = dl(w,l), ph = dl(h,l);
709
710 gauss_expand(output[l+1], output[l], pw, ph);
711 // go through all coefficients in the upsampled gauss buffer:
712 #ifdef _OPENMP
713 #pragma omp parallel for default(none) \
714 dt_omp_firstprivate(ph, pw) \
715 shared(w,h,buf,output,l,gamma,padded) \
716 schedule(static) \
717 collapse(2)
718 #endif
719 for(int j=0;j<ph;j++) for(int i=0;i<pw;i++)
720 {
721 const float v = padded[l][j*pw+i];
722 int hi = 1;
723 for(;hi<num_gamma-1 && gamma[hi] <= v;hi++);
724 int lo = hi-1;
725 const float a = CLAMPS((v - gamma[lo])/(gamma[hi]-gamma[lo]), 0.0f, 1.0f);
726 const float l0 = ll_laplacian(buf[lo][l+1], buf[lo][l], i, j, pw, ph);
727 const float l1 = ll_laplacian(buf[hi][l+1], buf[hi][l], i, j, pw, ph);
728 output[l][j*pw+i] += l0 * (1.0f-a) + l1 * a;
729 // we could do this to save on memory (no need for finest buf[][]).
730 // unfortunately it results in a quite noticeable loss of sharpness, i think
731 // the extra level is worth it.
732 // else if(l == 0) // use finest scale from input to not amplify noise (and use less memory)
733 // output[l][j*pw+i] += ll_laplacian(padded[l+1], padded[l], i, j, pw, ph);
734 }
735 }
736 #ifdef _OPENMP
737 #pragma omp parallel for default(none) \
738 dt_omp_firstprivate(ht, input, max_supp, out, wd) \
739 shared(w,output,buf) \
740 schedule(static) \
741 collapse(2)
742 #endif
743 for(int j=0;j<ht;j++) for(int i=0;i<wd;i++)
744 {
745 out[4*(j*wd+i)+0] = 100.0f * output[0][(j+max_supp)*w+max_supp+i]; // [0,1] -> L
746 out[4*(j*wd+i)+1] = input[4*(j*wd+i)+1]; // copy original colour channels
747 out[4*(j*wd+i)+2] = input[4*(j*wd+i)+2];
748 }
749 if(b && b->mode == 1)
750 { // output the buffers for later re-use
751 b->pad0 = padded[0];
752 b->wd = wd;
753 b->ht = ht;
754 b->pwd = w;
755 b->pht = h;
756 b->num_levels = num_levels;
757 for(int l=0;l<num_levels;l++) b->output[l] = output[l];
758 }
759 // free all buffers except the ones passed out for preview rendering
760 for(int l=0;l<max_levels;l++)
761 {
762 if(!b || b->mode != 1 || l) dt_free_align(padded[l]);
763 if(!b || b->mode != 1) dt_free_align(output[l]);
764 for(int k=0; k<num_gamma;k++) dt_free_align(buf[k][l]);
765 }
766 }
767
768
local_laplacian_memory_use(const int width,const int height)769 size_t local_laplacian_memory_use(const int width, // width of input image
770 const int height) // height of input image
771 {
772 const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(width,height)));
773 const int max_supp = 1<<(num_levels-1);
774 const int paddwd = width + 2*max_supp;
775 const int paddht = height + 2*max_supp;
776
777 size_t memory_use = 0;
778
779 for(int l=0;l<num_levels;l++)
780 memory_use += sizeof(float) * (2 + num_gamma) * dl(paddwd, l) * dl(paddht, l);
781
782 return memory_use;
783 }
784
local_laplacian_singlebuffer_size(const int width,const int height)785 size_t local_laplacian_singlebuffer_size(const int width, // width of input image
786 const int height) // height of input image
787 {
788 const int num_levels = MIN(max_levels, 31-__builtin_clz(MIN(width,height)));
789 const int max_supp = 1<<(num_levels-1);
790 const int paddwd = width + 2*max_supp;
791 const int paddht = height + 2*max_supp;
792
793 return sizeof(float) * dl(paddwd, 0) * dl(paddht, 0);
794 }
795