1 /*
2 * This file is part of RawTherapee.
3 *
4 * Copyright (c) 2017-2018 Ingo Weyrich <heckflosse67@gmx.de>
5 *
6 * RawTherapee is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * RawTherapee is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with RawTherapee. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20 #include <algorithm>
21 #include <cassert>
22 #include <cmath>
23 #include <cstddef>
24 #include <cstdint>
25 #include <vector>
26 #ifdef _OPENMP
27 #include <omp.h>
28 #endif
29
30 #include "gauss.h"
31 #include "opthelper.h"
32 #include "rt_algo.h"
33 #include "rt_math.h"
34 #include "sleef.h"
35
36 namespace {
calcBlendFactor(float val,float threshold)37 float calcBlendFactor(float val, float threshold) {
38 // sigmoid function
39 // result is in ]0;1] range
40 // inflexion point is at (x, y) (threshold, 0.5)
41 return 1.f / (1.f + xexpf(16.f - 16.f * val / threshold));
42 }
43
44 #ifdef __SSE2__
calcBlendFactor(vfloat valv,vfloat thresholdv)45 vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) {
46 // sigmoid function
47 // result is in ]0;1] range
48 // inflexion point is at (x, y) (threshold, 0.5)
49 const vfloat onev = F2V(1.f);
50 const vfloat c16v = F2V(16.f);
51 return onev / (onev + xexpf(c16v - c16v * valv / thresholdv));
52 }
53 #endif
54
tileAverage(const float * const * data,size_t tileY,size_t tileX,size_t tilesize)55 float tileAverage(const float * const *data, size_t tileY, size_t tileX, size_t tilesize) {
56
57 float avg = 0.f;
58 #ifdef __SSE2__
59 vfloat avgv = ZEROV;
60 #endif
61 for (std::size_t y = tileY; y < tileY + tilesize; ++y) {
62 std::size_t x = tileX;
63 #ifdef __SSE2__
64 for (; x < tileX + tilesize - 3; x += 4) {
65 avgv += LVFU(data[y][x]);
66 }
67 #endif
68 for (; x < tileX + tilesize; ++x) {
69 avg += data[y][x];
70 }
71 }
72 #ifdef __SSE2__
73 avg += vhadd(avgv);
74 #endif
75 return avg / rtengine::SQR(tilesize);
76 }
77
tileVariance(const float * const * data,size_t tileY,size_t tileX,size_t tilesize,float avg)78 float tileVariance(const float * const *data, size_t tileY, size_t tileX, size_t tilesize, float avg) {
79
80 float var = 0.f;
81 #ifdef __SSE2__
82 vfloat varv = ZEROV;
83 const vfloat avgv = F2V(avg);
84 #endif
85 for (std::size_t y = tileY; y < tileY + tilesize; ++y) {
86 std::size_t x = tileX;
87 #ifdef __SSE2__
88 for (; x < tileX + tilesize - 3; x += 4) {
89 varv += SQRV(LVFU(data[y][x]) - avgv);
90 }
91 #endif
92 for (; x < tileX + tilesize; ++x) {
93 var += rtengine::SQR(data[y][x] - avg);
94 }
95 }
96 #ifdef __SSE2__
97 var += vhadd(varv);
98 #endif
99 return var / (rtengine::SQR(tilesize) * avg);
100 }
101
calcContrastThreshold(const float * const * luminance,int tileY,int tileX,int tilesize)102 float calcContrastThreshold(const float* const * luminance, int tileY, int tileX, int tilesize) {
103
104 constexpr float scale = 0.0625f / 327.68f;
105 std::vector<std::vector<float>> blend(tilesize - 4, std::vector<float>(tilesize - 4));
106
107 #ifdef __SSE2__
108 const vfloat scalev = F2V(scale);
109 #endif
110
111 for(int j = tileY + 2; j < tileY + tilesize - 2; ++j) {
112 int i = tileX + 2;
113 #ifdef __SSE2__
114 for(; i < tileX + tilesize - 5; i += 4) {
115 vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
116 SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
117 STVFU(blend[j - tileY - 2][i - tileX - 2], contrastv);
118 }
119 #endif
120 for(; i < tileX + tilesize - 2; ++i) {
121
122 float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) +
123 rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
124
125 blend[j - tileY - 2][i - tileX - 2] = contrast;
126 }
127 }
128
129 const float limit = rtengine::SQR(tilesize - 4) / 100.f;
130
131 int c;
132 for (c = 1; c < 100; ++c) {
133 const float contrastThreshold = c / 100.f;
134 float sum = 0.f;
135 #ifdef __SSE2__
136 const vfloat contrastThresholdv = F2V(contrastThreshold);
137 vfloat sumv = ZEROV;
138 #endif
139
140 for(int j = 0; j < tilesize - 4; ++j) {
141 int i = 0;
142 #ifdef __SSE2__
143 for(; i < tilesize - 7; i += 4) {
144 sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv);
145 }
146 #endif
147 for(; i < tilesize - 4; ++i) {
148 sum += calcBlendFactor(blend[j][i], contrastThreshold);
149 }
150 }
151 #ifdef __SSE2__
152 sum += vhadd(sumv);
153 #endif
154 if (sum <= limit) {
155 break;
156 }
157 }
158
159 return (c + 1) / 100.f;
160 }
161 }
162
163 namespace rtengine
164 {
165
findMinMaxPercentile(const float * data,size_t size,float minPrct,float & minOut,float maxPrct,float & maxOut,bool multithread)166 void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multithread)
167 {
168 // Copyright (c) 2017 Ingo Weyrich <heckflosse67@gmx.de>
169 // We need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in data.
170 // We use a histogram based search for speed and to reduce memory usage.
171 // Memory usage of this method is histoSize * sizeof(uint32_t) * (t + 1) byte,
172 // where t is the number of threads and histoSize is in [1;65536].
173 // Processing time is O(n) where n is size of the input array.
174 // It scales well with multiple threads if the size of the input array is large.
175 // The current implementation is not guaranteed to work correctly if size > 2^32 (4294967296).
176
177 assert(minPrct <= maxPrct);
178
179 if (size == 0) {
180 return;
181 }
182
183 size_t numThreads = 1;
184 #ifdef _OPENMP
185 // Because we have an overhead in the critical region of the main loop for each thread
186 // we make a rough calculation to reduce the number of threads for small data size.
187 // This also works fine for the minmax loop.
188 if (multithread) {
189 const size_t maxThreads = omp_get_max_threads();
190 while (size > numThreads * numThreads * 16384 && numThreads < maxThreads) {
191 ++numThreads;
192 }
193 }
194 #endif
195
196 // We need min and max value of data to calculate the scale factor for the histogram
197 float minVal = data[0];
198 float maxVal = data[0];
199 #ifdef _OPENMP
200 #pragma omp parallel for reduction(min:minVal) reduction(max:maxVal) num_threads(numThreads)
201 #endif
202 for (size_t i = 1; i < size; ++i) {
203 minVal = std::min(minVal, data[i]);
204 maxVal = std::max(maxVal, data[i]);
205 }
206
207 if (std::fabs(maxVal - minVal) == 0.f) { // fast exit, also avoids division by zero in calculation of scale factor
208 minOut = maxOut = minVal;
209 return;
210 }
211
212 // Caution: Currently this works correctly only for histoSize in range[1;65536].
213 // For small data size (i.e. thumbnails) we reduce the size of the histogram to the size of data.
214 const unsigned int histoSize = std::min<size_t>(65536, size);
215
216 // calculate scale factor to use full range of histogram
217 const float scale = (histoSize - 1) / (maxVal - minVal);
218
219 // We need one main histogram
220 std::vector<uint32_t> histo(histoSize, 0);
221
222 if (numThreads == 1) {
223 // just one thread => use main histogram
224 for (size_t i = 0; i < size; ++i) {
225 // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range
226 histo[static_cast<uint16_t>(scale * (data[i] - minVal))]++;
227 }
228 } else {
229 #ifdef _OPENMP
230 #pragma omp parallel num_threads(numThreads)
231 #endif
232 {
233 // We need one histogram per thread
234 std::vector<uint32_t> histothr(histoSize, 0);
235
236 #ifdef _OPENMP
237 #pragma omp for nowait
238 #endif
239 for (size_t i = 0; i < size; ++i) {
240 // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range
241 histothr[static_cast<uint16_t>(scale * (data[i] - minVal))]++;
242 }
243
244 #ifdef _OPENMP
245 #pragma omp critical
246 #endif
247 {
248 // add per thread histogram to main histogram
249 #ifdef _OPENMP
250 #pragma omp simd
251 #endif
252
253 for (size_t i = 0; i < histoSize; ++i) {
254 histo[i] += histothr[i];
255 }
256 }
257 }
258 }
259
260 size_t k = 0;
261 size_t count = 0;
262
263 // find (minPrct*size) smallest value
264 const float threshmin = minPrct * size;
265 while (count < threshmin) {
266 count += histo[k++];
267 }
268
269 if (k > 0) { // interpolate
270 const size_t count_ = count - histo[k - 1];
271 const float c0 = count - threshmin;
272 const float c1 = threshmin - count_;
273 minOut = (c1 * k + c0 * (k - 1)) / (c0 + c1);
274 } else {
275 minOut = k;
276 }
277 // go back to original range
278 minOut /= scale;
279 minOut += minVal;
280 minOut = rtengine::LIM(minOut, minVal, maxVal);
281
282 // find (maxPrct*size) smallest value
283 const float threshmax = maxPrct * size;
284 while (count < threshmax) {
285 count += histo[k++];
286 }
287
288 if (k > 0) { // interpolate
289 const size_t count_ = count - histo[k - 1];
290 const float c0 = count - threshmax;
291 const float c1 = threshmax - count_;
292 maxOut = (c1 * k + c0 * (k - 1)) / (c0 + c1);
293 } else {
294 maxOut = k;
295 }
296 // go back to original range
297 maxOut /= scale;
298 maxOut += minVal;
299 maxOut = rtengine::LIM(maxOut, minVal, maxVal);
300 }
301
buildBlendMask(const float * const * luminance,float ** blend,int W,int H,float & contrastThreshold,bool autoContrast,float ** clipMask)302 void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, bool autoContrast, float ** clipMask) {
303
304 if (autoContrast) {
305 constexpr float minLuminance = 2000.f;
306 constexpr float maxLuminance = 20000.f;
307 constexpr float minTileVariance = 0.5f;
308 for (int pass = 0; pass < 2; ++pass) {
309 const int tilesize = 80 / (pass + 1);
310 const int skip = pass == 0 ? tilesize : tilesize / 4;
311 const int numTilesW = W / skip - 3 * pass;
312 const int numTilesH = H / skip - 3 * pass;
313 std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
314
315 #ifdef _OPENMP
316 #pragma omp parallel for schedule(dynamic)
317 #endif
318 for (int i = 0; i < numTilesH; ++i) {
319 const int tileY = i * skip;
320 for (int j = 0; j < numTilesW; ++j) {
321 const int tileX = j * skip;
322 const float avg = tileAverage(luminance, tileY, tileX, tilesize);
323 if (avg < minLuminance || avg > maxLuminance) {
324 // too dark or too bright => skip the tile
325 variances[i][j] = RT_INFINITY_F;
326 continue;
327 } else {
328 variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
329 // exclude tiles with a variance less than minTileVariance
330 variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j];
331 }
332 }
333 }
334
335 float minvar = RT_INFINITY_F;
336 int minI = 0, minJ = 0;
337 for (int i = 0; i < numTilesH; ++i) {
338 for (int j = 0; j < numTilesW; ++j) {
339 if (variances[i][j] < minvar) {
340 minvar = variances[i][j];
341 minI = i;
342 minJ = j;
343 }
344 }
345 }
346
347 if (minvar <= 1.f || pass == 1) {
348 const int minY = skip * minI;
349 const int minX = skip * minJ;
350 if (pass == 0) {
351 // a variance <= 1 means we already found a flat region and can skip second pass
352 contrastThreshold = calcContrastThreshold(luminance, minY, minX, tilesize);
353 break;
354 } else {
355 // in second pass we allow a variance of 8
356 // we additionally scan the tiles +-skip pixels around the best tile from pass 2
357 // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate
358 // fortunately the scan is quite fast, so we use only one core and don't parallelize
359 const int topLeftYStart = std::max(minY - skip, 0);
360 const int topLeftXStart = std::max(minX - skip, 0);
361 const int topLeftYEnd = std::min(minY + skip, H - tilesize);
362 const int topLeftXEnd = std::min(minX + skip, W - tilesize);
363 const int numTilesH = topLeftYEnd - topLeftYStart + 1;
364 const int numTilesW = topLeftXEnd - topLeftXStart + 1;
365
366 std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
367 for (int i = 0; i < numTilesH; ++i) {
368 const int tileY = topLeftYStart + i;
369 for (int j = 0; j < numTilesW; ++j) {
370 const int tileX = topLeftXStart + j;
371 const float avg = tileAverage(luminance, tileY, tileX, tilesize);
372
373 if (avg < minLuminance || avg > maxLuminance) {
374 // too dark or too bright => skip the tile
375 variances[i][j] = RT_INFINITY_F;
376 continue;
377 } else {
378 variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
379 // exclude tiles with a variance less than minTileVariance
380 variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j];
381 }
382 }
383 }
384
385 float minvar = RT_INFINITY_F;
386 int minI = 0, minJ = 0;
387 for (int i = 0; i < numTilesH; ++i) {
388 for (int j = 0; j < numTilesW; ++j) {
389 if (variances[i][j] < minvar) {
390 minvar = variances[i][j];
391 minI = i;
392 minJ = j;
393 }
394 }
395 }
396
397 contrastThreshold = minvar <= 8.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
398 }
399 }
400 }
401 }
402
403 if(contrastThreshold == 0.f) {
404 for(int j = 0; j < H; ++j) {
405 for(int i = 0; i < W; ++i) {
406 blend[j][i] = 1.f;
407 }
408 }
409 } else {
410 constexpr float scale = 0.0625f / 327.68f;
411 #ifdef _OPENMP
412 #pragma omp parallel
413 #endif
414 {
415 #ifdef __SSE2__
416 const vfloat contrastThresholdv = F2V(contrastThreshold);
417 const vfloat scalev = F2V(scale);
418 #endif
419 #ifdef _OPENMP
420 #pragma omp for schedule(dynamic,16)
421 #endif
422
423 for(int j = 2; j < H - 2; ++j) {
424 int i = 2;
425 #ifdef __SSE2__
426 if (clipMask) {
427 for(; i < W - 5; i += 4) {
428 vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
429 SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
430
431 STVFU(blend[j][i], LVFU(clipMask[j][i]) * calcBlendFactor(contrastv, contrastThresholdv));
432 }
433 } else {
434 for(; i < W - 5; i += 4) {
435 vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
436 SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
437
438 STVFU(blend[j][i], calcBlendFactor(contrastv, contrastThresholdv));
439 }
440 }
441 #endif
442 for(; i < W - 2; ++i) {
443
444 float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) +
445 rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
446
447 blend[j][i] = (clipMask ? clipMask[j][i] : 1.f) * calcBlendFactor(contrast, contrastThreshold);
448 }
449 }
450
451 #ifdef _OPENMP
452 #pragma omp single
453 #endif
454 {
455 // upper border
456 for(int j = 0; j < 2; ++j) {
457 for(int i = 2; i < W - 2; ++i) {
458 blend[j][i] = blend[2][i];
459 }
460 }
461 // lower border
462 for(int j = H - 2; j < H; ++j) {
463 for(int i = 2; i < W - 2; ++i) {
464 blend[j][i] = blend[H-3][i];
465 }
466 }
467 for(int j = 0; j < H; ++j) {
468 // left border
469 blend[j][0] = blend[j][1] = blend[j][2];
470 // right border
471 blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3];
472 }
473 }
474
475 #ifdef __SSE2__
476 // flush denormals to zero for gaussian blur to avoid performance penalty if there are a lot of zero values in the mask
477 const auto oldMode = _MM_GET_FLUSH_ZERO_MODE();
478 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
479 #endif
480
481 // blur blend mask to smooth transitions
482 gaussianBlur(blend, blend, W, H, 2.0);
483
484 #ifdef __SSE2__
485 _MM_SET_FLUSH_ZERO_MODE(oldMode);
486 #endif
487 }
488 }
489 }
490
491 }
492