1 /*
2  *  This file is part of RawTherapee.
3  *
4  *  Copyright (c) 2017-2018 Ingo Weyrich <heckflosse67@gmx.de>
5  *
6  *  RawTherapee is free software: you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation, either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  RawTherapee is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
18  */
19 
20 #include <algorithm>
21 #include <cassert>
22 #include <cmath>
23 #include <cstddef>
24 #include <cstdint>
25 #include <vector>
26 #ifdef _OPENMP
27 #include <omp.h>
28 #endif
29 
30 #include "gauss.h"
31 #include "opthelper.h"
32 #include "rt_algo.h"
33 #include "rt_math.h"
34 #include "core/sleef.h"
35 
36 namespace {
calcBlendFactor(float val,float threshold)37 float calcBlendFactor(float val, float threshold) {
38     // sigmoid function
39     // result is in ]0;1] range
40     // inflexion point is at (x, y) (threshold, 0.5)
41     return 1.f / (1.f + xexpf(16.f - 16.f * val / threshold));
42 }
43 
44 #ifdef __SSE2__
calcBlendFactor(vfloat valv,vfloat thresholdv)45 vfloat calcBlendFactor(vfloat valv, vfloat thresholdv) {
46     // sigmoid function
47     // result is in ]0;1] range
48     // inflexion point is at (x, y) (threshold, 0.5)
49     const vfloat onev = F2V(1.f);
50     const vfloat c16v = F2V(16.f);
51     return onev / (onev + xexpf(c16v - c16v * valv / thresholdv));
52 }
53 #endif
54 
tileAverage(const float * const * data,size_t tileY,size_t tileX,size_t tilesize)55 float tileAverage(const float * const *data, size_t tileY, size_t tileX, size_t tilesize) {
56 
57     float avg = 0.f;
58 #ifdef __SSE2__
59     vfloat avgv = ZEROV;
60 #endif
61     for (std::size_t y = tileY; y < tileY + tilesize; ++y) {
62         std::size_t x = tileX;
63 #ifdef __SSE2__
64         for (; x < tileX + tilesize - 3; x += 4) {
65             avgv += LVFU(data[y][x]);
66         }
67 #endif
68         for (; x < tileX + tilesize; ++x) {
69             avg += data[y][x];
70         }
71     }
72 #ifdef __SSE2__
73     avg += vhadd(avgv);
74 #endif
75     return avg / rtengine::SQR(tilesize);
76 }
77 
tileVariance(const float * const * data,size_t tileY,size_t tileX,size_t tilesize,float avg)78 float tileVariance(const float * const *data, size_t tileY, size_t tileX, size_t tilesize, float avg) {
79 
80     float var = 0.f;
81 #ifdef __SSE2__
82     vfloat varv = ZEROV;
83     const vfloat avgv = F2V(avg);
84 #endif
85     for (std::size_t y = tileY; y < tileY + tilesize; ++y) {
86         std::size_t x = tileX;
87 #ifdef __SSE2__
88         for (; x < tileX + tilesize - 3; x += 4) {
89             varv += SQRV(LVFU(data[y][x]) - avgv);
90         }
91 #endif
92         for (; x < tileX + tilesize; ++x) {
93             var += rtengine::SQR(data[y][x] - avg);
94         }
95     }
96 #ifdef __SSE2__
97     var += vhadd(varv);
98 #endif
99     return var / (rtengine::SQR(tilesize) * avg);
100 }
101 
calcContrastThreshold(const float * const * luminance,int tileY,int tileX,int tilesize)102 float calcContrastThreshold(const float* const * luminance, int tileY, int tileX, int tilesize) {
103 
104     constexpr float scale = 0.0625f / 327.68f;
105     std::vector<std::vector<float>> blend(tilesize - 4, std::vector<float>(tilesize - 4));
106 
107 #ifdef __SSE2__
108     const vfloat scalev = F2V(scale);
109 #endif
110 
111     for(int j = tileY + 2; j < tileY + tilesize - 2; ++j) {
112         int i = tileX + 2;
113 #ifdef __SSE2__
114         for(; i < tileX + tilesize - 5; i += 4) {
115             vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
116                                       SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
117             STVFU(blend[j - tileY - 2][i - tileX - 2], contrastv);
118         }
119 #endif
120         for(; i < tileX + tilesize - 2; ++i) {
121 
122             float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) +
123                                    rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
124 
125             blend[j - tileY - 2][i - tileX - 2] = contrast;
126         }
127     }
128 
129     const float limit = rtengine::SQR(tilesize - 4) / 100.f;
130 
131     int c;
132     for (c = 1; c < 100; ++c) {
133         const float contrastThreshold = c / 100.f;
134         float sum = 0.f;
135 #ifdef __SSE2__
136         const vfloat contrastThresholdv = F2V(contrastThreshold);
137         vfloat sumv = ZEROV;
138 #endif
139 
140         for(int j = 0; j < tilesize - 4; ++j) {
141             int i = 0;
142 #ifdef __SSE2__
143             for(; i < tilesize - 7; i += 4) {
144                 sumv += calcBlendFactor(LVFU(blend[j][i]), contrastThresholdv);
145             }
146 #endif
147             for(; i < tilesize - 4; ++i) {
148                 sum += calcBlendFactor(blend[j][i], contrastThreshold);
149             }
150         }
151 #ifdef __SSE2__
152         sum += vhadd(sumv);
153 #endif
154         if (sum <= limit) {
155             break;
156         }
157     }
158 
159     return (c + 1) / 100.f;
160 }
161 }
162 
163 namespace rtengine
164 {
165 
166 extern "C" void findMinMaxPercentile(const float* data, size_t size, float minPrct, float* minOut, float maxPrct, float* maxOut, int multithread);
167 
findMinMaxPercentile(const float * data,size_t size,float minPrct,float * minOut,float maxPrct,float * maxOut,int multithread)168 void findMinMaxPercentile(const float* data, size_t size, float minPrct, float* minOut, float maxPrct, float* maxOut, int multithread)
169 {
170     // Copyright (c) 2017 Ingo Weyrich <heckflosse67@gmx.de>
171     // We need to find the (minPrct*size) smallest value and the (maxPrct*size) smallest value in data.
172     // We use a histogram based search for speed and to reduce memory usage.
173     // Memory usage of this method is histoSize * sizeof(uint32_t) * (t + 1) byte,
174     // where t is the number of threads and histoSize is in [1;65536].
175     // Processing time is O(n) where n is size of the input array.
176     // It scales well with multiple threads if the size of the input array is large.
177     // The current implementation is not guaranteed to work correctly if size > 2^32 (4294967296).
178 
179     assert(minPrct <= maxPrct);
180 
181     if (size == 0) {
182         return;
183     }
184 
185     size_t numThreads = 1;
186 #ifdef _OPENMP
187     // Because we have an overhead in the critical region of the main loop for each thread
188     // we make a rough calculation to reduce the number of threads for small data size.
189     // This also works fine for the minmax loop.
190     if (multithread) {
191         const size_t maxThreads = omp_get_max_threads();
192         while (size > numThreads * numThreads * 16384 && numThreads < maxThreads) {
193             ++numThreads;
194         }
195     }
196 #endif
197 
198     // We need min and max value of data to calculate the scale factor for the histogram
199     float minVal = data[0];
200     float maxVal = data[0];
201 #ifdef _OPENMP
202     #pragma omp parallel for reduction(min:minVal) reduction(max:maxVal) num_threads(numThreads)
203 #endif
204     for (size_t i = 1; i < size; ++i) {
205         minVal = std::min(minVal, data[i]);
206         maxVal = std::max(maxVal, data[i]);
207     }
208 
209     if (std::fabs(maxVal - minVal) == 0.f) { // fast exit, also avoids division by zero in calculation of scale factor
210         *minOut = *maxOut = minVal;
211         return;
212     }
213 
214     // Caution: Currently this works correctly only for histoSize in range[1;65536].
215     // For small data size (i.e. thumbnails) we reduce the size of the histogram to the size of data.
216     const unsigned int histoSize = std::min<size_t>(65536, size);
217 
218     // calculate scale factor to use full range of histogram
219     const float scale = (histoSize - 1) / (maxVal - minVal);
220 
221     // We need one main histogram
222     std::vector<uint32_t> histo(histoSize, 0);
223 
224     if (numThreads == 1) {
225         // just one thread => use main histogram
226         for (size_t i = 0; i < size; ++i) {
227             // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range
228             histo[static_cast<uint16_t>(scale * (data[i] - minVal))]++;
229         }
230     } else {
231 #ifdef _OPENMP
232     #pragma omp parallel num_threads(numThreads)
233 #endif
234         {
235             // We need one histogram per thread
236             std::vector<uint32_t> histothr(histoSize, 0);
237 
238 #ifdef _OPENMP
239             #pragma omp for nowait
240 #endif
241             for (size_t i = 0; i < size; ++i) {
242                 // we have to subtract minVal and multiply with scale to get the data in [0;histosize] range
243                 histothr[static_cast<uint16_t>(scale * (data[i] - minVal))]++;
244             }
245 
246 #ifdef _OPENMP
247             #pragma omp critical
248 #endif
249             {
250                 // add per thread histogram to main histogram
251 #ifdef _OPENMP
252                 #pragma omp simd
253 #endif
254 
255                 for (size_t i = 0; i < histoSize; ++i) {
256                     histo[i] += histothr[i];
257                 }
258             }
259         }
260     }
261 
262     size_t k = 0;
263     size_t count = 0;
264 
265     // find (minPrct*size) smallest value
266     const float threshmin = minPrct * size;
267     while (count < threshmin) {
268         count += histo[k++];
269     }
270 
271     if (k > 0) { // interpolate
272         const size_t count_ = count - histo[k - 1];
273         const float c0 = count - threshmin;
274         const float c1 = threshmin - count_;
275         *minOut = (c1 * k + c0 * (k - 1)) / (c0 + c1);
276     } else {
277         *minOut = k;
278     }
279     // go back to original range
280     *minOut /= scale;
281     *minOut += minVal;
282     *minOut = rtengine::LIM(*minOut, minVal, maxVal);
283 
284     // find (maxPrct*size) smallest value
285     const float threshmax = maxPrct * size;
286     while (count < threshmax) {
287         count += histo[k++];
288     }
289 
290     if (k > 0) { // interpolate
291         const size_t count_ = count - histo[k - 1];
292         const float c0 = count - threshmax;
293         const float c1 = threshmax - count_;
294         *maxOut = (c1 * k + c0 * (k - 1)) / (c0 + c1);
295     } else {
296         *maxOut = k;
297     }
298     // go back to original range
299     *maxOut /= scale;
300     *maxOut += minVal;
301     *maxOut = rtengine::LIM(*maxOut, minVal, maxVal);
302 }
303 
buildBlendMask(const float * const * luminance,float ** blend,int W,int H,float & contrastThreshold,bool autoContrast,float ** clipMask)304 void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, bool autoContrast, float ** clipMask) {
305 
306     if (autoContrast) {
307         constexpr float minLuminance = 2000.f;
308         constexpr float maxLuminance = 20000.f;
309         constexpr float minTileVariance = 0.5f;
310         for (int pass = 0; pass < 2; ++pass) {
311             const int tilesize = 80 / (pass + 1);
312             const int skip = pass == 0 ? tilesize : tilesize / 4;
313             const int numTilesW = W / skip - 3 * pass;
314             const int numTilesH = H / skip - 3 * pass;
315             std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
316 
317 #ifdef _OPENMP
318             #pragma omp parallel for schedule(dynamic)
319 #endif
320             for (int i = 0; i < numTilesH; ++i) {
321                 const int tileY = i * skip;
322                 for (int j = 0; j < numTilesW; ++j) {
323                     const int tileX = j * skip;
324                     const float avg = tileAverage(luminance, tileY, tileX, tilesize);
325                     if (avg < minLuminance || avg > maxLuminance) {
326                         // too dark or too bright => skip the tile
327                         variances[i][j] = RT_INFINITY_F;
328                         continue;
329                     } else {
330                         variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
331                         // exclude tiles with a variance less than minTileVariance
332                         variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j];
333                     }
334                 }
335             }
336 
337             float minvar = RT_INFINITY_F;
338             int minI = 0, minJ = 0;
339             for (int i = 0; i < numTilesH; ++i) {
340                 for (int j = 0; j < numTilesW; ++j) {
341                     if (variances[i][j] < minvar) {
342                         minvar = variances[i][j];
343                         minI = i;
344                         minJ = j;
345                     }
346                 }
347             }
348 
349             if (minvar <= 1.f || pass == 1) {
350                 const int minY = skip * minI;
351                 const int minX = skip * minJ;
352                 if (pass == 0) {
353                     // a variance <= 1 means we already found a flat region and can skip second pass
354                     contrastThreshold = calcContrastThreshold(luminance, minY, minX, tilesize);
355                     break;
356                 } else {
357                     // in second pass we allow a variance of 8
358                     // we additionally scan the tiles +-skip pixels around the best tile from pass 2
359                     // Means we scan (2 * skip + 1)^2 tiles in this step to get a better hit rate
360                     // fortunately the scan is quite fast, so we use only one core and don't parallelize
361                     const int topLeftYStart = std::max(minY - skip, 0);
362                     const int topLeftXStart = std::max(minX - skip, 0);
363                     const int topLeftYEnd = std::min(minY + skip, H - tilesize);
364                     const int topLeftXEnd = std::min(minX + skip, W - tilesize);
365                     const int numTilesH = topLeftYEnd - topLeftYStart + 1;
366                     const int numTilesW = topLeftXEnd - topLeftXStart + 1;
367 
368                     std::vector<std::vector<float>> variances(numTilesH, std::vector<float>(numTilesW));
369                     for (int i = 0; i < numTilesH; ++i) {
370                         const int tileY = topLeftYStart + i;
371                         for (int j = 0; j < numTilesW; ++j) {
372                             const int tileX = topLeftXStart + j;
373                             const float avg = tileAverage(luminance, tileY, tileX, tilesize);
374 
375                             if (avg < minLuminance || avg > maxLuminance) {
376                                 // too dark or too bright => skip the tile
377                                 variances[i][j] = RT_INFINITY_F;
378                                 continue;
379                             } else {
380                                 variances[i][j] = tileVariance(luminance, tileY, tileX, tilesize, avg);
381                             // exclude tiles with a variance less than minTileVariance
382                             variances[i][j] = variances[i][j] < minTileVariance ? RT_INFINITY_F : variances[i][j];
383                             }
384                         }
385                     }
386 
387                     float minvar = RT_INFINITY_F;
388                     int minI = 0, minJ = 0;
389                     for (int i = 0; i < numTilesH; ++i) {
390                         for (int j = 0; j < numTilesW; ++j) {
391                             if (variances[i][j] < minvar) {
392                                 minvar = variances[i][j];
393                                 minI = i;
394                                 minJ = j;
395                             }
396                         }
397                     }
398 
399                     contrastThreshold = minvar <= 8.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
400                 }
401             }
402         }
403     }
404 
405     if(contrastThreshold == 0.f && !clipMask) {
406         for(int j = 0; j < H; ++j) {
407             for(int i = 0; i < W; ++i) {
408                 blend[j][i] = 1.f;
409             }
410         }
411     } else {
412         constexpr float scale = 0.0625f / 327.68f;
413 #ifdef _OPENMP
414         #pragma omp parallel
415 #endif
416         {
417 #ifdef __SSE2__
418             const vfloat contrastThresholdv = F2V(contrastThreshold);
419             const vfloat scalev = F2V(scale);
420 #endif
421 #ifdef _OPENMP
422             #pragma omp for schedule(dynamic,16)
423 #endif
424 
425             for(int j = 2; j < H - 2; ++j) {
426                 int i = 2;
427 #ifdef __SSE2__
428                 if (clipMask) {
429                     for(; i < W - 5; i += 4) {
430                         vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
431                                                   SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
432 
433                         STVFU(blend[j][i], LVFU(clipMask[j][i]) * calcBlendFactor(contrastv, contrastThresholdv));
434                     }
435                 } else {
436                     for(; i < W - 5; i += 4) {
437                         vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
438                                                   SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
439 
440                         STVFU(blend[j][i], calcBlendFactor(contrastv, contrastThresholdv));
441                     }
442                 }
443 #endif
444                 for(; i < W - 2; ++i) {
445 
446                     float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) +
447                                            rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
448 
449                     blend[j][i] = (clipMask ? clipMask[j][i] : 1.f) * calcBlendFactor(contrast, contrastThreshold);
450                 }
451             }
452 
453 #ifdef _OPENMP
454             #pragma omp single
455 #endif
456             {
457                 // upper border
458                 for(int j = 0; j < 2; ++j) {
459                     for(int i = 2; i < W - 2; ++i) {
460                         blend[j][i] = blend[2][i];
461                     }
462                 }
463                 // lower border
464                 for(int j = H - 2; j < H; ++j) {
465                     for(int i = 2; i < W - 2; ++i) {
466                         blend[j][i] = blend[H-3][i];
467                     }
468                 }
469                 for(int j = 0; j < H; ++j) {
470                     // left border
471                     blend[j][0] = blend[j][1] = blend[j][2];
472                     // right border
473                     blend[j][W - 2] = blend[j][W - 1] = blend[j][W - 3];
474                 }
475             }
476         }
477     }
478 }
479 
480 }
481