1 /* -*- C++ -*-
2  *
3  *  This file is part of RawTherapee.
4  *
5  *  Ported from LuminanceHDR by Alberto Griggio <alberto.griggio@gmail.com>
6  *
7  *  RawTherapee is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  RawTherapee is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 /**
22  * @file tmo_fattal02.cpp
23  * @brief TMO: Gradient Domain High Dynamic Range Compression
24  *
25  * Implementation of Gradient Domain High Dynamic Range Compression
26  * by Raanan Fattal, Dani Lischinski, Michael Werman.
27  *
28  * @author Grzegorz Krawczyk, <krawczyk@mpi-sb.mpg.de>
29  *
30  *
31  * This file is a part of LuminanceHDR package, based on pfstmo.
32  * ----------------------------------------------------------------------
33  * Copyright (C) 2003,2004 Grzegorz Krawczyk
34  *
35  *  This program is free software; you can redistribute it and/or modify
36  *  it under the terms of the GNU General Public License as published by
37  *  the Free Software Foundation; either version 2 of the License, or
38  *  (at your option) any later version.
39  *
40  *  This program is distributed in the hope that it will be useful,
41  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
42  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
43  *  GNU General Public License for more details.
44  *
45  *  You should have received a copy of the GNU General Public License
46  *  along with this program; if not, write to the Free Software
47  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
48  * ----------------------------------------------------------------------
49  *
50  * $Id: tmo_fattal02.cpp,v 1.3 2008/11/04 23:43:08 rafm Exp $
51  */
52 
53 
54 #ifdef _OPENMP
55 #include <omp.h>
56 #endif
57 #include <cstdio>
58 #include <iostream>
59 #include <iterator>
60 #include <vector>
61 #include <algorithm>
62 #include <limits>
63 
64 #include <math.h>
65 #include <assert.h>
66 #include <fftw3.h>
67 
68 #include "array2D.h"
69 #include "improcfun.h"
70 #include "settings.h"
71 #include "iccstore.h"
72 #include "StopWatch.h"
73 #include "sleef.h"
74 #include "opthelper.h"
75 #include "rt_algo.h"
76 #include "rescale.h"
77 #include "ipdenoise.h"
78 
79 namespace rtengine
80 {
81 
82 /******************************************************************************
83  * RT code
84  ******************************************************************************/
85 
86 extern const Settings *settings;
87 extern MyMutex *fftwMutex;
88 
89 using namespace std;
90 
91 namespace
92 {
93 
94 class Array2Df: public array2D<float>
95 {
96     typedef array2D<float> Super;
97 public:
Array2Df()98     Array2Df(): Super() {}
Array2Df(int w,int h)99     Array2Df(int w, int h): Super(w, h) {}
Array2Df(int w,int h,float ** data)100     Array2Df(int w, int h, float **data):
101         Super(w, h, data, ARRAY2D_BYREFERENCE) {}
102 
operator ()(int w,int h)103     float &operator() (int w, int h)
104     {
105         return (*this)[h][w];
106     }
107 
operator ()(int w,int h) const108     const float &operator() (int w, int h) const
109     {
110         return (*this)[h][w];
111     }
112 
operator ()(int i)113     float &operator() (int i)
114     {
115         return static_cast<float *> (*this)[i];
116     }
117 
operator ()(int i) const118     const float &operator() (int i) const
119     {
120         return const_cast<Array2Df &> (*this).operator() (i);
121     }
122 
getRows() const123     int getRows() const
124     {
125         return const_cast<Array2Df &> (*this).height();
126     }
127 
getCols() const128     int getCols() const
129     {
130         return const_cast<Array2Df &> (*this).width();
131     }
132 
data()133     float *data()
134     {
135         return static_cast<float *> (*this);
136     }
137 
data() const138     const float *data() const
139     {
140         return const_cast<Array2Df &> (*this).data();
141     }
142 
143     // operator float **() const
144     // {
145     //     return static_cast<float **>(const_cast<Array2Df &>(*this));
146     // }
147 };
148 
149 // upper bound on image dimension used in tmo_fattal02 -- see the comment there
150 const int RT_dimension_cap = 1920;
151 
152 void rescale_bilinear (const Array2Df &src, Array2Df &dst, bool multithread);
153 
154 
155 /******************************************************************************
156  * Luminance HDR code (modifications are marked with an RT comment)
157  ******************************************************************************/
158 
downSample(const Array2Df & A,Array2Df & B)159 void downSample (const Array2Df& A, Array2Df& B)
160 {
161     const int width = B.getCols();
162     const int height = B.getRows();
163 
164     // Note, I've uncommented all omp directives. They are all ok but are
165     // applied to too small problems and in total don't lead to noticeable
166     // speed improvements. The main issue is the pde solver and in case of the
167     // fft solver uses optimised threaded fftw routines.
168     //#pragma omp parallel for
169     for ( int y = 0 ; y < height ; y++ ) {
170         for ( int x = 0 ; x < width ; x++ ) {
171             float p = A (2 * x, 2 * y);
172             p += A (2 * x + 1, 2 * y);
173             p += A (2 * x, 2 * y + 1);
174             p += A (2 * x + 1, 2 * y + 1);
175             B (x, y) = p * 0.25f; // p / 4.0f;
176         }
177     }
178 }
179 
gaussianBlur(const Array2Df & I,Array2Df & L,bool multithread)180 void gaussianBlur (const Array2Df& I, Array2Df& L, bool multithread)
181 {
182     const int width = I.getCols();
183     const int height = I.getRows();
184 
185     if (width < 3 || height < 3) {
186         if (&I != &L) {
187             for (int i = 0, n = width * height; i < n; ++i) {
188                 L (i) = I (i);
189             }
190         }
191 
192         return;
193     }
194 
195     Array2Df T (width, height);
196 
197     //--- X blur
198 #ifdef _OPENMP
199     #pragma omp parallel for shared(I, T) if(multithread)
200 #endif
201 
202     for ( int y = 0 ; y < height ; y++ ) {
203         for ( int x = 1 ; x < width - 1 ; x++ ) {
204             float t = 2.f * I (x, y);
205             t += I (x - 1, y);
206             t += I (x + 1, y);
207             T (x, y) = t * 0.25f; // t / 4.f;
208         }
209 
210         T (0, y) = ( 3.f * I (0, y) + I (1, y) ) * 0.25f; // / 4.f;
211         T (width - 1, y) = ( 3.f * I (width - 1, y) + I (width - 2, y) ) * 0.25f; // / 4.f;
212     }
213 
214     //--- Y blur
215 #ifdef _OPENMP
216     #pragma omp parallel for if(multithread)
217 #endif
218 
219     for ( int x = 0 ; x < width - 7 ; x += 8 ) {
220         for ( int y = 1 ; y < height - 1 ; y++ ) {
221             for (int xx = 0; xx < 8; ++xx) {
222                 float t = 2.f * T (x + xx, y);
223                 t += T (x + xx, y - 1);
224                 t += T (x + xx, y + 1);
225                 L (x + xx, y) = t * 0.25f; // t/4.0f;
226             }
227         }
228 
229         for (int xx = 0; xx < 8; ++xx) {
230             L (x + xx, 0) = ( 3.f * T (x + xx, 0) + T (x + xx, 1) ) * 0.25f; // / 4.0f;
231             L (x + xx, height - 1) = ( 3.f * T (x + xx, height - 1) + T (x + xx, height - 2) ) * 0.25f; // / 4.0f;
232         }
233     }
234 
235     for ( int x = width - (width % 8) ; x < width ; x++ ) {
236         for ( int y = 1 ; y < height - 1 ; y++ ) {
237             float t = 2.f * T (x, y);
238             t += T (x, y - 1);
239             t += T (x, y + 1);
240             L (x, y) = t * 0.25f; // t/4.0f;
241         }
242 
243         L (x, 0) = ( 3.f * T (x, 0) + T (x, 1) ) * 0.25f; // / 4.0f;
244         L (x, height - 1) = ( 3.f * T (x, height - 1) + T (x, height - 2) ) * 0.25f; // / 4.0f;
245     }
246 }
247 
createGaussianPyramids(Array2Df ** pyramids,int nlevels,bool multithread)248 void createGaussianPyramids (Array2Df** pyramids, int nlevels, bool multithread)
249 {
250     // pyramids[0] is already set
251     int width = pyramids[0]->getCols();
252     int height = pyramids[0]->getRows();
253 
254 
255     Array2Df* L = new Array2Df (width, height);
256     gaussianBlur ( *pyramids[0], *L, multithread );
257 
258     for ( int k = 1 ; k < nlevels ; k++ ) {
259         if (width > 2 && height > 2) {
260             width /= 2;
261             height /= 2;
262             pyramids[k] = new Array2Df (width, height);
263             downSample (*L, *pyramids[k]);
264         } else {
265             // RT - now nlevels is fixed in tmo_fattal02 (see the comment in
266             // there), so it might happen that we have to add some padding to
267             // the gaussian pyramids
268             pyramids[k] = new Array2Df (width, height);
269 
270             for (int j = 0, n = width * height; j < n; ++j) {
271                 (*pyramids[k]) (j) = (*L) (j);
272             }
273         }
274 
275         if (k < nlevels - 1) {
276             delete L;
277             L = new Array2Df (width, height);
278             gaussianBlur ( *pyramids[k], *L, multithread );
279         }
280     }
281 
282     delete L;
283 }
284 
285 //--------------------------------------------------------------------
286 
calculateGradients(Array2Df * H,Array2Df * G,int k,bool multithread)287 float calculateGradients (Array2Df* H, Array2Df* G, int k, bool multithread)
288 {
289     const int width = H->getCols();
290     const int height = H->getRows();
291     const float divider = pow ( 2.0f, k + 1 );
292     double avgGrad = 0.0; // use double precision for large summations
293 
294 #ifdef _OPENMP
295     #pragma omp parallel for reduction(+:avgGrad) if(multithread)
296 #endif
297 
298     for ( int y = 0 ; y < height ; y++ ) {
299         int n = (y == 0 ? 0 : y - 1);
300         int s = (y + 1 == height ? y : y + 1);
301 
302         for ( int x = 0 ; x < width ; x++ ) {
303             float gx, gy;
304             int w, e;
305             w = (x == 0 ? 0 : x - 1);
306             e = (x + 1 == width ? x : x + 1);
307 
308             gx = ((*H) (w, y) - (*H) (e, y));
309 
310             gy = ((*H) (x, s) - (*H) (x, n));
311             // note this implicitly assumes that H(-1)=H(0)
312             // for the fft-pde slover this would need adjustment as H(-1)=H(1)
313             // is assumed, which means gx=0.0, gy=0.0 at the boundaries
314             // however, the impact is not visible so we ignore this here
315 
316             (*G) (x, y) = sqrt (gx * gx + gy * gy) / divider;
317             avgGrad += (*G) (x, y);
318         }
319     }
320 
321     return avgGrad / (width * height);
322 }
323 
324 //--------------------------------------------------------------------
325 
upSample(const Array2Df & A,Array2Df & B)326 void upSample (const Array2Df& A, Array2Df& B)
327 {
328     const int width = B.getCols();
329     const int height = B.getRows();
330     const int awidth = A.getCols();
331     const int aheight = A.getRows();
332 
333     //#pragma omp parallel for shared(A, B)
334     for ( int y = 0 ; y < height ; y++ ) {
335         for ( int x = 0 ; x < width ; x++ ) {
336             int ax = static_cast<int> (x * 0.5f); //x / 2.f;
337             int ay = static_cast<int> (y * 0.5f); //y / 2.f;
338             ax = (ax < awidth) ? ax : awidth - 1;
339             ay = (ay < aheight) ? ay : aheight - 1;
340 
341             B (x, y) = A (ax, ay);
342         }
343     }
344 
345 //--- this code below produces 'use of uninitialized value error'
346 //   int width = A->getCols();
347 //   int height = A->getRows();
348 //   int x,y;
349 
350 //   for( y=0 ; y<height ; y++ )
351 //     for( x=0 ; x<width ; x++ )
352 //     {
353 //       (*B)(2*x,2*y) = (*A)(x,y);
354 //       (*B)(2*x+1,2*y) = (*A)(x,y);
355 //       (*B)(2*x,2*y+1) = (*A)(x,y);
356 //       (*B)(2*x+1,2*y+1) = (*A)(x,y);
357 //     }
358 }
359 
360 
calculateFiMatrix(Array2Df * FI,Array2Df * gradients[],float avgGrad[],int nlevels,int detail_level,float alfa,float beta,float noise,bool multithread)361 void calculateFiMatrix (Array2Df* FI, Array2Df* gradients[],
362                         float avgGrad[], int nlevels, int detail_level,
363                         float alfa, float beta, float noise, bool multithread)
364 {
365     int width = gradients[nlevels - 1]->getCols();
366     int height = gradients[nlevels - 1]->getRows();
367     Array2Df** fi = new Array2Df*[nlevels];
368 
369     fi[nlevels - 1] = new Array2Df (width, height);
370 
371 #ifdef _OPENMP
372     #pragma omp parallel for shared(fi) if(multithread)
373 #endif
374     for ( int k = 0 ; k < width * height ; k++ ) {
375         (*fi[nlevels - 1]) (k) = 1.0f;
376     }
377 
378     for ( int k = nlevels - 1; k >= 0 ; k-- ) {
379         width = gradients[k]->getCols();
380         height = gradients[k]->getRows();
381 
382         // only apply gradients to levels>=detail_level but at least to the coarsest
383         if ((k >= detail_level || k == nlevels - 1) && beta != 1.f)  {
384             const float a = alfa * avgGrad[k];
385             //DEBUG_STR << "calculateFiMatrix: apply gradient to level " << k << endl;
386 #ifdef _OPENMP
387             #pragma omp parallel for shared(fi,avgGrad) if(multithread)
388 #endif
389             for ( int y = 0; y < height; y++ ) {
390                 for ( int x = 0; x < width; x++ ) {
391                     float grad = ((*gradients[k]) (x, y) < 1e-4f) ? 1e-4 : (*gradients[k]) (x, y);
392                     float value = pow ((grad + noise) / a, beta - 1.0f);
393 
394                     (*fi[k]) (x, y) *= value;
395                 }
396             }
397         }
398 
399         // create next level
400         if ( k > 1 ) {
401             width = gradients[k - 1]->getCols();
402             height = gradients[k - 1]->getRows();
403             fi[k - 1] = new Array2Df (width, height);
404         } else {
405             fi[0] = FI;    // highest level -> result
406         }
407 
408         if (k > 0) {
409             upSample (*fi[k], *fi[k - 1]);        // upsample to next level
410             gaussianBlur (*fi[k - 1], *fi[k - 1], multithread);
411         }
412     }
413 
414     for ( int k = 1 ; k < nlevels ; k++ ) {
415         delete fi[k];
416     }
417 
418     delete[] fi;
419 }
420 
421 void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread);
422 
tmo_fattal02(size_t width,size_t height,const Array2Df & Y,Array2Df & L,float alfa,float beta,float noise,int detail_level,bool multithread)423 void tmo_fattal02 (size_t width,
424                    size_t height,
425                    const Array2Df& Y,
426                    Array2Df& L,
427                    float alfa,
428                    float beta,
429                    float noise,
430                    int detail_level,
431                    bool multithread)
432 {
433 // #ifdef TIMER_PROFILING
434 //     msec_timer stop_watch;
435 //     stop_watch.start();
436 // #endif
437     // static const float black_point = 0.1f;
438     // static const float white_point = 0.5f;
439     // static const float gamma = 1.0f; // 0.8f;
440 
441     // static const int   detail_level = 3;
442     if ( detail_level < 0 ) {
443         detail_level = 0;
444     }
445 
446     if ( detail_level > 3 ) {
447         detail_level = 3;
448     }
449 
450     // ph.setValue(2);
451     // if (ph.canceled()) return;
452 
453     /* RT -- we use a hardcoded value for nlevels, to limit the
454      * dependency of the result on the image size. When using an auto computed
455      * nlevels value, you would get vastly different results with different
456      * image sizes, making it essentially impossible to preview the tool
457      * inside RT. With a hardcoded value, the results for the preview are much
458      * closer to those for the final image */
459     // int MSIZE = 32;         // minimum size of gaussian pyramid
460     // // I believe a smaller value than 32 results in slightly better overall
461     // // quality but I'm only applying this if the newly implemented fft solver
462     // // is used in order not to change behaviour of the old version
463     // // TODO: best let the user decide this value
464     // // if (fftsolver)
465     // {
466     //    MSIZE = 8;
467     // }
468 
469 
470     // int size = width * height;
471 
472     // find max value, normalize to range 0..100 and take logarithm
473     // float minLum = Y (0, 0);
474 //     float maxLum = Y (0, 0);
475 
476 // #ifdef _OPENMP
477 //     #pragma omp parallel for reduction(max:maxLum) if(multithread)
478 // #endif
479 
480 //     for ( int i = 0 ; i < size ; i++ ) {
481 //         maxLum = std::max (maxLum, Y (i));
482 //     }
483 
484     Array2Df* H = new Array2Df (width, height);
485     //float temp = 100.f / maxLum;
486     float eps = 1e-4f;
487 #ifdef _OPENMP
488     #pragma omp parallel if(multithread)
489 #endif
490     {
491 #ifdef __SSE2__
492         vfloat epsv = F2V (eps);
493         //vfloat tempv = F2V (temp);
494 #endif
495 #ifdef _OPENMP
496         #pragma omp for schedule(dynamic,16)
497 #endif
498 
499         for (size_t i = 0 ; i < height ; ++i) {
500             size_t j = 0;
501 #ifdef __SSE2__
502 
503             for (; j < width - 3; j += 4) {
504                 STVFU ((*H)[i][j], xlogf(/*tempv **/ LVFU (Y[i][j]) + epsv));
505             }
506 
507 #endif
508 
509             for (; j < width; ++j) {
510                 (*H)[i][j] = xlogf(/*temp **/ Y[i][j] + eps);
511             }
512         }
513     }
514 
515     /** RT - this is also here to reduce the dependency of the results on the
516      * input image size, with the primary aim of having a preview in RT that is
517      * reasonably close to the actual output image. Intuitively, what we do is
518      * to put a cap on the dimension of the image processed, so that it is close
519      * in size to the typical preview that you will see on a normal consumer
520      * monitor. (That's where the 1920 value for RT_dimension_cap comes from.)
521      * However, we can't simply downscale the input Y array and then upscale it
522      * on output, because that would cause a big loss of sharpness (confirmed by
523      * testing).
524      * So, we use a different method: we downscale the H array, so that we
525      * compute a downscaled gaussian pyramid and a downscaled FI matrix. Then,
526      * we upscale the FI matrix later on, before it gets combined with the
527      * original input luminance array H. This seems to preserve the input
528      * sharpness and at the same time significantly reduce the dependency of the
529      * result on the input size. Clearly this is a hack, and keep in mind that I
530      * do not really know how Fattal works (it comes from LuminanceHDR almost
531      * verbatim), so this should probably be revised/reviewed by someone who
532      * knows better... also, we use a quite naive bilinear interpolation
533      * algorithm (see rescale_bilinear below), which could definitely be
534      * improved */
535     int fullwidth = width;
536     int fullheight = height;
537     int dim = std::max (width, height);
538     Array2Df *fullH = nullptr;
539 
540     if (dim > RT_dimension_cap) {
541         float s = float (RT_dimension_cap) / float (dim);
542         Array2Df *HH = new Array2Df (width * s, height * s);
543         rescale_bilinear (*H, *HH, multithread);
544         fullH = H;
545         H = HH;
546         width = H->getCols();
547         height = H->getRows();
548     }
549 
550     /** RT */
551 
552     const int nlevels = 7; // RT -- see above
553 
554     Array2Df* pyramids[nlevels];
555     pyramids[0] = H;
556     createGaussianPyramids (pyramids, nlevels, multithread);
557 
558     // calculate gradients and its average values on pyramid levels
559     Array2Df* gradients[nlevels];
560     float avgGrad[nlevels];
561 
562     for ( int k = 0 ; k < nlevels ; k++ ) {
563         gradients[k] = new Array2Df (pyramids[k]->getCols(), pyramids[k]->getRows());
564         avgGrad[k] = calculateGradients (pyramids[k], gradients[k], k, multithread);
565         if(k != 0) // pyramids[0] is H. Will be deleted later
566             delete pyramids[k];
567     }
568 
569 
570     // calculate fi matrix
571     Array2Df* FI = new Array2Df (width, height);
572     calculateFiMatrix (FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise, multithread);
573 
574     for ( int i = 0 ; i < nlevels ; i++ ) {
575         delete gradients[i];
576     }
577 
578     /** - RT - bring back the FI image to the input size if it was downscaled */
579     if (fullH) {
580         delete H;
581         H = fullH;
582         Array2Df *FI2 = new Array2Df (fullwidth, fullheight);
583         rescale_bilinear (*FI, *FI2, multithread);
584         delete FI;
585         FI = FI2;
586         width = fullwidth;
587         height = fullheight;
588     }
589 
590     /** RT */
591 
592     // attenuate gradients
593     Array2Df* Gx = new Array2Df (width, height);
594     Array2Df* Gy = &L; // use L as buffer for Gy
595 
596     // the fft solver solves the Poisson pde but with slightly different
597     // boundary conditions, so we need to adjust the assembly of the right hand
598     // side accordingly (basically fft solver assumes U(-1) = U(1), whereas zero
599     // Neumann conditions assume U(-1)=U(0)), see also divergence calculation
600 #ifdef _OPENMP
601     #pragma omp parallel for if(multithread)
602 #endif
603 
604     for ( size_t y = 0 ; y < height ; y++ ) {
605         // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
606         unsigned int yp1 = (y + 1 >= height ? height - 2 : y + 1);
607 
608         for ( size_t x = 0 ; x < width ; x++ ) {
609             // sets index+1 based on the boundary assumption H(N+1)=H(N-1)
610             unsigned int xp1 = (x + 1 >= width ?  width - 2  : x + 1);
611             // forward differences in H, so need to use between-points approx of FI
612             (*Gx) (x, y) = ((*H) (xp1, y) - (*H) (x, y)) * 0.5 * ((*FI) (xp1, y) + (*FI) (x, y));
613             (*Gy) (x, y) = ((*H) (x, yp1) - (*H) (x, y)) * 0.5 * ((*FI) (x, yp1) + (*FI) (x, y));
614         }
615     }
616 
617     delete H;
618 
619     // calculate divergence
620 #ifdef _OPENMP
621     #pragma omp parallel for if(multithread)
622 #endif
623 
624     for ( size_t y = 0; y < height; ++y ) {
625         for ( size_t x = 0; x < width; ++x ) {
626             (*FI) (x, y) = (*Gx) (x, y) + (*Gy) (x, y);
627 
628             if ( x > 0 ) {
629                 (*FI) (x, y) -= (*Gx) (x - 1, y);
630             }
631 
632             if ( y > 0 ) {
633                 (*FI) (x, y) -= (*Gy) (x, y - 1);
634             }
635 
636             if (x == 0) {
637                 (*FI) (x, y) += (*Gx) (x, y);
638             }
639 
640             if (y == 0) {
641                 (*FI) (x, y) += (*Gy) (x, y);
642             }
643 
644         }
645     }
646 
647     //delete Gx; // RT - reused as temp buffer in solve_pde_fft, deleted later
648 
649     // solve pde and exponentiate (ie recover compressed image)
650     {
651         MyMutex::MyLock lock (*fftwMutex);
652         solve_pde_fft (FI, &L, Gx, multithread);
653     }
654     delete Gx;
655     delete FI;
656 
657 #ifdef _OPENMP
658     #pragma omp parallel if(multithread)
659 #endif
660     {
661 // #ifdef __SSE2__
662 //         vfloat gammav = F2V (gamma);
663 // #endif
664 #ifdef _OPENMP
665         #pragma omp for schedule(dynamic,16)
666 #endif
667 
668         for (size_t i = 0 ; i < height ; i++) {
669             size_t j = 0;
670 #ifdef __SSE2__
671 
672             for (; j < width - 3; j += 4) {
673                 STVFU (L[i][j], xexpf (/*gammav **/ LVFU (L[i][j])));
674             }
675 
676 #endif
677 
678             for (; j < width; j++) {
679                 L[i][j] = xexpf ( /*gamma **/ L[i][j]);
680             }
681         }
682     }
683 }
684 
685 
686 /**
687  *
688  * @file pde_fft.cpp
689  * @brief Direct Poisson solver using the discrete cosine transform
690  *
691  * @author Tino Kluge (tino.kluge@hrz.tu-chemnitz.de)
692  *
693  */
694 
695 //////////////////////////////////////////////////////////////////////
696 // Direct Poisson solver using the discrete cosine transform
697 //////////////////////////////////////////////////////////////////////
698 // by Tino Kluge (tino.kluge@hrz.tu-chemnitz.de)
699 //
700 // let U and F be matrices of order (n1,n2), ie n1=height, n2=width
701 // and L_x of order (n2,n2) and L_y of order (n1,n1) and both
702 // representing the 1d Laplace operator with Neumann boundary conditions,
703 // ie L_x and L_y are tridiagonal matrices of the form
704 //
705 //  ( -2  2          )
706 //  (  1 -2  1       )
707 //  (     .  .  .    )
708 //  (        1 -2  1 )
709 //  (           2 -2 )
710 //
711 // then this solver computes U given F based on the equation
712 //
713 //  -------------------------
714 //  L_y U + (L_x U^tr)^tr = F
715 //  -------------------------
716 //
717 // Note, if the first and last row of L_x and L_y contained one's instead of
718 // two's then this equation would be exactly the 2d Poisson equation with
719 // Neumann boundary conditions. As a simple rule:
720 // - Neumann: assume U(-1)=U(0) --> U(i-1) - 2 U(i) + U(i+1) becomes
721 //        i=0: U(0) - 2 U(0) + U(1) = -U(0) + U(1)
722 // - our system: assume U(-1)=U(1) --> this becomes
723 //        i=0: U(1) - 2(0) + U(1) = -2 U(0) + 2 U(1)
724 //
725 // The multi grid solver solve_pde_multigrid() solves the 2d Poisson pde
726 // with the right Neumann boundary conditions, U(-1)=U(0), see function
727 // atimes(). This means the assembly of the right hand side F is different
728 // for both solvers.
729 
730 
731 // returns T = EVy A EVx^tr
732 // note, modifies input data
transform_ev2normal(Array2Df * A,Array2Df * T,bool multithread)733 void transform_ev2normal (Array2Df *A, Array2Df *T, bool multithread)
734 {
735     int width = A->getCols();
736     int height = A->getRows();
737     assert ((int)T->getCols() == width && (int)T->getRows() == height);
738 
739     // the discrete cosine transform is not exactly the transform needed
740     // need to scale input values to get the right transformation
741 #ifdef _OPENMP
742     #pragma omp parallel for if(multithread)
743 #endif
744 
745     for (int y = 1 ; y < height - 1 ; y++ )
746         for (int x = 1 ; x < width - 1 ; x++ ) {
747             (*A) (x, y) *= 0.25f;
748         }
749 
750     for (int x = 1 ; x < width - 1 ; x++ ) {
751         (*A) (x, 0) *= 0.5f;
752         (*A) (x, height - 1) *= 0.5f;
753     }
754 
755     for (int y = 1 ; y < height - 1 ; y++ ) {
756         (*A) (0, y) *= 0.5;
757         (*A) (width - 1, y) *= 0.5f;
758     }
759 
760     // note, fftw provides its own memory allocation routines which
761     // ensure that memory is properly 16/32 byte aligned so it can
762     // use SSE/AVX operations (2/4 double ops in parallel), if our
763     // data is not properly aligned fftw won't use SSE/AVX
764     // (I believe new() aligns memory to 16 byte so avoid overhead here)
765     //
766     // double* in = (double*) fftwf_malloc(sizeof(double) * width*height);
767     // fftwf_free(in);
768 
769     // executes 2d discrete cosine transform
770     fftwf_plan p;
771     p = fftwf_plan_r2r_2d (height, width, A->data(), T->data(),
772                            FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
773     fftwf_execute (p);
774     fftwf_destroy_plan (p);
775 }
776 
777 
778 // returns T = EVy^-1 * A * (EVx^-1)^tr
transform_normal2ev(Array2Df * A,Array2Df * T,bool multithread)779 void transform_normal2ev (Array2Df *A, Array2Df *T, bool multithread)
780 {
781     int width = A->getCols();
782     int height = A->getRows();
783     assert ((int)T->getCols() == width && (int)T->getRows() == height);
784 
785     // executes 2d discrete cosine transform
786     fftwf_plan p;
787     p = fftwf_plan_r2r_2d (height, width, A->data(), T->data(),
788                            FFTW_REDFT00, FFTW_REDFT00, FFTW_ESTIMATE);
789     fftwf_execute (p);
790     fftwf_destroy_plan (p);
791 
792     // need to scale the output matrix to get the right transform
793     float factor = (1.0f / ((height - 1) * (width - 1)));
794 #ifdef _OPENMP
795     #pragma omp parallel for if(multithread)
796 #endif
797 
798     for (int y = 0 ; y < height ; y++ )
799         for (int x = 0 ; x < width ; x++ ) {
800             (*T) (x, y) *= factor;
801         }
802 
803     for (int x = 0 ; x < width ; x++ ) {
804         (*T) (x, 0) *= 0.5f;
805         (*T) (x, height - 1) *= 0.5f;
806     }
807 
808     for (int y = 0 ; y < height ; y++ ) {
809         (*T) (0, y) *= 0.5f;
810         (*T) (width - 1, y) *= 0.5f;
811     }
812 }
813 
814 // returns the eigenvalues of the 1d laplace operator
get_lambda(int n)815 std::vector<double> get_lambda (int n)
816 {
817     assert (n > 1);
818     std::vector<double> v (n);
819 
820     for (int i = 0; i < n; i++) {
821         v[i] = -4.0 * SQR (sin ((double)i / (2 * (n - 1)) * RT_PI));
822     }
823 
824     return v;
825 }
826 
827 // // makes boundary conditions compatible so that a solution exists
828 // void make_compatible_boundary(Array2Df *F)
829 // {
830 //   int width = F->getCols();
831 //   int height = F->getRows();
832 
833 //   double sum=0.0;
834 //   for(int y=1 ; y<height-1 ; y++ )
835 //     for(int x=1 ; x<width-1 ; x++ )
836 //       sum+=(*F)(x,y);
837 
838 //   for(int x=1 ; x<width-1 ; x++ )
839 //     sum+=0.5*((*F)(x,0)+(*F)(x,height-1));
840 
841 //   for(int y=1 ; y<height-1 ; y++ )
842 //     sum+=0.5*((*F)(0,y)+(*F)(width-1,y));
843 
844 //   sum+=0.25*((*F)(0,0)+(*F)(0,height-1)+(*F)(width-1,0)+(*F)(width-1,height-1));
845 
846 //   //DEBUG_STR << "compatible_boundary: int F = " << sum ;
847 //   //DEBUG_STR << " (should be 0 to be solvable)" << std::endl;
848 
849 //   double add=-sum/(height+width-3);
850 //   //DEBUG_STR << "compatible_boundary: adjusting boundary by " << add << std::endl;
851 //   for(int x=0 ; x<width ; x++ )
852 //   {
853 //     (*F)(x,0)+=add;
854 //     (*F)(x,height-1)+=add;
855 //   }
856 //   for(int y=1 ; y<height-1 ; y++ )
857 //   {
858 //     (*F)(0,y)+=add;
859 //     (*F)(width-1,y)+=add;
860 //   }
861 // }
862 
863 
864 
865 // solves Laplace U = F with Neumann boundary conditions
866 // if adjust_bound is true then boundary values in F are modified so that
867 // the equation has a solution, if adjust_bound is set to false then F is
868 // not modified and the equation might not have a solution but an
869 // approximate solution with a minimum error is then calculated
870 // double precision version
solve_pde_fft(Array2Df * F,Array2Df * U,Array2Df * buf,bool multithread)871 void solve_pde_fft (Array2Df *F, Array2Df *U, Array2Df *buf, bool multithread)/*, pfs::Progress &ph,
872                                               bool adjust_bound)*/
873 {
874     // ph.setValue(20);
875     //DEBUG_STR << "solve_pde_fft: solving Laplace U = F ..." << std::endl;
876     int width = F->getCols();
877     int height = F->getRows();
878     assert ((int)U->getCols() == width && (int)U->getRows() == height);
879     assert (buf->getCols() == width && buf->getRows() == height);
880 
881     // activate parallel execution of fft routines
882 #ifdef RT_FFTW3F_OMP
883 
884     if (multithread) {
885         fftwf_init_threads();
886         fftwf_plan_with_nthreads ( omp_get_max_threads() );
887     }
888 
889 // #else
890 //   fftwf_plan_with_nthreads( 2 );
891 #endif
892 
893     // in general there might not be a solution to the Poisson pde
894     // with Neumann boundary conditions unless the boundary satisfies
895     // an integral condition, this function modifies the boundary so that
896     // the condition is exactly satisfied
897     // if(adjust_bound)
898     // {
899     //   //DEBUG_STR << "solve_pde_fft: checking boundary conditions" << std::endl;
900     //   make_compatible_boundary(F);
901     // }
902 
903     // transforms F into eigenvector space: Ftr =
904     //DEBUG_STR << "solve_pde_fft: transform F to ev space (fft)" << std::endl;
905     Array2Df* F_tr = buf;
906     transform_normal2ev (F, F_tr, multithread);
907     // TODO: F no longer needed so could release memory, but as it is an
908     // input parameter we won't do that
909 
910 
911     // in the eigenvector space the solution is very simple
912     std::vector<double> l1 = get_lambda (height);
913     std::vector<double> l2 = get_lambda (width);
914 
915 #ifdef _OPENMP
916     #pragma omp parallel for if(multithread)
917 #endif
918 
919     for (int y = 0 ; y < height ; y++ ) {
920         for (int x = 0 ; x < width ; x++ ) {
921             (*F_tr) (x, y) = (*F_tr) (x, y) / (l1[y] + l2[x]);
922         }
923     }
924 
925     (*F_tr) (0, 0) = 0.f; // any value ok, only adds a const to the solution
926 
927     // transforms F_tr back to the normal space
928     transform_ev2normal (F_tr, U, multithread);
929 
930     // the solution U as calculated will satisfy something like int U = 0
931     // since for any constant c, U-c is also a solution and we are mainly
932     // working in the logspace of (0,1) data we prefer to have
933     // a solution which has no positive values: U_new(x,y)=U(x,y)-max
934     // (not really needed but good for numerics as we later take exp(U))
935     //DEBUG_STR << "solve_pde_fft: removing constant from solution" << std::endl;
936 //     float max = 0.f;
937 // #ifdef _OPENMP
938 //     #pragma omp parallel for reduction(max:max) if(multithread)
939 // #endif
940 
941 //     for (int i = 0; i < width * height; i++) {
942 //         max = std::max (max, (*U) (i));
943 //     }
944 
945 // #ifdef _OPENMP
946 //     #pragma omp parallel for if(multithread)
947 // #endif
948 
949 //     for (int i = 0; i < width * height; i++) {
950 //         (*U) (i) -= max;
951 //     }
952 }
953 
954 
955 // ---------------------------------------------------------------------
956 // the functions below are only for test purposes to check the accuracy
957 // of the pde solvers
958 
959 
960 // // returns the norm of (Laplace U - F) of all interior points
961 // // useful to compare solvers
962 // float residual_pde(Array2Df* U, Array2Df* F)
963 // {
964 //   int width = U->getCols();
965 //   int height = U->getRows();
966 //   assert((int)F->getCols()==width && (int)F->getRows()==height);
967 
968 //   double res=0.0;
969 //   for(int y=1;y<height-1;y++)
970 //     for(int x=1;x<width-1;x++)
971 //     {
972 //       double laplace=-4.0*(*U)(x,y)+(*U)(x-1,y)+(*U)(x+1,y)
973 //                      +(*U)(x,y-1)+(*U)(x,y+1);
974 //       res += SQR( laplace-(*F)(x,y) );
975 //     }
976 //   return static_cast<float>( sqrt(res) );
977 // }
978 
979 
980 /*****************************************************************************
981  * RT code from here on
982  *****************************************************************************/
983 
rescale_bilinear(const Array2Df & src,Array2Df & dst,bool multithread)984 inline void rescale_bilinear (const Array2Df &src, Array2Df &dst, bool multithread)
985 {
986     rescaleBilinear(src, dst, multithread);
987 }
988 
rescale_nearest(const Array2Df & src,Array2Df & dst,bool multithread)989 inline void rescale_nearest (const Array2Df &src, Array2Df &dst, bool multithread)
990 {
991     rescaleNearest(src, dst, multithread);
992 }
993 
994 
luminance(float r,float g,float b,TMatrix ws)995 inline float luminance (float r, float g, float b, TMatrix ws)
996 {
997     return Color::rgbLuminance(r, g, b, ws);
998 }
999 
1000 
round_up_pow2(int dim)1001 inline int round_up_pow2 (int dim)
1002 {
1003     // from https://graphics.stanford.edu/~seander/bithacks.html
1004     assert (dim > 0);
1005     unsigned int v = dim;
1006     v--;
1007     v |= v >> 1;
1008     v |= v >> 2;
1009     v |= v >> 4;
1010     v |= v >> 8;
1011     v |= v >> 16;
1012     v++;
1013     return v;
1014 }
1015 
find_fast_dim(int dim)1016 inline int find_fast_dim (int dim)
1017 {
1018     // as per the FFTW docs:
1019     //
1020     //   FFTW is generally best at handling sizes of the form
1021     //     2^a 3^b 5^c 7^d 11^e 13^f,
1022     //   where e+f is either 0 or 1.
1023     //
1024     // Here, we try to round up to the nearest dim that can be expressed in
1025     // the above form. This is not exhaustive, but should be ok for pictures
1026     // up to 100MPix at least
1027 
1028     int d1 = round_up_pow2 (dim);
1029     std::vector<int> d = {
1030         d1 / 128 * 65,
1031         d1 / 64 * 33,
1032         d1 / 512 * 273,
1033         d1 / 16 * 9,
1034         d1 / 8 * 5,
1035         d1 / 16 * 11,
1036         d1 / 128 * 91,
1037         d1 / 4 * 3,
1038         d1 / 64 * 49,
1039         d1 / 16 * 13,
1040         d1 / 8 * 7,
1041         d1
1042     };
1043 
1044     for (size_t i = 0; i < d.size(); ++i) {
1045         if (d[i] >= dim) {
1046             return d[i];
1047         }
1048     }
1049 
1050     assert (false);
1051     return dim;
1052 }
1053 
1054 
ToneMapFattal02(Imagefloat * rgb,ImProcFunctions * ipf,const ProcParams * params,bool multiThread)1055 void ToneMapFattal02(Imagefloat *rgb, ImProcFunctions *ipf, const ProcParams *params, bool multiThread)
1056 {
1057 //    BENCHFUN
1058     const int detail_level = 3;
1059 
1060     float alpha = 1.f;
1061     if (params->fattal.threshold < 0) {
1062         alpha += (params->fattal.threshold * 0.9f) / 100.f;
1063     } else if (params->fattal.threshold > 0) {
1064         alpha += params->fattal.threshold / 100.f;
1065     }
1066 
1067     float beta = 1.f - (params->fattal.amount * 0.3f) / 100.f;
1068 
1069     // sanity check
1070     if (alpha <= 0 || beta <= 0) {
1071         return;
1072     }
1073 
1074     int w = rgb->getWidth();
1075     int h = rgb->getHeight();
1076 
1077     Array2Df Yr (w, h);
1078 
1079     constexpr float epsilon = 1e-4f;
1080     constexpr float luminance_noise_floor = 65.535f;
1081     constexpr float min_luminance = 1.f;
1082 
1083     TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix (params->icm.workingProfile);
1084 
1085 #ifdef _OPENMP
1086     #pragma omp parallel for if(multiThread)
1087 #endif
1088     for (int y = 0; y < h; y++) {
1089         for (int x = 0; x < w; x++) {
1090             Yr (x, y) = std::max (luminance (rgb->r (y, x), rgb->g (y, x), rgb->b (y, x), ws), min_luminance); // clip really black pixels
1091         }
1092     }
1093 
1094     // median filter on the deep shadows, to avoid boosting noise
1095     // because w2 >= w and h2 >= h, we can use the L buffer as temporary buffer for Median_Denoise()
1096     int w2 = find_fast_dim (w) + 1;
1097     int h2 = find_fast_dim (h) + 1;
1098     Array2Df L (w2, h2);
1099     {
1100 #ifdef _OPENMP
1101         int num_threads = multiThread ? omp_get_max_threads() : 1;
1102 #else
1103         int num_threads = 1;
1104 #endif
1105         float r = float (std::max (w, h)) / float (RT_dimension_cap);
1106         denoise::Median med;
1107 
1108         if (r >= 3) {
1109             med = denoise::Median::TYPE_7X7;
1110         } else if (r >= 2) {
1111             med = denoise::Median::TYPE_5X5_STRONG;
1112         } else if (r >= 1) {
1113             med = denoise::Median::TYPE_5X5_SOFT;
1114         } else {
1115             med = denoise::Median::TYPE_3X3_STRONG;
1116         }
1117 
1118         denoise::Median_Denoise(Yr, Yr, luminance_noise_floor, w, h, med, 1, num_threads, L);
1119     }
1120 
1121     float noise = alpha * 0.01f;
1122 
1123     if (settings->verbose) {
1124         std::cout << "ToneMapFattal02: alpha = " << alpha << ", beta = " << beta
1125                   << ", detail_level = " << detail_level << std::endl;
1126     }
1127 
1128     rescale_nearest (Yr, L, multiThread);
1129     tmo_fattal02 (w2, h2, L, L, alpha, beta, noise, detail_level, multiThread);
1130 
1131     const float hr = float(h2) / float(h);
1132     const float wr = float(w2) / float(w);
1133 
1134     float scale = 65535.f;
1135     float offset = 0.f;
1136     {
1137         float ratio = 0.f;
1138         int ww, hh;
1139         if (w >= h) {
1140             ratio = 200.f / w;
1141             ww = 200;
1142             hh = ratio * h;
1143         } else {
1144             ratio = 200.f / h;
1145             hh = 200;
1146             ww = ratio * w;
1147         }
1148         Array2Df tmp(ww, hh);
1149         int sz = ww * hh;
1150         int idx = sz / 2;
1151         int oidx = LIM(int(sz * 0.05f + 0.5f), 1, sz-1);
1152         rescale_nearest(Yr, tmp, multiThread);
1153         std::sort(tmp.data(), tmp.data() + sz);
1154         float oldMedian = tmp(idx);
1155         float old_min = 0.f;
1156         for (int i = 0; i <= oidx; ++i) {
1157             old_min += tmp(i);
1158         }
1159         old_min /= oidx;
1160         rescale_nearest(L, tmp, multiThread);
1161         std::sort(tmp.data(), tmp.data() + sz);
1162         float newMedian = tmp(idx);
1163         scale = (oldMedian == 0.f || newMedian == 0.f) ? 65535.f : (oldMedian / newMedian); // avoid Nan
1164         float new_min = 0.f;
1165         for (int i = 0; i <= oidx; ++i) {
1166             new_min += tmp(i);
1167         }
1168         new_min /= oidx;
1169         offset = old_min - new_min;
1170     }
1171 
1172     const bool satcontrol = params->fattal.satcontrol;
1173 
1174 #ifdef _OPENMP
1175     #pragma omp parallel for schedule(dynamic,16) if(multiThread)
1176 #endif
1177     for (int y = 0; y < h; y++) {
1178         int yy = std::min(int(y * hr + 1), h2-1);
1179 
1180         for (int x = 0; x < w; x++) {
1181             int xx = std::min(int(x * wr + 1), w2-1);
1182 
1183             float Y = std::max(Yr(x, y), epsilon);
1184             float l = std::max(L(xx, yy), epsilon) * (scale / Y);
1185             float &r = rgb->r(y, x);
1186             float &g = rgb->g(y, x);
1187             float &b = rgb->b(y, x);
1188             float s = 1.f;
1189             if (l > 1.f) {
1190                 r = max(r * l - offset, r);
1191                 g = max(g * l - offset, g);
1192                 b = max(b * l - offset, b);
1193                 if (satcontrol) {
1194                     s = pow_F(1.f / l, 0.3f);
1195                 }
1196             } else {
1197                 r *= l;
1198                 g *= l;
1199                 b *= l;
1200                 if (satcontrol) {
1201                     s = pow_F(l, 0.3f);
1202                 }
1203             }
1204             if (satcontrol && s != 1.f) {
1205                 float ll = luminance(r, g, b, ws);
1206                 float rl = r - ll;
1207                 float gl = g - ll;
1208                 float bl = b - ll;
1209                 r = ll + s * rl;
1210                 g = ll + s * gl;
1211                 b = ll + s * bl;
1212             }
1213 
1214             assert(std::isfinite(rgb->r(y, x)));
1215             assert(std::isfinite(rgb->g(y, x)));
1216             assert(std::isfinite(rgb->b(y, x)));
1217         }
1218     }
1219 }
1220 
1221 } // namespace
1222 
1223 
dynamicRangeCompression(Imagefloat * rgb)1224 void ImProcFunctions::dynamicRangeCompression(Imagefloat *rgb)
1225 {
1226     if (params->fattal.enabled) {
1227         ToneMapFattal02(rgb, this, params, multiThread);
1228     }
1229 }
1230 
1231 
buildGradientsMask(int W,int H,float ** luminance,float ** out,float amount,int nlevels,int detail_level,float alfa,float beta,bool multithread)1232 void buildGradientsMask(int W, int H, float **luminance, float **out,
1233                         float amount, int nlevels, int detail_level,
1234                         float alfa, float beta, bool multithread)
1235 {
1236     Array2Df Y(W, H, luminance);
1237     const float noise = alfa * 0.01f;
1238 
1239     Array2Df *pyramids[nlevels];
1240     pyramids[0] = &Y;
1241     createGaussianPyramids(pyramids, nlevels, multithread);
1242 
1243     // calculate gradients and its average values on pyramid levels
1244     Array2Df *gradients[nlevels];
1245     float avgGrad[nlevels];
1246 
1247     for (int k = 0 ; k < nlevels ; k++) {
1248         gradients[k] = new Array2Df(pyramids[k]->getCols(), pyramids[k]->getRows());
1249         avgGrad[k] = calculateGradients(pyramids[k], gradients[k], k, multithread);
1250         if (k != 0) { // pyramids[0] is Y
1251             delete pyramids[k];
1252         }
1253     }
1254 
1255 
1256     // calculate fi matrix
1257     Array2Df FI(W, H, out);
1258     calculateFiMatrix(&FI, gradients, avgGrad, nlevels, detail_level, alfa, beta, noise, multithread);
1259 
1260     for (int i = 0 ; i < nlevels ; i++) {
1261         delete gradients[i];
1262     }
1263 
1264     // rescale the mask
1265     float m = out[0][0];
1266 #ifdef _OPENMP
1267 #   pragma omp parallel for reduction(max:m) if (multithread)
1268 #endif
1269     for (int y = 0; y < H; ++y) {
1270         for (int x = 0; x < W; ++x) {
1271             float v = std::abs(out[y][x]);
1272             out[y][x] = v;
1273             m = std::max(v, m);
1274         }
1275     }
1276     if (m > 0.f) {
1277         const float f = amount / m;
1278 #ifdef _OPENMP
1279 #       pragma omp parallel for reduction(max:m) if (multithread)
1280 #endif
1281         for (int y = 0; y < H; ++y) {
1282             for (int x = 0; x < W; ++x) {
1283                 out[y][x] *= f;
1284             }
1285         }
1286     }
1287 
1288     // {
1289     //     Imagefloat tmp(W, H);
1290     //     for (int y = 0; y < H; ++y) {
1291     //         for (int x = 0; x < W; ++x) {
1292     //             tmp.r(y, x) = tmp.g(y, x) = tmp.b(y, x) = out[y][x] * 65535.f;
1293     //         }
1294     //     }
1295     //     std::ostringstream name;
1296     //     name << "/tmp/FI-" << W << "x" << H << ".tif";
1297     //     tmp.saveAsTIFF(name.str(), 16);
1298     // }
1299 }
1300 
1301 
1302 } // namespace rtengine
1303