1 #include <cmath>
2 #include "rt_math.h"
3 #include "EdgePreservingDecomposition.h"
4 #ifdef _OPENMP
5 #include <omp.h>
6 #endif
7 #include "sleef.h"
8 
9 #define DIAGONALS 5
10 #define DIAGONALSP1 6
11 
12 /* Solves A x = b by the conjugate gradient method, where instead of feeding it the matrix A you feed it a function which
13 calculates A x where x is some vector. Stops when rms residual < RMSResidual or when maximum iterates is reached.
14 Stops at n iterates if MaximumIterates = 0 since that many iterates gives exact solution. Applicable to symmetric positive
15 definite problems only, which is what unconstrained smooth optimization pretty much always is.
16 Parameter pass can be passed through, containing whatever info you like it to contain (matrix info?).
17 Takes less memory with OkToModify_b = true, and Preconditioner = nullptr. */
SparseConjugateGradient(void Ax (float * Product,float * x,void * Pass),float * b,int n,bool OkToModify_b,float * x,float RMSResidual,void * Pass,int MaximumIterates,void Preconditioner (float * Product,float * x,void * Pass))18 float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, int n, bool OkToModify_b,
19                                float *x, float RMSResidual, void *Pass, int MaximumIterates, void Preconditioner(float *Product, float *x, void *Pass))
20 {
21     int iterate;
22 
23     float* buffer = (float*)malloc(2 * n * sizeof(float) + 128);
24     float *r = (buffer + 16);
25 
26     //Start r and x.
27     if(x == nullptr) {
28         x = new float[n];
29 
30         memset(x, 0, sizeof(float)*n);      //Zero initial guess if x == nullptr.
31         memcpy(r, b, sizeof(float)*n);
32     } else {
33         Ax(r, x, Pass);
34 #ifdef _OPENMP
35         #pragma omp parallel for           // removed schedule(dynamic,10)
36 #endif
37 
38         for(int ii = 0; ii < n; ii++) {
39             r[ii] = b[ii] - r[ii];    //r = b - A x.
40         }
41     }
42 
43     //s is preconditionment of r. Without, direct to r.
44     float *s = r;
45     double rs = 0.0; // use double precision for large summations
46 
47     if(Preconditioner != nullptr) {
48         s = new float[n];
49 
50         Preconditioner(s, r, Pass);
51     }
52 
53 #ifdef _OPENMP
54     #pragma omp parallel for reduction(+:rs)  // removed schedule(dynamic,10)
55 #endif
56 
57     for(int ii = 0; ii < n; ii++) {
58         rs += r[ii] * s[ii];
59     }
60 
61     //Search direction d.
62     float *d = (buffer + n + 32);
63 
64     memcpy(d, s, sizeof(float)*n);
65 
66     //Store calculations of Ax in this.
67     float *ax = b;
68 
69     if(!OkToModify_b) {
70         ax = new float[n];
71     }
72 
73     //Start iterating!
74     if(MaximumIterates == 0) {
75         MaximumIterates = n;
76     }
77 
78     for(iterate = 0; iterate < MaximumIterates; iterate++) {
79         //Get step size alpha, store ax while at it.
80         double ab = 0.0; // use double precision for large summations
81         Ax(ax, d, Pass);
82 #ifdef _OPENMP
83         #pragma omp parallel for reduction(+:ab)
84 #endif
85 
86         for(int ii = 0; ii < n; ii++) {
87             ab += d[ii] * ax[ii];
88         }
89 
90         if(ab == 0.0f) {
91             break;    //So unlikely. It means perfectly converged or singular, stop either way.
92         }
93 
94         ab = rs / ab;
95 
96         //Update x and r with this step size.
97         double rms = 0.0; // use double precision for large summations
98 #ifdef _OPENMP
99         #pragma omp parallel for reduction(+:rms)
100 #endif
101 
102         for(int ii = 0; ii < n; ii++) {
103             x[ii] += ab * d[ii];
104             r[ii] -= ab * ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
105             rms += r[ii] * r[ii];
106         }
107 
108         rms = sqrtf(rms / n);
109 
110         //Quit? This probably isn't the best stopping condition, but ok.
111         if(rms < RMSResidual) {
112             break;
113         }
114 
115         if(Preconditioner != nullptr) {
116             Preconditioner(s, r, Pass);
117         }
118 
119         //Get beta.
120         ab = rs;
121         rs = 0.0f;
122 
123 #ifdef _OPENMP
124         #pragma omp parallel
125 #endif
126         {
127 #ifdef _OPENMP
128             #pragma omp for reduction(+:rs)
129 #endif
130 
131             for(int ii = 0; ii < n; ii++) {
132                 rs += r[ii] * s[ii];
133             }
134 
135         }
136 
137         ab = rs / ab;
138 
139         //Update search direction p.
140 #ifdef _OPENMP
141         #pragma omp parallel for
142 #endif
143 
144         for(int ii = 0; ii < n; ii++) {
145             d[ii] = s[ii] + ab * d[ii];
146         }
147 
148 
149     }
150 
151     if(iterate == MaximumIterates)
152         if(iterate != n && RMSResidual != 0.0f) {
153             printf("Warning: MaximumIterates (%u) reached in SparseConjugateGradient.\n", MaximumIterates);
154         }
155 
156     if(ax != b) {
157         delete[] ax;
158     }
159 
160     if(s != r) {
161         delete[] s;
162     }
163 
164     free(buffer);
165     return x;
166 }
167 
MultiDiagonalSymmetricMatrix(int Dimension,int NumberOfDiagonalsInLowerTriangle)168 MultiDiagonalSymmetricMatrix::MultiDiagonalSymmetricMatrix(int Dimension, int NumberOfDiagonalsInLowerTriangle) : buffer(nullptr), DiagBuffer(nullptr)
169 {
170     n = Dimension;
171     m = NumberOfDiagonalsInLowerTriangle;
172     IncompleteCholeskyFactorization = nullptr;
173 
174     Diagonals = new float *[m];
175     StartRows = new int [m + 1];
176     memset(Diagonals, 0, sizeof(float *)*m);
177     memset(StartRows, 0, sizeof(int) * (m + 1));
178     StartRows[m] = n + 1;
179 }
180 
~MultiDiagonalSymmetricMatrix()181 MultiDiagonalSymmetricMatrix::~MultiDiagonalSymmetricMatrix()
182 {
183     if(DiagBuffer != nullptr) {
184         free(buffer);
185     } else
186         for(int i = 0; i < m; i++) {
187             delete[] Diagonals[i];
188         }
189 
190     delete[] Diagonals;
191     delete[] StartRows;
192 }
193 
CreateDiagonal(int index,int StartRow)194 bool MultiDiagonalSymmetricMatrix::CreateDiagonal(int index, int StartRow)
195 {
196     // Changed memory allocation for diagonals to avoid L1 conflict misses
197     // Falls back to original version if big block could not be allocated
198     int padding = 4096 - ((n * m * sizeof(float)) % 4096);
199 
200     if(index == 0) {
201         buffer = (char*)calloc( (n + padding) * m * sizeof(float) + (m + 16) * 64 + 63, 1);
202 
203         if(buffer == nullptr)
204             // no big memory block available => try to allocate smaller blocks
205         {
206             DiagBuffer = nullptr;
207         } else {
208             DiagBuffer = (float*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
209         }
210     }
211 
212     if(index >= m) {
213         printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: invalid index.\n");
214         return false;
215     }
216 
217     if(index > 0)
218         if(StartRow <= StartRows[index - 1]) {
219             printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: each StartRow must exceed the previous.\n");
220             return false;
221         }
222 
223     if(DiagBuffer != nullptr) {
224         Diagonals[index] = (DiagBuffer + (index * (n + padding)) + ((index + 16) * 16));
225     } else {
226         Diagonals[index] = new float[DiagonalLength(StartRow)];
227 
228         if(Diagonals[index] == nullptr) {
229             printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: memory allocation failed. Out of memory?\n");
230             return false;
231         }
232 
233         memset(Diagonals[index], 0, sizeof(float)*DiagonalLength(StartRow));
234     }
235 
236     StartRows[index] = StartRow;
237     return true;
238 }
239 
FindIndex(int StartRow)240 inline int MultiDiagonalSymmetricMatrix::FindIndex(int StartRow)
241 {
242     //There's GOT to be a better way to do this. "Bidirectional map?"
243     // Issue 1895 : Changed start of loop from zero to one
244     // m is small (5 or 6)
245     for(int i = 1; i < m; i++)
246         if(StartRows[i] == StartRow) {
247             return i;
248         }
249 
250     return -1;
251 }
252 
LazySetEntry(float value,int row,int column)253 bool MultiDiagonalSymmetricMatrix::LazySetEntry(float value, int row, int column)
254 {
255     //On the strict upper triangle? Swap, this is ok due to symmetry.
256     int i, sr;
257 
258     if(column > row)
259         i = column,
260         column = row,
261         row = i;
262 
263     if(row >= n) {
264         return false;
265     }
266 
267     sr = row - column;
268 
269     //Locate the relevant diagonal.
270     i = FindIndex(sr);
271 
272     if(i < 0) {
273         return false;
274     }
275 
276     Diagonals[i][column] = value;
277     return true;
278 }
279 
VectorProduct(float * RESTRICT Product,float * RESTRICT x)280 void MultiDiagonalSymmetricMatrix::VectorProduct(float* RESTRICT Product, float* RESTRICT x)
281 {
282 
283     int srm = StartRows[m - 1];
284     int lm = DiagonalLength(srm);
285 #ifdef _OPENMP
286 #ifdef __SSE2__
287     const int chunkSize = (lm - srm) / (omp_get_num_procs() * 32);
288 #else
289     const int chunkSize = (lm - srm) / (omp_get_num_procs() * 8);
290 #endif
291     #pragma omp parallel
292 #endif
293     {
294         // First fill the big part in the middle
295         // This can be done without intermediate stores to memory and it can be parallelized too
296 #ifdef _OPENMP
297         #pragma omp for schedule(dynamic,chunkSize) nowait
298 #endif
299 #ifdef __SSE2__
300 
301         for(int j = srm; j < lm - 3; j += 4) {
302             __m128 prodv = LVFU(Diagonals[0][j]) * LVFU(x[j]);
303 
304             for(int i = m - 1; i > 0; i--) {
305                 int s = StartRows[i];
306                 prodv += (LVFU(Diagonals[i][j - s]) * LVFU(x[j - s])) + (LVFU(Diagonals[i][j]) * LVFU(x[j + s]));
307             }
308 
309             _mm_storeu_ps(&Product[j], prodv);
310         }
311 
312 #else
313 
314         for(int j = srm; j < lm; j++) {
315             float prod = Diagonals[0][j] * x[j];
316 
317             for(int i = m - 1; i > 0; i--) {
318                 int s = StartRows[i];
319                 prod += (Diagonals[i][j - s] * x[j - s]) + (Diagonals[i][j] * x[j + s]);
320             }
321 
322             Product[j] = prod;
323         }
324 
325 #endif
326 #ifdef _OPENMP
327         #pragma omp single
328 #endif
329         {
330 #ifdef __SSE2__
331 
332             for(int j = lm - ((lm - srm) % 4); j < lm; j++) {
333                 float prod = Diagonals[0][j] * x[j];
334 
335                 for(int i = m - 1; i > 0; i--) {
336                     int s = StartRows[i];
337                     prod += (Diagonals[i][j - s] * x[j - s]) + (Diagonals[i][j] * x[j + s]);
338                 }
339 
340                 Product[j] = prod;
341             }
342 
343 #endif
344 
345             // Fill remaining area
346             // Loop over the stored diagonals.
347             for(int i = 0; i < m; i++) {
348                 int sr = StartRows[i];
349                 float *a = Diagonals[i];    //One fewer dereference.
350                 int l = DiagonalLength(sr);
351 
352                 if(sr == 0) {
353                     for(int j = 0; j < srm; j++) {
354                         Product[j] = a[j] * x[j];    //Separate, fairly simple treatment for the main diagonal.
355                     }
356 
357                     for(int j = lm; j < l; j++) {
358                         Product[j] = a[j] * x[j];    //Separate, fairly simple treatment for the main diagonal.
359                     }
360                 } else {
361 // Split the loop in 3 parts, so now the big one in the middle can be parallelized without race conditions
362                     // updates 0 to sr - 1. Because sr is small (in the range of image-width) no benefit by omp
363                     for(int j = 0; j < sr; j++) {
364                         Product[j] += a[j] * x[j + sr];     //Contribution from upper triangle
365                     }
366 
367                     // Updates sr to l - 1. Because sr is small and l is big, this loop is parallelized
368                     for(int j = sr; j < srm; j++) {
369                         Product[j] += a[j - sr] * x[j - sr] + a[j] * x[j + sr]; //Contribution from lower and upper triangle
370                     }
371 
372                     for(int j = lm; j < l; j++) {
373                         Product[j] += a[j - sr] * x[j - sr] + a[j] * x[j + sr]; //Contribution from lower and upper triangle
374                     }
375 
376                     // Updates l to l + sr - 1. Because sr is small (in the range of image-width) no benefit by omp
377                     for(int j = l; j < l + sr; j++) {
378                         Product[j] += a[j - sr] * x[j - sr]; //Contribution from lower triangle
379                     }
380                 }
381             }
382         }
383     }
384 #ifdef _OPENMP
385     static_cast<void>(chunkSize); // to silence cppcheck warning
386 #endif
387 }
388 
CreateIncompleteCholeskyFactorization(int MaxFillAbove)389 bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(int MaxFillAbove)
390 {
391     if(m == 1) {
392         printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: just one diagonal? Can you divide?\n");
393         return false;
394     }
395 
396     if(StartRows[0] != 0) {
397         printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: main diagonal required to exist for this math.\n");
398         return false;
399     }
400 
401     //How many diagonals in the decomposition?
402     MaxFillAbove++; //Conceptually, now "fill" includes an existing diagonal. Simpler in the math that follows.
403     int mic = 1;
404 
405     for(int ii = 1; ii < m; ii++) {
406         mic += rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove);    //Guaranteed positive since StartRows must be created in increasing order.
407     }
408 
409     //Initialize the decomposition - setup memory, start rows, etc.
410 
411     MultiDiagonalSymmetricMatrix *ic = new MultiDiagonalSymmetricMatrix(n, mic);
412     if(!ic->CreateDiagonal(0, 0)) { //There's always a main diagonal in this type of decomposition.
413         delete ic;
414         return false;
415     }
416     mic = 1;
417 
418     for(int ii = 1; ii < m; ii++) {
419         //Set j to the number of diagonals to be created corresponding to a diagonal on this source matrix...
420         int j = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove);
421 
422         //...and create those diagonals. I want to take a moment to tell you about how much I love minimalistic loops: very much.
423         while(j-- != 0)
424             if(!ic->CreateDiagonal(mic++, StartRows[ii] - j)) {
425                 //Beware of out of memory, possible for large, sparse problems if you ask for too much fill.
426                 printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: Out of memory. Ask for less fill?\n");
427                 delete ic;
428                 return false;
429             }
430     }
431 
432     //It's all initialized? Uhkay. Do the actual math then.
433     int sss;
434    // int MaxStartRow = StartRows[m - 1];  //Handy number.
435     float **l = ic->Diagonals;
436     float  *d = ic->Diagonals[0];       //Describes D in LDLt.
437     int icm = ic->m;
438     int icn = ic->n;
439     int* RESTRICT icStartRows = ic->StartRows;
440 
441     //Loop over the columns.
442 
443     // create array for quicker access to ic->StartRows
444     struct s_diagmap {
445         int sss;
446         int ss;
447         int k;
448     };
449 
450 
451     // Pass one: count number of needed entries
452     int entrycount = 0;
453 
454     for(int i = 1; i < icm; i++) {
455         for(int j = 1; j < icm; j++) {
456             if(ic->FindIndex( icStartRows[i] + icStartRows[j]) > 0) {
457                 entrycount ++;
458             }
459         }
460     }
461 
462     // now we can create the array
463     struct s_diagmap* RESTRICT DiagMap = new s_diagmap[entrycount];
464     // we also need the maxvalues
465     int entrynumber = 0;
466     int index;
467     int* RESTRICT MaxIndizes = new int[icm];
468 
469     for(int i = 1; i < icm; i++) {
470         for(int j = 1; j < icm; j++) {
471             index = ic->FindIndex( icStartRows[i] + icStartRows[j]);
472 
473             if(index > 0) {
474                 DiagMap[entrynumber].ss = j;
475                 DiagMap[entrynumber].sss = index;
476                 DiagMap[entrynumber].k = icStartRows[j];
477                 entrynumber ++;
478             }
479         }
480 
481         MaxIndizes[i] = entrynumber - 1;
482     }
483 
484     int* RESTRICT findmap = new int[icm];
485 
486     for(int j = 0; j < icm; j++) {
487         findmap[j] = FindIndex( icStartRows[j]);
488     }
489 
490     for(int j = 0; j < n; j++) {
491         //Calculate d for this column.
492         d[j] = Diagonals[0][j];
493 
494         //This is a loop over k from 1 to j, inclusive. We'll cover that by looping over the index of the diagonals (s), and get k from it.
495         //The first diagonal is d (k = 0), so skip that and have s start at 1. Cover all available s but stop if k exceeds j.
496         int s = 1;
497         int k = icStartRows[s];
498 
499         while(k <= j) {
500             d[j] -= l[s][j - k] * l[s][j - k] * d[j - k];
501             s++;
502             k = icStartRows[s];
503         }
504 
505         if(UNLIKELY(d[j] == 0.0f)) {
506             printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: division by zero. Matrix not decomposable.\n");
507             delete ic;
508             delete[] DiagMap;
509             delete[] MaxIndizes;
510             delete[] findmap;
511             return false;
512         }
513 
514         float id = 1.0f / d[j];
515         //Now, calculate l from top down along this column.
516 
517         int mapindex = 0;
518         int jMax = icn - j;
519 
520         for(s = 1; s < icm; s++) {
521             if(icStartRows[s] >= jMax) {
522                 break;    //Possible values of j are limited
523             }
524 
525             float temp = 0.0f;
526 
527             while(mapindex <= MaxIndizes[s] && ( k = DiagMap[mapindex].k) <= j) {
528                 temp -= l[DiagMap[mapindex].sss][j - k] * l[DiagMap[mapindex].ss][j - k] * d[j - k];
529                 mapindex ++;
530             }
531 
532             sss = findmap[s];
533             l[s][j] = id * (sss < 0 ? temp : (Diagonals[sss][j] + temp));
534         }
535     }
536 
537     delete[] DiagMap;
538     delete[] MaxIndizes;
539     delete[] findmap;
540     IncompleteCholeskyFactorization = ic;
541     return true;
542 }
543 
KillIncompleteCholeskyFactorization()544 void MultiDiagonalSymmetricMatrix::KillIncompleteCholeskyFactorization()
545 {
546     delete IncompleteCholeskyFactorization;
547 }
548 
CholeskyBackSolve(float * RESTRICT x,float * RESTRICT b)549 void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float* RESTRICT x, float* RESTRICT b)
550 {
551     //We want to solve L D Lt x = b where D is a diagonal matrix described by Diagonals[0] and L is a unit lower triagular matrix described by the rest of the diagonals.
552     //Let D Lt x = y. Then, first solve L y = b.
553     float* RESTRICT  *d = IncompleteCholeskyFactorization->Diagonals;
554     int* RESTRICT s = IncompleteCholeskyFactorization->StartRows;
555     int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
556 
557     if(M != DIAGONALSP1) {                  // can happen in theory
558         for(int j = 0; j < N; j++) {
559             float sub = b[j];                   // using local var to reduce memory writes, gave a big speedup
560             int i = 1;
561             int c = j - s[i];
562 
563             while(c >= 0) {
564                 sub -= d[i][c] * x[c];
565                 i++;
566                 c = j - s[i];
567             }
568 
569             x[j] = sub;                         // only one memory-write per j
570         }
571     } else {                               // that's the case almost every time
572         for(int j = 0; j <= s[M - 1]; j++) {
573             float sub = b[j];                   // using local var to reduce memory writes, gave a big speedup
574             int i = 1;
575             int c = j - s[1];
576 
577             while(c >= 0) {
578                 sub -= d[i][c] * x[c];
579                 i++;
580                 c = j - s[i];
581             }
582 
583             x[j] = sub;                         // only one memory-write per j
584         }
585 
586         for(int j = s[M - 1] + 1; j < N; j++) {
587             float sub = b[j];                   // using local var to reduce memory writes, gave a big speedup
588 
589             for(int i = DIAGONALSP1 - 1; i > 0; i--) { // using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
590                 sub -= d[i][j - s[i]] * x[j - s[i]];
591             }
592 
593             x[j] = sub;                         // only one memory-write per j
594         }
595     }
596 
597     //Now, solve x from D Lt x = y -> Lt x = D^-1 y
598 // Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
599 #ifdef _OPENMP
600     #pragma omp parallel for
601 #endif
602 
603     for(int j = 0; j < N; j++) {
604         x[j] = x[j] / d[0][j];
605     }
606 
607     if(M != DIAGONALSP1) {                  // can happen in theory
608         int j = N;
609         while(j-- > 0) {
610             float sub = x[j];                   // using local var to reduce memory writes, gave a big speedup
611             int i = 1;
612             int c = j + s[1];
613 
614             while(c < N) {
615                 sub -= d[i][j] * x[c];
616                 i++;
617                 c = j + s[i];
618             }
619 
620             x[j] = sub;                         // only one memory-write per j
621         }
622     } else {                                // that's the case almost every time
623         for(int j = N - 1; j >= (N - 1) - s[M - 1]; j--) {
624             float sub = x[j];                   // using local var to reduce memory writes, gave a big speedup
625             int i = 1;
626             int c = j + s[1];
627 
628             while(c < N) {
629                 sub -= d[i][j] * x[j + s[i]];
630                 i++;
631                 c = j + s[i];
632             }
633 
634             x[j] = sub;                         // only one memory-write per j
635         }
636 
637         for(int j = (N - 2) - s[M - 1]; j >= 0; j--) {
638             float sub = x[j];                   // using local var to reduce memory writes, gave a big speedup
639 
640             for(int i = DIAGONALSP1 - 1; i > 0; i--) { // using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
641                 sub -= d[i][j] * x[j + s[i]];
642             }
643 
644             x[j] = sub;                         // only one memory-write per j
645         }
646     }
647 }
648 
EdgePreservingDecomposition(int width,int height)649 EdgePreservingDecomposition::EdgePreservingDecomposition(int width, int height) : a0(nullptr) , a_1(nullptr), a_w(nullptr), a_w_1(nullptr), a_w1(nullptr)
650 {
651     w = width;
652     h = height;
653     n = w * h;
654 
655     //Initialize the matrix just once at construction.
656     A = new MultiDiagonalSymmetricMatrix(n, DIAGONALS);
657 
658     if(!(
659                 A->CreateDiagonal(0, 0) &&
660                 A->CreateDiagonal(1, 1) &&
661                 A->CreateDiagonal(2, w - 1) &&
662                 A->CreateDiagonal(3, w) &&
663                 A->CreateDiagonal(4, w + 1))) {
664         delete A;
665         A = nullptr;
666         printf("Error in EdgePreservingDecomposition construction: out of memory.\n");
667     } else {
668         a0    = A->Diagonals[0];
669         a_1   = A->Diagonals[1];
670         a_w1  = A->Diagonals[2];
671         a_w   = A->Diagonals[3];
672         a_w_1 = A->Diagonals[4];
673     }
674 }
675 
~EdgePreservingDecomposition()676 EdgePreservingDecomposition::~EdgePreservingDecomposition()
677 {
678     delete A;
679 }
680 
CreateBlur(float * Source,float Scale,float EdgeStopping,int Iterates,float * Blur,bool UseBlurForEdgeStop)681 float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float EdgeStopping, int Iterates, float *Blur, bool UseBlurForEdgeStop)
682 {
683 
684     if(Blur == nullptr)
685         UseBlurForEdgeStop = false, //Use source if there's no supplied Blur.
686         Blur = new float[n];
687 
688     if(Scale == 0.0f) {
689         memcpy(Blur, Source, n * sizeof(float));
690         return Blur;
691     }
692 
693     //Create the edge stopping function a, rotationally symmetric and just one instead of (ax, ay). Maybe don't need Blur yet, so use its memory.
694     float* RESTRICT a;
695     float* RESTRICT g;
696 
697     if(UseBlurForEdgeStop) {
698         a = new float[n], g = Blur;
699     } else {
700         a = Blur, g = Source;
701     }
702 
703     int w1 = w - 1, h1 = h - 1;
704     const float sqreps = 0.0004f;                           // removed eps*eps from inner loop
705 
706 
707 #ifdef _OPENMP
708     #pragma omp parallel
709 #endif
710     {
711 #ifdef __SSE2__
712         int x;
713         __m128 gxv, gyv;
714         __m128 Scalev = _mm_set1_ps( Scale );
715         __m128 sqrepsv = _mm_set1_ps( sqreps );
716         __m128 EdgeStoppingv = _mm_set1_ps( -EdgeStopping );
717         __m128 zd5v = _mm_set1_ps( 0.5f );
718 #endif
719 #ifdef _OPENMP
720         #pragma omp for
721 #endif
722 
723         for(int y = 0; y < h1; y++) {
724             float *rg = &g[w * y];
725 #ifdef __SSE2__
726 
727             for(x = 0; x < w1 - 3; x += 4) {
728                 //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
729                 gxv = (LVFU(rg[x + 1]) -  LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + w]));
730                 gyv = (LVFU(rg[x + w]) -  LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + 1]));
731                 //Apply power to the magnitude of the gradient to get the edge stopping function.
732                 _mm_storeu_ps( &a[x + w * y], Scalev * pow_F((zd5v * vsqrtf(gxv * gxv + gyv * gyv + sqrepsv)), EdgeStoppingv) );
733             }
734 
735             for(; x < w1; x++) {
736                 //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
737                 float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
738                 float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
739                 //Apply power to the magnitude of the gradient to get the edge stopping function.
740                 a[x + w * y] = Scale * pow_F(0.5f * sqrtf(gx * gx + gy * gy + sqreps), -EdgeStopping);
741             }
742 
743 #else
744 
745             for(int x = 0; x < w1; x++) {
746                 //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
747                 float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
748                 float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
749 
750                 //Apply power to the magnitude of the gradient to get the edge stopping function.
751                 a[x + w * y] = Scale * pow_F(0.5f * sqrtf(gx * gx + gy * gy + sqreps), -EdgeStopping);
752             }
753 
754 #endif
755         }
756     }
757 
758 
759     /* Now setup the linear problem. I use the Maxima CAS, here's code for making an FEM formulation for the smoothness term:
760         p(x, y) := (1 - x)*(1 - y);
761         P(m, n) := A[m][n]*p(x, y) + A[m + 1][n]*p(1 - x, y) + A[m + 1][n + 1]*p(1 - x, 1 - y) + A[m][n + 1]*p(x, 1 - y);
762         Integrate(f) := integrate(integrate(f, x, 0, 1), y, 0, 1);
763 
764         Integrate(diff(P(u, v), x)*diff(p(x, y), x) + diff(P(u, v), y)*diff(p(x, y), y));
765         Integrate(diff(P(u - 1, v), x)*diff(p(1 - x, y), x) + diff(P(u - 1, v), y)*diff(p(1 - x, y), y));
766         Integrate(diff(P(u - 1, v - 1), x)*diff(p(1 - x, 1 - y), x) + diff(P(u - 1, v - 1), y)*diff(p(1 - x, 1 - y), y));
767         Integrate(diff(P(u, v - 1), x)*diff(p(x, 1 - y), x) + diff(P(u, v - 1), y)*diff(p(x, 1 - y), y));
768     So yeah. Use the numeric results of that to fill the matrix A.*/
769 
770     memset(a_1, 0, A->DiagonalLength(1)*sizeof(float));
771     memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
772     memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
773     memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
774 
775 
776 // checked for race condition here
777 // a0[] is read and write but addressed by i only
778 // a[] is read only
779 // a_w_1 is write only
780 // a_w is write only
781 // a_w1 is write only
782 // a_1 is write only
783 // So, there should be no race conditions
784 
785 #ifdef _OPENMP
786     #pragma omp parallel for
787 #endif
788 
789     for(int y = 0; y < h; y++) {
790         int i = y * w;
791 
792         for(int x = 0; x < w; x++, i++) {
793             float ac, a0temp;
794             a0temp = 0.25f;
795 
796             //Remember, only fill the lower triangle. Memory for upper is never made. It's symmetric. Trust.
797             if(x > 0 && y > 0) {
798                 ac = a[i - w - 1] / 6.0f;
799                 a_w_1[i - w - 1] -= 2.0f * ac;
800                 a_w[i - w] -= ac;
801                 a_1[i - 1] -= ac;
802                 a0temp += ac;
803             }
804 
805             if(x < w1 && y > 0) {
806                 ac = a[i - w] / 6.0f;
807                 a_w[i - w] -= ac;
808                 a_w1[i - w + 1] -= 2.0f * ac;
809                 a0temp += ac;
810             }
811 
812             if(x > 0 && y < h1) {
813                 ac = a[i - 1] / 6.0f;
814                 a_1[i - 1] -= ac;
815                 a0temp += ac;
816             }
817 
818             if(x < w1 && y < h1) {
819                 a0temp += a[i] / 6.0f;
820             }
821 
822             a0[i] = 4.0f * a0temp;
823         }
824     }
825 
826     if(UseBlurForEdgeStop) {
827         delete[] a;
828     }
829 
830     //Solve & return.
831     bool success = A->CreateIncompleteCholeskyFactorization(1); //Fill-in of 1 seems to work really good. More doesn't really help and less hurts (slightly).
832 
833     if(!success) {
834         fprintf(stderr, "Error: Tonemapping has failed.\n");
835         memset(Blur, 0, sizeof(float)*n);  // On failure, set the blur to zero.  This is subsequently exponentiated in CompressDynamicRange.
836         return Blur;
837     }
838 
839     if(!UseBlurForEdgeStop) {
840         memcpy(Blur, Source, n * sizeof(float));
841     }
842 
843     SparseConjugateGradient(A->PassThroughVectorProduct, Source, n, false, Blur, 0.0f, (void *)A, Iterates, A->PassThroughCholeskyBackSolve);
844     A->KillIncompleteCholeskyFactorization();
845     return Blur;
846 }
847 
CreateIteratedBlur(float * Source,float Scale,float EdgeStopping,int Iterates,int Reweightings,float * Blur)848 float *EdgePreservingDecomposition::CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, int Iterates, int Reweightings, float *Blur)
849 {
850     //Simpler outcome?
851     if(Reweightings == 0) {
852         return CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur);
853     }
854 
855     //Create a blur here, initialize.
856     if(Blur == nullptr) {
857         Blur = new float[n];
858     }
859 
860     memcpy(Blur, Source, n * sizeof(float));
861 
862     //Iteratively improve the blur.
863     Reweightings++;
864 
865     for(int i = 0; i < Reweightings; i++) {
866         CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur, true);
867     }
868 
869     return Blur;
870 }
871 
CompressDynamicRange(float * Source,float Scale,float EdgeStopping,float CompressionExponent,float DetailBoost,int Iterates,int Reweightings)872 void EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Scale, float EdgeStopping, float CompressionExponent, float DetailBoost, int Iterates, int Reweightings)
873 {
874     if(w < 300 && h < 300) { // set number of Reweightings to zero for small images (thumbnails). We could try to find a better solution here.
875         Reweightings = 0;
876     }
877 
878     //Small number intended to prevent division by zero. This is different from the eps in CreateBlur.
879     const float eps = 0.0001f;
880 
881     //We're working with luminance, which does better logarithmic.
882 #ifdef __SSE2__
883 #ifdef _OPENMP
884     #pragma omp parallel
885 #endif
886     {
887         __m128 epsv = _mm_set1_ps( eps );
888 #ifdef _OPENMP
889         #pragma omp for
890 #endif
891 
892         for(int ii = 0; ii < n - 3; ii += 4) {
893             _mm_storeu_ps( &Source[ii], xlogf(vmaxf(LVFU(Source[ii]), ZEROV) + epsv));
894         }
895     }
896 
897     for(int ii = n - (n % 4); ii < n; ii++) {
898         Source[ii] = xlogf(std::max(Source[ii], 0.f) + eps);
899     }
900 
901 #else
902 #ifdef _OPENMP
903     #pragma omp parallel for
904 #endif
905 
906     for(int ii = 0; ii < n; ii++) {
907         Source[ii] = xlogf(std::max(Source[ii], 0.f) + eps);
908     }
909 
910 #endif
911 
912     //Blur. Also setup memory for Compressed (we can just use u since each element of u is used in one calculation).
913     float *u = CreateIteratedBlur(Source, Scale, EdgeStopping, Iterates, Reweightings);
914 
915     //Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
916     float temp;
917 
918     if(DetailBoost > 0.f) {
919         float betemp = expf(-(2.f - DetailBoost + 0.694f)) - 1.f; //0.694 = log(2)
920         temp = 1.2f * xlogf( -betemp);
921     } else {
922         temp = CompressionExponent - 1.0f;
923     }
924 
925 #ifdef __SSE2__
926 #ifdef _OPENMP
927     #pragma omp parallel
928 #endif
929     {
930         __m128 cev, uev, sourcev;
931         __m128 epsv = _mm_set1_ps( eps );
932         __m128 DetailBoostv = _mm_set1_ps( DetailBoost );
933         __m128 tempv = _mm_set1_ps( temp );
934 #ifdef _OPENMP
935         #pragma omp for
936 #endif
937 
938         for(int i = 0; i < n - 3; i += 4) {
939             cev = xexpf(LVFU(Source[i]) + LVFU(u[i]) * (tempv)) - epsv;
940             uev = xexpf(LVFU(u[i])) - epsv;
941             sourcev = xexpf(LVFU(Source[i])) - epsv;
942             _mm_storeu_ps( &Source[i], cev + DetailBoostv * (sourcev - uev) );
943         }
944     }
945 
946     for(int i = n - (n % 4); i < n; i++) {
947         float ce = xexpf(Source[i] + u[i] * (temp)) - eps;
948         float ue = xexpf(u[i]) - eps;
949         Source[i] = xexpf(Source[i]) - eps;
950         Source[i] = ce + DetailBoost * (Source[i] - ue);
951     }
952 
953 #else
954 #ifdef _OPENMP
955     #pragma omp parallel for
956 #endif
957 
958     for(int i = 0; i < n; i++) {
959         float ce = xexpf(Source[i] + u[i] * (temp)) - eps;
960         float ue = xexpf(u[i]) - eps;
961         Source[i] = xexpf(Source[i]) - eps;
962         Source[i] = ce + DetailBoost * (Source[i] - ue);
963     }
964 
965 #endif
966 
967     delete[] u;
968 }
969 
970