1 #include <cmath>
2 #include "rt_math.h"
3 #include "EdgePreservingDecomposition.h"
4 #ifdef _OPENMP
5 #include <omp.h>
6 #endif
7 #include "sleef.h"
8
9 #define DIAGONALS 5
10 #define DIAGONALSP1 6
11
12 /* Solves A x = b by the conjugate gradient method, where instead of feeding it the matrix A you feed it a function which
13 calculates A x where x is some vector. Stops when rms residual < RMSResidual or when maximum iterates is reached.
14 Stops at n iterates if MaximumIterates = 0 since that many iterates gives exact solution. Applicable to symmetric positive
15 definite problems only, which is what unconstrained smooth optimization pretty much always is.
16 Parameter pass can be passed through, containing whatever info you like it to contain (matrix info?).
17 Takes less memory with OkToModify_b = true, and Preconditioner = nullptr. */
SparseConjugateGradient(void Ax (float * Product,float * x,void * Pass),float * b,int n,bool OkToModify_b,float * x,float RMSResidual,void * Pass,int MaximumIterates,void Preconditioner (float * Product,float * x,void * Pass))18 float *SparseConjugateGradient(void Ax(float *Product, float *x, void *Pass), float *b, int n, bool OkToModify_b,
19 float *x, float RMSResidual, void *Pass, int MaximumIterates, void Preconditioner(float *Product, float *x, void *Pass))
20 {
21 int iterate;
22
23 float* buffer = (float*)malloc(2 * n * sizeof(float) + 128);
24 float *r = (buffer + 16);
25
26 //Start r and x.
27 if(x == nullptr) {
28 x = new float[n];
29
30 memset(x, 0, sizeof(float)*n); //Zero initial guess if x == nullptr.
31 memcpy(r, b, sizeof(float)*n);
32 } else {
33 Ax(r, x, Pass);
34 #ifdef _OPENMP
35 #pragma omp parallel for // removed schedule(dynamic,10)
36 #endif
37
38 for(int ii = 0; ii < n; ii++) {
39 r[ii] = b[ii] - r[ii]; //r = b - A x.
40 }
41 }
42
43 //s is preconditionment of r. Without, direct to r.
44 float *s = r;
45 double rs = 0.0; // use double precision for large summations
46
47 if(Preconditioner != nullptr) {
48 s = new float[n];
49
50 Preconditioner(s, r, Pass);
51 }
52
53 #ifdef _OPENMP
54 #pragma omp parallel for reduction(+:rs) // removed schedule(dynamic,10)
55 #endif
56
57 for(int ii = 0; ii < n; ii++) {
58 rs += r[ii] * s[ii];
59 }
60
61 //Search direction d.
62 float *d = (buffer + n + 32);
63
64 memcpy(d, s, sizeof(float)*n);
65
66 //Store calculations of Ax in this.
67 float *ax = b;
68
69 if(!OkToModify_b) {
70 ax = new float[n];
71 }
72
73 //Start iterating!
74 if(MaximumIterates == 0) {
75 MaximumIterates = n;
76 }
77
78 for(iterate = 0; iterate < MaximumIterates; iterate++) {
79 //Get step size alpha, store ax while at it.
80 double ab = 0.0; // use double precision for large summations
81 Ax(ax, d, Pass);
82 #ifdef _OPENMP
83 #pragma omp parallel for reduction(+:ab)
84 #endif
85
86 for(int ii = 0; ii < n; ii++) {
87 ab += d[ii] * ax[ii];
88 }
89
90 if(ab == 0.0f) {
91 break; //So unlikely. It means perfectly converged or singular, stop either way.
92 }
93
94 ab = rs / ab;
95
96 //Update x and r with this step size.
97 double rms = 0.0; // use double precision for large summations
98 #ifdef _OPENMP
99 #pragma omp parallel for reduction(+:rms)
100 #endif
101
102 for(int ii = 0; ii < n; ii++) {
103 x[ii] += ab * d[ii];
104 r[ii] -= ab * ax[ii]; //"Fast recursive formula", use explicit r = b - Ax occasionally?
105 rms += r[ii] * r[ii];
106 }
107
108 rms = sqrtf(rms / n);
109
110 //Quit? This probably isn't the best stopping condition, but ok.
111 if(rms < RMSResidual) {
112 break;
113 }
114
115 if(Preconditioner != nullptr) {
116 Preconditioner(s, r, Pass);
117 }
118
119 //Get beta.
120 ab = rs;
121 rs = 0.0f;
122
123 #ifdef _OPENMP
124 #pragma omp parallel
125 #endif
126 {
127 #ifdef _OPENMP
128 #pragma omp for reduction(+:rs)
129 #endif
130
131 for(int ii = 0; ii < n; ii++) {
132 rs += r[ii] * s[ii];
133 }
134
135 }
136
137 ab = rs / ab;
138
139 //Update search direction p.
140 #ifdef _OPENMP
141 #pragma omp parallel for
142 #endif
143
144 for(int ii = 0; ii < n; ii++) {
145 d[ii] = s[ii] + ab * d[ii];
146 }
147
148
149 }
150
151 if(iterate == MaximumIterates)
152 if(iterate != n && RMSResidual != 0.0f) {
153 printf("Warning: MaximumIterates (%u) reached in SparseConjugateGradient.\n", MaximumIterates);
154 }
155
156 if(ax != b) {
157 delete[] ax;
158 }
159
160 if(s != r) {
161 delete[] s;
162 }
163
164 free(buffer);
165 return x;
166 }
167
MultiDiagonalSymmetricMatrix(int Dimension,int NumberOfDiagonalsInLowerTriangle)168 MultiDiagonalSymmetricMatrix::MultiDiagonalSymmetricMatrix(int Dimension, int NumberOfDiagonalsInLowerTriangle) : buffer(nullptr), DiagBuffer(nullptr)
169 {
170 n = Dimension;
171 m = NumberOfDiagonalsInLowerTriangle;
172 IncompleteCholeskyFactorization = nullptr;
173
174 Diagonals = new float *[m];
175 StartRows = new int [m + 1];
176 memset(Diagonals, 0, sizeof(float *)*m);
177 memset(StartRows, 0, sizeof(int) * (m + 1));
178 StartRows[m] = n + 1;
179 }
180
~MultiDiagonalSymmetricMatrix()181 MultiDiagonalSymmetricMatrix::~MultiDiagonalSymmetricMatrix()
182 {
183 if(DiagBuffer != nullptr) {
184 free(buffer);
185 } else
186 for(int i = 0; i < m; i++) {
187 delete[] Diagonals[i];
188 }
189
190 delete[] Diagonals;
191 delete[] StartRows;
192 }
193
CreateDiagonal(int index,int StartRow)194 bool MultiDiagonalSymmetricMatrix::CreateDiagonal(int index, int StartRow)
195 {
196 // Changed memory allocation for diagonals to avoid L1 conflict misses
197 // Falls back to original version if big block could not be allocated
198 int padding = 4096 - ((n * m * sizeof(float)) % 4096);
199
200 if(index == 0) {
201 buffer = (char*)calloc( (n + padding) * m * sizeof(float) + (m + 16) * 64 + 63, 1);
202
203 if(buffer == nullptr)
204 // no big memory block available => try to allocate smaller blocks
205 {
206 DiagBuffer = nullptr;
207 } else {
208 DiagBuffer = (float*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
209 }
210 }
211
212 if(index >= m) {
213 printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: invalid index.\n");
214 return false;
215 }
216
217 if(index > 0)
218 if(StartRow <= StartRows[index - 1]) {
219 printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: each StartRow must exceed the previous.\n");
220 return false;
221 }
222
223 if(DiagBuffer != nullptr) {
224 Diagonals[index] = (DiagBuffer + (index * (n + padding)) + ((index + 16) * 16));
225 } else {
226 Diagonals[index] = new float[DiagonalLength(StartRow)];
227
228 if(Diagonals[index] == nullptr) {
229 printf("Error in MultiDiagonalSymmetricMatrix::CreateDiagonal: memory allocation failed. Out of memory?\n");
230 return false;
231 }
232
233 memset(Diagonals[index], 0, sizeof(float)*DiagonalLength(StartRow));
234 }
235
236 StartRows[index] = StartRow;
237 return true;
238 }
239
FindIndex(int StartRow)240 inline int MultiDiagonalSymmetricMatrix::FindIndex(int StartRow)
241 {
242 //There's GOT to be a better way to do this. "Bidirectional map?"
243 // Issue 1895 : Changed start of loop from zero to one
244 // m is small (5 or 6)
245 for(int i = 1; i < m; i++)
246 if(StartRows[i] == StartRow) {
247 return i;
248 }
249
250 return -1;
251 }
252
LazySetEntry(float value,int row,int column)253 bool MultiDiagonalSymmetricMatrix::LazySetEntry(float value, int row, int column)
254 {
255 //On the strict upper triangle? Swap, this is ok due to symmetry.
256 int i, sr;
257
258 if(column > row)
259 i = column,
260 column = row,
261 row = i;
262
263 if(row >= n) {
264 return false;
265 }
266
267 sr = row - column;
268
269 //Locate the relevant diagonal.
270 i = FindIndex(sr);
271
272 if(i < 0) {
273 return false;
274 }
275
276 Diagonals[i][column] = value;
277 return true;
278 }
279
VectorProduct(float * RESTRICT Product,float * RESTRICT x)280 void MultiDiagonalSymmetricMatrix::VectorProduct(float* RESTRICT Product, float* RESTRICT x)
281 {
282
283 int srm = StartRows[m - 1];
284 int lm = DiagonalLength(srm);
285 #ifdef _OPENMP
286 #ifdef __SSE2__
287 const int chunkSize = (lm - srm) / (omp_get_num_procs() * 32);
288 #else
289 const int chunkSize = (lm - srm) / (omp_get_num_procs() * 8);
290 #endif
291 #pragma omp parallel
292 #endif
293 {
294 // First fill the big part in the middle
295 // This can be done without intermediate stores to memory and it can be parallelized too
296 #ifdef _OPENMP
297 #pragma omp for schedule(dynamic,chunkSize) nowait
298 #endif
299 #ifdef __SSE2__
300
301 for(int j = srm; j < lm - 3; j += 4) {
302 __m128 prodv = LVFU(Diagonals[0][j]) * LVFU(x[j]);
303
304 for(int i = m - 1; i > 0; i--) {
305 int s = StartRows[i];
306 prodv += (LVFU(Diagonals[i][j - s]) * LVFU(x[j - s])) + (LVFU(Diagonals[i][j]) * LVFU(x[j + s]));
307 }
308
309 _mm_storeu_ps(&Product[j], prodv);
310 }
311
312 #else
313
314 for(int j = srm; j < lm; j++) {
315 float prod = Diagonals[0][j] * x[j];
316
317 for(int i = m - 1; i > 0; i--) {
318 int s = StartRows[i];
319 prod += (Diagonals[i][j - s] * x[j - s]) + (Diagonals[i][j] * x[j + s]);
320 }
321
322 Product[j] = prod;
323 }
324
325 #endif
326 #ifdef _OPENMP
327 #pragma omp single
328 #endif
329 {
330 #ifdef __SSE2__
331
332 for(int j = lm - ((lm - srm) % 4); j < lm; j++) {
333 float prod = Diagonals[0][j] * x[j];
334
335 for(int i = m - 1; i > 0; i--) {
336 int s = StartRows[i];
337 prod += (Diagonals[i][j - s] * x[j - s]) + (Diagonals[i][j] * x[j + s]);
338 }
339
340 Product[j] = prod;
341 }
342
343 #endif
344
345 // Fill remaining area
346 // Loop over the stored diagonals.
347 for(int i = 0; i < m; i++) {
348 int sr = StartRows[i];
349 float *a = Diagonals[i]; //One fewer dereference.
350 int l = DiagonalLength(sr);
351
352 if(sr == 0) {
353 for(int j = 0; j < srm; j++) {
354 Product[j] = a[j] * x[j]; //Separate, fairly simple treatment for the main diagonal.
355 }
356
357 for(int j = lm; j < l; j++) {
358 Product[j] = a[j] * x[j]; //Separate, fairly simple treatment for the main diagonal.
359 }
360 } else {
361 // Split the loop in 3 parts, so now the big one in the middle can be parallelized without race conditions
362 // updates 0 to sr - 1. Because sr is small (in the range of image-width) no benefit by omp
363 for(int j = 0; j < sr; j++) {
364 Product[j] += a[j] * x[j + sr]; //Contribution from upper triangle
365 }
366
367 // Updates sr to l - 1. Because sr is small and l is big, this loop is parallelized
368 for(int j = sr; j < srm; j++) {
369 Product[j] += a[j - sr] * x[j - sr] + a[j] * x[j + sr]; //Contribution from lower and upper triangle
370 }
371
372 for(int j = lm; j < l; j++) {
373 Product[j] += a[j - sr] * x[j - sr] + a[j] * x[j + sr]; //Contribution from lower and upper triangle
374 }
375
376 // Updates l to l + sr - 1. Because sr is small (in the range of image-width) no benefit by omp
377 for(int j = l; j < l + sr; j++) {
378 Product[j] += a[j - sr] * x[j - sr]; //Contribution from lower triangle
379 }
380 }
381 }
382 }
383 }
384 #ifdef _OPENMP
385 static_cast<void>(chunkSize); // to silence cppcheck warning
386 #endif
387 }
388
CreateIncompleteCholeskyFactorization(int MaxFillAbove)389 bool MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization(int MaxFillAbove)
390 {
391 if(m == 1) {
392 printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: just one diagonal? Can you divide?\n");
393 return false;
394 }
395
396 if(StartRows[0] != 0) {
397 printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: main diagonal required to exist for this math.\n");
398 return false;
399 }
400
401 //How many diagonals in the decomposition?
402 MaxFillAbove++; //Conceptually, now "fill" includes an existing diagonal. Simpler in the math that follows.
403 int mic = 1;
404
405 for(int ii = 1; ii < m; ii++) {
406 mic += rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove); //Guaranteed positive since StartRows must be created in increasing order.
407 }
408
409 //Initialize the decomposition - setup memory, start rows, etc.
410
411 MultiDiagonalSymmetricMatrix *ic = new MultiDiagonalSymmetricMatrix(n, mic);
412 if(!ic->CreateDiagonal(0, 0)) { //There's always a main diagonal in this type of decomposition.
413 delete ic;
414 return false;
415 }
416 mic = 1;
417
418 for(int ii = 1; ii < m; ii++) {
419 //Set j to the number of diagonals to be created corresponding to a diagonal on this source matrix...
420 int j = rtengine::min(StartRows[ii] - StartRows[ii - 1], MaxFillAbove);
421
422 //...and create those diagonals. I want to take a moment to tell you about how much I love minimalistic loops: very much.
423 while(j-- != 0)
424 if(!ic->CreateDiagonal(mic++, StartRows[ii] - j)) {
425 //Beware of out of memory, possible for large, sparse problems if you ask for too much fill.
426 printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: Out of memory. Ask for less fill?\n");
427 delete ic;
428 return false;
429 }
430 }
431
432 //It's all initialized? Uhkay. Do the actual math then.
433 int sss;
434 // int MaxStartRow = StartRows[m - 1]; //Handy number.
435 float **l = ic->Diagonals;
436 float *d = ic->Diagonals[0]; //Describes D in LDLt.
437 int icm = ic->m;
438 int icn = ic->n;
439 int* RESTRICT icStartRows = ic->StartRows;
440
441 //Loop over the columns.
442
443 // create array for quicker access to ic->StartRows
444 struct s_diagmap {
445 int sss;
446 int ss;
447 int k;
448 };
449
450
451 // Pass one: count number of needed entries
452 int entrycount = 0;
453
454 for(int i = 1; i < icm; i++) {
455 for(int j = 1; j < icm; j++) {
456 if(ic->FindIndex( icStartRows[i] + icStartRows[j]) > 0) {
457 entrycount ++;
458 }
459 }
460 }
461
462 // now we can create the array
463 struct s_diagmap* RESTRICT DiagMap = new s_diagmap[entrycount];
464 // we also need the maxvalues
465 int entrynumber = 0;
466 int index;
467 int* RESTRICT MaxIndizes = new int[icm];
468
469 for(int i = 1; i < icm; i++) {
470 for(int j = 1; j < icm; j++) {
471 index = ic->FindIndex( icStartRows[i] + icStartRows[j]);
472
473 if(index > 0) {
474 DiagMap[entrynumber].ss = j;
475 DiagMap[entrynumber].sss = index;
476 DiagMap[entrynumber].k = icStartRows[j];
477 entrynumber ++;
478 }
479 }
480
481 MaxIndizes[i] = entrynumber - 1;
482 }
483
484 int* RESTRICT findmap = new int[icm];
485
486 for(int j = 0; j < icm; j++) {
487 findmap[j] = FindIndex( icStartRows[j]);
488 }
489
490 for(int j = 0; j < n; j++) {
491 //Calculate d for this column.
492 d[j] = Diagonals[0][j];
493
494 //This is a loop over k from 1 to j, inclusive. We'll cover that by looping over the index of the diagonals (s), and get k from it.
495 //The first diagonal is d (k = 0), so skip that and have s start at 1. Cover all available s but stop if k exceeds j.
496 int s = 1;
497 int k = icStartRows[s];
498
499 while(k <= j) {
500 d[j] -= l[s][j - k] * l[s][j - k] * d[j - k];
501 s++;
502 k = icStartRows[s];
503 }
504
505 if(UNLIKELY(d[j] == 0.0f)) {
506 printf("Error in MultiDiagonalSymmetricMatrix::CreateIncompleteCholeskyFactorization: division by zero. Matrix not decomposable.\n");
507 delete ic;
508 delete[] DiagMap;
509 delete[] MaxIndizes;
510 delete[] findmap;
511 return false;
512 }
513
514 float id = 1.0f / d[j];
515 //Now, calculate l from top down along this column.
516
517 int mapindex = 0;
518 int jMax = icn - j;
519
520 for(s = 1; s < icm; s++) {
521 if(icStartRows[s] >= jMax) {
522 break; //Possible values of j are limited
523 }
524
525 float temp = 0.0f;
526
527 while(mapindex <= MaxIndizes[s] && ( k = DiagMap[mapindex].k) <= j) {
528 temp -= l[DiagMap[mapindex].sss][j - k] * l[DiagMap[mapindex].ss][j - k] * d[j - k];
529 mapindex ++;
530 }
531
532 sss = findmap[s];
533 l[s][j] = id * (sss < 0 ? temp : (Diagonals[sss][j] + temp));
534 }
535 }
536
537 delete[] DiagMap;
538 delete[] MaxIndizes;
539 delete[] findmap;
540 IncompleteCholeskyFactorization = ic;
541 return true;
542 }
543
KillIncompleteCholeskyFactorization()544 void MultiDiagonalSymmetricMatrix::KillIncompleteCholeskyFactorization()
545 {
546 delete IncompleteCholeskyFactorization;
547 }
548
CholeskyBackSolve(float * RESTRICT x,float * RESTRICT b)549 void MultiDiagonalSymmetricMatrix::CholeskyBackSolve(float* RESTRICT x, float* RESTRICT b)
550 {
551 //We want to solve L D Lt x = b where D is a diagonal matrix described by Diagonals[0] and L is a unit lower triagular matrix described by the rest of the diagonals.
552 //Let D Lt x = y. Then, first solve L y = b.
553 float* RESTRICT *d = IncompleteCholeskyFactorization->Diagonals;
554 int* RESTRICT s = IncompleteCholeskyFactorization->StartRows;
555 int M = IncompleteCholeskyFactorization->m, N = IncompleteCholeskyFactorization->n;
556
557 if(M != DIAGONALSP1) { // can happen in theory
558 for(int j = 0; j < N; j++) {
559 float sub = b[j]; // using local var to reduce memory writes, gave a big speedup
560 int i = 1;
561 int c = j - s[i];
562
563 while(c >= 0) {
564 sub -= d[i][c] * x[c];
565 i++;
566 c = j - s[i];
567 }
568
569 x[j] = sub; // only one memory-write per j
570 }
571 } else { // that's the case almost every time
572 for(int j = 0; j <= s[M - 1]; j++) {
573 float sub = b[j]; // using local var to reduce memory writes, gave a big speedup
574 int i = 1;
575 int c = j - s[1];
576
577 while(c >= 0) {
578 sub -= d[i][c] * x[c];
579 i++;
580 c = j - s[i];
581 }
582
583 x[j] = sub; // only one memory-write per j
584 }
585
586 for(int j = s[M - 1] + 1; j < N; j++) {
587 float sub = b[j]; // using local var to reduce memory writes, gave a big speedup
588
589 for(int i = DIAGONALSP1 - 1; i > 0; i--) { // using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
590 sub -= d[i][j - s[i]] * x[j - s[i]];
591 }
592
593 x[j] = sub; // only one memory-write per j
594 }
595 }
596
597 //Now, solve x from D Lt x = y -> Lt x = D^-1 y
598 // Took this one out of the while, so it can be parallelized now, which speeds up, because division is expensive
599 #ifdef _OPENMP
600 #pragma omp parallel for
601 #endif
602
603 for(int j = 0; j < N; j++) {
604 x[j] = x[j] / d[0][j];
605 }
606
607 if(M != DIAGONALSP1) { // can happen in theory
608 int j = N;
609 while(j-- > 0) {
610 float sub = x[j]; // using local var to reduce memory writes, gave a big speedup
611 int i = 1;
612 int c = j + s[1];
613
614 while(c < N) {
615 sub -= d[i][j] * x[c];
616 i++;
617 c = j + s[i];
618 }
619
620 x[j] = sub; // only one memory-write per j
621 }
622 } else { // that's the case almost every time
623 for(int j = N - 1; j >= (N - 1) - s[M - 1]; j--) {
624 float sub = x[j]; // using local var to reduce memory writes, gave a big speedup
625 int i = 1;
626 int c = j + s[1];
627
628 while(c < N) {
629 sub -= d[i][j] * x[j + s[i]];
630 i++;
631 c = j + s[i];
632 }
633
634 x[j] = sub; // only one memory-write per j
635 }
636
637 for(int j = (N - 2) - s[M - 1]; j >= 0; j--) {
638 float sub = x[j]; // using local var to reduce memory writes, gave a big speedup
639
640 for(int i = DIAGONALSP1 - 1; i > 0; i--) { // using a constant upperbound allows the compiler to unroll this loop (gives a good speedup)
641 sub -= d[i][j] * x[j + s[i]];
642 }
643
644 x[j] = sub; // only one memory-write per j
645 }
646 }
647 }
648
EdgePreservingDecomposition(int width,int height)649 EdgePreservingDecomposition::EdgePreservingDecomposition(int width, int height) : a0(nullptr) , a_1(nullptr), a_w(nullptr), a_w_1(nullptr), a_w1(nullptr)
650 {
651 w = width;
652 h = height;
653 n = w * h;
654
655 //Initialize the matrix just once at construction.
656 A = new MultiDiagonalSymmetricMatrix(n, DIAGONALS);
657
658 if(!(
659 A->CreateDiagonal(0, 0) &&
660 A->CreateDiagonal(1, 1) &&
661 A->CreateDiagonal(2, w - 1) &&
662 A->CreateDiagonal(3, w) &&
663 A->CreateDiagonal(4, w + 1))) {
664 delete A;
665 A = nullptr;
666 printf("Error in EdgePreservingDecomposition construction: out of memory.\n");
667 } else {
668 a0 = A->Diagonals[0];
669 a_1 = A->Diagonals[1];
670 a_w1 = A->Diagonals[2];
671 a_w = A->Diagonals[3];
672 a_w_1 = A->Diagonals[4];
673 }
674 }
675
~EdgePreservingDecomposition()676 EdgePreservingDecomposition::~EdgePreservingDecomposition()
677 {
678 delete A;
679 }
680
CreateBlur(float * Source,float Scale,float EdgeStopping,int Iterates,float * Blur,bool UseBlurForEdgeStop)681 float *EdgePreservingDecomposition::CreateBlur(float *Source, float Scale, float EdgeStopping, int Iterates, float *Blur, bool UseBlurForEdgeStop)
682 {
683
684 if(Blur == nullptr)
685 UseBlurForEdgeStop = false, //Use source if there's no supplied Blur.
686 Blur = new float[n];
687
688 if(Scale == 0.0f) {
689 memcpy(Blur, Source, n * sizeof(float));
690 return Blur;
691 }
692
693 //Create the edge stopping function a, rotationally symmetric and just one instead of (ax, ay). Maybe don't need Blur yet, so use its memory.
694 float* RESTRICT a;
695 float* RESTRICT g;
696
697 if(UseBlurForEdgeStop) {
698 a = new float[n], g = Blur;
699 } else {
700 a = Blur, g = Source;
701 }
702
703 int w1 = w - 1, h1 = h - 1;
704 const float sqreps = 0.0004f; // removed eps*eps from inner loop
705
706
707 #ifdef _OPENMP
708 #pragma omp parallel
709 #endif
710 {
711 #ifdef __SSE2__
712 int x;
713 __m128 gxv, gyv;
714 __m128 Scalev = _mm_set1_ps( Scale );
715 __m128 sqrepsv = _mm_set1_ps( sqreps );
716 __m128 EdgeStoppingv = _mm_set1_ps( -EdgeStopping );
717 __m128 zd5v = _mm_set1_ps( 0.5f );
718 #endif
719 #ifdef _OPENMP
720 #pragma omp for
721 #endif
722
723 for(int y = 0; y < h1; y++) {
724 float *rg = &g[w * y];
725 #ifdef __SSE2__
726
727 for(x = 0; x < w1 - 3; x += 4) {
728 //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
729 gxv = (LVFU(rg[x + 1]) - LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + w]));
730 gyv = (LVFU(rg[x + w]) - LVFU(rg[x])) + (LVFU(rg[x + w + 1]) - LVFU(rg[x + 1]));
731 //Apply power to the magnitude of the gradient to get the edge stopping function.
732 _mm_storeu_ps( &a[x + w * y], Scalev * pow_F((zd5v * vsqrtf(gxv * gxv + gyv * gyv + sqrepsv)), EdgeStoppingv) );
733 }
734
735 for(; x < w1; x++) {
736 //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
737 float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
738 float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
739 //Apply power to the magnitude of the gradient to get the edge stopping function.
740 a[x + w * y] = Scale * pow_F(0.5f * sqrtf(gx * gx + gy * gy + sqreps), -EdgeStopping);
741 }
742
743 #else
744
745 for(int x = 0; x < w1; x++) {
746 //Estimate the central difference gradient in the center of a four pixel square. (gx, gy) is actually 2*gradient.
747 float gx = (rg[x + 1] - rg[x]) + (rg[x + w + 1] - rg[x + w]);
748 float gy = (rg[x + w] - rg[x]) + (rg[x + w + 1] - rg[x + 1]);
749
750 //Apply power to the magnitude of the gradient to get the edge stopping function.
751 a[x + w * y] = Scale * pow_F(0.5f * sqrtf(gx * gx + gy * gy + sqreps), -EdgeStopping);
752 }
753
754 #endif
755 }
756 }
757
758
759 /* Now setup the linear problem. I use the Maxima CAS, here's code for making an FEM formulation for the smoothness term:
760 p(x, y) := (1 - x)*(1 - y);
761 P(m, n) := A[m][n]*p(x, y) + A[m + 1][n]*p(1 - x, y) + A[m + 1][n + 1]*p(1 - x, 1 - y) + A[m][n + 1]*p(x, 1 - y);
762 Integrate(f) := integrate(integrate(f, x, 0, 1), y, 0, 1);
763
764 Integrate(diff(P(u, v), x)*diff(p(x, y), x) + diff(P(u, v), y)*diff(p(x, y), y));
765 Integrate(diff(P(u - 1, v), x)*diff(p(1 - x, y), x) + diff(P(u - 1, v), y)*diff(p(1 - x, y), y));
766 Integrate(diff(P(u - 1, v - 1), x)*diff(p(1 - x, 1 - y), x) + diff(P(u - 1, v - 1), y)*diff(p(1 - x, 1 - y), y));
767 Integrate(diff(P(u, v - 1), x)*diff(p(x, 1 - y), x) + diff(P(u, v - 1), y)*diff(p(x, 1 - y), y));
768 So yeah. Use the numeric results of that to fill the matrix A.*/
769
770 memset(a_1, 0, A->DiagonalLength(1)*sizeof(float));
771 memset(a_w1, 0, A->DiagonalLength(w - 1)*sizeof(float));
772 memset(a_w, 0, A->DiagonalLength(w)*sizeof(float));
773 memset(a_w_1, 0, A->DiagonalLength(w + 1)*sizeof(float));
774
775
776 // checked for race condition here
777 // a0[] is read and write but addressed by i only
778 // a[] is read only
779 // a_w_1 is write only
780 // a_w is write only
781 // a_w1 is write only
782 // a_1 is write only
783 // So, there should be no race conditions
784
785 #ifdef _OPENMP
786 #pragma omp parallel for
787 #endif
788
789 for(int y = 0; y < h; y++) {
790 int i = y * w;
791
792 for(int x = 0; x < w; x++, i++) {
793 float ac, a0temp;
794 a0temp = 0.25f;
795
796 //Remember, only fill the lower triangle. Memory for upper is never made. It's symmetric. Trust.
797 if(x > 0 && y > 0) {
798 ac = a[i - w - 1] / 6.0f;
799 a_w_1[i - w - 1] -= 2.0f * ac;
800 a_w[i - w] -= ac;
801 a_1[i - 1] -= ac;
802 a0temp += ac;
803 }
804
805 if(x < w1 && y > 0) {
806 ac = a[i - w] / 6.0f;
807 a_w[i - w] -= ac;
808 a_w1[i - w + 1] -= 2.0f * ac;
809 a0temp += ac;
810 }
811
812 if(x > 0 && y < h1) {
813 ac = a[i - 1] / 6.0f;
814 a_1[i - 1] -= ac;
815 a0temp += ac;
816 }
817
818 if(x < w1 && y < h1) {
819 a0temp += a[i] / 6.0f;
820 }
821
822 a0[i] = 4.0f * a0temp;
823 }
824 }
825
826 if(UseBlurForEdgeStop) {
827 delete[] a;
828 }
829
830 //Solve & return.
831 bool success = A->CreateIncompleteCholeskyFactorization(1); //Fill-in of 1 seems to work really good. More doesn't really help and less hurts (slightly).
832
833 if(!success) {
834 fprintf(stderr, "Error: Tonemapping has failed.\n");
835 memset(Blur, 0, sizeof(float)*n); // On failure, set the blur to zero. This is subsequently exponentiated in CompressDynamicRange.
836 return Blur;
837 }
838
839 if(!UseBlurForEdgeStop) {
840 memcpy(Blur, Source, n * sizeof(float));
841 }
842
843 SparseConjugateGradient(A->PassThroughVectorProduct, Source, n, false, Blur, 0.0f, (void *)A, Iterates, A->PassThroughCholeskyBackSolve);
844 A->KillIncompleteCholeskyFactorization();
845 return Blur;
846 }
847
CreateIteratedBlur(float * Source,float Scale,float EdgeStopping,int Iterates,int Reweightings,float * Blur)848 float *EdgePreservingDecomposition::CreateIteratedBlur(float *Source, float Scale, float EdgeStopping, int Iterates, int Reweightings, float *Blur)
849 {
850 //Simpler outcome?
851 if(Reweightings == 0) {
852 return CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur);
853 }
854
855 //Create a blur here, initialize.
856 if(Blur == nullptr) {
857 Blur = new float[n];
858 }
859
860 memcpy(Blur, Source, n * sizeof(float));
861
862 //Iteratively improve the blur.
863 Reweightings++;
864
865 for(int i = 0; i < Reweightings; i++) {
866 CreateBlur(Source, Scale, EdgeStopping, Iterates, Blur, true);
867 }
868
869 return Blur;
870 }
871
CompressDynamicRange(float * Source,float Scale,float EdgeStopping,float CompressionExponent,float DetailBoost,int Iterates,int Reweightings)872 void EdgePreservingDecomposition::CompressDynamicRange(float *Source, float Scale, float EdgeStopping, float CompressionExponent, float DetailBoost, int Iterates, int Reweightings)
873 {
874 if(w < 300 && h < 300) { // set number of Reweightings to zero for small images (thumbnails). We could try to find a better solution here.
875 Reweightings = 0;
876 }
877
878 //Small number intended to prevent division by zero. This is different from the eps in CreateBlur.
879 const float eps = 0.0001f;
880
881 //We're working with luminance, which does better logarithmic.
882 #ifdef __SSE2__
883 #ifdef _OPENMP
884 #pragma omp parallel
885 #endif
886 {
887 __m128 epsv = _mm_set1_ps( eps );
888 #ifdef _OPENMP
889 #pragma omp for
890 #endif
891
892 for(int ii = 0; ii < n - 3; ii += 4) {
893 _mm_storeu_ps( &Source[ii], xlogf(vmaxf(LVFU(Source[ii]), ZEROV) + epsv));
894 }
895 }
896
897 for(int ii = n - (n % 4); ii < n; ii++) {
898 Source[ii] = xlogf(std::max(Source[ii], 0.f) + eps);
899 }
900
901 #else
902 #ifdef _OPENMP
903 #pragma omp parallel for
904 #endif
905
906 for(int ii = 0; ii < n; ii++) {
907 Source[ii] = xlogf(std::max(Source[ii], 0.f) + eps);
908 }
909
910 #endif
911
912 //Blur. Also setup memory for Compressed (we can just use u since each element of u is used in one calculation).
913 float *u = CreateIteratedBlur(Source, Scale, EdgeStopping, Iterates, Reweightings);
914
915 //Apply compression, detail boost, unlogging. Compression is done on the logged data and detail boost on unlogged.
916 float temp;
917
918 if(DetailBoost > 0.f) {
919 float betemp = expf(-(2.f - DetailBoost + 0.694f)) - 1.f; //0.694 = log(2)
920 temp = 1.2f * xlogf( -betemp);
921 } else {
922 temp = CompressionExponent - 1.0f;
923 }
924
925 #ifdef __SSE2__
926 #ifdef _OPENMP
927 #pragma omp parallel
928 #endif
929 {
930 __m128 cev, uev, sourcev;
931 __m128 epsv = _mm_set1_ps( eps );
932 __m128 DetailBoostv = _mm_set1_ps( DetailBoost );
933 __m128 tempv = _mm_set1_ps( temp );
934 #ifdef _OPENMP
935 #pragma omp for
936 #endif
937
938 for(int i = 0; i < n - 3; i += 4) {
939 cev = xexpf(LVFU(Source[i]) + LVFU(u[i]) * (tempv)) - epsv;
940 uev = xexpf(LVFU(u[i])) - epsv;
941 sourcev = xexpf(LVFU(Source[i])) - epsv;
942 _mm_storeu_ps( &Source[i], cev + DetailBoostv * (sourcev - uev) );
943 }
944 }
945
946 for(int i = n - (n % 4); i < n; i++) {
947 float ce = xexpf(Source[i] + u[i] * (temp)) - eps;
948 float ue = xexpf(u[i]) - eps;
949 Source[i] = xexpf(Source[i]) - eps;
950 Source[i] = ce + DetailBoost * (Source[i] - ue);
951 }
952
953 #else
954 #ifdef _OPENMP
955 #pragma omp parallel for
956 #endif
957
958 for(int i = 0; i < n; i++) {
959 float ce = xexpf(Source[i] + u[i] * (temp)) - eps;
960 float ue = xexpf(u[i]) - eps;
961 Source[i] = xexpf(Source[i]) - eps;
962 Source[i] = ce + DetailBoost * (Source[i] - ue);
963 }
964
965 #endif
966
967 delete[] u;
968 }
969
970