1 /*
2  *  This file is part of RawTherapee.
3  *
4  *  RawTherapee is free software: you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation, either version 3 of the License, or
7  *  (at your option) any later version.
8  *
9  *  RawTherapee is distributed in the hope that it will be useful,
10  *  but widthITheightOUT ANY widthARRANTY; without even the implied warranty of
11  *  MERCheightANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
16  *
17  *  2010 Emil Martinec <ejmartin@uchicago.edu>
18  *
19  */
20 #include <cstddef>
21 #include "rt_math.h"
22 #include "labimage.h"
23 #include "improcfun.h"
24 #include "cieimage.h"
25 #include "sleef.h"
26 #include "opthelper.h"
27 #include "gauss.h"
28 
29 using namespace std;
30 
31 namespace rtengine
32 {
33 
impulse_nr(LabImage * lab,double thresh)34 void ImProcFunctions::impulse_nr (LabImage* lab, double thresh)
35 {
36     // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
37     // impulse noise removal
38     // local variables
39 
40     int width = lab->W;
41     int height = lab->H;
42 
43     // buffer for the lowpass image
44     float * lpf[height] ALIGNED16;
45     lpf[0] = new float [width * height];
46     // buffer for the highpass image
47     char * impish[height] ALIGNED16;
48     impish[0] = new char [width * height];
49 
50     for (int i = 1; i < height; i++) {
51         lpf[i] = lpf[i - 1] + width;
52         impish[i] = impish[i - 1] + width;
53     }
54 
55 
56     //The cleaning algorithm starts here
57 
58     //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
59     // modified bilateral filter for lowpass image, omitting input pixel; or Gaussian blur
60 
61     const float eps = 1.0;
62 
63 #ifdef _OPENMP
64     #pragma omp parallel
65 #endif
66     {
67         gaussianBlur (lab->L, lpf, width, height, max(2.0, thresh - 1.0));
68     }
69 
70     //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
71 
72     float impthr = max(1.0, 5.5 - thresh);
73     float impthrDiv24 = impthr / 24.0f;         //Issue 1671: moved the Division outside the loop, impthr can be optimized out too, but I let in the code at the moment
74 
75 
76 #ifdef _OPENMP
77     #pragma omp parallel
78 #endif
79     {
80         int i1, j1, j;
81         float hpfabs, hfnbrave;
82 #ifdef __SSE2__
83         vfloat hfnbravev, hpfabsv;
84         vfloat impthrDiv24v = F2V( impthrDiv24 );
85 #endif
86 #ifdef _OPENMP
87         #pragma omp for
88 #endif
89 
90         for (int i = 0; i < height; i++) {
91             for (j = 0; j < 2; j++) {
92                 hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
93 
94                 //block average of high pass data
95                 for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
96                     for (j1 = 0; j1 <= j + 2; j1++) {
97                         hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]);
98                     }
99 
100                 impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
101             }
102 
103 #ifdef __SSE2__
104 
105             for (; j < width - 5; j += 4) {
106                 hfnbravev = ZEROV;
107                 hpfabsv = vabsf(LVFU(lab->L[i][j]) - LVFU(lpf[i][j]));
108 
109                 //block average of high pass data
110                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) {
111                     for (j1 = j - 2; j1 <= j + 2; j1++) {
112                         hfnbravev += vabsf(LVFU(lab->L[i1][j1]) - LVFU(lpf[i1][j1]));
113                     }
114                 }
115 
116                 int mask = _mm_movemask_ps((hfnbravev - hpfabsv) * impthrDiv24v - hpfabsv);
117                 impish[i][j] = (mask & 1);
118                 impish[i][j + 1] = ((mask & 2) >> 1);
119                 impish[i][j + 2] = ((mask & 4) >> 2);
120                 impish[i][j + 3] = ((mask & 8) >> 3);
121             }
122 
123 #endif
124 
125             for (; j < width - 2; j++) {
126                 hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
127 
128                 //block average of high pass data
129                 for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
130                     for (j1 = j - 2; j1 <= j + 2; j1++) {
131                         hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]);
132                     }
133 
134                 impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
135             }
136 
137             for (; j < width; j++) {
138                 hpfabs = fabs(lab->L[i][j] - lpf[i][j]);
139 
140                 //block average of high pass data
141                 for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
142                     for (j1 = j - 2; j1 < width; j1++) {
143                         hfnbrave += fabs(lab->L[i1][j1] - lpf[i1][j1]);
144                     }
145 
146                 impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
147             }
148         }
149     }
150 
151 //now impulsive values have been identified
152 
153 // Issue 1671:
154 // often, noise isn't evenly distributed, e.g. only a few noisy pixels in the bright sky, but many in the dark foreground,
155 // so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
156 // Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
157 // choice for the chunk_size than 16
158 // race conditions are avoided by the array impish
159 #ifdef _OPENMP
160     #pragma omp parallel
161 #endif
162     {
163         int i1, j1, j;
164         float wtdsum[3], dirwt, norm;
165 #ifdef _OPENMP
166         #pragma omp for schedule(dynamic,16)
167 #endif
168 
169         for (int i = 0; i < height; i++) {
170             for (j = 0; j < 2; j++) {
171                 if (!impish[i][j]) {
172                     continue;
173                 }
174 
175                 norm = 0.0;
176                 wtdsum[0] = wtdsum[1] = wtdsum[2] = 0.0;
177 
178                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
179                     for (j1 = 0; j1 <= j + 2; j1++ ) {
180                         if (impish[i1][j1]) {
181                             continue;
182                         }
183 
184                         dirwt = 1 / (SQR(lab->L[i1][j1] - lab->L[i][j]) + eps); //use more sophisticated rangefn???
185                         wtdsum[0] += dirwt * lab->L[i1][j1];
186                         wtdsum[1] += dirwt * lab->a[i1][j1];
187                         wtdsum[2] += dirwt * lab->b[i1][j1];
188                         norm += dirwt;
189                     }
190 
191                 if (norm) {
192                     lab->L[i][j] = wtdsum[0] / norm; //low pass filter
193                     lab->a[i][j] = wtdsum[1] / norm; //low pass filter
194                     lab->b[i][j] = wtdsum[2] / norm; //low pass filter
195                 }
196             }
197 
198             for (; j < width - 2; j++) {
199                 if (!impish[i][j]) {
200                     continue;
201                 }
202 
203                 norm = 0.0;
204                 wtdsum[0] = wtdsum[1] = wtdsum[2] = 0.0;
205 
206                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
207                     for (j1 = j - 2; j1 <= j + 2; j1++ ) {
208                         if (impish[i1][j1]) {
209                             continue;
210                         }
211 
212                         dirwt = 1 / (SQR(lab->L[i1][j1] - lab->L[i][j]) + eps); //use more sophisticated rangefn???
213                         wtdsum[0] += dirwt * lab->L[i1][j1];
214                         wtdsum[1] += dirwt * lab->a[i1][j1];
215                         wtdsum[2] += dirwt * lab->b[i1][j1];
216                         norm += dirwt;
217                     }
218 
219                 if (norm) {
220                     lab->L[i][j] = wtdsum[0] / norm; //low pass filter
221                     lab->a[i][j] = wtdsum[1] / norm; //low pass filter
222                     lab->b[i][j] = wtdsum[2] / norm; //low pass filter
223                 }
224             }
225 
226             for (; j < width; j++) {
227                 if (!impish[i][j]) {
228                     continue;
229                 }
230 
231                 norm = 0.0;
232                 wtdsum[0] = wtdsum[1] = wtdsum[2] = 0.0;
233 
234                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
235                     for (j1 = j - 2; j1 < width; j1++ ) {
236                         if (impish[i1][j1]) {
237                             continue;
238                         }
239 
240                         dirwt = 1 / (SQR(lab->L[i1][j1] - lab->L[i][j]) + eps); //use more sophisticated rangefn???
241                         wtdsum[0] += dirwt * lab->L[i1][j1];
242                         wtdsum[1] += dirwt * lab->a[i1][j1];
243                         wtdsum[2] += dirwt * lab->b[i1][j1];
244                         norm += dirwt;
245                     }
246 
247                 if (norm) {
248                     lab->L[i][j] = wtdsum[0] / norm; //low pass filter
249                     lab->a[i][j] = wtdsum[1] / norm; //low pass filter
250                     lab->b[i][j] = wtdsum[2] / norm; //low pass filter
251                 }
252             }
253         }
254     }
255 //now impulsive values have been corrected
256 
257     delete [] lpf[0];
258     delete [] impish[0];
259 
260 }
261 
262 
impulse_nrcam(CieImage * ncie,double thresh,float ** buffers[3])263 void ImProcFunctions::impulse_nrcam (CieImage* ncie, double thresh, float **buffers[3])
264 {
265     // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
266     // impulse noise removal
267     // local variables
268 
269     int width = ncie->W;
270     int height = ncie->H;
271 
272 
273     float piid = 3.14159265f / 180.f;
274 
275     // buffer for the lowpass image
276     float ** lpf = buffers[0];
277     // buffer for the highpass image
278     float ** impish = buffers[1];
279 
280     //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
281     // modified bilateral filter for lowpass image, omitting input pixel; or Gaussian blur
282 
283 
284 
285     //The cleaning algorithm starts here
286 
287     //rangeblur<unsigned short, unsigned int> (lab->L, lpf, impish /*used as buffer here*/, width, height, thresh, false);
288 #ifdef _OPENMP
289     #pragma omp parallel
290 #endif
291     {
292         gaussianBlur (ncie->sh_p, lpf, width, height, max(2.0, thresh - 1.0));
293     }
294 
295     //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
296 
297     float impthr = max(1.0f, 5.0f - (float)thresh);
298     float impthrDiv24 = impthr / 24.0f;         //Issue 1671: moved the Division outside the loop, impthr can be optimized out too, but I let in the code at the moment
299 
300 #ifdef _OPENMP
301     #pragma omp parallel
302 #endif
303     {
304         int i1, j1, j;
305         float hpfabs, hfnbrave;
306 #ifdef __SSE2__
307         vfloat hfnbravev, hpfabsv;
308         vfloat impthrDiv24v = F2V( impthrDiv24 );
309         vfloat onev = F2V( 1.0f );
310 #endif
311 #ifdef _OPENMP
312         #pragma omp for
313 #endif
314 
315         for (int i = 0; i < height; i++) {
316             for (j = 0; j < 2; j++) {
317                 hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
318 
319                 //block average of high pass data
320                 for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
321                     for (j1 = 0; j1 <= j + 2; j1++) {
322                         hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]);
323                     }
324 
325                 impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
326             }
327 
328 #ifdef __SSE2__
329 
330             for (; j < width - 5; j += 4) {
331                 hpfabsv = vabsf(LVFU(ncie->sh_p[i][j]) - LVFU(lpf[i][j]));
332                 hfnbravev = ZEROV;
333 
334                 //block average of high pass data
335                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ ) {
336                     for (j1 = j - 2; j1 <= j + 2; j1++ ) {
337                         hfnbravev += vabsf(LVFU(ncie->sh_p[i1][j1]) - LVFU(lpf[i1][j1]));
338                     }
339 
340                 }
341 
342                 STVFU(impish[i][j], vselfzero(vmaskf_gt(hpfabsv, (hfnbravev - hpfabsv)*impthrDiv24v), onev));
343             }
344 
345 #endif
346 
347             for (; j < width - 2; j++) {
348                 hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
349 
350                 //block average of high pass data
351                 for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
352                     for (j1 = j - 2; j1 <= j + 2; j1++ ) {
353                         hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]);
354                     }
355 
356                 impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
357             }
358 
359             for (; j < width; j++) {
360                 hpfabs = fabs(ncie->sh_p[i][j] - lpf[i][j]);
361 
362                 //block average of high pass data
363                 for (i1 = max(0, i - 2), hfnbrave = 0; i1 <= min(i + 2, height - 1); i1++ )
364                     for (j1 = j - 2; j1 < width; j1++ ) {
365                         hfnbrave += fabs(ncie->sh_p[i1][j1] - lpf[i1][j1]);
366                     }
367 
368                 impish[i][j] = (hpfabs > ((hfnbrave - hpfabs) * impthrDiv24));
369             }
370         }
371     }
372 
373 //now impulsive values have been identified
374 
375     const float eps = 1.0f;
376 
377     float** sraa = buffers[0]; // we can reuse buffers[0] because lpf is not needed anymore at this point
378     float** srbb = buffers[2];
379 
380 #ifdef _OPENMP
381     #pragma omp parallel
382 #endif
383     {
384 
385 #ifdef __SSE2__
386         vfloat2 sincosvalv;
387         vfloat piidv = F2V( piid );
388         vfloat tempv;
389 #endif
390 #ifdef _OPENMP
391         #pragma omp for
392 #endif
393 
394         for (int i = 0; i < height; i++) {
395             int j = 0;
396 #ifdef __SSE2__
397 
398             for (; j < width - 3; j += 4) {
399                 sincosvalv = xsincosf(piidv * LVFU(ncie->h_p[i][j]));
400                 tempv = LVFU(ncie->C_p[i][j]);
401                 STVFU(sraa[i][j], tempv * sincosvalv.y);
402                 STVFU(srbb[i][j], tempv * sincosvalv.x);
403             }
404 
405 #endif
406 
407             for (; j < width; j++) {
408                 float2 sincosval = xsincosf(piid * ncie->h_p[i][j]);
409                 sraa[i][j] = ncie->C_p[i][j] * sincosval.y;
410                 srbb[i][j] = ncie->C_p[i][j] * sincosval.x;
411             }
412         }
413     }
414 
415 // Issue 1671:
416 // often, noise isn't evenly distributed, e.g. only a few noisy pixels in the bright sky, but many in the dark foreground,
417 // so it's better to schedule dynamic and let every thread only process 16 rows, to avoid running big threads out of work
418 // Measured it and in fact gives better performance than without schedule(dynamic,16). Of course, there could be a better
419 // choice for the chunk_size than 16
420 // race conditions are avoided by the array impish
421 #ifdef _OPENMP
422     #pragma omp parallel
423 #endif
424     {
425         int i1, j1, j;
426         float wtdsum[3], dirwt, norm;
427 #ifdef _OPENMP
428         #pragma omp for schedule(dynamic,16)
429 #endif
430 
431         for (int i = 0; i < height; i++) {
432             for (j = 0; j < 2; j++) {
433                 if (!impish[i][j]) {
434                     continue;
435                 }
436 
437                 norm = 0.0f;
438                 wtdsum[0] = wtdsum[1] = wtdsum[2] = 0.0f;
439 
440                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
441                     for (j1 = 0; j1 <= j + 2; j1++ ) {
442                         if (impish[i1][j1]) {
443                             continue;
444                         }
445 
446                         dirwt = 1.f / (SQR(ncie->sh_p[i1][j1] - ncie->sh_p[i][j]) + eps); //use more sophisticated rangefn???
447                         wtdsum[0] += dirwt * ncie->sh_p[i1][j1];
448                         wtdsum[1] += dirwt * sraa[i1][j1];
449                         wtdsum[2] += dirwt * srbb[i1][j1];
450                         norm += dirwt;
451                     }
452 
453                 if (norm) {
454                     ncie->sh_p[i][j] = wtdsum[0] / norm; //low pass filter
455                     sraa[i][j] = wtdsum[1] / norm; //low pass filter
456                     srbb[i][j] = wtdsum[2] / norm; //low pass filter
457                 }
458             }
459 
460             for (; j < width - 2; j++) {
461                 if (!impish[i][j]) {
462                     continue;
463                 }
464 
465                 norm = 0.0f;
466                 wtdsum[0] = wtdsum[1] = wtdsum[2] = 0.0f;
467 
468                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
469                     for (j1 = j - 2; j1 <= j + 2; j1++ ) {
470                         if (impish[i1][j1]) {
471                             continue;
472                         }
473 
474                         dirwt = 1.f / (SQR(ncie->sh_p[i1][j1] - ncie->sh_p[i][j]) + eps); //use more sophisticated rangefn???
475                         wtdsum[0] += dirwt * ncie->sh_p[i1][j1];
476                         wtdsum[1] += dirwt * sraa[i1][j1];
477                         wtdsum[2] += dirwt * srbb[i1][j1];
478                         norm += dirwt;
479                     }
480 
481                 if (norm) {
482                     ncie->sh_p[i][j] = wtdsum[0] / norm; //low pass filter
483                     sraa[i][j] = wtdsum[1] / norm; //low pass filter
484                     srbb[i][j] = wtdsum[2] / norm; //low pass filter
485                 }
486             }
487 
488             for (; j < width; j++) {
489                 if (!impish[i][j]) {
490                     continue;
491                 }
492 
493                 norm = 0.0f;
494                 wtdsum[0] = wtdsum[1] = wtdsum[2] = 0.0f;
495 
496                 for (i1 = max(0, i - 2); i1 <= min(i + 2, height - 1); i1++ )
497                     for (j1 = j - 2; j1 < width; j1++ ) {
498                         if (impish[i1][j1]) {
499                             continue;
500                         }
501 
502                         dirwt = 1.f / (SQR(ncie->sh_p[i1][j1] - ncie->sh_p[i][j]) + eps); //use more sophisticated rangefn???
503                         wtdsum[0] += dirwt * ncie->sh_p[i1][j1];
504                         wtdsum[1] += dirwt * sraa[i1][j1];
505                         wtdsum[2] += dirwt * srbb[i1][j1];
506                         norm += dirwt;
507                     }
508 
509                 if (norm) {
510                     ncie->sh_p[i][j] = wtdsum[0] / norm; //low pass filter
511                     sraa[i][j] = wtdsum[1] / norm; //low pass filter
512                     srbb[i][j] = wtdsum[2] / norm; //low pass filter
513                 }
514             }
515         }
516     }
517 
518 //now impulsive values have been corrected
519 
520 #ifdef _OPENMP
521     #pragma omp parallel
522 #endif
523     {
524 #ifdef __SSE2__
525         vfloat interav, interbv;
526         vfloat piidv = F2V(piid);
527 #endif // __SSE2__
528 #ifdef _OPENMP
529         #pragma omp for
530 #endif
531 
532         for(int i = 0; i < height; i++ ) {
533             int j = 0;
534 #ifdef __SSE2__
535 
536             for(; j < width - 3; j += 4) {
537                 interav = LVFU(sraa[i][j]);
538                 interbv = LVFU(srbb[i][j]);
539                 STVFU(ncie->h_p[i][j], (xatan2f(interbv, interav)) / piidv);
540                 STVFU(ncie->C_p[i][j], vsqrtf(SQRV(interbv) + SQRV(interav)));
541             }
542 
543 #endif
544 
545             for(; j < width; j++) {
546                 float intera = sraa[i][j];
547                 float interb = srbb[i][j];
548                 ncie->h_p[i][j] = (xatan2f(interb, intera)) / piid;
549                 ncie->C_p[i][j] = sqrt(SQR(interb) + SQR(intera));
550             }
551         }
552     }
553 
554 }
555 
556 
557 }
558