1 /*
2  */
3 
4 /*
5 
6     Copyright (C) 2014 Ferrero Andrea
7 
8     This program is free software: you can redistribute it and/or modify
9     it under the terms of the GNU General Public License as published by
10     the Free Software Foundation, either version 3 of the License, or
11     (at your option) any later version.
12 
13     This program is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16     GNU General Public License for more details.
17 
18     You should have received a copy of the GNU General Public License
19     along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21 
22  */
23 
24 /*
25 
26     These files are distributed with PhotoFlow - http://aferrero2707.github.io/PhotoFlow/
27 
28  */
29 
30 #ifndef PF_NLMEANS_H
31 #define PF_NLMEANS_H
32 
33 #include <string.h>
34 
35 #include "../base/processor.hh"
36 
37 namespace PF
38 {
39 
40 class NonLocalMeansPar: public OpParBase
41 {
42   Property<float> radius, strength, luma_frac, chroma_frac;
43 
44   int P, K;
45   float sharpness;
46 
47   ProcessorBase* convert2lab;
48   ProcessorBase* convert2input;
49   ProcessorBase* nlmeans_algo;
50 
51   PF::ICCProfile* in_profile;
52 
53 public:
54   NonLocalMeansPar();
55 
has_intensity()56   bool has_intensity() { return false; }
needs_caching()57   bool needs_caching() { return false; }
58 
set_radius(float r)59   void set_radius( float r ) { radius.set(r); }
set_strength(float s)60   void set_strength( float s ) { strength.set(s); }
set_luma_frac(float f)61   void set_luma_frac( float f ) { luma_frac.set(f); }
set_chroma_frac(float f)62   void set_chroma_frac( float f ) { chroma_frac.set(f); }
63 
64 
get_P()65   int get_P() {return P; }
get_K()66   int get_K() { return K; }
get_sharpness()67   float get_sharpness() { return sharpness; }
get_luma_frac()68   float get_luma_frac() { return luma_frac.get(); }
get_chroma_frac()69   float get_chroma_frac() { return chroma_frac.get(); }
70 
get_padding()71   int get_padding() { return P+K; }
72 
73   VipsImage* build(std::vector<VipsImage*>& in, int first,
74       VipsImage* imap, VipsImage* omap,
75       unsigned int& level);
76 };
77 
78 
79 
80 class NonLocalMeans_DTAlgo_Par: public OpParBase
81 {
82   int P, K;
83   float sharpness, luma_frac, chroma_frac;
84 
85 public:
NonLocalMeans_DTAlgo_Par()86   NonLocalMeans_DTAlgo_Par(): OpParBase() {set_type("nlmeans_DT");}
87 
has_intensity()88   bool has_intensity() { return false; }
needs_caching()89   bool needs_caching() { return false; }
90 
get_P()91   int get_P() {return P; }
set_P(int val)92   void set_P(int val) { P = val; }
get_K()93   int get_K() { return K; }
set_K(int val)94   void set_K(int val) { K = val; }
get_sharpness()95   float get_sharpness() { return sharpness; }
set_sharpness(float val)96   void set_sharpness(float val) { sharpness = val; }
get_luma_frac()97   float get_luma_frac() { return luma_frac; }
set_luma_frac(float val)98   void set_luma_frac(float val) { luma_frac = val; }
get_chroma_frac()99   float get_chroma_frac() { return chroma_frac; }
set_chroma_frac(float val)100   void set_chroma_frac(float val) { chroma_frac = val; }
101 
get_padding()102   int get_padding() { return P+K; }
103 
104   /* Function to derive the output area from the input area
105    */
transform(const VipsRect * rin,VipsRect * rout,int)106   virtual void transform(const VipsRect* rin, VipsRect* rout, int /*id*/)
107   {
108     int pad = get_padding();
109     rout->left = rin->left+pad;
110     rout->top = rin->top+pad;
111     rout->width = rin->width-pad*2;
112     rout->height = rin->height-pad*2;
113   }
114 
115   /* Function to derive the area to be read from input images,
116      based on the requested output area
117    */
transform_inv(const VipsRect * rout,VipsRect * rin,int)118   virtual void transform_inv(const VipsRect* rout, VipsRect* rin, int /*id*/)
119   {
120     int pad = get_padding();
121     rin->left = rout->left-pad;
122     rin->top = rout->top-pad;
123     rin->width = rout->width+pad*2;
124     rin->height = rout->height+pad*2;
125   }
126 };
127 
128 
129 
130 template < OP_TEMPLATE_DEF >
131 class NonLocalMeansProc
132 {
133 public:
render(VipsRegion ** in,int n,int in_first,VipsRegion * imap,VipsRegion * omap,VipsRegion * out,OpParBase * par)134   void render(VipsRegion** in, int n, int in_first,
135       VipsRegion* imap, VipsRegion* omap,
136       VipsRegion* out, OpParBase* par)
137   {
138     std::cout<<"NonLocalMeansProc::render() called."<<std::endl;
139   }
140 };
141 
142 
143 typedef union floatint_t
144 {
145   float f;
146   uint32_t i;
147 } floatint_t;
148 
149 /* Non-local means filter, adapted from Darktable
150  */
151 template < OP_TEMPLATE_DEF >
152 class NonLocalMeans_DTAlgo_Proc
153 {
fast_mexp2f(const float x)154   float fast_mexp2f(const float x)
155   {
156     const float i1 = (float)0x3f800000u; // 2^0
157     const float i2 = (float)0x3f000000u; // 2^-1
158     const float k0 = i1 + x * (i2 - i1);
159     floatint_t k;
160     k.i = k0 >= (float)0x800000u ? k0 : 0;
161     return k.f;
162   }
163 
gh(const float f,const float sharpness)164   float gh(const float f, const float sharpness)
165   {
166     const float f2 = f * sharpness;
167     return fast_mexp2f(f2);
168     // return 0.0001f + dt_fast_expf(-fabsf(f)*800.0f);
169     // return 1.0f/(1.0f + f*f);
170     // make spread bigger: less smoothing
171     // const float spread = 100.f;
172     // return 1.0f/(1.0f + fabsf(f)*spread);
173   }
174 
175 
176 public:
render(VipsRegion ** ireg,int n,int in_first,VipsRegion * imap,VipsRegion * omap,VipsRegion * oreg,OpParBase * par)177   void render(VipsRegion** ireg, int n, int in_first,
178       VipsRegion* imap, VipsRegion* omap,
179       VipsRegion* oreg, OpParBase* par)
180   {
181     //std::cout<<"NonLocalMeansProc::render(RGB) called: n="<<n<<std::endl;
182     if( n < 1 ) return;
183     if( ireg[0] == NULL ) {std::cout<<"NonLocalMeansProc::render(RGB): ireg[0]==NULL"<<std::endl;return;}
184 
185     //int iwidth = ireg[0]->valid.width;
186     //int iheight = ireg[0]->valid.height;
187 
188     //The cleaning algorithm starts here
189 
190     NonLocalMeans_DTAlgo_Par* opar = dynamic_cast<NonLocalMeans_DTAlgo_Par*>(par);
191     if( !opar ) {
192       std::cout<<"NonLocalMeansProc::render(RGB): opar==NULL"<<std::endl;
193       return;
194     }
195 
196     VipsRect *ir = &ireg[0]->valid;
197     VipsRect roi = {ir->left+2, ir->top+2, ir->width-4, ir->height-4};
198     VipsRect *r = &oreg->valid;
199     vips_rect_intersectrect( &roi, r, &roi );
200     int nch = oreg->im->Bands;
201     int line_size = roi.width * oreg->im->Bands;
202     int width = ir->width;
203     int height = ir->height;
204     int dx = r->left - ir->left;
205     int dy = r->top - ir->top;
206     int padding = r->top - ir->top;
207 
208     //std::cout<<"NonLocalMeansProc::render(RGB):"<<std::endl
209     //    <<"ir: "<<ir->width<<","<<ir->height<<"+"<<ir->left<<"+"<<ir->top<<std::endl
210     //    <<"r:  "<<r->width<<","<<r->height<<"+"<<r->left<<"+"<<r->top<<std::endl;
211 
212     //T* pin0;
213     ///T* pin1;
214     //T* pin2;
215     //T* pin20;
216     //T* pin21;
217     //T* pin22;
218     //T* pout;
219     //int x, y, oy, x1, y1, pos, ipos, pos1;
220 
221     // adjust to zoom size:
222     const int P = opar->get_P(); //ceilf(d->radius * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f)); // pixel filter size
223     const int K = opar->get_K(); //ceilf(7 * fmin(roi_in->scale, 2.0f) / fmax(piece->iscale, 1.0f));         // nbhood
224     const float sharpness = opar->get_sharpness(); //3000.0f / (1.0f + d->strength);
225     float luma = opar->get_luma_frac();
226     float chroma = opar->get_chroma_frac();
227 
228     //int padding = opar->get_padding();
229 
230     // adjust to Lab, make L more important
231     // float max_L = 100.0f, max_C = 256.0f;
232     // float nL = 1.0f/(d->luma*max_L), nC = 1.0f/(d->chroma*max_C);
233     //float max_L = 120.0f, max_C = 512.0f;
234     float max_L = 1.2f, max_C = 2.0f;
235     float nL = 1.0f / max_L, nC = 1.0f / max_C;
236     const float norm2[4] = { nL * nL, nC * nC, nC * nC, 1.0f };
237     //const float norm2[4] = { 1.0f, 1.0f, .0f, 1.0f };
238 
239     /*
240     {
241       for(int j = 0; j < 10; j++) {
242       T* pin = (T*)VIPS_REGION_ADDR( ireg[0], ir->left, ir->top+j );
243       for( int i = 0; i < 10*nch; i+=nch ) printf("%f ", pin[i]*100); printf("\n");
244       }
245     }
246     */
247 
248     float *Sa = (float*)malloc( sizeof(float) * width);
249     float *out_norm = (float*)malloc( sizeof(float) * width * height);
250     memset(out_norm, 0x0, sizeof(float) * width * height);
251 
252     // we want to sum up weights in col[3], so need to init to 0:
253     for( int y = 0; y < r->height; y++ ) {
254       T* pout = (T*)VIPS_REGION_ADDR( oreg, r->left, r->top + y );
255       memset( pout, 0x0, VIPS_REGION_SIZEOF_LINE(oreg) );
256     }
257     //memset(ovoid, 0x0, (size_t)sizeof(float) * roi_out->width * roi_out->height * 4);
258 
259     // for each shift vector
260     for(int kj = -K; kj <= K; kj++) {
261       for(int ki = -K; ki <= K; ki++) {
262         int inited_slide = 0;
263         // don't construct summed area tables but use sliding window! (applies to cpu version res < 1k only, or else
264         // we will add up errors)
265         // do this in parallel with a little threading overhead. could parallelize the outer loops with a bit more
266         // memory
267 //#ifdef _OPENMP
268 //#pragma omp parallel for schedule(static) default(none) firstprivate(inited_slide) shared(kj, ki, Sa)
269 //#endif
270         for(int j = 0; j < height; j++) {
271           if(j + kj < 0 || j + kj >= height) continue;
272           float *S = Sa;
273           //const float *ins = ((float *)ivoid) + 4 * ((size_t)roi_in->width * (j + kj) + ki);
274           const T *ins = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + ki, ir->top + j + kj );
275           //float *out = ((float *)ovoid) + 4 * (size_t)roi_out->width * j;
276 
277           const int Pm = MIN(MIN(P, j + kj), j);
278           const int PM = MIN(MIN(P, height - 1 - j - kj), height - 1 - j);
279           // first line of every thread
280           // TODO: also every once in a while to assert numerical precision!
281           if(!inited_slide) {
282             // sum up a line
283             memset(Sa, 0x0, sizeof(float) * width);
284             for(int jj = -Pm; jj <= PM; jj++) {
285               int i = MAX(0, -ki);
286               float *s = S + i;
287               //const float *inp = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j + jj);
288               const T* inp = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + i, ir->top + j + jj );
289               //printf("inp: left=%d top=%d\n", ir->left + i, ir->top + j + jj);
290               //const float *inps = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j + jj + kj) + ki);
291               const T* inps = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + i + ki, ir->top + j + jj + kj);
292               //printf("inps: left=%d top=%d\n", ir->left + i + ki, ir->top + j + jj + kj);
293               const int last = width + MIN(0, -ki);
294               for(; i < last; i++, inp += nch, inps += nch, s++)
295               {
296                 for(int k = 0; k < nch; k++) {
297                   //float temp = (inp[k] - inps[k]) * (inp[k] - inps[k]) * norm2[k];
298                   s[0] += (inp[k] - inps[k]) * (inp[k] - inps[k]) * norm2[k];
299                   if( false && /*j < 10 && i < 10 &&*/ k==0)
300                     printf("  kj=%d  ki=%d  j=%d  Pm=%d  PM=%d  jj=%d  i=%d  inp[%d]=%f inps[%d]=%f s[0]+=%f\n",
301                         kj, ki, j, Pm, PM, jj, i, k, (float)inp[k]*100, k, (float)inps[k]*100,
302                         ((inp[k] - inps[k]) * (inp[k] - inps[k]) * norm2[k]));
303                 }
304               }
305               if( false && /*j< 10*/ true )
306                 printf("  kj=%d  ki=%d  j=%d  Pm=%d  PM=%d  jj=%d  i=%d  last=%d  s[0]=%f\n",
307                     kj, ki, j, Pm, PM, jj, i, last, s[0]);
308             }
309             // only reuse this if we had a full stripe
310             if(Pm == P && PM == P) inited_slide = 1;
311           }
312           //continue;
313           //std::cout<<"inited_slide="<<inited_slide<<std::endl;
314 
315           // sliding window for this line:
316           T* out = (T*)VIPS_REGION_ADDR( oreg, ir->left, ir->top + j );
317           float* on = out_norm + width*j;
318           float *s = S;
319           float slide = 0.0f;
320           // sum up the first -P..P
321           for(int i = 0; i < 2 * P + 1; i++) slide += s[i];
322           for(int i = 0; i < width; i++, s++, ins += nch, out += nch, on += 1) {
323             if(i - P > 0 && i + P < width)
324               slide += s[P] - s[-P - 1];
325             if(i >= padding && i < width - padding && i + ki >= 0 && i + ki < width && j >= padding && j < height - padding) {
326               const float iv[4] = { static_cast<float>(ins[0]), static_cast<float>(ins[1]), static_cast<float>(ins[2]), 1.0f };
327               //#if defined(_OPENMP) && defined(OPENMP_SIMD_)
328               //#pragma omp SIMD()
329               //#endif
330               float gh_val = gh(slide, sharpness);
331               for(int c = 0; c < nch; c++)
332               {
333                 //printf("  i=%d    out[0] = %f + %f * gh(%f, %f) = %f + %f * %f = %f\n",
334                 //    i, out[c], iv[c], slide, sharpness,
335                 //    out[c], iv[c], gh_val,
336                 //    out[c] + iv[c] * gh_val);
337                 out[c] += iv[c] * gh_val;
338               }
339               //printf("  i=%d    on[0] = %f + %f = %f\n", i, on[0], gh_val, on[0] + gh_val);
340               on[0] += gh_val;
341             }
342           }
343           //continue;
344           if(inited_slide && j + P + 1 + MAX(0, kj) < height)
345           {
346             // sliding window in j direction:
347             int i = MAX(0, -ki);
348             float *s = S + i;
349             //const float *inp = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j + P + 1);
350             const T* inp = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + i, ir->top + j + P + 1 );
351             //const float *inps = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j + P + 1 + kj) + ki);
352             const T* inps = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + i + ki, ir->top + j + P + 1 + kj );
353             //const float *inm = ((float *)ivoid) + 4 * i + 4 * (size_t)roi_in->width * (j - P);
354             const T* inm = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + i, ir->top + j - P );
355             //const float *inms = ((float *)ivoid) + 4 * i + 4 * ((size_t)roi_in->width * (j - P + kj) + ki);
356             const T* inms = (T*)VIPS_REGION_ADDR( ireg[0], ir->left + i + ki, ir->top + j - P + kj );
357             const int last = width + MIN(0, -ki);
358             for(; i < last; i++, inp += nch, inps += nch, inm += nch, inms += nch, s++)
359             {
360               float stmp = s[0];
361               for(int k = 0; k < nch; k++)
362                 stmp += ((inp[k] - inps[k]) * (inp[k] - inps[k]) - (inm[k] - inms[k]) * (inm[k] - inms[k])) * norm2[k];
363               s[0] = stmp;
364             }
365           }
366           else
367             inited_slide = 0;
368         }
369       }
370     }
371 
372     // normalize and apply chroma/luma blending
373     const float weight[4] = { luma, chroma, chroma, 1.0f };
374     const float invert[4] = { 1.0f - luma, 1.0f - chroma, 1.0f - chroma, 0.0f };
375 
376     for(int j = 0; j < r->height; j++) {
377       T* in = ( (T*)VIPS_REGION_ADDR(ireg[0], r->left, r->top + j) );
378       T* out = ( (T*)VIPS_REGION_ADDR(oreg, r->left, r->top + j) );
379       float* on = out_norm + (j+padding) * width + padding;
380       for(int i = 0; i < r->width; i++, in+=nch, out+=nch, on+=1) {
381         for(int c = 0; c < nch; c++) {
382           //printf("j=%d i=%d  out[c]=%f * %f / %f = %f\n", j, i, out[c], weight[c], on[0], out[c] * weight[c] / on[0]);
383           out[c] = in[c] * invert[c] + out[c] * weight[c] / on[0];
384         }
385         //out[1] = out[2] = 0.5;
386       }
387     }
388 
389     //#ifdef _OPENMP
390     //#pragma omp parallel for SIMD() default(none) schedule(static) collapse(2)
391     //#endif
392     //    for(size_t k = 0; k < (size_t)ch * roi_out->width * roi_out->height; k += ch)
393     //    {
394     //      for(size_t c = 0; c < 4; c++)
395     //      {
396     //        out[k + c] = (in[k + c] * invert[c]) + (out[k + c] * (weight[c] / out[k + 3]));
397     //      }
398     //    }
399 
400     // free shared tmp memory:
401     free(Sa);
402     free(out_norm);
403   }
404 };
405 
406 
407 ProcessorBase* new_nlmeans_algo();
408 
409 ProcessorBase* new_nlmeans();
410 
411 }
412 
413 #endif
414 
415 
416