1 ////////////////////////////////////////////////////////////////
2 //
3 //          AMaZE demosaic algorithm
4 // (Aliasing Minimization and Zipper Elimination)
5 //
6 //  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
7 //  optimized for speed by Ingo Weyrich
8 //
9 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
10 //
11 // code dated: May 27, 2010
12 // latest modification: Ingo Weyrich, January 25, 2016
13 //
14 //  amaze.cc is free software: you can redistribute it and/or modify
15 //  it under the terms of the GNU General Public License as published by
16 //  the Free Software Foundation, either version 3 of the License, or
17 //  (at your option) any later version.
18 //
19 //  This program is distributed in the hope that it will be useful,
20 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
21 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 //  GNU General Public License for more details.
23 //
24 //  You should have received a copy of the GNU General Public License
25 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
26 //
27 ////////////////////////////////////////////////////////////////
28 
29 #include <algorithm>
30 #include <array>
31 #include <cstring>
32 #include <memory>
33 
34 #include "bayerhelper.h"
35 #include "librtprocess.h"
36 #include "rt_math.h"
37 #include "sleef.h"
38 #include "opthelper.h"
39 #include "median.h"
40 #include "StopWatch.h"
41 
42 using namespace librtprocess;
43 
amaze_demosaic(int raw_width,int raw_height,int winx,int winy,int winw,int winh,const float * const * rawData,float ** red,float ** green,float ** blue,const unsigned cfarray[2][2],const std::function<bool (double)> & setProgCancel,double initGain,int border,float inputScale,float outputScale,size_t chunkSize,bool measure)44 rpError amaze_demosaic(int raw_width, int raw_height, int winx, int winy, int winw, int winh, const float * const *rawData, float **red, float **green, float **blue, const unsigned cfarray[2][2], const std::function<bool(double)> &setProgCancel, double initGain, int border, float inputScale, float outputScale, size_t chunkSize, bool measure)
45 {
46     BENCHFUN
47     std::unique_ptr<StopWatch> stop;
48 
49     if (measure) {
50         std::cout << "Demosaicing " << winw << "x" << winh << " image using AMaZE with " << chunkSize << " Tiles per Thread" << std::endl;
51         stop.reset(new StopWatch("amaze demosaic"));
52     }
53 
54     if (!validateBayerCfa(3, cfarray)) {
55         return RP_WRONG_CFA;
56     }
57     rpError rc = RP_NO_ERROR;
58 
59     double progress = 0.0;
60     setProgCancel(progress);
61 
62     const int width = winw, height = winh;
63     const float clip_pt = 1.0 / initGain;
64     const float clip_pt8 = 0.8 / initGain;
65 
66 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
67 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
68 #ifndef AMAZETS
69 #define AMAZETS 160
70 #endif
71     // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
72     // We assure that Tile size is a multiple of 32 in the range [96;992]
73     constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
74     constexpr int tsh = ts / 2; // half of Tile size
75 
76     //offset of R pixel within a Bayer quartet
77     int ex, ey;
78 
79     //determine GRBG coset; (ey,ex) is the offset of the R subarray
80     if (fc(cfarray, 0, 0) == 1) { //first pixel is G
81         if (fc(cfarray, 0, 1) == 0) {
82             ey = 0;
83             ex = 1;
84         } else {
85             ey = 1;
86             ex = 0;
87         }
88     } else {//first pixel is R or B
89         if (fc(cfarray, 0, 0) == 0) {
90             ey = 0;
91             ex = 0;
92         } else {
93             ey = 1;
94             ex = 1;
95         }
96     }
97 
98     //shifts of pointer value to access pixels in vertical and diagonal directions
99     constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3, m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
100 
101     //tolerance to avoid dividing by zero
102     constexpr float eps = 1e-5, epssq = 1e-10;       //tolerance to avoid dividing by zero
103 
104     //adaptive ratios threshold
105     constexpr float arthresh = 0.75;
106 
107     //gaussian on 5x5 quincunx, sigma=1.2
108     constexpr float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
109     //nyquist texture test threshold
110     constexpr float nyqthresh = 0.5;
111     //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
112     // Is this really sigma=1.2????, seems more like sigma = 1.672
113     constexpr float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
114                                     nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
115                                    };
116     //gaussian on 5x5 alt quincunx, sigma=1.5
117     constexpr float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
118     //gaussian on quincunx grid
119     constexpr float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
120 
121     typedef struct {
122         float h;
123         float v;
124     } s_hv;
125 
126 #ifdef _OPENMP
127     #pragma omp parallel
128 #endif
129     {
130         int progresscounter = 0;
131 
132         constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
133         // assign working space
134         char *buffer = (char *) calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
135 #ifdef _OPENMP
136         #pragma omp critical
137 #endif
138         {
139             if (!buffer) {
140                 rc = RP_MEMORY_ERROR;
141             }
142         }
143 #ifdef _OPENMP
144         #pragma omp barrier
145 #endif
146         if (!rc) {
147             // aligned to 64 byte boundary
148             float *data = (float*)((uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
149 
150             // green values
151             float *rgbgreen         = data;
152             // sum of square of horizontal gradient and square of vertical gradient
153             float *delhvsqsum       = rgbgreen + ts * ts + cldf * 16;
154             // gradient based directional weights for interpolation
155             float *dirwts0          = delhvsqsum + ts * ts + cldf * 16;
156             float *dirwts1          = dirwts0 + ts * ts + cldf * 16;
157             // vertically interpolated colour differences G-R, G-B
158             float *vcd              = dirwts1 + ts * ts + cldf * 16;
159             // horizontally interpolated colour differences
160             float *hcd              = vcd + ts * ts + cldf * 16;
161             // alternative vertical interpolation
162             float *vcdalt           = hcd + ts * ts + cldf * 16;
163             // alternative horizontal interpolation
164             float *hcdalt           = vcdalt + ts * ts + cldf * 16;
165             // square of average colour difference
166             float *cddiffsq         = hcdalt + ts * ts + cldf * 16;
167             // weight to give horizontal vs vertical interpolation
168             float *hvwt             = cddiffsq + ts * ts + 2 * cldf * 16;
169             // final interpolated colour difference
170             float (*Dgrb)[ts * tsh] = (float (*)[ts * tsh]) vcdalt; // there is no overlap in buffer usage => share
171             // gradient in plus (NE/SW) direction
172             float *delp             = cddiffsq; // there is no overlap in buffer usage => share
173             // gradient in minus (NW/SE) direction
174             float *delm             = delp + ts * tsh + cldf * 16;
175             // diagonal interpolation of R+B
176             float *rbint            = delm; // there is no overlap in buffer usage => share
177             // difference between up/down interpolations of G
178             float *dgintv           = hvwt + ts * tsh + cldf * 16;
179             // difference between left/right interpolations of G
180             float *dginth           = dgintv + ts * ts + cldf * 16;
181             // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
182             s_hv  *Dgrb2            = (s_hv  (*)) dgintv; // there is no overlap in buffer usage => share
183             // square of diagonal colour differences
184             float *Dgrbsq1m         = dginth + ts * ts + cldf * 16;
185             float *Dgrbsq1p         = Dgrbsq1m + ts * tsh + cldf * 16;
186             // tile raw data
187             float *cfa              = Dgrbsq1p + ts * tsh + cldf * 16;
188             // relative weight for combining plus and minus diagonal interpolations
189             float *pmwt             = delhvsqsum;  // there is no overlap in buffer usage => share
190             // interpolated colour difference R-B in minus and plus direction
191             float *rbm              = vcd;  // there is no overlap in buffer usage => share
192             float *rbp              = rbm + ts * tsh + cldf * 16;
193             // nyquist texture flags 1=nyquist, 0=not nyquist
194             unsigned char *nyquist  = (unsigned char (*)) (cfa + ts * ts + cldf * 16);            // 1
195             unsigned char *nyquist2 = (unsigned char (*)) cddiffsq;
196             float *nyqutest = (float(*)) (nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64);                // 1
197 
198             // Main algorithm: Tile loop
199             // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
200 #ifdef _OPENMP
201             #pragma omp for schedule(dynamic, chunkSize) collapse(2) nowait
202 #endif
203 
204             for (int top = winy - 16; top < winy + height; top += ts - 32) {
205                 for (int left = winx - 16; left < winx + width; left += ts - 32) {
206                     memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
207                     //location of tile bottom edge
208                     int bottom = min(top + ts, winy + height + 16);
209                     //location of tile right edge
210                     int right  = min(left + ts, winx + width + 16);
211                     //tile width  (=ts except for right edge of image)
212                     int rr1 = bottom - top;
213                     //tile height (=ts except for bottom edge of image)
214                     int cc1 = right - left;
215                     // bookkeeping for borders
216                     // min and max row/column in the tile
217                     int rrmin = top < 0 ? 16 : 0;
218                     int ccmin = left < 0 ? 16 : 0;
219                     //int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
220                     int rrmax = bottom > (raw_height) ? winy + height - top : rr1;
221                     //int ccmax = right > (winx + width) ? winx + width - left : cc1;
222                     int ccmax = right > (raw_width) ? winx + width - left : cc1;
223 
224                     // rgb from input CFA data
225                     // rgb values should be floating point number between 0 and 1
226                     // after white balance multipliers are applied
227                     // a 16 pixel border is added to each side of the image
228 
229                     // begin of tile initialization
230 #ifdef __SSE2__
231                     vfloat cinScalev = F2V( inputScale );
232 
233                     //fill upper border
234                     if (rrmin > 0) {
235                         for (int rr = 0; rr < 16; rr++) {
236                             int row = 32 - rr + top;
237 
238                             for (int cc = ccmin; cc < ccmax; cc += 4) {
239                                 int indx1 = rr * ts + cc;
240                                 vfloat tempv = LVFU(rawData[row][cc + left]) / cinScalev;
241                                 STVF(cfa[indx1], tempv);
242                                 STVF(rgbgreen[indx1], tempv );
243                             }
244                         }
245                     }
246 
247                     // fill inner part
248                     for (int rr = rrmin; rr < rrmax; rr++) {
249                         int row = rr + top;
250                         int cc = ccmin;
251                         for (; cc < ccmax - 3; cc += 4) {
252                             int indx1 = rr * ts + cc;
253                             vfloat tempv = LVFU(rawData[row][cc + left]) / cinScalev;
254                             STVF(cfa[indx1], tempv );
255                             STVF(rgbgreen[indx1], tempv );
256                         }
257                         for (; cc < ccmax; ++cc) {
258                             int indx1 = rr * ts + cc;
259                             float temp = rawData[row][cc + left] / inputScale;
260                             cfa[indx1] = temp;
261                             rgbgreen[indx1] = temp;
262                         }
263                     }
264 
265                     //fill lower border
266                     if (rrmax < rr1) {
267                         for (int rr = 0; rr < 16; rr++)
268                             for (int cc = ccmin; cc < ccmax; cc += 4) {
269                                 int indx1 = (rrmax + rr) * ts + cc;
270                                 vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / cinScalev;
271                                 STVF(cfa[indx1], tempv );
272                                 STVF(rgbgreen[indx1], tempv );
273                             }
274                     }
275 
276 #else
277 
278                     //fill upper border
279                     if (rrmin > 0) {
280                         for (int rr = 0; rr < 16; rr++)
281                             for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
282                                 cfa[rr * ts + cc] = (rawData[row][cc + left]) / inputScale;
283                                 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
284                             }
285                     }
286 
287                     // fill inner part
288                     for (int rr = rrmin; rr < rrmax; rr++) {
289                         int row = rr + top;
290 
291                         for (int cc = ccmin; cc < ccmax; cc++) {
292                             int indx1 = rr * ts + cc;
293                             cfa[indx1] = (rawData[row][cc + left]) / inputScale;
294                             rgbgreen[indx1] = cfa[indx1];
295                         }
296                     }
297 
298                     //fill lower border
299                     if (rrmax < rr1) {
300                         for (int rr = 0; rr < 16; rr++)
301                             for (int cc = ccmin; cc < ccmax; cc++) {
302                                 cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / inputScale;
303                                 rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
304                             }
305                     }
306 
307 #endif
308 
309                     //fill left border
310                     if (ccmin > 0) {
311                         for (int rr = rrmin; rr < rrmax; rr++)
312                             for (int cc = 0, row = rr + top; cc < 16; cc++) {
313                                 cfa[rr * ts + cc] = (rawData[row][32 - cc + left]) / inputScale;
314                                 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
315                             }
316                     }
317 
318                     //fill right border
319                     if (ccmax < cc1) {
320                         for (int rr = rrmin; rr < rrmax; rr++)
321                             for (int cc = 0; cc < 16; cc++) {
322                                 cfa[rr * ts + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / inputScale;
323                                 rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
324                             }
325                     }
326 
327                     //also, fill the image corners
328                     if (rrmin > 0 && ccmin > 0) {
329                         for (int rr = 0; rr < 16; rr++)
330                             for (int cc = 0; cc < 16; cc++) {
331                                 cfa[(rr)*ts + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / inputScale;
332                                 rgbgreen[(rr)*ts + cc] = cfa[(rr) * ts + cc];
333                             }
334                     }
335 
336                     if (rrmax < rr1 && ccmax < cc1) {
337                         for (int rr = 0; rr < 16; rr++)
338                             for (int cc = 0; cc < 16; cc++) {
339                                 cfa[(rrmax + rr)*ts + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / inputScale;
340                                 rgbgreen[(rrmax + rr)*ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
341                             }
342                     }
343 
344                     if (rrmin > 0 && ccmax < cc1) {
345                         for (int rr = 0; rr < 16; rr++)
346                             for (int cc = 0; cc < 16; cc++) {
347                                 cfa[(rr)*ts + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / inputScale;
348                                 rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr) * ts + ccmax + cc];
349                             }
350                     }
351 
352                     if (rrmax < rr1 && ccmin > 0) {
353                         for (int rr = 0; rr < 16; rr++)
354                             for (int cc = 0; cc < 16; cc++) {
355                                 cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / inputScale;
356                                 rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
357                             }
358                     }
359 
360                     // end of tile initialization
361 
362                     // horizontal and vertical gradients
363 #ifdef __SSE2__
364                     vfloat epsv = F2V( eps );
365 
366                     for (int rr = 2; rr < rr1 - 2; rr++) {
367                         for (int indx = rr * ts; indx < rr * ts + cc1; indx += 4) {
368                             vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) -  LVFU( cfa[indx - 1] ) );
369                             vfloat delvv = vabsf( LVF( cfa[indx + v1] ) -  LVF( cfa[indx - v1] ) );
370                             STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
371                             STVF(dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv );
372                             STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
373                         }
374                     }
375 
376 #else
377 
378                     for (int rr = 2; rr < rr1 - 2; rr++)
379                         for (int cc = 2, indx = (rr) * ts + cc; cc < cc1 - 2; cc++, indx++) {
380                             float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
381                             float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
382                             dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
383                             dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
384                             delhvsqsum[indx] = SQR(delh) + SQR(delv);
385                         }
386 
387 #endif
388 
389                     //interpolate vertical and horizontal colour differences
390 #ifdef __SSE2__
391                     vfloat sgnv;
392 
393                     if( !(fc(cfarray, 4, 4) & 1) ) {
394                         sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
395                     } else {
396                         sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
397                     }
398 
399                     vfloat zd5v = F2V( 0.5f );
400                     vfloat onev = F2V( 1.f );
401                     vfloat arthreshv = F2V( arthresh );
402                     vfloat clip_pt8v = F2V( clip_pt8 );
403 
404                     for (int rr = 4; rr < rr1 - 4; rr++) {
405                         sgnv = -sgnv;
406 
407                         for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4) {
408                             //colour ratios in each cardinal direction
409                             vfloat cfav = LVF(cfa[indx]);
410                             vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
411                             vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
412                             vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
413                             vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
414 
415                             //G interpolated in vert/hor directions using Hamilton-Adams method
416                             vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
417                             vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
418                             vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
419                             vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
420 
421                             //G interpolated in vert/hor directions using adaptive ratios
422                             vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
423                             vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
424                             vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
425                             vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
426 
427                             //adaptive weights for vertical/horizontal directions
428                             vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
429                             vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
430 
431                             //interpolated G via adaptive weights of cardinal evaluations
432                             vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
433                             vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
434 
435                             //interpolated colour differences
436                             vfloat hcdaltv = sgnv * (Ginthhav - cfav);
437                             vfloat vcdaltv = sgnv * (Gintvhav - cfav);
438                             STVF(hcdalt[indx], hcdaltv);
439                             STVF(vcdalt[indx], vcdaltv);
440 
441                             vmask clipmask = vorm( vorm( vmaskf_gt( cfav, clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v ));
442                             guarv = vself( clipmask, guhav, guarv);
443                             gdarv = vself( clipmask, gdhav, gdarv);
444                             glarv = vself( clipmask, glhav, glarv);
445                             grarv = vself( clipmask, grhav, grarv);
446 
447                             //use HA if highlights are (nearly) clipped
448                             STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
449                             STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
450 
451                             //differences of interpolations in opposite directions
452                             STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
453                             STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
454                         }
455                     }
456 
457 #else
458 
459                     for (int rr = 4; rr < rr1 - 4; rr++) {
460                         bool fcswitch = fc(cfarray, rr, 4) & 1;
461 
462                         for (int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++) {
463 
464                             //colour ratios in each cardinal direction
465                             float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
466                             float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
467                             float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
468                             float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
469 
470                             //G interpolated in vert/hor directions using Hamilton-Adams method
471                             float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
472                             float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
473                             float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
474                             float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
475 
476                             //G interpolated in vert/hor directions using adaptive ratios
477                             float guar, gdar, glar, grar;
478 
479                             if (fabsf(1.f - cru) < arthresh) {
480                                 guar = cfa[indx] * cru;
481                             } else {
482                                 guar = guha;
483                             }
484 
485                             if (fabsf(1.f - crd) < arthresh) {
486                                 gdar = cfa[indx] * crd;
487                             } else {
488                                 gdar = gdha;
489                             }
490 
491                             if (fabsf(1.f - crl) < arthresh) {
492                                 glar = cfa[indx] * crl;
493                             } else {
494                                 glar = glha;
495                             }
496 
497                             if (fabsf(1.f - crr) < arthresh) {
498                                 grar = cfa[indx] * crr;
499                             } else {
500                                 grar = grha;
501                             }
502 
503                             //adaptive weights for vertical/horizontal directions
504                             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
505                             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
506 
507                             //interpolated G via adaptive weights of cardinal evaluations
508                             float Gintvha = vwt * gdha + (1.f - vwt) * guha;
509                             float Ginthha = hwt * grha + (1.f - hwt) * glha;
510 
511                             //interpolated colour differences
512                             if (fcswitch) {
513                                 vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
514                                 hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
515                                 vcdalt[indx] = cfa[indx] - Gintvha;
516                                 hcdalt[indx] = cfa[indx] - Ginthha;
517                             } else {
518                                 //interpolated colour differences
519                                 vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
520                                 hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
521                                 vcdalt[indx] = Gintvha - cfa[indx];
522                                 hcdalt[indx] = Ginthha - cfa[indx];
523                             }
524 
525                             fcswitch = !fcswitch;
526 
527                             if (cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8) {
528                                 //use HA if highlights are (nearly) clipped
529                                 guar = guha;
530                                 gdar = gdha;
531                                 glar = glha;
532                                 grar = grha;
533                                 vcd[indx] = vcdalt[indx];
534                                 hcd[indx] = hcdalt[indx];
535                             }
536 
537                             //differences of interpolations in opposite directions
538                             dgintv[indx] = min(SQR(guha - gdha), SQR(guar - gdar));
539                             dginth[indx] = min(SQR(glha - grha), SQR(glar - grar));
540 
541                         }
542                     }
543 
544 #endif
545 
546 
547 
548 #ifdef __SSE2__
549                     vfloat  clip_ptv = F2V( clip_pt );
550                     vfloat  sgn3v;
551 
552                     if( !(fc(cfarray, 4, 4) & 1) ) {
553                         sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
554                     } else {
555                         sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
556                     }
557 
558                     sgn3v = sgnv + sgnv + sgnv;
559 
560                     for (int rr = 4; rr < rr1 - 4; rr++) {
561                         vfloat nsgnv = sgnv;
562                         sgnv = -sgnv;
563                         sgn3v = -sgn3v;
564 
565                         for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4) {
566                             vfloat hcdv = LVF( hcd[indx] );
567                             vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2]));
568                             vfloat hcdaltv = LVF( hcdalt[indx] );
569                             vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv) + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2])) + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
570                             vfloat vcdv = LVF( vcd[indx] );
571                             vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv) + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2])) + SQRV(vcdv - LVF(vcd[indx + v2]));
572                             vfloat vcdaltv = LVF( vcdalt[indx] );
573                             vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv) + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2])) + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
574 
575                             //choose the smallest variance; this yields a smoother interpolation
576                             hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv);
577                             vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv);
578 
579                             //bound the interpolation in regions of high saturation
580                             //vertical and horizontal G interpolations
581                             vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] );
582                             vfloat temp2v = sgn3v * hcdv;
583                             vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
584                             vmask hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV );
585                             vfloat hcdoldv = hcdv;
586                             vfloat tempv = nsgnv * (LVF(cfa[indx]) - median( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) ));
587                             hcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
588                             hcdv = vself( hcdmask, hcdv, hcdoldv );
589                             hcdv = vself( vmaskf_gt( Ginthv, clip_ptv), tempv, hcdv);
590                             STVF(hcd[indx], hcdv);
591 
592                             vfloat Gintvv = sgnv * vcdv + LVF( cfa[indx] );
593                             temp2v = sgn3v * vcdv;
594                             vfloat vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx]));
595                             vmask vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV );
596                             vfloat vcdoldv = vcdv;
597                             tempv = nsgnv * (LVF(cfa[indx]) - median( Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1]) ));
598                             vcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
599                             vcdv = vself( vcdmask, vcdv, vcdoldv );
600                             vcdv = vself( vmaskf_gt( Gintvv, clip_ptv), tempv, vcdv);
601                             STVF(vcd[indx], vcdv);
602                             STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
603                         }
604 
605                     }
606 
607 #else
608 
609                     for (int rr = 4; rr < rr1 - 4; rr++) {
610                         for (int cc = 4, indx = rr * ts + cc, c = fc(cfarray, rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
611                             float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
612                             float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
613                             float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
614                             float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
615 
616                             //choose the smallest variance; this yields a smoother interpolation
617                             if (hcdaltvar < hcdvar) {
618                                 hcd[indx] = hcdalt[indx];
619                             }
620 
621                             if (vcdaltvar < vcdvar) {
622                                 vcd[indx] = vcdalt[indx];
623                             }
624 
625                             //bound the interpolation in regions of high saturation
626                             //vertical and horizontal G interpolations
627                             float Gintv, Ginth;
628 
629                             if (c) {//G site
630                                 Ginth = -hcd[indx] + cfa[indx]; //R or B
631                                 Gintv = -vcd[indx] + cfa[indx]; //B or R
632 
633                                 if (hcd[indx] > 0) {
634                                     if (3.f * hcd[indx] > (Ginth + cfa[indx])) {
635                                         hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
636                                     } else {
637                                         float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
638                                         hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (-median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
639                                     }
640                                 }
641 
642                                 if (vcd[indx] > 0) {
643                                     if (3.f * vcd[indx] > (Gintv + cfa[indx])) {
644                                         vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
645                                     } else {
646                                         float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
647                                         vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (-median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
648                                     }
649                                 }
650 
651                                 if (Ginth > clip_pt) {
652                                     hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
653                                 }
654 
655                                 if (Gintv > clip_pt) {
656                                     vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
657                                 }
658 
659 
660                             } else {//R or B site
661 
662                                 Ginth = hcd[indx] + cfa[indx]; //interpolated G
663                                 Gintv = vcd[indx] + cfa[indx];
664 
665                                 if (hcd[indx] < 0) {
666                                     if (3.f * hcd[indx] < -(Ginth + cfa[indx])) {
667                                         hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
668                                     } else {
669                                         float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
670                                         hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
671                                     }
672                                 }
673 
674                                 if (vcd[indx] < 0) {
675                                     if (3.f * vcd[indx] < -(Gintv + cfa[indx])) {
676                                         vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
677                                     } else {
678                                         float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
679                                         vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
680                                     }
681                                 }
682 
683                                 if (Ginth > clip_pt) {
684                                     hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
685                                 }
686 
687                                 if (Gintv > clip_pt) {
688                                     vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
689                                 }
690 
691                                 cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
692                             }
693 
694                             c = !c;
695                         }
696                     }
697 
698 #endif
699 
700 
701 
702 #ifdef __SSE2__
703                     vfloat  epssqv = F2V( epssq );
704 
705                     for (int rr = 6; rr < rr1 - 6; rr++) {
706                         for (int indx = rr * ts + 6 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 6; indx += 8) {
707                             //compute colour difference variances in cardinal directions
708                             vfloat tempv = LC2VFU(vcd[indx]);
709                             vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
710                             vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
711                             vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
712                             vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
713 
714                             vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
715                             vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
716 
717                             tempv = LC2VFU(hcd[indx]);
718                             vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
719                             vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
720 
721                             vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
722                             vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
723 
724 
725                             vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
726                             vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
727 
728                             //compute fluctuations in up/down and left/right interpolations of colours
729                             Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
730                             Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
731 
732                             Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
733                             Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
734 
735                             vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
736                             vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
737 
738                             //determine adaptive weights for G interpolation
739                             vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
740                             vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
741 
742                             //if both agree on interpolation direction, choose the one with strongest directional discrimination;
743                             //otherwise, choose the u/d and l/r difference fluctuation weights
744                             vmask decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) );
745                             STVFU(hvwt[indx >> 1], vself( decmask, varwtv, diffwtv));
746                         }
747                     }
748 
749 #else
750 
751                     for (int rr = 6; rr < rr1 - 6; rr++) {
752                         for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
753 
754                             //compute colour difference variances in cardinal directions
755 
756                             float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
757                             float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
758                             float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
759                             float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
760 
761                             //colour difference (G-R or G-B) variance in up/down/left/right directions
762                             float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
763                             float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
764                             float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
765                             float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave);
766 
767                             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
768                             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
769 
770                             float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
771                             float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
772 
773                             //compute fluctuations in up/down and left/right interpolations of colours
774                             Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
775                             Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
776                             Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
777                             Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
778 
779                             float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
780                             float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
781 
782                             //determine adaptive weights for G interpolation
783                             float varwt = hcdvar / (vcdvar + hcdvar);
784                             float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
785 
786                             //if both agree on interpolation direction, choose the one with strongest directional discrimination;
787                             //otherwise, choose the u/d and l/r difference fluctuation weights
788                             if ((0.5 - varwt) * (0.5 - diffwt) > 0 && fabsf(0.5 - diffwt) < fabsf(0.5 - varwt)) {
789                                 hvwt[indx >> 1] = varwt;
790                             } else {
791                                 hvwt[indx >> 1] = diffwt;
792                             }
793                         }
794                     }
795 
796 #endif
797 
798 #ifdef __SSE2__
799                     vfloat gaussg0 = F2V(gaussgrad[0]);
800                     vfloat gaussg1 = F2V(gaussgrad[1]);
801                     vfloat gaussg2 = F2V(gaussgrad[2]);
802                     vfloat gaussg3 = F2V(gaussgrad[3]);
803                     vfloat gaussg4 = F2V(gaussgrad[4]);
804                     vfloat gaussg5 = F2V(gaussgrad[5]);
805                     vfloat gausso0 = F2V(gaussodd[0]);
806                     vfloat gausso1 = F2V(gaussodd[1]);
807                     vfloat gausso2 = F2V(gaussodd[2]);
808                     vfloat gausso3 = F2V(gaussodd[3]);
809 
810 #endif
811 
812                     // precompute nyquist
813                     for (int rr = 6; rr < rr1 - 6; rr++) {
814                         int cc = 6 + (fc(cfarray, rr, 2) & 1);
815                         int indx = rr * ts + cc;
816 
817 #ifdef __SSE2__
818 
819                         for (; cc < cc1 - 7; cc += 8, indx += 8) {
820                             vfloat valv = (gausso0 * LC2VFU(cddiffsq[indx]) +
821                                            gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)]) +
822                                                       LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)])) +
823                                            gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)]) +
824                                                       LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)])) +
825                                            gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)]) +
826                                                       LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)]))) -
827                                           (gaussg0 * LC2VFU(delhvsqsum[indx]) +
828                                            gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1]) +
829                                                       LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1])) +
830                                            gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1]) +
831                                                       LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1])) +
832                                            gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2]) +
833                                                       LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2])) +
834                                            gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1]) +
835                                                       LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2]) +
836                                                       LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2]) +
837                                                       LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1])) +
838                                            gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2]) +
839                                                       LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
840                             STVFU(nyqutest[indx >> 1], valv);
841 
842                         }
843 
844 #endif
845 
846                         for (; cc < cc1 - 6; cc += 2, indx += 2) {
847                             nyqutest[indx >> 1] = (gaussodd[0] * cddiffsq[indx] +
848                                                    gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
849                                                                   cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
850                                                    gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
851                                                                   cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
852                                                    gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
853                                                                   cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])) -
854                                                   (gaussgrad[0] *  delhvsqsum[indx] +
855                                                    gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
856                                                                    delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
857                                                    gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
858                                                                    delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
859                                                    gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
860                                                                    delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
861                                                    gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1] +
862                                                                    delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2] +
863                                                                    delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2] +
864                                                                    delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1]) +
865                                                    gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
866                                                                    delhvsqsum[indx - p2] + delhvsqsum[indx + m2]));
867                         }
868                     }
869 
870                     // Nyquist test
871                     int nystartrow = 0;
872                     int nyendrow = 0;
873                     int nystartcol = ts + 1;
874                     int nyendcol = 0;
875 
876                     for (int rr = 6; rr < rr1 - 6; rr++) {
877                         for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
878 
879                             //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients
880                             if(nyqutest[indx >> 1] > 0.f) {
881                                 nyquist[indx >> 1] = 1;    //nyquist=1 for nyquist region
882                                 nystartrow = nystartrow ? nystartrow : rr;
883                                 nyendrow = rr;
884                                 nystartcol = nystartcol > cc ? cc : nystartcol;
885                                 nyendcol = nyendcol < cc ? cc : nyendcol;
886                             }
887                         }
888                     }
889 
890 
891                     bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
892 
893                     if(doNyquist) {
894                         nyendrow ++; // because of < condition
895                         nyendcol ++; // because of < condition
896                         nystartcol -= (nystartcol & 1);
897                         nystartrow = std::max(8, nystartrow);
898                         nyendrow = std::min(rr1 - 8, nyendrow);
899                         nystartcol = std::max(8, nystartcol);
900                         nyendcol = std::min(cc1 - 8, nyendcol);
901                         memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
902 
903 #ifdef __SSE2__
904                         vint fourvb = _mm_set1_epi8(4);
905                         vint onevb = _mm_set1_epi8(1);
906 
907 #endif
908 
909                         for (int rr = nystartrow; rr < nyendrow; rr++) {
910 #ifdef __SSE2__
911 
912                             for (int indx = rr * ts; indx < rr * ts + cc1; indx += 32) {
913                                 vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx - v2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - m1) >> 1]));
914                                 vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + p1) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - 2) >> 1]));
915                                 vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx +  2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - p1) >> 1]));
916                                 vint valv = _mm_load_si128((vint*)&nyquist[indx >> 1]);
917                                 vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + m1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx + v2) >> 1]));
918                                 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
919                                 nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
920                                 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
921                                 valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
922                                 valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
923                                 _mm_store_si128((vint*)&nyquist2[indx >> 1], valv);
924                             }
925 
926 #else
927 
928                             for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
929                                 unsigned int nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
930                                                             nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] +
931                                                             nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
932                                 //if most of your neighbours are named Nyquist, it's likely that you're one too, or not
933                                 nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
934                             }
935 
936 #endif
937                         }
938 
939                         // end of Nyquist test
940 
941                         // in areas of Nyquist texture, do area interpolation
942                         for (int rr = nystartrow; rr < nyendrow; rr++)
943                             for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
944 
945                                 if (nyquist2[indx >> 1]) {
946                                     // area interpolation
947 
948                                     float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
949 
950                                     for (int i = -6; i < 7; i += 2) {
951                                         int indx1 = indx + (i * ts) - 6;
952 
953                                         for (int j = -6; j < 7; j += 2, indx1 += 2) {
954                                             if (nyquist2[indx1 >> 1]) {
955                                                 float cfatemp = cfa[indx1];
956                                                 sumcfa += cfatemp;
957                                                 sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
958                                                 sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
959                                                 sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
960                                                 sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
961                                                 areawt += 1;
962                                             }
963                                         }
964                                     }
965 
966                                     //horizontal and vertical colour differences, and adaptive weight
967                                     sumh = sumcfa - xdiv2f(sumh);
968                                     sumv = sumcfa - xdiv2f(sumv);
969                                     areawt = xdiv2f(areawt);
970                                     float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
971                                     float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
972                                     hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
973 
974                                     // end of area interpolation
975 
976                                 }
977                             }
978                     }
979 
980 
981                     //populate G at R/B sites
982                     for (int rr = 8; rr < rr1 - 8; rr++)
983                         for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 8; indx += 2) {
984 
985                             //first ask if one gets more directional discrimination from nearby B/R sites
986                             float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
987 
988                             hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
989                             //a better result was obtained from the neighbours
990 
991                             Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate colour differences
992 
993                             rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!)
994 
995                             //local curvature in G (preparation for nyquist refinement step)
996                             Dgrb2[indx >> 1].h = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
997                             Dgrb2[indx >> 1].v = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
998                         }
999 
1000 
1001                     //end of standard interpolation
1002 
1003                     // refine Nyquist areas using G curvatures
1004                     if(doNyquist) {
1005                         for (int rr = nystartrow; rr < nyendrow; rr++)
1006                             for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
1007 
1008                                 if (nyquist2[indx >> 1]) {
1009                                     //local averages (over Nyquist pixels only) of G curvature squared
1010                                     float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
1011                                                            gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
1012                                                            gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) +
1013                                                            gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
1014                                     float gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v +
1015                                                            gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) +
1016                                                            gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) +
1017                                                            gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
1018                                     //use the results as weights for refined G interpolation
1019                                     Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
1020                                     rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
1021                                 }
1022                             }
1023                     }
1024 
1025 
1026 #ifdef __SSE2__
1027 
1028                     for (int rr = 6; rr < rr1 - 6; rr++) {
1029                         if((fc(cfarray, rr, 2) & 1) == 0) {
1030                             for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1031                                 vfloat tempv = LC2VFU(cfa[indx + 1]);
1032                                 vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
1033                                 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
1034                                 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
1035                                 vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
1036                                 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1037                                 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1038                             }
1039                         } else {
1040                             for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1041                                 vfloat tempv = LC2VFU(cfa[indx]);
1042                                 vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
1043                                 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
1044                                 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
1045                                 vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
1046                                 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1047                                 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1048                             }
1049                         }
1050                     }
1051 
1052 #else
1053 
1054                     for (int rr = 6; rr < rr1 - 6; rr++) {
1055                         if((fc(cfarray, rr, 2) & 1) == 0) {
1056                             for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1057                                 delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
1058                                 delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
1059                                 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
1060                                 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
1061                             }
1062                         } else {
1063                             for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1064                                 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
1065                                 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
1066                                 delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
1067                                 delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
1068                             }
1069                         }
1070                     }
1071 
1072 #endif
1073 
1074                     // diagonal interpolation correction
1075 
1076 #ifdef __SSE2__
1077                     vfloat gausseven0v = F2V(gausseven[0]);
1078                     vfloat gausseven1v = F2V(gausseven[1]);
1079 #endif
1080 
1081                     for (int rr = 8; rr < rr1 - 8; rr++) {
1082 #ifdef __SSE2__
1083 
1084                         for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8; indx += 8, indx1 += 4) {
1085 
1086                             //diagonal colour ratios
1087                             vfloat cfav = LC2VFU(cfa[indx]);
1088 
1089                             vfloat temp1v = LC2VFU(cfa[indx + m1]);
1090                             vfloat temp2v = LC2VFU(cfa[indx + m2]);
1091                             vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1092                             rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev, temp1v + zd5v * (cfav - temp2v));
1093 
1094                             temp1v = LC2VFU(cfa[indx - m1]);
1095                             temp2v = LC2VFU(cfa[indx - m2]);
1096                             vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1097                             rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv, temp1v + zd5v * (cfav - temp2v));
1098 
1099                             temp1v = epsv + LVFU(delm[indx1]);
1100                             vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr
1101                             vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
1102 
1103                             vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
1104 
1105                             temp1v = median(rbmv , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
1106                             vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
1107                             temp2v = vintpf(wtv, rbmv, temp1v);
1108 
1109                             temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
1110                             temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
1111                             STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v ));
1112 
1113 
1114                             temp1v = LC2VFU(cfa[indx + p1]);
1115                             temp2v = LC2VFU(cfa[indx + p2]);
1116                             vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1117                             rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v));
1118 
1119                             temp1v = LC2VFU(cfa[indx - p1]);
1120                             temp2v = LC2VFU(cfa[indx - p2]);
1121                             vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1122                             rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v));
1123 
1124                             temp1v = epsv + LVFU(delp[indx1]);
1125                             vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
1126                             vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
1127 
1128                             vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
1129 
1130                             temp1v = median(rbpv , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
1131                             wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
1132                             temp2v = vintpf(wtv, rbpv, temp1v);
1133 
1134                             temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
1135                             temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
1136                             STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v ));
1137 
1138                             vfloat rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) +
1139                                                        gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) +
1140                                                                LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
1141                             STVFU(pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) +
1142                                                             gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) +
1143                                                                     LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv));
1144 
1145                         }
1146 
1147 #else
1148 
1149                         for (int cc = 8 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
1150 
1151                             //diagonal colour ratios
1152                             float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
1153                             float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
1154                             float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
1155                             float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
1156                             //colour differences in diagonal directions
1157                             float rbse, rbnw, rbne, rbsw;
1158 
1159                             //assign B/R at R/B sites
1160                             if (fabsf(1.f - crse) < arthresh) {
1161                                 rbse = cfa[indx] * crse;    //use this if more precise diag interp is necessary
1162                             } else {
1163                                 rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
1164                             }
1165 
1166                             if (fabsf(1.f - crnw) < arthresh) {
1167                                 rbnw = cfa[indx] * crnw;
1168                             } else {
1169                                 rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
1170                             }
1171 
1172                             if (fabsf(1.f - crne) < arthresh) {
1173                                 rbne = cfa[indx] * crne;
1174                             } else {
1175                                 rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
1176                             }
1177 
1178                             if (fabsf(1.f - crsw) < arthresh) {
1179                                 rbsw = cfa[indx] * crsw;
1180                             } else {
1181                                 rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
1182                             }
1183 
1184                             float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr
1185                             float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
1186                             float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
1187                             float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
1188 
1189 
1190                             rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
1191                             rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
1192 
1193                             //variance of R-B in plus/minus directions
1194                             float rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) +
1195                                                     gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] +
1196                                                                     Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
1197                             pmwt[indx1] = rbvarm / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1] + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1]) +
1198                                                               gausseven[1] * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1] + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1] +
1199                                                                       Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1] + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1]))) + rbvarm);
1200 
1201                             //bound the interpolation in regions of high saturation
1202 
1203                             if (rbp[indx1] < cfa[indx]) {
1204                                 if (xmul2f(rbp[indx1]) < cfa[indx]) {
1205                                     rbp[indx1] = median(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]);
1206                                 } else {
1207                                     float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
1208                                     rbp[indx1] = pwt * rbp[indx1] + (1.f - pwt) * median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1209                                 }
1210                             }
1211 
1212                             if (rbm[indx1] < cfa[indx]) {
1213                                 if (xmul2f(rbm[indx1]) < cfa[indx]) {
1214                                     rbm[indx1] = median(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]);
1215                                 } else {
1216                                     float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
1217                                     rbm[indx1] = mwt * rbm[indx1] + (1.f - mwt) * median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1218                                 }
1219                             }
1220 
1221                             if (rbp[indx1] > clip_pt) {
1222                                 rbp[indx1] = median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1223                             }
1224 
1225                             if (rbm[indx1] > clip_pt) {
1226                                 rbm[indx1] = median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1227                             }
1228                         }
1229 
1230 #endif
1231                     }
1232 
1233 #ifdef __SSE2__
1234                     vfloat zd25v = F2V(0.25f);
1235 #endif
1236 
1237                     for (int rr = 10; rr < rr1 - 10; rr++)
1238 #ifdef __SSE2__
1239                         for (int indx = rr * ts + 10 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4) {
1240 
1241                             //first ask if one gets more directional discrimination from nearby B/R sites
1242                             vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
1243                             vfloat tempv = LVFU(pmwt[indx1]);
1244                             tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
1245                             STVFU(pmwt[indx1], tempv);
1246                             STVFU(rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
1247                         }
1248 
1249 #else
1250 
1251                         for (int cc = 10 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
1252 
1253                             //first ask if one gets more directional discrimination from nearby B/R sites
1254                             float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
1255 
1256                             if (fabsf(0.5 - pmwt[indx1]) < fabsf(0.5 - pmwtalt)) {
1257                                 pmwt[indx1] = pmwtalt;   //a better result was obtained from the neighbours
1258                             }
1259 
1260                             rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
1261                         }
1262 
1263 #endif
1264 
1265                     for (int rr = 12; rr < rr1 - 12; rr++)
1266 #ifdef __SSE2__
1267                         for (int indx = rr * ts + 12 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4) {
1268                             vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
1269 
1270                             if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the unused out at the end
1271                                 //now interpolate G vertically/horizontally using R+B values
1272                                 //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1273                                 //colour ratios for G interpolation
1274                                 vfloat rbintv = LVFU(rbint[indx1]);
1275 
1276                                 //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1277                                 vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
1278                                 vfloat guv = rbintv * cruv;
1279                                 vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
1280                                 guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
1281 
1282                                 vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
1283                                 vfloat gdv = rbintv * crdv;
1284                                 vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
1285                                 gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
1286 
1287                                 vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
1288                                 vfloat Gint1v = median(Gintvv , LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
1289                                 vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
1290                                 vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
1291                                 Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
1292                                 Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
1293                                 Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv), median(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
1294 
1295                                 vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
1296                                 vfloat glv = rbintv * crlv;
1297                                 vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
1298                                 glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
1299 
1300                                 vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
1301                                 vfloat grv = rbintv * crrv;
1302                                 vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
1303                                 grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
1304 
1305                                 vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
1306                                 vfloat Gint1h = median(Ginthv , LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
1307                                 vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
1308                                 vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
1309                                 Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
1310                                 Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
1311                                 Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv), median(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
1312 
1313                                 vfloat greenv = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
1314                                 STC2VFU(rgbgreen[indx], greenv);
1315 
1316                                 STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
1317                             }
1318                         }
1319 
1320 #else
1321 
1322                         for (int cc = 12 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
1323 
1324                             if (fabsf(0.5 - pmwt[indx >> 1]) < fabsf(0.5 - hvwt[indx >> 1]) ) {
1325                                 continue;
1326                             }
1327 
1328                             //now interpolate G vertically/horizontally using R+B values
1329                             //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1330 
1331                             //colour ratios for G interpolation
1332                             float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
1333                             float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
1334                             float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
1335                             float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
1336 
1337                             //interpolation of G in four directions
1338                             float gu, gd, gl, gr;
1339 
1340                             //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1341                             if (fabsf(1.f - cru) < arthresh) {
1342                                 gu = rbint[indx1] * cru;
1343                             } else {
1344                                 gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
1345                             }
1346 
1347                             if (fabsf(1.f - crd) < arthresh) {
1348                                 gd = rbint[indx1] * crd;
1349                             } else {
1350                                 gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
1351                             }
1352 
1353                             if (fabsf(1.f - crl) < arthresh) {
1354                                 gl = rbint[indx1] * crl;
1355                             } else {
1356                                 gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
1357                             }
1358 
1359                             if (fabsf(1.f - crr) < arthresh) {
1360                                 gr = rbint[indx1] * crr;
1361                             } else {
1362                                 gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
1363                             }
1364 
1365                             //interpolated G via adaptive weights of cardinal evaluations
1366                             float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1367                             float Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1368 
1369                             //bound the interpolation in regions of high saturation
1370                             if (Gintv < rbint[indx1]) {
1371                                 if (2 * Gintv < rbint[indx1]) {
1372                                     Gintv = median(Gintv , cfa[indx - v1], cfa[indx + v1]);
1373                                 } else {
1374                                     float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
1375                                     Gintv = vwt * Gintv + (1.f - vwt) * median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1376                                 }
1377                             }
1378 
1379                             if (Ginth < rbint[indx1]) {
1380                                 if (2 * Ginth < rbint[indx1]) {
1381                                     Ginth = median(Ginth , cfa[indx - 1], cfa[indx + 1]);
1382                                 } else {
1383                                     float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
1384                                     Ginth = hwt * Ginth + (1.f - hwt) * median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1385                                 }
1386                             }
1387 
1388                             if (Ginth > clip_pt) {
1389                                 Ginth = median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1390                             }
1391 
1392                             if (Gintv > clip_pt) {
1393                                 Gintv = median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1394                             }
1395 
1396                             rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
1397                             Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
1398                         }
1399 
1400 #endif
1401 
1402                     //end of diagonal interpolation correction
1403 
1404                     //fancy chrominance interpolation
1405                     //(ey,ex) is location of R site
1406                     for (int rr = 13 - ey; rr < rr1 - 12; rr += 2)
1407                         for (int indx1 = (rr * ts + 13 - ex) >> 1; indx1 < (rr * ts + cc1 - 12) >> 1; indx1++) { //B coset
1408                             Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R
1409                             Dgrb[0][indx1] = 0;
1410                         }
1411 
1412 #ifdef __SSE2__
1413                     vfloat oned325v = F2V( 1.325f );
1414                     vfloat zd175v = F2V( 0.175f );
1415                     vfloat zd075v = F2V( 0.075f );
1416 #endif
1417 
1418                     for (int rr = 14; rr < rr1 - 14; rr++)
1419 #ifdef __SSE2__
1420                         for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
1421                             vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
1422                             vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
1423                             vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
1424                             vfloat wtnev = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
1425                             vfloat wtswv = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
1426                             vfloat wtsev = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
1427 
1428                             STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])) ) +
1429                                                        wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])) ) +
1430                                                        wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])) ) +
1431                                                        wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + m1 + v2) >> 1])) )) / (wtnwv + wtnev + wtswv + wtsev));
1432                         }
1433 
1434 #else
1435 
1436                         for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
1437                             float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
1438                             float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
1439                             float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
1440                             float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
1441 
1442                             Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) +
1443                                                   wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) +
1444                                                   wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1] - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1] ) +
1445                                                   wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1] - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1] )) / (wtnw + wtne + wtsw + wtse);
1446                         }
1447 
1448 #endif
1449 
1450 #ifdef __SSE2__
1451                     int offset;
1452                     vfloat twov = F2V(2.f);
1453                     vfloat coutscalev = F2V(outputScale);
1454                     vmask selmask;
1455 
1456                     if((fc(cfarray, 16, 2) & 1) == 1) {
1457                         selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
1458                         offset = 1;
1459                     } else {
1460                         selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
1461                         offset = 0;
1462                     }
1463 
1464 #endif
1465 
1466                     for (int rr = 16; rr < rr1 - 16; rr++) {
1467                         int row = rr + top;
1468                         int col = left + 16;
1469                         int indx = rr * ts + 16;
1470 #ifdef __SSE2__
1471                         offset = 1 - offset;
1472                         selmask = vnotm(selmask);
1473 
1474                         for (; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4) {
1475                             vfloat greenv = LVF(rgbgreen[indx]);
1476                             vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
1477                             vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
1478                             vfloat tempv =  onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])) - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
1479 
1480                             vfloat redv1  = greenv - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1]))) * tempv;
1481                             vfloat bluev1 = greenv - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1]))) * tempv;
1482                             vfloat redv2  = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
1483                             vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
1484                             STVFU(red[row][col], coutscalev * vself(selmask, redv1, redv2));
1485                             STVFU(blue[row][col], coutscalev * vself(selmask, bluev1, bluev2));
1486                         }
1487 
1488                         if(offset == 0) {
1489                             for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1490                                 float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1491                                 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1492                                                             temp);
1493                                 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1494                                                              temp);
1495 
1496                                 indx++;
1497                                 col++;
1498                                 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1499                                 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1500                             }
1501 
1502                             if(cc1 & 1) { // width of tile is odd
1503                                 float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1504                                 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1505                                                             temp);
1506                                 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1507                                                              temp);
1508                             }
1509                         } else {
1510                             for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1511                                 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1512                                 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1513 
1514                                 indx++;
1515                                 col++;
1516                                 float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1517                                 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1518                                                             temp);
1519                                 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1520                                                              temp);
1521                             }
1522 
1523                             if(cc1 & 1) { // width of tile is odd
1524                                 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1525                                 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1526                             }
1527                         }
1528 
1529 #else
1530 
1531                         if((fc(cfarray, rr, 2) & 1) == 1) {
1532                             for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1533                                 float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1534                                 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1535                                                             temp);
1536                                 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1537                                                              temp);
1538 
1539                                 indx++;
1540                                 col++;
1541                                 red[row][col] = outputScale* (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1542                                 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1543                             }
1544 
1545                             if(cc1 & 1) { // width of tile is odd
1546                                 float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1547                                 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1548                                                             temp);
1549                                 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1550                                                              temp);
1551                             }
1552                         } else {
1553                             for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1554                                 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1555                                 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1556 
1557                                 indx++;
1558                                 col++;
1559                                 float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1560                                 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1561                                                             temp);
1562                                 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1563                                                              temp);
1564                             }
1565 
1566                             if(cc1 & 1) { // width of tile is odd
1567                                 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1568                                 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1569                             }
1570                         }
1571 
1572 #endif
1573                     }
1574 
1575                     // copy smoothed results back to image matrix
1576                     for (int rr = 16; rr < rr1 - 16; rr++) {
1577                         int row = rr + top;
1578                         int cc = 16;
1579 #ifdef __SSE2__
1580 
1581                         for (; cc < cc1 - 19; cc += 4) {
1582                             STVFU(green[row][cc + left], LVF(rgbgreen[rr * ts + cc]) * coutscalev);
1583                         }
1584 
1585 #endif
1586 
1587                         for (; cc < cc1 - 16; cc++) {
1588                             green[row][cc + left] = outputScale * rgbgreen[rr * ts + cc];
1589                         }
1590                     }
1591 
1592                     progresscounter++;
1593 
1594                     if(progresscounter % 32 == 0) {
1595 #ifdef _OPENMP
1596                         #pragma omp critical (amazeprogress)
1597 #endif
1598                         {
1599                             progress += 32.0 * SQR(ts - 32) / (height * width);
1600                             progress = std::min(progress, 1.0);
1601                             setProgCancel(progress);
1602                         }
1603                     }
1604                 }
1605             }  //end of main loop
1606         }
1607         // clean up
1608         if (buffer) {
1609             free(buffer);
1610         }
1611     }
1612     if(border < 4 && rc == RP_NO_ERROR) {
1613         rc = bayerborder_demosaic(width, height, 3, rawData, red, green, blue, cfarray);
1614     }
1615 
1616     setProgCancel(1.0);
1617 
1618     return rc;
1619 }
1620