1 ////////////////////////////////////////////////////////////////
2 //
3 //          AMaZE demosaic algorithm
4 // (Aliasing Minimization and Zipper Elimination)
5 //
6 //  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
7 //  optimized for speed by Ingo Weyrich
8 //
9 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
10 //
11 // code dated: May 27, 2010
12 // latest modification: Ingo Weyrich, January 25, 2016
13 //
14 //  amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
15 //  it under the terms of the GNU General Public License as published by
16 //  the Free Software Foundation, either version 3 of the License, or
17 //  (at your option) any later version.
18 //
19 //  This program is distributed in the hope that it will be useful,
20 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
21 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 //  GNU General Public License for more details.
23 //
24 //  You should have received a copy of the GNU General Public License
25 //  along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 //
27 ////////////////////////////////////////////////////////////////
28 
29 #include "rtengine.h"
30 #include "rawimagesource.h"
31 #include "rt_math.h"
32 #include "../rtgui/multilangmgr.h"
33 #include "sleef.h"
34 #include "opthelper.h"
35 #include "median.h"
36 #include "StopWatch.h"
37 
38 namespace
39 {
fc(const unsigned int cfa[2][2],int r,int c)40 unsigned fc(const unsigned int cfa[2][2], int r, int c) {
41     return cfa[r & 1][c & 1];
42 }
43 }
44 
45 namespace rtengine
46 {
47 
amaze_demosaic_RT(int winx,int winy,int winw,int winh,const array2D<float> & rawData,array2D<float> & red,array2D<float> & green,array2D<float> & blue,size_t chunkSize,bool measure)48 void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh, const array2D<float> &rawData, array2D<float> &red, array2D<float> &green, array2D<float> &blue, size_t chunkSize, bool measure)
49 {
50 
51     std::unique_ptr<StopWatch> stop;
52 
53     if (measure) {
54         std::cout << "Demosaicing " << W << "x" << H << " image using AMaZE with " << chunkSize << " Tiles per Thread" << std::endl;
55         stop.reset(new StopWatch("amaze demosaic"));
56     }
57 
58     double progress = 0.0;
59 
60     if (plistener) {
61         plistener->setProgressStr(Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), M("TP_RAW_AMAZE")));
62         plistener->setProgress(progress);
63     }
64 
65     const unsigned int cfarray[2][2] = {{FC(0,0), FC(0,1)}, {FC(1,0), FC(1,1)}};
66     const int width = winw, height = winh;
67     const float clip_pt = 1.0 / initialGain;
68     const float clip_pt8 = 0.8 / initialGain;
69 
70 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
71 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
72 #ifndef AMAZETS
73 #define AMAZETS 160
74 #endif
75     // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
76     // We assure that Tile size is a multiple of 32 in the range [96;992]
77     constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
78     constexpr int tsh = ts / 2; // half of Tile size
79 
80     //offset of R pixel within a Bayer quartet
81     int ex, ey;
82 
83     //determine GRBG coset; (ey,ex) is the offset of the R subarray
84     if (fc(cfarray, 0, 0) == 1) { //first pixel is G
85         if (fc(cfarray, 0, 1) == 0) {
86             ey = 0;
87             ex = 1;
88         } else {
89             ey = 1;
90             ex = 0;
91         }
92     } else {//first pixel is R or B
93         if (fc(cfarray, 0, 0) == 0) {
94             ey = 0;
95             ex = 0;
96         } else {
97             ey = 1;
98             ex = 1;
99         }
100     }
101 
102     //shifts of pointer value to access pixels in vertical and diagonal directions
103     constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3, m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
104 
105     //tolerance to avoid dividing by zero
106     constexpr float eps = 1e-5, epssq = 1e-10;       //tolerance to avoid dividing by zero
107 
108     //adaptive ratios threshold
109     constexpr float arthresh = 0.75;
110 
111     //gaussian on 5x5 quincunx, sigma=1.2
112     constexpr float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
113     //nyquist texture test threshold
114     constexpr float nyqthresh = 0.5;
115     //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
116     // Is this really sigma=1.2????, seems more like sigma = 1.672
117     constexpr float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
118                                     nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
119                                    };
120     //gaussian on 5x5 alt quincunx, sigma=1.5
121     constexpr float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
122     //gaussian on quincunx grid
123     constexpr float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
124 
125     typedef struct {
126         float h;
127         float v;
128     } s_hv;
129 
130 #ifdef _OPENMP
131     #pragma omp parallel
132 #endif
133     {
134         int progresscounter = 0;
135 
136         constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
137         // assign working space
138         char *buffer = (char *) calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
139         // aligned to 64 byte boundary
140         char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
141 
142         // green values
143         float *rgbgreen         = (float (*))         data;
144         // sum of square of horizontal gradient and square of vertical gradient
145         float *delhvsqsum       = (float (*))         ((char*)rgbgreen + sizeof(float) * ts * ts + cldf * 64);       // 1
146         // gradient based directional weights for interpolation
147         float *dirwts0          = (float (*))         ((char*)delhvsqsum + sizeof(float) * ts * ts + cldf * 64);     // 1
148         float *dirwts1          = (float (*))         ((char*)dirwts0 + sizeof(float) * ts * ts + cldf * 64);        // 1
149         // vertically interpolated colour differences G-R, G-B
150         float *vcd              = (float (*))         ((char*)dirwts1 + sizeof(float) * ts * ts + cldf * 64);        // 1
151         // horizontally interpolated colour differences
152         float *hcd              = (float (*))         ((char*)vcd + sizeof(float) * ts * ts + cldf * 64);            // 1
153         // alternative vertical interpolation
154         float *vcdalt           = (float (*))         ((char*)hcd + sizeof(float) * ts * ts + cldf * 64);            // 1
155         // alternative horizontal interpolation
156         float *hcdalt           = (float (*))         ((char*)vcdalt + sizeof(float) * ts * ts + cldf * 64);         // 1
157         // square of average colour difference
158         float *cddiffsq         = (float (*))         ((char*)hcdalt + sizeof(float) * ts * ts + cldf * 64);         // 1
159         // weight to give horizontal vs vertical interpolation
160         float *hvwt             = (float (*))         ((char*)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64);   // 1
161         // final interpolated colour difference
162         float (*Dgrb)[ts * tsh] = (float (*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
163         // gradient in plus (NE/SW) direction
164         float *delp             = (float (*))cddiffsq; // there is no overlap in buffer usage => share
165         // gradient in minus (NW/SE) direction
166         float *delm             = (float (*))         ((char*)delp + sizeof(float) * ts * tsh + cldf * 64);
167         // diagonal interpolation of R+B
168         float *rbint            = (float (*))delm; // there is no overlap in buffer usage => share
169         // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
170         s_hv  *Dgrb2            = (s_hv  (*))         ((char*)hvwt + sizeof(float) * ts * tsh + cldf * 64);          // 1
171         // difference between up/down interpolations of G
172         float *dgintv           = (float (*))Dgrb2;   // there is no overlap in buffer usage => share
173         // difference between left/right interpolations of G
174         float *dginth           = (float (*))         ((char*)dgintv + sizeof(float) * ts * ts + cldf * 64);         // 1
175         // square of diagonal colour differences
176         float *Dgrbsq1m         = (float (*))         ((char*)dginth + sizeof(float) * ts * ts + cldf * 64);         // 1
177         float *Dgrbsq1p         = (float (*))         ((char*)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64);      // 1
178         // tile raw data
179         float *cfa              = (float (*))         ((char*)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64);      // 1
180         // relative weight for combining plus and minus diagonal interpolations
181         float *pmwt             = (float (*))delhvsqsum;  // there is no overlap in buffer usage => share
182         // interpolated colour difference R-B in minus and plus direction
183         float *rbm              = (float (*))vcd;  // there is no overlap in buffer usage => share
184         float *rbp              = (float (*))         ((char*)rbm + sizeof(float) * ts * tsh + cldf * 64);
185         // nyquist texture flags 1=nyquist, 0=not nyquist
186         unsigned char *nyquist  = (unsigned char (*)) ((char*)cfa + sizeof(float) * ts * ts + cldf * 64);            // 1
187         unsigned char *nyquist2 = (unsigned char (*))cddiffsq;
188         float *nyqutest = (float(*)) ((char*)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64);                // 1
189 
190         // Main algorithm: Tile loop
191         // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
192 #ifdef _OPENMP
193         #pragma omp for schedule(dynamic, chunkSize) collapse(2) nowait
194 #endif
195 
196         for (int top = winy - 16; top < winy + height; top += ts - 32) {
197             for (int left = winx - 16; left < winx + width; left += ts - 32) {
198                 memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
199                 //location of tile bottom edge
200                 int bottom = min(top + ts, winy + height + 16);
201                 //location of tile right edge
202                 int right  = min(left + ts, winx + width + 16);
203                 //tile width  (=ts except for right edge of image)
204                 int rr1 = bottom - top;
205                 //tile height (=ts except for bottom edge of image)
206                 int cc1 = right - left;
207                 // bookkeeping for borders
208                 // min and max row/column in the tile
209                 int rrmin = top < winy ? 16 : 0;
210                 int ccmin = left < winx ? 16 : 0;
211                 int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
212                 int ccmax = right > (winx + width) ? winx + width - left : cc1;
213 
214                 // rgb from input CFA data
215                 // rgb values should be floating point number between 0 and 1
216                 // after white balance multipliers are applied
217                 // a 16 pixel border is added to each side of the image
218 
219                 // begin of tile initialization
220 #ifdef __SSE2__
221                 vfloat c65535v = F2V( 65535.f );
222 
223                 //fill upper border
224                 if (rrmin > 0) {
225                     for (int rr = 0; rr < 16; rr++) {
226                         int row = 32 - rr + top;
227 
228                         for (int cc = ccmin; cc < ccmax; cc += 4) {
229                             int indx1 = rr * ts + cc;
230                             vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
231                             STVF(cfa[indx1], tempv);
232                             STVF(rgbgreen[indx1], tempv );
233                         }
234                     }
235                 }
236 
237                 // fill inner part
238                 for (int rr = rrmin; rr < rrmax; rr++) {
239                     int row = rr + top;
240                     int cc = ccmin;
241                     for (; cc < ccmax - 3; cc += 4) {
242                         int indx1 = rr * ts + cc;
243                         vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
244                         STVF(cfa[indx1], tempv );
245                         STVF(rgbgreen[indx1], tempv );
246                     }
247                     for (; cc < ccmax; ++cc) {
248                         int indx1 = rr * ts + cc;
249                         float temp = rawData[row][cc + left] / 65535.f;
250                         cfa[indx1] = temp;
251                         rgbgreen[indx1] = temp;
252                     }
253                 }
254 
255                 //fill lower border
256                 if (rrmax < rr1) {
257                     for (int rr = 0; rr < 16; rr++)
258                         for (int cc = ccmin; cc < ccmax; cc += 4) {
259                             int indx1 = (rrmax + rr) * ts + cc;
260                             vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v;
261                             STVF(cfa[indx1], tempv );
262                             STVF(rgbgreen[indx1], tempv );
263                         }
264                 }
265 
266 #else
267 
268                 //fill upper border
269                 if (rrmin > 0) {
270                     for (int rr = 0; rr < 16; rr++)
271                         for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
272                             cfa[rr * ts + cc] = (rawData[row][cc + left]) / 65535.f;
273                             rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
274                         }
275                 }
276 
277                 // fill inner part
278                 for (int rr = rrmin; rr < rrmax; rr++) {
279                     int row = rr + top;
280 
281                     for (int cc = ccmin; cc < ccmax; cc++) {
282                         int indx1 = rr * ts + cc;
283                         cfa[indx1] = (rawData[row][cc + left]) / 65535.f;
284                         rgbgreen[indx1] = cfa[indx1];
285                     }
286                 }
287 
288                 //fill lower border
289                 if (rrmax < rr1) {
290                     for (int rr = 0; rr < 16; rr++)
291                         for (int cc = ccmin; cc < ccmax; cc++) {
292                             cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.f;
293                             rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
294                         }
295                 }
296 
297 #endif
298 
299                 //fill left border
300                 if (ccmin > 0) {
301                     for (int rr = rrmin; rr < rrmax; rr++)
302                         for (int cc = 0, row = rr + top; cc < 16; cc++) {
303                             cfa[rr * ts + cc] = (rawData[row][32 - cc + left]) / 65535.f;
304                             rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
305                         }
306                 }
307 
308                 //fill right border
309                 if (ccmax < cc1) {
310                     for (int rr = rrmin; rr < rrmax; rr++)
311                         for (int cc = 0; cc < 16; cc++) {
312                             cfa[rr * ts + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.f;
313                             rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
314                         }
315                 }
316 
317                 //also, fill the image corners
318                 if (rrmin > 0 && ccmin > 0) {
319                     for (int rr = 0; rr < 16; rr++)
320                         for (int cc = 0; cc < 16; cc++) {
321                             cfa[(rr)*ts + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.f;
322                             rgbgreen[(rr)*ts + cc] = cfa[(rr) * ts + cc];
323                         }
324                 }
325 
326                 if (rrmax < rr1 && ccmax < cc1) {
327                     for (int rr = 0; rr < 16; rr++)
328                         for (int cc = 0; cc < 16; cc++) {
329                             cfa[(rrmax + rr)*ts + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.f;
330                             rgbgreen[(rrmax + rr)*ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
331                         }
332                 }
333 
334                 if (rrmin > 0 && ccmax < cc1) {
335                     for (int rr = 0; rr < 16; rr++)
336                         for (int cc = 0; cc < 16; cc++) {
337                             cfa[(rr)*ts + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.f;
338                             rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr) * ts + ccmax + cc];
339                         }
340                 }
341 
342                 if (rrmax < rr1 && ccmin > 0) {
343                     for (int rr = 0; rr < 16; rr++)
344                         for (int cc = 0; cc < 16; cc++) {
345                             cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.f;
346                             rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
347                         }
348                 }
349 
350                 // end of tile initialization
351 
352                 // horizontal and vertical gradients
353 #ifdef __SSE2__
354                 vfloat epsv = F2V( eps );
355 
356                 for (int rr = 2; rr < rr1 - 2; rr++) {
357                     for (int indx = rr * ts; indx < rr * ts + cc1; indx += 4) {
358                         vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) -  LVFU( cfa[indx - 1] ) );
359                         vfloat delvv = vabsf( LVF( cfa[indx + v1] ) -  LVF( cfa[indx - v1] ) );
360                         STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
361                         STVF(dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv );
362                         STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
363                     }
364                 }
365 
366 #else
367 
368                 for (int rr = 2; rr < rr1 - 2; rr++)
369                     for (int cc = 2, indx = (rr) * ts + cc; cc < cc1 - 2; cc++, indx++) {
370                         float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
371                         float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
372                         dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
373                         dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
374                         delhvsqsum[indx] = SQR(delh) + SQR(delv);
375                     }
376 
377 #endif
378 
379                 //interpolate vertical and horizontal colour differences
380 #ifdef __SSE2__
381                 vfloat sgnv;
382 
383                 if( !(fc(cfarray, 4, 4) & 1) ) {
384                     sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
385                 } else {
386                     sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
387                 }
388 
389                 vfloat zd5v = F2V( 0.5f );
390                 vfloat onev = F2V( 1.f );
391                 vfloat arthreshv = F2V( arthresh );
392                 vfloat clip_pt8v = F2V( clip_pt8 );
393 
394                 for (int rr = 4; rr < rr1 - 4; rr++) {
395                     sgnv = -sgnv;
396 
397                     for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4) {
398                         //colour ratios in each cardinal direction
399                         vfloat cfav = LVF(cfa[indx]);
400                         vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
401                         vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
402                         vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
403                         vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
404 
405                         //G interpolated in vert/hor directions using Hamilton-Adams method
406                         vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
407                         vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
408                         vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
409                         vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
410 
411                         //G interpolated in vert/hor directions using adaptive ratios
412                         vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
413                         vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
414                         vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
415                         vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
416 
417                         //adaptive weights for vertical/horizontal directions
418                         vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
419                         vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
420 
421                         //interpolated G via adaptive weights of cardinal evaluations
422                         vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
423                         vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
424 
425                         //interpolated colour differences
426                         vfloat hcdaltv = sgnv * (Ginthhav - cfav);
427                         vfloat vcdaltv = sgnv * (Gintvhav - cfav);
428                         STVF(hcdalt[indx], hcdaltv);
429                         STVF(vcdalt[indx], vcdaltv);
430 
431                         vmask clipmask = vorm( vorm( vmaskf_gt( cfav, clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v ));
432                         guarv = vself( clipmask, guhav, guarv);
433                         gdarv = vself( clipmask, gdhav, gdarv);
434                         glarv = vself( clipmask, glhav, glarv);
435                         grarv = vself( clipmask, grhav, grarv);
436 
437                         //use HA if highlights are (nearly) clipped
438                         STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
439                         STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
440 
441                         //differences of interpolations in opposite directions
442                         STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
443                         STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
444                     }
445                 }
446 
447 #else
448 
449                 for (int rr = 4; rr < rr1 - 4; rr++) {
450                     bool fcswitch = fc(cfarray, rr, 4) & 1;
451 
452                     for (int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++) {
453 
454                         //colour ratios in each cardinal direction
455                         float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
456                         float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
457                         float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
458                         float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
459 
460                         //G interpolated in vert/hor directions using Hamilton-Adams method
461                         float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
462                         float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
463                         float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
464                         float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
465 
466                         //G interpolated in vert/hor directions using adaptive ratios
467                         float guar, gdar, glar, grar;
468 
469                         if (fabsf(1.f - cru) < arthresh) {
470                             guar = cfa[indx] * cru;
471                         } else {
472                             guar = guha;
473                         }
474 
475                         if (fabsf(1.f - crd) < arthresh) {
476                             gdar = cfa[indx] * crd;
477                         } else {
478                             gdar = gdha;
479                         }
480 
481                         if (fabsf(1.f - crl) < arthresh) {
482                             glar = cfa[indx] * crl;
483                         } else {
484                             glar = glha;
485                         }
486 
487                         if (fabsf(1.f - crr) < arthresh) {
488                             grar = cfa[indx] * crr;
489                         } else {
490                             grar = grha;
491                         }
492 
493                         //adaptive weights for vertical/horizontal directions
494                         float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
495                         float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
496 
497                         //interpolated G via adaptive weights of cardinal evaluations
498                         float Gintvha = vwt * gdha + (1.f - vwt) * guha;
499                         float Ginthha = hwt * grha + (1.f - hwt) * glha;
500 
501                         //interpolated colour differences
502                         if (fcswitch) {
503                             vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
504                             hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
505                             vcdalt[indx] = cfa[indx] - Gintvha;
506                             hcdalt[indx] = cfa[indx] - Ginthha;
507                         } else {
508                             //interpolated colour differences
509                             vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
510                             hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
511                             vcdalt[indx] = Gintvha - cfa[indx];
512                             hcdalt[indx] = Ginthha - cfa[indx];
513                         }
514 
515                         fcswitch = !fcswitch;
516 
517                         if (cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8) {
518                             //use HA if highlights are (nearly) clipped
519                             guar = guha;
520                             gdar = gdha;
521                             glar = glha;
522                             grar = grha;
523                             vcd[indx] = vcdalt[indx];
524                             hcd[indx] = hcdalt[indx];
525                         }
526 
527                         //differences of interpolations in opposite directions
528                         dgintv[indx] = min(SQR(guha - gdha), SQR(guar - gdar));
529                         dginth[indx] = min(SQR(glha - grha), SQR(glar - grar));
530 
531                     }
532                 }
533 
534 #endif
535 
536 
537 
538 #ifdef __SSE2__
539                 vfloat  clip_ptv = F2V( clip_pt );
540                 vfloat  sgn3v;
541 
542                 if( !(fc(cfarray, 4, 4) & 1) ) {
543                     sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
544                 } else {
545                     sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
546                 }
547 
548                 sgn3v = sgnv + sgnv + sgnv;
549 
550                 for (int rr = 4; rr < rr1 - 4; rr++) {
551                     vfloat nsgnv = sgnv;
552                     sgnv = -sgnv;
553                     sgn3v = -sgn3v;
554 
555                     for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4) {
556                         vfloat hcdv = LVF( hcd[indx] );
557                         vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2]));
558                         vfloat hcdaltv = LVF( hcdalt[indx] );
559                         vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv) + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2])) + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
560                         vfloat vcdv = LVF( vcd[indx] );
561                         vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv) + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2])) + SQRV(vcdv - LVF(vcd[indx + v2]));
562                         vfloat vcdaltv = LVF( vcdalt[indx] );
563                         vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv) + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2])) + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
564 
565                         //choose the smallest variance; this yields a smoother interpolation
566                         hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv);
567                         vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv);
568 
569                         //bound the interpolation in regions of high saturation
570                         //vertical and horizontal G interpolations
571                         vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] );
572                         vfloat temp2v = sgn3v * hcdv;
573                         vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
574                         vmask hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV );
575                         vfloat hcdoldv = hcdv;
576                         vfloat tempv = nsgnv * (LVF(cfa[indx]) - median( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) ));
577                         hcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
578                         hcdv = vself( hcdmask, hcdv, hcdoldv );
579                         hcdv = vself( vmaskf_gt( Ginthv, clip_ptv), tempv, hcdv);
580                         STVF(hcd[indx], hcdv);
581 
582                         vfloat Gintvv = sgnv * vcdv + LVF( cfa[indx] );
583                         temp2v = sgn3v * vcdv;
584                         vfloat vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx]));
585                         vmask vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV );
586                         vfloat vcdoldv = vcdv;
587                         tempv = nsgnv * (LVF(cfa[indx]) - median( Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1]) ));
588                         vcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
589                         vcdv = vself( vcdmask, vcdv, vcdoldv );
590                         vcdv = vself( vmaskf_gt( Gintvv, clip_ptv), tempv, vcdv);
591                         STVF(vcd[indx], vcdv);
592                         STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
593                     }
594 
595                 }
596 
597 #else
598 
599                 for (int rr = 4; rr < rr1 - 4; rr++) {
600                     for (int cc = 4, indx = rr * ts + cc, c = fc(cfarray, rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
601                         float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
602                         float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
603                         float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
604                         float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
605 
606                         //choose the smallest variance; this yields a smoother interpolation
607                         if (hcdaltvar < hcdvar) {
608                             hcd[indx] = hcdalt[indx];
609                         }
610 
611                         if (vcdaltvar < vcdvar) {
612                             vcd[indx] = vcdalt[indx];
613                         }
614 
615                         //bound the interpolation in regions of high saturation
616                         //vertical and horizontal G interpolations
617                         float Gintv, Ginth;
618 
619                         if (c) {//G site
620                             Ginth = -hcd[indx] + cfa[indx]; //R or B
621                             Gintv = -vcd[indx] + cfa[indx]; //B or R
622 
623                             if (hcd[indx] > 0) {
624                                 if (3.f * hcd[indx] > (Ginth + cfa[indx])) {
625                                     hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
626                                 } else {
627                                     float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
628                                     hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (-median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
629                                 }
630                             }
631 
632                             if (vcd[indx] > 0) {
633                                 if (3.f * vcd[indx] > (Gintv + cfa[indx])) {
634                                     vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
635                                 } else {
636                                     float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
637                                     vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (-median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
638                                 }
639                             }
640 
641                             if (Ginth > clip_pt) {
642                                 hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
643                             }
644 
645                             if (Gintv > clip_pt) {
646                                 vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
647                             }
648 
649 
650                         } else {//R or B site
651 
652                             Ginth = hcd[indx] + cfa[indx]; //interpolated G
653                             Gintv = vcd[indx] + cfa[indx];
654 
655                             if (hcd[indx] < 0) {
656                                 if (3.f * hcd[indx] < -(Ginth + cfa[indx])) {
657                                     hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
658                                 } else {
659                                     float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
660                                     hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
661                                 }
662                             }
663 
664                             if (vcd[indx] < 0) {
665                                 if (3.f * vcd[indx] < -(Gintv + cfa[indx])) {
666                                     vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
667                                 } else {
668                                     float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
669                                     vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
670                                 }
671                             }
672 
673                             if (Ginth > clip_pt) {
674                                 hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
675                             }
676 
677                             if (Gintv > clip_pt) {
678                                 vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
679                             }
680 
681                             cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
682                         }
683 
684                         c = !c;
685                     }
686                 }
687 
688 #endif
689 
690 
691 
692 #ifdef __SSE2__
693                 vfloat  epssqv = F2V( epssq );
694 
695                 for (int rr = 6; rr < rr1 - 6; rr++) {
696                     for (int indx = rr * ts + 6 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 6; indx += 8) {
697                         //compute colour difference variances in cardinal directions
698                         vfloat tempv = LC2VFU(vcd[indx]);
699                         vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
700                         vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
701                         vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
702                         vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
703 
704                         vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
705                         vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
706 
707                         tempv = LC2VFU(hcd[indx]);
708                         vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
709                         vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
710 
711                         vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
712                         vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
713 
714 
715                         vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
716                         vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
717 
718                         //compute fluctuations in up/down and left/right interpolations of colours
719                         Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
720                         Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
721 
722                         Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
723                         Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
724 
725                         vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
726                         vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
727 
728                         //determine adaptive weights for G interpolation
729                         vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
730                         vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
731 
732                         //if both agree on interpolation direction, choose the one with strongest directional discrimination;
733                         //otherwise, choose the u/d and l/r difference fluctuation weights
734                         vmask decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) );
735                         STVFU(hvwt[indx >> 1], vself( decmask, varwtv, diffwtv));
736                     }
737                 }
738 
739 #else
740 
741                 for (int rr = 6; rr < rr1 - 6; rr++) {
742                     for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
743 
744                         //compute colour difference variances in cardinal directions
745 
746                         float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
747                         float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
748                         float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
749                         float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
750 
751                         //colour difference (G-R or G-B) variance in up/down/left/right directions
752                         float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
753                         float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
754                         float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
755                         float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave);
756 
757                         float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
758                         float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
759 
760                         float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
761                         float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
762 
763                         //compute fluctuations in up/down and left/right interpolations of colours
764                         Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
765                         Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
766                         Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
767                         Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
768 
769                         float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
770                         float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
771 
772                         //determine adaptive weights for G interpolation
773                         float varwt = hcdvar / (vcdvar + hcdvar);
774                         float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
775 
776                         //if both agree on interpolation direction, choose the one with strongest directional discrimination;
777                         //otherwise, choose the u/d and l/r difference fluctuation weights
778                         if ((0.5f - varwt) * (0.5f - diffwt) > 0.f && fabsf(0.5f - diffwt) < fabsf(0.5f - varwt)) {
779                             hvwt[indx >> 1] = varwt;
780                         } else {
781                             hvwt[indx >> 1] = diffwt;
782                         }
783                     }
784                 }
785 
786 #endif
787 
788 #ifdef __SSE2__
789                 vfloat gaussg0 = F2V(gaussgrad[0]);
790                 vfloat gaussg1 = F2V(gaussgrad[1]);
791                 vfloat gaussg2 = F2V(gaussgrad[2]);
792                 vfloat gaussg3 = F2V(gaussgrad[3]);
793                 vfloat gaussg4 = F2V(gaussgrad[4]);
794                 vfloat gaussg5 = F2V(gaussgrad[5]);
795                 vfloat gausso0 = F2V(gaussodd[0]);
796                 vfloat gausso1 = F2V(gaussodd[1]);
797                 vfloat gausso2 = F2V(gaussodd[2]);
798                 vfloat gausso3 = F2V(gaussodd[3]);
799 
800 #endif
801 
802                 // precompute nyquist
803                 for (int rr = 6; rr < rr1 - 6; rr++) {
804                     int cc = 6 + (fc(cfarray, rr, 2) & 1);
805                     int indx = rr * ts + cc;
806 
807 #ifdef __SSE2__
808 
809                     for (; cc < cc1 - 7; cc += 8, indx += 8) {
810                         vfloat valv = (gausso0 * LC2VFU(cddiffsq[indx]) +
811                                        gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)]) +
812                                                   LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)])) +
813                                        gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)]) +
814                                                   LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)])) +
815                                        gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)]) +
816                                                   LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)]))) -
817                                       (gaussg0 * LC2VFU(delhvsqsum[indx]) +
818                                        gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1]) +
819                                                   LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1])) +
820                                        gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1]) +
821                                                   LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1])) +
822                                        gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2]) +
823                                                   LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2])) +
824                                        gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1]) +
825                                                   LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2]) +
826                                                   LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2]) +
827                                                   LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1])) +
828                                        gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2]) +
829                                                   LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
830                         STVFU(nyqutest[indx >> 1], valv);
831 
832                     }
833 
834 #endif
835 
836                     for (; cc < cc1 - 6; cc += 2, indx += 2) {
837                         nyqutest[indx >> 1] = (gaussodd[0] * cddiffsq[indx] +
838                                                gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
839                                                               cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
840                                                gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
841                                                               cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
842                                                gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
843                                                               cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])) -
844                                               (gaussgrad[0] *  delhvsqsum[indx] +
845                                                gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
846                                                                delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
847                                                gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
848                                                                delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
849                                                gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
850                                                                delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
851                                                gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1] +
852                                                                delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2] +
853                                                                delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2] +
854                                                                delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1]) +
855                                                gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
856                                                                delhvsqsum[indx - p2] + delhvsqsum[indx + m2]));
857                     }
858                 }
859 
860                 // Nyquist test
861                 int nystartrow = 0;
862                 int nyendrow = 0;
863                 int nystartcol = ts + 1;
864                 int nyendcol = 0;
865 
866                 for (int rr = 6; rr < rr1 - 6; rr++) {
867                     for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
868 
869                         //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients
870                         if(nyqutest[indx >> 1] > 0.f) {
871                             nyquist[indx >> 1] = 1;    //nyquist=1 for nyquist region
872                             nystartrow = nystartrow ? nystartrow : rr;
873                             nyendrow = rr;
874                             nystartcol = nystartcol > cc ? cc : nystartcol;
875                             nyendcol = nyendcol < cc ? cc : nyendcol;
876                         }
877                     }
878                 }
879 
880 
881                 bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
882 
883                 if(doNyquist) {
884                     nyendrow ++; // because of < condition
885                     nyendcol ++; // because of < condition
886                     nystartcol -= (nystartcol & 1);
887                     nystartrow = std::max(8, nystartrow);
888                     nyendrow = std::min(rr1 - 8, nyendrow);
889                     nystartcol = std::max(8, nystartcol);
890                     nyendcol = std::min(cc1 - 8, nyendcol);
891                     memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
892 
893 #ifdef __SSE2__
894                     vint fourvb = _mm_set1_epi8(4);
895                     vint onevb = _mm_set1_epi8(1);
896 
897 #endif
898 
899                     for (int rr = nystartrow; rr < nyendrow; rr++) {
900 #ifdef __SSE2__
901 
902                         for (int indx = rr * ts; indx < rr * ts + cc1; indx += 32) {
903                             vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx - v2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - m1) >> 1]));
904                             vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + p1) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - 2) >> 1]));
905                             vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx +  2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - p1) >> 1]));
906                             vint valv = _mm_load_si128((vint*)&nyquist[indx >> 1]);
907                             vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + m1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx + v2) >> 1]));
908                             nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
909                             nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
910                             nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
911                             valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
912                             valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
913                             _mm_store_si128((vint*)&nyquist2[indx >> 1], valv);
914                         }
915 
916 #else
917 
918                         for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
919                             unsigned int nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
920                                                         nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] +
921                                                         nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
922                             //if most of your neighbours are named Nyquist, it's likely that you're one too, or not
923                             nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
924                         }
925 
926 #endif
927                     }
928 
929                     // end of Nyquist test
930 
931                     // in areas of Nyquist texture, do area interpolation
932                     for (int rr = nystartrow; rr < nyendrow; rr++)
933                         for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
934 
935                             if (nyquist2[indx >> 1]) {
936                                 // area interpolation
937 
938                                 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
939 
940                                 for (int i = -6; i < 7; i += 2) {
941                                     int indx1 = indx + (i * ts) - 6;
942 
943                                     for (int j = -6; j < 7; j += 2, indx1 += 2) {
944                                         if (nyquist2[indx1 >> 1]) {
945                                             float cfatemp = cfa[indx1];
946                                             sumcfa += cfatemp;
947                                             sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
948                                             sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
949                                             sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
950                                             sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
951                                             areawt += 1;
952                                         }
953                                     }
954                                 }
955 
956                                 //horizontal and vertical colour differences, and adaptive weight
957                                 sumh = sumcfa - xdiv2f(sumh);
958                                 sumv = sumcfa - xdiv2f(sumv);
959                                 areawt = xdiv2f(areawt);
960                                 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
961                                 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
962                                 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
963 
964                                 // end of area interpolation
965 
966                             }
967                         }
968                 }
969 
970 
971                 //populate G at R/B sites
972                 for (int rr = 8; rr < rr1 - 8; rr++)
973                     for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 8; indx += 2) {
974 
975                         //first ask if one gets more directional discrimination from nearby B/R sites
976                         float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
977 
978                         hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
979                         //a better result was obtained from the neighbours
980 
981                         Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate colour differences
982 
983                         rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!)
984 
985                         //local curvature in G (preparation for nyquist refinement step)
986                         Dgrb2[indx >> 1].h = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
987                         Dgrb2[indx >> 1].v = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
988                     }
989 
990 
991                 //end of standard interpolation
992 
993                 // refine Nyquist areas using G curvatures
994                 if(doNyquist) {
995                     for (int rr = nystartrow; rr < nyendrow; rr++)
996                         for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
997 
998                             if (nyquist2[indx >> 1]) {
999                                 //local averages (over Nyquist pixels only) of G curvature squared
1000                                 float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
1001                                                        gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
1002                                                        gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) +
1003                                                        gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
1004                                 float gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v +
1005                                                        gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) +
1006                                                        gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) +
1007                                                        gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
1008                                 //use the results as weights for refined G interpolation
1009                                 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
1010                                 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
1011                             }
1012                         }
1013                 }
1014 
1015 
1016 #ifdef __SSE2__
1017 
1018                 for (int rr = 6; rr < rr1 - 6; rr++) {
1019                     if((fc(cfarray, rr, 2) & 1) == 0) {
1020                         for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1021                             vfloat tempv = LC2VFU(cfa[indx + 1]);
1022                             vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
1023                             STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
1024                             STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
1025                             vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
1026                             STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1027                             STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1028                         }
1029                     } else {
1030                         for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1031                             vfloat tempv = LC2VFU(cfa[indx]);
1032                             vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
1033                             STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
1034                             STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
1035                             vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
1036                             STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1037                             STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1038                         }
1039                     }
1040                 }
1041 
1042 #else
1043 
1044                 for (int rr = 6; rr < rr1 - 6; rr++) {
1045                     if((fc(cfarray, rr, 2) & 1) == 0) {
1046                         for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1047                             delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
1048                             delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
1049                             Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
1050                             Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
1051                         }
1052                     } else {
1053                         for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1054                             Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
1055                             Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
1056                             delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
1057                             delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
1058                         }
1059                     }
1060                 }
1061 
1062 #endif
1063 
1064                 // diagonal interpolation correction
1065 
1066 #ifdef __SSE2__
1067                 vfloat gausseven0v = F2V(gausseven[0]);
1068                 vfloat gausseven1v = F2V(gausseven[1]);
1069 #endif
1070 
1071                 for (int rr = 8; rr < rr1 - 8; rr++) {
1072 #ifdef __SSE2__
1073 
1074                     for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8; indx += 8, indx1 += 4) {
1075 
1076                         //diagonal colour ratios
1077                         vfloat cfav = LC2VFU(cfa[indx]);
1078 
1079                         vfloat temp1v = LC2VFU(cfa[indx + m1]);
1080                         vfloat temp2v = LC2VFU(cfa[indx + m2]);
1081                         vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1082                         rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev, temp1v + zd5v * (cfav - temp2v));
1083 
1084                         temp1v = LC2VFU(cfa[indx - m1]);
1085                         temp2v = LC2VFU(cfa[indx - m2]);
1086                         vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1087                         rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv, temp1v + zd5v * (cfav - temp2v));
1088 
1089                         temp1v = epsv + LVFU(delm[indx1]);
1090                         vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr
1091                         vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
1092 
1093                         vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
1094 
1095                         temp1v = median(rbmv , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
1096                         vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
1097                         temp2v = vintpf(wtv, rbmv, temp1v);
1098 
1099                         temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
1100                         temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
1101                         STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v ));
1102 
1103 
1104                         temp1v = LC2VFU(cfa[indx + p1]);
1105                         temp2v = LC2VFU(cfa[indx + p2]);
1106                         vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1107                         rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v));
1108 
1109                         temp1v = LC2VFU(cfa[indx - p1]);
1110                         temp2v = LC2VFU(cfa[indx - p2]);
1111                         vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1112                         rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v));
1113 
1114                         temp1v = epsv + LVFU(delp[indx1]);
1115                         vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
1116                         vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
1117 
1118                         vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
1119 
1120                         temp1v = median(rbpv , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
1121                         wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
1122                         temp2v = vintpf(wtv, rbpv, temp1v);
1123 
1124                         temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
1125                         temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
1126                         STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v ));
1127 
1128                         vfloat rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) +
1129                                                    gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) +
1130                                                            LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
1131                         STVFU(pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) +
1132                                                         gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) +
1133                                                                 LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv));
1134 
1135                     }
1136 
1137 #else
1138 
1139                     for (int cc = 8 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
1140 
1141                         //diagonal colour ratios
1142                         float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
1143                         float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
1144                         float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
1145                         float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
1146                         //colour differences in diagonal directions
1147                         float rbse, rbnw, rbne, rbsw;
1148 
1149                         //assign B/R at R/B sites
1150                         if (fabsf(1.f - crse) < arthresh) {
1151                             rbse = cfa[indx] * crse;    //use this if more precise diag interp is necessary
1152                         } else {
1153                             rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
1154                         }
1155 
1156                         if (fabsf(1.f - crnw) < arthresh) {
1157                             rbnw = cfa[indx] * crnw;
1158                         } else {
1159                             rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
1160                         }
1161 
1162                         if (fabsf(1.f - crne) < arthresh) {
1163                             rbne = cfa[indx] * crne;
1164                         } else {
1165                             rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
1166                         }
1167 
1168                         if (fabsf(1.f - crsw) < arthresh) {
1169                             rbsw = cfa[indx] * crsw;
1170                         } else {
1171                             rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
1172                         }
1173 
1174                         float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr
1175                         float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
1176                         float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
1177                         float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
1178 
1179 
1180                         rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
1181                         rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
1182 
1183                         //variance of R-B in plus/minus directions
1184                         float rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) +
1185                                                 gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] +
1186                                                                 Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
1187                         pmwt[indx1] = rbvarm / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1] + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1]) +
1188                                                           gausseven[1] * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1] + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1] +
1189                                                                   Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1] + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1]))) + rbvarm);
1190 
1191                         //bound the interpolation in regions of high saturation
1192 
1193                         if (rbp[indx1] < cfa[indx]) {
1194                             if (xmul2f(rbp[indx1]) < cfa[indx]) {
1195                                 rbp[indx1] = median(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]);
1196                             } else {
1197                                 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
1198                                 rbp[indx1] = pwt * rbp[indx1] + (1.f - pwt) * median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1199                             }
1200                         }
1201 
1202                         if (rbm[indx1] < cfa[indx]) {
1203                             if (xmul2f(rbm[indx1]) < cfa[indx]) {
1204                                 rbm[indx1] = median(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]);
1205                             } else {
1206                                 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
1207                                 rbm[indx1] = mwt * rbm[indx1] + (1.f - mwt) * median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1208                             }
1209                         }
1210 
1211                         if (rbp[indx1] > clip_pt) {
1212                             rbp[indx1] = median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1213                         }
1214 
1215                         if (rbm[indx1] > clip_pt) {
1216                             rbm[indx1] = median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1217                         }
1218                     }
1219 
1220 #endif
1221                 }
1222 
1223 #ifdef __SSE2__
1224                 vfloat zd25v = F2V(0.25f);
1225 #endif
1226 
1227                 for (int rr = 10; rr < rr1 - 10; rr++)
1228 #ifdef __SSE2__
1229                     for (int indx = rr * ts + 10 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4) {
1230 
1231                         //first ask if one gets more directional discrimination from nearby B/R sites
1232                         vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
1233                         vfloat tempv = LVFU(pmwt[indx1]);
1234                         tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
1235                         STVFU(pmwt[indx1], tempv);
1236                         STVFU(rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
1237                     }
1238 
1239 #else
1240 
1241                     for (int cc = 10 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
1242 
1243                         //first ask if one gets more directional discrimination from nearby B/R sites
1244                         float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
1245 
1246                         if (fabsf(0.5f - pmwt[indx1]) < fabsf(0.5f - pmwtalt)) {
1247                             pmwt[indx1] = pmwtalt;   //a better result was obtained from the neighbours
1248                         }
1249 
1250                         rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
1251                     }
1252 
1253 #endif
1254 
1255                 for (int rr = 12; rr < rr1 - 12; rr++)
1256 #ifdef __SSE2__
1257                     for (int indx = rr * ts + 12 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4) {
1258                         vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
1259 
1260                         if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the unused out at the end
1261                             //now interpolate G vertically/horizontally using R+B values
1262                             //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1263                             //colour ratios for G interpolation
1264                             vfloat rbintv = LVFU(rbint[indx1]);
1265 
1266                             //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1267                             vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
1268                             vfloat guv = rbintv * cruv;
1269                             vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
1270                             guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
1271 
1272                             vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
1273                             vfloat gdv = rbintv * crdv;
1274                             vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
1275                             gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
1276 
1277                             vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
1278                             vfloat Gint1v = median(Gintvv , LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
1279                             vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
1280                             vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
1281                             Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
1282                             Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
1283                             Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv), median(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
1284 
1285                             vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
1286                             vfloat glv = rbintv * crlv;
1287                             vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
1288                             glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
1289 
1290                             vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
1291                             vfloat grv = rbintv * crrv;
1292                             vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
1293                             grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
1294 
1295                             vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
1296                             vfloat Gint1h = median(Ginthv , LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
1297                             vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
1298                             vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
1299                             Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
1300                             Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
1301                             Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv), median(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
1302 
1303                             vfloat greenv = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
1304                             STC2VFU(rgbgreen[indx], greenv);
1305 
1306                             STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
1307                         }
1308                     }
1309 
1310 #else
1311 
1312                     for (int cc = 12 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
1313 
1314                         if (fabsf(0.5f - pmwt[indx >> 1]) < fabsf(0.5f - hvwt[indx >> 1]) ) {
1315                             continue;
1316                         }
1317 
1318                         //now interpolate G vertically/horizontally using R+B values
1319                         //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1320 
1321                         //colour ratios for G interpolation
1322                         float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
1323                         float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
1324                         float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
1325                         float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
1326 
1327                         //interpolation of G in four directions
1328                         float gu, gd, gl, gr;
1329 
1330                         //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1331                         if (fabsf(1.f - cru) < arthresh) {
1332                             gu = rbint[indx1] * cru;
1333                         } else {
1334                             gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
1335                         }
1336 
1337                         if (fabsf(1.f - crd) < arthresh) {
1338                             gd = rbint[indx1] * crd;
1339                         } else {
1340                             gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
1341                         }
1342 
1343                         if (fabsf(1.f - crl) < arthresh) {
1344                             gl = rbint[indx1] * crl;
1345                         } else {
1346                             gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
1347                         }
1348 
1349                         if (fabsf(1.f - crr) < arthresh) {
1350                             gr = rbint[indx1] * crr;
1351                         } else {
1352                             gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
1353                         }
1354 
1355                         //interpolated G via adaptive weights of cardinal evaluations
1356                         float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1357                         float Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1358 
1359                         //bound the interpolation in regions of high saturation
1360                         if (Gintv < rbint[indx1]) {
1361                             if (2 * Gintv < rbint[indx1]) {
1362                                 Gintv = median(Gintv , cfa[indx - v1], cfa[indx + v1]);
1363                             } else {
1364                                 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
1365                                 Gintv = vwt * Gintv + (1.f - vwt) * median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1366                             }
1367                         }
1368 
1369                         if (Ginth < rbint[indx1]) {
1370                             if (2 * Ginth < rbint[indx1]) {
1371                                 Ginth = median(Ginth , cfa[indx - 1], cfa[indx + 1]);
1372                             } else {
1373                                 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
1374                                 Ginth = hwt * Ginth + (1.f - hwt) * median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1375                             }
1376                         }
1377 
1378                         if (Ginth > clip_pt) {
1379                             Ginth = median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1380                         }
1381 
1382                         if (Gintv > clip_pt) {
1383                             Gintv = median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1384                         }
1385 
1386                         rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
1387                         Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
1388                     }
1389 
1390 #endif
1391 
1392                 //end of diagonal interpolation correction
1393 
1394                 //fancy chrominance interpolation
1395                 //(ey,ex) is location of R site
1396                 for (int rr = 13 - ey; rr < rr1 - 12; rr += 2)
1397                     for (int indx1 = (rr * ts + 13 - ex) >> 1; indx1 < (rr * ts + cc1 - 12) >> 1; indx1++) { //B coset
1398                         Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R
1399                         Dgrb[0][indx1] = 0;
1400                     }
1401 
1402 #ifdef __SSE2__
1403                 vfloat oned325v = F2V( 1.325f );
1404                 vfloat zd175v = F2V( 0.175f );
1405                 vfloat zd075v = F2V( 0.075f );
1406 #endif
1407 
1408                 for (int rr = 14; rr < rr1 - 14; rr++)
1409 #ifdef __SSE2__
1410                     for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
1411                         vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
1412                         vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
1413                         vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
1414                         vfloat wtnev = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
1415                         vfloat wtswv = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
1416                         vfloat wtsev = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
1417 
1418                         STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])) ) +
1419                                                    wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])) ) +
1420                                                    wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])) ) +
1421                                                    wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + m1 + v2) >> 1])) )) / (wtnwv + wtnev + wtswv + wtsev));
1422                     }
1423 
1424 #else
1425 
1426                     for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
1427                         float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
1428                         float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
1429                         float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
1430                         float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
1431 
1432                         Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) +
1433                                               wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) +
1434                                               wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1] - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1] ) +
1435                                               wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1] - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1] )) / (wtnw + wtne + wtsw + wtse);
1436                     }
1437 
1438 #endif
1439 
1440 #ifdef __SSE2__
1441                 int offset;
1442                 vfloat twov = F2V(2.f);
1443                 vmask selmask;
1444 
1445                 if((fc(cfarray, 16, 2) & 1) == 1) {
1446                     selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
1447                     offset = 1;
1448                 } else {
1449                     selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
1450                     offset = 0;
1451                 }
1452 
1453 #endif
1454 
1455                 for (int rr = 16; rr < rr1 - 16; rr++) {
1456                     int row = rr + top;
1457                     int col = left + 16;
1458                     int indx = rr * ts + 16;
1459 #ifdef __SSE2__
1460                     offset = 1 - offset;
1461                     selmask = vnotm(selmask);
1462 
1463                     for (; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4) {
1464                         vfloat greenv = LVF(rgbgreen[indx]);
1465                         vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
1466                         vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
1467                         vfloat tempv =  onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])) - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
1468 
1469                         vfloat redv1  = greenv - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1]))) * tempv;
1470                         vfloat bluev1 = greenv - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1]))) * tempv;
1471                         vfloat redv2  = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
1472                         vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
1473                         STVFU(red[row][col], vmaxf(c65535v * vself(selmask, redv1, redv2), ZEROV));
1474                         STVFU(blue[row][col], vmaxf(c65535v * vself(selmask, bluev1, bluev2), ZEROV));
1475                     }
1476 
1477                     if(offset == 0) {
1478                         for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1479                             float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1480                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1481                                                         temp));
1482                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1483                                                          temp));
1484 
1485                             indx++;
1486                             col++;
1487                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1488                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1489                         }
1490 
1491                         if(cc1 & 1) { // width of tile is odd
1492                             float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1493                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1494                                                         temp));
1495                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1496                                                          temp));
1497                         }
1498                     } else {
1499                         for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1500                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1501                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1502 
1503                             indx++;
1504                             col++;
1505                             float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1506                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1507                                                         temp));
1508                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1509                                                          temp));
1510                         }
1511 
1512                         if(cc1 & 1) { // width of tile is odd
1513                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1514                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1515                         }
1516                     }
1517 
1518 #else
1519 
1520                     if((fc(cfarray, rr, 2) & 1) == 1) {
1521                         for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1522                             float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1523                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1524                                                         temp));
1525                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1526                                                          temp));
1527 
1528                             indx++;
1529                             col++;
1530                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1531                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1532                         }
1533 
1534                         if(cc1 & 1) { // width of tile is odd
1535                             float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1536                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1537                                                         temp));
1538                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1539                                                          temp));
1540                         }
1541                     } else {
1542                         for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1543                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1544                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1545 
1546                             indx++;
1547                             col++;
1548                             float temp =  1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1549                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1550                                                         temp));
1551                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1552                                                          temp));
1553                         }
1554 
1555                         if(cc1 & 1) { // width of tile is odd
1556                             red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1557                             blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1558                         }
1559                     }
1560 
1561 #endif
1562                 }
1563 
1564                 // copy smoothed results back to image matrix
1565                 for (int rr = 16; rr < rr1 - 16; rr++) {
1566                     int row = rr + top;
1567                     int cc = 16;
1568 #ifdef __SSE2__
1569 
1570                     for (; cc < cc1 - 19; cc += 4) {
1571                         STVFU(green[row][cc + left], vmaxf(LVF(rgbgreen[rr * ts + cc]) * c65535v, ZEROV));
1572                     }
1573 
1574 #endif
1575 
1576                     for (; cc < cc1 - 16; cc++) {
1577                         green[row][cc + left] = std::max(0.f, 65535.f * rgbgreen[rr * ts + cc]);
1578                     }
1579                 }
1580 
1581                 if(plistener) {
1582                     progresscounter++;
1583 
1584                     if(progresscounter % 32 == 0) {
1585 #ifdef _OPENMP
1586                         #pragma omp critical (amazeprogress)
1587 #endif
1588                         {
1589                             progress += (double)32 * ((ts - 32) * (ts - 32)) / (height * width);
1590                             progress = progress > 1.0 ? 1.0 : progress;
1591                             plistener->setProgress(progress);
1592                         }
1593                     }
1594                 }
1595             }
1596         }  //end of main loop
1597 
1598         // clean up
1599         free(buffer);
1600     }
1601     if(border < 4) {
1602         border_interpolate(W, H, 3, rawData, red, green, blue);
1603     }
1604 
1605     if(plistener) {
1606         plistener->setProgress(1.0);
1607     }
1608 
1609 }
1610 }
1611