1 /*
2     This file is part of darktable,
3     Copyright (C) 2011-2020 darktable developers.
4 
5     darktable is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     darktable is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with darktable.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #define __STDC_FORMAT_MACROS
20 
21 #if defined(__SSE__)
22 #include <xmmintrin.h>
23 #endif
24 
25 extern "C" {
26 #include "develop/imageop.h"
27 #include "develop/imageop_math.h"
28 
29 // otherwise the name will be mangled and the linker won't be able to see the function ...
30 void amaze_demosaic_RT(
31     dt_dev_pixelpipe_iop_t *piece,
32     const float *const in,
33     float *out,
34     const dt_iop_roi_t *const roi_in,
35     const dt_iop_roi_t *const roi_out,
36     const int filters);
37 }
38 
39 #include <algorithm>
40 #include <cmath>
41 #include <cstdint>
42 #include <cstdlib>
43 #include <cstring>
44 
clampnan(const float x,const float m,const float M)45 static __inline float clampnan(const float x, const float m, const float M)
46 {
47   float r;
48 
49   // clamp to [m, M] if x is infinite; return average of m and M if x is NaN; else just return x
50 
51   if(std::isinf(x))
52     r = (std::isless(x, m) ? m : (std::isgreater(x, M) ? M : x));
53   else if(std::isnan(x))
54     r = (m + M) / 2.0f;
55   else // normal number
56     r = x;
57 
58   return r;
59 }
60 
61 #ifndef __SSE2__
xmul2f(float d)62 static __inline float xmul2f(float d)
63 {
64   union {
65       float f;
66       uint32_t u;
67   } x;
68   x.f = d;
69   if(x.u & 0x7FFFFFFF) // if f==0 do nothing
70   {
71     x.u += 1 << 23; // add 1 to the exponent
72   }
73   return x.f;
74 }
75 #endif
76 
xdiv2f(float d)77 static __inline float xdiv2f(float d)
78 {
79   union {
80       float f;
81       uint32_t u;
82   } x;
83   x.f = d;
84   if(x.u & 0x7FFFFFFF) // if f==0 do nothing
85   {
86     x.u -= 1 << 23; // sub 1 from the exponent
87   }
88   return x.f;
89 }
90 
xdivf(float d,int n)91 static __inline float xdivf(float d, int n)
92 {
93   union {
94       float f;
95       uint32_t u;
96   } x;
97   x.f = d;
98   if(x.u & 0x7FFFFFFF) // if f==0 do nothing
99   {
100     x.u -= n << 23; // add n to the exponent
101   }
102   return x.f;
103 }
104 
105 
106 /*==================================================================================
107  * begin raw therapee code, hg checkout of march 03, 2016 branch master.
108  *==================================================================================*/
109 
110 
111 #ifdef __SSE2__
112 
113 #ifdef __GNUC__
114 #define INLINE __inline
115 #else
116 #define INLINE inline
117 #endif
118 
119 #ifdef __GNUC__
120 #if((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) && (!defined(WIN32) || defined(__x86_64__))
121 #define LVF(x) _mm_load_ps(&x)
122 #define LVFU(x) _mm_loadu_ps(&x)
123 #define STVF(x, y) _mm_store_ps(&x, y)
124 #define STVFU(x, y) _mm_storeu_ps(&x, y)
125 #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the
126       // aligned functions to unaligned functions for WIN32 builds
127 #define LVF(x) _mm_loadu_ps(&x)
128 #define LVFU(x) _mm_loadu_ps(&x)
129 #define STVF(x, y) _mm_storeu_ps(&x, y)
130 #define STVFU(x, y) _mm_storeu_ps(&x, y)
131 #endif
132 #else
133 #define LVF(x) _mm_load_ps(&x)
134 #define LVFU(x) _mm_loadu_ps(&x)
135 #define STVF(x, y) _mm_store_ps(&x, y)
136 #define STVFU(x, y) _mm_storeu_ps(&x, y)
137 #endif
138 
139 #define STC2VFU(a, v)                                                                                        \
140   {                                                                                                          \
141     __m128 TST1V = _mm_loadu_ps(&a);                                                                         \
142     __m128 TST2V = _mm_unpacklo_ps(v, v);                                                                    \
143     vmask cmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);                                               \
144     _mm_storeu_ps(&a, vself(cmask, TST1V, TST2V));                                                           \
145     TST1V = _mm_loadu_ps((&a) + 4);                                                                          \
146     TST2V = _mm_unpackhi_ps(v, v);                                                                           \
147     _mm_storeu_ps((&a) + 4, vself(cmask, TST1V, TST2V));                                                     \
148   }
149 
150 #define ZEROV _mm_setzero_ps()
151 #define F2V(a) _mm_set1_ps((a))
152 
153 typedef __m128i vmask;
154 typedef __m128 vfloat;
155 typedef __m128i vint;
156 
LC2VFU(float & a)157 static INLINE vfloat LC2VFU(float &a)
158 {
159   // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
160   vfloat a1 = _mm_loadu_ps(&a);
161   vfloat a2 = _mm_loadu_ps((&a) + 4);
162   return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0));
163 }
vmaxf(vfloat x,vfloat y)164 static INLINE vfloat vmaxf(vfloat x, vfloat y)
165 {
166   return _mm_max_ps(x, y);
167 }
vminf(vfloat x,vfloat y)168 static INLINE vfloat vminf(vfloat x, vfloat y)
169 {
170   return _mm_min_ps(x, y);
171 }
vcast_vf_f(float f)172 static INLINE vfloat vcast_vf_f(float f)
173 {
174   return _mm_set_ps(f, f, f, f);
175 }
vorm(vmask x,vmask y)176 static INLINE vmask vorm(vmask x, vmask y)
177 {
178   return _mm_or_si128(x, y);
179 }
vandm(vmask x,vmask y)180 static INLINE vmask vandm(vmask x, vmask y)
181 {
182   return _mm_and_si128(x, y);
183 }
vandnotm(vmask x,vmask y)184 static INLINE vmask vandnotm(vmask x, vmask y)
185 {
186   return _mm_andnot_si128(x, y);
187 }
vabsf(vfloat f)188 static INLINE vfloat vabsf(vfloat f)
189 {
190   return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f);
191 }
vself(vmask mask,vfloat x,vfloat y)192 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y)
193 {
194   return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
195 }
vmaskf_lt(vfloat x,vfloat y)196 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
197 {
198   return (__m128i)_mm_cmplt_ps(x, y);
199 }
vmaskf_gt(vfloat x,vfloat y)200 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
201 {
202   return (__m128i)_mm_cmpgt_ps(x, y);
203 }
ULIMV(vfloat a,vfloat b,vfloat c)204 static INLINE vfloat ULIMV(vfloat a, vfloat b, vfloat c)
205 {
206   // made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is
207   // independent on order of arguments
208   // ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a)
209   return vmaxf(vminf(a, b), vminf(vmaxf(a, b), c));
210 }
SQRV(vfloat a)211 static INLINE vfloat SQRV(vfloat a)
212 {
213   return a * a;
214 }
vintpf(vfloat a,vfloat b,vfloat c)215 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c)
216 {
217   // calculate a * b + (1 - a) * c (interpolate two values)
218   // following is valid:
219   // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
220   // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
221   return a * (b - c) + c;
222 }
vaddc2vfu(float & a)223 static INLINE vfloat vaddc2vfu(float &a)
224 {
225   // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
226   vfloat a1 = _mm_loadu_ps(&a);
227   vfloat a2 = _mm_loadu_ps((&a) + 4);
228   return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0)) + _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(3, 1, 3, 1));
229 }
vadivapb(vfloat a,vfloat b)230 static INLINE vfloat vadivapb(vfloat a, vfloat b)
231 {
232   return a / (a + b);
233 }
vselc(vmask mask,vint x,vint y)234 static INLINE vint vselc(vmask mask, vint x, vint y)
235 {
236   return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
237 }
vselinotzero(vmask mask,vint x)238 static INLINE vint vselinotzero(vmask mask, vint x)
239 {
240   // returns value of x if corresponding mask bits are 0, else returns 0
241   // faster than vselc(mask, ZEROV, x)
242   return _mm_andnot_si128(mask, x);
243 }
vmul2f(vfloat a)244 static INLINE vfloat vmul2f(vfloat a)
245 {
246   // fastest way to multiply by 2
247   return a + a;
248 }
vmaskf_ge(vfloat x,vfloat y)249 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
250 {
251   return (__m128i)_mm_cmpge_ps(x, y);
252 }
vnotm(vmask x)253 static INLINE vmask vnotm(vmask x)
254 {
255   return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
256 }
vdup(vfloat a)257 static INLINE vfloat vdup(vfloat a)
258 {
259   // returns { a[0],a[0],a[1],a[1] }
260   return _mm_unpacklo_ps(a, a);
261 }
262 
263 #endif // __SSE2__
264 
SQR(_Tp x)265 template <typename _Tp> static inline const _Tp SQR(_Tp x)
266 {
267   //      return std::pow(x,2); Slower than:
268   return (x * x);
269 }
270 
intp(const _Tp a,const _Tp b,const _Tp c)271 template <typename _Tp> static inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c)
272 {
273   // calculate a * b + (1 - a) * c
274   // following is valid:
275   // intp(a, b+x, c+x) = intp(a, b, c) + x
276   // intp(a, b*x, c*x) = intp(a, b, c) * x
277   return a * (b - c) + c;
278 }
279 
LIM(const _Tp a,const _Tp b,const _Tp c)280 template <typename _Tp> static inline const _Tp LIM(const _Tp a, const _Tp b, const _Tp c)
281 {
282   return std::max(b, std::min(a, c));
283 }
284 
ULIM(const _Tp a,const _Tp b,const _Tp c)285 template <typename _Tp> static inline const _Tp ULIM(const _Tp a, const _Tp b, const _Tp c)
286 {
287   return ((b < c) ? LIM(a, b, c) : LIM(a, c, b));
288 }
289 
290 
291 
292 ////////////////////////////////////////////////////////////////
293 //
294 //          AMaZE demosaic algorithm
295 // (Aliasing Minimization and Zipper Elimination)
296 //
297 //  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
298 //  optimized for speed by Ingo Weyrich
299 //
300 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
301 //
302 // code dated: May 27, 2010
303 // latest modification: Ingo Weyrich, January 25, 2016
304 //
305 //  amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
306 //  it under the terms of the GNU General Public License as published by
307 //  the Free Software Foundation, either version 3 of the License, or
308 //  (at your option) any later version.
309 //
310 //  This program is distributed in the hope that it will be useful,
311 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
312 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
313 //  GNU General Public License for more details.
314 //
315 //  You should have received a copy of the GNU General Public License
316 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
317 //
318 ////////////////////////////////////////////////////////////////
319 
320 
amaze_demosaic_RT(dt_dev_pixelpipe_iop_t * piece,const float * const in,float * out,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int filters)321 void amaze_demosaic_RT(dt_dev_pixelpipe_iop_t *piece, const float *const in,
322                        float *out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
323                        const int filters)
324 {
325   int winx = roi_out->x;
326   int winy = roi_out->y;
327   int winw = roi_in->width;
328   int winh = roi_in->height;
329 
330   const int width = winw, height = winh;
331   const float clip_pt = fminf(piece->pipe->dsc.processed_maximum[0],
332                               fminf(piece->pipe->dsc.processed_maximum[1], piece->pipe->dsc.processed_maximum[2]));
333   const float clip_pt8 = 0.8f * clip_pt;
334 
335 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
336 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
337 #ifndef AMAZETS
338 #define AMAZETS 160
339 #endif
340   // Tile size; the image is processed in square tiles to lower memory requirements and facilitate
341   // multi-threading
342   // We assure that Tile size is a multiple of 32 in the range [96;992]
343   constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
344   constexpr int tsh = ts / 2; // half of Tile size
345 
346   // offset of R pixel within a Bayer quartet
347   int ex, ey;
348 
349   // determine GRBG coset; (ey,ex) is the offset of the R subarray
350   if(FC(0, 0, filters) == 1)
351   { // first pixel is G
352     if(FC(0, 1, filters) == 0)
353     {
354       ey = 0;
355       ex = 1;
356     }
357     else
358     {
359       ey = 1;
360       ex = 0;
361     }
362   }
363   else
364   { // first pixel is R or B
365     if(FC(0, 0, filters) == 0)
366     {
367       ey = 0;
368       ex = 0;
369     }
370     else
371     {
372       ey = 1;
373       ex = 1;
374     }
375   }
376 
377   // shifts of pointer value to access pixels in vertical and diagonal directions
378   constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3,
379                 m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
380 
381   // tolerance to avoid dividing by zero
382   constexpr float eps = 1e-5, epssq = 1e-10; // tolerance to avoid dividing by zero
383 
384   // adaptive ratios threshold
385   constexpr float arthresh = 0.75;
386 
387   // gaussian on 5x5 quincunx, sigma=1.2
388   constexpr float gaussodd[4]
389       = { 0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f };
390   // nyquist texture test threshold
391   constexpr float nyqthresh = 0.5;
392   // gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
393   // Is this really sigma=1.2????, seems more like sigma = 1.672
394   constexpr float gaussgrad[6] = { nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f,
395                                    nyqthresh * 0.0521818194747806f,  nyqthresh * 0.03687419286733595f,
396                                    nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f };
397   // gaussian on 5x5 alt quincunx, sigma=1.5
398   constexpr float gausseven[2] = { 0.13719494435797422f, 0.05640252782101291f };
399   // gaussian on quincunx grid
400   constexpr float gquinc[4] = { 0.169917f, 0.108947f, 0.069855f, 0.0287182f };
401 
402   typedef struct
403   {
404     float h;
405     float v;
406   } s_hv;
407 
408 #ifdef _OPENMP
409 #pragma omp parallel
410 #endif
411   {
412     //     int progresscounter = 0;
413 
414     constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
415     // assign working space
416     char *buffer
417         = (char *)calloc(sizeof(float) * 14 * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
418     // aligned to 64 byte boundary
419     char *data = (char *)((uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
420 
421     // green values
422     float *rgbgreen = (float(*))data;
423     // sum of square of horizontal gradient and square of vertical gradient
424     float *delhvsqsum = (float(*))((char *)rgbgreen + sizeof(float) * ts * ts + cldf * 64); // 1
425     // gradient based directional weights for interpolation
426     float *dirwts0 = (float(*))((char *)delhvsqsum + sizeof(float) * ts * ts + cldf * 64); // 1
427     float *dirwts1 = (float(*))((char *)dirwts0 + sizeof(float) * ts * ts + cldf * 64);    // 1
428     // vertically interpolated colour differences G-R, G-B
429     float *vcd = (float(*))((char *)dirwts1 + sizeof(float) * ts * ts + cldf * 64); // 1
430     // horizontally interpolated colour differences
431     float *hcd = (float(*))((char *)vcd + sizeof(float) * ts * ts + cldf * 64); // 1
432     // alternative vertical interpolation
433     float *vcdalt = (float(*))((char *)hcd + sizeof(float) * ts * ts + cldf * 64); // 1
434     // alternative horizontal interpolation
435     float *hcdalt = (float(*))((char *)vcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
436     // square of average colour difference
437     float *cddiffsq = (float(*))((char *)hcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
438     // weight to give horizontal vs vertical interpolation
439     float *hvwt = (float(*))((char *)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64); // 1
440     // final interpolated colour difference
441     float(*Dgrb)[ts * tsh] = (float(*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
442     // gradient in plus (NE/SW) direction
443     float *delp = (float(*))cddiffsq; // there is no overlap in buffer usage => share
444     // gradient in minus (NW/SE) direction
445     float *delm = (float(*))((char *)delp + sizeof(float) * ts * tsh + cldf * 64);
446     // diagonal interpolation of R+B
447     float *rbint = (float(*))delm; // there is no overlap in buffer usage => share
448     // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture
449     // regions)
450     s_hv *Dgrb2 = (s_hv(*))((char *)hvwt + sizeof(float) * ts * tsh + cldf * 64); // 1
451     // difference between up/down interpolations of G
452     float *dgintv = (float(*))Dgrb2; // there is no overlap in buffer usage => share
453     // difference between left/right interpolations of G
454     float *dginth = (float(*))((char *)dgintv + sizeof(float) * ts * ts + cldf * 64); // 1
455     // square of diagonal colour differences
456     float *Dgrbsq1m = (float(*))((char *)dginth + sizeof(float) * ts * ts + cldf * 64);    // 1
457     float *Dgrbsq1p = (float(*))((char *)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64); // 1
458     // tile raw data
459     float *cfa = (float(*))((char *)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64); // 1
460     // relative weight for combining plus and minus diagonal interpolations
461     float *pmwt = (float(*))delhvsqsum; // there is no overlap in buffer usage => share
462     // interpolated colour difference R-B in minus and plus direction
463     float *rbm = (float(*))vcd; // there is no overlap in buffer usage => share
464     float *rbp = (float(*))((char *)rbm + sizeof(float) * ts * tsh + cldf * 64);
465     // nyquist texture flags 1=nyquist, 0=not nyquist
466     unsigned char *nyquist = (unsigned char(*))((char *)cfa + sizeof(float) * ts * ts + cldf * 64); // 1
467     unsigned char *nyquist2 = (unsigned char(*))cddiffsq;
468     float *nyqutest = (float(*))((char *)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1
469 
470 // Main algorithm: Tile loop
471 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
472 #ifdef _OPENMP
473 #pragma omp for SIMD() schedule(static) collapse(2) nowait
474 #endif
475 
476     for(int top = winy - 16; top < winy + height; top += ts - 32)
477     {
478       for(int left = winx - 16; left < winx + width; left += ts - 32)
479       {
480         memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
481         // location of tile bottom edge
482         int bottom = MIN(top + ts, winy + height + 16);
483         // location of tile right edge
484         int right = MIN(left + ts, winx + width + 16);
485         // tile width  (=ts except for right edge of image)
486         int rr1 = bottom - top;
487         // tile height (=ts except for bottom edge of image)
488         int cc1 = right - left;
489         // bookkeeping for borders
490         // min and max row/column in the tile
491         int rrmin = top < winy ? 16 : 0;
492         int ccmin = left < winx ? 16 : 0;
493         int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
494         int ccmax = right > (winx + width) ? winx + width - left : cc1;
495 
496 // rgb from input CFA data
497 // rgb values should be floating point number between 0 and 1
498 // after white balance multipliers are applied
499 // a 16 pixel border is added to each side of the image
500 
501 // begin of tile initialization
502 #ifdef __SSE2__
503         // fill upper border
504         if(rrmin > 0)
505         {
506           for(int rr = 0; rr < 16; rr++)
507           {
508             int row = 32 - rr + top;
509 
510             for(int cc = ccmin; cc < ccmax; cc += 4)
511             {
512               int indx1 = rr * ts + cc;
513               vfloat tempv = LVFU(in[row * width + (cc + left)]);
514               STVF(cfa[indx1], tempv);
515               STVF(rgbgreen[indx1], tempv);
516             }
517           }
518         }
519 
520         // fill inner part
521         for(int rr = rrmin; rr < rrmax; rr++)
522         {
523           int row = rr + top;
524 
525           for(int cc = ccmin; cc < ccmax; cc += 4)
526           {
527             int indx1 = rr * ts + cc;
528             vfloat tempv = LVFU(in[row * width + (cc + left)]);
529             STVF(cfa[indx1], tempv);
530             STVF(rgbgreen[indx1], tempv);
531           }
532         }
533 
534         // fill lower border
535         if(rrmax < rr1)
536         {
537           for(int rr = 0; rr < 16; rr++)
538             for(int cc = ccmin; cc < ccmax; cc += 4)
539             {
540               int indx1 = (rrmax + rr) * ts + cc;
541               vfloat tempv = LVFU(in[(winy + height - rr - 2) * width + (left + cc)]);
542               STVF(cfa[indx1], tempv);
543               STVF(rgbgreen[indx1], tempv);
544             }
545         }
546 
547 #else
548 
549         // fill upper border
550         if(rrmin > 0)
551         {
552           for(int rr = 0; rr < 16; rr++)
553             for(int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++)
554             {
555               cfa[rr * ts + cc] = (in[row * width + (cc + left)]);
556               rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
557             }
558         }
559 
560         // fill inner part
561         for(int rr = rrmin; rr < rrmax; rr++)
562         {
563           int row = rr + top;
564 
565           for(int cc = ccmin; cc < ccmax; cc++)
566           {
567             int indx1 = rr * ts + cc;
568             cfa[indx1] = (in[row * width + (cc + left)]);
569             rgbgreen[indx1] = cfa[indx1];
570           }
571         }
572 
573         // fill lower border
574         if(rrmax < rr1)
575         {
576           for(int rr = 0; rr < 16; rr++)
577             for(int cc = ccmin; cc < ccmax; cc++)
578             {
579               cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + (left + cc)]);
580               rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
581             }
582         }
583 
584 #endif
585 
586         // fill left border
587         if(ccmin > 0)
588         {
589           for(int rr = rrmin; rr < rrmax; rr++)
590             for(int cc = 0, row = rr + top; cc < 16; cc++)
591             {
592               cfa[rr * ts + cc] = (in[row * width + (32 - cc + left)]);
593               rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
594             }
595         }
596 
597         // fill right border
598         if(ccmax < cc1)
599         {
600           for(int rr = rrmin; rr < rrmax; rr++)
601             for(int cc = 0; cc < 16; cc++)
602             {
603               cfa[rr * ts + ccmax + cc] = (in[(top + rr) * width + ((winx + width - cc - 2))]);
604               rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
605             }
606         }
607 
608         // also, fill the image corners
609         if(rrmin > 0 && ccmin > 0)
610         {
611           for(int rr = 0; rr < 16; rr++)
612             for(int cc = 0; cc < 16; cc++)
613             {
614               cfa[(rr)*ts + cc] = (in[(winy + 32 - rr) * width + (winx + 32 - cc)]);
615               rgbgreen[(rr)*ts + cc] = cfa[(rr)*ts + cc];
616             }
617         }
618 
619         if(rrmax < rr1 && ccmax < cc1)
620         {
621           for(int rr = 0; rr < 16; rr++)
622             for(int cc = 0; cc < 16; cc++)
623             {
624               cfa[(rrmax + rr) * ts + ccmax + cc]
625                   = (in[(winy + height - rr - 2) * width + ((winx + width - cc - 2))]);
626               rgbgreen[(rrmax + rr) * ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
627             }
628         }
629 
630         if(rrmin > 0 && ccmax < cc1)
631         {
632           for(int rr = 0; rr < 16; rr++)
633             for(int cc = 0; cc < 16; cc++)
634             {
635               cfa[(rr)*ts + ccmax + cc] = (in[(winy + 32 - rr) * width + ((winx + width - cc - 2))]);
636               rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr)*ts + ccmax + cc];
637             }
638         }
639 
640         if(rrmax < rr1 && ccmin > 0)
641         {
642           for(int rr = 0; rr < 16; rr++)
643             for(int cc = 0; cc < 16; cc++)
644             {
645               cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + ((winx + 32 - cc))]);
646               rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
647             }
648         }
649 
650 // end of tile initialization
651 
652 // horizontal and vertical gradients
653 #ifdef __SSE2__
654         vfloat epsv = F2V(eps);
655 
656         for(int rr = 2; rr < rr1 - 2; rr++)
657         {
658           for(int indx = rr * ts; indx < rr * ts + cc1; indx += 4)
659           {
660             vfloat delhv = vabsf(LVFU(cfa[indx + 1]) - LVFU(cfa[indx - 1]));
661             vfloat delvv = vabsf(LVF(cfa[indx + v1]) - LVF(cfa[indx - v1]));
662             STVF(dirwts1[indx], epsv + vabsf(LVFU(cfa[indx + 2]) - LVF(cfa[indx]))
663                                     + vabsf(LVF(cfa[indx]) - LVFU(cfa[indx - 2])) + delhv);
664             STVF(dirwts0[indx], epsv + vabsf(LVF(cfa[indx + v2]) - LVF(cfa[indx]))
665                                     + vabsf(LVF(cfa[indx]) - LVF(cfa[indx - v2])) + delvv);
666             STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
667           }
668         }
669 
670 #else
671 
672         for(int rr = 2; rr < rr1 - 2; rr++)
673           for(int cc = 2, indx = (rr)*ts + cc; cc < cc1 - 2; cc++, indx++)
674           {
675             float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
676             float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
677             dirwts0[indx]
678                 = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
679             dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
680             delhvsqsum[indx] = SQR(delh) + SQR(delv);
681           }
682 
683 #endif
684 
685 // interpolate vertical and horizontal colour differences
686 #ifdef __SSE2__
687         vfloat sgnv;
688 
689         if(!(FC(4, 4, filters) & 1))
690         {
691           sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
692         }
693         else
694         {
695           sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
696         }
697 
698         vfloat zd5v = F2V(0.5f);
699         vfloat onev = F2V(1.f);
700         vfloat arthreshv = F2V(arthresh);
701         vfloat clip_pt8v = F2V(clip_pt8);
702 
703         for(int rr = 4; rr < rr1 - 4; rr++)
704         {
705           sgnv = -sgnv;
706 
707           for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4)
708           {
709             // colour ratios in each cardinal direction
710             vfloat cfav = LVF(cfa[indx]);
711             vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx]))
712                           / (LVF(dirwts0[indx - v2]) * (epsv + cfav)
713                              + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
714             vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx]))
715                           / (LVF(dirwts0[indx + v2]) * (epsv + cfav)
716                              + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
717             vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx]))
718                           / (LVFU(dirwts1[indx - 2]) * (epsv + cfav)
719                              + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
720             vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx]))
721                           / (LVFU(dirwts1[indx + 2]) * (epsv + cfav)
722                              + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
723 
724             // G interpolated in vert/hor directions using Hamilton-Adams method
725             vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
726             vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
727             vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
728             vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
729 
730             // G interpolated in vert/hor directions using adaptive ratios
731             vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
732             vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
733             vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
734             vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
735 
736             // adaptive weights for vertical/horizontal directions
737             vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
738             vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
739 
740             // interpolated G via adaptive weights of cardinal evaluations
741             vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
742             vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
743 
744             // interpolated colour differences
745             vfloat hcdaltv = sgnv * (Ginthhav - cfav);
746             vfloat vcdaltv = sgnv * (Gintvhav - cfav);
747             STVF(hcdalt[indx], hcdaltv);
748             STVF(vcdalt[indx], vcdaltv);
749 
750             vmask clipmask = vorm(vorm(vmaskf_gt(cfav, clip_pt8v), vmaskf_gt(Gintvhav, clip_pt8v)),
751                                   vmaskf_gt(Ginthhav, clip_pt8v));
752             guarv = vself(clipmask, guhav, guarv);
753             gdarv = vself(clipmask, gdhav, gdarv);
754             glarv = vself(clipmask, glhav, glarv);
755             grarv = vself(clipmask, grhav, grarv);
756 
757             // use HA if highlights are (nearly) clipped
758             STVF(vcd[indx], vself(clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
759             STVF(hcd[indx], vself(clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
760 
761             // differences of interpolations in opposite directions
762             STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
763             STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
764           }
765         }
766 
767 #else
768 
769         for(int rr = 4; rr < rr1 - 4; rr++)
770         {
771           bool fcswitch = FC(rr, 4, filters) & 1;
772 
773           for(int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++)
774           {
775 
776             // colour ratios in each cardinal direction
777             float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx])
778                         / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
779             float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx])
780                         / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
781             float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx])
782                         / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
783             float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx])
784                         / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
785 
786             // G interpolated in vert/hor directions using Hamilton-Adams method
787             float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
788             float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
789             float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
790             float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
791 
792             // G interpolated in vert/hor directions using adaptive ratios
793             float guar, gdar, glar, grar;
794 
795             if(fabsf(1.f - cru) < arthresh)
796             {
797               guar = cfa[indx] * cru;
798             }
799             else
800             {
801               guar = guha;
802             }
803 
804             if(fabsf(1.f - crd) < arthresh)
805             {
806               gdar = cfa[indx] * crd;
807             }
808             else
809             {
810               gdar = gdha;
811             }
812 
813             if(fabsf(1.f - crl) < arthresh)
814             {
815               glar = cfa[indx] * crl;
816             }
817             else
818             {
819               glar = glha;
820             }
821 
822             if(fabsf(1.f - crr) < arthresh)
823             {
824               grar = cfa[indx] * crr;
825             }
826             else
827             {
828               grar = grha;
829             }
830 
831             // adaptive weights for vertical/horizontal directions
832             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
833             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
834 
835             // interpolated G via adaptive weights of cardinal evaluations
836             float Gintvha = vwt * gdha + (1.f - vwt) * guha;
837             float Ginthha = hwt * grha + (1.f - hwt) * glha;
838 
839             // interpolated colour differences
840             if(fcswitch)
841             {
842               vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
843               hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
844               vcdalt[indx] = cfa[indx] - Gintvha;
845               hcdalt[indx] = cfa[indx] - Ginthha;
846             }
847             else
848             {
849               // interpolated colour differences
850               vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
851               hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
852               vcdalt[indx] = Gintvha - cfa[indx];
853               hcdalt[indx] = Ginthha - cfa[indx];
854             }
855 
856             fcswitch = !fcswitch;
857 
858             if(cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8)
859             {
860               // use HA if highlights are (nearly) clipped
861               guar = guha;
862               gdar = gdha;
863               glar = glha;
864               grar = grha;
865               vcd[indx] = vcdalt[indx];
866               hcd[indx] = hcdalt[indx];
867             }
868 
869             // differences of interpolations in opposite directions
870             dgintv[indx] = MIN(SQR(guha - gdha), SQR(guar - gdar));
871             dginth[indx] = MIN(SQR(glha - grha), SQR(glar - grar));
872           }
873         }
874 
875 #endif
876 
877 
878 
879 #ifdef __SSE2__
880         vfloat clip_ptv = F2V(clip_pt);
881         vfloat sgn3v;
882 
883         if(!(FC(4, 4, filters) & 1))
884         {
885           sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
886         }
887         else
888         {
889           sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
890         }
891 
892         sgn3v = sgnv + sgnv + sgnv;
893 
894         for(int rr = 4; rr < rr1 - 4; rr++)
895         {
896           vfloat nsgnv = sgnv;
897           sgnv = -sgnv;
898           sgn3v = -sgn3v;
899 
900           for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4)
901           {
902             vfloat hcdv = LVF(hcd[indx]);
903             vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv)
904                              + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2]))
905                              + SQRV(hcdv - LVFU(hcd[indx + 2]));
906             vfloat hcdaltv = LVF(hcdalt[indx]);
907             vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv)
908                                 + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2]))
909                                 + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
910             vfloat vcdv = LVF(vcd[indx]);
911             vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv)
912                              + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2]))
913                              + SQRV(vcdv - LVF(vcd[indx + v2]));
914             vfloat vcdaltv = LVF(vcdalt[indx]);
915             vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv)
916                                 + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2]))
917                                 + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
918 
919             // choose the smallest variance; this yields a smoother interpolation
920             hcdv = vself(vmaskf_lt(hcdaltvarv, hcdvarv), hcdaltv, hcdv);
921             vcdv = vself(vmaskf_lt(vcdaltvarv, vcdvarv), vcdaltv, vcdv);
922 
923             // bound the interpolation in regions of high saturation
924             // vertical and horizontal G interpolations
925             vfloat Ginthv = sgnv * hcdv + LVF(cfa[indx]);
926             vfloat temp2v = sgn3v * hcdv;
927             vfloat hwtv = onev + temp2v / (epsv + Ginthv + LVF(cfa[indx]));
928             vmask hcdmask = vmaskf_gt(nsgnv * hcdv, ZEROV);
929             vfloat hcdoldv = hcdv;
930             vfloat tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1])));
931             hcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
932             hcdv = vself(hcdmask, hcdv, hcdoldv);
933             hcdv = vself(vmaskf_gt(Ginthv, clip_ptv), tempv, hcdv);
934             STVF(hcd[indx], hcdv);
935 
936             vfloat Gintvv = sgnv * vcdv + LVF(cfa[indx]);
937             temp2v = sgn3v * vcdv;
938             vfloat vwtv = onev + temp2v / (epsv + Gintvv + LVF(cfa[indx]));
939             vmask vcdmask = vmaskf_gt(nsgnv * vcdv, ZEROV);
940             vfloat vcdoldv = vcdv;
941             tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1])));
942             vcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
943             vcdv = vself(vcdmask, vcdv, vcdoldv);
944             vcdv = vself(vmaskf_gt(Gintvv, clip_ptv), tempv, vcdv);
945             STVF(vcd[indx], vcdv);
946             STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
947           }
948         }
949 
950 #else
951 
952         for(int rr = 4; rr < rr1 - 4; rr++)
953         {
954           for(int cc = 4, indx = rr * ts + cc, c = FC(rr, cc, filters) & 1; cc < cc1 - 4; cc++, indx++)
955           {
956             float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2]))
957                            - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
958             float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2]))
959                               - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
960             float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2]))
961                            - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
962             float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2]))
963                               - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
964 
965             // choose the smallest variance; this yields a smoother interpolation
966             if(hcdaltvar < hcdvar)
967             {
968               hcd[indx] = hcdalt[indx];
969             }
970 
971             if(vcdaltvar < vcdvar)
972             {
973               vcd[indx] = vcdalt[indx];
974             }
975 
976             // bound the interpolation in regions of high saturation
977             // vertical and horizontal G interpolations
978             float Gintv, Ginth;
979 
980             if(c)
981             {                                 // G site
982               Ginth = -hcd[indx] + cfa[indx]; // R or B
983               Gintv = -vcd[indx] + cfa[indx]; // B or R
984 
985               if(hcd[indx] > 0)
986               {
987                 if(3.f * hcd[indx] > (Ginth + cfa[indx]))
988                 {
989                   hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
990                 }
991                 else
992                 {
993                   float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
994                   hcd[indx] = hwt * hcd[indx]
995                               + (1.f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
996                 }
997               }
998 
999               if(vcd[indx] > 0)
1000               {
1001                 if(3.f * vcd[indx] > (Gintv + cfa[indx]))
1002                 {
1003                   vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
1004                 }
1005                 else
1006                 {
1007                   float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
1008                   vcd[indx] = vwt * vcd[indx]
1009                               + (1.f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
1010                 }
1011               }
1012 
1013               if(Ginth > clip_pt)
1014               {
1015                 hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
1016               }
1017 
1018               if(Gintv > clip_pt)
1019               {
1020                 vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
1021               }
1022             }
1023             else
1024             { // R or B site
1025 
1026               Ginth = hcd[indx] + cfa[indx]; // interpolated G
1027               Gintv = vcd[indx] + cfa[indx];
1028 
1029               if(hcd[indx] < 0)
1030               {
1031                 if(3.f * hcd[indx] < -(Ginth + cfa[indx]))
1032                 {
1033                   hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
1034                 }
1035                 else
1036                 {
1037                   float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
1038                   hcd[indx] = hwt * hcd[indx]
1039                               + (1.f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
1040                 }
1041               }
1042 
1043               if(vcd[indx] < 0)
1044               {
1045                 if(3.f * vcd[indx] < -(Gintv + cfa[indx]))
1046                 {
1047                   vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
1048                 }
1049                 else
1050                 {
1051                   float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
1052                   vcd[indx] = vwt * vcd[indx]
1053                               + (1.f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
1054                 }
1055               }
1056 
1057               if(Ginth > clip_pt)
1058               {
1059                 hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
1060               }
1061 
1062               if(Gintv > clip_pt)
1063               {
1064                 vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
1065               }
1066 
1067               cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
1068             }
1069 
1070             c = !c;
1071           }
1072         }
1073 
1074 #endif
1075 
1076 
1077 
1078 #ifdef __SSE2__
1079         vfloat epssqv = F2V(epssq);
1080 
1081         for(int rr = 6; rr < rr1 - 6; rr++)
1082         {
1083           for(int indx = rr * ts + 6 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 6; indx += 8)
1084           {
1085             // compute colour difference variances in cardinal directions
1086             vfloat tempv = LC2VFU(vcd[indx]);
1087             vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
1088             vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
1089             vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev)
1090                                 + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
1091             vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev)
1092                                 + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
1093 
1094             vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
1095             vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
1096 
1097             tempv = LC2VFU(hcd[indx]);
1098             vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
1099             vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
1100 
1101             vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev)
1102                                 + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
1103             vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev)
1104                                 + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
1105 
1106 
1107             vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
1108             vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
1109 
1110             // compute fluctuations in up/down and left/right interpolations of colours
1111             Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
1112             Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
1113 
1114             Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
1115             Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
1116 
1117             vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
1118             vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
1119 
1120             // determine adaptive weights for G interpolation
1121             vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
1122             vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
1123 
1124             // if both agree on interpolation direction, choose the one with strongest directional
1125             // discrimination;
1126             // otherwise, choose the u/d and l/r difference fluctuation weights
1127             vmask decmask = vandm(vmaskf_gt((zd5v - varwtv) * (zd5v - diffwtv), ZEROV),
1128                                   vmaskf_lt(vabsf(zd5v - diffwtv), vabsf(zd5v - varwtv)));
1129             STVFU(hvwt[indx >> 1], vself(decmask, varwtv, diffwtv));
1130           }
1131         }
1132 
1133 #else
1134 
1135         for(int rr = 6; rr < rr1 - 6; rr++)
1136         {
1137           for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1138           {
1139 
1140             // compute colour difference variances in cardinal directions
1141 
1142             float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
1143             float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
1144             float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
1145             float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
1146 
1147             // colour difference (G-R or G-B) variance in up/down/left/right directions
1148             float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave)
1149                               + SQR(vcd[indx - v3] - uave);
1150             float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave)
1151                               + SQR(vcd[indx + v3] - dave);
1152             float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave)
1153                               + SQR(hcd[indx - 3] - lave);
1154             float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave)
1155                               + SQR(hcd[indx + 3] - rave);
1156 
1157             float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1158             float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1159 
1160             float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
1161             float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
1162 
1163             // compute fluctuations in up/down and left/right interpolations of colours
1164             Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
1165             Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
1166             Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
1167             Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
1168 
1169             float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
1170             float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
1171 
1172             // determine adaptive weights for G interpolation
1173             float varwt = hcdvar / (vcdvar + hcdvar);
1174             float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
1175 
1176             // if both agree on interpolation direction, choose the one with strongest directional
1177             // discrimination;
1178             // otherwise, choose the u/d and l/r difference fluctuation weights
1179             if((0.5 - varwt) * (0.5 - diffwt) > 0 && fabsf(0.5f - diffwt) < fabsf(0.5f - varwt))
1180             {
1181               hvwt[indx >> 1] = varwt;
1182             }
1183             else
1184             {
1185               hvwt[indx >> 1] = diffwt;
1186             }
1187           }
1188         }
1189 
1190 #endif
1191 
1192 #ifdef __SSE2__
1193         vfloat gaussg0 = F2V(gaussgrad[0]);
1194         vfloat gaussg1 = F2V(gaussgrad[1]);
1195         vfloat gaussg2 = F2V(gaussgrad[2]);
1196         vfloat gaussg3 = F2V(gaussgrad[3]);
1197         vfloat gaussg4 = F2V(gaussgrad[4]);
1198         vfloat gaussg5 = F2V(gaussgrad[5]);
1199         vfloat gausso0 = F2V(gaussodd[0]);
1200         vfloat gausso1 = F2V(gaussodd[1]);
1201         vfloat gausso2 = F2V(gaussodd[2]);
1202         vfloat gausso3 = F2V(gaussodd[3]);
1203 
1204 #endif
1205 
1206         // precompute nyquist
1207         for(int rr = 6; rr < rr1 - 6; rr++)
1208         {
1209           int cc = 6 + (FC(rr, 2, filters) & 1);
1210           int indx = rr * ts + cc;
1211 
1212 #ifdef __SSE2__
1213 
1214           for(; cc < cc1 - 7; cc += 8, indx += 8)
1215           {
1216             vfloat valv
1217                 = (gausso0 * LC2VFU(cddiffsq[indx])
1218                    + gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)])
1219                                 + LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)]))
1220                    + gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)])
1221                                 + LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)]))
1222                    + gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)])
1223                                 + LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)])))
1224                   - (gaussg0 * LC2VFU(delhvsqsum[indx])
1225                      + gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1])
1226                                   + LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1]))
1227                      + gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1])
1228                                   + LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1]))
1229                      + gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2])
1230                                   + LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2]))
1231                      + gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1])
1232                                   + LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2])
1233                                   + LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2])
1234                                   + LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1]))
1235                      + gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2])
1236                                   + LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
1237             STVFU(nyqutest[indx >> 1], valv);
1238           }
1239 
1240 #endif
1241 
1242           for(; cc < cc1 - 6; cc += 2, indx += 2)
1243           {
1244             nyqutest[indx >> 1]
1245                 = (gaussodd[0] * cddiffsq[indx]
1246                    + gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + cddiffsq[(indx - p1)]
1247                                     + cddiffsq[(indx + m1)])
1248                    + gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + cddiffsq[(indx + 2)]
1249                                     + cddiffsq[(indx + v2)])
1250                    + gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + cddiffsq[(indx - p2)]
1251                                     + cddiffsq[(indx + m2)]))
1252                   - (gaussgrad[0] * delhvsqsum[indx]
1253                      + gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + delhvsqsum[indx - 1]
1254                                        + delhvsqsum[indx + v1])
1255                      + gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + delhvsqsum[indx - p1]
1256                                        + delhvsqsum[indx + m1])
1257                      + gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + delhvsqsum[indx + 2]
1258                                        + delhvsqsum[indx + v2])
1259                      + gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1]
1260                                        + delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2]
1261                                        + delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2]
1262                                        + delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1])
1263                      + gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + delhvsqsum[indx - p2]
1264                                        + delhvsqsum[indx + m2]));
1265           }
1266         }
1267 
1268         // Nyquist test
1269         int nystartrow = 0;
1270         int nyendrow = 0;
1271         int nystartcol = ts + 1;
1272         int nyendcol = 0;
1273 
1274         for(int rr = 6; rr < rr1 - 6; rr++)
1275         {
1276           for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1277           {
1278 
1279             // nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB
1280             // gradients
1281             if(nyqutest[indx >> 1] > 0.f)
1282             {
1283               nyquist[indx >> 1] = 1; // nyquist=1 for nyquist region
1284               nystartrow = nystartrow ? nystartrow : rr;
1285               nyendrow = rr;
1286               nystartcol = nystartcol > cc ? cc : nystartcol;
1287               nyendcol = nyendcol < cc ? cc : nyendcol;
1288             }
1289           }
1290         }
1291 
1292 
1293         bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
1294 
1295         if(doNyquist)
1296         {
1297           nyendrow++; // because of < condition
1298           nyendcol++; // because of < condition
1299           nystartcol -= (nystartcol & 1);
1300           nystartrow = std::max(8, nystartrow);
1301           nyendrow = std::min(rr1 - 8, nyendrow);
1302           nystartcol = std::max(8, nystartcol);
1303           nyendcol = std::min(cc1 - 8, nyendcol);
1304           memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
1305 
1306 #ifdef __SSE2__
1307           vint fourvb = _mm_set1_epi8(4);
1308           vint onevb = _mm_set1_epi8(1);
1309 
1310 #endif
1311 
1312           for(int rr = nystartrow; rr < nyendrow; rr++)
1313           {
1314 #ifdef __SSE2__
1315 
1316             for(int indx = rr * ts; indx < rr * ts + cc1; indx += 32)
1317             {
1318               vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint *)&nyquist[(indx - v2) >> 1]),
1319                                                  _mm_loadu_si128((vint *)&nyquist[(indx - m1) >> 1]));
1320               vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + p1) >> 1]),
1321                                                  _mm_loadu_si128((vint *)&nyquist[(indx - 2) >> 1]));
1322               vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + 2) >> 1]),
1323                                                  _mm_loadu_si128((vint *)&nyquist[(indx - p1) >> 1]));
1324               vint valv = _mm_load_si128((vint *)&nyquist[indx >> 1]);
1325               vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + m1) >> 1]),
1326                                                  _mm_load_si128((vint *)&nyquist[(indx + v2) >> 1]));
1327               nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
1328               nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
1329               nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
1330               valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
1331               valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
1332               _mm_store_si128((vint *)&nyquist2[indx >> 1], valv);
1333             }
1334 
1335 #else
1336 
1337             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
1338                 indx += 2)
1339             {
1340               unsigned int nyquisttemp
1341                   = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1]
1342                      + nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] + nyquist[(indx - p1) >> 1]
1343                      + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
1344               // if most of your neighbours are named Nyquist, it's likely that you're one too, or not
1345               nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
1346             }
1347 
1348 #endif
1349           }
1350 
1351           // end of Nyquist test
1352 
1353           // in areas of Nyquist texture, do area interpolation
1354           for(int rr = nystartrow; rr < nyendrow; rr++)
1355             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
1356                 indx += 2)
1357             {
1358 
1359               if(nyquist2[indx >> 1])
1360               {
1361                 // area interpolation
1362 
1363                 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
1364 
1365                 for(int i = -6; i < 7; i += 2)
1366                 {
1367                   int indx1 = indx + (i * ts) - 6;
1368 
1369                   for(int j = -6; j < 7; j += 2, indx1 += 2)
1370                   {
1371                     if(nyquist2[indx1 >> 1])
1372                     {
1373                       float cfatemp = cfa[indx1];
1374                       sumcfa += cfatemp;
1375                       sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
1376                       sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
1377                       sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
1378                       sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
1379                       areawt += 1;
1380                     }
1381                   }
1382                 }
1383 
1384                 // horizontal and vertical colour differences, and adaptive weight
1385                 sumh = sumcfa - xdiv2f(sumh);
1386                 sumv = sumcfa - xdiv2f(sumv);
1387                 areawt = xdiv2f(areawt);
1388                 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
1389                 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
1390                 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
1391 
1392                 // end of area interpolation
1393               }
1394             }
1395         }
1396 
1397 
1398         // populate G at R/B sites
1399         for(int rr = 8; rr < rr1 - 8; rr++)
1400           for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 8; indx += 2)
1401           {
1402 
1403             // first ask if one gets more directional discrimination from nearby B/R sites
1404             float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1]
1405                                       + hvwt[(indx + m1) >> 1],
1406                                   2);
1407 
1408             hvwt[indx >> 1]
1409                 = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
1410             // a better result was obtained from the neighbours
1411 
1412             Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); // evaluate colour differences
1413 
1414             rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; // evaluate G (finally!)
1415 
1416             // local curvature in G (preparation for nyquist refinement step)
1417             Dgrb2[indx >> 1].h = nyquist2[indx >> 1]
1418                                      ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1]))
1419                                      : 0.f;
1420             Dgrb2[indx >> 1].v = nyquist2[indx >> 1]
1421                                      ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1]))
1422                                      : 0.f;
1423           }
1424 
1425 
1426         // end of standard interpolation
1427 
1428         // refine Nyquist areas using G curvatures
1429         if(doNyquist)
1430         {
1431           for(int rr = nystartrow; rr < nyendrow; rr++)
1432             for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
1433                 indx += 2)
1434             {
1435 
1436               if(nyquist2[indx >> 1])
1437               {
1438                 // local averages (over Nyquist pixels only) of G curvature squared
1439                 float gvarh
1440                     = epssq + (gquinc[0] * Dgrb2[indx >> 1].h
1441                                + gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h
1442                                               + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h)
1443                                + gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h
1444                                               + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h)
1445                                + gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h
1446                                               + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
1447                 float gvarv
1448                     = epssq + (gquinc[0] * Dgrb2[indx >> 1].v
1449                                + gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v
1450                                               + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v)
1451                                + gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v
1452                                               + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v)
1453                                + gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v
1454                                               + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
1455                 // use the results as weights for refined G interpolation
1456                 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
1457                 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
1458               }
1459             }
1460         }
1461 
1462 
1463 #ifdef __SSE2__
1464 
1465         for(int rr = 6; rr < rr1 - 6; rr++)
1466         {
1467           if((FC(rr, 2, filters) & 1) == 0)
1468           {
1469             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
1470             {
1471               vfloat tempv = LC2VFU(cfa[indx + 1]);
1472               vfloat Dgrbsq1pv
1473                   = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
1474               STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
1475               STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
1476               vfloat Dgrbsq1mv
1477                   = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
1478               STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
1479               STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
1480             }
1481           }
1482           else
1483           {
1484             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
1485             {
1486               vfloat tempv = LC2VFU(cfa[indx]);
1487               vfloat Dgrbsq1pv
1488                   = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
1489               STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
1490               STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
1491               vfloat Dgrbsq1mv
1492                   = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
1493               STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
1494               STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
1495             }
1496           }
1497         }
1498 
1499 #else
1500 
1501         for(int rr = 6; rr < rr1 - 6; rr++)
1502         {
1503           if((FC(rr, 2, filters) & 1) == 0)
1504           {
1505             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1506             {
1507               delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
1508               delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
1509               Dgrbsq1p[indx >> 1]
1510                   = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
1511               Dgrbsq1m[indx >> 1]
1512                   = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
1513             }
1514           }
1515           else
1516           {
1517             for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1518             {
1519               Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
1520               Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
1521               delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
1522               delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
1523             }
1524           }
1525         }
1526 
1527 #endif
1528 
1529 // diagonal interpolation correction
1530 
1531 #ifdef __SSE2__
1532         vfloat gausseven0v = F2V(gausseven[0]);
1533         vfloat gausseven1v = F2V(gausseven[1]);
1534 #endif
1535 
1536         for(int rr = 8; rr < rr1 - 8; rr++)
1537         {
1538 #ifdef __SSE2__
1539 
1540           for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8;
1541               indx += 8, indx1 += 4)
1542           {
1543 
1544             // diagonal colour ratios
1545             vfloat cfav = LC2VFU(cfa[indx]);
1546 
1547             vfloat temp1v = LC2VFU(cfa[indx + m1]);
1548             vfloat temp2v = LC2VFU(cfa[indx + m2]);
1549             vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v);
1550             rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev,
1551                           temp1v + zd5v * (cfav - temp2v));
1552 
1553             temp1v = LC2VFU(cfa[indx - m1]);
1554             temp2v = LC2VFU(cfa[indx - m2]);
1555             vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v);
1556             rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv,
1557                           temp1v + zd5v * (cfav - temp2v));
1558 
1559             temp1v = epsv + LVFU(delm[indx1]);
1560             vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1])
1561                            + LVFU(delm[(indx + m2) >> 1]); // same as for wtu,wtd,wtl,wtr
1562             vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
1563 
1564             vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
1565 
1566             temp1v = ULIMV(rbmv, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
1567             vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
1568             temp2v = vintpf(wtv, rbmv, temp1v);
1569 
1570             temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
1571             temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
1572             STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
1573                                     ULIMV(temp2v, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v));
1574 
1575 
1576             temp1v = LC2VFU(cfa[indx + p1]);
1577             temp2v = LC2VFU(cfa[indx + p2]);
1578             vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v);
1579             rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev,
1580                           temp1v + zd5v * (cfav - temp2v));
1581 
1582             temp1v = LC2VFU(cfa[indx - p1]);
1583             temp2v = LC2VFU(cfa[indx - p2]);
1584             vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v);
1585             rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv,
1586                           temp1v + zd5v * (cfav - temp2v));
1587 
1588             temp1v = epsv + LVFU(delp[indx1]);
1589             vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
1590             vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
1591 
1592             vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
1593 
1594             temp1v = ULIMV(rbpv, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
1595             wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
1596             temp2v = vintpf(wtv, rbpv, temp1v);
1597 
1598             temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
1599             temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
1600             STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
1601                                     ULIMV(temp2v, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v));
1602 
1603             vfloat rbvarmv
1604                 = epssqv
1605                   + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1])
1606                                     + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1]))
1607                      + gausseven1v
1608                            * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1])
1609                               + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1])
1610                               + LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1])
1611                               + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
1612             STVFU(pmwt[indx1],
1613                   rbvarmv / ((epssqv
1614                               + (gausseven0v
1615                                      * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1])
1616                                         + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1]))
1617                                  + gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1])
1618                                                   + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1])
1619                                                   + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1])
1620                                                   + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1])
1621                                                   + LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1])
1622                                                   + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1])
1623                                                   + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1])
1624                                                   + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1]))))
1625                              + rbvarmv));
1626           }
1627 
1628 #else
1629 
1630           for(int cc = 8 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8;
1631               cc += 2, indx += 2, indx1++)
1632           {
1633 
1634             // diagonal colour ratios
1635             float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
1636             float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
1637             float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
1638             float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
1639             // colour differences in diagonal directions
1640             float rbse, rbnw, rbne, rbsw;
1641 
1642             // assign B/R at R/B sites
1643             if(fabsf(1.f - crse) < arthresh)
1644             {
1645               rbse = cfa[indx] * crse; // use this if more precise diag interp is necessary
1646             }
1647             else
1648             {
1649               rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
1650             }
1651 
1652             if(fabsf(1.f - crnw) < arthresh)
1653             {
1654               rbnw = cfa[indx] * crnw;
1655             }
1656             else
1657             {
1658               rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
1659             }
1660 
1661             if(fabsf(1.f - crne) < arthresh)
1662             {
1663               rbne = cfa[indx] * crne;
1664             }
1665             else
1666             {
1667               rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
1668             }
1669 
1670             if(fabsf(1.f - crsw) < arthresh)
1671             {
1672               rbsw = cfa[indx] * crsw;
1673             }
1674             else
1675             {
1676               rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
1677             }
1678 
1679             float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1]
1680                          + delm[(indx + m2) >> 1]; // same as for wtu,wtd,wtl,wtr
1681             float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
1682             float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
1683             float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
1684 
1685 
1686             rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
1687             rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
1688 
1689             // variance of R-B in plus/minus directions
1690             float rbvarm
1691                 = epssq
1692                   + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1]
1693                                      + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1])
1694                      + gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1]
1695                                        + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1]
1696                                        + Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1]
1697                                        + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
1698             pmwt[indx1]
1699                 = rbvarm
1700                   / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1]
1701                                                + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1])
1702                                + gausseven[1]
1703                                      * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1]
1704                                         + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1]
1705                                         + Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1]
1706                                         + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1])))
1707                      + rbvarm);
1708 
1709             // bound the interpolation in regions of high saturation
1710 
1711             if(rbp[indx1] < cfa[indx])
1712             {
1713               if(xmul2f(rbp[indx1]) < cfa[indx])
1714               {
1715                 rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1716               }
1717               else
1718               {
1719                 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
1720                 rbp[indx1]
1721                     = pwt * rbp[indx1] + (1.f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1722               }
1723             }
1724 
1725             if(rbm[indx1] < cfa[indx])
1726             {
1727               if(xmul2f(rbm[indx1]) < cfa[indx])
1728               {
1729                 rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1730               }
1731               else
1732               {
1733                 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
1734                 rbm[indx1]
1735                     = mwt * rbm[indx1] + (1.f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1736               }
1737             }
1738 
1739             if(rbp[indx1] > clip_pt)
1740             {
1741               rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1742             }
1743 
1744             if(rbm[indx1] > clip_pt)
1745             {
1746               rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1747             }
1748           }
1749 
1750 #endif
1751         }
1752 
1753 #ifdef __SSE2__
1754         vfloat zd25v = F2V(0.25f);
1755 #endif
1756 
1757         for(int rr = 10; rr < rr1 - 10; rr++)
1758 #ifdef __SSE2__
1759           for(int indx = rr * ts + 10 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
1760               indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4)
1761           {
1762 
1763             // first ask if one gets more directional discrimination from nearby B/R sites
1764             vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1])
1765                                        + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
1766             vfloat tempv = LVFU(pmwt[indx1]);
1767             tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
1768             STVFU(pmwt[indx1], tempv);
1769             STVFU(rbint[indx1],
1770                   zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
1771           }
1772 
1773 #else
1774 
1775           for(int cc = 10 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10;
1776               cc += 2, indx += 2, indx1++)
1777           {
1778 
1779             // first ask if one gets more directional discrimination from nearby B/R sites
1780             float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1]
1781                                       + pmwt[(indx + m1) >> 1],
1782                                   2);
1783 
1784             if(fabsf(0.5f - pmwt[indx1]) < fabsf(0.5f - pmwtalt))
1785             {
1786               pmwt[indx1] = pmwtalt; // a better result was obtained from the neighbours
1787             }
1788 
1789             rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1])
1790                                   + rbp[indx1] * pmwt[indx1]); // this is R+B, interpolated
1791           }
1792 
1793 #endif
1794 
1795         for(int rr = 12; rr < rr1 - 12; rr++)
1796 #ifdef __SSE2__
1797           for(int indx = rr * ts + 12 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
1798               indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4)
1799           {
1800             vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
1801 
1802             if(_mm_movemask_ps((vfloat)copymask))
1803             { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the
1804               // unused out at the end
1805               // now interpolate G vertically/horizontally using R+B values
1806               // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1807               // colour ratios for G interpolation
1808               vfloat rbintv = LVFU(rbint[indx1]);
1809 
1810               // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1811               vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
1812               vfloat guv = rbintv * cruv;
1813               vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
1814               guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
1815 
1816               vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
1817               vfloat gdv = rbintv * crdv;
1818               vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
1819               gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
1820 
1821               vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv)
1822                               / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
1823               vfloat Gint1v = ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
1824               vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
1825               vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
1826               Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
1827               Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
1828               Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv),
1829                              ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
1830 
1831               vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
1832               vfloat glv = rbintv * crlv;
1833               vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
1834               glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
1835 
1836               vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
1837               vfloat grv = rbintv * crrv;
1838               vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
1839               grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
1840 
1841               vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv)
1842                               / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
1843               vfloat Gint1h = ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
1844               vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
1845               vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
1846               Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
1847               Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
1848               Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv),
1849                              ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
1850 
1851               vfloat greenv
1852                   = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
1853               STC2VFU(rgbgreen[indx], greenv);
1854 
1855               STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
1856             }
1857           }
1858 
1859 #else
1860 
1861           for(int cc = 12 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12;
1862               cc += 2, indx += 2, indx1++)
1863           {
1864 
1865             if(fabsf(0.5f - pmwt[indx >> 1]) < fabsf(0.5f - hvwt[indx >> 1]))
1866             {
1867               continue;
1868             }
1869 
1870             // now interpolate G vertically/horizontally using R+B values
1871             // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1872 
1873             // colour ratios for G interpolation
1874             float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
1875             float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
1876             float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
1877             float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
1878 
1879             // interpolation of G in four directions
1880             float gu, gd, gl, gr;
1881 
1882             // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1883             if(fabsf(1.f - cru) < arthresh)
1884             {
1885               gu = rbint[indx1] * cru;
1886             }
1887             else
1888             {
1889               gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
1890             }
1891 
1892             if(fabsf(1.f - crd) < arthresh)
1893             {
1894               gd = rbint[indx1] * crd;
1895             }
1896             else
1897             {
1898               gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
1899             }
1900 
1901             if(fabsf(1.f - crl) < arthresh)
1902             {
1903               gl = rbint[indx1] * crl;
1904             }
1905             else
1906             {
1907               gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
1908             }
1909 
1910             if(fabsf(1.f - crr) < arthresh)
1911             {
1912               gr = rbint[indx1] * crr;
1913             }
1914             else
1915             {
1916               gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
1917             }
1918 
1919             // interpolated G via adaptive weights of cardinal evaluations
1920             float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu)
1921                           / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1922             float Ginth
1923                 = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1924 
1925             // bound the interpolation in regions of high saturation
1926             if(Gintv < rbint[indx1])
1927             {
1928               if(2 * Gintv < rbint[indx1])
1929               {
1930                 Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
1931               }
1932               else
1933               {
1934                 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
1935                 Gintv = vwt * Gintv + (1.f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
1936               }
1937             }
1938 
1939             if(Ginth < rbint[indx1])
1940             {
1941               if(2 * Ginth < rbint[indx1])
1942               {
1943                 Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
1944               }
1945               else
1946               {
1947                 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
1948                 Ginth = hwt * Ginth + (1.f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
1949               }
1950             }
1951 
1952             if(Ginth > clip_pt)
1953             {
1954               Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
1955             }
1956 
1957             if(Gintv > clip_pt)
1958             {
1959               Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
1960             }
1961 
1962             rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
1963             Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
1964           }
1965 
1966 #endif
1967 
1968         // end of diagonal interpolation correction
1969 
1970         // fancy chrominance interpolation
1971         //(ey,ex) is location of R site
1972         for(int rr = 13 - ey; rr < rr1 - 12; rr += 2)
1973           for(int indx1 = (rr * ts + 13 - ex) >> 1; indx1<(rr * ts + cc1 - 12)>> 1; indx1++)
1974           {                                  // B coset
1975             Dgrb[1][indx1] = Dgrb[0][indx1]; // split out G-B from G-R
1976             Dgrb[0][indx1] = 0;
1977           }
1978 
1979 #ifdef __SSE2__
1980         vfloat oned325v = F2V(1.325f);
1981         vfloat zd175v = F2V(0.175f);
1982         vfloat zd075v = F2V(0.075f);
1983 #endif
1984 
1985         for(int rr = 14; rr < rr1 - 14; rr++)
1986 #ifdef __SSE2__
1987           for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
1988               cc < cc1 - 14; cc += 8, indx += 8)
1989           {
1990             vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
1991             vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
1992             vfloat wtnwv
1993                 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))
1994                           + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
1995             vfloat wtnev
1996                 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1]))
1997                           + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
1998             vfloat wtswv
1999                 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1]))
2000                           + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
2001             vfloat wtsev
2002                 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1]))
2003                           + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
2004 
2005             STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1])
2006                                                 - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1])
2007                                                 - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1])
2008                                                             + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])))
2009                                        + wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1])
2010                                                   - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1])
2011                                                   - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1])
2012                                                               + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])))
2013                                        + wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1])
2014                                                   - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1])
2015                                                   - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1])
2016                                                               + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])))
2017                                        + wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1])
2018                                                   - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1])
2019                                                   - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1])
2020                                                               + LVFU(Dgrb[c][(indx + m1 + v2) >> 1]))))
2021                                           / (wtnwv + wtnev + wtswv + wtsev));
2022           }
2023 
2024 #else
2025 
2026           for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
2027               cc < cc1 - 14; cc += 2, indx += 2)
2028           {
2029             float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1])
2030                                 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1])
2031                                 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
2032             float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1])
2033                                 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1])
2034                                 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
2035             float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1])
2036                                 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1])
2037                                 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
2038             float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1])
2039                                 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1])
2040                                 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
2041 
2042             Dgrb[c][indx >> 1]
2043                 = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1]
2044                            - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1])
2045                    + wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1]
2046                              - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1]
2047                              - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1])
2048                    + wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1]
2049                              - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1]
2050                              - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1])
2051                    + wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1]
2052                              - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1]
2053                              - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1]))
2054                   / (wtnw + wtne + wtsw + wtse);
2055           }
2056 
2057 #endif
2058 
2059 #ifdef __SSE2__
2060         int offset;
2061         vfloat twov = F2V(2.f);
2062         vmask selmask;
2063 
2064         if((FC(16, 2, filters) & 1) == 1)
2065         {
2066           selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
2067           offset = 1;
2068         }
2069         else
2070         {
2071           selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
2072           offset = 0;
2073         }
2074 
2075 #endif
2076 
2077         for(int rr = 16; rr < rr1 - 16; rr++)
2078         {
2079           int row = rr + top;
2080           int col = left + 16;
2081           int indx = rr * ts + 16;
2082 #ifdef __SSE2__
2083           offset = 1 - offset;
2084           selmask = vnotm(selmask);
2085 
2086           for(; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4)
2087           {
2088             if(col < roi_out->width && row < roi_out->height)
2089             {
2090               vfloat greenv = LVF(rgbgreen[indx]);
2091               vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
2092               vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
2093               vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))
2094                                      - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
2095 
2096               vfloat redv1 = greenv
2097                              - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1]))
2098                                 + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
2099                                       * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1]))
2100                                 + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
2101                                       * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1]))
2102                                 + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1])))
2103                                    * tempv;
2104               vfloat bluev1 = greenv
2105                               - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1]))
2106                                  + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
2107                                        * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1]))
2108                                  + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
2109                                        * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1]))
2110                                  + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1])))
2111                                     * tempv;
2112               vfloat redv2 = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
2113               vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
2114               __attribute__((aligned(64))) float _r[4];
2115               __attribute__((aligned(64))) float _b[4];
2116               STVF(*_r, vself(selmask, redv1, redv2));
2117               STVF(*_b, vself(selmask, bluev1, bluev2));
2118               for(int c = 0; c < 4; c++)
2119               {
2120                 out[(row * roi_out->width + col + c) * 4] = clampnan(_r[c], 0.0, 1.0);
2121                 out[(row * roi_out->width + col + c) * 4 + 2] = clampnan(_b[c], 0.0, 1.0);
2122               }
2123             }
2124           }
2125 
2126           if(offset == 0)
2127           {
2128             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2129             {
2130               if(col < roi_out->width && row < roi_out->height)
2131               {
2132                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2133                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2134                 out[(row * roi_out->width + col) * 4]
2135                     = clampnan(rgbgreen[indx]
2136                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2137                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2138                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2139                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2140                                          * temp,
2141                                0.0, 1.0);
2142                 out[(row * roi_out->width + col) * 4 + 2]
2143                     = clampnan(rgbgreen[indx]
2144                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2145                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2146                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2147                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2148                                          * temp,
2149                                0.0, 1.0);
2150               }
2151 
2152               indx++;
2153               col++;
2154               if(col < roi_out->width && row < roi_out->height)
2155               {
2156                 out[(row * roi_out->width + col) * 4]
2157                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2158                 out[(row * roi_out->width + col) * 4 + 2]
2159                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2160               }
2161             }
2162 
2163             if(cc1 & 1)
2164             { // width of tile is odd
2165               if(col < roi_out->width && row < roi_out->height)
2166               {
2167                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2168                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2169                 out[(row * roi_out->width + col) * 4]
2170                     = clampnan(rgbgreen[indx]
2171                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2172                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2173                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2174                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2175                                          * temp,
2176                                0.0, 1.0);
2177                 out[(row * roi_out->width + col) * 4 + 2]
2178                     = clampnan(rgbgreen[indx]
2179                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2180                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2181                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2182                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2183                                          * temp,
2184                                0.0, 1.0);
2185               }
2186             }
2187           }
2188           else
2189           {
2190             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2191             {
2192               if(col < roi_out->width && row < roi_out->height)
2193               {
2194                 out[(row * roi_out->width + col) * 4]
2195                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2196                 out[(row * roi_out->width + col) * 4 + 2]
2197                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2198               }
2199 
2200               indx++;
2201               col++;
2202               if(col < roi_out->width && row < roi_out->height)
2203               {
2204                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2205                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2206                 out[(row * roi_out->width + col) * 4]
2207                     = clampnan(rgbgreen[indx]
2208                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2209                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2210                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2211                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2212                                          * temp,
2213                                0.0, 1.0);
2214                 out[(row * roi_out->width + col) * 4 + 2]
2215                     = clampnan(rgbgreen[indx]
2216                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2217                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2218                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2219                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2220                                          * temp,
2221                                0.0, 1.0);
2222               }
2223             }
2224 
2225             if(cc1 & 1)
2226             { // width of tile is odd
2227               if(col < roi_out->width && row < roi_out->height)
2228               {
2229                 out[(row * roi_out->width + col) * 4]
2230                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2231                 out[(row * roi_out->width + col) * 4 + 2]
2232                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2233               }
2234             }
2235           }
2236 
2237 #else
2238 
2239           if((FC(rr, 2, filters) & 1) == 1)
2240           {
2241             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2242             {
2243               if(col < roi_out->width && row < roi_out->height)
2244               {
2245                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2246                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2247                 out[(row * roi_out->width + col) * 4]
2248                     = clampnan(rgbgreen[indx]
2249                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2250                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2251                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2252                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2253                                          * temp,
2254                                0.0, 1.0);
2255                 out[(row * roi_out->width + col) * 4 + 2]
2256                     = clampnan(rgbgreen[indx]
2257                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2258                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2259                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2260                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2261                                          * temp,
2262                                0.0, 1.0);
2263               }
2264 
2265               indx++;
2266               col++;
2267               if(col < roi_out->width && row < roi_out->height)
2268               {
2269                 out[(row * roi_out->width + col) * 4]
2270                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2271                 out[(row * roi_out->width + col) * 4 + 2]
2272                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2273               }
2274             }
2275 
2276             if(cc1 & 1)
2277             { // width of tile is odd
2278               if(col < roi_out->width && row < roi_out->height)
2279               {
2280                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2281                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2282                 out[(row * roi_out->width + col) * 4]
2283                     = clampnan(rgbgreen[indx]
2284                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2285                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2286                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2287                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2288                                          * temp,
2289                                0.0, 1.0);
2290                 out[(row * roi_out->width + col) * 4 + 2]
2291                     = clampnan(rgbgreen[indx]
2292                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2293                                       + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2294                                       + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2295                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2296                                          * temp,
2297                                0.0, 1.0);
2298               }
2299             }
2300           }
2301           else
2302           {
2303             for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2304             {
2305               if(col < roi_out->width && row < roi_out->height)
2306               {
2307                 out[(row * roi_out->width + col) * 4]
2308                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
2309                 out[(row * roi_out->width + col) * 4 + 2]
2310                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
2311               }
2312 
2313               indx++;
2314               col++;
2315               if(col < roi_out->width && row < roi_out->height)
2316               {
2317                 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2318                                     - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2319 
2320                 out[(row * roi_out->width + col) * 4]
2321                     = clampnan(rgbgreen[indx]
2322                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2323                                       + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2324                                       + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2325                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2326                                          * temp,
2327                                0.0f, 1.0f);
2328 
2329                 out[(row * roi_out->width + col) * 4 + 2]
2330                     = clampnan(rgbgreen[indx]
2331                                    - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2332                                       + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2333                                       + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2334                                       + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2335                                          * temp,
2336                                0.0f, 1.0f);
2337               }
2338             }
2339 
2340             if(cc1 & 1)
2341             { // width of tile is odd
2342               if(col < roi_out->width && row < roi_out->height)
2343               {
2344                 out[(row * roi_out->width + col) * 4]
2345                     = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
2346                 out[(row * roi_out->width + col) * 4 + 2]
2347                     = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
2348               }
2349             }
2350           }
2351 
2352 #endif
2353         }
2354 
2355         // copy smoothed results back to image matrix
2356         for(int rr = 16; rr < rr1 - 16; rr++)
2357         {
2358           int row = rr + top;
2359           int cc = 16;
2360           // TODO (darktable): we have the pixel colors interleaved so writing them in blocks using SSE2 is
2361           // not possible. or is it?
2362           // #ifdef __SSE2__
2363           //
2364           //           for(; cc < cc1 - 19; cc += 4)
2365           //           {
2366           //             STVFU(out[(row * roi_out->width + (cc + left))  * 4 + 1], LVF(rgbgreen[rr * ts +
2367           //             cc]));
2368           //           }
2369           //
2370           // #endif
2371 
2372           for(; cc < cc1 - 16; cc++)
2373           {
2374             int col = cc + left;
2375             int indx = rr * ts + cc;
2376             if(col < roi_out->width && row < roi_out->height)
2377               out[(row * roi_out->width + col) * 4 + 1] = clampnan(rgbgreen[indx], 0.0f, 1.0f);
2378           }
2379         }
2380       }
2381     } // end of main loop
2382 
2383     // clean up
2384     free(buffer);
2385   }
2386 }
2387 
2388 /*==================================================================================
2389  * end of raw therapee code
2390  *==================================================================================*/
2391 
2392 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2393 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2394 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2395