1 /*
2 This file is part of darktable,
3 Copyright (C) 2011-2020 darktable developers.
4
5 darktable is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 darktable is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with darktable. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #define __STDC_FORMAT_MACROS
20
21 #if defined(__SSE__)
22 #include <xmmintrin.h>
23 #endif
24
25 extern "C" {
26 #include "develop/imageop.h"
27 #include "develop/imageop_math.h"
28
29 // otherwise the name will be mangled and the linker won't be able to see the function ...
30 void amaze_demosaic_RT(
31 dt_dev_pixelpipe_iop_t *piece,
32 const float *const in,
33 float *out,
34 const dt_iop_roi_t *const roi_in,
35 const dt_iop_roi_t *const roi_out,
36 const int filters);
37 }
38
39 #include <algorithm>
40 #include <cmath>
41 #include <cstdint>
42 #include <cstdlib>
43 #include <cstring>
44
clampnan(const float x,const float m,const float M)45 static __inline float clampnan(const float x, const float m, const float M)
46 {
47 float r;
48
49 // clamp to [m, M] if x is infinite; return average of m and M if x is NaN; else just return x
50
51 if(std::isinf(x))
52 r = (std::isless(x, m) ? m : (std::isgreater(x, M) ? M : x));
53 else if(std::isnan(x))
54 r = (m + M) / 2.0f;
55 else // normal number
56 r = x;
57
58 return r;
59 }
60
61 #ifndef __SSE2__
xmul2f(float d)62 static __inline float xmul2f(float d)
63 {
64 union {
65 float f;
66 uint32_t u;
67 } x;
68 x.f = d;
69 if(x.u & 0x7FFFFFFF) // if f==0 do nothing
70 {
71 x.u += 1 << 23; // add 1 to the exponent
72 }
73 return x.f;
74 }
75 #endif
76
xdiv2f(float d)77 static __inline float xdiv2f(float d)
78 {
79 union {
80 float f;
81 uint32_t u;
82 } x;
83 x.f = d;
84 if(x.u & 0x7FFFFFFF) // if f==0 do nothing
85 {
86 x.u -= 1 << 23; // sub 1 from the exponent
87 }
88 return x.f;
89 }
90
xdivf(float d,int n)91 static __inline float xdivf(float d, int n)
92 {
93 union {
94 float f;
95 uint32_t u;
96 } x;
97 x.f = d;
98 if(x.u & 0x7FFFFFFF) // if f==0 do nothing
99 {
100 x.u -= n << 23; // add n to the exponent
101 }
102 return x.f;
103 }
104
105
106 /*==================================================================================
107 * begin raw therapee code, hg checkout of march 03, 2016 branch master.
108 *==================================================================================*/
109
110
111 #ifdef __SSE2__
112
113 #ifdef __GNUC__
114 #define INLINE __inline
115 #else
116 #define INLINE inline
117 #endif
118
119 #ifdef __GNUC__
120 #if((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) && (!defined(WIN32) || defined(__x86_64__))
121 #define LVF(x) _mm_load_ps(&x)
122 #define LVFU(x) _mm_loadu_ps(&x)
123 #define STVF(x, y) _mm_store_ps(&x, y)
124 #define STVFU(x, y) _mm_storeu_ps(&x, y)
125 #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3, also need to map the
126 // aligned functions to unaligned functions for WIN32 builds
127 #define LVF(x) _mm_loadu_ps(&x)
128 #define LVFU(x) _mm_loadu_ps(&x)
129 #define STVF(x, y) _mm_storeu_ps(&x, y)
130 #define STVFU(x, y) _mm_storeu_ps(&x, y)
131 #endif
132 #else
133 #define LVF(x) _mm_load_ps(&x)
134 #define LVFU(x) _mm_loadu_ps(&x)
135 #define STVF(x, y) _mm_store_ps(&x, y)
136 #define STVFU(x, y) _mm_storeu_ps(&x, y)
137 #endif
138
139 #define STC2VFU(a, v) \
140 { \
141 __m128 TST1V = _mm_loadu_ps(&a); \
142 __m128 TST2V = _mm_unpacklo_ps(v, v); \
143 vmask cmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0); \
144 _mm_storeu_ps(&a, vself(cmask, TST1V, TST2V)); \
145 TST1V = _mm_loadu_ps((&a) + 4); \
146 TST2V = _mm_unpackhi_ps(v, v); \
147 _mm_storeu_ps((&a) + 4, vself(cmask, TST1V, TST2V)); \
148 }
149
150 #define ZEROV _mm_setzero_ps()
151 #define F2V(a) _mm_set1_ps((a))
152
153 typedef __m128i vmask;
154 typedef __m128 vfloat;
155 typedef __m128i vint;
156
LC2VFU(float & a)157 static INLINE vfloat LC2VFU(float &a)
158 {
159 // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
160 vfloat a1 = _mm_loadu_ps(&a);
161 vfloat a2 = _mm_loadu_ps((&a) + 4);
162 return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0));
163 }
vmaxf(vfloat x,vfloat y)164 static INLINE vfloat vmaxf(vfloat x, vfloat y)
165 {
166 return _mm_max_ps(x, y);
167 }
vminf(vfloat x,vfloat y)168 static INLINE vfloat vminf(vfloat x, vfloat y)
169 {
170 return _mm_min_ps(x, y);
171 }
vcast_vf_f(float f)172 static INLINE vfloat vcast_vf_f(float f)
173 {
174 return _mm_set_ps(f, f, f, f);
175 }
vorm(vmask x,vmask y)176 static INLINE vmask vorm(vmask x, vmask y)
177 {
178 return _mm_or_si128(x, y);
179 }
vandm(vmask x,vmask y)180 static INLINE vmask vandm(vmask x, vmask y)
181 {
182 return _mm_and_si128(x, y);
183 }
vandnotm(vmask x,vmask y)184 static INLINE vmask vandnotm(vmask x, vmask y)
185 {
186 return _mm_andnot_si128(x, y);
187 }
vabsf(vfloat f)188 static INLINE vfloat vabsf(vfloat f)
189 {
190 return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f);
191 }
vself(vmask mask,vfloat x,vfloat y)192 static INLINE vfloat vself(vmask mask, vfloat x, vfloat y)
193 {
194 return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
195 }
vmaskf_lt(vfloat x,vfloat y)196 static INLINE vmask vmaskf_lt(vfloat x, vfloat y)
197 {
198 return (__m128i)_mm_cmplt_ps(x, y);
199 }
vmaskf_gt(vfloat x,vfloat y)200 static INLINE vmask vmaskf_gt(vfloat x, vfloat y)
201 {
202 return (__m128i)_mm_cmpgt_ps(x, y);
203 }
ULIMV(vfloat a,vfloat b,vfloat c)204 static INLINE vfloat ULIMV(vfloat a, vfloat b, vfloat c)
205 {
206 // made to clamp a in range [b,c] but in fact it's also the median of a,b,c, which means that the result is
207 // independent on order of arguments
208 // ULIMV(a,b,c) = ULIMV(a,c,b) = ULIMV(b,a,c) = ULIMV(b,c,a) = ULIMV(c,a,b) = ULIMV(c,b,a)
209 return vmaxf(vminf(a, b), vminf(vmaxf(a, b), c));
210 }
SQRV(vfloat a)211 static INLINE vfloat SQRV(vfloat a)
212 {
213 return a * a;
214 }
vintpf(vfloat a,vfloat b,vfloat c)215 static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c)
216 {
217 // calculate a * b + (1 - a) * c (interpolate two values)
218 // following is valid:
219 // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
220 // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
221 return a * (b - c) + c;
222 }
vaddc2vfu(float & a)223 static INLINE vfloat vaddc2vfu(float &a)
224 {
225 // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
226 vfloat a1 = _mm_loadu_ps(&a);
227 vfloat a2 = _mm_loadu_ps((&a) + 4);
228 return _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(2, 0, 2, 0)) + _mm_shuffle_ps(a1, a2, _MM_SHUFFLE(3, 1, 3, 1));
229 }
vadivapb(vfloat a,vfloat b)230 static INLINE vfloat vadivapb(vfloat a, vfloat b)
231 {
232 return a / (a + b);
233 }
vselc(vmask mask,vint x,vint y)234 static INLINE vint vselc(vmask mask, vint x, vint y)
235 {
236 return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
237 }
vselinotzero(vmask mask,vint x)238 static INLINE vint vselinotzero(vmask mask, vint x)
239 {
240 // returns value of x if corresponding mask bits are 0, else returns 0
241 // faster than vselc(mask, ZEROV, x)
242 return _mm_andnot_si128(mask, x);
243 }
vmul2f(vfloat a)244 static INLINE vfloat vmul2f(vfloat a)
245 {
246 // fastest way to multiply by 2
247 return a + a;
248 }
vmaskf_ge(vfloat x,vfloat y)249 static INLINE vmask vmaskf_ge(vfloat x, vfloat y)
250 {
251 return (__m128i)_mm_cmpge_ps(x, y);
252 }
vnotm(vmask x)253 static INLINE vmask vnotm(vmask x)
254 {
255 return _mm_xor_si128(x, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
256 }
vdup(vfloat a)257 static INLINE vfloat vdup(vfloat a)
258 {
259 // returns { a[0],a[0],a[1],a[1] }
260 return _mm_unpacklo_ps(a, a);
261 }
262
263 #endif // __SSE2__
264
SQR(_Tp x)265 template <typename _Tp> static inline const _Tp SQR(_Tp x)
266 {
267 // return std::pow(x,2); Slower than:
268 return (x * x);
269 }
270
intp(const _Tp a,const _Tp b,const _Tp c)271 template <typename _Tp> static inline const _Tp intp(const _Tp a, const _Tp b, const _Tp c)
272 {
273 // calculate a * b + (1 - a) * c
274 // following is valid:
275 // intp(a, b+x, c+x) = intp(a, b, c) + x
276 // intp(a, b*x, c*x) = intp(a, b, c) * x
277 return a * (b - c) + c;
278 }
279
LIM(const _Tp a,const _Tp b,const _Tp c)280 template <typename _Tp> static inline const _Tp LIM(const _Tp a, const _Tp b, const _Tp c)
281 {
282 return std::max(b, std::min(a, c));
283 }
284
ULIM(const _Tp a,const _Tp b,const _Tp c)285 template <typename _Tp> static inline const _Tp ULIM(const _Tp a, const _Tp b, const _Tp c)
286 {
287 return ((b < c) ? LIM(a, b, c) : LIM(a, c, b));
288 }
289
290
291
292 ////////////////////////////////////////////////////////////////
293 //
294 // AMaZE demosaic algorithm
295 // (Aliasing Minimization and Zipper Elimination)
296 //
297 // copyright (c) 2008-2010 Emil Martinec <ejmartin@uchicago.edu>
298 // optimized for speed by Ingo Weyrich
299 //
300 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
301 //
302 // code dated: May 27, 2010
303 // latest modification: Ingo Weyrich, January 25, 2016
304 //
305 // amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
306 // it under the terms of the GNU General Public License as published by
307 // the Free Software Foundation, either version 3 of the License, or
308 // (at your option) any later version.
309 //
310 // This program is distributed in the hope that it will be useful,
311 // but WITHOUT ANY WARRANTY; without even the implied warranty of
312 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
313 // GNU General Public License for more details.
314 //
315 // You should have received a copy of the GNU General Public License
316 // along with this program. If not, see <http://www.gnu.org/licenses/>.
317 //
318 ////////////////////////////////////////////////////////////////
319
320
amaze_demosaic_RT(dt_dev_pixelpipe_iop_t * piece,const float * const in,float * out,const dt_iop_roi_t * const roi_in,const dt_iop_roi_t * const roi_out,const int filters)321 void amaze_demosaic_RT(dt_dev_pixelpipe_iop_t *piece, const float *const in,
322 float *out, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
323 const int filters)
324 {
325 int winx = roi_out->x;
326 int winy = roi_out->y;
327 int winw = roi_in->width;
328 int winh = roi_in->height;
329
330 const int width = winw, height = winh;
331 const float clip_pt = fminf(piece->pipe->dsc.processed_maximum[0],
332 fminf(piece->pipe->dsc.processed_maximum[1], piece->pipe->dsc.processed_maximum[2]));
333 const float clip_pt8 = 0.8f * clip_pt;
334
335 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
336 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
337 #ifndef AMAZETS
338 #define AMAZETS 160
339 #endif
340 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate
341 // multi-threading
342 // We assure that Tile size is a multiple of 32 in the range [96;992]
343 constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
344 constexpr int tsh = ts / 2; // half of Tile size
345
346 // offset of R pixel within a Bayer quartet
347 int ex, ey;
348
349 // determine GRBG coset; (ey,ex) is the offset of the R subarray
350 if(FC(0, 0, filters) == 1)
351 { // first pixel is G
352 if(FC(0, 1, filters) == 0)
353 {
354 ey = 0;
355 ex = 1;
356 }
357 else
358 {
359 ey = 1;
360 ex = 0;
361 }
362 }
363 else
364 { // first pixel is R or B
365 if(FC(0, 0, filters) == 0)
366 {
367 ey = 0;
368 ex = 0;
369 }
370 else
371 {
372 ey = 1;
373 ex = 1;
374 }
375 }
376
377 // shifts of pointer value to access pixels in vertical and diagonal directions
378 constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3,
379 m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
380
381 // tolerance to avoid dividing by zero
382 constexpr float eps = 1e-5, epssq = 1e-10; // tolerance to avoid dividing by zero
383
384 // adaptive ratios threshold
385 constexpr float arthresh = 0.75;
386
387 // gaussian on 5x5 quincunx, sigma=1.2
388 constexpr float gaussodd[4]
389 = { 0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f };
390 // nyquist texture test threshold
391 constexpr float nyqthresh = 0.5;
392 // gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
393 // Is this really sigma=1.2????, seems more like sigma = 1.672
394 constexpr float gaussgrad[6] = { nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f,
395 nyqthresh * 0.0521818194747806f, nyqthresh * 0.03687419286733595f,
396 nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f };
397 // gaussian on 5x5 alt quincunx, sigma=1.5
398 constexpr float gausseven[2] = { 0.13719494435797422f, 0.05640252782101291f };
399 // gaussian on quincunx grid
400 constexpr float gquinc[4] = { 0.169917f, 0.108947f, 0.069855f, 0.0287182f };
401
402 typedef struct
403 {
404 float h;
405 float v;
406 } s_hv;
407
408 #ifdef _OPENMP
409 #pragma omp parallel
410 #endif
411 {
412 // int progresscounter = 0;
413
414 constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
415 // assign working space
416 char *buffer
417 = (char *)calloc(sizeof(float) * 14 * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
418 // aligned to 64 byte boundary
419 char *data = (char *)((uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
420
421 // green values
422 float *rgbgreen = (float(*))data;
423 // sum of square of horizontal gradient and square of vertical gradient
424 float *delhvsqsum = (float(*))((char *)rgbgreen + sizeof(float) * ts * ts + cldf * 64); // 1
425 // gradient based directional weights for interpolation
426 float *dirwts0 = (float(*))((char *)delhvsqsum + sizeof(float) * ts * ts + cldf * 64); // 1
427 float *dirwts1 = (float(*))((char *)dirwts0 + sizeof(float) * ts * ts + cldf * 64); // 1
428 // vertically interpolated colour differences G-R, G-B
429 float *vcd = (float(*))((char *)dirwts1 + sizeof(float) * ts * ts + cldf * 64); // 1
430 // horizontally interpolated colour differences
431 float *hcd = (float(*))((char *)vcd + sizeof(float) * ts * ts + cldf * 64); // 1
432 // alternative vertical interpolation
433 float *vcdalt = (float(*))((char *)hcd + sizeof(float) * ts * ts + cldf * 64); // 1
434 // alternative horizontal interpolation
435 float *hcdalt = (float(*))((char *)vcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
436 // square of average colour difference
437 float *cddiffsq = (float(*))((char *)hcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
438 // weight to give horizontal vs vertical interpolation
439 float *hvwt = (float(*))((char *)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64); // 1
440 // final interpolated colour difference
441 float(*Dgrb)[ts * tsh] = (float(*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
442 // gradient in plus (NE/SW) direction
443 float *delp = (float(*))cddiffsq; // there is no overlap in buffer usage => share
444 // gradient in minus (NW/SE) direction
445 float *delm = (float(*))((char *)delp + sizeof(float) * ts * tsh + cldf * 64);
446 // diagonal interpolation of R+B
447 float *rbint = (float(*))delm; // there is no overlap in buffer usage => share
448 // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture
449 // regions)
450 s_hv *Dgrb2 = (s_hv(*))((char *)hvwt + sizeof(float) * ts * tsh + cldf * 64); // 1
451 // difference between up/down interpolations of G
452 float *dgintv = (float(*))Dgrb2; // there is no overlap in buffer usage => share
453 // difference between left/right interpolations of G
454 float *dginth = (float(*))((char *)dgintv + sizeof(float) * ts * ts + cldf * 64); // 1
455 // square of diagonal colour differences
456 float *Dgrbsq1m = (float(*))((char *)dginth + sizeof(float) * ts * ts + cldf * 64); // 1
457 float *Dgrbsq1p = (float(*))((char *)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64); // 1
458 // tile raw data
459 float *cfa = (float(*))((char *)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64); // 1
460 // relative weight for combining plus and minus diagonal interpolations
461 float *pmwt = (float(*))delhvsqsum; // there is no overlap in buffer usage => share
462 // interpolated colour difference R-B in minus and plus direction
463 float *rbm = (float(*))vcd; // there is no overlap in buffer usage => share
464 float *rbp = (float(*))((char *)rbm + sizeof(float) * ts * tsh + cldf * 64);
465 // nyquist texture flags 1=nyquist, 0=not nyquist
466 unsigned char *nyquist = (unsigned char(*))((char *)cfa + sizeof(float) * ts * ts + cldf * 64); // 1
467 unsigned char *nyquist2 = (unsigned char(*))cddiffsq;
468 float *nyqutest = (float(*))((char *)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1
469
470 // Main algorithm: Tile loop
471 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
472 #ifdef _OPENMP
473 #pragma omp for SIMD() schedule(static) collapse(2) nowait
474 #endif
475
476 for(int top = winy - 16; top < winy + height; top += ts - 32)
477 {
478 for(int left = winx - 16; left < winx + width; left += ts - 32)
479 {
480 memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
481 // location of tile bottom edge
482 int bottom = MIN(top + ts, winy + height + 16);
483 // location of tile right edge
484 int right = MIN(left + ts, winx + width + 16);
485 // tile width (=ts except for right edge of image)
486 int rr1 = bottom - top;
487 // tile height (=ts except for bottom edge of image)
488 int cc1 = right - left;
489 // bookkeeping for borders
490 // min and max row/column in the tile
491 int rrmin = top < winy ? 16 : 0;
492 int ccmin = left < winx ? 16 : 0;
493 int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
494 int ccmax = right > (winx + width) ? winx + width - left : cc1;
495
496 // rgb from input CFA data
497 // rgb values should be floating point number between 0 and 1
498 // after white balance multipliers are applied
499 // a 16 pixel border is added to each side of the image
500
501 // begin of tile initialization
502 #ifdef __SSE2__
503 // fill upper border
504 if(rrmin > 0)
505 {
506 for(int rr = 0; rr < 16; rr++)
507 {
508 int row = 32 - rr + top;
509
510 for(int cc = ccmin; cc < ccmax; cc += 4)
511 {
512 int indx1 = rr * ts + cc;
513 vfloat tempv = LVFU(in[row * width + (cc + left)]);
514 STVF(cfa[indx1], tempv);
515 STVF(rgbgreen[indx1], tempv);
516 }
517 }
518 }
519
520 // fill inner part
521 for(int rr = rrmin; rr < rrmax; rr++)
522 {
523 int row = rr + top;
524
525 for(int cc = ccmin; cc < ccmax; cc += 4)
526 {
527 int indx1 = rr * ts + cc;
528 vfloat tempv = LVFU(in[row * width + (cc + left)]);
529 STVF(cfa[indx1], tempv);
530 STVF(rgbgreen[indx1], tempv);
531 }
532 }
533
534 // fill lower border
535 if(rrmax < rr1)
536 {
537 for(int rr = 0; rr < 16; rr++)
538 for(int cc = ccmin; cc < ccmax; cc += 4)
539 {
540 int indx1 = (rrmax + rr) * ts + cc;
541 vfloat tempv = LVFU(in[(winy + height - rr - 2) * width + (left + cc)]);
542 STVF(cfa[indx1], tempv);
543 STVF(rgbgreen[indx1], tempv);
544 }
545 }
546
547 #else
548
549 // fill upper border
550 if(rrmin > 0)
551 {
552 for(int rr = 0; rr < 16; rr++)
553 for(int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++)
554 {
555 cfa[rr * ts + cc] = (in[row * width + (cc + left)]);
556 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
557 }
558 }
559
560 // fill inner part
561 for(int rr = rrmin; rr < rrmax; rr++)
562 {
563 int row = rr + top;
564
565 for(int cc = ccmin; cc < ccmax; cc++)
566 {
567 int indx1 = rr * ts + cc;
568 cfa[indx1] = (in[row * width + (cc + left)]);
569 rgbgreen[indx1] = cfa[indx1];
570 }
571 }
572
573 // fill lower border
574 if(rrmax < rr1)
575 {
576 for(int rr = 0; rr < 16; rr++)
577 for(int cc = ccmin; cc < ccmax; cc++)
578 {
579 cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + (left + cc)]);
580 rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
581 }
582 }
583
584 #endif
585
586 // fill left border
587 if(ccmin > 0)
588 {
589 for(int rr = rrmin; rr < rrmax; rr++)
590 for(int cc = 0, row = rr + top; cc < 16; cc++)
591 {
592 cfa[rr * ts + cc] = (in[row * width + (32 - cc + left)]);
593 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
594 }
595 }
596
597 // fill right border
598 if(ccmax < cc1)
599 {
600 for(int rr = rrmin; rr < rrmax; rr++)
601 for(int cc = 0; cc < 16; cc++)
602 {
603 cfa[rr * ts + ccmax + cc] = (in[(top + rr) * width + ((winx + width - cc - 2))]);
604 rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
605 }
606 }
607
608 // also, fill the image corners
609 if(rrmin > 0 && ccmin > 0)
610 {
611 for(int rr = 0; rr < 16; rr++)
612 for(int cc = 0; cc < 16; cc++)
613 {
614 cfa[(rr)*ts + cc] = (in[(winy + 32 - rr) * width + (winx + 32 - cc)]);
615 rgbgreen[(rr)*ts + cc] = cfa[(rr)*ts + cc];
616 }
617 }
618
619 if(rrmax < rr1 && ccmax < cc1)
620 {
621 for(int rr = 0; rr < 16; rr++)
622 for(int cc = 0; cc < 16; cc++)
623 {
624 cfa[(rrmax + rr) * ts + ccmax + cc]
625 = (in[(winy + height - rr - 2) * width + ((winx + width - cc - 2))]);
626 rgbgreen[(rrmax + rr) * ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
627 }
628 }
629
630 if(rrmin > 0 && ccmax < cc1)
631 {
632 for(int rr = 0; rr < 16; rr++)
633 for(int cc = 0; cc < 16; cc++)
634 {
635 cfa[(rr)*ts + ccmax + cc] = (in[(winy + 32 - rr) * width + ((winx + width - cc - 2))]);
636 rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr)*ts + ccmax + cc];
637 }
638 }
639
640 if(rrmax < rr1 && ccmin > 0)
641 {
642 for(int rr = 0; rr < 16; rr++)
643 for(int cc = 0; cc < 16; cc++)
644 {
645 cfa[(rrmax + rr) * ts + cc] = (in[(winy + height - rr - 2) * width + ((winx + 32 - cc))]);
646 rgbgreen[(rrmax + rr) * ts + cc] = cfa[(rrmax + rr) * ts + cc];
647 }
648 }
649
650 // end of tile initialization
651
652 // horizontal and vertical gradients
653 #ifdef __SSE2__
654 vfloat epsv = F2V(eps);
655
656 for(int rr = 2; rr < rr1 - 2; rr++)
657 {
658 for(int indx = rr * ts; indx < rr * ts + cc1; indx += 4)
659 {
660 vfloat delhv = vabsf(LVFU(cfa[indx + 1]) - LVFU(cfa[indx - 1]));
661 vfloat delvv = vabsf(LVF(cfa[indx + v1]) - LVF(cfa[indx - v1]));
662 STVF(dirwts1[indx], epsv + vabsf(LVFU(cfa[indx + 2]) - LVF(cfa[indx]))
663 + vabsf(LVF(cfa[indx]) - LVFU(cfa[indx - 2])) + delhv);
664 STVF(dirwts0[indx], epsv + vabsf(LVF(cfa[indx + v2]) - LVF(cfa[indx]))
665 + vabsf(LVF(cfa[indx]) - LVF(cfa[indx - v2])) + delvv);
666 STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
667 }
668 }
669
670 #else
671
672 for(int rr = 2; rr < rr1 - 2; rr++)
673 for(int cc = 2, indx = (rr)*ts + cc; cc < cc1 - 2; cc++, indx++)
674 {
675 float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
676 float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
677 dirwts0[indx]
678 = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
679 dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
680 delhvsqsum[indx] = SQR(delh) + SQR(delv);
681 }
682
683 #endif
684
685 // interpolate vertical and horizontal colour differences
686 #ifdef __SSE2__
687 vfloat sgnv;
688
689 if(!(FC(4, 4, filters) & 1))
690 {
691 sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
692 }
693 else
694 {
695 sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
696 }
697
698 vfloat zd5v = F2V(0.5f);
699 vfloat onev = F2V(1.f);
700 vfloat arthreshv = F2V(arthresh);
701 vfloat clip_pt8v = F2V(clip_pt8);
702
703 for(int rr = 4; rr < rr1 - 4; rr++)
704 {
705 sgnv = -sgnv;
706
707 for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4)
708 {
709 // colour ratios in each cardinal direction
710 vfloat cfav = LVF(cfa[indx]);
711 vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx]))
712 / (LVF(dirwts0[indx - v2]) * (epsv + cfav)
713 + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
714 vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx]))
715 / (LVF(dirwts0[indx + v2]) * (epsv + cfav)
716 + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
717 vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx]))
718 / (LVFU(dirwts1[indx - 2]) * (epsv + cfav)
719 + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
720 vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx]))
721 / (LVFU(dirwts1[indx + 2]) * (epsv + cfav)
722 + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
723
724 // G interpolated in vert/hor directions using Hamilton-Adams method
725 vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
726 vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
727 vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
728 vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
729
730 // G interpolated in vert/hor directions using adaptive ratios
731 vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
732 vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
733 vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
734 vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
735
736 // adaptive weights for vertical/horizontal directions
737 vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
738 vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
739
740 // interpolated G via adaptive weights of cardinal evaluations
741 vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
742 vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
743
744 // interpolated colour differences
745 vfloat hcdaltv = sgnv * (Ginthhav - cfav);
746 vfloat vcdaltv = sgnv * (Gintvhav - cfav);
747 STVF(hcdalt[indx], hcdaltv);
748 STVF(vcdalt[indx], vcdaltv);
749
750 vmask clipmask = vorm(vorm(vmaskf_gt(cfav, clip_pt8v), vmaskf_gt(Gintvhav, clip_pt8v)),
751 vmaskf_gt(Ginthhav, clip_pt8v));
752 guarv = vself(clipmask, guhav, guarv);
753 gdarv = vself(clipmask, gdhav, gdarv);
754 glarv = vself(clipmask, glhav, glarv);
755 grarv = vself(clipmask, grhav, grarv);
756
757 // use HA if highlights are (nearly) clipped
758 STVF(vcd[indx], vself(clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
759 STVF(hcd[indx], vself(clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
760
761 // differences of interpolations in opposite directions
762 STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
763 STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
764 }
765 }
766
767 #else
768
769 for(int rr = 4; rr < rr1 - 4; rr++)
770 {
771 bool fcswitch = FC(rr, 4, filters) & 1;
772
773 for(int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++)
774 {
775
776 // colour ratios in each cardinal direction
777 float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx])
778 / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
779 float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx])
780 / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
781 float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx])
782 / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
783 float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx])
784 / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
785
786 // G interpolated in vert/hor directions using Hamilton-Adams method
787 float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
788 float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
789 float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
790 float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
791
792 // G interpolated in vert/hor directions using adaptive ratios
793 float guar, gdar, glar, grar;
794
795 if(fabsf(1.f - cru) < arthresh)
796 {
797 guar = cfa[indx] * cru;
798 }
799 else
800 {
801 guar = guha;
802 }
803
804 if(fabsf(1.f - crd) < arthresh)
805 {
806 gdar = cfa[indx] * crd;
807 }
808 else
809 {
810 gdar = gdha;
811 }
812
813 if(fabsf(1.f - crl) < arthresh)
814 {
815 glar = cfa[indx] * crl;
816 }
817 else
818 {
819 glar = glha;
820 }
821
822 if(fabsf(1.f - crr) < arthresh)
823 {
824 grar = cfa[indx] * crr;
825 }
826 else
827 {
828 grar = grha;
829 }
830
831 // adaptive weights for vertical/horizontal directions
832 float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
833 float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
834
835 // interpolated G via adaptive weights of cardinal evaluations
836 float Gintvha = vwt * gdha + (1.f - vwt) * guha;
837 float Ginthha = hwt * grha + (1.f - hwt) * glha;
838
839 // interpolated colour differences
840 if(fcswitch)
841 {
842 vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
843 hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
844 vcdalt[indx] = cfa[indx] - Gintvha;
845 hcdalt[indx] = cfa[indx] - Ginthha;
846 }
847 else
848 {
849 // interpolated colour differences
850 vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
851 hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
852 vcdalt[indx] = Gintvha - cfa[indx];
853 hcdalt[indx] = Ginthha - cfa[indx];
854 }
855
856 fcswitch = !fcswitch;
857
858 if(cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8)
859 {
860 // use HA if highlights are (nearly) clipped
861 guar = guha;
862 gdar = gdha;
863 glar = glha;
864 grar = grha;
865 vcd[indx] = vcdalt[indx];
866 hcd[indx] = hcdalt[indx];
867 }
868
869 // differences of interpolations in opposite directions
870 dgintv[indx] = MIN(SQR(guha - gdha), SQR(guar - gdar));
871 dginth[indx] = MIN(SQR(glha - grha), SQR(glar - grar));
872 }
873 }
874
875 #endif
876
877
878
879 #ifdef __SSE2__
880 vfloat clip_ptv = F2V(clip_pt);
881 vfloat sgn3v;
882
883 if(!(FC(4, 4, filters) & 1))
884 {
885 sgnv = _mm_set_ps(1.f, -1.f, 1.f, -1.f);
886 }
887 else
888 {
889 sgnv = _mm_set_ps(-1.f, 1.f, -1.f, 1.f);
890 }
891
892 sgn3v = sgnv + sgnv + sgnv;
893
894 for(int rr = 4; rr < rr1 - 4; rr++)
895 {
896 vfloat nsgnv = sgnv;
897 sgnv = -sgnv;
898 sgn3v = -sgn3v;
899
900 for(int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4)
901 {
902 vfloat hcdv = LVF(hcd[indx]);
903 vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv)
904 + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2]))
905 + SQRV(hcdv - LVFU(hcd[indx + 2]));
906 vfloat hcdaltv = LVF(hcdalt[indx]);
907 vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv)
908 + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2]))
909 + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
910 vfloat vcdv = LVF(vcd[indx]);
911 vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv)
912 + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2]))
913 + SQRV(vcdv - LVF(vcd[indx + v2]));
914 vfloat vcdaltv = LVF(vcdalt[indx]);
915 vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv)
916 + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2]))
917 + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
918
919 // choose the smallest variance; this yields a smoother interpolation
920 hcdv = vself(vmaskf_lt(hcdaltvarv, hcdvarv), hcdaltv, hcdv);
921 vcdv = vself(vmaskf_lt(vcdaltvarv, vcdvarv), vcdaltv, vcdv);
922
923 // bound the interpolation in regions of high saturation
924 // vertical and horizontal G interpolations
925 vfloat Ginthv = sgnv * hcdv + LVF(cfa[indx]);
926 vfloat temp2v = sgn3v * hcdv;
927 vfloat hwtv = onev + temp2v / (epsv + Ginthv + LVF(cfa[indx]));
928 vmask hcdmask = vmaskf_gt(nsgnv * hcdv, ZEROV);
929 vfloat hcdoldv = hcdv;
930 vfloat tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1])));
931 hcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
932 hcdv = vself(hcdmask, hcdv, hcdoldv);
933 hcdv = vself(vmaskf_gt(Ginthv, clip_ptv), tempv, hcdv);
934 STVF(hcd[indx], hcdv);
935
936 vfloat Gintvv = sgnv * vcdv + LVF(cfa[indx]);
937 temp2v = sgn3v * vcdv;
938 vfloat vwtv = onev + temp2v / (epsv + Gintvv + LVF(cfa[indx]));
939 vmask vcdmask = vmaskf_gt(nsgnv * vcdv, ZEROV);
940 vfloat vcdoldv = vcdv;
941 tempv = nsgnv * (LVF(cfa[indx]) - ULIMV(Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1])));
942 vcdv = vself(vmaskf_lt(temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
943 vcdv = vself(vcdmask, vcdv, vcdoldv);
944 vcdv = vself(vmaskf_gt(Gintvv, clip_ptv), tempv, vcdv);
945 STVF(vcd[indx], vcdv);
946 STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
947 }
948 }
949
950 #else
951
952 for(int rr = 4; rr < rr1 - 4; rr++)
953 {
954 for(int cc = 4, indx = rr * ts + cc, c = FC(rr, cc, filters) & 1; cc < cc1 - 4; cc++, indx++)
955 {
956 float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2]))
957 - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
958 float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2]))
959 - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
960 float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2]))
961 - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
962 float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2]))
963 - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
964
965 // choose the smallest variance; this yields a smoother interpolation
966 if(hcdaltvar < hcdvar)
967 {
968 hcd[indx] = hcdalt[indx];
969 }
970
971 if(vcdaltvar < vcdvar)
972 {
973 vcd[indx] = vcdalt[indx];
974 }
975
976 // bound the interpolation in regions of high saturation
977 // vertical and horizontal G interpolations
978 float Gintv, Ginth;
979
980 if(c)
981 { // G site
982 Ginth = -hcd[indx] + cfa[indx]; // R or B
983 Gintv = -vcd[indx] + cfa[indx]; // B or R
984
985 if(hcd[indx] > 0)
986 {
987 if(3.f * hcd[indx] > (Ginth + cfa[indx]))
988 {
989 hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
990 }
991 else
992 {
993 float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
994 hcd[indx] = hwt * hcd[indx]
995 + (1.f - hwt) * (-ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
996 }
997 }
998
999 if(vcd[indx] > 0)
1000 {
1001 if(3.f * vcd[indx] > (Gintv + cfa[indx]))
1002 {
1003 vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
1004 }
1005 else
1006 {
1007 float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
1008 vcd[indx] = vwt * vcd[indx]
1009 + (1.f - vwt) * (-ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
1010 }
1011 }
1012
1013 if(Ginth > clip_pt)
1014 {
1015 hcd[indx] = -ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
1016 }
1017
1018 if(Gintv > clip_pt)
1019 {
1020 vcd[indx] = -ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
1021 }
1022 }
1023 else
1024 { // R or B site
1025
1026 Ginth = hcd[indx] + cfa[indx]; // interpolated G
1027 Gintv = vcd[indx] + cfa[indx];
1028
1029 if(hcd[indx] < 0)
1030 {
1031 if(3.f * hcd[indx] < -(Ginth + cfa[indx]))
1032 {
1033 hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
1034 }
1035 else
1036 {
1037 float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
1038 hcd[indx] = hwt * hcd[indx]
1039 + (1.f - hwt) * (ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
1040 }
1041 }
1042
1043 if(vcd[indx] < 0)
1044 {
1045 if(3.f * vcd[indx] < -(Gintv + cfa[indx]))
1046 {
1047 vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
1048 }
1049 else
1050 {
1051 float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
1052 vcd[indx] = vwt * vcd[indx]
1053 + (1.f - vwt) * (ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
1054 }
1055 }
1056
1057 if(Ginth > clip_pt)
1058 {
1059 hcd[indx] = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
1060 }
1061
1062 if(Gintv > clip_pt)
1063 {
1064 vcd[indx] = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
1065 }
1066
1067 cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
1068 }
1069
1070 c = !c;
1071 }
1072 }
1073
1074 #endif
1075
1076
1077
1078 #ifdef __SSE2__
1079 vfloat epssqv = F2V(epssq);
1080
1081 for(int rr = 6; rr < rr1 - 6; rr++)
1082 {
1083 for(int indx = rr * ts + 6 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 6; indx += 8)
1084 {
1085 // compute colour difference variances in cardinal directions
1086 vfloat tempv = LC2VFU(vcd[indx]);
1087 vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
1088 vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
1089 vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev)
1090 + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
1091 vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev)
1092 + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
1093
1094 vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
1095 vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
1096
1097 tempv = LC2VFU(hcd[indx]);
1098 vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
1099 vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
1100
1101 vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev)
1102 + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
1103 vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev)
1104 + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
1105
1106
1107 vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
1108 vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
1109
1110 // compute fluctuations in up/down and left/right interpolations of colours
1111 Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
1112 Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
1113
1114 Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
1115 Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
1116
1117 vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
1118 vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
1119
1120 // determine adaptive weights for G interpolation
1121 vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
1122 vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
1123
1124 // if both agree on interpolation direction, choose the one with strongest directional
1125 // discrimination;
1126 // otherwise, choose the u/d and l/r difference fluctuation weights
1127 vmask decmask = vandm(vmaskf_gt((zd5v - varwtv) * (zd5v - diffwtv), ZEROV),
1128 vmaskf_lt(vabsf(zd5v - diffwtv), vabsf(zd5v - varwtv)));
1129 STVFU(hvwt[indx >> 1], vself(decmask, varwtv, diffwtv));
1130 }
1131 }
1132
1133 #else
1134
1135 for(int rr = 6; rr < rr1 - 6; rr++)
1136 {
1137 for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1138 {
1139
1140 // compute colour difference variances in cardinal directions
1141
1142 float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
1143 float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
1144 float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
1145 float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
1146
1147 // colour difference (G-R or G-B) variance in up/down/left/right directions
1148 float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave)
1149 + SQR(vcd[indx - v3] - uave);
1150 float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave)
1151 + SQR(vcd[indx + v3] - dave);
1152 float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave)
1153 + SQR(hcd[indx - 3] - lave);
1154 float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave)
1155 + SQR(hcd[indx + 3] - rave);
1156
1157 float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1158 float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1159
1160 float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
1161 float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
1162
1163 // compute fluctuations in up/down and left/right interpolations of colours
1164 Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
1165 Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
1166 Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
1167 Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
1168
1169 float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
1170 float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
1171
1172 // determine adaptive weights for G interpolation
1173 float varwt = hcdvar / (vcdvar + hcdvar);
1174 float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
1175
1176 // if both agree on interpolation direction, choose the one with strongest directional
1177 // discrimination;
1178 // otherwise, choose the u/d and l/r difference fluctuation weights
1179 if((0.5 - varwt) * (0.5 - diffwt) > 0 && fabsf(0.5f - diffwt) < fabsf(0.5f - varwt))
1180 {
1181 hvwt[indx >> 1] = varwt;
1182 }
1183 else
1184 {
1185 hvwt[indx >> 1] = diffwt;
1186 }
1187 }
1188 }
1189
1190 #endif
1191
1192 #ifdef __SSE2__
1193 vfloat gaussg0 = F2V(gaussgrad[0]);
1194 vfloat gaussg1 = F2V(gaussgrad[1]);
1195 vfloat gaussg2 = F2V(gaussgrad[2]);
1196 vfloat gaussg3 = F2V(gaussgrad[3]);
1197 vfloat gaussg4 = F2V(gaussgrad[4]);
1198 vfloat gaussg5 = F2V(gaussgrad[5]);
1199 vfloat gausso0 = F2V(gaussodd[0]);
1200 vfloat gausso1 = F2V(gaussodd[1]);
1201 vfloat gausso2 = F2V(gaussodd[2]);
1202 vfloat gausso3 = F2V(gaussodd[3]);
1203
1204 #endif
1205
1206 // precompute nyquist
1207 for(int rr = 6; rr < rr1 - 6; rr++)
1208 {
1209 int cc = 6 + (FC(rr, 2, filters) & 1);
1210 int indx = rr * ts + cc;
1211
1212 #ifdef __SSE2__
1213
1214 for(; cc < cc1 - 7; cc += 8, indx += 8)
1215 {
1216 vfloat valv
1217 = (gausso0 * LC2VFU(cddiffsq[indx])
1218 + gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)])
1219 + LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)]))
1220 + gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)])
1221 + LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)]))
1222 + gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)])
1223 + LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)])))
1224 - (gaussg0 * LC2VFU(delhvsqsum[indx])
1225 + gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1])
1226 + LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1]))
1227 + gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1])
1228 + LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1]))
1229 + gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2])
1230 + LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2]))
1231 + gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1])
1232 + LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2])
1233 + LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2])
1234 + LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1]))
1235 + gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2])
1236 + LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
1237 STVFU(nyqutest[indx >> 1], valv);
1238 }
1239
1240 #endif
1241
1242 for(; cc < cc1 - 6; cc += 2, indx += 2)
1243 {
1244 nyqutest[indx >> 1]
1245 = (gaussodd[0] * cddiffsq[indx]
1246 + gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] + cddiffsq[(indx - p1)]
1247 + cddiffsq[(indx + m1)])
1248 + gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] + cddiffsq[(indx + 2)]
1249 + cddiffsq[(indx + v2)])
1250 + gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] + cddiffsq[(indx - p2)]
1251 + cddiffsq[(indx + m2)]))
1252 - (gaussgrad[0] * delhvsqsum[indx]
1253 + gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] + delhvsqsum[indx - 1]
1254 + delhvsqsum[indx + v1])
1255 + gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] + delhvsqsum[indx - p1]
1256 + delhvsqsum[indx + m1])
1257 + gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] + delhvsqsum[indx + 2]
1258 + delhvsqsum[indx + v2])
1259 + gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1]
1260 + delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2]
1261 + delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2]
1262 + delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1])
1263 + gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] + delhvsqsum[indx - p2]
1264 + delhvsqsum[indx + m2]));
1265 }
1266 }
1267
1268 // Nyquist test
1269 int nystartrow = 0;
1270 int nyendrow = 0;
1271 int nystartcol = ts + 1;
1272 int nyendcol = 0;
1273
1274 for(int rr = 6; rr < rr1 - 6; rr++)
1275 {
1276 for(int cc = 6 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1277 {
1278
1279 // nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB
1280 // gradients
1281 if(nyqutest[indx >> 1] > 0.f)
1282 {
1283 nyquist[indx >> 1] = 1; // nyquist=1 for nyquist region
1284 nystartrow = nystartrow ? nystartrow : rr;
1285 nyendrow = rr;
1286 nystartcol = nystartcol > cc ? cc : nystartcol;
1287 nyendcol = nyendcol < cc ? cc : nyendcol;
1288 }
1289 }
1290 }
1291
1292
1293 bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
1294
1295 if(doNyquist)
1296 {
1297 nyendrow++; // because of < condition
1298 nyendcol++; // because of < condition
1299 nystartcol -= (nystartcol & 1);
1300 nystartrow = std::max(8, nystartrow);
1301 nyendrow = std::min(rr1 - 8, nyendrow);
1302 nystartcol = std::max(8, nystartcol);
1303 nyendcol = std::min(cc1 - 8, nyendcol);
1304 memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
1305
1306 #ifdef __SSE2__
1307 vint fourvb = _mm_set1_epi8(4);
1308 vint onevb = _mm_set1_epi8(1);
1309
1310 #endif
1311
1312 for(int rr = nystartrow; rr < nyendrow; rr++)
1313 {
1314 #ifdef __SSE2__
1315
1316 for(int indx = rr * ts; indx < rr * ts + cc1; indx += 32)
1317 {
1318 vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint *)&nyquist[(indx - v2) >> 1]),
1319 _mm_loadu_si128((vint *)&nyquist[(indx - m1) >> 1]));
1320 vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + p1) >> 1]),
1321 _mm_loadu_si128((vint *)&nyquist[(indx - 2) >> 1]));
1322 vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + 2) >> 1]),
1323 _mm_loadu_si128((vint *)&nyquist[(indx - p1) >> 1]));
1324 vint valv = _mm_load_si128((vint *)&nyquist[indx >> 1]);
1325 vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint *)&nyquist[(indx + m1) >> 1]),
1326 _mm_load_si128((vint *)&nyquist[(indx + v2) >> 1]));
1327 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
1328 nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
1329 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
1330 valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
1331 valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
1332 _mm_store_si128((vint *)&nyquist2[indx >> 1], valv);
1333 }
1334
1335 #else
1336
1337 for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
1338 indx += 2)
1339 {
1340 unsigned int nyquisttemp
1341 = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1]
1342 + nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] + nyquist[(indx - p1) >> 1]
1343 + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
1344 // if most of your neighbours are named Nyquist, it's likely that you're one too, or not
1345 nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
1346 }
1347
1348 #endif
1349 }
1350
1351 // end of Nyquist test
1352
1353 // in areas of Nyquist texture, do area interpolation
1354 for(int rr = nystartrow; rr < nyendrow; rr++)
1355 for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
1356 indx += 2)
1357 {
1358
1359 if(nyquist2[indx >> 1])
1360 {
1361 // area interpolation
1362
1363 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
1364
1365 for(int i = -6; i < 7; i += 2)
1366 {
1367 int indx1 = indx + (i * ts) - 6;
1368
1369 for(int j = -6; j < 7; j += 2, indx1 += 2)
1370 {
1371 if(nyquist2[indx1 >> 1])
1372 {
1373 float cfatemp = cfa[indx1];
1374 sumcfa += cfatemp;
1375 sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
1376 sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
1377 sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
1378 sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
1379 areawt += 1;
1380 }
1381 }
1382 }
1383
1384 // horizontal and vertical colour differences, and adaptive weight
1385 sumh = sumcfa - xdiv2f(sumh);
1386 sumv = sumcfa - xdiv2f(sumv);
1387 areawt = xdiv2f(areawt);
1388 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
1389 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
1390 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
1391
1392 // end of area interpolation
1393 }
1394 }
1395 }
1396
1397
1398 // populate G at R/B sites
1399 for(int rr = 8; rr < rr1 - 8; rr++)
1400 for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1); indx < rr * ts + cc1 - 8; indx += 2)
1401 {
1402
1403 // first ask if one gets more directional discrimination from nearby B/R sites
1404 float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1]
1405 + hvwt[(indx + m1) >> 1],
1406 2);
1407
1408 hvwt[indx >> 1]
1409 = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
1410 // a better result was obtained from the neighbours
1411
1412 Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); // evaluate colour differences
1413
1414 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; // evaluate G (finally!)
1415
1416 // local curvature in G (preparation for nyquist refinement step)
1417 Dgrb2[indx >> 1].h = nyquist2[indx >> 1]
1418 ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1]))
1419 : 0.f;
1420 Dgrb2[indx >> 1].v = nyquist2[indx >> 1]
1421 ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1]))
1422 : 0.f;
1423 }
1424
1425
1426 // end of standard interpolation
1427
1428 // refine Nyquist areas using G curvatures
1429 if(doNyquist)
1430 {
1431 for(int rr = nystartrow; rr < nyendrow; rr++)
1432 for(int indx = rr * ts + nystartcol + (FC(rr, 2, filters) & 1); indx < rr * ts + nyendcol;
1433 indx += 2)
1434 {
1435
1436 if(nyquist2[indx >> 1])
1437 {
1438 // local averages (over Nyquist pixels only) of G curvature squared
1439 float gvarh
1440 = epssq + (gquinc[0] * Dgrb2[indx >> 1].h
1441 + gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h
1442 + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h)
1443 + gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h
1444 + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h)
1445 + gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h
1446 + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
1447 float gvarv
1448 = epssq + (gquinc[0] * Dgrb2[indx >> 1].v
1449 + gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v
1450 + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v)
1451 + gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v
1452 + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v)
1453 + gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v
1454 + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
1455 // use the results as weights for refined G interpolation
1456 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
1457 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
1458 }
1459 }
1460 }
1461
1462
1463 #ifdef __SSE2__
1464
1465 for(int rr = 6; rr < rr1 - 6; rr++)
1466 {
1467 if((FC(rr, 2, filters) & 1) == 0)
1468 {
1469 for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
1470 {
1471 vfloat tempv = LC2VFU(cfa[indx + 1]);
1472 vfloat Dgrbsq1pv
1473 = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
1474 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
1475 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
1476 vfloat Dgrbsq1mv
1477 = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
1478 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
1479 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
1480 }
1481 }
1482 else
1483 {
1484 for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8)
1485 {
1486 vfloat tempv = LC2VFU(cfa[indx]);
1487 vfloat Dgrbsq1pv
1488 = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
1489 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
1490 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
1491 vfloat Dgrbsq1mv
1492 = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
1493 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv);
1494 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv);
1495 }
1496 }
1497 }
1498
1499 #else
1500
1501 for(int rr = 6; rr < rr1 - 6; rr++)
1502 {
1503 if((FC(rr, 2, filters) & 1) == 0)
1504 {
1505 for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1506 {
1507 delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
1508 delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
1509 Dgrbsq1p[indx >> 1]
1510 = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
1511 Dgrbsq1m[indx >> 1]
1512 = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
1513 }
1514 }
1515 else
1516 {
1517 for(int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2)
1518 {
1519 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
1520 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
1521 delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
1522 delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
1523 }
1524 }
1525 }
1526
1527 #endif
1528
1529 // diagonal interpolation correction
1530
1531 #ifdef __SSE2__
1532 vfloat gausseven0v = F2V(gausseven[0]);
1533 vfloat gausseven1v = F2V(gausseven[1]);
1534 #endif
1535
1536 for(int rr = 8; rr < rr1 - 8; rr++)
1537 {
1538 #ifdef __SSE2__
1539
1540 for(int indx = rr * ts + 8 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8;
1541 indx += 8, indx1 += 4)
1542 {
1543
1544 // diagonal colour ratios
1545 vfloat cfav = LC2VFU(cfa[indx]);
1546
1547 vfloat temp1v = LC2VFU(cfa[indx + m1]);
1548 vfloat temp2v = LC2VFU(cfa[indx + m2]);
1549 vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v);
1550 rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev,
1551 temp1v + zd5v * (cfav - temp2v));
1552
1553 temp1v = LC2VFU(cfa[indx - m1]);
1554 temp2v = LC2VFU(cfa[indx - m2]);
1555 vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v);
1556 rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv,
1557 temp1v + zd5v * (cfav - temp2v));
1558
1559 temp1v = epsv + LVFU(delm[indx1]);
1560 vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1])
1561 + LVFU(delm[(indx + m2) >> 1]); // same as for wtu,wtd,wtl,wtr
1562 vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
1563
1564 vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
1565
1566 temp1v = ULIMV(rbmv, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
1567 vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
1568 temp2v = vintpf(wtv, rbmv, temp1v);
1569
1570 temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
1571 temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
1572 STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
1573 ULIMV(temp2v, LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v));
1574
1575
1576 temp1v = LC2VFU(cfa[indx + p1]);
1577 temp2v = LC2VFU(cfa[indx + p2]);
1578 vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v);
1579 rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev,
1580 temp1v + zd5v * (cfav - temp2v));
1581
1582 temp1v = LC2VFU(cfa[indx - p1]);
1583 temp2v = LC2VFU(cfa[indx - p2]);
1584 vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v);
1585 rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv,
1586 temp1v + zd5v * (cfav - temp2v));
1587
1588 temp1v = epsv + LVFU(delp[indx1]);
1589 vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
1590 vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
1591
1592 vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
1593
1594 temp1v = ULIMV(rbpv, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
1595 wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
1596 temp2v = vintpf(wtv, rbpv, temp1v);
1597
1598 temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
1599 temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
1600 STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv),
1601 ULIMV(temp2v, LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v));
1602
1603 vfloat rbvarmv
1604 = epssqv
1605 + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1])
1606 + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1]))
1607 + gausseven1v
1608 * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1])
1609 + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1])
1610 + LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1])
1611 + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
1612 STVFU(pmwt[indx1],
1613 rbvarmv / ((epssqv
1614 + (gausseven0v
1615 * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1])
1616 + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1]))
1617 + gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1])
1618 + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1])
1619 + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1])
1620 + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1])
1621 + LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1])
1622 + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1])
1623 + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1])
1624 + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1]))))
1625 + rbvarmv));
1626 }
1627
1628 #else
1629
1630 for(int cc = 8 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8;
1631 cc += 2, indx += 2, indx1++)
1632 {
1633
1634 // diagonal colour ratios
1635 float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
1636 float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
1637 float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
1638 float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
1639 // colour differences in diagonal directions
1640 float rbse, rbnw, rbne, rbsw;
1641
1642 // assign B/R at R/B sites
1643 if(fabsf(1.f - crse) < arthresh)
1644 {
1645 rbse = cfa[indx] * crse; // use this if more precise diag interp is necessary
1646 }
1647 else
1648 {
1649 rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
1650 }
1651
1652 if(fabsf(1.f - crnw) < arthresh)
1653 {
1654 rbnw = cfa[indx] * crnw;
1655 }
1656 else
1657 {
1658 rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
1659 }
1660
1661 if(fabsf(1.f - crne) < arthresh)
1662 {
1663 rbne = cfa[indx] * crne;
1664 }
1665 else
1666 {
1667 rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
1668 }
1669
1670 if(fabsf(1.f - crsw) < arthresh)
1671 {
1672 rbsw = cfa[indx] * crsw;
1673 }
1674 else
1675 {
1676 rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
1677 }
1678
1679 float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1]
1680 + delm[(indx + m2) >> 1]; // same as for wtu,wtd,wtl,wtr
1681 float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
1682 float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
1683 float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
1684
1685
1686 rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
1687 rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
1688
1689 // variance of R-B in plus/minus directions
1690 float rbvarm
1691 = epssq
1692 + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1]
1693 + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1])
1694 + gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1]
1695 + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1]
1696 + Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1]
1697 + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
1698 pmwt[indx1]
1699 = rbvarm
1700 / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1]
1701 + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1])
1702 + gausseven[1]
1703 * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1]
1704 + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1]
1705 + Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1]
1706 + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1])))
1707 + rbvarm);
1708
1709 // bound the interpolation in regions of high saturation
1710
1711 if(rbp[indx1] < cfa[indx])
1712 {
1713 if(xmul2f(rbp[indx1]) < cfa[indx])
1714 {
1715 rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1716 }
1717 else
1718 {
1719 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
1720 rbp[indx1]
1721 = pwt * rbp[indx1] + (1.f - pwt) * ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1722 }
1723 }
1724
1725 if(rbm[indx1] < cfa[indx])
1726 {
1727 if(xmul2f(rbm[indx1]) < cfa[indx])
1728 {
1729 rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1730 }
1731 else
1732 {
1733 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
1734 rbm[indx1]
1735 = mwt * rbm[indx1] + (1.f - mwt) * ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1736 }
1737 }
1738
1739 if(rbp[indx1] > clip_pt)
1740 {
1741 rbp[indx1] = ULIM(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1742 }
1743
1744 if(rbm[indx1] > clip_pt)
1745 {
1746 rbm[indx1] = ULIM(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1747 }
1748 }
1749
1750 #endif
1751 }
1752
1753 #ifdef __SSE2__
1754 vfloat zd25v = F2V(0.25f);
1755 #endif
1756
1757 for(int rr = 10; rr < rr1 - 10; rr++)
1758 #ifdef __SSE2__
1759 for(int indx = rr * ts + 10 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
1760 indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4)
1761 {
1762
1763 // first ask if one gets more directional discrimination from nearby B/R sites
1764 vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1])
1765 + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
1766 vfloat tempv = LVFU(pmwt[indx1]);
1767 tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
1768 STVFU(pmwt[indx1], tempv);
1769 STVFU(rbint[indx1],
1770 zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
1771 }
1772
1773 #else
1774
1775 for(int cc = 10 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10;
1776 cc += 2, indx += 2, indx1++)
1777 {
1778
1779 // first ask if one gets more directional discrimination from nearby B/R sites
1780 float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1]
1781 + pmwt[(indx + m1) >> 1],
1782 2);
1783
1784 if(fabsf(0.5f - pmwt[indx1]) < fabsf(0.5f - pmwtalt))
1785 {
1786 pmwt[indx1] = pmwtalt; // a better result was obtained from the neighbours
1787 }
1788
1789 rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1])
1790 + rbp[indx1] * pmwt[indx1]); // this is R+B, interpolated
1791 }
1792
1793 #endif
1794
1795 for(int rr = 12; rr < rr1 - 12; rr++)
1796 #ifdef __SSE2__
1797 for(int indx = rr * ts + 12 + (FC(rr, 2, filters) & 1), indx1 = indx >> 1;
1798 indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4)
1799 {
1800 vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
1801
1802 if(_mm_movemask_ps((vfloat)copymask))
1803 { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the
1804 // unused out at the end
1805 // now interpolate G vertically/horizontally using R+B values
1806 // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1807 // colour ratios for G interpolation
1808 vfloat rbintv = LVFU(rbint[indx1]);
1809
1810 // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1811 vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
1812 vfloat guv = rbintv * cruv;
1813 vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
1814 guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
1815
1816 vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
1817 vfloat gdv = rbintv * crdv;
1818 vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
1819 gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
1820
1821 vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv)
1822 / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
1823 vfloat Gint1v = ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
1824 vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
1825 vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
1826 Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
1827 Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
1828 Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv),
1829 ULIMV(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
1830
1831 vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
1832 vfloat glv = rbintv * crlv;
1833 vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
1834 glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
1835
1836 vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
1837 vfloat grv = rbintv * crrv;
1838 vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
1839 grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
1840
1841 vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv)
1842 / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
1843 vfloat Gint1h = ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
1844 vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
1845 vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
1846 Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
1847 Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
1848 Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv),
1849 ULIMV(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
1850
1851 vfloat greenv
1852 = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
1853 STC2VFU(rgbgreen[indx], greenv);
1854
1855 STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
1856 }
1857 }
1858
1859 #else
1860
1861 for(int cc = 12 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12;
1862 cc += 2, indx += 2, indx1++)
1863 {
1864
1865 if(fabsf(0.5f - pmwt[indx >> 1]) < fabsf(0.5f - hvwt[indx >> 1]))
1866 {
1867 continue;
1868 }
1869
1870 // now interpolate G vertically/horizontally using R+B values
1871 // unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1872
1873 // colour ratios for G interpolation
1874 float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
1875 float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
1876 float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
1877 float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
1878
1879 // interpolation of G in four directions
1880 float gu, gd, gl, gr;
1881
1882 // interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1883 if(fabsf(1.f - cru) < arthresh)
1884 {
1885 gu = rbint[indx1] * cru;
1886 }
1887 else
1888 {
1889 gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
1890 }
1891
1892 if(fabsf(1.f - crd) < arthresh)
1893 {
1894 gd = rbint[indx1] * crd;
1895 }
1896 else
1897 {
1898 gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
1899 }
1900
1901 if(fabsf(1.f - crl) < arthresh)
1902 {
1903 gl = rbint[indx1] * crl;
1904 }
1905 else
1906 {
1907 gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
1908 }
1909
1910 if(fabsf(1.f - crr) < arthresh)
1911 {
1912 gr = rbint[indx1] * crr;
1913 }
1914 else
1915 {
1916 gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
1917 }
1918
1919 // interpolated G via adaptive weights of cardinal evaluations
1920 float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu)
1921 / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1922 float Ginth
1923 = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1924
1925 // bound the interpolation in regions of high saturation
1926 if(Gintv < rbint[indx1])
1927 {
1928 if(2 * Gintv < rbint[indx1])
1929 {
1930 Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
1931 }
1932 else
1933 {
1934 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
1935 Gintv = vwt * Gintv + (1.f - vwt) * ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
1936 }
1937 }
1938
1939 if(Ginth < rbint[indx1])
1940 {
1941 if(2 * Ginth < rbint[indx1])
1942 {
1943 Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
1944 }
1945 else
1946 {
1947 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
1948 Ginth = hwt * Ginth + (1.f - hwt) * ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
1949 }
1950 }
1951
1952 if(Ginth > clip_pt)
1953 {
1954 Ginth = ULIM(Ginth, cfa[indx - 1], cfa[indx + 1]);
1955 }
1956
1957 if(Gintv > clip_pt)
1958 {
1959 Gintv = ULIM(Gintv, cfa[indx - v1], cfa[indx + v1]);
1960 }
1961
1962 rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
1963 Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
1964 }
1965
1966 #endif
1967
1968 // end of diagonal interpolation correction
1969
1970 // fancy chrominance interpolation
1971 //(ey,ex) is location of R site
1972 for(int rr = 13 - ey; rr < rr1 - 12; rr += 2)
1973 for(int indx1 = (rr * ts + 13 - ex) >> 1; indx1<(rr * ts + cc1 - 12)>> 1; indx1++)
1974 { // B coset
1975 Dgrb[1][indx1] = Dgrb[0][indx1]; // split out G-B from G-R
1976 Dgrb[0][indx1] = 0;
1977 }
1978
1979 #ifdef __SSE2__
1980 vfloat oned325v = F2V(1.325f);
1981 vfloat zd175v = F2V(0.175f);
1982 vfloat zd075v = F2V(0.075f);
1983 #endif
1984
1985 for(int rr = 14; rr < rr1 - 14; rr++)
1986 #ifdef __SSE2__
1987 for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
1988 cc < cc1 - 14; cc += 8, indx += 8)
1989 {
1990 vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
1991 vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
1992 vfloat wtnwv
1993 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1]))
1994 + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
1995 vfloat wtnev
1996 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1]))
1997 + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
1998 vfloat wtswv
1999 = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1]))
2000 + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
2001 vfloat wtsev
2002 = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1]))
2003 + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
2004
2005 STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1])
2006 - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1])
2007 - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1])
2008 + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])))
2009 + wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1])
2010 - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1])
2011 - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1])
2012 + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])))
2013 + wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1])
2014 - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1])
2015 - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1])
2016 + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])))
2017 + wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1])
2018 - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1])
2019 - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1])
2020 + LVFU(Dgrb[c][(indx + m1 + v2) >> 1]))))
2021 / (wtnwv + wtnev + wtswv + wtsev));
2022 }
2023
2024 #else
2025
2026 for(int cc = 14 + (FC(rr, 2, filters) & 1), indx = rr * ts + cc, c = 1 - FC(rr, cc, filters) / 2;
2027 cc < cc1 - 14; cc += 2, indx += 2)
2028 {
2029 float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1])
2030 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1])
2031 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
2032 float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1])
2033 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1])
2034 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
2035 float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1])
2036 + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1])
2037 + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
2038 float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1])
2039 + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1])
2040 + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
2041
2042 Dgrb[c][indx >> 1]
2043 = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1]
2044 - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1])
2045 + wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1]
2046 - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1]
2047 - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1])
2048 + wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1]
2049 - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1]
2050 - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1])
2051 + wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1]
2052 - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1]
2053 - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1]))
2054 / (wtnw + wtne + wtsw + wtse);
2055 }
2056
2057 #endif
2058
2059 #ifdef __SSE2__
2060 int offset;
2061 vfloat twov = F2V(2.f);
2062 vmask selmask;
2063
2064 if((FC(16, 2, filters) & 1) == 1)
2065 {
2066 selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
2067 offset = 1;
2068 }
2069 else
2070 {
2071 selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
2072 offset = 0;
2073 }
2074
2075 #endif
2076
2077 for(int rr = 16; rr < rr1 - 16; rr++)
2078 {
2079 int row = rr + top;
2080 int col = left + 16;
2081 int indx = rr * ts + 16;
2082 #ifdef __SSE2__
2083 offset = 1 - offset;
2084 selmask = vnotm(selmask);
2085
2086 for(; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4)
2087 {
2088 if(col < roi_out->width && row < roi_out->height)
2089 {
2090 vfloat greenv = LVF(rgbgreen[indx]);
2091 vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
2092 vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
2093 vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))
2094 - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
2095
2096 vfloat redv1 = greenv
2097 - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1]))
2098 + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
2099 * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1]))
2100 + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
2101 * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1]))
2102 + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1])))
2103 * tempv;
2104 vfloat bluev1 = greenv
2105 - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1]))
2106 + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])))
2107 * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1]))
2108 + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])))
2109 * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1]))
2110 + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1])))
2111 * tempv;
2112 vfloat redv2 = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
2113 vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
2114 __attribute__((aligned(64))) float _r[4];
2115 __attribute__((aligned(64))) float _b[4];
2116 STVF(*_r, vself(selmask, redv1, redv2));
2117 STVF(*_b, vself(selmask, bluev1, bluev2));
2118 for(int c = 0; c < 4; c++)
2119 {
2120 out[(row * roi_out->width + col + c) * 4] = clampnan(_r[c], 0.0, 1.0);
2121 out[(row * roi_out->width + col + c) * 4 + 2] = clampnan(_b[c], 0.0, 1.0);
2122 }
2123 }
2124 }
2125
2126 if(offset == 0)
2127 {
2128 for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2129 {
2130 if(col < roi_out->width && row < roi_out->height)
2131 {
2132 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2133 - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2134 out[(row * roi_out->width + col) * 4]
2135 = clampnan(rgbgreen[indx]
2136 - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2137 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2138 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2139 + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2140 * temp,
2141 0.0, 1.0);
2142 out[(row * roi_out->width + col) * 4 + 2]
2143 = clampnan(rgbgreen[indx]
2144 - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2145 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2146 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2147 + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2148 * temp,
2149 0.0, 1.0);
2150 }
2151
2152 indx++;
2153 col++;
2154 if(col < roi_out->width && row < roi_out->height)
2155 {
2156 out[(row * roi_out->width + col) * 4]
2157 = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2158 out[(row * roi_out->width + col) * 4 + 2]
2159 = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2160 }
2161 }
2162
2163 if(cc1 & 1)
2164 { // width of tile is odd
2165 if(col < roi_out->width && row < roi_out->height)
2166 {
2167 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2168 - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2169 out[(row * roi_out->width + col) * 4]
2170 = clampnan(rgbgreen[indx]
2171 - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2172 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2173 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2174 + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2175 * temp,
2176 0.0, 1.0);
2177 out[(row * roi_out->width + col) * 4 + 2]
2178 = clampnan(rgbgreen[indx]
2179 - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2180 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2181 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2182 + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2183 * temp,
2184 0.0, 1.0);
2185 }
2186 }
2187 }
2188 else
2189 {
2190 for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2191 {
2192 if(col < roi_out->width && row < roi_out->height)
2193 {
2194 out[(row * roi_out->width + col) * 4]
2195 = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2196 out[(row * roi_out->width + col) * 4 + 2]
2197 = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2198 }
2199
2200 indx++;
2201 col++;
2202 if(col < roi_out->width && row < roi_out->height)
2203 {
2204 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2205 - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2206 out[(row * roi_out->width + col) * 4]
2207 = clampnan(rgbgreen[indx]
2208 - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2209 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2210 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2211 + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2212 * temp,
2213 0.0, 1.0);
2214 out[(row * roi_out->width + col) * 4 + 2]
2215 = clampnan(rgbgreen[indx]
2216 - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2217 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2218 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2219 + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2220 * temp,
2221 0.0, 1.0);
2222 }
2223 }
2224
2225 if(cc1 & 1)
2226 { // width of tile is odd
2227 if(col < roi_out->width && row < roi_out->height)
2228 {
2229 out[(row * roi_out->width + col) * 4]
2230 = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2231 out[(row * roi_out->width + col) * 4 + 2]
2232 = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2233 }
2234 }
2235 }
2236
2237 #else
2238
2239 if((FC(rr, 2, filters) & 1) == 1)
2240 {
2241 for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2242 {
2243 if(col < roi_out->width && row < roi_out->height)
2244 {
2245 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2246 - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2247 out[(row * roi_out->width + col) * 4]
2248 = clampnan(rgbgreen[indx]
2249 - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2250 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2251 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2252 + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2253 * temp,
2254 0.0, 1.0);
2255 out[(row * roi_out->width + col) * 4 + 2]
2256 = clampnan(rgbgreen[indx]
2257 - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2258 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2259 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2260 + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2261 * temp,
2262 0.0, 1.0);
2263 }
2264
2265 indx++;
2266 col++;
2267 if(col < roi_out->width && row < roi_out->height)
2268 {
2269 out[(row * roi_out->width + col) * 4]
2270 = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0, 1.0);
2271 out[(row * roi_out->width + col) * 4 + 2]
2272 = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0, 1.0);
2273 }
2274 }
2275
2276 if(cc1 & 1)
2277 { // width of tile is odd
2278 if(col < roi_out->width && row < roi_out->height)
2279 {
2280 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2281 - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2282 out[(row * roi_out->width + col) * 4]
2283 = clampnan(rgbgreen[indx]
2284 - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2285 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2286 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2287 + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2288 * temp,
2289 0.0, 1.0);
2290 out[(row * roi_out->width + col) * 4 + 2]
2291 = clampnan(rgbgreen[indx]
2292 - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2293 + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2294 + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2295 + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2296 * temp,
2297 0.0, 1.0);
2298 }
2299 }
2300 }
2301 else
2302 {
2303 for(; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++)
2304 {
2305 if(col < roi_out->width && row < roi_out->height)
2306 {
2307 out[(row * roi_out->width + col) * 4]
2308 = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
2309 out[(row * roi_out->width + col) * 4 + 2]
2310 = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
2311 }
2312
2313 indx++;
2314 col++;
2315 if(col < roi_out->width && row < roi_out->height)
2316 {
2317 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1]
2318 - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
2319
2320 out[(row * roi_out->width + col) * 4]
2321 = clampnan(rgbgreen[indx]
2322 - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1]
2323 + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1]
2324 + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1]
2325 + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1])
2326 * temp,
2327 0.0f, 1.0f);
2328
2329 out[(row * roi_out->width + col) * 4 + 2]
2330 = clampnan(rgbgreen[indx]
2331 - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1]
2332 + (1.0f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1]
2333 + (1.0f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1]
2334 + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1])
2335 * temp,
2336 0.0f, 1.0f);
2337 }
2338 }
2339
2340 if(cc1 & 1)
2341 { // width of tile is odd
2342 if(col < roi_out->width && row < roi_out->height)
2343 {
2344 out[(row * roi_out->width + col) * 4]
2345 = clampnan(rgbgreen[indx] - Dgrb[0][indx >> 1], 0.0f, 1.0f);
2346 out[(row * roi_out->width + col) * 4 + 2]
2347 = clampnan(rgbgreen[indx] - Dgrb[1][indx >> 1], 0.0f, 1.0f);
2348 }
2349 }
2350 }
2351
2352 #endif
2353 }
2354
2355 // copy smoothed results back to image matrix
2356 for(int rr = 16; rr < rr1 - 16; rr++)
2357 {
2358 int row = rr + top;
2359 int cc = 16;
2360 // TODO (darktable): we have the pixel colors interleaved so writing them in blocks using SSE2 is
2361 // not possible. or is it?
2362 // #ifdef __SSE2__
2363 //
2364 // for(; cc < cc1 - 19; cc += 4)
2365 // {
2366 // STVFU(out[(row * roi_out->width + (cc + left)) * 4 + 1], LVF(rgbgreen[rr * ts +
2367 // cc]));
2368 // }
2369 //
2370 // #endif
2371
2372 for(; cc < cc1 - 16; cc++)
2373 {
2374 int col = cc + left;
2375 int indx = rr * ts + cc;
2376 if(col < roi_out->width && row < roi_out->height)
2377 out[(row * roi_out->width + col) * 4 + 1] = clampnan(rgbgreen[indx], 0.0f, 1.0f);
2378 }
2379 }
2380 }
2381 } // end of main loop
2382
2383 // clean up
2384 free(buffer);
2385 }
2386 }
2387
2388 /*==================================================================================
2389 * end of raw therapee code
2390 *==================================================================================*/
2391
2392 // modelines: These editor modelines have been set for all relevant files by tools/update_modelines.sh
2393 // vim: shiftwidth=2 expandtab tabstop=2 cindent
2394 // kate: tab-indents: off; indent-width 2; replace-tabs on; indent-mode cstyle; remove-trailing-spaces modified;
2395