1 ////////////////////////////////////////////////////////////////
2 //
3 // AMaZE demosaic algorithm
4 // (Aliasing Minimization and Zipper Elimination)
5 //
6 // copyright (c) 2008-2010 Emil Martinec <ejmartin@uchicago.edu>
7 // optimized for speed by Ingo Weyrich
8 //
9 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
10 //
11 // code dated: May 27, 2010
12 // latest modification: Ingo Weyrich, January 25, 2016
13 //
14 // amaze.cc is free software: you can redistribute it and/or modify
15 // it under the terms of the GNU General Public License as published by
16 // the Free Software Foundation, either version 3 of the License, or
17 // (at your option) any later version.
18 //
19 // This program is distributed in the hope that it will be useful,
20 // but WITHOUT ANY WARRANTY; without even the implied warranty of
21 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 // GNU General Public License for more details.
23 //
24 // You should have received a copy of the GNU General Public License
25 // along with this program. If not, see <http://www.gnu.org/licenses/>.
26 //
27 ////////////////////////////////////////////////////////////////
28
29 #include <algorithm>
30 #include <array>
31 #include <cstring>
32 #include <memory>
33
34 #include "bayerhelper.h"
35 #include "librtprocess.h"
36 #include "rt_math.h"
37 #include "sleef.h"
38 #include "opthelper.h"
39 #include "median.h"
40 #include "StopWatch.h"
41
42 using namespace librtprocess;
43
amaze_demosaic(int raw_width,int raw_height,int winx,int winy,int winw,int winh,const float * const * rawData,float ** red,float ** green,float ** blue,const unsigned cfarray[2][2],const std::function<bool (double)> & setProgCancel,double initGain,int border,float inputScale,float outputScale,size_t chunkSize,bool measure)44 rpError amaze_demosaic(int raw_width, int raw_height, int winx, int winy, int winw, int winh, const float * const *rawData, float **red, float **green, float **blue, const unsigned cfarray[2][2], const std::function<bool(double)> &setProgCancel, double initGain, int border, float inputScale, float outputScale, size_t chunkSize, bool measure)
45 {
46 BENCHFUN
47 std::unique_ptr<StopWatch> stop;
48
49 if (measure) {
50 std::cout << "Demosaicing " << winw << "x" << winh << " image using AMaZE with " << chunkSize << " Tiles per Thread" << std::endl;
51 stop.reset(new StopWatch("amaze demosaic"));
52 }
53
54 if (!validateBayerCfa(3, cfarray)) {
55 return RP_WRONG_CFA;
56 }
57 rpError rc = RP_NO_ERROR;
58
59 double progress = 0.0;
60 setProgCancel(progress);
61
62 const int width = winw, height = winh;
63 const float clip_pt = 1.0 / initGain;
64 const float clip_pt8 = 0.8 / initGain;
65
66 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
67 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
68 #ifndef AMAZETS
69 #define AMAZETS 160
70 #endif
71 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
72 // We assure that Tile size is a multiple of 32 in the range [96;992]
73 constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
74 constexpr int tsh = ts / 2; // half of Tile size
75
76 //offset of R pixel within a Bayer quartet
77 int ex, ey;
78
79 //determine GRBG coset; (ey,ex) is the offset of the R subarray
80 if (fc(cfarray, 0, 0) == 1) { //first pixel is G
81 if (fc(cfarray, 0, 1) == 0) {
82 ey = 0;
83 ex = 1;
84 } else {
85 ey = 1;
86 ex = 0;
87 }
88 } else {//first pixel is R or B
89 if (fc(cfarray, 0, 0) == 0) {
90 ey = 0;
91 ex = 0;
92 } else {
93 ey = 1;
94 ex = 1;
95 }
96 }
97
98 //shifts of pointer value to access pixels in vertical and diagonal directions
99 constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3, m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
100
101 //tolerance to avoid dividing by zero
102 constexpr float eps = 1e-5, epssq = 1e-10; //tolerance to avoid dividing by zero
103
104 //adaptive ratios threshold
105 constexpr float arthresh = 0.75;
106
107 //gaussian on 5x5 quincunx, sigma=1.2
108 constexpr float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
109 //nyquist texture test threshold
110 constexpr float nyqthresh = 0.5;
111 //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
112 // Is this really sigma=1.2????, seems more like sigma = 1.672
113 constexpr float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
114 nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
115 };
116 //gaussian on 5x5 alt quincunx, sigma=1.5
117 constexpr float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
118 //gaussian on quincunx grid
119 constexpr float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
120
121 typedef struct {
122 float h;
123 float v;
124 } s_hv;
125
126 #ifdef _OPENMP
127 #pragma omp parallel
128 #endif
129 {
130 int progresscounter = 0;
131
132 constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
133 // assign working space
134 char *buffer = (char *) calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
135 #ifdef _OPENMP
136 #pragma omp critical
137 #endif
138 {
139 if (!buffer) {
140 rc = RP_MEMORY_ERROR;
141 }
142 }
143 #ifdef _OPENMP
144 #pragma omp barrier
145 #endif
146 if (!rc) {
147 // aligned to 64 byte boundary
148 float *data = (float*)((uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
149
150 // green values
151 float *rgbgreen = data;
152 // sum of square of horizontal gradient and square of vertical gradient
153 float *delhvsqsum = rgbgreen + ts * ts + cldf * 16;
154 // gradient based directional weights for interpolation
155 float *dirwts0 = delhvsqsum + ts * ts + cldf * 16;
156 float *dirwts1 = dirwts0 + ts * ts + cldf * 16;
157 // vertically interpolated colour differences G-R, G-B
158 float *vcd = dirwts1 + ts * ts + cldf * 16;
159 // horizontally interpolated colour differences
160 float *hcd = vcd + ts * ts + cldf * 16;
161 // alternative vertical interpolation
162 float *vcdalt = hcd + ts * ts + cldf * 16;
163 // alternative horizontal interpolation
164 float *hcdalt = vcdalt + ts * ts + cldf * 16;
165 // square of average colour difference
166 float *cddiffsq = hcdalt + ts * ts + cldf * 16;
167 // weight to give horizontal vs vertical interpolation
168 float *hvwt = cddiffsq + ts * ts + 2 * cldf * 16;
169 // final interpolated colour difference
170 float (*Dgrb)[ts * tsh] = (float (*)[ts * tsh]) vcdalt; // there is no overlap in buffer usage => share
171 // gradient in plus (NE/SW) direction
172 float *delp = cddiffsq; // there is no overlap in buffer usage => share
173 // gradient in minus (NW/SE) direction
174 float *delm = delp + ts * tsh + cldf * 16;
175 // diagonal interpolation of R+B
176 float *rbint = delm; // there is no overlap in buffer usage => share
177 // difference between up/down interpolations of G
178 float *dgintv = hvwt + ts * tsh + cldf * 16;
179 // difference between left/right interpolations of G
180 float *dginth = dgintv + ts * ts + cldf * 16;
181 // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
182 s_hv *Dgrb2 = (s_hv (*)) dgintv; // there is no overlap in buffer usage => share
183 // square of diagonal colour differences
184 float *Dgrbsq1m = dginth + ts * ts + cldf * 16;
185 float *Dgrbsq1p = Dgrbsq1m + ts * tsh + cldf * 16;
186 // tile raw data
187 float *cfa = Dgrbsq1p + ts * tsh + cldf * 16;
188 // relative weight for combining plus and minus diagonal interpolations
189 float *pmwt = delhvsqsum; // there is no overlap in buffer usage => share
190 // interpolated colour difference R-B in minus and plus direction
191 float *rbm = vcd; // there is no overlap in buffer usage => share
192 float *rbp = rbm + ts * tsh + cldf * 16;
193 // nyquist texture flags 1=nyquist, 0=not nyquist
194 unsigned char *nyquist = (unsigned char (*)) (cfa + ts * ts + cldf * 16); // 1
195 unsigned char *nyquist2 = (unsigned char (*)) cddiffsq;
196 float *nyqutest = (float(*)) (nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1
197
198 // Main algorithm: Tile loop
199 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
200 #ifdef _OPENMP
201 #pragma omp for schedule(dynamic, chunkSize) collapse(2) nowait
202 #endif
203
204 for (int top = winy - 16; top < winy + height; top += ts - 32) {
205 for (int left = winx - 16; left < winx + width; left += ts - 32) {
206 memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
207 //location of tile bottom edge
208 int bottom = min(top + ts, winy + height + 16);
209 //location of tile right edge
210 int right = min(left + ts, winx + width + 16);
211 //tile width (=ts except for right edge of image)
212 int rr1 = bottom - top;
213 //tile height (=ts except for bottom edge of image)
214 int cc1 = right - left;
215 // bookkeeping for borders
216 // min and max row/column in the tile
217 int rrmin = top < 0 ? 16 : 0;
218 int ccmin = left < 0 ? 16 : 0;
219 //int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
220 int rrmax = bottom > (raw_height) ? winy + height - top : rr1;
221 //int ccmax = right > (winx + width) ? winx + width - left : cc1;
222 int ccmax = right > (raw_width) ? winx + width - left : cc1;
223
224 // rgb from input CFA data
225 // rgb values should be floating point number between 0 and 1
226 // after white balance multipliers are applied
227 // a 16 pixel border is added to each side of the image
228
229 // begin of tile initialization
230 #ifdef __SSE2__
231 vfloat cinScalev = F2V( inputScale );
232
233 //fill upper border
234 if (rrmin > 0) {
235 for (int rr = 0; rr < 16; rr++) {
236 int row = 32 - rr + top;
237
238 for (int cc = ccmin; cc < ccmax; cc += 4) {
239 int indx1 = rr * ts + cc;
240 vfloat tempv = LVFU(rawData[row][cc + left]) / cinScalev;
241 STVF(cfa[indx1], tempv);
242 STVF(rgbgreen[indx1], tempv );
243 }
244 }
245 }
246
247 // fill inner part
248 for (int rr = rrmin; rr < rrmax; rr++) {
249 int row = rr + top;
250 int cc = ccmin;
251 for (; cc < ccmax - 3; cc += 4) {
252 int indx1 = rr * ts + cc;
253 vfloat tempv = LVFU(rawData[row][cc + left]) / cinScalev;
254 STVF(cfa[indx1], tempv );
255 STVF(rgbgreen[indx1], tempv );
256 }
257 for (; cc < ccmax; ++cc) {
258 int indx1 = rr * ts + cc;
259 float temp = rawData[row][cc + left] / inputScale;
260 cfa[indx1] = temp;
261 rgbgreen[indx1] = temp;
262 }
263 }
264
265 //fill lower border
266 if (rrmax < rr1) {
267 for (int rr = 0; rr < 16; rr++)
268 for (int cc = ccmin; cc < ccmax; cc += 4) {
269 int indx1 = (rrmax + rr) * ts + cc;
270 vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / cinScalev;
271 STVF(cfa[indx1], tempv );
272 STVF(rgbgreen[indx1], tempv );
273 }
274 }
275
276 #else
277
278 //fill upper border
279 if (rrmin > 0) {
280 for (int rr = 0; rr < 16; rr++)
281 for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
282 cfa[rr * ts + cc] = (rawData[row][cc + left]) / inputScale;
283 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
284 }
285 }
286
287 // fill inner part
288 for (int rr = rrmin; rr < rrmax; rr++) {
289 int row = rr + top;
290
291 for (int cc = ccmin; cc < ccmax; cc++) {
292 int indx1 = rr * ts + cc;
293 cfa[indx1] = (rawData[row][cc + left]) / inputScale;
294 rgbgreen[indx1] = cfa[indx1];
295 }
296 }
297
298 //fill lower border
299 if (rrmax < rr1) {
300 for (int rr = 0; rr < 16; rr++)
301 for (int cc = ccmin; cc < ccmax; cc++) {
302 cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / inputScale;
303 rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
304 }
305 }
306
307 #endif
308
309 //fill left border
310 if (ccmin > 0) {
311 for (int rr = rrmin; rr < rrmax; rr++)
312 for (int cc = 0, row = rr + top; cc < 16; cc++) {
313 cfa[rr * ts + cc] = (rawData[row][32 - cc + left]) / inputScale;
314 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
315 }
316 }
317
318 //fill right border
319 if (ccmax < cc1) {
320 for (int rr = rrmin; rr < rrmax; rr++)
321 for (int cc = 0; cc < 16; cc++) {
322 cfa[rr * ts + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / inputScale;
323 rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
324 }
325 }
326
327 //also, fill the image corners
328 if (rrmin > 0 && ccmin > 0) {
329 for (int rr = 0; rr < 16; rr++)
330 for (int cc = 0; cc < 16; cc++) {
331 cfa[(rr)*ts + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / inputScale;
332 rgbgreen[(rr)*ts + cc] = cfa[(rr) * ts + cc];
333 }
334 }
335
336 if (rrmax < rr1 && ccmax < cc1) {
337 for (int rr = 0; rr < 16; rr++)
338 for (int cc = 0; cc < 16; cc++) {
339 cfa[(rrmax + rr)*ts + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / inputScale;
340 rgbgreen[(rrmax + rr)*ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
341 }
342 }
343
344 if (rrmin > 0 && ccmax < cc1) {
345 for (int rr = 0; rr < 16; rr++)
346 for (int cc = 0; cc < 16; cc++) {
347 cfa[(rr)*ts + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / inputScale;
348 rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr) * ts + ccmax + cc];
349 }
350 }
351
352 if (rrmax < rr1 && ccmin > 0) {
353 for (int rr = 0; rr < 16; rr++)
354 for (int cc = 0; cc < 16; cc++) {
355 cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / inputScale;
356 rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
357 }
358 }
359
360 // end of tile initialization
361
362 // horizontal and vertical gradients
363 #ifdef __SSE2__
364 vfloat epsv = F2V( eps );
365
366 for (int rr = 2; rr < rr1 - 2; rr++) {
367 for (int indx = rr * ts; indx < rr * ts + cc1; indx += 4) {
368 vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) - LVFU( cfa[indx - 1] ) );
369 vfloat delvv = vabsf( LVF( cfa[indx + v1] ) - LVF( cfa[indx - v1] ) );
370 STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
371 STVF(dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv );
372 STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
373 }
374 }
375
376 #else
377
378 for (int rr = 2; rr < rr1 - 2; rr++)
379 for (int cc = 2, indx = (rr) * ts + cc; cc < cc1 - 2; cc++, indx++) {
380 float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
381 float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
382 dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
383 dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
384 delhvsqsum[indx] = SQR(delh) + SQR(delv);
385 }
386
387 #endif
388
389 //interpolate vertical and horizontal colour differences
390 #ifdef __SSE2__
391 vfloat sgnv;
392
393 if( !(fc(cfarray, 4, 4) & 1) ) {
394 sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
395 } else {
396 sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
397 }
398
399 vfloat zd5v = F2V( 0.5f );
400 vfloat onev = F2V( 1.f );
401 vfloat arthreshv = F2V( arthresh );
402 vfloat clip_pt8v = F2V( clip_pt8 );
403
404 for (int rr = 4; rr < rr1 - 4; rr++) {
405 sgnv = -sgnv;
406
407 for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4) {
408 //colour ratios in each cardinal direction
409 vfloat cfav = LVF(cfa[indx]);
410 vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
411 vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
412 vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
413 vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
414
415 //G interpolated in vert/hor directions using Hamilton-Adams method
416 vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
417 vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
418 vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
419 vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
420
421 //G interpolated in vert/hor directions using adaptive ratios
422 vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
423 vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
424 vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
425 vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
426
427 //adaptive weights for vertical/horizontal directions
428 vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
429 vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
430
431 //interpolated G via adaptive weights of cardinal evaluations
432 vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
433 vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
434
435 //interpolated colour differences
436 vfloat hcdaltv = sgnv * (Ginthhav - cfav);
437 vfloat vcdaltv = sgnv * (Gintvhav - cfav);
438 STVF(hcdalt[indx], hcdaltv);
439 STVF(vcdalt[indx], vcdaltv);
440
441 vmask clipmask = vorm( vorm( vmaskf_gt( cfav, clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v ));
442 guarv = vself( clipmask, guhav, guarv);
443 gdarv = vself( clipmask, gdhav, gdarv);
444 glarv = vself( clipmask, glhav, glarv);
445 grarv = vself( clipmask, grhav, grarv);
446
447 //use HA if highlights are (nearly) clipped
448 STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
449 STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
450
451 //differences of interpolations in opposite directions
452 STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
453 STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
454 }
455 }
456
457 #else
458
459 for (int rr = 4; rr < rr1 - 4; rr++) {
460 bool fcswitch = fc(cfarray, rr, 4) & 1;
461
462 for (int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++) {
463
464 //colour ratios in each cardinal direction
465 float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
466 float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
467 float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
468 float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
469
470 //G interpolated in vert/hor directions using Hamilton-Adams method
471 float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
472 float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
473 float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
474 float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
475
476 //G interpolated in vert/hor directions using adaptive ratios
477 float guar, gdar, glar, grar;
478
479 if (fabsf(1.f - cru) < arthresh) {
480 guar = cfa[indx] * cru;
481 } else {
482 guar = guha;
483 }
484
485 if (fabsf(1.f - crd) < arthresh) {
486 gdar = cfa[indx] * crd;
487 } else {
488 gdar = gdha;
489 }
490
491 if (fabsf(1.f - crl) < arthresh) {
492 glar = cfa[indx] * crl;
493 } else {
494 glar = glha;
495 }
496
497 if (fabsf(1.f - crr) < arthresh) {
498 grar = cfa[indx] * crr;
499 } else {
500 grar = grha;
501 }
502
503 //adaptive weights for vertical/horizontal directions
504 float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
505 float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
506
507 //interpolated G via adaptive weights of cardinal evaluations
508 float Gintvha = vwt * gdha + (1.f - vwt) * guha;
509 float Ginthha = hwt * grha + (1.f - hwt) * glha;
510
511 //interpolated colour differences
512 if (fcswitch) {
513 vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
514 hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
515 vcdalt[indx] = cfa[indx] - Gintvha;
516 hcdalt[indx] = cfa[indx] - Ginthha;
517 } else {
518 //interpolated colour differences
519 vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
520 hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
521 vcdalt[indx] = Gintvha - cfa[indx];
522 hcdalt[indx] = Ginthha - cfa[indx];
523 }
524
525 fcswitch = !fcswitch;
526
527 if (cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8) {
528 //use HA if highlights are (nearly) clipped
529 guar = guha;
530 gdar = gdha;
531 glar = glha;
532 grar = grha;
533 vcd[indx] = vcdalt[indx];
534 hcd[indx] = hcdalt[indx];
535 }
536
537 //differences of interpolations in opposite directions
538 dgintv[indx] = min(SQR(guha - gdha), SQR(guar - gdar));
539 dginth[indx] = min(SQR(glha - grha), SQR(glar - grar));
540
541 }
542 }
543
544 #endif
545
546
547
548 #ifdef __SSE2__
549 vfloat clip_ptv = F2V( clip_pt );
550 vfloat sgn3v;
551
552 if( !(fc(cfarray, 4, 4) & 1) ) {
553 sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
554 } else {
555 sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
556 }
557
558 sgn3v = sgnv + sgnv + sgnv;
559
560 for (int rr = 4; rr < rr1 - 4; rr++) {
561 vfloat nsgnv = sgnv;
562 sgnv = -sgnv;
563 sgn3v = -sgn3v;
564
565 for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4) {
566 vfloat hcdv = LVF( hcd[indx] );
567 vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2]));
568 vfloat hcdaltv = LVF( hcdalt[indx] );
569 vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv) + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2])) + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
570 vfloat vcdv = LVF( vcd[indx] );
571 vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv) + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2])) + SQRV(vcdv - LVF(vcd[indx + v2]));
572 vfloat vcdaltv = LVF( vcdalt[indx] );
573 vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv) + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2])) + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
574
575 //choose the smallest variance; this yields a smoother interpolation
576 hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv);
577 vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv);
578
579 //bound the interpolation in regions of high saturation
580 //vertical and horizontal G interpolations
581 vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] );
582 vfloat temp2v = sgn3v * hcdv;
583 vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
584 vmask hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV );
585 vfloat hcdoldv = hcdv;
586 vfloat tempv = nsgnv * (LVF(cfa[indx]) - median( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) ));
587 hcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
588 hcdv = vself( hcdmask, hcdv, hcdoldv );
589 hcdv = vself( vmaskf_gt( Ginthv, clip_ptv), tempv, hcdv);
590 STVF(hcd[indx], hcdv);
591
592 vfloat Gintvv = sgnv * vcdv + LVF( cfa[indx] );
593 temp2v = sgn3v * vcdv;
594 vfloat vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx]));
595 vmask vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV );
596 vfloat vcdoldv = vcdv;
597 tempv = nsgnv * (LVF(cfa[indx]) - median( Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1]) ));
598 vcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
599 vcdv = vself( vcdmask, vcdv, vcdoldv );
600 vcdv = vself( vmaskf_gt( Gintvv, clip_ptv), tempv, vcdv);
601 STVF(vcd[indx], vcdv);
602 STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
603 }
604
605 }
606
607 #else
608
609 for (int rr = 4; rr < rr1 - 4; rr++) {
610 for (int cc = 4, indx = rr * ts + cc, c = fc(cfarray, rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
611 float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
612 float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
613 float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
614 float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
615
616 //choose the smallest variance; this yields a smoother interpolation
617 if (hcdaltvar < hcdvar) {
618 hcd[indx] = hcdalt[indx];
619 }
620
621 if (vcdaltvar < vcdvar) {
622 vcd[indx] = vcdalt[indx];
623 }
624
625 //bound the interpolation in regions of high saturation
626 //vertical and horizontal G interpolations
627 float Gintv, Ginth;
628
629 if (c) {//G site
630 Ginth = -hcd[indx] + cfa[indx]; //R or B
631 Gintv = -vcd[indx] + cfa[indx]; //B or R
632
633 if (hcd[indx] > 0) {
634 if (3.f * hcd[indx] > (Ginth + cfa[indx])) {
635 hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
636 } else {
637 float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
638 hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (-median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
639 }
640 }
641
642 if (vcd[indx] > 0) {
643 if (3.f * vcd[indx] > (Gintv + cfa[indx])) {
644 vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
645 } else {
646 float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
647 vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (-median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
648 }
649 }
650
651 if (Ginth > clip_pt) {
652 hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
653 }
654
655 if (Gintv > clip_pt) {
656 vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
657 }
658
659
660 } else {//R or B site
661
662 Ginth = hcd[indx] + cfa[indx]; //interpolated G
663 Gintv = vcd[indx] + cfa[indx];
664
665 if (hcd[indx] < 0) {
666 if (3.f * hcd[indx] < -(Ginth + cfa[indx])) {
667 hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
668 } else {
669 float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
670 hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
671 }
672 }
673
674 if (vcd[indx] < 0) {
675 if (3.f * vcd[indx] < -(Gintv + cfa[indx])) {
676 vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
677 } else {
678 float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
679 vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
680 }
681 }
682
683 if (Ginth > clip_pt) {
684 hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
685 }
686
687 if (Gintv > clip_pt) {
688 vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
689 }
690
691 cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
692 }
693
694 c = !c;
695 }
696 }
697
698 #endif
699
700
701
702 #ifdef __SSE2__
703 vfloat epssqv = F2V( epssq );
704
705 for (int rr = 6; rr < rr1 - 6; rr++) {
706 for (int indx = rr * ts + 6 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 6; indx += 8) {
707 //compute colour difference variances in cardinal directions
708 vfloat tempv = LC2VFU(vcd[indx]);
709 vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
710 vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
711 vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
712 vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
713
714 vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
715 vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
716
717 tempv = LC2VFU(hcd[indx]);
718 vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
719 vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
720
721 vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
722 vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
723
724
725 vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
726 vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
727
728 //compute fluctuations in up/down and left/right interpolations of colours
729 Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
730 Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
731
732 Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
733 Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
734
735 vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
736 vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
737
738 //determine adaptive weights for G interpolation
739 vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
740 vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
741
742 //if both agree on interpolation direction, choose the one with strongest directional discrimination;
743 //otherwise, choose the u/d and l/r difference fluctuation weights
744 vmask decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) );
745 STVFU(hvwt[indx >> 1], vself( decmask, varwtv, diffwtv));
746 }
747 }
748
749 #else
750
751 for (int rr = 6; rr < rr1 - 6; rr++) {
752 for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
753
754 //compute colour difference variances in cardinal directions
755
756 float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
757 float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
758 float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
759 float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
760
761 //colour difference (G-R or G-B) variance in up/down/left/right directions
762 float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
763 float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
764 float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
765 float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave);
766
767 float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
768 float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
769
770 float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
771 float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
772
773 //compute fluctuations in up/down and left/right interpolations of colours
774 Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
775 Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
776 Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
777 Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
778
779 float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
780 float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
781
782 //determine adaptive weights for G interpolation
783 float varwt = hcdvar / (vcdvar + hcdvar);
784 float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
785
786 //if both agree on interpolation direction, choose the one with strongest directional discrimination;
787 //otherwise, choose the u/d and l/r difference fluctuation weights
788 if ((0.5 - varwt) * (0.5 - diffwt) > 0 && fabsf(0.5 - diffwt) < fabsf(0.5 - varwt)) {
789 hvwt[indx >> 1] = varwt;
790 } else {
791 hvwt[indx >> 1] = diffwt;
792 }
793 }
794 }
795
796 #endif
797
798 #ifdef __SSE2__
799 vfloat gaussg0 = F2V(gaussgrad[0]);
800 vfloat gaussg1 = F2V(gaussgrad[1]);
801 vfloat gaussg2 = F2V(gaussgrad[2]);
802 vfloat gaussg3 = F2V(gaussgrad[3]);
803 vfloat gaussg4 = F2V(gaussgrad[4]);
804 vfloat gaussg5 = F2V(gaussgrad[5]);
805 vfloat gausso0 = F2V(gaussodd[0]);
806 vfloat gausso1 = F2V(gaussodd[1]);
807 vfloat gausso2 = F2V(gaussodd[2]);
808 vfloat gausso3 = F2V(gaussodd[3]);
809
810 #endif
811
812 // precompute nyquist
813 for (int rr = 6; rr < rr1 - 6; rr++) {
814 int cc = 6 + (fc(cfarray, rr, 2) & 1);
815 int indx = rr * ts + cc;
816
817 #ifdef __SSE2__
818
819 for (; cc < cc1 - 7; cc += 8, indx += 8) {
820 vfloat valv = (gausso0 * LC2VFU(cddiffsq[indx]) +
821 gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)]) +
822 LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)])) +
823 gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)]) +
824 LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)])) +
825 gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)]) +
826 LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)]))) -
827 (gaussg0 * LC2VFU(delhvsqsum[indx]) +
828 gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1]) +
829 LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1])) +
830 gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1]) +
831 LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1])) +
832 gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2]) +
833 LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2])) +
834 gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1]) +
835 LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2]) +
836 LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2]) +
837 LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1])) +
838 gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2]) +
839 LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
840 STVFU(nyqutest[indx >> 1], valv);
841
842 }
843
844 #endif
845
846 for (; cc < cc1 - 6; cc += 2, indx += 2) {
847 nyqutest[indx >> 1] = (gaussodd[0] * cddiffsq[indx] +
848 gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
849 cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
850 gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
851 cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
852 gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
853 cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])) -
854 (gaussgrad[0] * delhvsqsum[indx] +
855 gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
856 delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
857 gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
858 delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
859 gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
860 delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
861 gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1] +
862 delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2] +
863 delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2] +
864 delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1]) +
865 gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
866 delhvsqsum[indx - p2] + delhvsqsum[indx + m2]));
867 }
868 }
869
870 // Nyquist test
871 int nystartrow = 0;
872 int nyendrow = 0;
873 int nystartcol = ts + 1;
874 int nyendcol = 0;
875
876 for (int rr = 6; rr < rr1 - 6; rr++) {
877 for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
878
879 //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients
880 if(nyqutest[indx >> 1] > 0.f) {
881 nyquist[indx >> 1] = 1; //nyquist=1 for nyquist region
882 nystartrow = nystartrow ? nystartrow : rr;
883 nyendrow = rr;
884 nystartcol = nystartcol > cc ? cc : nystartcol;
885 nyendcol = nyendcol < cc ? cc : nyendcol;
886 }
887 }
888 }
889
890
891 bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
892
893 if(doNyquist) {
894 nyendrow ++; // because of < condition
895 nyendcol ++; // because of < condition
896 nystartcol -= (nystartcol & 1);
897 nystartrow = std::max(8, nystartrow);
898 nyendrow = std::min(rr1 - 8, nyendrow);
899 nystartcol = std::max(8, nystartcol);
900 nyendcol = std::min(cc1 - 8, nyendcol);
901 memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
902
903 #ifdef __SSE2__
904 vint fourvb = _mm_set1_epi8(4);
905 vint onevb = _mm_set1_epi8(1);
906
907 #endif
908
909 for (int rr = nystartrow; rr < nyendrow; rr++) {
910 #ifdef __SSE2__
911
912 for (int indx = rr * ts; indx < rr * ts + cc1; indx += 32) {
913 vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx - v2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - m1) >> 1]));
914 vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + p1) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - 2) >> 1]));
915 vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + 2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - p1) >> 1]));
916 vint valv = _mm_load_si128((vint*)&nyquist[indx >> 1]);
917 vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + m1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx + v2) >> 1]));
918 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
919 nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
920 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
921 valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
922 valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
923 _mm_store_si128((vint*)&nyquist2[indx >> 1], valv);
924 }
925
926 #else
927
928 for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
929 unsigned int nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
930 nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] +
931 nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
932 //if most of your neighbours are named Nyquist, it's likely that you're one too, or not
933 nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
934 }
935
936 #endif
937 }
938
939 // end of Nyquist test
940
941 // in areas of Nyquist texture, do area interpolation
942 for (int rr = nystartrow; rr < nyendrow; rr++)
943 for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
944
945 if (nyquist2[indx >> 1]) {
946 // area interpolation
947
948 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
949
950 for (int i = -6; i < 7; i += 2) {
951 int indx1 = indx + (i * ts) - 6;
952
953 for (int j = -6; j < 7; j += 2, indx1 += 2) {
954 if (nyquist2[indx1 >> 1]) {
955 float cfatemp = cfa[indx1];
956 sumcfa += cfatemp;
957 sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
958 sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
959 sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
960 sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
961 areawt += 1;
962 }
963 }
964 }
965
966 //horizontal and vertical colour differences, and adaptive weight
967 sumh = sumcfa - xdiv2f(sumh);
968 sumv = sumcfa - xdiv2f(sumv);
969 areawt = xdiv2f(areawt);
970 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
971 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
972 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
973
974 // end of area interpolation
975
976 }
977 }
978 }
979
980
981 //populate G at R/B sites
982 for (int rr = 8; rr < rr1 - 8; rr++)
983 for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 8; indx += 2) {
984
985 //first ask if one gets more directional discrimination from nearby B/R sites
986 float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
987
988 hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
989 //a better result was obtained from the neighbours
990
991 Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate colour differences
992
993 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!)
994
995 //local curvature in G (preparation for nyquist refinement step)
996 Dgrb2[indx >> 1].h = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
997 Dgrb2[indx >> 1].v = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
998 }
999
1000
1001 //end of standard interpolation
1002
1003 // refine Nyquist areas using G curvatures
1004 if(doNyquist) {
1005 for (int rr = nystartrow; rr < nyendrow; rr++)
1006 for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
1007
1008 if (nyquist2[indx >> 1]) {
1009 //local averages (over Nyquist pixels only) of G curvature squared
1010 float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
1011 gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
1012 gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) +
1013 gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
1014 float gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v +
1015 gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) +
1016 gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) +
1017 gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
1018 //use the results as weights for refined G interpolation
1019 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
1020 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
1021 }
1022 }
1023 }
1024
1025
1026 #ifdef __SSE2__
1027
1028 for (int rr = 6; rr < rr1 - 6; rr++) {
1029 if((fc(cfarray, rr, 2) & 1) == 0) {
1030 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1031 vfloat tempv = LC2VFU(cfa[indx + 1]);
1032 vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
1033 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
1034 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
1035 vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
1036 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1037 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1038 }
1039 } else {
1040 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1041 vfloat tempv = LC2VFU(cfa[indx]);
1042 vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
1043 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
1044 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
1045 vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
1046 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1047 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1048 }
1049 }
1050 }
1051
1052 #else
1053
1054 for (int rr = 6; rr < rr1 - 6; rr++) {
1055 if((fc(cfarray, rr, 2) & 1) == 0) {
1056 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1057 delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
1058 delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
1059 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
1060 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
1061 }
1062 } else {
1063 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1064 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
1065 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
1066 delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
1067 delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
1068 }
1069 }
1070 }
1071
1072 #endif
1073
1074 // diagonal interpolation correction
1075
1076 #ifdef __SSE2__
1077 vfloat gausseven0v = F2V(gausseven[0]);
1078 vfloat gausseven1v = F2V(gausseven[1]);
1079 #endif
1080
1081 for (int rr = 8; rr < rr1 - 8; rr++) {
1082 #ifdef __SSE2__
1083
1084 for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8; indx += 8, indx1 += 4) {
1085
1086 //diagonal colour ratios
1087 vfloat cfav = LC2VFU(cfa[indx]);
1088
1089 vfloat temp1v = LC2VFU(cfa[indx + m1]);
1090 vfloat temp2v = LC2VFU(cfa[indx + m2]);
1091 vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1092 rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev, temp1v + zd5v * (cfav - temp2v));
1093
1094 temp1v = LC2VFU(cfa[indx - m1]);
1095 temp2v = LC2VFU(cfa[indx - m2]);
1096 vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1097 rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv, temp1v + zd5v * (cfav - temp2v));
1098
1099 temp1v = epsv + LVFU(delm[indx1]);
1100 vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr
1101 vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
1102
1103 vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
1104
1105 temp1v = median(rbmv , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
1106 vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
1107 temp2v = vintpf(wtv, rbmv, temp1v);
1108
1109 temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
1110 temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
1111 STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v ));
1112
1113
1114 temp1v = LC2VFU(cfa[indx + p1]);
1115 temp2v = LC2VFU(cfa[indx + p2]);
1116 vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1117 rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v));
1118
1119 temp1v = LC2VFU(cfa[indx - p1]);
1120 temp2v = LC2VFU(cfa[indx - p2]);
1121 vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1122 rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v));
1123
1124 temp1v = epsv + LVFU(delp[indx1]);
1125 vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
1126 vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
1127
1128 vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
1129
1130 temp1v = median(rbpv , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
1131 wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
1132 temp2v = vintpf(wtv, rbpv, temp1v);
1133
1134 temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
1135 temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
1136 STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v ));
1137
1138 vfloat rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) +
1139 gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) +
1140 LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
1141 STVFU(pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) +
1142 gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) +
1143 LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv));
1144
1145 }
1146
1147 #else
1148
1149 for (int cc = 8 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
1150
1151 //diagonal colour ratios
1152 float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
1153 float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
1154 float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
1155 float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
1156 //colour differences in diagonal directions
1157 float rbse, rbnw, rbne, rbsw;
1158
1159 //assign B/R at R/B sites
1160 if (fabsf(1.f - crse) < arthresh) {
1161 rbse = cfa[indx] * crse; //use this if more precise diag interp is necessary
1162 } else {
1163 rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
1164 }
1165
1166 if (fabsf(1.f - crnw) < arthresh) {
1167 rbnw = cfa[indx] * crnw;
1168 } else {
1169 rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
1170 }
1171
1172 if (fabsf(1.f - crne) < arthresh) {
1173 rbne = cfa[indx] * crne;
1174 } else {
1175 rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
1176 }
1177
1178 if (fabsf(1.f - crsw) < arthresh) {
1179 rbsw = cfa[indx] * crsw;
1180 } else {
1181 rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
1182 }
1183
1184 float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr
1185 float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
1186 float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
1187 float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
1188
1189
1190 rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
1191 rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
1192
1193 //variance of R-B in plus/minus directions
1194 float rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) +
1195 gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] +
1196 Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
1197 pmwt[indx1] = rbvarm / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1] + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1]) +
1198 gausseven[1] * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1] + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1] +
1199 Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1] + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1]))) + rbvarm);
1200
1201 //bound the interpolation in regions of high saturation
1202
1203 if (rbp[indx1] < cfa[indx]) {
1204 if (xmul2f(rbp[indx1]) < cfa[indx]) {
1205 rbp[indx1] = median(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]);
1206 } else {
1207 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
1208 rbp[indx1] = pwt * rbp[indx1] + (1.f - pwt) * median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1209 }
1210 }
1211
1212 if (rbm[indx1] < cfa[indx]) {
1213 if (xmul2f(rbm[indx1]) < cfa[indx]) {
1214 rbm[indx1] = median(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]);
1215 } else {
1216 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
1217 rbm[indx1] = mwt * rbm[indx1] + (1.f - mwt) * median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1218 }
1219 }
1220
1221 if (rbp[indx1] > clip_pt) {
1222 rbp[indx1] = median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1223 }
1224
1225 if (rbm[indx1] > clip_pt) {
1226 rbm[indx1] = median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1227 }
1228 }
1229
1230 #endif
1231 }
1232
1233 #ifdef __SSE2__
1234 vfloat zd25v = F2V(0.25f);
1235 #endif
1236
1237 for (int rr = 10; rr < rr1 - 10; rr++)
1238 #ifdef __SSE2__
1239 for (int indx = rr * ts + 10 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4) {
1240
1241 //first ask if one gets more directional discrimination from nearby B/R sites
1242 vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
1243 vfloat tempv = LVFU(pmwt[indx1]);
1244 tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
1245 STVFU(pmwt[indx1], tempv);
1246 STVFU(rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
1247 }
1248
1249 #else
1250
1251 for (int cc = 10 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
1252
1253 //first ask if one gets more directional discrimination from nearby B/R sites
1254 float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
1255
1256 if (fabsf(0.5 - pmwt[indx1]) < fabsf(0.5 - pmwtalt)) {
1257 pmwt[indx1] = pmwtalt; //a better result was obtained from the neighbours
1258 }
1259
1260 rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
1261 }
1262
1263 #endif
1264
1265 for (int rr = 12; rr < rr1 - 12; rr++)
1266 #ifdef __SSE2__
1267 for (int indx = rr * ts + 12 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4) {
1268 vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
1269
1270 if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the unused out at the end
1271 //now interpolate G vertically/horizontally using R+B values
1272 //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1273 //colour ratios for G interpolation
1274 vfloat rbintv = LVFU(rbint[indx1]);
1275
1276 //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1277 vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
1278 vfloat guv = rbintv * cruv;
1279 vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
1280 guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
1281
1282 vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
1283 vfloat gdv = rbintv * crdv;
1284 vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
1285 gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
1286
1287 vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
1288 vfloat Gint1v = median(Gintvv , LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
1289 vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
1290 vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
1291 Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
1292 Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
1293 Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv), median(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
1294
1295 vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
1296 vfloat glv = rbintv * crlv;
1297 vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
1298 glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
1299
1300 vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
1301 vfloat grv = rbintv * crrv;
1302 vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
1303 grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
1304
1305 vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
1306 vfloat Gint1h = median(Ginthv , LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
1307 vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
1308 vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
1309 Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
1310 Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
1311 Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv), median(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
1312
1313 vfloat greenv = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
1314 STC2VFU(rgbgreen[indx], greenv);
1315
1316 STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
1317 }
1318 }
1319
1320 #else
1321
1322 for (int cc = 12 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
1323
1324 if (fabsf(0.5 - pmwt[indx >> 1]) < fabsf(0.5 - hvwt[indx >> 1]) ) {
1325 continue;
1326 }
1327
1328 //now interpolate G vertically/horizontally using R+B values
1329 //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1330
1331 //colour ratios for G interpolation
1332 float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
1333 float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
1334 float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
1335 float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
1336
1337 //interpolation of G in four directions
1338 float gu, gd, gl, gr;
1339
1340 //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1341 if (fabsf(1.f - cru) < arthresh) {
1342 gu = rbint[indx1] * cru;
1343 } else {
1344 gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
1345 }
1346
1347 if (fabsf(1.f - crd) < arthresh) {
1348 gd = rbint[indx1] * crd;
1349 } else {
1350 gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
1351 }
1352
1353 if (fabsf(1.f - crl) < arthresh) {
1354 gl = rbint[indx1] * crl;
1355 } else {
1356 gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
1357 }
1358
1359 if (fabsf(1.f - crr) < arthresh) {
1360 gr = rbint[indx1] * crr;
1361 } else {
1362 gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
1363 }
1364
1365 //interpolated G via adaptive weights of cardinal evaluations
1366 float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1367 float Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1368
1369 //bound the interpolation in regions of high saturation
1370 if (Gintv < rbint[indx1]) {
1371 if (2 * Gintv < rbint[indx1]) {
1372 Gintv = median(Gintv , cfa[indx - v1], cfa[indx + v1]);
1373 } else {
1374 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
1375 Gintv = vwt * Gintv + (1.f - vwt) * median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1376 }
1377 }
1378
1379 if (Ginth < rbint[indx1]) {
1380 if (2 * Ginth < rbint[indx1]) {
1381 Ginth = median(Ginth , cfa[indx - 1], cfa[indx + 1]);
1382 } else {
1383 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
1384 Ginth = hwt * Ginth + (1.f - hwt) * median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1385 }
1386 }
1387
1388 if (Ginth > clip_pt) {
1389 Ginth = median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1390 }
1391
1392 if (Gintv > clip_pt) {
1393 Gintv = median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1394 }
1395
1396 rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
1397 Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
1398 }
1399
1400 #endif
1401
1402 //end of diagonal interpolation correction
1403
1404 //fancy chrominance interpolation
1405 //(ey,ex) is location of R site
1406 for (int rr = 13 - ey; rr < rr1 - 12; rr += 2)
1407 for (int indx1 = (rr * ts + 13 - ex) >> 1; indx1 < (rr * ts + cc1 - 12) >> 1; indx1++) { //B coset
1408 Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R
1409 Dgrb[0][indx1] = 0;
1410 }
1411
1412 #ifdef __SSE2__
1413 vfloat oned325v = F2V( 1.325f );
1414 vfloat zd175v = F2V( 0.175f );
1415 vfloat zd075v = F2V( 0.075f );
1416 #endif
1417
1418 for (int rr = 14; rr < rr1 - 14; rr++)
1419 #ifdef __SSE2__
1420 for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
1421 vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
1422 vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
1423 vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
1424 vfloat wtnev = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
1425 vfloat wtswv = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
1426 vfloat wtsev = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
1427
1428 STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])) ) +
1429 wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])) ) +
1430 wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])) ) +
1431 wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + m1 + v2) >> 1])) )) / (wtnwv + wtnev + wtswv + wtsev));
1432 }
1433
1434 #else
1435
1436 for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
1437 float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
1438 float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
1439 float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
1440 float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
1441
1442 Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) +
1443 wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) +
1444 wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1] - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1] ) +
1445 wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1] - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1] )) / (wtnw + wtne + wtsw + wtse);
1446 }
1447
1448 #endif
1449
1450 #ifdef __SSE2__
1451 int offset;
1452 vfloat twov = F2V(2.f);
1453 vfloat coutscalev = F2V(outputScale);
1454 vmask selmask;
1455
1456 if((fc(cfarray, 16, 2) & 1) == 1) {
1457 selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
1458 offset = 1;
1459 } else {
1460 selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
1461 offset = 0;
1462 }
1463
1464 #endif
1465
1466 for (int rr = 16; rr < rr1 - 16; rr++) {
1467 int row = rr + top;
1468 int col = left + 16;
1469 int indx = rr * ts + 16;
1470 #ifdef __SSE2__
1471 offset = 1 - offset;
1472 selmask = vnotm(selmask);
1473
1474 for (; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4) {
1475 vfloat greenv = LVF(rgbgreen[indx]);
1476 vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
1477 vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
1478 vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])) - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
1479
1480 vfloat redv1 = greenv - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1]))) * tempv;
1481 vfloat bluev1 = greenv - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1]))) * tempv;
1482 vfloat redv2 = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
1483 vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
1484 STVFU(red[row][col], coutscalev * vself(selmask, redv1, redv2));
1485 STVFU(blue[row][col], coutscalev * vself(selmask, bluev1, bluev2));
1486 }
1487
1488 if(offset == 0) {
1489 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1490 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1491 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1492 temp);
1493 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1494 temp);
1495
1496 indx++;
1497 col++;
1498 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1499 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1500 }
1501
1502 if(cc1 & 1) { // width of tile is odd
1503 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1504 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1505 temp);
1506 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1507 temp);
1508 }
1509 } else {
1510 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1511 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1512 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1513
1514 indx++;
1515 col++;
1516 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1517 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1518 temp);
1519 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1520 temp);
1521 }
1522
1523 if(cc1 & 1) { // width of tile is odd
1524 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1525 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1526 }
1527 }
1528
1529 #else
1530
1531 if((fc(cfarray, rr, 2) & 1) == 1) {
1532 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1533 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1534 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1535 temp);
1536 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1537 temp);
1538
1539 indx++;
1540 col++;
1541 red[row][col] = outputScale* (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1542 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1543 }
1544
1545 if(cc1 & 1) { // width of tile is odd
1546 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1547 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1548 temp);
1549 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1550 temp);
1551 }
1552 } else {
1553 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1554 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1555 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1556
1557 indx++;
1558 col++;
1559 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1560 red[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1561 temp);
1562 blue[row][col] = outputScale * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1563 temp);
1564 }
1565
1566 if(cc1 & 1) { // width of tile is odd
1567 red[row][col] = outputScale * (rgbgreen[indx] - Dgrb[0][indx >> 1]);
1568 blue[row][col] = outputScale * (rgbgreen[indx] - Dgrb[1][indx >> 1]);
1569 }
1570 }
1571
1572 #endif
1573 }
1574
1575 // copy smoothed results back to image matrix
1576 for (int rr = 16; rr < rr1 - 16; rr++) {
1577 int row = rr + top;
1578 int cc = 16;
1579 #ifdef __SSE2__
1580
1581 for (; cc < cc1 - 19; cc += 4) {
1582 STVFU(green[row][cc + left], LVF(rgbgreen[rr * ts + cc]) * coutscalev);
1583 }
1584
1585 #endif
1586
1587 for (; cc < cc1 - 16; cc++) {
1588 green[row][cc + left] = outputScale * rgbgreen[rr * ts + cc];
1589 }
1590 }
1591
1592 progresscounter++;
1593
1594 if(progresscounter % 32 == 0) {
1595 #ifdef _OPENMP
1596 #pragma omp critical (amazeprogress)
1597 #endif
1598 {
1599 progress += 32.0 * SQR(ts - 32) / (height * width);
1600 progress = std::min(progress, 1.0);
1601 setProgCancel(progress);
1602 }
1603 }
1604 }
1605 } //end of main loop
1606 }
1607 // clean up
1608 if (buffer) {
1609 free(buffer);
1610 }
1611 }
1612 if(border < 4 && rc == RP_NO_ERROR) {
1613 rc = bayerborder_demosaic(width, height, 3, rawData, red, green, blue, cfarray);
1614 }
1615
1616 setProgCancel(1.0);
1617
1618 return rc;
1619 }
1620