1 ////////////////////////////////////////////////////////////////
2 //
3 // AMaZE demosaic algorithm
4 // (Aliasing Minimization and Zipper Elimination)
5 //
6 // copyright (c) 2008-2010 Emil Martinec <ejmartin@uchicago.edu>
7 // optimized for speed by Ingo Weyrich
8 //
9 // incorporating ideas of Luis Sanz Rodrigues and Paul Lee
10 //
11 // code dated: May 27, 2010
12 // latest modification: Ingo Weyrich, January 25, 2016
13 //
14 // amaze_interpolate_RT.cc is free software: you can redistribute it and/or modify
15 // it under the terms of the GNU General Public License as published by
16 // the Free Software Foundation, either version 3 of the License, or
17 // (at your option) any later version.
18 //
19 // This program is distributed in the hope that it will be useful,
20 // but WITHOUT ANY WARRANTY; without even the implied warranty of
21 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 // GNU General Public License for more details.
23 //
24 // You should have received a copy of the GNU General Public License
25 // along with this program. If not, see <https://www.gnu.org/licenses/>.
26 //
27 ////////////////////////////////////////////////////////////////
28
29 #include "rtengine.h"
30 #include "rawimagesource.h"
31 #include "rt_math.h"
32 #include "../rtgui/multilangmgr.h"
33 #include "sleef.h"
34 #include "opthelper.h"
35 #include "median.h"
36 #include "StopWatch.h"
37
38 namespace
39 {
fc(const unsigned int cfa[2][2],int r,int c)40 unsigned fc(const unsigned int cfa[2][2], int r, int c) {
41 return cfa[r & 1][c & 1];
42 }
43 }
44
45 namespace rtengine
46 {
47
amaze_demosaic_RT(int winx,int winy,int winw,int winh,const array2D<float> & rawData,array2D<float> & red,array2D<float> & green,array2D<float> & blue,size_t chunkSize,bool measure)48 void RawImageSource::amaze_demosaic_RT(int winx, int winy, int winw, int winh, const array2D<float> &rawData, array2D<float> &red, array2D<float> &green, array2D<float> &blue, size_t chunkSize, bool measure)
49 {
50
51 std::unique_ptr<StopWatch> stop;
52
53 if (measure) {
54 std::cout << "Demosaicing " << W << "x" << H << " image using AMaZE with " << chunkSize << " Tiles per Thread" << std::endl;
55 stop.reset(new StopWatch("amaze demosaic"));
56 }
57
58 double progress = 0.0;
59
60 if (plistener) {
61 plistener->setProgressStr(Glib::ustring::compose(M("TP_RAW_DMETHOD_PROGRESSBAR"), M("TP_RAW_AMAZE")));
62 plistener->setProgress(progress);
63 }
64
65 const unsigned int cfarray[2][2] = {{FC(0,0), FC(0,1)}, {FC(1,0), FC(1,1)}};
66 const int width = winw, height = winh;
67 const float clip_pt = 1.0 / initialGain;
68 const float clip_pt8 = 0.8 / initialGain;
69
70 // this allows to pass AMAZETS to the code. On some machines larger AMAZETS is faster
71 // If AMAZETS is undefined it will be set to 160, which is the fastest on modern x86/64 machines
72 #ifndef AMAZETS
73 #define AMAZETS 160
74 #endif
75 // Tile size; the image is processed in square tiles to lower memory requirements and facilitate multi-threading
76 // We assure that Tile size is a multiple of 32 in the range [96;992]
77 constexpr int ts = (AMAZETS & 992) < 96 ? 96 : (AMAZETS & 992);
78 constexpr int tsh = ts / 2; // half of Tile size
79
80 //offset of R pixel within a Bayer quartet
81 int ex, ey;
82
83 //determine GRBG coset; (ey,ex) is the offset of the R subarray
84 if (fc(cfarray, 0, 0) == 1) { //first pixel is G
85 if (fc(cfarray, 0, 1) == 0) {
86 ey = 0;
87 ex = 1;
88 } else {
89 ey = 1;
90 ex = 0;
91 }
92 } else {//first pixel is R or B
93 if (fc(cfarray, 0, 0) == 0) {
94 ey = 0;
95 ex = 0;
96 } else {
97 ey = 1;
98 ex = 1;
99 }
100 }
101
102 //shifts of pointer value to access pixels in vertical and diagonal directions
103 constexpr int v1 = ts, v2 = 2 * ts, v3 = 3 * ts, p1 = -ts + 1, p2 = -2 * ts + 2, p3 = -3 * ts + 3, m1 = ts + 1, m2 = 2 * ts + 2, m3 = 3 * ts + 3;
104
105 //tolerance to avoid dividing by zero
106 constexpr float eps = 1e-5, epssq = 1e-10; //tolerance to avoid dividing by zero
107
108 //adaptive ratios threshold
109 constexpr float arthresh = 0.75;
110
111 //gaussian on 5x5 quincunx, sigma=1.2
112 constexpr float gaussodd[4] = {0.14659727707323927f, 0.103592713382435f, 0.0732036125103057f, 0.0365543548389495f};
113 //nyquist texture test threshold
114 constexpr float nyqthresh = 0.5;
115 //gaussian on 5x5, sigma=1.2, multiplied with nyqthresh to save some time later in loop
116 // Is this really sigma=1.2????, seems more like sigma = 1.672
117 constexpr float gaussgrad[6] = {nyqthresh * 0.07384411893421103f, nyqthresh * 0.06207511968171489f, nyqthresh * 0.0521818194747806f,
118 nyqthresh * 0.03687419286733595f, nyqthresh * 0.03099732204057846f, nyqthresh * 0.018413194161458882f
119 };
120 //gaussian on 5x5 alt quincunx, sigma=1.5
121 constexpr float gausseven[2] = {0.13719494435797422f, 0.05640252782101291f};
122 //gaussian on quincunx grid
123 constexpr float gquinc[4] = {0.169917f, 0.108947f, 0.069855f, 0.0287182f};
124
125 typedef struct {
126 float h;
127 float v;
128 } s_hv;
129
130 #ifdef _OPENMP
131 #pragma omp parallel
132 #endif
133 {
134 int progresscounter = 0;
135
136 constexpr int cldf = 2; // factor to multiply cache line distance. 1 = 64 bytes, 2 = 128 bytes ...
137 // assign working space
138 char *buffer = (char *) calloc(14 * sizeof(float) * ts * ts + sizeof(char) * ts * tsh + 18 * cldf * 64 + 63, 1);
139 // aligned to 64 byte boundary
140 char *data = (char*)( ( uintptr_t(buffer) + uintptr_t(63)) / 64 * 64);
141
142 // green values
143 float *rgbgreen = (float (*)) data;
144 // sum of square of horizontal gradient and square of vertical gradient
145 float *delhvsqsum = (float (*)) ((char*)rgbgreen + sizeof(float) * ts * ts + cldf * 64); // 1
146 // gradient based directional weights for interpolation
147 float *dirwts0 = (float (*)) ((char*)delhvsqsum + sizeof(float) * ts * ts + cldf * 64); // 1
148 float *dirwts1 = (float (*)) ((char*)dirwts0 + sizeof(float) * ts * ts + cldf * 64); // 1
149 // vertically interpolated colour differences G-R, G-B
150 float *vcd = (float (*)) ((char*)dirwts1 + sizeof(float) * ts * ts + cldf * 64); // 1
151 // horizontally interpolated colour differences
152 float *hcd = (float (*)) ((char*)vcd + sizeof(float) * ts * ts + cldf * 64); // 1
153 // alternative vertical interpolation
154 float *vcdalt = (float (*)) ((char*)hcd + sizeof(float) * ts * ts + cldf * 64); // 1
155 // alternative horizontal interpolation
156 float *hcdalt = (float (*)) ((char*)vcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
157 // square of average colour difference
158 float *cddiffsq = (float (*)) ((char*)hcdalt + sizeof(float) * ts * ts + cldf * 64); // 1
159 // weight to give horizontal vs vertical interpolation
160 float *hvwt = (float (*)) ((char*)cddiffsq + sizeof(float) * ts * ts + 2 * cldf * 64); // 1
161 // final interpolated colour difference
162 float (*Dgrb)[ts * tsh] = (float (*)[ts * tsh])vcdalt; // there is no overlap in buffer usage => share
163 // gradient in plus (NE/SW) direction
164 float *delp = (float (*))cddiffsq; // there is no overlap in buffer usage => share
165 // gradient in minus (NW/SE) direction
166 float *delm = (float (*)) ((char*)delp + sizeof(float) * ts * tsh + cldf * 64);
167 // diagonal interpolation of R+B
168 float *rbint = (float (*))delm; // there is no overlap in buffer usage => share
169 // horizontal and vertical curvature of interpolated G (used to refine interpolation in Nyquist texture regions)
170 s_hv *Dgrb2 = (s_hv (*)) ((char*)hvwt + sizeof(float) * ts * tsh + cldf * 64); // 1
171 // difference between up/down interpolations of G
172 float *dgintv = (float (*))Dgrb2; // there is no overlap in buffer usage => share
173 // difference between left/right interpolations of G
174 float *dginth = (float (*)) ((char*)dgintv + sizeof(float) * ts * ts + cldf * 64); // 1
175 // square of diagonal colour differences
176 float *Dgrbsq1m = (float (*)) ((char*)dginth + sizeof(float) * ts * ts + cldf * 64); // 1
177 float *Dgrbsq1p = (float (*)) ((char*)Dgrbsq1m + sizeof(float) * ts * tsh + cldf * 64); // 1
178 // tile raw data
179 float *cfa = (float (*)) ((char*)Dgrbsq1p + sizeof(float) * ts * tsh + cldf * 64); // 1
180 // relative weight for combining plus and minus diagonal interpolations
181 float *pmwt = (float (*))delhvsqsum; // there is no overlap in buffer usage => share
182 // interpolated colour difference R-B in minus and plus direction
183 float *rbm = (float (*))vcd; // there is no overlap in buffer usage => share
184 float *rbp = (float (*)) ((char*)rbm + sizeof(float) * ts * tsh + cldf * 64);
185 // nyquist texture flags 1=nyquist, 0=not nyquist
186 unsigned char *nyquist = (unsigned char (*)) ((char*)cfa + sizeof(float) * ts * ts + cldf * 64); // 1
187 unsigned char *nyquist2 = (unsigned char (*))cddiffsq;
188 float *nyqutest = (float(*)) ((char*)nyquist + sizeof(unsigned char) * ts * tsh + cldf * 64); // 1
189
190 // Main algorithm: Tile loop
191 // use collapse(2) to collapse the 2 loops to one large loop, so there is better scaling
192 #ifdef _OPENMP
193 #pragma omp for schedule(dynamic, chunkSize) collapse(2) nowait
194 #endif
195
196 for (int top = winy - 16; top < winy + height; top += ts - 32) {
197 for (int left = winx - 16; left < winx + width; left += ts - 32) {
198 memset(&nyquist[3 * tsh], 0, sizeof(unsigned char) * (ts - 6) * tsh);
199 //location of tile bottom edge
200 int bottom = min(top + ts, winy + height + 16);
201 //location of tile right edge
202 int right = min(left + ts, winx + width + 16);
203 //tile width (=ts except for right edge of image)
204 int rr1 = bottom - top;
205 //tile height (=ts except for bottom edge of image)
206 int cc1 = right - left;
207 // bookkeeping for borders
208 // min and max row/column in the tile
209 int rrmin = top < winy ? 16 : 0;
210 int ccmin = left < winx ? 16 : 0;
211 int rrmax = bottom > (winy + height) ? winy + height - top : rr1;
212 int ccmax = right > (winx + width) ? winx + width - left : cc1;
213
214 // rgb from input CFA data
215 // rgb values should be floating point number between 0 and 1
216 // after white balance multipliers are applied
217 // a 16 pixel border is added to each side of the image
218
219 // begin of tile initialization
220 #ifdef __SSE2__
221 vfloat c65535v = F2V( 65535.f );
222
223 //fill upper border
224 if (rrmin > 0) {
225 for (int rr = 0; rr < 16; rr++) {
226 int row = 32 - rr + top;
227
228 for (int cc = ccmin; cc < ccmax; cc += 4) {
229 int indx1 = rr * ts + cc;
230 vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
231 STVF(cfa[indx1], tempv);
232 STVF(rgbgreen[indx1], tempv );
233 }
234 }
235 }
236
237 // fill inner part
238 for (int rr = rrmin; rr < rrmax; rr++) {
239 int row = rr + top;
240 int cc = ccmin;
241 for (; cc < ccmax - 3; cc += 4) {
242 int indx1 = rr * ts + cc;
243 vfloat tempv = LVFU(rawData[row][cc + left]) / c65535v;
244 STVF(cfa[indx1], tempv );
245 STVF(rgbgreen[indx1], tempv );
246 }
247 for (; cc < ccmax; ++cc) {
248 int indx1 = rr * ts + cc;
249 float temp = rawData[row][cc + left] / 65535.f;
250 cfa[indx1] = temp;
251 rgbgreen[indx1] = temp;
252 }
253 }
254
255 //fill lower border
256 if (rrmax < rr1) {
257 for (int rr = 0; rr < 16; rr++)
258 for (int cc = ccmin; cc < ccmax; cc += 4) {
259 int indx1 = (rrmax + rr) * ts + cc;
260 vfloat tempv = LVFU(rawData[(winy + height - rr - 2)][left + cc]) / c65535v;
261 STVF(cfa[indx1], tempv );
262 STVF(rgbgreen[indx1], tempv );
263 }
264 }
265
266 #else
267
268 //fill upper border
269 if (rrmin > 0) {
270 for (int rr = 0; rr < 16; rr++)
271 for (int cc = ccmin, row = 32 - rr + top; cc < ccmax; cc++) {
272 cfa[rr * ts + cc] = (rawData[row][cc + left]) / 65535.f;
273 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
274 }
275 }
276
277 // fill inner part
278 for (int rr = rrmin; rr < rrmax; rr++) {
279 int row = rr + top;
280
281 for (int cc = ccmin; cc < ccmax; cc++) {
282 int indx1 = rr * ts + cc;
283 cfa[indx1] = (rawData[row][cc + left]) / 65535.f;
284 rgbgreen[indx1] = cfa[indx1];
285 }
286 }
287
288 //fill lower border
289 if (rrmax < rr1) {
290 for (int rr = 0; rr < 16; rr++)
291 for (int cc = ccmin; cc < ccmax; cc++) {
292 cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][left + cc]) / 65535.f;
293 rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
294 }
295 }
296
297 #endif
298
299 //fill left border
300 if (ccmin > 0) {
301 for (int rr = rrmin; rr < rrmax; rr++)
302 for (int cc = 0, row = rr + top; cc < 16; cc++) {
303 cfa[rr * ts + cc] = (rawData[row][32 - cc + left]) / 65535.f;
304 rgbgreen[rr * ts + cc] = cfa[rr * ts + cc];
305 }
306 }
307
308 //fill right border
309 if (ccmax < cc1) {
310 for (int rr = rrmin; rr < rrmax; rr++)
311 for (int cc = 0; cc < 16; cc++) {
312 cfa[rr * ts + ccmax + cc] = (rawData[(top + rr)][(winx + width - cc - 2)]) / 65535.f;
313 rgbgreen[rr * ts + ccmax + cc] = cfa[rr * ts + ccmax + cc];
314 }
315 }
316
317 //also, fill the image corners
318 if (rrmin > 0 && ccmin > 0) {
319 for (int rr = 0; rr < 16; rr++)
320 for (int cc = 0; cc < 16; cc++) {
321 cfa[(rr)*ts + cc] = (rawData[winy + 32 - rr][winx + 32 - cc]) / 65535.f;
322 rgbgreen[(rr)*ts + cc] = cfa[(rr) * ts + cc];
323 }
324 }
325
326 if (rrmax < rr1 && ccmax < cc1) {
327 for (int rr = 0; rr < 16; rr++)
328 for (int cc = 0; cc < 16; cc++) {
329 cfa[(rrmax + rr)*ts + ccmax + cc] = (rawData[(winy + height - rr - 2)][(winx + width - cc - 2)]) / 65535.f;
330 rgbgreen[(rrmax + rr)*ts + ccmax + cc] = cfa[(rrmax + rr) * ts + ccmax + cc];
331 }
332 }
333
334 if (rrmin > 0 && ccmax < cc1) {
335 for (int rr = 0; rr < 16; rr++)
336 for (int cc = 0; cc < 16; cc++) {
337 cfa[(rr)*ts + ccmax + cc] = (rawData[(winy + 32 - rr)][(winx + width - cc - 2)]) / 65535.f;
338 rgbgreen[(rr)*ts + ccmax + cc] = cfa[(rr) * ts + ccmax + cc];
339 }
340 }
341
342 if (rrmax < rr1 && ccmin > 0) {
343 for (int rr = 0; rr < 16; rr++)
344 for (int cc = 0; cc < 16; cc++) {
345 cfa[(rrmax + rr)*ts + cc] = (rawData[(winy + height - rr - 2)][(winx + 32 - cc)]) / 65535.f;
346 rgbgreen[(rrmax + rr)*ts + cc] = cfa[(rrmax + rr) * ts + cc];
347 }
348 }
349
350 // end of tile initialization
351
352 // horizontal and vertical gradients
353 #ifdef __SSE2__
354 vfloat epsv = F2V( eps );
355
356 for (int rr = 2; rr < rr1 - 2; rr++) {
357 for (int indx = rr * ts; indx < rr * ts + cc1; indx += 4) {
358 vfloat delhv = vabsf( LVFU( cfa[indx + 1] ) - LVFU( cfa[indx - 1] ) );
359 vfloat delvv = vabsf( LVF( cfa[indx + v1] ) - LVF( cfa[indx - v1] ) );
360 STVF(dirwts1[indx], epsv + vabsf( LVFU( cfa[indx + 2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVFU( cfa[indx - 2] )) + delhv );
361 STVF(dirwts0[indx], epsv + vabsf( LVF( cfa[indx + v2] ) - LVF( cfa[indx] )) + vabsf( LVF( cfa[indx] ) - LVF( cfa[indx - v2] )) + delvv );
362 STVF(delhvsqsum[indx], SQRV(delhv) + SQRV(delvv));
363 }
364 }
365
366 #else
367
368 for (int rr = 2; rr < rr1 - 2; rr++)
369 for (int cc = 2, indx = (rr) * ts + cc; cc < cc1 - 2; cc++, indx++) {
370 float delh = fabsf(cfa[indx + 1] - cfa[indx - 1]);
371 float delv = fabsf(cfa[indx + v1] - cfa[indx - v1]);
372 dirwts0[indx] = eps + fabsf(cfa[indx + v2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - v2]) + delv;
373 dirwts1[indx] = eps + fabsf(cfa[indx + 2] - cfa[indx]) + fabsf(cfa[indx] - cfa[indx - 2]) + delh;
374 delhvsqsum[indx] = SQR(delh) + SQR(delv);
375 }
376
377 #endif
378
379 //interpolate vertical and horizontal colour differences
380 #ifdef __SSE2__
381 vfloat sgnv;
382
383 if( !(fc(cfarray, 4, 4) & 1) ) {
384 sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
385 } else {
386 sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
387 }
388
389 vfloat zd5v = F2V( 0.5f );
390 vfloat onev = F2V( 1.f );
391 vfloat arthreshv = F2V( arthresh );
392 vfloat clip_pt8v = F2V( clip_pt8 );
393
394 for (int rr = 4; rr < rr1 - 4; rr++) {
395 sgnv = -sgnv;
396
397 for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 7; indx += 4) {
398 //colour ratios in each cardinal direction
399 vfloat cfav = LVF(cfa[indx]);
400 vfloat cruv = LVF(cfa[indx - v1]) * (LVF(dirwts0[indx - v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx - v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx - v2])));
401 vfloat crdv = LVF(cfa[indx + v1]) * (LVF(dirwts0[indx + v2]) + LVF(dirwts0[indx])) / (LVF(dirwts0[indx + v2]) * (epsv + cfav) + LVF(dirwts0[indx]) * (epsv + LVF(cfa[indx + v2])));
402 vfloat crlv = LVFU(cfa[indx - 1]) * (LVFU(dirwts1[indx - 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx - 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx - 2])));
403 vfloat crrv = LVFU(cfa[indx + 1]) * (LVFU(dirwts1[indx + 2]) + LVF(dirwts1[indx])) / (LVFU(dirwts1[indx + 2]) * (epsv + cfav) + LVF(dirwts1[indx]) * (epsv + LVFU(cfa[indx + 2])));
404
405 //G interpolated in vert/hor directions using Hamilton-Adams method
406 vfloat guhav = LVF(cfa[indx - v1]) + zd5v * (cfav - LVF(cfa[indx - v2]));
407 vfloat gdhav = LVF(cfa[indx + v1]) + zd5v * (cfav - LVF(cfa[indx + v2]));
408 vfloat glhav = LVFU(cfa[indx - 1]) + zd5v * (cfav - LVFU(cfa[indx - 2]));
409 vfloat grhav = LVFU(cfa[indx + 1]) + zd5v * (cfav - LVFU(cfa[indx + 2]));
410
411 //G interpolated in vert/hor directions using adaptive ratios
412 vfloat guarv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), cfav * cruv, guhav);
413 vfloat gdarv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), cfav * crdv, gdhav);
414 vfloat glarv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), cfav * crlv, glhav);
415 vfloat grarv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), cfav * crrv, grhav);
416
417 //adaptive weights for vertical/horizontal directions
418 vfloat hwtv = LVFU(dirwts1[indx - 1]) / (LVFU(dirwts1[indx - 1]) + LVFU(dirwts1[indx + 1]));
419 vfloat vwtv = LVF(dirwts0[indx - v1]) / (LVF(dirwts0[indx + v1]) + LVF(dirwts0[indx - v1]));
420
421 //interpolated G via adaptive weights of cardinal evaluations
422 vfloat Ginthhav = vintpf(hwtv, grhav, glhav);
423 vfloat Gintvhav = vintpf(vwtv, gdhav, guhav);
424
425 //interpolated colour differences
426 vfloat hcdaltv = sgnv * (Ginthhav - cfav);
427 vfloat vcdaltv = sgnv * (Gintvhav - cfav);
428 STVF(hcdalt[indx], hcdaltv);
429 STVF(vcdalt[indx], vcdaltv);
430
431 vmask clipmask = vorm( vorm( vmaskf_gt( cfav, clip_pt8v ), vmaskf_gt( Gintvhav, clip_pt8v ) ), vmaskf_gt( Ginthhav, clip_pt8v ));
432 guarv = vself( clipmask, guhav, guarv);
433 gdarv = vself( clipmask, gdhav, gdarv);
434 glarv = vself( clipmask, glhav, glarv);
435 grarv = vself( clipmask, grhav, grarv);
436
437 //use HA if highlights are (nearly) clipped
438 STVF(vcd[indx], vself( clipmask, vcdaltv, sgnv * (vintpf(vwtv, gdarv, guarv) - cfav)));
439 STVF(hcd[indx], vself( clipmask, hcdaltv, sgnv * (vintpf(hwtv, grarv, glarv) - cfav)));
440
441 //differences of interpolations in opposite directions
442 STVF(dgintv[indx], vminf(SQRV(guhav - gdhav), SQRV(guarv - gdarv)));
443 STVF(dginth[indx], vminf(SQRV(glhav - grhav), SQRV(glarv - grarv)));
444 }
445 }
446
447 #else
448
449 for (int rr = 4; rr < rr1 - 4; rr++) {
450 bool fcswitch = fc(cfarray, rr, 4) & 1;
451
452 for (int cc = 4, indx = rr * ts + cc; cc < cc1 - 4; cc++, indx++) {
453
454 //colour ratios in each cardinal direction
455 float cru = cfa[indx - v1] * (dirwts0[indx - v2] + dirwts0[indx]) / (dirwts0[indx - v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx - v2]));
456 float crd = cfa[indx + v1] * (dirwts0[indx + v2] + dirwts0[indx]) / (dirwts0[indx + v2] * (eps + cfa[indx]) + dirwts0[indx] * (eps + cfa[indx + v2]));
457 float crl = cfa[indx - 1] * (dirwts1[indx - 2] + dirwts1[indx]) / (dirwts1[indx - 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx - 2]));
458 float crr = cfa[indx + 1] * (dirwts1[indx + 2] + dirwts1[indx]) / (dirwts1[indx + 2] * (eps + cfa[indx]) + dirwts1[indx] * (eps + cfa[indx + 2]));
459
460 //G interpolated in vert/hor directions using Hamilton-Adams method
461 float guha = cfa[indx - v1] + xdiv2f(cfa[indx] - cfa[indx - v2]);
462 float gdha = cfa[indx + v1] + xdiv2f(cfa[indx] - cfa[indx + v2]);
463 float glha = cfa[indx - 1] + xdiv2f(cfa[indx] - cfa[indx - 2]);
464 float grha = cfa[indx + 1] + xdiv2f(cfa[indx] - cfa[indx + 2]);
465
466 //G interpolated in vert/hor directions using adaptive ratios
467 float guar, gdar, glar, grar;
468
469 if (fabsf(1.f - cru) < arthresh) {
470 guar = cfa[indx] * cru;
471 } else {
472 guar = guha;
473 }
474
475 if (fabsf(1.f - crd) < arthresh) {
476 gdar = cfa[indx] * crd;
477 } else {
478 gdar = gdha;
479 }
480
481 if (fabsf(1.f - crl) < arthresh) {
482 glar = cfa[indx] * crl;
483 } else {
484 glar = glha;
485 }
486
487 if (fabsf(1.f - crr) < arthresh) {
488 grar = cfa[indx] * crr;
489 } else {
490 grar = grha;
491 }
492
493 //adaptive weights for vertical/horizontal directions
494 float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
495 float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
496
497 //interpolated G via adaptive weights of cardinal evaluations
498 float Gintvha = vwt * gdha + (1.f - vwt) * guha;
499 float Ginthha = hwt * grha + (1.f - hwt) * glha;
500
501 //interpolated colour differences
502 if (fcswitch) {
503 vcd[indx] = cfa[indx] - (vwt * gdar + (1.f - vwt) * guar);
504 hcd[indx] = cfa[indx] - (hwt * grar + (1.f - hwt) * glar);
505 vcdalt[indx] = cfa[indx] - Gintvha;
506 hcdalt[indx] = cfa[indx] - Ginthha;
507 } else {
508 //interpolated colour differences
509 vcd[indx] = (vwt * gdar + (1.f - vwt) * guar) - cfa[indx];
510 hcd[indx] = (hwt * grar + (1.f - hwt) * glar) - cfa[indx];
511 vcdalt[indx] = Gintvha - cfa[indx];
512 hcdalt[indx] = Ginthha - cfa[indx];
513 }
514
515 fcswitch = !fcswitch;
516
517 if (cfa[indx] > clip_pt8 || Gintvha > clip_pt8 || Ginthha > clip_pt8) {
518 //use HA if highlights are (nearly) clipped
519 guar = guha;
520 gdar = gdha;
521 glar = glha;
522 grar = grha;
523 vcd[indx] = vcdalt[indx];
524 hcd[indx] = hcdalt[indx];
525 }
526
527 //differences of interpolations in opposite directions
528 dgintv[indx] = min(SQR(guha - gdha), SQR(guar - gdar));
529 dginth[indx] = min(SQR(glha - grha), SQR(glar - grar));
530
531 }
532 }
533
534 #endif
535
536
537
538 #ifdef __SSE2__
539 vfloat clip_ptv = F2V( clip_pt );
540 vfloat sgn3v;
541
542 if( !(fc(cfarray, 4, 4) & 1) ) {
543 sgnv = _mm_set_ps( 1.f, -1.f, 1.f, -1.f );
544 } else {
545 sgnv = _mm_set_ps( -1.f, 1.f, -1.f, 1.f );
546 }
547
548 sgn3v = sgnv + sgnv + sgnv;
549
550 for (int rr = 4; rr < rr1 - 4; rr++) {
551 vfloat nsgnv = sgnv;
552 sgnv = -sgnv;
553 sgn3v = -sgn3v;
554
555 for (int indx = rr * ts + 4; indx < rr * ts + cc1 - 4; indx += 4) {
556 vfloat hcdv = LVF( hcd[indx] );
557 vfloat hcdvarv = SQRV(LVFU(hcd[indx - 2]) - hcdv) + SQRV(LVFU(hcd[indx - 2]) - LVFU(hcd[indx + 2])) + SQRV(hcdv - LVFU(hcd[indx + 2]));
558 vfloat hcdaltv = LVF( hcdalt[indx] );
559 vfloat hcdaltvarv = SQRV(LVFU(hcdalt[indx - 2]) - hcdaltv) + SQRV(LVFU(hcdalt[indx - 2]) - LVFU(hcdalt[indx + 2])) + SQRV(hcdaltv - LVFU(hcdalt[indx + 2]));
560 vfloat vcdv = LVF( vcd[indx] );
561 vfloat vcdvarv = SQRV(LVF(vcd[indx - v2]) - vcdv) + SQRV(LVF(vcd[indx - v2]) - LVF(vcd[indx + v2])) + SQRV(vcdv - LVF(vcd[indx + v2]));
562 vfloat vcdaltv = LVF( vcdalt[indx] );
563 vfloat vcdaltvarv = SQRV(LVF(vcdalt[indx - v2]) - vcdaltv) + SQRV(LVF(vcdalt[indx - v2]) - LVF(vcdalt[indx + v2])) + SQRV(vcdaltv - LVF(vcdalt[indx + v2]));
564
565 //choose the smallest variance; this yields a smoother interpolation
566 hcdv = vself( vmaskf_lt( hcdaltvarv, hcdvarv ), hcdaltv, hcdv);
567 vcdv = vself( vmaskf_lt( vcdaltvarv, vcdvarv ), vcdaltv, vcdv);
568
569 //bound the interpolation in regions of high saturation
570 //vertical and horizontal G interpolations
571 vfloat Ginthv = sgnv * hcdv + LVF( cfa[indx] );
572 vfloat temp2v = sgn3v * hcdv;
573 vfloat hwtv = onev + temp2v / ( epsv + Ginthv + LVF( cfa[indx]));
574 vmask hcdmask = vmaskf_gt( nsgnv * hcdv, ZEROV );
575 vfloat hcdoldv = hcdv;
576 vfloat tempv = nsgnv * (LVF(cfa[indx]) - median( Ginthv, LVFU(cfa[indx - 1]), LVFU(cfa[indx + 1]) ));
577 hcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Ginthv)), tempv, vintpf(hwtv, hcdv, tempv));
578 hcdv = vself( hcdmask, hcdv, hcdoldv );
579 hcdv = vself( vmaskf_gt( Ginthv, clip_ptv), tempv, hcdv);
580 STVF(hcd[indx], hcdv);
581
582 vfloat Gintvv = sgnv * vcdv + LVF( cfa[indx] );
583 temp2v = sgn3v * vcdv;
584 vfloat vwtv = onev + temp2v / ( epsv + Gintvv + LVF( cfa[indx]));
585 vmask vcdmask = vmaskf_gt( nsgnv * vcdv, ZEROV );
586 vfloat vcdoldv = vcdv;
587 tempv = nsgnv * (LVF(cfa[indx]) - median( Gintvv, LVF(cfa[indx - v1]), LVF(cfa[indx + v1]) ));
588 vcdv = vself( vmaskf_lt( temp2v, -(LVF(cfa[indx]) + Gintvv)), tempv, vintpf(vwtv, vcdv, tempv));
589 vcdv = vself( vcdmask, vcdv, vcdoldv );
590 vcdv = vself( vmaskf_gt( Gintvv, clip_ptv), tempv, vcdv);
591 STVF(vcd[indx], vcdv);
592 STVFU(cddiffsq[indx], SQRV(vcdv - hcdv));
593 }
594
595 }
596
597 #else
598
599 for (int rr = 4; rr < rr1 - 4; rr++) {
600 for (int cc = 4, indx = rr * ts + cc, c = fc(cfarray, rr, cc) & 1; cc < cc1 - 4; cc++, indx++) {
601 float hcdvar = 3.f * (SQR(hcd[indx - 2]) + SQR(hcd[indx]) + SQR(hcd[indx + 2])) - SQR(hcd[indx - 2] + hcd[indx] + hcd[indx + 2]);
602 float hcdaltvar = 3.f * (SQR(hcdalt[indx - 2]) + SQR(hcdalt[indx]) + SQR(hcdalt[indx + 2])) - SQR(hcdalt[indx - 2] + hcdalt[indx] + hcdalt[indx + 2]);
603 float vcdvar = 3.f * (SQR(vcd[indx - v2]) + SQR(vcd[indx]) + SQR(vcd[indx + v2])) - SQR(vcd[indx - v2] + vcd[indx] + vcd[indx + v2]);
604 float vcdaltvar = 3.f * (SQR(vcdalt[indx - v2]) + SQR(vcdalt[indx]) + SQR(vcdalt[indx + v2])) - SQR(vcdalt[indx - v2] + vcdalt[indx] + vcdalt[indx + v2]);
605
606 //choose the smallest variance; this yields a smoother interpolation
607 if (hcdaltvar < hcdvar) {
608 hcd[indx] = hcdalt[indx];
609 }
610
611 if (vcdaltvar < vcdvar) {
612 vcd[indx] = vcdalt[indx];
613 }
614
615 //bound the interpolation in regions of high saturation
616 //vertical and horizontal G interpolations
617 float Gintv, Ginth;
618
619 if (c) {//G site
620 Ginth = -hcd[indx] + cfa[indx]; //R or B
621 Gintv = -vcd[indx] + cfa[indx]; //B or R
622
623 if (hcd[indx] > 0) {
624 if (3.f * hcd[indx] > (Ginth + cfa[indx])) {
625 hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
626 } else {
627 float hwt = 1.f - 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
628 hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (-median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx]);
629 }
630 }
631
632 if (vcd[indx] > 0) {
633 if (3.f * vcd[indx] > (Gintv + cfa[indx])) {
634 vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
635 } else {
636 float vwt = 1.f - 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
637 vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (-median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx]);
638 }
639 }
640
641 if (Ginth > clip_pt) {
642 hcd[indx] = -median(Ginth, cfa[indx - 1], cfa[indx + 1]) + cfa[indx];
643 }
644
645 if (Gintv > clip_pt) {
646 vcd[indx] = -median(Gintv, cfa[indx - v1], cfa[indx + v1]) + cfa[indx];
647 }
648
649
650 } else {//R or B site
651
652 Ginth = hcd[indx] + cfa[indx]; //interpolated G
653 Gintv = vcd[indx] + cfa[indx];
654
655 if (hcd[indx] < 0) {
656 if (3.f * hcd[indx] < -(Ginth + cfa[indx])) {
657 hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
658 } else {
659 float hwt = 1.f + 3.f * hcd[indx] / (eps + Ginth + cfa[indx]);
660 hcd[indx] = hwt * hcd[indx] + (1.f - hwt) * (median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx]);
661 }
662 }
663
664 if (vcd[indx] < 0) {
665 if (3.f * vcd[indx] < -(Gintv + cfa[indx])) {
666 vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
667 } else {
668 float vwt = 1.f + 3.f * vcd[indx] / (eps + Gintv + cfa[indx]);
669 vcd[indx] = vwt * vcd[indx] + (1.f - vwt) * (median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx]);
670 }
671 }
672
673 if (Ginth > clip_pt) {
674 hcd[indx] = median(Ginth, cfa[indx - 1], cfa[indx + 1]) - cfa[indx];
675 }
676
677 if (Gintv > clip_pt) {
678 vcd[indx] = median(Gintv, cfa[indx - v1], cfa[indx + v1]) - cfa[indx];
679 }
680
681 cddiffsq[indx] = SQR(vcd[indx] - hcd[indx]);
682 }
683
684 c = !c;
685 }
686 }
687
688 #endif
689
690
691
692 #ifdef __SSE2__
693 vfloat epssqv = F2V( epssq );
694
695 for (int rr = 6; rr < rr1 - 6; rr++) {
696 for (int indx = rr * ts + 6 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 6; indx += 8) {
697 //compute colour difference variances in cardinal directions
698 vfloat tempv = LC2VFU(vcd[indx]);
699 vfloat uavev = tempv + LC2VFU(vcd[indx - v1]) + LC2VFU(vcd[indx - v2]) + LC2VFU(vcd[indx - v3]);
700 vfloat davev = tempv + LC2VFU(vcd[indx + v1]) + LC2VFU(vcd[indx + v2]) + LC2VFU(vcd[indx + v3]);
701 vfloat Dgrbvvaruv = SQRV(tempv - uavev) + SQRV(LC2VFU(vcd[indx - v1]) - uavev) + SQRV(LC2VFU(vcd[indx - v2]) - uavev) + SQRV(LC2VFU(vcd[indx - v3]) - uavev);
702 vfloat Dgrbvvardv = SQRV(tempv - davev) + SQRV(LC2VFU(vcd[indx + v1]) - davev) + SQRV(LC2VFU(vcd[indx + v2]) - davev) + SQRV(LC2VFU(vcd[indx + v3]) - davev);
703
704 vfloat hwtv = vadivapb(LC2VFU(dirwts1[indx - 1]), LC2VFU(dirwts1[indx + 1]));
705 vfloat vwtv = vadivapb(LC2VFU(dirwts0[indx - v1]), LC2VFU(dirwts0[indx + v1]));
706
707 tempv = LC2VFU(hcd[indx]);
708 vfloat lavev = tempv + vaddc2vfu(hcd[indx - 3]) + LC2VFU(hcd[indx - 1]);
709 vfloat ravev = tempv + vaddc2vfu(hcd[indx + 1]) + LC2VFU(hcd[indx + 3]);
710
711 vfloat Dgrbhvarlv = SQRV(tempv - lavev) + SQRV(LC2VFU(hcd[indx - 1]) - lavev) + SQRV(LC2VFU(hcd[indx - 2]) - lavev) + SQRV(LC2VFU(hcd[indx - 3]) - lavev);
712 vfloat Dgrbhvarrv = SQRV(tempv - ravev) + SQRV(LC2VFU(hcd[indx + 1]) - ravev) + SQRV(LC2VFU(hcd[indx + 2]) - ravev) + SQRV(LC2VFU(hcd[indx + 3]) - ravev);
713
714
715 vfloat vcdvarv = epssqv + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
716 vfloat hcdvarv = epssqv + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
717
718 //compute fluctuations in up/down and left/right interpolations of colours
719 Dgrbvvaruv = LC2VFU(dgintv[indx - v1]) + LC2VFU(dgintv[indx - v2]);
720 Dgrbvvardv = LC2VFU(dgintv[indx + v1]) + LC2VFU(dgintv[indx + v2]);
721
722 Dgrbhvarlv = vaddc2vfu(dginth[indx - 2]);
723 Dgrbhvarrv = vaddc2vfu(dginth[indx + 1]);
724
725 vfloat vcdvar1v = epssqv + LC2VFU(dgintv[indx]) + vintpf(vwtv, Dgrbvvardv, Dgrbvvaruv);
726 vfloat hcdvar1v = epssqv + LC2VFU(dginth[indx]) + vintpf(hwtv, Dgrbhvarrv, Dgrbhvarlv);
727
728 //determine adaptive weights for G interpolation
729 vfloat varwtv = hcdvarv / (vcdvarv + hcdvarv);
730 vfloat diffwtv = hcdvar1v / (vcdvar1v + hcdvar1v);
731
732 //if both agree on interpolation direction, choose the one with strongest directional discrimination;
733 //otherwise, choose the u/d and l/r difference fluctuation weights
734 vmask decmask = vandm( vmaskf_gt( (zd5v - varwtv) * (zd5v - diffwtv), ZEROV ), vmaskf_lt( vabsf( zd5v - diffwtv), vabsf( zd5v - varwtv) ) );
735 STVFU(hvwt[indx >> 1], vself( decmask, varwtv, diffwtv));
736 }
737 }
738
739 #else
740
741 for (int rr = 6; rr < rr1 - 6; rr++) {
742 for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
743
744 //compute colour difference variances in cardinal directions
745
746 float uave = vcd[indx] + vcd[indx - v1] + vcd[indx - v2] + vcd[indx - v3];
747 float dave = vcd[indx] + vcd[indx + v1] + vcd[indx + v2] + vcd[indx + v3];
748 float lave = hcd[indx] + hcd[indx - 1] + hcd[indx - 2] + hcd[indx - 3];
749 float rave = hcd[indx] + hcd[indx + 1] + hcd[indx + 2] + hcd[indx + 3];
750
751 //colour difference (G-R or G-B) variance in up/down/left/right directions
752 float Dgrbvvaru = SQR(vcd[indx] - uave) + SQR(vcd[indx - v1] - uave) + SQR(vcd[indx - v2] - uave) + SQR(vcd[indx - v3] - uave);
753 float Dgrbvvard = SQR(vcd[indx] - dave) + SQR(vcd[indx + v1] - dave) + SQR(vcd[indx + v2] - dave) + SQR(vcd[indx + v3] - dave);
754 float Dgrbhvarl = SQR(hcd[indx] - lave) + SQR(hcd[indx - 1] - lave) + SQR(hcd[indx - 2] - lave) + SQR(hcd[indx - 3] - lave);
755 float Dgrbhvarr = SQR(hcd[indx] - rave) + SQR(hcd[indx + 1] - rave) + SQR(hcd[indx + 2] - rave) + SQR(hcd[indx + 3] - rave);
756
757 float hwt = dirwts1[indx - 1] / (dirwts1[indx - 1] + dirwts1[indx + 1]);
758 float vwt = dirwts0[indx - v1] / (dirwts0[indx + v1] + dirwts0[indx - v1]);
759
760 float vcdvar = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
761 float hcdvar = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
762
763 //compute fluctuations in up/down and left/right interpolations of colours
764 Dgrbvvaru = (dgintv[indx]) + (dgintv[indx - v1]) + (dgintv[indx - v2]);
765 Dgrbvvard = (dgintv[indx]) + (dgintv[indx + v1]) + (dgintv[indx + v2]);
766 Dgrbhvarl = (dginth[indx]) + (dginth[indx - 1]) + (dginth[indx - 2]);
767 Dgrbhvarr = (dginth[indx]) + (dginth[indx + 1]) + (dginth[indx + 2]);
768
769 float vcdvar1 = epssq + vwt * Dgrbvvard + (1.f - vwt) * Dgrbvvaru;
770 float hcdvar1 = epssq + hwt * Dgrbhvarr + (1.f - hwt) * Dgrbhvarl;
771
772 //determine adaptive weights for G interpolation
773 float varwt = hcdvar / (vcdvar + hcdvar);
774 float diffwt = hcdvar1 / (vcdvar1 + hcdvar1);
775
776 //if both agree on interpolation direction, choose the one with strongest directional discrimination;
777 //otherwise, choose the u/d and l/r difference fluctuation weights
778 if ((0.5f - varwt) * (0.5f - diffwt) > 0.f && fabsf(0.5f - diffwt) < fabsf(0.5f - varwt)) {
779 hvwt[indx >> 1] = varwt;
780 } else {
781 hvwt[indx >> 1] = diffwt;
782 }
783 }
784 }
785
786 #endif
787
788 #ifdef __SSE2__
789 vfloat gaussg0 = F2V(gaussgrad[0]);
790 vfloat gaussg1 = F2V(gaussgrad[1]);
791 vfloat gaussg2 = F2V(gaussgrad[2]);
792 vfloat gaussg3 = F2V(gaussgrad[3]);
793 vfloat gaussg4 = F2V(gaussgrad[4]);
794 vfloat gaussg5 = F2V(gaussgrad[5]);
795 vfloat gausso0 = F2V(gaussodd[0]);
796 vfloat gausso1 = F2V(gaussodd[1]);
797 vfloat gausso2 = F2V(gaussodd[2]);
798 vfloat gausso3 = F2V(gaussodd[3]);
799
800 #endif
801
802 // precompute nyquist
803 for (int rr = 6; rr < rr1 - 6; rr++) {
804 int cc = 6 + (fc(cfarray, rr, 2) & 1);
805 int indx = rr * ts + cc;
806
807 #ifdef __SSE2__
808
809 for (; cc < cc1 - 7; cc += 8, indx += 8) {
810 vfloat valv = (gausso0 * LC2VFU(cddiffsq[indx]) +
811 gausso1 * (LC2VFU(cddiffsq[(indx - m1)]) + LC2VFU(cddiffsq[(indx + p1)]) +
812 LC2VFU(cddiffsq[(indx - p1)]) + LC2VFU(cddiffsq[(indx + m1)])) +
813 gausso2 * (LC2VFU(cddiffsq[(indx - v2)]) + LC2VFU(cddiffsq[(indx - 2)]) +
814 LC2VFU(cddiffsq[(indx + 2)]) + LC2VFU(cddiffsq[(indx + v2)])) +
815 gausso3 * (LC2VFU(cddiffsq[(indx - m2)]) + LC2VFU(cddiffsq[(indx + p2)]) +
816 LC2VFU(cddiffsq[(indx - p2)]) + LC2VFU(cddiffsq[(indx + m2)]))) -
817 (gaussg0 * LC2VFU(delhvsqsum[indx]) +
818 gaussg1 * (LC2VFU(delhvsqsum[indx - v1]) + LC2VFU(delhvsqsum[indx - 1]) +
819 LC2VFU(delhvsqsum[indx + 1]) + LC2VFU(delhvsqsum[indx + v1])) +
820 gaussg2 * (LC2VFU(delhvsqsum[indx - m1]) + LC2VFU(delhvsqsum[indx + p1]) +
821 LC2VFU(delhvsqsum[indx - p1]) + LC2VFU(delhvsqsum[indx + m1])) +
822 gaussg3 * (LC2VFU(delhvsqsum[indx - v2]) + LC2VFU(delhvsqsum[indx - 2]) +
823 LC2VFU(delhvsqsum[indx + 2]) + LC2VFU(delhvsqsum[indx + v2])) +
824 gaussg4 * (LC2VFU(delhvsqsum[indx - v2 - 1]) + LC2VFU(delhvsqsum[indx - v2 + 1]) +
825 LC2VFU(delhvsqsum[indx - ts - 2]) + LC2VFU(delhvsqsum[indx - ts + 2]) +
826 LC2VFU(delhvsqsum[indx + ts - 2]) + LC2VFU(delhvsqsum[indx + ts + 2]) +
827 LC2VFU(delhvsqsum[indx + v2 - 1]) + LC2VFU(delhvsqsum[indx + v2 + 1])) +
828 gaussg5 * (LC2VFU(delhvsqsum[indx - m2]) + LC2VFU(delhvsqsum[indx + p2]) +
829 LC2VFU(delhvsqsum[indx - p2]) + LC2VFU(delhvsqsum[indx + m2])));
830 STVFU(nyqutest[indx >> 1], valv);
831
832 }
833
834 #endif
835
836 for (; cc < cc1 - 6; cc += 2, indx += 2) {
837 nyqutest[indx >> 1] = (gaussodd[0] * cddiffsq[indx] +
838 gaussodd[1] * (cddiffsq[(indx - m1)] + cddiffsq[(indx + p1)] +
839 cddiffsq[(indx - p1)] + cddiffsq[(indx + m1)]) +
840 gaussodd[2] * (cddiffsq[(indx - v2)] + cddiffsq[(indx - 2)] +
841 cddiffsq[(indx + 2)] + cddiffsq[(indx + v2)]) +
842 gaussodd[3] * (cddiffsq[(indx - m2)] + cddiffsq[(indx + p2)] +
843 cddiffsq[(indx - p2)] + cddiffsq[(indx + m2)])) -
844 (gaussgrad[0] * delhvsqsum[indx] +
845 gaussgrad[1] * (delhvsqsum[indx - v1] + delhvsqsum[indx + 1] +
846 delhvsqsum[indx - 1] + delhvsqsum[indx + v1]) +
847 gaussgrad[2] * (delhvsqsum[indx - m1] + delhvsqsum[indx + p1] +
848 delhvsqsum[indx - p1] + delhvsqsum[indx + m1]) +
849 gaussgrad[3] * (delhvsqsum[indx - v2] + delhvsqsum[indx - 2] +
850 delhvsqsum[indx + 2] + delhvsqsum[indx + v2]) +
851 gaussgrad[4] * (delhvsqsum[indx - v2 - 1] + delhvsqsum[indx - v2 + 1] +
852 delhvsqsum[indx - ts - 2] + delhvsqsum[indx - ts + 2] +
853 delhvsqsum[indx + ts - 2] + delhvsqsum[indx + ts + 2] +
854 delhvsqsum[indx + v2 - 1] + delhvsqsum[indx + v2 + 1]) +
855 gaussgrad[5] * (delhvsqsum[indx - m2] + delhvsqsum[indx + p2] +
856 delhvsqsum[indx - p2] + delhvsqsum[indx + m2]));
857 }
858 }
859
860 // Nyquist test
861 int nystartrow = 0;
862 int nyendrow = 0;
863 int nystartcol = ts + 1;
864 int nyendcol = 0;
865
866 for (int rr = 6; rr < rr1 - 6; rr++) {
867 for (int cc = 6 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
868
869 //nyquist texture test: ask if difference of vcd compared to hcd is larger or smaller than RGGB gradients
870 if(nyqutest[indx >> 1] > 0.f) {
871 nyquist[indx >> 1] = 1; //nyquist=1 for nyquist region
872 nystartrow = nystartrow ? nystartrow : rr;
873 nyendrow = rr;
874 nystartcol = nystartcol > cc ? cc : nystartcol;
875 nyendcol = nyendcol < cc ? cc : nyendcol;
876 }
877 }
878 }
879
880
881 bool doNyquist = nystartrow != nyendrow && nystartcol != nyendcol;
882
883 if(doNyquist) {
884 nyendrow ++; // because of < condition
885 nyendcol ++; // because of < condition
886 nystartcol -= (nystartcol & 1);
887 nystartrow = std::max(8, nystartrow);
888 nyendrow = std::min(rr1 - 8, nyendrow);
889 nystartcol = std::max(8, nystartcol);
890 nyendcol = std::min(cc1 - 8, nyendcol);
891 memset(&nyquist2[4 * tsh], 0, sizeof(char) * (ts - 8) * tsh);
892
893 #ifdef __SSE2__
894 vint fourvb = _mm_set1_epi8(4);
895 vint onevb = _mm_set1_epi8(1);
896
897 #endif
898
899 for (int rr = nystartrow; rr < nyendrow; rr++) {
900 #ifdef __SSE2__
901
902 for (int indx = rr * ts; indx < rr * ts + cc1; indx += 32) {
903 vint nyquisttemp1v = _mm_adds_epi8(_mm_load_si128((vint*)&nyquist[(indx - v2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - m1) >> 1]));
904 vint nyquisttemp2v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + p1) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - 2) >> 1]));
905 vint nyquisttemp3v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + 2) >> 1]), _mm_loadu_si128((vint*)&nyquist[(indx - p1) >> 1]));
906 vint valv = _mm_load_si128((vint*)&nyquist[indx >> 1]);
907 vint nyquisttemp4v = _mm_adds_epi8(_mm_loadu_si128((vint*)&nyquist[(indx + m1) >> 1]), _mm_load_si128((vint*)&nyquist[(indx + v2) >> 1]));
908 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp3v);
909 nyquisttemp2v = _mm_adds_epi8(nyquisttemp2v, nyquisttemp4v);
910 nyquisttemp1v = _mm_adds_epi8(nyquisttemp1v, nyquisttemp2v);
911 valv = vselc(_mm_cmpgt_epi8(nyquisttemp1v, fourvb), onevb, valv);
912 valv = vselinotzero(_mm_cmplt_epi8(nyquisttemp1v, fourvb), valv);
913 _mm_store_si128((vint*)&nyquist2[indx >> 1], valv);
914 }
915
916 #else
917
918 for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
919 unsigned int nyquisttemp = (nyquist[(indx - v2) >> 1] + nyquist[(indx - m1) >> 1] + nyquist[(indx + p1) >> 1] +
920 nyquist[(indx - 2) >> 1] + nyquist[(indx + 2) >> 1] +
921 nyquist[(indx - p1) >> 1] + nyquist[(indx + m1) >> 1] + nyquist[(indx + v2) >> 1]);
922 //if most of your neighbours are named Nyquist, it's likely that you're one too, or not
923 nyquist2[indx >> 1] = nyquisttemp > 4 ? 1 : (nyquisttemp < 4 ? 0 : nyquist[indx >> 1]);
924 }
925
926 #endif
927 }
928
929 // end of Nyquist test
930
931 // in areas of Nyquist texture, do area interpolation
932 for (int rr = nystartrow; rr < nyendrow; rr++)
933 for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
934
935 if (nyquist2[indx >> 1]) {
936 // area interpolation
937
938 float sumcfa = 0.f, sumh = 0.f, sumv = 0.f, sumsqh = 0.f, sumsqv = 0.f, areawt = 0.f;
939
940 for (int i = -6; i < 7; i += 2) {
941 int indx1 = indx + (i * ts) - 6;
942
943 for (int j = -6; j < 7; j += 2, indx1 += 2) {
944 if (nyquist2[indx1 >> 1]) {
945 float cfatemp = cfa[indx1];
946 sumcfa += cfatemp;
947 sumh += (cfa[indx1 - 1] + cfa[indx1 + 1]);
948 sumv += (cfa[indx1 - v1] + cfa[indx1 + v1]);
949 sumsqh += SQR(cfatemp - cfa[indx1 - 1]) + SQR(cfatemp - cfa[indx1 + 1]);
950 sumsqv += SQR(cfatemp - cfa[indx1 - v1]) + SQR(cfatemp - cfa[indx1 + v1]);
951 areawt += 1;
952 }
953 }
954 }
955
956 //horizontal and vertical colour differences, and adaptive weight
957 sumh = sumcfa - xdiv2f(sumh);
958 sumv = sumcfa - xdiv2f(sumv);
959 areawt = xdiv2f(areawt);
960 float hcdvar = epssq + fabsf(areawt * sumsqh - sumh * sumh);
961 float vcdvar = epssq + fabsf(areawt * sumsqv - sumv * sumv);
962 hvwt[indx >> 1] = hcdvar / (vcdvar + hcdvar);
963
964 // end of area interpolation
965
966 }
967 }
968 }
969
970
971 //populate G at R/B sites
972 for (int rr = 8; rr < rr1 - 8; rr++)
973 for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1); indx < rr * ts + cc1 - 8; indx += 2) {
974
975 //first ask if one gets more directional discrimination from nearby B/R sites
976 float hvwtalt = xdivf(hvwt[(indx - m1) >> 1] + hvwt[(indx + p1) >> 1] + hvwt[(indx - p1) >> 1] + hvwt[(indx + m1) >> 1], 2);
977
978 hvwt[indx >> 1] = fabsf(0.5f - hvwt[indx >> 1]) < fabsf(0.5f - hvwtalt) ? hvwtalt : hvwt[indx >> 1];
979 //a better result was obtained from the neighbours
980
981 Dgrb[0][indx >> 1] = intp(hvwt[indx >> 1], vcd[indx], hcd[indx]); //evaluate colour differences
982
983 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1]; //evaluate G (finally!)
984
985 //local curvature in G (preparation for nyquist refinement step)
986 Dgrb2[indx >> 1].h = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - 1] + rgbgreen[indx + 1])) : 0.f;
987 Dgrb2[indx >> 1].v = nyquist2[indx >> 1] ? SQR(rgbgreen[indx] - xdiv2f(rgbgreen[indx - v1] + rgbgreen[indx + v1])) : 0.f;
988 }
989
990
991 //end of standard interpolation
992
993 // refine Nyquist areas using G curvatures
994 if(doNyquist) {
995 for (int rr = nystartrow; rr < nyendrow; rr++)
996 for (int indx = rr * ts + nystartcol + (fc(cfarray, rr, 2) & 1); indx < rr * ts + nyendcol; indx += 2) {
997
998 if (nyquist2[indx >> 1]) {
999 //local averages (over Nyquist pixels only) of G curvature squared
1000 float gvarh = epssq + (gquinc[0] * Dgrb2[indx >> 1].h +
1001 gquinc[1] * (Dgrb2[(indx - m1) >> 1].h + Dgrb2[(indx + p1) >> 1].h + Dgrb2[(indx - p1) >> 1].h + Dgrb2[(indx + m1) >> 1].h) +
1002 gquinc[2] * (Dgrb2[(indx - v2) >> 1].h + Dgrb2[(indx - 2) >> 1].h + Dgrb2[(indx + 2) >> 1].h + Dgrb2[(indx + v2) >> 1].h) +
1003 gquinc[3] * (Dgrb2[(indx - m2) >> 1].h + Dgrb2[(indx + p2) >> 1].h + Dgrb2[(indx - p2) >> 1].h + Dgrb2[(indx + m2) >> 1].h));
1004 float gvarv = epssq + (gquinc[0] * Dgrb2[indx >> 1].v +
1005 gquinc[1] * (Dgrb2[(indx - m1) >> 1].v + Dgrb2[(indx + p1) >> 1].v + Dgrb2[(indx - p1) >> 1].v + Dgrb2[(indx + m1) >> 1].v) +
1006 gquinc[2] * (Dgrb2[(indx - v2) >> 1].v + Dgrb2[(indx - 2) >> 1].v + Dgrb2[(indx + 2) >> 1].v + Dgrb2[(indx + v2) >> 1].v) +
1007 gquinc[3] * (Dgrb2[(indx - m2) >> 1].v + Dgrb2[(indx + p2) >> 1].v + Dgrb2[(indx - p2) >> 1].v + Dgrb2[(indx + m2) >> 1].v));
1008 //use the results as weights for refined G interpolation
1009 Dgrb[0][indx >> 1] = (hcd[indx] * gvarv + vcd[indx] * gvarh) / (gvarv + gvarh);
1010 rgbgreen[indx] = cfa[indx] + Dgrb[0][indx >> 1];
1011 }
1012 }
1013 }
1014
1015
1016 #ifdef __SSE2__
1017
1018 for (int rr = 6; rr < rr1 - 6; rr++) {
1019 if((fc(cfarray, rr, 2) & 1) == 0) {
1020 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1021 vfloat tempv = LC2VFU(cfa[indx + 1]);
1022 vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - p1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + p1])));
1023 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + p1]) - LC2VFU(cfa[indx - p1])));
1024 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + m1]) - LC2VFU(cfa[indx - m1])));
1025 vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx + 1 - m1])) + SQRV(tempv - LC2VFU(cfa[indx + 1 + m1])));
1026 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1027 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1028 }
1029 } else {
1030 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 8, indx += 8) {
1031 vfloat tempv = LC2VFU(cfa[indx]);
1032 vfloat Dgrbsq1pv = (SQRV(tempv - LC2VFU(cfa[indx - p1])) + SQRV(tempv - LC2VFU(cfa[indx + p1])));
1033 STVFU(delp[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + p1]) - LC2VFU(cfa[indx + 1 - p1])));
1034 STVFU(delm[indx >> 1], vabsf(LC2VFU(cfa[indx + 1 + m1]) - LC2VFU(cfa[indx + 1 - m1])));
1035 vfloat Dgrbsq1mv = (SQRV(tempv - LC2VFU(cfa[indx - m1])) + SQRV(tempv - LC2VFU(cfa[indx + m1])));
1036 STVFU(Dgrbsq1m[indx >> 1], Dgrbsq1mv );
1037 STVFU(Dgrbsq1p[indx >> 1], Dgrbsq1pv );
1038 }
1039 }
1040 }
1041
1042 #else
1043
1044 for (int rr = 6; rr < rr1 - 6; rr++) {
1045 if((fc(cfarray, rr, 2) & 1) == 0) {
1046 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1047 delp[indx >> 1] = fabsf(cfa[indx + p1] - cfa[indx - p1]);
1048 delm[indx >> 1] = fabsf(cfa[indx + m1] - cfa[indx - m1]);
1049 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - p1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + p1]));
1050 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx + 1] - cfa[indx + 1 - m1]) + SQR(cfa[indx + 1] - cfa[indx + 1 + m1]));
1051 }
1052 } else {
1053 for (int cc = 6, indx = rr * ts + cc; cc < cc1 - 6; cc += 2, indx += 2) {
1054 Dgrbsq1p[indx >> 1] = (SQR(cfa[indx] - cfa[indx - p1]) + SQR(cfa[indx] - cfa[indx + p1]));
1055 Dgrbsq1m[indx >> 1] = (SQR(cfa[indx] - cfa[indx - m1]) + SQR(cfa[indx] - cfa[indx + m1]));
1056 delp[indx >> 1] = fabsf(cfa[indx + 1 + p1] - cfa[indx + 1 - p1]);
1057 delm[indx >> 1] = fabsf(cfa[indx + 1 + m1] - cfa[indx + 1 - m1]);
1058 }
1059 }
1060 }
1061
1062 #endif
1063
1064 // diagonal interpolation correction
1065
1066 #ifdef __SSE2__
1067 vfloat gausseven0v = F2V(gausseven[0]);
1068 vfloat gausseven1v = F2V(gausseven[1]);
1069 #endif
1070
1071 for (int rr = 8; rr < rr1 - 8; rr++) {
1072 #ifdef __SSE2__
1073
1074 for (int indx = rr * ts + 8 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 8; indx += 8, indx1 += 4) {
1075
1076 //diagonal colour ratios
1077 vfloat cfav = LC2VFU(cfa[indx]);
1078
1079 vfloat temp1v = LC2VFU(cfa[indx + m1]);
1080 vfloat temp2v = LC2VFU(cfa[indx + m2]);
1081 vfloat rbsev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1082 rbsev = vself(vmaskf_lt(vabsf(onev - rbsev), arthreshv), cfav * rbsev, temp1v + zd5v * (cfav - temp2v));
1083
1084 temp1v = LC2VFU(cfa[indx - m1]);
1085 temp2v = LC2VFU(cfa[indx - m2]);
1086 vfloat rbnwv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1087 rbnwv = vself(vmaskf_lt(vabsf(onev - rbnwv), arthreshv), cfav * rbnwv, temp1v + zd5v * (cfav - temp2v));
1088
1089 temp1v = epsv + LVFU(delm[indx1]);
1090 vfloat wtsev = temp1v + LVFU(delm[(indx + m1) >> 1]) + LVFU(delm[(indx + m2) >> 1]); //same as for wtu,wtd,wtl,wtr
1091 vfloat wtnwv = temp1v + LVFU(delm[(indx - m1) >> 1]) + LVFU(delm[(indx - m2) >> 1]);
1092
1093 vfloat rbmv = (wtsev * rbnwv + wtnwv * rbsev) / (wtsev + wtnwv);
1094
1095 temp1v = median(rbmv , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1]));
1096 vfloat wtv = vmul2f(cfav - rbmv) / (epsv + rbmv + cfav);
1097 temp2v = vintpf(wtv, rbmv, temp1v);
1098
1099 temp2v = vself(vmaskf_lt(rbmv + rbmv, cfav), temp1v, temp2v);
1100 temp2v = vself(vmaskf_lt(rbmv, cfav), temp2v, rbmv);
1101 STVFU(rbm[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - m1]), LC2VFU(cfa[indx + m1])), temp2v ));
1102
1103
1104 temp1v = LC2VFU(cfa[indx + p1]);
1105 temp2v = LC2VFU(cfa[indx + p2]);
1106 vfloat rbnev = vmul2f(temp1v) / (epsv + cfav + temp2v );
1107 rbnev = vself(vmaskf_lt(vabsf(onev - rbnev), arthreshv), cfav * rbnev, temp1v + zd5v * (cfav - temp2v));
1108
1109 temp1v = LC2VFU(cfa[indx - p1]);
1110 temp2v = LC2VFU(cfa[indx - p2]);
1111 vfloat rbswv = vmul2f(temp1v) / (epsv + cfav + temp2v );
1112 rbswv = vself(vmaskf_lt(vabsf(onev - rbswv), arthreshv), cfav * rbswv, temp1v + zd5v * (cfav - temp2v));
1113
1114 temp1v = epsv + LVFU(delp[indx1]);
1115 vfloat wtnev = temp1v + LVFU(delp[(indx + p1) >> 1]) + LVFU(delp[(indx + p2) >> 1]);
1116 vfloat wtswv = temp1v + LVFU(delp[(indx - p1) >> 1]) + LVFU(delp[(indx - p2) >> 1]);
1117
1118 vfloat rbpv = (wtnev * rbswv + wtswv * rbnev) / (wtnev + wtswv);
1119
1120 temp1v = median(rbpv , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1]));
1121 wtv = vmul2f(cfav - rbpv) / (epsv + rbpv + cfav);
1122 temp2v = vintpf(wtv, rbpv, temp1v);
1123
1124 temp2v = vself(vmaskf_lt(rbpv + rbpv, cfav), temp1v, temp2v);
1125 temp2v = vself(vmaskf_lt(rbpv, cfav), temp2v, rbpv);
1126 STVFU(rbp[indx1], vself(vmaskf_gt(temp2v, clip_ptv), median(temp2v , LC2VFU(cfa[indx - p1]), LC2VFU(cfa[indx + p1])), temp2v ));
1127
1128 vfloat rbvarmv = epssqv + (gausseven0v * (LVFU(Dgrbsq1m[(indx - v1) >> 1]) + LVFU(Dgrbsq1m[(indx - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v1) >> 1])) +
1129 gausseven1v * (LVFU(Dgrbsq1m[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1m[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 - v1) >> 1]) +
1130 LVFU(Dgrbsq1m[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1m[(indx + v2 + 1) >> 1])));
1131 STVFU(pmwt[indx1] , rbvarmv / ((epssqv + (gausseven0v * (LVFU(Dgrbsq1p[(indx - v1) >> 1]) + LVFU(Dgrbsq1p[(indx - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v1) >> 1])) +
1132 gausseven1v * (LVFU(Dgrbsq1p[(indx - v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx - v2 + 1) >> 1]) + LVFU(Dgrbsq1p[(indx - 2 - v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 - v1) >> 1]) +
1133 LVFU(Dgrbsq1p[(indx - 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + 2 + v1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 - 1) >> 1]) + LVFU(Dgrbsq1p[(indx + v2 + 1) >> 1])))) + rbvarmv));
1134
1135 }
1136
1137 #else
1138
1139 for (int cc = 8 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 8; cc += 2, indx += 2, indx1++) {
1140
1141 //diagonal colour ratios
1142 float crse = xmul2f(cfa[indx + m1]) / (eps + cfa[indx] + (cfa[indx + m2]));
1143 float crnw = xmul2f(cfa[indx - m1]) / (eps + cfa[indx] + (cfa[indx - m2]));
1144 float crne = xmul2f(cfa[indx + p1]) / (eps + cfa[indx] + (cfa[indx + p2]));
1145 float crsw = xmul2f(cfa[indx - p1]) / (eps + cfa[indx] + (cfa[indx - p2]));
1146 //colour differences in diagonal directions
1147 float rbse, rbnw, rbne, rbsw;
1148
1149 //assign B/R at R/B sites
1150 if (fabsf(1.f - crse) < arthresh) {
1151 rbse = cfa[indx] * crse; //use this if more precise diag interp is necessary
1152 } else {
1153 rbse = (cfa[indx + m1]) + xdiv2f(cfa[indx] - cfa[indx + m2]);
1154 }
1155
1156 if (fabsf(1.f - crnw) < arthresh) {
1157 rbnw = cfa[indx] * crnw;
1158 } else {
1159 rbnw = (cfa[indx - m1]) + xdiv2f(cfa[indx] - cfa[indx - m2]);
1160 }
1161
1162 if (fabsf(1.f - crne) < arthresh) {
1163 rbne = cfa[indx] * crne;
1164 } else {
1165 rbne = (cfa[indx + p1]) + xdiv2f(cfa[indx] - cfa[indx + p2]);
1166 }
1167
1168 if (fabsf(1.f - crsw) < arthresh) {
1169 rbsw = cfa[indx] * crsw;
1170 } else {
1171 rbsw = (cfa[indx - p1]) + xdiv2f(cfa[indx] - cfa[indx - p2]);
1172 }
1173
1174 float wtse = eps + delm[indx1] + delm[(indx + m1) >> 1] + delm[(indx + m2) >> 1]; //same as for wtu,wtd,wtl,wtr
1175 float wtnw = eps + delm[indx1] + delm[(indx - m1) >> 1] + delm[(indx - m2) >> 1];
1176 float wtne = eps + delp[indx1] + delp[(indx + p1) >> 1] + delp[(indx + p2) >> 1];
1177 float wtsw = eps + delp[indx1] + delp[(indx - p1) >> 1] + delp[(indx - p2) >> 1];
1178
1179
1180 rbm[indx1] = (wtse * rbnw + wtnw * rbse) / (wtse + wtnw);
1181 rbp[indx1] = (wtne * rbsw + wtsw * rbne) / (wtne + wtsw);
1182
1183 //variance of R-B in plus/minus directions
1184 float rbvarm = epssq + (gausseven[0] * (Dgrbsq1m[(indx - v1) >> 1] + Dgrbsq1m[(indx - 1) >> 1] + Dgrbsq1m[(indx + 1) >> 1] + Dgrbsq1m[(indx + v1) >> 1]) +
1185 gausseven[1] * (Dgrbsq1m[(indx - v2 - 1) >> 1] + Dgrbsq1m[(indx - v2 + 1) >> 1] + Dgrbsq1m[(indx - 2 - v1) >> 1] + Dgrbsq1m[(indx + 2 - v1) >> 1] +
1186 Dgrbsq1m[(indx - 2 + v1) >> 1] + Dgrbsq1m[(indx + 2 + v1) >> 1] + Dgrbsq1m[(indx + v2 - 1) >> 1] + Dgrbsq1m[(indx + v2 + 1) >> 1]));
1187 pmwt[indx1] = rbvarm / ((epssq + (gausseven[0] * (Dgrbsq1p[(indx - v1) >> 1] + Dgrbsq1p[(indx - 1) >> 1] + Dgrbsq1p[(indx + 1) >> 1] + Dgrbsq1p[(indx + v1) >> 1]) +
1188 gausseven[1] * (Dgrbsq1p[(indx - v2 - 1) >> 1] + Dgrbsq1p[(indx - v2 + 1) >> 1] + Dgrbsq1p[(indx - 2 - v1) >> 1] + Dgrbsq1p[(indx + 2 - v1) >> 1] +
1189 Dgrbsq1p[(indx - 2 + v1) >> 1] + Dgrbsq1p[(indx + 2 + v1) >> 1] + Dgrbsq1p[(indx + v2 - 1) >> 1] + Dgrbsq1p[(indx + v2 + 1) >> 1]))) + rbvarm);
1190
1191 //bound the interpolation in regions of high saturation
1192
1193 if (rbp[indx1] < cfa[indx]) {
1194 if (xmul2f(rbp[indx1]) < cfa[indx]) {
1195 rbp[indx1] = median(rbp[indx1] , cfa[indx - p1], cfa[indx + p1]);
1196 } else {
1197 float pwt = xmul2f(cfa[indx] - rbp[indx1]) / (eps + rbp[indx1] + cfa[indx]);
1198 rbp[indx1] = pwt * rbp[indx1] + (1.f - pwt) * median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1199 }
1200 }
1201
1202 if (rbm[indx1] < cfa[indx]) {
1203 if (xmul2f(rbm[indx1]) < cfa[indx]) {
1204 rbm[indx1] = median(rbm[indx1] , cfa[indx - m1], cfa[indx + m1]);
1205 } else {
1206 float mwt = xmul2f(cfa[indx] - rbm[indx1]) / (eps + rbm[indx1] + cfa[indx]);
1207 rbm[indx1] = mwt * rbm[indx1] + (1.f - mwt) * median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1208 }
1209 }
1210
1211 if (rbp[indx1] > clip_pt) {
1212 rbp[indx1] = median(rbp[indx1], cfa[indx - p1], cfa[indx + p1]);
1213 }
1214
1215 if (rbm[indx1] > clip_pt) {
1216 rbm[indx1] = median(rbm[indx1], cfa[indx - m1], cfa[indx + m1]);
1217 }
1218 }
1219
1220 #endif
1221 }
1222
1223 #ifdef __SSE2__
1224 vfloat zd25v = F2V(0.25f);
1225 #endif
1226
1227 for (int rr = 10; rr < rr1 - 10; rr++)
1228 #ifdef __SSE2__
1229 for (int indx = rr * ts + 10 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 10; indx += 8, indx1 += 4) {
1230
1231 //first ask if one gets more directional discrimination from nearby B/R sites
1232 vfloat pmwtaltv = zd25v * (LVFU(pmwt[(indx - m1) >> 1]) + LVFU(pmwt[(indx + p1) >> 1]) + LVFU(pmwt[(indx - p1) >> 1]) + LVFU(pmwt[(indx + m1) >> 1]));
1233 vfloat tempv = LVFU(pmwt[indx1]);
1234 tempv = vself(vmaskf_lt(vabsf(zd5v - tempv), vabsf(zd5v - pmwtaltv)), pmwtaltv, tempv);
1235 STVFU(pmwt[indx1], tempv);
1236 STVFU(rbint[indx1], zd5v * (LC2VFU(cfa[indx]) + vintpf(tempv, LVFU(rbp[indx1]), LVFU(rbm[indx1]))));
1237 }
1238
1239 #else
1240
1241 for (int cc = 10 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 10; cc += 2, indx += 2, indx1++) {
1242
1243 //first ask if one gets more directional discrimination from nearby B/R sites
1244 float pmwtalt = xdivf(pmwt[(indx - m1) >> 1] + pmwt[(indx + p1) >> 1] + pmwt[(indx - p1) >> 1] + pmwt[(indx + m1) >> 1], 2);
1245
1246 if (fabsf(0.5f - pmwt[indx1]) < fabsf(0.5f - pmwtalt)) {
1247 pmwt[indx1] = pmwtalt; //a better result was obtained from the neighbours
1248 }
1249
1250 rbint[indx1] = xdiv2f(cfa[indx] + rbm[indx1] * (1.f - pmwt[indx1]) + rbp[indx1] * pmwt[indx1]); //this is R+B, interpolated
1251 }
1252
1253 #endif
1254
1255 for (int rr = 12; rr < rr1 - 12; rr++)
1256 #ifdef __SSE2__
1257 for (int indx = rr * ts + 12 + (fc(cfarray, rr, 2) & 1), indx1 = indx >> 1; indx < rr * ts + cc1 - 12; indx += 8, indx1 += 4) {
1258 vmask copymask = vmaskf_ge(vabsf(zd5v - LVFU(pmwt[indx1])), vabsf(zd5v - LVFU(hvwt[indx1])));
1259
1260 if(_mm_movemask_ps((vfloat)copymask)) { // if for any of the 4 pixels the condition is true, do the maths for all 4 pixels and mask the unused out at the end
1261 //now interpolate G vertically/horizontally using R+B values
1262 //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1263 //colour ratios for G interpolation
1264 vfloat rbintv = LVFU(rbint[indx1]);
1265
1266 //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1267 vfloat cruv = vmul2f(LC2VFU(cfa[indx - v1])) / (epsv + rbintv + LVFU(rbint[(indx1 - v1)]));
1268 vfloat guv = rbintv * cruv;
1269 vfloat gu2v = LC2VFU(cfa[indx - v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - v1)]));
1270 guv = vself(vmaskf_lt(vabsf(onev - cruv), arthreshv), guv, gu2v);
1271
1272 vfloat crdv = vmul2f(LC2VFU(cfa[indx + v1])) / (epsv + rbintv + LVFU(rbint[(indx1 + v1)]));
1273 vfloat gdv = rbintv * crdv;
1274 vfloat gd2v = LC2VFU(cfa[indx + v1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + v1)]));
1275 gdv = vself(vmaskf_lt(vabsf(onev - crdv), arthreshv), gdv, gd2v);
1276
1277 vfloat Gintvv = (LC2VFU(dirwts0[indx - v1]) * gdv + LC2VFU(dirwts0[indx + v1]) * guv) / (LC2VFU(dirwts0[indx + v1]) + LC2VFU(dirwts0[indx - v1]));
1278 vfloat Gint1v = median(Gintvv , LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1]));
1279 vfloat vwtv = vmul2f(rbintv - Gintvv) / (epsv + Gintvv + rbintv);
1280 vfloat Gint2v = vintpf(vwtv, Gintvv, Gint1v);
1281 Gint1v = vself(vmaskf_lt(vmul2f(Gintvv), rbintv), Gint1v, Gint2v);
1282 Gintvv = vself(vmaskf_lt(Gintvv, rbintv), Gint1v, Gintvv);
1283 Gintvv = vself(vmaskf_gt(Gintvv, clip_ptv), median(Gintvv, LC2VFU(cfa[indx - v1]), LC2VFU(cfa[indx + v1])), Gintvv);
1284
1285 vfloat crlv = vmul2f(LC2VFU(cfa[indx - 1])) / (epsv + rbintv + LVFU(rbint[(indx1 - 1)]));
1286 vfloat glv = rbintv * crlv;
1287 vfloat gl2v = LC2VFU(cfa[indx - 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 - 1)]));
1288 glv = vself(vmaskf_lt(vabsf(onev - crlv), arthreshv), glv, gl2v);
1289
1290 vfloat crrv = vmul2f(LC2VFU(cfa[indx + 1])) / (epsv + rbintv + LVFU(rbint[(indx1 + 1)]));
1291 vfloat grv = rbintv * crrv;
1292 vfloat gr2v = LC2VFU(cfa[indx + 1]) + zd5v * (rbintv - LVFU(rbint[(indx1 + 1)]));
1293 grv = vself(vmaskf_lt(vabsf(onev - crrv), arthreshv), grv, gr2v);
1294
1295 vfloat Ginthv = (LC2VFU(dirwts1[indx - 1]) * grv + LC2VFU(dirwts1[indx + 1]) * glv) / (LC2VFU(dirwts1[indx - 1]) + LC2VFU(dirwts1[indx + 1]));
1296 vfloat Gint1h = median(Ginthv , LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1]));
1297 vfloat hwtv = vmul2f(rbintv - Ginthv) / (epsv + Ginthv + rbintv);
1298 vfloat Gint2h = vintpf(hwtv, Ginthv, Gint1h);
1299 Gint1h = vself(vmaskf_lt(vmul2f(Ginthv), rbintv), Gint1h, Gint2h);
1300 Ginthv = vself(vmaskf_lt(Ginthv, rbintv), Gint1h, Ginthv);
1301 Ginthv = vself(vmaskf_gt(Ginthv, clip_ptv), median(Ginthv, LC2VFU(cfa[indx - 1]), LC2VFU(cfa[indx + 1])), Ginthv);
1302
1303 vfloat greenv = vself(copymask, vintpf(LVFU(hvwt[indx1]), Gintvv, Ginthv), LC2VFU(rgbgreen[indx]));
1304 STC2VFU(rgbgreen[indx], greenv);
1305
1306 STVFU(Dgrb[0][indx1], vself(copymask, greenv - LC2VFU(cfa[indx]), LVFU(Dgrb[0][indx1])));
1307 }
1308 }
1309
1310 #else
1311
1312 for (int cc = 12 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, indx1 = indx >> 1; cc < cc1 - 12; cc += 2, indx += 2, indx1++) {
1313
1314 if (fabsf(0.5f - pmwt[indx >> 1]) < fabsf(0.5f - hvwt[indx >> 1]) ) {
1315 continue;
1316 }
1317
1318 //now interpolate G vertically/horizontally using R+B values
1319 //unfortunately, since G interpolation cannot be done diagonally this may lead to colour shifts
1320
1321 //colour ratios for G interpolation
1322 float cru = cfa[indx - v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - v1)]);
1323 float crd = cfa[indx + v1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + v1)]);
1324 float crl = cfa[indx - 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 - 1)]);
1325 float crr = cfa[indx + 1] * 2.0 / (eps + rbint[indx1] + rbint[(indx1 + 1)]);
1326
1327 //interpolation of G in four directions
1328 float gu, gd, gl, gr;
1329
1330 //interpolated G via adaptive ratios or Hamilton-Adams in each cardinal direction
1331 if (fabsf(1.f - cru) < arthresh) {
1332 gu = rbint[indx1] * cru;
1333 } else {
1334 gu = cfa[indx - v1] + xdiv2f(rbint[indx1] - rbint[(indx1 - v1)]);
1335 }
1336
1337 if (fabsf(1.f - crd) < arthresh) {
1338 gd = rbint[indx1] * crd;
1339 } else {
1340 gd = cfa[indx + v1] + xdiv2f(rbint[indx1] - rbint[(indx1 + v1)]);
1341 }
1342
1343 if (fabsf(1.f - crl) < arthresh) {
1344 gl = rbint[indx1] * crl;
1345 } else {
1346 gl = cfa[indx - 1] + xdiv2f(rbint[indx1] - rbint[(indx1 - 1)]);
1347 }
1348
1349 if (fabsf(1.f - crr) < arthresh) {
1350 gr = rbint[indx1] * crr;
1351 } else {
1352 gr = cfa[indx + 1] + xdiv2f(rbint[indx1] - rbint[(indx1 + 1)]);
1353 }
1354
1355 //interpolated G via adaptive weights of cardinal evaluations
1356 float Gintv = (dirwts0[indx - v1] * gd + dirwts0[indx + v1] * gu) / (dirwts0[indx + v1] + dirwts0[indx - v1]);
1357 float Ginth = (dirwts1[indx - 1] * gr + dirwts1[indx + 1] * gl) / (dirwts1[indx - 1] + dirwts1[indx + 1]);
1358
1359 //bound the interpolation in regions of high saturation
1360 if (Gintv < rbint[indx1]) {
1361 if (2 * Gintv < rbint[indx1]) {
1362 Gintv = median(Gintv , cfa[indx - v1], cfa[indx + v1]);
1363 } else {
1364 float vwt = 2.0 * (rbint[indx1] - Gintv) / (eps + Gintv + rbint[indx1]);
1365 Gintv = vwt * Gintv + (1.f - vwt) * median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1366 }
1367 }
1368
1369 if (Ginth < rbint[indx1]) {
1370 if (2 * Ginth < rbint[indx1]) {
1371 Ginth = median(Ginth , cfa[indx - 1], cfa[indx + 1]);
1372 } else {
1373 float hwt = 2.0 * (rbint[indx1] - Ginth) / (eps + Ginth + rbint[indx1]);
1374 Ginth = hwt * Ginth + (1.f - hwt) * median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1375 }
1376 }
1377
1378 if (Ginth > clip_pt) {
1379 Ginth = median(Ginth, cfa[indx - 1], cfa[indx + 1]);
1380 }
1381
1382 if (Gintv > clip_pt) {
1383 Gintv = median(Gintv, cfa[indx - v1], cfa[indx + v1]);
1384 }
1385
1386 rgbgreen[indx] = Ginth * (1.f - hvwt[indx1]) + Gintv * hvwt[indx1];
1387 Dgrb[0][indx >> 1] = rgbgreen[indx] - cfa[indx];
1388 }
1389
1390 #endif
1391
1392 //end of diagonal interpolation correction
1393
1394 //fancy chrominance interpolation
1395 //(ey,ex) is location of R site
1396 for (int rr = 13 - ey; rr < rr1 - 12; rr += 2)
1397 for (int indx1 = (rr * ts + 13 - ex) >> 1; indx1 < (rr * ts + cc1 - 12) >> 1; indx1++) { //B coset
1398 Dgrb[1][indx1] = Dgrb[0][indx1]; //split out G-B from G-R
1399 Dgrb[0][indx1] = 0;
1400 }
1401
1402 #ifdef __SSE2__
1403 vfloat oned325v = F2V( 1.325f );
1404 vfloat zd175v = F2V( 0.175f );
1405 vfloat zd075v = F2V( 0.075f );
1406 #endif
1407
1408 for (int rr = 14; rr < rr1 - 14; rr++)
1409 #ifdef __SSE2__
1410 for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 8, indx += 8) {
1411 vfloat tempv = epsv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m1) >> 1]));
1412 vfloat temp2v = epsv + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p1) >> 1]));
1413 vfloat wtnwv = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - m3) >> 1])));
1414 vfloat wtnev = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + p3) >> 1])));
1415 vfloat wtswv = onev / (temp2v + vabsf(LVFU(Dgrb[c][(indx - p1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx + p1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])));
1416 vfloat wtsev = onev / (tempv + vabsf(LVFU(Dgrb[c][(indx + m1) >> 1]) - LVFU(Dgrb[c][(indx - p3) >> 1])) + vabsf(LVFU(Dgrb[c][(indx - m1) >> 1]) - LVFU(Dgrb[c][(indx + m3) >> 1])));
1417
1418 STVFU(Dgrb[c][indx >> 1], (wtnwv * (oned325v * LVFU(Dgrb[c][(indx - m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - m1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - m1 - v2) >> 1])) ) +
1419 wtnev * (oned325v * LVFU(Dgrb[c][(indx + p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + p1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + p1 + v2) >> 1])) ) +
1420 wtswv * (oned325v * LVFU(Dgrb[c][(indx - p1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx - p3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx - p1 - 2) >> 1]) + LVFU(Dgrb[c][(indx - p1 - v2) >> 1])) ) +
1421 wtsev * (oned325v * LVFU(Dgrb[c][(indx + m1) >> 1]) - zd175v * LVFU(Dgrb[c][(indx + m3) >> 1]) - zd075v * (LVFU(Dgrb[c][(indx + m1 + 2) >> 1]) + LVFU(Dgrb[c][(indx + m1 + v2) >> 1])) )) / (wtnwv + wtnev + wtswv + wtsev));
1422 }
1423
1424 #else
1425
1426 for (int cc = 14 + (fc(cfarray, rr, 2) & 1), indx = rr * ts + cc, c = 1 - fc(cfarray, rr, cc) / 2; cc < cc1 - 14; cc += 2, indx += 2) {
1427 float wtnw = 1.f / (eps + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m1) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx - m3) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m3) >> 1]));
1428 float wtne = 1.f / (eps + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p1) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx + p3) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p3) >> 1]));
1429 float wtsw = 1.f / (eps + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + p1) >> 1]) + fabsf(Dgrb[c][(indx - p1) >> 1] - Dgrb[c][(indx + m3) >> 1]) + fabsf(Dgrb[c][(indx + p1) >> 1] - Dgrb[c][(indx - p3) >> 1]));
1430 float wtse = 1.f / (eps + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - m1) >> 1]) + fabsf(Dgrb[c][(indx + m1) >> 1] - Dgrb[c][(indx - p3) >> 1]) + fabsf(Dgrb[c][(indx - m1) >> 1] - Dgrb[c][(indx + m3) >> 1]));
1431
1432 Dgrb[c][indx >> 1] = (wtnw * (1.325f * Dgrb[c][(indx - m1) >> 1] - 0.175f * Dgrb[c][(indx - m3) >> 1] - 0.075f * Dgrb[c][(indx - m1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - m1 - v2) >> 1] ) +
1433 wtne * (1.325f * Dgrb[c][(indx + p1) >> 1] - 0.175f * Dgrb[c][(indx + p3) >> 1] - 0.075f * Dgrb[c][(indx + p1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + p1 + v2) >> 1] ) +
1434 wtsw * (1.325f * Dgrb[c][(indx - p1) >> 1] - 0.175f * Dgrb[c][(indx - p3) >> 1] - 0.075f * Dgrb[c][(indx - p1 - 2) >> 1] - 0.075f * Dgrb[c][(indx - p1 - v2) >> 1] ) +
1435 wtse * (1.325f * Dgrb[c][(indx + m1) >> 1] - 0.175f * Dgrb[c][(indx + m3) >> 1] - 0.075f * Dgrb[c][(indx + m1 + 2) >> 1] - 0.075f * Dgrb[c][(indx + m1 + v2) >> 1] )) / (wtnw + wtne + wtsw + wtse);
1436 }
1437
1438 #endif
1439
1440 #ifdef __SSE2__
1441 int offset;
1442 vfloat twov = F2V(2.f);
1443 vmask selmask;
1444
1445 if((fc(cfarray, 16, 2) & 1) == 1) {
1446 selmask = _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0);
1447 offset = 1;
1448 } else {
1449 selmask = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
1450 offset = 0;
1451 }
1452
1453 #endif
1454
1455 for (int rr = 16; rr < rr1 - 16; rr++) {
1456 int row = rr + top;
1457 int col = left + 16;
1458 int indx = rr * ts + 16;
1459 #ifdef __SSE2__
1460 offset = 1 - offset;
1461 selmask = vnotm(selmask);
1462
1463 for (; indx < rr * ts + cc1 - 18 - (cc1 & 1); indx += 4, col += 4) {
1464 vfloat greenv = LVF(rgbgreen[indx]);
1465 vfloat temp00v = vdup(LVFU(hvwt[(indx - v1) >> 1]));
1466 vfloat temp01v = vdup(LVFU(hvwt[(indx + v1) >> 1]));
1467 vfloat tempv = onev / (temp00v + twov - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1])) - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1])) + temp01v);
1468
1469 vfloat redv1 = greenv - (temp00v * vdup(LVFU(Dgrb[0][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[0][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[0][(indx + v1) >> 1]))) * tempv;
1470 vfloat bluev1 = greenv - (temp00v * vdup(LVFU(Dgrb[1][(indx - v1) >> 1])) + (onev - vdup(LVFU(hvwt[(indx + 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx + 1 + offset) >> 1])) + (onev - vdup(LVFU(hvwt[(indx - 1 + offset) >> 1]))) * vdup(LVFU(Dgrb[1][(indx - 1 + offset) >> 1])) + temp01v * vdup(LVFU(Dgrb[1][(indx + v1) >> 1]))) * tempv;
1471 vfloat redv2 = greenv - vdup(LVFU(Dgrb[0][indx >> 1]));
1472 vfloat bluev2 = greenv - vdup(LVFU(Dgrb[1][indx >> 1]));
1473 STVFU(red[row][col], vmaxf(c65535v * vself(selmask, redv1, redv2), ZEROV));
1474 STVFU(blue[row][col], vmaxf(c65535v * vself(selmask, bluev1, bluev2), ZEROV));
1475 }
1476
1477 if(offset == 0) {
1478 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1479 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1480 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1481 temp));
1482 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1483 temp));
1484
1485 indx++;
1486 col++;
1487 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1488 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1489 }
1490
1491 if(cc1 & 1) { // width of tile is odd
1492 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1493 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1494 temp));
1495 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1496 temp));
1497 }
1498 } else {
1499 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1500 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1501 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1502
1503 indx++;
1504 col++;
1505 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1506 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1507 temp));
1508 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1509 temp));
1510 }
1511
1512 if(cc1 & 1) { // width of tile is odd
1513 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1514 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1515 }
1516 }
1517
1518 #else
1519
1520 if((fc(cfarray, rr, 2) & 1) == 1) {
1521 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1522 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1523 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1524 temp));
1525 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1526 temp));
1527
1528 indx++;
1529 col++;
1530 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1531 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1532 }
1533
1534 if(cc1 & 1) { // width of tile is odd
1535 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1536 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1537 temp));
1538 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1539 temp));
1540 }
1541 } else {
1542 for (; indx < rr * ts + cc1 - 16 - (cc1 & 1); indx++, col++) {
1543 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1544 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1545
1546 indx++;
1547 col++;
1548 float temp = 1.f / (hvwt[(indx - v1) >> 1] + 2.f - hvwt[(indx + 1) >> 1] - hvwt[(indx - 1) >> 1] + hvwt[(indx + v1) >> 1]);
1549 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[0][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[0][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[0][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[0][(indx + v1) >> 1]) *
1550 temp));
1551 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - ((hvwt[(indx - v1) >> 1]) * Dgrb[1][(indx - v1) >> 1] + (1.f - hvwt[(indx + 1) >> 1]) * Dgrb[1][(indx + 1) >> 1] + (1.f - hvwt[(indx - 1) >> 1]) * Dgrb[1][(indx - 1) >> 1] + (hvwt[(indx + v1) >> 1]) * Dgrb[1][(indx + v1) >> 1]) *
1552 temp));
1553 }
1554
1555 if(cc1 & 1) { // width of tile is odd
1556 red[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[0][indx >> 1]));
1557 blue[row][col] = std::max(0.f, 65535.f * (rgbgreen[indx] - Dgrb[1][indx >> 1]));
1558 }
1559 }
1560
1561 #endif
1562 }
1563
1564 // copy smoothed results back to image matrix
1565 for (int rr = 16; rr < rr1 - 16; rr++) {
1566 int row = rr + top;
1567 int cc = 16;
1568 #ifdef __SSE2__
1569
1570 for (; cc < cc1 - 19; cc += 4) {
1571 STVFU(green[row][cc + left], vmaxf(LVF(rgbgreen[rr * ts + cc]) * c65535v, ZEROV));
1572 }
1573
1574 #endif
1575
1576 for (; cc < cc1 - 16; cc++) {
1577 green[row][cc + left] = std::max(0.f, 65535.f * rgbgreen[rr * ts + cc]);
1578 }
1579 }
1580
1581 if(plistener) {
1582 progresscounter++;
1583
1584 if(progresscounter % 32 == 0) {
1585 #ifdef _OPENMP
1586 #pragma omp critical (amazeprogress)
1587 #endif
1588 {
1589 progress += (double)32 * ((ts - 32) * (ts - 32)) / (height * width);
1590 progress = progress > 1.0 ? 1.0 : progress;
1591 plistener->setProgress(progress);
1592 }
1593 }
1594 }
1595 }
1596 } //end of main loop
1597
1598 // clean up
1599 free(buffer);
1600 }
1601 if(border < 4) {
1602 border_interpolate(W, H, 3, rawData, red, green, blue);
1603 }
1604
1605 if(plistener) {
1606 plistener->setProgress(1.0);
1607 }
1608
1609 }
1610 }
1611