1 ////////////////////////////////////////////////////////////////
2 //
3 //  Xtrans demosaic algorithm
4 //
5 //  code dated: April 18, 2018
6 //
7 //  xtrans.cc is free software: you can redistribute it and/or modify
8 //  it under the terms of the GNU General Public License as published by
9 //  the Free Software Foundation, either version 3 of the License, or
10 //  (at your option) any later version.
11 //
12 //  This program is distributed in the hope that it will be useful,
13 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 //  GNU General Public License for more details.
16 //
17 //  You should have received a copy of the GNU General Public License
18 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 ////////////////////////////////////////////////////////////////
21 
22 #include <float.h>
23 #include <memory>
24 
25 #include "librtprocess.h"
26 #include "LUT.h"
27 #include "sleef.h"
28 #include "rt_math.h"
29 #include "opthelper.h"
30 #include "StopWatch.h"
31 #include "xtranshelper.h"
32 
33 namespace
34 {
35 
36 const float xyz_rgb[3][3] = { // XYZ from RGB
37     { 0.412453, 0.357580, 0.180423 },
38     { 0.212671, 0.715160, 0.072169 },
39     { 0.019334, 0.119193, 0.950227 }
40 };
41 const float d65_white[3] = { 0.950456, 1, 1.088754 };
42 
43 
cielab(const float (* rgb)[3],float * l,float * a,float * b,const int width,const int height,const int labWidth,const float xyz_cam[3][3])44 void cielab (const float (*rgb)[3], float* l, float* a, float *b, const int width, const int height, const int labWidth, const float xyz_cam[3][3])
45 {
46     static LUTf cbrt(0x14000);
47 
48     if (!rgb) {
49         static bool cbrtinit = false;
50         if(!cbrtinit) {
51             //sRGB epsilon and kappa
52             constexpr double eps = 216.0 / 24389.0;
53             constexpr double kappa = 24389.0 / 27.0;
54             for (int i = 0; i < 0x14000; i++) {
55                 double r = i / 65535.0;
56                 cbrt[i] = r > eps ? std::cbrt(r) : (kappa * r + 16.0) / 116.0;
57             }
58 
59             cbrtinit = true;
60         }
61 
62         return;
63     }
64 
65 #ifdef __SSE2__
66     vfloat c116v = F2V(116.f);
67     vfloat c16v = F2V(16.f);
68     vfloat c500v = F2V(500.f);
69     vfloat c200v = F2V(200.f);
70     vfloat xyz_camv[3][3];
71 
72     for(int i = 0; i < 3; i++)
73         for(int j = 0; j < 3; j++) {
74             xyz_camv[i][j] = F2V(xyz_cam[i][j]);
75         }
76 
77 #endif // __SSE2__
78 
79     for(int i = 0; i < height; i++) {
80         int j = 0;
81 #ifdef __SSE2__
82 
83         for(; j < labWidth - 3; j += 4) {
84             vfloat redv, greenv, bluev;
85             vconvertrgbrgbrgbrgb2rrrrggggbbbb(rgb[i * width + j], redv, greenv, bluev);
86             vfloat xyz0v = redv * xyz_camv[0][0] + greenv * xyz_camv[0][1] + bluev * xyz_camv[0][2];
87             vfloat xyz1v = redv * xyz_camv[1][0] + greenv * xyz_camv[1][1] + bluev * xyz_camv[1][2];
88             vfloat xyz2v = redv * xyz_camv[2][0] + greenv * xyz_camv[2][1] + bluev * xyz_camv[2][2];
89             xyz0v = cbrt[_mm_cvtps_epi32(xyz0v)];
90             xyz1v = cbrt[_mm_cvtps_epi32(xyz1v)];
91             xyz2v = cbrt[_mm_cvtps_epi32(xyz2v)];
92 
93             STVFU(l[i * labWidth + j], c116v * xyz1v - c16v);
94             STVFU(a[i * labWidth + j], c500v * (xyz0v - xyz1v));
95             STVFU(b[i * labWidth + j], c200v * (xyz1v - xyz2v));
96         }
97 
98 #endif
99 
100         for(; j < labWidth; j++) {
101             float xyz[3] = {0.5f, 0.5f, 0.5f};
102 
103             for(int c = 0; c < 3; c++) {
104                 float val = rgb[i * width + j][c];
105                 xyz[0] += xyz_cam[0][c] * val;
106                 xyz[1] += xyz_cam[1][c] * val;
107                 xyz[2] += xyz_cam[2][c] * val;
108             }
109 
110             xyz[0] = cbrt[(int) xyz[0]];
111             xyz[1] = cbrt[(int) xyz[1]];
112             xyz[2] = cbrt[(int) xyz[2]];
113 
114             l[i * labWidth + j] = 116 * xyz[1] - 16;
115             a[i * labWidth + j] = 500 * (xyz[0] - xyz[1]);
116             b[i * labWidth + j] = 200 * (xyz[1] - xyz[2]);
117         }
118     }
119 }
120 }
121 
122 
123 /*
124    Frank Markesteijn's algorithm for Fuji X-Trans sensors
125    adapted to RT by Ingo Weyrich 2014
126 */
127 
128 using namespace librtprocess;
markesteijn_demosaic(int width,int height,const float * const * rawData,float ** red,float ** green,float ** blue,const unsigned xtrans[6][6],const float rgb_cam[3][4],const std::function<bool (double)> & setProgCancel,const int passes,const bool useCieLab,size_t chunkSize,bool measure)129 rpError markesteijn_demosaic (int width, int height, const float * const *rawData, float **red, float **green, float **blue, const unsigned xtrans[6][6], const float rgb_cam[3][4], const std::function<bool(double)> &setProgCancel, const int passes, const bool useCieLab, size_t chunkSize, bool measure)
130 {
131     BENCHFUN
132     std::unique_ptr<StopWatch> stop;
133 
134     if (measure) {
135         std::cout << passes << "-pass Markesteijn Demosaicing " << width << "x" << height << " image with " << chunkSize << " tiles per thread" << std::endl;
136         stop.reset(new StopWatch("xtrans demosaic"));
137     }
138     if (!validateXtransCfa(xtrans)) {
139         return RP_WRONG_CFA;
140     }
141 
142     rpError rc = RP_NO_ERROR;
143 
144     constexpr int ts = 114;      /* Tile Size */
145     constexpr int tsh = ts / 2;  /* half of Tile Size */
146 
147     double progress = 0.0;
148     setProgCancel(progress);
149 
150     constexpr short  orth[12] = { 1, 0, 0, 1, -1, 0, 0, -1, 1, 0, 0, 1 },
151     patt[2][16] = { { 0, 1, 0, -1, 2, 0, -1, 0, 1, 1, 1, -1, 0, 0, 0, 0 },
152                     { 0, 1, 0, -2, 1, 0, -2, 0, 1, 1, -2, -2, 1, -1, -1, 1 }
153                   },
154     dir[4] = { 1, ts, ts + 1, ts - 1 };
155 
156     // sgrow/sgcol is the offset in the sensor matrix of the solitary
157     // green pixels
158     unsigned short sgrow = 0, sgcol = 0;
159 
160     float xyz_cam[3][3];
161     {
162         int k;
163 
164         for (int i = 0; i < 3; i++)
165             for (int j = 0; j < 3; j++)
166                 for (xyz_cam[i][j] = k = 0; k < 3; k++) {
167                     xyz_cam[i][j] += xyz_rgb[i][k] * rgb_cam[k][j] / d65_white[i];
168                 }
169     }
170 
171     /* Map a green hexagon around each non-green pixel and vice versa:  */
172     short allhex[2][3][3][8];
173     {
174         int gint, d, h, v, ng, row, col;
175 
176         for (row = 0; row < 3; row++)
177             for (col = 0; col < 3; col++) {
178                 gint = isgreen(xtrans, row, col);
179 
180                 for (ng = d = 0; d < 10; d += 2) {
181                     if (isgreen(xtrans, row + orth[d] + 6, col + orth[d + 2] + 6)) {
182                         ng = 0;
183                     } else {
184                         ng++;
185                     }
186 
187                     if (ng == 4) {
188                         // if there are four non-green pixels adjacent in cardinal
189                         // directions, this is the solitary green pixel
190                         sgrow = row;
191                         sgcol = col;
192                     }
193 
194                     if (ng == gint + 1) {
195                         for (int c = 0; c < 8; c++) {
196                             v = orth[d] * patt[gint][c * 2] + orth[d + 1] * patt[gint][c * 2 + 1];
197                             h = orth[d + 2] * patt[gint][c * 2] + orth[d + 3] * patt[gint][c * 2 + 1];
198                             allhex[0][row][col][c ^ (gint * 2 & d)] = h + v * width;
199                             allhex[1][row][col][c ^ (gint * 2 & d)] = h + v * ts;
200                         }
201                     }
202                 }
203             }
204 
205     }
206 
207     progress += 0.05;
208     setProgCancel(progress);
209 
210 
211     double progressInc = 36.0 * (1.0 - progress) / ((height * width) / ((ts - 16) * (ts - 16)));
212     const int ndir = 4 << (passes > 1);
213     cielab (nullptr, nullptr, nullptr, nullptr, 0, 0, 0, nullptr);
214     struct s_minmaxgreen {
215         float min;
216         float max;
217     };
218 
219     int RightShift[3];
220 
221     for(int row = 0; row < 3; row++) {
222         // count number of green pixels in three cols
223         int greencount = 0;
224 
225         for(int col = 0; col < 3; col++) {
226             greencount += isgreen(xtrans, row, col);
227         }
228 
229         RightShift[row] = (greencount == 2);
230     }
231 
232 #ifdef _OPENMP
233     #pragma omp parallel
234 #endif
235     {
236         int progressCounter = 0;
237         float dcolor[3][6];
238 
239         float *buffer = (float *) malloc ((ts * ts * (ndir * 4 + 3) + 128) * sizeof(float));
240 
241 #ifdef _OPENMP
242         #pragma omp critical
243 #endif
244         {
245             if(!buffer) {
246                 rc = RP_MEMORY_ERROR;
247             }
248         }
249 #ifdef _OPENMP
250         #pragma omp barrier
251 #endif
252         if(!rc) {
253             float (*rgb)[ts][ts][3] = (float(*)[ts][ts][3]) buffer;
254             float (*lab)[ts - 8][ts - 8] = (float (*)[ts - 8][ts - 8])(buffer + ts * ts * (ndir * 3));
255             float (*drv)[ts - 10][ts - 10] = (float (*)[ts - 10][ts - 10])   (buffer + ts * ts * (ndir * 3 + 3));
256             uint8_t (*homo)[ts][ts] = (uint8_t  (*)[ts][ts])   (lab); // we can reuse the lab-buffer because they are not used together
257             s_minmaxgreen  (*greenminmaxtile)[tsh] = (s_minmaxgreen(*)[tsh]) (lab); // we can reuse the lab-buffer because they are not used together
258             uint8_t (*homosum)[ts][ts] = (uint8_t (*)[ts][ts]) (drv); // we can reuse the drv-buffer because they are not used together
259             uint8_t (*homosummax)[ts] = (uint8_t (*)[ts]) homo[ndir - 1]; // we can reuse the homo-buffer because they are not used together
260 
261 #ifdef _OPENMP
262             #pragma omp for collapse(2) schedule(dynamic, chunkSize) nowait
263 #endif
264 
265             for (int top = 3; top < height - 19; top += ts - 16)
266                 for (int left = 3; left < width - 19; left += ts - 16) {
267                     int mrow = std::min(top + ts, height - 3);
268                     int mcol = std::min(left + ts, width - 3);
269 
270                     /* Set greenmin and greenmax to the minimum and maximum allowed values: */
271                     for (int row = top; row < mrow; row++) {
272                         // find first non-green pixel
273                         int leftstart = left;
274 
275                         for(; leftstart < mcol; leftstart++)
276                             if(!isgreen(xtrans, row, leftstart)) {
277                                 break;
278                             }
279 
280                         int coloffset = (RightShift[row % 3] == 1 ? 3 : 1 + (fc(xtrans, row, leftstart + 1) & 1));
281 
282                         if(coloffset == 3) {
283                             short *hex = allhex[0][row % 3][leftstart % 3];
284 
285                             for (int col = leftstart; col < mcol; col += coloffset) {
286                                 float minval = FLT_MAX;
287                                 float maxval = 0.f;
288                                 const float *pix = &rawData[row][col];
289 
290                                 for(int c = 0; c < 6; c++) {
291                                     float val = pix[hex[c]];
292 
293                                     minval = minval < val ? minval : val;
294                                     maxval = maxval > val ? maxval : val;
295                                 }
296 
297                                 greenminmaxtile[row - top][(col - left) >> 1].min = minval;
298                                 greenminmaxtile[row - top][(col - left) >> 1].max = maxval;
299                             }
300                         } else {
301                             float minval = FLT_MAX;
302                             float maxval = 0.f;
303                             int col = leftstart;
304 
305                             if(coloffset == 2) {
306                                 minval = FLT_MAX;
307                                 maxval = 0.f;
308                                 const float *pix = &rawData[row][col];
309                                 short *hex = allhex[0][row % 3][col % 3];
310 
311                                 for(int c = 0; c < 6; c++) {
312                                     float val = pix[hex[c]];
313 
314                                     minval = minval < val ? minval : val;
315                                     maxval = maxval > val ? maxval : val;
316                                 }
317 
318                                 greenminmaxtile[row - top][(col - left) >> 1].min = minval;
319                                 greenminmaxtile[row - top][(col - left) >> 1].max = maxval;
320                                 col += 2;
321                             }
322 
323                             short *hex = allhex[0][row % 3][col % 3];
324 
325                             for (; col < mcol - 1; col += 3) {
326                                 minval = FLT_MAX;
327                                 maxval = 0.f;
328                                 const float *pix = &rawData[row][col];
329 
330                                 for(int c = 0; c < 6; c++) {
331                                     float val = pix[hex[c]];
332 
333                                     minval = minval < val ? minval : val;
334                                     maxval = maxval > val ? maxval : val;
335                                 }
336 
337                                 greenminmaxtile[row - top][(col - left) >> 1].min = minval;
338                                 greenminmaxtile[row - top][(col - left) >> 1].max = maxval;
339                                 greenminmaxtile[row - top][(col + 1 - left) >> 1].min = minval;
340                                 greenminmaxtile[row - top][(col + 1 - left) >> 1].max = maxval;
341                             }
342 
343                             if(col < mcol) {
344                                 minval = FLT_MAX;
345                                 maxval = 0.f;
346                                 const float *pix = &rawData[row][col];
347 
348                                 for(int c = 0; c < 6; c++) {
349                                     float val = pix[hex[c]];
350 
351                                     minval = minval < val ? minval : val;
352                                     maxval = maxval > val ? maxval : val;
353                                 }
354 
355                                 greenminmaxtile[row - top][(col - left) >> 1].min = minval;
356                                 greenminmaxtile[row - top][(col - left) >> 1].max = maxval;
357                             }
358                         }
359                     }
360 
361                     memset(rgb, 0, ts * ts * 3 * sizeof(float));
362 
363                     for (int row = top; row < mrow; row++)
364                         for (int col = left; col < mcol; col++) {
365                             rgb[0][row - top][col - left][fc(xtrans, row, col)] = rawData[row][col];
366                         }
367 
368                     for(int c = 0; c < 3; c++) {
369                         memcpy (rgb[c + 1], rgb[0], sizeof * rgb);
370                     }
371 
372                     /* Interpolate green horizontally, vertically, and along both diagonals: */
373                     for (int row = top; row < mrow; row++) {
374                         // find first non-green pixel
375                         int leftstart = left;
376 
377                         for(; leftstart < mcol; leftstart++)
378                             if(!isgreen(xtrans, row, leftstart)) {
379                                 break;
380                             }
381 
382                         int coloffset = (RightShift[row % 3] == 1 ? 3 : 1 + (fc(xtrans, row, leftstart + 1) & 1));
383 
384                         if(coloffset == 3) {
385                             short *hex = allhex[0][row % 3][leftstart % 3];
386 
387                             for (int col = leftstart; col < mcol; col += coloffset) {
388                                 const float *pix = &rawData[row][col];
389                                 float color[4];
390                                 color[0] = 0.6796875f * (pix[hex[1]] + pix[hex[0]]) -
391                                            0.1796875f * (pix[2 * hex[1]] + pix[2 * hex[0]]);
392                                 color[1] = 0.87109375f *  pix[hex[3]] + pix[hex[2]] * 0.12890625f +
393                                            0.359375f * (pix[0] - pix[-hex[2]]);
394 
395                                 for(int c = 0; c < 2; c++)
396                                     color[2 + c] = 0.640625f * pix[hex[4 + c]] + 0.359375f * pix[-2 * hex[4 + c]] + 0.12890625f *
397                                                    (2.f * pix[0] - pix[3 * hex[4 + c]] - pix[-3 * hex[4 + c]]);
398 
399                                 for(int c = 0; c < 4; c++) {
400                                     rgb[c][row - top][col - left][1] = LIM(color[c], greenminmaxtile[row - top][(col - left) >> 1].min, greenminmaxtile[row - top][(col - left) >> 1].max);
401                                 }
402                             }
403                         } else {
404                             short *hexmod[2];
405                             hexmod[0] = allhex[0][row % 3][leftstart % 3];
406                             hexmod[1] = allhex[0][row % 3][(leftstart + coloffset) % 3];
407 
408                             for (int col = leftstart, hexindex = 0; col < mcol; col += coloffset, coloffset ^= 3, hexindex ^= 1) {
409                                 const float *pix = &rawData[row][col];
410                                 short *hex = hexmod[hexindex];
411                                 float color[4];
412                                 color[0] = 0.6796875f * (pix[hex[1]] + pix[hex[0]]) -
413                                            0.1796875f * (pix[2 * hex[1]] + pix[2 * hex[0]]);
414                                 color[1] = 0.87109375f *  pix[hex[3]] + pix[hex[2]] * 0.12890625f +
415                                            0.359375f * (pix[0] - pix[-hex[2]]);
416 
417                                 for(int c = 0; c < 2; c++)
418                                     color[2 + c] = 0.640625f * pix[hex[4 + c]] + 0.359375f * pix[-2 * hex[4 + c]] + 0.12890625f *
419                                                    (2.f * pix[0] - pix[3 * hex[4 + c]] - pix[-3 * hex[4 + c]]);
420 
421                                 for(int c = 0; c < 4; c++) {
422                                     rgb[c ^ 1][row - top][col - left][1] = LIM(color[c], greenminmaxtile[row - top][(col - left) >> 1].min, greenminmaxtile[row - top][(col - left) >> 1].max);
423                                 }
424                             }
425                         }
426                     }
427 
428                     for (int pass = 0; pass < passes; pass++) {
429                         if (pass == 1) {
430                             memcpy (rgb += 4, buffer, 4 * sizeof * rgb);
431                         }
432 
433                         /* Recalculate green from interpolated values of closer pixels: */
434                         if (pass) {
435                             for (int row = top + 2; row < mrow - 2; row++) {
436                                 int leftstart = left + 2;
437 
438                                 for(; leftstart < mcol - 2; leftstart++)
439                                     if(!isgreen(xtrans, row, leftstart)) {
440                                         break;
441                                     }
442 
443                                 int coloffset = (RightShift[row % 3] == 1 ? 3 : 1 + (fc(xtrans, row, leftstart + 1) & 1));
444 
445                                 if(coloffset == 3) {
446                                     int f = fc(xtrans, row, leftstart);
447                                     short *hex = allhex[1][row % 3][leftstart % 3];
448 
449                                     for (int col = leftstart; col < mcol - 2; col += coloffset, f ^= 2) {
450                                         for (int d = 3; d < 6; d++) {
451                                             float (*rix)[3] = &rgb[(d - 2)][row - top][col - left];
452                                             float val = 0.33333333f * (rix[-2 * hex[d]][1] + 2 * (rix[hex[d]][1] - rix[hex[d]][f])
453                                                                        - rix[-2 * hex[d]][f]) + rix[0][f];
454                                             rix[0][1] = LIM(val, greenminmaxtile[row - top][(col - left) >> 1].min, greenminmaxtile[row - top][(col - left) >> 1].max);
455                                         }
456                                     }
457                                 } else {
458                                     int f = fc(xtrans, row, leftstart);
459                                     short *hexmod[2];
460                                     hexmod[0] = allhex[1][row % 3][leftstart % 3];
461                                     hexmod[1] = allhex[1][row % 3][(leftstart + coloffset) % 3];
462 
463                                     for (int col = leftstart, hexindex = 0; col < mcol - 2; col += coloffset, coloffset ^= 3, f = f ^ (coloffset & 2), hexindex ^= 1 ) {
464                                         short *hex = hexmod[hexindex];
465 
466                                         for (int d = 3; d < 6; d++) {
467                                             float (*rix)[3] = &rgb[(d - 2) ^ 1][row - top][col - left];
468                                             float val = 0.33333333f * (rix[-2 * hex[d]][1] + 2 * (rix[hex[d]][1] - rix[hex[d]][f])
469                                                                        - rix[-2 * hex[d]][f]) + rix[0][f];
470                                             rix[0][1] = LIM(val, greenminmaxtile[row - top][(col - left) >> 1].min, greenminmaxtile[row - top][(col - left) >> 1].max);
471                                         }
472                                     }
473                                 }
474                             }
475                         }
476 
477                         /* Interpolate red and blue values for solitary green pixels:   */
478                         int sgstartcol = (left - sgcol + 4) / 3 * 3 + sgcol;
479 
480                         for (int row = (top - sgrow + 4) / 3 * 3 + sgrow; row < mrow - 2; row += 3) {
481                             for (int col = sgstartcol, h = fc(xtrans, row, col + 1); col < mcol - 2; col += 3, h ^= 2) {
482                                 float (*rix)[3] = &rgb[0][row - top][col - left];
483                                 float diff[6] = {0.f};
484 
485                                 for (int i = 1, d = 0; d < 6; d++, i ^= ts ^ 1, h ^= 2) {
486                                     for (int c = 0; c < 2; c++, h ^= 2) {
487                                         float g = rix[0][1] + rix[0][1] - rix[i << c][1] - rix[-i << c][1];
488                                         dcolor[h][d] = g + rix[i << c][h] + rix[-i << c][h];
489 
490                                         if (d > 1)
491                                             diff[d] += SQR (rix[i << c][1] - rix[-i << c][1]
492                                                             - rix[i << c][h] + rix[-i << c][h]) + SQR(g);
493                                     }
494 
495                                     if (d > 2 && (d & 1))    // 3, 5
496                                         if (diff[d - 1] < diff[d])
497                                             for(int c = 0; c < 2; c++) {
498                                                 dcolor[c * 2][d] = dcolor[c * 2][d - 1];
499                                             }
500 
501                                     if ((d & 1) || d < 2) { // d: 0, 1, 3, 5
502                                         for(int c = 0; c < 2; c++) {
503                                             rix[0][c * 2] = 0.5f * dcolor[c * 2][d];
504                                         }
505 
506                                         rix += ts * ts;
507                                     }
508                                 }
509                             }
510                         }
511 
512                         /* Interpolate red for blue pixels and vice versa:      */
513                         for (int row = top + 3; row < mrow - 3; row++) {
514                             int leftstart = left + 3;
515 
516                             for(; leftstart < mcol - 1; leftstart++)
517                                 if(!isgreen(xtrans, row, leftstart)) {
518                                     break;
519                                 }
520 
521                             int coloffset = (RightShift[row % 3] == 1 ? 3 : 1);
522                             int c = ((row - sgrow) % 3) ? ts : 1;
523                             int h = 3 * (c ^ ts ^ 1);
524 
525                             if(coloffset == 3) {
526                                 int f = 2 - fc(xtrans, row, leftstart);
527 
528                                 for (int col = leftstart; col < mcol - 3; col += coloffset, f ^= 2) {
529                                     float (*rix)[3] = &rgb[0][row - top][col - left];
530 
531                                     for (int d = 0; d < 4; d++, rix += ts * ts) {
532                                         int i = d > 1 || ((d ^ c) & 1) ||
533                                                 ((fabsf(rix[0][1] - rix[c][1]) + fabsf(rix[0][1] - rix[-c][1])) < 2.f * (fabsf(rix[0][1] - rix[h][1]) + fabsf(rix[0][1] - rix[-h][1]))) ? c : h;
534 
535                                         rix[0][f] = rix[0][1] + 0.5f * (rix[i][f] + rix[-i][f] - rix[i][1] - rix[-i][1]);
536                                     }
537                                 }
538                             } else {
539                                 coloffset = fc(xtrans, row, leftstart + 1) == 1 ? 2 : 1;
540                                 int f = 2 - fc(xtrans, row, leftstart);
541 
542                                 for (int col = leftstart; col < mcol - 3; col += coloffset, coloffset ^= 3, f = f ^ (coloffset & 2) ) {
543                                     float (*rix)[3] = &rgb[0][row - top][col - left];
544 
545                                     for (int d = 0; d < 4; d++, rix += ts * ts) {
546                                         int i = d > 1 || ((d ^ c) & 1) ||
547                                                 ((fabsf(rix[0][1] - rix[c][1]) + fabsf(rix[0][1] - rix[-c][1])) < 2.f * (fabsf(rix[0][1] - rix[h][1]) + fabsf(rix[0][1] - rix[-h][1]))) ? c : h;
548 
549                                         rix[0][f] = rix[0][1] + 0.5f * (rix[i][f] + rix[-i][f] - rix[i][1] - rix[-i][1]);
550                                     }
551                                 }
552                             }
553                         }
554 
555                         /* Fill in red and blue for 2x2 blocks of green:        */
556                         // Find first row of 2x2 green
557                         int topstart = top + 2;
558 
559                         for(; topstart < mrow - 2; topstart++)
560                             if((topstart - sgrow) % 3) {
561                                 break;
562                             }
563 
564                         int leftstart = left + 2;
565 
566                         for(; leftstart < mcol - 2; leftstart++)
567                             if((leftstart - sgcol) % 3) {
568                                 break;
569                             }
570 
571                         int coloffsetstart = 2 - (fc(xtrans, topstart, leftstart + 1) & 1);
572 
573                         for (int row = topstart; row < mrow - 2; row++) {
574                             if ((row - sgrow) % 3) {
575                                 short *hexmod[2];
576                                 hexmod[0] = allhex[1][row % 3][leftstart % 3];
577                                 hexmod[1] = allhex[1][row % 3][(leftstart + coloffsetstart) % 3];
578 
579                                 for (int col = leftstart, coloffset = coloffsetstart, hexindex = 0; col < mcol - 2; col += coloffset, coloffset ^= 3, hexindex ^= 1) {
580                                     float (*rix)[3] = &rgb[0][row - top][col - left];
581                                     short *hex = hexmod[hexindex];
582 
583                                     for (int d = 0; d < ndir; d += 2, rix += ts * ts) {
584                                         if (hex[d] + hex[d + 1]) {
585                                             float g = 3 * rix[0][1] - 2 * rix[hex[d]][1] - rix[hex[d + 1]][1];
586 
587                                             for (int c = 0; c < 4; c += 2) {
588                                                 rix[0][c] = (g + 2 * rix[hex[d]][c] + rix[hex[d + 1]][c]) * 0.33333333f;
589                                             }
590                                         } else {
591                                             float g = 2 * rix[0][1] - rix[hex[d]][1] - rix[hex[d + 1]][1];
592 
593                                             for (int c = 0; c < 4; c += 2) {
594                                                 rix[0][c] = (g + rix[hex[d]][c] + rix[hex[d + 1]][c]) * 0.5f;
595                                             }
596                                         }
597                                     }
598                                 }
599                             }
600                         }
601                     }
602 
603     // end of multipass part
604                     rgb = (float(*)[ts][ts][3]) buffer;
605                     mrow -= top;
606                     mcol -= left;
607 
608                     if(useCieLab) {
609                         /* Convert to CIELab and differentiate in all directions:   */
610                         // Original dcraw algorithm uses CIELab as perceptual space
611                         // (presumably coming from original AHD) and converts taking
612                         // camera matrix into account.  We use this in RT.
613                         for (int d = 0; d < ndir; d++) {
614                             float *l = &lab[0][0][0];
615                             float *a = &lab[1][0][0];
616                             float *b = &lab[2][0][0];
617                             cielab(&rgb[d][4][4], l, a, b, ts, mrow - 8, ts - 8, xyz_cam);
618                             int f = dir[d & 3];
619                             f = f == 1 ? 1 : f - 8;
620 
621                             for (int row = 5; row < mrow - 5; row++)
622 #ifdef _OPENMP
623                                 #pragma omp simd
624 #endif
625                                 for (int col = 5; col < mcol - 5; col++) {
626                                     float *ll = &lab[0][row - 4][col - 4];
627                                     float *la = &lab[1][row - 4][col - 4];
628                                     float *lb = &lab[2][row - 4][col - 4];
629 
630                                     float g = 2 * ll[0] - ll[f] - ll[-f];
631                                     drv[d][row - 5][col - 5] =  SQR(g)
632                                                                 + SQR((2 * la[0] - la[f] - la[-f] + g * 2.1551724f))
633                                                                 + SQR((2 * lb[0] - lb[f] - lb[-f] - g * 0.86206896f));
634                                 }
635 
636                         }
637                     } else {
638                         // For 1-pass demosaic we use YPbPr which requires much
639                         // less code and is nearly indistinguishable. It assumes the
640                         // camera RGB is roughly linear.
641                         for (int d = 0; d < ndir; d++) {
642                             float (*yuv)[ts - 8][ts - 8] = lab; // we use the lab buffer, which has the same dimensions
643 #ifdef __SSE2__
644                             vfloat zd2627v = F2V(0.2627f);
645                             vfloat zd6780v = F2V(0.6780f);
646                             vfloat zd0593v = F2V(0.0593f);
647                             vfloat zd56433v = F2V(0.56433f);
648                             vfloat zd67815v = F2V(0.67815f);
649 #endif
650 
651                             for (int row = 4; row < mrow - 4; row++) {
652                                 int col = 4;
653 #ifdef __SSE2__
654 
655                                 for (; col < mcol - 7; col += 4) {
656                                     // use ITU-R BT.2020 YPbPr, which is great, but could use
657                                     // a better/simpler choice? note that imageop.h provides
658                                     // dt_iop_RGB_to_YCbCr which uses Rec. 601 conversion,
659                                     // which appears less good with specular highlights
660                                     vfloat redv, greenv, bluev;
661                                     vconvertrgbrgbrgbrgb2rrrrggggbbbb(rgb[d][row][col], redv, greenv, bluev);
662                                     vfloat yv = zd2627v * redv + zd6780v * bluev + zd0593v * greenv;
663                                     STVFU(yuv[0][row - 4][col - 4], yv);
664                                     STVFU(yuv[1][row - 4][col - 4], (bluev - yv) * zd56433v);
665                                     STVFU(yuv[2][row - 4][col - 4], (redv - yv) * zd67815v);
666                                 }
667 
668 #endif
669 
670                                 for (; col < mcol - 4; col++) {
671                                     // use ITU-R BT.2020 YPbPr, which is great, but could use
672                                     // a better/simpler choice? note that imageop.h provides
673                                     // dt_iop_RGB_to_YCbCr which uses Rec. 601 conversion,
674                                     // which appears less good with specular highlights
675                                     float y = 0.2627f * rgb[d][row][col][0] + 0.6780f * rgb[d][row][col][1] + 0.0593f * rgb[d][row][col][2];
676                                     yuv[0][row - 4][col - 4] = y;
677                                     yuv[1][row - 4][col - 4] = (rgb[d][row][col][2] - y) * 0.56433f;
678                                     yuv[2][row - 4][col - 4] = (rgb[d][row][col][0] - y) * 0.67815f;
679                                 }
680                             }
681 
682                             int f = dir[d & 3];
683                             f = f == 1 ? 1 : f - 8;
684 
685                             for (int row = 5; row < mrow - 5; row++)
686                                 for (int col = 5; col < mcol - 5; col++) {
687                                     float *y = &yuv[0][row - 4][col - 4];
688                                     float *u = &yuv[1][row - 4][col - 4];
689                                     float *v = &yuv[2][row - 4][col - 4];
690                                     drv[d][row - 5][col - 5] = SQR(2 * y[0] - y[f] - y[-f])
691                                                                + SQR(2 * u[0] - u[f] - u[-f])
692                                                                + SQR(2 * v[0] - v[f] - v[-f]);
693                                 }
694                         }
695                     }
696 
697                     /* Build homogeneity maps from the derivatives:         */
698 #ifdef __SSE2__
699                     vfloat eightv = F2V(8.f);
700                     vfloat zerov = F2V(0.f);
701                     vfloat onev = F2V(1.f);
702 #endif
703 
704                     for (int row = 6; row < mrow - 6; row++) {
705                         int col = 6;
706 #ifdef __SSE2__
707 
708                         for (; col < mcol - 9; col += 4) {
709                             vfloat tr1v = vminf(LVFU(drv[0][row - 5][col - 5]), LVFU(drv[1][row - 5][col - 5]));
710                             vfloat tr2v = vminf(LVFU(drv[2][row - 5][col - 5]), LVFU(drv[3][row - 5][col - 5]));
711 
712                             if(ndir > 4) {
713                                 vfloat tr3v = vminf(LVFU(drv[4][row - 5][col - 5]), LVFU(drv[5][row - 5][col - 5]));
714                                 vfloat tr4v = vminf(LVFU(drv[6][row - 5][col - 5]), LVFU(drv[7][row - 5][col - 5]));
715                                 tr1v = vminf(tr1v, tr3v);
716                                 tr1v = vminf(tr1v, tr4v);
717                             }
718 
719                             tr1v = vminf(tr1v, tr2v);
720                             tr1v = tr1v * eightv;
721 
722                             for (int d = 0; d < ndir; d++) {
723                                 uint8_t tempstore[16];
724                                 vfloat tempv = zerov;
725 
726                                 for (int v = -1; v <= 1; v++) {
727                                     for (int h = -1; h <= 1; h++) {
728                                         tempv += vselfzero(vmaskf_le(LVFU(drv[d][row + v - 5][col + h - 5]), tr1v), onev);
729                                     }
730                                 }
731 
732                                 _mm_storeu_si128((__m128i*)&tempstore, _mm_cvtps_epi32(tempv));
733                                 homo[d][row][col] = tempstore[0];
734                                 homo[d][row][col + 1] = tempstore[4];
735                                 homo[d][row][col + 2] = tempstore[8];
736                                 homo[d][row][col + 3] = tempstore[12];
737 
738                             }
739                         }
740 
741 #endif
742 
743                         for (; col < mcol - 6; col++) {
744                             float tr = drv[0][row - 5][col - 5] < drv[1][row - 5][col - 5] ? drv[0][row - 5][col - 5] : drv[1][row - 5][col - 5];
745 
746                             for (int d = 2; d < ndir; d++) {
747                                 tr = (drv[d][row - 5][col - 5] < tr ? drv[d][row - 5][col - 5] : tr);
748                             }
749 
750                             tr *= 8;
751 
752                             for (int d = 0; d < ndir; d++) {
753                                 uint8_t temp = 0;
754 
755                                 for (int v = -1; v <= 1; v++) {
756                                     for (int h = -1; h <= 1; h++) {
757                                         temp += (drv[d][row + v - 5][col + h - 5] <= tr ? 1 : 0);
758                                     }
759                                 }
760 
761                                 homo[d][row][col] = temp;
762                             }
763                         }
764                     }
765 
766                     if (height - top < ts + 4) {
767                         mrow = height - top + 2;
768                     }
769 
770                     if (width - left < ts + 4) {
771                         mcol = width - left + 2;
772                     }
773 
774 
775                     /* Build 5x5 sum of homogeneity maps */
776                     const int startcol = std::min(left, 8);
777 
778                     for(int d = 0; d < ndir; d++) {
779                         for (int row = std::min(top, 8); row < mrow - 8; row++) {
780                             int col = startcol;
781 #ifdef __SSE2__
782                             int endcol = row < mrow - 9 ? mcol - 8 : mcol - 23;
783 
784                             // crunching 16 values at once is faster than summing up column sums
785                             for (; col < endcol; col += 16) {
786                                 vint v5sumv = (vint)ZEROV;
787 
788                                 for(int v = -2; v <= 2; v++)
789                                     for(int h = -2; h <= 2; h++) {
790                                         v5sumv = _mm_adds_epu8( _mm_loadu_si128((vint*)&homo[d][row + v][col + h]), v5sumv);
791                                     }
792 
793                                 _mm_storeu_si128((vint*)&homosum[d][row][col], v5sumv);
794                             }
795 
796 #endif
797 
798                             if(col < mcol - 8) {
799                                 int v5sum[5] = {0};
800 
801                                 for(int v = -2; v <= 2; v++)
802                                     for(int h = -2; h <= 2; h++) {
803                                         v5sum[2 + h] += homo[d][row + v][col + h];
804                                     }
805 
806                                 int blocksum = v5sum[0] + v5sum[1] + v5sum[2] + v5sum[3] + v5sum[4];
807                                 homosum[d][row][col] = blocksum;
808                                 col++;
809 
810                                 // now we can subtract a column of five from blocksum and get new colsum of 5
811                                 for (int voffset = 0; col < mcol - 8; col++, voffset++) {
812                                     int colsum = homo[d][row - 2][col + 2] + homo[d][row - 1][col + 2] + homo[d][row][col + 2] + homo[d][row + 1][col + 2] + homo[d][row + 2][col + 2];
813                                     voffset = voffset == 5 ? 0 : voffset;  // faster than voffset %= 5;
814                                     blocksum -= v5sum[voffset];
815                                     blocksum += colsum;
816                                     v5sum[voffset] = colsum;
817                                     homosum[d][row][col] = blocksum;
818                                 }
819                             }
820                         }
821                     }
822 
823                     // calculate maximum of homogeneity maps per pixel. Vectorized calculation is a tiny bit faster than on the fly calculation in next step
824 #ifdef __SSE2__
825                     vint maskv = _mm_set1_epi8(31);
826 #endif
827 
828                     for (int row = std::min(top, 8); row < mrow - 8; row++) {
829                         int col = startcol;
830 #ifdef __SSE2__
831                         int endcol = row < mrow - 9 ? mcol - 8 : mcol - 23;
832 
833                         for (; col < endcol; col += 16) {
834                             vint maxval1 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[0][row][col]), _mm_loadu_si128((vint*)&homosum[1][row][col]));
835                             vint maxval2 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[2][row][col]), _mm_loadu_si128((vint*)&homosum[3][row][col]));
836 
837                             if(ndir > 4) {
838                                 vint maxval3 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[4][row][col]), _mm_loadu_si128((vint*)&homosum[5][row][col]));
839                                 vint maxval4 = _mm_max_epu8(_mm_loadu_si128((vint*)&homosum[6][row][col]), _mm_loadu_si128((vint*)&homosum[7][row][col]));
840                                 maxval1 = _mm_max_epu8(maxval1, maxval3);
841                                 maxval1 = _mm_max_epu8(maxval1, maxval4);
842                             }
843 
844                             maxval1 = _mm_max_epu8(maxval1, maxval2);
845                             // there is no shift intrinsic for epu8. Shift using epi32 and mask the wrong bits out
846                             vint subv = _mm_srli_epi32( maxval1, 3 );
847                             subv = _mm_and_si128(subv, maskv);
848                             maxval1 = _mm_subs_epu8(maxval1, subv);
849                             _mm_storeu_si128((vint*)&homosummax[row][col], maxval1);
850                         }
851 
852 #endif
853 
854                         for (; col < mcol - 8; col ++) {
855                             uint8_t maxval = homosum[0][row][col];
856 
857                             for(int d = 1; d < ndir; d++) {
858                                 maxval = maxval < homosum[d][row][col] ? homosum[d][row][col] : maxval;
859                             }
860 
861                             maxval -= maxval >> 3;
862                             homosummax[row][col] = maxval;
863                         }
864                     }
865 
866 
867                     /* Average the most homogeneous pixels for the final result: */
868                     uint8_t hm[8] = {};
869 
870                     for (int row = std::min(top, 8); row < mrow - 8; row++)
871                         for (int col = std::min(left, 8); col < mcol - 8; col++) {
872 
873                             for (int d = 0; d < 4; d++) {
874                                 hm[d] = homosum[d][row][col];
875                             }
876 
877                             for (int d = 4; d < ndir; d++) {
878                                 hm[d] = homosum[d][row][col];
879 
880                                 if (hm[d - 4] < hm[d]) {
881                                     hm[d - 4] = 0;
882                                 } else if (hm[d - 4] > hm[d]) {
883                                     hm[d] = 0;
884                                 }
885                             }
886 
887                             float avg[4] = {0.f};
888 
889                             uint8_t maxval = homosummax[row][col];
890 
891                             for (int d = 0; d < ndir; d++)
892                                 if (hm[d] >= maxval) {
893                                     for (int c = 0; c < 3; c++) {
894                                         avg[c] += rgb[d][row][col][c];
895                                     }
896                                     avg[3]++;
897                                 }
898 
899                             red[row + top][col + left] = avg[0] / avg[3];
900                             green[row + top][col + left] = avg[1] / avg[3];
901                             blue[row + top][col + left] = avg[2] / avg[3];
902                         }
903 
904                     if((++progressCounter) % 32 == 0) {
905 #ifdef _OPENMP
906                         #pragma omp critical (xtransdemosaic)
907 #endif
908                         {
909                             progress += progressInc;
910                             progress = min(1.0, progress);
911                             setProgCancel(progress);
912                         }
913                     }
914 
915 
916                 }
917         }
918         free(buffer);
919     }
920     xtransborder_demosaic(width, height, 8, rawData, red, green, blue, xtrans);
921     return rc;
922 }
923