1/*
2    This file is part of darktable,
3    copyright (c) 2009--2013 johannes hanika.
4    copyright (c) 2014 Ulrich Pegelow.
5    copyright (c) 2014 LebedevRI.
6
7    darktable is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation, either version 3 of the License, or
10    (at your option) any later version.
11
12    darktable is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include "colorspace.h"
22#include "color_conversion.h"
23#include "common.h"
24#include "rgb_norms.h"
25
26int
27BL(const int row, const int col)
28{
29  return (((row & 1) << 1) + (col & 1));
30}
31
32kernel void
33rawprepare_1f(read_only image2d_t in, write_only image2d_t out,
34              const int width, const int height,
35              const int cx, const int cy,
36              global const float *sub, global const float *div,
37              const int rx, const int ry)
38{
39  const int x = get_global_id(0);
40  const int y = get_global_id(1);
41
42  if(x >= width || y >= height) return;
43
44  const float pixel = read_imageui(in, sampleri, (int2)(x + cx, y + cy)).x;
45
46  const int id = BL(ry+cy+y, rx+cx+x);
47  const float pixel_scaled = (pixel - sub[id]) / div[id];
48
49  write_imagef(out, (int2)(x, y), (float4)(pixel_scaled, 0.0f, 0.0f, 0.0f));
50}
51
52kernel void
53rawprepare_1f_unnormalized(read_only image2d_t in, write_only image2d_t out,
54                           const int width, const int height,
55                           const int cx, const int cy,
56                           global const float *sub, global const float *div,
57                           const int rx, const int ry)
58{
59  const int x = get_global_id(0);
60  const int y = get_global_id(1);
61
62  if(x >= width  || y >= height) return;
63
64  const float pixel = read_imagef(in, sampleri, (int2)(x + cx, y + cy)).x;
65
66  const int id = BL(ry+cy+y, rx+cx+x);
67  const float pixel_scaled = (pixel - sub[id]) / div[id];
68
69  write_imagef(out, (int2)(x, y), (float4)(pixel_scaled, 0.0f, 0.0f, 0.0f));
70}
71
72kernel void
73rawprepare_4f(read_only image2d_t in, write_only image2d_t out,
74              const int width, const int height,
75              const int cx, const int cy,
76              global const float *black, global const float *div)
77{
78  const int x = get_global_id(0);
79  const int y = get_global_id(1);
80
81  if(x >= width || y >= height) return;
82
83  float4 pixel = read_imagef(in, sampleri, (int2)(x + cx, y + cy));
84  pixel.xyz = (pixel.xyz - black[0]) / div[0];
85
86  write_imagef(out, (int2)(x, y), pixel);
87}
88
89kernel void
90invert_1f(read_only image2d_t in, write_only image2d_t out, const int width, const int height, global float *color,
91          const unsigned int filters, const int rx, const int ry)
92{
93  const int x = get_global_id(0);
94  const int y = get_global_id(1);
95  if(x >= width || y >= height) return;
96  const float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
97  const float inv_pixel = color[FC(ry+y, rx+x, filters)] - pixel;
98
99  write_imagef (out, (int2)(x, y), (float4)(clamp(inv_pixel, 0.0f, 1.0f), 0.0f, 0.0f, 0.0f));
100}
101
102kernel void
103invert_4f(read_only image2d_t in, write_only image2d_t out, const int width, const int height, global float *color,
104                const unsigned int filters, const int rx, const int ry)
105{
106  const int x = get_global_id(0);
107  const int y = get_global_id(1);
108  if(x >= width || y >= height) return;
109  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
110  pixel.x = color[0] - pixel.x;
111  pixel.y = color[1] - pixel.y;
112  pixel.z = color[2] - pixel.z;
113  pixel.xyz = clamp(pixel.xyz, 0.0f, 1.0f);
114
115  write_imagef (out, (int2)(x, y), pixel);
116}
117
118kernel void
119whitebalance_1f(read_only image2d_t in, write_only image2d_t out, const int width, const int height, global float *coeffs,
120    const unsigned int filters, const int rx, const int ry, global const unsigned char (*const xtrans)[6])
121{
122  const int x = get_global_id(0);
123  const int y = get_global_id(1);
124  if(x >= width || y >= height) return;
125  const float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
126  write_imagef (out, (int2)(x, y), (float4)(pixel * coeffs[FC(ry+y, rx+x, filters)], 0.0f, 0.0f, 0.0f));
127}
128
129kernel void
130whitebalance_1f_xtrans(read_only image2d_t in, write_only image2d_t out, const int width, const int height, global float *coeffs,
131    const unsigned int filters, const int rx, const int ry, global const unsigned char (*const xtrans)[6])
132{
133  const int x = get_global_id(0);
134  const int y = get_global_id(1);
135  if(x >= width || y >= height) return;
136  const float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
137  write_imagef (out, (int2)(x, y), (float4)(pixel * coeffs[FCxtrans(ry+y, rx+x, xtrans)], 0.0f, 0.0f, 0.0f));
138}
139
140
141kernel void
142whitebalance_4f(read_only image2d_t in, write_only image2d_t out, const int width, const int height, global float *coeffs,
143    const unsigned int filters, const int rx, const int ry, global const unsigned char (*const xtrans)[6])
144{
145  const int x = get_global_id(0);
146  const int y = get_global_id(1);
147  if(x >= width || y >= height) return;
148  const float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
149  write_imagef (out, (int2)(x, y), (float4)(pixel.x * coeffs[0], pixel.y * coeffs[1], pixel.z * coeffs[2], pixel.w));
150}
151
152/* kernel for the exposure plugin. should work transparently with float4 and float image2d. */
153kernel void
154exposure (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const float black, const float scale)
155{
156  const int x = get_global_id(0);
157  const int y = get_global_id(1);
158
159  if(x >= width || y >= height) return;
160  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
161  pixel.xyz = ((pixel - black ) * scale).xyz;
162  write_imagef (out, (int2)(x, y), pixel);
163}
164
165/* kernel for the highlights plugin. */
166kernel void
167highlights_4f_clip (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
168                    const int mode, const float clip)
169{
170  const int x = get_global_id(0);
171  const int y = get_global_id(1);
172
173  if(x >= width || y >= height) return;
174
175  // 4f/pixel means that this has been debayered already.
176  // it's thus hopeless to recover highlights here (this code path is just used for preview and non-raw images)
177  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
178  // default: // 0, DT_IOP_HIGHLIGHTS_CLIP
179  pixel.x = fmin(clip, pixel.x);
180  pixel.y = fmin(clip, pixel.y);
181  pixel.z = fmin(clip, pixel.z);
182  write_imagef (out, (int2)(x, y), pixel);
183}
184
185kernel void
186highlights_1f_clip (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
187                    const float clip, const int rx, const int ry, const int filters)
188{
189  const int x = get_global_id(0);
190  const int y = get_global_id(1);
191
192  if(x >= width || y >= height) return;
193
194  float pixel = read_imagef(in, sampleri, (int2)(x, y)).x;
195
196  pixel = fmin(clip, pixel);
197
198  write_imagef (out, (int2)(x, y), pixel);
199}
200
201#define SQRT3 1.7320508075688772935274463415058723669f
202#define SQRT12 3.4641016151377545870548926830117447339f // 2*SQRT3
203kernel void
204highlights_1f_lch_bayer (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
205                         const float clip, const int rx, const int ry, const int filters)
206{
207  const int x = get_global_id(0);
208  const int y = get_global_id(1);
209
210  if(x >= width || y >= height) return;
211
212  int clipped = 0;
213  float R = 0.0f;
214  float Gmin = FLT_MAX;
215  float Gmax = -FLT_MAX;
216  float B = 0.0f;
217  float pixel = 0.0f;
218
219  // sample 1 bayer block. thus we will have 2 green values.
220  for(int jj = 0; jj <= 1; jj++)
221  {
222    for(int ii = 0; ii <= 1; ii++)
223    {
224      const float val = read_imagef(in, sampleri, (int2)(x+ii, y+jj)).x;
225
226      pixel = (ii == 0 && jj == 0) ? val : pixel;
227
228      clipped = (clipped || (val > clip));
229
230      const int c = FC(y + jj + ry, x + ii + rx, filters);
231
232      switch(c)
233      {
234        case 0:
235          R = val;
236          break;
237        case 1:
238          Gmin = min(Gmin, val);
239          Gmax = max(Gmax, val);
240          break;
241        case 2:
242          B = val;
243          break;
244      }
245    }
246  }
247
248  if(clipped)
249  {
250    const float Ro = min(R, clip);
251    const float Go = min(Gmin, clip);
252    const float Bo = min(B, clip);
253
254    const float L = (R + Gmax + B) / 3.0f;
255
256    float C = SQRT3 * (R - Gmax);
257    float H = 2.0f * B - Gmax - R;
258
259    const float Co = SQRT3 * (Ro - Go);
260    const float Ho = 2.0f * Bo - Go - Ro;
261
262    const float ratio = (R != Gmax && Gmax != B) ? sqrt((Co * Co + Ho * Ho) / (C * C + H * H)) : 1.0f;
263
264    C *= ratio;
265    H *= ratio;
266
267    /*
268     * backtransform proof, sage:
269     *
270     * R,G,B,L,C,H = var('R,G,B,L,C,H')
271     * solve([L==(R+G+B)/3, C==sqrt(3)*(R-G), H==2*B-G-R], R, G, B)
272     *
273     * result:
274     * [[R == 1/6*sqrt(3)*C - 1/6*H + L, G == -1/6*sqrt(3)*C - 1/6*H + L, B == 1/3*H + L]]
275     */
276    const int c = FC(y + ry, x + rx, filters);
277    C = (c == 1) ? -C : C;
278
279    pixel = L;
280    pixel += (c == 2) ? H / 3.0f : -H / 6.0f + C / SQRT12;
281  }
282
283  write_imagef (out, (int2)(x, y), pixel);
284}
285
286
287kernel void
288highlights_1f_lch_xtrans (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
289                         const float clip, const int rx, const int ry, global const unsigned char (*const xtrans)[6],
290                         local float *buffer)
291{
292  const int x = get_global_id(0);
293  const int y = get_global_id(1);
294  const int xlsz = get_local_size(0);
295  const int ylsz = get_local_size(1);
296  const int xlid = get_local_id(0);
297  const int ylid = get_local_id(1);
298  const int xgid = get_group_id(0);
299  const int ygid = get_group_id(1);
300
301  // individual control variable in this work group and the work group size
302  const int l = mad24(ylid, xlsz, xlid);
303  const int lsz = mul24(xlsz, ylsz);
304
305  // stride and maximum capacity of local buffer
306  // cells of 1*float per pixel with a surrounding border of 2 cells
307  const int stride = xlsz + 2*2;
308  const int maxbuf = mul24(stride, ylsz + 2*2);
309
310  // coordinates of top left pixel of buffer
311  // this is 2 pixel left and above of the work group origin
312  const int xul = mul24(xgid, xlsz) - 2;
313  const int yul = mul24(ygid, ylsz) - 2;
314
315  // populate local memory buffer
316  for(int n = 0; n <= maxbuf/lsz; n++)
317  {
318    const int bufidx = mad24(n, lsz, l);
319    if(bufidx >= maxbuf) continue;
320    const int xx = xul + bufidx % stride;
321    const int yy = yul + bufidx / stride;
322    buffer[bufidx] = read_imagef(in, sampleri, (int2)(xx, yy)).x;
323  }
324
325  // center buffer around current x,y-Pixel
326  buffer += mad24(ylid + 2, stride, xlid + 2);
327
328  barrier(CLK_LOCAL_MEM_FENCE);
329
330  if(x >= width || y >= height) return;
331
332  float pixel = 0.0f;
333
334  if(x < 2 || x > width - 3 || y < 2 || y > height - 3)
335  {
336    // fast path for border
337    pixel = min(clip, buffer[0]);
338  }
339  else
340  {
341    // if current pixel is clipped, always reconstruct
342    int clipped = (buffer[0] > clip);
343
344    if(!clipped)
345    {
346      clipped = 1;
347      // check if there is any 3x3 block touching the current
348      // pixel which has no clipping, as then we don't need to
349      // reconstruct the current pixel. This avoids zippering in
350      // edge transitions from clipped to unclipped areas. The
351      // X-Trans sensor seems prone to this, unlike Bayer, due
352      // to its irregular pattern.
353      for(int offset_j = -2; offset_j <= 0; offset_j++)
354      {
355        for(int offset_i = -2; offset_i <= 0; offset_i++)
356        {
357          if(clipped)
358          {
359            clipped = 0;
360            for(int jj = offset_j; jj <= offset_j + 2; jj++)
361            {
362              for(int ii = offset_i; ii <= offset_i + 2; ii++)
363              {
364                const float val = buffer[mad24(jj, stride, ii)];
365                clipped = (clipped || (val > clip));
366              }
367            }
368          }
369        }
370      }
371    }
372
373    if(clipped)
374    {
375      float mean[3] = { 0.0f, 0.0f, 0.0f };
376      int cnt[3] = { 0, 0, 0 };
377      float RGBmax[3] = { -FLT_MAX, -FLT_MAX, -FLT_MAX };
378
379      for(int jj = -1; jj <= 1; jj++)
380      {
381        for(int ii = -1; ii <= 1; ii++)
382        {
383          const float val = buffer[mad24(jj, stride, ii)];
384          const int c = FCxtrans(y + jj + ry, x + ii + rx, xtrans);
385          mean[c] += val;
386          cnt[c]++;
387          RGBmax[c] = max(RGBmax[c], val);
388        }
389      }
390
391      const float Ro = min(mean[0]/cnt[0], clip);
392      const float Go = min(mean[1]/cnt[1], clip);
393      const float Bo = min(mean[2]/cnt[2], clip);
394
395      const float R = RGBmax[0];
396      const float G = RGBmax[1];
397      const float B = RGBmax[2];
398
399      const float L = (R + G + B) / 3.0f;
400      float C = SQRT3 * (R - G);
401      float H = 2.0f * B - G - R;
402
403      const float Co = SQRT3 * (Ro - Go);
404      const float Ho = 2.0f * Bo - Go - Ro;
405
406      if(R != G && G != B)
407      {
408        const float ratio = sqrt((Co * Co + Ho * Ho) / (C * C + H * H));
409        C *= ratio;
410        H *= ratio;
411      }
412
413      float RGB[3] = { 0.0f, 0.0f, 0.0f };
414
415      RGB[0] = L - H / 6.0f + C / SQRT12;
416      RGB[1] = L - H / 6.0f - C / SQRT12;
417      RGB[2] = L + H / 3.0f;
418
419      pixel = RGB[FCxtrans(y + ry, x + rx, xtrans)];
420    }
421    else
422      pixel = buffer[0];
423  }
424
425  write_imagef (out, (int2)(x, y), pixel);
426}
427#undef SQRT3
428#undef SQRT12
429
430float
431lookup_unbounded_twosided(read_only image2d_t lut, const float x, constant float *a)
432{
433  // in case the tone curve is marked as linear, return the fast
434  // path to linear unbounded (does not clip x at 1)
435  if(a[0] >= 0.0f)
436  {
437    const float ar = 1.0f/a[0];
438    const float al = 1.0f - 1.0f/a[3];
439    if(x < ar && x >= al)
440    {
441      // lut lookup
442      const int xi = clamp((int)(x * 0x10000ul), 0, 0xffff);
443      const int2 p = (int2)((xi & 0xff), (xi >> 8));
444      return read_imagef(lut, sampleri, p).x;
445    }
446    else
447    {
448      // two-sided extrapolation (with inverted x-axis for left side)
449      const float xx = (x >= ar) ? x : 1.0f - x;
450      constant float *aa = (x >= ar) ? a : a + 3;
451      return aa[1] * native_powr(xx*aa[0], aa[2]);
452    }
453  }
454  else return x;
455}
456
457float
458lerp_lookup_unbounded0(read_only image2d_t lut, const float x, global const float *a)
459{
460  // in case the tone curve is marked as linear, return the fast
461  // path to linear unbounded (does not clip x at 1)
462  if(a[0] >= 0.0f)
463  {
464    if(x < 1.0f)
465    {
466      const float ft = clamp(x * (float)0xffff, 0.0f, (float)0xffff);
467      const int t = ft < 0xfffe ? ft : 0xfffe;
468      const float f = ft - t;
469      const int2 p1 = (int2)((t & 0xff), (t >> 8));
470      const int2 p2 = (int2)(((t + 1) & 0xff), ((t + 1) >> 8));
471      const float l1 = read_imagef(lut, sampleri, p1).x;
472      const float l2 = read_imagef(lut, sampleri, p2).x;
473      return l1 * (1.0f - f) + l2 * f;
474    }
475    else return a[1] * native_powr(x*a[0], a[2]);
476  }
477  else return x;
478}
479
480
481/* kernel for the plugin colorin: unbound processing */
482kernel void
483colorin_unbound (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
484                 global float *cmat, global float *lmat,
485                 read_only image2d_t lutr, read_only image2d_t lutg, read_only image2d_t lutb,
486                 const int blue_mapping, global const float (*const a)[3])
487{
488  const int x = get_global_id(0);
489  const int y = get_global_id(1);
490
491  if(x >= width || y >= height) return;
492
493  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
494
495  float cam[3], XYZ[3];
496  cam[0] = lerp_lookup_unbounded0(lutr, pixel.x, a[0]);
497  cam[1] = lerp_lookup_unbounded0(lutg, pixel.y, a[1]);
498  cam[2] = lerp_lookup_unbounded0(lutb, pixel.z, a[2]);
499
500  if(blue_mapping)
501  {
502    const float YY = cam[0] + cam[1] + cam[2];
503    if(YY > 0.0f)
504    {
505      // manual gamut mapping. these values cause trouble when converting back from Lab to sRGB:
506      const float zz = cam[2] / YY;
507      // lower amount and higher bound_z make the effect smaller.
508      // the effect is weakened the darker input values are, saturating at bound_Y
509      const float bound_z = 0.5f, bound_Y = 0.8f;
510      const float amount = 0.11f;
511      if (zz > bound_z)
512      {
513        const float t = (zz - bound_z) / (1.0f - bound_z) * fmin(1.0f, YY / bound_Y);
514        cam[1] += t * amount;
515        cam[2] -= t * amount;
516      }
517    }
518  }
519
520  // now convert camera to XYZ using the color matrix
521  for(int j=0;j<3;j++)
522  {
523    XYZ[j] = 0.0f;
524    for(int i=0;i<3;i++) XYZ[j] += cmat[3*j+i] * cam[i];
525  }
526  float4 xyz = (float4)(XYZ[0], XYZ[1], XYZ[2], 0.0f);
527  pixel.xyz = XYZ_to_Lab(xyz).xyz;
528  write_imagef (out, (int2)(x, y), pixel);
529}
530
531/* kernel for the plugin colorin: with clipping */
532kernel void
533colorin_clipping (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
534                  global float *cmat, global float *lmat,
535                  read_only image2d_t lutr, read_only image2d_t lutg, read_only image2d_t lutb,
536                  const int blue_mapping, global const float (*const a)[3])
537{
538  const int x = get_global_id(0);
539  const int y = get_global_id(1);
540
541  if(x >= width || y >= height) return;
542
543  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
544
545  float cam[3], RGB[3], XYZ[3];
546  cam[0] = lerp_lookup_unbounded0(lutr, pixel.x, a[0]);
547  cam[1] = lerp_lookup_unbounded0(lutg, pixel.y, a[1]);
548  cam[2] = lerp_lookup_unbounded0(lutb, pixel.z, a[2]);
549
550  if(blue_mapping)
551  {
552    const float YY = cam[0] + cam[1] + cam[2];
553    if(YY > 0.0f)
554    {
555      // manual gamut mapping. these values cause trouble when converting back from Lab to sRGB:
556      const float zz = cam[2] / YY;
557      // lower amount and higher bound_z make the effect smaller.
558      // the effect is weakened the darker input values are, saturating at bound_Y
559      const float bound_z = 0.5f, bound_Y = 0.8f;
560      const float amount = 0.11f;
561      if (zz > bound_z)
562      {
563        const float t = (zz - bound_z) / (1.0f - bound_z) * fmin(1.0f, YY / bound_Y);
564        cam[1] += t * amount;
565        cam[2] -= t * amount;
566      }
567    }
568  }
569
570  // convert camera to RGB using the first color matrix
571  for(int j=0;j<3;j++)
572  {
573    RGB[j] = 0.0f;
574    for(int i=0;i<3;i++) RGB[j] += cmat[3*j+i] * cam[i];
575  }
576
577  // clamp at this stage
578  for(int i=0; i<3; i++) RGB[i] = clamp(RGB[i], 0.0f, 1.0f);
579
580  // convert clipped RGB to XYZ
581  for(int j=0;j<3;j++)
582  {
583    XYZ[j] = 0.0f;
584    for(int i=0;i<3;i++) XYZ[j] += lmat[3*j+i] * RGB[i];
585  }
586
587  float4 xyz = (float4)(XYZ[0], XYZ[1], XYZ[2], 0.0f);
588  pixel.xyz = XYZ_to_Lab(xyz).xyz;
589  write_imagef (out, (int2)(x, y), pixel);
590}
591
592/* kernel for the tonecurve plugin. */
593kernel void
594tonecurve (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
595           read_only image2d_t table_L, read_only image2d_t table_a, read_only image2d_t table_b,
596           const int autoscale_ab, const int unbound_ab, constant float *coeffs_L, constant float *coeffs_ab,
597           const float low_approximation, const int preserve_colors,
598           constant dt_colorspaces_iccprofile_info_cl_t *profile_info, read_only image2d_t lut)
599{
600  const int x = get_global_id(0);
601  const int y = get_global_id(1);
602
603  if(x >= width || y >= height) return;
604
605  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
606  const float L_in = pixel.x/100.0f;
607  // use lut or extrapolation:
608  const float L = lookup_unbounded(table_L, L_in, coeffs_L);
609
610  if (autoscale_ab == 0)
611  {
612    const float a_in = (pixel.y + 128.0f) / 256.0f;
613    const float b_in = (pixel.z + 128.0f) / 256.0f;
614
615    if (unbound_ab == 0)
616    {
617      pixel.y = lookup(table_a, a_in);
618      pixel.z = lookup(table_b, b_in);
619    }
620    else
621    {
622      // use lut or two-sided extrapolation
623      pixel.y = lookup_unbounded_twosided(table_a, a_in, coeffs_ab);
624      pixel.z = lookup_unbounded_twosided(table_b, b_in, coeffs_ab + 6);
625    }
626    pixel.x = L;
627  }
628  else if(autoscale_ab == 1)
629  {
630    if(L_in > 0.01f)
631    {
632      pixel.y *= L/pixel.x;
633      pixel.z *= L/pixel.x;
634    }
635    else
636    {
637      pixel.y *= low_approximation;
638      pixel.z *= low_approximation;
639    }
640    pixel.x = L;
641  }
642  else if(autoscale_ab == 2)
643  {
644    float4 xyz = Lab_to_XYZ(pixel);
645    xyz.x = lookup_unbounded(table_L, xyz.x, coeffs_L);
646    xyz.y = lookup_unbounded(table_L, xyz.y, coeffs_L);
647    xyz.z = lookup_unbounded(table_L, xyz.z, coeffs_L);
648    pixel.xyz = XYZ_to_Lab(xyz).xyz;
649  }
650  else if(autoscale_ab == 3)
651  {
652    float4 rgb = Lab_to_prophotorgb(pixel);
653
654    if (preserve_colors == DT_RGB_NORM_NONE)
655    {
656      rgb.x = lookup_unbounded(table_L, rgb.x, coeffs_L);
657      rgb.y = lookup_unbounded(table_L, rgb.y, coeffs_L);
658      rgb.z = lookup_unbounded(table_L, rgb.z, coeffs_L);
659    }
660    else
661    {
662      float ratio = 1.f;
663      float lum = dt_rgb_norm(rgb, preserve_colors, 1, profile_info, lut);
664      if(lum > 0.f)
665      {
666        const float curve_lum = lookup_unbounded(table_L, lum, coeffs_L);
667        ratio = curve_lum / lum;
668      }
669      rgb.xyz *= ratio;
670    }
671    pixel.xyz = prophotorgb_to_Lab(rgb).xyz;
672  }
673
674  write_imagef (out, (int2)(x, y), pixel);
675}
676
677
678/* kernel for the colorcorrection plugin. */
679__kernel void
680colorcorrection (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
681                 const float saturation, const float a_scale, const float a_base,
682                 const float b_scale, const float b_base)
683{
684  const int x = get_global_id(0);
685  const int y = get_global_id(1);
686
687  if(x >= width || y >= height) return;
688
689  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
690  pixel.y = saturation*(pixel.y + pixel.x * a_scale + a_base);
691  pixel.z = saturation*(pixel.z + pixel.x * b_scale + b_base);
692  write_imagef (out, (int2)(x, y), pixel);
693}
694
695
696void
697mul_mat_vec_2(const float4 m, const float2 *p, float2 *o)
698{
699  (*o).x = (*p).x*m.x + (*p).y*m.y;
700  (*o).y = (*p).x*m.z + (*p).y*m.w;
701}
702
703void
704backtransform(float2 *p, float2 *o, const float4 m, const float2 t)
705{
706  (*p).y /= (1.0f + (*p).x*t.x);
707  (*p).x /= (1.0f + (*p).y*t.y);
708  mul_mat_vec_2(m, p, o);
709}
710
711void
712keystone_backtransform(float2 *i, const float4 k_space, const float2 ka, const float4 ma, const float2 mb)
713{
714  float xx = (*i).x - k_space.x;
715  float yy = (*i).y - k_space.y;
716
717  /*float u = ka.x-kb.x+kc.x-kd.x;
718  float v = ka.x-kb.x;
719  float w = ka.x-kd.x;
720  float z = ka.x;
721  //(*i).x = (xx/k_space.z)*(yy/k_space.w)*(ka.x-kb.x+kc.x-kd.x) - (xx/k_space.z)*(ka.x-kb.x) - (yy/k_space.w)*(ka.x-kd.x) + ka.x + k_space.x;
722  (*i).x = (xx/k_space.z)*(yy/k_space.w)*u - (xx/k_space.z)*v - (yy/k_space.w)*w + z + k_space.x;
723  u = ka.y-kb.y+kc.y-kd.y;
724  v = ka.y-kb.y;
725  w = ka.y-kd.y;
726  z = ka.y;
727  //(*i).y = (xx/k_space.z)*(yy/k_space.w)*(ka.y-kb.y+kc.y-kd.y) - (xx/k_space.z)*(ka.y-kb.y) - (yy/k_space.w)*(ka.y-kd.y) + ka.y + k_space.y;
728  (*i).y = (xx/k_space.z)*(yy/k_space.w)*u - (xx/k_space.z)*v - (yy/k_space.w)*w + z + k_space.y;*/
729  float div = ((ma.z*xx-ma.x*yy)*mb.y+(ma.y*yy-ma.w*xx)*mb.x+ma.x*ma.w-ma.y*ma.z);
730
731  (*i).x = (ma.w*xx-ma.y*yy)/div + ka.x;
732  (*i).y =-(ma.z*xx-ma.x*yy)/div + ka.y;
733}
734
735
736float
737interpolation_func_bicubic(float t)
738{
739  float r;
740  t = fabs(t);
741
742  r = (t >= 2.0f) ? 0.0f : ((t > 1.0f) ? (0.5f*(t*(-t*t + 5.0f*t - 8.0f) + 4.0f)) : (0.5f*(t*(3.0f*t*t - 5.0f*t) + 2.0f)));
743
744  return r;
745}
746
747#define DT_LANCZOS_EPSILON (1e-9f)
748
749#if 0
750float
751interpolation_func_lanczos(float width, float t)
752{
753  float ta = fabs(t);
754
755  float r = (ta > width) ? 0.0f : ((ta < DT_LANCZOS_EPSILON) ? 1.0f : width*native_sin(M_PI_F*t)*native_sin(M_PI_F*t/width)/(M_PI_F*M_PI_F*t*t));
756
757  return r;
758}
759#else
760float
761sinf_fast(float t)
762{
763  /***** if you change this function, you must also change the copy in src/common/math.h *****/
764  const float a = 4.0f/(M_PI_F*M_PI_F);
765  const float p = 0.225f;
766
767  t = a*t*(M_PI_F - fabs(t));
768
769  return p*(t*fabs(t) - t) + t;
770}
771
772float
773interpolation_func_lanczos(float width, float t)
774{
775  /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be
776   * correct */
777  int a = (int)t;
778  float r = t - (float)a;
779
780  // Compute the correct sign for sinf(pi.r)
781  union { float f; unsigned int i; } sign;
782  sign.i = ((a&1)<<31) | 0x3f800000;
783
784  return (DT_LANCZOS_EPSILON + width*sign.f*sinf_fast(M_PI_F*r)*sinf_fast(M_PI_F*t/width))/(DT_LANCZOS_EPSILON + M_PI_F*M_PI_F*t*t);
785}
786#endif
787
788
789/* kernel for clip&rotate: bilinear interpolation */
790__kernel void
791clip_rotate_bilinear(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
792            const int in_width, const int in_height,
793            const int2 roi_in, const float2 roi_out, const float scale_in, const float scale_out,
794            const int flip, const float2 t, const float2 k, const float4 mat,
795            const float4 k_space, const float2 ka, const float4 ma, const float2 mb)
796{
797  const int x = get_global_id(0);
798  const int y = get_global_id(1);
799
800  if(x >= width || y >= height) return;
801
802  float2 pi, po;
803
804  pi.x = roi_out.x + x + 0.5f;
805  pi.y = roi_out.y + y + 0.5f;
806
807  pi.x -= flip ? t.y * scale_out : t.x * scale_out;
808  pi.y -= flip ? t.x * scale_out : t.y * scale_out;
809
810  pi /= scale_out;
811  backtransform(&pi, &po, mat, k);
812  po *= scale_in;
813
814  po.x += t.x * scale_in;
815  po.y += t.y * scale_in;
816
817  if (k_space.z > 0.0f) keystone_backtransform(&po,k_space,ka,ma,mb);
818
819  po.x -= roi_in.x + 0.5f;
820  po.y -= roi_in.y + 0.5f;
821
822  const int ii = (int)po.x;
823  const int jj = (int)po.y;
824
825  float4 o = (ii >=0 && jj >= 0 && ii < in_width && jj < in_height) ? read_imagef(in, samplerf, po) : (float4)0.0f;
826
827  write_imagef (out, (int2)(x, y), o);
828}
829
830
831
832/* kernel for clip&rotate: bicubic interpolation */
833__kernel void
834clip_rotate_bicubic(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
835            const int in_width, const int in_height,
836            const int2 roi_in, const float2 roi_out, const float scale_in, const float scale_out,
837            const int flip, const float2 t, const float2 k, const float4 mat,
838            const float4 k_space, const float2 ka, const float4 ma, const float2 mb)
839{
840  const int x = get_global_id(0);
841  const int y = get_global_id(1);
842
843  const int kwidth = 2;
844
845  if(x >= width || y >= height) return;
846
847  float2 pi, po;
848
849  pi.x = roi_out.x + x + 0.5f;
850  pi.y = roi_out.y + y + 0.5f;
851
852  pi.x -= flip ? t.y * scale_out : t.x * scale_out;
853  pi.y -= flip ? t.x * scale_out : t.y * scale_out;
854
855  pi /= scale_out;
856  backtransform(&pi, &po, mat, k);
857  po *= scale_in;
858
859  po.x += t.x * scale_in;
860  po.y += t.y * scale_in;
861
862  if (k_space.z > 0.0f) keystone_backtransform(&po,k_space,ka,ma,mb);
863
864  po.x -= roi_in.x + 0.5f;
865  po.y -= roi_in.y + 0.5f;
866
867  int tx = po.x;
868  int ty = po.y;
869
870  float4 pixel = (float4)0.0f;
871  float weight = 0.0f;
872
873  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
874    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
875  {
876    const int i = tx + ii;
877    const int j = ty + jj;
878
879    float wx = interpolation_func_bicubic((float)i - po.x);
880    float wy = interpolation_func_bicubic((float)j - po.y);
881    float w = wx * wy;
882
883    pixel += read_imagef(in, sampleri, (int2)(i, j)) * w;
884    weight += w;
885  }
886
887  pixel = (tx >= 0 && ty >= 0 && tx < in_width && ty < in_height) ? pixel / weight : (float4)0.0f;
888
889  write_imagef (out, (int2)(x, y), pixel);
890}
891
892
893/* kernel for clip&rotate: lanczos2 interpolation */
894__kernel void
895clip_rotate_lanczos2(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
896            const int in_width, const int in_height,
897            const int2 roi_in, const float2 roi_out, const float scale_in, const float scale_out,
898            const int flip, const float2 t, const float2 k, const float4 mat,
899            const float4 k_space, const float2 ka, const float4 ma, const float2 mb)
900{
901  const int x = get_global_id(0);
902  const int y = get_global_id(1);
903
904  const int kwidth = 2;
905
906  if(x >= width || y >= height) return;
907
908  float2 pi, po;
909
910  pi.x = roi_out.x + x + 0.5f;
911  pi.y = roi_out.y + y + 0.5f;
912
913  pi.x -= flip ? t.y * scale_out : t.x * scale_out;
914  pi.y -= flip ? t.x * scale_out : t.y * scale_out;
915
916  pi /= scale_out;
917  backtransform(&pi, &po, mat, k);
918  po *= scale_in;
919
920  po.x += t.x * scale_in;
921  po.y += t.y * scale_in;
922
923  if (k_space.z > 0.0f) keystone_backtransform(&po,k_space,ka,ma,mb);
924
925  po.x -= roi_in.x + 0.5f;
926  po.y -= roi_in.y + 0.5f;
927
928  int tx = po.x;
929  int ty = po.y;
930
931  float4 pixel = (float4)0.0f;
932  float weight = 0.0f;
933
934  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
935    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
936  {
937    const int i = tx + ii;
938    const int j = ty + jj;
939
940    float wx = interpolation_func_lanczos(2, (float)i - po.x);
941    float wy = interpolation_func_lanczos(2, (float)j - po.y);
942    float w = wx * wy;
943
944    pixel += read_imagef(in, sampleri, (int2)(i, j)) * w;
945    weight += w;
946  }
947
948  pixel = (tx >= 0 && ty >= 0 && tx < in_width && ty < in_height) ? pixel / weight : (float4)0.0f;
949
950  write_imagef (out, (int2)(x, y), pixel);
951}
952
953
954
955/* kernel for clip&rotate: lanczos3 interpolation */
956__kernel void
957clip_rotate_lanczos3(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
958            const int in_width, const int in_height,
959            const int2 roi_in, const float2 roi_out, const float scale_in, const float scale_out,
960            const int flip, const float2 t, const float2 k, const float4 mat,
961            const float4 k_space, const float2 ka, const float4 ma, const float2 mb)
962{
963  const int x = get_global_id(0);
964  const int y = get_global_id(1);
965
966  const int kwidth = 3;
967
968  if(x >= width || y >= height) return;
969
970  float2 pi, po;
971
972  pi.x = roi_out.x + x + 0.5f;
973  pi.y = roi_out.y + y + 0.5f;
974
975  pi.x -= flip ? t.y * scale_out : t.x * scale_out;
976  pi.y -= flip ? t.x * scale_out : t.y * scale_out;
977
978  pi /= scale_out;
979  backtransform(&pi, &po, mat, k);
980  po *= scale_in;
981
982  po.x += t.x * scale_in;
983  po.y += t.y * scale_in;
984
985  if (k_space.z > 0.0f) keystone_backtransform(&po,k_space,ka,ma,mb);
986
987  po.x -= roi_in.x + 0.5f;
988  po.y -= roi_in.y + 0.5f;
989
990  int tx = (int)po.x;
991  int ty = (int)po.y;
992
993  float4 pixel = (float4)0.0f;
994  float weight = 0.0f;
995
996  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
997    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
998  {
999    const int i = tx + ii;
1000    const int j = ty + jj;
1001
1002    float wx = interpolation_func_lanczos(3, (float)i - po.x);
1003    float wy = interpolation_func_lanczos(3, (float)j - po.y);
1004    float w = wx * wy;
1005
1006    pixel += read_imagef(in, sampleri, (int2)(i, j)) * w;
1007    weight += w;
1008  }
1009
1010  pixel = (tx >= 0 && ty >= 0 && tx < in_width && ty < in_height) ? pixel / weight : (float4)0.0f;
1011
1012  write_imagef (out, (int2)(x, y), pixel);
1013}
1014
1015
1016/* kernels for the lens plugin: bilinear interpolation */
1017kernel void
1018lens_distort_bilinear (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1019               const int iwidth, const int iheight, const int roi_in_x, const int roi_in_y, global float *pi,
1020               const int do_nan_checks)
1021{
1022  const int x = get_global_id(0);
1023  const int y = get_global_id(1);
1024
1025  if(x >= width || y >= height) return;
1026
1027  float4 pixel;
1028
1029  float rx, ry;
1030  const int piwidth = 2*3*width;
1031  global float *ppi = pi + mad24(y, piwidth, 2*3*x);
1032
1033  if(do_nan_checks)
1034  {
1035    bool valid = true;
1036
1037    for(int i = 0; i < 6; i++) valid = valid && isfinite(ppi[i]);
1038
1039    if(!valid)
1040    {
1041      pixel = (float4)0.0f;
1042      write_imagef (out, (int2)(x, y), pixel);
1043      return;
1044    }
1045  }
1046
1047  rx = ppi[0] - roi_in_x;
1048  ry = ppi[1] - roi_in_y;
1049  rx = (rx >= 0) ? rx : 0;
1050  ry = (ry >= 0) ? ry : 0;
1051  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1052  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1053  pixel.x = read_imagef(in, samplerf, (float2)(rx, ry)).x;
1054
1055  rx = ppi[2] - roi_in_x;
1056  ry = ppi[3] - roi_in_y;
1057  rx = (rx >= 0) ? rx : 0;
1058  ry = (ry >= 0) ? ry : 0;
1059  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1060  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1061  pixel.yw = read_imagef(in, samplerf, (float2)(rx, ry)).yw;
1062
1063  rx = ppi[4] - roi_in_x;
1064  ry = ppi[5] - roi_in_y;
1065  rx = (rx >= 0) ? rx : 0;
1066  ry = (ry >= 0) ? ry : 0;
1067  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1068  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1069  pixel.z = read_imagef(in, samplerf, (float2)(rx, ry)).z;
1070
1071  pixel = all(isfinite(pixel.xyz)) ? pixel : (float4)0.0f;
1072
1073  write_imagef (out, (int2)(x, y), pixel);
1074}
1075
1076/* kernels for the lens plugin: bicubic interpolation */
1077kernel void
1078lens_distort_bicubic (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1079                      const int iwidth, const int iheight, const int roi_in_x, const int roi_in_y, global float *pi,
1080                      const int do_nan_checks)
1081{
1082  const int x = get_global_id(0);
1083  const int y = get_global_id(1);
1084
1085  const int kwidth = 2;
1086
1087  if(x >= width || y >= height) return;
1088
1089  float4 pixel = (float4)0.0f;
1090
1091  float rx, ry;
1092  int tx, ty;
1093  float sum, weight;
1094  float2 sum2;
1095  const int piwidth = 2*3*width;
1096  global float *ppi = pi + mad24(y, piwidth, 2*3*x);
1097
1098  if(do_nan_checks)
1099  {
1100    bool valid = true;
1101
1102    for(int i = 0; i < 6; i++) valid = valid && isfinite(ppi[i]);
1103
1104    if(!valid)
1105    {
1106      pixel = (float4)0.0f;
1107      write_imagef (out, (int2)(x, y), pixel);
1108      return;
1109    }
1110  }
1111
1112
1113  rx = ppi[0] - (float)roi_in_x;
1114  ry = ppi[1] - (float)roi_in_y;
1115  rx = (rx >= 0) ? rx : 0;
1116  ry = (ry >= 0) ? ry : 0;
1117  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1118  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1119
1120  tx = rx;
1121  ty = ry;
1122
1123  sum = 0.0f;
1124  weight = 0.0f;
1125  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1126    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1127  {
1128    int i = tx + ii;
1129    int j = ty + jj;
1130    i = (i >= 0) ? i : 0;
1131    j = (j >= 0) ? j : 0;
1132    i = (i <= iwidth - 1) ? i : iwidth - 1;
1133    j = (j <= iheight - 1) ? j : iheight - 1;
1134
1135    float wx = interpolation_func_bicubic((float)i - rx);
1136    float wy = interpolation_func_bicubic((float)j - ry);
1137    float w = wx * wy;
1138
1139    sum += read_imagef(in, samplerc, (int2)(i, j)).x * w;
1140    weight += w;
1141  }
1142  pixel.x = sum/weight;
1143
1144
1145  rx = ppi[2] - (float)roi_in_x;
1146  ry = ppi[3] - (float)roi_in_y;
1147  rx = (rx >= 0) ? rx : 0;
1148  ry = (ry >= 0) ? ry : 0;
1149  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1150  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1151
1152  tx = rx;
1153  ty = ry;
1154
1155  sum2 = (float2)0.0f;
1156  weight = 0.0f;
1157  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1158    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1159  {
1160    int i = tx + ii;
1161    int j = ty + jj;
1162    i = (i >= 0) ? i : 0;
1163    j = (j >= 0) ? j : 0;
1164    i = (i <= iwidth - 1) ? i : iwidth - 1;
1165    j = (j <= iheight - 1) ? j : iheight - 1;
1166
1167    float wx = interpolation_func_bicubic((float)i - rx);
1168    float wy = interpolation_func_bicubic((float)j - ry);
1169    float w = wx * wy;
1170
1171    sum2 += read_imagef(in, samplerc, (int2)(i, j)).yw * w;
1172    weight += w;
1173  }
1174  pixel.yw = sum2/weight;
1175
1176
1177  rx = ppi[4] - (float)roi_in_x;
1178  ry = ppi[5] - (float)roi_in_y;
1179  rx = (rx >= 0) ? rx : 0;
1180  ry = (ry >= 0) ? ry : 0;
1181  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1182  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1183
1184  tx = rx;
1185  ty = ry;
1186
1187  sum = 0.0f;
1188  weight = 0.0f;
1189  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1190    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1191  {
1192    int i = tx + ii;
1193    int j = ty + jj;
1194    i = (i >= 0) ? i : 0;
1195    j = (j >= 0) ? j : 0;
1196    i = (i <= iwidth - 1) ? i : iwidth - 1;
1197    j = (j <= iheight - 1) ? j : iheight - 1;
1198
1199    float wx = interpolation_func_bicubic((float)i - rx);
1200    float wy = interpolation_func_bicubic((float)j - ry);
1201    float w = wx * wy;
1202
1203    sum += read_imagef(in, samplerc, (int2)(i, j)).z * w;
1204    weight += w;
1205  }
1206  pixel.z = sum/weight;
1207
1208  pixel = all(isfinite(pixel.xyz)) ? pixel : (float4)0.0f;
1209
1210  write_imagef (out, (int2)(x, y), pixel);
1211}
1212
1213
1214/* kernels for the lens plugin: lanczos2 interpolation */
1215kernel void
1216lens_distort_lanczos2 (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1217                      const int iwidth, const int iheight, const int roi_in_x, const int roi_in_y, global float *pi,
1218                      const int do_nan_checks)
1219{
1220  const int x = get_global_id(0);
1221  const int y = get_global_id(1);
1222
1223  const int kwidth = 2;
1224
1225  if(x >= width || y >= height) return;
1226
1227  float4 pixel = (float4)0.0f;
1228
1229  float rx, ry;
1230  int tx, ty;
1231  float sum, weight;
1232  float2 sum2;
1233  const int piwidth = 2*3*width;
1234  global float *ppi = pi + mad24(y, piwidth, 2*3*x);
1235
1236  if(do_nan_checks)
1237  {
1238    bool valid = true;
1239
1240    for(int i = 0; i < 6; i++) valid = valid && isfinite(ppi[i]);
1241
1242    if(!valid)
1243    {
1244      pixel = (float4)0.0f;
1245      write_imagef (out, (int2)(x, y), pixel);
1246      return;
1247    }
1248  }
1249
1250
1251  rx = ppi[0] - (float)roi_in_x;
1252  ry = ppi[1] - (float)roi_in_y;
1253  rx = (rx >= 0) ? rx : 0;
1254  ry = (ry >= 0) ? ry : 0;
1255  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1256  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1257
1258  tx = rx;
1259  ty = ry;
1260
1261  sum = 0.0f;
1262  weight = 0.0f;
1263  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1264    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1265  {
1266    int i = tx + ii;
1267    int j = ty + jj;
1268    i = (i >= 0) ? i : 0;
1269    j = (j >= 0) ? j : 0;
1270    i = (i <= iwidth - 1) ? i : iwidth - 1;
1271    j = (j <= iheight - 1) ? j : iheight - 1;
1272
1273    float wx = interpolation_func_lanczos(2, (float)i - rx);
1274    float wy = interpolation_func_lanczos(2, (float)j - ry);
1275    float w = wx * wy;
1276
1277    sum += read_imagef(in, samplerc, (int2)(i, j)).x * w;
1278    weight += w;
1279  }
1280  pixel.x = sum/weight;
1281
1282
1283  rx = ppi[2] - (float)roi_in_x;
1284  ry = ppi[3] - (float)roi_in_y;
1285  rx = (rx >= 0) ? rx : 0;
1286  ry = (ry >= 0) ? ry : 0;
1287  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1288  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1289
1290  tx = rx;
1291  ty = ry;
1292
1293  sum2 = (float2)0.0f;
1294  weight = 0.0f;
1295  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1296    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1297  {
1298    int i = tx + ii;
1299    int j = ty + jj;
1300    i = (i >= 0) ? i : 0;
1301    j = (j >= 0) ? j : 0;
1302    i = (i <= iwidth - 1) ? i : iwidth - 1;
1303    j = (j <= iheight - 1) ? j : iheight - 1;
1304
1305    float wx = interpolation_func_lanczos(2, (float)i - rx);
1306    float wy = interpolation_func_lanczos(2, (float)j - ry);
1307    float w = wx * wy;
1308
1309    sum2 += read_imagef(in, samplerc, (int2)(i, j)).yw * w;
1310    weight += w;
1311  }
1312  pixel.yw = sum2/weight;
1313
1314
1315  rx = ppi[4] - (float)roi_in_x;
1316  ry = ppi[5] - (float)roi_in_y;
1317  rx = (rx >= 0) ? rx : 0;
1318  ry = (ry >= 0) ? ry : 0;
1319  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1320  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1321
1322  tx = rx;
1323  ty = ry;
1324
1325  sum = 0.0f;
1326  weight = 0.0f;
1327  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1328    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1329  {
1330    int i = tx + ii;
1331    int j = ty + jj;
1332    i = (i >= 0) ? i : 0;
1333    j = (j >= 0) ? j : 0;
1334    i = (i <= iwidth - 1) ? i : iwidth - 1;
1335    j = (j <= iheight - 1) ? j : iheight - 1;
1336
1337    float wx = interpolation_func_lanczos(2, (float)i - rx);
1338    float wy = interpolation_func_lanczos(2, (float)j - ry);
1339    float w = wx * wy;
1340
1341    sum += read_imagef(in, samplerc, (int2)(i, j)).z * w;
1342    weight += w;
1343  }
1344  pixel.z = sum/weight;
1345
1346  pixel = all(isfinite(pixel.xyz)) ? pixel : (float4)0.0f;
1347
1348  write_imagef (out, (int2)(x, y), pixel);
1349}
1350
1351
1352/* kernels for the lens plugin: lanczos3 interpolation */
1353kernel void
1354lens_distort_lanczos3 (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1355                      const int iwidth, const int iheight, const int roi_in_x, const int roi_in_y, global float *pi,
1356                      const int do_nan_checks)
1357{
1358  const int x = get_global_id(0);
1359  const int y = get_global_id(1);
1360
1361  const int kwidth = 3;
1362
1363  if(x >= width || y >= height) return;
1364
1365  float4 pixel = (float4)0.0f;
1366
1367  float rx, ry;
1368  int tx, ty;
1369  float sum, weight;
1370  float2 sum2;
1371  const int piwidth = 2*3*width;
1372  global float *ppi = pi + mad24(y, piwidth, 2*3*x);
1373
1374  if(do_nan_checks)
1375  {
1376    bool valid = true;
1377
1378    for(int i = 0; i < 6; i++) valid = valid && isfinite(ppi[i]);
1379
1380    if(!valid)
1381    {
1382      pixel = (float4)0.0f;
1383      write_imagef (out, (int2)(x, y), pixel);
1384      return;
1385    }
1386  }
1387
1388  rx = ppi[0] - (float)roi_in_x;
1389  ry = ppi[1] - (float)roi_in_y;
1390  rx = (rx >= 0) ? rx : 0;
1391  ry = (ry >= 0) ? ry : 0;
1392  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1393  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1394
1395  tx = rx;
1396  ty = ry;
1397
1398  sum = 0.0f;
1399  weight = 0.0f;
1400  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1401    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1402  {
1403    int i = tx + ii;
1404    int j = ty + jj;
1405    i = (i >= 0) ? i : 0;
1406    j = (j >= 0) ? j : 0;
1407    i = (i <= iwidth - 1) ? i : iwidth - 1;
1408    j = (j <= iheight - 1) ? j : iheight - 1;
1409
1410    float wx = interpolation_func_lanczos(3, (float)i - rx);
1411    float wy = interpolation_func_lanczos(3, (float)j - ry);
1412    float w = wx * wy;
1413
1414    sum += read_imagef(in, samplerc, (int2)(i, j)).x * w;
1415    weight += w;
1416  }
1417  pixel.x = sum/weight;
1418
1419
1420  rx = ppi[2] - (float)roi_in_x;
1421  ry = ppi[3] - (float)roi_in_y;
1422  rx = (rx >= 0) ? rx : 0;
1423  ry = (ry >= 0) ? ry : 0;
1424  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1425  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1426
1427  tx = rx;
1428  ty = ry;
1429
1430  sum2 = (float2)0.0f;
1431  weight = 0.0f;
1432  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1433    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1434  {
1435    int i = tx + ii;
1436    int j = ty + jj;
1437    i = (i >= 0) ? i : 0;
1438    j = (j >= 0) ? j : 0;
1439    i = (i <= iwidth - 1) ? i : iwidth - 1;
1440    j = (j <= iheight - 1) ? j : iheight - 1;
1441
1442    float wx = interpolation_func_lanczos(3, (float)i - rx);
1443    float wy = interpolation_func_lanczos(3, (float)j - ry);
1444    float w = wx * wy;
1445
1446    sum2 += read_imagef(in, samplerc, (int2)(i, j)).yw * w;
1447    weight += w;
1448  }
1449  pixel.yw = sum2/weight;
1450
1451
1452  rx = ppi[4] - (float)roi_in_x;
1453  ry = ppi[5] - (float)roi_in_y;
1454  rx = (rx >= 0) ? rx : 0;
1455  ry = (ry >= 0) ? ry : 0;
1456  rx = (rx <= iwidth - 1) ? rx : iwidth - 1;
1457  ry = (ry <= iheight - 1) ? ry : iheight - 1;
1458
1459  tx = rx;
1460  ty = ry;
1461
1462  sum = 0.0f;
1463  weight = 0.0f;
1464  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1465    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1466  {
1467    int i = tx + ii;
1468    int j = ty + jj;
1469    i = (i >= 0) ? i : 0;
1470    j = (j >= 0) ? j : 0;
1471    i = (i <= iwidth - 1) ? i : iwidth - 1;
1472    j = (j <= iheight - 1) ? j : iheight - 1;
1473
1474    float wx = interpolation_func_lanczos(3, (float)i - rx);
1475    float wy = interpolation_func_lanczos(3, (float)j - ry);
1476    float w = wx * wy;
1477
1478    sum += read_imagef(in, samplerc, (int2)(i, j)).z * w;
1479    weight += w;
1480  }
1481  pixel.z = sum/weight;
1482
1483  pixel = all(isfinite(pixel.xyz)) ? pixel : (float4)0.0f;
1484
1485  write_imagef (out, (int2)(x, y), pixel);
1486}
1487
1488
1489
1490/* kernel for the ashift module: bilinear interpolation */
1491kernel void
1492ashift_bilinear(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1493                const int iwidth, const int iheight, const int2 roi_in, const int2 roi_out,
1494                const float in_scale, const float out_scale, const float2 clip, global float *homograph)
1495{
1496  const int x = get_global_id(0);
1497  const int y = get_global_id(1);
1498
1499  if(x >= width || y >= height) return;
1500
1501  float pin[3], pout[3];
1502
1503  // convert output pixel coordinates to original image coordinates
1504  pout[0] = roi_out.x + x + clip.x;
1505  pout[1] = roi_out.y + y + clip.y;
1506  pout[0] /= out_scale;
1507  pout[1] /= out_scale;
1508  pout[2] = 1.0f;
1509
1510  // apply homograph
1511  for(int i = 0; i < 3; i++)
1512  {
1513    pin[i] = 0.0f;
1514    for(int j = 0; j < 3; j++) pin[i] += homograph[3 * i + j] * pout[j];
1515  }
1516
1517  // convert to input pixel coordinates
1518  pin[0] /= pin[2];
1519  pin[1] /= pin[2];
1520  pin[0] *= in_scale;
1521  pin[1] *= in_scale;
1522  pin[0] -= roi_in.x;
1523  pin[1] -= roi_in.y;
1524
1525  // get output values by interpolation from input image using fast hardware bilinear interpolation
1526  float rx = pin[0];
1527  float ry = pin[1];
1528  int tx = rx;
1529  int ty = ry;
1530
1531  float4 pixel = (tx >= 0 && ty >= 0 && tx < iwidth && ty < iheight) ? read_imagef(in, samplerf, (float2)(rx, ry)) : (float4)0.0f;
1532
1533  write_imagef (out, (int2)(x, y), pixel);
1534}
1535
1536/* kernel for the ashift module: bicubic interpolation */
1537kernel void
1538ashift_bicubic (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1539                const int iwidth, const int iheight, const int2 roi_in, const int2 roi_out,
1540                const float in_scale, const float out_scale, const float2 clip, global float *homograph)
1541{
1542  const int x = get_global_id(0);
1543  const int y = get_global_id(1);
1544
1545  const int kwidth = 2;
1546
1547  if(x >= width || y >= height) return;
1548
1549  float pin[3], pout[3];
1550
1551  // convert output pixel coordinates to original image coordinates
1552  pout[0] = roi_out.x + x + clip.x;
1553  pout[1] = roi_out.y + y + clip.y;
1554  pout[0] /= out_scale;
1555  pout[1] /= out_scale;
1556  pout[2] = 1.0f;
1557
1558  // apply homograph
1559  for(int i = 0; i < 3; i++)
1560  {
1561    pin[i] = 0.0f;
1562    for(int j = 0; j < 3; j++) pin[i] += homograph[3 * i + j] * pout[j];
1563  }
1564
1565  // convert to input pixel coordinates
1566  pin[0] /= pin[2];
1567  pin[1] /= pin[2];
1568  pin[0] *= in_scale;
1569  pin[1] *= in_scale;
1570  pin[0] -= roi_in.x;
1571  pin[1] -= roi_in.y;
1572
1573  // get output values by interpolation from input image
1574  float rx = pin[0];
1575  float ry = pin[1];
1576  int tx = rx;
1577  int ty = ry;
1578
1579  float4 pixel = (float4)0.0f;
1580  float weight = 0.0f;
1581  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1582    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1583  {
1584    const int i = tx + ii;
1585    const int j = ty + jj;
1586
1587    float wx = interpolation_func_bicubic((float)i - rx);
1588    float wy = interpolation_func_bicubic((float)j - ry);
1589    float w = wx * wy;
1590
1591    pixel += read_imagef(in, sampleri, (int2)(i, j)) * w;
1592    weight += w;
1593  }
1594
1595  pixel = (tx >= 0 && ty >= 0 && tx < iwidth && ty < iheight) ? pixel/weight : (float4)0.0f;
1596
1597  write_imagef (out, (int2)(x, y), pixel);
1598}
1599
1600
1601/* kernel for the ashift module: lanczos2 interpolation */
1602kernel void
1603ashift_lanczos2(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1604                const int iwidth, const int iheight, const int2 roi_in, const int2 roi_out,
1605                const float in_scale, const float out_scale, const float2 clip, global float *homograph)
1606{
1607  const int x = get_global_id(0);
1608  const int y = get_global_id(1);
1609
1610  const int kwidth = 2;
1611
1612  if(x >= width || y >= height) return;
1613
1614  float pin[3], pout[3];
1615
1616  // convert output pixel coordinates to original image coordinates
1617  pout[0] = roi_out.x + x + clip.x;
1618  pout[1] = roi_out.y + y + clip.y;
1619  pout[0] /= out_scale;
1620  pout[1] /= out_scale;
1621  pout[2] = 1.0f;
1622
1623  // apply homograph
1624  for(int i = 0; i < 3; i++)
1625  {
1626    pin[i] = 0.0f;
1627    for(int j = 0; j < 3; j++) pin[i] += homograph[3 * i + j] * pout[j];
1628  }
1629
1630  // convert to input pixel coordinates
1631  pin[0] /= pin[2];
1632  pin[1] /= pin[2];
1633  pin[0] *= in_scale;
1634  pin[1] *= in_scale;
1635  pin[0] -= roi_in.x;
1636  pin[1] -= roi_in.y;
1637
1638  // get output values by interpolation from input image
1639  float rx = pin[0];
1640  float ry = pin[1];
1641  int tx = rx;
1642  int ty = ry;
1643
1644  float4 pixel = (float4)0.0f;
1645  float weight = 0.0f;
1646  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1647    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1648  {
1649    const int i = tx + ii;
1650    const int j = ty + jj;
1651
1652    float wx = interpolation_func_lanczos(2, (float)i - rx);
1653    float wy = interpolation_func_lanczos(2, (float)j - ry);
1654    float w = wx * wy;
1655
1656    pixel += read_imagef(in, sampleri, (int2)(i, j)) * w;
1657    weight += w;
1658  }
1659
1660  pixel = (tx >= 0 && ty >= 0 && tx < iwidth && ty < iheight) ? pixel/weight : (float4)0.0f;
1661
1662  write_imagef (out, (int2)(x, y), pixel);
1663}
1664
1665
1666/* kernels for the ashift module: lanczos3 interpolation */
1667kernel void
1668ashift_lanczos3(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1669                const int iwidth, const int iheight, const int2 roi_in, const int2 roi_out,
1670                const float in_scale, const float out_scale, const float2 clip, global float *homograph)
1671{
1672  const int x = get_global_id(0);
1673  const int y = get_global_id(1);
1674
1675  const int kwidth = 3;
1676
1677  if(x >= width || y >= height) return;
1678
1679  float pin[3], pout[3];
1680
1681  // convert output pixel coordinates to original image coordinates
1682  pout[0] = roi_out.x + x + clip.x;
1683  pout[1] = roi_out.y + y + clip.y;
1684  pout[0] /= out_scale;
1685  pout[1] /= out_scale;
1686  pout[2] = 1.0f;
1687
1688  // apply homograph
1689  for(int i = 0; i < 3; i++)
1690  {
1691    pin[i] = 0.0f;
1692    for(int j = 0; j < 3; j++) pin[i] += homograph[3 * i + j] * pout[j];
1693  }
1694
1695  // convert to input pixel coordinates
1696  pin[0] /= pin[2];
1697  pin[1] /= pin[2];
1698  pin[0] *= in_scale;
1699  pin[1] *= in_scale;
1700  pin[0] -= roi_in.x;
1701  pin[1] -= roi_in.y;
1702
1703  // get output values by interpolation from input image
1704  float rx = pin[0];
1705  float ry = pin[1];
1706  int tx = rx;
1707  int ty = ry;
1708
1709  float4 pixel = (float4)0.0f;
1710  float weight = 0.0f;
1711  for(int jj = 1 - kwidth; jj <= kwidth; jj++)
1712    for(int ii= 1 - kwidth; ii <= kwidth; ii++)
1713  {
1714    const int i = tx + ii;
1715    const int j = ty + jj;
1716
1717    float wx = interpolation_func_lanczos(3, (float)i - rx);
1718    float wy = interpolation_func_lanczos(3, (float)j - ry);
1719    float w = wx * wy;
1720
1721    pixel += read_imagef(in, sampleri, (int2)(i, j)) * w;
1722    weight += w;
1723  }
1724
1725  pixel = (tx >= 0 && ty >= 0 && tx < iwidth && ty < iheight) ? pixel/weight : (float4)0.0f;
1726
1727  write_imagef (out, (int2)(x, y), pixel);
1728}
1729
1730
1731kernel void
1732lens_vignette (read_only image2d_t in, write_only image2d_t out, const int width, const int height, global float4 *pi)
1733{
1734  const int x = get_global_id(0);
1735  const int y = get_global_id(1);
1736
1737  if(x >= width || y >= height) return;
1738
1739  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
1740  float4 scale = pi[mad24(y, width, x)]/(float4)0.5f;
1741
1742  pixel.xyz *= scale.xyz;
1743
1744  write_imagef (out, (int2)(x, y), pixel);
1745}
1746
1747
1748
1749/* kernel for flip */
1750__kernel void
1751flip(read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int orientation)
1752{
1753  const int x = get_global_id(0);
1754  const int y = get_global_id(1);
1755
1756  if(x >= width || y >= height) return;
1757
1758  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
1759
1760  // ORIENTATION_FLIP_X = 2
1761  int  nx = (orientation & 2) ? width - x - 1 : x;
1762
1763  // ORIENTATION_FLIP_Y = 1
1764  int ny = (orientation & 1) ? height - y - 1 : y;
1765
1766  // ORIENTATION_SWAP_XY = 4
1767  if((orientation & 4) == 4)
1768  {
1769     const int tmp = nx;
1770     nx = ny;
1771     ny = tmp;
1772   }
1773
1774  write_imagef (out, (int2)(nx, ny), pixel);
1775}
1776
1777
1778/* we use this exp approximation to maintain full identity with cpu path */
1779float
1780fast_expf(const float x)
1781{
1782  // meant for the range [-100.0f, 0.0f]. largest error ~ -0.06 at 0.0f.
1783  // will get _a_lot_ worse for x > 0.0f (9000 at 10.0f)..
1784  const int i1 = 0x3f800000u;
1785  // e^x, the comment would be 2^x
1786  const int i2 = 0x402DF854u;//0x40000000u;
1787  // const int k = CLAMPS(i1 + x * (i2 - i1), 0x0u, 0x7fffffffu);
1788  // without max clamping (doesn't work for large x, but is faster):
1789  const int k0 = i1 + x * (i2 - i1);
1790  union {
1791      float f;
1792      int k;
1793  } u;
1794  u.k = k0 > 0 ? k0 : 0;
1795  return u.f;
1796}
1797
1798
1799float
1800envelope(const float L)
1801{
1802  const float x = clamp(L/100.0f, 0.0f, 1.0f);
1803  // const float alpha = 2.0f;
1804  const float beta = 0.6f;
1805  if(x < beta)
1806  {
1807    // return 1.0f-fabsf(x/beta-1.0f)^2
1808    const float tmp = fabs(x/beta-1.0f);
1809    return 1.0f-tmp*tmp;
1810  }
1811  else
1812  {
1813    const float tmp1 = (1.0f-x)/(1.0f-beta);
1814    const float tmp2 = tmp1*tmp1;
1815    const float tmp3 = tmp2*tmp1;
1816    return 3.0f*tmp2 - 2.0f*tmp3;
1817  }
1818}
1819
1820/* kernel for monochrome */
1821kernel void
1822monochrome_filter(
1823    read_only image2d_t in,
1824    write_only image2d_t out,
1825    const int width,
1826    const int height,
1827    const float a,
1828    const float b,
1829    const float size)
1830{
1831  const int x = get_global_id(0);
1832  const int y = get_global_id(1);
1833
1834  if(x >= width || y >= height) return;
1835
1836  float4 pixel = read_imagef (in,   sampleri, (int2)(x, y));
1837  // TODO: this could be a native_expf, or exp2f, need to evaluate comparisons with cpu though:
1838  pixel.x = 100.0f*fast_expf(-clamp(((pixel.y - a)*(pixel.y - a) + (pixel.z - b)*(pixel.z - b))/(2.0f * size), 0.0f, 1.0f));
1839  write_imagef (out, (int2)(x, y), pixel);
1840}
1841
1842kernel void
1843monochrome(
1844    read_only image2d_t in,
1845    read_only image2d_t base,
1846    write_only image2d_t out,
1847    const int width,
1848    const int height,
1849    const float a,
1850    const float b,
1851    const float size,
1852    float highlights)
1853{
1854  const int x = get_global_id(0);
1855  const int y = get_global_id(1);
1856
1857  if(x >= width || y >= height) return;
1858
1859  float4 pixel = read_imagef (in,   sampleri, (int2)(x, y));
1860  float4 basep = read_imagef (base, sampleri, (int2)(x, y));
1861  float filter  = fast_expf(-clamp(((pixel.y - a)*(pixel.y - a) + (pixel.z - b)*(pixel.z - b))/(2.0f * size), 0.0f, 1.0f));
1862  float tt = envelope(pixel.x);
1863  float t  = tt + (1.0f-tt)*(1.0f-highlights);
1864  pixel.x = mix(pixel.x, pixel.x*basep.x/100.0f, t);
1865  pixel.y = pixel.z = 0.0f;
1866  write_imagef (out, (int2)(x, y), pixel);
1867}
1868
1869
1870
1871/* kernel for the plugin colorout, fast matrix + shaper path only */
1872kernel void
1873colorout (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1874          global float *mat, read_only image2d_t lutr, read_only image2d_t lutg, read_only image2d_t lutb,
1875          global const float (*const a)[3])
1876{
1877  const int x = get_global_id(0);
1878  const int y = get_global_id(1);
1879
1880  if(x >= width || y >= height) return;
1881
1882  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
1883  float XYZ[3], rgb[3];
1884  float4 xyz = Lab_to_XYZ(pixel);
1885  XYZ[0] = xyz.x;
1886  XYZ[1] = xyz.y;
1887  XYZ[2] = xyz.z;
1888  for(int i=0;i<3;i++)
1889  {
1890    rgb[i] = 0.0f;
1891    for(int j=0;j<3;j++) rgb[i] += mat[3*i+j]*XYZ[j];
1892  }
1893  pixel.x = lerp_lookup_unbounded0(lutr, rgb[0], a[0]);
1894  pixel.y = lerp_lookup_unbounded0(lutg, rgb[1], a[1]);
1895  pixel.z = lerp_lookup_unbounded0(lutb, rgb[2], a[2]);
1896  write_imagef (out, (int2)(x, y), pixel);
1897}
1898
1899
1900/* kernel for the levels plugin */
1901kernel void
1902levels (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
1903        read_only image2d_t lut, const float in_low, const float in_high, const float in_inv_gamma)
1904{
1905  const int x = get_global_id(0);
1906  const int y = get_global_id(1);
1907
1908  if(x >= width || y >= height) return;
1909
1910  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
1911  const float L = pixel.x;
1912  const float L_in = pixel.x/100.0f;
1913
1914  if(L_in <= in_low)
1915  {
1916    pixel.x = 0.0f;
1917  }
1918  else if(L_in >= in_high)
1919  {
1920    float percentage = (L_in - in_low) / (in_high - in_low);
1921    pixel.x = 100.0f * pow(percentage, in_inv_gamma);
1922  }
1923  else
1924  {
1925    float percentage = (L_in - in_low) / (in_high - in_low);
1926    pixel.x = lookup(lut, percentage);
1927  }
1928
1929  if(L_in > 0.01f)
1930  {
1931    pixel.y *= pixel.x/L;
1932    pixel.z *= pixel.x/L;
1933  }
1934  else
1935  {
1936    pixel.y *= pixel.x;
1937    pixel.z *= pixel.x;
1938  }
1939
1940  write_imagef (out, (int2)(x, y), pixel);
1941}
1942
1943/* kernel for the colorzones plugin */
1944enum
1945{
1946  DT_IOP_COLORZONES_L = 0,
1947  DT_IOP_COLORZONES_C = 1,
1948  DT_IOP_COLORZONES_h = 2
1949};
1950
1951
1952kernel void
1953colorzones_v3 (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int channel,
1954            read_only image2d_t table_L, read_only image2d_t table_a, read_only image2d_t table_b)
1955{
1956  const int x = get_global_id(0);
1957  const int y = get_global_id(1);
1958
1959  if(x >= width || y >= height) return;
1960
1961  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
1962
1963  const float a = pixel.y;
1964  const float b = pixel.z;
1965  const float h = fmod(atan2(b, a) + 2.0f*M_PI_F, 2.0f*M_PI_F)/(2.0f*M_PI_F);
1966  const float C = sqrt(b*b + a*a);
1967
1968  float select = 0.0f;
1969  float blend = 0.0f;
1970
1971  switch(channel)
1972  {
1973    case DT_IOP_COLORZONES_L:
1974      select = fmin(1.0f, pixel.x/100.0f);
1975      break;
1976    case DT_IOP_COLORZONES_C:
1977      select = fmin(1.0f, C/128.0f);
1978      break;
1979    default:
1980    case DT_IOP_COLORZONES_h:
1981      select = h;
1982      blend = pow(1.0f - C/128.0f, 2.0f);
1983      break;
1984  }
1985
1986  const float Lm = (blend * 0.5f + (1.0f-blend)*lookup(table_L, select)) - 0.5f;
1987  const float hm = (blend * 0.5f + (1.0f-blend)*lookup(table_b, select)) - 0.5f;
1988  blend *= blend; // saturation isn't as prone to artifacts:
1989  // const float Cm = 2.0f* (blend*0.5f + (1.0f-blend)*lookup(d->lut[1], select));
1990  const float Cm = 2.0f * lookup(table_a, select);
1991  const float L = pixel.x * pow(2.0f, 4.0f*Lm);
1992
1993  pixel.x = L;
1994  pixel.y = cos(2.0f*M_PI_F*(h + hm)) * Cm * C;
1995  pixel.z = sin(2.0f*M_PI_F*(h + hm)) * Cm * C;
1996
1997  write_imagef (out, (int2)(x, y), pixel);
1998}
1999
2000kernel void
2001colorzones (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int channel,
2002            read_only image2d_t table_L, read_only image2d_t table_C, read_only image2d_t table_h)
2003{
2004  const int x = get_global_id(0);
2005  const int y = get_global_id(1);
2006
2007  if(x >= width || y >= height) return;
2008
2009  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2010
2011  float4 LCh;
2012  const float normalize_C = 1.f / (128.0f * sqrt(2.f));
2013
2014  LCh = Lab_2_LCH(pixel);
2015
2016  float select = 0.0f;
2017  switch(channel)
2018  {
2019    case DT_IOP_COLORZONES_L:
2020      select = LCh.x * 0.01f;
2021      break;
2022    case DT_IOP_COLORZONES_C:
2023      select = LCh.y * normalize_C;
2024      break;
2025    case DT_IOP_COLORZONES_h:
2026    default:
2027      select = LCh.z;
2028      break;
2029  }
2030  select = clamp(select, 0.f, 1.f);
2031
2032  LCh.x *= native_powr(2.0f, 4.0f * (lookup(table_L, select) - .5f));
2033  LCh.y *= 2.f * lookup(table_C, select);
2034  LCh.z += lookup(table_h, select) - .5f;
2035
2036  pixel.xyz = LCH_2_Lab(LCh).xyz;
2037
2038  write_imagef (out, (int2)(x, y), pixel);
2039}
2040
2041
2042/* kernel for the zonesystem plugin */
2043kernel void
2044zonesystem (read_only image2d_t in, write_only image2d_t out, const int width, const int height, const int size,
2045            global float *zonemap_offset, global float *zonemap_scale)
2046{
2047  const int x = get_global_id(0);
2048  const int y = get_global_id(1);
2049
2050  if(x >= width || y >= height) return;
2051
2052  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2053
2054  const float rzscale = (float)(size-1)/100.0f;
2055  const int rz = clamp((int)(pixel.x*rzscale), 0, size-2);
2056  const float zs = ((rz > 0) ? (zonemap_offset[rz]/pixel.x) : 0) + zonemap_scale[rz];
2057
2058  pixel.xyz *= zs;
2059
2060  write_imagef (out, (int2)(x, y), pixel);
2061}
2062
2063
2064
2065
2066/* kernel to fill an image with a color (for the borders plugin). */
2067kernel void
2068borders_fill (write_only image2d_t out, const int left, const int top, const int width, const int height, const float4 color)
2069{
2070  const int x = get_global_id(0);
2071  const int y = get_global_id(1);
2072
2073  if(x < left || y < top) return;
2074  if(x >= width + left || y >= height + top) return;
2075
2076  write_imagef (out, (int2)(x, y), color);
2077}
2078
2079
2080/* kernel for the overexposed plugin. */
2081typedef enum dt_clipping_preview_mode_t
2082{
2083  DT_CLIPPING_PREVIEW_GAMUT = 0,
2084  DT_CLIPPING_PREVIEW_ANYRGB = 1,
2085  DT_CLIPPING_PREVIEW_LUMINANCE = 2,
2086  DT_CLIPPING_PREVIEW_SATURATION = 3
2087} dt_clipping_preview_mode_t;
2088
2089kernel void
2090overexposed (read_only image2d_t in, write_only image2d_t out, read_only image2d_t tmp, const int width, const int height,
2091             const float lower, const float upper, const float4 lower_color, const float4 upper_color,
2092             constant dt_colorspaces_iccprofile_info_cl_t *profile_info,
2093            read_only image2d_t lut, const int use_work_profile, dt_clipping_preview_mode_t mode)
2094{
2095  const int x = get_global_id(0);
2096  const int y = get_global_id(1);
2097
2098  if(x >= width || y >= height) return;
2099
2100  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2101  float4 pixel_tmp = read_imagef(tmp, sampleri, (int2)(x, y));
2102
2103  if(mode == DT_CLIPPING_PREVIEW_ANYRGB)
2104  {
2105    if(pixel_tmp.x >= upper || pixel_tmp.y >= upper || pixel_tmp.z >= upper)
2106      pixel.xyz = upper_color.xyz;
2107
2108    else if(pixel_tmp.x <= lower && pixel_tmp.y <= lower && pixel_tmp.z <= lower)
2109      pixel.xyz = lower_color.xyz;
2110
2111  }
2112  else if(mode == DT_CLIPPING_PREVIEW_GAMUT && use_work_profile)
2113  {
2114    const float luminance = get_rgb_matrix_luminance(pixel, profile_info, profile_info->matrix_in, lut);
2115
2116    if(luminance >= upper)
2117    {
2118      pixel.xyz = upper_color.xyz;
2119    }
2120    else if(luminance <= lower)
2121    {
2122      pixel.xyz = lower_color.xyz;
2123    }
2124    else
2125    {
2126      float4 saturation = { 0.f, 0.f, 0.f, 0.f};
2127      saturation = pixel_tmp - (float4)luminance;
2128      saturation = native_sqrt(saturation * saturation / ((float4)(luminance * luminance) + pixel_tmp * pixel_tmp));
2129
2130      if(saturation.x > upper || saturation.y > upper || saturation.z > upper ||
2131         pixel_tmp.x >= upper || pixel_tmp.y >= upper || pixel_tmp.z >= upper)
2132        pixel.xyz = upper_color.xyz;
2133
2134      else if(pixel_tmp.x <= lower && pixel_tmp.y <= lower && pixel_tmp.z <= lower)
2135        pixel.xyz = lower_color.xyz;
2136    }
2137  }
2138  else if(mode == DT_CLIPPING_PREVIEW_LUMINANCE && use_work_profile)
2139  {
2140    const float luminance = get_rgb_matrix_luminance(pixel, profile_info, profile_info->matrix_in, lut);
2141
2142    if(luminance >= upper)
2143      pixel.xyz = upper_color.xyz;
2144
2145    else if(luminance <= lower)
2146      pixel.xyz = lower_color.xyz;
2147  }
2148  else if(mode == DT_CLIPPING_PREVIEW_SATURATION && use_work_profile)
2149  {
2150    const float luminance = get_rgb_matrix_luminance(pixel, profile_info, profile_info->matrix_in, lut);
2151
2152    if(luminance < upper && luminance > lower)
2153    {
2154      float4 saturation = { 0.f, 0.f, 0.f, 0.f};
2155      saturation = pixel_tmp - (float4)luminance;
2156      saturation = native_sqrt(saturation * saturation / ((float4)(luminance * luminance) + pixel_tmp * pixel_tmp));
2157
2158      if(saturation.x > upper || saturation.y > upper || saturation.z > upper ||
2159         pixel_tmp.x >= upper || pixel_tmp.y >= upper || pixel_tmp.z >= upper)
2160        pixel.xyz = upper_color.xyz;
2161
2162      else if(pixel_tmp.x <= lower && pixel_tmp.y <= lower && pixel_tmp.z <= lower)
2163        pixel.xyz = lower_color.xyz;
2164    }
2165  }
2166
2167  write_imagef (out, (int2)(x, y), pixel);
2168}
2169
2170
2171/* kernel for the rawoverexposed plugin. */
2172kernel void
2173rawoverexposed_mark_cfa (
2174        read_only image2d_t in, write_only image2d_t out, global float *pi,
2175        const int width, const int height,
2176        read_only image2d_t raw, const int raw_width, const int raw_height,
2177        const unsigned int filters, global const unsigned char (*const xtrans)[6],
2178        global unsigned int *threshold, global float *colors)
2179{
2180  const int x = get_global_id(0);
2181  const int y = get_global_id(1);
2182
2183  if(x >= width || y >= height) return;
2184
2185  const int piwidth = 2*width;
2186  global float *ppi = pi + mad24(y, piwidth, 2*x);
2187
2188  const int raw_x = ppi[0];
2189  const int raw_y = ppi[1];
2190
2191  if(raw_x < 0 || raw_y < 0 || raw_x >= raw_width || raw_y >= raw_height) return;
2192
2193  const uint raw_pixel = read_imageui(raw, sampleri, (int2)(raw_x, raw_y)).x;
2194
2195  const int c = (filters == 9u) ? FCxtrans(raw_y, raw_x, xtrans) : FC(raw_y, raw_x, filters);
2196
2197  if(raw_pixel < threshold[c]) return;
2198
2199  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2200
2201  global float *color = colors + mad24(4, c, 0);
2202
2203  // cfa color
2204  pixel.x = color[0];
2205  pixel.y = color[1];
2206  pixel.z = color[2];
2207
2208  write_imagef (out, (int2)(x, y), pixel);
2209}
2210
2211kernel void
2212rawoverexposed_mark_solid (
2213        read_only image2d_t in, write_only image2d_t out, global float *pi,
2214        const int width, const int height,
2215        read_only image2d_t raw, const int raw_width, const int raw_height,
2216        const unsigned int filters, global const unsigned char (*const xtrans)[6],
2217        global unsigned int *threshold, const float4 solid_color)
2218{
2219  const int x = get_global_id(0);
2220  const int y = get_global_id(1);
2221
2222  if(x >= width || y >= height) return;
2223
2224  const int piwidth = 2*width;
2225  global float *ppi = pi + mad24(y, piwidth, 2*x);
2226
2227  const int raw_x = ppi[0];
2228  const int raw_y = ppi[1];
2229
2230  if(raw_x < 0 || raw_y < 0 || raw_x >= raw_width || raw_y >= raw_height) return;
2231
2232  const uint raw_pixel = read_imageui(raw, sampleri, (int2)(raw_x, raw_y)).x;
2233
2234  const int c = (filters == 9u) ? FCxtrans(raw_y, raw_x, xtrans) : FC(raw_y, raw_x, filters);
2235
2236  if(raw_pixel < threshold[c]) return;
2237
2238  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2239
2240  // solid color
2241  pixel.xyz = solid_color.xyz;
2242
2243  write_imagef (out, (int2)(x, y), pixel);
2244}
2245
2246kernel void
2247rawoverexposed_falsecolor (
2248        read_only image2d_t in, write_only image2d_t out, global float *pi,
2249        const int width, const int height,
2250        read_only image2d_t raw, const int raw_width, const int raw_height,
2251        const unsigned int filters, global const unsigned char (*const xtrans)[6],
2252        global unsigned int *threshold)
2253{
2254  const int x = get_global_id(0);
2255  const int y = get_global_id(1);
2256
2257  if(x >= width || y >= height) return;
2258
2259  const int piwidth = 2*width;
2260  global float *ppi = pi + mad24(y, piwidth, 2*x);
2261
2262  const int raw_x = ppi[0];
2263  const int raw_y = ppi[1];
2264
2265  if(raw_x < 0 || raw_y < 0 || raw_x >= raw_width || raw_y >= raw_height) return;
2266
2267  const uint raw_pixel = read_imageui(raw, sampleri, (int2)(raw_x, raw_y)).x;
2268
2269  const int c = (filters == 9u) ? FCxtrans(raw_y, raw_x, xtrans) : FC(raw_y, raw_x, filters);
2270
2271  if(raw_pixel < threshold[c]) return;
2272
2273  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2274
2275  float p[4];
2276  vstore4(pixel, 0, p);
2277  // falsecolor
2278  p[c] = 0.0f;
2279  pixel = vload4(0, p);
2280
2281  write_imagef (out, (int2)(x, y), pixel);
2282}
2283
2284
2285/* kernel for the lowlight plugin. */
2286kernel void
2287lowlight (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
2288          const float4 XYZ_sw, read_only image2d_t lut)
2289{
2290  const int x = get_global_id(0);
2291  const int y = get_global_id(1);
2292
2293  if(x >= width || y >= height) return;
2294
2295  const float c = 0.5f;
2296  const float threshold = 0.01f;
2297
2298  float V;
2299  float w;
2300
2301  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
2302
2303  float4 XYZ = Lab_to_XYZ(pixel);
2304
2305  // calculate scotopic luminance
2306  if (XYZ.x > threshold)
2307  {
2308    // normal flow
2309    V = XYZ.y * ( 1.33f * ( 1.0f + (XYZ.y+XYZ.z)/XYZ.x) - 1.68f );
2310  }
2311  else
2312  {
2313    // low red flow, avoids "snow" on dark noisy areas
2314    V = XYZ.y * ( 1.33f * ( 1.0f + (XYZ.y+XYZ.z)/threshold) - 1.68f );
2315  }
2316
2317  // scale using empiric coefficient and fit inside limits
2318  V = clamp(c*V, 0.0f, 1.0f);
2319
2320  // blending coefficient from curve
2321  w = lookup(lut, pixel.x/100.0f);
2322
2323  XYZ = w * XYZ + (1.0f - w) * V * XYZ_sw;
2324
2325  pixel = XYZ_to_Lab(XYZ);
2326
2327  write_imagef (out, (int2)(x, y), pixel);
2328}
2329
2330
2331/* kernel for the contrast lightness saturation module */
2332kernel void
2333colisa (read_only image2d_t in, write_only image2d_t out, unsigned int width, unsigned int height, const float saturation,
2334        read_only image2d_t ctable, constant float *ca, read_only image2d_t ltable, constant float *la)
2335{
2336  const unsigned int x = get_global_id(0);
2337  const unsigned int y = get_global_id(1);
2338
2339  if(x >= width || y >= height) return;
2340
2341  float4 i = read_imagef(in, sampleri, (int2)(x, y));
2342  float4 o;
2343
2344  o.x = lookup_unbounded(ctable, i.x/100.0f, ca);
2345  o.x = lookup_unbounded(ltable, o.x/100.0f, la);
2346  o.y = i.y*saturation;
2347  o.z = i.z*saturation;
2348  o.w = i.w;
2349
2350  write_imagef(out, (int2)(x, y), o);
2351}
2352
2353/* kernel for the unbreak input profile module - gamma version */
2354
2355kernel void
2356profilegamma (read_only image2d_t in, write_only image2d_t out, int width, int height,
2357        read_only image2d_t table, constant float *ta)
2358{
2359  const unsigned int x = get_global_id(0);
2360  const unsigned int y = get_global_id(1);
2361
2362  if(x >= width || y >= height) return;
2363
2364  float4 i = read_imagef(in, sampleri, (int2)(x, y));
2365  float4 o;
2366
2367  o.x = lookup_unbounded(table, i.x, ta);
2368  o.y = lookup_unbounded(table, i.y, ta);
2369  o.z = lookup_unbounded(table, i.z, ta);
2370  o.w = i.w;
2371
2372  write_imagef(out, (int2)(x, y), o);
2373}
2374
2375/* kernel for the unbreak input profile module - log version */
2376kernel void
2377profilegamma_log (read_only image2d_t in, write_only image2d_t out, int width, int height, const float dynamic_range, const float shadows_range, const float grey)
2378{
2379  const unsigned int x = get_global_id(0);
2380  const unsigned int y = get_global_id(1);
2381
2382  if(x >= width || y >= height) return;
2383
2384  float4 i = read_imagef(in, sampleri, (int2)(x, y));
2385  const float4 noise = pow((float4)2.0f, (float4)-16.0f);
2386  const float4 dynamic4 = dynamic_range;
2387  const float4 shadows4 = shadows_range;
2388  const float4 grey4 = grey;
2389
2390  float4 o;
2391
2392  o = (i < noise) ? noise : i / grey4;
2393  o = (log2(o) - shadows4) / dynamic4;
2394  o = (o < noise) ? noise : o;
2395  i.xyz = o.xyz;
2396
2397  write_imagef(out, (int2)(x, y), i);
2398}
2399
2400/* kernel for the interpolation resample helper */
2401kernel void
2402interpolation_resample (read_only image2d_t in, write_only image2d_t out, const int width, const int height,
2403                        const global int *hmeta, const global int *vmeta,
2404                        const global int *hlength, const global int *vlength,
2405                        const global int *hindex, const global int *vindex,
2406                        const global float *hkernel, const global float *vkernel,
2407                        const int htaps, const int vtaps,
2408                        local float *lkernel, local int *lindex,
2409                        local float4 *buffer)
2410{
2411  const int x = get_global_id(0);
2412  const int yi = get_global_id(1);
2413  const int ylsz = get_local_size(1);
2414  const int xlid = get_local_id(0);
2415  const int ylid = get_local_id(1);
2416  const int y = yi / vtaps;
2417  const int iy = yi % vtaps;
2418
2419  // Initialize resampling indices
2420  const int xm = min(x, width - 1);
2421  const int ym = min(y, height - 1);
2422  const int hlidx = hmeta[xm*3];   // H(orizontal) L(ength) I(n)d(e)x
2423  const int hkidx = hmeta[xm*3+1]; // H(orizontal) K(ernel) I(n)d(e)x
2424  const int hiidx = hmeta[xm*3+2]; // H(orizontal) I(ndex) I(n)d(e)x
2425  const int vlidx = vmeta[ym*3];   // V(ertical) L(ength) I(n)d(e)x
2426  const int vkidx = vmeta[ym*3+1]; // V(ertical) K(ernel) I(n)d(e)x
2427  const int viidx = vmeta[ym*3+2]; // V(ertical) I(ndex) I(n)d(e)x
2428
2429  const int hl = hlength[hlidx];   // H(orizontal) L(ength)
2430  const int vl = vlength[vlidx];   // V(ertical) L(ength)
2431
2432  // generate local copy of horizontal index field and kernel
2433  for(int n = 0; n <= htaps/ylsz; n++)
2434  {
2435    int k = mad24(n, ylsz, ylid);
2436    if(k >= hl) continue;
2437    lindex[k] = hindex[hiidx+k];
2438    lkernel[k] = hkernel[hkidx+k];
2439  }
2440
2441  barrier(CLK_LOCAL_MEM_FENCE);
2442
2443  // horizontal convolution kernel; store intermediate result in local buffer
2444  if(x < width && y < height)
2445  {
2446    const int yvalid = iy < vl;
2447
2448    const int yy = yvalid ? vindex[viidx+iy] : -1;
2449
2450    float4 vpixel = (float4)0.0f;
2451
2452    for (int ix = 0; ix < hl && yvalid; ix++)
2453    {
2454      const int xx = lindex[ix];
2455      float4 hpixel = read_imagef(in, sampleri,(int2)(xx, yy));
2456      vpixel += hpixel * lkernel[ix];
2457    }
2458
2459    buffer[ylid] = yvalid ? vpixel * vkernel[vkidx+iy] : (float4)0.0f;
2460  }
2461  else
2462    buffer[ylid] = (float4)0.0f;
2463
2464  barrier(CLK_LOCAL_MEM_FENCE);
2465
2466  // recursively reduce local buffer (vertical convolution kernel)
2467  for(int offset = vtaps / 2; offset > 0; offset >>= 1)
2468  {
2469    if (iy < offset)
2470    {
2471      buffer[ylid] += buffer[ylid + offset];
2472    }
2473    barrier(CLK_LOCAL_MEM_FENCE);
2474  }
2475
2476  // store final result
2477  if (iy == 0 && x < width && y < height)
2478  {
2479    // Clip negative RGB that may be produced by Lanczos undershooting
2480    // Negative RGB are invalid values no matter the RGB space (light is positive)
2481    write_imagef (out, (int2)(x, y), fmax(buffer[ylid], 0.f));
2482  }
2483}
2484