1/*
2    This file is part of darktable,
3    copyright (c) 2011 johannes hanika.
4    copyright (c) 2012--2013 Ulrich Pegelow.
5
6    darktable is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation, either version 3 of the License, or
9    (at your option) any later version.
10
11    darktable is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
18*/
19
20#include "common.h"
21
22
23
24/*
25    To speed up processing we use an algorithm proposed by B. Goossens, H.Q. Luong, J. Aelterman, A. Pizurica,  and W. Philips,
26    "A GPU-Accelerated Real-Time NLMeans Algorithm for Denoising Color Video Sequences", in Proc. ACIVS (2), 2010, pp.46-57.
27*/
28
29float fast_mexp2f(const float x)
30{
31  const float i1 = (float)0x3f800000u; // 2^0
32  const float i2 = (float)0x3f000000u; // 2^-1
33  const float k0 = i1 + x * (i2 - i1);
34  union { float f; unsigned int i; } k;
35  k.i = (k0 >= (float)0x800000u) ? k0 : 0;
36  return k.f;
37}
38
39float gh(const float f, const float sharpness)
40{
41  // make sharpness bigger: less smoothing
42  return fast_mexp2f(f*sharpness);
43}
44
45float ddirac(const int2 q)
46{
47  return ((q.x || q.y) ? 1.0f : 0.0f);
48}
49
50
51kernel void
52nlmeans_init(global float4* out, const int width, const int height)
53{
54  const int x = get_global_id(0);
55  const int y = get_global_id(1);
56  const int gidx = mad24(y, width, x);
57
58  if(x >= width || y >= height) return;
59
60  out[gidx] = (float4)0.0f;
61}
62
63
64kernel void
65nlmeans_dist(read_only image2d_t in, global float *U4, const int width, const int height,
66             const int2 q, const float nL2, const float nC2)
67{
68  const int x = get_global_id(0);
69  const int y = get_global_id(1);
70  const int gidx = mad24(y, width, x);
71  const float4 norm2 = (float4)(nL2, nC2, nC2, 1.0f);
72
73  if(x >= width || y >= height) return;
74
75  int xpq = x + q.x;
76  int ypq = y + q.y;
77  // Convert out of bounds indexes to 0
78  // Reminder: q.x and q.y can be negative
79  xpq *= (x+q.x < width && x+q.x >= 0) ? 1 : 0;
80  ypq *= (y+q.y < height && y+q.y >= 0) ? 1 : 0;
81
82  float4 p1 = read_imagef(in, sampleri, (int2)(x, y));
83  float4 p2 = read_imagef(in, sampleri, (int2)(xpq, ypq));
84  float4 tmp = (p1 - p2)*(p1 - p2)*norm2;
85  float dist = tmp.x + tmp.y + tmp.z;
86
87  // make dist equal to 0 in case xpq or ypq is out of bounds
88  dist *= (x+q.x < width && x+q.x >= 0 && y+q.y < height && y+q.y >= 0) ? 1.0f : 0.0f;
89
90  U4[gidx] = dist;
91}
92
93kernel void
94nlmeans_horiz(global float *U4_in, global float *U4_out, const int width, const int height,
95              const int2 q, const int P, local float *buffer)
96{
97  const int lid = get_local_id(0);
98  const int lsz = get_local_size(0);
99  const int x = get_global_id(0);
100  const int y = get_global_id(1);
101  const int gidx = mad24(min(y, height-1), width, min(x, width-1));
102
103  if(y < height)
104  {
105    /* fill center part of buffer */
106    buffer[P + lid] = U4_in[gidx];
107
108    /* left wing of buffer */
109    for(int n=0; n <= P/lsz; n++)
110    {
111      const int l = mad24(n, lsz, lid + 1);
112      if(l > P) continue;
113      int xx = mad24((int)get_group_id(0), lsz, -l);
114      xx = max(xx, 0);
115      buffer[P - l] = U4_in[mad24(y, width, xx)];
116    }
117
118    /* right wing of buffer */
119    for(int n=0; n <= P/lsz; n++)
120    {
121      const int r = mad24(n, lsz, lsz - lid);
122      if(r > P) continue;
123      int xx = mad24((int)get_group_id(0), lsz, lsz - 1 + r);
124      xx = min(xx, width-1);
125      buffer[P + lsz - 1 + r] = U4_in[mad24(y, width, xx)];
126    }
127  }
128
129  barrier(CLK_LOCAL_MEM_FENCE);
130
131  if(x >= width || y >= height) return;
132
133  buffer += lid + P;
134
135  float distacc = 0.0f;
136  for(int pi = -P; pi <= P; pi++)
137  {
138    distacc += buffer[pi];
139  }
140
141  U4_out[gidx] = distacc;
142}
143
144
145kernel void
146nlmeans_vert(global float* U4_in, global float* U4_out, const int width, const int height,
147              const int2 q, const int P, const float sharpness, local float *buffer)
148{
149  const int lid = get_local_id(1);
150  const int lsz = get_local_size(1);
151  const int x = get_global_id(0);
152  const int y = get_global_id(1);
153  const int gidx = mad24(min(y, height-1), width, min(x, width-1));
154
155  if(x < width)
156  {
157    /* fill center part of buffer */
158    buffer[P + lid] = U4_in[gidx];
159
160    /* left wing of buffer */
161    for(int n=0; n <= P/lsz; n++)
162    {
163      const int l = mad24(n, lsz, lid + 1);
164      if(l > P) continue;
165      int yy = mad24((int)get_group_id(1), lsz, -l);
166      yy = max(yy, 0);
167      buffer[P - l] = U4_in[mad24(yy, width, x)];
168    }
169
170    /* right wing of buffer */
171    for(int n=0; n <= P/lsz; n++)
172    {
173      const int r = mad24(n, lsz, lsz - lid);
174      if(r > P) continue;
175      int yy = mad24((int)get_group_id(1), lsz, lsz - 1 + r);
176      yy = min(yy, height-1);
177      buffer[P + lsz - 1 + r] = U4_in[mad24(yy, width, x)];
178    }
179  }
180
181  barrier(CLK_LOCAL_MEM_FENCE);
182
183  if(x >= width || y >= height) return;
184
185  buffer += lid + P;
186
187  float distacc = 0.0f;
188  for(int pj = -P; pj <= P; pj++)
189  {
190    distacc += buffer[pj];
191  }
192
193  distacc = gh(distacc, sharpness);
194
195  U4_out[gidx] = distacc;
196}
197
198
199
200kernel void
201nlmeans_accu(read_only image2d_t in, global float4* U2, global float* U4,
202             const int width, const int height, const int2 q)
203{
204  const int x = get_global_id(0);
205  const int y = get_global_id(1);
206  const int gidx = mad24(y, width, x);
207
208  if(x >= width || y >= height) return;
209
210  // wpq and wmq are weights for the image read of
211  // indexes (int2)(x, y) + q and (int2)(x, y) - q)
212  // respectively
213  // we want wpq and wmq equal to 1 only if
214  // their associated index is in bounds
215  int wpq = 1;
216  int wmq = 1;
217
218  // handle bounds for x
219  // Reminder: q.x can be negative
220  wpq *= (x+q.x < width) ? 1 : 0;
221  wmq *= (x-q.x < width) ? 1 : 0;
222  wpq *= (x+q.x >= 0) ? 1 : 0;
223  wmq *= (x-q.x >= 0) ? 1 : 0;
224
225  // handle bounds for y
226  // Reminder: q.y can be negative
227  wpq *= (y+q.y >= 0) ? 1 : 0;
228  wmq *= (y-q.y >= 0) ? 1 : 0;
229  wpq *= (y+q.y < height) ? 1 : 0;
230  wmq *= (y-q.y < height) ? 1 : 0;
231
232  float4 u1_pq = wpq ? read_imagef(in, sampleri, (int2)(x, y) + q) : (float4)0.0f;
233  float4 u1_mq = wmq ? read_imagef(in, sampleri, (int2)(x, y) - q) : (float4)0.0f;
234
235  float  u4    = U4[gidx];
236  float  u4_mq = U4[mad24(clamp(y-q.y, 0, height-1), width, clamp(x-q.x, 0, width-1))];
237
238  float u4_mq_dd = u4_mq * ddirac(q);
239
240  float4 accu = (u4 * u1_pq) + (u4_mq_dd * u1_mq);
241  accu.w = (wpq * u4 + wmq * u4_mq_dd);
242
243  U2[gidx] += accu;
244}
245
246
247kernel void
248nlmeans_finish(read_only image2d_t in, global float4* U2, write_only image2d_t out,
249               const int width, const int height, const float4 weight)
250{
251  const int x = get_global_id(0);
252  const int y = get_global_id(1);
253  const int gidx = mad24(y, width, x);
254
255  if(x >= width || y >= height) return;
256
257  float4 i  = read_imagef(in, sampleri, (int2)(x, y));
258  float4 u2 = U2[gidx];
259  float  u3 = u2.w;
260
261  float4 o = i * ((float4)1.0f - weight) + u2/u3 * weight;
262  o.w = i.w;
263
264  write_imagef(out, (int2)(x, y), o);
265}
266
267
268
269