1/* 2 This file is part of darktable, 3 copyright (c) 2011 johannes hanika. 4 copyright (c) 2012--2013 Ulrich Pegelow. 5 6 darktable is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 darktable is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with darktable. If not, see <http://www.gnu.org/licenses/>. 18*/ 19 20#include "common.h" 21 22 23 24/* 25 To speed up processing we use an algorithm proposed by B. Goossens, H.Q. Luong, J. Aelterman, A. Pizurica, and W. Philips, 26 "A GPU-Accelerated Real-Time NLMeans Algorithm for Denoising Color Video Sequences", in Proc. ACIVS (2), 2010, pp.46-57. 27*/ 28 29float fast_mexp2f(const float x) 30{ 31 const float i1 = (float)0x3f800000u; // 2^0 32 const float i2 = (float)0x3f000000u; // 2^-1 33 const float k0 = i1 + x * (i2 - i1); 34 union { float f; unsigned int i; } k; 35 k.i = (k0 >= (float)0x800000u) ? k0 : 0; 36 return k.f; 37} 38 39float gh(const float f, const float sharpness) 40{ 41 // make sharpness bigger: less smoothing 42 return fast_mexp2f(f*sharpness); 43} 44 45float ddirac(const int2 q) 46{ 47 return ((q.x || q.y) ? 1.0f : 0.0f); 48} 49 50 51kernel void 52nlmeans_init(global float4* out, const int width, const int height) 53{ 54 const int x = get_global_id(0); 55 const int y = get_global_id(1); 56 const int gidx = mad24(y, width, x); 57 58 if(x >= width || y >= height) return; 59 60 out[gidx] = (float4)0.0f; 61} 62 63 64kernel void 65nlmeans_dist(read_only image2d_t in, global float *U4, const int width, const int height, 66 const int2 q, const float nL2, const float nC2) 67{ 68 const int x = get_global_id(0); 69 const int y = get_global_id(1); 70 const int gidx = mad24(y, width, x); 71 const float4 norm2 = (float4)(nL2, nC2, nC2, 1.0f); 72 73 if(x >= width || y >= height) return; 74 75 int xpq = x + q.x; 76 int ypq = y + q.y; 77 // Convert out of bounds indexes to 0 78 // Reminder: q.x and q.y can be negative 79 xpq *= (x+q.x < width && x+q.x >= 0) ? 1 : 0; 80 ypq *= (y+q.y < height && y+q.y >= 0) ? 1 : 0; 81 82 float4 p1 = read_imagef(in, sampleri, (int2)(x, y)); 83 float4 p2 = read_imagef(in, sampleri, (int2)(xpq, ypq)); 84 float4 tmp = (p1 - p2)*(p1 - p2)*norm2; 85 float dist = tmp.x + tmp.y + tmp.z; 86 87 // make dist equal to 0 in case xpq or ypq is out of bounds 88 dist *= (x+q.x < width && x+q.x >= 0 && y+q.y < height && y+q.y >= 0) ? 1.0f : 0.0f; 89 90 U4[gidx] = dist; 91} 92 93kernel void 94nlmeans_horiz(global float *U4_in, global float *U4_out, const int width, const int height, 95 const int2 q, const int P, local float *buffer) 96{ 97 const int lid = get_local_id(0); 98 const int lsz = get_local_size(0); 99 const int x = get_global_id(0); 100 const int y = get_global_id(1); 101 const int gidx = mad24(min(y, height-1), width, min(x, width-1)); 102 103 if(y < height) 104 { 105 /* fill center part of buffer */ 106 buffer[P + lid] = U4_in[gidx]; 107 108 /* left wing of buffer */ 109 for(int n=0; n <= P/lsz; n++) 110 { 111 const int l = mad24(n, lsz, lid + 1); 112 if(l > P) continue; 113 int xx = mad24((int)get_group_id(0), lsz, -l); 114 xx = max(xx, 0); 115 buffer[P - l] = U4_in[mad24(y, width, xx)]; 116 } 117 118 /* right wing of buffer */ 119 for(int n=0; n <= P/lsz; n++) 120 { 121 const int r = mad24(n, lsz, lsz - lid); 122 if(r > P) continue; 123 int xx = mad24((int)get_group_id(0), lsz, lsz - 1 + r); 124 xx = min(xx, width-1); 125 buffer[P + lsz - 1 + r] = U4_in[mad24(y, width, xx)]; 126 } 127 } 128 129 barrier(CLK_LOCAL_MEM_FENCE); 130 131 if(x >= width || y >= height) return; 132 133 buffer += lid + P; 134 135 float distacc = 0.0f; 136 for(int pi = -P; pi <= P; pi++) 137 { 138 distacc += buffer[pi]; 139 } 140 141 U4_out[gidx] = distacc; 142} 143 144 145kernel void 146nlmeans_vert(global float* U4_in, global float* U4_out, const int width, const int height, 147 const int2 q, const int P, const float sharpness, local float *buffer) 148{ 149 const int lid = get_local_id(1); 150 const int lsz = get_local_size(1); 151 const int x = get_global_id(0); 152 const int y = get_global_id(1); 153 const int gidx = mad24(min(y, height-1), width, min(x, width-1)); 154 155 if(x < width) 156 { 157 /* fill center part of buffer */ 158 buffer[P + lid] = U4_in[gidx]; 159 160 /* left wing of buffer */ 161 for(int n=0; n <= P/lsz; n++) 162 { 163 const int l = mad24(n, lsz, lid + 1); 164 if(l > P) continue; 165 int yy = mad24((int)get_group_id(1), lsz, -l); 166 yy = max(yy, 0); 167 buffer[P - l] = U4_in[mad24(yy, width, x)]; 168 } 169 170 /* right wing of buffer */ 171 for(int n=0; n <= P/lsz; n++) 172 { 173 const int r = mad24(n, lsz, lsz - lid); 174 if(r > P) continue; 175 int yy = mad24((int)get_group_id(1), lsz, lsz - 1 + r); 176 yy = min(yy, height-1); 177 buffer[P + lsz - 1 + r] = U4_in[mad24(yy, width, x)]; 178 } 179 } 180 181 barrier(CLK_LOCAL_MEM_FENCE); 182 183 if(x >= width || y >= height) return; 184 185 buffer += lid + P; 186 187 float distacc = 0.0f; 188 for(int pj = -P; pj <= P; pj++) 189 { 190 distacc += buffer[pj]; 191 } 192 193 distacc = gh(distacc, sharpness); 194 195 U4_out[gidx] = distacc; 196} 197 198 199 200kernel void 201nlmeans_accu(read_only image2d_t in, global float4* U2, global float* U4, 202 const int width, const int height, const int2 q) 203{ 204 const int x = get_global_id(0); 205 const int y = get_global_id(1); 206 const int gidx = mad24(y, width, x); 207 208 if(x >= width || y >= height) return; 209 210 // wpq and wmq are weights for the image read of 211 // indexes (int2)(x, y) + q and (int2)(x, y) - q) 212 // respectively 213 // we want wpq and wmq equal to 1 only if 214 // their associated index is in bounds 215 int wpq = 1; 216 int wmq = 1; 217 218 // handle bounds for x 219 // Reminder: q.x can be negative 220 wpq *= (x+q.x < width) ? 1 : 0; 221 wmq *= (x-q.x < width) ? 1 : 0; 222 wpq *= (x+q.x >= 0) ? 1 : 0; 223 wmq *= (x-q.x >= 0) ? 1 : 0; 224 225 // handle bounds for y 226 // Reminder: q.y can be negative 227 wpq *= (y+q.y >= 0) ? 1 : 0; 228 wmq *= (y-q.y >= 0) ? 1 : 0; 229 wpq *= (y+q.y < height) ? 1 : 0; 230 wmq *= (y-q.y < height) ? 1 : 0; 231 232 float4 u1_pq = wpq ? read_imagef(in, sampleri, (int2)(x, y) + q) : (float4)0.0f; 233 float4 u1_mq = wmq ? read_imagef(in, sampleri, (int2)(x, y) - q) : (float4)0.0f; 234 235 float u4 = U4[gidx]; 236 float u4_mq = U4[mad24(clamp(y-q.y, 0, height-1), width, clamp(x-q.x, 0, width-1))]; 237 238 float u4_mq_dd = u4_mq * ddirac(q); 239 240 float4 accu = (u4 * u1_pq) + (u4_mq_dd * u1_mq); 241 accu.w = (wpq * u4 + wmq * u4_mq_dd); 242 243 U2[gidx] += accu; 244} 245 246 247kernel void 248nlmeans_finish(read_only image2d_t in, global float4* U2, write_only image2d_t out, 249 const int width, const int height, const float4 weight) 250{ 251 const int x = get_global_id(0); 252 const int y = get_global_id(1); 253 const int gidx = mad24(y, width, x); 254 255 if(x >= width || y >= height) return; 256 257 float4 i = read_imagef(in, sampleri, (int2)(x, y)); 258 float4 u2 = U2[gidx]; 259 float u3 = u2.w; 260 261 float4 o = i * ((float4)1.0f - weight) + u2/u3 * weight; 262 o.w = i.w; 263 264 write_imagef(out, (int2)(x, y), o); 265} 266 267 268 269