1 // Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
2 //
3 // Permission is hereby granted, free of charge, to any person
4 // obtaining a copy of this software and associated documentation
5 // files (the "Software"), to deal in the Software without
6 // restriction, including without limitation the rights to use,
7 // copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following
10 // conditions:
11 //
12 // The above copyright notice and this permission notice shall be
13 // included in all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 // OTHER DEALINGS IN THE SOFTWARE.
23
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <math.h>
28
29 #include "CudaMath.h"
30
31 #define TW 16
32 #define TH 16
33
34 #define THREAD_COUNT (TW * TH)
35
36 #define MAX_KERNEL_WIDTH 32
37
38 #define KW 4
39
40
41
42 #if __DEVICE_EMULATION__
43 #define __debugsync() __syncthreads()
44 #else
45 #define __debugsync()
46 #endif
47
48 #define TN 256
49 #define WARP_COUNT (TN / 32)
50 #define HWARP_COUNT (TN / 16)
51
52 // Window size
53 #define WS 20
54
55
56
57 struct WrapClamp
58 {
operator ()WrapClamp59 int operator()(int i, int h)
60 {
61 i = min(max(i, 0), h-1);
62 }
63 };
64
65 struct WrapRepeat
66 {
operator ()WrapRepeat67 int operator()(int i, int h)
68 {
69 i = abs(i) % h; // :( Non power of two!
70 }
71 };
72
73 struct WrapMirror
74 {
operator ()WrapMirror75 int operator()(int i, int h)
76 {
77 i = abs(i);
78 while (i >= h) i = 2 * w - i - 2;
79 }
80 };
81
82
83 // Vertical convolution filter that processes vertical strips.
convolveStrip(float * d_channel,float * d_kernel,int width,int height)84 __global__ void convolveStrip(float * d_channel, float * d_kernel, int width, int height)
85 {
86 __shared__ float s_kernel[32 * WS];
87
88 // Preload kernel in shared memory.
89 for (int i = 0; i < 32 * WS / TN; i++)
90 {
91 int idx = i * TN + tid;
92 if (idx < 32 * WS) s_kernel[idx] = d_kernel[idx];
93 }
94
95 __shared__ float s_strip[32 * WS]; // TN/32
96
97 int wid = tid / 32 - WS/2;
98
99 Mirror wrap;
100 int row = wrap(wid);
101
102 // Preload image block.
103 for (int i = 0; i < 32 * WS / TN; i++)
104 {
105 }
106
107 // @@ Apply kernel to TN/32 rows.
108
109 // @@ Load
110
111
112 }
113
114
115
116
117
118
119 __constant__ float inputGamma, outputInverseGamma;
120 __constant__ float kernel[MAX_KERNEL_WIDTH];
121
122 // Use texture to access input?
123 // That's the most simple approach.
124
125 texture<> image;
126
127 ////////////////////////////////////////////////////////////////////////////////
128 // Combined convolution filter
129 ////////////////////////////////////////////////////////////////////////////////
130
convolve(float4 * output)131 __global__ void convolve(float4 * output)
132 {
133 // @@ Use morton order to assing threads.
134 int x = threadIdx.x;
135 int y = threadIdx.y;
136
137 float4 color = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
138
139 // texture coordinate.
140 int2 t;
141 t.x = 2 * (blockIdx.x * TW + x) - HW;
142 t.y = blockIdx.y * TH + y;
143
144 // @@ We might want to loop and process strips, to reuse the results of the horizontal convolutions.
145
146 // Horizontal convolution. @@ Unroll loops.
147 for (int e = HW; e > 0; e--)
148 {
149 t.x++;
150 float w = kernel[e-1];
151 color += w * tex2D(image, tc);
152 }
153
154 for (int e = 0; e < HW; e++)
155 {
156 t.x++;
157 float w = kernel[e];
158 color += w * tex2D(image, tc);
159 }
160
161 // Write color to shared memory.
162 __shared__ float tile[4 * THREAD_COUNT];
163
164 int tileIdx = y * TW + x;
165 tile[tileIdx + 0 * THREAD_COUNT] = color.x;
166 tile[tileIdx + 1 * THREAD_COUNT] = color.y;
167 tile[tileIdx + 2 * THREAD_COUNT] = color.z;
168 tile[tileIdx + 3 * THREAD_COUNT] = color.w;
169
170 __syncthreads();
171
172 // tile coordinate.
173 t.x = x;
174 t.y = y - HW;
175
176 // Vertical convolution. @@ Unroll loops.
177 for (int i = HW; i > 0; i--)
178 {
179 float w = kernel[i-1];
180
181 t.y++;
182 int idx = t.y * TW + t.x;
183
184 color.x += w * tile[idx + 0 * THREAD_COUNT];
185 color.y += w * tile[idx + 1 * THREAD_COUNT];
186 color.z += w * tile[idx + 2 * THREAD_COUNT];
187 color.w += w * tile[idx + 3 * THREAD_COUNT];
188 }
189
190 for (int i = 0; i < HW; i++)
191 {
192 float w = kernel[i];
193
194 t.y++;
195 int idx = t.y * TW + t.x;
196
197 color.x += w * tile[idx + 0 * THREAD_COUNT];
198 color.y += w * tile[idx + 1 * THREAD_COUNT];
199 color.z += w * tile[idx + 2 * THREAD_COUNT];
200 color.w += w * tile[idx + 3 * THREAD_COUNT];
201 }
202
203 it (x < w && y < h)
204 {
205 // @@ Prevent unaligned writes.
206
207 output[y * w + h] = color;
208 }
209 }
210
211
212 ////////////////////////////////////////////////////////////////////////////////
213 // Monophase X convolution filter
214 ////////////////////////////////////////////////////////////////////////////////
215
convolveY()216 __device__ void convolveY()
217 {
218
219 }
220
221
222 ////////////////////////////////////////////////////////////////////////////////
223 // Mipmap convolution filter
224 ////////////////////////////////////////////////////////////////////////////////
225
226
227
228 ////////////////////////////////////////////////////////////////////////////////
229 // Gamma correction
230 ////////////////////////////////////////////////////////////////////////////////
231
232 /*
233 __device__ float toLinear(float f, float gamma = 2.2f)
234 {
235 return __pow(f, gamma);
236 }
237
238 __device__ float toGamma(float f, float gamma = 2.2f)
239 {
240 return pow(f, 1.0f / gamma);
241 }
242 */
243
244
245
246
247 ////////////////////////////////////////////////////////////////////////////////
248 // Setup kernel
249 ////////////////////////////////////////////////////////////////////////////////
250
setupConvolveKernel(const float * k,int w)251 extern "C" void setupConvolveKernel(const float * k, int w)
252 {
253 w = min(w, MAX_KERNEL_WIDTH);
254 cudaMemcpyToSymbol(kernel, k, sizeof(float) * w, 0);
255 }
256
257
258 ////////////////////////////////////////////////////////////////////////////////
259 // Launch kernel
260 ////////////////////////////////////////////////////////////////////////////////
261
262
263
264
265