1 // Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
2 //
3 // Permission is hereby granted, free of charge, to any person
4 // obtaining a copy of this software and associated documentation
5 // files (the "Software"), to deal in the Software without
6 // restriction, including without limitation the rights to use,
7 // copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following
10 // conditions:
11 //
12 // The above copyright notice and this permission notice shall be
13 // included in all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 // OTHER DEALINGS IN THE SOFTWARE.
23 
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <math.h>
28 
29 #include "CudaMath.h"
30 
31 #define TW 16
32 #define TH 16
33 
34 #define THREAD_COUNT 		(TW * TH)
35 
36 #define MAX_KERNEL_WIDTH	32
37 
38 #define KW 4
39 
40 
41 
42 #if __DEVICE_EMULATION__
43 #define __debugsync() __syncthreads()
44 #else
45 #define __debugsync()
46 #endif
47 
48 #define TN            256
49 #define WARP_COUNT    (TN / 32)
50 #define HWARP_COUNT   (TN / 16)
51 
52 // Window size
53 #define WS            20
54 
55 
56 
57 struct WrapClamp
58 {
operator ()WrapClamp59 	int operator()(int i, int h)
60 	{
61 		i = min(max(i, 0), h-1);
62 	}
63 };
64 
65 struct WrapRepeat
66 {
operator ()WrapRepeat67 	int operator()(int i, int h)
68 	{
69 		i = abs(i) % h;	// :( Non power of two!
70 	}
71 };
72 
73 struct WrapMirror
74 {
operator ()WrapMirror75 	int operator()(int i, int h)
76 	{
77 		i = abs(i);
78 		while (i >= h) i = 2 * w - i - 2;
79 	}
80 };
81 
82 
83 // Vertical convolution filter that processes vertical strips.
convolveStrip(float * d_channel,float * d_kernel,int width,int height)84 __global__ void convolveStrip(float * d_channel, float * d_kernel, int width, int height)
85 {
86 	__shared__ float s_kernel[32 * WS];
87 
88 	// Preload kernel in shared memory.
89 	for (int i = 0; i < 32 * WS / TN; i++)
90 	{
91 		int idx = i * TN + tid;
92 		if (idx < 32 * WS) s_kernel[idx] = d_kernel[idx];
93 	}
94 
95 	__shared__ float s_strip[32 * WS];	// TN/32
96 
97 	int wid = tid / 32 - WS/2;
98 
99 	Mirror wrap;
100 	int row = wrap(wid);
101 
102 	// Preload image block.
103 	for (int i = 0; i < 32 * WS / TN; i++)
104 	{
105 	}
106 
107 	// @@ Apply kernel to TN/32 rows.
108 
109 	// @@ Load
110 
111 
112 }
113 
114 
115 
116 
117 
118 
119 __constant__ float inputGamma, outputInverseGamma;
120 __constant__ float kernel[MAX_KERNEL_WIDTH];
121 
122 // Use texture to access input?
123 // That's the most simple approach.
124 
125 texture<> image;
126 
127 ////////////////////////////////////////////////////////////////////////////////
128 // Combined convolution filter
129 ////////////////////////////////////////////////////////////////////////////////
130 
convolve(float4 * output)131 __global__ void convolve(float4 * output)
132 {
133 	// @@ Use morton order to assing threads.
134 	int x = threadIdx.x;
135 	int y = threadIdx.y;
136 
137 	float4 color = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
138 
139 	// texture coordinate.
140 	int2 t;
141 	t.x = 2 * (blockIdx.x * TW + x) - HW;
142 	t.y = blockIdx.y * TH + y;
143 
144 	// @@ We might want to loop and process strips, to reuse the results of the horizontal convolutions.
145 
146 	// Horizontal convolution. @@ Unroll loops.
147 	for (int e = HW; e > 0; e--)
148 	{
149 		t.x++;
150 		float w = kernel[e-1];
151 		color += w * tex2D(image, tc);
152 	}
153 
154 	for (int e = 0; e < HW; e++)
155 	{
156 		t.x++;
157 		float w = kernel[e];
158 		color += w * tex2D(image, tc);
159 	}
160 
161 	// Write color to shared memory.
162 	__shared__ float tile[4 * THREAD_COUNT];
163 
164 	int tileIdx = y * TW + x;
165 	tile[tileIdx + 0 * THREAD_COUNT] = color.x;
166 	tile[tileIdx + 1 * THREAD_COUNT] = color.y;
167 	tile[tileIdx + 2 * THREAD_COUNT] = color.z;
168 	tile[tileIdx + 3 * THREAD_COUNT] = color.w;
169 
170 	__syncthreads();
171 
172 	// tile coordinate.
173 	t.x = x;
174 	t.y = y - HW;
175 
176 	// Vertical convolution. @@ Unroll loops.
177 	for (int i = HW; i > 0; i--)
178 	{
179 		float w = kernel[i-1];
180 
181 		t.y++;
182 		int idx = t.y * TW + t.x;
183 
184 		color.x += w * tile[idx + 0 * THREAD_COUNT];
185 		color.y += w * tile[idx + 1 * THREAD_COUNT];
186 		color.z += w * tile[idx + 2 * THREAD_COUNT];
187 		color.w += w * tile[idx + 3 * THREAD_COUNT];
188 	}
189 
190 	for (int i = 0; i < HW; i++)
191 	{
192 		float w = kernel[i];
193 
194 		t.y++;
195 		int idx = t.y * TW + t.x;
196 
197 		color.x += w * tile[idx + 0 * THREAD_COUNT];
198 		color.y += w * tile[idx + 1 * THREAD_COUNT];
199 		color.z += w * tile[idx + 2 * THREAD_COUNT];
200 		color.w += w * tile[idx + 3 * THREAD_COUNT];
201 	}
202 
203 	it (x < w && y < h)
204 	{
205 		// @@ Prevent unaligned writes.
206 
207 		output[y * w + h] = color;
208 	}
209 }
210 
211 
212 ////////////////////////////////////////////////////////////////////////////////
213 // Monophase X convolution filter
214 ////////////////////////////////////////////////////////////////////////////////
215 
convolveY()216 __device__ void convolveY()
217 {
218 
219 }
220 
221 
222 ////////////////////////////////////////////////////////////////////////////////
223 // Mipmap convolution filter
224 ////////////////////////////////////////////////////////////////////////////////
225 
226 
227 
228 ////////////////////////////////////////////////////////////////////////////////
229 // Gamma correction
230 ////////////////////////////////////////////////////////////////////////////////
231 
232 /*
233 __device__ float toLinear(float f, float gamma = 2.2f)
234 {
235 	return __pow(f, gamma);
236 }
237 
238 __device__ float toGamma(float f, float gamma = 2.2f)
239 {
240 	return pow(f, 1.0f / gamma);
241 }
242 */
243 
244 
245 
246 
247 ////////////////////////////////////////////////////////////////////////////////
248 // Setup kernel
249 ////////////////////////////////////////////////////////////////////////////////
250 
setupConvolveKernel(const float * k,int w)251 extern "C" void setupConvolveKernel(const float * k, int w)
252 {
253 	w = min(w, MAX_KERNEL_WIDTH);
254 	cudaMemcpyToSymbol(kernel, k, sizeof(float) * w, 0);
255 }
256 
257 
258 ////////////////////////////////////////////////////////////////////////////////
259 // Launch kernel
260 ////////////////////////////////////////////////////////////////////////////////
261 
262 
263 
264 
265