1 ////////////////////////////////////////////////////////////////////////////
2 //	File:		ProgramCU.cu
3 //	Author:		Changchang Wu
4 //	Description : implementation of ProgramCU and all CUDA kernels
5 //
6 //	Copyright (c) 2007 University of North Carolina at Chapel Hill
7 //	All Rights Reserved
8 //
9 //	Permission to use, copy, modify and distribute this software and its
10 //	documentation for educational, research and non-profit purposes, without
11 //	fee, and without a written agreement is hereby granted, provided that the
12 //	above copyright notice and the following paragraph appear in all copies.
13 //
14 //	The University of North Carolina at Chapel Hill make no representations
15 //	about the suitability of this software for any purpose. It is provided
16 //	'as is' without express or implied warranty.
17 //
18 //	Please send BUG REPORTS to ccwu@cs.unc.edu
19 //
20 ////////////////////////////////////////////////////////////////////////////
21 
22 #if defined(CUDA_SIFTGPU_ENABLED)
23 
24 #include "GL/glew.h"
25 #include "stdio.h"
26 
27 #include "CuTexImage.h"
28 #include "ProgramCU.h"
29 #include "GlobalUtil.h"
30 
31 //----------------------------------------------------------------
32 //Begin SiftGPU setting section.
33 //////////////////////////////////////////////////////////
34 #define IMUL(X,Y) __mul24(X,Y)
35 //#define FDIV(X,Y) ((X)/(Y))
36 #define FDIV(X,Y) __fdividef(X,Y)
37 
38 /////////////////////////////////////////////////////////
39 //filter kernel width range (don't change this)
40 #define KERNEL_MAX_WIDTH 33
41 #define KERNEL_MIN_WIDTH 5
42 
43 //////////////////////////////////////////////////////////
44 //horizontal filter block size (32, 64, 128, 256, 512)
45 #define FILTERH_TILE_WIDTH 128
46 //thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)
47 #define FILTERV_BLOCK_WIDTH 16
48 #define FILTERV_BLOCK_HEIGHT 32
49 //The corresponding image patch for a thread block
50 #define FILTERV_PIXEL_PER_THREAD 4
51 #define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH
52 #define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)
53 
54 
55 //////////////////////////////////////////////////////////
56 //thread block size for computing Difference of Gaussian
57 #define DOG_BLOCK_LOG_DIMX 7
58 #define DOG_BLOCK_LOG_DIMY 0
59 #define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX)
60 #define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY)
61 
62 //////////////////////////////////////////////////////////
63 //thread block size for keypoint detection
64 #define KEY_BLOCK_LOG_DIMX 3
65 #define KEY_BLOCK_LOG_DIMY 3
66 #define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)
67 #define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)
68 //#define KEY_OFFSET_ONE
69 //make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..
70 //but it seems uncoalesced writes don't affect the speed
71 
72 //////////////////////////////////////////////////////////
73 //thread block size for initializing list generation (64, 128, 256, 512 ...)
74 #define HIST_INIT_WIDTH 128
75 //thread block size for generating feature list (32, 64, 128, 256, 512, ...)
76 #define LISTGEN_BLOCK_DIM 128
77 
78 
79 /////////////////////////////////////////////////////////
80 //how many keypoint orientations to compute in a block
81 #define ORIENTATION_COMPUTE_PER_BLOCK 64
82 //how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)
83 #define DESCRIPTOR_COMPUTE_PER_BLOCK	4
84 #define DESCRIPTOR_COMPUTE_BLOCK_SIZE	(16 * DESCRIPTOR_COMPUTE_PER_BLOCK)
85 //how many keypoint descriptor to normalized in a block (32, ...)
86 #define DESCRIPTOR_NORMALIZ_PER_BLOCK	32
87 
88 
89 
90 ///////////////////////////////////////////
91 //Thread block size for visualization
92 //(This doesn't affect the speed of computation)
93 #define BLOCK_LOG_DIM 4
94 #define BLOCK_DIM (1 << BLOCK_LOG_DIM)
95 
96 //End SiftGPU setting section.
97 //----------------------------------------------------------------
98 
99 
100 __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
101 texture<float, 1, cudaReadModeElementType> texData;
102 texture<unsigned char, 1, cudaReadModeNormalizedFloat> texDataB;
103 texture<float2, 2, cudaReadModeElementType> texDataF2;
104 texture<float4, 1, cudaReadModeElementType> texDataF4;
105 texture<int4, 1, cudaReadModeElementType> texDataI4;
106 texture<int4, 1, cudaReadModeElementType> texDataList;
107 
108 //template<int i>	 __device__ float Conv(float *data)		{    return Conv<i-1>(data) + data[i]*d_kernel[i];}
109 //template<>		__device__ float Conv<0>(float *data)	{    return data[0] * d_kernel[0];					}
110 
111 
112 //////////////////////////////////////////////////////////////
FilterH(float * d_result,int width)113 template<int FW> __global__ void FilterH( float* d_result, int width)
114 {
115 
116 	const int HALF_WIDTH = FW >> 1;
117 	const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;
118 	const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;
119 	__shared__ float data[CACHE_WIDTH];
120 	const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);
121 	const int col =  bcol + threadIdx.x;
122 	const int index_min = IMUL(blockIdx.y, width);
123 	const int index_max = index_min + width - 1;
124 	int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
125 	int cache_index = threadIdx.x;
126 	float value = 0;
127 #pragma unroll
128 	for(int j = 0; j < CACHE_COUNT; ++j)
129 	{
130 		if(cache_index < CACHE_WIDTH)
131 		{
132 			int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
133 			data[cache_index] = tex1Dfetch(texData,fetch_index);
134 			src_index += FILTERH_TILE_WIDTH;
135 			cache_index += FILTERH_TILE_WIDTH;
136 		}
137 	}
138 	__syncthreads();
139 	if(col >= width) return;
140 #pragma unroll
141 	for(int i = 0; i < FW; ++i)
142 	{
143 		value += (data[threadIdx.x + i]* d_kernel[i]);
144 	}
145 //	value = Conv<FW-1>(data + threadIdx.x);
146 	d_result[index_min + col] = value;
147 }
148 
149 
150 
151 ////////////////////////////////////////////////////////////////////
FilterV(float * d_result,int width,int height)152 template<int  FW>  __global__ void FilterV(float* d_result, int width, int height)
153 {
154 	const int HALF_WIDTH = FW >> 1;
155 	const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
156 	const int TEMP = CACHE_WIDTH & 0xf;
157 //add some extra space to avoid bank conflict
158 #if FILTERV_TILE_WIDTH == 16
159 	//make the stride 16 * n +/- 1
160 	const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;
161 #elif FILTERV_TILE_WIDTH == 8
162 	//make the stride 16 * n +/- 2
163 	const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);
164 #elif FILTERV_TILE_WIDTH == 4
165 	//make the stride 16 * n +/- 4
166 	const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);
167 #else
168 #error
169 #endif
170 	const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;
171 	const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT;
172 	const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT;
173 	__shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
174 	const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);
175 	const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;
176 	const int row_first = row_block_first - HALF_WIDTH;
177 	const int data_index_max = IMUL(height - 1, width) + col;
178 	const int cache_col_start = threadIdx.y;
179 	const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);
180 	int cache_index = cache_col_start + cache_row_start;
181 	int data_index = IMUL(row_first + cache_col_start, width) + col;
182 
183 	if(col < width)
184 	{
185 #pragma unroll
186 		for(int i = 0; i < CACHE_COUNT; ++i)
187 		{
188 			if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
189 			{
190 				int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
191 				data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index);
192 				data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);
193 			}
194 		}
195 	}
196 	__syncthreads();
197 
198 	if(col >= width) return;
199 
200 	int row = row_block_first + threadIdx.y;
201 	int index_start = cache_row_start + threadIdx.y;
202 #pragma unroll
203 	for(int i = 0; i < WRITE_COUNT;		++i,
204 			row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT)
205 	{
206 		if(row < height)
207 		{
208 			int index_dest = IMUL(row, width) + col;
209 			float value = 0;
210 #pragma unroll
211 			for(int i = 0; i < FW; ++i)
212 			{
213 				value += (data[index_start + i] * d_kernel[i]);
214 			}
215 			d_result[index_dest] = value;
216 		}
217 	}
218 }
219 
220 
UpsampleKernel(float * d_result,int width)221 template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width)
222 {
223 	const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
224 	const float INV_SCALE = 1.0f / (float(SCALE));
225 	int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
226 	if(col >= width) return;
227 
228 	int row = blockIdx.y >> LOG_SCALE;
229 	int index = row * width + col;
230 	int dst_row = blockIdx.y;
231 	int dst_idx= (width * dst_row + col) * SCALE;
232 	int helper = blockIdx.y & SCALE_MASK;
233 	if (helper)
234 	{
235 		float v11 = tex1Dfetch(texData, index);
236 		float v12 = tex1Dfetch(texData, index + 1);
237 		index += width;
238 		float v21 = tex1Dfetch(texData, index);
239 		float v22 = tex1Dfetch(texData, index + 1);
240 		float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
241 		float v1 = (v21 * w1  + w2 * v11);
242 		float v2 = (v22 * w1  + w2 * v12);
243 		d_result[dst_idx] = v1;
244 #pragma unroll
245 		for(int i = 1; i < SCALE; ++i)
246 		{
247 			const float r2 = i * INV_SCALE;
248 			const float r1 = 1.0f - r2;
249 			d_result[dst_idx +i] = v1 * r1 + v2 * r2;
250 		}
251 	}else
252 	{
253 		float v1 = tex1Dfetch(texData, index);
254 		float v2 = tex1Dfetch(texData, index + 1);
255 		d_result[dst_idx] = v1;
256 #pragma unroll
257 		for(int i = 1; i < SCALE; ++i)
258 		{
259 			const float r2 = i * INV_SCALE;
260 			const float r1 = 1.0f - r2;
261 			d_result[dst_idx +i] = v1 * r1 + v2 * r2;
262 		}
263 	}
264 
265 }
266 
267 ////////////////////////////////////////////////////////////////////////////////////////
SampleImageU(CuTexImage * dst,CuTexImage * src,int log_scale)268 void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
269 {
270 	int width = src->GetImgWidth(), height = src->GetImgHeight();
271 	src->BindTexture(texData);
272 	dim3 grid((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
273 	dim3 block(FILTERH_TILE_WIDTH);
274 	switch(log_scale)
275 	{
276 	case 1 : 	UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
277 	case 2 : 	UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
278 	case 3 : 	UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
279 	default:	break;
280 	}
281 }
282 
DownsampleKernel(float * d_result,int src_width,int dst_width)283 template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)
284 {
285 	const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
286 	if(dst_col >= dst_width) return;
287 	const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));
288 	const int dst_row = blockIdx.y;
289 	const int src_row = blockIdx.y << LOG_SCALE;
290 	const int src_idx = IMUL(src_row, src_width) + src_col;
291 	const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
292 	d_result[dst_idx] = tex1Dfetch(texData, src_idx);
293 
294 }
295 
DownsampleKernel(float * d_result,int src_width,int dst_width,const int log_scale)296 __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)
297 {
298 	const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
299 	if(dst_col >= dst_width) return;
300 	const int src_col = min((dst_col << log_scale), (src_width - 1));
301 	const int dst_row = blockIdx.y;
302 	const int src_row = blockIdx.y << log_scale;
303 	const int src_idx = IMUL(src_row, src_width) + src_col;
304 	const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
305 	d_result[dst_idx] = tex1Dfetch(texData, src_idx);
306 
307 }
308 
SampleImageD(CuTexImage * dst,CuTexImage * src,int log_scale)309 void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
310 {
311 	int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
312 
313 	src->BindTexture(texData);
314 	dim3 grid((dst_width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
315 	dim3 block(FILTERH_TILE_WIDTH);
316 	switch(log_scale)
317 	{
318 	case 1 : 	DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
319 	case 2 :	DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
320 	case 3 : 	DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
321 	default:	DownsampleKernel    <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);
322 	}
323 }
324 
ChannelReduce_Kernel(float * d_result)325 __global__ void ChannelReduce_Kernel(float* d_result)
326 {
327 	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
328 	d_result[index] = tex1Dfetch(texData, index*4);
329 }
330 
ChannelReduce_Convert_Kernel(float * d_result)331 __global__ void ChannelReduce_Convert_Kernel(float* d_result)
332 {
333 	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
334 	float4 rgba = tex1Dfetch(texDataF4, index);
335 	d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
336 }
337 
ReduceToSingleChannel(CuTexImage * dst,CuTexImage * src,int convert_rgb)338 void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)
339 {
340 	int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
341 
342 	dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
343 	dim3 block(FILTERH_TILE_WIDTH);
344 	if(convert_rgb)
345 	{
346 		src->BindTexture(texDataF4);
347 		ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
348 	}else
349 	{
350 		src->BindTexture(texData);
351 		ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
352 	}
353 }
354 
ConvertByteToFloat_Kernel(float * d_result)355 __global__ void ConvertByteToFloat_Kernel(float* d_result)
356 {
357 	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
358 	d_result[index] = tex1Dfetch(texDataB, index);
359 }
360 
ConvertByteToFloat(CuTexImage * src,CuTexImage * dst)361 void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
362 {
363 	int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
364 	dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
365 	dim3 block(FILTERH_TILE_WIDTH);
366 	src->BindTexture(texDataB);
367 	ConvertByteToFloat_Kernel<<<grid, block>>>((float*)dst->_cuData);
368 }
369 
CreateFilterKernel(float sigma,float * kernel,int & width)370 void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
371 {
372 	int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
373 	width = 2*sz + 1;
374 
375 	if(width > KERNEL_MAX_WIDTH)
376 	{
377 		//filter size truncation
378 		sz = KERNEL_MAX_WIDTH >> 1;
379 		width =KERNEL_MAX_WIDTH;
380 	}else if(width < KERNEL_MIN_WIDTH)
381 	{
382 		sz = KERNEL_MIN_WIDTH >> 1;
383 		width =KERNEL_MIN_WIDTH;
384 	}
385 
386 	float   rv = 1.0f/(sigma*sigma), v, ksum =0;
387 
388 	// pre-compute filter
389 	for( i = -sz ; i <= sz ; ++i)
390 	{
391 		kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;
392 		ksum += v;
393 	}
394 
395 	//normalize the kernel
396 	rv = 1.0f/ksum;
397 	for(i = 0; i< width ;i++) kernel[i]*=rv;
398 }
399 
400 
FilterImage(CuTexImage * dst,CuTexImage * src,CuTexImage * buf)401 template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)
402 {
403 	int width = src->GetImgWidth(), height = src->GetImgHeight();
404 
405 	//horizontal filtering
406 	src->BindTexture(texData);
407 	dim3 gridh((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
408 	dim3 blockh(FILTERH_TILE_WIDTH);
409 	FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
410 	CheckErrorCUDA("FilterH");
411 
412 	///vertical filtering
413 	buf->BindTexture(texData);
414 	dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH,  (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
415 	dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);
416 	FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height);
417 	CheckErrorCUDA("FilterV");
418 }
419 
420 //////////////////////////////////////////////////////////////////////
421 // tested on 2048x1500 image, the time on pyramid construction is
422 // OpenGL version : 18ms
423 // CUDA version: 28 ms
FilterImage(CuTexImage * dst,CuTexImage * src,CuTexImage * buf,float sigma)424 void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)
425 {
426 	float filter_kernel[KERNEL_MAX_WIDTH]; int width;
427 	CreateFilterKernel(sigma, filter_kernel, width);
428 	cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);
429 
430 	switch(width)
431 	{
432 		case 5:		FilterImage< 5>(dst, src, buf);	break;
433 		case 7:		FilterImage< 7>(dst, src, buf);	break;
434 		case 9:		FilterImage< 9>(dst, src, buf);	break;
435 		case 11:	FilterImage<11>(dst, src, buf);	break;
436 		case 13:	FilterImage<13>(dst, src, buf);	break;
437 		case 15:	FilterImage<15>(dst, src, buf);	break;
438 		case 17:	FilterImage<17>(dst, src, buf);	break;
439 		case 19:	FilterImage<19>(dst, src, buf);	break;
440 		case 21:	FilterImage<21>(dst, src, buf);	break;
441 		case 23:	FilterImage<23>(dst, src, buf);	break;
442 		case 25:	FilterImage<25>(dst, src, buf);	break;
443 		case 27:	FilterImage<27>(dst, src, buf);	break;
444 		case 29:	FilterImage<29>(dst, src, buf);	break;
445 		case 31:	FilterImage<31>(dst, src, buf);	break;
446 		case 33:	FilterImage<33>(dst, src, buf);	break;
447 		default:	break;
448 	}
449 
450 }
451 
452 
453 texture<float, 1, cudaReadModeElementType> texC;
454 texture<float, 1, cudaReadModeElementType> texP;
455 texture<float, 1, cudaReadModeElementType> texN;
456 
ComputeDOG_Kernel(float * d_dog,float2 * d_got,int width,int height)457 void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)
458 {
459 	int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
460 	int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
461 	if(col < width && row < height)
462 	{
463 		int index = IMUL(row, width) + col;
464 		float vp = tex1Dfetch(texP, index);
465 		float v = tex1Dfetch(texC, index);
466 		d_dog[index] = v - vp;
467 		float vxn = tex1Dfetch(texC, index + 1);
468 		float vxp = tex1Dfetch(texC, index - 1);
469 		float vyp = tex1Dfetch(texC, index - width);
470 		float vyn = tex1Dfetch(texC, index + width);
471 		float dx = vxn - vxp, dy = vyn - vyp;
472 		float grd = 0.5f * sqrt(dx * dx  + dy * dy);
473 		float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
474 		d_got[index] = make_float2(grd, rot);
475 	}
476 }
477 
ComputeDOG_Kernel(float * d_dog,int width,int height)478 void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)
479 {
480 	int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
481 	int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
482 	if(col < width && row < height)
483 	{
484 		int index = IMUL(row, width) + col;
485 		float vp = tex1Dfetch(texP, index);
486 		float v = tex1Dfetch(texC, index);
487 		d_dog[index] = v - vp;
488 	}
489 }
490 
ComputeDOG(CuTexImage * gus,CuTexImage * dog,CuTexImage * got)491 void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
492 {
493 	int width = gus->GetImgWidth(), height = gus->GetImgHeight();
494 	dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX,  (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
495 	dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
496 	gus->BindTexture(texC);
497 	(gus -1)->BindTexture(texP);
498 	if(got->_cuData)
499 		ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
500 	else
501 		ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
502 }
503 
504 
505 #define READ_CMP_DOG_DATA(datai, tex, idx) \
506 		datai[0] = tex1Dfetch(tex, idx - 1);\
507 		datai[1] = tex1Dfetch(tex, idx);\
508 		datai[2] = tex1Dfetch(tex, idx + 1);\
509 		if(v > nmax)\
510 		{\
511 			   nmax = max(nmax, datai[0]);\
512 			   nmax = max(nmax, datai[1]);\
513 			   nmax = max(nmax, datai[2]);\
514 			   if(v < nmax) goto key_finish;\
515 		}else\
516 		{\
517 			   nmin = min(nmin, datai[0]);\
518 			   nmin = min(nmin, datai[1]);\
519 			   nmin = min(nmin, datai[2]);\
520 			   if(v > nmin) goto key_finish;\
521 		}
522 
523 
ComputeKEY_Kernel(float4 * d_key,int width,int colmax,int rowmax,float dog_threshold0,float dog_threshold,float edge_threshold,int subpixel_localization)524 void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax,
525 					float dog_threshold0,  float dog_threshold, float edge_threshold, int subpixel_localization)
526 {
527        float data[3][3], v;
528        float datap[3][3], datan[3][3];
529 #ifdef KEY_OFFSET_ONE
530        int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;
531        int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;
532 #else
533        int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;
534        int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;
535 #endif
536        int index = IMUL(row, width) + col;
537 	   int idx[3] ={index - width, index, index + width};
538        int in_image =0;
539        float nmax, nmin, result = 0.0f;
540 	   float dx = 0, dy = 0, ds = 0;
541 	   bool offset_test_passed = true;
542 #ifdef KEY_OFFSET_ONE
543        if(row < rowmax && col < colmax)
544 #else
545        if(row > 0 && col > 0 && row < rowmax && col < colmax)
546 #endif
547        {
548 			in_image = 1;
549 			data[1][1] = v = tex1Dfetch(texC, idx[1]);
550 			if(fabs(v) <= dog_threshold0) goto key_finish;
551 
552 			data[1][0] = tex1Dfetch(texC, idx[1] - 1);
553 			data[1][2] = tex1Dfetch(texC, idx[1] + 1);
554 			nmax = max(data[1][0], data[1][2]);
555 			nmin = min(data[1][0], data[1][2]);
556 
557 			if(v <=nmax && v >= nmin) goto key_finish;
558 			//if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
559 			READ_CMP_DOG_DATA(data[0], texC, idx[0]);
560 			READ_CMP_DOG_DATA(data[2], texC, idx[2]);
561 
562 			//edge supression
563 			float vx2 = v * 2.0f;
564 			float fxx = data[1][0] + data[1][2] - vx2;
565 			float fyy = data[0][1] + data[2][1] - vx2;
566 			float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);
567 			float temp1 = fxx * fyy - fxy * fxy;
568 			float temp2 = (fxx + fyy) * (fxx + fyy);
569 			if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;
570 
571 
572 			//read the previous level
573 			READ_CMP_DOG_DATA(datap[0], texP, idx[0]);
574 			READ_CMP_DOG_DATA(datap[1], texP, idx[1]);
575 			READ_CMP_DOG_DATA(datap[2], texP, idx[2]);
576 
577 
578 			//read the next level
579 			READ_CMP_DOG_DATA(datan[0], texN, idx[0]);
580 			READ_CMP_DOG_DATA(datan[1], texN, idx[1]);
581 			READ_CMP_DOG_DATA(datan[2], texN, idx[2]);
582 
583 			if(subpixel_localization)
584 			{
585 				//subpixel localization
586 				float fx = 0.5f * (data[1][2] - data[1][0]);
587 				float fy = 0.5f * (data[2][1] - data[0][1]);
588 				float fs = 0.5f * (datan[1][1] - datap[1][1]);
589 
590 				float fss = (datan[1][1] + datap[1][1] - vx2);
591 				float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);
592 				float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);
593 
594 				//need to solve dx, dy, ds;
595 				// |-fx|     | fxx fxy fxs |   |dx|
596 				// |-fy|  =  | fxy fyy fys | * |dy|
597 				// |-fs|     | fxs fys fss |   |ds|
598 				float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
599 				float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
600 				float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
601 				float maxa = max(max(A0.x, A1.x), A2.x);
602 				if(maxa >= 1e-10)
603 				{
604 					if(maxa == A1.x)
605 					{
606 						float4 TEMP = A1; A1 = A0; A0 = TEMP;
607 					}else if(maxa == A2.x)
608 					{
609 						float4 TEMP = A2; A2 = A0; A0 = TEMP;
610 					}
611 					A0.y /= A0.x;	A0.z /= A0.x;	A0.w/= A0.x;
612 					A1.y -= A1.x * A0.y;	A1.z -= A1.x * A0.z;	A1.w -= A1.x * A0.w;
613 					A2.y -= A2.x * A0.y;	A2.z -= A2.x * A0.z;	A2.w -= A2.x * A0.w;
614 					if(abs(A2.y) > abs(A1.y))
615 					{
616 						float4 TEMP = A2;	A2 = A1; A1 = TEMP;
617 					}
618 					if(abs(A1.y) >= 1e-10)
619 					{
620 						A1.z /= A1.y;	A1.w /= A1.y;
621 						A2.z -= A2.y * A1.z;	A2.w -= A2.y * A1.w;
622 						if(abs(A2.z) >= 1e-10)
623 						{
624 							ds = A2.w / A2.z;
625 							dy = A1.w - ds * A1.z;
626 							dx = A0.w - ds * A0.z - dy * A0.y;
627 
628 							offset_test_passed =
629 								fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold
630 								&&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;
631 						}
632 					}
633 				}
634 			}
635 			if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;
636        }
637 key_finish:
638        if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
639 }
640 
641 
ComputeKEY(CuTexImage * dog,CuTexImage * key,float Tdog,float Tedge)642 void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)
643 {
644 	int width = dog->GetImgWidth(), height = dog->GetImgHeight();
645 	float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;
646 	CuTexImage* dogp = dog - 1;
647 	CuTexImage* dogn = dog + 1;
648 #ifdef KEY_OFFSET_ONE
649 	dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
650 #else
651 	dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
652 #endif
653 	dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
654 	dogp->BindTexture(texP);
655 	dog ->BindTexture(texC);
656 	dogn->BindTexture(texN);
657 	Tedge = (Tedge+1)*(Tedge+1)/Tedge;
658 	ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width,
659         width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
660 
661 }
662 
663 
664 
InitHist_Kernel(int4 * hist,int ws,int wd,int height)665 void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
666 {
667        int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
668        int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
669 	   if(row < height && col < wd)
670 	   {
671 			int hidx = IMUL(row, wd) + col;
672 			int scol = col << 2;
673 			int sidx = IMUL(row, ws) + scol;
674 			int v[4] = {0, 0, 0, 0};
675 			if(row > 0 && row < height -1)
676 			{
677 #pragma unroll
678 				for(int i = 0; i < 4 ; ++i, ++scol)
679 				{
680 					float4 temp = tex1Dfetch(texDataF4, sidx +i);
681 					v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
682 				}
683 			}
684 			hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
685 
686 	   }
687 }
688 
689 
690 
InitHistogram(CuTexImage * key,CuTexImage * hist)691 void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
692 {
693 	int ws = key->GetImgWidth(), hs = key->GetImgHeight();
694 	int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
695 	dim3 grid((wd  + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH,  hd);
696 	dim3 block(HIST_INIT_WIDTH, 1);
697 	key->BindTexture(texDataF4);
698 	InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd);
699 }
700 
701 
702 
ReduceHist_Kernel(int4 * d_hist,int ws,int wd,int height)703 void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
704 {
705        int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
706        int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
707 	   if(row < height && col < wd)
708 	   {
709 			int hidx = IMUL(row, wd) + col;
710 			int scol = col << 2;
711 			int sidx = IMUL(row, ws) + scol;
712 			int v[4] = {0, 0, 0, 0};
713 #pragma unroll
714 			for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
715 			{
716 				int4 temp = tex1Dfetch(texDataI4, sidx + i);
717 				v[i] = temp.x + temp.y + temp.z + temp.w;
718 			}
719 			d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
720 	   }
721 }
722 
ReduceHistogram(CuTexImage * hist1,CuTexImage * hist2)723 void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
724 {
725 	int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
726 	int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
727 	int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f));
728 	const int wi = min(7, max(temp , 0));
729 	hist1->BindTexture(texDataI4);
730 
731 	const int BW = 1 << wi, BH =  1 << (7 - wi);
732 	dim3 grid((wd  + BW - 1)/ BW,  (hd + BH -1) / BH);
733 	dim3 block(BW, BH);
734 	ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd);
735 }
736 
737 
ListGen_Kernel(int4 * d_list,int list_len,int width)738 void __global__ ListGen_Kernel(int4* d_list, int list_len, int width)
739 {
740 	int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
741     int4 pos = tex1Dfetch(texDataList, idx1);
742 	int idx2 = IMUL(pos.y, width) + pos.x;
743 	int4 temp = tex1Dfetch(texDataI4, idx2);
744 	int  sum1 = temp.x + temp.y;
745 	int  sum2 = sum1 + temp.z;
746 	pos.x <<= 2;
747 	if(pos.z >= sum2)
748 	{
749 		pos.x += 3;
750 		pos.z -= sum2;
751 	}else if(pos.z >= sum1)
752 	{
753 		pos.x += 2;
754 		pos.z -= sum1;
755 	}else if(pos.z >= temp.x)
756 	{
757 		pos.x += 1;
758 		pos.z -= temp.x;
759 	}
760   if (idx1 < list_len) {
761     d_list[idx1] = pos;
762   }
763 }
764 
765 //input list (x, y) (x, y) ....
GenerateList(CuTexImage * list,CuTexImage * hist)766 void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
767 {
768 	int len = list->GetImgWidth();
769 	list->BindTexture(texDataList);
770 	hist->BindTexture(texDataI4);
771 	dim3  grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
772 	dim3  block(LISTGEN_BLOCK_DIM);
773 	ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, len,
774                                   hist->GetImgWidth());
775 }
776 
ComputeOrientation_Kernel(float4 * d_list,int list_len,int width,int height,float sigma,float sigma_step,float gaussian_factor,float sample_factor,int num_orientation,int existing_keypoint,int subpixel,int keepsign)777 void __global__ ComputeOrientation_Kernel(float4* d_list,
778 										  int list_len,
779 										  int width, int height,
780 										  float sigma, float sigma_step,
781 										  float gaussian_factor, float sample_factor,
782 										  int num_orientation,
783 										  int existing_keypoint,
784 										  int subpixel,
785 										  int keepsign)
786 {
787 	const float ten_degree_per_radius = 5.7295779513082320876798154814105;
788 	const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;
789 	int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
790 	if(idx >= list_len) return;
791 	float4 key;
792 	if(existing_keypoint)
793 	{
794 		key = tex1Dfetch(texDataF4, idx);
795 	}else
796 	{
797 		int4 ikey = tex1Dfetch(texDataList, idx);
798 		key.x = ikey.x + 0.5f;
799 		key.y = ikey.y + 0.5f;
800 		key.z = sigma;
801 		if(subpixel || keepsign)
802 		{
803 			float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
804 			if(subpixel)
805 			{
806 				key.x += offset.y;
807 				key.y += offset.z;
808 				key.z *= pow(sigma_step, offset.w);
809 			}
810 			if(keepsign) key.z *= offset.x;
811 		}
812 	}
813 	if(num_orientation == 0)
814 	{
815 		key.w = 0;
816 		d_list[idx] = key;
817 		return;
818 	}
819 	float vote[37];
820 	float gsigma = key.z * gaussian_factor;
821 	float win = fabs(key.z) * sample_factor;
822 	float dist_threshold = win * win + 0.5;
823 	float factor = -0.5f / (gsigma * gsigma);
824 	float xmin = max(1.5f, floor(key.x - win) + 0.5f);
825 	float ymin = max(1.5f, floor(key.y - win) + 0.5f);
826 	float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f);
827 	float ymax = min(height -1.5f, floor(key.y + win) + 0.5f);
828 #pragma unroll
829 	for(int i = 0; i < 36; ++i) vote[i] = 0.0f;
830 	for(float y = ymin; y <= ymax; y += 1.0f)
831 	{
832 		for(float x = xmin; x <= xmax; x += 1.0f)
833 		{
834 			float dx = x - key.x;
835 			float dy = y - key.y;
836 			float sq_dist  = dx * dx + dy * dy;
837 			if(sq_dist >= dist_threshold) continue;
838 			float2 got = tex2D(texDataF2, x, y);
839 			float weight = got.x * exp(sq_dist * factor);
840 			float fidx = floor(got.y * ten_degree_per_radius);
841 			int oidx = fidx;
842 			if(oidx < 0) oidx += 36;
843 			vote[oidx] += weight;
844 		}
845 	}
846 
847 	//filter the vote
848 
849 	const float one_third = 1.0 /3.0;
850 #pragma unroll
851 	for(int i = 0; i < 6; ++i)
852 	{
853 		vote[36] = vote[0];
854 		float pre = vote[35];
855 #pragma unroll
856 		for(int j = 0; j < 36; ++j)
857 		{
858 			float temp = one_third * (pre + vote[j] + vote[j + 1]);
859 			pre = vote[j];			vote[j] = temp;
860 		}
861 	}
862 
863 	vote[36] = vote[0];
864 	if(num_orientation == 1 || existing_keypoint)
865 	{
866 		int index_max = 0;
867 		float max_vote = vote[0];
868 #pragma unroll
869 		for(int i = 1; i < 36; ++i)
870 		{
871 			index_max =  vote[i] > max_vote? i : index_max;
872 			max_vote = max(max_vote, vote[i]);
873 		}
874 		float pre = vote[index_max == 0? 35 : index_max -1];
875 		float next = vote[index_max + 1];
876 		float weight = max_vote;
877 		float off =  0.5f * FDIV(next - pre, weight + weight - next - pre);
878 		key.w = radius_per_ten_degrees * (index_max + 0.5f + off);
879 		d_list[idx] = key;
880 
881 	}else
882 	{
883 		float max_vote = vote[0];
884 #pragma unroll
885 		for(int i = 1; i < 36; ++i)		max_vote = max(max_vote, vote[i]);
886 
887 		float vote_threshold = max_vote * 0.8f;
888 		float pre = vote[35];
889 		float max_rot[2], max_vot[2] = {0, 0};
890 		int  ocount = 0;
891 #pragma unroll
892 		for(int i =0; i < 36; ++i)
893 		{
894 			float next = vote[i + 1];
895 			if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)
896 			{
897 				float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre);
898 				float rot = i + di + 0.5f;
899 				float weight = vote[i];
900 				///
901 				if(weight > max_vot[1])
902 				{
903 					if(weight > max_vot[0])
904 					{
905 						max_vot[1] = max_vot[0];
906 						max_rot[1] = max_rot[0];
907 						max_vot[0] = weight;
908 						max_rot[0] = rot;
909 					}
910 					else
911 					{
912 						max_vot[1] = weight;
913 						max_rot[1] = rot;
914 					}
915 					ocount ++;
916 				}
917 			}
918 			pre = vote[i];
919 		}
920 		float fr1 = max_rot[0] / 36.0f;
921 		if(fr1 < 0) fr1 += 1.0f;
922 		unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floor(fr1 * 65535.0f));
923 		unsigned short us2 = 65535;
924 		if(ocount > 1)
925 		{
926 			float fr2 = max_rot[1] / 36.0f;
927 			if(fr2 < 0) fr2 += 1.0f;
928 			us2 = (unsigned short ) floor(fr2 * 65535.0f);
929 		}
930 		unsigned int uspack = (us2 << 16) | us1;
931 		key.w = __int_as_float(uspack);
932 		d_list[idx] = key;
933 	}
934 
935 }
936 
937 
938 
939 
ComputeOrientation(CuTexImage * list,CuTexImage * got,CuTexImage * key,float sigma,float sigma_step,int existing_keypoint)940 void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key,
941 								   float sigma, float sigma_step, int existing_keypoint)
942 {
943 	int len = list->GetImgWidth();
944 	if(len <= 0) return;
945 	int width = got->GetImgWidth(), height = got->GetImgHeight();
946 	if(existing_keypoint)
947 	{
948 		list->BindTexture(texDataF4);
949 	}else
950 	{
951 		list->BindTexture(texDataList);
952 		if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);
953 	}
954 	got->BindTexture2D(texDataF2);
955 
956 	const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK;
957 	dim3 grid((len + block_width -1) / block_width);
958 	dim3 block(block_width);
959 
960 	ComputeOrientation_Kernel<<<grid, block>>>((float4*) list->_cuData,
961 		len, width, height, sigma, sigma_step,
962 		GlobalUtil::_OrientationGaussianFactor,
963 		GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor,
964 		GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation,
965 		existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign);
966 
967 	ProgramCU::CheckErrorCUDA("ComputeOrientation");
968 }
969 
ComputeDescriptor_Kernel(float4 * d_des,int num,int width,int height,float window_factor)970 template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4* d_des, int num,
971 											 int width, int height, float window_factor)
972 {
973 	const float rpi = 4.0/ 3.14159265358979323846;
974 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
975 	int fidx = idx >> 4;
976 	if(fidx >= num) return;
977 	float4 key = tex1Dfetch(texDataF4, fidx);
978 	int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
979 	float spt = fabs(key.z * window_factor);
980 	float s, c; __sincosf(key.w, &s, &c);
981 	float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ;
982 	float cspt = c * spt, sspt = s * spt;
983 	float crspt = c / spt, srspt = s / spt;
984 	float2 offsetpt, pt;
985 	float xmin, ymin, xmax, ymax, bsz;
986 	offsetpt.x = ix - 1.5f;
987 	offsetpt.y = iy - 1.5f;
988 	pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x;
989 	pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y;
990 	bsz =  fabs(cspt) + fabs(sspt);
991 	xmin = max(1.5f, floor(pt.x - bsz) + 0.5f);
992 	ymin = max(1.5f, floor(pt.y - bsz) + 0.5f);
993 	xmax = min(width - 1.5f, floor(pt.x + bsz) + 0.5f);
994 	ymax = min(height - 1.5f, floor(pt.y + bsz) + 0.5f);
995 	float des[9];
996 #pragma unroll
997 	for(int i =0; i < 9; ++i) des[i] = 0.0f;
998 	for(float y = ymin; y <= ymax; y += 1.0f)
999 	{
1000 		for(float x = xmin; x <= xmax; x += 1.0f)
1001 		{
1002 			float dx = x - pt.x;
1003 			float dy = y - pt.y;
1004 			float nx = crspt * dx + srspt * dy;
1005 			float ny = crspt * dy - srspt * dx;
1006 			float nxn = fabs(nx);
1007 			float nyn = fabs(ny);
1008 			if(nxn < 1.0f && nyn < 1.0f)
1009 			{
1010 				float2 cc = tex2D(texDataF2, x, y);
1011 				float dnx = nx + offsetpt.x;
1012 				float dny = ny + offsetpt.y;
1013 				float ww = exp(-0.125f * (dnx * dnx + dny * dny));
1014 				float wx = 1.0 - nxn;
1015 				float wy = 1.0 - nyn;
1016 				float weight = ww * wx * wy * cc.x;
1017 				float theta = (anglef - cc.y) * rpi;
1018 				if(theta < 0) theta += 8.0f;
1019 				float fo = floor(theta);
1020 				int fidx = fo;
1021 				float weight1 = fo + 1.0f  - theta;
1022 				float weight2 = theta - fo;
1023 				if(DYNAMIC_INDEXING)
1024 				{
1025 					des[fidx] += (weight1 * weight);
1026 					des[fidx + 1] += (weight2 * weight);
1027 					//this dynamic indexing part might be slow
1028 				}else
1029 				{
1030 					#pragma unroll
1031 					for(int k = 0; k < 8; ++k)
1032 					{
1033 						if(k == fidx)
1034 						{
1035 							des[k] += (weight1 * weight);
1036 							des[k+1] += (weight2 * weight);
1037 						}
1038 					}
1039 				}
1040 			}
1041 		}
1042 	}
1043 	des[0] += des[8];
1044 
1045 	int didx = idx << 1;
1046 	d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1047 	d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1048 }
1049 
1050 
ComputeDescriptorRECT_Kernel(float4 * d_des,int num,int width,int height,float window_factor)1051 template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num,
1052 											 int width, int height, float window_factor)
1053 {
1054 	const float rpi = 4.0/ 3.14159265358979323846;
1055 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1056 	int fidx = idx >> 4;
1057 	if(fidx >= num) return;
1058 	float4 key = tex1Dfetch(texDataF4, fidx);
1059 	int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
1060     //float aspect_ratio = key.w / key.z;
1061     //float aspect_sq = aspect_ratio * aspect_ratio;
1062 	float sptx = key.z * 0.25, spty = key.w * 0.25;
1063 	float xmin, ymin, xmax, ymax; float2 pt;
1064 	pt.x = sptx * (ix + 0.5f)  + key.x;
1065 	pt.y = spty * (iy + 0.5f)  + key.y;
1066 	xmin = max(1.5f, floor(pt.x - sptx) + 0.5f);
1067 	ymin = max(1.5f, floor(pt.y - spty) + 0.5f);
1068 	xmax = min(width - 1.5f, floor(pt.x + sptx) + 0.5f);
1069 	ymax = min(height - 1.5f, floor(pt.y + spty) + 0.5f);
1070 	float des[9];
1071 #pragma unroll
1072 	for(int i =0; i < 9; ++i) des[i] = 0.0f;
1073 	for(float y = ymin; y <= ymax; y += 1.0f)
1074 	{
1075 		for(float x = xmin; x <= xmax; x += 1.0f)
1076 		{
1077 			float nx = (x - pt.x) / sptx;
1078 			float ny = (y - pt.y) / spty;
1079 			float nxn = fabs(nx);
1080 			float nyn = fabs(ny);
1081 			if(nxn < 1.0f && nyn < 1.0f)
1082 			{
1083 				float2 cc = tex2D(texDataF2, x, y);
1084 				float wx = 1.0 - nxn;
1085 				float wy = 1.0 - nyn;
1086 				float weight =  wx * wy * cc.x;
1087 				float theta = (- cc.y) * rpi;
1088 				if(theta < 0) theta += 8.0f;
1089 				float fo = floor(theta);
1090 				int fidx = fo;
1091 				float weight1 = fo + 1.0f  - theta;
1092 				float weight2 = theta - fo;
1093 				if(DYNAMIC_INDEXING)
1094 				{
1095 					des[fidx] += (weight1 * weight);
1096 					des[fidx + 1] += (weight2 * weight);
1097 					//this dynamic indexing part might be slow
1098 				}else
1099 				{
1100 					#pragma unroll
1101 					for(int k = 0; k < 8; ++k)
1102 					{
1103 						if(k == fidx)
1104 						{
1105 							des[k] += (weight1 * weight);
1106 							des[k+1] += (weight2 * weight);
1107 						}
1108 					}
1109 				}
1110 			}
1111 		}
1112 	}
1113 	des[0] += des[8];
1114 
1115 	int didx = idx << 1;
1116 	d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1117 	d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1118 }
1119 
NormalizeDescriptor_Kernel(float4 * d_des,int num)1120 void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num)
1121 {
1122 	float4 temp[32];
1123 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1124 	if(idx >= num) return;
1125 	int sidx = idx << 5;
1126 	float norm1 = 0, norm2 = 0;
1127 #pragma unroll
1128 	for(int i = 0; i < 32; ++i)
1129 	{
1130 		temp[i] = tex1Dfetch(texDataF4, sidx +i);
1131 		norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1132 				 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1133 	}
1134 	norm1 = rsqrt(norm1);
1135 
1136 #pragma unroll
1137 	for(int i = 0; i < 32; ++i)
1138 	{
1139 		temp[i].x = min(0.2f, temp[i].x * norm1);
1140 		temp[i].y = min(0.2f, temp[i].y * norm1);
1141 		temp[i].z = min(0.2f, temp[i].z * norm1);
1142 		temp[i].w = min(0.2f, temp[i].w * norm1);
1143 		norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1144 				 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1145 	}
1146 
1147 	norm2 = rsqrt(norm2);
1148 #pragma unroll
1149 	for(int i = 0; i < 32; ++i)
1150 	{
1151 		temp[i].x *= norm2;		temp[i].y *= norm2;
1152 		temp[i].z *= norm2;		temp[i].w *= norm2;
1153 		d_des[sidx + i] = temp[i];
1154 	}
1155 }
1156 
ComputeDescriptor(CuTexImage * list,CuTexImage * got,CuTexImage * dtex,int rect,int stream)1157 void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream)
1158 {
1159 	int num = list->GetImgWidth();
1160 	int width = got->GetImgWidth();
1161 	int height = got->GetImgHeight();
1162 
1163     dtex->InitTexture(num * 128, 1, 1);
1164 	got->BindTexture2D(texDataF2);
1165 	list->BindTexture(texDataF4);
1166 	int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
1167 	dim3 grid((num * 16 + block_width -1) / block_width);
1168 	dim3 block(block_width);
1169 
1170     if(rect)
1171     {
1172 	    if(GlobalUtil::_UseDynamicIndexing)
1173 	    	ComputeDescriptorRECT_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1174 	    else
1175 	    	ComputeDescriptorRECT_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1176 
1177     }else
1178     {
1179 	    if(GlobalUtil::_UseDynamicIndexing)
1180 	    	ComputeDescriptor_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1181 	    else
1182 	    	ComputeDescriptor_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1183     }
1184 	if(GlobalUtil::_NormalizedSIFT)
1185 	{
1186 		dtex->BindTexture(texDataF4);
1187 		const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK;
1188 		dim3 grid((num + block_width -1) / block_width);
1189 		dim3 block(block_width);
1190 		NormalizeDescriptor_Kernel<<<grid, block>>>((float4*) dtex->_cuData, num);
1191 	}
1192 	CheckErrorCUDA("ComputeDescriptor");
1193 }
1194 
1195 //////////////////////////////////////////////////////
FinishCUDA()1196 void ProgramCU::FinishCUDA()
1197 {
1198 	cudaThreadSynchronize();
1199 }
1200 
CheckErrorCUDA(const char * location)1201 int ProgramCU::CheckErrorCUDA(const char* location)
1202 {
1203 	cudaError_t e = cudaGetLastError();
1204 	if(e)
1205 	{
1206         if(location) fprintf(stderr, "%s:\t",  location);
1207 		fprintf(stderr, "%s\n",  cudaGetErrorString(e));
1208 		//assert(0);
1209         return 1;
1210 	}else
1211     {
1212         return 0;
1213     }
1214 }
1215 
ConvertDOG_Kernel(float * d_result,int width,int height)1216 void __global__ ConvertDOG_Kernel(float* d_result, int width, int height)
1217 {
1218 	int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1219 	int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1220 	if(col < width && row < height)
1221 	{
1222 		int index = row * width  + col;
1223 		float v = tex1Dfetch(texData, index);
1224 		d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1225 			0.5 : saturate(0.5+20.0*v);
1226 	}
1227 }
1228 ///
DisplayConvertDOG(CuTexImage * dog,CuTexImage * out)1229 void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
1230 {
1231 	if(out->_cuData == NULL) return;
1232 	int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
1233 	dog->BindTexture(texData);
1234 	dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
1235 	dim3 block(BLOCK_DIM, BLOCK_DIM);
1236 	ConvertDOG_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
1237 	ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
1238 }
1239 
ConvertGRD_Kernel(float * d_result,int width,int height)1240 void __global__ ConvertGRD_Kernel(float* d_result, int width, int height)
1241 {
1242 	int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1243 	int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1244 	if(col < width && row < height)
1245 	{
1246 		int index = row * width  + col;
1247 		float v = tex1Dfetch(texData, index << 1);
1248 		d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1249 				0 : saturate(5 * v);
1250 
1251 	}
1252 }
1253 
1254 
DisplayConvertGRD(CuTexImage * got,CuTexImage * out)1255 void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
1256 {
1257 	if(out->_cuData == NULL) return;
1258 	int width = got->GetImgWidth(), height = got ->GetImgHeight();
1259 	got->BindTexture(texData);
1260 	dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
1261 	dim3 block(BLOCK_DIM, BLOCK_DIM);
1262 	ConvertGRD_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
1263 	ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
1264 }
1265 
ConvertKEY_Kernel(float4 * d_result,int width,int height)1266 void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height)
1267 {
1268 
1269 	int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1270 	int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1271 	if(col < width && row < height)
1272 	{
1273 		int index = row * width + col;
1274 		float4 keyv = tex1Dfetch(texDataF4, index);
1275 		int is_key = (keyv.x == 1.0f || keyv.x == -1.0f);
1276 		int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;
1277 		float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5;
1278 		d_result[index] = is_key && inside ?
1279 			(keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)):
1280 			make_float4(v, v, v, 1.0f) ;
1281 	}
1282 }
DisplayConvertKEY(CuTexImage * key,CuTexImage * dog,CuTexImage * out)1283 void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)
1284 {
1285 	if(out->_cuData == NULL) return;
1286 	int width = key->GetImgWidth(), height = key ->GetImgHeight();
1287 	dog->BindTexture(texData);
1288 	key->BindTexture(texDataF4);
1289 	dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
1290 	dim3 block(BLOCK_DIM, BLOCK_DIM);
1291 	ConvertKEY_Kernel<<<grid, block>>>((float4*) out->_cuData, width, height);
1292 }
1293 
1294 
DisplayKeyPoint_Kernel(float4 * d_result,int num)1295 void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num)
1296 {
1297 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1298 	if(idx >= num) return;
1299 	float4 v = tex1Dfetch(texDataF4, idx);
1300 	d_result[idx] = make_float4(v.x, v.y, 0, 1.0f);
1301 }
1302 
DisplayKeyPoint(CuTexImage * ftex,CuTexImage * out)1303 void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
1304 {
1305 	int num = ftex->GetImgWidth();
1306 	int block_width = 64;
1307 	dim3 grid((num + block_width -1) /block_width);
1308 	dim3 block(block_width);
1309 	ftex->BindTexture(texDataF4);
1310 	DisplayKeyPoint_Kernel<<<grid, block>>>((float4*) out->_cuData, num);
1311 	ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
1312 }
1313 
DisplayKeyBox_Kernel(float4 * d_result,int num)1314 void __global__ DisplayKeyBox_Kernel(float4* d_result, int num)
1315 {
1316 	int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1317 	if(idx >= num) return;
1318 	int  kidx = idx / 10, vidx = idx - IMUL(kidx , 10);
1319 	float4 v = tex1Dfetch(texDataF4, kidx);
1320 	float sz = fabs(v.z * 3.0f);
1321 	///////////////////////
1322 	float s, c;	__sincosf(v.w, &s, &c);
1323 	///////////////////////
1324 	float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz);
1325 	float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz);
1326 	float4 pos;
1327 	pos.x = v.x + c * dx - s * dy;
1328 	pos.y = v.y + c * dy + s * dx;
1329 	pos.z = 0;	pos.w = 1.0f;
1330 	d_result[idx]  = pos;
1331 }
1332 
DisplayKeyBox(CuTexImage * ftex,CuTexImage * out)1333 void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
1334 {
1335 	int len = ftex->GetImgWidth();
1336 	int block_width = 32;
1337 	dim3 grid((len * 10 + block_width -1) / block_width);
1338 	dim3 block(block_width);
1339 	ftex->BindTexture(texDataF4);
1340 	DisplayKeyBox_Kernel<<<grid, block>>>((float4*) out->_cuData, len * 10);
1341 }
1342 ///////////////////////////////////////////////////////////////////
BindTexture(textureReference & texRef)1343 inline void CuTexImage:: BindTexture(textureReference& texRef)
1344 {
1345 	 cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes);
1346 }
1347 
BindTexture2D(textureReference & texRef)1348 inline void CuTexImage::BindTexture2D(textureReference& texRef)
1349 {
1350 #if defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
1351 	cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float));
1352 #else
1353 	cudaChannelFormatDesc desc;
1354 	cudaGetChannelDesc(&desc, _cuData2D);
1355 	cudaBindTextureToArray(&texRef, _cuData2D, &desc);
1356 #endif
1357 }
1358 
CheckCudaDevice(int device)1359 int ProgramCU::CheckCudaDevice(int device)
1360 {
1361     int count = 0, device_used;
1362     if(cudaGetDeviceCount(&count) != cudaSuccess  || count <= 0)
1363     {
1364         ProgramCU::CheckErrorCUDA("CheckCudaDevice");
1365         return 0;
1366     }else if(count == 1)
1367     {
1368         cudaDeviceProp deviceProp;
1369         if ( cudaGetDeviceProperties(&deviceProp, 0) != cudaSuccess  ||
1370 		  (deviceProp.major == 9999 && deviceProp.minor == 9999))
1371         {
1372             fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
1373             return 0;
1374         }else
1375 		{
1376 			GlobalUtil::_MemCapGPU = deviceProp.totalGlobalMem / 1024;
1377 			GlobalUtil::_texMaxDimGL = 32768;
1378 			if(GlobalUtil::_verbose)
1379 				fprintf(stdout, "NOTE: changing maximum texture dimension to %d\n", GlobalUtil::_texMaxDimGL);
1380 
1381 		}
1382     }
1383     if(device >0 && device < count)
1384     {
1385         cudaSetDevice(device);
1386         CheckErrorCUDA("cudaSetDevice\n");
1387     }
1388     cudaGetDevice(&device_used);
1389     if(device != device_used)
1390         fprintf(stderr,  "\nERROR:   Cannot set device to %d\n"
1391         "\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);
1392     return 1;
1393 }
1394 
1395 ////////////////////////////////////////////////////////////////////////////////////////
1396 // siftmatch funtions
1397 //////////////////////////////////////////////////////////////////////////////////////////
1398 
1399 #define MULT_TBLOCK_DIMX 128
1400 #define MULT_TBLOCK_DIMY 1
1401 #define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
1402 #define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)
1403 
1404 
1405 texture<uint4, 1, cudaReadModeElementType> texDes1;
1406 texture<uint4, 1, cudaReadModeElementType> texDes2;
1407 
MultiplyDescriptor_Kernel(int * d_result,int num1,int num2,int3 * d_temp)1408 void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp)
1409 {
1410 	int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY),  idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);
1411 
1412 	int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
1413 	__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1414 	int read_idx1 = idx01 * 8 +  threadIdx.x, read_idx2 = idx2 * 8;
1415 	int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1416 	int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1417 
1418 	///////////////////////////////////////////////////////////////
1419 	//Load feature descriptors
1420 	///////////////////////////////////////////////////////////////
1421 #if MULT_BLOCK_DIMY == 16
1422 	uint4 v = tex1Dfetch(texDes1, read_idx1);
1423 	data1[cache_idx1]   = v.x;	data1[cache_idx1+1] = v.y;
1424 	data1[cache_idx1+2] = v.z;	data1[cache_idx1+3] = v.w;
1425 #elif MULT_BLOCK_DIMY == 8
1426 	if(threadIdx.x < 64)
1427 	{
1428 		uint4 v = tex1Dfetch(texDes1, read_idx1);
1429 		data1[cache_idx1]   = v.x;		data1[cache_idx1+1] = v.y;
1430 		data1[cache_idx1+2] = v.z;		data1[cache_idx1+3] = v.w;
1431 	}
1432 #else
1433 #error
1434 #endif
1435 	__syncthreads();
1436 
1437 	///
1438 	if(idx2 >= num2) return;
1439 	///////////////////////////////////////////////////////////////////////////
1440 	//compare descriptors
1441 
1442 	int results[MULT_BLOCK_DIMY];
1443 #pragma unroll
1444 	for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;
1445 
1446 #pragma unroll
1447 	for(int i = 0; i < 8; ++i)
1448 	{
1449 		uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
1450 		unsigned char* p2 = (unsigned char*)(&v);
1451 #pragma unroll
1452 		for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1453 		{
1454 			unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));
1455 			results[k] += 	 ( IMUL(p1[0], p2[0])	+ IMUL(p1[1], p2[1])
1456 							 + IMUL(p1[2], p2[2])  	+ IMUL(p1[3], p2[3])
1457 							 + IMUL(p1[4], p2[4])  	+ IMUL(p1[5], p2[5])
1458 							 + IMUL(p1[6], p2[6])  	+ IMUL(p1[7], p2[7])
1459 							 + IMUL(p1[8], p2[8])  	+ IMUL(p1[9], p2[9])
1460 							 + IMUL(p1[10], p2[10])	+ IMUL(p1[11], p2[11])
1461 							 + IMUL(p1[12], p2[12])	+ IMUL(p1[13], p2[13])
1462 							 + IMUL(p1[14], p2[14])	+ IMUL(p1[15], p2[15]));
1463 		}
1464 	}
1465 
1466 	int dst_idx = IMUL(idx1, num2)  + idx2;
1467 	if(d_temp)
1468 	{
1469 		int3 cmp_result = make_int3(0, -1, 0);
1470 
1471 #pragma unroll
1472 		for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1473 		{
1474 			if(idx1 + i < num1)
1475 			{
1476 				cmp_result = results[i] > cmp_result.x?
1477 				make_int3(results[i], idx1 + i, cmp_result.x) :
1478 				make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1479 				d_result[dst_idx + IMUL(i, num2)] = results[i];
1480 			}
1481 		}
1482 		d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
1483 	}else
1484 	{
1485 #pragma unroll
1486 		for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1487 		{
1488 			if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
1489 		}
1490 	}
1491 
1492 }
1493 
1494 
MultiplyDescriptor(CuTexImage * des1,CuTexImage * des2,CuTexImage * texDot,CuTexImage * texCRT)1495 void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
1496 {
1497 	int num1 = des1->GetImgWidth() / 8;
1498 	int num2 = des2->GetImgWidth() / 8;
1499 	dim3 grid(	(num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1500 		(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1501 	dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1502 	texDot->InitTexture( num2,num1);
1503 	if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
1504 	des1->BindTexture(texDes1);
1505 	des2->BindTexture(texDes2);
1506 
1507 	MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
1508 												(texCRT? (int3*)texCRT->_cuData : NULL));
1509 }
1510 
1511 texture<float, 1, cudaReadModeElementType> texLoc1;
1512 texture<float2, 1, cudaReadModeElementType> texLoc2;
1513 struct Matrix33{float mat[3][3];};
1514 
1515 
1516 
MultiplyDescriptorG_Kernel(int * d_result,int num1,int num2,int3 * d_temp,Matrix33 H,float hdistmax,Matrix33 F,float fdistmax)1517 void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp,
1518 										   Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
1519 {
1520 	int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY);
1521 	int idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);
1522 
1523 	int idx1 = idx01 + threadIdx.y;
1524 	int idx2 = idx02 + threadIdx.x;
1525 	__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1526 	__shared__ float loc1[MULT_BLOCK_DIMY * 2];
1527 	int read_idx1 = idx01 * 8 +  threadIdx.x ;
1528 	int read_idx2 = idx2 * 8;
1529 	int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1530 	int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1531 #if MULT_BLOCK_DIMY == 16
1532 	uint4 v = tex1Dfetch(texDes1, read_idx1);
1533 	data1[cache_idx1]   = v.x;
1534 	data1[cache_idx1+1] = v.y;
1535 	data1[cache_idx1+2] = v.z;
1536 	data1[cache_idx1+3] = v.w;
1537 #elif MULT_BLOCK_DIMY == 8
1538 	if(threadIdx.x < 64)
1539 	{
1540 		uint4 v = tex1Dfetch(texDes1, read_idx1);
1541 		data1[cache_idx1]   = v.x;
1542 		data1[cache_idx1+1] = v.y;
1543 		data1[cache_idx1+2] = v.z;
1544 		data1[cache_idx1+3] = v.w;
1545 	}
1546 #else
1547 #error
1548 #endif
1549 	__syncthreads();
1550 	if(threadIdx.x < MULT_BLOCK_DIMY * 2)
1551 	{
1552 		loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
1553 	}
1554 	__syncthreads();
1555 	if(idx2 >= num2) return;
1556 	int results[MULT_BLOCK_DIMY];
1557 	/////////////////////////////////////////////////////////////////////////////////////////////
1558 	//geometric verification
1559 	/////////////////////////////////////////////////////////////////////////////////////////////
1560 	int good_count = 0;
1561 	float2 loc2 = tex1Dfetch(texLoc2, idx2);
1562 #pragma unroll
1563 	for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1564 	{
1565 
1566 		if(idx1 + i < num1)
1567 		{
1568 			float* loci = loc1 + i * 2;
1569 			float locx = loci[0], locy = loci[1];
1570 			//homography
1571 			float x[3], diff[2];
1572 			x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];
1573 			x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];
1574 			x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];
1575 			diff[0] = FDIV(x[0], x[2]) - loc2.x;
1576 			diff[1] = FDIV(x[1], x[2]) - loc2.y;
1577       float hdist = diff[0] * diff[0] + diff[1] * diff[1];
1578 			if(hdist < hdistmax)
1579 			{
1580 				//check fundamental matrix
1581 				float fx1[3], ftx2[3], x2fx1, se;
1582 				fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];
1583 				fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];
1584 				fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
1585 
1586 				ftx2[0] = F.mat[0][0] * loc2.x + F.mat[1][0] * loc2.y + F.mat[2][0];
1587 				ftx2[1] = F.mat[0][1] * loc2.x + F.mat[1][1] * loc2.y + F.mat[2][1];
1588 				//ftx2[2] = F.mat[0][2] * loc2.x + F.mat[1][2] * loc2.y + F.mat[2][2];
1589 
1590 				x2fx1 = loc2.x * fx1[0]  + loc2.y * fx1[1] + fx1[2];
1591 				se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);
1592 				results[i] = se < fdistmax? 0: -262144;
1593 			}else
1594 			{
1595 				results[i] = -262144;
1596 			}
1597 		}else
1598 		{
1599 			results[i] = -262144;
1600 		}
1601 		good_count += (results[i] >=0);
1602 	}
1603 	/////////////////////////////////////////////////////////////////////////////////////////////
1604 	///compare feature descriptors anyway
1605 	/////////////////////////////////////////////////////////////////////////////////////////////
1606 	if(good_count > 0)
1607 	{
1608 #pragma unroll
1609 		for(int i = 0; i < 8; ++i)
1610 		{
1611 			uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
1612 			unsigned char* p2 = (unsigned char*)(&v);
1613 #pragma unroll
1614 			for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1615 			{
1616 				unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));
1617 				results[k] += 	 ( IMUL(p1[0], p2[0])	+ IMUL(p1[1], p2[1])
1618 								 + IMUL(p1[2], p2[2])  	+ IMUL(p1[3], p2[3])
1619 								 + IMUL(p1[4], p2[4])  	+ IMUL(p1[5], p2[5])
1620 								 + IMUL(p1[6], p2[6])  	+ IMUL(p1[7], p2[7])
1621 								 + IMUL(p1[8], p2[8])  	+ IMUL(p1[9], p2[9])
1622 								 + IMUL(p1[10], p2[10])	+ IMUL(p1[11], p2[11])
1623 								 + IMUL(p1[12], p2[12])	+ IMUL(p1[13], p2[13])
1624 								 + IMUL(p1[14], p2[14])	+ IMUL(p1[15], p2[15]));
1625 			}
1626 		}
1627 	}
1628 	int dst_idx = IMUL(idx1, num2)  + idx2;
1629 	if(d_temp)
1630 	{
1631 		int3 cmp_result = make_int3(0, -1, 0);
1632 #pragma unroll
1633 		for(int i= 0; i < MULT_BLOCK_DIMY; ++i)
1634 		{
1635 			if(idx1 + i < num1)
1636 			{
1637 				cmp_result = results[i] > cmp_result.x?
1638 				make_int3(results[i], idx1 + i, cmp_result.x) :
1639 				make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1640 				d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1641 			}else
1642 			{
1643 				break;
1644 			}
1645 		}
1646 		d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
1647 	}else
1648 	{
1649 #pragma unroll
1650 		for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1651 		{
1652 			if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1653 			else break;
1654 		}
1655 	}
1656 
1657 }
1658 
1659 
MultiplyDescriptorG(CuTexImage * des1,CuTexImage * des2,CuTexImage * loc1,CuTexImage * loc2,CuTexImage * texDot,CuTexImage * texCRT,float * H,float hdistmax,float * F,float fdistmax)1660 void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
1661 		CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,
1662 		float* H, float hdistmax, float* F, float fdistmax)
1663 {
1664 	int num1 = des1->GetImgWidth() / 8;
1665 	int num2 = des2->GetImgWidth() / 8;
1666 	Matrix33 MatF, MatH;
1667 	//copy the matrix
1668 	memcpy(MatF.mat, F, 9 * sizeof(float));
1669 	memcpy(MatH.mat, H, 9 * sizeof(float));
1670 	//thread blocks
1671 	dim3 grid(	(num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1672 		(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1673 	dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1674 	//intermediate results
1675 	texDot->InitTexture( num2,num1);
1676 	if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
1677 	loc1->BindTexture(texLoc1);
1678 	loc2->BindTexture(texLoc2);
1679 	des1->BindTexture(texDes1);
1680 	des2->BindTexture(texDes2);
1681 	MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
1682 												(texCRT? (int3*)texCRT->_cuData : NULL),
1683 												MatH, hdistmax, MatF, fdistmax);
1684 }
1685 
1686 
1687 texture<int,  1, cudaReadModeElementType> texDOT;
1688 
1689 #define ROWMATCH_BLOCK_WIDTH 32
1690 #define ROWMATCH_BLOCK_HEIGHT 1
1691 
RowMatch_Kernel(int * d_dot,int * d_result,int num2,float distmax,float ratiomax)1692 void __global__  RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)
1693 {
1694 #if ROWMATCH_BLOCK_HEIGHT == 1
1695 	__shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
1696 	__shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
1697 	__shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
1698 	int	row = blockIdx.y;
1699 #else
1700 	__shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1701 	__shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1702 	__shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1703 	int*	dotmax = x_dotmax[threadIdx.y];
1704 	int*	dotnxt = x_dotnxt[threadIdx.y];
1705 	int*	dotidx = x_dotidx[threadIdx.y];
1706 	int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
1707 #endif
1708 
1709 	int base_address = IMUL(row , num2);
1710 	int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;
1711 	for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)
1712 	{
1713 		if(threadIdx.x + i < num2)
1714 		{
1715 			int v = d_dot[base_address + threadIdx.x + i];  // tex1Dfetch(texDOT, base_address + threadIdx.x + i);
1716 			bool test = v > t_dotmax;
1717 			t_dotnxt = test? t_dotmax : max(t_dotnxt, v);
1718 			t_dotidx = test? (threadIdx.x + i) : t_dotidx;
1719 			t_dotmax = test? v: t_dotmax;
1720 		}
1721 		__syncthreads();
1722 	}
1723 	dotmax[threadIdx.x] = t_dotmax;
1724 	dotnxt[threadIdx.x] = t_dotnxt;
1725 	dotidx[threadIdx.x] = t_dotidx;
1726 	__syncthreads();
1727 
1728 #pragma unroll
1729 	for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)
1730 	{
1731 		if(threadIdx.x < step)
1732 		{
1733 			int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
1734 			bool test =  v2 > v1;
1735 			dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);
1736 			dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];
1737 			dotmax[threadIdx.x] = test? v2 : v1;
1738 		}
1739 		__syncthreads();
1740 	}
1741 	if(threadIdx.x == 0)
1742 	{
1743 		float dist =  acos(min(dotmax[0] * 0.000003814697265625f, 1.0));
1744 		float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));
1745 		//float ratio = dist / distn;
1746 		d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//?  : -1;
1747 	}
1748 
1749 }
1750 
1751 
GetRowMatch(CuTexImage * texDot,CuTexImage * texMatch,float distmax,float ratiomax)1752 void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)
1753 {
1754 	int num1 = texDot->GetImgHeight();
1755 	int num2 = texDot->GetImgWidth();
1756 	dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
1757 	dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
1758 	// texDot->BindTexture(texDOT);
1759 	RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
1760 		(int*)texMatch->_cuData, num2, distmax, ratiomax);
1761 }
1762 
1763 #define COLMATCH_BLOCK_WIDTH 32
1764 
1765 //texture<int3,  1, cudaReadModeElementType> texCT;
1766 
ColMatch_Kernel(int3 * d_crt,int * d_result,int height,int num2,float distmax,float ratiomax)1767 void __global__  ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
1768 {
1769 	int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
1770 	if(col >= num2) return;
1771 	int3 result = d_crt[col];//tex1Dfetch(texCT, col);
1772 	int read_idx = col + num2;
1773 	for(int i = 1; i < height; ++i, read_idx += num2)
1774 	{
1775 		int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
1776 		result = result.x < temp.x?
1777 			make_int3(temp.x, temp.y, max(result.x, temp.z)) :
1778 			make_int3(result.x, result.y, max(result.z, temp.x));
1779 	}
1780 
1781 	float dist =  acos(min(result.x * 0.000003814697265625f, 1.0));
1782 	float distn = acos(min(result.z * 0.000003814697265625f, 1.0));
1783 		//float ratio = dist / distn;
1784 	d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//?  : -1;
1785 
1786 }
1787 
GetColMatch(CuTexImage * texCRT,CuTexImage * texMatch,float distmax,float ratiomax)1788 void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)
1789 {
1790 	int height = texCRT->GetImgHeight();
1791 	int num2 = texCRT->GetImgWidth();
1792 	//texCRT->BindTexture(texCT);
1793     dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);
1794     dim3 block(COLMATCH_BLOCK_WIDTH);
1795 	ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
1796 }
1797 
1798 #endif
1799