1 //////////////////////////////////////////////////////////////////////////////
2 //	File:		ProgramCL.cpp
3 //	Author:		Changchang Wu
4 //	Description :	implementation of CL related class.
5 //		            class ProgramCL			A simple wrapper of Cg programs
6 //
7 //	Copyright (c) 2007 University of North Carolina at Chapel Hill
8 //	All Rights Reserved
9 //
10 //	Permission to use, copy, modify and distribute this software and its
11 //	documentation for educational, research and non-profit purposes, without
12 //	fee, and without a written agreement is hereby granted, provided that the
13 //	above copyright notice and the following paragraph appear in all copies.
14 //
15 //	The University of North Carolina at Chapel Hill make no representations
16 //	about the suitability of this software for any purpose. It is provided
17 //	'as is' without express or implied warranty.
18 //
19 //	Please send BUG REPORTS to ccwu@cs.unc.edu
20 //
21 ////////////////////////////////////////////////////////////////////////////
22 
23 #if defined(CL_SIFTGPU_ENABLED)
24 
25 #include <CL/opencl.h>
26 #include <GL/glew.h>
27 
28 #include <iostream>
29 #include <iomanip>
30 #include <vector>
31 #include <strstream>
32 #include <algorithm>
33 #include <stdlib.h>
34 #include <math.h>
35 #include <string.h>
36 using namespace std;
37 
38 #include "GlobalUtil.h"
39 #include "CLTexImage.h"
40 #include "ProgramCL.h"
41 #include "SiftGPU.h"
42 
43 
44 #if  defined(_WIN32)
45 	#pragma comment (lib, "OpenCL.lib")
46 #endif
47 
48 #ifndef _INC_WINDOWS
49 #ifndef WIN32_LEAN_AND_MEAN
50 	#define WIN32_LEAN_AND_MEAN
51 #endif
52 #include <windows.h>
53 #endif
54 
55 //////////////////////////////////////////////////////////////////////
56 // Construction/Destruction
57 //////////////////////////////////////////////////////////////////////
58 
ProgramCL()59 ProgramCL::ProgramCL()
60 {
61 	_program = NULL;
62     _kernel = NULL;
63     _valid = 0;
64 }
65 
~ProgramCL()66 ProgramCL::~ProgramCL()
67 {
68     if(_kernel)  clReleaseKernel(_kernel);
69     if(_program) clReleaseProgram(_program);
70 }
71 
ProgramCL(const char * name,const char * code,cl_context context,cl_device_id device)72 ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1)
73 {
74     const char * src[1] = {code};     cl_int status;
75 
76     _program = clCreateProgramWithSource(context, 1, src, NULL, &status);
77     if(status != CL_SUCCESS) _valid = 0;
78 
79     status = clBuildProgram(_program, 0, NULL,
80         GlobalUtil::_debug ?
81         "-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" :
82         "-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL);
83 
84     if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;}
85     else if(GlobalUtil::_debug) PrintBuildLog(device, 0);
86 
87     _kernel = clCreateKernel(_program, name, &status);
88     if(status != CL_SUCCESS) _valid = 0;
89 }
90 
PrintBuildLog(cl_device_id device,int all)91 void ProgramCL::PrintBuildLog(cl_device_id device, int all)
92 {
93     char buffer[10240] = "\0";
94     cl_int status = clGetProgramBuildInfo(
95         _program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
96     if(all )
97     {
98         std::cerr << buffer  << endl;
99     }else
100     {
101         const char * pos = strstr(buffer, "ptxas");
102         if(pos) std::cerr << pos << endl;
103     }
104 }
105 
106 ///////////////////////////////////////////////////////////////////////////////////
107 /////////////////////////////////PACKED VERSION?///////////////////////////////////
108 
ProgramBagCL()109 ProgramBagCL::ProgramBagCL()
110 {
111     ////////////////////////////////////
112     _context = NULL;   _queue = NULL;
113     s_gray = s_sampling = NULL;
114     s_packup = s_zero_pass = NULL;
115     s_gray_pack = s_unpack = NULL;
116     s_sampling_u = NULL;
117     s_dog_pass   = NULL;
118     s_grad_pass  = NULL;
119     s_grad_pass2 = NULL;
120     s_unpack_dog = NULL;
121     s_unpack_grd = NULL;
122     s_unpack_key = NULL;
123     s_keypoint = NULL;
124     f_gaussian_skip0 = NULL;
125     f_gaussian_skip1 = NULL;
126     f_gaussian_step = 0;
127 
128     ////////////////////////////////
129 	GlobalUtil::StartTimer("Initialize OpenCL");
130     if(!InitializeContext()) return;
131     GlobalUtil::StopTimer();
132 
133 }
134 
135 
136 
~ProgramBagCL()137 ProgramBagCL::~ProgramBagCL()
138 {
139     if(s_gray)      delete s_gray;
140 	if(s_sampling)  delete s_sampling;
141 	if(s_zero_pass) delete s_zero_pass;
142     if(s_packup)    delete s_packup;
143     if(s_unpack)    delete s_unpack;
144     if(s_gray_pack) delete s_gray_pack;
145     if(s_sampling_u)  delete s_sampling_u;
146     if(s_dog_pass)  delete s_dog_pass;
147     if(s_grad_pass)  delete s_grad_pass;
148     if(s_grad_pass2)  delete s_grad_pass2;
149     if(s_unpack_dog) delete s_unpack_dog;
150     if(s_unpack_grd) delete s_unpack_grd;
151     if(s_unpack_key) delete s_unpack_key;
152     if(s_keypoint)   delete s_keypoint;
153 
154     if(f_gaussian_skip1) delete f_gaussian_skip1;
155 
156     for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
157     {
158 	    if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i];
159     }
160     if(f_gaussian_step && _gaussian_step_num > 0)
161     {
162 	    for(int i = 0; i< _gaussian_step_num; i++)
163 	    {
164 		    delete f_gaussian_step[i];
165 	    }
166 	    delete[] f_gaussian_step;
167     }
168 
169     //////////////////////////////////////
170     if(_context) clReleaseContext(_context);
171     if(_queue) clReleaseCommandQueue(_queue);
172 }
173 
InitializeContext()174 bool ProgramBagCL::InitializeContext()
175 {
176     cl_uint num_platform, num_device;
177     cl_int status;
178     // Get OpenCL platform count
179     status = clGetPlatformIDs (0, NULL, &num_platform);
180     if (status != CL_SUCCESS || num_platform == 0) return false;
181 
182     cl_platform_id platforms[16];
183     if(num_platform > 16 ) num_platform = 16;
184     status = clGetPlatformIDs (num_platform, platforms, NULL);
185     _platform = platforms[0];
186 
187     ///////////////////////////////
188     status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);
189     if(status != CL_SUCCESS || num_device == 0) return false;
190 
191     // Create the device list
192     cl_device_id* devices = new cl_device_id [num_device];
193     status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
194     _device = (status == CL_SUCCESS? devices[0] : 0);  delete[] devices;
195     if(status != CL_SUCCESS)  return false;
196 
197 
198     if(GlobalUtil::_verbose)
199     {
200         cl_device_mem_cache_type is_gcache;
201         clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL);
202         if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n";
203         //else if(is_gcache == CL_READ_ONLY_CACHE) std::cout << "Read only cache for global memory\n";
204         //else std::cout << "Read/Write cache for global memory\n";
205     }
206 
207     //context;
208     if(GlobalUtil::_UseSiftGPUEX)
209     {
210         cl_context_properties prop[] = {
211         CL_CONTEXT_PLATFORM, (cl_context_properties)_platform,
212         CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
213         CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),  0 };
214         _context = clCreateContext(prop, 1, &_device, NULL, NULL, &status);
215         if(status != CL_SUCCESS) return false;
216     }else
217     {
218         _context = clCreateContext(0, 1, &_device, NULL, NULL, &status);
219         if(status != CL_SUCCESS) return false;
220     }
221 
222     //command queue
223     _queue = clCreateCommandQueue(_context, _device, 0, &status);
224     return status == CL_SUCCESS;
225 }
226 
InitProgramBag(SiftParam & param)227 void ProgramBagCL::InitProgramBag(SiftParam&param)
228 {
229 	GlobalUtil::StartTimer("Load Programs");
230     LoadFixedShaders();
231     LoadDynamicShaders(param);
232     if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
233     GlobalUtil::StopTimer();
234 }
235 
236 
UnloadProgram()237 void ProgramBagCL::UnloadProgram()
238 {
239 
240 }
241 
FinishCL()242 void ProgramBagCL::FinishCL()
243 {
244     clFinish(_queue);
245 }
246 
LoadFixedShaders()247 void ProgramBagCL::LoadFixedShaders()
248 {
249 
250 
251     s_gray = new ProgramCL( "gray",
252         "__kernel void gray(__read_only  image2d_t input, __write_only image2d_t output) {\n"
253         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
254         "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
255         "float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
256         "float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
257 	    "float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
258         "write_imagef(output, coord, result); }", _context, _device	);
259 
260 
261 	s_sampling = new ProgramCL("sampling",
262         "__kernel void sampling(__read_only  image2d_t input, __write_only image2d_t output,\n"
263         "                   int width, int height) {\n"
264         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
265         "int x = get_global_id(0), y =  get_global_id(1); \n"
266         "if( x >= width || y >= height) return;\n"
267         "int xa = x + x,   ya = y + y; \n"
268         "int xb = xa + 1,  yb = ya + 1; \n"
269         "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
270         "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
271         "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
272         "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
273         "float4 result = (float4) (v1, v2, v3, v4);"
274         "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
275 
276 	s_sampling_k = new ProgramCL("sampling_k",
277         "__kernel void sampling_k(__read_only  image2d_t input, __write_only image2d_t output, "
278         "                   int width, int height,\n"
279         "                   int step,  int halfstep) {\n"
280         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
281         "int x = get_global_id(0), y =  get_global_id(1); \n"
282         "if( x >= width || y >= height) return;\n"
283         "int xa = x * step,   ya = y *step; \n"
284         "int xb = xa + halfstep,  yb = ya + halfstep; \n"
285         "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
286         "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
287         "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
288         "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
289         "float4 result = (float4) (v1, v2, v3, v4);"
290         "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
291 
292 
293 	s_sampling_u = new ProgramCL("sampling_u",
294         "__kernel void sampling_u(__read_only  image2d_t input, \n"
295         "                   __write_only image2d_t output,\n"
296         "                   int width, int height,\n"
297         "                   float step, float halfstep) {\n"
298         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
299         "int x = get_global_id(0), y =  get_global_id(1); \n"
300         "if( x >= width || y >= height) return;\n"
301         "float xa = x * step,       ya = y *step; \n"
302         "float xb = xa + halfstep,  yb = ya + halfstep; \n"
303         "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
304         "float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
305         "float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
306         "float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
307         "float4 result = (float4) (v1, v2, v3, v4);"
308         "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
309 
310 
311     s_zero_pass = new ProgramCL("zero_pass",
312         "__kernel void zero_pass(__write_only image2d_t output){\n"
313         "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
314         "write_imagef(output, coord, (float4)(0.0));}", _context, _device);
315 
316     s_packup = new ProgramCL("packup",
317         "__kernel void packup(__global float* input, __write_only image2d_t output,\n"
318         "                   int twidth, int theight, int width){\n"
319         "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
320         "if(coord.x >= twidth || coord.y >= theight) return;\n"
321         "int index0 = (coord.y + coord.y) * width; \n"
322         "int index1 = index0 + coord.x;\n"
323         "int x2 = min(width -1, coord.x); \n"
324         "float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
325         "int index2 = index1 + width; \n"
326         "float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
327         "write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
328 
329    s_dog_pass = new ProgramCL("dog_pass",
330         "__kernel void dog_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
331         "                   __write_only image2d_t dog, int width, int height) {\n"
332         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
333         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
334         "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
335         "if( coord.x >= width || coord.y >= height) return;\n"
336         "float4 cc = read_imagef(tex , sampler, coord); \n"
337         "float4 cp = read_imagef(texp, sampler, coord);\n"
338         "write_imagef(dog, coord, cc - cp); }\n", _context, _device);
339 
340    s_grad_pass = new ProgramCL("grad_pass",
341         "__kernel void grad_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
342         "                   __write_only image2d_t dog, int width, int height,\n"
343         "                    __write_only image2d_t grad, __write_only image2d_t rot) {\n"
344         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
345         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
346         "int x = get_global_id(0), y =  get_global_id(1); \n"
347         "if( x >= width || y >= height) return;\n"
348         "int2 coord = (int2) (x, y);\n"
349         "float4 cc = read_imagef(tex , sampler, coord); \n"
350         "float4 cp = read_imagef(texp, sampler, coord);\n"
351         "float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
352         "float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
353 	    "float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
354         "float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
355         "write_imagef(dog, coord, cc - cp); \n"
356         "float4 dx = (float4)(cc.y - cl.x,  cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
357         "float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
358         "write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
359         "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
360 
361     s_grad_pass2 = new ProgramCL("grad_pass2",
362         "#define BLOCK_DIMX 32\n"
363         "#define BLOCK_DIMY 14\n"
364         "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
365         "__kernel void grad_pass2(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
366         "                   __write_only image2d_t dog, int width, int height,\n"
367         "                   __write_only image2d_t grd, __write_only image2d_t rot){\n"//,  __local float* block) {\n"
368         "__local float block[BLOCK_SIZE * 4]; \n"
369         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
370         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
371         "int2 coord = (int2) (  get_global_id(0) - get_group_id(0) * 2 - 1, \n"
372         "                       get_global_id(1) - get_group_id(1) * 2- 1); \n"
373         "int idx =  mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
374         "float4 cc = read_imagef(tex, sampler, coord);\n"
375         "block[idx                 ] = cc.x;\n"
376         "block[idx + BLOCK_SIZE    ] = cc.y;\n"
377         "block[idx + BLOCK_SIZE * 2] = cc.z;\n"
378         "block[idx + BLOCK_SIZE * 3] = cc.w;\n"
379         "barrier(CLK_LOCAL_MEM_FENCE);\n"
380         "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
381         "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
382         "if( coord.x >= width) return; \n"
383         "if( coord.y >= height) return;\n"
384         "float4 cp = read_imagef(texp, sampler, coord);\n"
385         "float4 dx = (float4)(  cc.y - block[idx - 1 + BLOCK_SIZE], \n"
386         "                       block[idx + 1] - cc.x, \n"
387         "                       cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
388         "                       block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
389         "float4 dy = (float4)(  cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
390         "                       cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
391         //"                       cc.zw - block[idx - BLOCK_DIMX].zw, \n"
392         "                       block[idx + BLOCK_DIMX] - cc.x,\n "
393         "                       block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
394         //"                       block[idx + BLOCK_DIMX].xy - cc.xy);\n"
395         "write_imagef(dog, coord, cc - cp); \n"
396         "write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
397         "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
398 }
399 
LoadDynamicShaders(SiftParam & param)400 void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
401 {
402     LoadKeypointShader();
403     LoadGenListShader(param._dog_level_num, 0);
404     CreateGaussianFilters(param);
405 }
406 
407 
SelectInitialSmoothingFilter(int octave_min,SiftParam & param)408 void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam&param)
409 {
410     float sigma = param.GetInitialSmoothSigma(octave_min);
411     if(sigma == 0)
412     {
413        f_gaussian_skip0 = NULL;
414     }else
415     {
416 	    for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
417 	    {
418 		    if(f_gaussian_skip0_v[i]->_id == octave_min)
419 		    {
420 			    f_gaussian_skip0 = f_gaussian_skip0_v[i];
421 			    return ;
422 		    }
423 	    }
424 	    FilterCL * filter = CreateGaussianFilter(sigma);
425 	    filter->_id = octave_min;
426 	    f_gaussian_skip0_v.push_back(filter);
427 	    f_gaussian_skip0 = filter;
428     }
429 
430 }
431 
CreateGaussianFilters(SiftParam & param)432 void ProgramBagCL::CreateGaussianFilters(SiftParam&param)
433 {
434 	if(param._sigma_skip0>0.0f)
435 	{
436 		f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0);
437 		f_gaussian_skip0->_id = GlobalUtil::_octave_min_default;
438 		f_gaussian_skip0_v.push_back(f_gaussian_skip0);
439 	}
440 	if(param._sigma_skip1>0.0f)
441 	{
442 		f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1);
443 	}
444 
445 	f_gaussian_step = new FilterCL*[param._sigma_num];
446 	for(int i = 0; i< param._sigma_num; i++)
447 	{
448 		f_gaussian_step[i] =  CreateGaussianFilter(param._sigma[i]);
449 	}
450     _gaussian_step_num = param._sigma_num;
451 }
452 
453 
CreateGaussianFilter(float sigma)454 FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
455 {
456 	//pixel inside 3*sigma box
457 	int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
458 	int width = 2*sz + 1;
459 
460 	//filter size truncation
461 	if(GlobalUtil::_MaxFilterWidth >0 && width > GlobalUtil::_MaxFilterWidth)
462 	{
463 		std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl;
464 		sz = GlobalUtil::_MaxFilterWidth>>1;
465 		width = 2 * sz + 1;
466 	}
467 
468 	int i;
469 	float * kernel = new float[width];
470 	float   rv = 1.0f/(sigma*sigma);
471 	float   v, ksum =0;
472 
473 	// pre-compute filter
474 	for( i = -sz ; i <= sz ; ++i)
475 	{
476 		kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;
477 		ksum += v;
478 	}
479 
480 	//normalize the kernel
481 	rv = 1.0f / ksum;
482 	for(i = 0; i< width ;i++) kernel[i]*=rv;
483 
484     FilterCL * filter = CreateFilter(kernel, width);
485     delete [] kernel;
486     if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl;
487     return filter;
488 }
489 
CreateFilter(float kernel[],int width)490 FilterCL*  ProgramBagCL::CreateFilter(float kernel[], int width)
491 {
492     FilterCL * filter = new FilterCL;
493     filter->s_shader_h = CreateFilterH(kernel, width);
494     filter->s_shader_v = CreateFilterV(kernel, width);
495     filter->_size = width;
496     filter->_id  = 0;
497     return filter;
498 }
499 
CreateFilterH(float kernel[],int width)500 ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width)
501 {
502 	int halfwidth  = width >>1;
503 	float * pf = kernel + halfwidth;
504 	int nhpixel = (halfwidth+1)>>1;	//how many neighbour pixels need to be looked up
505 	int npixel  = (nhpixel<<1)+1;//
506 	float weight[3];
507 
508     ////////////////////////////
509 	char buffer[10240];
510 	ostrstream out(buffer, 10240);
511 	out<<setprecision(8);
512 
513 
514     //CL_DEVICE_IMAGE2D_MAX_WIDTH
515 	out<<
516           "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
517           "__kernel void filter_h(__read_only  image2d_t input, \n"
518           "          __write_only image2d_t output, int width_, int height_) {\n"
519           "int x = get_global_id(0);\n"
520           "int y = get_global_id(1); \n"
521           "if( x > width_ || y > height_) return; \n"
522           "float4 pc; int2 coord; \n"
523           "float4 result = (float4)(0.0);\n";
524     for(int i = 0 ; i < npixel ; i++)
525 	{
526 		out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n";
527 		out<<"pc= read_imagef(input, sampler, coord);\n";
528 		if(GlobalUtil::_PreciseBorder)
529         out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
530 		//for each sub-pixel j  in center, the weight of sub-pixel k
531 		int xw = (i - nhpixel)*2;
532 		for(int j = 0; j < 3; j++)
533 		{
534 			int xwn = xw  + j  -1;
535 			weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
536 		}
537 		if(weight[1] == 0.0)
538 		{
539 			out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n";
540 		}
541 		else
542 		{
543 			out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n";
544 			out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n";
545 		}
546 	}
547     out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
548 	return new ProgramCL("filter_h", buffer, _context, _device);
549 }
550 
551 
552 
CreateFilterV(float kernel[],int width)553 ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width)
554 {
555 
556 	int halfwidth  = width >>1;
557 	float * pf = kernel + halfwidth;
558 	int nhpixel = (halfwidth+1)>>1;	//how many neighbour pixels need to be looked up
559 	int npixel  = (nhpixel<<1)+1;//
560 	float weight[3];
561 
562     ////////////////////////////
563 	char buffer[10240];
564 	ostrstream out(buffer, 10240);
565 	out<<setprecision(8);
566 
567 
568     //CL_DEVICE_IMAGE2D_MAX_WIDTH
569 	out<<
570           "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
571           "__kernel void filter_v(__read_only  image2d_t input, \n"
572           "          __write_only image2d_t output, int width_, int height_) {\n"
573           "int x = get_global_id(0);\n"
574           "int y = get_global_id(1); \n"
575           "if( x > width_ || y >= height_) return; \n"
576           "float4 pc; int2 coord; \n"
577           "float4 result = (float4)(0.0);\n";
578     for(int i = 0 ; i < npixel ; i++)
579 	{
580 		out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n";
581 		out<<"pc= read_imagef(input, sampler, coord);\n";
582 		if(GlobalUtil::_PreciseBorder)
583         out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
584 		//for each sub-pixel j  in center, the weight of sub-pixel k
585 		int xw = (i - nhpixel)*2;
586 		for(int j = 0; j < 3; j++)
587 		{
588 			int xwn = xw  + j  -1;
589 			weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
590 		}
591 		if(weight[1] == 0.0)
592 		{
593 			out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n";
594 		}
595 		else
596 		{
597 			out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n";
598 			out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n";
599 		}
600 	}
601     out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
602 	return new ProgramCL("filter_v", buffer, _context, _device);
603 
604 }
605 
FilterImage(FilterCL * filter,CLTexImage * dst,CLTexImage * src,CLTexImage * tmp)606 void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
607 {
608     cl_kernel kernelh = filter->s_shader_h->_kernel;
609     cl_kernel kernelv = filter->s_shader_v->_kernel;
610     //////////////////////////////////////////////////////////////////
611 
612     cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
613     cl_int w_ = w - 1, h_ = h - 1;
614 
615     size_t dim0 = 16, dim1 = 16;
616     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
617 
618     clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
619     clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
620     clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
621     clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
622     status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL);
623     CheckErrorCL(status, "ProgramBagCL::FilterImageH");
624     if(status != CL_SUCCESS) return;
625 
626     clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
627     clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
628     clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
629     clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
630     size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0};
631     status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
632     CheckErrorCL(status, "ProgramBagCL::FilterImageV");
633     //clReleaseEvent(event);
634 }
635 
SampleImageU(CLTexImage * dst,CLTexImage * src,int log_scale)636 void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale)
637 {
638     cl_kernel  kernel= s_sampling_u->_kernel;
639     float scale  = 1.0f / (1 << log_scale);
640     float offset = scale * 0.5f;
641     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
642     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
643     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
644     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
645     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
646     clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale));
647     clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset));
648 
649     size_t dim0 = 16, dim1 = 16;
650     //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
651     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
652     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
653     CheckErrorCL(status, "ProgramBagCL::SampleImageU");
654 }
655 
SampleImageD(CLTexImage * dst,CLTexImage * src,int log_scale)656 void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
657 {
658     cl_kernel  kernel;
659     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
660     if(log_scale == 1)
661     {
662         kernel = s_sampling->_kernel;
663         clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
664         clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
665         clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
666         clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
667     }else
668     {
669         cl_int fullstep = (1 << log_scale);
670         cl_int halfstep = fullstep >> 1;
671         kernel = s_sampling_k->_kernel;
672         clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
673         clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
674         clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
675         clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
676         clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
677         clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep));
678     }
679     size_t dim0 = 128, dim1 = 1;
680     //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
681     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
682     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
683     CheckErrorCL(status, "ProgramBagCL::SampleImageD");
684 }
685 
FilterInitialImage(CLTexImage * tex,CLTexImage * buf)686 void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
687 {
688     if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
689 }
690 
FilterSampledImage(CLTexImage * tex,CLTexImage * buf)691 void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
692 {
693     if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
694 }
695 
ComputeDOG(CLTexImage * tex,CLTexImage * texp,CLTexImage * dog,CLTexImage * grad,CLTexImage * rot)696 void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
697 {
698     int margin = 0, use_gm2 = 1;
699     bool both_grad_dog = rot->_clData && grad->_clData;
700     cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight();
701     cl_kernel  kernel ; size_t dim0, dim1;
702     if(!both_grad_dog)  {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; }
703     else if(use_gm2)    {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; }
704     else                {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; }
705     size_t gsz[2] = {   (w + dim0 - 1 - margin) / (dim0 - margin) * dim0,
706                         (h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
707     size_t lsz[2] = {dim0, dim1};
708     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData));
709     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData));
710     clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData));
711     clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
712     clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
713     if(both_grad_dog)
714     {
715         clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData));
716         clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData));
717     }
718     ///////////////////////////////////////////////////////
719     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
720     CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
721 }
722 
723 
ComputeKEY(CLTexImage * dog,CLTexImage * key,float Tdog,float Tedge)724 void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge)
725 {
726     cl_kernel  kernel = s_keypoint->_kernel;
727     cl_int w = key->GetImgWidth(), h = key->GetImgHeight();
728 	float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f);
729 	float threshold1 = Tdog;
730 	float threshold2 = (Tedge+1)*(Tedge+1)/Tedge;
731 
732     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
733     clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData));
734     clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData));
735     clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData));
736     clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0));
737     clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1));
738     clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2));
739     clSetKernelArg(kernel, 7, sizeof(cl_int), &(w));
740     clSetKernelArg(kernel, 8, sizeof(cl_int), &(h));
741 
742     size_t dim0 = 8, dim1 = 8;
743     //if( w * h / dim0 / dim1 < 16) dim1 /= 2;
744     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
745     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
746     CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
747 }
748 
UnpackImage(CLTexImage * src,CLTexImage * dst)749 void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
750 {
751     cl_kernel  kernel = s_unpack->_kernel;
752     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
753     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
754     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
755     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
756     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
757     const size_t dim0 = 16, dim1 = 16;
758     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
759     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
760 
761     CheckErrorCL(status, "ProgramBagCL::UnpackImage");
762     FinishCL();
763 
764 }
765 
UnpackImageDOG(CLTexImage * src,CLTexImage * dst)766 void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
767 {
768     if(s_unpack_dog == NULL) return;
769     cl_kernel  kernel = s_unpack_dog->_kernel;
770     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
771     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
772     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
773     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
774     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
775     const size_t dim0 = 16, dim1 = 16;
776     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
777     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
778 
779     CheckErrorCL(status, "ProgramBagCL::UnpackImage");
780     FinishCL();
781 }
782 
UnpackImageGRD(CLTexImage * src,CLTexImage * dst)783 void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
784 {
785     if(s_unpack_grd == NULL) return;
786     cl_kernel  kernel = s_unpack_grd->_kernel;
787     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
788     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
789     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
790     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
791     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
792     const size_t dim0 = 16, dim1 = 16;
793     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
794     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
795 
796     CheckErrorCL(status, "ProgramBagCL::UnpackImage");
797     FinishCL();
798 }
UnpackImageKEY(CLTexImage * src,CLTexImage * dog,CLTexImage * dst)799 void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
800 {
801     if(s_unpack_key == NULL) return;
802     cl_kernel  kernel = s_unpack_key->_kernel;
803     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
804     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
805     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData));
806     clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData));
807     clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
808     clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
809     const size_t dim0 = 16, dim1 = 16;
810     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
811     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
812 
813     CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
814     FinishCL();
815 }
LoadDescriptorShader()816 void ProgramBagCL::LoadDescriptorShader()
817 {
818 	GlobalUtil::_DescriptorPPT = 16;
819 	LoadDescriptorShaderF2();
820 }
821 
LoadDescriptorShaderF2()822 void ProgramBagCL::LoadDescriptorShaderF2()
823 {
824 
825 }
826 
LoadOrientationShader(void)827 void ProgramBagCL::LoadOrientationShader(void)
828 {
829 
830 }
831 
LoadGenListShader(int ndoglev,int nlev)832 void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev)
833 {
834 
835 }
836 
LoadKeypointShader()837 void ProgramBagCL::LoadKeypointShader()
838 {
839     int i;    char buffer[20240];
840 	ostrstream out(buffer, 20240);
841 	streampos pos;
842 
843 	//tex(X)(Y)
844 	//X: (CLR) (CENTER 0, LEFT -1, RIGHT +1)
845 	//Y: (CDU) (CENTER 0, DOWN -1, UP    +1)
846 	out<<
847     "__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
848     "           __read_only image2d_t texD, __write_only image2d_t texK,\n"
849     "          float THRESHOLD0, float THRESHOLD1, \n"
850     "          float THRESHOLD2, int width, int height)\n"
851 	"{\n"
852     "   sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
853     "         CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
854     "   int x = get_global_id(0), y = get_global_id(1);\n"
855     "   if(x  >= width || y >= height) return; \n"
856     "   int  xp = x - 1, xn = x + 1;\n"
857     "   int  yp = y - 1, yn = y + 1;\n"
858     "   int2 coord0 = (int2) (x, y); \n"
859     "   int2 coord1 = (int2) (xp, y); \n"
860     "   int2 coord2 = (int2) (xn, y); \n"
861     "   int2 coord3 = (int2) (x, yp); \n"
862     "   int2 coord4 = (int2) (x, yn); \n"
863     "   int2 coord5 = (int2) (xp, yp); \n"
864     "   int2 coord6 = (int2) (xp, yn); \n"
865     "   int2 coord7 = (int2) (xn, yp); \n"
866     "   int2 coord8 = (int2) (xn, yn); \n"
867     "	float4 ccc = read_imagef(tex, sampler,coord0);\n"
868 	"	float4 clc = read_imagef(tex, sampler,coord1);\n"
869 	"	float4 crc = read_imagef(tex, sampler,coord2);\n"
870 	"	float4 ccd = read_imagef(tex, sampler,coord3);\n"
871 	"	float4 ccu = read_imagef(tex, sampler,coord4);\n"
872 	"	float4 cld = read_imagef(tex, sampler,coord5);\n"
873 	"	float4 clu = read_imagef(tex, sampler,coord6);\n"
874 	"	float4 crd = read_imagef(tex, sampler,coord7);\n"
875 	"	float4 cru = read_imagef(tex, sampler,coord8);\n"
876     "	float4   cc = ccc;\n"
877 	"	float4  v1[4], v2[4];\n"
878 	"	v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
879 	"	v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
880 	"	v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
881 	"	v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
882 	"	v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
883 	"	v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
884 	"	v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
885 	"	v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
886     "   float4 key4 = (float4)(0); \n";
887 	//test against 8 neighbours
888 	//use variable to identify type of extremum
889 	//1.0 for local maximum and -1.0 for minimum
890     for(i = 0; i < 4; ++i)
891 	out<<
892     "   if(cc.s"<<i<<" > THRESHOLD0){ \n"
893     "           if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n"
894     "	}else if(cc.s"<<i<<" < -THRESHOLD0){ \n"
895     "           if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n"
896     "   }";
897 
898 	out<<
899     "   if(x ==0) {key4.x =  key4.z= 0; }\n"
900     "   else if(x + 1 == width) {key4.y =  key4.w = 0;}\n"
901     "   if(y ==0) {key4.x =   key4.y = 0; }\n"
902     "   else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
903     "   float4 ak = fabs(key4); \n"
904     "   float keysum = ak.x + ak.y + ak.z + ak.w; \n"
905 	"	float4 result = (float4)(0.0);\n"
906 	"	if(keysum == 1.0) {\n"
907 	"	float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
908 
909     //do edge supression first..
910 	//vector v1 is < (-1, 0), (1, 0), (0,-1), (0, 1)>
911 	//vector v2 is < (-1,-1), (-1,1), (1,-1), (1, 1)>
912     for(i = 0; i < 4; ++i)
913 	out <<
914 	"	if(key4.s"<<i<<" != 0)\n"
915 	"	{\n"
916 	"		float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n"
917 	"		float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n"
918 	"		float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n"
919 	"		fx["<<i<<"] = D5.x;	fy["<<i<<"] = D5.y ;\n"
920 	"		fxx["<<i<<"] = D2.x + D2.y;\n"
921 	"		fyy["<<i<<"] = D2.z + D2.w;\n"
922 	"		fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n"
923 	"		float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n"
924 	"		float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
925 	"		float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n"
926 	"		if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
927 	"	}\n";
928 
929     out <<
930 	"	if(keysum == 1) {\n";
931 	////////////////////////////////////////////////
932 	//read 9 pixels of upper/lower level
933 	out<<
934 	"	float4  v4[4], v5[4], v6[4];\n"
935 	"	ccc = read_imagef(texU, sampler,coord0);\n"
936 	"	clc = read_imagef(texU, sampler,coord1);\n"
937 	"	crc = read_imagef(texU, sampler,coord2);\n"
938 	"	ccd = read_imagef(texU, sampler,coord3);\n"
939 	"	ccu = read_imagef(texU, sampler,coord4);\n"
940 	"	cld = read_imagef(texU, sampler,coord5);\n"
941 	"	clu = read_imagef(texU, sampler,coord6);\n"
942 	"	crd = read_imagef(texU, sampler,coord7);\n"
943 	"	cru = read_imagef(texU, sampler,coord8);\n"
944     "	float4 cu = ccc;\n"
945 	"	v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
946 	"	v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
947 	"	v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
948 	"	v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
949 	"	v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
950 	"	v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
951 	"	v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
952 	"	v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
953 
954     for(i = 0; i < 4; ++i)
955 	out <<
956 	"	if(key4.s"<<i<<" == 1.0)\n"
957 	"	{\n"
958 	"		if(cc.s"<<i<<" < cu.s"<<i<<" || \n"
959     "           any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
960 	"	}else if(key4.s"<<i<<" == -1.0)\n"
961 	"	{\n"
962 	"		if(cc.s"<<i<<" > cu.s"<<i<<" || \n"
963     "           any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n"
964 	"	}\n";
965 
966     out <<
967 	"	if(keysum == 1.0) { \n";
968     out <<
969 	"	ccc = read_imagef(texD, sampler,coord0);\n"
970 	"	clc = read_imagef(texD, sampler,coord1);\n"
971 	"	crc = read_imagef(texD, sampler,coord2);\n"
972 	"	ccd = read_imagef(texD, sampler,coord3);\n"
973 	"	ccu = read_imagef(texD, sampler,coord4);\n"
974 	"	cld = read_imagef(texD, sampler,coord5);\n"
975 	"	clu = read_imagef(texD, sampler,coord6);\n"
976 	"	crd = read_imagef(texD, sampler,coord7);\n"
977 	"	cru = read_imagef(texD, sampler,coord8);\n"
978     "	float4 cd = ccc;\n"
979 	"	v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
980 	"	v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
981 	"	v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
982 	"	v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
983 	"	v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
984 	"	v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
985 	"	v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
986 	"	v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
987     for(i = 0; i < 4; ++i)
988     out <<
989 	"	if(key4.s"<<i<<" == 1.0)\n"
990 	"	{\n"
991 	"		if(cc.s"<<i<<" < cd.s"<<i<<" ||\n"
992     "           any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
993 	"	}else if(key4.s"<<i<<" == -1.0)\n"
994 	"	{\n"
995 	"		if(cc.s"<<i<<" > cd.s"<<i<<" ||\n"
996     "           any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
997 	"	}\n";
998 
999     out <<
1000 	"	if(keysum==1.0) {\n";
1001 	//////////////////////////////////////////////////////////////////////
1002 	if(GlobalUtil::_SubpixelLocalization)
1003     {
1004 	    out <<
1005 	    "	float4 offset = (float4)(0); \n";
1006         for(i = 1; i < 4; ++i)
1007         out <<
1008 	    "	if(key4.s"<<i<<" != 0) \n"
1009 	    "	{\n"
1010 	    "		cu.s0 = cu.s"<<i<<";	cd.s0 = cd.s"<<i<<";	cc.s0 = cc.s"<<i<<";	\n"
1011 	    "		v4[0] = v4["<<i<<"];	v5[0] = v5["<<i<<"];						\n"
1012 	    "		fxy[0] = fxy["<<i<<"];	fxx[0] = fxx["<<i<<"];	fyy[0] = fyy["<<i<<"];	\n"
1013 	    "		fx[0] = fx["<<i<<"];	fy[0] = fy["<<i<<"];						\n"
1014 	    "	}\n";
1015 
1016         out <<
1017 	    "	float fs = 0.5*( cu.s0 - cd.s0 );				\n"
1018 	    "	float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
1019 	    "	float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
1020 	    "	float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
1021 	    "	float4 A0, A1, A2 ;			\n"
1022 	    "	A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]);	\n"
1023 	    "	A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]);	\n"
1024 	    "	A2 = (float4)(fxs, fys, fss, -fs);	\n"
1025         "	float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0));		\n"
1026 	    "	float maxa = max(max(x3.x, x3.y), x3.z);	\n"
1027 	    "	if(maxa >= 1e-10 ) \n"
1028 	    "	{												\n"
1029 	    "		if(x3.y ==maxa )							\n"
1030 	    "		{											\n"
1031 	    "			float4 TEMP = A1; A1 = A0; A0 = TEMP;	\n"
1032 	    "		}else if( x3.z == maxa )					\n"
1033 	    "		{											\n"
1034 	    "			float4 TEMP = A2; A2 = A0; A0 = TEMP;	\n"
1035 	    "		}											\n"
1036 	    "		A0 /= A0.x;									\n"
1037 	    "		A1 -= A1.x * A0;							\n"
1038 	    "		A2 -= A2.x * A0;							\n"
1039         "		float2 x2 = fabs((float2)(A1.y, A2.y));		\n"
1040 	    "		if( x2.y > x2.x )							\n"
1041 	    "		{											\n"
1042 	    "			float4 TEMP = A2.yzwx;					\n"
1043 	    "			A2.yzw = A1.yzw;						\n"
1044 	    "			A1.yzw = TEMP.xyz;							\n"
1045 	    "			x2.x = x2.y;							\n"
1046 	    "		}											\n"
1047 	    "		if(x2.x >= 1e-10) {								\n"
1048 	    "			A1.yzw /= A1.y;								\n"
1049 	    "			A2.yzw -= A2.y * A1.yzw;					\n"
1050 	    "			if(fabs(A2.z) >= 1e-10) {\n"
1051 	    "				offset.z = A2.w /A2.z;				    \n"
1052 	    "				offset.y = A1.w - offset.z*A1.z;			    \n"
1053 	    "				offset.x = A0.w - offset.z*A0.z - offset.y*A0.y;	\n"
1054         "				if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
1055         "                   || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
1056 	    "			}\n"
1057 	    "		}\n"
1058 	    "	}\n"
1059 	    <<"\n"
1060         "	float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1061 	    "	result = (float4)(keyv,  offset.xyz);\n"
1062 	    "	}}}}\n"
1063         "   write_imagef(texK, coord0, result);\n "
1064 	    "}\n"	<<'\0';
1065     }
1066 	else
1067     {
1068         out << "\n"
1069         "	float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1070         "	result =  (float4)(keyv, 0, 0, 0);\n"
1071         "	}}}}\n"
1072         "   write_imagef(texK, coord0, result);\n "
1073         "}\n"	<<'\0';
1074     }
1075 
1076     s_keypoint = new ProgramCL("keypoint", buffer, _context, _device);
1077 }
1078 
LoadDisplayShaders()1079 void ProgramBagCL::LoadDisplayShaders()
1080 {
1081     //"uniform sampler2DRect tex; void main(){\n"
1082     //"vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy);	bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));\n"
1083     //"float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}");
1084 	s_unpack = new ProgramCL("main",
1085     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1086     "                   int width, int height) {\n"
1087     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1088     "int x = get_global_id(0), y =  get_global_id(1); \n"
1089     "if(x >= width || y >= height) return;\n"
1090     "int xx = x / 2, yy = y / 2; \n"
1091     "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1092     "float v1 = (x & 1 ? vv.w : vv.z); \n"
1093     "float v2 = (x & 1 ? vv.y : vv.x);\n"
1094     "float v = y & 1 ? v1 : v2;\n"
1095     "float4 result = (float4) (v, v, v, 1);"
1096     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1097 
1098 	s_unpack_dog = new ProgramCL("main",
1099     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1100     "                   int width, int height) {\n"
1101     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1102     "int x = get_global_id(0), y =  get_global_id(1); \n"
1103     "if(x >= width || y >= height) return;\n"
1104     "int xx = x / 2, yy = y / 2; \n"
1105     "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1106     "float v1 = (x & 1 ? vv.w : vv.z); \n"
1107     "float v2 = (x & 1 ? vv.y : vv.x);\n"
1108     "float v0 = y & 1 ? v1 : v2;\n"
1109     "float v = 0.5 + 20.0 * v0;\n "
1110     "float4 result = (float4) (v, v, v, 1);"
1111     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1112 
1113     s_unpack_grd = new ProgramCL("main",
1114     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1115     "                   int width, int height) {\n"
1116     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1117     "int x = get_global_id(0), y =  get_global_id(1); \n"
1118     "if(x >= width || y >= height) return;\n"
1119     "int xx = x / 2, yy = y / 2; \n"
1120     "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1121     "float v1 = (x & 1 ? vv.w : vv.z); \n"
1122     "float v2 = (x & 1 ? vv.y : vv.x);\n"
1123     "float v0 = y & 1 ? v1 : v2;\n"
1124     "float v = 5.0 * v0;\n "
1125     "float4 result = (float4) (v, v, v, 1);"
1126     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1127 
1128 	s_unpack_key = new ProgramCL("main",
1129     "__kernel void main(__read_only  image2d_t dog,\n"
1130     "                   __read_only image2d_t key,\n"
1131     "                   __write_only image2d_t output,\n"
1132     "                   int width, int height) {\n"
1133     "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1134     "int x = get_global_id(0), y =  get_global_id(1); \n"
1135     "if(x >= width || y >= height) return;\n"
1136     "int xx = x / 2, yy = y / 2; \n"
1137     "float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
1138     "int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
1139     "int k1 = (x & 1 ? cc.w : cc.z); \n"
1140     "int k2 = (x & 1 ? cc.y : cc.x);\n"
1141     "int k0 = y & 1 ? k1 : k2;\n"
1142     "float4 result;\n"
1143     "if(k0 != 0){\n"
1144     "   //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
1145     "   result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0,  0.0, 1.0)); \n"
1146     "}else{"
1147         "float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
1148         "float v1 = (x & 1 ? vv.w : vv.z); \n"
1149         "float v2 = (x & 1 ? vv.y : vv.x);\n"
1150         "float v0 = y & 1 ? v1 : v2;\n"
1151         "float v = 0.5 + 20.0 * v0;\n "
1152         "result = (float4) (v, v, v, 1);"
1153     "}\n"
1154     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1155 }
1156 
1157 
SetMarginCopyParam(int xmax,int ymax)1158 void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax)
1159 {
1160 
1161 }
1162 
SetGradPassParam(int texP)1163 void ProgramBagCL::SetGradPassParam(int texP)
1164 {
1165 
1166 }
1167 
SetGenListEndParam(int ktex)1168 void ProgramBagCL::SetGenListEndParam(int ktex)
1169 {
1170 
1171 }
1172 
SetDogTexParam(int texU,int texD)1173 void ProgramBagCL::SetDogTexParam(int texU, int texD)
1174 {
1175 
1176 }
1177 
SetGenListInitParam(int w,int h)1178 void ProgramBagCL::SetGenListInitParam(int w, int h)
1179 {
1180 	float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f,  (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f};
1181 
1182 }
1183 
1184 
SetGenListStartParam(float width,int tex0)1185 void ProgramBagCL::SetGenListStartParam(float width, int tex0)
1186 {
1187 
1188 }
1189 
1190 
1191 
SetGenListStepParam(int tex,int tex0)1192 void ProgramBagCL::SetGenListStepParam(int tex, int tex0)
1193 {
1194 
1195 }
1196 
SetGenVBOParam(float width,float fwidth,float size)1197 void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size)
1198 {
1199 
1200 }
1201 
SetSimpleOrientationInput(int oTex,float sigma,float sigma_step)1202 void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step)
1203 {
1204 
1205 }
1206 
1207 
SetFeatureOrientationParam(int gtex,int width,int height,float sigma,int otex,float step)1208 void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step)
1209 {
1210 
1211 
1212 }
1213 
SetFeatureDescirptorParam(int gtex,int otex,float dwidth,float fwidth,float width,float height,float sigma)1214 void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth,  float width, float height, float sigma)
1215 {
1216 
1217 }
1218 
1219 
1220 
GetErrorString(cl_int error)1221 const char* ProgramBagCL::GetErrorString(cl_int error)
1222 {
1223     static const char* errorString[] = {
1224         "CL_SUCCESS",
1225         "CL_DEVICE_NOT_FOUND",
1226         "CL_DEVICE_NOT_AVAILABLE",
1227         "CL_COMPILER_NOT_AVAILABLE",
1228         "CL_MEM_OBJECT_ALLOCATION_FAILURE",
1229         "CL_OUT_OF_RESOURCES",
1230         "CL_OUT_OF_HOST_MEMORY",
1231         "CL_PROFILING_INFO_NOT_AVAILABLE",
1232         "CL_MEM_COPY_OVERLAP",
1233         "CL_IMAGE_FORMAT_MISMATCH",
1234         "CL_IMAGE_FORMAT_NOT_SUPPORTED",
1235         "CL_BUILD_PROGRAM_FAILURE",
1236         "CL_MAP_FAILURE",
1237         "",
1238         "",
1239         "",
1240         "",
1241         "",
1242         "",
1243         "",
1244         "",
1245         "",
1246         "",
1247         "",
1248         "",
1249         "",
1250         "",
1251         "",
1252         "",
1253         "",
1254         "CL_INVALID_VALUE",
1255         "CL_INVALID_DEVICE_TYPE",
1256         "CL_INVALID_PLATFORM",
1257         "CL_INVALID_DEVICE",
1258         "CL_INVALID_CONTEXT",
1259         "CL_INVALID_QUEUE_PROPERTIES",
1260         "CL_INVALID_COMMAND_QUEUE",
1261         "CL_INVALID_HOST_PTR",
1262         "CL_INVALID_MEM_OBJECT",
1263         "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
1264         "CL_INVALID_IMAGE_SIZE",
1265         "CL_INVALID_SAMPLER",
1266         "CL_INVALID_BINARY",
1267         "CL_INVALID_BUILD_OPTIONS",
1268         "CL_INVALID_PROGRAM",
1269         "CL_INVALID_PROGRAM_EXECUTABLE",
1270         "CL_INVALID_KERNEL_NAME",
1271         "CL_INVALID_KERNEL_DEFINITION",
1272         "CL_INVALID_KERNEL",
1273         "CL_INVALID_ARG_INDEX",
1274         "CL_INVALID_ARG_VALUE",
1275         "CL_INVALID_ARG_SIZE",
1276         "CL_INVALID_KERNEL_ARGS",
1277         "CL_INVALID_WORK_DIMENSION",
1278         "CL_INVALID_WORK_GROUP_SIZE",
1279         "CL_INVALID_WORK_ITEM_SIZE",
1280         "CL_INVALID_GLOBAL_OFFSET",
1281         "CL_INVALID_EVENT_WAIT_LIST",
1282         "CL_INVALID_EVENT",
1283         "CL_INVALID_OPERATION",
1284         "CL_INVALID_GL_OBJECT",
1285         "CL_INVALID_BUFFER_SIZE",
1286         "CL_INVALID_MIP_LEVEL",
1287         "CL_INVALID_GLOBAL_WORK_SIZE",
1288     };
1289 
1290     const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
1291 
1292     const int index = -error;
1293 
1294     return (index >= 0 && index < errorCount) ? errorString[index] : "";
1295 }
1296 
CheckErrorCL(cl_int error,const char * location)1297 bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location)
1298 {
1299     if(error == CL_SUCCESS) return true;
1300 	const char *errstr = GetErrorString(error);
1301 	if(errstr && errstr[0]) std::cerr << errstr;
1302 	else std::cerr  << "Error " << error;
1303 	if(location) std::cerr  << " at " << location;
1304 	std::cerr  << "\n";
1305     exit(0);
1306     return false;
1307 
1308 }
1309 
1310 
1311 ////////////////////////////////////////////////////////////////////////
1312 ////////////////////////////////////////////////////////////////////////
1313 
LoadFixedShaders()1314 void ProgramBagCLN::LoadFixedShaders()
1315 {
1316    	s_sampling = new ProgramCL("sampling",
1317         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1318         "__kernel void sampling(__read_only  image2d_t input, __write_only image2d_t output, "
1319         "                   int width, int height) {\n"
1320         "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1321         "if( coord.x >= width || coord.y >= height) return;\n"
1322         "write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }"  , _context, _device);
1323 
1324     s_sampling_k = new ProgramCL("sampling_k",
1325         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1326         "__kernel void sampling_k(__read_only  image2d_t input, __write_only image2d_t output, "
1327         "                   int width, int height, int step) {\n"
1328         "int x = get_global_id(0), y =  get_global_id(1); \n"
1329         "if( x >= width || y >= height) return;\n"
1330         "int xa = x * step,   ya = y *step; \n"
1331         "float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
1332         "write_imagef(output, (int2) (x, y), v1); }"  , _context, _device);
1333 
1334 
1335 	s_sampling_u = new ProgramCL("sampling_u",
1336         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
1337         "__kernel void sampling_u(__read_only  image2d_t input, \n"
1338         "                   __write_only image2d_t output,\n"
1339         "                   int width, int height, float step) {\n"
1340         "int x = get_global_id(0), y =  get_global_id(1); \n"
1341         "if( x >= width || y >= height) return;\n"
1342         "float xa = x * step,       ya = y *step; \n"
1343         "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
1344         "write_imagef(output, (int2) (x, y), (float4)(v1)); }"  , _context, _device);
1345 
1346    s_dog_pass = new ProgramCL("dog_pass",
1347         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1348         "__kernel void dog_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
1349         "                   __write_only image2d_t dog, int width, int height) {\n"
1350         "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1351         "if( coord.x >= width || coord.y >= height) return;\n"
1352         "float cc = read_imagef(tex , sampler, coord).x; \n"
1353         "float cp = read_imagef(texp, sampler, coord).x;\n"
1354         "write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device);
1355 
1356    s_grad_pass = new ProgramCL("grad_pass",
1357         "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1358         "__kernel void grad_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
1359         "                   __write_only image2d_t dog, int width, int height, \n"
1360         "                   __write_only image2d_t grad, __write_only image2d_t rot) {\n"
1361         "int x = get_global_id(0), y =  get_global_id(1); \n"
1362         "if( x >= width || y >= height) return;\n"
1363         "int2 coord = (int2) (x, y);\n"
1364         "float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
1365         "float cc = read_imagef(tex , sampler, coord).x; \n"
1366         "float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
1367         "float cp = read_imagef(texp, sampler, coord).x;\n"
1368         "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1369 	    "float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
1370         "float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
1371         "float dx = cr - cl, dy = cu - cd; \n"
1372 	    "float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
1373         "write_imagef(grad, coord, (float4)(gg));\n"
1374         "float oo = atan2(dy, dx + FLT_MIN);\n"
1375         "write_imagef(rot, coord, (float4)(oo));}\n", _context, _device);
1376 
1377    s_grad_pass2 = new ProgramCL("grad_pass2",
1378         "#define BLOCK_DIMX 32\n"
1379         "#define BLOCK_DIMY 14\n"
1380         "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
1381         "__kernel void grad_pass2(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
1382         "                   __write_only image2d_t dog, int width, int height,\n"
1383         "                   __write_only image2d_t grd, __write_only image2d_t rot){\n"//,  __local float* block) {\n"
1384         "__local float block[BLOCK_SIZE]; \n"
1385         "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1386         "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1387         "int2 coord = (int2) (  get_global_id(0) - get_group_id(0) * 2 - 1, \n"
1388         "                       get_global_id(1) - get_group_id(1) * 2 - 1); \n"
1389         "int idx =  mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
1390         "float cc = read_imagef(tex, sampler, coord).x;\n"
1391         "block[idx] = cc;\n"
1392         "barrier(CLK_LOCAL_MEM_FENCE);\n"
1393         "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
1394         "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
1395         "if( coord.x >= width) return; \n"
1396         "if( coord.y >= height) return;\n"
1397         "float cp = read_imagef(texp, sampler, coord).x;\n"
1398         "float dx = block[idx + 1] - block[idx - 1];\n"
1399         "float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
1400         "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1401         "write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
1402         "write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device);
1403 }
1404 
LoadDisplayShaders()1405 void ProgramBagCLN::LoadDisplayShaders()
1406 {
1407 	s_unpack = new ProgramCL("main",
1408     "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1409     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1410     "                   int width, int height) {\n"
1411     "int x = get_global_id(0), y =  get_global_id(1); \n"
1412     "if(x >= width || y >= height) return;\n"
1413     "float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1414     "float4 result = (float4) (v, v, v, 1);"
1415     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1416 
1417     s_unpack_grd = new ProgramCL("main",
1418     "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1419     "           CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1420     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1421     "                   int width, int height) {\n"
1422     "int x = get_global_id(0), y =  get_global_id(1); \n"
1423     "if(x >= width || y >= height) return;\n"
1424     "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1425     "float v = 5.0 * v0;  float4 result = (float4) (v, v, v, 1);"
1426     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1427 
1428 	s_unpack_dog = new ProgramCL("main",
1429     "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1430     "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1431     "                   int width, int height) {\n"
1432     "int x = get_global_id(0), y =  get_global_id(1); \n"
1433     "if(x >= width || y >= height) return;\n"
1434     "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1435     "float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
1436     "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1437 }
1438 
CreateFilterH(float kernel[],int width)1439 ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width)
1440 {
1441     ////////////////////////////
1442 	char buffer[10240];
1443 	ostrstream out(buffer, 10240);
1444     out <<  "#define KERNEL_WIDTH " << width << "\n"
1445         <<  "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
1446             "#define BLOCK_WIDTH 128\n"
1447             "#define BLOCK_HEIGHT 1\n"
1448             "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1449             "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1450             "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1451             "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1452             "__kernel void filter_h(__read_only  image2d_t input, \n"
1453             "          __write_only image2d_t output, int width_, int height_, \n"
1454             "          __constant float* weight) {\n"
1455             "__local float data[CACHE_WIDTH]; \n"
1456             "int x = get_global_id(0), y = get_global_id(1);\n"
1457             "#pragma unroll\n"
1458 	        "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1459 	        "{\n"
1460 		    "    if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
1461 		    "    {\n"
1462 			"        int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
1463             "        data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
1464 		    "    }\n"
1465 	        "}\n"
1466             "barrier(CLK_LOCAL_MEM_FENCE); \n"
1467             "if( x > width_ || y > height_) return; \n"
1468             "float result = 0; \n"
1469             "#pragma unroll\n"
1470             "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1471             "{\n"
1472             "   result += data[get_local_id(0) + i] * weight[i];\n"
1473             "}\n"
1474          << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1475 	return new ProgramCL("filter_h", buffer, _context, _device);
1476 }
1477 
1478 
1479 
CreateFilterV(float kernel[],int width)1480 ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width)
1481 {
1482     ////////////////////////////
1483 	char buffer[10240];
1484 	ostrstream out(buffer, 10240);
1485     out <<  "#define KERNEL_WIDTH " << width << "\n"
1486         <<  "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
1487             "#define BLOCK_WIDTH 128\n"
1488             "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1489             "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1490             "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1491             "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
1492             "           CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1493             "__kernel void filter_v(__read_only  image2d_t input, \n"
1494             "          __write_only image2d_t output, int width_, int height_, \n"
1495             "          __constant float* weight) {\n"
1496             "__local float data[CACHE_WIDTH]; \n"
1497             "int x = get_global_id(0), y = get_global_id(1);\n"
1498             "#pragma unroll\n"
1499 	        "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1500 	        "{\n"
1501 		    "    if(get_local_id(1) + j * BLOCK_WIDTH  < CACHE_WIDTH)\n"
1502 		    "    {\n"
1503 			"        int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
1504             "        data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
1505 		    "    }\n"
1506 	        "}\n"
1507             "barrier(CLK_LOCAL_MEM_FENCE); \n"
1508             "if( x > width_ || y > height_) return; \n"
1509             "float result = 0; \n"
1510             "#pragma unroll\n"
1511             "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1512             "{\n"
1513             "   result += data[get_local_id(1) + i] * weight[i];\n"
1514             "}\n"
1515          << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1516 
1517 	return new ProgramCL("filter_v", buffer, _context, _device);
1518 }
1519 
CreateFilter(float kernel[],int width)1520 FilterCL*  ProgramBagCLN::CreateFilter(float kernel[], int width)
1521 {
1522     FilterCL * filter = new FilterCL;
1523     filter->s_shader_h = CreateFilterH(kernel, width);
1524     filter->s_shader_v = CreateFilterV(kernel, width);
1525     filter->_weight = new CLTexImage(_context, _queue);
1526     filter->_weight->InitBufferTex(width, 1, 1);
1527     filter->_weight->CopyFromHost(kernel);
1528     filter->_size = width;
1529     return filter;
1530 }
1531 
1532 
FilterImage(FilterCL * filter,CLTexImage * dst,CLTexImage * src,CLTexImage * tmp)1533 void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
1534 {
1535     cl_kernel kernelh = filter->s_shader_h->_kernel;
1536     cl_kernel kernelv = filter->s_shader_v->_kernel;
1537     //////////////////////////////////////////////////////////////////
1538 
1539     cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
1540     cl_mem weight = (cl_mem) filter->_weight->_clData;
1541     cl_int w_ = w - 1, h_ = h - 1;
1542 
1543 
1544     clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
1545     clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
1546     clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
1547     clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
1548     clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight);
1549 
1550     size_t dim00 = 128, dim01 = 1;
1551     size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01};
1552     status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL);
1553     CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
1554     if(status != CL_SUCCESS) return;
1555 
1556 
1557     clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
1558     clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
1559     clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
1560     clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
1561     clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight);
1562 
1563     size_t dim10 = 1, dim11 = 128;
1564     size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11};
1565     status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
1566     CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
1567     //clReleaseEvent(event);
1568 }
1569 
SampleImageD(CLTexImage * dst,CLTexImage * src,int log_scale)1570 void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
1571 {
1572     cl_kernel  kernel;
1573     cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
1574 
1575     cl_int fullstep = (1 << log_scale);
1576     kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
1577     clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
1578     clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
1579     clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
1580     clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
1581     if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
1582 
1583     size_t dim0 = 128, dim1 = 1;
1584     //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
1585     size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
1586     cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
1587     CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
1588 }
1589 
1590 
1591 #endif
1592 
1593