1 //////////////////////////////////////////////////////////////////////////////
2 // File: ProgramCL.cpp
3 // Author: Changchang Wu
4 // Description : implementation of CL related class.
5 // class ProgramCL A simple wrapper of Cg programs
6 //
7 // Copyright (c) 2007 University of North Carolina at Chapel Hill
8 // All Rights Reserved
9 //
10 // Permission to use, copy, modify and distribute this software and its
11 // documentation for educational, research and non-profit purposes, without
12 // fee, and without a written agreement is hereby granted, provided that the
13 // above copyright notice and the following paragraph appear in all copies.
14 //
15 // The University of North Carolina at Chapel Hill make no representations
16 // about the suitability of this software for any purpose. It is provided
17 // 'as is' without express or implied warranty.
18 //
19 // Please send BUG REPORTS to ccwu@cs.unc.edu
20 //
21 ////////////////////////////////////////////////////////////////////////////
22
23 #if defined(CL_SIFTGPU_ENABLED)
24
25 #include <CL/opencl.h>
26 #include <GL/glew.h>
27
28 #include <iostream>
29 #include <iomanip>
30 #include <vector>
31 #include <strstream>
32 #include <algorithm>
33 #include <stdlib.h>
34 #include <math.h>
35 #include <string.h>
36 using namespace std;
37
38 #include "GlobalUtil.h"
39 #include "CLTexImage.h"
40 #include "ProgramCL.h"
41 #include "SiftGPU.h"
42
43
44 #if defined(_WIN32)
45 #pragma comment (lib, "OpenCL.lib")
46 #endif
47
48 #ifndef _INC_WINDOWS
49 #ifndef WIN32_LEAN_AND_MEAN
50 #define WIN32_LEAN_AND_MEAN
51 #endif
52 #include <windows.h>
53 #endif
54
55 //////////////////////////////////////////////////////////////////////
56 // Construction/Destruction
57 //////////////////////////////////////////////////////////////////////
58
ProgramCL()59 ProgramCL::ProgramCL()
60 {
61 _program = NULL;
62 _kernel = NULL;
63 _valid = 0;
64 }
65
~ProgramCL()66 ProgramCL::~ProgramCL()
67 {
68 if(_kernel) clReleaseKernel(_kernel);
69 if(_program) clReleaseProgram(_program);
70 }
71
ProgramCL(const char * name,const char * code,cl_context context,cl_device_id device)72 ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1)
73 {
74 const char * src[1] = {code}; cl_int status;
75
76 _program = clCreateProgramWithSource(context, 1, src, NULL, &status);
77 if(status != CL_SUCCESS) _valid = 0;
78
79 status = clBuildProgram(_program, 0, NULL,
80 GlobalUtil::_debug ?
81 "-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" :
82 "-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL);
83
84 if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;}
85 else if(GlobalUtil::_debug) PrintBuildLog(device, 0);
86
87 _kernel = clCreateKernel(_program, name, &status);
88 if(status != CL_SUCCESS) _valid = 0;
89 }
90
PrintBuildLog(cl_device_id device,int all)91 void ProgramCL::PrintBuildLog(cl_device_id device, int all)
92 {
93 char buffer[10240] = "\0";
94 cl_int status = clGetProgramBuildInfo(
95 _program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
96 if(all )
97 {
98 std::cerr << buffer << endl;
99 }else
100 {
101 const char * pos = strstr(buffer, "ptxas");
102 if(pos) std::cerr << pos << endl;
103 }
104 }
105
106 ///////////////////////////////////////////////////////////////////////////////////
107 /////////////////////////////////PACKED VERSION?///////////////////////////////////
108
ProgramBagCL()109 ProgramBagCL::ProgramBagCL()
110 {
111 ////////////////////////////////////
112 _context = NULL; _queue = NULL;
113 s_gray = s_sampling = NULL;
114 s_packup = s_zero_pass = NULL;
115 s_gray_pack = s_unpack = NULL;
116 s_sampling_u = NULL;
117 s_dog_pass = NULL;
118 s_grad_pass = NULL;
119 s_grad_pass2 = NULL;
120 s_unpack_dog = NULL;
121 s_unpack_grd = NULL;
122 s_unpack_key = NULL;
123 s_keypoint = NULL;
124 f_gaussian_skip0 = NULL;
125 f_gaussian_skip1 = NULL;
126 f_gaussian_step = 0;
127
128 ////////////////////////////////
129 GlobalUtil::StartTimer("Initialize OpenCL");
130 if(!InitializeContext()) return;
131 GlobalUtil::StopTimer();
132
133 }
134
135
136
~ProgramBagCL()137 ProgramBagCL::~ProgramBagCL()
138 {
139 if(s_gray) delete s_gray;
140 if(s_sampling) delete s_sampling;
141 if(s_zero_pass) delete s_zero_pass;
142 if(s_packup) delete s_packup;
143 if(s_unpack) delete s_unpack;
144 if(s_gray_pack) delete s_gray_pack;
145 if(s_sampling_u) delete s_sampling_u;
146 if(s_dog_pass) delete s_dog_pass;
147 if(s_grad_pass) delete s_grad_pass;
148 if(s_grad_pass2) delete s_grad_pass2;
149 if(s_unpack_dog) delete s_unpack_dog;
150 if(s_unpack_grd) delete s_unpack_grd;
151 if(s_unpack_key) delete s_unpack_key;
152 if(s_keypoint) delete s_keypoint;
153
154 if(f_gaussian_skip1) delete f_gaussian_skip1;
155
156 for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
157 {
158 if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i];
159 }
160 if(f_gaussian_step && _gaussian_step_num > 0)
161 {
162 for(int i = 0; i< _gaussian_step_num; i++)
163 {
164 delete f_gaussian_step[i];
165 }
166 delete[] f_gaussian_step;
167 }
168
169 //////////////////////////////////////
170 if(_context) clReleaseContext(_context);
171 if(_queue) clReleaseCommandQueue(_queue);
172 }
173
InitializeContext()174 bool ProgramBagCL::InitializeContext()
175 {
176 cl_uint num_platform, num_device;
177 cl_int status;
178 // Get OpenCL platform count
179 status = clGetPlatformIDs (0, NULL, &num_platform);
180 if (status != CL_SUCCESS || num_platform == 0) return false;
181
182 cl_platform_id platforms[16];
183 if(num_platform > 16 ) num_platform = 16;
184 status = clGetPlatformIDs (num_platform, platforms, NULL);
185 _platform = platforms[0];
186
187 ///////////////////////////////
188 status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);
189 if(status != CL_SUCCESS || num_device == 0) return false;
190
191 // Create the device list
192 cl_device_id* devices = new cl_device_id [num_device];
193 status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
194 _device = (status == CL_SUCCESS? devices[0] : 0); delete[] devices;
195 if(status != CL_SUCCESS) return false;
196
197
198 if(GlobalUtil::_verbose)
199 {
200 cl_device_mem_cache_type is_gcache;
201 clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL);
202 if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n";
203 //else if(is_gcache == CL_READ_ONLY_CACHE) std::cout << "Read only cache for global memory\n";
204 //else std::cout << "Read/Write cache for global memory\n";
205 }
206
207 //context;
208 if(GlobalUtil::_UseSiftGPUEX)
209 {
210 cl_context_properties prop[] = {
211 CL_CONTEXT_PLATFORM, (cl_context_properties)_platform,
212 CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
213 CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0 };
214 _context = clCreateContext(prop, 1, &_device, NULL, NULL, &status);
215 if(status != CL_SUCCESS) return false;
216 }else
217 {
218 _context = clCreateContext(0, 1, &_device, NULL, NULL, &status);
219 if(status != CL_SUCCESS) return false;
220 }
221
222 //command queue
223 _queue = clCreateCommandQueue(_context, _device, 0, &status);
224 return status == CL_SUCCESS;
225 }
226
InitProgramBag(SiftParam & param)227 void ProgramBagCL::InitProgramBag(SiftParam¶m)
228 {
229 GlobalUtil::StartTimer("Load Programs");
230 LoadFixedShaders();
231 LoadDynamicShaders(param);
232 if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
233 GlobalUtil::StopTimer();
234 }
235
236
UnloadProgram()237 void ProgramBagCL::UnloadProgram()
238 {
239
240 }
241
FinishCL()242 void ProgramBagCL::FinishCL()
243 {
244 clFinish(_queue);
245 }
246
LoadFixedShaders()247 void ProgramBagCL::LoadFixedShaders()
248 {
249
250
251 s_gray = new ProgramCL( "gray",
252 "__kernel void gray(__read_only image2d_t input, __write_only image2d_t output) {\n"
253 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
254 "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
255 "float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
256 "float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
257 "float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
258 "write_imagef(output, coord, result); }", _context, _device );
259
260
261 s_sampling = new ProgramCL("sampling",
262 "__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output,\n"
263 " int width, int height) {\n"
264 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
265 "int x = get_global_id(0), y = get_global_id(1); \n"
266 "if( x >= width || y >= height) return;\n"
267 "int xa = x + x, ya = y + y; \n"
268 "int xb = xa + 1, yb = ya + 1; \n"
269 "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
270 "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
271 "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
272 "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
273 "float4 result = (float4) (v1, v2, v3, v4);"
274 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
275
276 s_sampling_k = new ProgramCL("sampling_k",
277 "__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
278 " int width, int height,\n"
279 " int step, int halfstep) {\n"
280 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
281 "int x = get_global_id(0), y = get_global_id(1); \n"
282 "if( x >= width || y >= height) return;\n"
283 "int xa = x * step, ya = y *step; \n"
284 "int xb = xa + halfstep, yb = ya + halfstep; \n"
285 "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
286 "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
287 "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
288 "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
289 "float4 result = (float4) (v1, v2, v3, v4);"
290 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
291
292
293 s_sampling_u = new ProgramCL("sampling_u",
294 "__kernel void sampling_u(__read_only image2d_t input, \n"
295 " __write_only image2d_t output,\n"
296 " int width, int height,\n"
297 " float step, float halfstep) {\n"
298 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
299 "int x = get_global_id(0), y = get_global_id(1); \n"
300 "if( x >= width || y >= height) return;\n"
301 "float xa = x * step, ya = y *step; \n"
302 "float xb = xa + halfstep, yb = ya + halfstep; \n"
303 "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
304 "float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
305 "float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
306 "float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
307 "float4 result = (float4) (v1, v2, v3, v4);"
308 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
309
310
311 s_zero_pass = new ProgramCL("zero_pass",
312 "__kernel void zero_pass(__write_only image2d_t output){\n"
313 "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
314 "write_imagef(output, coord, (float4)(0.0));}", _context, _device);
315
316 s_packup = new ProgramCL("packup",
317 "__kernel void packup(__global float* input, __write_only image2d_t output,\n"
318 " int twidth, int theight, int width){\n"
319 "int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
320 "if(coord.x >= twidth || coord.y >= theight) return;\n"
321 "int index0 = (coord.y + coord.y) * width; \n"
322 "int index1 = index0 + coord.x;\n"
323 "int x2 = min(width -1, coord.x); \n"
324 "float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
325 "int index2 = index1 + width; \n"
326 "float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
327 "write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
328
329 s_dog_pass = new ProgramCL("dog_pass",
330 "__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
331 " __write_only image2d_t dog, int width, int height) {\n"
332 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
333 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
334 "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
335 "if( coord.x >= width || coord.y >= height) return;\n"
336 "float4 cc = read_imagef(tex , sampler, coord); \n"
337 "float4 cp = read_imagef(texp, sampler, coord);\n"
338 "write_imagef(dog, coord, cc - cp); }\n", _context, _device);
339
340 s_grad_pass = new ProgramCL("grad_pass",
341 "__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
342 " __write_only image2d_t dog, int width, int height,\n"
343 " __write_only image2d_t grad, __write_only image2d_t rot) {\n"
344 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
345 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
346 "int x = get_global_id(0), y = get_global_id(1); \n"
347 "if( x >= width || y >= height) return;\n"
348 "int2 coord = (int2) (x, y);\n"
349 "float4 cc = read_imagef(tex , sampler, coord); \n"
350 "float4 cp = read_imagef(texp, sampler, coord);\n"
351 "float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
352 "float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
353 "float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
354 "float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
355 "write_imagef(dog, coord, cc - cp); \n"
356 "float4 dx = (float4)(cc.y - cl.x, cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
357 "float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
358 "write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
359 "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
360
361 s_grad_pass2 = new ProgramCL("grad_pass2",
362 "#define BLOCK_DIMX 32\n"
363 "#define BLOCK_DIMY 14\n"
364 "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
365 "__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
366 " __write_only image2d_t dog, int width, int height,\n"
367 " __write_only image2d_t grd, __write_only image2d_t rot){\n"//, __local float* block) {\n"
368 "__local float block[BLOCK_SIZE * 4]; \n"
369 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
370 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
371 "int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
372 " get_global_id(1) - get_group_id(1) * 2- 1); \n"
373 "int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
374 "float4 cc = read_imagef(tex, sampler, coord);\n"
375 "block[idx ] = cc.x;\n"
376 "block[idx + BLOCK_SIZE ] = cc.y;\n"
377 "block[idx + BLOCK_SIZE * 2] = cc.z;\n"
378 "block[idx + BLOCK_SIZE * 3] = cc.w;\n"
379 "barrier(CLK_LOCAL_MEM_FENCE);\n"
380 "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
381 "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
382 "if( coord.x >= width) return; \n"
383 "if( coord.y >= height) return;\n"
384 "float4 cp = read_imagef(texp, sampler, coord);\n"
385 "float4 dx = (float4)( cc.y - block[idx - 1 + BLOCK_SIZE], \n"
386 " block[idx + 1] - cc.x, \n"
387 " cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
388 " block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
389 "float4 dy = (float4)( cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
390 " cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
391 //" cc.zw - block[idx - BLOCK_DIMX].zw, \n"
392 " block[idx + BLOCK_DIMX] - cc.x,\n "
393 " block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
394 //" block[idx + BLOCK_DIMX].xy - cc.xy);\n"
395 "write_imagef(dog, coord, cc - cp); \n"
396 "write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
397 "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
398 }
399
LoadDynamicShaders(SiftParam & param)400 void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
401 {
402 LoadKeypointShader();
403 LoadGenListShader(param._dog_level_num, 0);
404 CreateGaussianFilters(param);
405 }
406
407
SelectInitialSmoothingFilter(int octave_min,SiftParam & param)408 void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam¶m)
409 {
410 float sigma = param.GetInitialSmoothSigma(octave_min);
411 if(sigma == 0)
412 {
413 f_gaussian_skip0 = NULL;
414 }else
415 {
416 for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
417 {
418 if(f_gaussian_skip0_v[i]->_id == octave_min)
419 {
420 f_gaussian_skip0 = f_gaussian_skip0_v[i];
421 return ;
422 }
423 }
424 FilterCL * filter = CreateGaussianFilter(sigma);
425 filter->_id = octave_min;
426 f_gaussian_skip0_v.push_back(filter);
427 f_gaussian_skip0 = filter;
428 }
429
430 }
431
CreateGaussianFilters(SiftParam & param)432 void ProgramBagCL::CreateGaussianFilters(SiftParam¶m)
433 {
434 if(param._sigma_skip0>0.0f)
435 {
436 f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0);
437 f_gaussian_skip0->_id = GlobalUtil::_octave_min_default;
438 f_gaussian_skip0_v.push_back(f_gaussian_skip0);
439 }
440 if(param._sigma_skip1>0.0f)
441 {
442 f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1);
443 }
444
445 f_gaussian_step = new FilterCL*[param._sigma_num];
446 for(int i = 0; i< param._sigma_num; i++)
447 {
448 f_gaussian_step[i] = CreateGaussianFilter(param._sigma[i]);
449 }
450 _gaussian_step_num = param._sigma_num;
451 }
452
453
CreateGaussianFilter(float sigma)454 FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
455 {
456 //pixel inside 3*sigma box
457 int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
458 int width = 2*sz + 1;
459
460 //filter size truncation
461 if(GlobalUtil::_MaxFilterWidth >0 && width > GlobalUtil::_MaxFilterWidth)
462 {
463 std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl;
464 sz = GlobalUtil::_MaxFilterWidth>>1;
465 width = 2 * sz + 1;
466 }
467
468 int i;
469 float * kernel = new float[width];
470 float rv = 1.0f/(sigma*sigma);
471 float v, ksum =0;
472
473 // pre-compute filter
474 for( i = -sz ; i <= sz ; ++i)
475 {
476 kernel[i+sz] = v = exp(-0.5f * i * i *rv) ;
477 ksum += v;
478 }
479
480 //normalize the kernel
481 rv = 1.0f / ksum;
482 for(i = 0; i< width ;i++) kernel[i]*=rv;
483
484 FilterCL * filter = CreateFilter(kernel, width);
485 delete [] kernel;
486 if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl;
487 return filter;
488 }
489
CreateFilter(float kernel[],int width)490 FilterCL* ProgramBagCL::CreateFilter(float kernel[], int width)
491 {
492 FilterCL * filter = new FilterCL;
493 filter->s_shader_h = CreateFilterH(kernel, width);
494 filter->s_shader_v = CreateFilterV(kernel, width);
495 filter->_size = width;
496 filter->_id = 0;
497 return filter;
498 }
499
CreateFilterH(float kernel[],int width)500 ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width)
501 {
502 int halfwidth = width >>1;
503 float * pf = kernel + halfwidth;
504 int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up
505 int npixel = (nhpixel<<1)+1;//
506 float weight[3];
507
508 ////////////////////////////
509 char buffer[10240];
510 ostrstream out(buffer, 10240);
511 out<<setprecision(8);
512
513
514 //CL_DEVICE_IMAGE2D_MAX_WIDTH
515 out<<
516 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
517 "__kernel void filter_h(__read_only image2d_t input, \n"
518 " __write_only image2d_t output, int width_, int height_) {\n"
519 "int x = get_global_id(0);\n"
520 "int y = get_global_id(1); \n"
521 "if( x > width_ || y > height_) return; \n"
522 "float4 pc; int2 coord; \n"
523 "float4 result = (float4)(0.0);\n";
524 for(int i = 0 ; i < npixel ; i++)
525 {
526 out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n";
527 out<<"pc= read_imagef(input, sampler, coord);\n";
528 if(GlobalUtil::_PreciseBorder)
529 out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
530 //for each sub-pixel j in center, the weight of sub-pixel k
531 int xw = (i - nhpixel)*2;
532 for(int j = 0; j < 3; j++)
533 {
534 int xwn = xw + j -1;
535 weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
536 }
537 if(weight[1] == 0.0)
538 {
539 out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n";
540 }
541 else
542 {
543 out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n";
544 out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n";
545 }
546 }
547 out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
548 return new ProgramCL("filter_h", buffer, _context, _device);
549 }
550
551
552
CreateFilterV(float kernel[],int width)553 ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width)
554 {
555
556 int halfwidth = width >>1;
557 float * pf = kernel + halfwidth;
558 int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up
559 int npixel = (nhpixel<<1)+1;//
560 float weight[3];
561
562 ////////////////////////////
563 char buffer[10240];
564 ostrstream out(buffer, 10240);
565 out<<setprecision(8);
566
567
568 //CL_DEVICE_IMAGE2D_MAX_WIDTH
569 out<<
570 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
571 "__kernel void filter_v(__read_only image2d_t input, \n"
572 " __write_only image2d_t output, int width_, int height_) {\n"
573 "int x = get_global_id(0);\n"
574 "int y = get_global_id(1); \n"
575 "if( x > width_ || y >= height_) return; \n"
576 "float4 pc; int2 coord; \n"
577 "float4 result = (float4)(0.0);\n";
578 for(int i = 0 ; i < npixel ; i++)
579 {
580 out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n";
581 out<<"pc= read_imagef(input, sampler, coord);\n";
582 if(GlobalUtil::_PreciseBorder)
583 out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
584 //for each sub-pixel j in center, the weight of sub-pixel k
585 int xw = (i - nhpixel)*2;
586 for(int j = 0; j < 3; j++)
587 {
588 int xwn = xw + j -1;
589 weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
590 }
591 if(weight[1] == 0.0)
592 {
593 out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n";
594 }
595 else
596 {
597 out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n";
598 out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n";
599 }
600 }
601 out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
602 return new ProgramCL("filter_v", buffer, _context, _device);
603
604 }
605
FilterImage(FilterCL * filter,CLTexImage * dst,CLTexImage * src,CLTexImage * tmp)606 void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
607 {
608 cl_kernel kernelh = filter->s_shader_h->_kernel;
609 cl_kernel kernelv = filter->s_shader_v->_kernel;
610 //////////////////////////////////////////////////////////////////
611
612 cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
613 cl_int w_ = w - 1, h_ = h - 1;
614
615 size_t dim0 = 16, dim1 = 16;
616 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
617
618 clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
619 clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
620 clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
621 clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
622 status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL);
623 CheckErrorCL(status, "ProgramBagCL::FilterImageH");
624 if(status != CL_SUCCESS) return;
625
626 clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
627 clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
628 clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
629 clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
630 size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0};
631 status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
632 CheckErrorCL(status, "ProgramBagCL::FilterImageV");
633 //clReleaseEvent(event);
634 }
635
SampleImageU(CLTexImage * dst,CLTexImage * src,int log_scale)636 void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale)
637 {
638 cl_kernel kernel= s_sampling_u->_kernel;
639 float scale = 1.0f / (1 << log_scale);
640 float offset = scale * 0.5f;
641 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
642 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
643 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
644 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
645 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
646 clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale));
647 clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset));
648
649 size_t dim0 = 16, dim1 = 16;
650 //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
651 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
652 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
653 CheckErrorCL(status, "ProgramBagCL::SampleImageU");
654 }
655
SampleImageD(CLTexImage * dst,CLTexImage * src,int log_scale)656 void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
657 {
658 cl_kernel kernel;
659 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
660 if(log_scale == 1)
661 {
662 kernel = s_sampling->_kernel;
663 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
664 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
665 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
666 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
667 }else
668 {
669 cl_int fullstep = (1 << log_scale);
670 cl_int halfstep = fullstep >> 1;
671 kernel = s_sampling_k->_kernel;
672 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
673 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
674 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
675 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
676 clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
677 clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep));
678 }
679 size_t dim0 = 128, dim1 = 1;
680 //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
681 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
682 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
683 CheckErrorCL(status, "ProgramBagCL::SampleImageD");
684 }
685
FilterInitialImage(CLTexImage * tex,CLTexImage * buf)686 void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
687 {
688 if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
689 }
690
FilterSampledImage(CLTexImage * tex,CLTexImage * buf)691 void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
692 {
693 if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
694 }
695
ComputeDOG(CLTexImage * tex,CLTexImage * texp,CLTexImage * dog,CLTexImage * grad,CLTexImage * rot)696 void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
697 {
698 int margin = 0, use_gm2 = 1;
699 bool both_grad_dog = rot->_clData && grad->_clData;
700 cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight();
701 cl_kernel kernel ; size_t dim0, dim1;
702 if(!both_grad_dog) {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; }
703 else if(use_gm2) {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; }
704 else {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; }
705 size_t gsz[2] = { (w + dim0 - 1 - margin) / (dim0 - margin) * dim0,
706 (h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
707 size_t lsz[2] = {dim0, dim1};
708 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData));
709 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData));
710 clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData));
711 clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
712 clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
713 if(both_grad_dog)
714 {
715 clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData));
716 clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData));
717 }
718 ///////////////////////////////////////////////////////
719 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
720 CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
721 }
722
723
ComputeKEY(CLTexImage * dog,CLTexImage * key,float Tdog,float Tedge)724 void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge)
725 {
726 cl_kernel kernel = s_keypoint->_kernel;
727 cl_int w = key->GetImgWidth(), h = key->GetImgHeight();
728 float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f);
729 float threshold1 = Tdog;
730 float threshold2 = (Tedge+1)*(Tedge+1)/Tedge;
731
732 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
733 clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData));
734 clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData));
735 clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData));
736 clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0));
737 clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1));
738 clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2));
739 clSetKernelArg(kernel, 7, sizeof(cl_int), &(w));
740 clSetKernelArg(kernel, 8, sizeof(cl_int), &(h));
741
742 size_t dim0 = 8, dim1 = 8;
743 //if( w * h / dim0 / dim1 < 16) dim1 /= 2;
744 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
745 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
746 CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
747 }
748
UnpackImage(CLTexImage * src,CLTexImage * dst)749 void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
750 {
751 cl_kernel kernel = s_unpack->_kernel;
752 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
753 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
754 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
755 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
756 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
757 const size_t dim0 = 16, dim1 = 16;
758 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
759 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
760
761 CheckErrorCL(status, "ProgramBagCL::UnpackImage");
762 FinishCL();
763
764 }
765
UnpackImageDOG(CLTexImage * src,CLTexImage * dst)766 void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
767 {
768 if(s_unpack_dog == NULL) return;
769 cl_kernel kernel = s_unpack_dog->_kernel;
770 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
771 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
772 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
773 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
774 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
775 const size_t dim0 = 16, dim1 = 16;
776 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
777 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
778
779 CheckErrorCL(status, "ProgramBagCL::UnpackImage");
780 FinishCL();
781 }
782
UnpackImageGRD(CLTexImage * src,CLTexImage * dst)783 void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
784 {
785 if(s_unpack_grd == NULL) return;
786 cl_kernel kernel = s_unpack_grd->_kernel;
787 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
788 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
789 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
790 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
791 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
792 const size_t dim0 = 16, dim1 = 16;
793 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
794 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
795
796 CheckErrorCL(status, "ProgramBagCL::UnpackImage");
797 FinishCL();
798 }
UnpackImageKEY(CLTexImage * src,CLTexImage * dog,CLTexImage * dst)799 void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
800 {
801 if(s_unpack_key == NULL) return;
802 cl_kernel kernel = s_unpack_key->_kernel;
803 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
804 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
805 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData));
806 clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData));
807 clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
808 clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
809 const size_t dim0 = 16, dim1 = 16;
810 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
811 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
812
813 CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
814 FinishCL();
815 }
LoadDescriptorShader()816 void ProgramBagCL::LoadDescriptorShader()
817 {
818 GlobalUtil::_DescriptorPPT = 16;
819 LoadDescriptorShaderF2();
820 }
821
LoadDescriptorShaderF2()822 void ProgramBagCL::LoadDescriptorShaderF2()
823 {
824
825 }
826
LoadOrientationShader(void)827 void ProgramBagCL::LoadOrientationShader(void)
828 {
829
830 }
831
LoadGenListShader(int ndoglev,int nlev)832 void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev)
833 {
834
835 }
836
LoadKeypointShader()837 void ProgramBagCL::LoadKeypointShader()
838 {
839 int i; char buffer[20240];
840 ostrstream out(buffer, 20240);
841 streampos pos;
842
843 //tex(X)(Y)
844 //X: (CLR) (CENTER 0, LEFT -1, RIGHT +1)
845 //Y: (CDU) (CENTER 0, DOWN -1, UP +1)
846 out<<
847 "__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
848 " __read_only image2d_t texD, __write_only image2d_t texK,\n"
849 " float THRESHOLD0, float THRESHOLD1, \n"
850 " float THRESHOLD2, int width, int height)\n"
851 "{\n"
852 " sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
853 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
854 " int x = get_global_id(0), y = get_global_id(1);\n"
855 " if(x >= width || y >= height) return; \n"
856 " int xp = x - 1, xn = x + 1;\n"
857 " int yp = y - 1, yn = y + 1;\n"
858 " int2 coord0 = (int2) (x, y); \n"
859 " int2 coord1 = (int2) (xp, y); \n"
860 " int2 coord2 = (int2) (xn, y); \n"
861 " int2 coord3 = (int2) (x, yp); \n"
862 " int2 coord4 = (int2) (x, yn); \n"
863 " int2 coord5 = (int2) (xp, yp); \n"
864 " int2 coord6 = (int2) (xp, yn); \n"
865 " int2 coord7 = (int2) (xn, yp); \n"
866 " int2 coord8 = (int2) (xn, yn); \n"
867 " float4 ccc = read_imagef(tex, sampler,coord0);\n"
868 " float4 clc = read_imagef(tex, sampler,coord1);\n"
869 " float4 crc = read_imagef(tex, sampler,coord2);\n"
870 " float4 ccd = read_imagef(tex, sampler,coord3);\n"
871 " float4 ccu = read_imagef(tex, sampler,coord4);\n"
872 " float4 cld = read_imagef(tex, sampler,coord5);\n"
873 " float4 clu = read_imagef(tex, sampler,coord6);\n"
874 " float4 crd = read_imagef(tex, sampler,coord7);\n"
875 " float4 cru = read_imagef(tex, sampler,coord8);\n"
876 " float4 cc = ccc;\n"
877 " float4 v1[4], v2[4];\n"
878 " v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
879 " v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
880 " v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
881 " v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
882 " v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
883 " v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
884 " v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
885 " v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
886 " float4 key4 = (float4)(0); \n";
887 //test against 8 neighbours
888 //use variable to identify type of extremum
889 //1.0 for local maximum and -1.0 for minimum
890 for(i = 0; i < 4; ++i)
891 out<<
892 " if(cc.s"<<i<<" > THRESHOLD0){ \n"
893 " if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n"
894 " }else if(cc.s"<<i<<" < -THRESHOLD0){ \n"
895 " if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n"
896 " }";
897
898 out<<
899 " if(x ==0) {key4.x = key4.z= 0; }\n"
900 " else if(x + 1 == width) {key4.y = key4.w = 0;}\n"
901 " if(y ==0) {key4.x = key4.y = 0; }\n"
902 " else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
903 " float4 ak = fabs(key4); \n"
904 " float keysum = ak.x + ak.y + ak.z + ak.w; \n"
905 " float4 result = (float4)(0.0);\n"
906 " if(keysum == 1.0) {\n"
907 " float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
908
909 //do edge supression first..
910 //vector v1 is < (-1, 0), (1, 0), (0,-1), (0, 1)>
911 //vector v2 is < (-1,-1), (-1,1), (1,-1), (1, 1)>
912 for(i = 0; i < 4; ++i)
913 out <<
914 " if(key4.s"<<i<<" != 0)\n"
915 " {\n"
916 " float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n"
917 " float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n"
918 " float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n"
919 " fx["<<i<<"] = D5.x; fy["<<i<<"] = D5.y ;\n"
920 " fxx["<<i<<"] = D2.x + D2.y;\n"
921 " fyy["<<i<<"] = D2.z + D2.w;\n"
922 " fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n"
923 " float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n"
924 " float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
925 " float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n"
926 " if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
927 " }\n";
928
929 out <<
930 " if(keysum == 1) {\n";
931 ////////////////////////////////////////////////
932 //read 9 pixels of upper/lower level
933 out<<
934 " float4 v4[4], v5[4], v6[4];\n"
935 " ccc = read_imagef(texU, sampler,coord0);\n"
936 " clc = read_imagef(texU, sampler,coord1);\n"
937 " crc = read_imagef(texU, sampler,coord2);\n"
938 " ccd = read_imagef(texU, sampler,coord3);\n"
939 " ccu = read_imagef(texU, sampler,coord4);\n"
940 " cld = read_imagef(texU, sampler,coord5);\n"
941 " clu = read_imagef(texU, sampler,coord6);\n"
942 " crd = read_imagef(texU, sampler,coord7);\n"
943 " cru = read_imagef(texU, sampler,coord8);\n"
944 " float4 cu = ccc;\n"
945 " v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
946 " v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
947 " v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
948 " v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
949 " v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
950 " v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
951 " v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
952 " v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
953
954 for(i = 0; i < 4; ++i)
955 out <<
956 " if(key4.s"<<i<<" == 1.0)\n"
957 " {\n"
958 " if(cc.s"<<i<<" < cu.s"<<i<<" || \n"
959 " any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
960 " }else if(key4.s"<<i<<" == -1.0)\n"
961 " {\n"
962 " if(cc.s"<<i<<" > cu.s"<<i<<" || \n"
963 " any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n"
964 " }\n";
965
966 out <<
967 " if(keysum == 1.0) { \n";
968 out <<
969 " ccc = read_imagef(texD, sampler,coord0);\n"
970 " clc = read_imagef(texD, sampler,coord1);\n"
971 " crc = read_imagef(texD, sampler,coord2);\n"
972 " ccd = read_imagef(texD, sampler,coord3);\n"
973 " ccu = read_imagef(texD, sampler,coord4);\n"
974 " cld = read_imagef(texD, sampler,coord5);\n"
975 " clu = read_imagef(texD, sampler,coord6);\n"
976 " crd = read_imagef(texD, sampler,coord7);\n"
977 " cru = read_imagef(texD, sampler,coord8);\n"
978 " float4 cd = ccc;\n"
979 " v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
980 " v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
981 " v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
982 " v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
983 " v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
984 " v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
985 " v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
986 " v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
987 for(i = 0; i < 4; ++i)
988 out <<
989 " if(key4.s"<<i<<" == 1.0)\n"
990 " {\n"
991 " if(cc.s"<<i<<" < cd.s"<<i<<" ||\n"
992 " any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
993 " }else if(key4.s"<<i<<" == -1.0)\n"
994 " {\n"
995 " if(cc.s"<<i<<" > cd.s"<<i<<" ||\n"
996 " any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
997 " }\n";
998
999 out <<
1000 " if(keysum==1.0) {\n";
1001 //////////////////////////////////////////////////////////////////////
1002 if(GlobalUtil::_SubpixelLocalization)
1003 {
1004 out <<
1005 " float4 offset = (float4)(0); \n";
1006 for(i = 1; i < 4; ++i)
1007 out <<
1008 " if(key4.s"<<i<<" != 0) \n"
1009 " {\n"
1010 " cu.s0 = cu.s"<<i<<"; cd.s0 = cd.s"<<i<<"; cc.s0 = cc.s"<<i<<"; \n"
1011 " v4[0] = v4["<<i<<"]; v5[0] = v5["<<i<<"]; \n"
1012 " fxy[0] = fxy["<<i<<"]; fxx[0] = fxx["<<i<<"]; fyy[0] = fyy["<<i<<"]; \n"
1013 " fx[0] = fx["<<i<<"]; fy[0] = fy["<<i<<"]; \n"
1014 " }\n";
1015
1016 out <<
1017 " float fs = 0.5*( cu.s0 - cd.s0 ); \n"
1018 " float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
1019 " float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
1020 " float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
1021 " float4 A0, A1, A2 ; \n"
1022 " A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]); \n"
1023 " A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]); \n"
1024 " A2 = (float4)(fxs, fys, fss, -fs); \n"
1025 " float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0)); \n"
1026 " float maxa = max(max(x3.x, x3.y), x3.z); \n"
1027 " if(maxa >= 1e-10 ) \n"
1028 " { \n"
1029 " if(x3.y ==maxa ) \n"
1030 " { \n"
1031 " float4 TEMP = A1; A1 = A0; A0 = TEMP; \n"
1032 " }else if( x3.z == maxa ) \n"
1033 " { \n"
1034 " float4 TEMP = A2; A2 = A0; A0 = TEMP; \n"
1035 " } \n"
1036 " A0 /= A0.x; \n"
1037 " A1 -= A1.x * A0; \n"
1038 " A2 -= A2.x * A0; \n"
1039 " float2 x2 = fabs((float2)(A1.y, A2.y)); \n"
1040 " if( x2.y > x2.x ) \n"
1041 " { \n"
1042 " float4 TEMP = A2.yzwx; \n"
1043 " A2.yzw = A1.yzw; \n"
1044 " A1.yzw = TEMP.xyz; \n"
1045 " x2.x = x2.y; \n"
1046 " } \n"
1047 " if(x2.x >= 1e-10) { \n"
1048 " A1.yzw /= A1.y; \n"
1049 " A2.yzw -= A2.y * A1.yzw; \n"
1050 " if(fabs(A2.z) >= 1e-10) {\n"
1051 " offset.z = A2.w /A2.z; \n"
1052 " offset.y = A1.w - offset.z*A1.z; \n"
1053 " offset.x = A0.w - offset.z*A0.z - offset.y*A0.y; \n"
1054 " if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
1055 " || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
1056 " }\n"
1057 " }\n"
1058 " }\n"
1059 <<"\n"
1060 " float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1061 " result = (float4)(keyv, offset.xyz);\n"
1062 " }}}}\n"
1063 " write_imagef(texK, coord0, result);\n "
1064 "}\n" <<'\0';
1065 }
1066 else
1067 {
1068 out << "\n"
1069 " float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1070 " result = (float4)(keyv, 0, 0, 0);\n"
1071 " }}}}\n"
1072 " write_imagef(texK, coord0, result);\n "
1073 "}\n" <<'\0';
1074 }
1075
1076 s_keypoint = new ProgramCL("keypoint", buffer, _context, _device);
1077 }
1078
LoadDisplayShaders()1079 void ProgramBagCL::LoadDisplayShaders()
1080 {
1081 //"uniform sampler2DRect tex; void main(){\n"
1082 //"vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy); bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));\n"
1083 //"float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}");
1084 s_unpack = new ProgramCL("main",
1085 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1086 " int width, int height) {\n"
1087 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1088 "int x = get_global_id(0), y = get_global_id(1); \n"
1089 "if(x >= width || y >= height) return;\n"
1090 "int xx = x / 2, yy = y / 2; \n"
1091 "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1092 "float v1 = (x & 1 ? vv.w : vv.z); \n"
1093 "float v2 = (x & 1 ? vv.y : vv.x);\n"
1094 "float v = y & 1 ? v1 : v2;\n"
1095 "float4 result = (float4) (v, v, v, 1);"
1096 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1097
1098 s_unpack_dog = new ProgramCL("main",
1099 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1100 " int width, int height) {\n"
1101 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1102 "int x = get_global_id(0), y = get_global_id(1); \n"
1103 "if(x >= width || y >= height) return;\n"
1104 "int xx = x / 2, yy = y / 2; \n"
1105 "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1106 "float v1 = (x & 1 ? vv.w : vv.z); \n"
1107 "float v2 = (x & 1 ? vv.y : vv.x);\n"
1108 "float v0 = y & 1 ? v1 : v2;\n"
1109 "float v = 0.5 + 20.0 * v0;\n "
1110 "float4 result = (float4) (v, v, v, 1);"
1111 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1112
1113 s_unpack_grd = new ProgramCL("main",
1114 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1115 " int width, int height) {\n"
1116 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1117 "int x = get_global_id(0), y = get_global_id(1); \n"
1118 "if(x >= width || y >= height) return;\n"
1119 "int xx = x / 2, yy = y / 2; \n"
1120 "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1121 "float v1 = (x & 1 ? vv.w : vv.z); \n"
1122 "float v2 = (x & 1 ? vv.y : vv.x);\n"
1123 "float v0 = y & 1 ? v1 : v2;\n"
1124 "float v = 5.0 * v0;\n "
1125 "float4 result = (float4) (v, v, v, 1);"
1126 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1127
1128 s_unpack_key = new ProgramCL("main",
1129 "__kernel void main(__read_only image2d_t dog,\n"
1130 " __read_only image2d_t key,\n"
1131 " __write_only image2d_t output,\n"
1132 " int width, int height) {\n"
1133 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1134 "int x = get_global_id(0), y = get_global_id(1); \n"
1135 "if(x >= width || y >= height) return;\n"
1136 "int xx = x / 2, yy = y / 2; \n"
1137 "float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
1138 "int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
1139 "int k1 = (x & 1 ? cc.w : cc.z); \n"
1140 "int k2 = (x & 1 ? cc.y : cc.x);\n"
1141 "int k0 = y & 1 ? k1 : k2;\n"
1142 "float4 result;\n"
1143 "if(k0 != 0){\n"
1144 " //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
1145 " result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0, 0.0, 1.0)); \n"
1146 "}else{"
1147 "float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
1148 "float v1 = (x & 1 ? vv.w : vv.z); \n"
1149 "float v2 = (x & 1 ? vv.y : vv.x);\n"
1150 "float v0 = y & 1 ? v1 : v2;\n"
1151 "float v = 0.5 + 20.0 * v0;\n "
1152 "result = (float4) (v, v, v, 1);"
1153 "}\n"
1154 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1155 }
1156
1157
SetMarginCopyParam(int xmax,int ymax)1158 void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax)
1159 {
1160
1161 }
1162
SetGradPassParam(int texP)1163 void ProgramBagCL::SetGradPassParam(int texP)
1164 {
1165
1166 }
1167
SetGenListEndParam(int ktex)1168 void ProgramBagCL::SetGenListEndParam(int ktex)
1169 {
1170
1171 }
1172
SetDogTexParam(int texU,int texD)1173 void ProgramBagCL::SetDogTexParam(int texU, int texD)
1174 {
1175
1176 }
1177
SetGenListInitParam(int w,int h)1178 void ProgramBagCL::SetGenListInitParam(int w, int h)
1179 {
1180 float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f, (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f};
1181
1182 }
1183
1184
SetGenListStartParam(float width,int tex0)1185 void ProgramBagCL::SetGenListStartParam(float width, int tex0)
1186 {
1187
1188 }
1189
1190
1191
SetGenListStepParam(int tex,int tex0)1192 void ProgramBagCL::SetGenListStepParam(int tex, int tex0)
1193 {
1194
1195 }
1196
SetGenVBOParam(float width,float fwidth,float size)1197 void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size)
1198 {
1199
1200 }
1201
SetSimpleOrientationInput(int oTex,float sigma,float sigma_step)1202 void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step)
1203 {
1204
1205 }
1206
1207
SetFeatureOrientationParam(int gtex,int width,int height,float sigma,int otex,float step)1208 void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step)
1209 {
1210
1211
1212 }
1213
SetFeatureDescirptorParam(int gtex,int otex,float dwidth,float fwidth,float width,float height,float sigma)1214 void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth, float width, float height, float sigma)
1215 {
1216
1217 }
1218
1219
1220
GetErrorString(cl_int error)1221 const char* ProgramBagCL::GetErrorString(cl_int error)
1222 {
1223 static const char* errorString[] = {
1224 "CL_SUCCESS",
1225 "CL_DEVICE_NOT_FOUND",
1226 "CL_DEVICE_NOT_AVAILABLE",
1227 "CL_COMPILER_NOT_AVAILABLE",
1228 "CL_MEM_OBJECT_ALLOCATION_FAILURE",
1229 "CL_OUT_OF_RESOURCES",
1230 "CL_OUT_OF_HOST_MEMORY",
1231 "CL_PROFILING_INFO_NOT_AVAILABLE",
1232 "CL_MEM_COPY_OVERLAP",
1233 "CL_IMAGE_FORMAT_MISMATCH",
1234 "CL_IMAGE_FORMAT_NOT_SUPPORTED",
1235 "CL_BUILD_PROGRAM_FAILURE",
1236 "CL_MAP_FAILURE",
1237 "",
1238 "",
1239 "",
1240 "",
1241 "",
1242 "",
1243 "",
1244 "",
1245 "",
1246 "",
1247 "",
1248 "",
1249 "",
1250 "",
1251 "",
1252 "",
1253 "",
1254 "CL_INVALID_VALUE",
1255 "CL_INVALID_DEVICE_TYPE",
1256 "CL_INVALID_PLATFORM",
1257 "CL_INVALID_DEVICE",
1258 "CL_INVALID_CONTEXT",
1259 "CL_INVALID_QUEUE_PROPERTIES",
1260 "CL_INVALID_COMMAND_QUEUE",
1261 "CL_INVALID_HOST_PTR",
1262 "CL_INVALID_MEM_OBJECT",
1263 "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
1264 "CL_INVALID_IMAGE_SIZE",
1265 "CL_INVALID_SAMPLER",
1266 "CL_INVALID_BINARY",
1267 "CL_INVALID_BUILD_OPTIONS",
1268 "CL_INVALID_PROGRAM",
1269 "CL_INVALID_PROGRAM_EXECUTABLE",
1270 "CL_INVALID_KERNEL_NAME",
1271 "CL_INVALID_KERNEL_DEFINITION",
1272 "CL_INVALID_KERNEL",
1273 "CL_INVALID_ARG_INDEX",
1274 "CL_INVALID_ARG_VALUE",
1275 "CL_INVALID_ARG_SIZE",
1276 "CL_INVALID_KERNEL_ARGS",
1277 "CL_INVALID_WORK_DIMENSION",
1278 "CL_INVALID_WORK_GROUP_SIZE",
1279 "CL_INVALID_WORK_ITEM_SIZE",
1280 "CL_INVALID_GLOBAL_OFFSET",
1281 "CL_INVALID_EVENT_WAIT_LIST",
1282 "CL_INVALID_EVENT",
1283 "CL_INVALID_OPERATION",
1284 "CL_INVALID_GL_OBJECT",
1285 "CL_INVALID_BUFFER_SIZE",
1286 "CL_INVALID_MIP_LEVEL",
1287 "CL_INVALID_GLOBAL_WORK_SIZE",
1288 };
1289
1290 const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
1291
1292 const int index = -error;
1293
1294 return (index >= 0 && index < errorCount) ? errorString[index] : "";
1295 }
1296
CheckErrorCL(cl_int error,const char * location)1297 bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location)
1298 {
1299 if(error == CL_SUCCESS) return true;
1300 const char *errstr = GetErrorString(error);
1301 if(errstr && errstr[0]) std::cerr << errstr;
1302 else std::cerr << "Error " << error;
1303 if(location) std::cerr << " at " << location;
1304 std::cerr << "\n";
1305 exit(0);
1306 return false;
1307
1308 }
1309
1310
1311 ////////////////////////////////////////////////////////////////////////
1312 ////////////////////////////////////////////////////////////////////////
1313
LoadFixedShaders()1314 void ProgramBagCLN::LoadFixedShaders()
1315 {
1316 s_sampling = new ProgramCL("sampling",
1317 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1318 "__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output, "
1319 " int width, int height) {\n"
1320 "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1321 "if( coord.x >= width || coord.y >= height) return;\n"
1322 "write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }" , _context, _device);
1323
1324 s_sampling_k = new ProgramCL("sampling_k",
1325 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1326 "__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
1327 " int width, int height, int step) {\n"
1328 "int x = get_global_id(0), y = get_global_id(1); \n"
1329 "if( x >= width || y >= height) return;\n"
1330 "int xa = x * step, ya = y *step; \n"
1331 "float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
1332 "write_imagef(output, (int2) (x, y), v1); }" , _context, _device);
1333
1334
1335 s_sampling_u = new ProgramCL("sampling_u",
1336 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
1337 "__kernel void sampling_u(__read_only image2d_t input, \n"
1338 " __write_only image2d_t output,\n"
1339 " int width, int height, float step) {\n"
1340 "int x = get_global_id(0), y = get_global_id(1); \n"
1341 "if( x >= width || y >= height) return;\n"
1342 "float xa = x * step, ya = y *step; \n"
1343 "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
1344 "write_imagef(output, (int2) (x, y), (float4)(v1)); }" , _context, _device);
1345
1346 s_dog_pass = new ProgramCL("dog_pass",
1347 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1348 "__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
1349 " __write_only image2d_t dog, int width, int height) {\n"
1350 "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1351 "if( coord.x >= width || coord.y >= height) return;\n"
1352 "float cc = read_imagef(tex , sampler, coord).x; \n"
1353 "float cp = read_imagef(texp, sampler, coord).x;\n"
1354 "write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device);
1355
1356 s_grad_pass = new ProgramCL("grad_pass",
1357 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1358 "__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
1359 " __write_only image2d_t dog, int width, int height, \n"
1360 " __write_only image2d_t grad, __write_only image2d_t rot) {\n"
1361 "int x = get_global_id(0), y = get_global_id(1); \n"
1362 "if( x >= width || y >= height) return;\n"
1363 "int2 coord = (int2) (x, y);\n"
1364 "float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
1365 "float cc = read_imagef(tex , sampler, coord).x; \n"
1366 "float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
1367 "float cp = read_imagef(texp, sampler, coord).x;\n"
1368 "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1369 "float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
1370 "float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
1371 "float dx = cr - cl, dy = cu - cd; \n"
1372 "float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
1373 "write_imagef(grad, coord, (float4)(gg));\n"
1374 "float oo = atan2(dy, dx + FLT_MIN);\n"
1375 "write_imagef(rot, coord, (float4)(oo));}\n", _context, _device);
1376
1377 s_grad_pass2 = new ProgramCL("grad_pass2",
1378 "#define BLOCK_DIMX 32\n"
1379 "#define BLOCK_DIMY 14\n"
1380 "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
1381 "__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
1382 " __write_only image2d_t dog, int width, int height,\n"
1383 " __write_only image2d_t grd, __write_only image2d_t rot){\n"//, __local float* block) {\n"
1384 "__local float block[BLOCK_SIZE]; \n"
1385 "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1386 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1387 "int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
1388 " get_global_id(1) - get_group_id(1) * 2 - 1); \n"
1389 "int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
1390 "float cc = read_imagef(tex, sampler, coord).x;\n"
1391 "block[idx] = cc;\n"
1392 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1393 "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
1394 "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
1395 "if( coord.x >= width) return; \n"
1396 "if( coord.y >= height) return;\n"
1397 "float cp = read_imagef(texp, sampler, coord).x;\n"
1398 "float dx = block[idx + 1] - block[idx - 1];\n"
1399 "float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
1400 "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1401 "write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
1402 "write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device);
1403 }
1404
LoadDisplayShaders()1405 void ProgramBagCLN::LoadDisplayShaders()
1406 {
1407 s_unpack = new ProgramCL("main",
1408 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1409 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1410 " int width, int height) {\n"
1411 "int x = get_global_id(0), y = get_global_id(1); \n"
1412 "if(x >= width || y >= height) return;\n"
1413 "float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1414 "float4 result = (float4) (v, v, v, 1);"
1415 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1416
1417 s_unpack_grd = new ProgramCL("main",
1418 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1419 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1420 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1421 " int width, int height) {\n"
1422 "int x = get_global_id(0), y = get_global_id(1); \n"
1423 "if(x >= width || y >= height) return;\n"
1424 "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1425 "float v = 5.0 * v0; float4 result = (float4) (v, v, v, 1);"
1426 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1427
1428 s_unpack_dog = new ProgramCL("main",
1429 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1430 "__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
1431 " int width, int height) {\n"
1432 "int x = get_global_id(0), y = get_global_id(1); \n"
1433 "if(x >= width || y >= height) return;\n"
1434 "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1435 "float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
1436 "write_imagef(output, (int2) (x, y), result); }" , _context, _device);
1437 }
1438
CreateFilterH(float kernel[],int width)1439 ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width)
1440 {
1441 ////////////////////////////
1442 char buffer[10240];
1443 ostrstream out(buffer, 10240);
1444 out << "#define KERNEL_WIDTH " << width << "\n"
1445 << "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
1446 "#define BLOCK_WIDTH 128\n"
1447 "#define BLOCK_HEIGHT 1\n"
1448 "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1449 "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1450 "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1451 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1452 "__kernel void filter_h(__read_only image2d_t input, \n"
1453 " __write_only image2d_t output, int width_, int height_, \n"
1454 " __constant float* weight) {\n"
1455 "__local float data[CACHE_WIDTH]; \n"
1456 "int x = get_global_id(0), y = get_global_id(1);\n"
1457 "#pragma unroll\n"
1458 "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1459 "{\n"
1460 " if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
1461 " {\n"
1462 " int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
1463 " data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
1464 " }\n"
1465 "}\n"
1466 "barrier(CLK_LOCAL_MEM_FENCE); \n"
1467 "if( x > width_ || y > height_) return; \n"
1468 "float result = 0; \n"
1469 "#pragma unroll\n"
1470 "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1471 "{\n"
1472 " result += data[get_local_id(0) + i] * weight[i];\n"
1473 "}\n"
1474 << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1475 return new ProgramCL("filter_h", buffer, _context, _device);
1476 }
1477
1478
1479
CreateFilterV(float kernel[],int width)1480 ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width)
1481 {
1482 ////////////////////////////
1483 char buffer[10240];
1484 ostrstream out(buffer, 10240);
1485 out << "#define KERNEL_WIDTH " << width << "\n"
1486 << "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n"
1487 "#define BLOCK_WIDTH 128\n"
1488 "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1489 "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1490 "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1491 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
1492 " CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1493 "__kernel void filter_v(__read_only image2d_t input, \n"
1494 " __write_only image2d_t output, int width_, int height_, \n"
1495 " __constant float* weight) {\n"
1496 "__local float data[CACHE_WIDTH]; \n"
1497 "int x = get_global_id(0), y = get_global_id(1);\n"
1498 "#pragma unroll\n"
1499 "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1500 "{\n"
1501 " if(get_local_id(1) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
1502 " {\n"
1503 " int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
1504 " data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
1505 " }\n"
1506 "}\n"
1507 "barrier(CLK_LOCAL_MEM_FENCE); \n"
1508 "if( x > width_ || y > height_) return; \n"
1509 "float result = 0; \n"
1510 "#pragma unroll\n"
1511 "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1512 "{\n"
1513 " result += data[get_local_id(1) + i] * weight[i];\n"
1514 "}\n"
1515 << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1516
1517 return new ProgramCL("filter_v", buffer, _context, _device);
1518 }
1519
CreateFilter(float kernel[],int width)1520 FilterCL* ProgramBagCLN::CreateFilter(float kernel[], int width)
1521 {
1522 FilterCL * filter = new FilterCL;
1523 filter->s_shader_h = CreateFilterH(kernel, width);
1524 filter->s_shader_v = CreateFilterV(kernel, width);
1525 filter->_weight = new CLTexImage(_context, _queue);
1526 filter->_weight->InitBufferTex(width, 1, 1);
1527 filter->_weight->CopyFromHost(kernel);
1528 filter->_size = width;
1529 return filter;
1530 }
1531
1532
FilterImage(FilterCL * filter,CLTexImage * dst,CLTexImage * src,CLTexImage * tmp)1533 void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
1534 {
1535 cl_kernel kernelh = filter->s_shader_h->_kernel;
1536 cl_kernel kernelv = filter->s_shader_v->_kernel;
1537 //////////////////////////////////////////////////////////////////
1538
1539 cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
1540 cl_mem weight = (cl_mem) filter->_weight->_clData;
1541 cl_int w_ = w - 1, h_ = h - 1;
1542
1543
1544 clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
1545 clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
1546 clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
1547 clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
1548 clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight);
1549
1550 size_t dim00 = 128, dim01 = 1;
1551 size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01};
1552 status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL);
1553 CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
1554 if(status != CL_SUCCESS) return;
1555
1556
1557 clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
1558 clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
1559 clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
1560 clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
1561 clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight);
1562
1563 size_t dim10 = 1, dim11 = 128;
1564 size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11};
1565 status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
1566 CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
1567 //clReleaseEvent(event);
1568 }
1569
SampleImageD(CLTexImage * dst,CLTexImage * src,int log_scale)1570 void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
1571 {
1572 cl_kernel kernel;
1573 cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
1574
1575 cl_int fullstep = (1 << log_scale);
1576 kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
1577 clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
1578 clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
1579 clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
1580 clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
1581 if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
1582
1583 size_t dim0 = 128, dim1 = 1;
1584 //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
1585 size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
1586 cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
1587 CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
1588 }
1589
1590
1591 #endif
1592
1593