1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Dachuan Zhao, dachuan@multicorewareinc.com 19// Yao Wang, bitwangyaoyao@gmail.com 20// Xiaopeng Fu, fuxiaopeng2222@163.com 21// 22// Redistribution and use in source and binary forms, with or without modification, 23// are permitted provided that the following conditions are met: 24// 25// * Redistribution's of source code must retain the above copyright notice, 26// this list of conditions and the following disclaimer. 27// 28// * Redistribution's in binary form must reproduce the above copyright notice, 29// this list of conditions and the following disclaimer in the documentation 30// and/or other materials provided with the distribution. 31// 32// * The name of the copyright holders may not be used to endorse or promote products 33// derived from this software without specific prior written permission. 34// 35// This software is provided by the copyright holders and contributors as is and 36// any express or implied warranties, including, but not limited to, the implied 37// warranties of merchantability and fitness for a particular purpose are disclaimed. 38// In no event shall the Intel Corporation or contributors be liable for any direct, 39// indirect, incidental, special, exemplary, or consequential damages 40// (including, but not limited to, procurement of substitute goods or services; 41// loss of use, data, or profits; or business interruption) however caused 42// and on any theory of liability, whether in contract, strict liability, 43// or tort (including negligence or otherwise) arising in any way out of 44// the use of this software, even if advised of the possibility of such damage. 45// 46//M*/ 47 48#define GRIDSIZE 3 49#define LSx 8 50#define LSy 8 51// define local memory sizes 52#define LM_W (LSx*GRIDSIZE+2) 53#define LM_H (LSy*GRIDSIZE+2) 54#define BUFFER (LSx*LSy) 55#define BUFFER2 BUFFER>>1 56 57#ifdef CPU 58 59inline void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) 60{ 61 smem1[tid] = val1; 62 smem2[tid] = val2; 63 smem3[tid] = val3; 64 barrier(CLK_LOCAL_MEM_FENCE); 65 66 for(int i = BUFFER2; i > 0; i >>= 1) 67 { 68 if(tid < i) 69 { 70 smem1[tid] += smem1[tid + i]; 71 smem2[tid] += smem2[tid + i]; 72 smem3[tid] += smem3[tid + i]; 73 } 74 barrier(CLK_LOCAL_MEM_FENCE); 75 } 76} 77 78inline void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) 79{ 80 smem1[tid] = val1; 81 smem2[tid] = val2; 82 barrier(CLK_LOCAL_MEM_FENCE); 83 84 for(int i = BUFFER2; i > 0; i >>= 1) 85 { 86 if(tid < i) 87 { 88 smem1[tid] += smem1[tid + i]; 89 smem2[tid] += smem2[tid + i]; 90 } 91 barrier(CLK_LOCAL_MEM_FENCE); 92 } 93} 94 95inline void reduce1(float val1, __local float* smem1, int tid) 96{ 97 smem1[tid] = val1; 98 barrier(CLK_LOCAL_MEM_FENCE); 99 100 for(int i = BUFFER2; i > 0; i >>= 1) 101 { 102 if(tid < i) 103 { 104 smem1[tid] += smem1[tid + i]; 105 } 106 barrier(CLK_LOCAL_MEM_FENCE); 107 } 108} 109#else 110inline void reduce3(float val1, float val2, float val3, 111 __local float* smem1, __local float* smem2, __local float* smem3, int tid) 112{ 113 smem1[tid] = val1; 114 smem2[tid] = val2; 115 smem3[tid] = val3; 116 barrier(CLK_LOCAL_MEM_FENCE); 117 118 if (tid < 32) 119 { 120 smem1[tid] += smem1[tid + 32]; 121 smem2[tid] += smem2[tid + 32]; 122 smem3[tid] += smem3[tid + 32]; 123 } 124 barrier(CLK_LOCAL_MEM_FENCE); 125 if (tid < 16) 126 { 127 smem1[tid] += smem1[tid + 16]; 128 smem2[tid] += smem2[tid + 16]; 129 smem3[tid] += smem3[tid + 16]; 130 } 131 barrier(CLK_LOCAL_MEM_FENCE); 132 if (tid < 8) 133 { 134 smem1[tid] += smem1[tid + 8]; 135 smem2[tid] += smem2[tid + 8]; 136 smem3[tid] += smem3[tid + 8]; 137 } 138 barrier(CLK_LOCAL_MEM_FENCE); 139 if (tid < 4) 140 { 141 smem1[tid] += smem1[tid + 4]; 142 smem2[tid] += smem2[tid + 4]; 143 smem3[tid] += smem3[tid + 4]; 144 } 145 barrier(CLK_LOCAL_MEM_FENCE); 146 if (tid == 0) 147 { 148 smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]); 149 smem2[0] = (smem2[0] + smem2[1]) + (smem2[2] + smem2[3]); 150 smem3[0] = (smem3[0] + smem3[1]) + (smem3[2] + smem3[3]); 151 } 152 barrier(CLK_LOCAL_MEM_FENCE); 153} 154 155inline void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) 156{ 157 smem1[tid] = val1; 158 smem2[tid] = val2; 159 barrier(CLK_LOCAL_MEM_FENCE); 160 161 if (tid < 32) 162 { 163 smem1[tid] += smem1[tid + 32]; 164 smem2[tid] += smem2[tid + 32]; 165 } 166 barrier(CLK_LOCAL_MEM_FENCE); 167 if (tid < 16) 168 { 169 smem1[tid] += smem1[tid + 16]; 170 smem2[tid] += smem2[tid + 16]; 171 } 172 barrier(CLK_LOCAL_MEM_FENCE); 173 if (tid < 8) 174 { 175 smem1[tid] += smem1[tid + 8]; 176 smem2[tid] += smem2[tid + 8]; 177 } 178 barrier(CLK_LOCAL_MEM_FENCE); 179 if (tid < 4) 180 { 181 smem1[tid] += smem1[tid + 4]; 182 smem2[tid] += smem2[tid + 4]; 183 } 184 barrier(CLK_LOCAL_MEM_FENCE); 185 if (tid == 0) 186 { 187 smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]); 188 smem2[0] = (smem2[0] + smem2[1]) + (smem2[2] + smem2[3]); 189 } 190 barrier(CLK_LOCAL_MEM_FENCE); 191} 192 193inline void reduce1(float val1, __local float* smem1, int tid) 194{ 195 smem1[tid] = val1; 196 barrier(CLK_LOCAL_MEM_FENCE); 197 198 if (tid < 32) 199 { 200 smem1[tid] += smem1[tid + 32]; 201 } 202 barrier(CLK_LOCAL_MEM_FENCE); 203 if (tid < 16) 204 { 205 smem1[tid] += smem1[tid + 16]; 206 } 207 barrier(CLK_LOCAL_MEM_FENCE); 208 if (tid < 8) 209 { 210 smem1[tid] += smem1[tid + 8]; 211 } 212 barrier(CLK_LOCAL_MEM_FENCE); 213 if (tid < 4) 214 { 215 smem1[tid] += smem1[tid + 4]; 216 } 217 barrier(CLK_LOCAL_MEM_FENCE); 218 if (tid == 0) 219 { 220 smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]); 221 } 222 barrier(CLK_LOCAL_MEM_FENCE); 223} 224#endif 225 226#define SCALE (1.0f / (1 << 20)) 227#define THRESHOLD 0.01f 228 229// Image read mode 230__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; 231 232// macro to get pixel value from local memory 233 234#define VAL(_y,_x,_yy,_xx) (IPatchLocal[mad24(((_y) + (_yy)), LM_W, ((_x) + (_xx)))]) 235inline void SetPatch(local float* IPatchLocal, int TileY, int TileX, 236 float* Pch, float* Dx, float* Dy, 237 float* A11, float* A12, float* A22, float w) 238{ 239 int xid=get_local_id(0); 240 int yid=get_local_id(1); 241 int xBase = mad24(TileX, LSx, (xid + 1)); 242 int yBase = mad24(TileY, LSy, (yid + 1)); 243 244 *Pch = VAL(yBase,xBase,0,0); 245 246 *Dx = mad((VAL(yBase,xBase,-1,1) + VAL(yBase,xBase,+1,1) - VAL(yBase,xBase,-1,-1) - VAL(yBase,xBase,+1,-1)), 3.0f, (VAL(yBase,xBase,0,1) - VAL(yBase,xBase,0,-1)) * 10.0f) * w; 247 *Dy = mad((VAL(yBase,xBase,1,-1) + VAL(yBase,xBase,1,+1) - VAL(yBase,xBase,-1,-1) - VAL(yBase,xBase,-1,+1)), 3.0f, (VAL(yBase,xBase,1,0) - VAL(yBase,xBase,-1,0)) * 10.0f) * w; 248 249 *A11 = mad(*Dx, *Dx, *A11); 250 *A12 = mad(*Dx, *Dy, *A12); 251 *A22 = mad(*Dy, *Dy, *A22); 252} 253#undef VAL 254 255inline void GetPatch(image2d_t J, float x, float y, 256 float* Pch, float* Dx, float* Dy, 257 float* b1, float* b2) 258{ 259 float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch; 260 *b1 = mad(diff, *Dx, *b1); 261 *b2 = mad(diff, *Dy, *b2); 262} 263 264inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval, float w) 265{ 266 float diff = ((((read_imagef(J, sampler, (float2)(x,y)).x * 16384) + 256) / 512) - (((*Pch * 16384) + 256) /512)) * w; 267 *errval += fabs(diff); 268} 269 270 271//macro to read pixel value into local memory. 272#define READI(_y,_x) IPatchLocal[mad24(mad24((_y), LSy, yid), LM_W, mad24((_x), LSx, xid))] = read_imagef(I, sampler, (float2)(mad((float)(_x), (float)LSx, Point.x + xid - 0.5f), mad((float)(_y), (float)LSy, Point.y + yid - 0.5f))).x; 273void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal) 274{ 275 int xid=get_local_id(0); 276 int yid=get_local_id(1); 277 //read (3*LSx)*(3*LSy) window. each macro call read LSx*LSy pixels block 278 READI(0,0);READI(0,1);READI(0,2); 279 READI(1,0);READI(1,1);READI(1,2); 280 READI(2,0);READI(2,1);READI(2,2); 281 if(xid<2) 282 {// read last 2 columns border. each macro call reads 2*LSy pixels block 283 READI(0,3); 284 READI(1,3); 285 READI(2,3); 286 } 287 288 if(yid<2) 289 {// read last 2 row. each macro call reads LSx*2 pixels block 290 READI(3,0);READI(3,1);READI(3,2); 291 } 292 293 if(yid<2 && xid<2) 294 {// read right bottom 2x2 corner. one macro call reads 2*2 pixels block 295 READI(3,3); 296 } 297 barrier(CLK_LOCAL_MEM_FENCE); 298} 299#undef READI 300 301__attribute__((reqd_work_group_size(LSx, LSy, 1))) 302__kernel void lkSparse(image2d_t I, image2d_t J, 303 __global const float2* prevPts, __global float2* nextPts, __global uchar* status, __global float* err, 304 const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) 305{ 306 __local float smem1[BUFFER]; 307 __local float smem2[BUFFER]; 308 __local float smem3[BUFFER]; 309 310 int xid=get_local_id(0); 311 int yid=get_local_id(1); 312 int gid=get_group_id(0); 313 int xsize=get_local_size(0); 314 int ysize=get_local_size(1); 315 int k; 316 317#ifdef CPU 318 float wx0 = 1.0f; 319 float wy0 = 1.0f; 320 int xBase = mad24(xsize, 2, xid); 321 int yBase = mad24(ysize, 2, yid); 322 float wx1 = (xBase < c_winSize_x) ? 1 : 0; 323 float wy1 = (yBase < c_winSize_y) ? 1 : 0; 324#else 325#if WSX == 1 326 float wx0 = 1.0f; 327 int xBase = mad24(xsize, 2, xid); 328 float wx1 = (xBase < c_winSize_x) ? 1 : 0; 329#else 330 int xBase = mad24(xsize, 1, xid); 331 float wx0 = (xBase < c_winSize_x) ? 1 : 0; 332 float wx1 = 0.0f; 333#endif 334#if WSY == 1 335 float wy0 = 1.0f; 336 int yBase = mad24(ysize, 2, yid); 337 float wy1 = (yBase < c_winSize_y) ? 1 : 0; 338#else 339 int yBase = mad24(ysize, 1, yid); 340 float wy0 = (yBase < c_winSize_y) ? 1 : 0; 341 float wy1 = 0.0f; 342#endif 343#endif 344 345 float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1); 346 347 const int tid = mad24(yid, xsize, xid); 348 349 float2 prevPt = prevPts[gid] / (float2)(1 << level); 350 351 if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) 352 { 353 if (tid == 0 && level == 0) 354 { 355 status[gid] = 0; 356 } 357 358 return; 359 } 360 prevPt -= c_halfWin; 361 362 // extract the patch from the first image, compute covariation matrix of derivatives 363 364 float A11 = 0; 365 float A12 = 0; 366 float A22 = 0; 367 368 float I_patch[GRIDSIZE][GRIDSIZE]; 369 float dIdx_patch[GRIDSIZE][GRIDSIZE]; 370 float dIdy_patch[GRIDSIZE][GRIDSIZE]; 371 372 // local memory to read image with border to calc sobels 373 local float IPatchLocal[LM_W*LM_H]; 374 ReadPatchIToLocalMem(I,prevPt,IPatchLocal); 375 376 { 377 SetPatch(IPatchLocal, 0, 0, 378 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], 379 &A11, &A12, &A22,1); 380 381 382 SetPatch(IPatchLocal, 0, 1, 383 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], 384 &A11, &A12, &A22,wx0); 385 386 SetPatch(IPatchLocal, 0, 2, 387 &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2], 388 &A11, &A12, &A22,wx1); 389 } 390 { 391 SetPatch(IPatchLocal, 1, 0, 392 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0], 393 &A11, &A12, &A22,wy0); 394 395 396 SetPatch(IPatchLocal, 1,1, 397 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1], 398 &A11, &A12, &A22,wx0*wy0); 399 400 SetPatch(IPatchLocal, 1,2, 401 &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2], 402 &A11, &A12, &A22,wx1*wy0); 403 } 404 { 405 SetPatch(IPatchLocal, 2,0, 406 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0], 407 &A11, &A12, &A22,wy1); 408 409 410 SetPatch(IPatchLocal, 2,1, 411 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1], 412 &A11, &A12, &A22,wx0*wy1); 413 414 SetPatch(IPatchLocal, 2,2, 415 &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], 416 &A11, &A12, &A22,wx1*wy1); 417 } 418 419 420 reduce3(A11, A12, A22, smem1, smem2, smem3, tid); 421 422 A11 = smem1[0]; 423 A12 = smem2[0]; 424 A22 = smem3[0]; 425 barrier(CLK_LOCAL_MEM_FENCE); 426 427 float D = mad(A11, A22, - A12 * A12); 428 429 if (D < 1.192092896e-07f) 430 { 431 if (tid == 0 && level == 0) 432 status[gid] = 0; 433 434 return; 435 } 436 437 A11 /= D; 438 A12 /= D; 439 A22 /= D; 440 441 prevPt = mad(nextPts[gid], 2.0f, - c_halfWin); 442 443 float2 offset0 = (float2)(xid + 0.5f, yid + 0.5f); 444 float2 offset1 = (float2)(xsize, ysize); 445 float2 loc0 = prevPt + offset0; 446 float2 loc1 = loc0 + offset1; 447 float2 loc2 = loc1 + offset1; 448 449 for (k = 0; k < c_iters; ++k) 450 { 451 if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows) 452 { 453 if (tid == 0 && level == 0) 454 status[gid] = 0; 455 break; 456 } 457 float b1 = 0; 458 float b2 = 0; 459 460 { 461 GetPatch(J, loc0.x, loc0.y, 462 &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0], 463 &b1, &b2); 464 465 466 GetPatch(J, loc1.x, loc0.y, 467 &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1], 468 &b1, &b2); 469 470 GetPatch(J, loc2.x, loc0.y, 471 &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2], 472 &b1, &b2); 473 } 474 { 475 GetPatch(J, loc0.x, loc1.y, 476 &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0], 477 &b1, &b2); 478 479 480 GetPatch(J, loc1.x, loc1.y, 481 &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1], 482 &b1, &b2); 483 484 GetPatch(J, loc2.x, loc1.y, 485 &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2], 486 &b1, &b2); 487 } 488 { 489 GetPatch(J, loc0.x, loc2.y, 490 &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0], 491 &b1, &b2); 492 493 494 GetPatch(J, loc1.x, loc2.y, 495 &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1], 496 &b1, &b2); 497 498 GetPatch(J, loc2.x, loc2.y, 499 &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], 500 &b1, &b2); 501 } 502 503 reduce2(b1, b2, smem1, smem2, tid); 504 505 b1 = smem1[0]; 506 b2 = smem2[0]; 507 barrier(CLK_LOCAL_MEM_FENCE); 508 509 float2 delta; 510 delta.x = mad(A12, b2, - A22 * b1) * 32.0f; 511 delta.y = mad(A12, b1, - A11 * b2) * 32.0f; 512 513 prevPt += delta; 514 loc0 += delta; 515 loc1 += delta; 516 loc2 += delta; 517 518 if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD) 519 break; 520 } 521 522 D = 0.0f; 523 if (calcErr) 524 { 525 { 526 GetError(J, loc0.x, loc0.y, &I_patch[0][0], &D, 1); 527 GetError(J, loc1.x, loc0.y, &I_patch[0][1], &D, wx0); 528 } 529 { 530 GetError(J, loc0.x, loc1.y, &I_patch[1][0], &D, wy0); 531 GetError(J, loc1.x, loc1.y, &I_patch[1][1], &D, wx0*wy0); 532 } 533 if(xBase < c_winSize_x) 534 { 535 GetError(J, loc2.x, loc0.y, &I_patch[0][2], &D, wx1); 536 GetError(J, loc2.x, loc1.y, &I_patch[1][2], &D, wx1*wy0); 537 } 538 if(yBase < c_winSize_y) 539 { 540 GetError(J, loc0.x, loc2.y, &I_patch[2][0], &D, wy1); 541 GetError(J, loc1.x, loc2.y, &I_patch[2][1], &D, wx0*wy1); 542 if(xBase < c_winSize_x) 543 GetError(J, loc2.x, loc2.y, &I_patch[2][2], &D, wx1*wy1); 544 } 545 546 reduce1(D, smem1, tid); 547 } 548 549 if (tid == 0) 550 { 551 prevPt += c_halfWin; 552 553 nextPts[gid] = prevPt; 554 555 if (calcErr) 556 err[gid] = smem1[0] / (float)(32 * c_winSize_x * c_winSize_y); 557 } 558} 559