1// This file is part of OpenCV project. 2// It is subject to the license terms in the LICENSE file found in the top-level directory 3// of this distribution and at http://opencv.org/license.html 4 5// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory 6 7typedef char int8_t; 8typedef int8_t TsdfType; 9typedef uchar WeightType; 10 11struct TsdfVoxel 12{ 13 TsdfType tsdf; 14 WeightType weight; 15}; 16 17static inline TsdfType floatToTsdf(float num) 18{ 19 int8_t res = (int8_t) ( (num * (-128)) ); 20 res = res ? res : (num < 0 ? 1 : -1); 21 return res; 22} 23 24static inline float tsdfToFloat(TsdfType num) 25{ 26 return ( (float) num ) / (-128); 27} 28 29__kernel void integrate(__global const char * depthptr, 30 int depth_step, int depth_offset, 31 int depth_rows, int depth_cols, 32 __global struct TsdfVoxel * volumeptr, 33 const float16 vol2camMatrix, 34 const float voxelSize, 35 const int4 volResolution4, 36 const int4 volDims4, 37 const float2 fxy, 38 const float2 cxy, 39 const float dfac, 40 const float truncDist, 41 const int maxWeight, 42 const __global float * pixNorms) 43{ 44 int x = get_global_id(0); 45 int y = get_global_id(1); 46 47 const int3 volResolution = volResolution4.xyz; 48 49 if(x >= volResolution.x || y >= volResolution.y) 50 return; 51 52 // coord-independent constants 53 const int3 volDims = volDims4.xyz; 54 const float2 limits = (float2)(depth_cols-1, depth_rows-1); 55 56 const float4 vol2cam0 = vol2camMatrix.s0123; 57 const float4 vol2cam1 = vol2camMatrix.s4567; 58 const float4 vol2cam2 = vol2camMatrix.s89ab; 59 60 const float truncDistInv = 1.f/truncDist; 61 62 // optimization of camSpace transformation (vector addition instead of matmul at each z) 63 float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1); 64 float3 basePt = (float3)(dot(vol2cam0, inPt), 65 dot(vol2cam1, inPt), 66 dot(vol2cam2, inPt)); 67 68 float3 camSpacePt = basePt; 69 70 // zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt; 71 float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize; 72 73 int volYidx = x*volDims.x + y*volDims.y; 74 75 int startZ, endZ; 76 if(fabs(zStep.z) > 1e-5f) 77 { 78 int baseZ = convert_int(-basePt.z / zStep.z); 79 if(zStep.z > 0) 80 { 81 startZ = baseZ; 82 endZ = volResolution.z; 83 } 84 else 85 { 86 startZ = 0; 87 endZ = baseZ; 88 } 89 } 90 else 91 { 92 if(basePt.z > 0) 93 { 94 startZ = 0; endZ = volResolution.z; 95 } 96 else 97 { 98 // z loop shouldn't be performed 99 //startZ = endZ = 0; 100 return; 101 } 102 } 103 104 startZ = max(0, startZ); 105 endZ = min(volResolution.z, endZ); 106 107 for(int z = startZ; z < endZ; z++) 108 { 109 // optimization of the following: 110 //float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize); 111 camSpacePt += zStep; 112 113 if(camSpacePt.z <= 0) 114 continue; 115 116 float3 camPixVec = camSpacePt / camSpacePt.z; 117 float2 projected = mad(camPixVec.xy, fxy, cxy); 118 119 float v; 120 // bilinearly interpolate depth at projected 121 if(all(projected >= 0) && all(projected < limits)) 122 { 123 float2 ip = floor(projected); 124 int xi = ip.x, yi = ip.y; 125 126 __global const float* row0 = (__global const float*)(depthptr + depth_offset + 127 (yi+0)*depth_step); 128 __global const float* row1 = (__global const float*)(depthptr + depth_offset + 129 (yi+1)*depth_step); 130 131 float v00 = row0[xi+0]; 132 float v01 = row0[xi+1]; 133 float v10 = row1[xi+0]; 134 float v11 = row1[xi+1]; 135 float4 vv = (float4)(v00, v01, v10, v11); 136 137 // assume correct depth is positive 138 if(all(vv > 0)) 139 { 140 float2 t = projected - ip; 141 float2 vf = mix(vv.xz, vv.yw, t.x); 142 v = mix(vf.s0, vf.s1, t.y); 143 } 144 else 145 continue; 146 } 147 else 148 continue; 149 150 if(v == 0) 151 continue; 152 153 int idx = projected.y * depth_cols + projected.x; 154 float pixNorm = pixNorms[idx]; 155 //float pixNorm = length(camPixVec); 156 157 // difference between distances of point and of surface to camera 158 float sdf = pixNorm*(v*dfac - camSpacePt.z); 159 // possible alternative is: 160 // float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0); 161 162 if(sdf >= -truncDist) 163 { 164 float tsdf = fmin(1.0f, sdf * truncDistInv); 165 int volIdx = volYidx + z*volDims.z; 166 167 struct TsdfVoxel voxel = volumeptr[volIdx]; 168 float value = tsdfToFloat(voxel.tsdf); 169 int weight = voxel.weight; 170 171 // update TSDF 172 value = (value*weight + tsdf) / (weight + 1); 173 weight = min(weight + 1, maxWeight); 174 175 voxel.tsdf = floatToTsdf(value); 176 voxel.weight = weight; 177 volumeptr[volIdx] = voxel; 178 } 179 } 180} 181 182 183inline float interpolateVoxel(float3 p, __global const struct TsdfVoxel* volumePtr, 184 int3 volDims, int8 neighbourCoords) 185{ 186 float3 fip = floor(p); 187 int3 ip = convert_int3(fip); 188 float3 t = p - fip; 189 190 int3 cmul = volDims*ip; 191 int coordBase = cmul.x + cmul.y + cmul.z; 192 int nco[8]; 193 vstore8(neighbourCoords + coordBase, 0, nco); 194 195 float vaz[8]; 196 for(int i = 0; i < 8; i++) 197 vaz[i] = tsdfToFloat(volumePtr[nco[i]].tsdf); 198 199 float8 vz = vload8(0, vaz); 200 201 float4 vy = mix(vz.s0246, vz.s1357, t.z); 202 float2 vx = mix(vy.s02, vy.s13, t.y); 203 return mix(vx.s0, vx.s1, t.x); 204} 205 206inline float3 getNormalVoxel(float3 p, __global const struct TsdfVoxel* volumePtr, 207 int3 volResolution, int3 volDims, int8 neighbourCoords) 208{ 209 if(any(p < 1) || any(p >= convert_float3(volResolution - 2))) 210 return nan((uint)0); 211 212 float3 fip = floor(p); 213 int3 ip = convert_int3(fip); 214 float3 t = p - fip; 215 216 int3 cmul = volDims*ip; 217 int coordBase = cmul.x + cmul.y + cmul.z; 218 int nco[8]; 219 vstore8(neighbourCoords + coordBase, 0, nco); 220 221 int arDims[3]; 222 vstore3(volDims, 0, arDims); 223 float an[3]; 224 for(int c = 0; c < 3; c++) 225 { 226 int dim = arDims[c]; 227 228 float vaz[8]; 229 for(int i = 0; i < 8; i++) 230 vaz[i] = tsdfToFloat(volumePtr[nco[i] + dim].tsdf) - 231 tsdfToFloat(volumePtr[nco[i] - dim].tsdf); 232 233 float8 vz = vload8(0, vaz); 234 235 float4 vy = mix(vz.s0246, vz.s1357, t.z); 236 float2 vx = mix(vy.s02, vy.s13, t.y); 237 238 an[c] = mix(vx.s0, vx.s1, t.x); 239 } 240 241 //gradientDeltaFactor is fixed at 1.0 of voxel size 242 float3 n = vload3(0, an); 243 float Norm = sqrt(n.x*n.x + n.y*n.y + n.z*n.z); 244 return Norm < 0.0001f ? nan((uint)0) : n / Norm; 245 //return fast_normalize(vload3(0, an)); 246} 247 248typedef float4 ptype; 249 250__kernel void raycast(__global char * pointsptr, 251 int points_step, int points_offset, 252 __global char * normalsptr, 253 int normals_step, int normals_offset, 254 const int2 frameSize, 255 __global const struct TsdfVoxel * volumeptr, 256 __global const float * vol2camptr, 257 __global const float * cam2volptr, 258 const float2 fixy, 259 const float2 cxy, 260 const float4 boxDown4, 261 const float4 boxUp4, 262 const float tstep, 263 const float voxelSize, 264 const int4 volResolution4, 265 const int4 volDims4, 266 const int8 neighbourCoords 267 ) 268{ 269 int x = get_global_id(0); 270 int y = get_global_id(1); 271 272 if(x >= frameSize.x || y >= frameSize.y) 273 return; 274 275 // coordinate-independent constants 276 277 __global const float* cm = cam2volptr; 278 const float3 camRot0 = vload4(0, cm).xyz; 279 const float3 camRot1 = vload4(1, cm).xyz; 280 const float3 camRot2 = vload4(2, cm).xyz; 281 const float3 camTrans = (float3)(cm[3], cm[7], cm[11]); 282 283 __global const float* vm = vol2camptr; 284 const float3 volRot0 = vload4(0, vm).xyz; 285 const float3 volRot1 = vload4(1, vm).xyz; 286 const float3 volRot2 = vload4(2, vm).xyz; 287 const float3 volTrans = (float3)(vm[3], vm[7], vm[11]); 288 289 const float3 boxDown = boxDown4.xyz; 290 const float3 boxUp = boxUp4.xyz; 291 const int3 volDims = volDims4.xyz; 292 293 const int3 volResolution = volResolution4.xyz; 294 295 const float invVoxelSize = native_recip(voxelSize); 296 297 // kernel itself 298 299 float3 point = nan((uint)0); 300 float3 normal = nan((uint)0); 301 302 float3 orig = camTrans; 303 304 // get direction through pixel in volume space: 305 // 1. reproject (x, y) on projecting plane where z = 1.f 306 float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f); 307 308 // 2. rotate to volume space 309 planed = (float3)(dot(planed, camRot0), 310 dot(planed, camRot1), 311 dot(planed, camRot2)); 312 313 // 3. normalize 314 float3 dir = fast_normalize(planed); 315 316 // compute intersection of ray with all six bbox planes 317 float3 rayinv = native_recip(dir); 318 float3 tbottom = rayinv*(boxDown - orig); 319 float3 ttop = rayinv*(boxUp - orig); 320 321 // re-order intersections to find smallest and largest on each axis 322 float3 minAx = min(ttop, tbottom); 323 float3 maxAx = max(ttop, tbottom); 324 325 // near clipping plane 326 const float clip = 0.f; 327 float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip); 328 float tmax = min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z)); 329 330 // precautions against getting coordinates out of bounds 331 tmin = tmin + tstep; 332 tmax = tmax - tstep; 333 334 if(tmin < tmax) 335 { 336 // interpolation optimized a little 337 orig *= invVoxelSize; 338 dir *= invVoxelSize; 339 340 float3 rayStep = dir*tstep; 341 float3 next = (orig + dir*tmin); 342 float f = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); 343 float fnext = f; 344 345 // raymarch 346 int steps = 0; 347 int nSteps = floor(native_divide(tmax - tmin, tstep)); 348 bool stop = false; 349 for(int i = 0; i < nSteps; i++) 350 { 351 // fix for wrong steps counting 352 if(!stop) 353 { 354 next += rayStep; 355 356 // fetch voxel 357 int3 ip = convert_int3(round(next)); 358 int3 cmul = ip*volDims; 359 int idx = cmul.x + cmul.y + cmul.z; 360 fnext = tsdfToFloat(volumeptr[idx].tsdf); 361 362 if(fnext != f) 363 { 364 fnext = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); 365 366 // when ray crosses a surface 367 if(signbit(f) != signbit(fnext)) 368 { 369 stop = true; continue; 370 } 371 372 f = fnext; 373 } 374 steps++; 375 } 376 } 377 378 // if ray penetrates a surface from outside 379 // linearly interpolate t between two f values 380 if(f > 0 && fnext < 0) 381 { 382 float3 tp = next - rayStep; 383 float ft = interpolateVoxel(tp, volumeptr, volDims, neighbourCoords); 384 float ftdt = interpolateVoxel(next, volumeptr, volDims, neighbourCoords); 385 // float t = tmin + steps*tstep; 386 // float ts = t - tstep*ft/(ftdt - ft); 387 float ts = tmin + tstep*(steps - native_divide(ft, ftdt - ft)); 388 389 // avoid division by zero 390 if(!isnan(ts) && !isinf(ts)) 391 { 392 float3 pv = orig + dir*ts; 393 float3 nv = getNormalVoxel(pv, volumeptr, volResolution, volDims, neighbourCoords); 394 395 if(!any(isnan(nv))) 396 { 397 //convert pv and nv to camera space 398 normal = (float3)(dot(nv, volRot0), 399 dot(nv, volRot1), 400 dot(nv, volRot2)); 401 // interpolation optimized a little 402 pv *= voxelSize; 403 point = (float3)(dot(pv, volRot0), 404 dot(pv, volRot1), 405 dot(pv, volRot2)) + volTrans; 406 } 407 } 408 } 409 } 410 411 __global float* pts = (__global float*)(pointsptr + points_offset + y*points_step + x*sizeof(ptype)); 412 __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype)); 413 vstore4((float4)(point, 0), 0, pts); 414 vstore4((float4)(normal, 0), 0, nrm); 415} 416 417 418__kernel void getNormals(__global const char * pointsptr, 419 int points_step, int points_offset, 420 __global char * normalsptr, 421 int normals_step, int normals_offset, 422 const int2 frameSize, 423 __global const struct TsdfVoxel* volumeptr, 424 __global const float * volPoseptr, 425 __global const float * invPoseptr, 426 const float voxelSizeInv, 427 const int4 volResolution4, 428 const int4 volDims4, 429 const int8 neighbourCoords 430 ) 431{ 432 int x = get_global_id(0); 433 int y = get_global_id(1); 434 435 if(x >= frameSize.x || y >= frameSize.y) 436 return; 437 438 // coordinate-independent constants 439 440 __global const float* vp = volPoseptr; 441 const float3 volRot0 = vload4(0, vp).xyz; 442 const float3 volRot1 = vload4(1, vp).xyz; 443 const float3 volRot2 = vload4(2, vp).xyz; 444 const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); 445 446 __global const float* iv = invPoseptr; 447 const float3 invRot0 = vload4(0, iv).xyz; 448 const float3 invRot1 = vload4(1, iv).xyz; 449 const float3 invRot2 = vload4(2, iv).xyz; 450 const float3 invTrans = (float3)(iv[3], iv[7], iv[11]); 451 452 const int3 volResolution = volResolution4.xyz; 453 const int3 volDims = volDims4.xyz; 454 455 // kernel itself 456 457 __global const ptype* ptsRow = (__global const ptype*)(pointsptr + 458 points_offset + 459 y*points_step); 460 float3 p = ptsRow[x].xyz; 461 float3 n = nan((uint)0); 462 if(!any(isnan(p))) 463 { 464 float3 voxPt = (float3)(dot(p, invRot0), 465 dot(p, invRot1), 466 dot(p, invRot2)) + invTrans; 467 voxPt = voxPt * voxelSizeInv; 468 n = getNormalVoxel(voxPt, volumeptr, volResolution, volDims, neighbourCoords); 469 n = (float3)(dot(n, volRot0), 470 dot(n, volRot1), 471 dot(n, volRot2)); 472 } 473 474 __global float* nrm = (__global float*)(normalsptr + 475 normals_offset + 476 y*normals_step + 477 x*sizeof(ptype)); 478 479 vstore4((float4)(n, 0), 0, nrm); 480} 481 482#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable 483 484struct CoordReturn 485{ 486 bool result; 487 float3 point; 488 float3 normal; 489}; 490 491inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axis, 492 __global const struct TsdfVoxel* volumeptr, 493 int3 volResolution, int3 volDims, 494 int8 neighbourCoords, 495 float voxelSize, float voxelSizeInv, 496 const float3 volRot0, 497 const float3 volRot1, 498 const float3 volRot2, 499 const float3 volTrans, 500 bool needNormals, 501 bool scan 502 ) 503{ 504 struct CoordReturn cr; 505 506 // 0 for x, 1 for y, 2 for z 507 bool limits = false; 508 int3 shift; 509 float Vc = 0.f; 510 if(axis == 0) 511 { 512 shift = (int3)(1, 0, 0); 513 limits = (x + 1 < volResolution.x); 514 Vc = V.x; 515 } 516 if(axis == 1) 517 { 518 shift = (int3)(0, 1, 0); 519 limits = (y + 1 < volResolution.y); 520 Vc = V.y; 521 } 522 if(axis == 2) 523 { 524 shift = (int3)(0, 0, 1); 525 limits = (z + 1 < volResolution.z); 526 Vc = V.z; 527 } 528 529 if(limits) 530 { 531 int3 ip = ((int3)(x, y, z)) + shift; 532 int3 cmul = ip*volDims; 533 int idx = cmul.x + cmul.y + cmul.z; 534 535 struct TsdfVoxel voxel = volumeptr[idx]; 536 float vd = tsdfToFloat(voxel.tsdf); 537 int weight = voxel.weight; 538 539 if(weight != 0 && vd != 1.f) 540 { 541 if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0)) 542 { 543 // calc actual values or estimate amount of space 544 if(!scan) 545 { 546 // linearly interpolate coordinate 547 float Vn = Vc + voxelSize; 548 float dinv = 1.f/(fabs(v0)+fabs(vd)); 549 float inter = (Vc*fabs(vd) + Vn*fabs(v0))*dinv; 550 551 float3 p = (float3)(shift.x ? inter : V.x, 552 shift.y ? inter : V.y, 553 shift.z ? inter : V.z); 554 555 cr.point = (float3)(dot(p, volRot0), 556 dot(p, volRot1), 557 dot(p, volRot2)) + volTrans; 558 559 if(needNormals) 560 { 561 float3 nv = getNormalVoxel(p * voxelSizeInv, 562 volumeptr, volResolution, volDims, neighbourCoords); 563 564 cr.normal = (float3)(dot(nv, volRot0), 565 dot(nv, volRot1), 566 dot(nv, volRot2)); 567 } 568 } 569 570 cr.result = true; 571 return cr; 572 } 573 } 574 } 575 576 cr.result = false; 577 return cr; 578} 579 580 581__kernel void scanSize(__global const struct TsdfVoxel* volumeptr, 582 const int4 volResolution4, 583 const int4 volDims4, 584 const int8 neighbourCoords, 585 __global const float * volPoseptr, 586 const float voxelSize, 587 const float voxelSizeInv, 588 __local int* reducebuf, 589 __global char* groupedSumptr, 590 int groupedSum_slicestep, 591 int groupedSum_step, int groupedSum_offset 592 ) 593{ 594 const int3 volDims = volDims4.xyz; 595 const int3 volResolution = volResolution4.xyz; 596 597 int x = get_global_id(0); 598 int y = get_global_id(1); 599 int z = get_global_id(2); 600 601 bool validVoxel = true; 602 if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z) 603 validVoxel = false; 604 605 const int gx = get_group_id(0); 606 const int gy = get_group_id(1); 607 const int gz = get_group_id(2); 608 609 const int lx = get_local_id(0); 610 const int ly = get_local_id(1); 611 const int lz = get_local_id(2); 612 const int lw = get_local_size(0); 613 const int lh = get_local_size(1); 614 const int ld = get_local_size(2); 615 const int lsz = lw*lh*ld; 616 const int lid = lx + ly*lw + lz*lw*lh; 617 618 // coordinate-independent constants 619 620 __global const float* vp = volPoseptr; 621 const float3 volRot0 = vload4(0, vp).xyz; 622 const float3 volRot1 = vload4(1, vp).xyz; 623 const float3 volRot2 = vload4(2, vp).xyz; 624 const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); 625 626 // kernel itself 627 int npts = 0; 628 if(validVoxel) 629 { 630 int3 ip = (int3)(x, y, z); 631 int3 cmul = ip*volDims; 632 int idx = cmul.x + cmul.y + cmul.z; 633 struct TsdfVoxel voxel = volumeptr[idx]; 634 float value = tsdfToFloat(voxel.tsdf); 635 int weight = voxel.weight; 636 637 // if voxel is not empty 638 if(weight != 0 && value != 1.f) 639 { 640 float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize; 641 642 #pragma unroll 643 for(int i = 0; i < 3; i++) 644 { 645 struct CoordReturn cr; 646 cr = coord(x, y, z, V, value, i, 647 volumeptr, volResolution, volDims, 648 neighbourCoords, 649 voxelSize, voxelSizeInv, 650 volRot0, volRot1, volRot2, volTrans, 651 false, true); 652 if(cr.result) 653 { 654 npts++; 655 } 656 } 657 } 658 } 659 660 // reducebuf keeps counters for each thread 661 reducebuf[lid] = npts; 662 663 // reduce counter to local mem 664 665 // maxStep = ctz(lsz), ctz isn't supported on CUDA devices 666 const int c = clz(lsz & -lsz); 667 const int maxStep = c ? 31 - c : c; 668 for(int nstep = 1; nstep <= maxStep; nstep++) 669 { 670 if(lid % (1 << nstep) == 0) 671 { 672 int rto = lid; 673 int rfrom = lid + (1 << (nstep-1)); 674 reducebuf[rto] += reducebuf[rfrom]; 675 } 676 barrier(CLK_LOCAL_MEM_FENCE); 677 } 678 679 if(lid == 0) 680 { 681 __global int* groupedRow = (__global int*)(groupedSumptr + 682 groupedSum_offset + 683 gy*groupedSum_step + 684 gz*groupedSum_slicestep); 685 686 groupedRow[gx] = reducebuf[0]; 687 } 688} 689 690 691__kernel void fillPtsNrm(__global const struct TsdfVoxel* volumeptr, 692 const int4 volResolution4, 693 const int4 volDims4, 694 const int8 neighbourCoords, 695 __global const float * volPoseptr, 696 const float voxelSize, 697 const float voxelSizeInv, 698 const int needNormals, 699 __local float* localbuf, 700 volatile __global int* atomicCtr, 701 __global const char* groupedSumptr, 702 int groupedSum_slicestep, 703 int groupedSum_step, int groupedSum_offset, 704 __global char * pointsptr, 705 int points_step, int points_offset, 706 __global char * normalsptr, 707 int normals_step, int normals_offset 708 ) 709{ 710 const int3 volDims = volDims4.xyz; 711 const int3 volResolution = volResolution4.xyz; 712 713 int x = get_global_id(0); 714 int y = get_global_id(1); 715 int z = get_global_id(2); 716 717 bool validVoxel = true; 718 if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z) 719 validVoxel = false; 720 721 const int gx = get_group_id(0); 722 const int gy = get_group_id(1); 723 const int gz = get_group_id(2); 724 725 __global int* groupedRow = (__global int*)(groupedSumptr + 726 groupedSum_offset + 727 gy*groupedSum_step + 728 gz*groupedSum_slicestep); 729 730 // this group contains 0 pts, skip it 731 int nptsGroup = groupedRow[gx]; 732 if(nptsGroup == 0) 733 return; 734 735 const int lx = get_local_id(0); 736 const int ly = get_local_id(1); 737 const int lz = get_local_id(2); 738 const int lw = get_local_size(0); 739 const int lh = get_local_size(1); 740 const int ld = get_local_size(2); 741 const int lsz = lw*lh*ld; 742 const int lid = lx + ly*lw + lz*lw*lh; 743 744 // coordinate-independent constants 745 746 __global const float* vp = volPoseptr; 747 const float3 volRot0 = vload4(0, vp).xyz; 748 const float3 volRot1 = vload4(1, vp).xyz; 749 const float3 volRot2 = vload4(2, vp).xyz; 750 const float3 volTrans = (float3)(vp[3], vp[7], vp[11]); 751 752 // kernel itself 753 int npts = 0; 754 float3 parr[3], narr[3]; 755 if(validVoxel) 756 { 757 int3 ip = (int3)(x, y, z); 758 int3 cmul = ip*volDims; 759 int idx = cmul.x + cmul.y + cmul.z; 760 struct TsdfVoxel voxel = volumeptr[idx]; 761 float value = tsdfToFloat(voxel.tsdf); 762 int weight = voxel.weight; 763 764 // if voxel is not empty 765 if(weight != 0 && value != 1.f) 766 { 767 float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize; 768 769 #pragma unroll 770 for(int i = 0; i < 3; i++) 771 { 772 struct CoordReturn cr; 773 cr = coord(x, y, z, V, value, i, 774 volumeptr, volResolution, volDims, 775 neighbourCoords, 776 voxelSize, voxelSizeInv, 777 volRot0, volRot1, volRot2, volTrans, 778 needNormals, false); 779 780 if(cr.result) 781 { 782 parr[npts] = cr.point; 783 narr[npts] = cr.normal; 784 npts++; 785 } 786 } 787 } 788 } 789 790 // 4 floats per point or normal 791 const int elemStep = 4; 792 793 __local float* normAddr; 794 __local int localCtr; 795 if(lid == 0) 796 localCtr = 0; 797 798 // push all pts (and nrm) from private array to local mem 799 int privateCtr = 0; 800 barrier(CLK_LOCAL_MEM_FENCE); 801 privateCtr = atomic_add(&localCtr, npts); 802 barrier(CLK_LOCAL_MEM_FENCE); 803 804 for(int i = 0; i < npts; i++) 805 { 806 __local float* addr = localbuf + (privateCtr+i)*elemStep; 807 vstore4((float4)(parr[i], 0), 0, addr); 808 } 809 810 if(needNormals) 811 { 812 normAddr = localbuf + localCtr*elemStep; 813 814 for(int i = 0; i < npts; i++) 815 { 816 __local float* addr = normAddr + (privateCtr+i)*elemStep; 817 vstore4((float4)(narr[i], 0), 0, addr); 818 } 819 } 820 821 // debugging purposes 822 if(lid == 0) 823 { 824 if(localCtr != nptsGroup) 825 { 826 printf("!!! fetchPointsNormals result may be incorrect, npts != localCtr at %3d %3d %3d: %3d vs %3d\n", 827 gx, gy, gz, localCtr, nptsGroup); 828 } 829 } 830 831 // copy local buffer to global mem 832 __local int whereToWrite; 833 if(lid == 0) 834 whereToWrite = atomic_add(atomicCtr, localCtr); 835 barrier(CLK_GLOBAL_MEM_FENCE); 836 837 event_t ev[2]; 838 int evn = 0; 839 // points and normals are 1-column matrices 840 __global float* pts = (__global float*)(pointsptr + 841 points_offset + 842 whereToWrite*points_step); 843 ev[evn++] = async_work_group_copy(pts, localbuf, localCtr*elemStep, 0); 844 845 if(needNormals) 846 { 847 __global float* nrm = (__global float*)(normalsptr + 848 normals_offset + 849 whereToWrite*normals_step); 850 ev[evn++] = async_work_group_copy(nrm, normAddr, localCtr*elemStep, 0); 851 } 852 853 wait_group_events(evn, ev); 854} 855