1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html
4
5// This code is also subject to the license terms in the LICENSE_KinectFusion.md file found in this module's directory
6
7typedef char int8_t;
8typedef int8_t TsdfType;
9typedef uchar WeightType;
10
11struct TsdfVoxel
12{
13    TsdfType tsdf;
14    WeightType weight;
15};
16
17static inline TsdfType floatToTsdf(float num)
18{
19    int8_t res = (int8_t) ( (num * (-128)) );
20    res = res ? res : (num < 0 ? 1 : -1);
21    return res;
22}
23
24static inline float tsdfToFloat(TsdfType num)
25{
26    return ( (float) num ) / (-128);
27}
28
29__kernel void integrate(__global const char * depthptr,
30                        int depth_step, int depth_offset,
31                        int depth_rows, int depth_cols,
32                        __global struct TsdfVoxel * volumeptr,
33                        const float16 vol2camMatrix,
34                        const float voxelSize,
35                        const int4 volResolution4,
36                        const int4 volDims4,
37                        const float2 fxy,
38                        const float2 cxy,
39                        const float dfac,
40                        const float truncDist,
41                        const int maxWeight,
42                        const __global float * pixNorms)
43{
44    int x = get_global_id(0);
45    int y = get_global_id(1);
46
47    const int3 volResolution = volResolution4.xyz;
48
49    if(x >= volResolution.x || y >= volResolution.y)
50        return;
51
52    // coord-independent constants
53    const int3 volDims = volDims4.xyz;
54    const float2 limits = (float2)(depth_cols-1, depth_rows-1);
55
56    const float4 vol2cam0 = vol2camMatrix.s0123;
57    const float4 vol2cam1 = vol2camMatrix.s4567;
58    const float4 vol2cam2 = vol2camMatrix.s89ab;
59
60    const float truncDistInv = 1.f/truncDist;
61
62    // optimization of camSpace transformation (vector addition instead of matmul at each z)
63    float4 inPt = (float4)(x*voxelSize, y*voxelSize, 0, 1);
64    float3 basePt = (float3)(dot(vol2cam0, inPt),
65                             dot(vol2cam1, inPt),
66                             dot(vol2cam2, inPt));
67
68    float3 camSpacePt = basePt;
69
70    // zStep == vol2cam*(float3(x, y, 1)*voxelSize) - basePt;
71    float3 zStep = ((float3)(vol2cam0.z, vol2cam1.z, vol2cam2.z))*voxelSize;
72
73    int volYidx = x*volDims.x + y*volDims.y;
74
75    int startZ, endZ;
76    if(fabs(zStep.z) > 1e-5f)
77    {
78        int baseZ = convert_int(-basePt.z / zStep.z);
79        if(zStep.z > 0)
80        {
81            startZ = baseZ;
82            endZ = volResolution.z;
83        }
84        else
85        {
86            startZ = 0;
87            endZ = baseZ;
88        }
89    }
90    else
91    {
92        if(basePt.z > 0)
93        {
94            startZ = 0; endZ = volResolution.z;
95        }
96        else
97        {
98            // z loop shouldn't be performed
99            //startZ = endZ = 0;
100            return;
101        }
102    }
103
104    startZ = max(0, startZ);
105    endZ = min(volResolution.z, endZ);
106
107    for(int z = startZ; z < endZ; z++)
108    {
109        // optimization of the following:
110        //float3 camSpacePt = vol2cam * ((float3)(x, y, z)*voxelSize);
111        camSpacePt += zStep;
112
113        if(camSpacePt.z <= 0)
114            continue;
115
116        float3 camPixVec = camSpacePt / camSpacePt.z;
117        float2 projected = mad(camPixVec.xy, fxy, cxy);
118
119        float v;
120        // bilinearly interpolate depth at projected
121        if(all(projected >= 0) && all(projected < limits))
122        {
123            float2 ip = floor(projected);
124            int xi = ip.x, yi = ip.y;
125
126            __global const float* row0 = (__global const float*)(depthptr + depth_offset +
127                                                                 (yi+0)*depth_step);
128            __global const float* row1 = (__global const float*)(depthptr + depth_offset +
129                                                                 (yi+1)*depth_step);
130
131            float v00 = row0[xi+0];
132            float v01 = row0[xi+1];
133            float v10 = row1[xi+0];
134            float v11 = row1[xi+1];
135            float4 vv = (float4)(v00, v01, v10, v11);
136
137            // assume correct depth is positive
138            if(all(vv > 0))
139            {
140                float2 t = projected - ip;
141                float2 vf = mix(vv.xz, vv.yw, t.x);
142                v = mix(vf.s0, vf.s1, t.y);
143            }
144            else
145                continue;
146        }
147        else
148            continue;
149
150        if(v == 0)
151            continue;
152
153        int idx = projected.y * depth_cols + projected.x;
154        float pixNorm = pixNorms[idx];
155        //float pixNorm = length(camPixVec);
156
157        // difference between distances of point and of surface to camera
158        float sdf = pixNorm*(v*dfac - camSpacePt.z);
159        // possible alternative is:
160        // float sdf = length(camSpacePt)*(v*dfac/camSpacePt.z - 1.0);
161
162        if(sdf >= -truncDist)
163        {
164            float tsdf = fmin(1.0f, sdf * truncDistInv);
165            int volIdx = volYidx + z*volDims.z;
166
167            struct TsdfVoxel voxel = volumeptr[volIdx];
168            float value  = tsdfToFloat(voxel.tsdf);
169            int weight = voxel.weight;
170
171            // update TSDF
172            value = (value*weight + tsdf) / (weight + 1);
173            weight = min(weight + 1, maxWeight);
174
175            voxel.tsdf = floatToTsdf(value);
176            voxel.weight = weight;
177            volumeptr[volIdx] = voxel;
178        }
179    }
180}
181
182
183inline float interpolateVoxel(float3 p, __global const struct TsdfVoxel* volumePtr,
184                              int3 volDims, int8 neighbourCoords)
185{
186    float3 fip = floor(p);
187    int3 ip = convert_int3(fip);
188    float3 t = p - fip;
189
190    int3 cmul = volDims*ip;
191    int coordBase = cmul.x + cmul.y + cmul.z;
192    int nco[8];
193    vstore8(neighbourCoords + coordBase, 0, nco);
194
195    float vaz[8];
196    for(int i = 0; i < 8; i++)
197        vaz[i] = tsdfToFloat(volumePtr[nco[i]].tsdf);
198
199    float8 vz = vload8(0, vaz);
200
201    float4 vy = mix(vz.s0246, vz.s1357, t.z);
202    float2 vx = mix(vy.s02, vy.s13, t.y);
203    return mix(vx.s0, vx.s1, t.x);
204}
205
206inline float3 getNormalVoxel(float3 p, __global const struct TsdfVoxel* volumePtr,
207                             int3 volResolution, int3 volDims, int8 neighbourCoords)
208{
209    if(any(p < 1) || any(p >= convert_float3(volResolution - 2)))
210        return nan((uint)0);
211
212    float3 fip = floor(p);
213    int3 ip = convert_int3(fip);
214    float3 t = p - fip;
215
216    int3 cmul = volDims*ip;
217    int coordBase = cmul.x + cmul.y + cmul.z;
218    int nco[8];
219    vstore8(neighbourCoords + coordBase, 0, nco);
220
221    int arDims[3];
222    vstore3(volDims, 0, arDims);
223    float an[3];
224    for(int c = 0; c < 3; c++)
225    {
226        int dim = arDims[c];
227
228        float vaz[8];
229        for(int i = 0; i < 8; i++)
230            vaz[i] = tsdfToFloat(volumePtr[nco[i] + dim].tsdf) -
231                     tsdfToFloat(volumePtr[nco[i] - dim].tsdf);
232
233        float8 vz = vload8(0, vaz);
234
235        float4 vy = mix(vz.s0246, vz.s1357, t.z);
236        float2 vx = mix(vy.s02, vy.s13, t.y);
237
238        an[c] = mix(vx.s0, vx.s1, t.x);
239    }
240
241    //gradientDeltaFactor is fixed at 1.0 of voxel size
242    float3 n = vload3(0, an);
243    float Norm = sqrt(n.x*n.x + n.y*n.y + n.z*n.z);
244    return Norm < 0.0001f ? nan((uint)0) : n / Norm;
245    //return fast_normalize(vload3(0, an));
246}
247
248typedef float4 ptype;
249
250__kernel void raycast(__global char * pointsptr,
251                      int points_step, int points_offset,
252                      __global char * normalsptr,
253                      int normals_step, int normals_offset,
254                      const int2 frameSize,
255                      __global const struct TsdfVoxel * volumeptr,
256                      __global const float * vol2camptr,
257                      __global const float * cam2volptr,
258                      const float2 fixy,
259                      const float2 cxy,
260                      const float4 boxDown4,
261                      const float4 boxUp4,
262                      const float tstep,
263                      const float voxelSize,
264                      const int4 volResolution4,
265                      const int4 volDims4,
266                      const int8 neighbourCoords
267                      )
268{
269    int x = get_global_id(0);
270    int y = get_global_id(1);
271
272    if(x >= frameSize.x || y >= frameSize.y)
273        return;
274
275    // coordinate-independent constants
276
277    __global const float* cm = cam2volptr;
278    const float3 camRot0  = vload4(0, cm).xyz;
279    const float3 camRot1  = vload4(1, cm).xyz;
280    const float3 camRot2  = vload4(2, cm).xyz;
281    const float3 camTrans = (float3)(cm[3], cm[7], cm[11]);
282
283    __global const float* vm = vol2camptr;
284    const float3 volRot0  = vload4(0, vm).xyz;
285    const float3 volRot1  = vload4(1, vm).xyz;
286    const float3 volRot2  = vload4(2, vm).xyz;
287    const float3 volTrans = (float3)(vm[3], vm[7], vm[11]);
288
289    const float3 boxDown = boxDown4.xyz;
290    const float3 boxUp   = boxUp4.xyz;
291    const int3   volDims = volDims4.xyz;
292
293    const int3 volResolution = volResolution4.xyz;
294
295    const float invVoxelSize = native_recip(voxelSize);
296
297    // kernel itself
298
299    float3 point  = nan((uint)0);
300    float3 normal = nan((uint)0);
301
302    float3 orig = camTrans;
303
304    // get direction through pixel in volume space:
305    // 1. reproject (x, y) on projecting plane where z = 1.f
306    float3 planed = (float3)(((float2)(x, y) - cxy)*fixy, 1.f);
307
308    // 2. rotate to volume space
309    planed = (float3)(dot(planed, camRot0),
310                      dot(planed, camRot1),
311                      dot(planed, camRot2));
312
313    // 3. normalize
314    float3 dir = fast_normalize(planed);
315
316    // compute intersection of ray with all six bbox planes
317    float3 rayinv = native_recip(dir);
318    float3 tbottom = rayinv*(boxDown - orig);
319    float3 ttop    = rayinv*(boxUp   - orig);
320
321    // re-order intersections to find smallest and largest on each axis
322    float3 minAx = min(ttop, tbottom);
323    float3 maxAx = max(ttop, tbottom);
324
325    // near clipping plane
326    const float clip = 0.f;
327    float tmin = max(max(max(minAx.x, minAx.y), max(minAx.x, minAx.z)), clip);
328    float tmax =     min(min(maxAx.x, maxAx.y), min(maxAx.x, maxAx.z));
329
330    // precautions against getting coordinates out of bounds
331    tmin = tmin + tstep;
332    tmax = tmax - tstep;
333
334    if(tmin < tmax)
335    {
336        // interpolation optimized a little
337        orig *= invVoxelSize;
338        dir  *= invVoxelSize;
339
340        float3 rayStep = dir*tstep;
341        float3 next = (orig + dir*tmin);
342        float f = interpolateVoxel(next, volumeptr, volDims, neighbourCoords);
343        float fnext = f;
344
345        // raymarch
346        int steps = 0;
347        int nSteps = floor(native_divide(tmax - tmin, tstep));
348        bool stop = false;
349        for(int i = 0; i < nSteps; i++)
350        {
351            // fix for wrong steps counting
352            if(!stop)
353            {
354                next += rayStep;
355
356                // fetch voxel
357                int3 ip = convert_int3(round(next));
358                int3 cmul = ip*volDims;
359                int idx = cmul.x + cmul.y + cmul.z;
360                fnext = tsdfToFloat(volumeptr[idx].tsdf);
361
362                if(fnext != f)
363                {
364                    fnext = interpolateVoxel(next, volumeptr, volDims, neighbourCoords);
365
366                    // when ray crosses a surface
367                    if(signbit(f) != signbit(fnext))
368                    {
369                        stop = true; continue;
370                    }
371
372                    f = fnext;
373                }
374                steps++;
375            }
376        }
377
378        // if ray penetrates a surface from outside
379        // linearly interpolate t between two f values
380        if(f > 0 && fnext < 0)
381        {
382            float3 tp = next - rayStep;
383            float ft   = interpolateVoxel(tp,   volumeptr, volDims, neighbourCoords);
384            float ftdt = interpolateVoxel(next, volumeptr, volDims, neighbourCoords);
385            // float t = tmin + steps*tstep;
386            // float ts = t - tstep*ft/(ftdt - ft);
387            float ts = tmin + tstep*(steps - native_divide(ft, ftdt - ft));
388
389            // avoid division by zero
390            if(!isnan(ts) && !isinf(ts))
391            {
392                float3 pv = orig + dir*ts;
393                float3 nv = getNormalVoxel(pv, volumeptr, volResolution, volDims, neighbourCoords);
394
395                if(!any(isnan(nv)))
396                {
397                    //convert pv and nv to camera space
398                    normal = (float3)(dot(nv, volRot0),
399                                      dot(nv, volRot1),
400                                      dot(nv, volRot2));
401                    // interpolation optimized a little
402                    pv *= voxelSize;
403                    point = (float3)(dot(pv, volRot0),
404                                     dot(pv, volRot1),
405                                     dot(pv, volRot2)) + volTrans;
406                }
407            }
408        }
409    }
410
411    __global float* pts = (__global float*)(pointsptr  +  points_offset + y*points_step  + x*sizeof(ptype));
412    __global float* nrm = (__global float*)(normalsptr + normals_offset + y*normals_step + x*sizeof(ptype));
413    vstore4((float4)(point,  0), 0, pts);
414    vstore4((float4)(normal, 0), 0, nrm);
415}
416
417
418__kernel void getNormals(__global const char * pointsptr,
419                         int points_step, int points_offset,
420                         __global char * normalsptr,
421                         int normals_step, int normals_offset,
422                         const int2 frameSize,
423                         __global const struct TsdfVoxel* volumeptr,
424                         __global const float * volPoseptr,
425                         __global const float * invPoseptr,
426                         const float voxelSizeInv,
427                         const int4 volResolution4,
428                         const int4 volDims4,
429                         const int8 neighbourCoords
430                         )
431{
432    int x = get_global_id(0);
433    int y = get_global_id(1);
434
435    if(x >= frameSize.x || y >= frameSize.y)
436        return;
437
438    // coordinate-independent constants
439
440    __global const float* vp = volPoseptr;
441    const float3 volRot0  = vload4(0, vp).xyz;
442    const float3 volRot1  = vload4(1, vp).xyz;
443    const float3 volRot2  = vload4(2, vp).xyz;
444    const float3 volTrans = (float3)(vp[3], vp[7], vp[11]);
445
446    __global const float* iv = invPoseptr;
447    const float3 invRot0 = vload4(0, iv).xyz;
448    const float3 invRot1 = vload4(1, iv).xyz;
449    const float3 invRot2 = vload4(2, iv).xyz;
450    const float3 invTrans = (float3)(iv[3], iv[7], iv[11]);
451
452    const int3 volResolution = volResolution4.xyz;
453    const int3 volDims = volDims4.xyz;
454
455    // kernel itself
456
457    __global const ptype* ptsRow = (__global const ptype*)(pointsptr +
458                                                           points_offset +
459                                                           y*points_step);
460    float3 p = ptsRow[x].xyz;
461    float3 n = nan((uint)0);
462    if(!any(isnan(p)))
463    {
464        float3 voxPt = (float3)(dot(p, invRot0),
465                                dot(p, invRot1),
466                                dot(p, invRot2)) + invTrans;
467        voxPt = voxPt * voxelSizeInv;
468        n = getNormalVoxel(voxPt, volumeptr, volResolution, volDims, neighbourCoords);
469        n = (float3)(dot(n, volRot0),
470                     dot(n, volRot1),
471                     dot(n, volRot2));
472    }
473
474    __global float* nrm = (__global float*)(normalsptr +
475                                            normals_offset +
476                                            y*normals_step +
477                                            x*sizeof(ptype));
478
479    vstore4((float4)(n, 0), 0, nrm);
480}
481
482#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
483
484struct CoordReturn
485{
486    bool result;
487    float3 point;
488    float3 normal;
489};
490
491inline struct CoordReturn coord(int x, int y, int z, float3 V, float v0, int axis,
492                                __global const struct TsdfVoxel* volumeptr,
493                                int3 volResolution, int3 volDims,
494                                int8 neighbourCoords,
495                                float voxelSize, float voxelSizeInv,
496                                const float3 volRot0,
497                                const float3 volRot1,
498                                const float3 volRot2,
499                                const float3 volTrans,
500                                bool needNormals,
501                                bool scan
502                                )
503{
504    struct CoordReturn cr;
505
506    // 0 for x, 1 for y, 2 for z
507    bool limits = false;
508    int3 shift;
509    float Vc = 0.f;
510    if(axis == 0)
511    {
512        shift = (int3)(1, 0, 0);
513        limits = (x + 1 < volResolution.x);
514        Vc = V.x;
515    }
516    if(axis == 1)
517    {
518        shift = (int3)(0, 1, 0);
519        limits = (y + 1 < volResolution.y);
520        Vc = V.y;
521    }
522    if(axis == 2)
523    {
524        shift = (int3)(0, 0, 1);
525        limits = (z + 1 < volResolution.z);
526        Vc = V.z;
527    }
528
529    if(limits)
530    {
531        int3 ip = ((int3)(x, y, z)) + shift;
532        int3 cmul = ip*volDims;
533        int idx = cmul.x + cmul.y + cmul.z;
534
535        struct TsdfVoxel voxel = volumeptr[idx];
536        float vd  = tsdfToFloat(voxel.tsdf);
537        int weight = voxel.weight;
538
539        if(weight != 0 && vd != 1.f)
540        {
541            if((v0 > 0 && vd < 0) || (v0 < 0 && vd > 0))
542            {
543                // calc actual values or estimate amount of space
544                if(!scan)
545                {
546                    // linearly interpolate coordinate
547                    float Vn = Vc + voxelSize;
548                    float dinv = 1.f/(fabs(v0)+fabs(vd));
549                    float inter = (Vc*fabs(vd) + Vn*fabs(v0))*dinv;
550
551                    float3 p = (float3)(shift.x ? inter : V.x,
552                                        shift.y ? inter : V.y,
553                                        shift.z ? inter : V.z);
554
555                    cr.point = (float3)(dot(p, volRot0),
556                                        dot(p, volRot1),
557                                        dot(p, volRot2)) + volTrans;
558
559                    if(needNormals)
560                    {
561                        float3 nv = getNormalVoxel(p * voxelSizeInv,
562                                                   volumeptr, volResolution, volDims, neighbourCoords);
563
564                        cr.normal = (float3)(dot(nv, volRot0),
565                                             dot(nv, volRot1),
566                                             dot(nv, volRot2));
567                    }
568                }
569
570                cr.result = true;
571                return cr;
572            }
573        }
574    }
575
576    cr.result = false;
577    return cr;
578}
579
580
581__kernel void scanSize(__global const struct TsdfVoxel* volumeptr,
582                       const int4 volResolution4,
583                       const int4 volDims4,
584                       const int8 neighbourCoords,
585                       __global const float * volPoseptr,
586                       const float voxelSize,
587                       const float voxelSizeInv,
588                       __local int* reducebuf,
589                       __global char* groupedSumptr,
590                       int groupedSum_slicestep,
591                       int groupedSum_step, int groupedSum_offset
592                       )
593{
594    const int3 volDims = volDims4.xyz;
595    const int3 volResolution = volResolution4.xyz;
596
597    int x = get_global_id(0);
598    int y = get_global_id(1);
599    int z = get_global_id(2);
600
601    bool validVoxel = true;
602    if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z)
603        validVoxel = false;
604
605    const int gx = get_group_id(0);
606    const int gy = get_group_id(1);
607    const int gz = get_group_id(2);
608
609    const int lx = get_local_id(0);
610    const int ly = get_local_id(1);
611    const int lz = get_local_id(2);
612    const int lw = get_local_size(0);
613    const int lh = get_local_size(1);
614    const int ld = get_local_size(2);
615    const int lsz = lw*lh*ld;
616    const int lid = lx + ly*lw + lz*lw*lh;
617
618    // coordinate-independent constants
619
620    __global const float* vp = volPoseptr;
621    const float3 volRot0  = vload4(0, vp).xyz;
622    const float3 volRot1  = vload4(1, vp).xyz;
623    const float3 volRot2  = vload4(2, vp).xyz;
624    const float3 volTrans = (float3)(vp[3], vp[7], vp[11]);
625
626    // kernel itself
627    int npts = 0;
628    if(validVoxel)
629    {
630        int3 ip = (int3)(x, y, z);
631        int3 cmul = ip*volDims;
632        int idx = cmul.x + cmul.y + cmul.z;
633        struct TsdfVoxel voxel = volumeptr[idx];
634        float value  = tsdfToFloat(voxel.tsdf);
635        int weight = voxel.weight;
636
637        // if voxel is not empty
638        if(weight != 0 && value != 1.f)
639        {
640            float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize;
641
642            #pragma unroll
643            for(int i = 0; i < 3; i++)
644            {
645                struct CoordReturn cr;
646                cr = coord(x, y, z, V, value, i,
647                           volumeptr, volResolution, volDims,
648                           neighbourCoords,
649                           voxelSize, voxelSizeInv,
650                           volRot0, volRot1, volRot2, volTrans,
651                           false, true);
652                if(cr.result)
653                {
654                    npts++;
655                }
656            }
657        }
658    }
659
660    // reducebuf keeps counters for each thread
661    reducebuf[lid] = npts;
662
663    // reduce counter to local mem
664
665    // maxStep = ctz(lsz), ctz isn't supported on CUDA devices
666    const int c = clz(lsz & -lsz);
667    const int maxStep = c ? 31 - c : c;
668    for(int nstep = 1; nstep <= maxStep; nstep++)
669    {
670        if(lid % (1 << nstep) == 0)
671        {
672            int rto   = lid;
673            int rfrom = lid + (1 << (nstep-1));
674            reducebuf[rto] += reducebuf[rfrom];
675        }
676        barrier(CLK_LOCAL_MEM_FENCE);
677    }
678
679    if(lid == 0)
680    {
681        __global int* groupedRow = (__global int*)(groupedSumptr +
682                                                   groupedSum_offset +
683                                                   gy*groupedSum_step +
684                                                   gz*groupedSum_slicestep);
685
686        groupedRow[gx] = reducebuf[0];
687    }
688}
689
690
691__kernel void fillPtsNrm(__global const struct TsdfVoxel* volumeptr,
692                         const int4 volResolution4,
693                         const int4 volDims4,
694                         const int8 neighbourCoords,
695                         __global const float * volPoseptr,
696                         const float voxelSize,
697                         const float voxelSizeInv,
698                         const int needNormals,
699                         __local float* localbuf,
700                         volatile __global int* atomicCtr,
701                         __global const char* groupedSumptr,
702                         int groupedSum_slicestep,
703                         int groupedSum_step, int groupedSum_offset,
704                         __global char * pointsptr,
705                         int points_step, int points_offset,
706                         __global char * normalsptr,
707                         int normals_step, int normals_offset
708                         )
709{
710    const int3 volDims = volDims4.xyz;
711    const int3 volResolution = volResolution4.xyz;
712
713    int x = get_global_id(0);
714    int y = get_global_id(1);
715    int z = get_global_id(2);
716
717    bool validVoxel = true;
718    if(x >= volResolution.x || y >= volResolution.y || z >= volResolution.z)
719        validVoxel = false;
720
721    const int gx = get_group_id(0);
722    const int gy = get_group_id(1);
723    const int gz = get_group_id(2);
724
725    __global int* groupedRow = (__global int*)(groupedSumptr +
726                                               groupedSum_offset +
727                                               gy*groupedSum_step +
728                                               gz*groupedSum_slicestep);
729
730    // this group contains 0 pts, skip it
731    int nptsGroup = groupedRow[gx];
732    if(nptsGroup == 0)
733        return;
734
735    const int lx = get_local_id(0);
736    const int ly = get_local_id(1);
737    const int lz = get_local_id(2);
738    const int lw = get_local_size(0);
739    const int lh = get_local_size(1);
740    const int ld = get_local_size(2);
741    const int lsz = lw*lh*ld;
742    const int lid = lx + ly*lw + lz*lw*lh;
743
744    // coordinate-independent constants
745
746    __global const float* vp = volPoseptr;
747    const float3 volRot0  = vload4(0, vp).xyz;
748    const float3 volRot1  = vload4(1, vp).xyz;
749    const float3 volRot2  = vload4(2, vp).xyz;
750    const float3 volTrans = (float3)(vp[3], vp[7], vp[11]);
751
752    // kernel itself
753    int npts = 0;
754    float3 parr[3], narr[3];
755    if(validVoxel)
756    {
757        int3 ip = (int3)(x, y, z);
758        int3 cmul = ip*volDims;
759        int idx = cmul.x + cmul.y + cmul.z;
760        struct TsdfVoxel voxel = volumeptr[idx];
761        float value  = tsdfToFloat(voxel.tsdf);
762        int weight = voxel.weight;
763
764        // if voxel is not empty
765        if(weight != 0 && value != 1.f)
766        {
767            float3 V = (((float3)(x, y, z)) + 0.5f)*voxelSize;
768
769            #pragma unroll
770            for(int i = 0; i < 3; i++)
771            {
772                struct CoordReturn cr;
773                cr = coord(x, y, z, V, value, i,
774                           volumeptr, volResolution, volDims,
775                           neighbourCoords,
776                           voxelSize, voxelSizeInv,
777                           volRot0, volRot1, volRot2, volTrans,
778                           needNormals, false);
779
780                if(cr.result)
781                {
782                    parr[npts] = cr.point;
783                    narr[npts] = cr.normal;
784                    npts++;
785                }
786            }
787        }
788    }
789
790    // 4 floats per point or normal
791    const int elemStep = 4;
792
793    __local float* normAddr;
794    __local int localCtr;
795    if(lid == 0)
796        localCtr = 0;
797
798    // push all pts (and nrm) from private array to local mem
799    int privateCtr = 0;
800    barrier(CLK_LOCAL_MEM_FENCE);
801    privateCtr = atomic_add(&localCtr, npts);
802    barrier(CLK_LOCAL_MEM_FENCE);
803
804    for(int i = 0; i < npts; i++)
805    {
806        __local float* addr = localbuf + (privateCtr+i)*elemStep;
807        vstore4((float4)(parr[i], 0), 0, addr);
808    }
809
810    if(needNormals)
811    {
812        normAddr = localbuf + localCtr*elemStep;
813
814        for(int i = 0; i < npts; i++)
815        {
816            __local float* addr = normAddr + (privateCtr+i)*elemStep;
817            vstore4((float4)(narr[i], 0), 0, addr);
818        }
819    }
820
821    // debugging purposes
822    if(lid == 0)
823    {
824        if(localCtr != nptsGroup)
825        {
826            printf("!!! fetchPointsNormals result may be incorrect, npts != localCtr at %3d %3d %3d: %3d vs %3d\n",
827                   gx, gy, gz, localCtr, nptsGroup);
828        }
829    }
830
831    // copy local buffer to global mem
832    __local int whereToWrite;
833    if(lid == 0)
834        whereToWrite = atomic_add(atomicCtr, localCtr);
835    barrier(CLK_GLOBAL_MEM_FENCE);
836
837    event_t ev[2];
838    int evn = 0;
839    // points and normals are 1-column matrices
840    __global float* pts = (__global float*)(pointsptr +
841                                            points_offset +
842                                            whereToWrite*points_step);
843    ev[evn++] = async_work_group_copy(pts, localbuf, localCtr*elemStep, 0);
844
845    if(needNormals)
846    {
847        __global float* nrm = (__global float*)(normalsptr +
848                                                normals_offset +
849                                                whereToWrite*normals_step);
850        ev[evn++] = async_work_group_copy(nrm, normAddr, localCtr*elemStep, 0);
851    }
852
853    wait_group_events(evn, ev);
854}
855