1 /*
2  * Software License Agreement (BSD License)
3  *
4  *  Copyright (c) 2011, Willow Garage, Inc.
5  *  All rights reserved.
6  *
7  *  Redistribution and use in source and binary forms, with or without
8  *  modification, are permitted provided that the following conditions
9  *  are met:
10  *
11  *   * Redistributions of source code must retain the above copyright
12  *     notice, this list of conditions and the following disclaimer.
13  *   * Redistributions in binary form must reproduce the above
14  *     copyright notice, this list of conditions and the following
15  *     disclaimer in the documentation and/or other materials provided
16  *     with the distribution.
17  *   * Neither the name of Willow Garage, Inc. nor the names of its
18  *     contributors may be used to endorse or promote products derived
19  *     from this software without specific prior written permission.
20  *
21  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25  *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29  *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31  *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  *  POSSIBILITY OF SUCH DAMAGE.
33  *
34  * @author: Koen Buys, Anatoly Baksheev
35  */
36 
37 #include <pcl/gpu/people/bodyparts_detector.h>
38 //#include <pcl/gpu/people/conversions.h>
39 #include <pcl/gpu/people/label_common.h>
40 //#include <pcl/gpu/people/label_segment.h>
41 #include <pcl/gpu/people/label_tree.h>
42 
43 #include <pcl/common/time.h>
44 #include <pcl/console/print.h>
45 #include "cuda.h"
46 #include "emmintrin.h"
47 
48 #include <cassert>
49 #include "internal.h"
50 #include "cuda_async_copy.h"
51 
52 const int MAX_CLUST_SIZE = 25000;
53 const float CLUST_TOL = 0.05f;
54 
RDFBodyPartsDetector(const std::vector<std::string> & tree_files,int rows,int cols)55 pcl::gpu::people::RDFBodyPartsDetector::RDFBodyPartsDetector( const std::vector<std::string>& tree_files, int rows, int cols)
56 : max_cluster_size_(MAX_CLUST_SIZE), cluster_tolerance_(CLUST_TOL)
57 {
58   PCL_DEBUG("[pcl::gpu::people::RDFBodyPartsDetector::RDFBodyPartsDetector] : (D) : Constructor called\n");
59   //TODO replace all asserts with exceptions
60   assert(!tree_files.empty());
61 
62   impl_.reset ( new device::MultiTreeLiveProc(rows, cols) );
63 
64   for(const auto &tree_file : tree_files)
65   {
66     // load the tree file
67     std::vector<trees::Node>  nodes;
68     std::vector<trees::Label> leaves;
69 
70     // this might throw but we haven't done any malloc yet
71     int height = loadTree (tree_file, nodes, leaves );
72     impl_->trees.emplace_back(height, nodes, leaves);
73   }
74 
75   allocate_buffers(rows, cols);
76 }
77 
78 ////////////////////////////////////////////////////////////////////////////////////
79 /// getters
80 
81 size_t
getNumberTrees() const82 pcl::gpu::people::RDFBodyPartsDetector::getNumberTrees() const
83 {
84   return impl_->trees.size();
85 }
86 
87 const pcl::device::Labels&
getLabels() const88 pcl::gpu::people::RDFBodyPartsDetector::getLabels() const
89 {
90   return labels_smoothed_;
91 }
92 
93 const pcl::device::LabelProbability&
getProbability() const94 pcl::gpu::people::RDFBodyPartsDetector::getProbability() const
95 {
96   return P_l_;
97 }
98 
99 const pcl::device::LabelProbability&
getProbability1() const100 pcl::gpu::people::RDFBodyPartsDetector::getProbability1() const
101 {
102   return P_l_1_;
103 }
104 
105 const pcl::device::LabelProbability&
getProbability2() const106 pcl::gpu::people::RDFBodyPartsDetector::getProbability2() const
107 {
108   return P_l_2_;
109 }
110 
111 const pcl::device::LabelProbability&
getPrevProbability1() const112 pcl::gpu::people::RDFBodyPartsDetector::getPrevProbability1() const
113 {
114   return P_l_prev_1_;
115 }
116 
117 const pcl::device::LabelProbability&
getPrevProbability2() const118 pcl::gpu::people::RDFBodyPartsDetector::getPrevProbability2() const
119 {
120   return P_l_prev_2_;
121 }
122 
123 const pcl::gpu::people::RDFBodyPartsDetector::BlobMatrix&
getBlobMatrix() const124 pcl::gpu::people::RDFBodyPartsDetector::getBlobMatrix() const
125 {
126   return blob_matrix_;
127 }
128 
129 ////////////////////////////////////////////////////////////////////////////////////
130 
131 void
allocate_buffers(int rows,int cols)132 pcl::gpu::people::RDFBodyPartsDetector::allocate_buffers(int rows, int cols)
133 {
134   //std::cout << "(I) : RDFBodyPartsDetector::allocate_buffers called with: " << cols << "x" << rows << std::endl;
135 
136   labels_.create(rows, cols);
137   labels_smoothed_.create(rows, cols);
138 
139   // Create all the label probabilities
140   P_l_.create(rows,cols);
141   P_l_Gaus_.create(rows,cols);
142   P_l_Gaus_Temp_.create(rows,cols);
143   P_l_1_.create(rows,cols);
144   P_l_2_.create(rows,cols);
145   P_l_prev_1_.create(rows,cols);
146   P_l_prev_2_.create(rows,cols);
147 
148   lmap_host_.resize(rows * cols);
149 
150   dst_labels_.resize(rows * cols);
151   region_sizes_.resize(rows*cols+1);
152   remap_.resize(rows*cols);
153 
154   comps_.create(rows, cols);
155   device::ConnectedComponents::initEdges(rows, cols, edges_);
156 
157   means_storage_.resize((cols * rows + 1) * 3); // float3 * cols * rows and float3 for cc == -1.
158 
159   blob_matrix_.resize(NUM_PARTS);
160   for(auto &matrix : blob_matrix_)
161   {
162     matrix.clear();
163     matrix.reserve(5000);
164   }
165 }
166 
167 void
process(const pcl::device::Depth & depth,const PointCloud<PointXYZ> & cloud,int min_pts_per_cluster)168 pcl::gpu::people::RDFBodyPartsDetector::process (const pcl::device::Depth& depth, const PointCloud<PointXYZ>& cloud, int min_pts_per_cluster)
169 {
170   //ScopeTime time("ev");
171 
172   int cols = depth.cols();
173   int rows = depth.rows();
174 
175   allocate_buffers(rows, cols);
176 
177   {
178     {
179       //ScopeTime time("--");
180       // Process the depthimage (CUDA)
181       impl_->process(depth, labels_);
182       device::smoothLabelImage(labels_, depth, labels_smoothed_, NUM_PARTS, 5, 300);
183     }
184 
185     //AsyncCopy<unsigned char> async_labels_download(lmap_host_);
186 
187     int c;
188     labels_smoothed_.download(lmap_host_, c);
189     //async_labels_download.download(labels_smoothed_);
190 
191     // cc = generalized floodfill = approximation of euclidian clusterisation
192     device::ConnectedComponents::computeEdges(labels_smoothed_, depth, NUM_PARTS, cluster_tolerance_ * cluster_tolerance_, edges_);
193     device::ConnectedComponents::labelComponents(edges_, comps_);
194 
195     comps_.download(dst_labels_, c);
196 
197     //async_labels_download.waitForCompeltion();
198   }
199 
200   // This was sort indices to blob (sortIndicesToBlob2) method (till line 236)
201   {
202     //ScopeTime time("cvt");
203     std::fill(remap_.begin(), remap_.end(), -1);
204     std::fill(region_sizes_.begin(), region_sizes_.end(), 0);
205 
206     std::fill(means_storage_.begin(), means_storage_.end(), 0);
207     float3* means = (float3*) &means_storage_[3];
208     int *rsizes = &region_sizes_[1];
209 
210     for(auto &matrix : blob_matrix_)
211       matrix.clear();
212 
213     for(std::size_t k = 0; k < dst_labels_.size(); ++k)
214     {
215       const PointXYZ& p = cloud[k];
216       int cc = dst_labels_[k];
217       means[cc].x += p.x;
218       means[cc].y += p.y;
219       means[cc].z += p.z;
220       ++rsizes[cc];
221     }
222 
223     means[-1].z = 0; // cc == -1 means invalid
224 
225     for(std::size_t k = 0; k < dst_labels_.size(); ++k)
226     {
227       int label = lmap_host_[k];
228       int cc    = dst_labels_[k];
229 
230       if (means[cc].z != 0 && min_pts_per_cluster <= rsizes[cc] && rsizes[cc] <= max_cluster_size_)
231       {
232         int ccindex = remap_[cc];
233         if (ccindex == -1)
234         {
235           ccindex = static_cast<int> (blob_matrix_[label].size ());
236           blob_matrix_[label].resize(ccindex + 1);
237           remap_[cc] = ccindex;
238 
239           blob_matrix_[label][ccindex].label = static_cast<part_t> (label);
240           blob_matrix_[label][ccindex].mean.coeffRef(0) = means[cc].x / static_cast<float> (rsizes[cc]);
241           blob_matrix_[label][ccindex].mean.coeffRef(1) = means[cc].y / static_cast<float> (rsizes[cc]);
242           blob_matrix_[label][ccindex].mean.coeffRef(2) = means[cc].z / static_cast<float> (rsizes[cc]);
243           blob_matrix_[label][ccindex].indices.indices.reserve(rsizes[cc]);
244         }
245         blob_matrix_[label][ccindex].indices.indices.push_back(static_cast<int> (k));
246       }
247     }
248 
249     int id = 0;
250     for(auto &matrix : blob_matrix_)
251       for(std::size_t b = 0; b < matrix.size(); ++b)
252       {
253         matrix[b].id = id++;
254         matrix[b].lid = static_cast<int> (b);
255       }
256 
257     buildRelations ( blob_matrix_ );
258   }
259 }
260 
261 void
processProb(const pcl::device::Depth & depth)262 pcl::gpu::people::RDFBodyPartsDetector::processProb (const pcl::device::Depth& depth)
263 {
264   int cols = depth.cols();
265   int rows = depth.rows();
266 
267   allocate_buffers(rows, cols);
268 
269   // Process the depthimage into probabilities (CUDA)
270   //impl_->process(depth, labels_);
271   //impl_->processProb(depth, labels_, P_l_, (int) std::numeric_limits<std::int16_t>::max());
272   impl_->processProb(depth, labels_, P_l_, std::numeric_limits<int>::max());
273 }
274 
275 void
processSmooth(const pcl::device::Depth & depth,const PointCloud<PointXYZ> & cloud,int min_pts_per_cluster)276 pcl::gpu::people::RDFBodyPartsDetector::processSmooth (const pcl::device::Depth& depth, const PointCloud<PointXYZ>& cloud, int min_pts_per_cluster)
277 {
278   device::smoothLabelImage(labels_, depth, labels_smoothed_, NUM_PARTS, 5, 300);
279 
280   //AsyncCopy<unsigned char> async_labels_download(lmap_host_);
281 
282   int c;
283   labels_smoothed_.download(lmap_host_, c);
284   //async_labels_download.download(labels_smoothed_);
285 
286   // cc = generalized floodfill = approximation of euclidian clusterisation
287   device::ConnectedComponents::computeEdges(labels_smoothed_, depth, NUM_PARTS, cluster_tolerance_ * cluster_tolerance_, edges_);
288   device::ConnectedComponents::labelComponents(edges_, comps_);
289 
290   comps_.download(dst_labels_, c);
291 
292   //async_labels_download.waitForCompeltion();
293 
294   // This was sort indices to blob (sortIndicesToBlob2) method (till line 236)
295   {
296     ScopeTime time("[pcl::gpu::people::RDFBodyPartsDetector::processSmooth] : cvt");
297     std::fill(remap_.begin(), remap_.end(), -1);
298     std::fill(region_sizes_.begin(), region_sizes_.end(), 0);
299 
300     std::fill(means_storage_.begin(), means_storage_.end(), 0);
301     float3* means = (float3*) &means_storage_[3];
302     int *rsizes = &region_sizes_[1];
303 
304     for(auto &matrix : blob_matrix_)
305       matrix.clear();
306 
307     for(std::size_t k = 0; k < dst_labels_.size(); ++k)
308     {
309       const PointXYZ& p = cloud[k];
310       int cc = dst_labels_[k];
311       means[cc].x += p.x;
312       means[cc].y += p.y;
313       means[cc].z += p.z;
314       ++rsizes[cc];
315     }
316 
317     means[-1].z = 0; // cc == -1 means invalid
318 
319     for(std::size_t k = 0; k < dst_labels_.size(); ++k)
320     {
321       int label = lmap_host_[k];
322       int cc    = dst_labels_[k];
323 
324       if (means[cc].z != 0 && min_pts_per_cluster <= rsizes[cc] && rsizes[cc] <= max_cluster_size_)
325       {
326         int ccindex = remap_[cc];
327         if (ccindex == -1)
328         {
329           ccindex = static_cast<int> (blob_matrix_[label].size ());
330           blob_matrix_[label].resize(ccindex + 1);
331           remap_[cc] = ccindex;
332 
333           blob_matrix_[label][ccindex].label = static_cast<part_t> (label);
334           blob_matrix_[label][ccindex].mean.coeffRef(0) = means[cc].x / static_cast<float> (rsizes[cc]);
335           blob_matrix_[label][ccindex].mean.coeffRef(1) = means[cc].y / static_cast<float> (rsizes[cc]);
336           blob_matrix_[label][ccindex].mean.coeffRef(2) = means[cc].z / static_cast<float> (rsizes[cc]);
337           blob_matrix_[label][ccindex].indices.indices.reserve(rsizes[cc]);
338         }
339         blob_matrix_[label][ccindex].indices.indices.push_back(static_cast<int> (k));
340       }
341     }
342 
343     int id = 0;
344     for(auto &matrix : blob_matrix_)
345       for(std::size_t b = 0; b < matrix.size(); ++b)
346       {
347         matrix[b].id = id++;
348         matrix[b].lid = static_cast<int> (b);
349       }
350   }
351 }
352 
353 int
processRelations()354 pcl::gpu::people::RDFBodyPartsDetector::processRelations ()
355 {
356   return buildRelations ( blob_matrix_ );
357 }
358 
359 int
processRelations(PersonAttribs::Ptr person_attribs)360 pcl::gpu::people::RDFBodyPartsDetector::processRelations (PersonAttribs::Ptr person_attribs)
361 {
362   return buildRelations ( blob_matrix_, person_attribs );
363 }
364