1 // Copyright (C) 2014  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_SHAPE_PREDICToR_H_
4 #define DLIB_SHAPE_PREDICToR_H_
5 
6 #include "shape_predictor_abstract.h"
7 #include "full_object_detection.h"
8 #include "../algs.h"
9 #include "../matrix.h"
10 #include "../geometry.h"
11 #include "../pixel.h"
12 #include "../statistics.h"
13 #include <utility>
14 
15 namespace dlib
16 {
17 
18 // ----------------------------------------------------------------------------------------
19 
20     namespace impl
21     {
22         struct split_feature
23         {
24             unsigned long idx1;
25             unsigned long idx2;
26             float thresh;
27 
serializesplit_feature28             friend inline void serialize (const split_feature& item, std::ostream& out)
29             {
30                 dlib::serialize(item.idx1, out);
31                 dlib::serialize(item.idx2, out);
32                 dlib::serialize(item.thresh, out);
33             }
deserializesplit_feature34             friend inline void deserialize (split_feature& item, std::istream& in)
35             {
36                 dlib::deserialize(item.idx1, in);
37                 dlib::deserialize(item.idx2, in);
38                 dlib::deserialize(item.thresh, in);
39             }
40         };
41 
42 
43         // a tree is just a std::vector<impl::split_feature>.  We use this function to navigate the
44         // tree nodes
left_child(unsigned long idx)45         inline unsigned long left_child (unsigned long idx) { return 2*idx + 1; }
46         /*!
47             ensures
48                 - returns the index of the left child of the binary tree node idx
49         !*/
right_child(unsigned long idx)50         inline unsigned long right_child (unsigned long idx) { return 2*idx + 2; }
51         /*!
52             ensures
53                 - returns the index of the left child of the binary tree node idx
54         !*/
55 
56         struct regression_tree
57         {
58             std::vector<split_feature> splits;
59             std::vector<matrix<float,0,1> > leaf_values;
60 
num_leavesregression_tree61             unsigned long num_leaves() const { return leaf_values.size(); }
62 
operatorregression_tree63             inline const matrix<float,0,1>& operator()(
64                 const std::vector<float>& feature_pixel_values,
65                 unsigned long& i
66             ) const
67             /*!
68                 requires
69                     - All the index values in splits are less than feature_pixel_values.size()
70                     - leaf_values.size() is a power of 2.
71                       (i.e. we require a tree with all the levels fully filled out.
72                     - leaf_values.size() == splits.size()+1
73                       (i.e. there needs to be the right number of leaves given the number of splits in the tree)
74                 ensures
75                     - runs through the tree and returns the vector at the leaf we end up in.
76                     - #i == the selected leaf node index.
77             !*/
78             {
79                 i = 0;
80                 while (i < splits.size())
81                 {
82                     if ((float)feature_pixel_values[splits[i].idx1] - (float)feature_pixel_values[splits[i].idx2] > splits[i].thresh)
83                         i = left_child(i);
84                     else
85                         i = right_child(i);
86                 }
87                 i = i - splits.size();
88                 return leaf_values[i];
89             }
90 
serializeregression_tree91             friend void serialize (const regression_tree& item, std::ostream& out)
92             {
93                 dlib::serialize(item.splits, out);
94                 dlib::serialize(item.leaf_values, out);
95             }
deserializeregression_tree96             friend void deserialize (regression_tree& item, std::istream& in)
97             {
98                 dlib::deserialize(item.splits, in);
99                 dlib::deserialize(item.leaf_values, in);
100             }
101         };
102 
103     // ------------------------------------------------------------------------------------
104 
location(const matrix<float,0,1> & shape,unsigned long idx)105         inline vector<float,2> location (
106             const matrix<float,0,1>& shape,
107             unsigned long idx
108         )
109         /*!
110             requires
111                 - idx < shape.size()/2
112                 - shape.size()%2 == 0
113             ensures
114                 - returns the idx-th point from the shape vector.
115         !*/
116         {
117             return vector<float,2>(shape(idx*2), shape(idx*2+1));
118         }
119 
120     // ------------------------------------------------------------------------------------
121 
nearest_shape_point(const matrix<float,0,1> & shape,const dlib::vector<float,2> & pt)122         inline unsigned long nearest_shape_point (
123             const matrix<float,0,1>& shape,
124             const dlib::vector<float,2>& pt
125         )
126         {
127             // find the nearest part of the shape to this pixel
128             float best_dist = std::numeric_limits<float>::infinity();
129             const unsigned long num_shape_parts = shape.size()/2;
130             unsigned long best_idx = 0;
131             for (unsigned long j = 0; j < num_shape_parts; ++j)
132             {
133                 const float dist = length_squared(location(shape,j)-pt);
134                 if (dist < best_dist)
135                 {
136                     best_dist = dist;
137                     best_idx = j;
138                 }
139             }
140             return best_idx;
141         }
142 
143     // ------------------------------------------------------------------------------------
144 
create_shape_relative_encoding(const matrix<float,0,1> & shape,const std::vector<dlib::vector<float,2>> & pixel_coordinates,std::vector<unsigned long> & anchor_idx,std::vector<dlib::vector<float,2>> & deltas)145         inline void create_shape_relative_encoding (
146             const matrix<float,0,1>& shape,
147             const std::vector<dlib::vector<float,2> >& pixel_coordinates,
148             std::vector<unsigned long>& anchor_idx,
149             std::vector<dlib::vector<float,2> >& deltas
150         )
151         /*!
152             requires
153                 - shape.size()%2 == 0
154                 - shape.size() > 0
155             ensures
156                 - #anchor_idx.size() == pixel_coordinates.size()
157                 - #deltas.size()     == pixel_coordinates.size()
158                 - for all valid i:
159                     - pixel_coordinates[i] == location(shape,#anchor_idx[i]) + #deltas[i]
160         !*/
161         {
162             anchor_idx.resize(pixel_coordinates.size());
163             deltas.resize(pixel_coordinates.size());
164 
165 
166             for (unsigned long i = 0; i < pixel_coordinates.size(); ++i)
167             {
168                 anchor_idx[i] = nearest_shape_point(shape, pixel_coordinates[i]);
169                 deltas[i] = pixel_coordinates[i] - location(shape,anchor_idx[i]);
170             }
171         }
172 
173     // ------------------------------------------------------------------------------------
174 
find_tform_between_shapes(const matrix<float,0,1> & from_shape,const matrix<float,0,1> & to_shape)175         inline point_transform_affine find_tform_between_shapes (
176             const matrix<float,0,1>& from_shape,
177             const matrix<float,0,1>& to_shape
178         )
179         {
180             DLIB_ASSERT(from_shape.size() == to_shape.size() && (from_shape.size()%2) == 0 && from_shape.size() > 0,"");
181             std::vector<vector<float,2> > from_points, to_points;
182             const unsigned long num = from_shape.size()/2;
183             from_points.reserve(num);
184             to_points.reserve(num);
185             if (num == 1)
186             {
187                 // Just use an identity transform if there is only one landmark.
188                 return point_transform_affine();
189             }
190 
191             for (unsigned long i = 0; i < num; ++i)
192             {
193                 from_points.push_back(location(from_shape,i));
194                 to_points.push_back(location(to_shape,i));
195             }
196             return find_similarity_transform(from_points, to_points);
197         }
198 
199     // ------------------------------------------------------------------------------------
200 
normalizing_tform(const rectangle & rect)201         inline point_transform_affine normalizing_tform (
202             const rectangle& rect
203         )
204         /*!
205             ensures
206                 - returns a transform that maps rect.tl_corner() to (0,0) and rect.br_corner()
207                   to (1,1).
208         !*/
209         {
210             std::vector<vector<float,2> > from_points, to_points;
211             from_points.push_back(rect.tl_corner()); to_points.push_back(point(0,0));
212             from_points.push_back(rect.tr_corner()); to_points.push_back(point(1,0));
213             from_points.push_back(rect.br_corner()); to_points.push_back(point(1,1));
214             return find_affine_transform(from_points, to_points);
215         }
216 
217     // ------------------------------------------------------------------------------------
218 
unnormalizing_tform(const rectangle & rect)219         inline point_transform_affine unnormalizing_tform (
220             const rectangle& rect
221         )
222         /*!
223             ensures
224                 - returns a transform that maps (0,0) to rect.tl_corner() and (1,1) to
225                   rect.br_corner().
226         !*/
227         {
228             std::vector<vector<float,2> > from_points, to_points;
229             to_points.push_back(rect.tl_corner()); from_points.push_back(point(0,0));
230             to_points.push_back(rect.tr_corner()); from_points.push_back(point(1,0));
231             to_points.push_back(rect.br_corner()); from_points.push_back(point(1,1));
232             return find_affine_transform(from_points, to_points);
233         }
234 
235     // ------------------------------------------------------------------------------------
236 
237         template <typename image_type, typename feature_type>
extract_feature_pixel_values(const image_type & img_,const rectangle & rect,const matrix<float,0,1> & current_shape,const matrix<float,0,1> & reference_shape,const std::vector<unsigned long> & reference_pixel_anchor_idx,const std::vector<dlib::vector<float,2>> & reference_pixel_deltas,std::vector<feature_type> & feature_pixel_values)238         void extract_feature_pixel_values (
239             const image_type& img_,
240             const rectangle& rect,
241             const matrix<float,0,1>& current_shape,
242             const matrix<float,0,1>& reference_shape,
243             const std::vector<unsigned long>& reference_pixel_anchor_idx,
244             const std::vector<dlib::vector<float,2> >& reference_pixel_deltas,
245             std::vector<feature_type>& feature_pixel_values
246         )
247         /*!
248             requires
249                 - image_type == an image object that implements the interface defined in
250                   dlib/image_processing/generic_image.h
251                 - reference_pixel_anchor_idx.size() == reference_pixel_deltas.size()
252                 - current_shape.size() == reference_shape.size()
253                 - reference_shape.size()%2 == 0
254                 - max(mat(reference_pixel_anchor_idx)) < reference_shape.size()/2
255             ensures
256                 - #feature_pixel_values.size() == reference_pixel_deltas.size()
257                 - for all valid i:
258                     - #feature_pixel_values[i] == the value of the pixel in img_ that
259                       corresponds to the pixel identified by reference_pixel_anchor_idx[i]
260                       and reference_pixel_deltas[i] when the pixel is located relative to
261                       current_shape rather than reference_shape.
262         !*/
263         {
264             const matrix<float,2,2> tform = matrix_cast<float>(find_tform_between_shapes(reference_shape, current_shape).get_m());
265             const point_transform_affine tform_to_img = unnormalizing_tform(rect);
266 
267             const rectangle area = get_rect(img_);
268 
269             const_image_view<image_type> img(img_);
270             feature_pixel_values.resize(reference_pixel_deltas.size());
271             for (unsigned long i = 0; i < feature_pixel_values.size(); ++i)
272             {
273                 // Compute the point in the current shape corresponding to the i-th pixel and
274                 // then map it from the normalized shape space into pixel space.
275                 point p = tform_to_img(tform*reference_pixel_deltas[i] + location(current_shape, reference_pixel_anchor_idx[i]));
276                 if (area.contains(p))
277                     feature_pixel_values[i] = get_pixel_intensity(img[p.y()][p.x()]);
278                 else
279                     feature_pixel_values[i] = 0;
280             }
281         }
282 
283     } // end namespace impl
284 
285 // ----------------------------------------------------------------------------------------
286 
287     class shape_predictor
288     {
289     public:
290 
291 
shape_predictor()292         shape_predictor (
293         )
294         {}
295 
shape_predictor(const matrix<float,0,1> & initial_shape_,const std::vector<std::vector<impl::regression_tree>> & forests_,const std::vector<std::vector<dlib::vector<float,2>>> & pixel_coordinates)296         shape_predictor (
297             const matrix<float,0,1>& initial_shape_,
298             const std::vector<std::vector<impl::regression_tree> >& forests_,
299             const std::vector<std::vector<dlib::vector<float,2> > >& pixel_coordinates
300         ) : initial_shape(initial_shape_), forests(forests_)
301         /*!
302             requires
303                 - initial_shape.size()%2 == 0
304                 - forests.size() == pixel_coordinates.size() == the number of cascades
305                 - for all valid i:
306                     - all the index values in forests[i] are less than pixel_coordinates[i].size()
307                 - for all valid i and j:
308                     - forests[i][j].leaf_values.size() is a power of 2.
309                       (i.e. we require a tree with all the levels fully filled out.
310                     - forests[i][j].leaf_values.size() == forests[i][j].splits.size()+1
311                       (i.e. there need to be the right number of leaves given the number of splits in the tree)
312         !*/
313         {
314             anchor_idx.resize(pixel_coordinates.size());
315             deltas.resize(pixel_coordinates.size());
316             // Each cascade uses a different set of pixels for its features.  We compute
317             // their representations relative to the initial shape now and save it.
318             for (unsigned long i = 0; i < pixel_coordinates.size(); ++i)
319                 impl::create_shape_relative_encoding(initial_shape, pixel_coordinates[i], anchor_idx[i], deltas[i]);
320         }
321 
num_parts()322         unsigned long num_parts (
323         ) const
324         {
325             return initial_shape.size()/2;
326         }
327 
num_features()328         unsigned long num_features (
329         ) const
330         {
331             unsigned long num = 0;
332             for (unsigned long iter = 0; iter < forests.size(); ++iter)
333                 for (unsigned long i = 0; i < forests[iter].size(); ++i)
334                     num += forests[iter][i].num_leaves();
335             return num;
336         }
337 
338         template <typename image_type>
operator()339         full_object_detection operator()(
340             const image_type& img,
341             const rectangle& rect
342         ) const
343         {
344             using namespace impl;
345             matrix<float,0,1> current_shape = initial_shape;
346             std::vector<float> feature_pixel_values;
347             for (unsigned long iter = 0; iter < forests.size(); ++iter)
348             {
349                 extract_feature_pixel_values(img, rect, current_shape, initial_shape,
350                                              anchor_idx[iter], deltas[iter], feature_pixel_values);
351                 unsigned long leaf_idx;
352                 // evaluate all the trees at this level of the cascade.
353                 for (unsigned long i = 0; i < forests[iter].size(); ++i)
354                     current_shape += forests[iter][i](feature_pixel_values, leaf_idx);
355             }
356 
357             // convert the current_shape into a full_object_detection
358             const point_transform_affine tform_to_img = unnormalizing_tform(rect);
359             std::vector<point> parts(current_shape.size()/2);
360             for (unsigned long i = 0; i < parts.size(); ++i)
361                 parts[i] = tform_to_img(location(current_shape, i));
362             return full_object_detection(rect, parts);
363         }
364 
365         template <typename image_type, typename T, typename U>
operator()366         full_object_detection operator()(
367             const image_type& img,
368             const rectangle& rect,
369             std::vector<std::pair<T,U> >& feats
370         ) const
371         {
372             feats.clear();
373             using namespace impl;
374             matrix<float,0,1> current_shape = initial_shape;
375             std::vector<float> feature_pixel_values;
376             unsigned long feat_offset = 0;
377             for (unsigned long iter = 0; iter < forests.size(); ++iter)
378             {
379                 extract_feature_pixel_values(img, rect, current_shape, initial_shape,
380                                              anchor_idx[iter], deltas[iter], feature_pixel_values);
381                 // evaluate all the trees at this level of the cascade.
382                 for (unsigned long i = 0; i < forests[iter].size(); ++i)
383                 {
384                     unsigned long leaf_idx;
385                     current_shape += forests[iter][i](feature_pixel_values, leaf_idx);
386 
387                     feats.push_back(std::make_pair(feat_offset+leaf_idx, 1));
388                     feat_offset += forests[iter][i].num_leaves();
389                 }
390             }
391 
392             // convert the current_shape into a full_object_detection
393             const point_transform_affine tform_to_img = unnormalizing_tform(rect);
394             std::vector<point> parts(current_shape.size()/2);
395             for (unsigned long i = 0; i < parts.size(); ++i)
396                 parts[i] = tform_to_img(location(current_shape, i));
397             return full_object_detection(rect, parts);
398         }
399 
400         friend void serialize (const shape_predictor& item, std::ostream& out);
401 
402         friend void deserialize (shape_predictor& item, std::istream& in);
403 
404     private:
405         matrix<float,0,1> initial_shape;
406         std::vector<std::vector<impl::regression_tree> > forests;
407         std::vector<std::vector<unsigned long> > anchor_idx;
408         std::vector<std::vector<dlib::vector<float,2> > > deltas;
409     };
410 
serialize(const shape_predictor & item,std::ostream & out)411     inline void serialize (const shape_predictor& item, std::ostream& out)
412     {
413         int version = 1;
414         dlib::serialize(version, out);
415         dlib::serialize(item.initial_shape, out);
416         dlib::serialize(item.forests, out);
417         dlib::serialize(item.anchor_idx, out);
418         dlib::serialize(item.deltas, out);
419     }
420 
deserialize(shape_predictor & item,std::istream & in)421     inline void deserialize (shape_predictor& item, std::istream& in)
422     {
423         int version = 0;
424         dlib::deserialize(version, in);
425         if (version != 1)
426             throw serialization_error("Unexpected version found while deserializing dlib::shape_predictor.");
427         dlib::deserialize(item.initial_shape, in);
428         dlib::deserialize(item.forests, in);
429         dlib::deserialize(item.anchor_idx, in);
430         dlib::deserialize(item.deltas, in);
431     }
432 
433 // ----------------------------------------------------------------------------------------
434 // ----------------------------------------------------------------------------------------
435 // ----------------------------------------------------------------------------------------
436 // ----------------------------------------------------------------------------------------
437 
438     template <
439         typename image_array
440         >
test_shape_predictor(const shape_predictor & sp,const image_array & images,const std::vector<std::vector<full_object_detection>> & objects,const std::vector<std::vector<double>> & scales)441     double test_shape_predictor (
442         const shape_predictor& sp,
443         const image_array& images,
444         const std::vector<std::vector<full_object_detection> >& objects,
445         const std::vector<std::vector<double> >& scales
446     )
447     {
448         // make sure requires clause is not broken
449 #ifdef ENABLE_ASSERTS
450         DLIB_CASSERT( images.size() == objects.size() ,
451             "\t double test_shape_predictor()"
452             << "\n\t Invalid inputs were given to this function. "
453             << "\n\t images.size():  " << images.size()
454             << "\n\t objects.size(): " << objects.size()
455         );
456         for (unsigned long i = 0; i < objects.size(); ++i)
457         {
458             for (unsigned long j = 0; j < objects[i].size(); ++j)
459             {
460                 DLIB_CASSERT(objects[i][j].num_parts() == sp.num_parts(),
461                     "\t double test_shape_predictor()"
462                     << "\n\t Invalid inputs were given to this function. "
463                     << "\n\t objects["<<i<<"]["<<j<<"].num_parts(): " << objects[i][j].num_parts()
464                     << "\n\t sp.num_parts(): " << sp.num_parts()
465                 );
466             }
467             if (scales.size() != 0)
468             {
469                 DLIB_CASSERT(objects[i].size() == scales[i].size(),
470                     "\t double test_shape_predictor()"
471                     << "\n\t Invalid inputs were given to this function. "
472                     << "\n\t objects["<<i<<"].size(): " << objects[i].size()
473                     << "\n\t scales["<<i<<"].size(): " << scales[i].size()
474                 );
475 
476             }
477         }
478 #endif
479 
480         running_stats<double> rs;
481         for (unsigned long i = 0; i < objects.size(); ++i)
482         {
483             for (unsigned long j = 0; j < objects[i].size(); ++j)
484             {
485                 // Just use a scale of 1 (i.e. no scale at all) if the caller didn't supply
486                 // any scales.
487                 const double scale = scales.size()==0 ? 1 : scales[i][j];
488 
489                 full_object_detection det = sp(images[i], objects[i][j].get_rect());
490 
491                 for (unsigned long k = 0; k < det.num_parts(); ++k)
492                 {
493                     if (objects[i][j].part(k) != OBJECT_PART_NOT_PRESENT)
494                     {
495                         double score = length(det.part(k) - objects[i][j].part(k))/scale;
496                         rs.add(score);
497                     }
498                 }
499             }
500         }
501         return rs.mean();
502     }
503 
504 // ----------------------------------------------------------------------------------------
505 
506     template <
507         typename image_array
508         >
test_shape_predictor(const shape_predictor & sp,const image_array & images,const std::vector<std::vector<full_object_detection>> & objects)509     double test_shape_predictor (
510         const shape_predictor& sp,
511         const image_array& images,
512         const std::vector<std::vector<full_object_detection> >& objects
513     )
514     {
515         std::vector<std::vector<double> > no_scales;
516         return test_shape_predictor(sp, images, objects, no_scales);
517     }
518 
519 // ----------------------------------------------------------------------------------------
520 
521 }
522 
523 #endif // DLIB_SHAPE_PREDICToR_H_
524 
525