1 // Copyright (C) 2014 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 #ifndef DLIB_SHAPE_PREDICToR_H_ 4 #define DLIB_SHAPE_PREDICToR_H_ 5 6 #include "shape_predictor_abstract.h" 7 #include "full_object_detection.h" 8 #include "../algs.h" 9 #include "../matrix.h" 10 #include "../geometry.h" 11 #include "../pixel.h" 12 #include "../statistics.h" 13 #include <utility> 14 15 namespace dlib 16 { 17 18 // ---------------------------------------------------------------------------------------- 19 20 namespace impl 21 { 22 struct split_feature 23 { 24 unsigned long idx1; 25 unsigned long idx2; 26 float thresh; 27 serializesplit_feature28 friend inline void serialize (const split_feature& item, std::ostream& out) 29 { 30 dlib::serialize(item.idx1, out); 31 dlib::serialize(item.idx2, out); 32 dlib::serialize(item.thresh, out); 33 } deserializesplit_feature34 friend inline void deserialize (split_feature& item, std::istream& in) 35 { 36 dlib::deserialize(item.idx1, in); 37 dlib::deserialize(item.idx2, in); 38 dlib::deserialize(item.thresh, in); 39 } 40 }; 41 42 43 // a tree is just a std::vector<impl::split_feature>. We use this function to navigate the 44 // tree nodes left_child(unsigned long idx)45 inline unsigned long left_child (unsigned long idx) { return 2*idx + 1; } 46 /*! 47 ensures 48 - returns the index of the left child of the binary tree node idx 49 !*/ right_child(unsigned long idx)50 inline unsigned long right_child (unsigned long idx) { return 2*idx + 2; } 51 /*! 52 ensures 53 - returns the index of the left child of the binary tree node idx 54 !*/ 55 56 struct regression_tree 57 { 58 std::vector<split_feature> splits; 59 std::vector<matrix<float,0,1> > leaf_values; 60 num_leavesregression_tree61 unsigned long num_leaves() const { return leaf_values.size(); } 62 operatorregression_tree63 inline const matrix<float,0,1>& operator()( 64 const std::vector<float>& feature_pixel_values, 65 unsigned long& i 66 ) const 67 /*! 68 requires 69 - All the index values in splits are less than feature_pixel_values.size() 70 - leaf_values.size() is a power of 2. 71 (i.e. we require a tree with all the levels fully filled out. 72 - leaf_values.size() == splits.size()+1 73 (i.e. there needs to be the right number of leaves given the number of splits in the tree) 74 ensures 75 - runs through the tree and returns the vector at the leaf we end up in. 76 - #i == the selected leaf node index. 77 !*/ 78 { 79 i = 0; 80 while (i < splits.size()) 81 { 82 if ((float)feature_pixel_values[splits[i].idx1] - (float)feature_pixel_values[splits[i].idx2] > splits[i].thresh) 83 i = left_child(i); 84 else 85 i = right_child(i); 86 } 87 i = i - splits.size(); 88 return leaf_values[i]; 89 } 90 serializeregression_tree91 friend void serialize (const regression_tree& item, std::ostream& out) 92 { 93 dlib::serialize(item.splits, out); 94 dlib::serialize(item.leaf_values, out); 95 } deserializeregression_tree96 friend void deserialize (regression_tree& item, std::istream& in) 97 { 98 dlib::deserialize(item.splits, in); 99 dlib::deserialize(item.leaf_values, in); 100 } 101 }; 102 103 // ------------------------------------------------------------------------------------ 104 location(const matrix<float,0,1> & shape,unsigned long idx)105 inline vector<float,2> location ( 106 const matrix<float,0,1>& shape, 107 unsigned long idx 108 ) 109 /*! 110 requires 111 - idx < shape.size()/2 112 - shape.size()%2 == 0 113 ensures 114 - returns the idx-th point from the shape vector. 115 !*/ 116 { 117 return vector<float,2>(shape(idx*2), shape(idx*2+1)); 118 } 119 120 // ------------------------------------------------------------------------------------ 121 nearest_shape_point(const matrix<float,0,1> & shape,const dlib::vector<float,2> & pt)122 inline unsigned long nearest_shape_point ( 123 const matrix<float,0,1>& shape, 124 const dlib::vector<float,2>& pt 125 ) 126 { 127 // find the nearest part of the shape to this pixel 128 float best_dist = std::numeric_limits<float>::infinity(); 129 const unsigned long num_shape_parts = shape.size()/2; 130 unsigned long best_idx = 0; 131 for (unsigned long j = 0; j < num_shape_parts; ++j) 132 { 133 const float dist = length_squared(location(shape,j)-pt); 134 if (dist < best_dist) 135 { 136 best_dist = dist; 137 best_idx = j; 138 } 139 } 140 return best_idx; 141 } 142 143 // ------------------------------------------------------------------------------------ 144 create_shape_relative_encoding(const matrix<float,0,1> & shape,const std::vector<dlib::vector<float,2>> & pixel_coordinates,std::vector<unsigned long> & anchor_idx,std::vector<dlib::vector<float,2>> & deltas)145 inline void create_shape_relative_encoding ( 146 const matrix<float,0,1>& shape, 147 const std::vector<dlib::vector<float,2> >& pixel_coordinates, 148 std::vector<unsigned long>& anchor_idx, 149 std::vector<dlib::vector<float,2> >& deltas 150 ) 151 /*! 152 requires 153 - shape.size()%2 == 0 154 - shape.size() > 0 155 ensures 156 - #anchor_idx.size() == pixel_coordinates.size() 157 - #deltas.size() == pixel_coordinates.size() 158 - for all valid i: 159 - pixel_coordinates[i] == location(shape,#anchor_idx[i]) + #deltas[i] 160 !*/ 161 { 162 anchor_idx.resize(pixel_coordinates.size()); 163 deltas.resize(pixel_coordinates.size()); 164 165 166 for (unsigned long i = 0; i < pixel_coordinates.size(); ++i) 167 { 168 anchor_idx[i] = nearest_shape_point(shape, pixel_coordinates[i]); 169 deltas[i] = pixel_coordinates[i] - location(shape,anchor_idx[i]); 170 } 171 } 172 173 // ------------------------------------------------------------------------------------ 174 find_tform_between_shapes(const matrix<float,0,1> & from_shape,const matrix<float,0,1> & to_shape)175 inline point_transform_affine find_tform_between_shapes ( 176 const matrix<float,0,1>& from_shape, 177 const matrix<float,0,1>& to_shape 178 ) 179 { 180 DLIB_ASSERT(from_shape.size() == to_shape.size() && (from_shape.size()%2) == 0 && from_shape.size() > 0,""); 181 std::vector<vector<float,2> > from_points, to_points; 182 const unsigned long num = from_shape.size()/2; 183 from_points.reserve(num); 184 to_points.reserve(num); 185 if (num == 1) 186 { 187 // Just use an identity transform if there is only one landmark. 188 return point_transform_affine(); 189 } 190 191 for (unsigned long i = 0; i < num; ++i) 192 { 193 from_points.push_back(location(from_shape,i)); 194 to_points.push_back(location(to_shape,i)); 195 } 196 return find_similarity_transform(from_points, to_points); 197 } 198 199 // ------------------------------------------------------------------------------------ 200 normalizing_tform(const rectangle & rect)201 inline point_transform_affine normalizing_tform ( 202 const rectangle& rect 203 ) 204 /*! 205 ensures 206 - returns a transform that maps rect.tl_corner() to (0,0) and rect.br_corner() 207 to (1,1). 208 !*/ 209 { 210 std::vector<vector<float,2> > from_points, to_points; 211 from_points.push_back(rect.tl_corner()); to_points.push_back(point(0,0)); 212 from_points.push_back(rect.tr_corner()); to_points.push_back(point(1,0)); 213 from_points.push_back(rect.br_corner()); to_points.push_back(point(1,1)); 214 return find_affine_transform(from_points, to_points); 215 } 216 217 // ------------------------------------------------------------------------------------ 218 unnormalizing_tform(const rectangle & rect)219 inline point_transform_affine unnormalizing_tform ( 220 const rectangle& rect 221 ) 222 /*! 223 ensures 224 - returns a transform that maps (0,0) to rect.tl_corner() and (1,1) to 225 rect.br_corner(). 226 !*/ 227 { 228 std::vector<vector<float,2> > from_points, to_points; 229 to_points.push_back(rect.tl_corner()); from_points.push_back(point(0,0)); 230 to_points.push_back(rect.tr_corner()); from_points.push_back(point(1,0)); 231 to_points.push_back(rect.br_corner()); from_points.push_back(point(1,1)); 232 return find_affine_transform(from_points, to_points); 233 } 234 235 // ------------------------------------------------------------------------------------ 236 237 template <typename image_type, typename feature_type> extract_feature_pixel_values(const image_type & img_,const rectangle & rect,const matrix<float,0,1> & current_shape,const matrix<float,0,1> & reference_shape,const std::vector<unsigned long> & reference_pixel_anchor_idx,const std::vector<dlib::vector<float,2>> & reference_pixel_deltas,std::vector<feature_type> & feature_pixel_values)238 void extract_feature_pixel_values ( 239 const image_type& img_, 240 const rectangle& rect, 241 const matrix<float,0,1>& current_shape, 242 const matrix<float,0,1>& reference_shape, 243 const std::vector<unsigned long>& reference_pixel_anchor_idx, 244 const std::vector<dlib::vector<float,2> >& reference_pixel_deltas, 245 std::vector<feature_type>& feature_pixel_values 246 ) 247 /*! 248 requires 249 - image_type == an image object that implements the interface defined in 250 dlib/image_processing/generic_image.h 251 - reference_pixel_anchor_idx.size() == reference_pixel_deltas.size() 252 - current_shape.size() == reference_shape.size() 253 - reference_shape.size()%2 == 0 254 - max(mat(reference_pixel_anchor_idx)) < reference_shape.size()/2 255 ensures 256 - #feature_pixel_values.size() == reference_pixel_deltas.size() 257 - for all valid i: 258 - #feature_pixel_values[i] == the value of the pixel in img_ that 259 corresponds to the pixel identified by reference_pixel_anchor_idx[i] 260 and reference_pixel_deltas[i] when the pixel is located relative to 261 current_shape rather than reference_shape. 262 !*/ 263 { 264 const matrix<float,2,2> tform = matrix_cast<float>(find_tform_between_shapes(reference_shape, current_shape).get_m()); 265 const point_transform_affine tform_to_img = unnormalizing_tform(rect); 266 267 const rectangle area = get_rect(img_); 268 269 const_image_view<image_type> img(img_); 270 feature_pixel_values.resize(reference_pixel_deltas.size()); 271 for (unsigned long i = 0; i < feature_pixel_values.size(); ++i) 272 { 273 // Compute the point in the current shape corresponding to the i-th pixel and 274 // then map it from the normalized shape space into pixel space. 275 point p = tform_to_img(tform*reference_pixel_deltas[i] + location(current_shape, reference_pixel_anchor_idx[i])); 276 if (area.contains(p)) 277 feature_pixel_values[i] = get_pixel_intensity(img[p.y()][p.x()]); 278 else 279 feature_pixel_values[i] = 0; 280 } 281 } 282 283 } // end namespace impl 284 285 // ---------------------------------------------------------------------------------------- 286 287 class shape_predictor 288 { 289 public: 290 291 shape_predictor()292 shape_predictor ( 293 ) 294 {} 295 shape_predictor(const matrix<float,0,1> & initial_shape_,const std::vector<std::vector<impl::regression_tree>> & forests_,const std::vector<std::vector<dlib::vector<float,2>>> & pixel_coordinates)296 shape_predictor ( 297 const matrix<float,0,1>& initial_shape_, 298 const std::vector<std::vector<impl::regression_tree> >& forests_, 299 const std::vector<std::vector<dlib::vector<float,2> > >& pixel_coordinates 300 ) : initial_shape(initial_shape_), forests(forests_) 301 /*! 302 requires 303 - initial_shape.size()%2 == 0 304 - forests.size() == pixel_coordinates.size() == the number of cascades 305 - for all valid i: 306 - all the index values in forests[i] are less than pixel_coordinates[i].size() 307 - for all valid i and j: 308 - forests[i][j].leaf_values.size() is a power of 2. 309 (i.e. we require a tree with all the levels fully filled out. 310 - forests[i][j].leaf_values.size() == forests[i][j].splits.size()+1 311 (i.e. there need to be the right number of leaves given the number of splits in the tree) 312 !*/ 313 { 314 anchor_idx.resize(pixel_coordinates.size()); 315 deltas.resize(pixel_coordinates.size()); 316 // Each cascade uses a different set of pixels for its features. We compute 317 // their representations relative to the initial shape now and save it. 318 for (unsigned long i = 0; i < pixel_coordinates.size(); ++i) 319 impl::create_shape_relative_encoding(initial_shape, pixel_coordinates[i], anchor_idx[i], deltas[i]); 320 } 321 num_parts()322 unsigned long num_parts ( 323 ) const 324 { 325 return initial_shape.size()/2; 326 } 327 num_features()328 unsigned long num_features ( 329 ) const 330 { 331 unsigned long num = 0; 332 for (unsigned long iter = 0; iter < forests.size(); ++iter) 333 for (unsigned long i = 0; i < forests[iter].size(); ++i) 334 num += forests[iter][i].num_leaves(); 335 return num; 336 } 337 338 template <typename image_type> operator()339 full_object_detection operator()( 340 const image_type& img, 341 const rectangle& rect 342 ) const 343 { 344 using namespace impl; 345 matrix<float,0,1> current_shape = initial_shape; 346 std::vector<float> feature_pixel_values; 347 for (unsigned long iter = 0; iter < forests.size(); ++iter) 348 { 349 extract_feature_pixel_values(img, rect, current_shape, initial_shape, 350 anchor_idx[iter], deltas[iter], feature_pixel_values); 351 unsigned long leaf_idx; 352 // evaluate all the trees at this level of the cascade. 353 for (unsigned long i = 0; i < forests[iter].size(); ++i) 354 current_shape += forests[iter][i](feature_pixel_values, leaf_idx); 355 } 356 357 // convert the current_shape into a full_object_detection 358 const point_transform_affine tform_to_img = unnormalizing_tform(rect); 359 std::vector<point> parts(current_shape.size()/2); 360 for (unsigned long i = 0; i < parts.size(); ++i) 361 parts[i] = tform_to_img(location(current_shape, i)); 362 return full_object_detection(rect, parts); 363 } 364 365 template <typename image_type, typename T, typename U> operator()366 full_object_detection operator()( 367 const image_type& img, 368 const rectangle& rect, 369 std::vector<std::pair<T,U> >& feats 370 ) const 371 { 372 feats.clear(); 373 using namespace impl; 374 matrix<float,0,1> current_shape = initial_shape; 375 std::vector<float> feature_pixel_values; 376 unsigned long feat_offset = 0; 377 for (unsigned long iter = 0; iter < forests.size(); ++iter) 378 { 379 extract_feature_pixel_values(img, rect, current_shape, initial_shape, 380 anchor_idx[iter], deltas[iter], feature_pixel_values); 381 // evaluate all the trees at this level of the cascade. 382 for (unsigned long i = 0; i < forests[iter].size(); ++i) 383 { 384 unsigned long leaf_idx; 385 current_shape += forests[iter][i](feature_pixel_values, leaf_idx); 386 387 feats.push_back(std::make_pair(feat_offset+leaf_idx, 1)); 388 feat_offset += forests[iter][i].num_leaves(); 389 } 390 } 391 392 // convert the current_shape into a full_object_detection 393 const point_transform_affine tform_to_img = unnormalizing_tform(rect); 394 std::vector<point> parts(current_shape.size()/2); 395 for (unsigned long i = 0; i < parts.size(); ++i) 396 parts[i] = tform_to_img(location(current_shape, i)); 397 return full_object_detection(rect, parts); 398 } 399 400 friend void serialize (const shape_predictor& item, std::ostream& out); 401 402 friend void deserialize (shape_predictor& item, std::istream& in); 403 404 private: 405 matrix<float,0,1> initial_shape; 406 std::vector<std::vector<impl::regression_tree> > forests; 407 std::vector<std::vector<unsigned long> > anchor_idx; 408 std::vector<std::vector<dlib::vector<float,2> > > deltas; 409 }; 410 serialize(const shape_predictor & item,std::ostream & out)411 inline void serialize (const shape_predictor& item, std::ostream& out) 412 { 413 int version = 1; 414 dlib::serialize(version, out); 415 dlib::serialize(item.initial_shape, out); 416 dlib::serialize(item.forests, out); 417 dlib::serialize(item.anchor_idx, out); 418 dlib::serialize(item.deltas, out); 419 } 420 deserialize(shape_predictor & item,std::istream & in)421 inline void deserialize (shape_predictor& item, std::istream& in) 422 { 423 int version = 0; 424 dlib::deserialize(version, in); 425 if (version != 1) 426 throw serialization_error("Unexpected version found while deserializing dlib::shape_predictor."); 427 dlib::deserialize(item.initial_shape, in); 428 dlib::deserialize(item.forests, in); 429 dlib::deserialize(item.anchor_idx, in); 430 dlib::deserialize(item.deltas, in); 431 } 432 433 // ---------------------------------------------------------------------------------------- 434 // ---------------------------------------------------------------------------------------- 435 // ---------------------------------------------------------------------------------------- 436 // ---------------------------------------------------------------------------------------- 437 438 template < 439 typename image_array 440 > test_shape_predictor(const shape_predictor & sp,const image_array & images,const std::vector<std::vector<full_object_detection>> & objects,const std::vector<std::vector<double>> & scales)441 double test_shape_predictor ( 442 const shape_predictor& sp, 443 const image_array& images, 444 const std::vector<std::vector<full_object_detection> >& objects, 445 const std::vector<std::vector<double> >& scales 446 ) 447 { 448 // make sure requires clause is not broken 449 #ifdef ENABLE_ASSERTS 450 DLIB_CASSERT( images.size() == objects.size() , 451 "\t double test_shape_predictor()" 452 << "\n\t Invalid inputs were given to this function. " 453 << "\n\t images.size(): " << images.size() 454 << "\n\t objects.size(): " << objects.size() 455 ); 456 for (unsigned long i = 0; i < objects.size(); ++i) 457 { 458 for (unsigned long j = 0; j < objects[i].size(); ++j) 459 { 460 DLIB_CASSERT(objects[i][j].num_parts() == sp.num_parts(), 461 "\t double test_shape_predictor()" 462 << "\n\t Invalid inputs were given to this function. " 463 << "\n\t objects["<<i<<"]["<<j<<"].num_parts(): " << objects[i][j].num_parts() 464 << "\n\t sp.num_parts(): " << sp.num_parts() 465 ); 466 } 467 if (scales.size() != 0) 468 { 469 DLIB_CASSERT(objects[i].size() == scales[i].size(), 470 "\t double test_shape_predictor()" 471 << "\n\t Invalid inputs were given to this function. " 472 << "\n\t objects["<<i<<"].size(): " << objects[i].size() 473 << "\n\t scales["<<i<<"].size(): " << scales[i].size() 474 ); 475 476 } 477 } 478 #endif 479 480 running_stats<double> rs; 481 for (unsigned long i = 0; i < objects.size(); ++i) 482 { 483 for (unsigned long j = 0; j < objects[i].size(); ++j) 484 { 485 // Just use a scale of 1 (i.e. no scale at all) if the caller didn't supply 486 // any scales. 487 const double scale = scales.size()==0 ? 1 : scales[i][j]; 488 489 full_object_detection det = sp(images[i], objects[i][j].get_rect()); 490 491 for (unsigned long k = 0; k < det.num_parts(); ++k) 492 { 493 if (objects[i][j].part(k) != OBJECT_PART_NOT_PRESENT) 494 { 495 double score = length(det.part(k) - objects[i][j].part(k))/scale; 496 rs.add(score); 497 } 498 } 499 } 500 } 501 return rs.mean(); 502 } 503 504 // ---------------------------------------------------------------------------------------- 505 506 template < 507 typename image_array 508 > test_shape_predictor(const shape_predictor & sp,const image_array & images,const std::vector<std::vector<full_object_detection>> & objects)509 double test_shape_predictor ( 510 const shape_predictor& sp, 511 const image_array& images, 512 const std::vector<std::vector<full_object_detection> >& objects 513 ) 514 { 515 std::vector<std::vector<double> > no_scales; 516 return test_shape_predictor(sp, images, objects, no_scales); 517 } 518 519 // ---------------------------------------------------------------------------------------- 520 521 } 522 523 #endif // DLIB_SHAPE_PREDICToR_H_ 524 525