1 // The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
2 /*
3     This example shows how to run a CNN based vehicle detector using dlib.  The
4     example loads a pretrained model and uses it to find the rear ends of cars in
5     an image.  We will also visualize some of the detector's processing steps by
6     plotting various intermediate images on the screen.  Viewing these can help
7     you understand how the detector works.
8 
9     The model used by this example was trained by the dnn_mmod_train_find_cars_ex.cpp
10     example.  Also, since this is a CNN, you really should use a GPU to get the
11     best execution speed.  For instance, when run on a NVIDIA 1080ti, this detector
12     runs at 98fps when run on the provided test image.  That's more than an order
13     of magnitude faster than when run on the CPU.
14 
15     Users who are just learning about dlib's deep learning API should read
16     the dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp examples to learn
17     how the API works.  For an introduction to the object detection method you
18     should read dnn_mmod_ex.cpp.
19 
20     You can also see some videos of this vehicle detector running on YouTube:
21         https://www.youtube.com/watch?v=4B3bzmxMAZU
22         https://www.youtube.com/watch?v=bP2SUo5vSlc
23 */
24 
25 
26 #include <iostream>
27 #include <dlib/dnn.h>
28 #include <dlib/image_io.h>
29 #include <dlib/gui_widgets.h>
30 #include <dlib/image_processing.h>
31 
32 using namespace std;
33 using namespace dlib;
34 
35 
36 
37 // The rear view vehicle detector network
38 template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
39 template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;
40 template <typename SUBNET> using downsampler  = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
41 template <typename SUBNET> using rcon5  = relu<affine<con5<55,SUBNET>>>;
42 using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;
43 
44 // ----------------------------------------------------------------------------------------
45 
main()46 int main() try
47 {
48     net_type net;
49     shape_predictor sp;
50     // You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2
51     // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program.
52     // As you can see, the file also includes a separately trained shape_predictor.  To see
53     // a generic example of how to train those refer to train_shape_predictor_ex.cpp.
54     deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp;
55 
56     matrix<rgb_pixel> img;
57     load_image(img, "../mmod_cars_test_image.jpg");
58 
59     image_window win;
60     win.set_image(img);
61 
62     // Run the detector on the image and show us the output.
63     for (auto&& d : net(img))
64     {
65         // We use a shape_predictor to refine the exact shape and location of the detection
66         // box.  This shape_predictor is trained to simply output the 4 corner points of
67         // the box.  So all we do is make a rectangle that tightly contains those 4 points
68         // and that rectangle is our refined detection position.
69         auto fd = sp(img,d);
70         rectangle rect;
71         for (unsigned long j = 0; j < fd.num_parts(); ++j)
72             rect += fd.part(j);
73         win.add_overlay(rect, rgb_pixel(255,0,0));
74     }
75 
76 
77 
78     cout << "Hit enter to view the intermediate processing steps" << endl;
79     cin.get();
80 
81 
82     // Now let's look at how the detector works.  The high level processing steps look like:
83     //   1. Create an image pyramid and pack the pyramid into one big image.  We call this
84     //      image the "tiled pyramid".
85     //   2. Run the tiled pyramid image through the CNN.  The CNN outputs a new image where
86     //      bright pixels in the output image indicate the presence of cars.
87     //   3. Find pixels in the CNN's output image with a value > 0.  Those locations are your
88     //      preliminary car detections.
89     //   4. Perform non-maximum suppression on the preliminary detections to produce the
90     //      final output.
91     //
92     // We will be plotting the images from steps 1 and 2 so you can visualize what's
93     // happening.  For the CNN's output image, we will use the jet colormap so that "bright"
94     // outputs, i.e. pixels with big values, appear in red and "dim" outputs appear as a
95     // cold blue color.  To do this we pick a range of CNN output values for the color
96     // mapping.  The specific values don't matter.  They are just selected to give a nice
97     // looking output image.
98     const float lower = -2.5;
99     const float upper = 0.0;
100     cout << "jet color mapping range:  lower="<< lower << "  upper="<< upper << endl;
101 
102 
103 
104     // Create a tiled pyramid image and display it on the screen.
105     std::vector<rectangle> rects;
106     matrix<rgb_pixel> tiled_img;
107     // Get the type of pyramid the CNN used
108     using pyramid_type = std::remove_reference<decltype(input_layer(net))>::type::pyramid_type;
109     // And tell create_tiled_pyramid to create the pyramid using that pyramid type.
110     create_tiled_pyramid<pyramid_type>(img, tiled_img, rects,
111                                        input_layer(net).get_pyramid_padding(),
112                                        input_layer(net).get_pyramid_outer_padding());
113     image_window winpyr(tiled_img, "Tiled pyramid");
114 
115 
116 
117     // This CNN detector represents a sliding window detector with 3 sliding windows.  Each
118     // of the 3 windows has a different aspect ratio, allowing it to find vehicles which
119     // are either tall and skinny, squarish, or short and wide.  The aspect ratio of a
120     // detection is determined by which channel in the output image triggers the detection.
121     // Here we are just going to max pool the channels together to get one final image for
122     // our display.  In this image, a pixel will be bright if any of the sliding window
123     // detectors thinks there is a car at that location.
124     cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl;
125     matrix<float> network_output = image_plane(net.subnet().get_output(),0,0);
126     for (long k = 1; k < net.subnet().get_output().k(); ++k)
127         network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k));
128     // We will also upsample the CNN's output image.  The CNN we defined has an 8x
129     // downsampling layer at the beginning. In the code below we are going to overlay this
130     // CNN output image on top of the raw input image.  To make that look nice it helps to
131     // upsample the CNN output image back to the same resolution as the input image, which
132     // we do here.
133     const double network_output_scale = img.nc()/(double)network_output.nc();
134     resize_image(network_output_scale, network_output);
135 
136 
137     // Display the network's output as a color image.
138     image_window win_output(jet(network_output, upper, lower), "Output tensor from the network");
139 
140 
141     // Also, overlay network_output on top of the tiled image pyramid and display it.
142     for (long r = 0; r < tiled_img.nr(); ++r)
143     {
144         for (long c = 0; c < tiled_img.nc(); ++c)
145         {
146             dpoint tmp(c,r);
147             tmp = input_tensor_to_output_tensor(net, tmp);
148             tmp = point(network_output_scale*tmp);
149             if (get_rect(network_output).contains(tmp))
150             {
151                 float val = network_output(tmp.y(),tmp.x());
152                 // alpha blend the network output pixel with the RGB image to make our
153                 // overlay.
154                 rgb_alpha_pixel p;
155                 assign_pixel(p , colormap_jet(val,lower,upper));
156                 p.alpha = 120;
157                 assign_pixel(tiled_img(r,c), p);
158             }
159         }
160     }
161     // If you look at this image you can see that the vehicles have bright red blobs on
162     // them.  That's the CNN saying "there is a car here!".  You will also notice there is
163     // a certain scale at which it finds cars.  They have to be not too big or too small,
164     // which is why we have an image pyramid.  The pyramid allows us to find cars of all
165     // scales.
166     image_window win_pyr_overlay(tiled_img, "Detection scores on image pyramid");
167 
168 
169 
170 
171     // Finally, we can collapse the pyramid back into the original image.  The CNN doesn't
172     // actually do this step, since it's enough to threshold the tiled pyramid image to get
173     // the detections.  However, it makes a nice visualization and clearly indicates that
174     // the detector is firing for all the cars.
175     matrix<float> collapsed(img.nr(), img.nc());
176     resizable_tensor input_tensor;
177     input_layer(net).to_tensor(&img, &img+1, input_tensor);
178     for (long r = 0; r < collapsed.nr(); ++r)
179     {
180         for (long c = 0; c < collapsed.nc(); ++c)
181         {
182             // Loop over a bunch of scale values and look up what part of network_output
183             // corresponds to the point(c,r) in the original image, then take the max
184             // detection score over all the scales and save it at pixel point(c,r).
185             float max_score = -1e30;
186             for (double scale = 1; scale > 0.2; scale *= 5.0/6.0)
187             {
188                 // Map from input image coordinates to tiled pyramid coordinates.
189                 dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r))));
190                 // Now map from pyramid coordinates to network_output coordinates.
191                 tmp = point(network_output_scale*input_tensor_to_output_tensor(net, tmp));
192 
193                 if (get_rect(network_output).contains(tmp))
194                 {
195                     float val = network_output(tmp.y(),tmp.x());
196                     if (val > max_score)
197                         max_score = val;
198                 }
199             }
200 
201             collapsed(r,c) = max_score;
202 
203             // Also blend the scores into the original input image so we can view it as
204             // an overlay on the cars.
205             rgb_alpha_pixel p;
206             assign_pixel(p , colormap_jet(max_score,lower,upper));
207             p.alpha = 120;
208             assign_pixel(img(r,c), p);
209         }
210     }
211 
212     image_window win_collapsed(jet(collapsed, upper, lower), "Collapsed output tensor from the network");
213     image_window win_img_and_sal(img, "Collapsed detection scores on raw image");
214 
215 
216     cout << "Hit enter to end program" << endl;
217     cin.get();
218 }
219 catch(image_load_error& e)
220 {
221     cout << e.what() << endl;
222     cout << "The test image is located in the examples folder.  So you should run this program from a sub folder so that the relative path is correct." << endl;
223 }
224 catch(serialization_error& e)
225 {
226     cout << e.what() << endl;
227     cout << "The correct model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2" << endl;
228 }
229 catch(std::exception& e)
230 {
231     cout << e.what() << endl;
232 }
233 
234 
235 
236 
237