1 // The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
2 /*
3 
4     This is an example illustrating the use of the machine learning
5     tools for sequence labeling in the dlib C++ Library.
6 
7     The general problem addressed by these tools is the following.
8     Suppose you have a set of sequences of some kind and you want to
9     learn to predict a label for each element of a sequence.  So for
10     example, you might have a set of English sentences where each
11     word is labeled with its part of speech and you want to learn a
12     model which can predict the part of speech for each word in a new
13     sentence.
14 
15     Central to these tools is the sequence_labeler object.  It is the
16     object which represents the label prediction model. In particular,
17     the model used by this object is the following.  Given an input
18     sequence x, predict an output label sequence y such that:
19         y == argmax_y dot(weight_vector, PSI(x,y))
20     where PSI() is supplied by the user and defines the form of the
21     model.  In this example program we will define it such that we
22     obtain a simple Hidden Markov Model.  However, it's possible to
23     define much more sophisticated models.  You should take a look
24     at the following papers for a few examples:
25         - Hidden Markov Support Vector Machines by
26           Y. Altun, I. Tsochantaridis, T. Hofmann
27         - Shallow Parsing with Conditional Random Fields by
28           Fei Sha and Fernando Pereira
29 
30 
31 
32     In the remainder of this example program we will show how to
33     define your own PSI(), as well as how to learn the "weight_vector"
34     parameter.  Once you have these two items you will be able to
35     use the sequence_labeler to predict the labels of new sequences.
36 */
37 
38 
39 #include <iostream>
40 #include <dlib/svm_threaded.h>
41 #include <dlib/rand.h>
42 
43 using namespace std;
44 using namespace dlib;
45 
46 
47 /*
48     In this example we will be working with a Hidden Markov Model where
49     the hidden nodes and observation nodes both take on 3 different states.
50     The task will be to take a sequence of observations and predict the state
51     of the corresponding hidden nodes.
52 */
53 
54 const unsigned long num_label_states = 3;
55 const unsigned long num_sample_states = 3;
56 
57 // ----------------------------------------------------------------------------------------
58 
59 class feature_extractor
60 {
61     /*
62         This object is where you define your PSI().  To ensure that the argmax_y
63         remains a tractable problem, the PSI(x,y) vector is actually a sum of vectors,
64         each derived from the entire input sequence x but only part of the label
65         sequence y.  This allows the argmax_y to be efficiently solved using the
66         well known Viterbi algorithm.
67     */
68 
69 public:
70     // This defines the type used to represent the observed sequence.  You can use
71     // any type here so long as it has a .size() which returns the number of things
72     // in the sequence.
73     typedef std::vector<unsigned long> sequence_type;
74 
num_features() const75     unsigned long num_features() const
76     /*!
77         ensures
78             - returns the dimensionality of the PSI() feature vector.
79     !*/
80     {
81         // Recall that we are defining a HMM.  So in this case the PSI() vector
82         // should have the same dimensionality as the number of parameters in the HMM.
83         return num_label_states*num_label_states + num_label_states*num_sample_states;
84     }
85 
order() const86     unsigned long order() const
87     /*!
88         ensures
89             - This object represents a Markov model on the output labels.
90               This parameter defines the order of the model.  That is, this
91               value controls how many previous label values get to be taken
92               into consideration when performing feature extraction for a
93               particular element of the input sequence.  Note that the runtime
94               of the algorithm is exponential in the order.  So don't make order
95               very large.
96     !*/
97     {
98         // In this case we are using a HMM model that only looks at the
99         // previous label.
100         return 1;
101     }
102 
num_labels() const103     unsigned long num_labels() const
104     /*!
105         ensures
106             - returns the number of possible output labels.
107     !*/
108     {
109         return num_label_states;
110     }
111 
112     template <typename feature_setter, typename EXP>
get_features(feature_setter & set_feature,const sequence_type & x,const matrix_exp<EXP> & y,unsigned long position) const113     void get_features (
114         feature_setter& set_feature,
115         const sequence_type& x,
116         const matrix_exp<EXP>& y,
117         unsigned long position
118     ) const
119     /*!
120         requires
121             - EXP::type == unsigned long
122               (i.e. y contains unsigned longs)
123             - position < x.size()
124             - y.size() == min(position, order) + 1
125             - is_vector(y) == true
126             - max(y) < num_labels()
127             - set_feature is a function object which allows expressions of the form:
128                 - set_features((unsigned long)feature_index, (double)feature_value);
129                 - set_features((unsigned long)feature_index);
130         ensures
131             - for all valid i:
132                 - interprets y(i) as the label corresponding to x[position-i]
133             - This function computes the part of PSI() corresponding to the x[position]
134               element of the input sequence.  Moreover, this part of PSI() is returned as
135               a sparse vector by invoking set_feature().  For example, to set the feature
136               with an index of 55 to the value of 1 this method would call:
137                 set_feature(55);
138               Or equivalently:
139                 set_feature(55,1);
140               Therefore, the first argument to set_feature is the index of the feature
141               to be set while the second argument is the value the feature should take.
142               Additionally, note that calling set_feature() multiple times with the same
143               feature index does NOT overwrite the old value, it adds to the previous
144               value.  For example, if you call set_feature(55) 3 times then it will
145               result in feature 55 having a value of 3.
146             - This function only calls set_feature() with feature_index values < num_features()
147     !*/
148     {
149         // Again, the features below only define a simple HMM.  But in general, you can
150         // use a wide variety of sophisticated feature extraction methods here.
151 
152         // Pull out an indicator feature for the type of transition between the
153         // previous label and the current label.
154         if (y.size() > 1)
155             set_feature(y(1)*num_label_states + y(0));
156 
157         // Pull out an indicator feature for the type of observed node given
158         // the current label.
159         set_feature(num_label_states*num_label_states +
160                     y(0)*num_sample_states + x[position]);
161     }
162 };
163 
164 // We need to define serialize() and deserialize() for our feature extractor if we want
165 // to be able to serialize and deserialize our learned models.  In this case the
166 // implementation is empty since our feature_extractor doesn't have any state.  But you
167 // might define more complex feature extractors which have state that needs to be saved.
serialize(const feature_extractor &,std::ostream &)168 void serialize(const feature_extractor&, std::ostream&) {}
deserialize(feature_extractor &,std::istream &)169 void deserialize(feature_extractor&, std::istream&) {}
170 
171 // ----------------------------------------------------------------------------------------
172 
173 void make_dataset (
174     const matrix<double>& transition_probabilities,
175     const matrix<double>& emission_probabilities,
176     std::vector<std::vector<unsigned long> >& samples,
177     std::vector<std::vector<unsigned long> >& labels,
178     unsigned long dataset_size
179 );
180 /*!
181     requires
182         - transition_probabilities.nr() == transition_probabilities.nc()
183         - transition_probabilities.nr() == emission_probabilities.nr()
184         - The rows of transition_probabilities and emission_probabilities must sum to 1.
185           (i.e. sum_cols(transition_probabilities) and sum_cols(emission_probabilities)
186           must evaluate to vectors of all 1s.)
187     ensures
188         - This function randomly samples a bunch of sequences from the HMM defined by
189           transition_probabilities and emission_probabilities.
190         - The HMM is defined by:
191             - The probability of transitioning from hidden state H1 to H2
192               is given by transition_probabilities(H1,H2).
193             - The probability of a hidden state H producing an observed state
194               O is given by emission_probabilities(H,O).
195         - #samples.size() == #labels.size() == dataset_size
196         - for all valid i:
197             - #labels[i] is a randomly sampled sequence of hidden states from the
198               given HMM.  #samples[i] is its corresponding randomly sampled sequence
199               of observed states.
200 !*/
201 
202 // ----------------------------------------------------------------------------------------
203 
main()204 int main()
205 {
206     // We need a dataset to test the machine learning algorithms.  So we are going to
207     // define a HMM based on the following two matrices and then randomly sample a
208     // set of data from it.  Then we will see if the machine learning method can
209     // recover the HMM model from the training data.
210 
211 
212     matrix<double> transition_probabilities(num_label_states, num_label_states);
213     transition_probabilities = 0.05, 0.90, 0.05,
214                                0.05, 0.05, 0.90,
215                                0.90, 0.05, 0.05;
216 
217     matrix<double> emission_probabilities(num_label_states,num_sample_states);
218     emission_probabilities = 0.5, 0.5, 0.0,
219                              0.0, 0.5, 0.5,
220                              0.5, 0.0, 0.5;
221 
222     std::vector<std::vector<unsigned long> > samples;
223     std::vector<std::vector<unsigned long> > labels;
224     // sample 1000 labeled sequences from the HMM.
225     make_dataset(transition_probabilities,emission_probabilities,
226                  samples, labels, 1000);
227 
228     // print out some of the randomly sampled sequences
229     for (int i = 0; i < 10; ++i)
230     {
231         cout << "hidden states:   " << trans(mat(labels[i]));
232         cout << "observed states: " << trans(mat(samples[i]));
233         cout << "******************************" << endl;
234     }
235 
236     // Next we use the structural_sequence_labeling_trainer to learn our
237     // prediction model based on just the samples and labels.
238     structural_sequence_labeling_trainer<feature_extractor> trainer;
239     // This is the common SVM C parameter.  Larger values encourage the
240     // trainer to attempt to fit the data exactly but might overfit.
241     // In general, you determine this parameter by cross-validation.
242     trainer.set_c(4);
243     // This trainer can use multiple CPU cores to speed up the training.
244     // So set this to the number of available CPU cores.
245     trainer.set_num_threads(4);
246 
247 
248     // Learn to do sequence labeling from the dataset
249     sequence_labeler<feature_extractor> labeler = trainer.train(samples, labels);
250 
251     // Test the learned labeler on one of the training samples.  In this
252     // case it will give the correct sequence of labels.
253     std::vector<unsigned long> predicted_labels = labeler(samples[0]);
254     cout << "true hidden states:      "<< trans(mat(labels[0]));
255     cout << "predicted hidden states: "<< trans(mat(predicted_labels));
256 
257 
258 
259     // We can also do cross-validation.  The confusion_matrix is defined as:
260     //  - confusion_matrix(T,P) == the number of times a sequence element with label T
261     //    was predicted to have a label of P.
262     // So if all predictions are perfect then only diagonal elements of this matrix will
263     // be non-zero.
264     matrix<double> confusion_matrix;
265     confusion_matrix = cross_validate_sequence_labeler(trainer, samples, labels, 4);
266     cout << "\ncross-validation: " << endl;
267     cout << confusion_matrix;
268     cout << "label accuracy: "<< sum(diag(confusion_matrix))/sum(confusion_matrix) << endl;
269 
270     // In this case, the label accuracy is about 88%.  At this point, we want to know if
271     // the machine learning method was able to recover the HMM model from the data.  So
272     // to test this, we can load the true HMM model into another sequence_labeler and
273     // test it out on the data and compare the results.
274 
275     matrix<double,0,1> true_hmm_model_weights = log(join_cols(reshape_to_column_vector(transition_probabilities),
276                                                               reshape_to_column_vector(emission_probabilities)));
277     // With this model, labeler_true will predict the most probable set of labels
278     // given an input sequence.  That is, it will predict using the equation:
279     //    y == argmax_y dot(true_hmm_model_weights, PSI(x,y))
280     sequence_labeler<feature_extractor> labeler_true(true_hmm_model_weights);
281 
282     confusion_matrix = test_sequence_labeler(labeler_true, samples, labels);
283     cout << "\nTrue HMM model: " << endl;
284     cout << confusion_matrix;
285     cout << "label accuracy: "<< sum(diag(confusion_matrix))/sum(confusion_matrix) << endl;
286 
287     // Happily, we observe that the true model also obtains a label accuracy of 88%.
288 
289 
290 
291 
292 
293 
294     // Finally, the labeler can be serialized to disk just like most dlib objects.
295     serialize("labeler.dat") << labeler;
296 
297     // recall from disk
298     deserialize("labeler.dat") >> labeler;
299 }
300 
301 // ----------------------------------------------------------------------------------------
302 // ----------------------------------------------------------------------------------------
303 //              Code for creating a bunch of random samples from our HMM.
304 // ----------------------------------------------------------------------------------------
305 // ----------------------------------------------------------------------------------------
306 
sample_hmm(dlib::rand & rnd,const matrix<double> & transition_probabilities,const matrix<double> & emission_probabilities,unsigned long previous_label,unsigned long & next_label,unsigned long & next_sample)307 void sample_hmm (
308     dlib::rand& rnd,
309     const matrix<double>& transition_probabilities,
310     const matrix<double>& emission_probabilities,
311     unsigned long previous_label,
312     unsigned long& next_label,
313     unsigned long& next_sample
314 )
315 /*!
316     requires
317         - previous_label < transition_probabilities.nr()
318         - transition_probabilities.nr() == transition_probabilities.nc()
319         - transition_probabilities.nr() == emission_probabilities.nr()
320         - The rows of transition_probabilities and emission_probabilities must sum to 1.
321           (i.e. sum_cols(transition_probabilities) and sum_cols(emission_probabilities)
322           must evaluate to vectors of all 1s.)
323     ensures
324         - This function randomly samples the HMM defined by transition_probabilities
325           and emission_probabilities assuming that the previous hidden state
326           was previous_label.
327         - The HMM is defined by:
328             - P(next_label |previous_label) == transition_probabilities(previous_label, next_label)
329             - P(next_sample|next_label)     == emission_probabilities  (next_label,     next_sample)
330         - #next_label == the sampled value of the hidden state
331         - #next_sample == the sampled value of the observed state
332 !*/
333 {
334     // sample next_label
335     double p = rnd.get_random_double();
336     for (long c = 0; p >= 0 && c < transition_probabilities.nc(); ++c)
337     {
338         next_label = c;
339         p -= transition_probabilities(previous_label, c);
340     }
341 
342     // now sample next_sample
343     p = rnd.get_random_double();
344     for (long c = 0; p >= 0 && c < emission_probabilities.nc(); ++c)
345     {
346         next_sample = c;
347         p -= emission_probabilities(next_label, c);
348     }
349 }
350 
351 // ----------------------------------------------------------------------------------------
352 
make_dataset(const matrix<double> & transition_probabilities,const matrix<double> & emission_probabilities,std::vector<std::vector<unsigned long>> & samples,std::vector<std::vector<unsigned long>> & labels,unsigned long dataset_size)353 void make_dataset (
354     const matrix<double>& transition_probabilities,
355     const matrix<double>& emission_probabilities,
356     std::vector<std::vector<unsigned long> >& samples,
357     std::vector<std::vector<unsigned long> >& labels,
358     unsigned long dataset_size
359 )
360 {
361     samples.clear();
362     labels.clear();
363 
364     dlib::rand rnd;
365 
366     // now randomly sample some labeled sequences from our Hidden Markov Model
367     for (unsigned long iter = 0; iter < dataset_size; ++iter)
368     {
369         const unsigned long sequence_size = rnd.get_random_32bit_number()%20+3;
370         std::vector<unsigned long> sample(sequence_size);
371         std::vector<unsigned long> label(sequence_size);
372 
373         unsigned long previous_label = rnd.get_random_32bit_number()%num_label_states;
374         for (unsigned long i = 0; i < sample.size(); ++i)
375         {
376             unsigned long next_label = 0, next_sample = 0;
377             sample_hmm(rnd, transition_probabilities, emission_probabilities,
378                        previous_label, next_label, next_sample);
379 
380             label[i] = next_label;
381             sample[i] = next_sample;
382 
383             previous_label = next_label;
384         }
385 
386         samples.push_back(sample);
387         labels.push_back(label);
388     }
389 }
390 
391 // ----------------------------------------------------------------------------------------
392 
393