1 // Copyright (C) 2008  Davis E. King (davis@dlib.net), Steve Taylor
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_STATISTICs_
4 #define DLIB_STATISTICs_
5 
6 #include "statistics_abstract.h"
7 #include <limits>
8 #include <cmath>
9 #include "../algs.h"
10 #include "../matrix.h"
11 #include "../sparse_vector.h"
12 
13 namespace dlib
14 {
15 
16 // ----------------------------------------------------------------------------------------
17 
18     template <
19         typename T
20         >
21     class running_stats
22     {
23     public:
24 
running_stats()25         running_stats()
26         {
27             clear();
28 
29             COMPILE_TIME_ASSERT ((
30                     is_same_type<float,T>::value ||
31                     is_same_type<double,T>::value ||
32                     is_same_type<long double,T>::value
33             ));
34         }
35 
clear()36         void clear()
37         {
38             sum = 0;
39             sum_sqr  = 0;
40             sum_cub  = 0;
41             sum_four = 0;
42 
43             n = 0;
44             min_value = std::numeric_limits<T>::infinity();
45             max_value = -std::numeric_limits<T>::infinity();
46         }
47 
add(const T & val)48         void add (
49             const T& val
50         )
51         {
52             sum      += val;
53             sum_sqr  += val*val;
54             sum_cub  += cubed(val);
55             sum_four += quaded(val);
56 
57             if (val < min_value)
58                 min_value = val;
59             if (val > max_value)
60                 max_value = val;
61 
62             ++n;
63         }
64 
current_n()65         T current_n (
66         ) const
67         {
68             return n;
69         }
70 
mean()71         T mean (
72         ) const
73         {
74             if (n != 0)
75                 return sum/n;
76             else
77                 return 0;
78         }
79 
max()80         T max (
81         ) const
82         {
83             // make sure requires clause is not broken
84             DLIB_ASSERT(current_n() > 0,
85                 "\tT running_stats::max"
86                 << "\n\tyou have to add some numbers to this object first"
87                 << "\n\tthis: " << this
88                 );
89 
90             return max_value;
91         }
92 
min()93         T min (
94         ) const
95         {
96             // make sure requires clause is not broken
97             DLIB_ASSERT(current_n() > 0,
98                 "\tT running_stats::min"
99                 << "\n\tyou have to add some numbers to this object first"
100                 << "\n\tthis: " << this
101                 );
102 
103             return min_value;
104         }
105 
variance()106         T variance (
107         ) const
108         {
109             // make sure requires clause is not broken
110             DLIB_ASSERT(current_n() > 1,
111                 "\tT running_stats::variance"
112                 << "\n\tyou have to add some numbers to this object first"
113                 << "\n\tthis: " << this
114                 );
115 
116             T temp = 1/(n-1);
117             temp = temp*(sum_sqr - sum*sum/n);
118             // make sure the variance is never negative.  This might
119             // happen due to numerical errors.
120             if (temp >= 0)
121                 return temp;
122             else
123                 return 0;
124         }
125 
stddev()126         T stddev (
127         ) const
128         {
129             // make sure requires clause is not broken
130             DLIB_ASSERT(current_n() > 1,
131                 "\tT running_stats::stddev"
132                 << "\n\tyou have to add some numbers to this object first"
133                 << "\n\tthis: " << this
134                 );
135 
136             return std::sqrt(variance());
137         }
138 
skewness()139         T skewness (
140         ) const
141         {
142             // make sure requires clause is not broken
143             DLIB_ASSERT(current_n() > 2,
144                 "\tT running_stats::skewness"
145                 << "\n\tyou have to add some numbers to this object first"
146                 << "\n\tthis: " << this
147             );
148 
149             T temp  = 1/n;
150             T temp1 = std::sqrt(n*(n-1))/(n-2);
151             temp    = temp1*temp*(sum_cub - 3*sum_sqr*sum*temp + 2*cubed(sum)*temp*temp)/
152                       (std::sqrt(std::pow(temp*(sum_sqr-sum*sum*temp),3)));
153 
154             return temp;
155         }
156 
ex_kurtosis()157         T ex_kurtosis (
158         ) const
159         {
160             // make sure requires clause is not broken
161             DLIB_ASSERT(current_n() > 3,
162                 "\tT running_stats::kurtosis"
163                 << "\n\tyou have to add some numbers to this object first"
164                 << "\n\tthis: " << this
165             );
166 
167             T temp = 1/n;
168             T m4   = temp*(sum_four - 4*sum_cub*sum*temp+6*sum_sqr*sum*sum*temp*temp
169                      -3*quaded(sum)*cubed(temp));
170             T m2   = temp*(sum_sqr-sum*sum*temp);
171             temp   = (n-1)*((n+1)*m4/(m2*m2)-3*(n-1))/((n-2)*(n-3));
172 
173             return temp;
174         }
175 
scale(const T & val)176         T scale (
177             const T& val
178         ) const
179         {
180             // make sure requires clause is not broken
181             DLIB_ASSERT(current_n() > 1,
182                 "\tT running_stats::variance"
183                 << "\n\tyou have to add some numbers to this object first"
184                 << "\n\tthis: " << this
185                 );
186             return (val-mean())/std::sqrt(variance());
187         }
188 
189         running_stats operator+ (
190             const running_stats& rhs
191         ) const
192         {
193             running_stats temp(*this);
194 
195             temp.sum += rhs.sum;
196             temp.sum_sqr += rhs.sum_sqr;
197             temp.sum_cub += rhs.sum_cub;
198             temp.sum_four += rhs.sum_four;
199             temp.n += rhs.n;
200             temp.min_value = std::min(rhs.min_value, min_value);
201             temp.max_value = std::max(rhs.max_value, max_value);
202             return temp;
203         }
204 
205         template <typename U>
206         friend void serialize (
207             const running_stats<U>& item,
208             std::ostream& out
209         );
210 
211         template <typename U>
212         friend void deserialize (
213             running_stats<U>& item,
214             std::istream& in
215         );
216 
217     private:
218         T sum;
219         T sum_sqr;
220         T sum_cub;
221         T sum_four;
222         T n;
223         T min_value;
224         T max_value;
225 
cubed(const T & val)226         T cubed  (const T& val) const {return val*val*val; }
quaded(const T & val)227         T quaded (const T& val) const {return val*val*val*val; }
228     };
229 
230     template <typename T>
serialize(const running_stats<T> & item,std::ostream & out)231     void serialize (
232         const running_stats<T>& item,
233         std::ostream& out
234     )
235     {
236         int version = 2;
237         serialize(version, out);
238 
239         serialize(item.sum, out);
240         serialize(item.sum_sqr, out);
241         serialize(item.sum_cub, out);
242         serialize(item.sum_four, out);
243         serialize(item.n, out);
244         serialize(item.min_value, out);
245         serialize(item.max_value, out);
246     }
247 
248     template <typename T>
deserialize(running_stats<T> & item,std::istream & in)249     void deserialize (
250         running_stats<T>& item,
251         std::istream& in
252     )
253     {
254         int version = 0;
255         deserialize(version, in);
256         if (version != 2)
257             throw dlib::serialization_error("Unexpected version number found while deserializing dlib::running_stats object.");
258 
259         deserialize(item.sum, in);
260         deserialize(item.sum_sqr, in);
261         deserialize(item.sum_cub, in);
262         deserialize(item.sum_four, in);
263         deserialize(item.n, in);
264         deserialize(item.min_value, in);
265         deserialize(item.max_value, in);
266     }
267 
268 // ----------------------------------------------------------------------------------------
269 
270     template <
271         typename T
272         >
273     class running_scalar_covariance
274     {
275     public:
276 
running_scalar_covariance()277         running_scalar_covariance()
278         {
279             clear();
280 
281             COMPILE_TIME_ASSERT ((
282                     is_same_type<float,T>::value ||
283                     is_same_type<double,T>::value ||
284                     is_same_type<long double,T>::value
285             ));
286         }
287 
clear()288         void clear()
289         {
290             sum_xy = 0;
291             sum_x = 0;
292             sum_y = 0;
293             sum_xx = 0;
294             sum_yy = 0;
295             n = 0;
296         }
297 
add(const T & x,const T & y)298         void add (
299             const T& x,
300             const T& y
301         )
302         {
303             sum_xy += x*y;
304 
305             sum_xx += x*x;
306             sum_yy += y*y;
307 
308             sum_x  += x;
309             sum_y  += y;
310 
311             n += 1;
312         }
313 
current_n()314         T current_n (
315         ) const
316         {
317             return n;
318         }
319 
mean_x()320         T mean_x (
321         ) const
322         {
323             if (n != 0)
324                 return sum_x/n;
325             else
326                 return 0;
327         }
328 
mean_y()329         T mean_y (
330         ) const
331         {
332             if (n != 0)
333                 return sum_y/n;
334             else
335                 return 0;
336         }
337 
covariance()338         T covariance (
339         ) const
340         {
341             // make sure requires clause is not broken
342             DLIB_ASSERT(current_n() > 1,
343                 "\tT running_scalar_covariance::covariance()"
344                 << "\n\tyou have to add some numbers to this object first"
345                 << "\n\tthis: " << this
346                 );
347 
348             return 1/(n-1) * (sum_xy - sum_y*sum_x/n);
349         }
350 
correlation()351         T correlation (
352         ) const
353         {
354             // make sure requires clause is not broken
355             DLIB_ASSERT(current_n() > 1,
356                 "\tT running_scalar_covariance::correlation()"
357                 << "\n\tyou have to add some numbers to this object first"
358                 << "\n\tthis: " << this
359                 );
360 
361             return covariance() / std::sqrt(variance_x()*variance_y());
362         }
363 
variance_x()364         T variance_x (
365         ) const
366         {
367             // make sure requires clause is not broken
368             DLIB_ASSERT(current_n() > 1,
369                 "\tT running_scalar_covariance::variance_x()"
370                 << "\n\tyou have to add some numbers to this object first"
371                 << "\n\tthis: " << this
372                 );
373 
374             T temp = 1/(n-1) * (sum_xx - sum_x*sum_x/n);
375             // make sure the variance is never negative.  This might
376             // happen due to numerical errors.
377             if (temp >= 0)
378                 return temp;
379             else
380                 return 0;
381         }
382 
variance_y()383         T variance_y (
384         ) const
385         {
386             // make sure requires clause is not broken
387             DLIB_ASSERT(current_n() > 1,
388                 "\tT running_scalar_covariance::variance_y()"
389                 << "\n\tyou have to add some numbers to this object first"
390                 << "\n\tthis: " << this
391                 );
392 
393             T temp = 1/(n-1) * (sum_yy - sum_y*sum_y/n);
394             // make sure the variance is never negative.  This might
395             // happen due to numerical errors.
396             if (temp >= 0)
397                 return temp;
398             else
399                 return 0;
400         }
401 
stddev_x()402         T stddev_x (
403         ) const
404         {
405             // make sure requires clause is not broken
406             DLIB_ASSERT(current_n() > 1,
407                 "\tT running_scalar_covariance::stddev_x()"
408                 << "\n\tyou have to add some numbers to this object first"
409                 << "\n\tthis: " << this
410                 );
411 
412             return std::sqrt(variance_x());
413         }
414 
stddev_y()415         T stddev_y (
416         ) const
417         {
418             // make sure requires clause is not broken
419             DLIB_ASSERT(current_n() > 1,
420                 "\tT running_scalar_covariance::stddev_y()"
421                 << "\n\tyou have to add some numbers to this object first"
422                 << "\n\tthis: " << this
423                 );
424 
425             return std::sqrt(variance_y());
426         }
427 
428         running_scalar_covariance operator+ (
429             const running_scalar_covariance& rhs
430         ) const
431         {
432             running_scalar_covariance temp(rhs);
433 
434             temp.sum_xy += sum_xy;
435             temp.sum_x  += sum_x;
436             temp.sum_y  += sum_y;
437             temp.sum_xx += sum_xx;
438             temp.sum_yy += sum_yy;
439             temp.n      += n;
440             return temp;
441         }
442 
443     private:
444 
445         T sum_xy;
446         T sum_x;
447         T sum_y;
448         T sum_xx;
449         T sum_yy;
450         T n;
451     };
452 
453 // ----------------------------------------------------------------------------------------
454 
455     template <
456         typename T
457         >
458     class running_scalar_covariance_decayed
459     {
460     public:
461 
462         explicit running_scalar_covariance_decayed(
463             T decay_halflife = 1000
464         )
465         {
466             DLIB_ASSERT(decay_halflife > 0);
467 
468             sum_xy = 0;
469             sum_x = 0;
470             sum_y = 0;
471             sum_xx = 0;
472             sum_yy = 0;
473             forget = std::pow(0.5, 1/decay_halflife);
474             n = 0;
475 
476             COMPILE_TIME_ASSERT ((
477                     is_same_type<float,T>::value ||
478                     is_same_type<double,T>::value ||
479                     is_same_type<long double,T>::value
480             ));
481         }
482 
forget_factor()483         T forget_factor (
484         ) const
485         {
486             return forget;
487         }
488 
add(const T & x,const T & y)489         void add (
490             const T& x,
491             const T& y
492         )
493         {
494             sum_xy = sum_xy*forget + x*y;
495 
496             sum_xx = sum_xx*forget + x*x;
497             sum_yy = sum_yy*forget + y*y;
498 
499             sum_x  = sum_x*forget + x;
500             sum_y  = sum_y*forget + y;
501 
502             n = n*forget + 1;
503         }
504 
current_n()505         T current_n (
506         ) const
507         {
508             return n;
509         }
510 
mean_x()511         T mean_x (
512         ) const
513         {
514             if (n != 0)
515                 return sum_x/n;
516             else
517                 return 0;
518         }
519 
mean_y()520         T mean_y (
521         ) const
522         {
523             if (n != 0)
524                 return sum_y/n;
525             else
526                 return 0;
527         }
528 
covariance()529         T covariance (
530         ) const
531         {
532             // make sure requires clause is not broken
533             DLIB_ASSERT(current_n() > 1,
534                 "\tT running_scalar_covariance_decayed::covariance()"
535                 << "\n\tyou have to add some numbers to this object first"
536                 << "\n\tthis: " << this
537                 );
538 
539             return 1/(n-1) * (sum_xy - sum_y*sum_x/n);
540         }
541 
correlation()542         T correlation (
543         ) const
544         {
545             // make sure requires clause is not broken
546             DLIB_ASSERT(current_n() > 1,
547                 "\tT running_scalar_covariance_decayed::correlation()"
548                 << "\n\tyou have to add some numbers to this object first"
549                 << "\n\tthis: " << this
550                 );
551 
552             T temp = std::sqrt(variance_x()*variance_y());
553             if (temp != 0)
554                 return covariance() / temp;
555             else
556                 return 0; // just say it's zero if there isn't any variance in x or y.
557         }
558 
variance_x()559         T variance_x (
560         ) const
561         {
562             // make sure requires clause is not broken
563             DLIB_ASSERT(current_n() > 1,
564                 "\tT running_scalar_covariance_decayed::variance_x()"
565                 << "\n\tyou have to add some numbers to this object first"
566                 << "\n\tthis: " << this
567                 );
568 
569             T temp = 1/(n-1) * (sum_xx - sum_x*sum_x/n);
570             // make sure the variance is never negative.  This might
571             // happen due to numerical errors.
572             if (temp >= 0)
573                 return temp;
574             else
575                 return 0;
576         }
577 
variance_y()578         T variance_y (
579         ) const
580         {
581             // make sure requires clause is not broken
582             DLIB_ASSERT(current_n() > 1,
583                 "\tT running_scalar_covariance_decayed::variance_y()"
584                 << "\n\tyou have to add some numbers to this object first"
585                 << "\n\tthis: " << this
586                 );
587 
588             T temp = 1/(n-1) * (sum_yy - sum_y*sum_y/n);
589             // make sure the variance is never negative.  This might
590             // happen due to numerical errors.
591             if (temp >= 0)
592                 return temp;
593             else
594                 return 0;
595         }
596 
stddev_x()597         T stddev_x (
598         ) const
599         {
600             // make sure requires clause is not broken
601             DLIB_ASSERT(current_n() > 1,
602                 "\tT running_scalar_covariance_decayed::stddev_x()"
603                 << "\n\tyou have to add some numbers to this object first"
604                 << "\n\tthis: " << this
605                 );
606 
607             return std::sqrt(variance_x());
608         }
609 
stddev_y()610         T stddev_y (
611         ) const
612         {
613             // make sure requires clause is not broken
614             DLIB_ASSERT(current_n() > 1,
615                 "\tT running_scalar_covariance_decayed::stddev_y()"
616                 << "\n\tyou have to add some numbers to this object first"
617                 << "\n\tthis: " << this
618                 );
619 
620             return std::sqrt(variance_y());
621         }
622 
623     private:
624 
625         T sum_xy;
626         T sum_x;
627         T sum_y;
628         T sum_xx;
629         T sum_yy;
630         T n;
631         T forget;
632     };
633 
634 // ----------------------------------------------------------------------------------------
635 
636     template <
637         typename T
638         >
639     class running_stats_decayed
640     {
641     public:
642 
643         explicit running_stats_decayed(
644             T decay_halflife = 1000
645         )
646         {
647             DLIB_ASSERT(decay_halflife > 0);
648 
649             sum_x = 0;
650             sum_xx = 0;
651             forget = std::pow(0.5, 1/decay_halflife);
652             n = 0;
653 
654             COMPILE_TIME_ASSERT ((
655                     is_same_type<float,T>::value ||
656                     is_same_type<double,T>::value ||
657                     is_same_type<long double,T>::value
658             ));
659         }
660 
forget_factor()661         T forget_factor (
662         ) const
663         {
664             return forget;
665         }
666 
add(const T & x)667         void add (
668             const T& x
669         )
670         {
671 
672             sum_xx = sum_xx*forget + x*x;
673 
674             sum_x  = sum_x*forget + x;
675 
676             n = n*forget + 1;
677         }
678 
current_n()679         T current_n (
680         ) const
681         {
682             return n;
683         }
684 
mean()685         T mean (
686         ) const
687         {
688             if (n != 0)
689                 return sum_x/n;
690             else
691                 return 0;
692         }
693 
variance()694         T variance (
695         ) const
696         {
697             // make sure requires clause is not broken
698             DLIB_ASSERT(current_n() > 1,
699                 "\tT running_stats_decayed::variance()"
700                 << "\n\tyou have to add some numbers to this object first"
701                 << "\n\tthis: " << this
702                 );
703 
704             T temp = 1/(n-1) * (sum_xx - sum_x*sum_x/n);
705             // make sure the variance is never negative.  This might
706             // happen due to numerical errors.
707             if (temp >= 0)
708                 return temp;
709             else
710                 return 0;
711         }
712 
stddev()713         T stddev (
714         ) const
715         {
716             // make sure requires clause is not broken
717             DLIB_ASSERT(current_n() > 1,
718                 "\tT running_stats_decayed::stddev()"
719                 << "\n\tyou have to add some numbers to this object first"
720                 << "\n\tthis: " << this
721                 );
722 
723             return std::sqrt(variance());
724         }
725 
726         template <typename U>
727         friend void serialize (
728             const running_stats_decayed<U>& item,
729             std::ostream& out
730         );
731 
732         template <typename U>
733         friend void deserialize (
734             running_stats_decayed<U>& item,
735             std::istream& in
736         );
737 
738     private:
739 
740         T sum_x;
741         T sum_xx;
742         T n;
743         T forget;
744     };
745 
746     template <typename T>
serialize(const running_stats_decayed<T> & item,std::ostream & out)747     void serialize (
748         const running_stats_decayed<T>& item,
749         std::ostream& out
750     )
751     {
752         int version = 1;
753         serialize(version, out);
754 
755         serialize(item.sum_x, out);
756         serialize(item.sum_xx, out);
757         serialize(item.n, out);
758         serialize(item.forget, out);
759     }
760 
761     template <typename T>
deserialize(running_stats_decayed<T> & item,std::istream & in)762     void deserialize (
763         running_stats_decayed<T>& item,
764         std::istream& in
765     )
766     {
767         int version = 0;
768         deserialize(version, in);
769         if (version != 1)
770             throw dlib::serialization_error("Unexpected version number found while deserializing dlib::running_stats_decayed object.");
771 
772         deserialize(item.sum_x, in);
773         deserialize(item.sum_xx, in);
774         deserialize(item.n, in);
775         deserialize(item.forget, in);
776     }
777 
778 // ----------------------------------------------------------------------------------------
779 
780     template <
781         typename T,
782         typename alloc
783         >
mean_sign_agreement(const std::vector<T,alloc> & a,const std::vector<T,alloc> & b)784     double mean_sign_agreement (
785         const std::vector<T,alloc>& a,
786         const std::vector<T,alloc>& b
787     )
788     {
789         // make sure requires clause is not broken
790         DLIB_ASSERT(a.size() == b.size(),
791                     "\t double mean_sign_agreement(a,b)"
792                     << "\n\t a and b must be the same length."
793                     << "\n\t a.size(): " << a.size()
794                     << "\n\t b.size(): " << b.size()
795         );
796 
797 
798         double temp = 0;
799         for (unsigned long i = 0; i < a.size(); ++i)
800         {
801             if ((a[i] >= 0 && b[i] >= 0) ||
802                 (a[i] < 0  && b[i] <  0))
803             {
804                 temp += 1;
805             }
806         }
807 
808         return temp/a.size();
809     }
810 
811 // ----------------------------------------------------------------------------------------
812 
813     template <
814         typename T,
815         typename alloc
816         >
correlation(const std::vector<T,alloc> & a,const std::vector<T,alloc> & b)817     double correlation (
818         const std::vector<T,alloc>& a,
819         const std::vector<T,alloc>& b
820     )
821     {
822         // make sure requires clause is not broken
823         DLIB_ASSERT(a.size() == b.size() && a.size() > 1,
824                     "\t double correlation(a,b)"
825                     << "\n\t a and b must be the same length and have more than one element."
826                     << "\n\t a.size(): " << a.size()
827                     << "\n\t b.size(): " << b.size()
828         );
829 
830         running_scalar_covariance<double> rs;
831         for (unsigned long i = 0; i < a.size(); ++i)
832         {
833             rs.add(a[i], b[i]);
834         }
835         return rs.correlation();
836     }
837 
838 // ----------------------------------------------------------------------------------------
839 
840     template <
841         typename T,
842         typename alloc
843         >
covariance(const std::vector<T,alloc> & a,const std::vector<T,alloc> & b)844     double covariance (
845         const std::vector<T,alloc>& a,
846         const std::vector<T,alloc>& b
847     )
848     {
849         // make sure requires clause is not broken
850         DLIB_ASSERT(a.size() == b.size() && a.size() > 1,
851                     "\t double covariance(a,b)"
852                     << "\n\t a and b must be the same length and have more than one element."
853                     << "\n\t a.size(): " << a.size()
854                     << "\n\t b.size(): " << b.size()
855         );
856 
857         running_scalar_covariance<double> rs;
858         for (unsigned long i = 0; i < a.size(); ++i)
859         {
860             rs.add(a[i], b[i]);
861         }
862         return rs.covariance();
863     }
864 
865 // ----------------------------------------------------------------------------------------
866 
867     template <
868         typename T,
869         typename alloc
870         >
r_squared(const std::vector<T,alloc> & a,const std::vector<T,alloc> & b)871     double r_squared (
872         const std::vector<T,alloc>& a,
873         const std::vector<T,alloc>& b
874     )
875     {
876         // make sure requires clause is not broken
877         DLIB_ASSERT(a.size() == b.size() && a.size() > 1,
878                     "\t double r_squared(a,b)"
879                     << "\n\t a and b must be the same length and have more than one element."
880                     << "\n\t a.size(): " << a.size()
881                     << "\n\t b.size(): " << b.size()
882         );
883 
884         return std::pow(correlation(a,b),2.0);
885     }
886 
887 // ----------------------------------------------------------------------------------------
888 
889     template <
890         typename T,
891         typename alloc
892         >
mean_squared_error(const std::vector<T,alloc> & a,const std::vector<T,alloc> & b)893     double mean_squared_error (
894         const std::vector<T,alloc>& a,
895         const std::vector<T,alloc>& b
896     )
897     {
898         // make sure requires clause is not broken
899         DLIB_ASSERT(a.size() == b.size(),
900                     "\t double mean_squared_error(a,b)"
901                     << "\n\t a and b must be the same length."
902                     << "\n\t a.size(): " << a.size()
903                     << "\n\t b.size(): " << b.size()
904         );
905 
906         return mean(squared(matrix_cast<double>(mat(a))-matrix_cast<double>(mat(b))));
907     }
908 
909 // ----------------------------------------------------------------------------------------
910 
911     template <
912         typename matrix_type
913         >
914     class running_covariance
915     {
916         /*!
917             INITIAL VALUE
918                 - vect_size == 0
919                 - total_count == 0
920 
921             CONVENTION
922                 - vect_size == in_vector_size()
923                 - total_count == current_n()
924 
925                 - if (total_count != 0)
926                     - total_sum == the sum of all vectors given to add()
927                     - the covariance of all the elements given to add() is given
928                       by:
929                         - let avg == total_sum/total_count
930                         - covariance == total_cov/total_count - avg*trans(avg)
931         !*/
932     public:
933 
934         typedef typename matrix_type::mem_manager_type mem_manager_type;
935         typedef typename matrix_type::type scalar_type;
936         typedef typename matrix_type::layout_type layout_type;
937         typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
938         typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
939 
running_covariance()940         running_covariance(
941         )
942         {
943             clear();
944         }
945 
clear()946         void clear(
947         )
948         {
949             total_count = 0;
950 
951             vect_size = 0;
952 
953             total_sum.set_size(0);
954             total_cov.set_size(0,0);
955         }
956 
in_vector_size()957         long in_vector_size (
958         ) const
959         {
960             return vect_size;
961         }
962 
current_n()963         long current_n (
964         ) const
965         {
966             return static_cast<long>(total_count);
967         }
968 
set_dimension(long size)969         void set_dimension (
970             long size
971         )
972         {
973             // make sure requires clause is not broken
974             DLIB_ASSERT( size > 0,
975                 "\t void running_covariance::set_dimension()"
976                 << "\n\t Invalid inputs were given to this function"
977                 << "\n\t size: " << size
978                 << "\n\t this: " << this
979                 );
980 
981             clear();
982             vect_size = size;
983             total_sum.set_size(size);
984             total_cov.set_size(size,size);
985             total_sum = 0;
986             total_cov = 0;
987         }
988 
989         template <typename T>
add(const T & val)990         typename disable_if<is_matrix<T> >::type add (
991             const T& val
992         )
993         {
994             // make sure requires clause is not broken
995             DLIB_ASSERT(((long)max_index_plus_one(val) <= in_vector_size() && in_vector_size() > 0),
996                 "\t void running_covariance::add()"
997                 << "\n\t Invalid inputs were given to this function"
998                 << "\n\t max_index_plus_one(val): " << max_index_plus_one(val)
999                 << "\n\t in_vector_size():        " << in_vector_size()
1000                 << "\n\t this:                    " << this
1001                 );
1002 
1003             for (typename T::const_iterator i = val.begin(); i != val.end(); ++i)
1004             {
1005                 total_sum(i->first) += i->second;
1006                 for (typename T::const_iterator j = val.begin(); j != val.end(); ++j)
1007                 {
1008                     total_cov(i->first, j->first) += i->second*j->second;
1009                 }
1010             }
1011 
1012             ++total_count;
1013         }
1014 
1015         template <typename T>
add(const T & val)1016         typename enable_if<is_matrix<T> >::type add (
1017             const T& val
1018         )
1019         {
1020             // make sure requires clause is not broken
1021             DLIB_ASSERT(is_col_vector(val) && (in_vector_size() == 0 || val.size() == in_vector_size()),
1022                 "\t void running_covariance::add()"
1023                 << "\n\t Invalid inputs were given to this function"
1024                 << "\n\t is_col_vector(val): " << is_col_vector(val)
1025                 << "\n\t in_vector_size():   " << in_vector_size()
1026                 << "\n\t val.size():         " << val.size()
1027                 << "\n\t this:               " << this
1028                 );
1029 
1030             vect_size = val.size();
1031             if (total_count == 0)
1032             {
1033                 total_cov = val*trans(val);
1034                 total_sum = val;
1035             }
1036             else
1037             {
1038                 total_cov += val*trans(val);
1039                 total_sum += val;
1040             }
1041             ++total_count;
1042         }
1043 
mean()1044         const column_matrix mean (
1045         ) const
1046         {
1047             // make sure requires clause is not broken
1048             DLIB_ASSERT( in_vector_size() != 0,
1049                 "\t running_covariance::mean()"
1050                 << "\n\t This object can not execute this function in its current state."
1051                 << "\n\t in_vector_size(): " << in_vector_size()
1052                 << "\n\t current_n():      " << current_n()
1053                 << "\n\t this:             " << this
1054                 );
1055 
1056             return total_sum/total_count;
1057         }
1058 
covariance()1059         const general_matrix covariance (
1060         ) const
1061         {
1062             // make sure requires clause is not broken
1063             DLIB_ASSERT( in_vector_size() != 0 && current_n() > 1,
1064                 "\t running_covariance::covariance()"
1065                 << "\n\t This object can not execute this function in its current state."
1066                 << "\n\t in_vector_size(): " << in_vector_size()
1067                 << "\n\t current_n():      " << current_n()
1068                 << "\n\t this:             " << this
1069                 );
1070 
1071             return (total_cov - total_sum*trans(total_sum)/total_count)/(total_count-1);
1072         }
1073 
1074         const running_covariance operator+ (
1075             const running_covariance& item
1076         ) const
1077         {
1078             // make sure requires clause is not broken
1079             DLIB_ASSERT((in_vector_size() == 0 || item.in_vector_size() == 0 || in_vector_size() == item.in_vector_size()),
1080                 "\t running_covariance running_covariance::operator+()"
1081                 << "\n\t The two running_covariance objects being added must have compatible parameters"
1082                 << "\n\t in_vector_size():            " << in_vector_size()
1083                 << "\n\t item.in_vector_size():       " << item.in_vector_size()
1084                 << "\n\t this:                        " << this
1085                 );
1086 
1087             running_covariance temp(item);
1088 
1089             // make sure we ignore empty matrices
1090             if (total_count != 0 && temp.total_count != 0)
1091             {
1092                 temp.total_cov += total_cov;
1093                 temp.total_sum += total_sum;
1094                 temp.total_count += total_count;
1095             }
1096             else if (total_count != 0)
1097             {
1098                 temp.total_cov = total_cov;
1099                 temp.total_sum = total_sum;
1100                 temp.total_count = total_count;
1101             }
1102 
1103             return temp;
1104         }
1105 
1106 
1107     private:
1108 
1109         general_matrix total_cov;
1110         column_matrix total_sum;
1111         scalar_type total_count;
1112 
1113         long vect_size;
1114     };
1115 
1116 // ----------------------------------------------------------------------------------------
1117 
1118     template <
1119         typename matrix_type
1120         >
1121     class running_cross_covariance
1122     {
1123         /*!
1124             INITIAL VALUE
1125                 - x_vect_size == 0
1126                 - y_vect_size == 0
1127                 - total_count == 0
1128 
1129             CONVENTION
1130                 - x_vect_size == x_vector_size()
1131                 - y_vect_size == y_vector_size()
1132                 - total_count == current_n()
1133 
1134                 - if (total_count != 0)
1135                     - sum_x == the sum of all x vectors given to add()
1136                     - sum_y == the sum of all y vectors given to add()
1137                     - total_cov == sum of all x*trans(y) given to add()
1138         !*/
1139 
1140     public:
1141 
1142         typedef typename matrix_type::mem_manager_type mem_manager_type;
1143         typedef typename matrix_type::type scalar_type;
1144         typedef typename matrix_type::layout_type layout_type;
1145         typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix;
1146         typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix;
1147 
running_cross_covariance()1148         running_cross_covariance(
1149         )
1150         {
1151             clear();
1152         }
1153 
clear()1154         void clear(
1155         )
1156         {
1157             total_count = 0;
1158 
1159             x_vect_size = 0;
1160             y_vect_size = 0;
1161 
1162             sum_x.set_size(0);
1163             sum_y.set_size(0);
1164             total_cov.set_size(0,0);
1165         }
1166 
x_vector_size()1167         long x_vector_size (
1168         ) const
1169         {
1170             return x_vect_size;
1171         }
1172 
y_vector_size()1173         long y_vector_size (
1174         ) const
1175         {
1176             return y_vect_size;
1177         }
1178 
set_dimensions(long x_size,long y_size)1179         void set_dimensions (
1180             long x_size,
1181             long y_size
1182         )
1183         {
1184             // make sure requires clause is not broken
1185             DLIB_ASSERT( x_size > 0 && y_size > 0,
1186                 "\t void running_cross_covariance::set_dimensions()"
1187                 << "\n\t Invalid inputs were given to this function"
1188                 << "\n\t x_size: " << x_size
1189                 << "\n\t y_size: " << y_size
1190                 << "\n\t this:   " << this
1191                 );
1192 
1193             clear();
1194             x_vect_size = x_size;
1195             y_vect_size = y_size;
1196             sum_x.set_size(x_size);
1197             sum_y.set_size(y_size);
1198             total_cov.set_size(x_size,y_size);
1199 
1200             sum_x = 0;
1201             sum_y = 0;
1202             total_cov = 0;
1203         }
1204 
current_n()1205         long current_n (
1206         ) const
1207         {
1208             return static_cast<long>(total_count);
1209         }
1210 
1211         template <typename T, typename U>
add(const T & x,const U & y)1212         typename enable_if_c<!is_matrix<T>::value && !is_matrix<U>::value>::type add (
1213             const T& x,
1214             const U& y
1215         )
1216         {
1217             // make sure requires clause is not broken
1218             DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) &&
1219                          ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) ,
1220                 "\t void running_cross_covariance::add()"
1221                 << "\n\t Invalid inputs were given to this function"
1222                 << "\n\t max_index_plus_one(x): " << max_index_plus_one(x)
1223                 << "\n\t max_index_plus_one(y): " << max_index_plus_one(y)
1224                 << "\n\t x_vector_size():       " << x_vector_size()
1225                 << "\n\t y_vector_size():       " << y_vector_size()
1226                 << "\n\t this:                  " << this
1227                 );
1228 
1229             for (typename T::const_iterator i = x.begin(); i != x.end(); ++i)
1230             {
1231                 sum_x(i->first) += i->second;
1232                 for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
1233                 {
1234                     total_cov(i->first, j->first) += i->second*j->second;
1235                 }
1236             }
1237 
1238             // do sum_y += y
1239             for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
1240             {
1241                 sum_y(j->first) += j->second;
1242             }
1243 
1244             ++total_count;
1245         }
1246 
1247         template <typename T, typename U>
add(const T & x,const U & y)1248         typename enable_if_c<is_matrix<T>::value && !is_matrix<U>::value>::type add (
1249             const T& x,
1250             const U& y
1251         )
1252         {
1253             // make sure requires clause is not broken
1254             DLIB_ASSERT( (is_col_vector(x) && x.size() == x_vector_size() && x_vector_size() > 0) &&
1255                          ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) ,
1256                 "\t void running_cross_covariance::add()"
1257                 << "\n\t Invalid inputs were given to this function"
1258                 << "\n\t is_col_vector(x):      " << is_col_vector(x)
1259                 << "\n\t x.size():              " << x.size()
1260                 << "\n\t max_index_plus_one(y): " << max_index_plus_one(y)
1261                 << "\n\t x_vector_size():       " << x_vector_size()
1262                 << "\n\t y_vector_size():       " << y_vector_size()
1263                 << "\n\t this:                  " << this
1264                 );
1265 
1266             sum_x += x;
1267 
1268             for (long i = 0; i < x.size(); ++i)
1269             {
1270                 for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
1271                 {
1272                     total_cov(i, j->first) += x(i)*j->second;
1273                 }
1274             }
1275 
1276             // do sum_y += y
1277             for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
1278             {
1279                 sum_y(j->first) += j->second;
1280             }
1281 
1282             ++total_count;
1283         }
1284 
1285         template <typename T, typename U>
add(const T & x,const U & y)1286         typename enable_if_c<!is_matrix<T>::value && is_matrix<U>::value>::type add (
1287             const T& x,
1288             const U& y
1289         )
1290         {
1291             // make sure requires clause is not broken
1292             DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) &&
1293                          (is_col_vector(y) && y.size() == (long)y_vector_size() && y_vector_size() > 0) ,
1294                 "\t void running_cross_covariance::add()"
1295                 << "\n\t Invalid inputs were given to this function"
1296                 << "\n\t max_index_plus_one(x): " << max_index_plus_one(x)
1297                 << "\n\t is_col_vector(y):      " << is_col_vector(y)
1298                 << "\n\t y.size():              " << y.size()
1299                 << "\n\t x_vector_size():       " << x_vector_size()
1300                 << "\n\t y_vector_size():       " << y_vector_size()
1301                 << "\n\t this:                  " << this
1302                 );
1303 
1304             for (typename T::const_iterator i = x.begin(); i != x.end(); ++i)
1305             {
1306                 sum_x(i->first) += i->second;
1307                 for (long j = 0; j < y.size(); ++j)
1308                 {
1309                     total_cov(i->first, j) += i->second*y(j);
1310                 }
1311             }
1312 
1313             sum_y += y;
1314 
1315             ++total_count;
1316         }
1317 
1318         template <typename T, typename U>
add(const T & x,const U & y)1319         typename enable_if_c<is_matrix<T>::value && is_matrix<U>::value>::type add (
1320             const T& x,
1321             const U& y
1322         )
1323         {
1324             // make sure requires clause is not broken
1325             DLIB_ASSERT(is_col_vector(x) && (x_vector_size() == 0 || x.size() == x_vector_size()) &&
1326                         is_col_vector(y) && (y_vector_size() == 0 || y.size() == y_vector_size()) &&
1327                         x.size() != 0 &&
1328                         y.size() != 0,
1329                 "\t void running_cross_covariance::add()"
1330                 << "\n\t Invalid inputs were given to this function"
1331                 << "\n\t is_col_vector(x): " << is_col_vector(x)
1332                 << "\n\t x_vector_size():  " << x_vector_size()
1333                 << "\n\t x.size():         " << x.size()
1334                 << "\n\t is_col_vector(y): " << is_col_vector(y)
1335                 << "\n\t y_vector_size():  " << y_vector_size()
1336                 << "\n\t y.size():         " << y.size()
1337                 << "\n\t this:             " << this
1338                 );
1339 
1340             x_vect_size = x.size();
1341             y_vect_size = y.size();
1342             if (total_count == 0)
1343             {
1344                 total_cov = x*trans(y);
1345                 sum_x = x;
1346                 sum_y = y;
1347             }
1348             else
1349             {
1350                 total_cov += x*trans(y);
1351                 sum_x += x;
1352                 sum_y += y;
1353             }
1354             ++total_count;
1355         }
1356 
mean_x()1357         const column_matrix mean_x (
1358         ) const
1359         {
1360             // make sure requires clause is not broken
1361             DLIB_ASSERT( current_n() != 0,
1362                 "\t running_cross_covariance::mean()"
1363                 << "\n\t This object can not execute this function in its current state."
1364                 << "\n\t current_n():      " << current_n()
1365                 << "\n\t this:             " << this
1366                 );
1367 
1368             return sum_x/total_count;
1369         }
1370 
mean_y()1371         const column_matrix mean_y (
1372         ) const
1373         {
1374             // make sure requires clause is not broken
1375             DLIB_ASSERT( current_n() != 0,
1376                 "\t running_cross_covariance::mean()"
1377                 << "\n\t This object can not execute this function in its current state."
1378                 << "\n\t current_n():      " << current_n()
1379                 << "\n\t this:             " << this
1380                 );
1381 
1382             return sum_y/total_count;
1383         }
1384 
covariance_xy()1385         const general_matrix covariance_xy (
1386         ) const
1387         {
1388             // make sure requires clause is not broken
1389             DLIB_ASSERT( current_n() > 1,
1390                 "\t running_cross_covariance::covariance()"
1391                 << "\n\t This object can not execute this function in its current state."
1392                 << "\n\t x_vector_size(): " << x_vector_size()
1393                 << "\n\t y_vector_size(): " << y_vector_size()
1394                 << "\n\t current_n():     " << current_n()
1395                 << "\n\t this:            " << this
1396                 );
1397 
1398             return (total_cov - sum_x*trans(sum_y)/total_count)/(total_count-1);
1399         }
1400 
1401         const running_cross_covariance operator+ (
1402             const running_cross_covariance& item
1403         ) const
1404         {
1405             // make sure requires clause is not broken
1406             DLIB_ASSERT((x_vector_size() == 0 || item.x_vector_size() == 0 || x_vector_size() == item.x_vector_size()) &&
1407                         (y_vector_size() == 0 || item.y_vector_size() == 0 || y_vector_size() == item.y_vector_size()),
1408                 "\t running_cross_covariance running_cross_covariance::operator+()"
1409                 << "\n\t The two running_cross_covariance objects being added must have compatible parameters"
1410                 << "\n\t x_vector_size():            " << x_vector_size()
1411                 << "\n\t item.x_vector_size():       " << item.x_vector_size()
1412                 << "\n\t y_vector_size():            " << y_vector_size()
1413                 << "\n\t item.y_vector_size():       " << item.y_vector_size()
1414                 << "\n\t this:                       " << this
1415                 );
1416 
1417             running_cross_covariance temp(item);
1418 
1419             // make sure we ignore empty matrices
1420             if (total_count != 0 && temp.total_count != 0)
1421             {
1422                 temp.total_cov += total_cov;
1423                 temp.sum_x += sum_x;
1424                 temp.sum_y += sum_y;
1425                 temp.total_count += total_count;
1426             }
1427             else if (total_count != 0)
1428             {
1429                 temp.total_cov = total_cov;
1430                 temp.sum_x = sum_x;
1431                 temp.sum_y = sum_y;
1432                 temp.total_count = total_count;
1433             }
1434 
1435             return temp;
1436         }
1437 
1438 
1439     private:
1440 
1441         general_matrix total_cov;
1442         column_matrix sum_x;
1443         column_matrix sum_y;
1444         scalar_type total_count;
1445 
1446         long x_vect_size;
1447         long y_vect_size;
1448     };
1449 
1450 // ----------------------------------------------------------------------------------------
1451 
1452     template <
1453         typename matrix_type
1454         >
1455     class vector_normalizer
1456     {
1457     public:
1458         typedef typename matrix_type::mem_manager_type mem_manager_type;
1459         typedef typename matrix_type::type scalar_type;
1460         typedef matrix_type result_type;
1461 
1462         template <typename vector_type>
train(const vector_type & samples)1463         void train (
1464             const vector_type& samples
1465         )
1466         {
1467             // make sure requires clause is not broken
1468             DLIB_ASSERT(samples.size() > 0,
1469                 "\tvoid vector_normalizer::train()"
1470                 << "\n\tyou have to give a nonempty set of samples to this function"
1471                 << "\n\tthis: " << this
1472                 );
1473 
1474             m = mean(mat(samples));
1475             sd = reciprocal(sqrt(variance(mat(samples))));
1476 
1477             DLIB_ASSERT(is_finite(m), "Some of the input vectors to vector_normalizer::train() have infinite or NaN values");
1478         }
1479 
in_vector_size()1480         long in_vector_size (
1481         ) const
1482         {
1483             return m.nr();
1484         }
1485 
out_vector_size()1486         long out_vector_size (
1487         ) const
1488         {
1489             return m.nr();
1490         }
1491 
means()1492         const matrix_type& means (
1493         ) const
1494         {
1495             return m;
1496         }
1497 
std_devs()1498         const matrix_type& std_devs (
1499         ) const
1500         {
1501             return sd;
1502         }
1503 
operator()1504         const result_type& operator() (
1505             const matrix_type& x
1506         ) const
1507         {
1508             // make sure requires clause is not broken
1509             DLIB_ASSERT(x.nr() == in_vector_size() && x.nc() == 1,
1510                 "\tmatrix vector_normalizer::operator()"
1511                 << "\n\t you have given invalid arguments to this function"
1512                 << "\n\t x.nr():           " << x.nr()
1513                 << "\n\t in_vector_size(): " << in_vector_size()
1514                 << "\n\t x.nc():           " << x.nc()
1515                 << "\n\t this:             " << this
1516                 );
1517 
1518             temp_out = pointwise_multiply(x-m, sd);
1519             return temp_out;
1520         }
1521 
swap(vector_normalizer & item)1522         void swap (
1523             vector_normalizer& item
1524         )
1525         {
1526             m.swap(item.m);
1527             sd.swap(item.sd);
1528             temp_out.swap(item.temp_out);
1529         }
1530 
1531         template <typename mt>
1532         friend void deserialize (
1533             vector_normalizer<mt>& item,
1534             std::istream& in
1535         );
1536 
1537         template <typename mt>
1538         friend void serialize (
1539             const vector_normalizer<mt>& item,
1540             std::ostream& out
1541         );
1542 
1543     private:
1544 
1545         // ------------------- private data members -------------------
1546 
1547         matrix_type m, sd;
1548 
1549         // This is just a temporary variable that doesn't contribute to the
1550         // state of this object.
1551         mutable matrix_type temp_out;
1552     };
1553 
1554 // ----------------------------------------------------------------------------------------
1555 
1556     template <
1557         typename matrix_type
1558         >
swap(vector_normalizer<matrix_type> & a,vector_normalizer<matrix_type> & b)1559     inline void swap (
1560         vector_normalizer<matrix_type>& a,
1561         vector_normalizer<matrix_type>& b
1562     ) { a.swap(b); }
1563 
1564 // ----------------------------------------------------------------------------------------
1565 
1566     template <
1567         typename matrix_type
1568         >
deserialize(vector_normalizer<matrix_type> & item,std::istream & in)1569     void deserialize (
1570         vector_normalizer<matrix_type>& item,
1571         std::istream& in
1572     )
1573     {
1574         deserialize(item.m, in);
1575         deserialize(item.sd, in);
1576         // Keep deserializing the pca matrix for backwards compatibility.
1577         matrix<double> pca;
1578         deserialize(pca, in);
1579 
1580         if (pca.size() != 0)
1581             throw serialization_error("Error deserializing object of type vector_normalizer\n"
1582                                         "It looks like a serialized vector_normalizer_pca was accidentally deserialized into \n"
1583                                         "a vector_normalizer object.");
1584     }
1585 
1586 // ----------------------------------------------------------------------------------------
1587 
1588     template <
1589         typename matrix_type
1590         >
serialize(const vector_normalizer<matrix_type> & item,std::ostream & out)1591     void serialize (
1592         const vector_normalizer<matrix_type>& item,
1593         std::ostream& out
1594     )
1595     {
1596         serialize(item.m, out);
1597         serialize(item.sd, out);
1598         // Keep serializing the pca matrix for backwards compatibility.
1599         matrix<double> pca;
1600         serialize(pca, out);
1601     }
1602 
1603 // ----------------------------------------------------------------------------------------
1604 // ----------------------------------------------------------------------------------------
1605 // ----------------------------------------------------------------------------------------
1606 
1607     template <
1608         typename matrix_type
1609         >
1610     class vector_normalizer_pca
1611     {
1612     public:
1613         typedef typename matrix_type::mem_manager_type mem_manager_type;
1614         typedef typename matrix_type::type scalar_type;
1615         typedef matrix<scalar_type,0,1,mem_manager_type> result_type;
1616 
1617         template <typename vector_type>
1618         void train (
1619             const vector_type& samples,
1620             const double eps = 0.99
1621         )
1622         {
1623             // You are getting an error here because you are trying to apply PCA
1624             // to a vector of fixed length.  But PCA is going to try and do
1625             // dimensionality reduction so you can't use a vector with a fixed dimension.
1626             COMPILE_TIME_ASSERT(matrix_type::NR == 0);
1627 
1628             // make sure requires clause is not broken
1629             DLIB_ASSERT(samples.size() > 0,
1630                 "\tvoid vector_normalizer_pca::train_pca()"
1631                 << "\n\tyou have to give a nonempty set of samples to this function"
1632                 << "\n\tthis: " << this
1633                 );
1634             DLIB_ASSERT(0 < eps && eps <= 1,
1635                 "\tvoid vector_normalizer_pca::train_pca()"
1636                 << "\n\tyou have to give a nonempty set of samples to this function"
1637                 << "\n\tthis: " << this
1638                 );
1639             train_pca_impl(mat(samples),eps);
1640 
1641             DLIB_ASSERT(is_finite(m), "Some of the input vectors to vector_normalizer_pca::train() have infinite or NaN values");
1642         }
1643 
in_vector_size()1644         long in_vector_size (
1645         ) const
1646         {
1647             return m.nr();
1648         }
1649 
out_vector_size()1650         long out_vector_size (
1651         ) const
1652         {
1653             return pca.nr();
1654         }
1655 
means()1656         const matrix<scalar_type,0,1,mem_manager_type>& means (
1657         ) const
1658         {
1659             return m;
1660         }
1661 
std_devs()1662         const matrix<scalar_type,0,1,mem_manager_type>& std_devs (
1663         ) const
1664         {
1665             return sd;
1666         }
1667 
pca_matrix()1668         const matrix<scalar_type,0,0,mem_manager_type>& pca_matrix (
1669         ) const
1670         {
1671             return pca;
1672         }
1673 
operator()1674         const result_type& operator() (
1675             const matrix_type& x
1676         ) const
1677         {
1678             // make sure requires clause is not broken
1679             DLIB_ASSERT(x.nr() == in_vector_size() && x.nc() == 1,
1680                 "\tmatrix vector_normalizer_pca::operator()"
1681                 << "\n\t you have given invalid arguments to this function"
1682                 << "\n\t x.nr():           " << x.nr()
1683                 << "\n\t in_vector_size(): " << in_vector_size()
1684                 << "\n\t x.nc():           " << x.nc()
1685                 << "\n\t this:             " << this
1686                 );
1687 
1688             // If we have a pca transform matrix on hand then
1689             // also apply that.
1690             temp_out = pca*pointwise_multiply(x-m, sd);
1691 
1692             return temp_out;
1693         }
1694 
swap(vector_normalizer_pca & item)1695         void swap (
1696             vector_normalizer_pca& item
1697         )
1698         {
1699             m.swap(item.m);
1700             sd.swap(item.sd);
1701             pca.swap(item.pca);
1702             temp_out.swap(item.temp_out);
1703         }
1704 
1705         template <typename T>
1706         friend void deserialize (
1707             vector_normalizer_pca<T>& item,
1708             std::istream& in
1709         );
1710 
1711         template <typename T>
1712         friend void serialize (
1713             const vector_normalizer_pca<T>& item,
1714             std::ostream& out
1715         );
1716 
1717     private:
1718 
1719         template <typename mat_type>
train_pca_impl(const mat_type & samples,const double eps)1720         void train_pca_impl (
1721             const mat_type& samples,
1722             const double eps
1723         )
1724         {
1725             m = mean(samples);
1726             sd = reciprocal(sqrt(variance(samples)));
1727 
1728             // fill x with the normalized version of the input samples
1729             matrix<typename mat_type::type,0,1,mem_manager_type> x(samples);
1730             for (long r = 0; r < x.size(); ++r)
1731                 x(r) = pointwise_multiply(x(r)-m, sd);
1732 
1733             matrix<scalar_type,0,0,mem_manager_type> temp, eigen;
1734             matrix<scalar_type,0,1,mem_manager_type> eigenvalues;
1735 
1736             // Compute the svd of the covariance matrix of the normalized inputs
1737             svd(covariance(x), temp, eigen, pca);
1738             eigenvalues = diag(eigen);
1739 
1740             rsort_columns(pca, eigenvalues);
1741 
1742             // figure out how many eigenvectors we want in our pca matrix
1743             const double thresh = sum(eigenvalues)*eps;
1744             long num_vectors = 0;
1745             double total = 0;
1746             for (long r = 0; r < eigenvalues.size() && total < thresh; ++r)
1747             {
1748                 ++num_vectors;
1749                 total += eigenvalues(r);
1750             }
1751 
1752             // So now we know we want to use num_vectors of the first eigenvectors.  So
1753             // pull those out and discard the rest.
1754             pca = trans(colm(pca,range(0,num_vectors-1)));
1755 
1756             // Apply the pca transform to the data in x.  Then we will normalize the
1757             // pca matrix below.
1758             for (long r = 0; r < x.nr(); ++r)
1759             {
1760                 x(r) = pca*x(r);
1761             }
1762 
1763             // Here we just scale the output features from the pca transform so
1764             // that the variance of each feature is 1.  So this doesn't really change
1765             // what the pca is doing, it just makes sure the output features are
1766             // normalized.
1767             pca = trans(scale_columns(trans(pca), reciprocal(sqrt(variance(x)))));
1768         }
1769 
1770 
1771         // ------------------- private data members -------------------
1772 
1773         matrix<scalar_type,0,1,mem_manager_type> m, sd;
1774         matrix<scalar_type,0,0,mem_manager_type> pca;
1775 
1776         // This is just a temporary variable that doesn't contribute to the
1777         // state of this object.
1778         mutable result_type temp_out;
1779     };
1780 
1781     template <
1782         typename matrix_type
1783         >
swap(vector_normalizer_pca<matrix_type> & a,vector_normalizer_pca<matrix_type> & b)1784     inline void swap (
1785         vector_normalizer_pca<matrix_type>& a,
1786         vector_normalizer_pca<matrix_type>& b
1787     ) { a.swap(b); }
1788 
1789 // ----------------------------------------------------------------------------------------
1790 
1791     template <
1792         typename matrix_type
1793         >
deserialize(vector_normalizer_pca<matrix_type> & item,std::istream & in)1794     void deserialize (
1795         vector_normalizer_pca<matrix_type>& item,
1796         std::istream& in
1797     )
1798     {
1799         deserialize(item.m, in);
1800         deserialize(item.sd, in);
1801         deserialize(item.pca, in);
1802         if (item.pca.nc() != item.m.nr())
1803             throw serialization_error("Error deserializing object of type vector_normalizer_pca\n"
1804                                         "It looks like a serialized vector_normalizer was accidentally deserialized into \n"
1805                                         "a vector_normalizer_pca object.");
1806     }
1807 
1808     template <
1809         typename matrix_type
1810         >
serialize(const vector_normalizer_pca<matrix_type> & item,std::ostream & out)1811     void serialize (
1812         const vector_normalizer_pca<matrix_type>& item,
1813         std::ostream& out
1814     )
1815     {
1816         serialize(item.m, out);
1817         serialize(item.sd, out);
1818         serialize(item.pca, out);
1819     }
1820 
1821 // ----------------------------------------------------------------------------------------
1822 
binomial_random_vars_are_different(uint64_t k1,uint64_t n1,uint64_t k2,uint64_t n2)1823     inline double binomial_random_vars_are_different (
1824         uint64_t k1,
1825         uint64_t n1,
1826         uint64_t k2,
1827         uint64_t n2
1828     )
1829     {
1830         DLIB_ASSERT(k1 <= n1, "k1: "<< k1 << "  n1: "<< n1);
1831         DLIB_ASSERT(k2 <= n2, "k2: "<< k2 << "  n2: "<< n2);
1832 
1833         const double p1 = k1/(double)n1;
1834         const double p2 = k2/(double)n2;
1835         const double p = (k1+k2)/(double)(n1+n2);
1836 
1837         auto ll = [](double p, uint64_t k, uint64_t n) {
1838             if (p == 0 || p == 1)
1839                 return 0.0;
1840             return k*std::log(p) + (n-k)*std::log(1-p);
1841         };
1842 
1843         auto logll = ll(p1,k1,n1) + ll(p2,k2,n2) - ll(p,k1,n1) - ll(p,k2,n2);
1844 
1845         // The basic statistic only tells you if the random variables are different.  But
1846         // it's nice to know which way they are different, i.e., which one is bigger.  So
1847         // stuff that information into the sign bit of the return value.
1848         if (p1>=p2)
1849             return logll;
1850         else
1851             return -logll;
1852     }
1853 
1854 // ----------------------------------------------------------------------------------------
1855 
event_correlation(uint64_t A_count,uint64_t B_count,uint64_t AB_count,uint64_t total_num_observations)1856     inline double event_correlation (
1857         uint64_t A_count,
1858         uint64_t B_count,
1859         uint64_t AB_count,
1860         uint64_t total_num_observations
1861     )
1862     {
1863         DLIB_ASSERT(AB_count <= A_count && A_count <= total_num_observations,
1864             "AB_count: " << AB_count << ", A_count: "<< A_count << ", total_num_observations: " << total_num_observations);
1865         DLIB_ASSERT(AB_count <= B_count && B_count <= total_num_observations,
1866             "AB_count: " << AB_count << ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations);
1867 
1868         if (total_num_observations == 0)
1869             return 0;
1870 
1871         DLIB_ASSERT(A_count + B_count - AB_count <= total_num_observations,
1872             "AB_count: " << AB_count << " A_count: " << A_count <<  ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations);
1873 
1874 
1875         const auto AnotB_count = A_count - AB_count;
1876         const auto notB_count = total_num_observations - B_count;
1877         // How likely is it that the odds of A happening is different when conditioned on
1878         // whether or not B happened?
1879         return binomial_random_vars_are_different(
1880             AB_count, B_count,      // A conditional on the presence of B
1881             AnotB_count, notB_count // A conditional on the absence of B
1882         );
1883     }
1884 
1885 // ----------------------------------------------------------------------------------------
1886 
1887 }
1888 
1889 #endif // DLIB_STATISTICs_
1890 
1891