1 /**********************************************************************
2  * File:        linlsq.cpp  (Formerly llsq.c)
3  * Description: Linear Least squares fitting code.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "linlsq.h"
20 #include <cmath> // for std::sqrt
21 #include <cstdio>
22 #include "errcode.h"
23 
24 namespace tesseract {
25 
26 constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ");
27 
28 /**********************************************************************
29  * LLSQ::clear
30  *
31  * Function to initialize a LLSQ.
32  **********************************************************************/
33 
clear()34 void LLSQ::clear() {  // initialize
35   total_weight = 0.0; // no elements
36   sigx = 0.0;         // update accumulators
37   sigy = 0.0;
38   sigxx = 0.0;
39   sigxy = 0.0;
40   sigyy = 0.0;
41 }
42 
43 /**********************************************************************
44  * LLSQ::add
45  *
46  * Add an element to the accumulator.
47  **********************************************************************/
48 
add(double x,double y)49 void LLSQ::add(double x, double y) { // add an element
50   total_weight++;                    // count elements
51   sigx += x;                         // update accumulators
52   sigy += y;
53   sigxx += x * x;
54   sigxy += x * y;
55   sigyy += y * y;
56 }
57 // Adds an element with a specified weight.
add(double x,double y,double weight)58 void LLSQ::add(double x, double y, double weight) {
59   total_weight += weight;
60   sigx += x * weight; // update accumulators
61   sigy += y * weight;
62   sigxx += x * x * weight;
63   sigxy += x * y * weight;
64   sigyy += y * y * weight;
65 }
66 // Adds a whole LLSQ.
add(const LLSQ & other)67 void LLSQ::add(const LLSQ &other) {
68   total_weight += other.total_weight;
69   sigx += other.sigx; // update accumulators
70   sigy += other.sigy;
71   sigxx += other.sigxx;
72   sigxy += other.sigxy;
73   sigyy += other.sigyy;
74 }
75 
76 /**********************************************************************
77  * LLSQ::remove
78  *
79  * Delete an element from the acculuator.
80  **********************************************************************/
81 
remove(double x,double y)82 void LLSQ::remove(double x, double y) { // delete an element
83   if (total_weight <= 0.0) {            // illegal
84     EMPTY_LLSQ.error("LLSQ::remove", ABORT, nullptr);
85   }
86   total_weight--; // count elements
87   sigx -= x;      // update accumulators
88   sigy -= y;
89   sigxx -= x * x;
90   sigxy -= x * y;
91   sigyy -= y * y;
92 }
93 
94 /**********************************************************************
95  * LLSQ::m
96  *
97  * Return the gradient of the line fit.
98  **********************************************************************/
99 
m() const100 double LLSQ::m() const { // get gradient
101   double covar = covariance();
102   double x_var = x_variance();
103   if (x_var != 0.0) {
104     return covar / x_var;
105   } else {
106     return 0.0; // too little
107   }
108 }
109 
110 /**********************************************************************
111  * LLSQ::c
112  *
113  * Return the constant of the line fit.
114  **********************************************************************/
115 
c(double m) const116 double LLSQ::c(double m) const { // get constant
117   if (total_weight > 0.0) {
118     return (sigy - m * sigx) / total_weight;
119   } else {
120     return 0; // too little
121   }
122 }
123 
124 /**********************************************************************
125  * LLSQ::rms
126  *
127  * Return the rms error of the fit.
128  **********************************************************************/
129 
rms(double m,double c) const130 double LLSQ::rms(double m, double c) const { // get error
131   double error;                              // total error
132 
133   if (total_weight > 0) {
134     error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy);
135     if (error >= 0) {
136       error = std::sqrt(error / total_weight); // sqrt of mean
137     } else {
138       error = 0;
139     }
140   } else {
141     error = 0; // too little
142   }
143   return error;
144 }
145 
146 /**********************************************************************
147  * LLSQ::pearson
148  *
149  * Return the pearson product moment correlation coefficient.
150  **********************************************************************/
151 
pearson() const152 double LLSQ::pearson() const { // get correlation
153   double r = 0.0;              // Correlation is 0 if insufficient data.
154 
155   double covar = covariance();
156   if (covar != 0.0) {
157     double var_product = x_variance() * y_variance();
158     if (var_product > 0.0) {
159       r = covar / std::sqrt(var_product);
160     }
161   }
162   return r;
163 }
164 
165 // Returns the x,y means as an FCOORD.
mean_point() const166 FCOORD LLSQ::mean_point() const {
167   if (total_weight > 0.0) {
168     return FCOORD(sigx / total_weight, sigy / total_weight);
169   } else {
170     return FCOORD(0.0f, 0.0f);
171   }
172 }
173 
174 // Returns the sqrt of the mean squared error measured perpendicular from the
175 // line through mean_point() in the direction dir.
176 //
177 // Derivation:
178 //   Lemma:  Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
179 //     Let % be dot product and ' be transpose.  Note that:
180 //      Sum[i=1..N] (v % x_i)^2
181 //         = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
182 //     If x_i have average 0 we have:
183 //       = v * (N * COVARIANCE_MATRIX(X)) * v'
184 //     Expanded for the case that k = 2, where we treat the dimensions
185 //     as x_i and y_i, this is:
186 //       = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
187 //  Now, we are trying to calculate the mean squared error, where v is
188 //  perpendicular to our line of interest:
189 //    Mean squared error
190 //      = E [ (v % (x_i - x_avg))) ^2 ]
191 //      = Sum (v % (x_i - x_avg))^2 / N
192 //      = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
193 //      = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
194 //      = code below
rms_orth(const FCOORD & dir) const195 double LLSQ::rms_orth(const FCOORD &dir) const {
196   FCOORD v = !dir;
197   v.normalise();
198   return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() +
199                    y_variance() * v.y() * v.y());
200 }
201 
202 // Returns the direction of the fitted line as a unit vector, using the
203 // least mean squared perpendicular distance. The line runs through the
204 // mean_point, i.e. a point p on the line is given by:
205 // p = mean_point() + lambda * vector_fit() for some real number lambda.
206 // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
207 // and may be negated without changing its meaning.
208 // Fitting a line m + ��v to a set of N points Pi = (xi, yi), where
209 // m is the mean point (��, ��) and
210 // v is the direction vector (cos��, sin��)
211 // The perpendicular distance of each Pi from the line is:
212 // (Pi - m) x v, where x is the scalar cross product.
213 // Total squared error is thus:
214 // E = ∑((xi - ��)sin�� - (yi - ��)cos��)²
215 //   = ∑(xi - ��)²sin²��  - 2∑(xi - ��)(yi - ��)sin�� cos�� + ∑(yi - ��)²cos²��
216 //   = NVar(xi)sin²��  - 2NCovar(xi, yi)sin�� cos��  + NVar(yi)cos²��   (Eq 1)
217 // where Var(xi) is the variance of xi,
218 // and Covar(xi, yi) is the covariance of xi, yi.
219 // Taking the derivative wrt �� and setting to 0 to obtain the min/max:
220 // 0 = 2NVar(xi)sin�� cos�� -2NCovar(xi, yi)(cos²�� - sin²��) -2NVar(yi)sin�� cos��
221 // => Covar(xi, yi)(cos²�� - sin²��) = (Var(xi) - Var(yi))sin�� cos��
222 // Using double angles:
223 // 2Covar(xi, yi)cos2�� = (Var(xi) - Var(yi))sin2��   (Eq 2)
224 // So �� = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
225 
226 // Because it involves 2�� , Eq 2 has 2 solutions 90 degrees apart, but which
227 // is the min and which is the max? From Eq1:
228 // E/N = Var(xi)sin²��  - 2Covar(xi, yi)sin�� cos��  + Var(yi)cos²��
229 // and 90 degrees away, using sin/cos equivalences:
230 // E'/N = Var(xi)cos²��  + 2Covar(xi, yi)sin�� cos��  + Var(yi)sin²��
231 // The second error is smaller (making it the minimum) iff
232 // E'/N < E/N ie:
233 // (Var(xi) - Var(yi))(cos²�� - sin²��) < -4Covar(xi, yi)sin�� cos��
234 // Using double angles:
235 // (Var(xi) - Var(yi))cos2��  < -2Covar(xi, yi)sin2��  (InEq 1)
236 // But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2��  such that:
237 // sgn(cos2��) = sgn(Var(xi) - Var(yi)) and sgn(sin2��) = sgn(Covar(xi, yi))
238 // so InEq1 can *never* be true, making the atan2 result *always* the min!
239 // In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
240 // the 2 solutions have equal error and the inequality is still false.
241 // Therefore the solution really is as trivial as Eq 3.
242 
243 // This is equivalent to returning the Principal Component in PCA, or the
244 // eigenvector corresponding to the largest eigenvalue in the covariance
245 // matrix.  However, atan2 is much simpler! The one reference I found that
246 // uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
247 // that is still a much more complex derivation. It seems Pearson had already
248 // found this simple solution in 1901.
249 // http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
vector_fit() const250 FCOORD LLSQ::vector_fit() const {
251   double x_var = x_variance();
252   double y_var = y_variance();
253   double covar = covariance();
254   double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
255   FCOORD result(cos(theta), sin(theta));
256   return result;
257 }
258 
259 } // namespace tesseract
260