1 /**********************************************************************
2 * File: linlsq.cpp (Formerly llsq.c)
3 * Description: Linear Least squares fitting code.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "linlsq.h"
20 #include <cmath> // for std::sqrt
21 #include <cstdio>
22 #include "errcode.h"
23
24 namespace tesseract {
25
26 constexpr ERRCODE EMPTY_LLSQ("Can't delete from an empty LLSQ");
27
28 /**********************************************************************
29 * LLSQ::clear
30 *
31 * Function to initialize a LLSQ.
32 **********************************************************************/
33
clear()34 void LLSQ::clear() { // initialize
35 total_weight = 0.0; // no elements
36 sigx = 0.0; // update accumulators
37 sigy = 0.0;
38 sigxx = 0.0;
39 sigxy = 0.0;
40 sigyy = 0.0;
41 }
42
43 /**********************************************************************
44 * LLSQ::add
45 *
46 * Add an element to the accumulator.
47 **********************************************************************/
48
add(double x,double y)49 void LLSQ::add(double x, double y) { // add an element
50 total_weight++; // count elements
51 sigx += x; // update accumulators
52 sigy += y;
53 sigxx += x * x;
54 sigxy += x * y;
55 sigyy += y * y;
56 }
57 // Adds an element with a specified weight.
add(double x,double y,double weight)58 void LLSQ::add(double x, double y, double weight) {
59 total_weight += weight;
60 sigx += x * weight; // update accumulators
61 sigy += y * weight;
62 sigxx += x * x * weight;
63 sigxy += x * y * weight;
64 sigyy += y * y * weight;
65 }
66 // Adds a whole LLSQ.
add(const LLSQ & other)67 void LLSQ::add(const LLSQ &other) {
68 total_weight += other.total_weight;
69 sigx += other.sigx; // update accumulators
70 sigy += other.sigy;
71 sigxx += other.sigxx;
72 sigxy += other.sigxy;
73 sigyy += other.sigyy;
74 }
75
76 /**********************************************************************
77 * LLSQ::remove
78 *
79 * Delete an element from the acculuator.
80 **********************************************************************/
81
remove(double x,double y)82 void LLSQ::remove(double x, double y) { // delete an element
83 if (total_weight <= 0.0) { // illegal
84 EMPTY_LLSQ.error("LLSQ::remove", ABORT, nullptr);
85 }
86 total_weight--; // count elements
87 sigx -= x; // update accumulators
88 sigy -= y;
89 sigxx -= x * x;
90 sigxy -= x * y;
91 sigyy -= y * y;
92 }
93
94 /**********************************************************************
95 * LLSQ::m
96 *
97 * Return the gradient of the line fit.
98 **********************************************************************/
99
m() const100 double LLSQ::m() const { // get gradient
101 double covar = covariance();
102 double x_var = x_variance();
103 if (x_var != 0.0) {
104 return covar / x_var;
105 } else {
106 return 0.0; // too little
107 }
108 }
109
110 /**********************************************************************
111 * LLSQ::c
112 *
113 * Return the constant of the line fit.
114 **********************************************************************/
115
c(double m) const116 double LLSQ::c(double m) const { // get constant
117 if (total_weight > 0.0) {
118 return (sigy - m * sigx) / total_weight;
119 } else {
120 return 0; // too little
121 }
122 }
123
124 /**********************************************************************
125 * LLSQ::rms
126 *
127 * Return the rms error of the fit.
128 **********************************************************************/
129
rms(double m,double c) const130 double LLSQ::rms(double m, double c) const { // get error
131 double error; // total error
132
133 if (total_weight > 0) {
134 error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * (total_weight * c - 2 * sigy);
135 if (error >= 0) {
136 error = std::sqrt(error / total_weight); // sqrt of mean
137 } else {
138 error = 0;
139 }
140 } else {
141 error = 0; // too little
142 }
143 return error;
144 }
145
146 /**********************************************************************
147 * LLSQ::pearson
148 *
149 * Return the pearson product moment correlation coefficient.
150 **********************************************************************/
151
pearson() const152 double LLSQ::pearson() const { // get correlation
153 double r = 0.0; // Correlation is 0 if insufficient data.
154
155 double covar = covariance();
156 if (covar != 0.0) {
157 double var_product = x_variance() * y_variance();
158 if (var_product > 0.0) {
159 r = covar / std::sqrt(var_product);
160 }
161 }
162 return r;
163 }
164
165 // Returns the x,y means as an FCOORD.
mean_point() const166 FCOORD LLSQ::mean_point() const {
167 if (total_weight > 0.0) {
168 return FCOORD(sigx / total_weight, sigy / total_weight);
169 } else {
170 return FCOORD(0.0f, 0.0f);
171 }
172 }
173
174 // Returns the sqrt of the mean squared error measured perpendicular from the
175 // line through mean_point() in the direction dir.
176 //
177 // Derivation:
178 // Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
179 // Let % be dot product and ' be transpose. Note that:
180 // Sum[i=1..N] (v % x_i)^2
181 // = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
182 // If x_i have average 0 we have:
183 // = v * (N * COVARIANCE_MATRIX(X)) * v'
184 // Expanded for the case that k = 2, where we treat the dimensions
185 // as x_i and y_i, this is:
186 // = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
187 // Now, we are trying to calculate the mean squared error, where v is
188 // perpendicular to our line of interest:
189 // Mean squared error
190 // = E [ (v % (x_i - x_avg))) ^2 ]
191 // = Sum (v % (x_i - x_avg))^2 / N
192 // = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
193 // = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
194 // = code below
rms_orth(const FCOORD & dir) const195 double LLSQ::rms_orth(const FCOORD &dir) const {
196 FCOORD v = !dir;
197 v.normalise();
198 return std::sqrt(x_variance() * v.x() * v.x() + 2 * covariance() * v.x() * v.y() +
199 y_variance() * v.y() * v.y());
200 }
201
202 // Returns the direction of the fitted line as a unit vector, using the
203 // least mean squared perpendicular distance. The line runs through the
204 // mean_point, i.e. a point p on the line is given by:
205 // p = mean_point() + lambda * vector_fit() for some real number lambda.
206 // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
207 // and may be negated without changing its meaning.
208 // Fitting a line m + v to a set of N points Pi = (xi, yi), where
209 // m is the mean point (, ) and
210 // v is the direction vector (cos, sin)
211 // The perpendicular distance of each Pi from the line is:
212 // (Pi - m) x v, where x is the scalar cross product.
213 // Total squared error is thus:
214 // E = ∑((xi - )sin - (yi - )cos)²
215 // = ∑(xi - )²sin² - 2∑(xi - )(yi - )sin cos + ∑(yi - )²cos²
216 // = NVar(xi)sin² - 2NCovar(xi, yi)sin cos + NVar(yi)cos² (Eq 1)
217 // where Var(xi) is the variance of xi,
218 // and Covar(xi, yi) is the covariance of xi, yi.
219 // Taking the derivative wrt and setting to 0 to obtain the min/max:
220 // 0 = 2NVar(xi)sin cos -2NCovar(xi, yi)(cos² - sin²) -2NVar(yi)sin cos
221 // => Covar(xi, yi)(cos² - sin²) = (Var(xi) - Var(yi))sin cos
222 // Using double angles:
223 // 2Covar(xi, yi)cos2 = (Var(xi) - Var(yi))sin2 (Eq 2)
224 // So = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
225
226 // Because it involves 2 , Eq 2 has 2 solutions 90 degrees apart, but which
227 // is the min and which is the max? From Eq1:
228 // E/N = Var(xi)sin² - 2Covar(xi, yi)sin cos + Var(yi)cos²
229 // and 90 degrees away, using sin/cos equivalences:
230 // E'/N = Var(xi)cos² + 2Covar(xi, yi)sin cos + Var(yi)sin²
231 // The second error is smaller (making it the minimum) iff
232 // E'/N < E/N ie:
233 // (Var(xi) - Var(yi))(cos² - sin²) < -4Covar(xi, yi)sin cos
234 // Using double angles:
235 // (Var(xi) - Var(yi))cos2 < -2Covar(xi, yi)sin2 (InEq 1)
236 // But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2 such that:
237 // sgn(cos2) = sgn(Var(xi) - Var(yi)) and sgn(sin2) = sgn(Covar(xi, yi))
238 // so InEq1 can *never* be true, making the atan2 result *always* the min!
239 // In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
240 // the 2 solutions have equal error and the inequality is still false.
241 // Therefore the solution really is as trivial as Eq 3.
242
243 // This is equivalent to returning the Principal Component in PCA, or the
244 // eigenvector corresponding to the largest eigenvalue in the covariance
245 // matrix. However, atan2 is much simpler! The one reference I found that
246 // uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
247 // that is still a much more complex derivation. It seems Pearson had already
248 // found this simple solution in 1901.
249 // http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
vector_fit() const250 FCOORD LLSQ::vector_fit() const {
251 double x_var = x_variance();
252 double y_var = y_variance();
253 double covar = covariance();
254 double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
255 FCOORD result(cos(theta), sin(theta));
256 return result;
257 }
258
259 } // namespace tesseract
260