1 //==============================================================================
2 //
3 // This file is part of GPSTk, the GPS Toolkit.
4 //
5 // The GPSTk is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU Lesser General Public License as published
7 // by the Free Software Foundation; either version 3.0 of the License, or
8 // any later version.
9 //
10 // The GPSTk is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU Lesser General Public License for more details.
14 //
15 // You should have received a copy of the GNU Lesser General Public
16 // License along with GPSTk; if not, write to the Free Software Foundation,
17 // Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
18 //
19 // This software was developed by Applied Research Laboratories at the
20 // University of Texas at Austin.
21 // Copyright 2004-2020, The Board of Regents of The University of Texas System
22 //
23 //==============================================================================
24
25 //==============================================================================
26 //
27 // This software was developed by Applied Research Laboratories at the
28 // University of Texas at Austin, under contract to an agency or agencies
29 // within the U.S. Department of Defense. The U.S. Government retains all
30 // rights to use, duplicate, distribute, disclose, or release this software.
31 //
32 // Pursuant to DoD Directive 523024
33 //
34 // DISTRIBUTION STATEMENT A: This software has been approved for public
35 // release, distribution is unlimited.
36 //
37 //==============================================================================
38
39 /// @file RobustStats.cpp
40 /// Namespace Robust includes basic robust statistical computations, including median,
41 /// median average deviation, quartiles and m-estimate, as well as implementation of
42 /// of stem-and-leaf plots, quantile plots and robust least squares estimation of a
43 /// polynomial.
44 /// Reference: Mason, Gunst and Hess, "Statistical Design and
45 /// Analysis of Experiments," Wiley, New York, 1989.
46
47 //------------------------------------------------------------------------------------
48 // GPSTk includes
49 #include "Exception.hpp"
50 #include "StringUtils.hpp"
51 #include "Matrix.hpp"
52 #include "SRI.hpp"
53 #include "RobustStats.hpp"
54
55 //------------------------------------------------------------------------------------
56 // moved to RobustStats.hpp as macros
57 //const double gpstk::Robust::TuningT=1.5; // or 1.345; // or 1.5
58 //const double gpstk::Robust::TuningA=0.778; // or 0.67; // or 0.778
59 //const double gpstk::Robust::TuningE=0.6745;
60
61 //------------------------------------------------------------------------------------
62 using namespace std;
63 using namespace gpstk;
64 using namespace StringUtils;
65
66 //------------------------------------------------------------------------------------
Stem(double x,double & scale)67 inline long Stem(double x, double& scale) { return (long(x/scale)); }
68
69 //------------------------------------------------------------------------------------
StemLeafPlot(ostream & os,double * xd,long nd,string msg)70 void Robust::StemLeafPlot(ostream& os, double *xd, long nd, string msg)
71 {
72 long stem, l, nout=0, s, sM, sQ1, sQ3, sOH, sOL;
73 int i, sign, pos, k, leaf;
74 unsigned len=0, kk;
75 char c;
76 double x,scale;
77 string buf;
78
79 if(!xd || nd < 2) GPSTK_THROW(Exception("Invalid input"));
80
81 // find range
82 double range = xd[nd-1] - xd[0]; // max - min
83 if(range < 0.0)
84 GPSTK_THROW(Exception("Array is not sorted"));
85 if(range == 0.0)
86 range = xd[0]; //GPSTK_THROW(Exception("Array has zero range"));
87
88 // find scale
89 scale = 0.0;
90 short nscale=0; // scale = 1*10^nscale
91 if(range >= 10.0)
92 sign = 1;
93 else if(range < 1.0)
94 sign = -1;
95 else
96 scale = 1.0;
97
98 if(!scale) do {
99 nscale += sign;
100 } while(range*::pow(10.0,-nscale) < 1.0 || range*::pow(10.0,-nscale) >= 10.0);
101
102 scale = ::pow(10.0,nscale);
103
104 double M=Robust::Median(xd,nd);
105 double Q1,Q3;
106 Robust::Quartiles(xd,nd,Q1,Q3);
107 // outlier limits
108 double OH=2.5*Q3-1.5*Q1; // outlier high limit
109 double OL=2.5*Q1-1.5*Q3; // outlier low limit ('oh L' not 'zero L')
110
111 // starting stem=stem(min=xd[0]), and stem step==scale
112 i = 1+short((range/scale)+0.5); // number of stems
113 if(xd[0]*xd[nd-1] < 0.0) i++; // add one stem for zero
114 if(nd > 8 && i < 8 && xd[nd-1] != xd[0]) { // fudge so #stems is big enough
115 scale /= 10.0;
116 nscale--;
117 }
118
119 // find length of stem for printing
120 // must look through all data b/c leaf==10 can bump up the stem
121 len = 0;
122 for(l=0; l<nd; l++) {
123 // this code is duplicated below in the main loop
124 sign = 1;
125 if(xd[l] < 0.0) sign=-1;
126 stem = Stem(fabs(xd[l]),scale);
127 x = 10*fabs(xd[l]/scale-sign*stem);
128 leaf = short(x + 0.5);
129 if(leaf == 10) {
130 stem++;
131 leaf=0;
132 }
133 stem = sign*stem;
134 buf = asString<long>(stem);
135 if(len < buf.size()) len=buf.size();
136 }
137
138 // loop through data, adding stems and leaves to plot
139 bool start=true;
140 if(xd[0] < 0.0) pos=-1; else pos=1;
141 sM = Stem(M,scale);
142 sQ1 = Stem(Q1,scale);
143 sQ3 = Stem(Q3,scale);
144 sOH = Stem(OH,scale);
145 sOL = Stem(OL,scale);
146 for(l=0; l<nd; l++) {
147 // current: stem=s,pos; data=stem,sign(xd[l])
148 if(xd[l]>OH || xd[l]<OL) nout++; // count outliers
149 sign = 1;
150 if(xd[l] < 0.0) sign=-1;
151 stem = Stem(fabs(xd[l]),scale);
152 x = 10*fabs(xd[l]/scale-sign*stem);
153 leaf = short(x + 0.5);
154 if(leaf == 10) {
155 stem++;
156 leaf=0;
157 }
158 stem = sign*stem;
159
160 // print it
161 if(start || s!=stem || (s==0 && pos*sign<0.0)) {
162 // Change of stem -> print
163 if(start) { // first time through
164 os << "Stem and Leaf Plot (scale "; // print scale
165 if(nscale < -8 || nscale > 8) {
166 os << "1.0e" << nscale;
167 }
168 else {
169 i=nscale;
170 if(nscale < 0) {
171 os << "0.";
172 i++;
173 k = 1;
174 }
175 else {
176 os << "1";
177 k = -1;
178 }
179 while(i != 0) {
180 os << "0";
181 i += k;
182 }
183 if(nscale < 0)
184 os << "1";
185 else
186 os << ".0";
187 }
188 os << ", " << nd << "pts) : "; // print npts
189 if(msg.size() > 0) os << msg; // and message
190 s = stem - 1; // save for later
191 start = false;
192 }
193
194 while(s < stem || (s==0 && pos*sign<0)) { // also print stems without leaves
195 if(s != 0) s++;
196 else if(pos < 0) pos = 1;
197 else s++;
198
199 // print the new line with stem s
200 os << "\n";
201 buf = asString<long>(s < 0 ? -s : s); // abs(stem)
202
203 if(s<0) c='-'; // sign of stem
204 else if(s>0) c='+';
205 else if(pos>0)c='+';
206 else c='-';
207 os << c;
208
209 for(kk=buf.size(); kk<len; kk++) os << " "; // pad out to length
210 os << buf << " "; // stem/axis space
211
212 // now print either |, M (median), Q (quartiles), or >< (outliers)
213 k=0;
214
215 // if s==sM
216 if(s==sM && (s!=0 || pos*M>0.0)) {
217 os << "M"; // marks the median
218 k++;
219 }
220
221 if((s == sQ3 && (s != 0 || pos*Q3 < 0.0)) ||
222 (s == sQ1 && (s != 0 || pos*Q1 < 0.0) )) {
223 os << "Q"; // marks a quartile
224 k++;
225 }
226
227 if((s < sOL || (s == 0 && sOL == 0 && (pos == -1 && OL > 0.0))) ||
228 (s == sOL && (s != 0 || pos*OL > 0.0))) {
229 os << "<"; // marks an outlier (small)
230 k++;
231 }
232 else if((s > sOH || (s == 0 && sOH == 0 && (pos == 1 && OH < 0.0))) ||
233 (s == sOH && (s != 0 || pos*OH > 0.0))) {
234 os << ">"; // marks an outlier (big)
235 k++;
236 }
237
238 if(k == 0) {
239 os << "|"; // marks a regular point
240 k++;
241 }
242
243 while(k < 3) {
244 os << " ";
245 k++;
246 }
247 }
248 } // end change stem
249
250 // print leaf
251 os << leaf;
252 }
253
254 os << "\nEND Stem and Leaf Plot (there are " << nout << " outliers.)\n";
255
256 } // end StemLeafPlot
257
258
Quantiles(double * xd,long nd)259 void Robust::Quantiles(double *xd, long nd)
260 {
261 if(!xd || nd<2) {
262 Exception e("Invalid input");
263 GPSTK_THROW(e);
264 }
265
266 double f;
267 for(int i=0; i<nd; i++) {
268 f = double(8*i+5)/double(8*nd+2); // f(i) = i-3/8 / n+1/4, i=1,n
269 xd[i] = 4.91*(::pow(f,0.14) - ::pow(1-f,0.14));
270 }
271
272 } // end Quantiles
273
274
275 #define SEQUENTIAL 1 // don't see much difference in timing...
RobustPolyFit(double * xd,const double * td,int nd,int N,double * c,double * w)276 int Robust::RobustPolyFit(double *xd, const double *td, int nd,
277 int N, double *c, double *w)
278 {
279 try {
280 if(!xd || !td || !c || nd < 2) {
281 Exception e("Invalid input");
282 GPSTK_THROW(e);
283 }
284
285 const int maxiter=50;
286 const double conv_limit=::sqrt(double(nd))*1.e-3;
287 int i,j,k,niter;
288 double x0=xd[0],t0=td[0],mad,median,conv;
289 #ifdef SEQUENTIAL
290 double fit,dt;
291 Matrix<double> R,A(1,N+1),invR;
292 Vector<double> Z;
293 #else
294 Matrix<double> PT,P(nd,N,1.0);
295 Vector<double> D(nd);
296 #endif
297 Matrix<double> Cov;
298 Vector<double> Wts(nd,1.0), Coeff(N,0.0), Res(nd), ResCopy(nd);
299
300 #ifndef SEQUENTIAL
301 // build the data vector and the (constant) partials matrix
302 for(i=0; i<nd; i++) {
303 D(i) = xd[i]-x0;
304 for(j=1; j<N; j++)
305 P(i,j) = P(i,j-1)*(td[i]-t0);
306 }
307 #endif
308
309 // iterate until weights don't change
310 niter = 0;
311 while(1) {
312 #ifdef SEQUENTIAL
313 R = Matrix<double>(N,N,0.0);
314 Z = Vector<double>(N,0.0);
315 // loop over the data
316 for(i=0; i<nd; i++) {
317 //if(Wts(i) < 1.e-8) continue; // ignore if weight is very small
318 A(0,N) = (xd[i]-x0)*Wts(i); // data
319 dt = td[i]-t0;
320 A(0,0) = 1.0*Wts(i); // partials
321 for(j=1; j<N; j++) A(0,j) = A(0,j-1)*dt;
322 SrifMU<double>(R,Z,A,1);
323 }
324 #else
325 // compute partials transpose multiplied by 'weight matrix'=diag(squared wts)
326 PT = transpose(P);
327 for(i=0; i<N; i++)
328 for(j=0; j<nd; j++)
329 PT(i,j) *= Wts(j)*Wts(j);
330 Cov = PT * P; // information matrix
331 #endif
332
333 // solve
334 try {
335 #ifdef SEQUENTIAL
336 invR = inverseUT(R,&mad,&median);
337 Cov = UTtimesTranspose(invR);
338 Coeff = invR * Z;
339 #else
340 Cov = inverse(Cov);
341 Coeff = Cov * PT * D;
342 #endif
343 }
344 catch(Exception& e) { return -1; }
345
346 // compute residuals
347 #ifdef SEQUENTIAL
348 // loop over the data
349 for(i=0; i<nd; i++) {
350 fit = Coeff(N-1); // fit = partials * coeff
351 dt = td[i]-t0; // delta time
352 for(j=N-2; j>=0; j--)
353 fit = fit*dt + Coeff(j);
354 ResCopy(i) = Res(i) = xd[i]-x0 - fit; // data - fit
355 }
356 #else
357 ResCopy = Res = D - P*Coeff;
358 #endif
359
360 // compute median and MAD. NB Median() will sort the vector...
361 mad = MedianAbsoluteDeviation(&(ResCopy[0]),ResCopy.size(),median);
362
363 // recompute weights
364 Vector<double> OldWts(Wts);
365 for(i=0; i<nd; i++) {
366 if(Res(i) < -RobustTuningT*mad)
367 Wts(i) = -RobustTuningT*mad/Res(i);
368 else if(Res(i) > RobustTuningT*mad)
369 Wts(i) = RobustTuningT*mad/Res(i);
370 else
371 Wts(i) = 1.0;
372 }
373
374 // test for convergence
375 niter++;
376 conv = RMS(OldWts - Wts);
377 if(niter > maxiter) break; // return -2;
378 if(conv > 1.) break; // return -3;
379 if(niter > 2 && conv < conv_limit) break;
380 }
381
382 // copy out weights, residuals and solution
383 for(i=0; i<N; i++) c[i] = Coeff(i);
384 //c[0] += x0;
385 for(i=0; i<nd; i++) {
386 xd[i] = Res(i);
387 if(w) w[i] = Wts(i);
388 }
389 if(niter > maxiter) return -2;
390 if(conv > 1.) return -3;
391
392 #undef SEQUENTIAL
393 return 0;
394 }
395 catch(Exception& e) { GPSTK_RETHROW(e); }
396 catch(std::exception& e) {
397 Exception E("std except: "+string(e.what()));
398 GPSTK_THROW(E);
399 }
400 catch(...) { Exception e("Unknown exception"); GPSTK_THROW(e); }
401 }
402
403 // Anderson-Darling test statistic, which is a variant of the Kolmogorov-Smirnoff
404 // test, comparing the distribution of data with mean m and standard deviation s
405 // to the normal distribution.
406 // @param xd array of data.
407 // @param nd length of array xd.
408 // @param mean mean of the data.
409 // @param stddev standard deviation of the data.
410 // @param save_flag if true (default) array xd will NOT be changed, otherwise
411 // it will be sorted.
ADtest(double * xd,const int nd,double mean,double stddev,bool save_flag)412 double gpstk::ADtest(double *xd, const int nd,
413 double mean, double stddev, bool save_flag)
414 {
415 if(!xd || nd < 2) {
416 Exception e("Invalid input");
417 GPSTK_THROW(e);
418 }
419
420 try {
421 int i;
422 double med, *save;
423
424 if(save_flag) {
425 save = new double[nd];
426 if(!save) {
427 Exception e("Could not allocate temporary array");
428 GPSTK_THROW(e);
429 }
430 for(i=0; i<nd; i++) save[i]=xd[i];
431 }
432 QSort(xd,nd);
433
434 double tn = double(nd);
435 double AD = -tn, cdf;
436 for(i=0; i<nd; i++) {
437 cdf = normalCDF<double>(mean, stddev, xd[i]);
438 //cout << "CDF " << setw(4) << i
439 // << " " << fixed << setprecision(6) << setw(11) << xd[i]
440 // << " " << setw(11) << cdf << endl;
441 AD -= ((2.0*double(i)-1.0) * ::log(cdf)
442 + (2.0*(tn-double(i))+1.0) * ::log(1.0-cdf))/tn;
443 }
444
445 AD *= 1.0 + (0.75+2.25/tn)/tn;
446
447 if(save_flag) {
448 for(i=0; i<nd; i++) xd[i]=save[i];
449 delete[] save;
450 }
451
452 return AD;
453 }
454 catch(Exception& e) { GPSTK_RETHROW(e); }
455 catch(std::exception& e) {
456 Exception E("std except: "+string(e.what()));
457 GPSTK_THROW(E);
458 }
459 catch(...) { Exception e("Unknown exception"); GPSTK_THROW(e); }
460 }
461
462 //------------------------------------------------------------------------------------
463 //------------------------------------------------------------------------------------
464