1 //==============================================================================
2 //
3 //  This file is part of GPSTk, the GPS Toolkit.
4 //
5 //  The GPSTk is free software; you can redistribute it and/or modify
6 //  it under the terms of the GNU Lesser General Public License as published
7 //  by the Free Software Foundation; either version 3.0 of the License, or
8 //  any later version.
9 //
10 //  The GPSTk is distributed in the hope that it will be useful,
11 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 //  GNU Lesser General Public License for more details.
14 //
15 //  You should have received a copy of the GNU Lesser General Public
16 //  License along with GPSTk; if not, write to the Free Software Foundation,
17 //  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
18 //
19 //  This software was developed by Applied Research Laboratories at the
20 //  University of Texas at Austin.
21 //  Copyright 2004-2020, The Board of Regents of The University of Texas System
22 //
23 //==============================================================================
24 
25 //==============================================================================
26 //
27 //  This software was developed by Applied Research Laboratories at the
28 //  University of Texas at Austin, under contract to an agency or agencies
29 //  within the U.S. Department of Defense. The U.S. Government retains all
30 //  rights to use, duplicate, distribute, disclose, or release this software.
31 //
32 //  Pursuant to DoD Directive 523024
33 //
34 //  DISTRIBUTION STATEMENT A: This software has been approved for public
35 //                            release, distribution is unlimited.
36 //
37 //==============================================================================
38 
39 /// @file RobustStats.cpp
40 /// Namespace Robust includes basic robust statistical computations, including median,
41 /// median average deviation, quartiles and m-estimate, as well as implementation of
42 /// of stem-and-leaf plots, quantile plots and robust least squares estimation of a
43 /// polynomial.
44 /// Reference: Mason, Gunst and Hess, "Statistical Design and
45 ///            Analysis of Experiments," Wiley, New York, 1989.
46 
47 //------------------------------------------------------------------------------------
48 // GPSTk includes
49 #include "Exception.hpp"
50 #include "StringUtils.hpp"
51 #include "Matrix.hpp"
52 #include "SRI.hpp"
53 #include "RobustStats.hpp"
54 
55 //------------------------------------------------------------------------------------
56 // moved to RobustStats.hpp as macros
57 //const double gpstk::Robust::TuningT=1.5;      // or 1.345;       // or 1.5
58 //const double gpstk::Robust::TuningA=0.778;    // or 0.67;        // or 0.778
59 //const double gpstk::Robust::TuningE=0.6745;
60 
61 //------------------------------------------------------------------------------------
62 using namespace std;
63 using namespace gpstk;
64 using namespace StringUtils;
65 
66 //------------------------------------------------------------------------------------
Stem(double x,double & scale)67 inline long Stem(double x, double& scale) { return (long(x/scale)); }
68 
69 //------------------------------------------------------------------------------------
StemLeafPlot(ostream & os,double * xd,long nd,string msg)70 void Robust::StemLeafPlot(ostream& os, double *xd, long nd, string msg)
71 {
72    long stem, l, nout=0, s, sM, sQ1, sQ3, sOH, sOL;
73    int i, sign, pos, k, leaf;
74    unsigned len=0, kk;
75    char c;
76    double x,scale;
77    string buf;
78 
79    if(!xd || nd < 2) GPSTK_THROW(Exception("Invalid input"));
80 
81       // find range
82    double range = xd[nd-1] - xd[0];                   // max - min
83    if(range < 0.0)
84       GPSTK_THROW(Exception("Array is not sorted"));
85    if(range == 0.0)
86       range = xd[0]; //GPSTK_THROW(Exception("Array has zero range"));
87 
88       // find scale
89    scale = 0.0;
90    short nscale=0;       // scale = 1*10^nscale
91    if(range >= 10.0)
92       sign = 1;
93    else if(range < 1.0)
94       sign = -1;
95    else
96       scale = 1.0;
97 
98    if(!scale) do {
99       nscale += sign;
100    } while(range*::pow(10.0,-nscale) < 1.0 || range*::pow(10.0,-nscale) >= 10.0);
101 
102    scale = ::pow(10.0,nscale);
103 
104    double M=Robust::Median(xd,nd);
105    double Q1,Q3;
106    Robust::Quartiles(xd,nd,Q1,Q3);
107       // outlier limits
108    double OH=2.5*Q3-1.5*Q1;         // outlier high limit
109    double OL=2.5*Q1-1.5*Q3;         // outlier low limit ('oh L' not 'zero L')
110 
111       // starting stem=stem(min=xd[0]), and stem step==scale
112    i = 1+short((range/scale)+0.5);              // number of stems
113    if(xd[0]*xd[nd-1] < 0.0) i++;                // add one stem for zero
114    if(nd > 8 && i < 8 && xd[nd-1] != xd[0]) {   // fudge so #stems is big enough
115       scale /= 10.0;
116       nscale--;
117    }
118 
119       // find length of stem for printing
120       // must look through all data b/c leaf==10 can bump up the stem
121    len = 0;
122    for(l=0; l<nd; l++) {
123       // this code is duplicated below in the main loop
124       sign = 1;
125       if(xd[l] < 0.0) sign=-1;
126       stem = Stem(fabs(xd[l]),scale);
127       x = 10*fabs(xd[l]/scale-sign*stem);
128       leaf = short(x + 0.5);
129       if(leaf == 10) {
130          stem++;
131          leaf=0;
132       }
133       stem = sign*stem;
134       buf = asString<long>(stem);
135       if(len < buf.size()) len=buf.size();
136    }
137 
138       // loop through data, adding stems and leaves to plot
139    bool start=true;
140    if(xd[0] < 0.0) pos=-1; else pos=1;
141    sM = Stem(M,scale);
142    sQ1 = Stem(Q1,scale);
143    sQ3 = Stem(Q3,scale);
144    sOH = Stem(OH,scale);
145    sOL = Stem(OL,scale);
146    for(l=0; l<nd; l++) {
147          // current: stem=s,pos; data=stem,sign(xd[l])
148       if(xd[l]>OH || xd[l]<OL) nout++;                   // count outliers
149       sign = 1;
150       if(xd[l] < 0.0) sign=-1;
151       stem = Stem(fabs(xd[l]),scale);
152       x = 10*fabs(xd[l]/scale-sign*stem);
153       leaf = short(x + 0.5);
154       if(leaf == 10) {
155          stem++;
156          leaf=0;
157       }
158       stem = sign*stem;
159 
160       // print it
161       if(start || s!=stem || (s==0 && pos*sign<0.0)) {
162          // Change of stem -> print
163          if(start) {                                        // first time through
164             os << "Stem and Leaf Plot (scale ";             // print scale
165             if(nscale < -8 || nscale > 8) {
166                os << "1.0e" << nscale;
167             }
168             else {
169                i=nscale;
170                if(nscale < 0) {
171                   os << "0.";
172                   i++;
173                   k = 1;
174                }
175                else {
176                   os << "1";
177                   k = -1;
178                }
179                while(i != 0) {
180                   os << "0";
181                   i += k;
182                }
183                if(nscale < 0)
184                   os << "1";
185                else
186                   os << ".0";
187             }
188             os << ", " << nd << "pts) : ";                  // print npts
189             if(msg.size() > 0) os << msg;                   // and message
190             s = stem - 1;                                   // save for later
191             start = false;
192          }
193 
194          while(s < stem || (s==0 && pos*sign<0)) { // also print stems without leaves
195             if(s != 0) s++;
196             else if(pos < 0) pos = 1;
197             else s++;
198 
199                // print the new line with stem s
200             os << "\n";
201             buf = asString<long>(s < 0 ? -s : s); // abs(stem)
202 
203             if(s<0) c='-';                                     // sign of stem
204             else if(s>0) c='+';
205             else if(pos>0)c='+';
206             else c='-';
207             os << c;
208 
209             for(kk=buf.size(); kk<len; kk++) os << " ";        // pad out to length
210             os << buf << " ";                                  // stem/axis space
211 
212                // now print either |, M (median), Q (quartiles), or >< (outliers)
213             k=0;
214 
215                // if s==sM
216             if(s==sM && (s!=0 || pos*M>0.0)) {
217                os << "M";                       // marks the median
218                k++;
219             }
220 
221             if((s == sQ3 && (s != 0 || pos*Q3 < 0.0)) ||
222                   (s == sQ1 && (s != 0 || pos*Q1 < 0.0) )) {
223                os << "Q";                       // marks a quartile
224                k++;
225             }
226 
227             if((s < sOL || (s == 0 && sOL == 0 && (pos == -1 && OL > 0.0))) ||
228                (s == sOL && (s != 0 || pos*OL > 0.0))) {
229                os << "<";                       // marks an outlier (small)
230                k++;
231             }
232             else if((s > sOH || (s == 0 && sOH == 0 && (pos == 1 && OH < 0.0))) ||
233                (s == sOH && (s != 0 || pos*OH > 0.0))) {
234                os << ">";                       // marks an outlier (big)
235                k++;
236             }
237 
238             if(k == 0) {
239                os << "|";                       // marks a regular point
240                k++;
241             }
242 
243             while(k < 3) {
244                os << " ";
245                k++;
246             }
247          }
248       }     // end change stem
249 
250          // print leaf
251       os << leaf;
252    }
253 
254    os << "\nEND Stem and Leaf Plot (there are " << nout << " outliers.)\n";
255 
256 }  // end StemLeafPlot
257 
258 
Quantiles(double * xd,long nd)259 void Robust::Quantiles(double *xd, long nd)
260 {
261    if(!xd || nd<2) {
262       Exception e("Invalid input");
263       GPSTK_THROW(e);
264    }
265 
266    double f;
267    for(int i=0; i<nd; i++) {
268       f = double(8*i+5)/double(8*nd+2);         // f(i) = i-3/8 / n+1/4, i=1,n
269       xd[i] = 4.91*(::pow(f,0.14) - ::pow(1-f,0.14));
270    }
271 
272 }  // end Quantiles
273 
274 
275 #define SEQUENTIAL 1     // don't see much difference in timing...
RobustPolyFit(double * xd,const double * td,int nd,int N,double * c,double * w)276 int Robust::RobustPolyFit(double *xd, const double *td, int nd,
277                           int N, double *c, double *w)
278 {
279    try {
280       if(!xd || !td || !c || nd < 2) {
281          Exception e("Invalid input");
282          GPSTK_THROW(e);
283       }
284 
285       const int maxiter=50;
286       const double conv_limit=::sqrt(double(nd))*1.e-3;
287       int i,j,k,niter;
288       double x0=xd[0],t0=td[0],mad,median,conv;
289 #ifdef SEQUENTIAL
290       double fit,dt;
291       Matrix<double> R,A(1,N+1),invR;
292       Vector<double> Z;
293 #else
294       Matrix<double> PT,P(nd,N,1.0);
295       Vector<double> D(nd);
296 #endif
297       Matrix<double> Cov;
298       Vector<double> Wts(nd,1.0), Coeff(N,0.0), Res(nd), ResCopy(nd);
299 
300 #ifndef SEQUENTIAL
301       // build the data vector and the (constant) partials matrix
302       for(i=0; i<nd; i++) {
303          D(i) = xd[i]-x0;
304          for(j=1; j<N; j++)
305             P(i,j) = P(i,j-1)*(td[i]-t0);
306       }
307 #endif
308 
309       // iterate until weights don't change
310       niter = 0;
311       while(1) {
312 #ifdef SEQUENTIAL
313          R = Matrix<double>(N,N,0.0);
314          Z = Vector<double>(N,0.0);
315          // loop over the data
316          for(i=0; i<nd; i++) {
317             //if(Wts(i) < 1.e-8) continue;           // ignore if weight is very small
318             A(0,N) = (xd[i]-x0)*Wts(i);              // data
319             dt = td[i]-t0;
320             A(0,0) = 1.0*Wts(i);                     // partials
321             for(j=1; j<N; j++) A(0,j) = A(0,j-1)*dt;
322             SrifMU<double>(R,Z,A,1);
323          }
324 #else
325          // compute partials transpose multiplied by 'weight matrix'=diag(squared wts)
326          PT = transpose(P);
327          for(i=0; i<N; i++)
328             for(j=0; j<nd; j++)
329                PT(i,j) *= Wts(j)*Wts(j);
330          Cov = PT * P;        // information matrix
331 #endif
332 
333          // solve
334          try {
335 #ifdef SEQUENTIAL
336          invR = inverseUT(R,&mad,&median);
337          Cov = UTtimesTranspose(invR);
338          Coeff = invR * Z;
339 #else
340          Cov = inverse(Cov);
341          Coeff = Cov * PT * D;
342 #endif
343          }
344          catch(Exception& e) { return -1; }
345 
346          // compute residuals
347 #ifdef SEQUENTIAL
348          // loop over the data
349          for(i=0; i<nd; i++) {
350             fit = Coeff(N-1);                      // fit = partials * coeff
351             dt = td[i]-t0;                         // delta time
352             for(j=N-2; j>=0; j--)
353                fit = fit*dt + Coeff(j);
354             ResCopy(i) = Res(i) = xd[i]-x0 - fit;  // data - fit
355          }
356 #else
357          ResCopy = Res = D - P*Coeff;
358 #endif
359 
360          // compute median and MAD. NB Median() will sort the vector...
361          mad = MedianAbsoluteDeviation(&(ResCopy[0]),ResCopy.size(),median);
362 
363          // recompute weights
364          Vector<double> OldWts(Wts);
365          for(i=0; i<nd; i++) {
366             if(Res(i) < -RobustTuningT*mad)
367                Wts(i) = -RobustTuningT*mad/Res(i);
368             else if(Res(i) > RobustTuningT*mad)
369                Wts(i) = RobustTuningT*mad/Res(i);
370             else
371                Wts(i) = 1.0;
372          }
373 
374          // test for convergence
375          niter++;
376          conv = RMS(OldWts - Wts);
377          if(niter > maxiter) break; // return -2;
378          if(conv > 1.) break; // return -3;
379          if(niter > 2 && conv < conv_limit) break;
380       }
381 
382       // copy out weights, residuals and solution
383       for(i=0; i<N; i++) c[i] = Coeff(i);
384       //c[0] += x0;
385       for(i=0; i<nd; i++) {
386          xd[i] = Res(i);
387          if(w) w[i] = Wts(i);
388       }
389       if(niter > maxiter) return -2;
390       if(conv > 1.) return -3;
391 
392 #undef SEQUENTIAL
393       return 0;
394    }
395    catch(Exception& e) { GPSTK_RETHROW(e); }
396    catch(std::exception& e) {
397       Exception E("std except: "+string(e.what()));
398       GPSTK_THROW(E);
399    }
400    catch(...) { Exception e("Unknown exception"); GPSTK_THROW(e); }
401 }
402 
403    // Anderson-Darling test statistic, which is a variant of the Kolmogorov-Smirnoff
404    // test, comparing the distribution of data with mean m and standard deviation s
405    // to the normal distribution.
406    // @param xd         array of data.
407    // @param nd         length of array xd.
408    // @param mean       mean of the data.
409    // @param stddev     standard deviation of the data.
410    // @param save_flag  if true (default) array xd will NOT be changed, otherwise
411    //                      it will be sorted.
ADtest(double * xd,const int nd,double mean,double stddev,bool save_flag)412 double gpstk::ADtest(double *xd, const int nd,
413                      double mean, double stddev, bool save_flag)
414 {
415    if(!xd || nd < 2) {
416       Exception e("Invalid input");
417       GPSTK_THROW(e);
418    }
419 
420    try {
421       int i;
422       double med, *save;
423 
424       if(save_flag) {
425          save = new double[nd];
426          if(!save) {
427             Exception e("Could not allocate temporary array");
428             GPSTK_THROW(e);
429          }
430          for(i=0; i<nd; i++) save[i]=xd[i];
431       }
432       QSort(xd,nd);
433 
434       double tn = double(nd);
435       double AD = -tn, cdf;
436       for(i=0; i<nd; i++) {
437          cdf = normalCDF<double>(mean, stddev, xd[i]);
438          //cout << "CDF " << setw(4) << i
439          //     << " " << fixed << setprecision(6) << setw(11) << xd[i]
440          //     << " " << setw(11) << cdf << endl;
441          AD -= ((2.0*double(i)-1.0) * ::log(cdf)
442                + (2.0*(tn-double(i))+1.0) * ::log(1.0-cdf))/tn;
443       }
444 
445       AD *= 1.0 + (0.75+2.25/tn)/tn;
446 
447       if(save_flag) {
448          for(i=0; i<nd; i++) xd[i]=save[i];
449          delete[] save;
450       }
451 
452       return AD;
453    }
454    catch(Exception& e) { GPSTK_RETHROW(e); }
455    catch(std::exception& e) {
456       Exception E("std except: "+string(e.what()));
457       GPSTK_THROW(E);
458    }
459    catch(...) { Exception e("Unknown exception"); GPSTK_THROW(e); }
460 }
461 
462 //------------------------------------------------------------------------------------
463 //------------------------------------------------------------------------------------
464