1 /*
2   chronyd/chronyc - Programs for keeping computer clocks accurate.
3 
4  **********************************************************************
5  * Copyright (C) Richard P. Curnow  1997-2003
6  * Copyright (C) Miroslav Lichvar  2011, 2016-2017
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of version 2 of the GNU General Public License as
10  * published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20  *
21  **********************************************************************
22 
23   =======================================================================
24 
25   Regression algorithms.
26 
27   */
28 
29 #include "config.h"
30 
31 #include "sysincl.h"
32 
33 #include "regress.h"
34 #include "logging.h"
35 #include "util.h"
36 
37 #define MAX_POINTS 64
38 
39 void
RGR_WeightedRegression(double * x,double * y,double * w,int n,double * b0,double * b1,double * s2,double * sb0,double * sb1)40 RGR_WeightedRegression
41 (double *x,                     /* independent variable */
42  double *y,                     /* measured data */
43  double *w,                     /* weightings (large => data
44                                    less reliable) */
45 
46  int n,                         /* number of data points */
47 
48  /* And now the results */
49 
50  double *b0,                    /* estimated y axis intercept */
51  double *b1,                    /* estimated slope */
52  double *s2,                    /* estimated variance of data points */
53 
54  double *sb0,                   /* estimated standard deviation of
55                                    intercept */
56  double *sb1                    /* estimated standard deviation of
57                                    slope */
58 
59  /* Could add correlation stuff later if required */
60 )
61 {
62   double P, Q, U, V, W;
63   double diff;
64   double u, ui, aa;
65   int i;
66 
67   assert(n >= 3);
68 
69   W = U = 0;
70   for (i=0; i<n; i++) {
71     U += x[i]        / w[i];
72     W += 1.0         / w[i];
73   }
74 
75   u = U / W;
76 
77   /* Calculate statistics from data */
78   P = Q = V = 0.0;
79   for (i=0; i<n; i++) {
80     ui = x[i] - u;
81     P += y[i]        / w[i];
82     Q += y[i] * ui   / w[i];
83     V += ui   * ui   / w[i];
84   }
85 
86   *b1 = Q / V;
87   *b0 = (P / W) - (*b1) * u;
88 
89   *s2 = 0.0;
90   for (i=0; i<n; i++) {
91     diff = y[i] - *b0 - *b1*x[i];
92     *s2 += diff*diff / w[i];
93   }
94 
95   *s2 /= (double)(n-2);
96 
97   *sb1 = sqrt(*s2 / V);
98   aa = u * (*sb1);
99   *sb0 = sqrt(*s2 / W + aa * aa);
100 
101   *s2 *= (n / W); /* Giving weighted average of variances */
102 }
103 
104 /* ================================================== */
105 /* Get the coefficient to multiply the standard deviation by, to get a
106    particular size of confidence interval (assuming a t-distribution) */
107 
108 double
RGR_GetTCoef(int dof)109 RGR_GetTCoef(int dof)
110 {
111   /* Assuming now the 99.95% quantile */
112   static const float coefs[] =
113   { 636.6, 31.6, 12.92, 8.61, 6.869,
114     5.959, 5.408, 5.041, 4.781, 4.587,
115     4.437, 4.318, 4.221, 4.140, 4.073,
116     4.015, 3.965, 3.922, 3.883, 3.850,
117     3.819, 3.792, 3.768, 3.745, 3.725,
118     3.707, 3.690, 3.674, 3.659, 3.646,
119     3.633, 3.622, 3.611, 3.601, 3.591,
120     3.582, 3.574, 3.566, 3.558, 3.551};
121 
122   if (dof <= 40) {
123     return coefs[dof-1];
124   } else {
125     return 3.5; /* Until I can be bothered to do something better */
126   }
127 }
128 
129 /* ================================================== */
130 /* Get 90% quantile of chi-square distribution */
131 
132 double
RGR_GetChi2Coef(int dof)133 RGR_GetChi2Coef(int dof)
134 {
135   static const float coefs[] = {
136     2.706, 4.605, 6.251, 7.779, 9.236, 10.645, 12.017, 13.362,
137     14.684, 15.987, 17.275, 18.549, 19.812, 21.064, 22.307, 23.542,
138     24.769, 25.989, 27.204, 28.412, 29.615, 30.813, 32.007, 33.196,
139     34.382, 35.563, 36.741, 37.916, 39.087, 40.256, 41.422, 42.585,
140     43.745, 44.903, 46.059, 47.212, 48.363, 49.513, 50.660, 51.805,
141     52.949, 54.090, 55.230, 56.369, 57.505, 58.641, 59.774, 60.907,
142     62.038, 63.167, 64.295, 65.422, 66.548, 67.673, 68.796, 69.919,
143     71.040, 72.160, 73.279, 74.397, 75.514, 76.630, 77.745, 78.860
144   };
145 
146   if (dof <= 64) {
147     return coefs[dof-1];
148   } else {
149     return 1.2 * dof; /* Until I can be bothered to do something better */
150   }
151 }
152 
153 /* ================================================== */
154 /* Critical value for number of runs of residuals with same sign.
155    5% critical region for now. */
156 
157 static char critical_runs[] = {
158   0,  0,  0,  0,  0,  0,  0,  0,  2,  3,
159   3,  3,  4,  4,  5,  5,  5,  6,  6,  7,
160   7,  7,  8,  8,  9,  9,  9, 10, 10, 11,
161  11, 11, 12, 12, 13, 13, 14, 14, 14, 15,
162  15, 16, 16, 17, 17, 18, 18, 18, 19, 19,
163  20, 20, 21, 21, 21, 22, 22, 23, 23, 24,
164  24, 25, 25, 26, 26, 26, 27, 27, 28, 28,
165  29, 29, 30, 30, 30, 31, 31, 32, 32, 33,
166  33, 34, 34, 35, 35, 35, 36, 36, 37, 37,
167  38, 38, 39, 39, 40, 40, 40, 41, 41, 42,
168  42, 43, 43, 44, 44, 45, 45, 46, 46, 46,
169  47, 47, 48, 48, 49, 49, 50, 50, 51, 51,
170  52, 52, 52, 53, 53, 54, 54, 55, 55, 56
171 };
172 
173 /* ================================================== */
174 
175 static int
n_runs_from_residuals(double * resid,int n)176 n_runs_from_residuals(double *resid, int n)
177 {
178   int nruns;
179   int i;
180 
181   nruns = 1;
182   for (i=1; i<n; i++) {
183     if (((resid[i-1] < 0.0) && (resid[i] < 0.0)) ||
184         ((resid[i-1] > 0.0) && (resid[i] > 0.0))) {
185       /* Nothing to do */
186     } else {
187       nruns++;
188     }
189   }
190 
191   return nruns;
192 }
193 
194 /* ================================================== */
195 /* Return a boolean indicating whether we had enough points for
196    regression */
197 
198 int
RGR_FindBestRegression(double * x,double * y,double * w,int n,int m,int min_samples,double * b0,double * b1,double * s2,double * sb0,double * sb1,int * new_start,int * n_runs,int * dof)199 RGR_FindBestRegression
200 (double *x,                     /* independent variable */
201  double *y,                     /* measured data */
202  double *w,                     /* weightings (large => data
203                                    less reliable) */
204 
205  int n,                         /* number of data points */
206  int m,                         /* number of extra samples in x and y arrays
207                                    (negative index) which can be used to
208                                    extend runs test */
209  int min_samples,               /* minimum number of samples to be kept after
210                                    changing the starting index to pass the runs
211                                    test */
212 
213  /* And now the results */
214 
215  double *b0,                    /* estimated y axis intercept */
216  double *b1,                    /* estimated slope */
217  double *s2,                    /* estimated variance of data points */
218 
219  double *sb0,                   /* estimated standard deviation of
220                                    intercept */
221  double *sb1,                   /* estimated standard deviation of
222                                    slope */
223 
224  int *new_start,                /* the new starting index to make the
225                                    residuals pass the two tests */
226 
227  int *n_runs,                   /* number of runs amongst the residuals */
228 
229  int *dof                       /* degrees of freedom in statistics (needed
230                                    to get confidence intervals later) */
231 
232 )
233 {
234   double P, Q, U, V, W; /* total */
235   double resid[MAX_POINTS * REGRESS_RUNS_RATIO];
236   double ss;
237   double a, b, u, ui, aa;
238 
239   int start, resid_start, nruns, npoints;
240   int i;
241 
242   assert(n <= MAX_POINTS && m >= 0);
243   assert(n * REGRESS_RUNS_RATIO < sizeof (critical_runs) / sizeof (critical_runs[0]));
244 
245   if (n < MIN_SAMPLES_FOR_REGRESS) {
246     return 0;
247   }
248 
249   start = 0;
250   do {
251 
252     W = U = 0;
253     for (i=start; i<n; i++) {
254       U += x[i]        / w[i];
255       W += 1.0         / w[i];
256     }
257 
258     u = U / W;
259 
260     P = Q = V = 0.0;
261     for (i=start; i<n; i++) {
262       ui = x[i] - u;
263       P += y[i]        / w[i];
264       Q += y[i] * ui   / w[i];
265       V += ui   * ui   / w[i];
266     }
267 
268     b = Q / V;
269     a = (P / W) - (b * u);
270 
271     /* Get residuals also for the extra samples before start */
272     resid_start = n - (n - start) * REGRESS_RUNS_RATIO;
273     if (resid_start < -m)
274       resid_start = -m;
275 
276     for (i=resid_start; i<n; i++) {
277       resid[i - resid_start] = y[i] - a - b*x[i];
278     }
279 
280     /* Count number of runs */
281     nruns = n_runs_from_residuals(resid, n - resid_start);
282 
283     if (nruns > critical_runs[n - resid_start] ||
284         n - start <= MIN_SAMPLES_FOR_REGRESS ||
285         n - start <= min_samples) {
286       if (start != resid_start) {
287         /* Ignore extra samples in returned nruns */
288         nruns = n_runs_from_residuals(resid + (start - resid_start), n - start);
289       }
290       break;
291     } else {
292       /* Try dropping one sample at a time until the runs test passes. */
293       ++start;
294     }
295 
296   } while (1);
297 
298   /* Work out statistics from full dataset */
299   *b1 = b;
300   *b0 = a;
301 
302   ss = 0.0;
303   for (i=start; i<n; i++) {
304     ss += resid[i - resid_start]*resid[i - resid_start] / w[i];
305   }
306 
307   npoints = n - start;
308   ss /= (double)(npoints - 2);
309   *sb1 = sqrt(ss / V);
310   aa = u * (*sb1);
311   *sb0 = sqrt((ss / W) + (aa * aa));
312   *s2 = ss * (double) npoints / W;
313 
314   *new_start = start;
315   *dof = npoints - 2;
316   *n_runs = nruns;
317 
318   return 1;
319 
320 }
321 
322 /* ================================================== */
323 
324 #define EXCH(a,b) temp=(a); (a)=(b); (b)=temp
325 
326 /* ================================================== */
327 /* Find the index'th biggest element in the array x of n elements.
328    flags is an array where a 1 indicates that the corresponding entry
329    in x is known to be sorted into its correct position and a 0
330    indicates that the corresponding entry is not sorted.  However, if
331    flags[m] = flags[n] = 1 with m<n, then x[m] must be <= x[n] and for
332    all i with m<i<n, x[m] <= x[i] <= x[n].  In practice, this means
333    flags[] has to be the result of a previous call to this routine
334    with the same array x, and is used to remember which parts of the
335    x[] array we have already sorted.
336 
337    The approach used is a cut-down quicksort, where we only bother to
338    keep sorting the partition that contains the index we are after.
339    The approach comes from Numerical Recipes in C (ISBN
340    0-521-43108-5). */
341 
342 static double
find_ordered_entry_with_flags(double * x,int n,int index,char * flags)343 find_ordered_entry_with_flags(double *x, int n, int index, char *flags)
344 {
345   int u, v, l, r;
346   double temp;
347   double piv;
348   int pivind;
349 
350   assert(index >= 0);
351 
352   /* If this bit of the array is already sorted, simple! */
353   if (flags[index]) {
354     return x[index];
355   }
356 
357   /* Find subrange to look at */
358   u = v = index;
359   while (u > 0 && !flags[u]) u--;
360   if (flags[u]) u++;
361 
362   while (v < (n-1) && !flags[v]) v++;
363   if (flags[v]) v--;
364 
365   do {
366     if (v - u < 2) {
367       if (x[v] < x[u]) {
368         EXCH(x[v], x[u]);
369       }
370       flags[v] = flags[u] = 1;
371       return x[index];
372     } else {
373       pivind = (u + v) >> 1;
374       EXCH(x[u], x[pivind]);
375       piv = x[u]; /* New value */
376       l = u + 1;
377       r = v;
378       do {
379         while (l < v && x[l] < piv) l++;
380         while (x[r] > piv) r--;
381         if (r <= l) break;
382         EXCH(x[l], x[r]);
383         l++;
384         r--;
385       } while (1);
386       EXCH(x[u], x[r]);
387       flags[r] = 1; /* Pivot now in correct place */
388       if (index == r) {
389         return x[r];
390       } else if (index < r) {
391         v = r - 1;
392       } else if (index > r) {
393         u = l;
394       }
395     }
396   } while (1);
397 }
398 
399 /* ================================================== */
400 
401 #if 0
402 /* Not used, but this is how it can be done */
403 static double
404 find_ordered_entry(double *x, int n, int index)
405 {
406   char flags[MAX_POINTS];
407 
408   memset(flags, 0, n * sizeof (flags[0]));
409   return find_ordered_entry_with_flags(x, n, index, flags);
410 }
411 #endif
412 
413 /* ================================================== */
414 /* Find the median entry of an array x[] with n elements. */
415 
416 static double
find_median(double * x,int n)417 find_median(double *x, int n)
418 {
419   int k;
420   char flags[MAX_POINTS];
421 
422   memset(flags, 0, n * sizeof (flags[0]));
423   k = n>>1;
424   if (n&1) {
425     return find_ordered_entry_with_flags(x, n, k, flags);
426   } else {
427     return 0.5 * (find_ordered_entry_with_flags(x, n, k, flags) +
428                   find_ordered_entry_with_flags(x, n, k-1, flags));
429   }
430 }
431 
432 /* ================================================== */
433 
434 double
RGR_FindMedian(double * x,int n)435 RGR_FindMedian(double *x, int n)
436 {
437   double tmp[MAX_POINTS];
438 
439   assert(n > 0 && n <= MAX_POINTS);
440   memcpy(tmp, x, n * sizeof (tmp[0]));
441 
442   return find_median(tmp, n);
443 }
444 
445 /* ================================================== */
446 /* This function evaluates the equation
447 
448    \sum_{i=0}^{n-1} x_i sign(y_i - a - b x_i)
449 
450    and chooses the value of a that minimises the absolute value of the
451    result.  (See pp703-704 of Numerical Recipes in C). */
452 
453 static void
eval_robust_residual(double * x,double * y,int n,double b,double * aa,double * rr)454 eval_robust_residual
455 (double *x,                     /* The independent points */
456  double *y,                     /* The dependent points */
457  int n,                         /* Number of points */
458  double b,                      /* Slope */
459  double *aa,                    /* Intercept giving smallest absolute
460                                    value for the above equation */
461  double *rr                     /* Corresponding value of equation */
462 )
463 {
464   int i;
465   double a, res, del;
466   double d[MAX_POINTS];
467 
468   for (i=0; i<n; i++) {
469     d[i] = y[i] - b * x[i];
470   }
471 
472   a = find_median(d, n);
473 
474   res = 0.0;
475   for (i=0; i<n; i++) {
476     del = y[i] - a - b * x[i];
477     if (del > 0.0) {
478       res += x[i];
479     } else if (del < 0.0) {
480       res -= x[i];
481     }
482   }
483 
484   *aa = a;
485   *rr = res;
486 }
487 
488 /* ================================================== */
489 /* This routine performs a 'robust' regression, i.e. one which has low
490    susceptibility to outliers amongst the data.  If one thinks of a
491    normal (least squares) linear regression in 2D being analogous to
492    the arithmetic mean in 1D, this algorithm in 2D is roughly
493    analogous to the median in 1D.  This algorithm seems to work quite
494    well until the number of outliers is approximately half the number
495    of data points.
496 
497    The return value is a status indicating whether there were enough
498    data points to run the routine or not. */
499 
500 int
RGR_FindBestRobustRegression(double * x,double * y,int n,double tol,double * b0,double * b1,int * n_runs,int * best_start)501 RGR_FindBestRobustRegression
502 (double *x,                     /* The independent axis points */
503  double *y,                     /* The dependent axis points (which
504                                    may contain outliers). */
505  int n,                         /* The number of points */
506  double tol,                    /* The tolerance required in
507                                    determining the value of b1 */
508  double *b0,                    /* The estimated Y-axis intercept */
509  double *b1,                    /* The estimated slope */
510  int *n_runs,                   /* The number of runs of residuals */
511  int *best_start                /* The best starting index */
512 )
513 {
514   int i;
515   int start;
516   int n_points;
517   double a, b;
518   double P, U, V, W, X;
519   double resid, resids[MAX_POINTS];
520   double blo, bhi, bmid, rlo, rhi, rmid;
521   double s2, sb, incr;
522   double mx, dx, my, dy;
523   int nruns = 0;
524 
525   assert(n <= MAX_POINTS);
526 
527   if (n < 2) {
528     return 0;
529   } else if (n == 2) {
530     /* Just a straight line fit (we need this for the manual mode) */
531     *b1 = (y[1] - y[0]) / (x[1] - x[0]);
532     *b0 = y[0] - (*b1) * x[0];
533     *n_runs = 0;
534     *best_start = 0;
535     return 1;
536   }
537 
538   /* else at least 3 points, apply normal algorithm */
539 
540   start = 0;
541 
542   /* Loop to strip oldest points that cause the regression residuals
543      to fail the number of runs test */
544   do {
545 
546     n_points = n - start;
547 
548     /* Use standard least squares regression to get starting estimate */
549 
550     P = U = 0.0;
551     for (i=start; i<n; i++) {
552       P += y[i];
553       U += x[i];
554     }
555 
556     W = (double) n_points;
557 
558     my = P/W;
559     mx = U/W;
560 
561     X = V = 0.0;
562     for (i=start; i<n; i++) {
563       dy = y[i] - my;
564       dx = x[i] - mx;
565       X += dy * dx;
566       V += dx * dx;
567     }
568 
569     b = X / V;
570     a = my - b*mx;
571 
572     s2 = 0.0;
573     for (i=start; i<n; i++) {
574       resid = y[i] - a - b * x[i];
575       s2 += resid * resid;
576     }
577 
578     /* Need to expand range of b to get a root in the interval.
579        Estimate standard deviation of b and expand range about b based
580        on that. */
581     sb = sqrt(s2 * W/V);
582     incr = MAX(sb, tol);
583 
584     do {
585       incr *= 2.0;
586 
587       /* Give up if the interval is too large */
588       if (incr > 100.0)
589         return 0;
590 
591       blo = b - incr;
592       bhi = b + incr;
593 
594       /* We don't want 'a' yet */
595       eval_robust_residual(x + start, y + start, n_points, blo, &a, &rlo);
596       eval_robust_residual(x + start, y + start, n_points, bhi, &a, &rhi);
597 
598     } while (rlo * rhi >= 0.0); /* fn vals have same sign or one is zero,
599                                    i.e. root not in interval (rlo, rhi). */
600 
601     /* OK, so the root for b lies in (blo, bhi). Start bisecting */
602     do {
603       bmid = 0.5 * (blo + bhi);
604       if (!(blo < bmid && bmid < bhi))
605         break;
606       eval_robust_residual(x + start, y + start, n_points, bmid, &a, &rmid);
607       if (rmid == 0.0) {
608         break;
609       } else if (rmid * rlo > 0.0) {
610         blo = bmid;
611         rlo = rmid;
612       } else if (rmid * rhi > 0.0) {
613         bhi = bmid;
614         rhi = rmid;
615       } else {
616         assert(0);
617       }
618     } while (bhi - blo > tol);
619 
620     *b0 = a;
621     *b1 = bmid;
622 
623     /* Number of runs test, but not if we're already down to the
624        minimum number of points */
625     if (n_points == MIN_SAMPLES_FOR_REGRESS) {
626       break;
627     }
628 
629     for (i=start; i<n; i++) {
630       resids[i] = y[i] - a - bmid * x[i];
631     }
632 
633     nruns = n_runs_from_residuals(resids + start, n_points);
634 
635     if (nruns > critical_runs[n_points]) {
636       break;
637     } else {
638       start++;
639     }
640 
641   } while (1);
642 
643   *n_runs = nruns;
644   *best_start = start;
645 
646   return 1;
647 
648 }
649 
650 /* ================================================== */
651 /* This routine performs linear regression with two independent variables.
652    It returns non-zero status if there were enough data points and there
653    was a solution. */
654 
655 int
RGR_MultipleRegress(double * x1,double * x2,double * y,int n,double * b2)656 RGR_MultipleRegress
657 (double *x1,                    /* first independent variable */
658  double *x2,                    /* second independent variable */
659  double *y,                     /* measured data */
660 
661  int n,                         /* number of data points */
662 
663  /* The results */
664  double *b2                     /* estimated second slope */
665                                 /* other values are not needed yet */
666 )
667 {
668   double Sx1, Sx2, Sx1x1, Sx1x2, Sx2x2, Sx1y, Sx2y, Sy;
669   double U, V, V1, V2, V3;
670   int i;
671 
672   if (n < 4)
673     return 0;
674 
675   Sx1 = Sx2 = Sx1x1 = Sx1x2 = Sx2x2 = Sx1y = Sx2y = Sy = 0.0;
676 
677   for (i = 0; i < n; i++) {
678     Sx1 += x1[i];
679     Sx2 += x2[i];
680     Sx1x1 += x1[i] * x1[i];
681     Sx1x2 += x1[i] * x2[i];
682     Sx2x2 += x2[i] * x2[i];
683     Sx1y += x1[i] * y[i];
684     Sx2y += x2[i] * y[i];
685     Sy += y[i];
686   }
687 
688   U = n * (Sx1x2 * Sx1y - Sx1x1 * Sx2y) +
689       Sx1 * Sx1 * Sx2y - Sx1 * Sx2 * Sx1y +
690       Sy * (Sx2 * Sx1x1 - Sx1 * Sx1x2);
691 
692   V1 = n * (Sx1x2 * Sx1x2 - Sx1x1 * Sx2x2);
693   V2 = Sx1 * Sx1 * Sx2x2 + Sx2 * Sx2 * Sx1x1;
694   V3 = -2.0 * Sx1 * Sx2 * Sx1x2;
695   V = V1 + V2 + V3;
696 
697   /* Check if there is a (numerically stable) solution */
698   if (fabs(V) * 1.0e10 <= -V1 + V2 + fabs(V3))
699     return 0;
700 
701   *b2 = U / V;
702 
703   return 1;
704 }
705