1 /*
2   * $Id: regress.i,v 1.1 2005-09-18 22:06:18 dhmunro Exp $
3   * linear regression and related statistical functions
4   * after Numerical Recipes (Press et. al.) chapter 14
5   */
6 /* Copyright (c) 2005, The Regents of the University of California.
7  * All rights reserved.
8  * This file is part of yorick (http://yorick.sourceforge.net).
9  * Read the accompanying LICENSE file for details.
10  */
11 
12 func regress(y, x, &axes, &vals, &chi2, sigy=, rcond=)
13 /* DOCUMENT model = regress(y, [x1,x2,x3,...])
14   *       or model = regress(y, [x1,x2,x3,...], axes, vals, chi2)
15   *
16   *   Performs linear regression analysis, that is, a linear least squares
17   *   fit of the parametric model
18   *      y = model(1)*x1 + model(2)*x2 + model(3)*x3 + ...
19   *        = model(+) * [x1,x2,x3,...](..,+)
20   *   to the supplied data Y, [X1,X2,X3,...].  Y is an array, and the
21   *   Xi arrays should have the same dimensions as Y, or more accurately
22   *   [X1,X2,X3,...] must have one trailing dimension beyond the dimensions
23   *   of Y; the length of the trailing dimension is the number of parameters
24   *   in the model.
25   *
26   *   Use the sigy= keyword to pass in the standard deviations of the Y
27   *   values; sigy must be conformable with Y.  The returned model has
28   *   minimum chi2, where
29   *     chi2 = sum( (y - model(+) * [x1,x2,x3,...](..,+))^2 / sigy^2 )
30   *   The default sigy=1, that is, all points have equal weight in chi2.
31   *
32   *   The optional AXES, VALS, and CHI2 arguments are outputs.  AXES are
33   *   the axes of the error ellipsoid of the fitted parameters, and VALS
34   *   are the corresponding singular values.  AXES(i,) is the vector in
35   *   the space of model() corresponding to VALS(i).  The VALS are
36   *   non-negative and arranged in descending order.  CHI2 is the
37   *   chi2 value for the returned model, divided by the number of
38   *   degrees of freedom in the fit, numberof(y)-sum(vals>rcond*vals(1)).
39   *   AXES(i,) are mutually orthogonal unit vectors; you should arrange
40   *   the magnitudes of X and Y so this makes sense - their scale factors
41   *   may matter.  Usually this means arranging that the magnitudes of
42   *   the elements of the returned model not differ too wildly.
43   *
44   *   Often and unexpectedly, the data do not permit a definitive choice
45   *   of the model(); there may be entire subspaces of the model space
46   *   which produce indistinguishably good fits to the data.  The signature
47   *   of this problem is that some of the VALS may be zero or very small.
48   *   The rcond= keyword permits you to specify how small a singular
49   *   value, relative to the largest, VALS(1), you are willing to consider.
50   *   The AXES(i,) corresponding to VALS(i) smaller than rcond*VALS(1)
51   *   will be given zero contribution to the returned model().  The default
52   *   value is rcond=1.e-9.  If any of the returned VALS is less than
53   *   rcond*VALS(1) you should either change the scales of Y or some X,
54   *   or remove some of the X (the ones which aren't contributing) to
55   *   eliminate the problem.  The regress function reverses the sign of
56   *   any VALS which are less than rcond*VALS(1), so you can quickly
57   *   identify them.
58   *
59   *   Examples:
60   *     ab = regress(y, [1,x]);
61   *       ab(1)+ab(2)*x   is best fit line to y(x)
62   *     ab = regress(y, [1,x,x^2,x^3]);
63   *       poly(x,ab(1),ab(2),ab(3))   is best fit cubic to y(x)
64   *     ab = regress(y, [cos(x),sin(x)]);
65   *       ab(1)*cos(x)+ab(2)*sin(x)   is best fit period 2*pi sine wave to y(x)
66   *
67   * SEE ALSO: regress_cov
68   */
69 {
70    y = double(y(*));
71    x = double(x(*,));
72    dims = dimsof(x);
73    n = numberof(y);
74    if (dims(1)!=2 || dims(2)!=n) error, "bad x dimensions";
75    if (is_void(sigy)) sigy = 1. + 0.*y;
76    else sigy = 1./sigy(*);
77    if (is_void(rcond)) rcond = 1.e-9;
78    else rcond = min(max(rcond,1.e-13),0.5);
79    x *= sigy;
80    y *= sigy;
81    local u;
82    vals = SVdec(x, u, axes);
83    u = y(+) * u(+,);
84    mask = vals > rcond*vals(1);
85    np = sum(mask);
86    rvals = mask / (vals + !mask);
87    if (np < numberof(mask)) vals(where(!mask)) *= -1.;
88    u *= rvals;
89    model = u(+) * axes(+,);
90    chi2 = model(+)*x(,+) - y;
91    chi2 = sum(chi2*chi2) / max(n - np, 1);
92    return model;
93 }
94 
regress_cov(axes,vals)95 func regress_cov(axes, vals)
96 /* DOCUMENT cov = regress_cov(axes, vals)
97   *       or cov = chi2 * regress_cov(axes, vals)
98   *
99   *   Return the covariance matrix for the model returned by regress,
100   *   where AXES and VALS are the values returned by regress.  This
101   *   is a symmetric matrix representing
102   *     covariance(model(i),model(j))
103   *   that is, the variances (on the diagonal i=j) and correlations
104   *   of the model parameters.
105   *
106   *   If you did not specify the sigy= keyword in the original call to
107   *   regress, then the absolute magnitudes of the covariance matrix
108   *   elements do not mean much.  In case you wish to use the quality
109   *   of the fit itself as an estimate of the errors sigy in the
110   *   original Y values, assuming they are all equal, you can multiply
111   *   by the CHI2 returned by regress, as in the second form above,
112   *   in order to produce an estimated error in the fit.  See Numerical
113   *   Recipes for caveats, but this is what commercial statistics
114   *   software generally does.
115   *
116   *   cov(1::numberof(vals)+1) are the variances of the model parameters
117   *   themselves.
118   *
119   * SEE ALSO: regress
120   */
121 {
122    mask = vals > 0.;
123    axes *= mask / (vals + !mask);
124    return axes(+,) * axes(+,);
125 }
126