1 /* Foundation and miscellenea for the statistics modules.
2  *
3  * Contents:
4  *   1. Summary statistics (means, variances)
5  *   2. Special functions
6  *   3. Standard statistical tests
7  *   4. Data fitting.
8  *   5. Unit tests.
9  *   6. Test driver.
10  *   7. Examples.
11  *      - driver for linear regression
12  *      - driver for G-test
13  */
14 #include "esl_config.h"
15 
16 #include <math.h>
17 
18 #include "easel.h"
19 #include "esl_stats.h"
20 
21 
22 /*****************************************************************
23  * 1. Summary statistics calculations (means, variances)
24  *****************************************************************/
25 
26 /* Function:  esl_stats_DMean()
27  * Synopsis:  Calculates mean and $\sigma^2$ for samples $x_i$.
28  *
29  * Purpose:   Calculates the sample mean and $s^2$, the unbiased
30  *            estimator of the population variance, for a
31  *            sample of <n> numbers <x[0]..x[n-1]>, and optionally
32  *            returns either or both through <ret_mean> and
33  *            <ret_var>.
34  *
35  *            <esl_stats_FMean()> and <esl_stats_IMean()> do the same,
36  *            for float and integer vectors.
37  *
38  * Args:      x        - samples x[0]..x[n-1]
39  *            n        - number of samples
40  *            opt_mean - optRETURN: mean
41  *            opt_var  - optRETURN: estimate of population variance
42  *
43  * Returns:   <eslOK> on success.
44  */
45 int
esl_stats_DMean(const double * x,int n,double * opt_mean,double * opt_var)46 esl_stats_DMean(const double *x, int n, double *opt_mean, double *opt_var)
47 {
48   double sum   = 0.;
49   double sqsum = 0.;
50   int i;
51 
52   for (i = 0; i < n; i++)
53     {
54       sum   += x[i];
55       sqsum += x[i]*x[i];
56     }
57   if (opt_mean != NULL)  *opt_mean = sum / (double) n;
58   if (opt_var  != NULL)  *opt_var  = (sqsum - sum*sum/(double)n) / ((double)n-1);
59   return eslOK;
60 }
61 int
esl_stats_FMean(const float * x,int n,double * opt_mean,double * opt_var)62 esl_stats_FMean(const float *x, int n, double *opt_mean, double *opt_var)
63 {
64   double sum   = 0.;
65   double sqsum = 0.;
66   int i;
67 
68   for (i = 0; i < n; i++)
69     {
70       sum   += x[i];
71       sqsum += x[i]*x[i];
72     }
73   if (opt_mean != NULL)  *opt_mean = sum / (double) n;
74   if (opt_var  != NULL)  *opt_var  = (sqsum - sum*sum/(double)n) / ((double)n-1);
75   return eslOK;
76 }
77 int
esl_stats_IMean(const int * x,int n,double * opt_mean,double * opt_var)78 esl_stats_IMean(const int *x, int n, double *opt_mean, double *opt_var)
79 {
80   double sum   = 0.;
81   double sqsum = 0.;
82   int i;
83 
84   for (i = 0; i < n; i++)
85     {
86       sum   += x[i];
87       sqsum += x[i]*x[i];
88     }
89   if (opt_mean != NULL)  *opt_mean = sum / (double) n;
90   if (opt_var  != NULL)  *opt_var  = (sqsum - sum*sum/(double)n) / ((double)n-1);
91   return eslOK;
92 }
93 /*--------------- end, summary statistics -----------------------*/
94 
95 
96 
97 /*****************************************************************
98  * 2. Special functions.
99  *****************************************************************/
100 
101 /* Function:  esl_stats_LogGamma()
102  * Synopsis:  Calculates $\log \Gamma(x)$.
103  *
104  * Purpose:   Returns natural log of $\Gamma(x)$, for $x > 0$.
105  *
106  * Credit:    Adapted from a public domain implementation in the
107  *            NCBI core math library. Thanks to John Spouge and
108  *            the NCBI. (According to NCBI, that's Dr. John
109  *            "Gammas Galore" Spouge to you, pal.)
110  *
111  * Args:      x          : argument, x > 0.0
112  *            ret_answer : RETURN: the answer
113  *
114  * Returns:   Put the answer in <ret_answer>; returns <eslOK>.
115  *
116  * Throws:    <eslERANGE> if $x <= 0$.
117  */
118 int
esl_stats_LogGamma(double x,double * ret_answer)119 esl_stats_LogGamma(double x, double *ret_answer)
120 {
121   int i;
122   double xx, tx;
123   double tmp, value;
124   static double cof[11] = {
125      4.694580336184385e+04,
126     -1.560605207784446e+05,
127      2.065049568014106e+05,
128     -1.388934775095388e+05,
129      5.031796415085709e+04,
130     -9.601592329182778e+03,
131      8.785855930895250e+02,
132     -3.155153906098611e+01,
133      2.908143421162229e-01,
134     -2.319827630494973e-04,
135      1.251639670050933e-10
136   };
137 
138   /* Protect against invalid x<=0  */
139   if (x <= 0.0)  ESL_EXCEPTION(eslERANGE, "invalid x <= 0 in esl_stats_LogGamma()");
140 
141   xx       = x - 1.0;
142   tx = tmp = xx + 11.0;
143   value    = 1.0;
144   for (i = 10; i >= 0; i--)	/* sum least significant terms first */
145     {
146       value += cof[i] / tmp;
147       tmp   -= 1.0;
148     }
149   value  = log(value);
150   tx    += 0.5;
151   value += 0.918938533 + (xx+0.5)*log(tx) - tx;
152   *ret_answer = value;
153   return eslOK;
154 }
155 
156 
157 /* Function:  esl_stats_Psi()
158  * Synopsis:  Calculates $\Psi(x)$ (the digamma function).
159  *
160  * Purpose:   Computes $\Psi(x)$ (the "digamma" function), which is
161  *            the derivative of log of the Gamma function:
162  *            $d/dx \log \Gamma(x) = \frac{\Gamma'(x)}{\Gamma(x)} = \Psi(x)$.
163  *            Argument $x$ is $> 0$.
164  *
165  *            This is J.M. Bernardo's "Algorithm AS103",
166  *            Appl. Stat. 25:315-317 (1976).
167  */
168 int
esl_stats_Psi(double x,double * ret_answer)169 esl_stats_Psi(double x, double *ret_answer)
170 {
171   double answer = 0.;
172   double x2;
173 
174   if (x <= 0.0) ESL_EXCEPTION(eslERANGE, "invalid x <= 0 in esl_stats_Psi()");
175 
176   /* For small x, Psi(x) ~= -0.5772 - 1/x + O(x), we're done.
177    */
178   if (x <= 1e-5) {
179     *ret_answer = -eslCONST_EULER - 1./x;
180     return eslOK;
181   }
182 
183   /* For medium x, use Psi(1+x) = \Psi(x) + 1/x to c.o.v. x,
184    * big enough for Stirling approximation to work...
185    */
186   while (x < 8.5) {
187     answer = answer - 1./x;
188     x += 1.;
189   }
190 
191   /* For large X, use Stirling approximation
192    */
193   x2 = 1./x;
194   answer += log(x) - 0.5 * x2;
195   x2 = x2*x2;
196   answer -= (1./12.)*x2;
197   answer += (1./120.)*x2*x2;
198   answer -= (1./252.)*x2*x2*x2;
199 
200   *ret_answer = answer;
201   return eslOK;
202 }
203 
204 
205 
206 /* Function: esl_stats_IncompleteGamma()
207  * Synopsis: Calculates the incomplete Gamma function.
208  *
209  * Purpose:  Returns $P(a,x)$ and $Q(a,x)$ where:
210  *
211  *           \begin{eqnarray*}
212  *             P(a,x) & = & \frac{1}{\Gamma(a)} \int_{0}^{x} t^{a-1} e^{-t} dt \\
213  *                    & = & \frac{\gamma(a,x)}{\Gamma(a)} \\
214  *             Q(a,x) & = & \frac{1}{\Gamma(a)} \int_{x}^{\infty} t^{a-1} e^{-t} dt\\
215  *                    & = & 1 - P(a,x) \\
216  *           \end{eqnarray*}
217  *
218  *           $P(a,x)$ is the CDF of a gamma density with $\lambda = 1$,
219  *           and $Q(a,x)$ is the survival function.
220  *
221  *           For $x \simeq 0$, $P(a,x) \simeq 0$ and $Q(a,x) \simeq 1$; and
222  *           $P(a,x)$ is less prone to roundoff error.
223  *
224  *           The opposite is the case for large $x >> a$, where
225  *           $P(a,x) \simeq 1$ and $Q(a,x) \simeq 0$; there, $Q(a,x)$ is
226  *           less prone to roundoff error.
227  *
228  * Method:   Based on ideas from Numerical Recipes in C, Press et al.,
229  *           Cambridge University Press, 1988.
230  *
231  * Args:     a          - for instance, degrees of freedom / 2     [a > 0]
232  *           x          - for instance, chi-squared statistic / 2  [x >= 0]
233  *           ret_pax    - RETURN: P(a,x)
234  *           ret_qax    - RETURN: Q(a,x)
235  *
236  * Return:   <eslOK> on success.
237  *
238  * Throws:   <eslERANGE> if <a> or <x> is out of accepted range.
239  *           <eslENOHALT> if approximation fails to converge.
240  */
241 int
esl_stats_IncompleteGamma(double a,double x,double * ret_pax,double * ret_qax)242 esl_stats_IncompleteGamma(double a, double x, double *ret_pax, double *ret_qax)
243 {
244   int    iter;			/* iteration counter */
245   double pax;			/* P(a,x) */
246   double qax;			/* Q(a,x) */
247   int    status;
248 
249   if (a <= 0.) ESL_EXCEPTION(eslERANGE, "esl_stats_IncompleteGamma(): a must be > 0");
250   if (x <  0.) ESL_EXCEPTION(eslERANGE, "esl_stats_IncompleteGamma(): x must be >= 0");
251 
252   /* For x > a + 1 the following gives rapid convergence;
253    * calculate Q(a,x) = \frac{\Gamma(a,x)}{\Gamma(a)},
254    * using a continued fraction development for \Gamma(a,x).
255    */
256   if (x > a+1)
257     {
258       double oldp;		/* previous value of p    */
259       double nu0, nu1;		/* numerators for continued fraction calc   */
260       double de0, de1;		/* denominators for continued fraction calc */
261 
262       nu0 = 0.;			/* A_0 = 0       */
263       de0 = 1.;			/* B_0 = 1       */
264       nu1 = 1.;			/* A_1 = 1       */
265       de1 = x;			/* B_1 = x       */
266 
267       oldp = nu1;
268       for (iter = 1; iter < 100; iter++)
269 	{
270 	  /* Continued fraction development:
271 	   * set A_j = b_j A_j-1 + a_j A_j-2
272 	   *     B_j = b_j B_j-1 + a_j B_j-2
273            * We start with A_2, B_2.
274 	   */
275 				/* j = even: a_j = iter-a, b_j = 1 */
276 				/* A,B_j-2 are in nu0, de0; A,B_j-1 are in nu1,de1 */
277 	  nu0 = nu1 + ((double)iter - a) * nu0;
278 	  de0 = de1 + ((double)iter - a) * de0;
279 				/* j = odd: a_j = iter, b_j = x */
280 				/* A,B_j-2 are in nu1, de1; A,B_j-1 in nu0,de0 */
281 	  nu1 = x * nu0 + (double) iter * nu1;
282 	  de1 = x * de0 + (double) iter * de1;
283 				/* rescale */
284 	  if (de1 != 0.)
285 	    {
286 	      nu0 /= de1;
287 	      de0 /= de1;
288 	      nu1 /= de1;
289 	      de1 =  1.;
290 	    }
291 				/* check for convergence */
292 	  if (fabs((nu1-oldp)/nu1) < 1.e-7)
293 	    {
294 	      if ((status = esl_stats_LogGamma(a, &qax)) != eslOK) return status;
295 	      qax = nu1 * exp(a * log(x) - x - qax);
296 
297 	      if (ret_pax != NULL) *ret_pax = 1 - qax;
298 	      if (ret_qax != NULL) *ret_qax = qax;
299 	      return eslOK;
300 	    }
301 
302 	  oldp = nu1;
303 	}
304       ESL_EXCEPTION(eslENOHALT,
305 		"esl_stats_IncompleteGamma(): fraction failed to converge");
306     }
307   else /* x <= a+1 */
308     {
309       double p;			/* current sum               */
310       double val;		/* current value used in sum */
311 
312       /* For x <= a+1 we use a convergent series instead:
313        *   P(a,x) = \frac{\gamma(a,x)}{\Gamma(a)},
314        * where
315        *   \gamma(a,x) = e^{-x}x^a \sum_{n=0}{\infty} \frac{\Gamma{a}}{\Gamma{a+1+n}} x^n
316        * which looks appalling but the sum is in fact rearrangeable to
317        * a simple series without the \Gamma functions:
318        *   = \frac{1}{a} + \frac{x}{a(a+1)} + \frac{x^2}{a(a+1)(a+2)} ...
319        * and it's obvious that this should converge nicely for x <= a+1.
320        */
321       p = val = 1. / a;
322       for (iter = 1; iter < 10000; iter++)
323 	{
324 	  val *= x / (a+(double)iter);
325 	  p   += val;
326 
327 	  if (fabs(val/p) < 1.e-7)
328 	    {
329 	      if ((status = esl_stats_LogGamma(a, &pax)) != eslOK) return status;
330 	      pax = p * exp(a * log(x) - x - pax);
331 
332 	      if (ret_pax != NULL) *ret_pax = pax;
333 	      if (ret_qax != NULL) *ret_qax = 1. - pax;
334 	      return eslOK;
335 	    }
336 	}
337       ESL_EXCEPTION(eslENOHALT,
338 		"esl_stats_IncompleteGamma(): series failed to converge");
339     }
340   /*NOTREACHED*/
341   return eslOK;
342 }
343 
344 
345 /* Function:  esl_stats_erfc()
346  * Synopsis:  Complementary error function.
347  *
348  * Purpose:   Calculate and return the complementary error function,
349  *            erfc(x).
350  *
351  *            erfc(x) is mandated by the ANSI C99 standard but that
352  *            doesn't mean it's available on supposedly modern systems
353  *            (looking at you here, Microsoft).
354  *
355  *            Used for cumulative distribution function calculations
356  *            for the normal (Gaussian) distribution. See <esl_normal>
357  *            module.
358  *
359  *            erfc(-inf) = 2.0
360  *            erfc(0)    = 1.0
361  *            erfc(inf)  = 0.0
362  *            erfc(NaN)  = NaN
363  *
364  * Args:      x : any real-numbered value -inf..inf
365  *
366  * Returns:   erfc(x)
367  *
368  * Throws:    (no abnormal error conditions)
369  *
370  * Source:
371  *    Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
372  *    Developed at SunPro, a Sun Microsystems, Inc. business.
373  *    Permission to use, copy, modify, and distribute this software is
374  *    freely granted, provided that this notice is preserved.
375  *    [as posted by eggcrook at stackexchange.com, 21 Dec 2012]
376  *
377  *    Removed arcane incantations for runtime detection of endianness,
378  *    and for treating IEEE754 doubles as two adjacent uint32_t;
379  *    replaced with ANSI-compliant macros and compile-time detection
380  *    of endianness. [Apr 2015]
381  */
382 double
esl_stats_erfc(double x)383 esl_stats_erfc(double x)
384 {
385   static const double tiny = 1e-300;
386   static const double half = 5.00000000000000000000e-01; /* 0x3FE00000, 0x00000000 */
387   static const double one  = 1.00000000000000000000e+00; /* 0x3FF00000, 0x00000000 */
388   static const double two  = 2.00000000000000000000e+00; /* 0x40000000, 0x00000000 */
389   static const double erx  = 8.45062911510467529297e-01; /* 0x3FEB0AC1, 0x60000000 */
390   /*
391    * Coefficients for approximation to erf on [0,0.84375]
392    */
393   static const double pp0 =  1.28379167095512558561e-01; /* 0x3FC06EBA, 0x8214DB68 */
394   static const double pp1 = -3.25042107247001499370e-01; /* 0xBFD4CD7D, 0x691CB913 */
395   static const double pp2 = -2.84817495755985104766e-02; /* 0xBF9D2A51, 0xDBD7194F */
396   static const double pp3 = -5.77027029648944159157e-03; /* 0xBF77A291, 0x236668E4 */
397   static const double pp4 = -2.37630166566501626084e-05; /* 0xBEF8EAD6, 0x120016AC */
398   static const double qq1 =  3.97917223959155352819e-01; /* 0x3FD97779, 0xCDDADC09 */
399   static const double qq2 =  6.50222499887672944485e-02; /* 0x3FB0A54C, 0x5536CEBA */
400   static const double qq3 =  5.08130628187576562776e-03; /* 0x3F74D022, 0xC4D36B0F */
401   static const double qq4 =  1.32494738004321644526e-04; /* 0x3F215DC9, 0x221C1A10 */
402   static const double qq5 = -3.96022827877536812320e-06; /* 0xBED09C43, 0x42A26120 */
403   /*
404    * Coefficients for approximation to erf in [0.84375,1.25]
405    */
406   static const double pa0 = -2.36211856075265944077e-03; /* 0xBF6359B8, 0xBEF77538 */
407   static const double pa1 =  4.14856118683748331666e-01; /* 0x3FDA8D00, 0xAD92B34D */
408   static const double pa2 = -3.72207876035701323847e-01; /* 0xBFD7D240, 0xFBB8C3F1 */
409   static const double pa3 =  3.18346619901161753674e-01; /* 0x3FD45FCA, 0x805120E4 */
410   static const double pa4 = -1.10894694282396677476e-01; /* 0xBFBC6398, 0x3D3E28EC */
411   static const double pa5 =  3.54783043256182359371e-02; /* 0x3FA22A36, 0x599795EB */
412   static const double pa6 = -2.16637559486879084300e-03; /* 0xBF61BF38, 0x0A96073F */
413   static const double qa1 =  1.06420880400844228286e-01; /* 0x3FBB3E66, 0x18EEE323 */
414   static const double qa2 =  5.40397917702171048937e-01; /* 0x3FE14AF0, 0x92EB6F33 */
415   static const double qa3 =  7.18286544141962662868e-02; /* 0x3FB2635C, 0xD99FE9A7 */
416   static const double qa4 =  1.26171219808761642112e-01; /* 0x3FC02660, 0xE763351F */
417   static const double qa5 =  1.36370839120290507362e-02; /* 0x3F8BEDC2, 0x6B51DD1C */
418   static const double qa6 =  1.19844998467991074170e-02; /* 0x3F888B54, 0x5735151D */
419   /*
420    * Coefficients for approximation to erfc in [1.25,1/0.35]
421    */
422   static const double ra0 = -9.86494403484714822705e-03; /* 0xBF843412, 0x600D6435 */
423   static const double ra1 = -6.93858572707181764372e-01; /* 0xBFE63416, 0xE4BA7360 */
424   static const double ra2 = -1.05586262253232909814e+01; /* 0xC0251E04, 0x41B0E726 */
425   static const double ra3 = -6.23753324503260060396e+01; /* 0xC04F300A, 0xE4CBA38D */
426   static const double ra4 = -1.62396669462573470355e+02; /* 0xC0644CB1, 0x84282266 */
427   static const double ra5 = -1.84605092906711035994e+02; /* 0xC067135C, 0xEBCCABB2 */
428   static const double ra6 = -8.12874355063065934246e+01; /* 0xC0545265, 0x57E4D2F2 */
429   static const double ra7 = -9.81432934416914548592e+00; /* 0xC023A0EF, 0xC69AC25C */
430   static const double sa1 =  1.96512716674392571292e+01; /* 0x4033A6B9, 0xBD707687 */
431   static const double sa2 =  1.37657754143519042600e+02; /* 0x4061350C, 0x526AE721 */
432   static const double sa3 =  4.34565877475229228821e+02; /* 0x407B290D, 0xD58A1A71 */
433   static const double sa4 =  6.45387271733267880336e+02; /* 0x40842B19, 0x21EC2868 */
434   static const double sa5 =  4.29008140027567833386e+02; /* 0x407AD021, 0x57700314 */
435   static const double sa6 =  1.08635005541779435134e+02; /* 0x405B28A3, 0xEE48AE2C */
436   static const double sa7 =  6.57024977031928170135e+00; /* 0x401A47EF, 0x8E484A93 */
437   static const double sa8 = -6.04244152148580987438e-02; /* 0xBFAEEFF2, 0xEE749A62 */
438   /*
439    * Coefficients for approximation to erfc in [1/.35,28]
440    */
441   static const double rb0 = -9.86494292470009928597e-03; /* 0xBF843412, 0x39E86F4A */
442   static const double rb1 = -7.99283237680523006574e-01; /* 0xBFE993BA, 0x70C285DE */
443   static const double rb2 = -1.77579549177547519889e+01; /* 0xC031C209, 0x555F995A */
444   static const double rb3 = -1.60636384855821916062e+02; /* 0xC064145D, 0x43C5ED98 */
445   static const double rb4 = -6.37566443368389627722e+02; /* 0xC083EC88, 0x1375F228 */
446   static const double rb5 = -1.02509513161107724954e+03; /* 0xC0900461, 0x6A2E5992 */
447   static const double rb6 = -4.83519191608651397019e+02; /* 0xC07E384E, 0x9BDC383F */
448   static const double sb1 =  3.03380607434824582924e+01; /* 0x403E568B, 0x261D5190 */
449   static const double sb2 =  3.25792512996573918826e+02; /* 0x40745CAE, 0x221B9F0A */
450   static const double sb3 =  1.53672958608443695994e+03; /* 0x409802EB, 0x189D5118 */
451   static const double sb4 =  3.19985821950859553908e+03; /* 0x40A8FFB7, 0x688C246A */
452   static const double sb5 =  2.55305040643316442583e+03; /* 0x40A3F219, 0xCEDF3BE6 */
453   static const double sb6 =  4.74528541206955367215e+02; /* 0x407DA874, 0xE79FE763 */
454   static const double sb7 = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
455 
456   int hx,ix;
457   double R,S,P,Q,s,y,z,r;
458 
459   ESL_GET_HIGHWORD(hx, x);  // SRE: replaced original Sun incantation here.
460   ix = hx & 0x7fffffff;
461   if (ix>=0x7ff00000) /* erfc(nan)=nan; erfc(+-inf)=0,2 */
462     return (double)(((unsigned)hx>>31)<<1)+one/x;
463 
464   if (ix < 0x3feb0000)  /* |x|<0.84375 */
465     {
466       if (ix < 0x3c700000) return one-x; /* |x|<2**-56 */
467       z = x*x;
468       r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
469       s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
470       y = r/s;
471       if (hx < 0x3fd00000) /* x<1/4 */
472 	{
473 	  return one-(x+x*y);
474 	}
475       else
476 	{
477 	  r = x*y;
478 	  r += (x-half);
479 	  return half - r ;
480 	}
481     }
482 
483   if (ix < 0x3ff40000) /* 0.84375 <= |x| < 1.25 */
484     {
485       s = fabs(x)-one;
486       P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
487       Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
488       if (hx>=0)
489 	{
490 	  z = one-erx;
491 	  return z - P/Q;
492 	}
493       else
494 	{
495 	  z = erx+P/Q;
496 	  return one+z;
497 	}
498     }
499 
500   if (ix < 0x403c0000) /* |x|<28 */
501     {
502       x = fabs(x);
503       s = one/(x*x);
504       if (ix< 0x4006DB6D) /* |x| < 1/.35 ~ 2.857143*/
505 	{
506 	  R = ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(ra5+s*(ra6+s*ra7))))));
507 	  S = one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))));
508 	}
509       else  /* |x| >= 1/.35 ~ 2.857143 */
510 	{
511 	  if (hx < 0 && ix >= 0x40180000) return two-tiny; /* x < -6 */
512 	  R = rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(rb5+s*rb6)))));
513 	  S = one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(sb5+s*(sb6+s*sb7))))));
514 	}
515       z = x;
516       ESL_SET_LOWWORD(z, 0);  // SRE: replaced original Sun incantation here.
517       r = exp(-z*z-0.5625) * exp((z-x)*(z+x)+R/S);
518 
519       if (hx>0) return r/x;
520       else      return two-r/x;
521     }
522   else
523     {
524       if (hx>0) return tiny*tiny;
525       else      return two-tiny;
526     }
527 }
528 /*----------------- end, special functions ----------------------*/
529 
530 
531 /*****************************************************************
532  * 3. Standard statistical tests.
533  *****************************************************************/
534 
535 /* Function:  esl_stats_GTest()
536  * Synopsis:  Calculates a G-test on 2 vs. 1 binomials.
537  *
538  * Purpose:   In experiment a, we've drawn <ca> successes in <na> total
539  *            trials; in experiment b, we've drawn <cb> successes in
540  *            <nb> total trials. Are the counts different enough to
541  *            conclude that the two experiments are different? The
542  *            null hypothesis is that the successes in both experiments
543  *            were drawn from the same binomial distribution with
544  *            per-trial probability $p$. The tested hypothesis is that
545  *            experiments a,b have different binomial probabilities
546  *            $p_a,p_b$. The G-test is a log-likelihood-ratio statistic,
547  *            assuming maximum likelihood values for $p,p_a,p_b$.
548  *            $2G$ is distributed approximately as $X^2(1)$,
549  *            %"X" is "Chi"
550  *            which we use to calculate a P-value for the G statistic.
551  *
552  * Args:      ca    - number of positives in experiment a
553  *            na    - total number in experiment a
554  *            cb    - number of positives in experiment b
555  *            nb    - total number in experiment b
556  *            ret_G - RETURN: G statistic, a log likelihood ratio, in nats
557  *            ret_P - RETURN: P-value for the G-statistic
558  *
559  * Returns:   <eslOK> on success.
560  *
561  * Throws:    (no abnormal error conditions)
562  *
563  * Xref:      Archive1999/0906-sagescore/sagescore.c
564  */
565 int
esl_stats_GTest(int ca,int na,int cb,int nb,double * ret_G,double * ret_P)566 esl_stats_GTest(int ca, int na, int cb, int nb, double *ret_G, double *ret_P)
567 {
568   double a,b,c,d,n;
569   double G = 0.;
570 
571   a = (double) ca;
572   b = (double) (na - ca);
573   c = (double) cb;
574   d = (double) (nb - cb);
575   n = (double) na+nb;
576 
577   /* Yes, the calculation here is correct; algebraic
578    * rearrangement of the log-likelihood-ratio with
579    * p_a = ca/na, p_b = cb/nb, and p = (ca+cb)/(na+nb).
580    * Guard against 0 probabilities; assume 0 log 0 => 0.
581    */
582   if (a   > 0.) G  = a * log(a);
583   if (b   > 0.) G += b * log(b);
584   if (c   > 0.) G += c * log(c);
585   if (d   > 0.) G += d * log(d);
586   if (n   > 0.) G += n * log(n);
587   if (a+b > 0.) G -= (a+b) * log(a+b);
588   if (c+d > 0.) G -= (c+d) * log(c+d);
589   if (a+c > 0.) G -= (a+c) * log(a+c);
590   if (b+d > 0.) G -= (b+d) * log(b+d);
591 
592   *ret_G = G;
593   return esl_stats_IncompleteGamma( 0.5, G, NULL, ret_P);
594 }
595 
596 
597 /* Function:  esl_stats_ChiSquaredTest()
598  * Synopsis:  Calculates a $\chi^2$ P-value.
599  * Incept:    SRE, Tue Jul 19 11:39:32 2005 [St. Louis]
600  *
601  * Purpose:   Calculate the probability that a chi-squared statistic
602  *            with <v> degrees of freedom would exceed the observed
603  *            chi-squared value <x>; return it in <ret_answer>. If
604  *            this probability is less than some small threshold (say,
605  *            0.05 or 0.01), then we may reject the hypothesis we're
606  *            testing.
607  *
608  * Args:      v          - degrees of freedom
609  *            x          - observed chi-squared value
610  *            ret_answer - RETURN: P(\chi^2 > x)
611  *
612  * Returns:   <eslOK> on success.
613  *
614  * Throws:    <eslERANGE> if <v> or <x> are out of valid range.
615  *            <eslENOHALT> if iterative calculation fails.
616  */
617 int
esl_stats_ChiSquaredTest(int v,double x,double * ret_answer)618 esl_stats_ChiSquaredTest(int v, double x, double *ret_answer)
619 {
620   return esl_stats_IncompleteGamma((double)v/2., x/2., NULL, ret_answer);
621 }
622 /*----------------- end, statistical tests  ---------------------*/
623 
624 
625 
626 /*****************************************************************
627  * 4. Data fitting.
628  *****************************************************************/
629 
630 /* Function:  esl_stats_LinearRegression()
631  * Synopsis:  Fit data to a straight line.
632  * Incept:    SRE, Sat May 26 11:33:46 2007 [Janelia]
633  *
634  * Purpose:   Fit <n> points <x[i]>, <y[i]> to a straight line
635  *            $y = a + bx$ by linear regression.
636  *
637  *            The $x_i$ are taken to be known, and the $y_i$ are taken
638  *            to be observed quantities associated with a sampling
639  *            error $\sigma_i$. If known, the standard deviations
640  *            $\sigma_i$ for $y_i$ are provided in the <sigma> array.
641  *            If they are unknown, pass <sigma = NULL>, and the
642  *            routine will proceed with the assumption that $\sigma_i
643  *            = 1$ for all $i$.
644  *
645  *            The maximum likelihood estimates for $a$ and $b$ are
646  *            optionally returned in <opt_a> and <opt_b>.
647  *
648  *            The estimated standard deviations of $a$ and $b$ and
649  *            their estimated covariance are optionally returned in
650  *            <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>.
651  *
652  *            The Pearson correlation coefficient is optionally
653  *            returned in <opt_cc>.
654  *
655  *            The $\chi^2$ P-value for the regression fit is
656  *            optionally returned in <opt_Q>. This P-value may only be
657  *            obtained when the $\sigma_i$ are known. If <sigma> is
658  *            passed as <NULL> and <opt_Q> is requested, <*opt_Q> is
659  *            set to 1.0.
660  *
661  *            This routine follows the description and algorithm in
662  *            \citep[pp.661-666]{Press93}.
663  *
664  *            <n> must be greater than 2; at least two x[i] must
665  *            differ; and if <sigma> is provided, all <sigma[i]> must
666  *            be $>0$. If any of these conditions isn't met, the
667  *            routine throws <eslEINVAL>.
668  *
669  * Args:      x            - x[0..n-1]
670  *            y            - y[0..n-1]
671  *            sigma        - sample error in observed y_i
672  *            n            - number of data points
673  *            opt_a        - optRETURN: intercept estimate
674  *            opt_b        - optRETURN: slope estimate
675  *            opt_sigma_a  - optRETURN: error in estimate of a
676  *            opt_sigma_b  - optRETURN: error in estimate of b
677  *            opt_cov_ab   - optRETURN: covariance of a,b estimates
678  *            opt_cc       - optRETURN: Pearson correlation coefficient for x,y
679  *            opt_Q        - optRETURN: X^2 P-value for linear fit
680  *
681  * Returns:   <eslOK> on success.
682  *
683  * Throws:    <eslEMEM> on allocation error;
684  *            <eslEINVAL> if a contract condition isn't met;
685  *            <eslENORESULT> if the chi-squared test fails.
686  *            In these cases, all optional return values are set to 0.
687  */
688 int
esl_stats_LinearRegression(const double * x,const double * y,const double * sigma,int n,double * opt_a,double * opt_b,double * opt_sigma_a,double * opt_sigma_b,double * opt_cov_ab,double * opt_cc,double * opt_Q)689 esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n,
690 			   double *opt_a,       double *opt_b,
691 			   double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab,
692 			   double *opt_cc,      double *opt_Q)
693 {
694   int     status;
695   double *t      = NULL;
696   double  S, Sx, Sy, Stt;
697   double  Sxy, Sxx, Syy;
698   double  a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q;
699   double  xdev, ydev;
700   double  tmp;
701   int     i;
702 
703   /* Contract checks. */
704   if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting");
705   if (sigma != NULL)
706     for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i);
707   status = eslEINVAL;
708   for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; }
709   if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0.");
710 
711   /* Allocations */
712   ESL_ALLOC(t, sizeof(double) * n);
713 
714   /* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}.  (S > 0.) */
715   if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]);  }
716   else S = (double) n;
717 
718   /* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2}  (Sx real.) */
719   for (Sx = 0., i = 0; i < n; i++) {
720     if (sigma == NULL) Sx += x[i];
721     else               Sx += x[i] / (sigma[i] * sigma[i]);
722   }
723 
724   /* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2}  (Sy real.) */
725   for (Sy = 0., i = 0; i < n; i++) {
726     if (sigma == NULL) Sy += y[i];
727     else               Sy += y[i] / (sigma[i] * sigma[i]);
728   }
729 
730   /* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right)   (t_i real) */
731   for (i = 0; i < n; i++) {
732     t[i] = x[i] - Sx/S;
733     if (sigma != NULL) t[i] /= sigma[i];
734   }
735 
736   /* S_{tt} = \sum_{i=1}^n t_i^2  (if at least one x is != 0, Stt > 0) */
737   for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; }
738 
739   /* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i}  */
740   for (b = 0., i = 0; i < n; i++) {
741     if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; }
742     else               { b += t[i]*y[i]; }
743   }
744   b /= Stt;
745 
746   /* a = \frac{ S_y - S_x b } {S}   */
747   a = (Sy - Sx * b) / S;
748 
749   /* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */
750   sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S);
751 
752   /* \sigma_b = \frac{1}{S_{tt}} */
753   sigma_b = sqrt (1. / Stt);
754 
755   /* Cov(a,b) = - \frac{S_x}{S S_{tt}}    */
756   cov_ab = -Sx / (S * Stt);
757 
758   /* Pearson correlation coefficient */
759   Sxy = Sxx = Syy = 0.;
760   for (i = 0; i < n; i++) {
761     if (sigma != NULL) {
762       xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n);
763       ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n);
764     } else {
765       xdev = x[i] - (Sx / n);
766       ydev = y[i] - (Sy / n);
767     }
768     Sxy += xdev * ydev;
769     Sxx += xdev * xdev;
770     Syy += ydev * ydev;
771   }
772   cc = Sxy / (sqrt(Sxx) * sqrt(Syy));
773 
774   /* \chi^2 */
775   for (X2 = 0., i = 0; i < n; i++) {
776     tmp =  y[i] - a - b*x[i];
777     if (sigma != NULL) tmp /= sigma[i];
778     X2 += tmp*tmp;
779   }
780 
781   /* We can calculate a goodness of fit if we know the \sigma_i */
782   if (sigma != NULL) {
783     if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; }
784   } else Q = 1.0;
785 
786   /* If we didn't use \sigma_i, adjust the sigmas for a,b */
787   if (sigma == NULL) {
788     tmp = sqrt(X2 / (double)(n-2));
789     sigma_a *= tmp;
790     sigma_b *= tmp;
791   }
792 
793   /* Done. Set up for normal return.
794    */
795   free(t);
796   if (opt_a       != NULL) *opt_a       = a;
797   if (opt_b       != NULL) *opt_b       = b;
798   if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a;
799   if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b;
800   if (opt_cov_ab  != NULL) *opt_cov_ab  = cov_ab;
801   if (opt_cc      != NULL) *opt_cc      = cc;
802   if (opt_Q       != NULL) *opt_Q       = Q;
803   return eslOK;
804 
805  ERROR:
806   if (t != NULL) free(t);
807   if (opt_a       != NULL) *opt_a       = 0.;
808   if (opt_b       != NULL) *opt_b       = 0.;
809   if (opt_sigma_a != NULL) *opt_sigma_a = 0.;
810   if (opt_sigma_b != NULL) *opt_sigma_b = 0.;
811   if (opt_cov_ab  != NULL) *opt_cov_ab  = 0.;
812   if (opt_cc      != NULL) *opt_cc      = 0.;
813   if (opt_Q       != NULL) *opt_Q       = 0.;
814   return status;
815 }
816 /*------------------- end, data fitting -------------------------*/
817 
818 
819 
820 /*****************************************************************
821  * 5. Unit tests.
822  *****************************************************************/
823 #ifdef eslSTATS_TESTDRIVE
824 #include "esl_random.h"
825 #include "esl_stopwatch.h"
826 #ifdef HAVE_LIBGSL
827 #include <gsl/gsl_sf_gamma.h>
828 #endif
829 
830 
831 /* Macros for treating IEEE754 double as two uint32_t halves, with
832  * compile-time handling of endianness; see esl_stats.h.
833  */
834 static void
utest_doublesplitting(ESL_RANDOMNESS * rng)835 utest_doublesplitting(ESL_RANDOMNESS *rng)
836 {
837   char     msg[] = "esl_stats:: doublesplitting unit test failed";
838   uint32_t ix0, ix1;
839   double   x;
840   double   x2;
841   int      iteration;  // iteration 0 uses x = 2; iteration 1 uses random x = [0,1).
842 
843   for (iteration = 0; iteration < 2; iteration++)
844     {
845       x = (iteration == 0 ? 2.0 : esl_random(rng));
846       ESL_GET_WORDS(ix0, ix1, x);
847       ESL_SET_WORDS(x2, ix0, ix1);
848       if (x2 != x) esl_fatal(msg);
849 
850       ESL_GET_HIGHWORD(ix0, x);
851       ESL_SET_HIGHWORD(x2,  ix0);
852       if (x2 != x) esl_fatal(msg);
853 
854       ESL_GET_LOWWORD(ix0, x);
855       ESL_SET_LOWWORD(x2,  ix0);
856       if (iteration == 0 && ix0 != 0)   esl_fatal(msg);
857       if (x2  != x) esl_fatal(msg);
858     }
859 }
860 
861 /* The LogGamma() function is rate-limiting in hmmbuild, because it is
862  * used so heavily in mixture Dirichlet calculations.
863  *    ./configure --with-gsl; [compile test driver]
864  *    ./stats_utest -v
865  * runs a comparison of time/precision against GSL.
866  * SRE, Sat May 23 10:04:41 2009, on home Mac:
867  *     LogGamma       = 1.29u  / N=1e8  =  13 nsec/call
868  *     gsl_sf_lngamma = 1.43u  / N=1e8  =  14 nsec/call
869  */
870 static void
utest_LogGamma(ESL_RANDOMNESS * r,int N,int be_verbose)871 utest_LogGamma(ESL_RANDOMNESS *r, int N, int be_verbose)
872 {
873   char          *msg = "esl_stats_LogGamma() unit test failed";
874   ESL_STOPWATCH *w   = esl_stopwatch_Create();
875   double        *x   = malloc(sizeof(double) * N);
876   double        *lg  = malloc(sizeof(double) * N);
877   double        *lg2 = malloc(sizeof(double) * N);
878   int            i;
879 
880   for (i = 0; i < N; i++)
881     x[i] = esl_random(r) * 100.;
882 
883   esl_stopwatch_Start(w);
884   for (i = 0; i < N; i++)
885     if (esl_stats_LogGamma(x[i], &(lg[i])) != eslOK) esl_fatal(msg);
886   esl_stopwatch_Stop(w);
887 
888   if (be_verbose) esl_stopwatch_Display(stdout, w, "esl_stats_LogGamma() timing: ");
889 
890 #ifdef HAVE_LIBGSL
891   esl_stopwatch_Start(w);
892   for (i = 0; i < N; i++) lg2[i] = gsl_sf_lngamma(x[i]);
893   esl_stopwatch_Stop(w);
894 
895   if (be_verbose) esl_stopwatch_Display(stdout, w, "gsl_sf_lngamma() timing:     ");
896 
897   for (i = 0; i < N; i++)
898     if (esl_DCompare(lg[i], lg2[i], 1e-2) != eslOK) esl_fatal(msg);
899 #endif
900 
901   free(lg2);
902   free(lg);
903   free(x);
904   esl_stopwatch_Destroy(w);
905 }
906 
907 
908 /* The test of esl_stats_LinearRegression() is a statistical test,
909  * so we can't be too aggressive about testing results.
910  *
911  * Args:
912  *    r          - a source of randomness
913  *    use_sigma  - TRUE to pass sigma to the regression fit.
914  *    be_verbose - TRUE to print results (manual, not automated test mode)
915  */
916 static void
utest_LinearRegression(ESL_RANDOMNESS * r,int use_sigma,int be_verbose)917 utest_LinearRegression(ESL_RANDOMNESS *r, int use_sigma, int be_verbose)
918 {
919   char msg[] = "linear regression unit test failed";
920   double a     = -3.;
921   double b     = 1.;
922   int    n     = 100;
923   double xori  = -20.;
924   double xstep = 1.0;
925   double setsigma = 1.0;		/* sigma on all points */
926   int    i;
927   double *x     = NULL;
928   double *y     = NULL;
929   double *sigma = NULL;
930   double  ae, be, siga, sigb, cov_ab, cc, Q;
931 
932   if ((x     = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
933   if ((y     = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
934   if ((sigma = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
935 
936   /* Simulate some linear data */
937   for (i = 0; i < n; i++)
938     {
939       sigma[i] = setsigma;
940       x[i]     = xori + i*xstep;
941       y[i]     = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
942     }
943 
944   if (use_sigma) {
945     if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK) esl_fatal(msg);
946   } else {
947     if (esl_stats_LinearRegression(x, y,  NULL, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK) esl_fatal(msg);
948   }
949 
950   if (be_verbose) {
951     printf("Linear regression test:\n");
952     printf("estimated intercept a = %8.4f   [true = %8.4f]\n", ae, a);
953     printf("estimated slope b     = %8.4f   [true = %8.4f]\n", be, b);
954     printf("estimated sigma on a  = %8.4f\n",                  siga);
955     printf("estimated sigma on b  = %8.4f\n",                  sigb);
956     printf("estimated cov(a,b)    = %8.4f\n",                  cov_ab);
957     printf("correlation coeff     = %8.4f\n",                  cc);
958     printf("P-value               = %8.4f\n",                  Q);
959   }
960 
961   /* The following tests are statistical.
962    */
963   if ( fabs(ae-a) > 2*siga ) esl_fatal(msg);
964   if ( fabs(be-b) > 2*sigb ) esl_fatal(msg);
965   if ( cc < 0.95)            esl_fatal(msg);
966   if (use_sigma) {
967     if (Q < 0.001)           esl_fatal(msg);
968   } else {
969     if (Q != 1.0)            esl_fatal(msg);
970   }
971 
972   free(x);
973   free(y);
974   free(sigma);
975 }
976 
977 static void
utest_erfc(ESL_RANDOMNESS * rng,int be_verbose)978 utest_erfc(ESL_RANDOMNESS *rng, int be_verbose)
979 {
980   char   msg[] = "esl_stats:: erfc unit test failed";
981   double x;
982   double result;
983   int    i;
984 
985   if (be_verbose) {
986     printf("#--------------------------\n");
987     printf("# erfc unit testing...\n");
988   }
989 
990   result = esl_stats_erfc( eslNaN);
991   if (! isnan(result)) esl_fatal(msg);
992   if (esl_stats_erfc(-eslINFINITY) != 2.0)    esl_fatal(msg);
993   if (esl_stats_erfc( 0.0)         != 1.0)    esl_fatal(msg);
994   if (esl_stats_erfc( eslINFINITY) != 0.0)    esl_fatal(msg);
995 
996   for (i = 0; i < 42; i++)
997     {
998       x      = esl_random(rng) * 10. - 5.;
999       result = esl_stats_erfc(x);
1000       if (!isfinite(result)) esl_fatal(msg);
1001 #ifdef HAVE_ERFC
1002       if (esl_DCompare(result, erfc(x), 1e-6) != eslOK) esl_fatal(msg);
1003       if (be_verbose)
1004 	printf("%15f %15f %15f\n", x, result, erfc(x));
1005 #endif
1006     }
1007 
1008   if (be_verbose)
1009     printf("#--------------------------\n");
1010   return;
1011 }
1012 
1013 #endif /*eslSTATS_TESTDRIVE*/
1014 /*-------------------- end of unit tests ------------------------*/
1015 
1016 
1017 
1018 
1019 /*****************************************************************
1020  * 6. Test driver.
1021  *****************************************************************/
1022 #ifdef eslSTATS_TESTDRIVE
1023 /* gcc -g -Wall -o stats_utest  -L. -I. -DeslSTATS_TESTDRIVE esl_stats.c -leasel -lm
1024  * gcc -DHAVE_LIBGSL -O2 -o stats_utest -L. -I. -DeslSTATS_TESTDRIVE esl_stats.c -leasel -lgsl -lm
1025  */
1026 #include <stdio.h>
1027 #include "easel.h"
1028 #include "esl_getopts.h"
1029 #include "esl_random.h"
1030 #include "esl_stats.h"
1031 
1032 static ESL_OPTIONS options[] = {
1033   /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
1034   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",                   0},
1035   {"-s",  eslARG_INT,      "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>",         0},
1036   {"-v",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "verbose: show verbose output",          0},
1037   {"-N",  eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials in LogGamma test",     0},
1038   { 0,0,0,0,0,0,0,0,0,0},
1039 };
1040 static char usage[]  = "[-options]";
1041 static char banner[] = "test driver for stats special functions";
1042 
1043 int
main(int argc,char ** argv)1044 main(int argc, char **argv)
1045 {
1046   ESL_GETOPTS    *go         = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1047   ESL_RANDOMNESS *r          = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1048   int             be_verbose = esl_opt_GetBoolean(go, "-v");
1049   int             N          = esl_opt_GetInteger(go, "-N");
1050 
1051   if (be_verbose) printf("seed = %" PRIu32 "\n", esl_randomness_GetSeed(r));
1052 
1053   utest_doublesplitting(r);
1054   utest_erfc(r, be_verbose);
1055   utest_LogGamma(r, N, be_verbose);
1056   utest_LinearRegression(r, TRUE,  be_verbose);
1057   utest_LinearRegression(r, FALSE, be_verbose);
1058 
1059   esl_getopts_Destroy(go);
1060   esl_randomness_Destroy(r);
1061   exit(0);
1062 }
1063 #endif /*eslSTATS_TESTDRIVE*/
1064 /*------------------- end of test driver ------------------------*/
1065 
1066 
1067 
1068 
1069 /*****************************************************************
1070  * 7. Examples.
1071  *****************************************************************/
1072 
1073 /* Compile:  gcc -g -Wall -o example -I. -DeslSTATS_EXAMPLE esl_stats.c esl_random.c easel.c -lm
1074  * or        gcc -g -Wall -o example -I. -L. -DeslSTATS_EXAMPLE esl_stats.c -leasel -lm
1075  */
1076 #ifdef eslSTATS_EXAMPLE
1077 /*::cexcerpt::stats_example::begin::*/
1078 /* gcc -g -Wall -o example -I. -DeslSTATS_EXAMPLE esl_stats.c esl_random.c easel.c -lm  */
1079 #include <stdio.h>
1080 #include "easel.h"
1081 #include "esl_random.h"
1082 #include "esl_stats.h"
1083 
main(void)1084 int main(void)
1085 {
1086   ESL_RANDOMNESS *r   = esl_randomness_Create(0);
1087   double a            = -3.;
1088   double b            = 1.;
1089   double xori         = -20.;
1090   double xstep        = 1.0;
1091   double setsigma     = 1.0;		/* sigma on all points */
1092   int    n            = 100;
1093   double *x           = malloc(sizeof(double) * n);
1094   double *y           = malloc(sizeof(double) * n);
1095   double *sigma       = malloc(sizeof(double) * n);
1096   int    i;
1097   double  ae, be, siga, sigb, cov_ab, cc, Q;
1098 
1099   /* Simulate some linear data, with Gaussian noise added to y_i */
1100   for (i = 0; i < n; i++) {
1101     sigma[i] = setsigma;
1102     x[i]     = xori + i*xstep;
1103     y[i]     = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
1104   }
1105 
1106   if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK)
1107     esl_fatal("linear regression failed");
1108 
1109   printf("estimated intercept a = %8.4f   [true = %8.4f]\n", ae, a);
1110   printf("estimated slope b     = %8.4f   [true = %8.4f]\n", be, b);
1111   printf("estimated sigma on a  = %8.4f\n",                  siga);
1112   printf("estimated sigma on b  = %8.4f\n",                  sigb);
1113   printf("estimated cov(a,b)    = %8.4f\n",                  cov_ab);
1114   printf("correlation coeff     = %8.4f\n",                  cc);
1115   printf("P-value               = %8.4f\n",                  Q);
1116 
1117   free(x);  free(y);  free(sigma);
1118   esl_randomness_Destroy(r);
1119   exit(0);
1120 }
1121 /*::cexcerpt::stats_example::end::*/
1122 #endif /* eslSTATS_EXAMPLE */
1123 
1124 
1125 #ifdef eslSTATS_EXAMPLE2
1126 
1127 #include <stdlib.h>
1128 
1129 #include "easel.h"
1130 #include "esl_getopts.h"
1131 #include "esl_stats.h"
1132 
1133 static ESL_OPTIONS options[] = {
1134   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
1135   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",    0 },
1136   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1137 };
1138 static char usage[]  = "[-options] <ca> <na> <cb> <nb>";
1139 static char banner[] = "example from the stats module: using a G-test";
1140 
1141 int
main(int argc,char ** argv)1142 main(int argc, char **argv)
1143 {
1144   ESL_GETOPTS  *go  = esl_getopts_CreateDefaultApp(options, 4, argc, argv, banner, usage);
1145   int           ca  = strtol(esl_opt_GetArg(go, 1), NULL, 10);
1146   int           na  = strtol(esl_opt_GetArg(go, 2), NULL, 10);
1147   int           cb  = strtol(esl_opt_GetArg(go, 3), NULL, 10);
1148   int           nb  = strtol(esl_opt_GetArg(go, 4), NULL, 10);
1149   double        G, P;
1150   int           status;
1151 
1152   if (ca > na || cb > nb) esl_fatal("argument order wrong? expect ca, na, cb, nb for ca/na, cb/nb");
1153 
1154   if ( (status = esl_stats_GTest(ca, na, cb, nb, &G, &P)) != eslOK) esl_fatal("G-test failed?");
1155   printf("%-10.3g %12.2f\n", P, G);
1156   exit(0);
1157 }
1158 #endif /* eslSTATS_EXAMPLE2 */
1159 /*--------------------- end of examples -------------------------*/
1160 
1161