1 /* Foundation and miscellenea for the statistics modules.
2 *
3 * Contents:
4 * 1. Summary statistics (means, variances)
5 * 2. Special functions
6 * 3. Standard statistical tests
7 * 4. Data fitting.
8 * 5. Unit tests.
9 * 6. Test driver.
10 * 7. Examples.
11 * - driver for linear regression
12 * - driver for G-test
13 */
14 #include "esl_config.h"
15
16 #include <math.h>
17
18 #include "easel.h"
19 #include "esl_stats.h"
20
21
22 /*****************************************************************
23 * 1. Summary statistics calculations (means, variances)
24 *****************************************************************/
25
26 /* Function: esl_stats_DMean()
27 * Synopsis: Calculates mean and $\sigma^2$ for samples $x_i$.
28 *
29 * Purpose: Calculates the sample mean and $s^2$, the unbiased
30 * estimator of the population variance, for a
31 * sample of <n> numbers <x[0]..x[n-1]>, and optionally
32 * returns either or both through <ret_mean> and
33 * <ret_var>.
34 *
35 * <esl_stats_FMean()> and <esl_stats_IMean()> do the same,
36 * for float and integer vectors.
37 *
38 * Args: x - samples x[0]..x[n-1]
39 * n - number of samples
40 * opt_mean - optRETURN: mean
41 * opt_var - optRETURN: estimate of population variance
42 *
43 * Returns: <eslOK> on success.
44 */
45 int
esl_stats_DMean(const double * x,int n,double * opt_mean,double * opt_var)46 esl_stats_DMean(const double *x, int n, double *opt_mean, double *opt_var)
47 {
48 double sum = 0.;
49 double sqsum = 0.;
50 int i;
51
52 for (i = 0; i < n; i++)
53 {
54 sum += x[i];
55 sqsum += x[i]*x[i];
56 }
57 if (opt_mean != NULL) *opt_mean = sum / (double) n;
58 if (opt_var != NULL) *opt_var = (sqsum - sum*sum/(double)n) / ((double)n-1);
59 return eslOK;
60 }
61 int
esl_stats_FMean(const float * x,int n,double * opt_mean,double * opt_var)62 esl_stats_FMean(const float *x, int n, double *opt_mean, double *opt_var)
63 {
64 double sum = 0.;
65 double sqsum = 0.;
66 int i;
67
68 for (i = 0; i < n; i++)
69 {
70 sum += x[i];
71 sqsum += x[i]*x[i];
72 }
73 if (opt_mean != NULL) *opt_mean = sum / (double) n;
74 if (opt_var != NULL) *opt_var = (sqsum - sum*sum/(double)n) / ((double)n-1);
75 return eslOK;
76 }
77 int
esl_stats_IMean(const int * x,int n,double * opt_mean,double * opt_var)78 esl_stats_IMean(const int *x, int n, double *opt_mean, double *opt_var)
79 {
80 double sum = 0.;
81 double sqsum = 0.;
82 int i;
83
84 for (i = 0; i < n; i++)
85 {
86 sum += x[i];
87 sqsum += x[i]*x[i];
88 }
89 if (opt_mean != NULL) *opt_mean = sum / (double) n;
90 if (opt_var != NULL) *opt_var = (sqsum - sum*sum/(double)n) / ((double)n-1);
91 return eslOK;
92 }
93 /*--------------- end, summary statistics -----------------------*/
94
95
96
97 /*****************************************************************
98 * 2. Special functions.
99 *****************************************************************/
100
101 /* Function: esl_stats_LogGamma()
102 * Synopsis: Calculates $\log \Gamma(x)$.
103 *
104 * Purpose: Returns natural log of $\Gamma(x)$, for $x > 0$.
105 *
106 * Credit: Adapted from a public domain implementation in the
107 * NCBI core math library. Thanks to John Spouge and
108 * the NCBI. (According to NCBI, that's Dr. John
109 * "Gammas Galore" Spouge to you, pal.)
110 *
111 * Args: x : argument, x > 0.0
112 * ret_answer : RETURN: the answer
113 *
114 * Returns: Put the answer in <ret_answer>; returns <eslOK>.
115 *
116 * Throws: <eslERANGE> if $x <= 0$.
117 */
118 int
esl_stats_LogGamma(double x,double * ret_answer)119 esl_stats_LogGamma(double x, double *ret_answer)
120 {
121 int i;
122 double xx, tx;
123 double tmp, value;
124 static double cof[11] = {
125 4.694580336184385e+04,
126 -1.560605207784446e+05,
127 2.065049568014106e+05,
128 -1.388934775095388e+05,
129 5.031796415085709e+04,
130 -9.601592329182778e+03,
131 8.785855930895250e+02,
132 -3.155153906098611e+01,
133 2.908143421162229e-01,
134 -2.319827630494973e-04,
135 1.251639670050933e-10
136 };
137
138 /* Protect against invalid x<=0 */
139 if (x <= 0.0) ESL_EXCEPTION(eslERANGE, "invalid x <= 0 in esl_stats_LogGamma()");
140
141 xx = x - 1.0;
142 tx = tmp = xx + 11.0;
143 value = 1.0;
144 for (i = 10; i >= 0; i--) /* sum least significant terms first */
145 {
146 value += cof[i] / tmp;
147 tmp -= 1.0;
148 }
149 value = log(value);
150 tx += 0.5;
151 value += 0.918938533 + (xx+0.5)*log(tx) - tx;
152 *ret_answer = value;
153 return eslOK;
154 }
155
156
157 /* Function: esl_stats_Psi()
158 * Synopsis: Calculates $\Psi(x)$ (the digamma function).
159 *
160 * Purpose: Computes $\Psi(x)$ (the "digamma" function), which is
161 * the derivative of log of the Gamma function:
162 * $d/dx \log \Gamma(x) = \frac{\Gamma'(x)}{\Gamma(x)} = \Psi(x)$.
163 * Argument $x$ is $> 0$.
164 *
165 * This is J.M. Bernardo's "Algorithm AS103",
166 * Appl. Stat. 25:315-317 (1976).
167 */
168 int
esl_stats_Psi(double x,double * ret_answer)169 esl_stats_Psi(double x, double *ret_answer)
170 {
171 double answer = 0.;
172 double x2;
173
174 if (x <= 0.0) ESL_EXCEPTION(eslERANGE, "invalid x <= 0 in esl_stats_Psi()");
175
176 /* For small x, Psi(x) ~= -0.5772 - 1/x + O(x), we're done.
177 */
178 if (x <= 1e-5) {
179 *ret_answer = -eslCONST_EULER - 1./x;
180 return eslOK;
181 }
182
183 /* For medium x, use Psi(1+x) = \Psi(x) + 1/x to c.o.v. x,
184 * big enough for Stirling approximation to work...
185 */
186 while (x < 8.5) {
187 answer = answer - 1./x;
188 x += 1.;
189 }
190
191 /* For large X, use Stirling approximation
192 */
193 x2 = 1./x;
194 answer += log(x) - 0.5 * x2;
195 x2 = x2*x2;
196 answer -= (1./12.)*x2;
197 answer += (1./120.)*x2*x2;
198 answer -= (1./252.)*x2*x2*x2;
199
200 *ret_answer = answer;
201 return eslOK;
202 }
203
204
205
206 /* Function: esl_stats_IncompleteGamma()
207 * Synopsis: Calculates the incomplete Gamma function.
208 *
209 * Purpose: Returns $P(a,x)$ and $Q(a,x)$ where:
210 *
211 * \begin{eqnarray*}
212 * P(a,x) & = & \frac{1}{\Gamma(a)} \int_{0}^{x} t^{a-1} e^{-t} dt \\
213 * & = & \frac{\gamma(a,x)}{\Gamma(a)} \\
214 * Q(a,x) & = & \frac{1}{\Gamma(a)} \int_{x}^{\infty} t^{a-1} e^{-t} dt\\
215 * & = & 1 - P(a,x) \\
216 * \end{eqnarray*}
217 *
218 * $P(a,x)$ is the CDF of a gamma density with $\lambda = 1$,
219 * and $Q(a,x)$ is the survival function.
220 *
221 * For $x \simeq 0$, $P(a,x) \simeq 0$ and $Q(a,x) \simeq 1$; and
222 * $P(a,x)$ is less prone to roundoff error.
223 *
224 * The opposite is the case for large $x >> a$, where
225 * $P(a,x) \simeq 1$ and $Q(a,x) \simeq 0$; there, $Q(a,x)$ is
226 * less prone to roundoff error.
227 *
228 * Method: Based on ideas from Numerical Recipes in C, Press et al.,
229 * Cambridge University Press, 1988.
230 *
231 * Args: a - for instance, degrees of freedom / 2 [a > 0]
232 * x - for instance, chi-squared statistic / 2 [x >= 0]
233 * ret_pax - RETURN: P(a,x)
234 * ret_qax - RETURN: Q(a,x)
235 *
236 * Return: <eslOK> on success.
237 *
238 * Throws: <eslERANGE> if <a> or <x> is out of accepted range.
239 * <eslENOHALT> if approximation fails to converge.
240 */
241 int
esl_stats_IncompleteGamma(double a,double x,double * ret_pax,double * ret_qax)242 esl_stats_IncompleteGamma(double a, double x, double *ret_pax, double *ret_qax)
243 {
244 int iter; /* iteration counter */
245 double pax; /* P(a,x) */
246 double qax; /* Q(a,x) */
247 int status;
248
249 if (a <= 0.) ESL_EXCEPTION(eslERANGE, "esl_stats_IncompleteGamma(): a must be > 0");
250 if (x < 0.) ESL_EXCEPTION(eslERANGE, "esl_stats_IncompleteGamma(): x must be >= 0");
251
252 /* For x > a + 1 the following gives rapid convergence;
253 * calculate Q(a,x) = \frac{\Gamma(a,x)}{\Gamma(a)},
254 * using a continued fraction development for \Gamma(a,x).
255 */
256 if (x > a+1)
257 {
258 double oldp; /* previous value of p */
259 double nu0, nu1; /* numerators for continued fraction calc */
260 double de0, de1; /* denominators for continued fraction calc */
261
262 nu0 = 0.; /* A_0 = 0 */
263 de0 = 1.; /* B_0 = 1 */
264 nu1 = 1.; /* A_1 = 1 */
265 de1 = x; /* B_1 = x */
266
267 oldp = nu1;
268 for (iter = 1; iter < 100; iter++)
269 {
270 /* Continued fraction development:
271 * set A_j = b_j A_j-1 + a_j A_j-2
272 * B_j = b_j B_j-1 + a_j B_j-2
273 * We start with A_2, B_2.
274 */
275 /* j = even: a_j = iter-a, b_j = 1 */
276 /* A,B_j-2 are in nu0, de0; A,B_j-1 are in nu1,de1 */
277 nu0 = nu1 + ((double)iter - a) * nu0;
278 de0 = de1 + ((double)iter - a) * de0;
279 /* j = odd: a_j = iter, b_j = x */
280 /* A,B_j-2 are in nu1, de1; A,B_j-1 in nu0,de0 */
281 nu1 = x * nu0 + (double) iter * nu1;
282 de1 = x * de0 + (double) iter * de1;
283 /* rescale */
284 if (de1 != 0.)
285 {
286 nu0 /= de1;
287 de0 /= de1;
288 nu1 /= de1;
289 de1 = 1.;
290 }
291 /* check for convergence */
292 if (fabs((nu1-oldp)/nu1) < 1.e-7)
293 {
294 if ((status = esl_stats_LogGamma(a, &qax)) != eslOK) return status;
295 qax = nu1 * exp(a * log(x) - x - qax);
296
297 if (ret_pax != NULL) *ret_pax = 1 - qax;
298 if (ret_qax != NULL) *ret_qax = qax;
299 return eslOK;
300 }
301
302 oldp = nu1;
303 }
304 ESL_EXCEPTION(eslENOHALT,
305 "esl_stats_IncompleteGamma(): fraction failed to converge");
306 }
307 else /* x <= a+1 */
308 {
309 double p; /* current sum */
310 double val; /* current value used in sum */
311
312 /* For x <= a+1 we use a convergent series instead:
313 * P(a,x) = \frac{\gamma(a,x)}{\Gamma(a)},
314 * where
315 * \gamma(a,x) = e^{-x}x^a \sum_{n=0}{\infty} \frac{\Gamma{a}}{\Gamma{a+1+n}} x^n
316 * which looks appalling but the sum is in fact rearrangeable to
317 * a simple series without the \Gamma functions:
318 * = \frac{1}{a} + \frac{x}{a(a+1)} + \frac{x^2}{a(a+1)(a+2)} ...
319 * and it's obvious that this should converge nicely for x <= a+1.
320 */
321 p = val = 1. / a;
322 for (iter = 1; iter < 10000; iter++)
323 {
324 val *= x / (a+(double)iter);
325 p += val;
326
327 if (fabs(val/p) < 1.e-7)
328 {
329 if ((status = esl_stats_LogGamma(a, &pax)) != eslOK) return status;
330 pax = p * exp(a * log(x) - x - pax);
331
332 if (ret_pax != NULL) *ret_pax = pax;
333 if (ret_qax != NULL) *ret_qax = 1. - pax;
334 return eslOK;
335 }
336 }
337 ESL_EXCEPTION(eslENOHALT,
338 "esl_stats_IncompleteGamma(): series failed to converge");
339 }
340 /*NOTREACHED*/
341 return eslOK;
342 }
343
344
345 /* Function: esl_stats_erfc()
346 * Synopsis: Complementary error function.
347 *
348 * Purpose: Calculate and return the complementary error function,
349 * erfc(x).
350 *
351 * erfc(x) is mandated by the ANSI C99 standard but that
352 * doesn't mean it's available on supposedly modern systems
353 * (looking at you here, Microsoft).
354 *
355 * Used for cumulative distribution function calculations
356 * for the normal (Gaussian) distribution. See <esl_normal>
357 * module.
358 *
359 * erfc(-inf) = 2.0
360 * erfc(0) = 1.0
361 * erfc(inf) = 0.0
362 * erfc(NaN) = NaN
363 *
364 * Args: x : any real-numbered value -inf..inf
365 *
366 * Returns: erfc(x)
367 *
368 * Throws: (no abnormal error conditions)
369 *
370 * Source:
371 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
372 * Developed at SunPro, a Sun Microsystems, Inc. business.
373 * Permission to use, copy, modify, and distribute this software is
374 * freely granted, provided that this notice is preserved.
375 * [as posted by eggcrook at stackexchange.com, 21 Dec 2012]
376 *
377 * Removed arcane incantations for runtime detection of endianness,
378 * and for treating IEEE754 doubles as two adjacent uint32_t;
379 * replaced with ANSI-compliant macros and compile-time detection
380 * of endianness. [Apr 2015]
381 */
382 double
esl_stats_erfc(double x)383 esl_stats_erfc(double x)
384 {
385 static const double tiny = 1e-300;
386 static const double half = 5.00000000000000000000e-01; /* 0x3FE00000, 0x00000000 */
387 static const double one = 1.00000000000000000000e+00; /* 0x3FF00000, 0x00000000 */
388 static const double two = 2.00000000000000000000e+00; /* 0x40000000, 0x00000000 */
389 static const double erx = 8.45062911510467529297e-01; /* 0x3FEB0AC1, 0x60000000 */
390 /*
391 * Coefficients for approximation to erf on [0,0.84375]
392 */
393 static const double pp0 = 1.28379167095512558561e-01; /* 0x3FC06EBA, 0x8214DB68 */
394 static const double pp1 = -3.25042107247001499370e-01; /* 0xBFD4CD7D, 0x691CB913 */
395 static const double pp2 = -2.84817495755985104766e-02; /* 0xBF9D2A51, 0xDBD7194F */
396 static const double pp3 = -5.77027029648944159157e-03; /* 0xBF77A291, 0x236668E4 */
397 static const double pp4 = -2.37630166566501626084e-05; /* 0xBEF8EAD6, 0x120016AC */
398 static const double qq1 = 3.97917223959155352819e-01; /* 0x3FD97779, 0xCDDADC09 */
399 static const double qq2 = 6.50222499887672944485e-02; /* 0x3FB0A54C, 0x5536CEBA */
400 static const double qq3 = 5.08130628187576562776e-03; /* 0x3F74D022, 0xC4D36B0F */
401 static const double qq4 = 1.32494738004321644526e-04; /* 0x3F215DC9, 0x221C1A10 */
402 static const double qq5 = -3.96022827877536812320e-06; /* 0xBED09C43, 0x42A26120 */
403 /*
404 * Coefficients for approximation to erf in [0.84375,1.25]
405 */
406 static const double pa0 = -2.36211856075265944077e-03; /* 0xBF6359B8, 0xBEF77538 */
407 static const double pa1 = 4.14856118683748331666e-01; /* 0x3FDA8D00, 0xAD92B34D */
408 static const double pa2 = -3.72207876035701323847e-01; /* 0xBFD7D240, 0xFBB8C3F1 */
409 static const double pa3 = 3.18346619901161753674e-01; /* 0x3FD45FCA, 0x805120E4 */
410 static const double pa4 = -1.10894694282396677476e-01; /* 0xBFBC6398, 0x3D3E28EC */
411 static const double pa5 = 3.54783043256182359371e-02; /* 0x3FA22A36, 0x599795EB */
412 static const double pa6 = -2.16637559486879084300e-03; /* 0xBF61BF38, 0x0A96073F */
413 static const double qa1 = 1.06420880400844228286e-01; /* 0x3FBB3E66, 0x18EEE323 */
414 static const double qa2 = 5.40397917702171048937e-01; /* 0x3FE14AF0, 0x92EB6F33 */
415 static const double qa3 = 7.18286544141962662868e-02; /* 0x3FB2635C, 0xD99FE9A7 */
416 static const double qa4 = 1.26171219808761642112e-01; /* 0x3FC02660, 0xE763351F */
417 static const double qa5 = 1.36370839120290507362e-02; /* 0x3F8BEDC2, 0x6B51DD1C */
418 static const double qa6 = 1.19844998467991074170e-02; /* 0x3F888B54, 0x5735151D */
419 /*
420 * Coefficients for approximation to erfc in [1.25,1/0.35]
421 */
422 static const double ra0 = -9.86494403484714822705e-03; /* 0xBF843412, 0x600D6435 */
423 static const double ra1 = -6.93858572707181764372e-01; /* 0xBFE63416, 0xE4BA7360 */
424 static const double ra2 = -1.05586262253232909814e+01; /* 0xC0251E04, 0x41B0E726 */
425 static const double ra3 = -6.23753324503260060396e+01; /* 0xC04F300A, 0xE4CBA38D */
426 static const double ra4 = -1.62396669462573470355e+02; /* 0xC0644CB1, 0x84282266 */
427 static const double ra5 = -1.84605092906711035994e+02; /* 0xC067135C, 0xEBCCABB2 */
428 static const double ra6 = -8.12874355063065934246e+01; /* 0xC0545265, 0x57E4D2F2 */
429 static const double ra7 = -9.81432934416914548592e+00; /* 0xC023A0EF, 0xC69AC25C */
430 static const double sa1 = 1.96512716674392571292e+01; /* 0x4033A6B9, 0xBD707687 */
431 static const double sa2 = 1.37657754143519042600e+02; /* 0x4061350C, 0x526AE721 */
432 static const double sa3 = 4.34565877475229228821e+02; /* 0x407B290D, 0xD58A1A71 */
433 static const double sa4 = 6.45387271733267880336e+02; /* 0x40842B19, 0x21EC2868 */
434 static const double sa5 = 4.29008140027567833386e+02; /* 0x407AD021, 0x57700314 */
435 static const double sa6 = 1.08635005541779435134e+02; /* 0x405B28A3, 0xEE48AE2C */
436 static const double sa7 = 6.57024977031928170135e+00; /* 0x401A47EF, 0x8E484A93 */
437 static const double sa8 = -6.04244152148580987438e-02; /* 0xBFAEEFF2, 0xEE749A62 */
438 /*
439 * Coefficients for approximation to erfc in [1/.35,28]
440 */
441 static const double rb0 = -9.86494292470009928597e-03; /* 0xBF843412, 0x39E86F4A */
442 static const double rb1 = -7.99283237680523006574e-01; /* 0xBFE993BA, 0x70C285DE */
443 static const double rb2 = -1.77579549177547519889e+01; /* 0xC031C209, 0x555F995A */
444 static const double rb3 = -1.60636384855821916062e+02; /* 0xC064145D, 0x43C5ED98 */
445 static const double rb4 = -6.37566443368389627722e+02; /* 0xC083EC88, 0x1375F228 */
446 static const double rb5 = -1.02509513161107724954e+03; /* 0xC0900461, 0x6A2E5992 */
447 static const double rb6 = -4.83519191608651397019e+02; /* 0xC07E384E, 0x9BDC383F */
448 static const double sb1 = 3.03380607434824582924e+01; /* 0x403E568B, 0x261D5190 */
449 static const double sb2 = 3.25792512996573918826e+02; /* 0x40745CAE, 0x221B9F0A */
450 static const double sb3 = 1.53672958608443695994e+03; /* 0x409802EB, 0x189D5118 */
451 static const double sb4 = 3.19985821950859553908e+03; /* 0x40A8FFB7, 0x688C246A */
452 static const double sb5 = 2.55305040643316442583e+03; /* 0x40A3F219, 0xCEDF3BE6 */
453 static const double sb6 = 4.74528541206955367215e+02; /* 0x407DA874, 0xE79FE763 */
454 static const double sb7 = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
455
456 int hx,ix;
457 double R,S,P,Q,s,y,z,r;
458
459 ESL_GET_HIGHWORD(hx, x); // SRE: replaced original Sun incantation here.
460 ix = hx & 0x7fffffff;
461 if (ix>=0x7ff00000) /* erfc(nan)=nan; erfc(+-inf)=0,2 */
462 return (double)(((unsigned)hx>>31)<<1)+one/x;
463
464 if (ix < 0x3feb0000) /* |x|<0.84375 */
465 {
466 if (ix < 0x3c700000) return one-x; /* |x|<2**-56 */
467 z = x*x;
468 r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
469 s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
470 y = r/s;
471 if (hx < 0x3fd00000) /* x<1/4 */
472 {
473 return one-(x+x*y);
474 }
475 else
476 {
477 r = x*y;
478 r += (x-half);
479 return half - r ;
480 }
481 }
482
483 if (ix < 0x3ff40000) /* 0.84375 <= |x| < 1.25 */
484 {
485 s = fabs(x)-one;
486 P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
487 Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
488 if (hx>=0)
489 {
490 z = one-erx;
491 return z - P/Q;
492 }
493 else
494 {
495 z = erx+P/Q;
496 return one+z;
497 }
498 }
499
500 if (ix < 0x403c0000) /* |x|<28 */
501 {
502 x = fabs(x);
503 s = one/(x*x);
504 if (ix< 0x4006DB6D) /* |x| < 1/.35 ~ 2.857143*/
505 {
506 R = ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(ra5+s*(ra6+s*ra7))))));
507 S = one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))));
508 }
509 else /* |x| >= 1/.35 ~ 2.857143 */
510 {
511 if (hx < 0 && ix >= 0x40180000) return two-tiny; /* x < -6 */
512 R = rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(rb5+s*rb6)))));
513 S = one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(sb5+s*(sb6+s*sb7))))));
514 }
515 z = x;
516 ESL_SET_LOWWORD(z, 0); // SRE: replaced original Sun incantation here.
517 r = exp(-z*z-0.5625) * exp((z-x)*(z+x)+R/S);
518
519 if (hx>0) return r/x;
520 else return two-r/x;
521 }
522 else
523 {
524 if (hx>0) return tiny*tiny;
525 else return two-tiny;
526 }
527 }
528 /*----------------- end, special functions ----------------------*/
529
530
531 /*****************************************************************
532 * 3. Standard statistical tests.
533 *****************************************************************/
534
535 /* Function: esl_stats_GTest()
536 * Synopsis: Calculates a G-test on 2 vs. 1 binomials.
537 *
538 * Purpose: In experiment a, we've drawn <ca> successes in <na> total
539 * trials; in experiment b, we've drawn <cb> successes in
540 * <nb> total trials. Are the counts different enough to
541 * conclude that the two experiments are different? The
542 * null hypothesis is that the successes in both experiments
543 * were drawn from the same binomial distribution with
544 * per-trial probability $p$. The tested hypothesis is that
545 * experiments a,b have different binomial probabilities
546 * $p_a,p_b$. The G-test is a log-likelihood-ratio statistic,
547 * assuming maximum likelihood values for $p,p_a,p_b$.
548 * $2G$ is distributed approximately as $X^2(1)$,
549 * %"X" is "Chi"
550 * which we use to calculate a P-value for the G statistic.
551 *
552 * Args: ca - number of positives in experiment a
553 * na - total number in experiment a
554 * cb - number of positives in experiment b
555 * nb - total number in experiment b
556 * ret_G - RETURN: G statistic, a log likelihood ratio, in nats
557 * ret_P - RETURN: P-value for the G-statistic
558 *
559 * Returns: <eslOK> on success.
560 *
561 * Throws: (no abnormal error conditions)
562 *
563 * Xref: Archive1999/0906-sagescore/sagescore.c
564 */
565 int
esl_stats_GTest(int ca,int na,int cb,int nb,double * ret_G,double * ret_P)566 esl_stats_GTest(int ca, int na, int cb, int nb, double *ret_G, double *ret_P)
567 {
568 double a,b,c,d,n;
569 double G = 0.;
570
571 a = (double) ca;
572 b = (double) (na - ca);
573 c = (double) cb;
574 d = (double) (nb - cb);
575 n = (double) na+nb;
576
577 /* Yes, the calculation here is correct; algebraic
578 * rearrangement of the log-likelihood-ratio with
579 * p_a = ca/na, p_b = cb/nb, and p = (ca+cb)/(na+nb).
580 * Guard against 0 probabilities; assume 0 log 0 => 0.
581 */
582 if (a > 0.) G = a * log(a);
583 if (b > 0.) G += b * log(b);
584 if (c > 0.) G += c * log(c);
585 if (d > 0.) G += d * log(d);
586 if (n > 0.) G += n * log(n);
587 if (a+b > 0.) G -= (a+b) * log(a+b);
588 if (c+d > 0.) G -= (c+d) * log(c+d);
589 if (a+c > 0.) G -= (a+c) * log(a+c);
590 if (b+d > 0.) G -= (b+d) * log(b+d);
591
592 *ret_G = G;
593 return esl_stats_IncompleteGamma( 0.5, G, NULL, ret_P);
594 }
595
596
597 /* Function: esl_stats_ChiSquaredTest()
598 * Synopsis: Calculates a $\chi^2$ P-value.
599 * Incept: SRE, Tue Jul 19 11:39:32 2005 [St. Louis]
600 *
601 * Purpose: Calculate the probability that a chi-squared statistic
602 * with <v> degrees of freedom would exceed the observed
603 * chi-squared value <x>; return it in <ret_answer>. If
604 * this probability is less than some small threshold (say,
605 * 0.05 or 0.01), then we may reject the hypothesis we're
606 * testing.
607 *
608 * Args: v - degrees of freedom
609 * x - observed chi-squared value
610 * ret_answer - RETURN: P(\chi^2 > x)
611 *
612 * Returns: <eslOK> on success.
613 *
614 * Throws: <eslERANGE> if <v> or <x> are out of valid range.
615 * <eslENOHALT> if iterative calculation fails.
616 */
617 int
esl_stats_ChiSquaredTest(int v,double x,double * ret_answer)618 esl_stats_ChiSquaredTest(int v, double x, double *ret_answer)
619 {
620 return esl_stats_IncompleteGamma((double)v/2., x/2., NULL, ret_answer);
621 }
622 /*----------------- end, statistical tests ---------------------*/
623
624
625
626 /*****************************************************************
627 * 4. Data fitting.
628 *****************************************************************/
629
630 /* Function: esl_stats_LinearRegression()
631 * Synopsis: Fit data to a straight line.
632 * Incept: SRE, Sat May 26 11:33:46 2007 [Janelia]
633 *
634 * Purpose: Fit <n> points <x[i]>, <y[i]> to a straight line
635 * $y = a + bx$ by linear regression.
636 *
637 * The $x_i$ are taken to be known, and the $y_i$ are taken
638 * to be observed quantities associated with a sampling
639 * error $\sigma_i$. If known, the standard deviations
640 * $\sigma_i$ for $y_i$ are provided in the <sigma> array.
641 * If they are unknown, pass <sigma = NULL>, and the
642 * routine will proceed with the assumption that $\sigma_i
643 * = 1$ for all $i$.
644 *
645 * The maximum likelihood estimates for $a$ and $b$ are
646 * optionally returned in <opt_a> and <opt_b>.
647 *
648 * The estimated standard deviations of $a$ and $b$ and
649 * their estimated covariance are optionally returned in
650 * <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>.
651 *
652 * The Pearson correlation coefficient is optionally
653 * returned in <opt_cc>.
654 *
655 * The $\chi^2$ P-value for the regression fit is
656 * optionally returned in <opt_Q>. This P-value may only be
657 * obtained when the $\sigma_i$ are known. If <sigma> is
658 * passed as <NULL> and <opt_Q> is requested, <*opt_Q> is
659 * set to 1.0.
660 *
661 * This routine follows the description and algorithm in
662 * \citep[pp.661-666]{Press93}.
663 *
664 * <n> must be greater than 2; at least two x[i] must
665 * differ; and if <sigma> is provided, all <sigma[i]> must
666 * be $>0$. If any of these conditions isn't met, the
667 * routine throws <eslEINVAL>.
668 *
669 * Args: x - x[0..n-1]
670 * y - y[0..n-1]
671 * sigma - sample error in observed y_i
672 * n - number of data points
673 * opt_a - optRETURN: intercept estimate
674 * opt_b - optRETURN: slope estimate
675 * opt_sigma_a - optRETURN: error in estimate of a
676 * opt_sigma_b - optRETURN: error in estimate of b
677 * opt_cov_ab - optRETURN: covariance of a,b estimates
678 * opt_cc - optRETURN: Pearson correlation coefficient for x,y
679 * opt_Q - optRETURN: X^2 P-value for linear fit
680 *
681 * Returns: <eslOK> on success.
682 *
683 * Throws: <eslEMEM> on allocation error;
684 * <eslEINVAL> if a contract condition isn't met;
685 * <eslENORESULT> if the chi-squared test fails.
686 * In these cases, all optional return values are set to 0.
687 */
688 int
esl_stats_LinearRegression(const double * x,const double * y,const double * sigma,int n,double * opt_a,double * opt_b,double * opt_sigma_a,double * opt_sigma_b,double * opt_cov_ab,double * opt_cc,double * opt_Q)689 esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n,
690 double *opt_a, double *opt_b,
691 double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab,
692 double *opt_cc, double *opt_Q)
693 {
694 int status;
695 double *t = NULL;
696 double S, Sx, Sy, Stt;
697 double Sxy, Sxx, Syy;
698 double a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q;
699 double xdev, ydev;
700 double tmp;
701 int i;
702
703 /* Contract checks. */
704 if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting");
705 if (sigma != NULL)
706 for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i);
707 status = eslEINVAL;
708 for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; }
709 if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0.");
710
711 /* Allocations */
712 ESL_ALLOC(t, sizeof(double) * n);
713
714 /* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}. (S > 0.) */
715 if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]); }
716 else S = (double) n;
717
718 /* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2} (Sx real.) */
719 for (Sx = 0., i = 0; i < n; i++) {
720 if (sigma == NULL) Sx += x[i];
721 else Sx += x[i] / (sigma[i] * sigma[i]);
722 }
723
724 /* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2} (Sy real.) */
725 for (Sy = 0., i = 0; i < n; i++) {
726 if (sigma == NULL) Sy += y[i];
727 else Sy += y[i] / (sigma[i] * sigma[i]);
728 }
729
730 /* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right) (t_i real) */
731 for (i = 0; i < n; i++) {
732 t[i] = x[i] - Sx/S;
733 if (sigma != NULL) t[i] /= sigma[i];
734 }
735
736 /* S_{tt} = \sum_{i=1}^n t_i^2 (if at least one x is != 0, Stt > 0) */
737 for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; }
738
739 /* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i} */
740 for (b = 0., i = 0; i < n; i++) {
741 if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; }
742 else { b += t[i]*y[i]; }
743 }
744 b /= Stt;
745
746 /* a = \frac{ S_y - S_x b } {S} */
747 a = (Sy - Sx * b) / S;
748
749 /* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */
750 sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S);
751
752 /* \sigma_b = \frac{1}{S_{tt}} */
753 sigma_b = sqrt (1. / Stt);
754
755 /* Cov(a,b) = - \frac{S_x}{S S_{tt}} */
756 cov_ab = -Sx / (S * Stt);
757
758 /* Pearson correlation coefficient */
759 Sxy = Sxx = Syy = 0.;
760 for (i = 0; i < n; i++) {
761 if (sigma != NULL) {
762 xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n);
763 ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n);
764 } else {
765 xdev = x[i] - (Sx / n);
766 ydev = y[i] - (Sy / n);
767 }
768 Sxy += xdev * ydev;
769 Sxx += xdev * xdev;
770 Syy += ydev * ydev;
771 }
772 cc = Sxy / (sqrt(Sxx) * sqrt(Syy));
773
774 /* \chi^2 */
775 for (X2 = 0., i = 0; i < n; i++) {
776 tmp = y[i] - a - b*x[i];
777 if (sigma != NULL) tmp /= sigma[i];
778 X2 += tmp*tmp;
779 }
780
781 /* We can calculate a goodness of fit if we know the \sigma_i */
782 if (sigma != NULL) {
783 if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; }
784 } else Q = 1.0;
785
786 /* If we didn't use \sigma_i, adjust the sigmas for a,b */
787 if (sigma == NULL) {
788 tmp = sqrt(X2 / (double)(n-2));
789 sigma_a *= tmp;
790 sigma_b *= tmp;
791 }
792
793 /* Done. Set up for normal return.
794 */
795 free(t);
796 if (opt_a != NULL) *opt_a = a;
797 if (opt_b != NULL) *opt_b = b;
798 if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a;
799 if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b;
800 if (opt_cov_ab != NULL) *opt_cov_ab = cov_ab;
801 if (opt_cc != NULL) *opt_cc = cc;
802 if (opt_Q != NULL) *opt_Q = Q;
803 return eslOK;
804
805 ERROR:
806 if (t != NULL) free(t);
807 if (opt_a != NULL) *opt_a = 0.;
808 if (opt_b != NULL) *opt_b = 0.;
809 if (opt_sigma_a != NULL) *opt_sigma_a = 0.;
810 if (opt_sigma_b != NULL) *opt_sigma_b = 0.;
811 if (opt_cov_ab != NULL) *opt_cov_ab = 0.;
812 if (opt_cc != NULL) *opt_cc = 0.;
813 if (opt_Q != NULL) *opt_Q = 0.;
814 return status;
815 }
816 /*------------------- end, data fitting -------------------------*/
817
818
819
820 /*****************************************************************
821 * 5. Unit tests.
822 *****************************************************************/
823 #ifdef eslSTATS_TESTDRIVE
824 #include "esl_random.h"
825 #include "esl_stopwatch.h"
826 #ifdef HAVE_LIBGSL
827 #include <gsl/gsl_sf_gamma.h>
828 #endif
829
830
831 /* Macros for treating IEEE754 double as two uint32_t halves, with
832 * compile-time handling of endianness; see esl_stats.h.
833 */
834 static void
utest_doublesplitting(ESL_RANDOMNESS * rng)835 utest_doublesplitting(ESL_RANDOMNESS *rng)
836 {
837 char msg[] = "esl_stats:: doublesplitting unit test failed";
838 uint32_t ix0, ix1;
839 double x;
840 double x2;
841 int iteration; // iteration 0 uses x = 2; iteration 1 uses random x = [0,1).
842
843 for (iteration = 0; iteration < 2; iteration++)
844 {
845 x = (iteration == 0 ? 2.0 : esl_random(rng));
846 ESL_GET_WORDS(ix0, ix1, x);
847 ESL_SET_WORDS(x2, ix0, ix1);
848 if (x2 != x) esl_fatal(msg);
849
850 ESL_GET_HIGHWORD(ix0, x);
851 ESL_SET_HIGHWORD(x2, ix0);
852 if (x2 != x) esl_fatal(msg);
853
854 ESL_GET_LOWWORD(ix0, x);
855 ESL_SET_LOWWORD(x2, ix0);
856 if (iteration == 0 && ix0 != 0) esl_fatal(msg);
857 if (x2 != x) esl_fatal(msg);
858 }
859 }
860
861 /* The LogGamma() function is rate-limiting in hmmbuild, because it is
862 * used so heavily in mixture Dirichlet calculations.
863 * ./configure --with-gsl; [compile test driver]
864 * ./stats_utest -v
865 * runs a comparison of time/precision against GSL.
866 * SRE, Sat May 23 10:04:41 2009, on home Mac:
867 * LogGamma = 1.29u / N=1e8 = 13 nsec/call
868 * gsl_sf_lngamma = 1.43u / N=1e8 = 14 nsec/call
869 */
870 static void
utest_LogGamma(ESL_RANDOMNESS * r,int N,int be_verbose)871 utest_LogGamma(ESL_RANDOMNESS *r, int N, int be_verbose)
872 {
873 char *msg = "esl_stats_LogGamma() unit test failed";
874 ESL_STOPWATCH *w = esl_stopwatch_Create();
875 double *x = malloc(sizeof(double) * N);
876 double *lg = malloc(sizeof(double) * N);
877 double *lg2 = malloc(sizeof(double) * N);
878 int i;
879
880 for (i = 0; i < N; i++)
881 x[i] = esl_random(r) * 100.;
882
883 esl_stopwatch_Start(w);
884 for (i = 0; i < N; i++)
885 if (esl_stats_LogGamma(x[i], &(lg[i])) != eslOK) esl_fatal(msg);
886 esl_stopwatch_Stop(w);
887
888 if (be_verbose) esl_stopwatch_Display(stdout, w, "esl_stats_LogGamma() timing: ");
889
890 #ifdef HAVE_LIBGSL
891 esl_stopwatch_Start(w);
892 for (i = 0; i < N; i++) lg2[i] = gsl_sf_lngamma(x[i]);
893 esl_stopwatch_Stop(w);
894
895 if (be_verbose) esl_stopwatch_Display(stdout, w, "gsl_sf_lngamma() timing: ");
896
897 for (i = 0; i < N; i++)
898 if (esl_DCompare(lg[i], lg2[i], 1e-2) != eslOK) esl_fatal(msg);
899 #endif
900
901 free(lg2);
902 free(lg);
903 free(x);
904 esl_stopwatch_Destroy(w);
905 }
906
907
908 /* The test of esl_stats_LinearRegression() is a statistical test,
909 * so we can't be too aggressive about testing results.
910 *
911 * Args:
912 * r - a source of randomness
913 * use_sigma - TRUE to pass sigma to the regression fit.
914 * be_verbose - TRUE to print results (manual, not automated test mode)
915 */
916 static void
utest_LinearRegression(ESL_RANDOMNESS * r,int use_sigma,int be_verbose)917 utest_LinearRegression(ESL_RANDOMNESS *r, int use_sigma, int be_verbose)
918 {
919 char msg[] = "linear regression unit test failed";
920 double a = -3.;
921 double b = 1.;
922 int n = 100;
923 double xori = -20.;
924 double xstep = 1.0;
925 double setsigma = 1.0; /* sigma on all points */
926 int i;
927 double *x = NULL;
928 double *y = NULL;
929 double *sigma = NULL;
930 double ae, be, siga, sigb, cov_ab, cc, Q;
931
932 if ((x = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
933 if ((y = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
934 if ((sigma = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
935
936 /* Simulate some linear data */
937 for (i = 0; i < n; i++)
938 {
939 sigma[i] = setsigma;
940 x[i] = xori + i*xstep;
941 y[i] = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
942 }
943
944 if (use_sigma) {
945 if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK) esl_fatal(msg);
946 } else {
947 if (esl_stats_LinearRegression(x, y, NULL, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK) esl_fatal(msg);
948 }
949
950 if (be_verbose) {
951 printf("Linear regression test:\n");
952 printf("estimated intercept a = %8.4f [true = %8.4f]\n", ae, a);
953 printf("estimated slope b = %8.4f [true = %8.4f]\n", be, b);
954 printf("estimated sigma on a = %8.4f\n", siga);
955 printf("estimated sigma on b = %8.4f\n", sigb);
956 printf("estimated cov(a,b) = %8.4f\n", cov_ab);
957 printf("correlation coeff = %8.4f\n", cc);
958 printf("P-value = %8.4f\n", Q);
959 }
960
961 /* The following tests are statistical.
962 */
963 if ( fabs(ae-a) > 2*siga ) esl_fatal(msg);
964 if ( fabs(be-b) > 2*sigb ) esl_fatal(msg);
965 if ( cc < 0.95) esl_fatal(msg);
966 if (use_sigma) {
967 if (Q < 0.001) esl_fatal(msg);
968 } else {
969 if (Q != 1.0) esl_fatal(msg);
970 }
971
972 free(x);
973 free(y);
974 free(sigma);
975 }
976
977 static void
utest_erfc(ESL_RANDOMNESS * rng,int be_verbose)978 utest_erfc(ESL_RANDOMNESS *rng, int be_verbose)
979 {
980 char msg[] = "esl_stats:: erfc unit test failed";
981 double x;
982 double result;
983 int i;
984
985 if (be_verbose) {
986 printf("#--------------------------\n");
987 printf("# erfc unit testing...\n");
988 }
989
990 result = esl_stats_erfc( eslNaN);
991 if (! isnan(result)) esl_fatal(msg);
992 if (esl_stats_erfc(-eslINFINITY) != 2.0) esl_fatal(msg);
993 if (esl_stats_erfc( 0.0) != 1.0) esl_fatal(msg);
994 if (esl_stats_erfc( eslINFINITY) != 0.0) esl_fatal(msg);
995
996 for (i = 0; i < 42; i++)
997 {
998 x = esl_random(rng) * 10. - 5.;
999 result = esl_stats_erfc(x);
1000 if (!isfinite(result)) esl_fatal(msg);
1001 #ifdef HAVE_ERFC
1002 if (esl_DCompare(result, erfc(x), 1e-6) != eslOK) esl_fatal(msg);
1003 if (be_verbose)
1004 printf("%15f %15f %15f\n", x, result, erfc(x));
1005 #endif
1006 }
1007
1008 if (be_verbose)
1009 printf("#--------------------------\n");
1010 return;
1011 }
1012
1013 #endif /*eslSTATS_TESTDRIVE*/
1014 /*-------------------- end of unit tests ------------------------*/
1015
1016
1017
1018
1019 /*****************************************************************
1020 * 6. Test driver.
1021 *****************************************************************/
1022 #ifdef eslSTATS_TESTDRIVE
1023 /* gcc -g -Wall -o stats_utest -L. -I. -DeslSTATS_TESTDRIVE esl_stats.c -leasel -lm
1024 * gcc -DHAVE_LIBGSL -O2 -o stats_utest -L. -I. -DeslSTATS_TESTDRIVE esl_stats.c -leasel -lgsl -lm
1025 */
1026 #include <stdio.h>
1027 #include "easel.h"
1028 #include "esl_getopts.h"
1029 #include "esl_random.h"
1030 #include "esl_stats.h"
1031
1032 static ESL_OPTIONS options[] = {
1033 /* name type default env range togs reqs incomp help docgrp */
1034 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
1035 {"-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
1036 {"-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "verbose: show verbose output", 0},
1037 {"-N", eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials in LogGamma test", 0},
1038 { 0,0,0,0,0,0,0,0,0,0},
1039 };
1040 static char usage[] = "[-options]";
1041 static char banner[] = "test driver for stats special functions";
1042
1043 int
main(int argc,char ** argv)1044 main(int argc, char **argv)
1045 {
1046 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1047 ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1048 int be_verbose = esl_opt_GetBoolean(go, "-v");
1049 int N = esl_opt_GetInteger(go, "-N");
1050
1051 if (be_verbose) printf("seed = %" PRIu32 "\n", esl_randomness_GetSeed(r));
1052
1053 utest_doublesplitting(r);
1054 utest_erfc(r, be_verbose);
1055 utest_LogGamma(r, N, be_verbose);
1056 utest_LinearRegression(r, TRUE, be_verbose);
1057 utest_LinearRegression(r, FALSE, be_verbose);
1058
1059 esl_getopts_Destroy(go);
1060 esl_randomness_Destroy(r);
1061 exit(0);
1062 }
1063 #endif /*eslSTATS_TESTDRIVE*/
1064 /*------------------- end of test driver ------------------------*/
1065
1066
1067
1068
1069 /*****************************************************************
1070 * 7. Examples.
1071 *****************************************************************/
1072
1073 /* Compile: gcc -g -Wall -o example -I. -DeslSTATS_EXAMPLE esl_stats.c esl_random.c easel.c -lm
1074 * or gcc -g -Wall -o example -I. -L. -DeslSTATS_EXAMPLE esl_stats.c -leasel -lm
1075 */
1076 #ifdef eslSTATS_EXAMPLE
1077 /*::cexcerpt::stats_example::begin::*/
1078 /* gcc -g -Wall -o example -I. -DeslSTATS_EXAMPLE esl_stats.c esl_random.c easel.c -lm */
1079 #include <stdio.h>
1080 #include "easel.h"
1081 #include "esl_random.h"
1082 #include "esl_stats.h"
1083
main(void)1084 int main(void)
1085 {
1086 ESL_RANDOMNESS *r = esl_randomness_Create(0);
1087 double a = -3.;
1088 double b = 1.;
1089 double xori = -20.;
1090 double xstep = 1.0;
1091 double setsigma = 1.0; /* sigma on all points */
1092 int n = 100;
1093 double *x = malloc(sizeof(double) * n);
1094 double *y = malloc(sizeof(double) * n);
1095 double *sigma = malloc(sizeof(double) * n);
1096 int i;
1097 double ae, be, siga, sigb, cov_ab, cc, Q;
1098
1099 /* Simulate some linear data, with Gaussian noise added to y_i */
1100 for (i = 0; i < n; i++) {
1101 sigma[i] = setsigma;
1102 x[i] = xori + i*xstep;
1103 y[i] = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
1104 }
1105
1106 if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK)
1107 esl_fatal("linear regression failed");
1108
1109 printf("estimated intercept a = %8.4f [true = %8.4f]\n", ae, a);
1110 printf("estimated slope b = %8.4f [true = %8.4f]\n", be, b);
1111 printf("estimated sigma on a = %8.4f\n", siga);
1112 printf("estimated sigma on b = %8.4f\n", sigb);
1113 printf("estimated cov(a,b) = %8.4f\n", cov_ab);
1114 printf("correlation coeff = %8.4f\n", cc);
1115 printf("P-value = %8.4f\n", Q);
1116
1117 free(x); free(y); free(sigma);
1118 esl_randomness_Destroy(r);
1119 exit(0);
1120 }
1121 /*::cexcerpt::stats_example::end::*/
1122 #endif /* eslSTATS_EXAMPLE */
1123
1124
1125 #ifdef eslSTATS_EXAMPLE2
1126
1127 #include <stdlib.h>
1128
1129 #include "easel.h"
1130 #include "esl_getopts.h"
1131 #include "esl_stats.h"
1132
1133 static ESL_OPTIONS options[] = {
1134 /* name type default env range toggles reqs incomp help docgroup*/
1135 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1136 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1137 };
1138 static char usage[] = "[-options] <ca> <na> <cb> <nb>";
1139 static char banner[] = "example from the stats module: using a G-test";
1140
1141 int
main(int argc,char ** argv)1142 main(int argc, char **argv)
1143 {
1144 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 4, argc, argv, banner, usage);
1145 int ca = strtol(esl_opt_GetArg(go, 1), NULL, 10);
1146 int na = strtol(esl_opt_GetArg(go, 2), NULL, 10);
1147 int cb = strtol(esl_opt_GetArg(go, 3), NULL, 10);
1148 int nb = strtol(esl_opt_GetArg(go, 4), NULL, 10);
1149 double G, P;
1150 int status;
1151
1152 if (ca > na || cb > nb) esl_fatal("argument order wrong? expect ca, na, cb, nb for ca/na, cb/nb");
1153
1154 if ( (status = esl_stats_GTest(ca, na, cb, nb, &G, &P)) != eslOK) esl_fatal("G-test failed?");
1155 printf("%-10.3g %12.2f\n", P, G);
1156 exit(0);
1157 }
1158 #endif /* eslSTATS_EXAMPLE2 */
1159 /*--------------------- end of examples -------------------------*/
1160
1161