1 /* Statistical routines for Gumbel (type I extreme value) distributions.
2 *
3 * Contents:
4 * 1. Routines for evaluating densities and distributions
5 * 2. Generic API routines: for general interface w/ histogram module
6 * 3. Dumping plots to files
7 * 4. Sampling
8 * 5. ML fitting to complete data
9 * 6. ML fitting to censored data (x_i >= phi; z known)
10 * 7. ML fitting to truncated data (x_i >= phi; z unknown)
11 * 8. Stats driver
12 * 9. Unit tests
13 * 10. Test driver
14 * 11. Example
15 *
16 * To-do:
17 * - ML fitting routines will be prone to over/underfitting
18 * problems for scores outside a "normal" range, because
19 * of exp(-lambda * x) calls. The Lawless ML estimation
20 * may eventually need to be recast in log space.
21 * SRE, Mon Aug 6 13:42:09 2007
22 *
23 */
24 #include "esl_config.h"
25
26 #include <stdio.h>
27 #include <math.h>
28 #include <float.h>
29
30 #include "easel.h"
31 #include "esl_minimizer.h"
32 #include "esl_random.h"
33 #include "esl_stats.h"
34 #include "esl_vectorops.h"
35
36 #include "esl_gumbel.h"
37
38 /*****************************************************************
39 * 1. Routines for evaluating densities and distributions
40 *****************************************************************/
41
42 /* Function: esl_gumbel_pdf()
43 * Synopsis: Returns the probability density at $x$, $P(S=x)$.
44 *
45 * Purpose: Calculates the probability density function for the Gumbel,
46 * $P(X=x)$, given quantile <x> and Gumbel location and
47 * scale parameters <mu> and <lambda>.
48 *
49 * Let $y = \lambda(x-\mu)$; for 64-bit doubles,
50 * useful dynamic range is about $-6.5 <= y <= 710$.
51 * Returns 0.0 for smaller $y$, 0.0 for larger $y$.
52 */
53 double
esl_gumbel_pdf(double x,double mu,double lambda)54 esl_gumbel_pdf(double x, double mu, double lambda)
55 {
56 double y;
57 y = lambda * (x - mu);
58 return (lambda * exp(-y - exp(-y)));
59 }
60
61
62 /* Function: esl_gumbel_logpdf()
63 * Synopsis: Returns the log of the pdf at $x$, $\log P(S=x)$.
64 *
65 * Purpose: Calculates the log probability density function for the Gumbel,
66 * $\log P(X=x)$.
67 *
68 * Let $y = \lambda(x-\mu)$; for 64-bit doubles,
69 * useful dynamic range is about $-708 <= y <= \infty$.
70 * Returns $-\infty$ for smaller or larger $y$.
71 */
72 double
esl_gumbel_logpdf(double x,double mu,double lambda)73 esl_gumbel_logpdf(double x, double mu, double lambda)
74 {
75 double y;
76 y = lambda * (x - mu);
77 return (log(lambda) -y - exp(-y));
78 }
79
80
81 /* Function: esl_gumbel_cdf()
82 * Synopsis: Returns the cumulative distribution at $x$, $P(S \leq x)$.
83 *
84 * Purpose: Calculates the cumulative distribution function
85 * for the Gumbel, $P(X \leq x)$.
86 *
87 * Let $y = \lambda(x-\mu)$; for 64-bit doubles,
88 * useful dynamic range for $y$ is about $-6.5 <= y <=36$.
89 * Returns 0.0 for smaller $y$, 1.0 for larger $y$.
90 */
91 double
esl_gumbel_cdf(double x,double mu,double lambda)92 esl_gumbel_cdf(double x, double mu, double lambda)
93 {
94 double y;
95 y = lambda*(x-mu);
96 return exp(-exp(-y));
97 }
98
99 /* Function: esl_gumbel_logcdf()
100 * Synopsis: Returns the log of the cdf at $x$, $\log P(S \leq x)$.
101 *
102 * Purpose: Calculates the log of the cumulative distribution function
103 * for the Gumbel, $\log P(X \leq x)$.
104 *
105 * Let $y = \lambda(x-\mu)$; for 64-bit doubles,
106 * useful dynamic range for $y$ is about $-708 <= y <= 708$.
107 * Returns $-\infty$ for smaller $y$, 0.0 for larger $y$.
108 */
109 double
esl_gumbel_logcdf(double x,double mu,double lambda)110 esl_gumbel_logcdf(double x, double mu, double lambda)
111 {
112 double y;
113 y = lambda*(x-mu);
114 return (-exp(-y));
115 }
116
117 /* Function: esl_gumbel_surv()
118 * Synopsis: Returns right tail mass above $x$, $P(S > x)$.
119 *
120 * Purpose: Calculates the survivor function, $P(X>x)$ for a Gumbel
121 * (that is, 1-cdf), the right tail's probability mass.
122 *
123 * Let $y=\lambda(x-\mu)$; for 64-bit doubles,
124 * useful dynamic range for $y$ is $-3.6 <= y <= 708$.
125 * Returns 1.0 for $y$ below lower limit, and 0.0
126 * for $y$ above upper limit.
127 */
128 double
esl_gumbel_surv(double x,double mu,double lambda)129 esl_gumbel_surv(double x, double mu, double lambda)
130 {
131 double y = lambda*(x-mu);
132 double ey = -exp(-y);
133
134 /* Use 1-e^x ~ -x approximation here when e^-y is small. */
135 if (fabs(ey) < eslSMALLX1) return -ey;
136 else return 1 - exp(ey);
137 }
138
139 /* Function: esl_gumbel_logsurv()
140 * Synopsis: Returns log survival at $x$, $\log P(S > x)$.
141 *
142 * Purpose: Calculates $\log P(X>x)$ for a Gumbel (that is, $\log$(1-cdf)):
143 * the log of the right tail's probability mass.
144 *
145 * Let $y=\lambda(x-\mu)$; for 64-bit doubles,
146 * useful dynamic range for $y$ is $-6.5 <= y <= \infty$.
147 * Returns 0.0 for smaller $y$.
148 */
149 double
esl_gumbel_logsurv(double x,double mu,double lambda)150 esl_gumbel_logsurv(double x, double mu, double lambda)
151 {
152 double y = lambda*(x-mu);
153 double ey = -exp(-y);
154
155 /* The real calculation is log(1-exp(-exp(-y))).
156 * For "large" y, -exp(-y) is small, so 1-exp(-exp(-y) ~ exp(-y),
157 * and log of that gives us -y.
158 * For "small y", exp(-exp(-y) is small, and we can use log(1-x) ~ -x.
159 */
160 if (fabs(ey) < eslSMALLX1) return -y;
161 else if (fabs(exp(ey)) < eslSMALLX1) return -exp(ey);
162 else return log(1-exp(ey));
163 }
164
165 /* Function: esl_gumbel_invcdf()
166 *
167 * Purpose: Calculates the inverse CDF for a Gumbel distribution
168 * with parameters <mu> and <lambda>. That is, returns
169 * the quantile <x> at which the CDF is <p>.
170 */
171 double
esl_gumbel_invcdf(double p,double mu,double lambda)172 esl_gumbel_invcdf(double p, double mu, double lambda)
173 {
174 return mu - ( log(-1. * log(p)) / lambda);
175 }
176
177 /* Function: esl_gumbel_invsurv()
178 *
179 * Purpose: Calculates the score at which the right tail's mass
180 * is p, for a Gumbel distribution
181 * with parameters <mu> and <lambda>. That is, returns
182 * the quantile <x> at which 1-CDF is <p>.
183 */
184 double
esl_gumbel_invsurv(double p,double mu,double lambda)185 esl_gumbel_invsurv(double p, double mu, double lambda)
186 {
187 /* The real calculation is mu - ( log(-1. * log(1-p)) / lambda).
188 * But there's a problem with small p:
189 * for p<1e-15, 1-p will be viewed as 1, so
190 * log ( -log(1-p) ) == log (0) -> inf
191 * Instead, use two approximations;
192 * (1) log( 1-p) ~= -p for small p (e.g. p<0.001)
193 * so log(-1. * log(1-p)) ~= log(p)
194 * (2) log (p) ~= (p^p - 1) / p
195 *
196 * See notes Mar 1, 2010.
197 */
198 double log_part;
199 if (p < eslSMALLX1) {
200 log_part = (pow(p,p) - 1 ) / p;
201 } else {
202 log_part = log(-1. * log(1-p));
203 }
204
205 //test 2
206
207 return mu - ( log_part / lambda);
208 }
209 /*------------------ end of densities and distributions --------------------*/
210
211
212 /*****************************************************************
213 * 2. Generic API routines: for general interface w/ histogram module
214 *****************************************************************/
215
216 /* Function: esl_gumbel_generic_pdf()
217 *
218 * Purpose: Generic-API version of PDF function.
219 */
220 double
esl_gumbel_generic_pdf(double p,void * params)221 esl_gumbel_generic_pdf(double p, void *params)
222 {
223 double *v = (double *) params;
224 return esl_gumbel_pdf(p, v[0], v[1]);
225 }
226
227 /* Function: esl_gumbel_generic_cdf()
228 *
229 * Purpose: Generic-API version of CDF function.
230 */
231 double
esl_gumbel_generic_cdf(double x,void * params)232 esl_gumbel_generic_cdf(double x, void *params)
233 {
234 double *p = (double *) params;
235 return esl_gumbel_cdf(x, p[0], p[1]);
236 }
237
238 /* Function: esl_gumbel_generic_surv()
239 *
240 * Purpose: Generic-API version of survival function.
241 */
242 double
esl_gumbel_generic_surv(double p,void * params)243 esl_gumbel_generic_surv(double p, void *params)
244 {
245 double *v = (double *) params;
246 return esl_gumbel_surv(p, v[0], v[1]);
247 }
248
249 /* Function: esl_gumbel_generic_invcdf()
250 *
251 * Purpose: Generic-API version of inverse CDF.
252 */
253 double
esl_gumbel_generic_invcdf(double p,void * params)254 esl_gumbel_generic_invcdf(double p, void *params)
255 {
256 double *v = (double *) params;
257 return esl_gumbel_invcdf(p, v[0], v[1]);
258 }
259
260
261 /*------------------------- end of generic API --------------------------*/
262
263
264
265 /****************************************************************************
266 * 3. Routines for dumping plots for files
267 ****************************************************************************/
268
269 /* Function: esl_gumbel_Plot()
270 * Synopsis: Plot a Gumbel function in XMGRACE XY format.
271 *
272 * Purpose: Plot a Gumbel function <func> (for instance,
273 * <esl_gumbel_pdf()>) for parameters <mu> and <lambda>, for
274 * a range of quantiles x from <xmin> to <xmax> in steps of <xstep>;
275 * output to an open stream <fp> in xmgrace XY input format.
276 *
277 * Returns: <eslOK> on success.
278 *
279 * Throws: <eslEWRITE> on any system write error, such as filled disk.
280 */
281 int
esl_gumbel_Plot(FILE * fp,double mu,double lambda,double (* func)(double x,double mu,double lambda),double xmin,double xmax,double xstep)282 esl_gumbel_Plot(FILE *fp, double mu, double lambda,
283 double (*func)(double x, double mu, double lambda),
284 double xmin, double xmax, double xstep)
285 {
286 double x;
287 for (x = xmin; x <= xmax; x += xstep)
288 if (fprintf(fp, "%f\t%g\n", x, (*func)(x, mu, lambda)) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "gumbel plot write failed");
289 if (fprintf(fp, "&\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "gumbel plot write failed");
290 return eslOK;
291 }
292 /*-------------------- end plot dumping routines ---------------------------*/
293
294
295
296 /*****************************************************************
297 * 4. Routines for sampling
298 *****************************************************************/
299
300 /* Function: esl_gumbel_Sample()
301 * Synopsis: Return a Gumbel-distributed random sample $x$.
302 *
303 * Purpose: Sample a Gumbel-distributed random variate
304 * by the transformation method.
305 */
306 double
esl_gumbel_Sample(ESL_RANDOMNESS * r,double mu,double lambda)307 esl_gumbel_Sample(ESL_RANDOMNESS *r, double mu, double lambda)
308 {
309 double p;
310 p = esl_rnd_UniformPositive(r);
311 return esl_gumbel_invcdf(p, mu, lambda);
312 }
313 /*------------------------ end of sampling --------------------------------*/
314
315
316
317 /*****************************************************************
318 * 5. Maximum likelihood fitting to complete data
319 *****************************************************************/
320
321 /* lawless416()
322 * SRE, Thu Nov 13 11:48:50 1997 [St. Louis]
323 *
324 * Purpose: Equation 4.1.6 from [Lawless82], pg. 143, and
325 * its first derivative with respect to lambda,
326 * for finding the ML fit to Gumbel lambda parameter.
327 * This equation gives a result of zero for the maximum
328 * likelihood lambda.
329 *
330 * Args: x - array of sample values
331 * n - number of samples
332 * lambda - a lambda to test
333 * ret_f - RETURN: 4.1.6 evaluated at lambda
334 * ret_df - RETURN: first derivative of 4.1.6 evaluated at lambda
335 *
336 * Return: (void)
337 */
338 static void
lawless416(double * x,int n,double lambda,double * ret_f,double * ret_df)339 lawless416(double *x, int n, double lambda, double *ret_f, double *ret_df)
340 {
341 double esum; /* \sum e^(-lambda xi) */
342 double xesum; /* \sum xi e^(-lambda xi) */
343 double xxesum; /* \sum xi^2 e^(-lambda xi) */
344 double xsum; /* \sum xi */
345 int i;
346
347 esum = xesum = xsum = xxesum = 0.;
348 for (i = 0; i < n; i++)
349 {
350 xsum += x[i];
351 xesum += x[i] * exp(-1. * lambda * x[i]);
352 xxesum += x[i] * x[i] * exp(-1. * lambda * x[i]);
353 esum += exp(-1. * lambda * x[i]);
354 }
355 *ret_f = (1./lambda) - (xsum / n) + (xesum / esum);
356 *ret_df = ((xesum / esum) * (xesum / esum))
357 - (xxesum / esum)
358 - (1. / (lambda * lambda));
359 }
360
361 /* Function: esl_gumbel_FitComplete()
362 * Synopsis: Estimates $\mu$, $\lambda$ from complete data.
363 *
364 * Purpose: Given an array of Gumbel-distributed samples
365 * <x[0]..x[n-1]>, find maximum likelihood parameters <mu>
366 * and <lambda>.
367 *
368 * The number of samples <n> must be reasonably large to get
369 * an accurate fit. <n=100> suffices to get an accurate
370 * location parameter $\mu$ (to about 1% error), but
371 * <n~10000> is required to get a similarly accurate
372 * estimate of $\lambda$. It's probably a bad idea to try to
373 * fit a Gumbel to less than about 1000 data points.
374 *
375 * On a very small number of samples, the fit can fail
376 * altogether, in which case the routine will return a
377 * <eslENORESULT> code. Caller must check for this.
378 *
379 * Uses approach described in [Lawless82]. Solves for lambda
380 * using Newton/Raphson iterations, then substitutes lambda
381 * into Lawless' equation 4.1.5 to get mu.
382 *
383 * Args: x - list of Gumbel distributed samples
384 * n - number of samples (n>1)
385 * ret_mu - RETURN: ML estimate of mu
386 * ret_lambda - RETURN: ML estimate of lambda
387 *
388 * Returns: <eslOK> on success.
389 *
390 * <eslEINVAL> if n<=1.
391 * <eslENORESULT> if the fit fails, likely because the
392 * number of samples is too small. On either error,
393 * <*ret_mu> and <*ret_lambda> are 0.0. These are classed
394 * as failures (normal errors) because the data vector may
395 * have been provided by a user.
396 */
397 int
esl_gumbel_FitComplete(double * x,int n,double * ret_mu,double * ret_lambda)398 esl_gumbel_FitComplete(double *x, int n, double *ret_mu, double *ret_lambda)
399 {
400 double variance;
401 double lambda, mu;
402 double fx; /* f(x) */
403 double dfx; /* f'(x) */
404 double esum; /* \sum e^(-lambda xi) */
405 double tol = 1e-5;
406 int i;
407 int status;
408
409 if (n <= 1) { status = eslEINVAL; goto FAILURE; }
410
411 /* 1. Find an initial guess at lambda
412 * (Evans/Hastings/Peacock, Statistical Distributions, 2000, p.86)
413 */
414 esl_stats_DMean(x, n, NULL, &variance);
415 lambda = eslCONST_PI / sqrt(6.*variance);
416
417 /* 2. Use Newton/Raphson to solve Lawless 4.1.6 and find ML lambda
418 */
419 for (i = 0; i < 100; i++)
420 {
421 lawless416(x, n, lambda, &fx, &dfx);
422 if (fabs(fx) < tol) break; /* success */
423 lambda = lambda - fx / dfx; /* Newton/Raphson is simple */
424 if (lambda <= 0.) lambda = 0.001; /* but be a little careful */
425 }
426
427 /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
428 * Resort to a bisection search. Worse convergence speed
429 * but guaranteed to converge (unlike Newton/Raphson).
430 * We assume that fx is a monotonically decreasing function of x;
431 * i.e. fx > 0 if we are left of the root, fx < 0 if we
432 * are right of the root.
433 */
434 if (i == 100)
435 {
436 double left, right, mid;
437 ESL_DPRINTF2(("esl_gumbel_FitComplete(): Newton/Raphson failed; switchover to bisection\n"));
438
439 /* First bracket the root */
440 left = 0.; /* for sure */
441 right = eslCONST_PI / sqrt(6.*variance); /* an initial guess */
442 lawless416(x, n, lambda, &fx, &dfx);
443 while (fx > 0.)
444 {
445 right *= 2.; /* arbitrary leap to the right */
446 if (right > 1000.) /* no reasonable lambda should be > 1000, we assert */
447 {
448 ESL_DPRINTF2(("Failed to bracket root in esl_gumbel_FitComplete()."));
449 status = eslENORESULT;
450 goto FAILURE;
451 }
452
453 lawless416(x, n, right, &fx, &dfx);
454 }
455
456 /* Now, bisection search in left/right interval */
457 for (i = 0; i < 100; i++)
458 {
459 mid = (left + right) / 2.;
460 lawless416(x, n, mid, &fx, &dfx);
461 if (fabs(fx) < tol) break; /* success */
462 if (fx > 0.) left = mid;
463 else right = mid;
464 }
465
466 /* Too many iterations? Give up. */
467 if (i == 100)
468 {
469 ESL_DPRINTF2(("Even bisection search failed in esl_gumbel_FitComplete().\n"));
470 status = eslENORESULT;
471 goto FAILURE;
472 }
473
474 lambda = mid;
475 }
476
477 /* 3. Substitute into Lawless 4.1.5 to find mu
478 */
479 esum = 0.;
480 for (i = 0; i < n; i++)
481 esum += exp(-lambda * x[i]);
482 mu = -log(esum / n) / lambda;
483
484 *ret_lambda = lambda;
485 *ret_mu = mu;
486 return eslOK;
487
488 FAILURE:
489 *ret_mu = 0.0;
490 *ret_lambda = 0.0;
491 return status;
492 }
493
494 /* Function: esl_gumbel_FitCompleteLoc()
495 * Synopsis: Estimates $\mu$ from complete data, given $\lambda$.
496 *
497 * Purpose: Given an array of Gumbel-distributed samples
498 * <x[0]..x[n-1]> (complete data), and a known
499 * (or otherwise fixed) <lambda>, find a maximum
500 * likelihood estimate for location parameter <mu>.
501 *
502 * Algorithm is a straightforward simplification of
503 * <esl_gumbel_FitComplete()>.
504 *
505 * Args: x - list of Gumbel distributed samples
506 * n - number of samples
507 * lambda - known lambda (scale) parameter
508 * ret_mu : RETURN: ML estimate of mu
509 *
510 * Returns: <eslOK> on success.
511 *
512 * <eslEINVAL> if n<=1; on this error, <*ret_mu> = 0.
513 *
514 * Throws: (no abnormal error conditions)
515 */
516 int
esl_gumbel_FitCompleteLoc(double * x,int n,double lambda,double * ret_mu)517 esl_gumbel_FitCompleteLoc(double *x, int n, double lambda, double *ret_mu)
518 {
519 double esum;
520 int i;
521 int status;
522
523 if (n <= 1) { status = eslEINVAL; goto FAILURE; }
524
525 /* Substitute into Lawless 4.1.5 to find mu */
526 esum = 0.;
527 for (i = 0; i < n; i++)
528 esum += exp(-lambda * x[i]);
529 *ret_mu = -log(esum / n) / lambda;
530 return eslOK;
531
532 #if 0
533 /* Replace the code above w/ code below to test the direct method. */
534 double mean, variance;
535 esl_stats_DMean(x, n, &mean, &variance);
536 *ret_mu = mean - 0.57722/lambda;
537 return eslOK;
538 #endif
539
540 FAILURE:
541 *ret_mu = 0.;
542 return status;
543 }
544
545
546 #if eslDEBUGLEVEL >=3
547 /* direct_mv_fit()
548 * SRE, Wed Jun 29 08:23:47 2005
549 *
550 * Purely for curiousity: a complete data fit using the
551 * simple direct method, calculating mu and lambda from mean
552 * and variance.
553 */
554 static int
direct_mv_fit(double * x,int n,double * ret_mu,double * ret_lambda)555 direct_mv_fit(double *x, int n, double *ret_mu, double *ret_lambda)
556 {
557 double mean, variance;
558
559 esl_stats_DMean(x, n, &mean, &variance);
560 *ret_lambda = eslCONST_PI / sqrt(6.*variance);
561 *ret_mu = mean - 0.57722/(*ret_lambda);
562 return eslOK;
563 }
564 #endif
565
566 /*------------------- end of complete data fit ---------------------------------*/
567
568
569 /*****************************************************************
570 * 6. Maximum likelihood fitting to censored data (x_i >= phi; z known)
571 *****************************************************************/
572
573 /* lawless422()
574 * SRE, Mon Nov 17 09:42:48 1997 [St. Louis]
575 *
576 * Purpose: Equation 4.2.2 from [Lawless82], pg. 169, and
577 * its first derivative with respect to lambda,
578 * for finding the ML fit to Gumbel lambda parameter
579 * for Type I censored data.
580 * This equation gives a result of zero for the maximum
581 * likelihood lambda.
582 *
583 * Args: x - array of observed sample values
584 * n - number of observed samples
585 * z - number of censored samples = N-n
586 * phi - censoring value; all observed x_i >= phi
587 * lambda - a lambda to test
588 * ret_f - RETURN: 4.2.2 evaluated at lambda
589 * ret_df - RETURN: first derivative of 4.2.2 evaluated at lambda
590 *
591 * Return: (void)
592 */
593 static void
lawless422(double * x,int n,int z,double phi,double lambda,double * ret_f,double * ret_df)594 lawless422(double *x, int n, int z, double phi,
595 double lambda, double *ret_f, double *ret_df)
596 {
597 double esum; /* \sum e^(-lambda xi) + z term */
598 double xesum; /* \sum xi e^(-lambda xi) + z term */
599 double xxesum; /* \sum xi^2 e^(-lambda xi) + z term */
600 double xsum; /* \sum xi (no z term) */
601 int i;
602
603 esum = xesum = xsum = xxesum = 0.;
604 for (i = 0; i < n; i++)
605 {
606 xsum += x[i];
607 esum += exp(-1. * lambda * x[i]);
608 xesum += x[i] * exp(-1. * lambda * x[i]);
609 xxesum += x[i] * x[i] * exp(-1. * lambda * x[i]);
610 }
611
612 /* Add z terms for censored data
613 */
614 esum += (double) z * exp(-1. * lambda * phi);
615 xesum += (double) z * phi * exp(-1. * lambda * phi);
616 xxesum += (double) z * phi * phi * exp(-1. * lambda * phi);
617
618 *ret_f = 1./lambda - xsum / n + xesum / esum;
619 *ret_df = ((xesum / esum) * (xesum / esum))
620 - (xxesum / esum)
621 - (1. / (lambda * lambda));
622
623 return;
624 }
625
626 /* Function: esl_gumbel_FitCensored()
627 * Synopsis: Estimates $\mu$, $\lambda$ from censored data.
628 *
629 * Purpose: Given a left-censored array of Gumbel-distributed samples
630 * <x[0]..x[n-1]>, the number of censored samples <z>, and
631 * the censoring value <phi> (all <x[i]> $\geq$ <phi>). Find
632 * maximum likelihood parameters <mu> and <lambda>.
633 *
634 * Algorithm: Uses approach described in [Lawless82]. Solves
635 * for lambda using Newton/Raphson iterations;
636 * then substitutes lambda into Lawless' equation 4.2.3
637 * to get mu.
638 *
639 * Args: x - array of Gumbel-distributed samples, 0..n-1
640 * n - number of observed samples
641 * z - number of censored samples
642 * phi - censoring value (all x_i >= phi)
643 * ret_mu - RETURN: ML estimate of mu
644 * ret_lambda - RETURN: ML estimate of lambda
645 *
646 * Returns: <eslOK> on success.
647 *
648 * <eslEINVAL> if n<=1.
649 * <eslENORESULT> if the fit fails, likey because the number
650 * of samples is too small.
651 * On either error, <*ret_mu> and <*ret_lambda> are 0.0.
652 * These are classed as failures (normal errors) because the
653 * data vector may have been provided by a user.
654 */
655 int
esl_gumbel_FitCensored(double * x,int n,int z,double phi,double * ret_mu,double * ret_lambda)656 esl_gumbel_FitCensored(double *x, int n, int z, double phi, double *ret_mu, double *ret_lambda)
657 {
658 double variance;
659 double lambda, mu;
660 double fx; /* f(x) */
661 double dfx; /* f'(x) */
662 double esum; /* \sum e^(-lambda xi) */
663 double tol = 1e-5;
664 int i;
665 int status;
666
667 if (n <= 1) { status = eslEINVAL; goto FAILURE; }
668
669 /* 1. Find an initial guess at lambda
670 * (Evans/Hastings/Peacock, Statistical Distributions, 2000, p.86)
671 */
672 esl_stats_DMean(x, n, NULL, &variance);
673 lambda = eslCONST_PI / sqrt(6.*variance);
674
675 /* 2. Use Newton/Raphson to solve Lawless 4.2.2 and find ML lambda
676 */
677 for (i = 0; i < 100; i++)
678 {
679 lawless422(x, n, z, phi, lambda, &fx, &dfx);
680 if (fabs(fx) < tol) break; /* success */
681 lambda = lambda - fx / dfx; /* Newton/Raphson is simple */
682 if (lambda <= 0.) lambda = 0.001; /* but be a little careful */
683 }
684
685 /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
686 * Resort to a bisection search. Worse convergence speed
687 * but guaranteed to converge (unlike Newton/Raphson).
688 * We assume (!?) that fx is a monotonically decreasing function of x;
689 * i.e. fx > 0 if we are left of the root, fx < 0 if we
690 * are right of the root.
691 */
692 if (i == 100)
693 {
694 double left, right, mid;
695 ESL_DPRINTF2(("esl_gumbel_FitCensored(): Newton/Raphson failed; switched to bisection\n"));
696
697 /* First bracket the root */
698 left = 0.; /* we know that's the left bound */
699 right = eslCONST_PI / sqrt(6.*variance); /* start from here, move "right"... */
700 lawless422(x, n, z, phi, right, &fx, &dfx);
701 while (fx > 0.)
702 {
703 right *= 2.;
704 if (right > 1000.) /* no reasonable lambda should be > 1000, we assert */
705 {
706 ESL_DPRINTF2(("Failed to bracket root in esl_gumbel_FitCensored()."));
707 status = eslENORESULT;
708 goto FAILURE;
709 }
710 lawless422(x, n, z, phi, right, &fx, &dfx);
711 }
712
713 /* Now we bisection search in left/right interval */
714 for (i = 0; i < 100; i++)
715 {
716 mid = (left + right) / 2.;
717 lawless422(x, n, z, phi, mid, &fx, &dfx);
718 if (fabs(fx) < tol) break; /* success */
719 if (fx > 0.) left = mid;
720 else right = mid;
721 }
722 if (i == 100)
723 {
724 ESL_DPRINTF2(("Even bisection search failed in esl_gumbel_FitCensored().\n"));
725 status = eslENORESULT;
726 goto FAILURE;
727 }
728 lambda = mid;
729 }
730
731 /* 3. Substitute into Lawless 4.2.3 to find mu
732 */
733 esum = 0.;
734 for (i = 0; i < n; i++)
735 esum += exp(-lambda * x[i]);
736 esum += z * exp(-1. * lambda * phi); /* term from censored data */
737 mu = -log(esum / n) / lambda;
738
739 *ret_lambda = lambda;
740 *ret_mu = mu;
741 return eslOK;
742
743 FAILURE:
744 *ret_lambda = 0.0;
745 *ret_mu = 0.0;
746 return status;
747 }
748
749
750 /* Function: esl_gumbel_FitCensoredLoc()
751 * Synopsis: Estimates $\mu$ from censored data, given $\lambda$.
752 *
753 * Purpose: Given a left-censored array of Gumbel distributed samples
754 * <x[0>..x[n-1]>, the number of censored samples <z>, and the censoring
755 * value <phi> (where all <x[i]> $\geq$ <phi>), and a known
756 * (or at least fixed) <lambda>;
757 * find the maximum likelihood estimate of the location
758 * parameter $\mu$ and return it in <ret_mu>.
759 *
760 * Note: A straightforward simplification of FitCensored().
761 *
762 * Args: x - array of Gumbel-distributed samples, 0..n-1
763 * n - number of observed samples
764 * z - number of censored samples
765 * phi - censoring value (all x_i >= phi)
766 * lambda - known scale parameter $\lambda$
767 * ret_mu - RETURN: ML estimate of $\mu$
768 *
769 * Returns: <eslOK> on success.
770 *
771 * <eslEINVAL> if n<=1; on this error, <*ret_mu> = 0.
772 *
773 * Throws: (no abnormal error conditions)
774 */
775 int
esl_gumbel_FitCensoredLoc(double * x,int n,int z,double phi,double lambda,double * ret_mu)776 esl_gumbel_FitCensoredLoc(double *x, int n, int z, double phi, double lambda,
777 double *ret_mu)
778 {
779 double esum;
780 int i;
781 int status;
782
783 if (n <= 1) { status = eslEINVAL; goto FAILURE; }
784
785 /* Immediately substitute into Lawless 4.2.3 to find mu, because
786 * lambda is known.
787 */
788 esum = 0.;
789 for (i = 0; i < n; i++) /* contribution from observed data */
790 esum += exp(-lambda * x[i]);
791 esum += z * exp(-1. * lambda * phi); /* term from censored data */
792
793 *ret_mu = -log(esum / (double) n) / lambda;
794 return eslOK;
795
796 FAILURE:
797 *ret_mu = 0.;
798 return status;
799 }
800
801
802 /*****************************************************************
803 * 7. Maximum likelihood fitting to truncated data (x_i >= phi and z unknown)
804 *****************************************************************/
805
806 /* Easel's conjugate gradient descent code allows a single void ptr to
807 * point to any necessary fixed data, so we'll put everything into one
808 * structure:
809 */
810 struct tevd_data {
811 double *x; /* data: n observed samples from a Gumbel */
812 int n; /* number of observed samples */
813 double phi; /* truncation threshold: all observed x_i >= phi */
814 };
815
816 /* tevd_func()
817 *
818 * Called by the optimizer: evaluate the objective function
819 * for the negative posterior log probability of a particular choice
820 * of parameters mu and lambda, given truncated Gumbel samples.
821 */
822 static double
tevd_func(double * p,int nparam,void * dptr)823 tevd_func(double *p, int nparam, void *dptr)
824 {
825 double mu, w, lambda;
826 struct tevd_data *data;
827 double *x;
828 int n;
829 double phi;
830 double logL;
831 int i;
832
833 /* unpack what the optimizer gave us; nparam==2 always
834 */
835 mu = p[0];
836 w = p[1];
837 lambda = exp(w);
838 data = (struct tevd_data *) dptr;
839 x = data->x;
840 n = data->n;
841 phi = data->phi;
842
843 /* The log likelihood equation
844 */
845 logL = n * log(lambda);
846 for (i = 0; i < n; i++)
847 logL -= lambda * (x[i] - mu);
848 for (i = 0; i < n; i++)
849 logL -= exp(-1. * lambda * (x[i] - mu));
850 logL -= n * esl_gumbel_logsurv(phi, mu, lambda);
851
852 return -1.0 * logL; /* objective: minimize the NLP */
853 }
854
855 /* tevd_grad()
856 *
857 * Called by the optimizer: evaluate the gradient of the objective
858 * function (the negative posterior log probability of the parameters
859 * mu and w, where w = log(lambda), at a particular choice of mu and
860 * lambda.
861 */
862 static void
tevd_grad(double * p,int nparam,void * dptr,double * dp)863 tevd_grad(double *p, int nparam, void *dptr, double *dp)
864 {
865 double mu, lambda, w;
866 struct tevd_data *data;
867 double *x;
868 int n;
869 double phi;
870 double dmu, dw;
871 double coeff;
872 int i;
873
874 /* unpack what the optimizer gave us; nparam==2 always
875 */
876 mu = p[0];
877 w = p[1];
878 lambda = exp(w);
879 data = (struct tevd_data *) dptr;
880 x = data->x;
881 n = data->n;
882 phi = data->phi;
883
884 /* Both partials include a coefficient that
885 * basically looks like P(S=phi) / P(S>=phi); pre-calculate it.
886 * Watch out when phi >> mu, which'll give us 0/0; instead,
887 * recognize that for phi >> mu, coeff converges to \lambda.
888 */
889 if (lambda*(phi-mu) > 50.) /* arbitrary crossover. */
890 coeff = lambda;
891 else
892 coeff = esl_gumbel_pdf(phi, mu, lambda) / esl_gumbel_surv(phi, mu, lambda);
893
894 /* Partial derivative w.r.t. mu.
895 */
896 dmu = n * lambda;
897 for (i = 0; i < n; i++)
898 dmu -= lambda * exp(-1. * lambda * (x[i] - mu));
899 dmu -= n * coeff;
900
901 /* Partial derivative w.r.t. w=log(lambda).
902 */
903 dw = n;
904 for (i = 0; i < n; i++) dw -= (x[i] - mu) * lambda;
905 for (i = 0; i < n; i++) dw += (x[i] - mu) * lambda * exp(-1. * lambda * (x[i] - mu));
906 dw += n * (phi - mu) * coeff;
907
908 /* Return the negative, because we're minimizing NLP, not maximizing.
909 */
910 dp[0] = -1. * dmu; /* negative because we're minimizing NLP, not maximizing */
911 dp[1] = -1. * dw;
912 return;
913 }
914
915 /* Function: esl_gumbel_FitTruncated()
916 * Synopsis: Estimates $\mu$, $\lambda$ from truncated data.
917 *
918 * Purpose: Given a left-truncated array of Gumbel-distributed
919 * samples <x[0]..x[n-1]> and the truncation threshold
920 * <phi> (such that all <x[i]> $\geq$ <phi>).
921 * Find maximum likelihood parameters <mu> and <lambda>.
922 *
923 * <phi> should not be much greater than <mu>, the
924 * mode of the Gumbel, or the fit will become unstable
925 * or may even fail to converge. The problem is
926 * that for <phi> $>$ <mu>, the tail of the Gumbel
927 * becomes a scale-free exponential, and <mu> becomes
928 * undetermined.
929 *
930 * Algorithm: Uses conjugate gradient descent to optimize the
931 * log likelihood of the data. Follows a general
932 * approach to fitting missing data problems outlined
933 * in [Gelman95].
934 *
935 * Args: x - observed data samples [0..n-1]
936 * n - number of samples
937 * phi - truncation threshold; all x[i] >= phi
938 * ret_mu - RETURN: ML estimate of mu
939 * ret_lambda - RETURN: ML estimate of lambda
940 *
941 * Returns: <eslOK> on success.
942 *
943 * <eslEINVAL> if n<=1.
944 * <eslENORESULT> if the fit fails, likely because the
945 * number of samples <n> is too small, or because the
946 * truncation threshold is high enough that the tail
947 * looks like a scale-free exponential and we can't
948 * obtain <mu>.
949 * On either error, <*ret_mu> and <*ret_lambda> are
950 * returned as 0.0.
951 * These are "normal" (returned) errors because
952 * the data might be provided directly by a user.
953 *
954 * Throws: <eslEMEM> on allocation error.
955 */
956 int
esl_gumbel_FitTruncated(double * x,int n,double phi,double * ret_mu,double * ret_lambda)957 esl_gumbel_FitTruncated(double *x, int n, double phi, double *ret_mu, double *ret_lambda)
958 {
959 ESL_MIN_CFG *cfg = NULL; /* customization of the CG optimizer */
960 struct tevd_data data;
961 double p[2]; /* mu, w; lambda = e^w */
962 double mean, variance;
963 double mu, lambda;
964 double fx;
965 int i;
966 int status;
967
968 /* customization of the optimizer */
969 if ((cfg = esl_min_cfg_Create(2)) == NULL) { status = eslEMEM; goto ERROR; }
970 cfg->u[0] = 2.0;
971 cfg->u[1] = 0.1;
972 cfg->cg_rtol = 1e-4;
973
974 /* Can't fit to n<=1 */
975 if (n <= 1) { status = eslEINVAL; goto ERROR; }
976
977 /* Can fail on small <n>. One way is if x_i are all identical, so
978 * ML lambda is undefined.
979 */
980 for (i = 1; i < n; i++) if (x[i] != x[0]) break;
981 if (i == n) { status = eslENORESULT; goto ERROR; }
982
983 data.x = x;
984 data.n = n;
985 data.phi = phi;
986
987 /* The source of the following magic is Evans/Hastings/Peacock,
988 * Statistical Distributions, 3rd edition (2000), p.86, which gives
989 * eq's for the mean and variance of a Gumbel in terms of mu and lambda;
990 * we turn them around to get mu and lambda in terms of the mean and variance.
991 * These would be reasonable estimators if we had a full set of Gumbel
992 * distributed variates. They'll be off for a truncated sample, but
993 * close enough to be a useful starting point.
994 */
995 esl_stats_DMean(x, n, &mean, &variance);
996 lambda = eslCONST_PI / sqrt(6.*variance);
997 mu = mean - 0.57722/lambda;
998
999 p[0] = mu;
1000 p[1] = log(lambda); /* c.o.v. because lambda is constrained to >0 */
1001
1002 /* Pass the problem to the optimizer. The work is done by the
1003 * equations in tevd_func() and tevd_grad().
1004 */
1005 status = esl_min_ConjugateGradientDescent(cfg, p, 2,
1006 &tevd_func, &tevd_grad,(void *)(&data),
1007 &fx, NULL);
1008 if (status == eslENOHALT) { status = eslENORESULT; goto ERROR; }
1009 else if (status != eslOK) goto ERROR;
1010
1011 esl_min_cfg_Destroy(cfg);
1012 *ret_mu = p[0];
1013 *ret_lambda = exp(p[1]); /* reverse the c.o.v. */
1014 return status;
1015
1016 ERROR:
1017 esl_min_cfg_Destroy(cfg);
1018 *ret_mu = 0.0;
1019 *ret_lambda = 0.0;
1020 return status;
1021 }
1022 /*------------------------ end of fitting --------------------------------*/
1023
1024 /*****************************************************************
1025 * 8. Stats driver
1026 *****************************************************************/
1027 #ifdef eslGUMBEL_STATS
1028 /* compile: gcc -g -O2 -Wall -I. -L. -o stats -DeslGUMBEL_STATS esl_gumbel.c -leasel -lm
1029 * run: ./stats > stats.out
1030 * process w/ lines like:
1031 * grep "complete 100" stats.out | awk '{$i = 100*($5-$4)/$4; if ($i < 0) $i = -$i; print $i}' | avg
1032 * grep "complete 100" stats.out | awk '{$i = 100*($7-$6)/$6; if ($i < 0) $i = -$i; print $i}' | avg
1033 * to get accuracy summary (in %) for mu, lambda; first part of the grep pattern may be "complete", "censored", or
1034 * "truncated", second part may be " 100", " 1000", " 10000", or " 100000".
1035 *
1036 * This is the routine that collects the accuracy statistics that appear
1037 * in tables in the Gumbel chapter of the guide, esl_gumbel.tex.
1038 */
1039 #include <stdio.h>
1040
1041 #include "easel.h"
1042 #include "esl_random.h"
1043 #include "esl_minimizer.h"
1044 #include "esl_gumbel.h"
1045
1046 int
main(int argc,char ** argv)1047 main(int argc, char **argv)
1048 {
1049 ESL_RANDOMNESS *r;
1050 int totalN[4] = {100, 1000, 10000, 100000}; /*biggest last; one malloc*/
1051 int nexps = 4;
1052 int exp;
1053 int trial, ntrials;
1054 double phi; /* truncation threshold. */
1055 int i;
1056 int n;
1057 double *x;
1058 double mu, lambda;
1059 double est_mu, est_lambda;
1060 double val;
1061 int do_complete, do_censored, do_truncated, do_location;
1062
1063 ntrials = 500;
1064 mu = -20.0;
1065 lambda = 0.693;
1066 phi = -15.;
1067
1068 do_complete = TRUE; /* Flip these on/off as desired */
1069 do_censored = FALSE;
1070 do_truncated = FALSE;
1071 do_location = FALSE;
1072
1073 r = esl_randomness_Create(0);
1074 x = malloc(sizeof(double) * totalN[nexps-1]);
1075
1076 /* Fitting to simulated complete datasets
1077 */
1078 if (do_complete) {
1079 for (exp = 0; exp < nexps; exp++)
1080 {
1081 for (trial = 0; trial < ntrials; trial++)
1082 {
1083 for (i = 0; i < totalN[exp]; i++)
1084 x[i] = esl_gumbel_Sample(r, mu, lambda);
1085
1086 /*direct_mv_fit(x, totalN[exp], &est_mu, &est_lambda);*/
1087 if (esl_gumbel_FitComplete(x, totalN[exp], &est_mu, &est_lambda) != eslOK)
1088 esl_fatal("gumbel complete fit fit failed");
1089
1090 printf("complete %6d %6d %9.5f %9.5f %8.6f %8.6f\n",
1091 totalN[exp], totalN[exp], mu, est_mu, lambda, est_lambda);
1092 }
1093 printf("\n");
1094 }
1095 }
1096
1097 /* Fitting to simulated censored datasets
1098 */
1099 if (do_censored) {
1100 for (exp = 0; exp < nexps; exp++)
1101 {
1102 for (trial = 0; trial < ntrials; trial++)
1103 {
1104 for (n = 0, i = 0; i < totalN[exp]; i++)
1105 {
1106 val = esl_gumbel_Sample(r, mu, lambda);
1107 if (val >= phi) x[n++] = val;
1108 }
1109 if (esl_gumbel_FitCensored(x, n, totalN[exp]-n, phi, &est_mu, &est_lambda) != eslOK)
1110 esl_fatal("gumbel censored fit failed");
1111
1112 printf("censored %6d %6d %9.5f %9.5f %8.6f %8.6f\n",
1113 totalN[exp], n, mu, est_mu, lambda, est_lambda);
1114 }
1115 printf("\n");
1116 }
1117 }
1118
1119 /* Fitting to simulated truncated datasets
1120 */
1121 if (do_truncated) {
1122 for (exp = 0; exp < nexps; exp++)
1123 {
1124 for (trial = 0; trial < ntrials; trial++)
1125 {
1126 for (n = 0, i = 0; i < totalN[exp]; i++)
1127 {
1128 val = esl_gumbel_Sample(r, mu, lambda);
1129 if (val >= phi) x[n++] = val;
1130 }
1131 if (esl_gumbel_FitTruncated(x, n, phi, &est_mu, &est_lambda) != eslOK)
1132 esl_fatal("gumbel truncated fit failed");
1133
1134 printf("truncated %6d %6d %9.5f %9.5f %8.6f %8.6f\n",
1135 totalN[exp], n, mu, est_mu, lambda, est_lambda);
1136 }
1137 printf("\n");
1138 }
1139 }
1140
1141 /* Fitting mu given lambda
1142 */
1143 if (do_location) {
1144 for (exp = 0; exp < nexps; exp++)
1145 {
1146 for (trial = 0; trial < ntrials; trial++)
1147 {
1148 for (i = 0; i < totalN[exp]; i++)
1149 x[i] = esl_gumbel_Sample(r, mu, lambda);
1150
1151 if (esl_gumbel_FitCompleteLoc(x, totalN[exp], lambda, &est_mu) != eslOK)
1152 esl_fatal("gumbel location-only complete fit failed");
1153
1154 printf("location %6d %6d %9.5f %9.5f\n",
1155 totalN[exp], totalN[exp], mu, est_mu);
1156 }
1157 printf("\n");
1158 }
1159 }
1160
1161 esl_randomness_Destroy(r);
1162 free(x);
1163 return 0;
1164 }
1165 #endif /*eslGUMBEL_STATS*/
1166
1167 /*****************************************************************
1168 * 9. Unit tests.
1169 *****************************************************************/
1170 #ifdef eslGUMBEL_TESTDRIVE
1171
1172 #include "esl_getopts.h"
1173 #include "esl_random.h"
1174
1175 static void
utest_fitting(ESL_RANDOMNESS * rng)1176 utest_fitting(ESL_RANDOMNESS *rng)
1177 {
1178 char msg[] = "esl_gumbel: fitting unit test failed";
1179 int totalN = 10000;
1180 double pmu = -20.;
1181 double plambda = 0.4;
1182 double phi = -20.;
1183 double *x = NULL;
1184 int i;
1185 int n;
1186 double mu;
1187 double lambda;
1188 int status;
1189
1190 /* Simulate a complete Gumbel distributed dataset of <totalN> samples */
1191 ESL_ALLOC(x, sizeof(double) * totalN);
1192 for (i = 0; i < totalN; i++)
1193 x[i] = esl_gumbel_Sample(rng, pmu, plambda);
1194
1195 /* Complete data fitting.
1196 * Don't tolerate more than 1% error in mu, 3% in lambda.
1197 */
1198 if ((status = esl_gumbel_FitComplete(x, totalN, &mu, &lambda)) != eslOK) esl_fatal(msg);
1199 if (fabs((mu -pmu) /pmu) > 0.01) esl_fatal(msg);
1200 if (fabs((lambda-plambda)/plambda) > 0.03) esl_fatal(msg);
1201
1202 /* Complete data, known lambda; fit location <mu>
1203 */
1204 if ((status = esl_gumbel_FitCompleteLoc(x, totalN, plambda, &mu)) != eslOK) esl_fatal(msg);
1205 if (fabs((mu - pmu) / pmu) > 0.01) esl_fatal(msg);
1206
1207
1208 /* Left censor/truncate the data set, to <n> values x_i >= phi;
1209 * <Ntotal-n> are censored
1210 */
1211 for (n=0, i = 0; i < totalN; i++)
1212 if (x[i] >= phi) x[n++] = x[i];
1213
1214 /* Censored fitting.
1215 * Don't tolerate more than 1% error in mu, 4% in lambda.
1216 */
1217 if ((status = esl_gumbel_FitCensored(x, n, totalN-n, phi, &mu, &lambda)) != eslOK) esl_fatal(msg);
1218 if (fabs((mu - pmu) /pmu) > 0.01) esl_fatal(msg);
1219 if (fabs((lambda - plambda)/plambda) > 0.04) esl_fatal(msg);
1220
1221 /* Censored data, known lambda; fit location <mu>
1222 */
1223 if ((status = esl_gumbel_FitCensoredLoc(x, n, totalN-n, phi, plambda, &mu)) != eslOK) esl_fatal(msg);
1224 if (fabs((mu - pmu) / pmu) > 0.01) esl_fatal(msg);
1225
1226 /* Truncated fitting.
1227 * Don't tolerate more than 5% error in mu, 8% in lambda.
1228 */
1229 if ((status = esl_gumbel_FitTruncated(x, n, phi, &mu, &lambda)) != eslOK) esl_fatal(msg);
1230 if (fabs((mu - pmu) /pmu) > 0.05) esl_fatal(msg);
1231 if (fabs((lambda - plambda)/plambda) > 0.08) esl_fatal(msg);
1232
1233 free(x);
1234 return;
1235
1236 ERROR:
1237 if (x) free(x);
1238 esl_fatal("allocation failure in esl_gumbel : fitting unit test");
1239 }
1240
1241
1242 static void
utest_fit_failure(void)1243 utest_fit_failure(void)
1244 {
1245 char msg[] = "esl_gumbel: fit_failure unit test failed";
1246 double x[10];
1247 double mu;
1248 double lambda;
1249 int status;
1250
1251 x[0] = 1.0;
1252 x[1] = 1.0;
1253
1254 /* n=0 or 1 => eslEINVAL. */
1255 status = esl_gumbel_FitComplete(x, 1, &mu, &lambda);
1256 if (status != eslEINVAL) esl_fatal(msg);
1257 if (mu != 0.0) esl_fatal(msg);
1258 if (lambda != 0.0) esl_fatal(msg);
1259
1260 /* Test for failure on small n => eslENORESULT */
1261 status = esl_gumbel_FitComplete(x, 2, &mu, &lambda);
1262 if (status != eslENORESULT) esl_fatal(msg);
1263 if (mu != 0.0) esl_fatal(msg);
1264 if (lambda != 0.0) esl_fatal(msg);
1265
1266 /* FitCompleteLoc() invalid on n=0,1; but always succeeds for n>1 */
1267 status = esl_gumbel_FitCompleteLoc(x, 1, 1.0, &mu);
1268 if (status != eslEINVAL) esl_fatal(msg);
1269 if (mu != 0.0) esl_fatal(msg);
1270
1271 /* FitCensored() is eslEINVAL on n=0,1, like FitComplete().
1272 */
1273 status = esl_gumbel_FitCensored(x, 1, 1, 0.0, &mu, &lambda);
1274 if (status != eslEINVAL) esl_fatal(msg);
1275 if (mu != 0.0) esl_fatal(msg);
1276 if (lambda != 0.0) esl_fatal(msg);
1277
1278 /* FitCensored() can fail on small n, w/ eslENORESULT */
1279 status = esl_gumbel_FitCensored(x, 2, 1, 0.0, &mu, &lambda);
1280 if (status != eslENORESULT) esl_fatal(msg);
1281 if (mu != 0.0) esl_fatal(msg);
1282 if (lambda != 0.0) esl_fatal(msg);
1283
1284 /* FitCensoredLoc()invalid on n=0,1; but always succeeds for n>1 */
1285 status = esl_gumbel_FitCensoredLoc(x, 1, 1, 0.0, 1.0, &mu);
1286 if (status != eslEINVAL) esl_fatal(msg);
1287 if (mu != 0.0) esl_fatal(msg);
1288
1289 /* FitTruncated() w/ n=0,1 => eslEINVAL. */
1290 status = esl_gumbel_FitTruncated(x, 1, 0.0, &mu, &lambda);
1291 if (status != eslEINVAL) esl_fatal(msg);
1292 if (mu != 0.0) esl_fatal(msg);
1293 if (lambda != 0.0) esl_fatal(msg);
1294
1295 /* FitTruncated() can fail on small n, w/ eslENORESULT */
1296 status = esl_gumbel_FitTruncated(x, 2, 0.0, &mu, &lambda);
1297 if (status != eslENORESULT) esl_fatal(msg);
1298 if (mu != 0.0) esl_fatal(msg);
1299 if (lambda != 0.0) esl_fatal(msg);
1300
1301 return;
1302 }
1303 #endif /*eslGUMBEL_TESTDRIVE*/
1304
1305 /*****************************************************************
1306 * 10. Test driver.
1307 *****************************************************************/
1308 #ifdef eslGUMBEL_TESTDRIVE
1309 /* compile: gcc -g -Wall -I. -L. -o esl_gumbel_utest -DeslGUMBEL_TESTDRIVE esl_gumbel.c -leasel -lm
1310 * (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_gumbel_utest -DeslGUMBEL_TESTDRIVE esl_gumbel.c -leasel -lm
1311 * run: ./esl_gumbel_utest
1312 * coverage: ./esl_gumbel_utest; gcov esl_gumbel.c; more esl_gumbel.c.gcov
1313 */
1314 #include <stdio.h>
1315
1316 #include "easel.h"
1317 #include "esl_getopts.h"
1318 #include "esl_random.h"
1319 #include "esl_minimizer.h"
1320 #include "esl_gumbel.h"
1321 #include "esl_stats.h"
1322
1323 static ESL_OPTIONS options[] = {
1324 /* name type default env range togs reqs incomp help docgrp */
1325 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
1326 { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
1327 { "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "verbose: show verbose output", 0},
1328 { 0,0,0,0,0,0,0,0,0,0},
1329 };
1330 static char usage[] = "[-options]";
1331 static char banner[] = "test driver for Gumbel distribution routines";
1332
1333 int
main(int argc,char ** argv)1334 main(int argc, char **argv)
1335 {
1336 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1337 ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1338 int be_verbose = esl_opt_GetBoolean(go, "-v");
1339
1340 if (be_verbose) printf("seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
1341
1342 utest_fitting(rng);
1343 utest_fit_failure();
1344
1345 esl_randomness_Destroy(rng);
1346 esl_getopts_Destroy(go);
1347 return 0;
1348 }
1349 #endif /*eslGUMBEL_TESTDRIVE*/
1350
1351
1352 /*****************************************************************
1353 * 11. Example.
1354 *****************************************************************/
1355 #ifdef eslGUMBEL_EXAMPLE
1356 /*::cexcerpt::gumbel_example::begin::*/
1357 #include <stdio.h>
1358 #include "easel.h"
1359 #include "esl_random.h"
1360 #include "esl_gumbel.h"
1361
1362 int
main(int argc,char ** argv)1363 main(int argc, char **argv)
1364 {
1365 ESL_RANDOMNESS *r = esl_randomness_Create(0);;
1366 int n = 10000; /* simulate 10,000 samples */
1367 double mu = -20.0; /* with mu = -20 */
1368 double lambda = 0.4; /* and lambda = 0.4 */
1369 double min = 9999.;
1370 double max = -9999.;
1371 double *x = malloc(sizeof(double) * n);
1372 double z, est_mu, est_lambda;
1373 int i;
1374
1375 for (i = 0; i < n; i++) /* generate the 10,000 samples */
1376 {
1377 x[i] = esl_gumbel_Sample(r, mu, lambda);
1378 if (x[i] < min) min = x[i];
1379 if (x[i] > max) max = x[i];
1380 }
1381
1382 z = esl_gumbel_surv(max, mu, lambda); /* right tail p~1e-4 >= max */
1383 printf("max = %6.1f P(>max) = %g\n", max, z);
1384 z = esl_gumbel_cdf(min, mu, lambda); /* left tail p~1e-4 < min */
1385 printf("min = %6.1f P(<=min) = %g\n", min, z);
1386
1387 if (esl_gumbel_FitComplete(x, n, &est_mu, &est_lambda) != eslOK) /* fit params to the data */
1388 esl_fatal("gumbel ML complete data fit failed");
1389
1390 z = 100. * fabs((est_mu - mu) / mu);
1391 printf("Parametric mu = %6.1f. Estimated mu = %6.2f. Difference = %.1f%%.\n",
1392 mu, est_mu, z);
1393 z = 100. * fabs((est_lambda - lambda) /lambda);
1394 printf("Parametric lambda = %6.1f. Estimated lambda = %6.2f. Difference = %.1f%%.\n",
1395 lambda, est_lambda, z);
1396
1397 free(x);
1398 esl_randomness_Destroy(r);
1399 return 0;
1400 }
1401 /*::cexcerpt::gumbel_example::end::*/
1402 #endif /*eslGUMBEL_EXAMPLE*/
1403
1404
1405
1406