1 /* Statistical routines for Gumbel (type I extreme value) distributions.
2  *
3  * Contents:
4  *   1. Routines for evaluating densities and distributions
5  *   2. Generic API routines: for general interface w/ histogram module
6  *   3. Dumping plots to files
7  *   4. Sampling
8  *   5. ML fitting to complete data
9  *   6. ML fitting to censored data  (x_i >= phi; z known)
10  *   7. ML fitting to truncated data (x_i >= phi; z unknown)
11  *   8. Stats driver
12  *   9. Unit tests
13  *  10. Test driver
14  *  11. Example
15  *
16  * To-do:
17  *   - ML fitting routines will be prone to over/underfitting
18  *     problems for scores outside a "normal" range, because
19  *     of exp(-lambda * x) calls. The Lawless ML estimation
20  *     may eventually need to be recast in log space.
21  *     SRE, Mon Aug  6 13:42:09 2007
22  *
23  */
24 #include "esl_config.h"
25 
26 #include <stdio.h>
27 #include <math.h>
28 #include <float.h>
29 
30 #include "easel.h"
31 #include "esl_minimizer.h"
32 #include "esl_random.h"
33 #include "esl_stats.h"
34 #include "esl_vectorops.h"
35 
36 #include "esl_gumbel.h"
37 
38 /*****************************************************************
39  * 1. Routines for evaluating densities and distributions
40  *****************************************************************/
41 
42 /* Function:  esl_gumbel_pdf()
43  * Synopsis:  Returns the probability density at $x$, $P(S=x)$.
44  *
45  * Purpose:   Calculates the probability density function for the Gumbel,
46  *            $P(X=x)$, given quantile <x> and Gumbel location and
47  *            scale parameters <mu> and <lambda>.
48  *
49  *            Let $y = \lambda(x-\mu)$; for 64-bit doubles,
50  *            useful dynamic range is about $-6.5 <= y <= 710$.
51  *            Returns 0.0 for smaller $y$, 0.0 for larger $y$.
52  */
53 double
esl_gumbel_pdf(double x,double mu,double lambda)54 esl_gumbel_pdf(double x, double mu, double lambda)
55 {
56   double y;
57   y = lambda * (x - mu);
58   return (lambda * exp(-y - exp(-y)));
59 }
60 
61 
62 /* Function:  esl_gumbel_logpdf()
63  * Synopsis:  Returns the log of the pdf at $x$, $\log P(S=x)$.
64  *
65  * Purpose:   Calculates the log probability density function for the Gumbel,
66  *            $\log P(X=x)$.
67  *
68  *            Let $y = \lambda(x-\mu)$; for 64-bit doubles,
69  *            useful dynamic range is about $-708 <= y <= \infty$.
70  *            Returns $-\infty$ for smaller or larger $y$.
71  */
72 double
esl_gumbel_logpdf(double x,double mu,double lambda)73 esl_gumbel_logpdf(double x, double mu, double lambda)
74 {
75   double y;
76   y = lambda * (x - mu);
77   return (log(lambda) -y - exp(-y));
78 }
79 
80 
81 /* Function:  esl_gumbel_cdf()
82  * Synopsis:  Returns the cumulative distribution at $x$, $P(S \leq x)$.
83  *
84  * Purpose:   Calculates the cumulative distribution function
85  *            for the Gumbel, $P(X \leq x)$.
86  *
87  *            Let $y = \lambda(x-\mu)$; for 64-bit doubles,
88  *            useful dynamic range for $y$ is about $-6.5 <= y <=36$.
89  *            Returns 0.0 for smaller $y$, 1.0 for larger $y$.
90  */
91 double
esl_gumbel_cdf(double x,double mu,double lambda)92 esl_gumbel_cdf(double x, double mu, double lambda)
93 {
94   double y;
95   y = lambda*(x-mu);
96   return exp(-exp(-y));
97 }
98 
99 /* Function:  esl_gumbel_logcdf()
100  * Synopsis:  Returns the log of the cdf at $x$, $\log P(S \leq x)$.
101  *
102  * Purpose:   Calculates the log of the cumulative distribution function
103  *            for the Gumbel, $\log P(X \leq x)$.
104  *
105  *            Let $y = \lambda(x-\mu)$; for 64-bit doubles,
106  *            useful dynamic range for $y$ is about $-708 <= y <= 708$.
107  *            Returns $-\infty$ for smaller $y$, 0.0 for larger $y$.
108  */
109 double
esl_gumbel_logcdf(double x,double mu,double lambda)110 esl_gumbel_logcdf(double x, double mu, double lambda)
111 {
112   double y;
113   y = lambda*(x-mu);
114   return (-exp(-y));
115 }
116 
117 /* Function:  esl_gumbel_surv()
118  * Synopsis:  Returns right tail mass above $x$, $P(S > x)$.
119  *
120  * Purpose:   Calculates the survivor function, $P(X>x)$ for a Gumbel
121  *            (that is, 1-cdf), the right tail's probability mass.
122  *
123  *            Let $y=\lambda(x-\mu)$; for 64-bit doubles,
124  *            useful dynamic range for $y$ is $-3.6 <= y <= 708$.
125  *            Returns 1.0 for $y$ below lower limit, and 0.0
126  *            for $y$ above upper limit.
127  */
128 double
esl_gumbel_surv(double x,double mu,double lambda)129 esl_gumbel_surv(double x, double mu, double lambda)
130 {
131   double y  = lambda*(x-mu);
132   double ey = -exp(-y);
133 
134   /* Use 1-e^x ~ -x approximation here when e^-y is small. */
135   if (fabs(ey) < eslSMALLX1) return -ey;
136   else                       return 1 - exp(ey);
137 }
138 
139 /* Function:  esl_gumbel_logsurv()
140  * Synopsis:  Returns log survival at $x$, $\log P(S > x)$.
141  *
142  * Purpose:   Calculates $\log P(X>x)$ for a Gumbel (that is, $\log$(1-cdf)):
143  *            the log of the right tail's probability mass.
144  *
145  *            Let $y=\lambda(x-\mu)$; for 64-bit doubles,
146  *            useful dynamic range for $y$ is $-6.5 <= y <= \infty$.
147  *            Returns 0.0 for smaller $y$.
148  */
149 double
esl_gumbel_logsurv(double x,double mu,double lambda)150 esl_gumbel_logsurv(double x, double mu, double lambda)
151 {
152   double y  = lambda*(x-mu);
153   double ey = -exp(-y);
154 
155   /* The real calculation is log(1-exp(-exp(-y))).
156    * For "large" y, -exp(-y) is small, so 1-exp(-exp(-y) ~ exp(-y),
157    * and log of that gives us -y.
158    * For "small y", exp(-exp(-y) is small, and we can use log(1-x) ~ -x.
159    */
160   if      (fabs(ey)      < eslSMALLX1) return -y;
161   else if (fabs(exp(ey)) < eslSMALLX1) return -exp(ey);
162   else                                 return log(1-exp(ey));
163 }
164 
165 /* Function:  esl_gumbel_invcdf()
166  *
167  * Purpose:   Calculates the inverse CDF for a Gumbel distribution
168  *            with parameters <mu> and <lambda>. That is, returns
169  *            the quantile <x> at which the CDF is <p>.
170  */
171 double
esl_gumbel_invcdf(double p,double mu,double lambda)172 esl_gumbel_invcdf(double p, double mu, double lambda)
173 {
174     return mu - ( log(-1. * log(p)) / lambda);
175 }
176 
177 /* Function:  esl_gumbel_invsurv()
178  *
179  * Purpose:   Calculates the score at which the right tail's mass
180  *            is p, for a Gumbel distribution
181  *            with parameters <mu> and <lambda>. That is, returns
182  *            the quantile <x> at which 1-CDF is <p>.
183  */
184 double
esl_gumbel_invsurv(double p,double mu,double lambda)185 esl_gumbel_invsurv(double p, double mu, double lambda)
186 {
187 	/* The real calculation is mu - ( log(-1. * log(1-p)) / lambda).
188 	*  But there's a problem with small p:
189 	*     for p<1e-15, 1-p will be viewed as 1, so
190 	*     log ( -log(1-p) ) == log (0) -> inf
191 	*  Instead, use two approximations;
192 	*    (1) log( 1-p) ~= -p   for small p (e.g. p<0.001)
193 	*      so log(-1. * log(1-p)) ~= log(p)
194 	*    (2) log (p) ~= (p^p - 1) / p
195 	*
196 	*    See notes Mar 1, 2010.
197 	*/
198 	double log_part;
199 	if (p < eslSMALLX1) {
200 		log_part = (pow(p,p) - 1 ) / p;
201 	} else {
202 		log_part = log(-1. * log(1-p));
203 	}
204 
205 	//test 2
206 
207     return mu - ( log_part / lambda);
208 }
209 /*------------------ end of densities and distributions --------------------*/
210 
211 
212 /*****************************************************************
213  * 2. Generic API routines: for general interface w/ histogram module
214  *****************************************************************/
215 
216 /* Function:  esl_gumbel_generic_pdf()
217  *
218  * Purpose:   Generic-API version of PDF function.
219  */
220 double
esl_gumbel_generic_pdf(double p,void * params)221 esl_gumbel_generic_pdf(double p, void *params)
222 {
223   double *v = (double *) params;
224   return esl_gumbel_pdf(p, v[0], v[1]);
225 }
226 
227 /* Function:  esl_gumbel_generic_cdf()
228  *
229  * Purpose:   Generic-API version of CDF function.
230  */
231 double
esl_gumbel_generic_cdf(double x,void * params)232 esl_gumbel_generic_cdf(double x, void *params)
233 {
234   double *p = (double *) params;
235   return esl_gumbel_cdf(x, p[0], p[1]);
236 }
237 
238 /* Function:  esl_gumbel_generic_surv()
239  *
240  * Purpose:   Generic-API version of survival function.
241  */
242 double
esl_gumbel_generic_surv(double p,void * params)243 esl_gumbel_generic_surv(double p, void *params)
244 {
245   double *v = (double *) params;
246   return esl_gumbel_surv(p, v[0], v[1]);
247 }
248 
249 /* Function:  esl_gumbel_generic_invcdf()
250  *
251  * Purpose:   Generic-API version of inverse CDF.
252  */
253 double
esl_gumbel_generic_invcdf(double p,void * params)254 esl_gumbel_generic_invcdf(double p, void *params)
255 {
256   double *v = (double *) params;
257   return esl_gumbel_invcdf(p, v[0], v[1]);
258 }
259 
260 
261 /*------------------------- end of generic API --------------------------*/
262 
263 
264 
265 /****************************************************************************
266  * 3. Routines for dumping plots for files
267  ****************************************************************************/
268 
269 /* Function:  esl_gumbel_Plot()
270  * Synopsis:  Plot a Gumbel function in XMGRACE XY format.
271  *
272  * Purpose:   Plot a Gumbel function <func> (for instance,
273  *            <esl_gumbel_pdf()>) for parameters <mu> and <lambda>, for
274  *            a range of quantiles x from <xmin> to <xmax> in steps of <xstep>;
275  *            output to an open stream <fp> in xmgrace XY input format.
276  *
277  * Returns:   <eslOK> on success.
278  *
279  * Throws:    <eslEWRITE> on any system write error, such as filled disk.
280  */
281 int
esl_gumbel_Plot(FILE * fp,double mu,double lambda,double (* func)(double x,double mu,double lambda),double xmin,double xmax,double xstep)282 esl_gumbel_Plot(FILE *fp, double mu, double lambda,
283 		double (*func)(double x, double mu, double lambda),
284 		double xmin, double xmax, double xstep)
285 {
286   double x;
287   for (x = xmin; x <= xmax; x += xstep)
288     if (fprintf(fp, "%f\t%g\n", x, (*func)(x, mu, lambda)) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "gumbel plot write failed");
289   if (fprintf(fp, "&\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "gumbel plot write failed");
290   return eslOK;
291 }
292 /*-------------------- end plot dumping routines ---------------------------*/
293 
294 
295 
296 /*****************************************************************
297  * 4. Routines for sampling
298  *****************************************************************/
299 
300 /* Function:  esl_gumbel_Sample()
301  * Synopsis:  Return a Gumbel-distributed random sample $x$.
302  *
303  * Purpose:   Sample a Gumbel-distributed random variate
304  *            by the transformation method.
305  */
306 double
esl_gumbel_Sample(ESL_RANDOMNESS * r,double mu,double lambda)307 esl_gumbel_Sample(ESL_RANDOMNESS *r, double mu, double lambda)
308 {
309   double p;
310   p = esl_rnd_UniformPositive(r);
311   return esl_gumbel_invcdf(p, mu, lambda);
312 }
313 /*------------------------ end of sampling --------------------------------*/
314 
315 
316 
317 /*****************************************************************
318  * 5. Maximum likelihood fitting to complete data
319  *****************************************************************/
320 
321 /* lawless416()
322  * SRE, Thu Nov 13 11:48:50 1997 [St. Louis]
323  *
324  * Purpose:  Equation 4.1.6 from [Lawless82], pg. 143, and
325  *           its first derivative with respect to lambda,
326  *           for finding the ML fit to Gumbel lambda parameter.
327  *           This equation gives a result of zero for the maximum
328  *           likelihood lambda.
329  *
330  * Args:     x      - array of sample values
331  *           n      - number of samples
332  *           lambda - a lambda to test
333  *           ret_f  - RETURN: 4.1.6 evaluated at lambda
334  *           ret_df - RETURN: first derivative of 4.1.6 evaluated at lambda
335  *
336  * Return:   (void)
337  */
338 static void
lawless416(double * x,int n,double lambda,double * ret_f,double * ret_df)339 lawless416(double *x, int n, double lambda, double *ret_f, double *ret_df)
340 {
341   double esum;			/* \sum e^(-lambda xi)      */
342   double xesum;			/* \sum xi e^(-lambda xi)   */
343   double xxesum;		/* \sum xi^2 e^(-lambda xi) */
344   double xsum;			/* \sum xi                  */
345   int i;
346 
347   esum = xesum = xsum  = xxesum = 0.;
348   for (i = 0; i < n; i++)
349     {
350       xsum   += x[i];
351       xesum  += x[i] * exp(-1. * lambda * x[i]);
352       xxesum += x[i] * x[i] * exp(-1. * lambda * x[i]);
353       esum   += exp(-1. * lambda * x[i]);
354     }
355   *ret_f  = (1./lambda) - (xsum / n)  + (xesum / esum);
356   *ret_df = ((xesum / esum) * (xesum / esum))
357     - (xxesum / esum)
358     - (1. / (lambda * lambda));
359 }
360 
361 /* Function: esl_gumbel_FitComplete()
362  * Synopsis: Estimates $\mu$, $\lambda$ from complete data.
363  *
364  * Purpose:  Given an array of Gumbel-distributed samples
365  *           <x[0]..x[n-1]>, find maximum likelihood parameters <mu>
366  *           and <lambda>.
367  *
368  *           The number of samples <n> must be reasonably large to get
369  *           an accurate fit. <n=100> suffices to get an accurate
370  *           location parameter $\mu$ (to about 1% error), but
371  *           <n~10000> is required to get a similarly accurate
372  *           estimate of $\lambda$. It's probably a bad idea to try to
373  *           fit a Gumbel to less than about 1000 data points.
374  *
375  *           On a very small number of samples, the fit can fail
376  *           altogether, in which case the routine will return a
377  *           <eslENORESULT> code. Caller must check for this.
378  *
379  *           Uses approach described in [Lawless82]. Solves for lambda
380  *           using Newton/Raphson iterations, then substitutes lambda
381  *           into Lawless' equation 4.1.5 to get mu.
382  *
383  * Args:     x          - list of Gumbel distributed samples
384  *           n          - number of samples (n>1)
385  *           ret_mu     - RETURN: ML estimate of mu
386  *           ret_lambda - RETURN: ML estimate of lambda
387  *
388  * Returns:  <eslOK> on success.
389  *
390  *           <eslEINVAL> if n<=1.
391  *           <eslENORESULT> if the fit fails, likely because the
392  *           number of samples is too small. On either error,
393  *           <*ret_mu> and <*ret_lambda> are 0.0.  These are classed
394  *           as failures (normal errors) because the data vector may
395  *           have been provided by a user.
396  */
397 int
esl_gumbel_FitComplete(double * x,int n,double * ret_mu,double * ret_lambda)398 esl_gumbel_FitComplete(double *x, int n, double *ret_mu, double *ret_lambda)
399 {
400   double  variance;
401   double  lambda, mu;
402   double  fx;			/* f(x)  */
403   double  dfx;			/* f'(x) */
404   double  esum;                 /* \sum e^(-lambda xi) */
405   double  tol = 1e-5;
406   int     i;
407   int     status;
408 
409   if (n <= 1) { status = eslEINVAL; goto FAILURE; }
410 
411   /* 1. Find an initial guess at lambda
412    *    (Evans/Hastings/Peacock, Statistical Distributions, 2000, p.86)
413    */
414   esl_stats_DMean(x, n, NULL, &variance);
415   lambda = eslCONST_PI / sqrt(6.*variance);
416 
417   /* 2. Use Newton/Raphson to solve Lawless 4.1.6 and find ML lambda
418    */
419   for (i = 0; i < 100; i++)
420     {
421       lawless416(x, n, lambda, &fx, &dfx);
422       if (fabs(fx) < tol) break;             /* success */
423       lambda = lambda - fx / dfx;	     /* Newton/Raphson is simple */
424       if (lambda <= 0.) lambda = 0.001;      /* but be a little careful  */
425     }
426 
427   /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
428    *      Resort to a bisection search. Worse convergence speed
429    *      but guaranteed to converge (unlike Newton/Raphson).
430    *      We assume that fx is a monotonically decreasing function of x;
431    *      i.e. fx > 0 if we are left of the root, fx < 0 if we
432    *      are right of the root.
433    */
434   if (i == 100)
435     {
436       double left, right, mid;
437       ESL_DPRINTF2(("esl_gumbel_FitComplete(): Newton/Raphson failed; switchover to bisection\n"));
438 
439       /* First bracket the root */
440       left  = 0.;	                 	/* for sure */
441       right = eslCONST_PI / sqrt(6.*variance);  /* an initial guess */
442       lawless416(x, n, lambda, &fx, &dfx);
443       while (fx > 0.)
444 	{
445 	  right *= 2.;		/* arbitrary leap to the right */
446 	  if (right > 1000.)    /* no reasonable lambda should be > 1000, we assert */
447 	    {
448 	      ESL_DPRINTF2(("Failed to bracket root in esl_gumbel_FitComplete()."));
449 	      status = eslENORESULT;
450 	      goto FAILURE;
451 	    }
452 
453 	  lawless416(x, n, right, &fx, &dfx);
454 	}
455 
456       /* Now, bisection search in left/right interval */
457       for (i = 0; i < 100; i++)
458 	{
459 	  mid = (left + right) / 2.;
460 	  lawless416(x, n, mid, &fx, &dfx);
461 	  if (fabs(fx) < tol) break;             /* success */
462 	  if (fx > 0.)	left = mid;
463 	  else          right = mid;
464 	}
465 
466       /* Too many iterations? Give up. */
467       if (i == 100)
468 	{
469 	  ESL_DPRINTF2(("Even bisection search failed in esl_gumbel_FitComplete().\n"));
470 	  status = eslENORESULT;
471 	  goto FAILURE;
472 	}
473 
474       lambda = mid;
475     }
476 
477   /* 3. Substitute into Lawless 4.1.5 to find mu
478    */
479   esum = 0.;
480   for (i = 0; i < n; i++)
481     esum  += exp(-lambda * x[i]);
482   mu = -log(esum / n) / lambda;
483 
484   *ret_lambda = lambda;
485   *ret_mu     = mu;
486   return eslOK;
487 
488  FAILURE:
489   *ret_mu     = 0.0;
490   *ret_lambda = 0.0;
491   return status;
492 }
493 
494 /* Function:  esl_gumbel_FitCompleteLoc()
495  * Synopsis:  Estimates $\mu$ from complete data, given $\lambda$.
496  *
497  * Purpose:   Given an array of Gumbel-distributed samples
498  *            <x[0]..x[n-1]> (complete data), and a known
499  *            (or otherwise fixed) <lambda>, find a maximum
500  *            likelihood estimate for location parameter <mu>.
501  *
502  *            Algorithm is a straightforward simplification of
503  *            <esl_gumbel_FitComplete()>.
504  *
505  * Args:     x          - list of Gumbel distributed samples
506  *           n          - number of samples
507  *           lambda     - known lambda (scale) parameter
508  *           ret_mu     : RETURN: ML estimate of mu
509  *
510  * Returns:  <eslOK> on success.
511  *
512  *           <eslEINVAL> if n<=1; on this error, <*ret_mu> = 0.
513  *
514  * Throws:   (no abnormal error conditions)
515  */
516 int
esl_gumbel_FitCompleteLoc(double * x,int n,double lambda,double * ret_mu)517 esl_gumbel_FitCompleteLoc(double *x, int n, double lambda, double *ret_mu)
518 {
519   double esum;
520   int    i;
521   int    status;
522 
523   if (n <= 1) { status = eslEINVAL; goto FAILURE; }
524 
525   /* Substitute into Lawless 4.1.5 to find mu */
526   esum = 0.;
527   for (i = 0; i < n; i++)
528     esum  += exp(-lambda * x[i]);
529   *ret_mu = -log(esum / n) / lambda;
530   return eslOK;
531 
532 #if 0
533   /* Replace the code above w/ code below to test the direct method. */
534   double mean, variance;
535   esl_stats_DMean(x, n, &mean, &variance);
536   *ret_mu     = mean - 0.57722/lambda;
537   return eslOK;
538 #endif
539 
540  FAILURE:
541   *ret_mu = 0.;
542   return status;
543 }
544 
545 
546 #if eslDEBUGLEVEL >=3
547 /* direct_mv_fit()
548  * SRE, Wed Jun 29 08:23:47 2005
549  *
550  * Purely for curiousity: a complete data fit using the
551  * simple direct method, calculating mu and lambda from mean
552  * and variance.
553  */
554 static int
direct_mv_fit(double * x,int n,double * ret_mu,double * ret_lambda)555 direct_mv_fit(double *x, int n, double *ret_mu, double *ret_lambda)
556 {
557   double mean, variance;
558 
559   esl_stats_DMean(x, n, &mean, &variance);
560   *ret_lambda = eslCONST_PI / sqrt(6.*variance);
561   *ret_mu     = mean - 0.57722/(*ret_lambda);
562   return eslOK;
563 }
564 #endif
565 
566 /*------------------- end of complete data fit ---------------------------------*/
567 
568 
569 /*****************************************************************
570  * 6. Maximum likelihood fitting to censored data (x_i >= phi; z known)
571  *****************************************************************/
572 
573 /* lawless422()
574  * SRE, Mon Nov 17 09:42:48 1997 [St. Louis]
575  *
576  * Purpose:  Equation 4.2.2 from [Lawless82], pg. 169, and
577  *           its first derivative with respect to lambda,
578  *           for finding the ML fit to Gumbel lambda parameter
579  *           for Type I censored data.
580  *           This equation gives a result of zero for the maximum
581  *           likelihood lambda.
582  *
583  * Args:     x      - array of observed sample values
584  *           n      - number of observed samples
585  *           z      - number of censored samples = N-n
586  *           phi    - censoring value; all observed x_i >= phi
587  *           lambda - a lambda to test
588  *           ret_f  - RETURN: 4.2.2 evaluated at lambda
589  *           ret_df - RETURN: first derivative of 4.2.2 evaluated at lambda
590  *
591  * Return:   (void)
592  */
593 static void
lawless422(double * x,int n,int z,double phi,double lambda,double * ret_f,double * ret_df)594 lawless422(double *x, int n, int z, double phi,
595 	   double lambda, double *ret_f, double *ret_df)
596 {
597   double esum;			/* \sum e^(-lambda xi)      + z term    */
598   double xesum;			/* \sum xi e^(-lambda xi)   + z term    */
599   double xxesum;		/* \sum xi^2 e^(-lambda xi) + z term    */
600   double xsum;			/* \sum xi                  (no z term) */
601   int i;
602 
603   esum = xesum = xsum  = xxesum = 0.;
604   for (i = 0; i < n; i++)
605     {
606       xsum   += x[i];
607       esum   +=               exp(-1. * lambda * x[i]);
608       xesum  +=        x[i] * exp(-1. * lambda * x[i]);
609       xxesum += x[i] * x[i] * exp(-1. * lambda * x[i]);
610     }
611 
612   /* Add z terms for censored data
613    */
614   esum   += (double) z *             exp(-1. * lambda * phi);
615   xesum  += (double) z * phi *       exp(-1. * lambda * phi);
616   xxesum += (double) z * phi * phi * exp(-1. * lambda * phi);
617 
618   *ret_f  = 1./lambda - xsum / n + xesum / esum;
619   *ret_df = ((xesum / esum) * (xesum / esum))
620     - (xxesum / esum)
621     - (1. / (lambda * lambda));
622 
623   return;
624 }
625 
626 /* Function: esl_gumbel_FitCensored()
627  * Synopsis: Estimates $\mu$, $\lambda$ from censored data.
628  *
629  * Purpose:  Given a left-censored array of Gumbel-distributed samples
630  *           <x[0]..x[n-1]>, the number of censored samples <z>, and
631  *           the censoring value <phi> (all <x[i]> $\geq$ <phi>).  Find
632  *           maximum likelihood parameters <mu> and <lambda>.
633  *
634  * Algorithm: Uses approach described in [Lawless82]. Solves
635  *            for lambda using Newton/Raphson iterations;
636  *            then substitutes lambda into Lawless' equation 4.2.3
637  *            to get mu.
638  *
639  * Args:     x          - array of Gumbel-distributed samples, 0..n-1
640  *           n          - number of observed samples
641  *           z          - number of censored samples
642  *           phi        - censoring value (all x_i >= phi)
643  *           ret_mu     - RETURN: ML estimate of mu
644  *           ret_lambda - RETURN: ML estimate of lambda
645  *
646  * Returns:  <eslOK> on success.
647  *
648  *           <eslEINVAL> if n<=1.
649  *           <eslENORESULT> if the fit fails, likey because the number
650  *           of samples is too small.
651  *           On either error, <*ret_mu> and <*ret_lambda> are 0.0.
652  *           These are classed as failures (normal errors) because the
653  *           data vector may have been provided by a user.
654  */
655 int
esl_gumbel_FitCensored(double * x,int n,int z,double phi,double * ret_mu,double * ret_lambda)656 esl_gumbel_FitCensored(double *x, int n, int z, double phi, double *ret_mu, double *ret_lambda)
657 {
658   double variance;
659   double lambda, mu;
660   double fx;			/* f(x)  */
661   double dfx;			/* f'(x) */
662   double esum;                  /* \sum e^(-lambda xi) */
663   double tol = 1e-5;
664   int    i;
665   int    status;
666 
667   if (n <= 1) { status = eslEINVAL; goto FAILURE; }
668 
669   /* 1. Find an initial guess at lambda
670    *    (Evans/Hastings/Peacock, Statistical Distributions, 2000, p.86)
671    */
672   esl_stats_DMean(x, n, NULL, &variance);
673   lambda = eslCONST_PI / sqrt(6.*variance);
674 
675   /* 2. Use Newton/Raphson to solve Lawless 4.2.2 and find ML lambda
676    */
677   for (i = 0; i < 100; i++)
678     {
679       lawless422(x, n, z, phi, lambda, &fx, &dfx);
680       if (fabs(fx) < tol) break;             /* success */
681       lambda = lambda - fx / dfx;	     /* Newton/Raphson is simple */
682       if (lambda <= 0.) lambda = 0.001;      /* but be a little careful  */
683     }
684 
685  /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
686    *      Resort to a bisection search. Worse convergence speed
687    *      but guaranteed to converge (unlike Newton/Raphson).
688    *      We assume (!?) that fx is a monotonically decreasing function of x;
689    *      i.e. fx > 0 if we are left of the root, fx < 0 if we
690    *      are right of the root.
691    */
692   if (i == 100)
693     {
694       double left, right, mid;
695       ESL_DPRINTF2(("esl_gumbel_FitCensored(): Newton/Raphson failed; switched to bisection\n"));
696 
697       /* First bracket the root */
698       left  = 0.;		               /* we know that's the left bound */
699       right = eslCONST_PI / sqrt(6.*variance); /* start from here, move "right"... */
700       lawless422(x, n, z, phi, right, &fx, &dfx);
701       while (fx > 0.)
702 	{
703 	  right *= 2.;
704 	  if (right > 1000.) /* no reasonable lambda should be > 1000, we assert */
705 	    {
706 	      ESL_DPRINTF2(("Failed to bracket root in esl_gumbel_FitCensored()."));
707 	      status = eslENORESULT;
708 	      goto FAILURE;
709 	    }
710 	  lawless422(x, n, z, phi, right, &fx, &dfx);
711 	}
712 
713       /* Now we bisection search in left/right interval */
714       for (i = 0; i < 100; i++)
715 	{
716 	  mid = (left + right) / 2.;
717 	  lawless422(x, n, z, phi, mid, &fx, &dfx);
718 	  if (fabs(fx) < tol) break;             /* success */
719 	  if (fx > 0.)	left = mid;
720 	  else          right = mid;
721 	}
722       if (i == 100)
723 	{
724 	  ESL_DPRINTF2(("Even bisection search failed in esl_gumbel_FitCensored().\n"));
725 	  status = eslENORESULT;
726 	  goto FAILURE;
727 	}
728       lambda = mid;
729     }
730 
731   /* 3. Substitute into Lawless 4.2.3 to find mu
732    */
733   esum = 0.;
734   for (i = 0; i < n; i++)
735     esum  += exp(-lambda * x[i]);
736   esum += z * exp(-1. * lambda * phi);    /* term from censored data */
737   mu = -log(esum / n) / lambda;
738 
739   *ret_lambda = lambda;
740   *ret_mu     = mu;
741   return eslOK;
742 
743  FAILURE:
744   *ret_lambda = 0.0;
745   *ret_mu     = 0.0;
746   return status;
747 }
748 
749 
750 /* Function:  esl_gumbel_FitCensoredLoc()
751  * Synopsis:  Estimates $\mu$ from censored data, given $\lambda$.
752  *
753  * Purpose:   Given a left-censored array of Gumbel distributed samples
754  *            <x[0>..x[n-1]>, the number of censored samples <z>, and the censoring
755  *            value <phi> (where all <x[i]> $\geq$ <phi>), and a known
756  *            (or at least fixed) <lambda>;
757  *            find the maximum likelihood estimate of the location
758  *            parameter $\mu$ and return it in <ret_mu>.
759  *
760  * Note:      A straightforward simplification of FitCensored().
761  *
762  * Args:     x          - array of Gumbel-distributed samples, 0..n-1
763  *           n          - number of observed samples
764  *           z          - number of censored samples
765  *           phi        - censoring value (all x_i >= phi)
766  *           lambda     - known scale parameter $\lambda$
767  *           ret_mu     - RETURN: ML estimate of $\mu$
768  *
769  * Returns:   <eslOK> on success.
770  *
771  *            <eslEINVAL> if n<=1; on this error, <*ret_mu> = 0.
772  *
773  * Throws:    (no abnormal error conditions)
774  */
775 int
esl_gumbel_FitCensoredLoc(double * x,int n,int z,double phi,double lambda,double * ret_mu)776 esl_gumbel_FitCensoredLoc(double *x, int n, int z, double phi, double lambda,
777 			  double *ret_mu)
778 {
779   double esum;
780   int    i;
781   int    status;
782 
783   if (n <= 1) { status = eslEINVAL; goto FAILURE; }
784 
785   /* Immediately substitute into Lawless 4.2.3 to find mu, because
786    * lambda is known.
787    */
788   esum = 0.;
789   for (i = 0; i < n; i++) 	          /* contribution from observed data */
790     esum  += exp(-lambda * x[i]);
791   esum += z * exp(-1. * lambda * phi);    /* term from censored data */
792 
793   *ret_mu = -log(esum / (double) n) / lambda;
794   return eslOK;
795 
796  FAILURE:
797   *ret_mu = 0.;
798   return status;
799 }
800 
801 
802 /*****************************************************************
803  * 7. Maximum likelihood fitting to truncated data (x_i >= phi and z unknown)
804  *****************************************************************/
805 
806 /* Easel's conjugate gradient descent code allows a single void ptr to
807  * point to any necessary fixed data, so we'll put everything into one
808  * structure:
809  */
810 struct tevd_data {
811   double *x;	/* data: n observed samples from a Gumbel */
812   int     n;	/* number of observed samples */
813   double  phi;	/* truncation threshold: all observed x_i >= phi */
814 };
815 
816 /* tevd_func()
817  *
818  * Called by the optimizer: evaluate the objective function
819  * for the negative posterior log probability of a particular choice
820  * of parameters mu and lambda, given truncated Gumbel samples.
821  */
822 static double
tevd_func(double * p,int nparam,void * dptr)823 tevd_func(double *p, int nparam, void *dptr)
824 {
825   double mu, w, lambda;
826   struct tevd_data *data;
827   double *x;
828   int     n;
829   double  phi;
830   double  logL;
831   int     i;
832 
833   /* unpack what the optimizer gave us; nparam==2 always
834    */
835   mu     = p[0];
836   w      = p[1];
837   lambda = exp(w);
838   data   = (struct tevd_data *) dptr;
839   x      = data->x;
840   n      = data->n;
841   phi    = data->phi;
842 
843   /* The log likelihood equation
844    */
845   logL   = n * log(lambda);
846   for (i = 0; i < n; i++)
847     logL -= lambda * (x[i] - mu);
848   for (i = 0; i < n; i++)
849     logL -= exp(-1. * lambda * (x[i] - mu));
850   logL -= n * esl_gumbel_logsurv(phi, mu, lambda);
851 
852   return -1.0 * logL;		/* objective: minimize the NLP */
853 }
854 
855 /* tevd_grad()
856  *
857  * Called by the optimizer: evaluate the gradient of the objective
858  * function (the negative posterior log probability of the parameters
859  * mu and w, where w = log(lambda), at a particular choice of mu and
860  * lambda.
861  */
862 static void
tevd_grad(double * p,int nparam,void * dptr,double * dp)863 tevd_grad(double *p, int nparam, void *dptr, double *dp)
864 {
865   double mu, lambda, w;
866   struct tevd_data *data;
867   double *x;
868   int     n;
869   double  phi;
870   double  dmu, dw;
871   double  coeff;
872   int     i;
873 
874   /* unpack what the optimizer gave us; nparam==2 always
875    */
876   mu     = p[0];
877   w      = p[1];
878   lambda = exp(w);
879   data   = (struct tevd_data *) dptr;
880   x      = data->x;
881   n      = data->n;
882   phi    = data->phi;
883 
884   /* Both partials include a coefficient that
885    * basically looks like P(S=phi) / P(S>=phi); pre-calculate it.
886    * Watch out when phi >> mu, which'll give us 0/0; instead,
887    * recognize that for phi >> mu, coeff converges to \lambda.
888    */
889   if (lambda*(phi-mu) > 50.)	/* arbitrary crossover. */
890     coeff = lambda;
891   else
892     coeff = esl_gumbel_pdf(phi, mu, lambda) / esl_gumbel_surv(phi, mu, lambda);
893 
894   /* Partial derivative w.r.t. mu.
895    */
896   dmu = n * lambda;
897   for (i = 0; i < n; i++)
898     dmu -= lambda * exp(-1. * lambda * (x[i] - mu));
899   dmu -= n * coeff;
900 
901   /* Partial derivative w.r.t. w=log(lambda).
902    */
903   dw = n;
904   for (i = 0; i < n; i++) dw -= (x[i] - mu) * lambda;
905   for (i = 0; i < n; i++) dw += (x[i] - mu) * lambda * exp(-1. * lambda * (x[i] - mu));
906   dw += n * (phi - mu) * coeff;
907 
908   /* Return the negative, because we're minimizing NLP, not maximizing.
909    */
910   dp[0] = -1. * dmu;	/* negative because we're minimizing NLP, not maximizing */
911   dp[1] = -1. * dw;
912   return;
913 }
914 
915 /* Function:  esl_gumbel_FitTruncated()
916  * Synopsis:  Estimates $\mu$, $\lambda$ from truncated data.
917  *
918  * Purpose:   Given a left-truncated array of Gumbel-distributed
919  *            samples <x[0]..x[n-1]> and the truncation threshold
920  *            <phi> (such that all <x[i]> $\geq$ <phi>).
921  *            Find maximum likelihood parameters <mu> and <lambda>.
922  *
923  *            <phi> should not be much greater than <mu>, the
924  *            mode of the Gumbel, or the fit will become unstable
925  *            or may even fail to converge. The problem is
926  *            that for <phi> $>$ <mu>, the tail of the Gumbel
927  *            becomes a scale-free exponential, and <mu> becomes
928  *            undetermined.
929  *
930  * Algorithm: Uses conjugate gradient descent to optimize the
931  *            log likelihood of the data. Follows a general
932  *            approach to fitting missing data problems outlined
933  *            in [Gelman95].
934  *
935  * Args:      x          - observed data samples [0..n-1]
936  *            n          - number of samples
937  *            phi        - truncation threshold; all x[i] >= phi
938  *            ret_mu     - RETURN: ML estimate of mu
939  *            ret_lambda - RETURN: ML estimate of lambda
940  *
941  * Returns:   <eslOK> on success.
942  *
943  *            <eslEINVAL> if n<=1.
944  *            <eslENORESULT> if the fit fails, likely because the
945  *            number of samples <n> is too small, or because the
946  *            truncation threshold is high enough that the tail
947  *            looks like a scale-free exponential and we can't
948  *            obtain <mu>.
949  *            On either error, <*ret_mu> and <*ret_lambda> are
950  *            returned as 0.0.
951  *            These are "normal" (returned) errors because
952  *            the data might be provided directly by a user.
953  *
954  * Throws:    <eslEMEM> on allocation error.
955  */
956 int
esl_gumbel_FitTruncated(double * x,int n,double phi,double * ret_mu,double * ret_lambda)957 esl_gumbel_FitTruncated(double *x, int n, double phi, double *ret_mu, double *ret_lambda)
958 {
959   ESL_MIN_CFG *cfg = NULL;      /* customization of the CG optimizer */
960   struct tevd_data data;
961   double p[2];			/* mu, w;  lambda = e^w */
962   double mean, variance;
963   double mu, lambda;
964   double fx;
965   int    i;
966   int    status;
967 
968   /* customization of the optimizer */
969   if ((cfg = esl_min_cfg_Create(2)) == NULL) { status = eslEMEM; goto ERROR; }
970   cfg->u[0]    = 2.0;
971   cfg->u[1]    = 0.1;
972   cfg->cg_rtol = 1e-4;
973 
974   /* Can't fit to n<=1 */
975   if (n <= 1) { status = eslEINVAL; goto ERROR; }
976 
977   /* Can fail on small <n>. One way is if x_i are all identical, so
978    * ML lambda is undefined.
979    */
980   for (i = 1; i < n; i++) if (x[i] != x[0]) break;
981   if  (i == n) { status = eslENORESULT; goto ERROR; }
982 
983   data.x   = x;
984   data.n   = n;
985   data.phi = phi;
986 
987   /* The source of the following magic is Evans/Hastings/Peacock,
988    * Statistical Distributions, 3rd edition (2000), p.86, which gives
989    * eq's for the mean and variance of a Gumbel in terms of mu and lambda;
990    * we turn them around to get mu and lambda in terms of the mean and variance.
991    * These would be reasonable estimators if we had a full set of Gumbel
992    * distributed variates. They'll be off for a truncated sample, but
993    * close enough to be a useful starting point.
994    */
995   esl_stats_DMean(x, n, &mean, &variance);
996   lambda = eslCONST_PI / sqrt(6.*variance);
997   mu     = mean - 0.57722/lambda;
998 
999   p[0] = mu;
1000   p[1] = log(lambda);		/* c.o.v. because lambda is constrained to >0 */
1001 
1002   /* Pass the problem to the optimizer. The work is done by the
1003    * equations in tevd_func() and tevd_grad().
1004    */
1005   status = esl_min_ConjugateGradientDescent(cfg, p, 2,
1006 					    &tevd_func, &tevd_grad,(void *)(&data),
1007 					    &fx, NULL);
1008   if      (status == eslENOHALT) { status = eslENORESULT; goto ERROR; }
1009   else if (status != eslOK)      goto ERROR;
1010 
1011   esl_min_cfg_Destroy(cfg);
1012   *ret_mu     = p[0];
1013   *ret_lambda = exp(p[1]);	/* reverse the c.o.v. */
1014   return status;
1015 
1016  ERROR:
1017   esl_min_cfg_Destroy(cfg);
1018   *ret_mu     = 0.0;
1019   *ret_lambda = 0.0;
1020   return status;
1021 }
1022 /*------------------------ end of fitting --------------------------------*/
1023 
1024 /*****************************************************************
1025  * 8. Stats driver
1026  *****************************************************************/
1027 #ifdef eslGUMBEL_STATS
1028 /* compile: gcc -g -O2 -Wall -I. -L. -o stats -DeslGUMBEL_STATS esl_gumbel.c -leasel -lm
1029  * run:     ./stats > stats.out
1030  * process w/ lines like:
1031  *    grep "complete    100" stats.out | awk '{$i = 100*($5-$4)/$4; if ($i < 0) $i = -$i; print $i}' | avg
1032  *    grep "complete    100" stats.out | awk '{$i = 100*($7-$6)/$6; if ($i < 0) $i = -$i; print $i}' | avg
1033  * to get accuracy summary (in %) for mu, lambda; first part of the grep pattern may be "complete", "censored", or
1034  * "truncated", second part may be "    100", "   1000", "  10000", or " 100000".
1035  *
1036  * This is the routine that collects the accuracy statistics that appear
1037  * in tables in the Gumbel chapter of the guide, esl_gumbel.tex.
1038  */
1039 #include <stdio.h>
1040 
1041 #include "easel.h"
1042 #include "esl_random.h"
1043 #include "esl_minimizer.h"
1044 #include "esl_gumbel.h"
1045 
1046 int
main(int argc,char ** argv)1047 main(int argc, char **argv)
1048 {
1049   ESL_RANDOMNESS *r;
1050   int    totalN[4] = {100, 1000, 10000, 100000}; /*biggest last; one malloc*/
1051   int    nexps = 4;
1052   int    exp;
1053   int    trial, ntrials;
1054   double phi;		/* truncation threshold. */
1055   int    i;
1056   int    n;
1057   double *x;
1058   double  mu, lambda;
1059   double  est_mu, est_lambda;
1060   double  val;
1061   int     do_complete, do_censored, do_truncated, do_location;
1062 
1063   ntrials = 500;
1064   mu      = -20.0;
1065   lambda  = 0.693;
1066   phi     = -15.;
1067 
1068   do_complete  = TRUE;		/* Flip these on/off as desired */
1069   do_censored  = FALSE;
1070   do_truncated = FALSE;
1071   do_location  = FALSE;
1072 
1073   r = esl_randomness_Create(0);
1074   x = malloc(sizeof(double) * totalN[nexps-1]);
1075 
1076   /* Fitting to simulated complete datasets
1077    */
1078   if (do_complete) {
1079     for (exp = 0; exp < nexps; exp++)
1080       {
1081 	for (trial = 0; trial < ntrials; trial++)
1082 	  {
1083 	    for (i = 0; i < totalN[exp]; i++)
1084 	      x[i] = esl_gumbel_Sample(r, mu, lambda);
1085 
1086 	    /*direct_mv_fit(x, totalN[exp], &est_mu, &est_lambda);*/
1087 	    if (esl_gumbel_FitComplete(x, totalN[exp], &est_mu, &est_lambda) != eslOK)
1088 	      esl_fatal("gumbel complete fit fit failed");
1089 
1090 	    printf("complete %6d %6d %9.5f %9.5f %8.6f %8.6f\n",
1091 		   totalN[exp], totalN[exp], mu, est_mu, lambda, est_lambda);
1092 	  }
1093 	printf("\n");
1094       }
1095   }
1096 
1097   /* Fitting to simulated censored datasets
1098    */
1099   if (do_censored) {
1100     for (exp = 0; exp < nexps; exp++)
1101       {
1102 	for (trial = 0; trial < ntrials; trial++)
1103 	  {
1104 	    for (n = 0, i = 0; i < totalN[exp]; i++)
1105 	      {
1106 		val = esl_gumbel_Sample(r, mu, lambda);
1107 		if (val >= phi) x[n++] = val;
1108 	      }
1109 	    if (esl_gumbel_FitCensored(x, n, totalN[exp]-n, phi, &est_mu, &est_lambda) != eslOK)
1110 	      esl_fatal("gumbel censored fit failed");
1111 
1112 	    printf("censored %6d %6d %9.5f %9.5f %8.6f %8.6f\n",
1113 		   totalN[exp], n, mu, est_mu, lambda, est_lambda);
1114 	  }
1115 	printf("\n");
1116       }
1117   }
1118 
1119   /* Fitting to simulated truncated datasets
1120    */
1121   if (do_truncated) {
1122     for (exp = 0; exp < nexps; exp++)
1123       {
1124 	for (trial = 0; trial < ntrials; trial++)
1125 	  {
1126 	    for (n = 0, i = 0; i < totalN[exp]; i++)
1127 	      {
1128 		val = esl_gumbel_Sample(r, mu, lambda);
1129 		if (val >= phi) x[n++] = val;
1130 	      }
1131 	    if (esl_gumbel_FitTruncated(x, n, phi, &est_mu, &est_lambda) != eslOK)
1132 	      esl_fatal("gumbel truncated fit failed");
1133 
1134 	    printf("truncated %6d %6d %9.5f %9.5f %8.6f %8.6f\n",
1135 		   totalN[exp], n, mu, est_mu, lambda, est_lambda);
1136 	  }
1137 	printf("\n");
1138       }
1139   }
1140 
1141   /* Fitting mu given lambda
1142    */
1143   if (do_location) {
1144     for (exp = 0; exp < nexps; exp++)
1145       {
1146 	for (trial = 0; trial < ntrials; trial++)
1147 	  {
1148 	    for (i = 0; i < totalN[exp]; i++)
1149 	      x[i] = esl_gumbel_Sample(r, mu, lambda);
1150 
1151 	    if (esl_gumbel_FitCompleteLoc(x, totalN[exp], lambda, &est_mu) != eslOK)
1152 	      esl_fatal("gumbel location-only complete fit failed");
1153 
1154 	    printf("location %6d %6d %9.5f %9.5f\n",
1155 		   totalN[exp], totalN[exp], mu, est_mu);
1156 	  }
1157 	printf("\n");
1158       }
1159   }
1160 
1161   esl_randomness_Destroy(r);
1162   free(x);
1163   return 0;
1164 }
1165 #endif /*eslGUMBEL_STATS*/
1166 
1167 /*****************************************************************
1168  * 9. Unit tests.
1169  *****************************************************************/
1170 #ifdef eslGUMBEL_TESTDRIVE
1171 
1172 #include "esl_getopts.h"
1173 #include "esl_random.h"
1174 
1175 static void
utest_fitting(ESL_RANDOMNESS * rng)1176 utest_fitting(ESL_RANDOMNESS *rng)
1177 {
1178   char    msg[]   = "esl_gumbel: fitting unit test failed";
1179   int     totalN  = 10000;
1180   double  pmu     = -20.;
1181   double  plambda = 0.4;
1182   double  phi     = -20.;
1183   double *x       = NULL;
1184   int     i;
1185   int     n;
1186   double  mu;
1187   double  lambda;
1188   int     status;
1189 
1190   /* Simulate a complete Gumbel distributed dataset of <totalN> samples */
1191   ESL_ALLOC(x, sizeof(double) * totalN);
1192   for (i = 0; i < totalN; i++)
1193     x[i] = esl_gumbel_Sample(rng, pmu, plambda);
1194 
1195   /* Complete data fitting.
1196    * Don't tolerate more than 1% error in mu, 3% in lambda.
1197    */
1198   if ((status = esl_gumbel_FitComplete(x, totalN, &mu, &lambda)) != eslOK) esl_fatal(msg);
1199   if (fabs((mu    -pmu)    /pmu)     > 0.01) esl_fatal(msg);
1200   if (fabs((lambda-plambda)/plambda) > 0.03) esl_fatal(msg);
1201 
1202   /* Complete data, known lambda; fit location <mu>
1203    */
1204   if ((status = esl_gumbel_FitCompleteLoc(x, totalN, plambda, &mu)) != eslOK) esl_fatal(msg);
1205   if (fabs((mu   - pmu) / pmu)      > 0.01) esl_fatal(msg);
1206 
1207 
1208   /* Left censor/truncate the data set, to <n> values x_i >= phi;
1209    * <Ntotal-n> are censored
1210    */
1211   for (n=0, i = 0; i < totalN; i++)
1212     if (x[i] >= phi) x[n++] = x[i];
1213 
1214   /* Censored fitting.
1215    * Don't tolerate more than 1% error in mu, 4% in lambda.
1216    */
1217   if ((status = esl_gumbel_FitCensored(x, n, totalN-n, phi, &mu, &lambda)) != eslOK) esl_fatal(msg);
1218   if (fabs((mu     - pmu)    /pmu)     > 0.01) esl_fatal(msg);
1219   if (fabs((lambda - plambda)/plambda) > 0.04) esl_fatal(msg);
1220 
1221   /* Censored data, known lambda; fit location <mu>
1222    */
1223   if ((status = esl_gumbel_FitCensoredLoc(x, n, totalN-n, phi, plambda, &mu)) != eslOK) esl_fatal(msg);
1224   if (fabs((mu   - pmu) / pmu) > 0.01) esl_fatal(msg);
1225 
1226   /* Truncated fitting.
1227    * Don't tolerate more than 5% error in mu, 8% in lambda.
1228    */
1229   if ((status = esl_gumbel_FitTruncated(x, n, phi, &mu, &lambda)) != eslOK) esl_fatal(msg);
1230   if (fabs((mu     - pmu)    /pmu)     > 0.05) esl_fatal(msg);
1231   if (fabs((lambda - plambda)/plambda) > 0.08) esl_fatal(msg);
1232 
1233   free(x);
1234   return;
1235 
1236  ERROR:
1237   if (x) free(x);
1238   esl_fatal("allocation failure in esl_gumbel : fitting unit test");
1239 }
1240 
1241 
1242 static void
utest_fit_failure(void)1243 utest_fit_failure(void)
1244 {
1245   char   msg[] = "esl_gumbel: fit_failure unit test failed";
1246   double x[10];
1247   double mu;
1248   double lambda;
1249   int    status;
1250 
1251   x[0] = 1.0;
1252   x[1] = 1.0;
1253 
1254   /* n=0 or 1 => eslEINVAL. */
1255   status = esl_gumbel_FitComplete(x, 1, &mu, &lambda);
1256   if (status != eslEINVAL)    esl_fatal(msg);
1257   if (mu     != 0.0)          esl_fatal(msg);
1258   if (lambda != 0.0)          esl_fatal(msg);
1259 
1260   /* Test for failure on small n => eslENORESULT */
1261   status = esl_gumbel_FitComplete(x, 2, &mu, &lambda);
1262   if (status != eslENORESULT) esl_fatal(msg);
1263   if (mu     != 0.0)          esl_fatal(msg);
1264   if (lambda != 0.0)          esl_fatal(msg);
1265 
1266   /* FitCompleteLoc() invalid on n=0,1; but always succeeds for n>1 */
1267   status = esl_gumbel_FitCompleteLoc(x, 1, 1.0, &mu);
1268   if (status != eslEINVAL)    esl_fatal(msg);
1269   if (mu     != 0.0)          esl_fatal(msg);
1270 
1271   /* FitCensored() is eslEINVAL on n=0,1, like FitComplete().
1272    */
1273   status = esl_gumbel_FitCensored(x, 1, 1, 0.0, &mu, &lambda);
1274   if (status != eslEINVAL)    esl_fatal(msg);
1275   if (mu     != 0.0)          esl_fatal(msg);
1276   if (lambda != 0.0)          esl_fatal(msg);
1277 
1278   /* FitCensored() can fail on small n, w/ eslENORESULT */
1279   status = esl_gumbel_FitCensored(x, 2, 1, 0.0, &mu, &lambda);
1280   if (status != eslENORESULT) esl_fatal(msg);
1281   if (mu     != 0.0)          esl_fatal(msg);
1282   if (lambda != 0.0)          esl_fatal(msg);
1283 
1284   /* FitCensoredLoc()invalid on n=0,1; but always succeeds for n>1 */
1285   status = esl_gumbel_FitCensoredLoc(x, 1, 1, 0.0, 1.0, &mu);
1286   if (status != eslEINVAL)    esl_fatal(msg);
1287   if (mu     != 0.0)          esl_fatal(msg);
1288 
1289   /* FitTruncated() w/ n=0,1 => eslEINVAL. */
1290   status = esl_gumbel_FitTruncated(x, 1, 0.0, &mu, &lambda);
1291   if (status != eslEINVAL)    esl_fatal(msg);
1292   if (mu     != 0.0)          esl_fatal(msg);
1293   if (lambda != 0.0)          esl_fatal(msg);
1294 
1295   /* FitTruncated() can fail on small n, w/ eslENORESULT */
1296   status = esl_gumbel_FitTruncated(x, 2, 0.0, &mu, &lambda);
1297   if (status != eslENORESULT) esl_fatal(msg);
1298   if (mu     != 0.0)          esl_fatal(msg);
1299   if (lambda != 0.0)          esl_fatal(msg);
1300 
1301   return;
1302 }
1303 #endif /*eslGUMBEL_TESTDRIVE*/
1304 
1305 /*****************************************************************
1306  * 10. Test driver.
1307  *****************************************************************/
1308 #ifdef eslGUMBEL_TESTDRIVE
1309 /* compile: gcc -g -Wall -I. -L. -o esl_gumbel_utest -DeslGUMBEL_TESTDRIVE esl_gumbel.c -leasel -lm
1310  *  (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_gumbel_utest -DeslGUMBEL_TESTDRIVE esl_gumbel.c -leasel -lm
1311  * run:       ./esl_gumbel_utest
1312  * coverage:  ./esl_gumbel_utest; gcov esl_gumbel.c; more esl_gumbel.c.gcov
1313  */
1314 #include <stdio.h>
1315 
1316 #include "easel.h"
1317 #include "esl_getopts.h"
1318 #include "esl_random.h"
1319 #include "esl_minimizer.h"
1320 #include "esl_gumbel.h"
1321 #include "esl_stats.h"
1322 
1323 static ESL_OPTIONS options[] = {
1324    /* name  type         default  env   range togs  reqs incomp  help                        docgrp */
1325   { "-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",           0},
1326   { "-s",  eslARG_INT,      "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
1327   { "-v",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "verbose: show verbose output",  0},
1328   { 0,0,0,0,0,0,0,0,0,0},
1329 };
1330 static char usage[]  = "[-options]";
1331 static char banner[] = "test driver for Gumbel distribution routines";
1332 
1333 int
main(int argc,char ** argv)1334 main(int argc, char **argv)
1335 {
1336   ESL_GETOPTS    *go         = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1337   ESL_RANDOMNESS *rng        = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1338   int             be_verbose = esl_opt_GetBoolean(go, "-v");
1339 
1340   if (be_verbose) printf("seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
1341 
1342   utest_fitting(rng);
1343   utest_fit_failure();
1344 
1345   esl_randomness_Destroy(rng);
1346   esl_getopts_Destroy(go);
1347   return 0;
1348 }
1349 #endif /*eslGUMBEL_TESTDRIVE*/
1350 
1351 
1352 /*****************************************************************
1353  * 11. Example.
1354  *****************************************************************/
1355 #ifdef eslGUMBEL_EXAMPLE
1356 /*::cexcerpt::gumbel_example::begin::*/
1357 #include <stdio.h>
1358 #include "easel.h"
1359 #include "esl_random.h"
1360 #include "esl_gumbel.h"
1361 
1362 int
main(int argc,char ** argv)1363 main(int argc, char **argv)
1364 {
1365   ESL_RANDOMNESS *r = esl_randomness_Create(0);;
1366   int     n         = 10000; 	/* simulate 10,000 samples */
1367   double  mu        = -20.0;       /* with mu = -20 */
1368   double  lambda    = 0.4;         /* and lambda = 0.4 */
1369   double  min       =  9999.;
1370   double  max       = -9999.;
1371   double *x         = malloc(sizeof(double) * n);
1372   double  z, est_mu, est_lambda;
1373   int     i;
1374 
1375   for (i = 0; i < n; i++)	/* generate the 10,000 samples */
1376     {
1377       x[i] = esl_gumbel_Sample(r, mu, lambda);
1378       if (x[i] < min) min = x[i];
1379       if (x[i] > max) max = x[i];
1380     }
1381 
1382   z = esl_gumbel_surv(max, mu, lambda);           /* right tail p~1e-4 >= max */
1383   printf("max = %6.1f  P(>max)  = %g\n", max, z);
1384   z = esl_gumbel_cdf(min, mu, lambda);             /* left tail p~1e-4 < min */
1385   printf("min = %6.1f  P(<=min) = %g\n", min, z);
1386 
1387   if (esl_gumbel_FitComplete(x, n, &est_mu, &est_lambda) != eslOK) /* fit params to the data */
1388     esl_fatal("gumbel ML complete data fit failed");
1389 
1390   z = 100. * fabs((est_mu - mu) / mu);
1391   printf("Parametric mu     = %6.1f.  Estimated mu     = %6.2f.  Difference = %.1f%%.\n",
1392 	 mu, est_mu, z);
1393   z = 100. * fabs((est_lambda - lambda) /lambda);
1394   printf("Parametric lambda = %6.1f.  Estimated lambda = %6.2f.  Difference = %.1f%%.\n",
1395 	 lambda, est_lambda, z);
1396 
1397   free(x);
1398   esl_randomness_Destroy(r);
1399   return 0;
1400 }
1401 /*::cexcerpt::gumbel_example::end::*/
1402 #endif /*eslGUMBEL_EXAMPLE*/
1403 
1404 
1405 
1406