1 /* $Id: $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's offical duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: njn_localmaxstatutil.cpp
29 
30 Author: John Spouge
31 
32 Contents:
33 
34 ******************************************************************************/
35 
36 #include <assert.h>
37 #include "njn_localmaxstatutil.hpp"
38 #include "njn_approx.hpp"
39 #include "njn_dynprogproblim.hpp"
40 #include "njn_integer.hpp"
41 #include "njn_memutil.hpp"
42 #include "njn_root.hpp"
43 
44 #include "sls_basic.hpp"
45 
46 using namespace Njn;
47 
48 
flatten(size_t dimension_,const long int * const * scoreMatrix_,const double * const * prob_,size_t * dim_,long int ** score_,double ** p_,size_t dimension2_)49 void LocalMaxStatUtil::flatten ( // allocates memory for linear probabilities and scores
50 size_t dimension_, // dimension of equilProb_
51 const long int *const *scoreMatrix_, // packed scoring matrix [0...dimension_)[0...dimension2_)
52 const double *const *prob_, // prob_ [0...dimension_)[0...dimension2_) : distribution of scores sum to 1.0
53 size_t *dim_, // dimension of p_
54 long int **score_, // score [0...dim_) in increasing order
55 double **p_, // linear p_ [0...dim_) : distribution of scores
56 size_t dimension2_) // dimension2 of equilProb_
57 {
58     if (dimension2_ == 0) dimension2_ = dimension_;
59 
60     size_t i = 0;
61     size_t j = 0;
62 
63     double sum = 0.0;
64 
65     for (i = 0; i < dimension_; i++)
66     {
67       for (j = 0; j < dimension2_; j++)
68       {
69          sum += prob_ [i][j];
70       }
71     }
72 
73     const double FUDGE = 20.0;
74     assert (Approx::relApprox (sum, 1.0, FUDGE * REL_TOL));
75 
76     long int s = 0;
77     long int min = LONG_MAX;
78     long int max = LONG_MIN;
79 
80     for (i = 0; i < dimension_; i++)
81     {
82       for (j = 0; j < dimension2_; j++)
83       {
84          if (scoreMatrix_ [i][j] < min) min = scoreMatrix_ [i][j];
85          if (max < scoreMatrix_ [i][j]) max = scoreMatrix_ [i][j];
86       }
87     }
88 
89     assert (min <= max);
90 
91     size_t dim = static_cast <size_t> (max - min + 1);
92     double *p = new double [dim];
93     for (i = 0; i < dim; i++) p [i] = 0.0;
94 
95     for (i = 0; i < dimension_; i++)
96     {
97       for (j = 0; j < dimension2_; j++)
98       {
99          p [scoreMatrix_ [i][j] - min] += prob_ [i][j];
100       }
101     }
102 
103     *dim_ = 0;
104 
105     for (s = min; s <= max; s++)
106     {
107       if (0.0 < p [s - min]) ++*dim_;
108     }
109 
110     *p_ = new double [*dim_];
111     *score_ = new long int [*dim_];
112     *dim_ = 0;
113 
114     for (s = min; s <= max; s++)
115     {
116       if (0.0 < p [s - min]) {
117          (*score_) [*dim_] = s;
118          (*p_) [*dim_] = p [s - min];
119          ++*dim_;
120       }
121     }
122 
123     delete [] p; p = 0;
124 }
125 
lambda(size_t dimension_,const long int * const * scoreMatrix_,const double * q_)126 double LocalMaxStatUtil::lambda (
127 size_t dimension_, // dimension of equilProb_
128 const long int *const *scoreMatrix_, // packed scoring matrix [0...dimension_)[0...dimension_)
129 const double *q_) // q_ [0...dimension_) : distribution of independent letters
130 {
131     size_t i = 0;
132     size_t j = 0;
133 
134    double **prob = MemUtil::newMatrix <double> (dimension_, dimension_);
135 
136     for (i = 0; i < dimension_; i++)
137     {
138         for (j = 0; j < dimension_; j++)
139         {
140             prob [i][j] = q_ [i] * q_ [j];
141         }
142     }
143 
144     size_t dim = 0;
145     long int *score = 0;
146     double *p = 0;
147 
148     flatten (dimension_, scoreMatrix_, prob, &dim, &score, &p);
149 
150     for (i = 0; i < dimension_; i++)
151     {
152         delete prob [i];
153     }
154 
155     double lambdaHat = LocalMaxStatUtil::lambda (dim, score, p);
156 
157     delete p; p = 0;
158     delete score; score = 0;
159 
160     return lambdaHat;
161 }
162 
163 
164    size_t n_dimension = 0; // dimension of matrices
165    const long int *n_score = 0; // score_ [0...dimension_ - 1]
166    const double *n_prob = 0; // prob_ [0...dimension_ - 1]
167    long int n_morgue = 0; // score_ [0] - 1
168    long int n_entry = 0; // n_entry = 0 : weak descending ladder epoch ; n_entry = -1 : strict descending ladder epoch
169 
n_setParameters(size_t dimension_,const long int * score_,const double * prob_,long int entry_=0)170    void n_setParameters (
171    size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
172    const long int *score_, // scores
173    const double *prob_, // probabilities
174    long int entry_ = 0) // entry_ = 0 : weak descending ladder epoch ; entry_ = -1 : strict descending ladder epoch
175    {
176       n_dimension = dimension_;
177       n_score = score_;
178       n_prob = prob_;
179       n_morgue = score_ [0] - 1;
180       n_entry = entry_;
181    }
182 
n_totalProbAssoc(double x_)183    double n_totalProbAssoc (double x_)
184    {
185       double sum = 0.0;
186       for (size_t i = 0; i < n_dimension; i++) {
187          sum += n_prob [i] * exp (x_ * static_cast <double> (n_score [i]));
188       }
189       return sum;
190    }
191 
n_meanPowerAssoc(double x_,long int power_=1L)192    double n_meanPowerAssoc (double x_, long int power_ = 1L)
193    {
194       double sum = 0.0;
195       for (size_t i = 0; i < n_dimension; i++) {
196           sum += Integer::integerPower (static_cast <double> (n_score [i]), power_) *
197             n_prob [i] * exp (x_ * static_cast <double> (n_score [i]));
198       }
199       return sum;
200    }
201 
n_meanAssoc(double x_)202    double n_meanAssoc (double x_)
203    {
204       return n_meanPowerAssoc (x_);
205    }
206 
n_bracket(double * p_,double * q_)207    void n_bracket (double *p_, double *q_)
208    {
209       const double FACTOR = 0.5;
210       *p_ = -log (n_prob [n_dimension - 1]) / static_cast <double> (n_score [n_dimension - 1]);
211       while (1.0 <= n_totalProbAssoc (*p_)) {
212          *p_ *= FACTOR;
213       }
214       *q_ = *p_ / FACTOR;
215    }
216 
217 
mu(size_t dimension_,const long int * score_,const double * prob_)218 double LocalMaxStatUtil::mu (
219 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
220 const long int *score_, // scores in increasing order
221 const double *prob_) // corresponding probabilities
222 {
223    double mu = 0.0;
224    for (size_t i = 0; i < dimension_; i++) {
225       mu += static_cast <double> (score_ [i]) * prob_ [i];
226    }
227    return mu;
228 }
229 
lambda(size_t dimension_,const long int * score_,const double * prob_)230 double LocalMaxStatUtil::lambda (
231 size_t dimension_, // #(distinct values)
232 const long int *score_, // values
233 const double *prob_) // probability of corresponding value
234 {
235    n_setParameters (dimension_, score_, prob_);
236 
237    double p = 0.0;
238    double q = 0.0;
239 
240    n_bracket (&p, &q);
241 
242    return Root::bisection (1.0, n_totalProbAssoc, p, q, REL_TOL * fabs (p - q));
243 }
244 
muPowerAssoc(size_t dimension_,const long int * score_,const double * prob_,double lambda_,long int power_)245 double LocalMaxStatUtil::muPowerAssoc (
246 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
247 const long int *score_, // scores in increasing order
248 const double *prob_, // corresponding probabilities
249 double lambda_, // lambda
250 long int power_) // power
251 {
252    n_setParameters (dimension_, score_, prob_);
253 
254    if (lambda_ == 0.0) lambda_ = lambda (dimension_, score_, prob_);
255 
256    return n_meanPowerAssoc (lambda_, power_);
257 }
258 
muAssoc(size_t dimension_,const long int * score_,const double * prob_,double lambda_)259 double LocalMaxStatUtil::muAssoc (
260 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
261 const long int *score_, // scores in increasing order
262 const double *prob_, // corresponding probabilities
263 double lambda_) // lambda
264 {
265    return muPowerAssoc (dimension_, score_, prob_, lambda_);
266 }
267 
thetaMin(size_t dimension_,const long int * score_,const double * prob_,double lambda_)268 double LocalMaxStatUtil::thetaMin (
269 size_t dimension_, // #(distinct values)
270 const long int *score_, // values
271 const double *prob_, // probability of corresponding value
272 double lambda_) // lambda
273 // assumes logarithmic regime
274 {
275    n_setParameters (dimension_, score_, prob_);
276 
277    if (lambda_ == 0.0) lambda_ = lambda (dimension_, score_, prob_);
278 
279    double p = 0.0;
280    double q = 0.0;
281    n_bracket (&p, &q);
282    return Root::bisection (0.0, n_meanAssoc, 0.0, lambda_, REL_TOL * fabs (p - q));
283 }
284 
rMin(size_t dimension_,const long int * score_,const double * prob_,double lambda_,double thetaMin_)285 double LocalMaxStatUtil::rMin (
286 size_t dimension_, // #(distinct values)
287 const long int *score_, // values
288 const double *prob_, // probability of corresponding value
289 double lambda_, // lambda
290 double thetaMin_) // argument of rate
291 // assumes logarithmic regime
292 {
293    n_setParameters (dimension_, score_, prob_);
294 
295    if (thetaMin_ == 0.0) thetaMin_ = thetaMin (dimension_, score_, prob_, lambda_);
296 
297    return n_totalProbAssoc (thetaMin_);
298 }
299 
r(size_t dimension_,const long int * score_,const double * prob_,double theta_)300 double LocalMaxStatUtil::r ( // r (theta)
301 size_t dimension_, // #(distinct values)
302 const long int *score_, // scores in increasing order
303 const double *prob_, // probability of corresponding value
304 double theta_) // argument of rate
305 // assumes logarithmic regime
306 {
307    double sum = 0.0;
308    for (size_t i = 0; i < dimension_; i++) {
309       sum += prob_ [i] * exp (theta_ * static_cast <double> (score_ [i]));
310    }
311    return sum;
312 }
313 
delta(size_t dimension_,const long int * score_)314 long int LocalMaxStatUtil::delta ( // theta [minus delta] for ungapped sequence comparison
315 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
316 const long int *score_) // scores
317 {
318    size_t i = 0;
319 
320    long int delta = 0;
321    for (i = 0; i < dimension_; i++) {
322       delta = Integer::euclidAlgorithm <long int> (delta, score_ [i]);
323    }
324    return delta;
325 }
326 
thetaMinusDelta(double lambda_,size_t dimension_,const long int * score_)327 double LocalMaxStatUtil::thetaMinusDelta ( // theta [minus delta] for ungapped sequence comparison
328 double lambda_, // lambda, the exponential rate for the local maximum
329 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
330 const long int *score_) // scores
331 {
332    double del = static_cast <double> (delta (dimension_, score_));
333    return (1.0 - exp (-lambda_ * del)) / del;
334 }
335 
336 
n_step(long int oldValue_,size_t state_)337    long int n_step (long int oldValue_, size_t state_)
338    {
339       assert (state_ < n_dimension);
340       return n_morgue < oldValue_ ? oldValue_ + n_score [state_] : oldValue_;
341    }
342 
n_bury(long int oldValue_,size_t state_)343    long int n_bury (long int oldValue_, size_t state_)
344    {
345       assert (state_ < n_dimension);
346       return n_entry < oldValue_ ? oldValue_ : n_morgue;
347    }
348 
349 
descendingLadderEpochRepeat(size_t dimension_,const long int * score_,const double * prob_,double * eSumAlpha_,double * eOneMinusExpSumAlpha_,bool isStrict_,double lambda_,size_t endW_,double * pAlphaW_,double * eOneMinusExpSumAlphaW_,double lambda0_,double mu0_,double muAssoc0_,double thetaMin0_,double rMin0_,double time_,bool * terminated_)350 void LocalMaxStatUtil::descendingLadderEpochRepeat (
351 size_t dimension_, // #(distinct values)
352 const long int *score_, // values
353 const double *prob_, // probability of corresponding value
354 double *eSumAlpha_, // expectation (sum [alpha])
355 double *eOneMinusExpSumAlpha_, // expectation [1.0 - exp (sum [alpha])]
356 bool isStrict_, // ? is this a strict descending ladder epoch
357 double lambda_, // lambda for repeats : default is lambda0_ below
358 size_t endW_, // maximum w plus 1
359 double *pAlphaW_, // probability {alpha = w} : pAlphaW_ [0, wEnd)
360 double *eOneMinusExpSumAlphaW_, // expectation [1.0 - exp (sum [alpha]); alpha = w] : eOneMinusExpSumAlphaW_ [0, wEnd)
361 double lambda0_, // lambda for flattened distribution (avoid recomputation)
362 double mu0_, // mean of flattened distribution (avoid recomputation)
363 double muAssoc0_, // mean of associated flattened distribution (avoid recomputation)
364 double thetaMin0_, // thetaMin of flattened distribution (avoid recomputation)
365 double rMin0_, // rMin of flattened distribution (avoid recomputation)
366 double time_, // get time for the dynamic programming computation
367 bool *terminated_) // ? Was the dynamic programming computation terminated prematurely ?
368 // assumes logarithmic regime
369 {
370     // Start dynamic programming probability calculation using notation in
371     //
372     // Mott R. and Tribe R. (1999)
373     // J. Computational Biology 6(1):91-112
374     //
375     // Karlin S. and Taylor H.M.(1981)
376     // A Second Course in Stochastic Processes, p. 480
377     //
378     // Note there is an error in Eq (6.19) there, which is corrected in Eq (6.20)
379     //
380     // This program uses departure into (-Inf, 0] not (-Inf, 0)
381 
382     // avoid recomputation
383     double mu0 = 0.0 == mu0_ ? mu (dimension_, score_, prob_) : mu0_;
384     assert (mu0 < 0.0);
385     double lambda0 = 0.0 == lambda0_ ? lambda (dimension_, score_, prob_) : lambda0_;
386     assert (0.0 < lambda0);
387     if (lambda_ == 0.0) lambda_ = lambda0;
388     assert (0.0 < lambda_);
389     double muAssoc0 = 0.0 == muAssoc0_ ? muAssoc (dimension_, score_, prob_, lambda0) : muAssoc0_;
390     assert (0.0 < muAssoc0);
391     double thetaMin0 = 0.0 == thetaMin0_ ? thetaMin (dimension_, score_, prob_, lambda0) : thetaMin0_;
392     assert (0.0 < thetaMin0);
393     double rMin0 = 0.0 == rMin0_ ? rMin (dimension_, score_, prob_, lambda0, thetaMin0) : rMin0_;
394     assert (0.0 < rMin0 && rMin0 < 1.0);
395 
396     const long int ITER_MIN = static_cast <long int> ((log (REL_TOL * (1.0 - rMin0)) / log (rMin0)));
397     assert (0 < ITER_MIN);
398     const long int ITER = static_cast <long int> (endW_) < ITER_MIN ? ITER_MIN : static_cast <long int> (endW_);
399     assert (0 < ITER);
400     const long int Y_MAX = static_cast <long int> (-log (REL_TOL) / lambda0);
401 
402     long int entry = isStrict_ ? -1 : 0;
403     n_setParameters (dimension_, score_, prob_, entry);
404 
405 
406     double time0 = 0.0;
407     double time1 = 0.0;
408 	if (time_ > 0.0) Sls::sls_basic::get_current_time (time0);
409 
410     DynProgProbLim dynProgProb (n_step, dimension_, prob_, score_ [0] - 1, Y_MAX);
411 
412     if (pAlphaW_) pAlphaW_ [0] = 0.0;
413     if (eOneMinusExpSumAlphaW_) eOneMinusExpSumAlphaW_ [0] = 0.0;
414 
415     dynProgProb.update (); // iterate random walk
416 
417     long int value = 0;
418 
419     if (eSumAlpha_) *eSumAlpha_ = 0.0;
420     if (eOneMinusExpSumAlpha_) *eOneMinusExpSumAlpha_ = 0.0;
421 
422     for (size_t w = 1; w < static_cast <size_t> (ITER); w++) {
423 
424         if (w < endW_) { // sum pAlphaW_ [w] and eOneMinusExpSumAlphaW_ [w]
425 
426              if (pAlphaW_) pAlphaW_ [w] = 0.0;
427              if (eOneMinusExpSumAlphaW_) eOneMinusExpSumAlphaW_ [w] = 0.0;
428 
429              for (value = score_ [0]; value <= entry; value++) {
430                 if (pAlphaW_) pAlphaW_ [w] += dynProgProb.getProb (value);
431                 if (eOneMinusExpSumAlphaW_) eOneMinusExpSumAlphaW_ [w] +=
432                                                dynProgProb.getProb (value) *
433                                                (1.0 - exp (lambda_ * static_cast <double> (value)));
434              }
435         }
436 
437         for (value = score_ [0]; value <= entry; value++) {
438          if (eSumAlpha_) *eSumAlpha_ += dynProgProb.getProb (value) * static_cast <double> (value);
439          if (eOneMinusExpSumAlpha_) *eOneMinusExpSumAlpha_ += dynProgProb.getProb (value) *
440                                         (1.0 - exp (lambda_ * static_cast <double> (value)));
441         }
442 
443         dynProgProb.setValueFct (n_bury);
444         dynProgProb.update (); // put probability into the morgue
445 
446         dynProgProb.setValueFct (n_step);
447         dynProgProb.update (); // iterate random walk
448 
449         if (time_ > 0.0)
450         {
451 			Sls::sls_basic::get_current_time (time1);
452             if (time1 - time0 > time_)
453             {
454                 *terminated_ = true;
455                 return;
456             }
457         }
458 
459     }
460 
461     for (value = score_ [0]; value <= entry; value++) {
462       if (eSumAlpha_) *eSumAlpha_ += dynProgProb.getProb (value) * static_cast <double> (value);
463       if (eOneMinusExpSumAlpha_) *eOneMinusExpSumAlpha_ += dynProgProb.getProb (value) *
464                                      (1.0 - exp (lambda_ * static_cast <double> (value)));
465     }
466 
467     // check that not too much probability has been omitted
468     double prob = 0.0;
469     for (value = entry + 1; value < dynProgProb.getValueUpper (); value++) {
470       prob += dynProgProb.getProb (value);
471     }
472     prob += dynProgProb.getProbLost ();
473 
474     const double FUDGE = 100.0;
475     assert (prob <= FUDGE * static_cast <double> (dimension_) * REL_TOL);
476 }
477 
descendingLadderEpoch(size_t dimension_,const long int * score_,const double * prob_,double * eSumAlpha_,double * eOneMinusExpSumAlpha_,bool isStrict_,double lambda0_,double mu0_,double muAssoc0_,double thetaMin0_,double rMin0_,double time_,bool * terminated_)478 void LocalMaxStatUtil::descendingLadderEpoch (
479 size_t dimension_, // #(distinct values)
480 const long int *score_, // values
481 const double *prob_, // probability of corresponding value
482 double *eSumAlpha_, // expectation (sum [alpha])
483 double *eOneMinusExpSumAlpha_, // expectation [1.0 - exp (sum [alpha])]
484 bool isStrict_, // ? is this a strict descending ladder epoch
485 double lambda0_, // lambda for flattened distribution (avoid recomputation)
486 double mu0_, // mean of flattened distribution (avoid recomputation)
487 double muAssoc0_, // mean of associated flattened distribution (avoid recomputation)
488 double thetaMin0_, // thetaMin of flattened distribution (avoid recomputation)
489 double rMin0_, // rMin of flattened distribution (avoid recomputation)
490 double time_, // get time for the dynamic programming computation
491 bool *terminated_) // ? Was the dynamic programming computation terminated prematurely ?
492 {
493    descendingLadderEpochRepeat (dimension_, score_, prob_,
494       eSumAlpha_, eOneMinusExpSumAlpha_, isStrict_, 0.0, 0, 0, 0,
495       lambda0_, mu0_, muAssoc0_, thetaMin0_, rMin0_, time_, terminated_);
496 }
497 
isProbDist(size_t dimension_,const double * prob_)498 bool LocalMaxStatUtil::isProbDist (
499 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
500 const double *prob_) // corresponding probabilities
501 {
502    double sum = 0.0;
503    for (size_t i = 0; i < dimension_; i++) {
504       if (prob_ [i] < 0.0 || 1.0 < prob_ [i]) return false;
505       sum += prob_ [i];
506    }
507    return Approx::relApprox (sum, 1.0, REL_TOL);
508 }
509 
isScoreIncreasing(size_t dimension_,const long int * score_)510 bool LocalMaxStatUtil::isScoreIncreasing (
511 size_t dimension_, // #(distinct values)
512 const long int *score_) // scores in increasing order
513 {
514    for (size_t i = 1; i < dimension_; i++) {
515       if (score_ [i] <= score_ [i - 1]) return false;
516    }
517    return true;
518 }
519 
isLogarithmic(size_t dimension_,const long int * score_,const double * prob_)520 bool LocalMaxStatUtil::isLogarithmic (
521 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
522 const long int *score_, // scores in increasing order
523 const double *prob_) // corresponding probabilities
524 {
525    assert (score_);
526    assert (prob_);
527    if (! isScoreIncreasing (dimension_, score_)) return false;
528    if (! isProbDist (dimension_, prob_)) return false;
529    if (0.0 <= mu (dimension_, score_, prob_)) return false;
530    if (score_ [dimension_ - 1] <= 0.0) return false;
531    return true;
532 }
533 
534