1 /* $Id: $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25
26 /*****************************************************************************
27
28 File name: njn_localmaxstatutil.cpp
29
30 Author: John Spouge
31
32 Contents:
33
34 ******************************************************************************/
35
36 #include <assert.h>
37 #include "njn_localmaxstatutil.hpp"
38 #include "njn_approx.hpp"
39 #include "njn_dynprogproblim.hpp"
40 #include "njn_integer.hpp"
41 #include "njn_memutil.hpp"
42 #include "njn_root.hpp"
43
44 #include "sls_basic.hpp"
45
46 using namespace Njn;
47
48
flatten(size_t dimension_,const long int * const * scoreMatrix_,const double * const * prob_,size_t * dim_,long int ** score_,double ** p_,size_t dimension2_)49 void LocalMaxStatUtil::flatten ( // allocates memory for linear probabilities and scores
50 size_t dimension_, // dimension of equilProb_
51 const long int *const *scoreMatrix_, // packed scoring matrix [0...dimension_)[0...dimension2_)
52 const double *const *prob_, // prob_ [0...dimension_)[0...dimension2_) : distribution of scores sum to 1.0
53 size_t *dim_, // dimension of p_
54 long int **score_, // score [0...dim_) in increasing order
55 double **p_, // linear p_ [0...dim_) : distribution of scores
56 size_t dimension2_) // dimension2 of equilProb_
57 {
58 if (dimension2_ == 0) dimension2_ = dimension_;
59
60 size_t i = 0;
61 size_t j = 0;
62
63 double sum = 0.0;
64
65 for (i = 0; i < dimension_; i++)
66 {
67 for (j = 0; j < dimension2_; j++)
68 {
69 sum += prob_ [i][j];
70 }
71 }
72
73 const double FUDGE = 20.0;
74 assert (Approx::relApprox (sum, 1.0, FUDGE * REL_TOL));
75
76 long int s = 0;
77 long int min = LONG_MAX;
78 long int max = LONG_MIN;
79
80 for (i = 0; i < dimension_; i++)
81 {
82 for (j = 0; j < dimension2_; j++)
83 {
84 if (scoreMatrix_ [i][j] < min) min = scoreMatrix_ [i][j];
85 if (max < scoreMatrix_ [i][j]) max = scoreMatrix_ [i][j];
86 }
87 }
88
89 assert (min <= max);
90
91 size_t dim = static_cast <size_t> (max - min + 1);
92 double *p = new double [dim];
93 for (i = 0; i < dim; i++) p [i] = 0.0;
94
95 for (i = 0; i < dimension_; i++)
96 {
97 for (j = 0; j < dimension2_; j++)
98 {
99 p [scoreMatrix_ [i][j] - min] += prob_ [i][j];
100 }
101 }
102
103 *dim_ = 0;
104
105 for (s = min; s <= max; s++)
106 {
107 if (0.0 < p [s - min]) ++*dim_;
108 }
109
110 *p_ = new double [*dim_];
111 *score_ = new long int [*dim_];
112 *dim_ = 0;
113
114 for (s = min; s <= max; s++)
115 {
116 if (0.0 < p [s - min]) {
117 (*score_) [*dim_] = s;
118 (*p_) [*dim_] = p [s - min];
119 ++*dim_;
120 }
121 }
122
123 delete [] p; p = 0;
124 }
125
lambda(size_t dimension_,const long int * const * scoreMatrix_,const double * q_)126 double LocalMaxStatUtil::lambda (
127 size_t dimension_, // dimension of equilProb_
128 const long int *const *scoreMatrix_, // packed scoring matrix [0...dimension_)[0...dimension_)
129 const double *q_) // q_ [0...dimension_) : distribution of independent letters
130 {
131 size_t i = 0;
132 size_t j = 0;
133
134 double **prob = MemUtil::newMatrix <double> (dimension_, dimension_);
135
136 for (i = 0; i < dimension_; i++)
137 {
138 for (j = 0; j < dimension_; j++)
139 {
140 prob [i][j] = q_ [i] * q_ [j];
141 }
142 }
143
144 size_t dim = 0;
145 long int *score = 0;
146 double *p = 0;
147
148 flatten (dimension_, scoreMatrix_, prob, &dim, &score, &p);
149
150 for (i = 0; i < dimension_; i++)
151 {
152 delete prob [i];
153 }
154
155 double lambdaHat = LocalMaxStatUtil::lambda (dim, score, p);
156
157 delete p; p = 0;
158 delete score; score = 0;
159
160 return lambdaHat;
161 }
162
163
164 size_t n_dimension = 0; // dimension of matrices
165 const long int *n_score = 0; // score_ [0...dimension_ - 1]
166 const double *n_prob = 0; // prob_ [0...dimension_ - 1]
167 long int n_morgue = 0; // score_ [0] - 1
168 long int n_entry = 0; // n_entry = 0 : weak descending ladder epoch ; n_entry = -1 : strict descending ladder epoch
169
n_setParameters(size_t dimension_,const long int * score_,const double * prob_,long int entry_=0)170 void n_setParameters (
171 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
172 const long int *score_, // scores
173 const double *prob_, // probabilities
174 long int entry_ = 0) // entry_ = 0 : weak descending ladder epoch ; entry_ = -1 : strict descending ladder epoch
175 {
176 n_dimension = dimension_;
177 n_score = score_;
178 n_prob = prob_;
179 n_morgue = score_ [0] - 1;
180 n_entry = entry_;
181 }
182
n_totalProbAssoc(double x_)183 double n_totalProbAssoc (double x_)
184 {
185 double sum = 0.0;
186 for (size_t i = 0; i < n_dimension; i++) {
187 sum += n_prob [i] * exp (x_ * static_cast <double> (n_score [i]));
188 }
189 return sum;
190 }
191
n_meanPowerAssoc(double x_,long int power_=1L)192 double n_meanPowerAssoc (double x_, long int power_ = 1L)
193 {
194 double sum = 0.0;
195 for (size_t i = 0; i < n_dimension; i++) {
196 sum += Integer::integerPower (static_cast <double> (n_score [i]), power_) *
197 n_prob [i] * exp (x_ * static_cast <double> (n_score [i]));
198 }
199 return sum;
200 }
201
n_meanAssoc(double x_)202 double n_meanAssoc (double x_)
203 {
204 return n_meanPowerAssoc (x_);
205 }
206
n_bracket(double * p_,double * q_)207 void n_bracket (double *p_, double *q_)
208 {
209 const double FACTOR = 0.5;
210 *p_ = -log (n_prob [n_dimension - 1]) / static_cast <double> (n_score [n_dimension - 1]);
211 while (1.0 <= n_totalProbAssoc (*p_)) {
212 *p_ *= FACTOR;
213 }
214 *q_ = *p_ / FACTOR;
215 }
216
217
mu(size_t dimension_,const long int * score_,const double * prob_)218 double LocalMaxStatUtil::mu (
219 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
220 const long int *score_, // scores in increasing order
221 const double *prob_) // corresponding probabilities
222 {
223 double mu = 0.0;
224 for (size_t i = 0; i < dimension_; i++) {
225 mu += static_cast <double> (score_ [i]) * prob_ [i];
226 }
227 return mu;
228 }
229
lambda(size_t dimension_,const long int * score_,const double * prob_)230 double LocalMaxStatUtil::lambda (
231 size_t dimension_, // #(distinct values)
232 const long int *score_, // values
233 const double *prob_) // probability of corresponding value
234 {
235 n_setParameters (dimension_, score_, prob_);
236
237 double p = 0.0;
238 double q = 0.0;
239
240 n_bracket (&p, &q);
241
242 return Root::bisection (1.0, n_totalProbAssoc, p, q, REL_TOL * fabs (p - q));
243 }
244
muPowerAssoc(size_t dimension_,const long int * score_,const double * prob_,double lambda_,long int power_)245 double LocalMaxStatUtil::muPowerAssoc (
246 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
247 const long int *score_, // scores in increasing order
248 const double *prob_, // corresponding probabilities
249 double lambda_, // lambda
250 long int power_) // power
251 {
252 n_setParameters (dimension_, score_, prob_);
253
254 if (lambda_ == 0.0) lambda_ = lambda (dimension_, score_, prob_);
255
256 return n_meanPowerAssoc (lambda_, power_);
257 }
258
muAssoc(size_t dimension_,const long int * score_,const double * prob_,double lambda_)259 double LocalMaxStatUtil::muAssoc (
260 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
261 const long int *score_, // scores in increasing order
262 const double *prob_, // corresponding probabilities
263 double lambda_) // lambda
264 {
265 return muPowerAssoc (dimension_, score_, prob_, lambda_);
266 }
267
thetaMin(size_t dimension_,const long int * score_,const double * prob_,double lambda_)268 double LocalMaxStatUtil::thetaMin (
269 size_t dimension_, // #(distinct values)
270 const long int *score_, // values
271 const double *prob_, // probability of corresponding value
272 double lambda_) // lambda
273 // assumes logarithmic regime
274 {
275 n_setParameters (dimension_, score_, prob_);
276
277 if (lambda_ == 0.0) lambda_ = lambda (dimension_, score_, prob_);
278
279 double p = 0.0;
280 double q = 0.0;
281 n_bracket (&p, &q);
282 return Root::bisection (0.0, n_meanAssoc, 0.0, lambda_, REL_TOL * fabs (p - q));
283 }
284
rMin(size_t dimension_,const long int * score_,const double * prob_,double lambda_,double thetaMin_)285 double LocalMaxStatUtil::rMin (
286 size_t dimension_, // #(distinct values)
287 const long int *score_, // values
288 const double *prob_, // probability of corresponding value
289 double lambda_, // lambda
290 double thetaMin_) // argument of rate
291 // assumes logarithmic regime
292 {
293 n_setParameters (dimension_, score_, prob_);
294
295 if (thetaMin_ == 0.0) thetaMin_ = thetaMin (dimension_, score_, prob_, lambda_);
296
297 return n_totalProbAssoc (thetaMin_);
298 }
299
r(size_t dimension_,const long int * score_,const double * prob_,double theta_)300 double LocalMaxStatUtil::r ( // r (theta)
301 size_t dimension_, // #(distinct values)
302 const long int *score_, // scores in increasing order
303 const double *prob_, // probability of corresponding value
304 double theta_) // argument of rate
305 // assumes logarithmic regime
306 {
307 double sum = 0.0;
308 for (size_t i = 0; i < dimension_; i++) {
309 sum += prob_ [i] * exp (theta_ * static_cast <double> (score_ [i]));
310 }
311 return sum;
312 }
313
delta(size_t dimension_,const long int * score_)314 long int LocalMaxStatUtil::delta ( // theta [minus delta] for ungapped sequence comparison
315 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
316 const long int *score_) // scores
317 {
318 size_t i = 0;
319
320 long int delta = 0;
321 for (i = 0; i < dimension_; i++) {
322 delta = Integer::euclidAlgorithm <long int> (delta, score_ [i]);
323 }
324 return delta;
325 }
326
thetaMinusDelta(double lambda_,size_t dimension_,const long int * score_)327 double LocalMaxStatUtil::thetaMinusDelta ( // theta [minus delta] for ungapped sequence comparison
328 double lambda_, // lambda, the exponential rate for the local maximum
329 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
330 const long int *score_) // scores
331 {
332 double del = static_cast <double> (delta (dimension_, score_));
333 return (1.0 - exp (-lambda_ * del)) / del;
334 }
335
336
n_step(long int oldValue_,size_t state_)337 long int n_step (long int oldValue_, size_t state_)
338 {
339 assert (state_ < n_dimension);
340 return n_morgue < oldValue_ ? oldValue_ + n_score [state_] : oldValue_;
341 }
342
n_bury(long int oldValue_,size_t state_)343 long int n_bury (long int oldValue_, size_t state_)
344 {
345 assert (state_ < n_dimension);
346 return n_entry < oldValue_ ? oldValue_ : n_morgue;
347 }
348
349
descendingLadderEpochRepeat(size_t dimension_,const long int * score_,const double * prob_,double * eSumAlpha_,double * eOneMinusExpSumAlpha_,bool isStrict_,double lambda_,size_t endW_,double * pAlphaW_,double * eOneMinusExpSumAlphaW_,double lambda0_,double mu0_,double muAssoc0_,double thetaMin0_,double rMin0_,double time_,bool * terminated_)350 void LocalMaxStatUtil::descendingLadderEpochRepeat (
351 size_t dimension_, // #(distinct values)
352 const long int *score_, // values
353 const double *prob_, // probability of corresponding value
354 double *eSumAlpha_, // expectation (sum [alpha])
355 double *eOneMinusExpSumAlpha_, // expectation [1.0 - exp (sum [alpha])]
356 bool isStrict_, // ? is this a strict descending ladder epoch
357 double lambda_, // lambda for repeats : default is lambda0_ below
358 size_t endW_, // maximum w plus 1
359 double *pAlphaW_, // probability {alpha = w} : pAlphaW_ [0, wEnd)
360 double *eOneMinusExpSumAlphaW_, // expectation [1.0 - exp (sum [alpha]); alpha = w] : eOneMinusExpSumAlphaW_ [0, wEnd)
361 double lambda0_, // lambda for flattened distribution (avoid recomputation)
362 double mu0_, // mean of flattened distribution (avoid recomputation)
363 double muAssoc0_, // mean of associated flattened distribution (avoid recomputation)
364 double thetaMin0_, // thetaMin of flattened distribution (avoid recomputation)
365 double rMin0_, // rMin of flattened distribution (avoid recomputation)
366 double time_, // get time for the dynamic programming computation
367 bool *terminated_) // ? Was the dynamic programming computation terminated prematurely ?
368 // assumes logarithmic regime
369 {
370 // Start dynamic programming probability calculation using notation in
371 //
372 // Mott R. and Tribe R. (1999)
373 // J. Computational Biology 6(1):91-112
374 //
375 // Karlin S. and Taylor H.M.(1981)
376 // A Second Course in Stochastic Processes, p. 480
377 //
378 // Note there is an error in Eq (6.19) there, which is corrected in Eq (6.20)
379 //
380 // This program uses departure into (-Inf, 0] not (-Inf, 0)
381
382 // avoid recomputation
383 double mu0 = 0.0 == mu0_ ? mu (dimension_, score_, prob_) : mu0_;
384 assert (mu0 < 0.0);
385 double lambda0 = 0.0 == lambda0_ ? lambda (dimension_, score_, prob_) : lambda0_;
386 assert (0.0 < lambda0);
387 if (lambda_ == 0.0) lambda_ = lambda0;
388 assert (0.0 < lambda_);
389 double muAssoc0 = 0.0 == muAssoc0_ ? muAssoc (dimension_, score_, prob_, lambda0) : muAssoc0_;
390 assert (0.0 < muAssoc0);
391 double thetaMin0 = 0.0 == thetaMin0_ ? thetaMin (dimension_, score_, prob_, lambda0) : thetaMin0_;
392 assert (0.0 < thetaMin0);
393 double rMin0 = 0.0 == rMin0_ ? rMin (dimension_, score_, prob_, lambda0, thetaMin0) : rMin0_;
394 assert (0.0 < rMin0 && rMin0 < 1.0);
395
396 const long int ITER_MIN = static_cast <long int> ((log (REL_TOL * (1.0 - rMin0)) / log (rMin0)));
397 assert (0 < ITER_MIN);
398 const long int ITER = static_cast <long int> (endW_) < ITER_MIN ? ITER_MIN : static_cast <long int> (endW_);
399 assert (0 < ITER);
400 const long int Y_MAX = static_cast <long int> (-log (REL_TOL) / lambda0);
401
402 long int entry = isStrict_ ? -1 : 0;
403 n_setParameters (dimension_, score_, prob_, entry);
404
405
406 double time0 = 0.0;
407 double time1 = 0.0;
408 if (time_ > 0.0) Sls::sls_basic::get_current_time (time0);
409
410 DynProgProbLim dynProgProb (n_step, dimension_, prob_, score_ [0] - 1, Y_MAX);
411
412 if (pAlphaW_) pAlphaW_ [0] = 0.0;
413 if (eOneMinusExpSumAlphaW_) eOneMinusExpSumAlphaW_ [0] = 0.0;
414
415 dynProgProb.update (); // iterate random walk
416
417 long int value = 0;
418
419 if (eSumAlpha_) *eSumAlpha_ = 0.0;
420 if (eOneMinusExpSumAlpha_) *eOneMinusExpSumAlpha_ = 0.0;
421
422 for (size_t w = 1; w < static_cast <size_t> (ITER); w++) {
423
424 if (w < endW_) { // sum pAlphaW_ [w] and eOneMinusExpSumAlphaW_ [w]
425
426 if (pAlphaW_) pAlphaW_ [w] = 0.0;
427 if (eOneMinusExpSumAlphaW_) eOneMinusExpSumAlphaW_ [w] = 0.0;
428
429 for (value = score_ [0]; value <= entry; value++) {
430 if (pAlphaW_) pAlphaW_ [w] += dynProgProb.getProb (value);
431 if (eOneMinusExpSumAlphaW_) eOneMinusExpSumAlphaW_ [w] +=
432 dynProgProb.getProb (value) *
433 (1.0 - exp (lambda_ * static_cast <double> (value)));
434 }
435 }
436
437 for (value = score_ [0]; value <= entry; value++) {
438 if (eSumAlpha_) *eSumAlpha_ += dynProgProb.getProb (value) * static_cast <double> (value);
439 if (eOneMinusExpSumAlpha_) *eOneMinusExpSumAlpha_ += dynProgProb.getProb (value) *
440 (1.0 - exp (lambda_ * static_cast <double> (value)));
441 }
442
443 dynProgProb.setValueFct (n_bury);
444 dynProgProb.update (); // put probability into the morgue
445
446 dynProgProb.setValueFct (n_step);
447 dynProgProb.update (); // iterate random walk
448
449 if (time_ > 0.0)
450 {
451 Sls::sls_basic::get_current_time (time1);
452 if (time1 - time0 > time_)
453 {
454 *terminated_ = true;
455 return;
456 }
457 }
458
459 }
460
461 for (value = score_ [0]; value <= entry; value++) {
462 if (eSumAlpha_) *eSumAlpha_ += dynProgProb.getProb (value) * static_cast <double> (value);
463 if (eOneMinusExpSumAlpha_) *eOneMinusExpSumAlpha_ += dynProgProb.getProb (value) *
464 (1.0 - exp (lambda_ * static_cast <double> (value)));
465 }
466
467 // check that not too much probability has been omitted
468 double prob = 0.0;
469 for (value = entry + 1; value < dynProgProb.getValueUpper (); value++) {
470 prob += dynProgProb.getProb (value);
471 }
472 prob += dynProgProb.getProbLost ();
473
474 const double FUDGE = 100.0;
475 assert (prob <= FUDGE * static_cast <double> (dimension_) * REL_TOL);
476 }
477
descendingLadderEpoch(size_t dimension_,const long int * score_,const double * prob_,double * eSumAlpha_,double * eOneMinusExpSumAlpha_,bool isStrict_,double lambda0_,double mu0_,double muAssoc0_,double thetaMin0_,double rMin0_,double time_,bool * terminated_)478 void LocalMaxStatUtil::descendingLadderEpoch (
479 size_t dimension_, // #(distinct values)
480 const long int *score_, // values
481 const double *prob_, // probability of corresponding value
482 double *eSumAlpha_, // expectation (sum [alpha])
483 double *eOneMinusExpSumAlpha_, // expectation [1.0 - exp (sum [alpha])]
484 bool isStrict_, // ? is this a strict descending ladder epoch
485 double lambda0_, // lambda for flattened distribution (avoid recomputation)
486 double mu0_, // mean of flattened distribution (avoid recomputation)
487 double muAssoc0_, // mean of associated flattened distribution (avoid recomputation)
488 double thetaMin0_, // thetaMin of flattened distribution (avoid recomputation)
489 double rMin0_, // rMin of flattened distribution (avoid recomputation)
490 double time_, // get time for the dynamic programming computation
491 bool *terminated_) // ? Was the dynamic programming computation terminated prematurely ?
492 {
493 descendingLadderEpochRepeat (dimension_, score_, prob_,
494 eSumAlpha_, eOneMinusExpSumAlpha_, isStrict_, 0.0, 0, 0, 0,
495 lambda0_, mu0_, muAssoc0_, thetaMin0_, rMin0_, time_, terminated_);
496 }
497
isProbDist(size_t dimension_,const double * prob_)498 bool LocalMaxStatUtil::isProbDist (
499 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
500 const double *prob_) // corresponding probabilities
501 {
502 double sum = 0.0;
503 for (size_t i = 0; i < dimension_; i++) {
504 if (prob_ [i] < 0.0 || 1.0 < prob_ [i]) return false;
505 sum += prob_ [i];
506 }
507 return Approx::relApprox (sum, 1.0, REL_TOL);
508 }
509
isScoreIncreasing(size_t dimension_,const long int * score_)510 bool LocalMaxStatUtil::isScoreIncreasing (
511 size_t dimension_, // #(distinct values)
512 const long int *score_) // scores in increasing order
513 {
514 for (size_t i = 1; i < dimension_; i++) {
515 if (score_ [i] <= score_ [i - 1]) return false;
516 }
517 return true;
518 }
519
isLogarithmic(size_t dimension_,const long int * score_,const double * prob_)520 bool LocalMaxStatUtil::isLogarithmic (
521 size_t dimension_, // #(distinct values) of scores & probabilities (which are paired)
522 const long int *score_, // scores in increasing order
523 const double *prob_) // corresponding probabilities
524 {
525 assert (score_);
526 assert (prob_);
527 if (! isScoreIncreasing (dimension_, score_)) return false;
528 if (! isProbDist (dimension_, prob_)) return false;
529 if (0.0 <= mu (dimension_, score_, prob_)) return false;
530 if (score_ [dimension_ - 1] <= 0.0) return false;
531 return true;
532 }
533
534