1 /* $Id: gumbel_params.cpp 191513 2010-05-13 14:40:33Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25
26 /*****************************************************************************
27
28 File name: gumbel_params.cpp
29
30 Author: Greg Boratyn, Sergey Sheetlin
31
32 Contents: Implementation of Gumbel parameters computation wrapper
33
34 ******************************************************************************/
35
36 #include <ncbi_pch.hpp>
37
38 #include "sls_alp.hpp"
39 #include "sls_alp_data.hpp"
40 #include "sls_alp_regression.hpp"
41 #include "sls_alp_sim.hpp"
42 #include "sls_pvalues.hpp"
43 #include "njn_localmaxstatmatrix.hpp"
44 #include "njn_localmaxstatutil.hpp"
45
46 #include <algo/blast/gumbel_params/gumbel_params.hpp>
47
48
49 USING_NCBI_SCOPE;
50 USING_SCOPE(blast);
51
52 ///////////////////////////////////////////////////////////////////////////////
53 //CGumbelParamsOptions
54
CGumbelParamsOptions(void)55 CGumbelParamsOptions::CGumbelParamsOptions(void)
56 {
57 x_Init();
58 }
59
CGumbelParamsOptions(const CGeneralScoreMatrix * matrix,const vector<double> & seq1_probs,const vector<double> & seq2_probs)60 CGumbelParamsOptions::CGumbelParamsOptions(
61 const CGeneralScoreMatrix* matrix,
62 const vector<double>& seq1_probs,
63 const vector<double>& seq2_probs)
64 {
65 x_Init();
66 m_ScoreMatrix.Reset(matrix);
67 m_NumResidues = (*m_ScoreMatrix).GetNumResidues();
68 m_Seq1ResidueProbs = seq1_probs;
69 m_Seq2ResidueProbs = seq2_probs;
70 }
71
72
CGumbelParamsOptions(const CGumbelParamsOptions & options)73 CGumbelParamsOptions::CGumbelParamsOptions(
74 const CGumbelParamsOptions& options)
75 : m_GapOpening(options.m_GapOpening),
76 m_GapExtension(options.m_GapExtension),
77 m_LambdaAccuracy(options.m_LambdaAccuracy),
78 m_KAccuracy(options.m_KAccuracy),
79 m_ScoreMatrix(options.m_ScoreMatrix),
80 m_Seq1ResidueProbs(options.m_Seq1ResidueProbs),
81 m_Seq2ResidueProbs(options.m_Seq2ResidueProbs),
82 m_NumResidues(options.m_NumResidues),
83 m_MaxCalcTime(options.m_MaxCalcTime),
84 m_MaxCalcMemory(options.m_MaxCalcMemory)
85 {}
86
87
Validate(void)88 bool CGumbelParamsOptions::Validate(void)
89 {
90 const Int4 kMaxScore = 1000;
91
92 if (m_Seq1ResidueProbs.size() != m_Seq2ResidueProbs.size()) {
93 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
94 "Length of residue probabilities for both sequences do "
95 "not match");
96 }
97
98 if ((unsigned)m_NumResidues != m_Seq1ResidueProbs.size()) {
99 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
100 "Length of residue probabilities does not match number "
101 "of residues");
102 }
103
104 double sum_probs_seq1 = 0.0;
105 ITERATE(vector<double>, it, m_Seq1ResidueProbs) {
106 if (*it < 0.0) {
107 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
108 "Negative probability value for sequence 1");
109 }
110 sum_probs_seq1 += *it;
111 }
112 if (fabs(1.0 - sum_probs_seq1) > 1e-10) {
113 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
114 "The sum of probability values does not equal 1");
115 }
116
117 double sum_probs_seq2 = 0.0;
118 ITERATE(vector<double>, it, m_Seq2ResidueProbs) {
119 if (*it < 0.0) {
120 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
121 "Negative probability value for sequence 2");
122 }
123 sum_probs_seq2 += *it;
124 }
125 if (fabs(1.0 - sum_probs_seq2) > 1e-10) {
126 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
127 "The sum of probability values does not equal 1");
128 }
129
130 if (m_GapOpening < 0) {
131 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
132 "Negative gap opening penalty");
133 }
134
135 if (m_GapExtension <= 0) {
136 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
137 "Negative gap extention penalty");
138 }
139
140 if (m_LambdaAccuracy <= 0.0) {
141 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
142 "Non-positive accuracy for lambda");
143 }
144
145 if (m_KAccuracy <= 0.0) {
146 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
147 "Non-positive accuracy for K");
148 }
149
150 if (m_MaxCalcTime <= 0.0) {
151 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
152 "Non-positive max calc time");
153 }
154
155 if (m_MaxCalcMemory <= 0.0) {
156 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
157 "Non-positive maximum memory");
158 }
159
160 if(abs(m_GapOpening) > kMaxScore || abs(m_GapExtension) > kMaxScore) {
161 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
162 "Gap opening or extension too large");
163 }
164
165 for(Uint4 i=0;i < m_ScoreMatrix->GetNumResidues();i++) {
166 for(Uint4 j=0;j < m_ScoreMatrix->GetNumResidues();j++) {
167 if(abs(m_ScoreMatrix->GetScore(i, j)) > kMaxScore) {
168 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
169 "Score matrix entry too large");
170 }
171 }
172 }
173
174 // Warnings
175 bool result = true;
176
177 if (m_LambdaAccuracy < 0.001 || m_LambdaAccuracy > 0.01) {
178 m_Messages.push_back("The algorithm is designed for lambda accuracy"
179 " between 0.001 -- 0.01. Values outside this range"
180 " may cause increased bias or variance of"
181 " estimated parameter lambda");
182 result = false;
183 }
184
185 if (m_KAccuracy < 0.005 || m_KAccuracy > 0.05) {
186 m_Messages.push_back("The algorithm is designed for K accuracy"
187 " between 0.005 -- 0.05. Values outside this range"
188 " may cause increased bias or variance of"
189 " estimated parameter K");
190 result = false;
191
192 }
193
194 if (m_MaxCalcTime < 1.0) {
195 m_Messages.push_back("The maximum calculation time may be too small"
196 " and lead to inaccurate results. The suggested"
197 " maximum calculation time is at least 1s.");
198 result = false;
199 }
200
201 if (m_MaxCalcMemory < 1.0) {
202 m_Messages.push_back("The maximum allowed memory may be too small"
203 " and lead to inaccurate results. The suggsets"
204 " maximum allowed memory is at least 1Mb");
205 result = false;
206 }
207
208 return result;
209 }
210
211
SetScoreMatrix(const CGeneralScoreMatrix & matrix)212 void CGumbelParamsOptions::SetScoreMatrix(const CGeneralScoreMatrix& matrix)
213 {
214 m_ScoreMatrix.Reset(new CGeneralScoreMatrix(matrix));
215 }
216
SetScoreMatrix(const CRef<CGeneralScoreMatrix> & matrix)217 void CGumbelParamsOptions::SetScoreMatrix(
218 const CRef<CGeneralScoreMatrix>& matrix)
219 {
220 m_ScoreMatrix.Reset(matrix.GetNonNullPointer());
221 m_NumResidues = (*m_ScoreMatrix).GetNumResidues();
222 }
223
x_Init(void)224 void CGumbelParamsOptions::x_Init(void)
225 {
226 m_GapOpening = 0;
227 m_GapExtension = 0;
228 m_LambdaAccuracy = 0;
229 m_KAccuracy = 0;
230 m_IsGapped = true;
231
232 m_NumResidues = 0;
233
234 m_MaxCalcTime = 1.0;
235 m_MaxCalcMemory = 1000.0;
236 }
237
238 ///////////////////////////////////////////////////////////////////////////////
239 //CGumbelParamsOptionsFactory
240
241 CRef<CGumbelParamsOptions>
CreateStandard20AAOptions(CGeneralScoreMatrix::EScoreMatrixName smat)242 CGumbelParamsOptionsFactory::CreateStandard20AAOptions(
243 CGeneralScoreMatrix::EScoreMatrixName smat)
244 {
245 const size_t kNumResidues = 20;
246 CRef<CGeneralScoreMatrix>
247 smatrix(new CGeneralScoreMatrix(smat, kNumResidues));
248
249 CGumbelParamsOptions::TFrequencies probs(kNumResidues);
250
251 probs[ 0 ] = 0.07805 ;
252 probs[ 1 ] = 0.05129 ;
253 probs[ 2 ] = 0.04487 ;
254 probs[ 3 ] = 0.05364 ;
255 probs[ 4 ] = 0.01925 ;
256 probs[ 5 ] = 0.04264 ;
257 probs[ 6 ] = 0.06295 ;
258 probs[ 7 ] = 0.07377 ;
259 probs[ 8 ] = 0.02199 ;
260 probs[ 9 ] = 0.05142 ;
261 probs[ 10 ] = 0.09019 ;
262 probs[ 11 ] = 0.05744 ;
263 probs[ 12 ] = 0.02243 ;
264 probs[ 13 ] = 0.03856 ;
265 probs[ 14 ] = 0.05203 ;
266 probs[ 15 ] = 0.0712 ;
267 probs[ 16 ] = 0.05841 ;
268 probs[ 17 ] = 0.0133 ;
269 probs[ 18 ] = 0.03216 ;
270 probs[ 19 ] = 0.06441 ;
271
272 CGumbelParamsOptions* options = new CGumbelParamsOptions();
273 options->SetScoreMatrix(smatrix);
274 options->SetGapOpening(11);
275 options->SetGapExtension(1);
276 options->SetLambdaAccuracy(0.001);
277 options->SetKAccuracy(0.005);
278 options->SetGapped(true);//gapped calculation
279 options->SetMaxCalcTime(1.0);
280 options->SetMaxCalcMemory(1000.0);
281 options->SetSeq1ResidueProbs(probs);
282 options->SetSeq2ResidueProbs(probs);
283
284 return CRef<CGumbelParamsOptions>(options);
285 }
286
287 CRef<CGumbelParamsOptions>
CreateBasicOptions(void)288 CGumbelParamsOptionsFactory::CreateBasicOptions(void)
289 {
290 CGumbelParamsOptions* options = new CGumbelParamsOptions();
291 options->SetGapOpening(11);
292 options->SetGapExtension(1);
293 options->SetLambdaAccuracy(0.001);
294 options->SetKAccuracy(0.05);
295 options->SetGapped(true);
296 options->SetMaxCalcTime(1.0);
297 options->SetMaxCalcMemory(1000.0);
298
299 return CRef<CGumbelParamsOptions>(options);
300 }
301
302 ///////////////////////////////////////////////////////////////////////////////
303 //CGumbelParamsCalc
304
CGumbelParamsCalc(const CRef<CGumbelParamsOptions> & options)305 CGumbelParamsCalc::CGumbelParamsCalc(const CRef<CGumbelParamsOptions>& options)
306 : m_Options(options)
307 {}
308
CGumbelParamsCalc(const CRef<CGumbelParamsOptions> & options,const CRef<CGumbelParamsRandDiagnostics> & rand)309 CGumbelParamsCalc::CGumbelParamsCalc(const CRef<CGumbelParamsOptions>& options,
310 const CRef<CGumbelParamsRandDiagnostics>& rand)
311 : m_Options(options), m_RandParams(rand)
312 {}
313
314
315 template <typename T>
s_CopyVector(const vector<T> & in,vector<T> & out)316 static void s_CopyVector(const vector<T>& in, vector<T>& out)
317 {
318 out.resize(in.size());
319 copy(in.begin(), in.end(), out.begin());
320 }
321
322
Run(void)323 CRef<CGumbelParamsResult> CGumbelParamsCalc::Run(void)
324 {
325 CGumbelParamsResult* result;
326 Int4 number_of_aa = -1;
327
328 try {
329
330 double current_time1;
331 double current_time2;
332 Sls::alp_data::get_current_time(current_time1);
333
334 number_of_aa = m_Options->GetNumResidues();
335 _ASSERT(number_of_aa > 0);
336
337 result = new CGumbelParamsResult();
338 SGumbelParams& gumbel_params = result->SetGumbelParams();
339 CGumbelParamsResult::SSbsArrays& sbs_arrays = result->SetSbsArrays();
340
341 //Gapless parameters calculation
342
343 double gapless_time_portion = 0.5;
344
345 const Int4** scoring_matrix = m_Options->GetScoreMatrix();
346
347 const double* background_probabilities1
348 = &m_Options->GetSeq1ResidueProbs()[0];
349
350 const double* background_probabilities2
351 = &m_Options->GetSeq2ResidueProbs()[0];
352
353 double gapless_calc_time = m_Options->GetMaxCalcTime();
354
355 if(m_Options->GetGapped()) {
356 // Gapless calculation may take only a portion of maximum allowed
357 // calculation time in the case of gapped calculation
358 gapless_calc_time *= gapless_time_portion;
359 }
360
361 Njn::LocalMaxStatMatrix local_max_stat_matrix(number_of_aa,
362 scoring_matrix,
363 background_probabilities1,
364 background_probabilities2,
365 number_of_aa,
366 gapless_calc_time);
367
368 if(local_max_stat_matrix.getTerminated()) {
369 throw Sls::error("Please increase maximum allowed calculation time.",
370 1);
371 }
372
373 //calculation of a and alpha
374 double calculation_error = 1e-6;
375
376 gumbel_params.gapless_alpha = local_max_stat_matrix.getAlpha();
377 gumbel_params.gapless_alpha_error = calculation_error;
378
379 gumbel_params.gapless_a = local_max_stat_matrix.getA();
380 gumbel_params.gapless_a_error = calculation_error;
381
382 if(!m_Options->GetGapped()) {
383
384 //calculation of all required parameters for a gapless case
385 gumbel_params.G=0;
386
387 gumbel_params.lambda = local_max_stat_matrix.getLambda ();
388 gumbel_params.lambda_error = calculation_error;
389
390 gumbel_params.K = local_max_stat_matrix.getK ();
391 gumbel_params.K_error = calculation_error;
392
393 gumbel_params.C = local_max_stat_matrix.getC ();;
394 gumbel_params.C_error = calculation_error;
395
396 gumbel_params.sigma = gumbel_params.gapless_alpha;
397 gumbel_params.sigma_error = calculation_error;
398
399 gumbel_params.alpha_i = gumbel_params.gapless_alpha;
400 gumbel_params.alpha_i_error = calculation_error;
401
402 gumbel_params.alpha_j = gumbel_params.gapless_alpha;
403 gumbel_params.alpha_j_error = calculation_error;
404
405 gumbel_params.ai = gumbel_params.gapless_a;
406 gumbel_params.ai_error = calculation_error;
407
408 gumbel_params.aj = gumbel_params.gapless_a;
409 gumbel_params.aj_error = calculation_error;
410
411 Sls::alp_data::get_current_time(current_time2);
412 result->SetCalcTime(current_time2 - current_time1);
413
414 sbs_arrays.lambda_sbs.resize(2);
415 sbs_arrays.lambda_sbs[0]=gumbel_params.lambda;
416 sbs_arrays.lambda_sbs[1]=gumbel_params.lambda + calculation_error;
417
418 sbs_arrays.K_sbs.resize(2);
419 sbs_arrays.K_sbs[0]=gumbel_params.K;
420 sbs_arrays.K_sbs[1]=gumbel_params.K+calculation_error;
421
422 sbs_arrays.C_sbs.resize(2);
423 sbs_arrays.C_sbs[0]=gumbel_params.C;
424 sbs_arrays.C_sbs[1]=gumbel_params.C+calculation_error;
425
426 sbs_arrays.sigma_sbs.resize(2);
427 sbs_arrays.sigma_sbs[0]=gumbel_params.sigma;
428 sbs_arrays.sigma_sbs[1]=gumbel_params.sigma + calculation_error;
429
430 sbs_arrays.alpha_i_sbs.resize(2);
431 sbs_arrays.alpha_i_sbs[0]=gumbel_params.alpha_i;
432 sbs_arrays.alpha_i_sbs[1]=gumbel_params.alpha_i + calculation_error;
433
434 sbs_arrays.alpha_j_sbs.resize(2);
435 sbs_arrays.alpha_j_sbs[0]=gumbel_params.alpha_j;
436 sbs_arrays.alpha_j_sbs[1]=gumbel_params.alpha_j + calculation_error;
437
438 sbs_arrays.ai_sbs.resize(2);
439 sbs_arrays.ai_sbs[0]=gumbel_params.ai;
440 sbs_arrays.ai_sbs[1]=gumbel_params.ai + calculation_error;
441
442 sbs_arrays.aj_sbs.resize(2);
443 sbs_arrays.aj_sbs[0]=gumbel_params.aj;
444 sbs_arrays.aj_sbs[1]=gumbel_params.aj + calculation_error;
445
446 // New TRandParams object is created only if m_Options does
447 // not contain one already.
448 if (m_RandParams.Empty()) {
449
450 m_RandParams.Reset(new CGumbelParamsRandDiagnostics());
451 m_RandParams->SetRandomSeed(0);
452 m_RandParams->SetTotalReNumber(0);
453 m_RandParams->SetTotalReNumberKilling(0);
454 }
455 }
456 else {
457 double current_time_gapless_preliminary;
458 Sls::alp_data::get_current_time(current_time_gapless_preliminary);
459 double gapless_preliminary_time =
460 current_time_gapless_preliminary - current_time1;
461
462 // TODO: There should be constructors for two cases with and without
463 // rand params
464 Sls::alp_data data_obj(m_Options, m_RandParams);
465 data_obj.d_max_time =
466 Sls::alp_data::Tmax((1.0 - gapless_time_portion)
467 * data_obj.d_max_time,data_obj.d_max_time
468 - gapless_preliminary_time);
469
470 Sls::alp_sim sim_obj(&data_obj);
471 gumbel_params.G = m_Options->GetGapOpening()
472 + m_Options->GetGapExtension();
473
474 gumbel_params.lambda = sim_obj.m_Lambda;
475 gumbel_params.lambda_error = sim_obj.m_LambdaError;
476
477 gumbel_params.K = sim_obj.m_K;
478 gumbel_params.K_error = sim_obj.m_KError;
479
480 gumbel_params.C = sim_obj.m_C;
481 gumbel_params.C_error = sim_obj.m_CError;
482
483 gumbel_params.sigma = sim_obj.m_Sigma;
484 gumbel_params.sigma_error = sim_obj.m_SigmaError;
485
486 gumbel_params.alpha_i = sim_obj.m_AlphaI;
487 gumbel_params.alpha_i_error = sim_obj.m_AlphaIError;
488
489 gumbel_params.alpha_j = sim_obj.m_AlphaJ;
490 gumbel_params.alpha_j_error = sim_obj.m_AlphaJError;
491
492 gumbel_params.ai = sim_obj.m_AI;
493 gumbel_params.ai_error = sim_obj.m_AIError;
494
495 gumbel_params.aj = sim_obj.m_AJ;
496 gumbel_params.aj_error = sim_obj.m_AJError;
497
498 Sls::alp_data::get_current_time(current_time2);
499 result->SetCalcTime(current_time2 - current_time1);
500
501 s_CopyVector(sim_obj.m_LambdaSbs, sbs_arrays.lambda_sbs);
502 s_CopyVector(sim_obj.m_KSbs, sbs_arrays.K_sbs);
503 s_CopyVector(sim_obj.m_CSbs, sbs_arrays.C_sbs);
504 s_CopyVector(sim_obj.m_SigmaSbs, sbs_arrays.sigma_sbs);
505 s_CopyVector(sim_obj.m_AlphaISbs, sbs_arrays.alpha_i_sbs);
506 s_CopyVector(sim_obj.m_AlphaJSbs, sbs_arrays.alpha_j_sbs);
507 s_CopyVector(sim_obj.m_AISbs, sbs_arrays.ai_sbs);
508 s_CopyVector(sim_obj.m_AJSbs, sbs_arrays.aj_sbs);
509
510
511 // New TRandParams object is created only if m_Options does
512 // not contain one already.
513 if (m_RandParams.Empty()) {
514
515 m_RandParams.Reset(new CGumbelParamsRandDiagnostics());
516 m_RandParams->SetRandomSeed(
517 sim_obj.d_alp_data->d_rand_all->d_random_factor);
518
519 vector<Int4>& fsprelim_numbers
520 = m_RandParams->SetFirstStagePrelimReNumbers();
521
522 s_CopyVector(sim_obj.d_alp_data->d_rand_all
523 ->d_first_stage_preliminary_realizations_numbers_ALP,
524 fsprelim_numbers);
525
526 vector<Int4>& prelim_numbers
527 = m_RandParams->SetPrelimReNumbers();
528
529 s_CopyVector(sim_obj.d_alp_data->d_rand_all
530 ->d_preliminary_realizations_numbers_ALP,
531 prelim_numbers);
532
533 vector<Int4>& prelim_numbers_killing
534 = m_RandParams->SetPrelimReNumbersKilling();
535
536 s_CopyVector(sim_obj.d_alp_data->d_rand_all
537 ->d_preliminary_realizations_numbers_killing,
538 prelim_numbers_killing);
539
540 m_RandParams->SetTotalReNumber(sim_obj.d_alp_data->d_rand_all
541 ->d_total_realizations_number_with_ALP);
542
543 m_RandParams->SetTotalReNumberKilling(
544 sim_obj.d_alp_data->d_rand_all
545 ->d_total_realizations_number_with_killing);
546 }
547 }
548 }
549 catch (Sls::error er) {
550
551 switch(er.error_code) {
552 case 1: NCBI_THROW(CGumbelParamsException, eLimitsExceeded,
553 (string)"Time or memory limit exceeded.");
554
555 case 2: NCBI_THROW(CGumbelParamsException, eRepeatCalc,
556 (string)"The program could not estimate the "
557 "parameters. Please repeat calculation");
558
559 case 3: NCBI_THROW(CGumbelParamsException, eParamsError,
560 (string)"Results cannot be computed for the "
561 "provided parameters. " + er.st);
562
563 case 41: NCBI_THROW(CGumbelParamsException, eMemAllocError,
564 (string)"Memory allocation error");
565
566 case 4:
567 default: NCBI_THROW(CGumbelParamsException, eUnexpectedError,
568 (string)"Unexpected error");
569 }
570 }
571
572 m_Result.Reset(result);
573 return m_Result;
574 }
575
576
577