1 /* $Id: gumbel_params.cpp 191513 2010-05-13 14:40:33Z boratyng $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's offical duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: gumbel_params.cpp
29 
30 Author: Greg Boratyn, Sergey Sheetlin
31 
32 Contents: Implementation of Gumbel parameters computation wrapper
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "sls_alp.hpp"
39 #include "sls_alp_data.hpp"
40 #include "sls_alp_regression.hpp"
41 #include "sls_alp_sim.hpp"
42 #include "sls_pvalues.hpp"
43 #include "njn_localmaxstatmatrix.hpp"
44 #include "njn_localmaxstatutil.hpp"
45 
46 #include <algo/blast/gumbel_params/gumbel_params.hpp>
47 
48 
49 USING_NCBI_SCOPE;
50 USING_SCOPE(blast);
51 
52 ///////////////////////////////////////////////////////////////////////////////
53 //CGumbelParamsOptions
54 
CGumbelParamsOptions(void)55 CGumbelParamsOptions::CGumbelParamsOptions(void)
56 {
57     x_Init();
58 }
59 
CGumbelParamsOptions(const CGeneralScoreMatrix * matrix,const vector<double> & seq1_probs,const vector<double> & seq2_probs)60 CGumbelParamsOptions::CGumbelParamsOptions(
61                                    const CGeneralScoreMatrix* matrix,
62                                    const vector<double>& seq1_probs,
63                                    const vector<double>& seq2_probs)
64 {
65     x_Init();
66     m_ScoreMatrix.Reset(matrix);
67     m_NumResidues = (*m_ScoreMatrix).GetNumResidues();
68     m_Seq1ResidueProbs = seq1_probs;
69     m_Seq2ResidueProbs = seq2_probs;
70 }
71 
72 
CGumbelParamsOptions(const CGumbelParamsOptions & options)73 CGumbelParamsOptions::CGumbelParamsOptions(
74                                        const CGumbelParamsOptions& options)
75     : m_GapOpening(options.m_GapOpening),
76       m_GapExtension(options.m_GapExtension),
77       m_LambdaAccuracy(options.m_LambdaAccuracy),
78       m_KAccuracy(options.m_KAccuracy),
79       m_ScoreMatrix(options.m_ScoreMatrix),
80       m_Seq1ResidueProbs(options.m_Seq1ResidueProbs),
81       m_Seq2ResidueProbs(options.m_Seq2ResidueProbs),
82       m_NumResidues(options.m_NumResidues),
83       m_MaxCalcTime(options.m_MaxCalcTime),
84       m_MaxCalcMemory(options.m_MaxCalcMemory)
85 {}
86 
87 
Validate(void)88 bool CGumbelParamsOptions::Validate(void)
89 {
90     const Int4 kMaxScore = 1000;
91 
92     if (m_Seq1ResidueProbs.size() != m_Seq2ResidueProbs.size()) {
93         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
94                    "Length of residue probabilities for both sequences do "
95                    "not match");
96     }
97 
98     if ((unsigned)m_NumResidues != m_Seq1ResidueProbs.size()) {
99         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
100                    "Length of residue probabilities does not match number "
101                    "of residues");
102     }
103 
104     double sum_probs_seq1 = 0.0;
105     ITERATE(vector<double>, it, m_Seq1ResidueProbs) {
106         if (*it < 0.0) {
107             NCBI_THROW(CGumbelParamsException, eInvalidOptions,
108                        "Negative probability value for sequence 1");
109         }
110         sum_probs_seq1 += *it;
111     }
112     if (fabs(1.0 - sum_probs_seq1) > 1e-10) {
113         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
114                    "The sum of probability values does not equal 1");
115     }
116 
117     double sum_probs_seq2 = 0.0;
118     ITERATE(vector<double>, it, m_Seq2ResidueProbs) {
119         if (*it < 0.0) {
120             NCBI_THROW(CGumbelParamsException, eInvalidOptions,
121                        "Negative probability value for sequence 2");
122         }
123         sum_probs_seq2 += *it;
124     }
125     if (fabs(1.0 - sum_probs_seq2) > 1e-10) {
126         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
127                    "The sum of probability values does not equal 1");
128     }
129 
130     if (m_GapOpening < 0) {
131         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
132                    "Negative gap opening penalty");
133     }
134 
135     if (m_GapExtension <= 0) {
136         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
137                    "Negative gap extention penalty");
138     }
139 
140     if (m_LambdaAccuracy <= 0.0) {
141         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
142                    "Non-positive accuracy for lambda");
143     }
144 
145     if (m_KAccuracy <= 0.0) {
146         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
147                    "Non-positive accuracy for K");
148     }
149 
150     if (m_MaxCalcTime <= 0.0) {
151         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
152                    "Non-positive max calc time");
153     }
154 
155     if (m_MaxCalcMemory <= 0.0) {
156         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
157                    "Non-positive maximum memory");
158     }
159 
160     if(abs(m_GapOpening) > kMaxScore || abs(m_GapExtension) > kMaxScore) {
161         NCBI_THROW(CGumbelParamsException, eInvalidOptions,
162                    "Gap opening or extension too large");
163     }
164 
165     for(Uint4 i=0;i < m_ScoreMatrix->GetNumResidues();i++) {
166         for(Uint4 j=0;j < m_ScoreMatrix->GetNumResidues();j++) {
167             if(abs(m_ScoreMatrix->GetScore(i, j)) > kMaxScore) {
168                 NCBI_THROW(CGumbelParamsException, eInvalidOptions,
169                            "Score matrix entry too large");
170             }
171         }
172     }
173 
174     // Warnings
175     bool result = true;
176 
177     if (m_LambdaAccuracy < 0.001 || m_LambdaAccuracy > 0.01) {
178         m_Messages.push_back("The algorithm is designed for lambda accuracy"
179                             " between 0.001 -- 0.01. Values outside this range"
180                              " may cause increased bias or variance of"
181                              " estimated parameter lambda");
182         result = false;
183     }
184 
185     if (m_KAccuracy < 0.005 || m_KAccuracy > 0.05) {
186         m_Messages.push_back("The algorithm is designed for K accuracy"
187                             " between 0.005 -- 0.05. Values outside this range"
188                              " may cause increased bias or variance of"
189                              " estimated parameter K");
190         result = false;
191 
192     }
193 
194     if (m_MaxCalcTime < 1.0) {
195         m_Messages.push_back("The maximum calculation time may be too small"
196                              " and lead to inaccurate results. The suggested"
197                              " maximum calculation time is at least 1s.");
198         result = false;
199     }
200 
201     if (m_MaxCalcMemory < 1.0) {
202         m_Messages.push_back("The maximum allowed memory may be too small"
203                              " and lead to inaccurate results. The suggsets"
204                              " maximum allowed memory is at least 1Mb");
205         result = false;
206     }
207 
208     return result;
209 }
210 
211 
SetScoreMatrix(const CGeneralScoreMatrix & matrix)212 void CGumbelParamsOptions::SetScoreMatrix(const CGeneralScoreMatrix& matrix)
213 {
214     m_ScoreMatrix.Reset(new CGeneralScoreMatrix(matrix));
215 }
216 
SetScoreMatrix(const CRef<CGeneralScoreMatrix> & matrix)217 void CGumbelParamsOptions::SetScoreMatrix(
218                                           const CRef<CGeneralScoreMatrix>& matrix)
219 {
220     m_ScoreMatrix.Reset(matrix.GetNonNullPointer());
221     m_NumResidues = (*m_ScoreMatrix).GetNumResidues();
222 }
223 
x_Init(void)224 void CGumbelParamsOptions::x_Init(void)
225 {
226     m_GapOpening = 0;
227     m_GapExtension = 0;
228     m_LambdaAccuracy = 0;
229     m_KAccuracy = 0;
230     m_IsGapped = true;
231 
232     m_NumResidues = 0;
233 
234     m_MaxCalcTime = 1.0;
235     m_MaxCalcMemory = 1000.0;
236 }
237 
238 ///////////////////////////////////////////////////////////////////////////////
239 //CGumbelParamsOptionsFactory
240 
241 CRef<CGumbelParamsOptions>
CreateStandard20AAOptions(CGeneralScoreMatrix::EScoreMatrixName smat)242 CGumbelParamsOptionsFactory::CreateStandard20AAOptions(
243                                     CGeneralScoreMatrix::EScoreMatrixName smat)
244 {
245     const size_t kNumResidues = 20;
246     CRef<CGeneralScoreMatrix>
247         smatrix(new CGeneralScoreMatrix(smat, kNumResidues));
248 
249     CGumbelParamsOptions::TFrequencies probs(kNumResidues);
250 
251     probs[ 0 ] = 0.07805 ;
252     probs[ 1 ] = 0.05129 ;
253     probs[ 2 ] = 0.04487 ;
254     probs[ 3 ] = 0.05364 ;
255     probs[ 4 ] = 0.01925 ;
256     probs[ 5 ] = 0.04264 ;
257     probs[ 6 ] = 0.06295 ;
258     probs[ 7 ] = 0.07377 ;
259     probs[ 8 ] = 0.02199 ;
260     probs[ 9 ] = 0.05142 ;
261     probs[ 10 ] = 0.09019 ;
262     probs[ 11 ] = 0.05744 ;
263     probs[ 12 ] = 0.02243 ;
264     probs[ 13 ] = 0.03856 ;
265     probs[ 14 ] = 0.05203 ;
266     probs[ 15 ] = 0.0712 ;
267     probs[ 16 ] = 0.05841 ;
268     probs[ 17 ] = 0.0133 ;
269     probs[ 18 ] = 0.03216 ;
270     probs[ 19 ] = 0.06441 ;
271 
272     CGumbelParamsOptions* options = new CGumbelParamsOptions();
273     options->SetScoreMatrix(smatrix);
274     options->SetGapOpening(11);
275     options->SetGapExtension(1);
276     options->SetLambdaAccuracy(0.001);
277     options->SetKAccuracy(0.005);
278     options->SetGapped(true);//gapped calculation
279     options->SetMaxCalcTime(1.0);
280     options->SetMaxCalcMemory(1000.0);
281     options->SetSeq1ResidueProbs(probs);
282     options->SetSeq2ResidueProbs(probs);
283 
284     return CRef<CGumbelParamsOptions>(options);
285 }
286 
287 CRef<CGumbelParamsOptions>
CreateBasicOptions(void)288 CGumbelParamsOptionsFactory::CreateBasicOptions(void)
289 {
290     CGumbelParamsOptions* options = new CGumbelParamsOptions();
291     options->SetGapOpening(11);
292     options->SetGapExtension(1);
293     options->SetLambdaAccuracy(0.001);
294     options->SetKAccuracy(0.05);
295     options->SetGapped(true);
296     options->SetMaxCalcTime(1.0);
297     options->SetMaxCalcMemory(1000.0);
298 
299     return CRef<CGumbelParamsOptions>(options);
300 }
301 
302 ///////////////////////////////////////////////////////////////////////////////
303 //CGumbelParamsCalc
304 
CGumbelParamsCalc(const CRef<CGumbelParamsOptions> & options)305 CGumbelParamsCalc::CGumbelParamsCalc(const CRef<CGumbelParamsOptions>& options)
306     : m_Options(options)
307 {}
308 
CGumbelParamsCalc(const CRef<CGumbelParamsOptions> & options,const CRef<CGumbelParamsRandDiagnostics> & rand)309 CGumbelParamsCalc::CGumbelParamsCalc(const CRef<CGumbelParamsOptions>& options,
310                                      const CRef<CGumbelParamsRandDiagnostics>& rand)
311     : m_Options(options), m_RandParams(rand)
312 {}
313 
314 
315 template <typename T>
s_CopyVector(const vector<T> & in,vector<T> & out)316 static void s_CopyVector(const vector<T>& in, vector<T>& out)
317 {
318     out.resize(in.size());
319     copy(in.begin(), in.end(), out.begin());
320 }
321 
322 
Run(void)323 CRef<CGumbelParamsResult> CGumbelParamsCalc::Run(void)
324 {
325     CGumbelParamsResult* result;
326     Int4 number_of_aa = -1;
327 
328     try {
329 
330         double current_time1;
331         double current_time2;
332         Sls::alp_data::get_current_time(current_time1);
333 
334         number_of_aa = m_Options->GetNumResidues();
335         _ASSERT(number_of_aa > 0);
336 
337         result = new CGumbelParamsResult();
338         SGumbelParams& gumbel_params = result->SetGumbelParams();
339         CGumbelParamsResult::SSbsArrays& sbs_arrays = result->SetSbsArrays();
340 
341         //Gapless parameters calculation
342 
343         double gapless_time_portion = 0.5;
344 
345         const Int4** scoring_matrix = m_Options->GetScoreMatrix();
346 
347         const double* background_probabilities1
348             = &m_Options->GetSeq1ResidueProbs()[0];
349 
350         const double* background_probabilities2
351             = &m_Options->GetSeq2ResidueProbs()[0];
352 
353         double gapless_calc_time = m_Options->GetMaxCalcTime();
354 
355         if(m_Options->GetGapped()) {
356             // Gapless calculation may take only a portion of maximum allowed
357             // calculation time in the case of gapped calculation
358             gapless_calc_time *= gapless_time_portion;
359         }
360 
361         Njn::LocalMaxStatMatrix local_max_stat_matrix(number_of_aa,
362                                               scoring_matrix,
363                                               background_probabilities1,
364                                               background_probabilities2,
365                                               number_of_aa,
366                                               gapless_calc_time);
367 
368         if(local_max_stat_matrix.getTerminated()) {
369             throw Sls::error("Please increase maximum allowed calculation time.",
370                              1);
371         }
372 
373         //calculation of a and alpha
374         double calculation_error = 1e-6;
375 
376         gumbel_params.gapless_alpha = local_max_stat_matrix.getAlpha();
377         gumbel_params.gapless_alpha_error = calculation_error;
378 
379         gumbel_params.gapless_a = local_max_stat_matrix.getA();
380         gumbel_params.gapless_a_error = calculation_error;
381 
382         if(!m_Options->GetGapped()) {
383 
384             //calculation of all required parameters for a gapless case
385             gumbel_params.G=0;
386 
387             gumbel_params.lambda = local_max_stat_matrix.getLambda ();
388             gumbel_params.lambda_error = calculation_error;
389 
390             gumbel_params.K = local_max_stat_matrix.getK ();
391             gumbel_params.K_error = calculation_error;
392 
393             gumbel_params.C = local_max_stat_matrix.getC ();;
394             gumbel_params.C_error = calculation_error;
395 
396             gumbel_params.sigma = gumbel_params.gapless_alpha;
397             gumbel_params.sigma_error = calculation_error;
398 
399             gumbel_params.alpha_i = gumbel_params.gapless_alpha;
400             gumbel_params.alpha_i_error = calculation_error;
401 
402             gumbel_params.alpha_j = gumbel_params.gapless_alpha;
403             gumbel_params.alpha_j_error = calculation_error;
404 
405             gumbel_params.ai = gumbel_params.gapless_a;
406             gumbel_params.ai_error = calculation_error;
407 
408             gumbel_params.aj = gumbel_params.gapless_a;
409             gumbel_params.aj_error = calculation_error;
410 
411             Sls::alp_data::get_current_time(current_time2);
412             result->SetCalcTime(current_time2 - current_time1);
413 
414             sbs_arrays.lambda_sbs.resize(2);
415             sbs_arrays.lambda_sbs[0]=gumbel_params.lambda;
416             sbs_arrays.lambda_sbs[1]=gumbel_params.lambda + calculation_error;
417 
418             sbs_arrays.K_sbs.resize(2);
419             sbs_arrays.K_sbs[0]=gumbel_params.K;
420             sbs_arrays.K_sbs[1]=gumbel_params.K+calculation_error;
421 
422             sbs_arrays.C_sbs.resize(2);
423             sbs_arrays.C_sbs[0]=gumbel_params.C;
424             sbs_arrays.C_sbs[1]=gumbel_params.C+calculation_error;
425 
426             sbs_arrays.sigma_sbs.resize(2);
427             sbs_arrays.sigma_sbs[0]=gumbel_params.sigma;
428             sbs_arrays.sigma_sbs[1]=gumbel_params.sigma + calculation_error;
429 
430             sbs_arrays.alpha_i_sbs.resize(2);
431             sbs_arrays.alpha_i_sbs[0]=gumbel_params.alpha_i;
432             sbs_arrays.alpha_i_sbs[1]=gumbel_params.alpha_i + calculation_error;
433 
434             sbs_arrays.alpha_j_sbs.resize(2);
435             sbs_arrays.alpha_j_sbs[0]=gumbel_params.alpha_j;
436             sbs_arrays.alpha_j_sbs[1]=gumbel_params.alpha_j + calculation_error;
437 
438             sbs_arrays.ai_sbs.resize(2);
439             sbs_arrays.ai_sbs[0]=gumbel_params.ai;
440             sbs_arrays.ai_sbs[1]=gumbel_params.ai + calculation_error;
441 
442             sbs_arrays.aj_sbs.resize(2);
443             sbs_arrays.aj_sbs[0]=gumbel_params.aj;
444             sbs_arrays.aj_sbs[1]=gumbel_params.aj + calculation_error;
445 
446             // New TRandParams object is created only if m_Options does
447             // not contain one already.
448             if (m_RandParams.Empty()) {
449 
450                 m_RandParams.Reset(new CGumbelParamsRandDiagnostics());
451                 m_RandParams->SetRandomSeed(0);
452                 m_RandParams->SetTotalReNumber(0);
453                 m_RandParams->SetTotalReNumberKilling(0);
454             }
455         }
456         else {
457             double current_time_gapless_preliminary;
458             Sls::alp_data::get_current_time(current_time_gapless_preliminary);
459             double gapless_preliminary_time =
460                 current_time_gapless_preliminary - current_time1;
461 
462             // TODO: There should be constructors for two cases with and without
463             // rand params
464             Sls::alp_data data_obj(m_Options, m_RandParams);
465             data_obj.d_max_time =
466                 Sls::alp_data::Tmax((1.0 - gapless_time_portion)
467                                     * data_obj.d_max_time,data_obj.d_max_time
468                                     - gapless_preliminary_time);
469 
470             Sls::alp_sim sim_obj(&data_obj);
471             gumbel_params.G = m_Options->GetGapOpening()
472                 + m_Options->GetGapExtension();
473 
474             gumbel_params.lambda = sim_obj.m_Lambda;
475             gumbel_params.lambda_error = sim_obj.m_LambdaError;
476 
477             gumbel_params.K = sim_obj.m_K;
478             gumbel_params.K_error = sim_obj.m_KError;
479 
480             gumbel_params.C = sim_obj.m_C;
481             gumbel_params.C_error = sim_obj.m_CError;
482 
483             gumbel_params.sigma = sim_obj.m_Sigma;
484             gumbel_params.sigma_error = sim_obj.m_SigmaError;
485 
486             gumbel_params.alpha_i = sim_obj.m_AlphaI;
487             gumbel_params.alpha_i_error = sim_obj.m_AlphaIError;
488 
489             gumbel_params.alpha_j = sim_obj.m_AlphaJ;
490             gumbel_params.alpha_j_error = sim_obj.m_AlphaJError;
491 
492             gumbel_params.ai = sim_obj.m_AI;
493             gumbel_params.ai_error = sim_obj.m_AIError;
494 
495             gumbel_params.aj = sim_obj.m_AJ;
496             gumbel_params.aj_error = sim_obj.m_AJError;
497 
498             Sls::alp_data::get_current_time(current_time2);
499             result->SetCalcTime(current_time2 - current_time1);
500 
501             s_CopyVector(sim_obj.m_LambdaSbs, sbs_arrays.lambda_sbs);
502             s_CopyVector(sim_obj.m_KSbs, sbs_arrays.K_sbs);
503             s_CopyVector(sim_obj.m_CSbs, sbs_arrays.C_sbs);
504             s_CopyVector(sim_obj.m_SigmaSbs, sbs_arrays.sigma_sbs);
505             s_CopyVector(sim_obj.m_AlphaISbs, sbs_arrays.alpha_i_sbs);
506             s_CopyVector(sim_obj.m_AlphaJSbs, sbs_arrays.alpha_j_sbs);
507             s_CopyVector(sim_obj.m_AISbs, sbs_arrays.ai_sbs);
508             s_CopyVector(sim_obj.m_AJSbs, sbs_arrays.aj_sbs);
509 
510 
511             // New TRandParams object is created only if m_Options does
512             // not contain one already.
513             if (m_RandParams.Empty()) {
514 
515                 m_RandParams.Reset(new CGumbelParamsRandDiagnostics());
516                 m_RandParams->SetRandomSeed(
517                               sim_obj.d_alp_data->d_rand_all->d_random_factor);
518 
519                 vector<Int4>& fsprelim_numbers
520                     = m_RandParams->SetFirstStagePrelimReNumbers();
521 
522                 s_CopyVector(sim_obj.d_alp_data->d_rand_all
523                          ->d_first_stage_preliminary_realizations_numbers_ALP,
524                          fsprelim_numbers);
525 
526                 vector<Int4>& prelim_numbers
527                     = m_RandParams->SetPrelimReNumbers();
528 
529                 s_CopyVector(sim_obj.d_alp_data->d_rand_all
530                              ->d_preliminary_realizations_numbers_ALP,
531                              prelim_numbers);
532 
533                 vector<Int4>& prelim_numbers_killing
534                     = m_RandParams->SetPrelimReNumbersKilling();
535 
536                 s_CopyVector(sim_obj.d_alp_data->d_rand_all
537                              ->d_preliminary_realizations_numbers_killing,
538                              prelim_numbers_killing);
539 
540                 m_RandParams->SetTotalReNumber(sim_obj.d_alp_data->d_rand_all
541                                ->d_total_realizations_number_with_ALP);
542 
543                 m_RandParams->SetTotalReNumberKilling(
544                            sim_obj.d_alp_data->d_rand_all
545                            ->d_total_realizations_number_with_killing);
546                         }
547                 }
548     }
549     catch (Sls::error er) {
550 
551         switch(er.error_code) {
552         case 1: NCBI_THROW(CGumbelParamsException, eLimitsExceeded,
553                        (string)"Time or memory limit exceeded.");
554 
555         case 2: NCBI_THROW(CGumbelParamsException, eRepeatCalc,
556                            (string)"The program could not estimate the "
557                            "parameters. Please repeat calculation");
558 
559         case 3: NCBI_THROW(CGumbelParamsException, eParamsError,
560                            (string)"Results cannot be computed for the "
561                            "provided parameters. " + er.st);
562 
563         case 41: NCBI_THROW(CGumbelParamsException, eMemAllocError,
564                             (string)"Memory allocation error");
565 
566         case 4:
567         default: NCBI_THROW(CGumbelParamsException, eUnexpectedError,
568                             (string)"Unexpected error");
569         }
570     }
571 
572     m_Result.Reset(result);
573     return m_Result;
574 }
575 
576 
577