1 #ifndef INCLUDED_SLS_ALIGNMENT_EVALUER 2 #define INCLUDED_SLS_ALIGNMENT_EVALUER 3 4 /* $Id: $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's offical duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * ===========================================================================*/ 28 29 /***************************************************************************** 30 31 File name: sls_alignment_evaluer.hpp 32 33 Authors: Martin Frith, Sergey Sheetlin 34 35 Contents: library functions of main routines 36 37 ******************************************************************************/ 38 39 #include "sls_pvalues.hpp" 40 #include <math.h> 41 42 namespace Sls { 43 const double default_importance_sampling_temperature = 1.07; 44 45 class AlignmentEvaluer { 46 47 //Write the parameters: 48 friend std::ostream &operator<<(std::ostream &s_, 49 const AlignmentEvaluer &g_); 50 51 //Read the parameters: 52 friend std::istream &operator>>(std::istream &s_, 53 AlignmentEvaluer &g_); 54 55 public: 56 57 //randomization parameters 58 struct gapped_computation_parameters_struct 59 { gapped_computation_parameters_structSls::AlignmentEvaluer::gapped_computation_parameters_struct60 gapped_computation_parameters_struct() 61 { 62 d_parameters_flag=false; 63 d_max_time_for_quick_tests=-1; 64 d_max_time_with_computation_parameters=-1; 65 }; 66 67 std::vector<long int> d_first_stage_preliminary_realizations_numbers_ALP; 68 std::vector<long int> d_preliminary_realizations_numbers_ALP; 69 std::vector<long int> d_preliminary_realizations_numbers_killing; 70 long int d_total_realizations_number_with_ALP; 71 long int d_total_realizations_number_with_killing; 72 bool d_parameters_flag;//if false then the parameters are not defined 73 double d_max_time_for_quick_tests;//maximum allowed calculation time in seconds for quick tests 74 double d_max_time_with_computation_parameters;//maximum allowed time in seconds for the whole computation 75 }; 76 77 78 //Computes gapless Gumbel parameters: 79 void initGapless(long alphabetSize_,//a number of letters in the alphabet 80 const long *const *substitutionScoreMatrix_,//scoring matrix 81 const double *letterFreqs1_,//background frequencies of letters in sequence #1 82 const double *letterFreqs2_,//background frequencies of letters in sequence #2 83 double max_time_=60);//maximum allowed calculation time in seconds 84 85 //Computes gapped Gumbel parameters: 86 //The NCBI convention is used for penalizing a gap: 87 //For example, a gap of length k is penalized as gapOpen1_+k*gapEpen1_ for sequence #1 88 //if max_time_<=0 then d_gapped_computation_parameters will be used; 89 //d_gapped_computation_parameters.d_parameters_flag must be true in this case 90 //every execution of initGapped with max_time_>0 rewrites d_gapped_computation_parameters by the actual values 91 void initGapped(long alphabetSize_,//a number of letters in the alphabet 92 const long *const *substitutionScoreMatrix_,//scoring matrix 93 const double *letterFreqs1_,//background frequencies of letters in sequence #1 94 const double *letterFreqs2_,//background frequencies of letters in sequence #2 95 long gapOpen1_,//gap opening penalty for sequence #1 96 long gapEpen1_,//gap extension penalty for sequence #1 97 long gapOpen2_,//gap opening penalty for sequence #2 98 long gapEpen2_,//gap extension penalty for sequence #2 99 bool insertions_after_deletions_,//if true, then insertions after deletions are permitted 100 double eps_lambda_,//relative error for the parameter lambda 101 double eps_K_,//relative error for the parameter K 102 double max_time_,//maximum allowed calculation time in seconds; 103 double max_mem_,//maximum allowed memory usage in Mb 104 long randomSeed_,//randomizaton seed 105 double temperature_=default_importance_sampling_temperature); 106 107 108 //Initializes Gumbel parameters using precalculated values: 109 void initParameters( 110 const AlignmentEvaluerParameters ¶meters_);//parameters_ must be defined by the user 111 112 void initParameters( 113 const AlignmentEvaluerParametersWithErrors ¶meters_);//parameters_ must be defined by the user 114 115 //Computes P-values/E-values 116 //Gumbel parameters must be defined via d_params 117 void calc(double score_,//pairwise alignment score 118 double seqlen1_,//length of sequence #1 119 double seqlen2_,//length of sequence #2 120 double &pvalue_,//resulted P-value 121 double &pvalueErr_,//P-value error 122 double &evalue_,//resulted E-value 123 double &evalueErr_) const;//E-value error 124 125 //The following routines compute P-values/E-values and related 126 //quantities quickly, without calculating errors 127 128 //Gumbel parameters must be defined via d_params 129 void calc(double score_,//pairwise alignment score 130 double seqlen1_,//length of sequence #1 131 double seqlen2_,//length of sequence #2 132 double &pvalue_,//resulted P-value 133 double &evalue_) const;//resulted E-value 134 evalue(double score_,double seqlen1_,double seqlen2_) const135 double evalue(double score_,//pairwise alignment score 136 double seqlen1_,//length of sequence #1 137 double seqlen2_) const//length of sequence #2 138 { 139 return area(score_, seqlen1_, seqlen2_) * evaluePerArea(score_); 140 } 141 142 //Computes P-values from E-values pvalue(double evalue_)143 static double pvalue(double evalue_) 144 { 145 return sls_basic::one_minus_exp_function(-evalue_); 146 } 147 148 //The "area" is approximately seqlen1_*seqlen2_, but it is 149 //modified by a finite size correction 150 double area(double score_,//pairwise alignment score 151 double seqlen1_,//length of sequence #1 152 double seqlen2_) const;//length of sequence #2 153 evaluePerArea(double score_) const154 double evaluePerArea(double score_) const 155 { 156 return d_params.K*exp(-d_params.lambda*score_); 157 } 158 bitScore(double score_,double logK) const159 double bitScore(double score_, double logK) const 160 { 161 return (d_params.lambda*score_-logK)/log(2.0); 162 } 163 bitScore(double score_) const164 double bitScore(double score_) const 165 { 166 return (d_params.lambda*score_-log(d_params.K))/log(2.0); 167 } 168 169 //returns "true" if the set of parameters "d_params" is fully defined for P-value calculation isGood() const170 bool isGood() const 171 { 172 return d_params.d_params_flag; 173 } 174 175 //provides access to the set of Gumbel parameters parameters() const176 const ALP_set_of_parameters ¶meters() const { return d_params; } 177 178 //provides access to the set of randomization parameters gapped_computation_parameters() const179 const gapped_computation_parameters_struct &gapped_computation_parameters() const { return d_gapped_computation_parameters; } 180 181 //simplified setting of the computation parameters 182 void set_gapped_computation_parameters_simplified( 183 double max_time_=-1.0,//maximum allowed time in seconds for the whole computation 184 long number_of_samples_=1000,//number of realization for the main stage 185 long number_of_samples_for_preliminary_stages_=200);//number of realization for the preliminary stages 186 set_gapped_computation_parameters(const gapped_computation_parameters_struct & gapped_computation_parameters_)187 void set_gapped_computation_parameters( 188 const gapped_computation_parameters_struct &gapped_computation_parameters_) 189 { 190 d_gapped_computation_parameters=gapped_computation_parameters_; 191 }; 192 193 //clear the computation parameters 194 void gapped_computation_parameters_clear(); 195 196 private: 197 //check correctness of the input parameters for gapless alignment 198 void assert_Gapless_input_parameters( 199 long alphabetSize_,//a number of letters in the alphabet 200 const double *letterFreqs1_,//background frequencies of letters in sequence #1 201 const double *letterFreqs2_,//background frequencies of letters in sequence #2 202 double *&letterFreqs1_normalized_,//normalized background frequencies of letters in sequence #1 203 double *&letterFreqs2_normalized_,//normalized background frequencies of letters in sequence #2 204 const std::string function_name_);//"assert_Gapless_input_parameters" is called from "function_name_" function 205 206 private: 207 208 ALP_set_of_parameters d_params;//set of Gumbel parameters 209 210 //generalized randomization parameters for the gapped computation 211 gapped_computation_parameters_struct d_gapped_computation_parameters; 212 213 }; 214 } 215 216 #endif //! INCLUDED 217 218