1 #ifndef INCLUDED_SLS_ALIGNMENT_EVALUER
2 #define INCLUDED_SLS_ALIGNMENT_EVALUER
3 
4 /* $Id: $
5 * ===========================================================================
6 *
7 *							PUBLIC DOMAIN NOTICE
8 *			   National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's offical duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: sls_alignment_evaluer.hpp
32 
33 Authors: Martin Frith, Sergey Sheetlin
34 
35 Contents: library functions of main routines
36 
37 ******************************************************************************/
38 
39 #include "sls_pvalues.hpp"
40 #include <math.h>
41 
42 namespace Sls {
43 	const double default_importance_sampling_temperature = 1.07;
44 
45 	class AlignmentEvaluer {
46 
47 	//Write the parameters:
48 	friend std::ostream &operator<<(std::ostream &s_,
49 			const AlignmentEvaluer &g_);
50 
51 	//Read the parameters:
52 	friend std::istream &operator>>(std::istream &s_,
53 			AlignmentEvaluer &g_);
54 
55 	public:
56 
57 	//randomization parameters
58 	struct gapped_computation_parameters_struct
59 	{
gapped_computation_parameters_structSls::AlignmentEvaluer::gapped_computation_parameters_struct60 		gapped_computation_parameters_struct()
61 		{
62 			d_parameters_flag=false;
63 			d_max_time_for_quick_tests=-1;
64 			d_max_time_with_computation_parameters=-1;
65 		};
66 
67 		std::vector<long int> d_first_stage_preliminary_realizations_numbers_ALP;
68 		std::vector<long int> d_preliminary_realizations_numbers_ALP;
69 		std::vector<long int> d_preliminary_realizations_numbers_killing;
70 		long int d_total_realizations_number_with_ALP;
71 		long int d_total_realizations_number_with_killing;
72 		bool d_parameters_flag;//if false then the parameters are not defined
73 		double d_max_time_for_quick_tests;//maximum allowed calculation time in seconds for quick tests
74 		double d_max_time_with_computation_parameters;//maximum allowed time in seconds for the whole computation
75 	};
76 
77 
78 	//Computes gapless Gumbel parameters:
79 	void initGapless(long alphabetSize_,//a number of letters in the alphabet
80 			const long *const *substitutionScoreMatrix_,//scoring matrix
81 			const double *letterFreqs1_,//background frequencies of letters in sequence #1
82 			const double *letterFreqs2_,//background frequencies of letters in sequence #2
83 			double max_time_=60);//maximum allowed calculation time in seconds
84 
85 	//Computes gapped Gumbel parameters:
86 	//The NCBI convention is used for penalizing a gap:
87 	//For example, a gap of length k is penalized as gapOpen1_+k*gapEpen1_ for sequence #1
88 	//if max_time_<=0 then d_gapped_computation_parameters will be used;
89 	//d_gapped_computation_parameters.d_parameters_flag must be true in this case
90 	//every execution of initGapped with max_time_>0 rewrites d_gapped_computation_parameters by the actual values
91 	void initGapped(long alphabetSize_,//a number of letters in the alphabet
92 			const long *const *substitutionScoreMatrix_,//scoring matrix
93 			const double *letterFreqs1_,//background frequencies of letters in sequence #1
94 			const double *letterFreqs2_,//background frequencies of letters in sequence #2
95 			long gapOpen1_,//gap opening penalty for sequence #1
96 			long gapEpen1_,//gap extension penalty for sequence #1
97 			long gapOpen2_,//gap opening penalty for sequence #2
98 			long gapEpen2_,//gap extension penalty for sequence #2
99 			bool insertions_after_deletions_,//if true, then insertions after deletions are permitted
100 			double eps_lambda_,//relative error for the parameter lambda
101 			double eps_K_,//relative error for the parameter K
102 			double max_time_,//maximum allowed calculation time in seconds;
103 			double max_mem_,//maximum allowed memory usage in Mb
104 			long randomSeed_,//randomizaton seed
105 			double temperature_=default_importance_sampling_temperature);
106 
107 
108 	//Initializes Gumbel parameters using precalculated values:
109 	void initParameters(
110 	const AlignmentEvaluerParameters &parameters_);//parameters_ must be defined by the user
111 
112 	void initParameters(
113 	const AlignmentEvaluerParametersWithErrors &parameters_);//parameters_ must be defined by the user
114 
115 	//Computes P-values/E-values
116 	//Gumbel parameters must be defined via d_params
117 	void calc(double score_,//pairwise alignment score
118 			double seqlen1_,//length of sequence #1
119 			double seqlen2_,//length of sequence #2
120 			double &pvalue_,//resulted P-value
121 			double &pvalueErr_,//P-value error
122 			double &evalue_,//resulted E-value
123 			double &evalueErr_) const;//E-value error
124 
125 	//The following routines compute P-values/E-values and related
126 	//quantities quickly, without calculating errors
127 
128 	//Gumbel parameters must be defined via d_params
129 	void calc(double score_,//pairwise alignment score
130 		double seqlen1_,//length of sequence #1
131 		double seqlen2_,//length of sequence #2
132 		double &pvalue_,//resulted P-value
133 		double &evalue_) const;//resulted E-value
134 
evalue(double score_,double seqlen1_,double seqlen2_) const135 	double evalue(double score_,//pairwise alignment score
136 			double seqlen1_,//length of sequence #1
137 			double seqlen2_) const//length of sequence #2
138 	{
139 		return area(score_, seqlen1_, seqlen2_) * evaluePerArea(score_);
140 	}
141 
142 	//Computes P-values from E-values
pvalue(double evalue_)143 	static double pvalue(double evalue_)
144 	{
145 		return sls_basic::one_minus_exp_function(-evalue_);
146 	}
147 
148 	//The "area" is approximately seqlen1_*seqlen2_, but it is
149 	//modified by a finite size correction
150 	double area(double score_,//pairwise alignment score
151 			double seqlen1_,//length of sequence #1
152 			double seqlen2_) const;//length of sequence #2
153 
evaluePerArea(double score_) const154 	double evaluePerArea(double score_) const
155 	{
156 	  return d_params.K*exp(-d_params.lambda*score_);
157 	}
158 
bitScore(double score_,double logK) const159 	double bitScore(double score_, double logK) const
160 	{
161 		return (d_params.lambda*score_-logK)/log(2.0);
162 	}
163 
bitScore(double score_) const164 	double bitScore(double score_) const
165 	{
166 		return (d_params.lambda*score_-log(d_params.K))/log(2.0);
167 	}
168 
169 	//returns "true" if the set of parameters "d_params" is fully defined for P-value calculation
isGood() const170 	bool isGood() const
171 	{
172 		return d_params.d_params_flag;
173 	}
174 
175 	//provides access to the set of Gumbel parameters
parameters() const176 	const ALP_set_of_parameters &parameters() const { return d_params; }
177 
178 	//provides access to the set of randomization parameters
gapped_computation_parameters() const179 	const gapped_computation_parameters_struct &gapped_computation_parameters() const { return d_gapped_computation_parameters; }
180 
181 	//simplified setting of the computation parameters
182 	void set_gapped_computation_parameters_simplified(
183 		double max_time_=-1.0,//maximum allowed time in seconds for the whole computation
184 		long number_of_samples_=1000,//number of realization for the main stage
185 		long number_of_samples_for_preliminary_stages_=200);//number of realization for the preliminary stages
186 
set_gapped_computation_parameters(const gapped_computation_parameters_struct & gapped_computation_parameters_)187 	void set_gapped_computation_parameters(
188 		const gapped_computation_parameters_struct &gapped_computation_parameters_)
189 	{
190 		d_gapped_computation_parameters=gapped_computation_parameters_;
191 	};
192 
193 	//clear the computation parameters
194 	void gapped_computation_parameters_clear();
195 
196 	private:
197 	//check correctness of the input parameters for gapless alignment
198 	void assert_Gapless_input_parameters(
199 		long alphabetSize_,//a number of letters in the alphabet
200 		const double *letterFreqs1_,//background frequencies of letters in sequence #1
201 		const double *letterFreqs2_,//background frequencies of letters in sequence #2
202 		double *&letterFreqs1_normalized_,//normalized background frequencies of letters in sequence #1
203 		double *&letterFreqs2_normalized_,//normalized background frequencies of letters in sequence #2
204 		const std::string function_name_);//"assert_Gapless_input_parameters" is called from "function_name_" function
205 
206 	private:
207 
208 		ALP_set_of_parameters d_params;//set of Gumbel parameters
209 
210 		//generalized randomization parameters for the gapped computation
211 		gapped_computation_parameters_struct d_gapped_computation_parameters;
212 
213 	};
214 }
215 
216 #endif //! INCLUDED
217 
218