1 /*
2  * hhdecl.h
3  *
4  *  Created on: Mar 28, 2014
5  *      Author: meiermark
6  */
7 
8 #ifndef HHDECL_H_
9 #define HHDECL_H_
10 
11 #include <unistd.h>
12 
13 #include "crf.h"
14 #include "aa.h"
15 #include "crf_pseudocounts-inl.h"
16 #include "library_pseudocounts-inl.h"
17 #include "log.h"
18 #include "util.h"
19 
20 /////////////////////////////////////////////////////////////////////////////////////
21 //// Global variable declarations
22 /////////////////////////////////////////////////////////////////////////////////////
23 
24 const char REFERENCE[] = "Steinegger M, Meier M, Mirdita M, Vöhringer H, Haunsberger S J, and Söding J (2019)\nHH-suite3 for fast remote homology detection and deep protein annotation.\nBMC Bioinformatics, doi:10.1186/s12859-019-3019-7\n";
25 const char COPYRIGHT[] = "(c) The HH-suite development team\n";
26 
27 const int LINELEN=524288; //max length of line read in from input files; must be >= MAXCOL
28 const int MAXSEQDIS=10238;//max number of sequences stored in 'hit' objects and displayed in output alignment
29 const int IDLEN=255;     //max length of scop hierarchy id and pdb-id
30 const int DESCLEN=32765;//max length of sequence description (longname)
31 const int NAMELEN=(PATH_MAX>512? PATH_MAX:512); //max length of file names etc., defined in limits.h
32 const int NAA=20;       //number of amino acids (0-19)
33 const int NTRANS=7;     //number of transitions recorded in HMM (M2M,M2I,M2D,I2M,I2I,D2M,D2D)
34 const int NCOLMIN=10;   //min number of cols in subalignment for calculating pos-specific weights w[k][i]
35 const int ANY=20;       //number representing an X (any amino acid) internally
36 const int GAP=21;       //number representing a gap internally
37 const int FWD_BKW_PATHWITDH=40;       //cell off path width around viterbi alignment
38 const int ENDGAP=22;    //Important to distinguish because end gaps do not contribute to tansition counts
39 const int HMMSCALE=1000;//Scaling number for log2-values in HMMs
40 const int MAXPROF=32766;//Maximum number of HMM scores for fitting EVD
41 const float MAXENDGAPFRAC=0.1; //For weighting: include only columns into subalignment i that have a max fraction of seqs with endgap
42 const float LAMDA=0.388; //lamda in score EVD used for -local mode in length correction: S = S-log(Lq*Lt)/LAMDA)
43 const float LAMDA_GLOB=0.42; //lamda in score EVD used for -global mode
44 const int SELFEXCL=3;   // exclude self-alignments with j-i<SELFEXCL
45 const float PLTY_GAPOPEN=6.0f; // for -qsc option (filter for min similarity to query): 6 bits to open gap
46 const float PLTY_GAPEXTD=1.0f; // for -qsc option (filter for min similarity to query): 1 bit to extend gap
47 const int MINCOLS_REALIGN=6; // hits with MAC alignments with fewer matched columns will be deleted in hhsearch hitlist; must be at least 2 to avoid nonsense MAC alignments starting from the left/upper edge
48 const float LOG1000=log(1000.0);
49 const float POSTERIOR_PROBABILITY_THRESHOLD = 0.01;
50 const int VITERBI_PATH_WIDTH=40;
51 
52 // Secondary structure
53 const int NDSSP=8;      //number of different ss states determined by dssp: 0-7 (0: no state available)
54 const int NSSPRED=4;    //number of different ss states predicted by psipred: 0-3 (0: no prediction availabe)
55 const int MAXCF=11;     //number of different confidence values: 0-10 (0: no prediction availabe)
56 
57 // const char aa[]="ARNDCQEGHILKMFPSTWYVX-";
58 //Amino acids Sorted by alphabet     -> internal numbers a
59 //                0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
60 //                A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y  X
61 const int s2a[]={ 0, 4, 3, 6,13, 7, 8, 9,11,10,12, 2,14, 5, 1,15,16,19,17,18,20};
62 
63 //Internal numbers a for amino acids -> amino acids Sorted by alphabet:
64 //                0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
65 //                A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
66 const int a2s[]={ 0,14,11, 2, 1,13, 3, 5, 6, 7, 9, 8,10, 4,12,15,16,18,19,17,20};
67 
68 enum transitions {M2M,M2I,M2D,I2M,I2I,D2M,D2D}; // index for transitions within a HMM
69 
70 #ifdef __GNUC__
71 #define __DEPRECATED__ __attribute__((deprecated))
72 #elif defined(_MSC_VER)
73 #define __DEPRECATED__ __declspec(deprecated)
74 #else
75 #pragma message("WARNING: No DEPRECATED for this compiler")
76 #define __DEPRECATED__
77 #endif
78 
79 
80 enum pair_states {DEPRECATED_STOP=0,DEPRECATED_MM=2,DEPRECATED_GD=3,DEPRECATED_IM=4,DEPRECATED_DG=5,DEPRECATED_MI=6};
81 //__DEPRECATED__
82 const pair_states STOP = pair_states(DEPRECATED_STOP);
83 //__DEPRECATED__
84 const pair_states MM = pair_states(DEPRECATED_MM);
85 //__DEPRECATED__
86 const pair_states GD = pair_states(DEPRECATED_GD);
87 //__DEPRECATED__
88 const pair_states IM = pair_states(DEPRECATED_IM);
89 //__DEPRECATED__
90 const pair_states DG = pair_states(DEPRECATED_DG);
91 //__DEPRECATED__
92 const pair_states MI = pair_states(DEPRECATED_MI);
93 
94 
95 //states for the interim filter of the query msa during merging
96 enum InterimFilterStates {INTERIM_FILTER_NONE=0, INTERIM_FILTER_FULL=1};
97 
98 // Pseudocounts
99 namespace Pseudocounts {
100 enum Admix {
101   ConstantAdmix = 1,
102   HHsearchAdmix = 2,
103   CSBlastAdmix  = 3
104 };
105 
106 struct Params {
107   Params(
108       Admix m     = ConstantAdmix,
109       double a    = 1.0,
110       double b    = 1.0,
111       double c    = 1.0,
112       double neff = 0.0)
admixParams113     : admix(m), pca(a), pcb(b), pcc(c), target_neff(neff) {}
114 
CreateAdmixParams115   cs::Admix* CreateAdmix() {
116     switch (admix) {
117       case ConstantAdmix:
118         return new cs::ConstantAdmix(pca);
119         break;
120       case HHsearchAdmix:
121         return new cs::HHsearchAdmix(pca, pcb, pcc);
122         break;
123       case CSBlastAdmix:
124         return new cs::CSBlastAdmix(pca, pcb);
125         break;
126       default:
127         return NULL;
128     }
129   }
130 
131   Admix admix;        // admixture mode
132   double pca;         // admixture paramter a
133   double pcb;         // admixture paramter b
134   double pcc;         // admixture parameter c needed for HHsearchAdmix
135   double target_neff; // target diversity adjusted by optimizing a
136 };
137 
138 };
139 
140 class Parameters          // Parameters for gap penalties and pseudocounts
141 {
142 public:
143   Parameters(const int argc, const char** argv);
144 
145   const char** argv;            //command line parameters
146   const char argc;              //dimension of argv
147 
148   LogLevel v;
149 
150   char infile[NAMELEN];   // input filename
151   char outfile[NAMELEN];  // output filename
152   char matrices_output_file[NAMELEN];
153   bool filter_matrices;
154   char pairwisealisfile[NAMELEN]; // output filename with pairwise alignments
155   char alisbasename[NAMELEN];
156   char alnfile[NAMELEN];  // name of output alignment file in A3M format (for iterative search)
157   char hhmfile[NAMELEN];  // name of output HHM file for (iterative search)
158   char psifile[NAMELEN];  // name of output alignmen file in PSI-BLAST format (iterative search)
159   char scorefile[NAMELEN];// table of scores etc for all HMMs in searched database
160   char m8file[NAMELEN];   // blast tab format for all HMMs in searched database
161   char indexfile[NAMELEN];// optional file containing indeices of aligned residues in given alignment
162   std::vector<std::string> tfiles;    // template filenames (in hhalign)
163   char alitabfile[NAMELEN]; // where to write pairs of aligned residues (-atab option)
164   char* exclstr;          // optional string containing list of excluded residues, e.g. '1-33,97-168'
165   char* template_exclstr;
166   int aliwidth;           // number of characters per line in output alignments for HMM search
167   float p;                // minimum probability for inclusion in hit list and alignments
168   double E;               // maximum E-value for inclusion in hit list and alignment list
169   double e;               // maximum E-value for inclusion in output alignment, output HMM, and PSI-BLAST checkpoint model
170   int Z;                  // max number of lines in hit list
171   int z;                  // min number of lines in hit list
172   int B;                  // max number of lines in alignment list
173   int b;                  // min number of lines in alignment list
174   char showcons;           // in query-template alignments  0: don't show consensus sequence   1:show
175   char showdssp;           // in query-template alignments  0: don't show ss_dssp lines        1:show
176   char showpred;           // in query-template alignments  0: don't show ss_pred and ss_conf lines  1:show
177   char showconf;           // in query-template alignments  0: don't show ss_conf lines        1:show
178   char cons;              // if set to 1, include consensus as first representative sequence of HMM
179   int nseqdis;            // maximum number of query or template sequences in output alignments
180   char mark;              // which sequences to mark for display in output alignments? 0: auto; 1:all
181   char append;            // append to output file? (hhmake)
182   char outformat;         // 0: hhr  1: FASTA  2:A2M   3:A3M
183                           //0:MAC alignment, master-slave  1:MAC blending, master-slave  2:MAC alignment, combining
184 
185 
186   int max_seqid;          // Maximum sequence identity with all other sequences in alignment
187   int qid;                // Minimum sequence identity with query sequence (sequence 0)
188   float qsc;              // Minimum score per column with query sequence (sequence 0)
189   int coverage;           // Minimum coverage threshold
190   int Ndiff;              // Pick Ndiff most different sequences that passed the other filter thresholds
191   bool allseqs;           // if true, do not filter in output alignment; show all sequences
192 
193   int Mgaps;              // Maximum percentage of gaps for match states
194   int M;                  // Match state assignment by  1:upper/lower case  2:percentage rule  3:marked sequence
195   int M_template;
196   char matrix;            // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50
197 
198   char wg;                // 0: use local sequence weights   1: use global ones
199 
200   Pseudocounts::Params pc_hhm_context_engine;       // Pseudocounts parameters for query hhm if context given
201   Pseudocounts::Params pc_prefilter_context_engine; // Pseudocounts parameters for prefiltering if context given
202 
203   //pseudocount variables if no context is used
204   int pc_hhm_nocontext_mode;              // Admixture method
205   float pc_hhm_nocontext_a;               // Admixture parameter a
206   float pc_hhm_nocontext_b;               // Admixture parameter b
207   float pc_hhm_nocontext_c;               // Admixture parameter c
208 
209   //pseudocount variables for the prefilter if no context is used
210   int pc_prefilter_nocontext_mode;           // Admixture method
211   float pc_prefilter_nocontext_a;            // Admixture parameter a
212   float pc_prefilter_nocontext_b;            // Admixture parameter b
213   float pc_prefilter_nocontext_c;            // Admixture parameter c
214 
215   float gapb;             // Diversity threshold for adding pseudocounts to transitions from M state
216   float gapd;             // Gap open penalty factor for deletions
217   float gape;             // Gap extend penalty: factor to multiply hmmer values (def=1)
218   float gapf;             // factor for increasing/reducing the gap opening penalty for deletes
219   float gapg;             // factor for increasing/reducing the gap opening penalty for inserts
220   float gaph;             // factor for increasing/reducing the gap extension penalty for deletes
221   float gapi;             // factor for increasing/reducing the gap extension penalty for inserts
222 
223   float egq;              // penalty for end gaps when query not fully covered
224   float egt;              // penalty for end gaps when template not fully covered
225 
226   float Neff;
227 
228   char ssm;               // SS comparison mode: 0:no ss scoring  1:ss scoring AFTER alignment  2:ss score in column score
229   float ssw;              // SS weight as compared to column score
230   float ssw_realign;      // SS weight as compared to column score for realign
231   float ssa;              // SS state evolution matrix M1 = (1-ssa)*I + ssa*M0
232 
233   char loc;               // 0: local alignment (wrt. query), 1: global alignement
234   char realign;           // realign database hits to be displayed with MAC algorithm
235   int premerge;           // permerge up to N hits before realign
236   int altali;             // find up to this many possibly overlapping alignments
237   float smin;             //Minimum score of hit needed to search for another repeat of same profile: p=exp(-(4-mu)/lamda)=0.01
238   int columnscore;        // 0: no aa comp corr  1: 1/2(qav+tav) 2: template av freqs 3: query av freqs 4:...
239   int half_window_size_local_aa_bg_freqs; // half-window size to average local aa background frequencies
240   float corr;             // Weight of correlations between scores with |i-j|<=4
241   float shift;            // Score offset for match-match states
242   double mact;            // Probability threshold (negative offset) in MAC alignment determining greediness at ends of alignment
243   int realign_max;        // Realign max ... hits
244   float maxmem;           // maximum available memory in GB for realignment (approximately)
245 
246   int min_overlap;        // all cells of dyn. programming matrix with L_T-j+i or L_Q-i+j < min_overlap will be ignored
247   char notags;            // neutralize His-tags, FLAG tags, C-myc tags?
248 
249   //TODO: a const would be nicer
250   unsigned int maxdbstrlen; // maximum length of database string to be printed in 'Command' line of hhr file
251 
252   int maxcol;             // max number of columns in sequence/MSA input files; must be <= LINELEN and >= maxres
253   int maxres;             // max number of states in HMM; must be <= LINELEN
254   int maxseq;             // max number of sequences in MSA
255   int maxnumdb;           // max number of hits allowed past prefilter
256 
257   bool hmmer_used;        // True, if a HMMER database is used
258 
259   // parameters for context-specific pseudocounts
260   float csb;
261   float csw;
262   std::string clusterfile;
263   bool nocontxt;
264 
265   // HHblits
266   int dbsize;           // number of clusters of input database
267 
268   // HHblits Evalue calculation  (alpha = a + b(Neff(T) - 1)(1 - c(Neff(Q) - 1)) )
269   float alphaa;
270   float alphab;
271   float alphac;
272 
273   // For filtering database alignments in HHsearch and HHblits
274   // JS: What are these used for? They are set to the values without _db anyway.
275   int max_seqid_db;
276   int qid_db;
277   float qsc_db;
278   int coverage_db;
279   int Ndiff_db;
280 
281   // HHblits context state prefilter
282   std::string cs_library;
283 
284   // HHblits prefilter
285   bool prefilter;             // perform prefiltering in HHblits?
286 
287   //early stopping stuff
288   bool early_stopping_filter; // Break HMM search, when the sum of the last N HMM-hit-Evalues is below threshold
289   double filter_thresh;    // Threshold for early stopping
290 
291   // For HHblits prefiltering with SSE2
292   short prefilter_gap_open;
293   short prefilter_gap_extend;
294   int prefilter_score_offset;
295   int prefilter_bit_factor;
296   double prefilter_evalue_thresh;
297   double prefilter_evalue_coarse_thresh;
298   int preprefilter_smax_thresh;
299 
300   int min_prefilter_hits;
301 
302   size_t max_number_matrices;
303 
304   //hhblits specific variables
305   int num_rounds;
306   std::vector<std::string> db_bases;
307   // Perform filtering of already seen HHMs
308   bool already_seen_filter;
309   // Realign old hits in last round or use previous alignments
310   bool realign_old_hits;
311   float neffmax;
312   int threads;
313 
314   InterimFilterStates interim_filter;
315 };
316 
317 
318 #endif /* HHDECL_H_ */
319