1 /* 2 * hhdecl.h 3 * 4 * Created on: Mar 28, 2014 5 * Author: meiermark 6 */ 7 8 #ifndef HHDECL_H_ 9 #define HHDECL_H_ 10 11 #include <unistd.h> 12 13 #include "crf.h" 14 #include "aa.h" 15 #include "crf_pseudocounts-inl.h" 16 #include "library_pseudocounts-inl.h" 17 #include "log.h" 18 #include "util.h" 19 20 ///////////////////////////////////////////////////////////////////////////////////// 21 //// Global variable declarations 22 ///////////////////////////////////////////////////////////////////////////////////// 23 24 const char REFERENCE[] = "Steinegger M, Meier M, Mirdita M, Vöhringer H, Haunsberger S J, and Söding J (2019)\nHH-suite3 for fast remote homology detection and deep protein annotation.\nBMC Bioinformatics, doi:10.1186/s12859-019-3019-7\n"; 25 const char COPYRIGHT[] = "(c) The HH-suite development team\n"; 26 27 const int LINELEN=524288; //max length of line read in from input files; must be >= MAXCOL 28 const int MAXSEQDIS=10238;//max number of sequences stored in 'hit' objects and displayed in output alignment 29 const int IDLEN=255; //max length of scop hierarchy id and pdb-id 30 const int DESCLEN=32765;//max length of sequence description (longname) 31 const int NAMELEN=(PATH_MAX>512? PATH_MAX:512); //max length of file names etc., defined in limits.h 32 const int NAA=20; //number of amino acids (0-19) 33 const int NTRANS=7; //number of transitions recorded in HMM (M2M,M2I,M2D,I2M,I2I,D2M,D2D) 34 const int NCOLMIN=10; //min number of cols in subalignment for calculating pos-specific weights w[k][i] 35 const int ANY=20; //number representing an X (any amino acid) internally 36 const int GAP=21; //number representing a gap internally 37 const int FWD_BKW_PATHWITDH=40; //cell off path width around viterbi alignment 38 const int ENDGAP=22; //Important to distinguish because end gaps do not contribute to tansition counts 39 const int HMMSCALE=1000;//Scaling number for log2-values in HMMs 40 const int MAXPROF=32766;//Maximum number of HMM scores for fitting EVD 41 const float MAXENDGAPFRAC=0.1; //For weighting: include only columns into subalignment i that have a max fraction of seqs with endgap 42 const float LAMDA=0.388; //lamda in score EVD used for -local mode in length correction: S = S-log(Lq*Lt)/LAMDA) 43 const float LAMDA_GLOB=0.42; //lamda in score EVD used for -global mode 44 const int SELFEXCL=3; // exclude self-alignments with j-i<SELFEXCL 45 const float PLTY_GAPOPEN=6.0f; // for -qsc option (filter for min similarity to query): 6 bits to open gap 46 const float PLTY_GAPEXTD=1.0f; // for -qsc option (filter for min similarity to query): 1 bit to extend gap 47 const int MINCOLS_REALIGN=6; // hits with MAC alignments with fewer matched columns will be deleted in hhsearch hitlist; must be at least 2 to avoid nonsense MAC alignments starting from the left/upper edge 48 const float LOG1000=log(1000.0); 49 const float POSTERIOR_PROBABILITY_THRESHOLD = 0.01; 50 const int VITERBI_PATH_WIDTH=40; 51 52 // Secondary structure 53 const int NDSSP=8; //number of different ss states determined by dssp: 0-7 (0: no state available) 54 const int NSSPRED=4; //number of different ss states predicted by psipred: 0-3 (0: no prediction availabe) 55 const int MAXCF=11; //number of different confidence values: 0-10 (0: no prediction availabe) 56 57 // const char aa[]="ARNDCQEGHILKMFPSTWYVX-"; 58 //Amino acids Sorted by alphabet -> internal numbers a 59 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 60 // A C D E F G H I K L M N P Q R S T V W Y X 61 const int s2a[]={ 0, 4, 3, 6,13, 7, 8, 9,11,10,12, 2,14, 5, 1,15,16,19,17,18,20}; 62 63 //Internal numbers a for amino acids -> amino acids Sorted by alphabet: 64 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 65 // A R N D C Q E G H I L K M F P S T W Y V X 66 const int a2s[]={ 0,14,11, 2, 1,13, 3, 5, 6, 7, 9, 8,10, 4,12,15,16,18,19,17,20}; 67 68 enum transitions {M2M,M2I,M2D,I2M,I2I,D2M,D2D}; // index for transitions within a HMM 69 70 #ifdef __GNUC__ 71 #define __DEPRECATED__ __attribute__((deprecated)) 72 #elif defined(_MSC_VER) 73 #define __DEPRECATED__ __declspec(deprecated) 74 #else 75 #pragma message("WARNING: No DEPRECATED for this compiler") 76 #define __DEPRECATED__ 77 #endif 78 79 80 enum pair_states {DEPRECATED_STOP=0,DEPRECATED_MM=2,DEPRECATED_GD=3,DEPRECATED_IM=4,DEPRECATED_DG=5,DEPRECATED_MI=6}; 81 //__DEPRECATED__ 82 const pair_states STOP = pair_states(DEPRECATED_STOP); 83 //__DEPRECATED__ 84 const pair_states MM = pair_states(DEPRECATED_MM); 85 //__DEPRECATED__ 86 const pair_states GD = pair_states(DEPRECATED_GD); 87 //__DEPRECATED__ 88 const pair_states IM = pair_states(DEPRECATED_IM); 89 //__DEPRECATED__ 90 const pair_states DG = pair_states(DEPRECATED_DG); 91 //__DEPRECATED__ 92 const pair_states MI = pair_states(DEPRECATED_MI); 93 94 95 //states for the interim filter of the query msa during merging 96 enum InterimFilterStates {INTERIM_FILTER_NONE=0, INTERIM_FILTER_FULL=1}; 97 98 // Pseudocounts 99 namespace Pseudocounts { 100 enum Admix { 101 ConstantAdmix = 1, 102 HHsearchAdmix = 2, 103 CSBlastAdmix = 3 104 }; 105 106 struct Params { 107 Params( 108 Admix m = ConstantAdmix, 109 double a = 1.0, 110 double b = 1.0, 111 double c = 1.0, 112 double neff = 0.0) admixParams113 : admix(m), pca(a), pcb(b), pcc(c), target_neff(neff) {} 114 CreateAdmixParams115 cs::Admix* CreateAdmix() { 116 switch (admix) { 117 case ConstantAdmix: 118 return new cs::ConstantAdmix(pca); 119 break; 120 case HHsearchAdmix: 121 return new cs::HHsearchAdmix(pca, pcb, pcc); 122 break; 123 case CSBlastAdmix: 124 return new cs::CSBlastAdmix(pca, pcb); 125 break; 126 default: 127 return NULL; 128 } 129 } 130 131 Admix admix; // admixture mode 132 double pca; // admixture paramter a 133 double pcb; // admixture paramter b 134 double pcc; // admixture parameter c needed for HHsearchAdmix 135 double target_neff; // target diversity adjusted by optimizing a 136 }; 137 138 }; 139 140 class Parameters // Parameters for gap penalties and pseudocounts 141 { 142 public: 143 Parameters(const int argc, const char** argv); 144 145 const char** argv; //command line parameters 146 const char argc; //dimension of argv 147 148 LogLevel v; 149 150 char infile[NAMELEN]; // input filename 151 char outfile[NAMELEN]; // output filename 152 char matrices_output_file[NAMELEN]; 153 bool filter_matrices; 154 char pairwisealisfile[NAMELEN]; // output filename with pairwise alignments 155 char alisbasename[NAMELEN]; 156 char alnfile[NAMELEN]; // name of output alignment file in A3M format (for iterative search) 157 char hhmfile[NAMELEN]; // name of output HHM file for (iterative search) 158 char psifile[NAMELEN]; // name of output alignmen file in PSI-BLAST format (iterative search) 159 char scorefile[NAMELEN];// table of scores etc for all HMMs in searched database 160 char m8file[NAMELEN]; // blast tab format for all HMMs in searched database 161 char indexfile[NAMELEN];// optional file containing indeices of aligned residues in given alignment 162 std::vector<std::string> tfiles; // template filenames (in hhalign) 163 char alitabfile[NAMELEN]; // where to write pairs of aligned residues (-atab option) 164 char* exclstr; // optional string containing list of excluded residues, e.g. '1-33,97-168' 165 char* template_exclstr; 166 int aliwidth; // number of characters per line in output alignments for HMM search 167 float p; // minimum probability for inclusion in hit list and alignments 168 double E; // maximum E-value for inclusion in hit list and alignment list 169 double e; // maximum E-value for inclusion in output alignment, output HMM, and PSI-BLAST checkpoint model 170 int Z; // max number of lines in hit list 171 int z; // min number of lines in hit list 172 int B; // max number of lines in alignment list 173 int b; // min number of lines in alignment list 174 char showcons; // in query-template alignments 0: don't show consensus sequence 1:show 175 char showdssp; // in query-template alignments 0: don't show ss_dssp lines 1:show 176 char showpred; // in query-template alignments 0: don't show ss_pred and ss_conf lines 1:show 177 char showconf; // in query-template alignments 0: don't show ss_conf lines 1:show 178 char cons; // if set to 1, include consensus as first representative sequence of HMM 179 int nseqdis; // maximum number of query or template sequences in output alignments 180 char mark; // which sequences to mark for display in output alignments? 0: auto; 1:all 181 char append; // append to output file? (hhmake) 182 char outformat; // 0: hhr 1: FASTA 2:A2M 3:A3M 183 //0:MAC alignment, master-slave 1:MAC blending, master-slave 2:MAC alignment, combining 184 185 186 int max_seqid; // Maximum sequence identity with all other sequences in alignment 187 int qid; // Minimum sequence identity with query sequence (sequence 0) 188 float qsc; // Minimum score per column with query sequence (sequence 0) 189 int coverage; // Minimum coverage threshold 190 int Ndiff; // Pick Ndiff most different sequences that passed the other filter thresholds 191 bool allseqs; // if true, do not filter in output alignment; show all sequences 192 193 int Mgaps; // Maximum percentage of gaps for match states 194 int M; // Match state assignment by 1:upper/lower case 2:percentage rule 3:marked sequence 195 int M_template; 196 char matrix; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 197 198 char wg; // 0: use local sequence weights 1: use global ones 199 200 Pseudocounts::Params pc_hhm_context_engine; // Pseudocounts parameters for query hhm if context given 201 Pseudocounts::Params pc_prefilter_context_engine; // Pseudocounts parameters for prefiltering if context given 202 203 //pseudocount variables if no context is used 204 int pc_hhm_nocontext_mode; // Admixture method 205 float pc_hhm_nocontext_a; // Admixture parameter a 206 float pc_hhm_nocontext_b; // Admixture parameter b 207 float pc_hhm_nocontext_c; // Admixture parameter c 208 209 //pseudocount variables for the prefilter if no context is used 210 int pc_prefilter_nocontext_mode; // Admixture method 211 float pc_prefilter_nocontext_a; // Admixture parameter a 212 float pc_prefilter_nocontext_b; // Admixture parameter b 213 float pc_prefilter_nocontext_c; // Admixture parameter c 214 215 float gapb; // Diversity threshold for adding pseudocounts to transitions from M state 216 float gapd; // Gap open penalty factor for deletions 217 float gape; // Gap extend penalty: factor to multiply hmmer values (def=1) 218 float gapf; // factor for increasing/reducing the gap opening penalty for deletes 219 float gapg; // factor for increasing/reducing the gap opening penalty for inserts 220 float gaph; // factor for increasing/reducing the gap extension penalty for deletes 221 float gapi; // factor for increasing/reducing the gap extension penalty for inserts 222 223 float egq; // penalty for end gaps when query not fully covered 224 float egt; // penalty for end gaps when template not fully covered 225 226 float Neff; 227 228 char ssm; // SS comparison mode: 0:no ss scoring 1:ss scoring AFTER alignment 2:ss score in column score 229 float ssw; // SS weight as compared to column score 230 float ssw_realign; // SS weight as compared to column score for realign 231 float ssa; // SS state evolution matrix M1 = (1-ssa)*I + ssa*M0 232 233 char loc; // 0: local alignment (wrt. query), 1: global alignement 234 char realign; // realign database hits to be displayed with MAC algorithm 235 int premerge; // permerge up to N hits before realign 236 int altali; // find up to this many possibly overlapping alignments 237 float smin; //Minimum score of hit needed to search for another repeat of same profile: p=exp(-(4-mu)/lamda)=0.01 238 int columnscore; // 0: no aa comp corr 1: 1/2(qav+tav) 2: template av freqs 3: query av freqs 4:... 239 int half_window_size_local_aa_bg_freqs; // half-window size to average local aa background frequencies 240 float corr; // Weight of correlations between scores with |i-j|<=4 241 float shift; // Score offset for match-match states 242 double mact; // Probability threshold (negative offset) in MAC alignment determining greediness at ends of alignment 243 int realign_max; // Realign max ... hits 244 float maxmem; // maximum available memory in GB for realignment (approximately) 245 246 int min_overlap; // all cells of dyn. programming matrix with L_T-j+i or L_Q-i+j < min_overlap will be ignored 247 char notags; // neutralize His-tags, FLAG tags, C-myc tags? 248 249 //TODO: a const would be nicer 250 unsigned int maxdbstrlen; // maximum length of database string to be printed in 'Command' line of hhr file 251 252 int maxcol; // max number of columns in sequence/MSA input files; must be <= LINELEN and >= maxres 253 int maxres; // max number of states in HMM; must be <= LINELEN 254 int maxseq; // max number of sequences in MSA 255 int maxnumdb; // max number of hits allowed past prefilter 256 257 bool hmmer_used; // True, if a HMMER database is used 258 259 // parameters for context-specific pseudocounts 260 float csb; 261 float csw; 262 std::string clusterfile; 263 bool nocontxt; 264 265 // HHblits 266 int dbsize; // number of clusters of input database 267 268 // HHblits Evalue calculation (alpha = a + b(Neff(T) - 1)(1 - c(Neff(Q) - 1)) ) 269 float alphaa; 270 float alphab; 271 float alphac; 272 273 // For filtering database alignments in HHsearch and HHblits 274 // JS: What are these used for? They are set to the values without _db anyway. 275 int max_seqid_db; 276 int qid_db; 277 float qsc_db; 278 int coverage_db; 279 int Ndiff_db; 280 281 // HHblits context state prefilter 282 std::string cs_library; 283 284 // HHblits prefilter 285 bool prefilter; // perform prefiltering in HHblits? 286 287 //early stopping stuff 288 bool early_stopping_filter; // Break HMM search, when the sum of the last N HMM-hit-Evalues is below threshold 289 double filter_thresh; // Threshold for early stopping 290 291 // For HHblits prefiltering with SSE2 292 short prefilter_gap_open; 293 short prefilter_gap_extend; 294 int prefilter_score_offset; 295 int prefilter_bit_factor; 296 double prefilter_evalue_thresh; 297 double prefilter_evalue_coarse_thresh; 298 int preprefilter_smax_thresh; 299 300 int min_prefilter_hits; 301 302 size_t max_number_matrices; 303 304 //hhblits specific variables 305 int num_rounds; 306 std::vector<std::string> db_bases; 307 // Perform filtering of already seen HHMs 308 bool already_seen_filter; 309 // Realign old hits in last round or use previous alignments 310 bool realign_old_hits; 311 float neffmax; 312 int threads; 313 314 InterimFilterStates interim_filter; 315 }; 316 317 318 #endif /* HHDECL_H_ */ 319