1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 3 /********************************************************************* 4 * Clustal Omega - Multiple sequence alignment 5 * 6 * Copyright (C) 2010 University College Dublin 7 * 8 * Clustal-Omega is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License as 10 * published by the Free Software Foundation; either version 2 of the 11 * License, or (at your option) any later version. 12 * 13 * This file is part of Clustal-Omega. 14 * 15 ********************************************************************/ 16 17 /* 18 * RCS $Id: hhdecl-C.h 227 2011-03-28 17:03:09Z fabian $ 19 */ 20 21 ///////////////////////////////////////////////////////////////////////////////////// 22 //// Constants 23 ///////////////////////////////////////////////////////////////////////////////////// 24 25 const char VERSION_AND_DATE[]="version 1.5.1.3 (November 2008)"; 26 const char REFERENCE[]="Soding, J. Protein homology detection by HMM-HMM comparison. Bioinformatics 2005, 21, 951-960.\n"; 27 const char COPYRIGHT[]="(C) Johannes Soeding (see LICENSE file)\n"; 28 const int MAXSEQ=65535; //max number of sequences in input alignment (must be <~30000 on cluster nodes) 29 #if 0 30 const int MAXCOL=32765; //max number of residues in input files; must be <= LINELEN and >= MAXRES 31 const int MAXRES=15002; //max number of columns in HMM; must be <= LINELEN 32 #else 33 const int MAXCOL=2/*131072*/; //max number of residues in input files; must be <= LINELEN and >= MAXRES 34 const int MAXRES=1/*65536*/; //max number of columns in HMM; must be <= LINELEN 35 #endif 36 const int LINELEN=262144; //max length of line read in from input files; must be >= MAXCOL 37 const int MAXSEQDIS=3; //10238;//max number of sequences stored in 'hit' objects and displayed in output alignment 38 const int IDLEN=255; //max length of scop hierarchy id and pdb-id 39 const int DESCLEN=32765;//max length of sequence description (longname) 40 const int NAMELEN=511; //max length of file names etc. 41 const int MAXOPT=127; //Maximum number of options to be read in from .hhconfig or command line 42 const int NAA=20; //number of amino acids (0-19) 43 const int NTRANS=10; //number of transitions recorded in HMM (M2M,M2I,M2D,I2M,I2I,D2M,D2D,M2M_GAPOPEN,GAPOPEN,GAPEXTD) 44 const int NCOLMIN=10; //min number of cols in subalignment for calculating pos-specific weights w[k][i] 45 const int ANY=20; //number representing an X (any amino acid) internally 46 const int GAP=21; //number representing a gap internally 47 const int ENDGAP=22; //Important to distinguish because end gaps do not contribute to tansition counts 48 const int HMMSCALE=1000;//Scaling number for log2-values in HMMs 49 const int NFAMMAX=5119; //Size of hash for counting number of HMMs in each family 50 const int MAXPROF=32766;//Maximum number of HMM scores for fitting EVD 51 const float MAXENDGAPFRAC=0.1; //For weighting: include only columns into subalignment i that have a max fraction of seqs with endgap 52 const float SMIN= 20.; //Minimum score of hit needed to search for another repeat of same profile: p=exp(-(4-mu)/lamda)=0.01 53 const float LAMDA=0.388; //lamda in score EVD used for -local mode in length correction: S = S-log(Lq*Lt)/LAMDA) 54 const float LAMDA_GLOB=0.42; //lamda in score EVD used for -global mode 55 const float PMAX=1E-2; //Maximum single-repeat p-value that can contribute to whole-protein p-value 56 const float MINEVALEXCL=0.5; //above this E-value from first ML fit hits are not used for final ML fit of EVD 57 const int SELFEXCL=3; // exclude self-alignments with j-i<SELFEXCL 58 const float PLTY_GAPOPEN=6.0f; // for -qsc option (filter for min similarity to query): 6 bits to open gap 59 const float PLTY_GAPEXTD=1.0f; // for -qsc option (filter for min similarity to query): 1 bit to extend gap 60 const int MINCOLS_REALIGN=6; // hits with MAC alignments with fewer matched columns will be deleted in hhsearch hitlist 61 62 enum transitions {M2M,M2I,M2D,I2M,I2I,D2M,D2D,M2M_GAPOPEN,GAPOPEN,GAPEXTD}; // index for transitions within a HMM 63 enum pair_states {STOP=0,SAME=1,GD=2,IM=3,DG=4,MI=5,MS=6,ML=7,SM=8,LM=9,MM=10}; 64 65 // const char aa[]="ARNDCQEGHILKMFPSTWYVX-"; 66 //Amino acids Sorted by alphabet -> internal numbers a 67 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 68 // A C D E F G H I K L M N P Q R S T V W Y X 69 const int s2a[]={ 0, 4, 3, 6,13, 7, 8, 9,11,10,12, 2,14, 5, 1,15,16,19,17,18,20}; 70 //Internal numbers a for amino acids -> amino acids Sorted by alphabet: 71 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 72 // A R N D C Q E G H I L K M F P S T W Y V X 73 const int a2s[]={ 0,14,11, 2, 1,13, 3, 5, 6, 7, 9, 8,10, 4,12,15,16,18,19,17,20}; 74 75 // Secondary structure 76 const int NDSSP=8; //number of different ss states determined by dssp: 0-7 (0: no state available) 77 const int NSSPRED=4; //number of different ss states predicted by psipred: 0-3 (0: no prediction availabe) 78 const int MAXCF=11; //number of different confidence values: 0-10 (0: no prediction availabe) 79 const int NSA=7; //number of classes relative solvent accesiblity (0:no coord, 1:<2%, 2:<14%, 3:<33%, 4:<55%, 5:>55%, 6:S-S bridge) 80 81 ///////////////////////////////////////////////////////////////////////////////////// 82 ///////////////////////////////////////////////////////////////////////////////////// 83 84 // Input parameters 85 class Parameters // Parameters for gap penalties and pseudocounts 86 { 87 public: 88 char** argv; //command line parameters 89 char argc; //dimension of argv 90 91 char infile[NAMELEN]; // input filename 92 char outfile[NAMELEN]; // output filename 93 char pairwisealisfile[NAMELEN]; // output filename with pairwise alignments 94 char alnfile[NAMELEN]; // name of output alignment file in A3M format (for iterative search) 95 char hhmfile[NAMELEN]; // name of output HHM file for (iterative search) 96 char psifile[NAMELEN]; // name of output alignmen file in PSI-BLAST format (iterative search) 97 char scorefile[NAMELEN];// table of scores etc for all HMMs in searched database 98 char tfile[NAMELEN]; // template filename (in hhalign) 99 char buffer[NAMELEN]; // buffer to write results for other programs into 100 char pngfile[NAMELEN]; // png image file for dotplot 101 char wfile[NAMELEN]; // weights file generated with hhformat 102 char* blafile; // output of 'blastpgp -m 8' with PSI-BLAST E-values for HHblast 103 char* dbfiles; // database filenames, separated by colons 104 char* exclstr; // optional string containing list of excluded residues, e.g. '1-33,97-168' 105 int aliwidth; // number of characters per line in output alignments for HMM search 106 char append; // append to output file? (hhmake) 107 float p; // minimum probability for inclusion in hit list and alignments 108 float E; // maximum E-value for inclusion in hit list and alignment list 109 float e; // maximum E-value for inclusion in output alignment, output HMM, and PSI-BLAST checkpoint model 110 int Z; // max number of lines in hit list 111 int z; // min number of lines in hit list 112 int B; // max number of lines in alignment list 113 int b; // min number of lines in alignment list 114 int showcons; // 0: don't show consensus sequence in alignments 1:show 115 int showdssp; // 0: don't show consensus sequence in alignments 1:show 116 int showpred; // 0: don't show consensus sequence in alignments 1:show 117 int nseqdis; // maximum number of query or template sequences in output alignments 118 char cons; // if set to 1, include consensus as first representative sequence of HMM 119 char mark; // which sequences to mark for display in output alignments? 0: auto; 1:all 120 char outformat; // 0: hhr 1: FASTA 2:A2M 3:A3M 121 char mode; // 122 //0:MAC alignment, master-slave 1:MAC blending, master-slave 2:MAC alignment, combining 123 124 int max_seqid; // Maximum sequence identity with all other sequences in alignment 125 int qid; // Minimum sequence identity with query sequence (sequence 0) 126 float qsc; // Minimum score per column with query sequence (sequence 0) 127 int coverage; // Minimum coverage threshold 128 int Ndiff; // Pick Ndiff most different sequences that passed the other filter thresholds 129 int coverage_core; // Minimum coverage for sequences in core alignment 130 float qsc_core; // Minimum sequence identity with query for sequences in core alignment 131 float coresc; // Minimum score per column with core alignment (HMM) 132 133 int maxResLen; /* length of longest sequence/profile, FS 2010-11-05 */ 134 int maxColCnt; /* maximum number of columns in HMM, FS 2010-11-05 */ 135 136 int Mgaps; // Maximum percentage of gaps for match states 137 int M; // Match state assignment by 1:upper/lower case 2:percentage rule 3:marked sequence 138 char matrix; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 139 140 char wg; // 0: use local sequence weights 1: use local ones 141 double *pdWg1; /* seq weights 1st profile, derived from tree */ 142 double *pdWg2; /* seq weights 2nd profile, derived from tree */ 143 144 char pcm; // 0:no pseudocounts, 1:pos-specific pcs, 2:PSIBLAST pcs 145 /* pseudo-count parameters for MAC*/ 146 float pca; // Pseudocount matrix = (1-tau(i))*I + tau(i)*S 147 float pcb; // tau(i) = pca/(1 + ((Neff-1)/pcb)^pcc 148 float pcc; // 149 float pcw; // Decrease pseudocounts for conserved columns 150 151 /* gap parameters for MAC*/ 152 float gapb; // Diversity threshold for adding pseudocounts to transitions from M state 153 float gapd; // Gap open penalty factor for deletions 154 float gape; // Gap extend penalty: factor to multiply hmmer values (def=1) 155 float gapf; // factor for increasing/reducing the gap opening penalty for deletes 156 float gapg; // factor for increasing/reducing the gap opening penalty for inserts 157 float gaph; // factor for increasing/reducing the gap extension penalty for deletes 158 float gapi; // factor for increasing/reducing the gap extension penalty for inserts 159 160 /* pseudo-count parameters for Viterbi, FS, r226->r227 */ 161 float pcaV; // Pseudocount matrix = (1-tau(i))*I + tau(i)*S 162 float pcbV; // tau(i) = pca/(1 + ((Neff-1)/pcb)^pcc 163 float pccV; // 164 float pcwV; // Decrease pseudocounts for conserved columns 165 166 /* gap parameters for Viterbi, FS, r226->r227 */ 167 float gapbV; // Diversity threshold for adding pseudocounts to transitions from M state 168 float gapdV; // Gap open penalty factor for deletions 169 float gapeV; // Gap extend penalty: factor to multiply hmmer values (def=1) 170 float gapfV; // factor for increasing/reducing the gap opening penalty for deletes 171 float gapgV; // factor for increasing/reducing the gap opening penalty for inserts 172 float gaphV; // factor for increasing/reducing the gap extension penalty for deletes 173 float gapiV; // factor for increasing/reducing the gap extension penalty for inserts 174 175 float egq; // penalty for end gaps when query not fully covered 176 float egt; // penalty for end gaps when template not fully covered 177 178 float neffa; // Coefficients to estimate Neff-dependent weights for HMM merging procedure 179 float neffb; // Coefficients to estimate Neff-dependent weights for HMM merging procedure 180 181 char ssgap; // 1: add secondary structure-dependent gap penalties 0:off 182 float ssgapd; // secondary structure-dependent gap-opening penalty (per residue) 183 float ssgape; // secondary structure-dependent gap-extension penalty (per residue) 184 char ssgapi; // max. number of inside-integer(ii); gap-open-penalty= -ii*ssgapd 185 186 char ssm; // SS comparison mode: 0:no ss scoring 1:ss scoring AFTER alignment 2:ss score in column score 187 float ssw; // SS weight as compared to column score 188 float ssa; // SS state evolution matrix M1 = (1-ssa)*I + ssa*M0 189 190 char loc; // 0: local alignment (wrt. query), 1: global alignement 191 char forward; // 0:Viterbi algorithm 1:Forward algorithm 2: MAC 192 char realign; // realign database hits to be displayed with MAC algorithm 193 char altali; // find up to this many possibly overlapping alignments 194 int columnscore; // 0: no aa comp corr 1: 1/2(qav+tav) 2: template av freqs 3: query av freqs 4:... 195 float corr; // Weight of correlations between scores with |i-j|<=4 196 float shift; // Score offset for match-match states 197 float mact; // Score threshold (negative offset) in MAC alignment 198 199 char calibrate; // calibration of query HMM? 0:no, 1:yes (write lamda,mu into query profile) 200 char calm; // derive P-values from: 0:query calibration 1:template calibration 2:both 201 int opt; // for optimization: compare only every opt'th negative; 0: mode off 202 int readdefaultsfile ; // read defaults file ./.hhdefaults or HOME/.hhdefaults? 203 int min_overlap; // all cells of dyn. programming matrix with L_T-j+i or L_Q-i+j < min_overlap will be ignored 204 int hitrank; // rank of hit to be printed as a3m alignment 205 char notags; // neutralize His-tags, FLAG tags, C-myc tags? 206 unsigned int maxdbstrlen; // maximum length of database string to be printed in 'Command' line of hhr file 207 208 char trans; // 0: normal pairwise scoring; 1:transitive scoring 209 float Emax_trans; // max E-value for intermediate HMMs in transitive scoring (i.e. l is intermediate HMM if E_lq, E_lk <Emax_trans) 210 float wtrans; // Ztot[k] = Zq[k] + wtrans * (Zforward[k]+Zreverse[k]) 211 212 213 // SCRAP THE FOLLOWING VARIABLES? 214 215 float wstruc; // weight of structure scores 216 char repmode; // 1:repeat identification: multiple hits not treated as independent 0: repeat mode off 217 218 int idummy; 219 int jdummy; 220 float fdummy; 221 }; 222 223 ///////////////////////////////////////////////////////////////////////////////////// 224 //// Global variable declarations 225 ///////////////////////////////////////////////////////////////////////////////////// 226 227 char v=1; // 1: show only warnings 2:verbose mode 228 Parameters par; 229 char program_name[NAMELEN]; //name of program executed (e.g. hhmake of hhsearch) 230 231 // substitution matrix flavours 232 float P[21][21]; // P[a][b] = combined probability for a aligned to b 233 float R[21][21]; // R[a][b]=P[a][b]/p[b]=P(a|b); precalculated for pseudocounts 234 float Sim[21][21]; // Similarity matrix Sim[a][b]: how similar are a and b? 235 float S[21][21]; // Substitution score matrix S[a][b] = log2(Pab/pa/pb) 236 float pb[21]; // pb[a] = background amino acid probabilities for chosen substitution matrix 237 float qav[21]; // qav[a] = background amino acid probabilities for query HMM (needed for rate matrix rescaling) 238 239 // secondary structure matrices 240 float S73[NDSSP][NSSPRED][MAXCF]; // P[A][B][cf] = log2 P(A,B,cf)/P(A)/P(B,cf) 241 float S33[NSSPRED][MAXCF][NSSPRED][MAXCF]; // P[B][cf][B'][cf'] = log2 sum_B' P(A,B',cf)/P(A)/P(B,cf) * P_b(B'|B) 242 // float S77[NDSSP][DSSP]; // P[A][B] = log2 P(A,B)/P(A)/P(B) 243 244