1 /************************************************************ 2 * HMMER - Biological sequence analysis with profile HMMs 3 * Copyright (C) 1992-2003 Washington University School of Medicine 4 * All Rights Reserved 5 * 6 * This source code is distributed under the terms of the 7 * GNU General Public License. See the files COPYING and LICENSE 8 * for details. 9 ************************************************************/ 10 11 /* structs.h 12 * 13 * Data structures used in HMMER. 14 * Also, a few miscellaneous macros and global variable declarations. 15 * 16 * RCS $Id: structs.h,v 1.28 2003/10/01 13:06:05 eddy Exp $ 17 */ 18 19 #ifndef STRUCTSH_INCLUDED 20 #define STRUCTSH_INCLUDED 21 22 #include "config.h" 23 24 25 /********************************************************************** 26 * 27 * Plan7 28 * Implementation of the new Plan7 HMM architecture. 29 * Fully probabilistic even for hmmsw, hmmls, and hmmfs; 30 * No insert->delete or delete->insert transitions; 31 * Improved structure layout. 32 * 33 * The strategy is to infiltrate plan7 code into HMMER in 34 * an evolutionary rather than revolutionary manner. 35 * 36 **********************************************************************/ 37 38 /* Plan 7 construction strategies. 39 */ 40 enum p7_construction { 41 P7_MAP_CONSTRUCTION, /* maximum a posteriori architecture */ 42 P7_HAND_CONSTRUCTION, /* hand specified architecture */ 43 P7_FAST_CONSTRUCTION /* fast ad hoc architecture */ 44 }; 45 46 /* Plan 7 parameter optimization strategies 47 */ 48 enum p7_param { 49 P7_MAP_PARAM, /* standard maximum a posteriori */ 50 P7_MD_PARAM, /* maximum discrimination */ 51 P7_MRE_PARAM, /* maximum relative entropy */ 52 P7_WMAP_PARAM /* ad hoc weighted MAP */ 53 }; 54 55 /* Structure: plan7_s 56 * 57 * Declaration of a Plan 7 profile-HMM. 58 */ 59 struct plan7_s { 60 /* Annotation on the model. A name is mandatory. 61 * Other fields are optional; whether they are present is 62 * flagged in the stateflags bit array. 63 * 64 * desc is only valid if PLAN7_DESC is set in flags. 65 * acc is only valid if PLAN7_ACC is set in flags. 66 * rf is only valid if PLAN7_RF is set in flags. 67 * cs is only valid if PLAN7_CS is set in flags. 68 * ca is only valid if PLAN7_CA is set in flags. 69 * map is only valid if PLAN7_MAP is set in flags. 70 */ 71 char *name; /* name of the model +*/ 72 char *acc; /* accession number of model (Pfam) +*/ 73 char *desc; /* brief description of model +*/ 74 char *rf; /* reference line from alignment 0..M +*/ 75 char *cs; /* consensus structure line 0..M +*/ 76 char *ca; /* consensus accessibility line 0..M */ 77 char *comlog; /* command line(s) that built model +*/ 78 int nseq; /* number of training sequences +*/ 79 char *ctime; /* creation date +*/ 80 int *map; /* map of alignment cols onto model 1..M+*/ 81 int checksum; /* checksum of training sequences +*/ 82 83 /* The following are annotations added to support work by Michael Asman, 84 * CGR Stockholm. They are not stored in model files; they are only 85 * used in model construction. 86 * 87 * #=GC X-PRM (PRT,PRI) annotation is picked up by hmmbuild and interpreted 88 * as specifying which mixture Dirichlet component to use. If these flags 89 * are non-NULL, the normal mixture Dirichlet code is bypassed, and a 90 * single specific Dirichlet is used at each position. 91 */ 92 int *tpri; /* which transition mixture prior to use */ 93 int *mpri; /* which match mixture prior to use */ 94 int *ipri; /* which insert mixture prior to use */ 95 96 /* Pfam-specific score cutoffs. 97 * 98 * ga1, ga2 are valid if PLAN7_GA is set in flags. 99 * tc1, tc2 are valid if PLAN7_TC is set in flags. 100 * nc1, nc2 are valid if PLAN7_NC is set in flags. 101 */ 102 float ga1, ga2; /* per-seq/per-domain gathering thresholds (bits) +*/ 103 float tc1, tc2; /* per-seq/per-domain trusted cutoff (bits) +*/ 104 float nc1, nc2; /* per-seq/per-domain noise cutoff (bits) +*/ 105 106 /* The main model in probability form: data-dependent probabilities. 107 * This is the core Krogh/Haussler model. 108 * Transition probabilities are usually accessed as a 109 * two-D array: hmm->t[k][TMM], for instance. They are allocated 110 * such that they can also be stepped through in 1D by pointer 111 * manipulations, for efficiency in DP algorithms. 112 */ 113 int M; /* length of the model (# nodes) +*/ 114 float **t; /* transition prob's. t[1..M-1][0..6] +*/ 115 float **mat; /* match emissions. mat[1..M][0..19] +*/ 116 float **ins; /* insert emissions. ins[1..M-1][0..19] +*/ 117 float tbd1; /* B->D1 prob (data dependent) +*/ 118 119 /* The unique states of Plan 7 in probability form. 120 * These are the algorithm-dependent, data-independent probabilities. 121 * Some parts of the code may briefly use a trick of copying tbd1 122 * into begin[0]; this makes it easy to call FChoose() or FNorm() 123 * on the resulting vector. However, in general begin[0] is not 124 * a valid number. 125 */ 126 float xt[4][2]; /* N,E,C,J extra states: 2 transitions +*/ 127 float *begin; /* 1..M B->M state transitions +*/ 128 float *end; /* 1..M M->E state transitions (!= a dist!) +*/ 129 130 /* The null model probabilities. 131 */ 132 float null[MAXABET]; /* "random sequence" emission prob's +*/ 133 float p1; /* null model loop probability +*/ 134 135 /* The model in log-odds score form. 136 * These are created from the probabilities by LogoddsifyHMM(). 137 * By definition, null[] emission scores are all zero. 138 * Note that emission distributions are over 26 upper-case letters, 139 * not just the unambiguous protein or DNA alphabet: we 140 * precalculate the scores for all IUPAC degenerate symbols we 141 * may see. Non-IUPAC symbols simply have a -INFTY score. 142 * 143 * Note the reversed indexing on msc, isc, tsc -- for efficiency reasons. 144 * They're not probability vectors any more so we can reorder them 145 * without wildly complicating our life. 146 * 147 * The _mem ptrs are where the real memory is alloc'ed and free'd, 148 * as opposed to where it is accessed. 149 * This came in with Erik Lindahl's altivec port; it allows alignment on 150 * 16-byte boundaries. In the non-altivec code, this is just a little 151 * redundancy; tsc and tsc_mem point to the same thing, for example. 152 * 153 * Only valid if PLAN7_HASBITS is set. 154 */ 155 int **tsc; /* transition scores [0.6][1.M-1] -*/ 156 int **msc; /* match emission scores [0.MAXCODE-1][1.M] -*/ 157 int **isc; /* ins emission scores [0.MAXCODE-1][1.M-1] -*/ 158 int xsc[4][2]; /* N,E,C,J transitions -*/ 159 int *bsc; /* begin transitions [1.M] -*/ 160 int *esc; /* end transitions [1.M] -*/ 161 int *tsc_mem, *msc_mem, *isc_mem, *bsc_mem, *esc_mem; 162 163 /* DNA translation scoring parameters 164 * For aligning protein Plan7 models to DNA sequence. 165 * Lookup value for a codon is calculated by pos1 * 16 + pos2 * 4 + pos3, 166 * where 'pos1' is the digitized value of the first nucleotide position; 167 * if any of the positions are ambiguous codes, lookup value 64 is used 168 * (which will generally have a score of zero) 169 * 170 * Only valid if PLAN7_HASDNA is set. 171 */ 172 int **dnam; /* triplet match scores [0.64][1.M] -*/ 173 int **dnai; /* triplet insert scores [0.64][1.M] -*/ 174 int dna2; /* -1 frameshift, doublet emission, M or I -*/ 175 int dna4; /* +1 frameshift, doublet emission, M or I -*/ 176 177 /* P-value and E-value statistical parameters 178 * Only valid if PLAN7_STATS is set. 179 */ 180 float mu; /* EVD mu +*/ 181 float lambda; /* EVD lambda +*/ 182 183 int flags; // bit flags indicating state of HMM, valid data 184 int atype; // alphabet type 185 }; 186 187 /* Flags for plan7->flags. 188 * Note: Some models have scores but no probabilities (for instance, 189 * after reading from an HMM save file). Other models have 190 * probabilities but no scores (for instance, during training 191 * or building). Since it costs time to convert either way, 192 * I use PLAN7_HASBITS and PLAN7_HASPROB flags to defer conversion 193 * until absolutely necessary. This means I have to be careful 194 * about keeping these flags set properly when I fiddle a model. 195 */ 196 #define PLAN7_HASBITS (1<<0) /* raised if model has log-odds scores */ 197 #define PLAN7_DESC (1<<1) /* raised if description exists */ 198 #define PLAN7_RF (1<<2) /* raised if #RF annotation available */ 199 #define PLAN7_CS (1<<3) /* raised if #CS annotation available */ 200 #define PLAN7_XRAY (1<<4) /* raised if structural data available */ 201 #define PLAN7_HASPROB (1<<5) /* raised if model has probabilities */ 202 #define PLAN7_HASDNA (1<<6) /* raised if protein HMM->DNA seq params set*/ 203 #define PLAN7_STATS (1<<7) /* raised if EVD parameters are available */ 204 #define PLAN7_MAP (1<<8) /* raised if alignment map is available */ 205 #define PLAN7_ACC (1<<9) /* raised if accession number is available */ 206 #define PLAN7_GA (1<<10) /* raised if gathering thresholds available */ 207 #define PLAN7_TC (1<<11) /* raised if trusted cutoffs available */ 208 #define PLAN7_NC (1<<12) /* raised if noise cutoffs available */ 209 #define PLAN7_CA (1<<13) /* raised if surface accessibility avail. */ 210 211 /* Indices for special state types, I: used for dynamic programming xmx[][] 212 * mnemonic: eXtra Matrix for B state = XMB 213 */ 214 #define XMB 0 215 #define XME 1 216 #define XMC 2 217 #define XMJ 3 218 #define XMN 4 219 220 /* Indices for special state types, II: used for hmm->xt[] indexing 221 * mnemonic: eXtra Transition for N state = XTN 222 */ 223 #define XTN 0 224 #define XTE 1 225 #define XTC 2 226 #define XTJ 3 227 228 /* Indices for Plan7 main model state transitions. 229 * Used for indexing hmm->t[k][] 230 * mnemonic: Transition from Match to Match = TMM 231 */ 232 #define TMM 0 233 #define TMI 1 234 #define TMD 2 235 #define TIM 3 236 #define TII 4 237 #define TDM 5 238 #define TDD 6 239 240 /* Indices for extra state transitions 241 * Used for indexing hmm->xt[][]. 242 */ 243 #define MOVE 0 /* trNB, trEC, trCT, trJB */ 244 #define LOOP 1 /* trNN, trEJ, trCC, trJJ */ 245 246 /* Declaration of Plan7 dynamic programming matrix structure. 247 */ 248 struct dpmatrix_s { 249 int **xmx; /* special scores [0.1..N][BECJN] */ 250 int **mmx; /* match scores [0.1..N][0.1..M] */ 251 int **imx; /* insert scores [0.1..N][0.1..M-1.M] */ 252 int **dmx; /* delete scores [0.1..N][0.1..M-1.M] */ 253 254 /* Hidden ptrs where the real memory is kept; this trick was 255 * introduced by Erik Lindahl with the Altivec port; it's used to 256 * align xmx, etc. on 16-byte boundaries for cache optimization. 257 */ 258 void *xmx_mem, *mmx_mem, *imx_mem, *dmx_mem; 259 260 /* The other trick brought in w/ the Lindahl Altivec port; dp matrix 261 * is retained and grown, rather than ReallocOrDieated for every HMM or sequence. 262 * Keep track of current allocated-for size in rows (sequence length N) 263 * and columns (HMM length M). Also keep track of pad sizes: how much 264 * we should overallocate rows or columns when we ReallocOrDieate. If pad = 0, 265 * then we're not growable in this dimension. 266 */ 267 int maxN; /* alloc'ed for seq of length N; N+1 rows */ 268 int maxM; /* alloc'ed for HMM of length M; M+1 cols */ 269 270 int padN; /* extra pad in sequence length/rows */ 271 int padM; /* extra pad in HMM length/columns */ 272 }; 273 274 /* Declaration of Plan7 shadow matrix structure. 275 * In general, allowed values are STM, STI, etc. 276 * However, E state has M possible sources, from 1..M match states; 277 * hence the esrc array. 278 */ 279 struct dpshadow_s { 280 char **xtb; /* special state traces [0.1..N][BECJN] */ 281 char **mtb; /* match state traces [0.1..N][0.1..M] */ 282 char **itb; /* insert state traces [0.1..N][0.1..M-1.M] */ 283 char **dtb; /* delete state traces [0.1..N][0.1..M-1.M] */ 284 int *esrc; /* E trace is special; must store a M state number 1..M */ 285 }; 286 287 288 /* Plan 7 model state types 289 * used in traceback structure 290 */ 291 #define STBOGUS 0 292 #define STM 1 293 #define STD 2 294 #define STI 3 295 #define STS 4 296 #define STN 5 297 #define STB 6 298 #define STE 7 299 #define STC 8 300 #define STT 9 301 #define STJ 10 302 303 /* Structure: p7trace_s 304 * 305 * Traceback structure for alignments of model to sequence. 306 * Each array in a trace_s is 0..tlen-1. 307 * Element 0 is always to STATE_S. Element tlen-1 is always to STATE_T. 308 */ 309 struct p7trace_s { 310 int tlen; /* length of traceback */ 311 char *statetype; /* state type used for alignment */ 312 int *nodeidx; /* index of aligned node, 1..M (if M,D,I), or 0 */ 313 int *pos; /* position in dsq, 1..L, or 0 if none */ 314 }; 315 316 317 /********************************************************************** 318 * Other structures, not having to do with HMMs. 319 **********************************************************************/ 320 321 /* Structure: histogram_s 322 * 323 * Keep a score histogram. 324 * 325 * The main implementation issue here is that the range of 326 * scores is unknown, and will go negative. histogram is 327 * a 0..max-min array that represents the range min..max. 328 * A given score is indexed in histogram array as score-min. 329 * The AddToHistogram() function deals with dynamically 330 * resizing the histogram array when necessary. 331 */ 332 struct histogram_s { 333 int *histogram; /* counts of hits */ 334 int min; /* elem 0 of histogram == min */ 335 int max; /* last elem of histogram == max */ 336 int highscore; /* highest active elem has this score */ 337 int lowscore; /* lowest active elem has this score */ 338 int lumpsize; /* when resizing, overalloc by this */ 339 int total; /* total # of hits counted */ 340 341 float *expect; /* expected counts of hits */ 342 int fit_type; /* flag indicating distribution type */ 343 float param[3]; /* parameters used for fits */ 344 float chisq; /* chi-squared val for goodness of fit*/ 345 float chip; /* P value for chisquared */ 346 }; 347 #define HISTFIT_NONE 0 /* no fit done yet */ 348 #define HISTFIT_EVD 1 /* fit type = extreme value dist */ 349 #define HISTFIT_GAUSSIAN 2 /* fit type = Gaussian */ 350 #define EVD_MU 0 /* EVD fit parameter mu */ 351 #define EVD_LAMBDA 1 /* EVD fit parameter lambda */ 352 #define EVD_WONKA 2 /* EVD fit fudge factor */ 353 #define GAUSS_MEAN 0 /* Gaussian parameter mean */ 354 #define GAUSS_SD 1 /* Gaussian parameter std. dev. */ 355 356 /* Structure: fancyali_s 357 * 358 * Alignment of a hit to an HMM, for printing. 359 */ 360 struct fancyali_s { 361 char *rfline; /* reference coord info */ 362 char *csline; /* consensus structure info */ 363 char *model; /* aligned query consensus sequence */ 364 char *mline; /* "identities", conservation +'s, etc. */ 365 char *aseq; /* aligned target sequence */ 366 int len; /* length of strings */ 367 char *query; /* name of query HMM */ 368 char *target; /* name of target sequence */ 369 int sqfrom; /* start position on sequence (1..L) */ 370 int sqto; /* end position on sequence (1..L) */ 371 }; 372 373 /* Structure: hit_s 374 * 375 * Info about a high-scoring database hit. 376 * We keep this info in memory, so we can output a 377 * sorted list of high hits at the end. 378 * 379 * sqfrom and sqto are the coordinates that will be shown 380 * in the results, not coords in arrays... therefore, reverse 381 * complements have sqfrom > sqto 382 */ 383 struct hit_s { 384 double sortkey; /* number to sort by; big is better */ 385 float score; /* score of the hit */ 386 double pvalue; /* P-value of the hit */ 387 float mothersc; /* score of whole sequence */ 388 double motherp; /* P-value of whole sequence */ 389 char *name; /* name of the target */ 390 char *acc; /* accession of the target */ 391 char *desc; /* description of the target */ 392 int sqfrom; /* start position in seq (1..N) */ 393 int sqto; /* end position in seq (1..N) */ 394 int sqlen; /* length of sequence (N) */ 395 int hmmfrom; /* start position in HMM (1..M) */ 396 int hmmto; /* end position in HMM (1..M) */ 397 int hmmlen; /* length of HMM (M) */ 398 int domidx; /* index of this domain */ 399 int ndom; /* total # of domains in this seq */ 400 struct fancyali_s *ali; /* ptr to optional alignment info */ 401 }; 402 403 404 /* Structure: tophit_s 405 * 406 * Array of high scoring hits, suitable for efficient sorting 407 * when we prepare to output results. "hit" list is NULL and 408 * unavailable until after we do a sort. 409 */ 410 struct tophit_s { 411 struct hit_s **hit; /* array of ptrs to top scoring hits */ 412 struct hit_s *unsrt; /* unsorted array */ 413 int alloc; /* current allocation size */ 414 int num; /* number of hits in list now */ 415 int lump; /* allocation lumpsize */ 416 }; 417 418 /* struct threshold_s 419 * Contains score/evalue threshold settings. 420 * 421 * made first for hmmpfam: 422 * Since we're going to loop over all HMMs in a Pfam (or pfam-like) 423 * database in main_loop_{serial,pvm}, and we're going to 424 * allow autocutoffs using Pfam GA, NC, TC lines, we will need 425 * to reset those cutoffs with each HMM in turn. Therefore the 426 * main loops need to know whether they're supposed to be 427 * doing autocutoff. This amount of info was unwieldy enough 428 * to pass through the argument list that I put it 429 * in a structure. 430 */ 431 enum CUT{ CUT_NONE, CUT_GA, CUT_NC, CUT_TC }; 432 struct threshold_s { 433 float globT; /* T parameter: keep only hits > globT bits */ 434 double globE; /* E parameter: keep hits < globE E-value */ 435 float domT; /* T parameter for individual domains */ 436 double domE; /* E parameter for individual domains */ 437 /* autosetting of cutoffs using Pfam annot: */ 438 CUT autocut; 439 int Z; /* nseq to base E value calculation on */ 440 }; 441 442 443 /**************************************************** 444 * Single sequence information 445 ****************************************************/ 446 #define SQINFO_NAMELEN 64 447 #define SQINFO_DESCLEN 128 448 449 struct seqinfo_s { 450 int flags; /* what extra data are available */ 451 char name[SQINFO_NAMELEN];/* up to 63 characters of name */ 452 char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */ 453 char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */ 454 char desc[SQINFO_DESCLEN];/* up to 127 char of description */ 455 int len; /* length of this seq */ 456 int start; /* (1..len) start position on source seq */ 457 int stop; /* (1..len) end position on source seq */ 458 int olen; /* original length of source seq */ 459 int type; /* kRNA, kDNA, kAmino, or kOther */ 460 char *ss; /* 0..len-1 secondary structure string */ 461 char *sa; /* 0..len-1 % side chain surface access. */ 462 }; 463 typedef struct seqinfo_s SQINFO; 464 465 #define SQINFO_NAME (1 << 0) 466 #define SQINFO_ID (1 << 1) 467 #define SQINFO_ACC (1 << 2) 468 #define SQINFO_DESC (1 << 3) 469 #define SQINFO_START (1 << 4) 470 #define SQINFO_STOP (1 << 5) 471 #define SQINFO_LEN (1 << 6) 472 #define SQINFO_TYPE (1 << 7) 473 #define SQINFO_OLEN (1 << 8) 474 #define SQINFO_SS (1 << 9) 475 #define SQINFO_SA (1 << 10) 476 477 478 //alphabet 479 #define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd" 480 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY" 481 #define DNA_ALPHABET "ACGT" 482 #define RNA_ALPHABET "ACGU" 483 #define WHITESPACE " \t\n" 484 485 486 /* an idiom for determining a symbol's position in the array 487 * by pointer arithmetic. 488 * does no error checking, so caller must already be damned sure x is 489 * valid in the alphabet! 490 */ 491 #define SYMIDX(al, x) (strchr(al->Alphabet, (x)) - al->Alphabet) 492 493 /* The symbol alphabet. 494 * Must deal with IUPAC degeneracies. Nondegenerate symbols 495 * come first in Alphabet[], followed by degenerate symbols. 496 * Nucleic alphabet also must deal with other common symbols 497 * like U (in RNA) and X (often misused for N). 498 * Example: 499 * Nucleic: "ACGTUNRYMKSWHBVDX" size=4 iupac=17 500 * Amino: "ACDEFGHIKLMNPQRSTVWYBZX" size=20 iupac=23 501 * 502 * Parts of the code assume that the last symbol is a 503 * symbol for an unknown residue, i.e. 'X'. 504 * 505 * MAXCODE and MAXABET constants are defined in config.h 506 */ 507 #define hmmNOTSETYET 0 508 #define hmmNUCLEIC 2 /* compatibility with squid's kRNA */ 509 #define hmmAMINO 3 /* compatibility with squid's kAmino */ 510 511 512 struct alphabet_s { 513 int Alphabet_type; /* hmmNUCLEIC or hmmAMINO */ 514 int Alphabet_size; /* uniq alphabet size: 4 or 20 */ 515 int Alphabet_iupac; /* total size of alphabet + IUPAC degen. */ 516 517 char Alphabet[MAXCODE+1]; /* ACGT, for instance */ 518 char Degenerate[MAXCODE][MAXABET]; /* 1/0 arrays, for whether IUPAC code includes a residue */ 519 int DegenCount[MAXCODE]; 520 521 alphabet_s(); 522 }; 523 524 525 /**************************************************** 526 * Cluster analysis and phylogenetic tree support 527 ****************************************************/ 528 529 /* struct phylo_s - a phylogenetic tree 530 * 531 * For N sequences, there will generally be an array of 0..N-2 532 * phylo_s structures representing the nodes of a tree. 533 * [0] is the root. The indexes of left and 534 * right children are somewhat confusing so be careful. The 535 * indexes can have values of 0..2N-2. If they are 0..N-1, they 536 * represent pointers to individual sequences. If they are 537 * >= N, they represent pointers to a phylo_s structure 538 * at (index - N). 539 */ 540 struct phylo_s { 541 int parent; /* index of parent, N..2N-2, or -1 for root */ 542 int left; /* index of one of the branches, 0..2N-2 */ 543 int right; /* index of other branch, 0..2N-2 */ 544 float diff; /* difference score between seqs */ 545 float lblen; /* left branch length */ 546 float rblen; /* right branch length */ 547 char *is_in; /* 0..N-1 flag array, 1 if seq included */ 548 int incnum; /* number of seqs included at this node */ 549 }; 550 551 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN }; 552 553 /* Strategies for cluster analysis; cluster by mean distance, 554 * minimum distance, or maximum distance. 555 */ 556 557 558 559 /* Binary encoding of the IUPAC code for nucleotides 560 * 561 * four-bit "word", permitting rapid degenerate matching 562 * A C G T/U 563 * 0 0 1 0 564 */ 565 #define NTA 8 566 #define NTC 4 567 #define NTG 2 568 #define NTT 1 569 #define NTU 1 570 #define NTN 15 /* A|C|G|T */ 571 #define NTR 10 /* A|G */ 572 #define NTY 5 /* C|T */ 573 #define NTM 12 /* A|C */ 574 #define NTK 3 /* G|T */ 575 #define NTS 6 /* C|G */ 576 #define NTW 9 /* A|T */ 577 #define NTH 13 /* A|C|T */ 578 #define NTB 7 /* C|G|T */ 579 #define NTV 14 /* A|C|G */ 580 #define NTD 11 /* A|G|T */ 581 #define NTGAP 16 /* GAP */ 582 #define NTEND 0 /* null string terminator */ 583 584 585 /**************************************************** 586 * Sequence alphabet: see also iupac.c 587 ****************************************************/ 588 /* IUPAC symbols defined globally in iupac.c */ 589 struct iupactype { iupactypeiupactype590 iupactype(char _sym, char _symcomp, char _code, char _comp): sym(_sym), symcomp(_symcomp), code(_code), comp(_comp) {} 591 char sym; /* character representation */ 592 char symcomp; /* complement (regular char */ 593 char code; /* my binary rep */ 594 char comp; /* binary encoded complement */ 595 }; 596 597 extern struct iupactype iupac[]; 598 extern float dnafq[]; /* nucleotide occurrence frequencies */ 599 extern float aafq[]; /* amino acid occurrence frequencies */ 600 601 602 603 #define SQDCONST_E 2.71828182845904523536028747135 604 #define SQDCONST_PI 3.14159265358979323846264338328 605 606 /* must declare swapfoo to use SWAP() */ 607 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;} 608 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7) 609 610 #ifndef MIN 611 #define MIN(a,b) (((a)<(b))?(a):(b)) 612 #endif 613 #ifndef MAX 614 #define MAX(a,b) (((a)>(b))?(a):(b)) 615 #endif 616 617 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~') 618 619 /* For convenience and (one hopes) clarity in boolean tests: 620 */ 621 #ifndef TRUE 622 #define TRUE 1 623 #endif 624 #ifndef FALSE 625 #define FALSE 0 626 #endif 627 628 629 630 /* The following constants define the Pfam/Rfam cutoff set we'll propagate 631 * from msa's into HMMER and Infernal models. 632 */ 633 #define MSA_CUTOFF_TC1 0 634 #define MSA_CUTOFF_TC2 1 635 #define MSA_CUTOFF_GA1 2 636 #define MSA_CUTOFF_GA2 3 637 #define MSA_CUTOFF_NC1 4 638 #define MSA_CUTOFF_NC2 5 639 #define MSA_MAXCUTOFFS 6 640 641 /* Structure: MSA 642 * SRE, Tue May 18 11:33:08 1999 643 * 644 * Our object for a multiple sequence alignment. 645 */ 646 typedef struct msa_struct { 647 /* Mandatory information associated with the alignment. 648 */ 649 char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */ 650 char **sqname; /* names of sequences, [0..nseq-1][0..alen-1] */ 651 float *wgt; /* sequence weights [0..nseq-1] */ 652 int alen; /* length of alignment (columns) */ 653 int nseq; /* number of seqs in alignment */ 654 655 /* Optional information that we understand, and might have. 656 */ 657 int flags; /* flags for what optional info is valid */ 658 int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */ 659 char *name; /* name of alignment, or NULL */ 660 char *desc; /* description of alignment, or NULL */ 661 char *acc; /* accession of alignment, or NULL */ 662 char *au; /* "author" information, or NULL */ 663 char *ss_cons; /* consensus secondary structure string, or NULL */ 664 char *sa_cons; /* consensus surface accessibility string, or NULL */ 665 char *rf; /* reference coordinate system, or NULL */ 666 char **sqacc; /* accession numbers for individual sequences */ 667 char **sqdesc; /* description lines for individual sequences */ 668 char **ss; /* per-seq secondary structure annotation, or NULL */ 669 char **sa; /* per-seq surface accessibility annotation, or NULL */ 670 float cutoff[MSA_MAXCUTOFFS]; /* NC, TC, GA cutoffs propagated to Pfam/Rfam */ 671 int cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */ 672 } MSA; 673 674 #define MSA_SET_WGT (1 << 0) /* track whether wgts were set, or left at default 1.0 */ 675 676 677 678 /* Structure: p7prior_s 679 * 680 * Dirichlet priors on HMM parameters. 681 */ 682 struct p7prior_s { 683 int strategy; /* PRI_DCHLET, etc. */ 684 685 int tnum; /* number of transition Dirichlet mixtures */ 686 float tq[MAXDCHLET]; /* probabilities of tnum components */ 687 float t[MAXDCHLET][7]; /* transition terms per mix component */ 688 689 int mnum; /* number of mat emission Dirichlet mixtures */ 690 float mq[MAXDCHLET]; /* probabilities of mnum components */ 691 float m[MAXDCHLET][MAXABET]; /* match emission terms per mix component */ 692 693 int inum; /* number of insert emission Dirichlet mixes */ 694 float iq[MAXDCHLET]; /* probabilities of inum components */ 695 float i[MAXDCHLET][MAXABET]; /* insert emission terms */ 696 }; 697 #define PRI_DCHLET 0 /* simple or mixture Dirichlets */ 698 #define PRI_PAM 1 /* PAM prior hack */ 699 700 701 struct HMMException { 702 HMMException(const char *err); 703 char error[1024]; 704 }; 705 706 struct HMMERTaskLocalData { 707 // 708 alphabet_s al; 709 //sre_random 710 int sre_randseed; 711 long rnd1; /* random number from LCG1 */ 712 long rnd2; /* random number from LCG2 */ 713 long rnd; /* random number we return */ 714 long tbl[64]; /* table for Bays/Durham shuffle */ 715 //prob2ascii 716 char buffer[8]; 717 //Gaussrandom 718 long i; 719 double snorm,u,s,ustar,aa,w,y,tt; 720 // 721 HMMERTaskLocalData(); 722 }; 723 724 #endif /* STRUCTSH_INCLUDED */ 725