1 /************************************************************
2  * HMMER - Biological sequence analysis with profile HMMs
3  * Copyright (C) 1992-2003 Washington University School of Medicine
4  * All Rights Reserved
5  *
6  *     This source code is distributed under the terms of the
7  *     GNU General Public License. See the files COPYING and LICENSE
8  *     for details.
9  ************************************************************/
10 
11 /* structs.h
12  *
13  * Data structures used in HMMER.
14  * Also, a few miscellaneous macros and global variable declarations.
15  *
16  * RCS $Id: structs.h,v 1.28 2003/10/01 13:06:05 eddy Exp $
17  */
18 
19 #ifndef STRUCTSH_INCLUDED
20 #define STRUCTSH_INCLUDED
21 
22 #include "config.h"
23 
24 
25 /**********************************************************************
26  *
27  * Plan7
28  * Implementation of the new Plan7 HMM architecture.
29  * Fully probabilistic even for hmmsw, hmmls, and hmmfs;
30  * No insert->delete or delete->insert transitions;
31  * Improved structure layout.
32  *
33  * The strategy is to infiltrate plan7 code into HMMER in
34  * an evolutionary rather than revolutionary manner.
35  *
36  **********************************************************************/
37 
38 /* Plan 7 construction strategies.
39  */
40 enum p7_construction {
41   P7_MAP_CONSTRUCTION,      /* maximum a posteriori architecture */
42   P7_HAND_CONSTRUCTION,     /* hand specified architecture       */
43   P7_FAST_CONSTRUCTION      /* fast ad hoc architecture          */
44 };
45 
46 /* Plan 7 parameter optimization strategies
47  */
48 enum p7_param {
49   P7_MAP_PARAM,         /* standard maximum a posteriori    */
50   P7_MD_PARAM,          /* maximum discrimination           */
51   P7_MRE_PARAM,         /* maximum relative entropy         */
52   P7_WMAP_PARAM         /* ad hoc weighted MAP              */
53 };
54 
55 /* Structure: plan7_s
56  *
57  * Declaration of a Plan 7 profile-HMM.
58  */
59 struct plan7_s {
60   /* Annotation on the model. A name is mandatory.
61    * Other fields are optional; whether they are present is
62    * flagged in the stateflags bit array.
63    *
64    * desc is only valid if PLAN7_DESC is set in flags.
65    *  acc is only valid if PLAN7_ACC is set in flags.
66    *   rf is only valid if PLAN7_RF is set in flags.
67    *   cs is only valid if PLAN7_CS is set in flags.
68    *   ca is only valid if PLAN7_CA is set in flags.
69    *  map is only valid if PLAN7_MAP is set in flags.
70    */
71   char  *name;                  /* name of the model                    +*/
72   char  *acc;           /* accession number of model (Pfam)     +*/
73   char  *desc;                  /* brief description of model           +*/
74   char  *rf;                    /* reference line from alignment 0..M   +*/
75   char  *cs;                    /* consensus structure line      0..M   +*/
76   char  *ca;            /* consensus accessibility line  0..M    */
77   char  *comlog;        /* command line(s) that built model     +*/
78   int    nseq;          /* number of training sequences         +*/
79   char  *ctime;         /* creation date                        +*/
80   int   *map;           /* map of alignment cols onto model 1..M+*/
81   int    checksum;              /* checksum of training sequences       +*/
82 
83   /* The following are annotations added to support work by Michael Asman,
84    * CGR Stockholm. They are not stored in model files; they are only
85    * used in model construction.
86    *
87    * #=GC X-PRM (PRT,PRI) annotation is picked up by hmmbuild and interpreted
88    * as specifying which mixture Dirichlet component to use. If these flags
89    * are non-NULL, the normal mixture Dirichlet code is bypassed, and a
90    * single specific Dirichlet is used at each position.
91    */
92   int   *tpri;                  /* which transition mixture prior to use */
93   int   *mpri;                  /* which match mixture prior to use */
94   int   *ipri;                  /* which insert mixture prior to use */
95 
96   /* Pfam-specific score cutoffs.
97    *
98    * ga1, ga2 are valid if PLAN7_GA is set in flags.
99    * tc1, tc2 are valid if PLAN7_TC is set in flags.
100    * nc1, nc2 are valid if PLAN7_NC is set in flags.
101    */
102   float  ga1, ga2;      /* per-seq/per-domain gathering thresholds (bits) +*/
103   float  tc1, tc2;      /* per-seq/per-domain trusted cutoff (bits)       +*/
104   float  nc1, nc2;      /* per-seq/per-domain noise cutoff (bits)         +*/
105 
106   /* The main model in probability form: data-dependent probabilities.
107    * This is the core Krogh/Haussler model.
108    * Transition probabilities are usually accessed as a
109    *   two-D array: hmm->t[k][TMM], for instance. They are allocated
110    *   such that they can also be stepped through in 1D by pointer
111    *   manipulations, for efficiency in DP algorithms.
112    */
113   int     M;                    /* length of the model (# nodes)        +*/
114   float **t;                    /* transition prob's. t[1..M-1][0..6]   +*/
115   float **mat;                  /* match emissions.  mat[1..M][0..19]   +*/
116   float **ins;                  /* insert emissions. ins[1..M-1][0..19] +*/
117   float   tbd1;         /* B->D1 prob (data dependent)          +*/
118 
119   /* The unique states of Plan 7 in probability form.
120    * These are the algorithm-dependent, data-independent probabilities.
121    * Some parts of the code may briefly use a trick of copying tbd1
122    * into begin[0]; this makes it easy to call FChoose() or FNorm()
123    * on the resulting vector. However, in general begin[0] is not
124    * a valid number.
125    */
126   float  xt[4][2];              /* N,E,C,J extra states: 2 transitions      +*/
127   float *begin;                 /* 1..M B->M state transitions              +*/
128   float *end;                   /* 1..M M->E state transitions (!= a dist!) +*/
129 
130   /* The null model probabilities.
131    */
132   float  null[MAXABET];         /* "random sequence" emission prob's     +*/
133   float  p1;                    /* null model loop probability           +*/
134 
135   /* The model in log-odds score form.
136    * These are created from the probabilities by LogoddsifyHMM().
137    * By definition, null[] emission scores are all zero.
138    * Note that emission distributions are over 26 upper-case letters,
139    * not just the unambiguous protein or DNA alphabet: we
140    * precalculate the scores for all IUPAC degenerate symbols we
141    * may see. Non-IUPAC symbols simply have a -INFTY score.
142    *
143    * Note the reversed indexing on msc, isc, tsc -- for efficiency reasons.
144    * They're not probability vectors any more so we can reorder them
145    * without wildly complicating our life.
146    *
147    * The _mem ptrs are where the real memory is alloc'ed and free'd,
148    * as opposed to where it is accessed.
149    * This came in with Erik Lindahl's altivec port; it allows alignment on
150    * 16-byte boundaries. In the non-altivec code, this is just a little
151    * redundancy; tsc and tsc_mem point to the same thing, for example.
152    *
153    * Only valid if PLAN7_HASBITS is set.
154    */
155   int  **tsc;                   /* transition scores     [0.6][1.M-1]       -*/
156   int  **msc;                   /* match emission scores [0.MAXCODE-1][1.M] -*/
157   int  **isc;                   /* ins emission scores [0.MAXCODE-1][1.M-1] -*/
158   int    xsc[4][2];             /* N,E,C,J transitions                      -*/
159   int   *bsc;                   /* begin transitions     [1.M]              -*/
160   int   *esc;           /* end transitions       [1.M]              -*/
161   int  *tsc_mem, *msc_mem, *isc_mem, *bsc_mem, *esc_mem;
162 
163   /* DNA translation scoring parameters
164    * For aligning protein Plan7 models to DNA sequence.
165    * Lookup value for a codon is calculated by pos1 * 16 + pos2 * 4 + pos3,
166    * where 'pos1' is the digitized value of the first nucleotide position;
167    * if any of the positions are ambiguous codes, lookup value 64 is used
168    * (which will generally have a score of zero)
169    *
170    * Only valid if PLAN7_HASDNA is set.
171    */
172   int  **dnam;                  /* triplet match scores  [0.64][1.M]       -*/
173   int  **dnai;                  /* triplet insert scores [0.64][1.M]       -*/
174   int    dna2;          /* -1 frameshift, doublet emission, M or I -*/
175   int    dna4;          /* +1 frameshift, doublet emission, M or I -*/
176 
177   /* P-value and E-value statistical parameters
178    * Only valid if PLAN7_STATS is set.
179    */
180   float  mu;            /* EVD mu       +*/
181   float  lambda;        /* EVD lambda   +*/
182 
183   int flags;            // bit flags indicating state of HMM, valid data
184   int atype;            // alphabet type
185 };
186 
187 /* Flags for plan7->flags.
188  * Note: Some models have scores but no probabilities (for instance,
189  *       after reading from an HMM save file). Other models have
190  *       probabilities but no scores (for instance, during training
191  *       or building). Since it costs time to convert either way,
192  *       I use PLAN7_HASBITS and PLAN7_HASPROB flags to defer conversion
193  *       until absolutely necessary. This means I have to be careful
194  *       about keeping these flags set properly when I fiddle a model.
195  */
196 #define PLAN7_HASBITS (1<<0)    /* raised if model has log-odds scores      */
197 #define PLAN7_DESC    (1<<1)    /* raised if description exists             */
198 #define PLAN7_RF      (1<<2)    /* raised if #RF annotation available       */
199 #define PLAN7_CS      (1<<3)    /* raised if #CS annotation available       */
200 #define PLAN7_XRAY    (1<<4)    /* raised if structural data available      */
201 #define PLAN7_HASPROB (1<<5)    /* raised if model has probabilities        */
202 #define PLAN7_HASDNA  (1<<6)    /* raised if protein HMM->DNA seq params set*/
203 #define PLAN7_STATS   (1<<7)    /* raised if EVD parameters are available   */
204 #define PLAN7_MAP     (1<<8)    /* raised if alignment map is available     */
205 #define PLAN7_ACC     (1<<9)    /* raised if accession number is available  */
206 #define PLAN7_GA      (1<<10)   /* raised if gathering thresholds available */
207 #define PLAN7_TC      (1<<11)   /* raised if trusted cutoffs available      */
208 #define PLAN7_NC      (1<<12)   /* raised if noise cutoffs available        */
209 #define PLAN7_CA      (1<<13)   /* raised if surface accessibility avail.   */
210 
211 /* Indices for special state types, I: used for dynamic programming xmx[][]
212  * mnemonic: eXtra Matrix for B state = XMB
213  */
214 #define XMB 0
215 #define XME 1
216 #define XMC 2
217 #define XMJ 3
218 #define XMN 4
219 
220 /* Indices for special state types, II: used for hmm->xt[] indexing
221  * mnemonic: eXtra Transition for N state = XTN
222  */
223 #define XTN  0
224 #define XTE  1
225 #define XTC  2
226 #define XTJ  3
227 
228 /* Indices for Plan7 main model state transitions.
229  * Used for indexing hmm->t[k][]
230  * mnemonic: Transition from Match to Match = TMM
231  */
232 #define TMM  0
233 #define TMI  1
234 #define TMD  2
235 #define TIM  3
236 #define TII  4
237 #define TDM  5
238 #define TDD  6
239 
240 /* Indices for extra state transitions
241  * Used for indexing hmm->xt[][].
242  */
243 #define MOVE 0          /* trNB, trEC, trCT, trJB */
244 #define LOOP 1          /* trNN, trEJ, trCC, trJJ */
245 
246 /* Declaration of Plan7 dynamic programming matrix structure.
247  */
248 struct dpmatrix_s {
249   int **xmx;            /* special scores [0.1..N][BECJN]     */
250   int **mmx;            /* match scores [0.1..N][0.1..M]      */
251   int **imx;            /* insert scores [0.1..N][0.1..M-1.M] */
252   int **dmx;            /* delete scores [0.1..N][0.1..M-1.M] */
253 
254   /* Hidden ptrs where the real memory is kept; this trick was
255    * introduced by Erik Lindahl with the Altivec port; it's used to
256    * align xmx, etc. on 16-byte boundaries for cache optimization.
257    */
258   void *xmx_mem, *mmx_mem, *imx_mem, *dmx_mem;
259 
260   /* The other trick brought in w/ the Lindahl Altivec port; dp matrix
261    * is retained and grown, rather than ReallocOrDieated for every HMM or sequence.
262    * Keep track of current allocated-for size in rows (sequence length N)
263    * and columns (HMM length M). Also keep track of pad sizes: how much
264    * we should overallocate rows or columns when we ReallocOrDieate. If pad = 0,
265    * then we're not growable in this dimension.
266    */
267   int maxN;         /* alloc'ed for seq of length N; N+1 rows */
268   int maxM;         /* alloc'ed for HMM of length M; M+1 cols */
269 
270   int padN;         /* extra pad in sequence length/rows */
271   int padM;         /* extra pad in HMM length/columns   */
272 };
273 
274 /* Declaration of Plan7 shadow matrix structure.
275  * In general, allowed values are STM, STI, etc.
276  * However, E state has M possible sources, from 1..M match states;
277  * hence the esrc array.
278  */
279 struct dpshadow_s {
280   char **xtb;           /* special state traces [0.1..N][BECJN]     */
281   char **mtb;           /* match state traces [0.1..N][0.1..M]      */
282   char **itb;           /* insert state traces [0.1..N][0.1..M-1.M] */
283   char **dtb;           /* delete state traces [0.1..N][0.1..M-1.M] */
284   int   *esrc;                  /* E trace is special; must store a M state number 1..M */
285 };
286 
287 
288 /* Plan 7 model state types
289  * used in traceback structure
290  */
291 #define STBOGUS 0
292 #define STM     1
293 #define STD     2
294 #define STI     3
295 #define STS     4
296 #define STN     5
297 #define STB     6
298 #define STE     7
299 #define STC     8
300 #define STT     9
301 #define STJ     10
302 
303 /* Structure: p7trace_s
304  *
305  * Traceback structure for alignments of model to sequence.
306  * Each array in a trace_s is 0..tlen-1.
307  * Element 0 is always to STATE_S. Element tlen-1 is always to STATE_T.
308  */
309 struct p7trace_s {
310   int   tlen;                   /* length of traceback                          */
311   char *statetype;              /* state type used for alignment                */
312   int  *nodeidx;                /* index of aligned node, 1..M (if M,D,I), or 0 */
313   int  *pos;                    /* position in dsq, 1..L, or 0 if none          */
314 };
315 
316 
317 /**********************************************************************
318 * Other structures, not having to do with HMMs.
319 **********************************************************************/
320 
321 /* Structure: histogram_s
322 *
323 * Keep a score histogram.
324 *
325 * The main implementation issue here is that the range of
326 * scores is unknown, and will go negative. histogram is
327 * a 0..max-min array that represents the range min..max.
328 * A given score is indexed in histogram array as score-min.
329 * The AddToHistogram() function deals with dynamically
330 * resizing the histogram array when necessary.
331 */
332 struct histogram_s {
333     int *histogram;       /* counts of hits                     */
334     int  min;         /* elem 0 of histogram == min         */
335     int  max;                     /* last elem of histogram == max      */
336     int  highscore;       /* highest active elem has this score */
337     int  lowscore;        /* lowest active elem has this score  */
338     int  lumpsize;        /* when resizing, overalloc by this   */
339     int  total;           /* total # of hits counted            */
340 
341     float *expect;        /* expected counts of hits            */
342     int    fit_type;      /* flag indicating distribution type  */
343     float  param[3];      /* parameters used for fits           */
344     float  chisq;         /* chi-squared val for goodness of fit*/
345     float  chip;          /* P value for chisquared             */
346 };
347 #define HISTFIT_NONE     0  /* no fit done yet               */
348 #define HISTFIT_EVD      1  /* fit type = extreme value dist */
349 #define HISTFIT_GAUSSIAN 2  /* fit type = Gaussian           */
350 #define EVD_MU       0  /* EVD fit parameter mu          */
351 #define EVD_LAMBDA       1  /* EVD fit parameter lambda      */
352 #define EVD_WONKA        2      /* EVD fit fudge factor          */
353 #define GAUSS_MEAN       0  /* Gaussian parameter mean       */
354 #define GAUSS_SD         1  /* Gaussian parameter std. dev.  */
355 
356 /* Structure: fancyali_s
357 *
358 * Alignment of a hit to an HMM, for printing.
359 */
360 struct fancyali_s {
361     char *rfline;                 /* reference coord info                 */
362     char *csline;                 /* consensus structure info             */
363     char *model;                  /* aligned query consensus sequence     */
364     char *mline;                  /* "identities", conservation +'s, etc. */
365     char *aseq;                   /* aligned target sequence              */
366     int   len;            /* length of strings                    */
367     char *query;          /* name of query HMM                    */
368     char *target;         /* name of target sequence              */
369     int   sqfrom;         /* start position on sequence (1..L)    */
370     int   sqto;               /* end position on sequence   (1..L)    */
371 };
372 
373 /* Structure: hit_s
374 *
375 * Info about a high-scoring database hit.
376 * We keep this info in memory, so we can output a
377 * sorted list of high hits at the end.
378 *
379 * sqfrom and sqto are the coordinates that will be shown
380 * in the results, not coords in arrays... therefore, reverse
381 * complements have sqfrom > sqto
382 */
383 struct hit_s {
384     double sortkey;       /* number to sort by; big is better */
385     float  score;         /* score of the hit                 */
386     double pvalue;        /* P-value of the hit               */
387     float  mothersc;      /* score of whole sequence          */
388     double motherp;       /* P-value of whole sequence        */
389     char   *name;         /* name of the target               */
390     char   *acc;          /* accession of the target          */
391     char   *desc;         /* description of the target        */
392     int    sqfrom;        /* start position in seq (1..N)     */
393     int    sqto;          /* end position in seq (1..N)       */
394     int    sqlen;         /* length of sequence (N)           */
395     int    hmmfrom;       /* start position in HMM (1..M)     */
396     int    hmmto;         /* end position in HMM (1..M)       */
397     int    hmmlen;        /* length of HMM (M)                */
398     int    domidx;        /* index of this domain             */
399     int    ndom;          /* total # of domains in this seq   */
400     struct fancyali_s  *ali;  /* ptr to optional alignment info   */
401 };
402 
403 
404 /* Structure: tophit_s
405 *
406 * Array of high scoring hits, suitable for efficient sorting
407 * when we prepare to output results. "hit" list is NULL and
408 * unavailable until after we do a sort.
409 */
410 struct tophit_s {
411     struct hit_s **hit;           /* array of ptrs to top scoring hits        */
412     struct hit_s  *unsrt;         /* unsorted array                           */
413     int            alloc;     /* current allocation size                  */
414     int            num;       /* number of hits in list now               */
415     int            lump;          /* allocation lumpsize                      */
416 };
417 
418 /* struct threshold_s
419 * Contains score/evalue threshold settings.
420 *
421 * made first for hmmpfam:
422 * Since we're going to loop over all HMMs in a Pfam (or pfam-like)
423 * database in main_loop_{serial,pvm}, and we're going to
424 * allow autocutoffs using Pfam GA, NC, TC lines, we will need
425 * to reset those cutoffs with each HMM in turn. Therefore the
426 * main loops need to know whether they're supposed to be
427 * doing autocutoff. This amount of info was unwieldy enough
428 * to pass through the argument list that I put it
429 * in a structure.
430 */
431 enum CUT{ CUT_NONE, CUT_GA, CUT_NC, CUT_TC };
432 struct threshold_s {
433     float  globT;         /* T parameter: keep only hits > globT bits */
434     double globE;         /* E parameter: keep hits < globE E-value   */
435     float  domT;          /* T parameter for individual domains       */
436     double domE;          /* E parameter for individual domains       */
437     /* autosetting of cutoffs using Pfam annot: */
438     CUT autocut;
439     int   Z;          /* nseq to base E value calculation on      */
440 };
441 
442 
443 /****************************************************
444  * Single sequence information
445  ****************************************************/
446 #define SQINFO_NAMELEN 64
447 #define SQINFO_DESCLEN 128
448 
449 struct seqinfo_s {
450   int      flags;               /* what extra data are available         */
451   char     name[SQINFO_NAMELEN];/* up to 63 characters of name           */
452   char     id[SQINFO_NAMELEN];  /* up to 63 char of database identifier  */
453   char     acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */
454   char     desc[SQINFO_DESCLEN];/* up to 127 char of description         */
455   int      len;                 /* length of this seq                    */
456   int      start;       /* (1..len) start position on source seq */
457   int      stop;                /* (1..len) end position on source seq   */
458   int      olen;                /* original length of source seq         */
459   int      type;                /* kRNA, kDNA, kAmino, or kOther         */
460   char    *ss;                  /* 0..len-1 secondary structure string   */
461   char    *sa;          /* 0..len-1 % side chain surface access. */
462 };
463 typedef struct seqinfo_s SQINFO;
464 
465 #define SQINFO_NAME  (1 << 0)
466 #define SQINFO_ID    (1 << 1)
467 #define SQINFO_ACC   (1 << 2)
468 #define SQINFO_DESC  (1 << 3)
469 #define SQINFO_START (1 << 4)
470 #define SQINFO_STOP  (1 << 5)
471 #define SQINFO_LEN   (1 << 6)
472 #define SQINFO_TYPE  (1 << 7)
473 #define SQINFO_OLEN  (1 << 8)
474 #define SQINFO_SS    (1 << 9)
475 #define SQINFO_SA    (1 << 10)
476 
477 
478 //alphabet
479 #define NUCLEOTIDES    "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
480 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
481 #define DNA_ALPHABET   "ACGT"
482 #define RNA_ALPHABET   "ACGU"
483 #define WHITESPACE     " \t\n"
484 
485 
486 /* an idiom for determining a symbol's position in the array
487 * by pointer arithmetic.
488 * does no error checking, so caller must already be damned sure x is
489 * valid in the alphabet!
490 */
491 #define SYMIDX(al, x)   (strchr(al->Alphabet, (x)) - al->Alphabet)
492 
493 /* The symbol alphabet.
494 * Must deal with IUPAC degeneracies. Nondegenerate symbols
495 * come first in Alphabet[], followed by degenerate symbols.
496 * Nucleic alphabet also must deal with other common symbols
497 * like U (in RNA) and X (often misused for N).
498 * Example:
499 *   Nucleic: "ACGTUNRYMKSWHBVDX"          size=4  iupac=17
500 *   Amino:   "ACDEFGHIKLMNPQRSTVWYBZX"    size=20 iupac=23
501 *
502 * Parts of the code assume that the last symbol is a
503 * symbol for an unknown residue, i.e. 'X'.
504 *
505 * MAXCODE and MAXABET constants are defined in config.h
506 */
507 #define hmmNOTSETYET 0
508 #define hmmNUCLEIC   2      /* compatibility with squid's kRNA   */
509 #define hmmAMINO     3      /* compatibility with squid's kAmino */
510 
511 
512 struct alphabet_s {
513     int   Alphabet_type;       /* hmmNUCLEIC or hmmAMINO                */
514     int   Alphabet_size;       /* uniq alphabet size: 4 or 20           */
515     int   Alphabet_iupac;      /* total size of alphabet + IUPAC degen. */
516 
517     char  Alphabet[MAXCODE+1]; /* ACGT, for instance                    */
518     char  Degenerate[MAXCODE][MAXABET]; /* 1/0 arrays, for whether IUPAC code includes a residue */
519     int   DegenCount[MAXCODE];
520 
521     alphabet_s();
522 };
523 
524 
525 /****************************************************
526 * Cluster analysis and phylogenetic tree support
527 ****************************************************/
528 
529 /* struct phylo_s - a phylogenetic tree
530 *
531 * For N sequences, there will generally be an array of 0..N-2
532 * phylo_s structures representing the nodes of a tree.
533 * [0] is the root. The indexes of left and
534 * right children are somewhat confusing so be careful. The
535 * indexes can have values of 0..2N-2. If they are 0..N-1, they
536 * represent pointers to individual sequences. If they are
537 * >= N, they represent pointers to a phylo_s structure
538 * at (index - N).
539 */
540 struct phylo_s {
541     int    parent;                /* index of parent, N..2N-2, or -1 for root */
542     int    left;          /* index of one of the branches, 0..2N-2 */
543     int    right;         /* index of other branch, 0..2N-2        */
544     float  diff;          /* difference score between seqs         */
545     float  lblen;             /* left branch length                    */
546     float  rblen;                 /* right branch length                   */
547     char  *is_in;                 /* 0..N-1 flag array, 1 if seq included  */
548     int    incnum;                /* number of seqs included at this node  */
549 };
550 
551 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN };
552 
553 /* Strategies for cluster analysis; cluster by mean distance,
554 * minimum distance, or maximum distance.
555 */
556 
557 
558 
559 /* Binary encoding of the IUPAC code for nucleotides
560 *
561 *    four-bit "word", permitting rapid degenerate matching
562 *         A  C  G  T/U
563 *         0  0  1  0
564 */
565 #define NTA 8
566 #define NTC 4
567 #define NTG 2
568 #define NTT 1
569 #define NTU 1
570 #define NTN 15          /* A|C|G|T */
571 #define NTR 10          /* A|G */
572 #define NTY 5           /* C|T */
573 #define NTM 12          /* A|C */
574 #define NTK 3           /* G|T */
575 #define NTS 6           /* C|G */
576 #define NTW 9           /* A|T */
577 #define NTH 13          /* A|C|T */
578 #define NTB 7           /* C|G|T */
579 #define NTV 14          /* A|C|G */
580 #define NTD 11          /* A|G|T */
581 #define NTGAP 16        /* GAP */
582 #define NTEND 0         /* null string terminator */
583 
584 
585 /****************************************************
586 * Sequence alphabet: see also iupac.c
587 ****************************************************/
588 /* IUPAC symbols defined globally in iupac.c */
589 struct iupactype {
iupactypeiupactype590     iupactype(char _sym, char _symcomp, char _code, char _comp): sym(_sym), symcomp(_symcomp), code(_code), comp(_comp) {}
591     char       sym;       /* character representation */
592     char       symcomp;           /* complement (regular char */
593     char       code;      /* my binary rep */
594     char       comp;              /* binary encoded complement */
595 };
596 
597 extern struct iupactype iupac[];
598 extern float    dnafq[];        /* nucleotide occurrence frequencies    */
599 extern float    aafq[];     /* amino acid occurrence frequencies    */
600 
601 
602 
603 #define SQDCONST_E    2.71828182845904523536028747135
604 #define SQDCONST_PI   3.14159265358979323846264338328
605 
606                 /* must declare swapfoo to use SWAP() */
607 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;}
608 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7)
609 
610 #ifndef MIN
611 #define MIN(a,b)         (((a)<(b))?(a):(b))
612 #endif
613 #ifndef MAX
614 #define MAX(a,b)         (((a)>(b))?(a):(b))
615 #endif
616 
617 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~')
618 
619 /* For convenience and (one hopes) clarity in boolean tests:
620 */
621 #ifndef TRUE
622 #define TRUE 1
623 #endif
624 #ifndef FALSE
625 #define FALSE 0
626 #endif
627 
628 
629 
630 /* The following constants define the Pfam/Rfam cutoff set we'll propagate
631  * from msa's into HMMER and Infernal models.
632  */
633 #define MSA_CUTOFF_TC1 0
634 #define MSA_CUTOFF_TC2 1
635 #define MSA_CUTOFF_GA1 2
636 #define MSA_CUTOFF_GA2 3
637 #define MSA_CUTOFF_NC1 4
638 #define MSA_CUTOFF_NC2 5
639 #define MSA_MAXCUTOFFS 6
640 
641 /* Structure: MSA
642  * SRE, Tue May 18 11:33:08 1999
643  *
644  * Our object for a multiple sequence alignment.
645 */
646 typedef struct msa_struct {
647   /* Mandatory information associated with the alignment.
648    */
649   char **aseq;                  /* the alignment itself, [0..nseq-1][0..alen-1] */
650   char **sqname;                /* names of sequences, [0..nseq-1][0..alen-1]   */
651   float *wgt;                   /* sequence weights [0..nseq-1]                 */
652   int    alen;          /* length of alignment (columns)                */
653   int    nseq;          /* number of seqs in alignment                  */
654 
655   /* Optional information that we understand, and might have.
656    */
657   int    flags;         /* flags for what optional info is valid    */
658   int    type;          /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */
659   char  *name;              /* name of alignment, or NULL */
660   char  *desc;                  /* description of alignment, or NULL */
661   char  *acc;                   /* accession of alignment, or NULL */
662   char  *au;                /* "author" information, or NULL */
663   char  *ss_cons;       /* consensus secondary structure string, or NULL */
664   char  *sa_cons;               /* consensus surface accessibility string, or NULL */
665   char  *rf;                    /* reference coordinate system, or NULL */
666   char **sqacc;         /* accession numbers for individual sequences */
667   char **sqdesc;        /* description lines for individual sequences */
668   char **ss;                    /* per-seq secondary structure annotation, or NULL */
669   char **sa;                    /* per-seq surface accessibility annotation, or NULL */
670   float  cutoff[MSA_MAXCUTOFFS];       /* NC, TC, GA cutoffs propagated to Pfam/Rfam */
671   int    cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */
672 } MSA;
673 
674 #define MSA_SET_WGT     (1 << 0)  /* track whether wgts were set, or left at default 1.0 */
675 
676 
677 
678 /* Structure: p7prior_s
679 *
680 * Dirichlet priors on HMM parameters.
681 */
682 struct p7prior_s {
683     int   strategy;               /* PRI_DCHLET, etc.                          */
684 
685     int   tnum;                   /* number of transition Dirichlet mixtures   */
686     float tq[MAXDCHLET];          /* probabilities of tnum components          */
687     float t[MAXDCHLET][7];        /* transition terms per mix component        */
688 
689     int   mnum;                   /* number of mat emission Dirichlet mixtures */
690     float mq[MAXDCHLET];          /* probabilities of mnum components          */
691     float m[MAXDCHLET][MAXABET];  /* match emission terms per mix component    */
692 
693     int   inum;           /* number of insert emission Dirichlet mixes */
694     float iq[MAXDCHLET];      /* probabilities of inum components          */
695     float i[MAXDCHLET][MAXABET];  /* insert emission terms                     */
696 };
697 #define PRI_DCHLET 0            /* simple or mixture Dirichlets   */
698 #define PRI_PAM    1        /* PAM prior hack                 */
699 
700 
701 struct HMMException {
702     HMMException(const char *err);
703     char error[1024];
704 };
705 
706 struct HMMERTaskLocalData {
707     //
708 	alphabet_s	al;
709     //sre_random
710 	int			sre_randseed;
711 	long		rnd1;		/* random number from LCG1 */
712     long		rnd2;            /* random number from LCG2 */
713     long		rnd;             /* random number we return */
714     long		tbl[64];		/* table for Bays/Durham shuffle */
715     //prob2ascii
716 	char        buffer[8];
717     //Gaussrandom
718     long i;
719     double snorm,u,s,ustar,aa,w,y,tt;
720     //
721 	HMMERTaskLocalData();
722 };
723 
724 #endif /* STRUCTSH_INCLUDED */
725