1 /* The all-encompassing include file for HMMER.
2  * All-encompassing because there's a lot of crossdependency.
3  * There's some opportunity for modularity, but not a lot.
4  *
5  *    1. P7_HMM:         a core model.
6  *    2. P7_PROFILE:     a scoring profile, and its implicit model.
7  *    3. P7_BG:          a null (background) model.
8  *    4. P7_TRACE:       a traceback path (alignment of seq to profile).
9  *    5. P7_HMMFILE:     an HMM save file or database, open for reading.
10  *    6. P7_GMX:         a "generic" dynamic programming matrix
11  *    7. P7_PRIOR:       mixture Dirichlet prior for profile HMMs
12  *    8. P7_SPENSEMBLE:  segment pair ensembles for domain locations
13  *    9. P7_ALIDISPLAY:  an alignment formatted for printing
14  *   10. P7_DOMAINDEF:   reusably managing workflow in annotating domains
15  *   11. P7_TOPHITS:     ranking lists of top-scoring hits
16  *   12. P7_SCOREDATA:     data used in diagonal recovery and extension
17  *   13. P7_HMM_WINDOW:  data used to track lists of sequence windows
18  *   14. Inclusion of the architecture-specific optimized implementation.
19  *   16. P7_PIPELINE:    H3's accelerated seq/profile comparison pipeline
20  *   17. P7_BUILDER:     configuration options for new HMM construction.
21  *   18. Declaration of functions in HMMER's exposed API.
22  *
23  * Also, see impl_{sse,vmx}/impl_{sse,vmx}.h for additional API
24  * specific to the acceleration layer; in particular, the P7_OPROFILE
25  * structure for an optimized profile.
26  */
27 #ifndef P7_HMMERH_INCLUDED
28 #define P7_HMMERH_INCLUDED
29 #include "p7_config.h"
30 
31 #include <stdio.h>		/* FILE */
32 
33 #ifdef HMMER_MPI
34 #include "mpi.h"
35 #endif
36 
37 #ifdef HMMER_THREADS
38 #include <pthread.h>
39 #endif
40 
41 #include "easel.h"
42 #include "esl_alphabet.h"	/* ESL_DSQ, ESL_ALPHABET */
43 #include "esl_dmatrix.h"	/* ESL_DMATRIX           */
44 #include "esl_getopts.h"	/* ESL_GETOPTS           */
45 #include "esl_histogram.h"      /* ESL_HISTOGRAM         */
46 #include "esl_hmm.h"	        /* ESL_HMM               */
47 #include "esl_keyhash.h"        /* ESL_KEYHASH           */
48 #include "esl_mixdchlet.h"	/* ESL_MIXDCHLET         */
49 #include "esl_msa.h"		/* ESL_MSA               */
50 #include "esl_random.h"		/* ESL_RANDOMNESS        */
51 #include "esl_rand64.h" /* ESL_RAND64 */
52 #include "esl_sq.h"		/* ESL_SQ                */
53 #include "esl_scorematrix.h"    /* ESL_SCOREMATRIX       */
54 #include "esl_stopwatch.h"      /* ESL_STOPWATCH         */
55 
56 
57 
58 /* Search modes. */
59 #define p7_NO_MODE   0
60 #define p7_LOCAL     1		/* multihit local:  "fs" mode   */
61 #define p7_GLOCAL    2		/* multihit glocal: "ls" mode   */
62 #define p7_UNILOCAL  3		/* unihit local: "sw" mode      */
63 #define p7_UNIGLOCAL 4		/* unihit glocal: "s" mode      */
64 
65 #define p7_IsLocal(mode)  (mode == p7_LOCAL || mode == p7_UNILOCAL)
66 #define p7_IsMulti(mode)  (mode == p7_LOCAL || mode == p7_GLOCAL)
67 
68 #define p7_NEVPARAM 6	/* number of statistical parameters stored in models                      */
69 #define p7_NCUTOFFS 6	/* number of Pfam score cutoffs stored in models                          */
70 #define p7_NOFFSETS 3	/* number of disk offsets stored in models for hmmscan's fast model input */
71 enum p7_evparams_e {    p7_MMU  = 0, p7_MLAMBDA = 1,     p7_VMU = 2,  p7_VLAMBDA = 3, p7_FTAU = 4, p7_FLAMBDA = 5 };
72 enum p7_cutoffs_e  {     p7_GA1 = 0,     p7_GA2 = 1,     p7_TC1 = 2,      p7_TC2 = 3,  p7_NC1 = 4,     p7_NC2 = 5 };
73 enum p7_offsets_e  { p7_MOFFSET = 0, p7_FOFFSET = 1, p7_POFFSET = 2 };
74 
75 #define p7_EVPARAM_UNSET -99999.0f  /* if evparam[0] is unset, then all unset                         */
76 #define p7_CUTOFF_UNSET  -99999.0f  /* if cutoff[XX1] is unset, then cutoff[XX2] unset, XX={GA,TC,NC} */
77 #define p7_COMPO_UNSET   -1.0f      /* if compo[0] is unset, then all unset                           */
78 
79 /* Option flags when creating multiple alignments with p7_tracealign_*() */
80 #define p7_DEFAULT             0
81 #define p7_DIGITIZE            (1<<0)
82 #define p7_ALL_CONSENSUS_COLS  (1<<1)
83 #define p7_TRIM                (1<<2)
84 
85 /* Option flags when creating faux traces with p7_trace_FauxFromMSA() */
86 #define p7_MSA_COORDS	       (1<<0) /* default: i = unaligned seq residue coords     */
87 
88 /* Which strand(s) should be searched */
89 enum p7_strands_e {    p7_STRAND_TOPONLY  = 0, p7_STRAND_BOTTOMONLY = 1,  p7_STRAND_BOTH = 2};
90 
91 /*****************************************************************
92  * 1. P7_HMM: a core model.
93  *****************************************************************/
94 
95 /* Bit flags used in <hmm->flags>: optional annotation in an HMM
96  *
97  * Flags marked with ! may not be changed nor used for other meanings,
98  * because they're codes used by HMMER2 (and earlier) that must be
99  * preserved for reverse compatibility with old HMMER files.
100  *
101  * Why use flags? (So I don't ask this question of myself again:)
102  *   1. The way we allocate an HMM, we need to know if we're allocating
103  *      M-width annotation fields (RF, CS, CA, MAP) before we read the
104  *      annotation from the file.
105  *   2. Historically, H2 used flags, so we still need to read H2 flags
106  *      for backwards compatibility; so we may as well keep using them.
107  */
108 #define p7H_HASBITS (1<<0)    /* obsolete (was: model has log-odds scores)       !*/
109 #define p7H_DESC    (1<<1)    /* description exists (legacy; xref SRE:J5/114)    !*/
110 #define p7H_RF      (1<<2)    /* #RF annotation available                        !*/
111 #define p7H_CS      (1<<3)    /* #CS annotation available                        !*/
112 #define p7H_XRAY    (1<<4)    /* obsolete (was: structural data available)       !*/
113 #define p7H_HASPROB (1<<5)    /* obsolete (was: model in probability form)       !*/
114 #define p7H_HASDNA  (1<<6)    /* obsolete (was: protein HMM->DNA seq params set) !*/
115 #define p7H_STATS   (1<<7)    /* model has E-value statistics calibrated         !*/
116 #define p7H_MAP     (1<<8)    /* alignment map is available                      !*/
117 #define p7H_ACC     (1<<9)    /* accession is available (legacy; xref SRE:J5/114)!*/
118 #define p7H_GA      (1<<10)   /* gathering thresholds available                  !*/
119 #define p7H_TC      (1<<11)   /* trusted cutoffs available                       !*/
120 #define p7H_NC      (1<<12)   /* noise cutoffs available                         !*/
121 #define p7H_CA      (1<<13)   /* surface accessibilities available               !*/
122 #define p7H_COMPO   (1<<14)   /* model-specific residue composition available     */
123 #define p7H_CHKSUM  (1<<15)   /* model has an alignment checksum                  */
124 #define p7H_CONS    (1<<16)   /* consensus residue line available                 */
125 #define p7H_MMASK   (1<<17)   /* #MM annotation available                        !*/
126 
127 /* Indices of Plan7 main model state transitions, hmm->t[k][] */
128 enum p7h_transitions_e {
129   p7H_MM = 0,
130   p7H_MI = 1,
131   p7H_MD = 2,
132   p7H_IM = 3,
133   p7H_II = 4,
134   p7H_DM = 5,
135   p7H_DD = 6
136 };
137 #define p7H_NTRANSITIONS 7
138 
139 /* How the hmm->t[k] vector is interpreted as separate probability vectors. */
140 #define P7H_TMAT(hmm, k) ((hmm)->t[k])
141 #define P7H_TINS(hmm, k) ((hmm)->t[k]+3)
142 #define P7H_TDEL(hmm, k) ((hmm)->t[k]+5)
143 #define p7H_NTMAT 3
144 #define p7H_NTDEL 2
145 #define p7H_NTINS 2
146 
147 /* Some notes:
148  *   0. The model might be either in counts or probability form.
149  *   1. t[0] is special: t[0][TMM,TMI,TMD] are the begin->M_1,I_0,D_1 entry probabilities,
150  *      t[0][TIM,TII] are the I_0 transitions, and delete state 0 doesn't
151  *      exist. Therefore D[0] transitions and mat[0] emissions are unused.
152  *      To simplify some normalization code, we adopt a convention that these are set
153  *      to valid probability distributions: 1.0 for t[0][TDM] and mat[0][0],
154  *      and 0 for the rest.
155  *   2. t[M] is also special: TMD and TDD are 0 because there is no next delete state;
156  *      TDM is therefore 1.0 by definition. TMM and TDM are interpreted as the
157  *      M->E and D->E end transitions. t[M][TDM] must be 1.0, therefore.
158  */
159 typedef struct p7_hmm_s {
160   /*::cexcerpt::plan7_core::begin::*/
161   int     M;                    /* length of the model (# nodes)                           */
162   float **t;                    /* transition prob's. t[(0),1..M][0..p7H_NTRANSITIONS-1]   */
163   float **mat;                  /* match emissions.  mat[1..M][0..K-1]                     */
164   float **ins;                  /* insert emissions. ins[1..M][0..K-1]                     */
165   /*::cexcerpt::plan7_core::end::*/
166 
167   /* Annotation. Everything but <name> is optional. Flags are set when
168    * optional values are set. All the char *'s are proper nul-terminated
169    * strings, not just arrays. (hmm->map is an int array).
170    */
171   char    *name;                 /* name of the model                     (mandatory)      */ /* String, \0-terminated   */
172   char    *acc;	                 /* accession number of model (Pfam)      (optional: NULL) */ /* String, \0-terminated   */
173   char    *desc;                 /* brief (1-line) description of model   (optional: NULL) */ /* String, \0-terminated   */
174   char    *rf;                   /* reference line from alignment 1..M    (p7H_RF)         */ /* String; 0=' ', M+1='\0' */
175   char    *mm;                   /* model mask line from alignment 1..M   (p7H_MM)         */ /* String; 0=' ', M+1='\0' */
176   char    *consensus;		         /* consensus residue line        1..M    (p7H_CONS)       */ /* String; 0=' ', M+1='\0' */
177   char    *cs;                   /* consensus structure line      1..M    (p7H_CS)         */ /* String; 0=' ', M+1='\0' */
178   char    *ca;	                 /* consensus accessibility line  1..M    (p7H_CA)         */ /* String; 0=' ', M+1='\0' */
179 
180   char    *comlog;               /* command line(s) that built model      (optional: NULL) */ /* String, \0-terminated   */
181   int      nseq;	         /* number of training sequences          (optional: -1)   */
182   float    eff_nseq;             /* effective number of seqs (<= nseq)    (optional: -1)   */
183   int	   max_length;           /* upper bound length, all but 1e-7 prob (optional: -1)   */
184   char    *ctime;	         /* creation date                         (optional: NULL) */
185   int     *map;	                 /* map of alignment cols onto model 1..M (p7H_MAP)        */ /* Array; map[0]=0 */
186   uint32_t checksum;             /* checksum of training sequences        (p7H_CHKSUM)     */
187   float    evparam[p7_NEVPARAM]; /* E-value params                        (p7H_STATS)      */
188   float    cutoff[p7_NCUTOFFS];  /* Pfam score cutoffs                    (p7H_{GA,TC,NC}) */
189   float    compo[p7_MAXABET];    /* model bg residue comp                 (p7H_COMPO)      */
190 
191   off_t    offset;               /* HMM record offset on disk                              */
192   const ESL_ALPHABET *abc;       /* ptr to alphabet info (hmm->abc->K is alphabet size)    */
193   int      flags;                /* status flags                                           */
194 } P7_HMM;
195 
196 
197 /*****************************************************************
198  * 2. P7_PROFILE: a scoring profile, and its implicit model.
199  *****************************************************************/
200 
201 /* Indices for special state types in the length model, gm->xsc[x][]
202  */
203 enum p7p_xstates_e {
204   p7P_E = 0,
205   p7P_N = 1,
206   p7P_J = 2,
207   p7P_C = 3
208 };
209 #define p7P_NXSTATES 4
210 
211 /* Indices for transitions from the length modeling scores gm->xsc[][x]
212  */
213 enum p7p_xtransitions_e {
214   p7P_LOOP = 0,
215   p7P_MOVE = 1
216 };
217 #define p7P_NXTRANS 2
218 
219 /* Indices for transition scores gm->tsc[k][] */
220 /* order is optimized for dynamic programming */
221 enum p7p_tsc_e {
222   p7P_MM = 0,
223   p7P_IM = 1,
224   p7P_DM = 2,
225   p7P_BM = 3,
226   p7P_MD = 4,
227   p7P_DD = 5,
228   p7P_MI = 6,
229   p7P_II = 7,
230 };
231 #define p7P_NTRANS 8
232 
233 /* Indices for residue emission score vectors
234  */
235 enum p7p_rsc_e {
236   p7P_MSC = 0,
237   p7P_ISC = 1
238 };
239 #define p7P_NR 2
240 
241 /* Accessing transition, emission scores */
242 /* _BM is specially stored off-by-one: [k-1][p7P_BM] is score for entering at Mk */
243 #define p7P_TSC(gm, k, s) ((gm)->tsc[(k) * p7P_NTRANS + (s)])
244 #define p7P_MSC(gm, k, x) ((gm)->rsc[x][(k) * p7P_NR + p7P_MSC])
245 #define p7P_ISC(gm, k, x) ((gm)->rsc[x][(k) * p7P_NR + p7P_ISC])
246 
247 
248 typedef struct p7_profile_s {
249   float  *tsc;          /* transitions  [0.1..M-1][0..p7P_NTRANS-1], hand-indexed  */
250   float **rsc;          /* emissions [0..Kp-1][0.1..M][p7P_NR], hand-indexed       */
251   float   xsc[p7P_NXSTATES][p7P_NXTRANS]; /* special transitions [NECJ][LOOP,MOVE] */
252 
253   int     mode;        	/* configured algorithm mode (e.g. p7_LOCAL)               */
254   int     L;		/* current configured target seq length                    */
255   int     allocM;	/* max # of nodes allocated in this structure              */
256   int     M;		/* number of nodes in the model                            */
257   int     max_length;	/* calculated upper bound on emitted seq length            */
258   float   nj;		/* expected # of uses of J; precalculated from loop config */
259 
260   /* Info, most of which is a copy from parent HMM:                                       */
261   char  *name;			/* unique name of model                                   */
262   char  *acc;			/* unique accession of model, or NULL                     */
263   char  *desc;                  /* brief (1-line) description of model, or NULL           */
264   char  *rf;                    /* reference line from alignment 1..M; *rf=0 means unused */
265   char  *mm;                    /* modelmask line           1..M; *ref=0: unused     */
266   char  *cs;                    /* consensus structure line      1..M, *cs=0 means unused */
267   char  *consensus;		/* consensus residues to display in alignments, 1..M      */
268   float  evparam[p7_NEVPARAM]; 	/* parameters for determining E-values, or UNSET          */
269   float  cutoff[p7_NCUTOFFS]; 	/* per-seq/per-domain bit score cutoffs, or UNSET         */
270   float  compo[p7_MAXABET];	/* per-model HMM filter composition, or UNSET             */
271 
272   /* Disk offset information for hmmpfam's fast model retrieval                           */
273   off_t  offs[p7_NOFFSETS];     /* p7_{MFP}OFFSET, or -1                                  */
274 
275   off_t  roff;                  /* record offset (start of record); -1 if none            */
276   off_t  eoff;                  /* offset to last byte of record; -1 if unknown           */
277 
278   const ESL_ALPHABET *abc;	/* copy of pointer to appropriate alphabet                */
279 } P7_PROFILE;
280 
281 
282 
283 /*****************************************************************
284  * 3. P7_BG: a null (background) model.
285  *****************************************************************/
286 
287 /* This really contains three different things:
288  *
289  *   - the "null1" model, a one-state HMM consisting of background
290  *     frequencies <f> and a parameter <p1> for a target-length
291  *     dependent geometric;
292  *
293  *   - the "bias filter" <fhmm> a two-state HMM composed from null1's
294  *     background <f> and the model's mean composition <compo>. This
295  *     model is constructed dynamically, every time a new profile is
296  *     considered;
297  *
298  *   - a single term <omega> that's needed by the "null2" model to set
299  *     a balance between the null1 and null2 scoring terms.  The null2
300  *     model is otherwise defined by construction, in p7_domaindef.c.
301  *
302  * Someday we might pull this apart into two or three separate
303  * objects.
304  */
305 typedef struct p7_bg_s {
306   float   *f;		/* null1 background residue frequencies [0..K-1]: set at initialization    */
307   float    p1;		/* null1's transition prob: p7_bg_SetLength() sets this from target seq L  */
308 
309   ESL_HMM *fhmm;	/* bias filter: p7_bg_SetFilter() sets this, from model's mean composition */
310 
311   float    omega;	/* the "prior" on null2/null3: set at initialization (one omega for both null types)  */
312 
313   const ESL_ALPHABET *abc;	/* reference to alphabet in use: set at initialization             */
314 } P7_BG;
315 
316 /*****************************************************************
317  * 4. P7_TRACE:  a traceback (alignment of seq to profile).
318  *****************************************************************/
319 
320 /* Traceback structure for alignment of a model to a sequence.
321  *
322  * A traceback only makes sense in a triplet (tr, gm, dsq), for a
323  * given profile or HMM (with nodes 1..M) and a given digital sequence
324  * (with positions 1..L).
325  *
326  * A traceback may be relative to a profile (usually) or to a core
327  * model (as a special case in model construction; see build.c). You
328  * can tell the difference by looking at the first statetype,
329  * tr->st[0]; if it's a p7T_S, it's for a profile, and if it's p7T_B,
330  * it's for a core model.
331  *
332  * A "profile" trace uniquely has S,N,C,T,J states and their
333  * transitions; it also can have B->Mk and Mk->E internal entry/exit
334  * transitions for local alignments. It may not contain X states.
335  *
336  * A "core" trace may contain I0, IM, and D1 states and their
337  * transitions. A "core" trace can also have B->X->{MDI}k and
338  * {MDI}k->X->E transitions as a special hack in a build procedure, to
339  * deal with the case of a local alignment fragment implied by an
340  * input alignment, which is "impossible" for a core model.
341  * X "states" only appear in core traces, and only at these
342  * entry/exit places; some code depends on this.
343  *
344  * A profile's N,C,J states emit on transition, not on state, so a
345  * path of N emits 0 residues, NN emits 1 residue, NNN emits 2
346  * residues, and so on. By convention, the trace always associates an
347  * emission-on-transition with the trailing (destination) state, so
348  * the first N, C, or J is stored in a trace as a nonemitter (i=0).
349  *
350  * A i coords in a traceback are usually 1..L with respect to an
351  * unaligned digital target sequence, but in the special case of
352  * traces faked from existing MSAs (as in hmmbuild), the coords may
353  * be 1..alen relative to an MSA's columns.
354  */
355 
356 /* State types */
357 enum p7t_statetype_e {
358   p7T_BOGUS =  0,
359   p7T_M     =  1,
360   p7T_D     =  2,
361   p7T_I     =  3,
362   p7T_S     =  4,
363   p7T_N     =  5,
364   p7T_B     =  6,
365   p7T_E     =  7,
366   p7T_C     =  8,
367   p7T_T     =  9,
368   p7T_J     = 10,
369   p7T_X     = 11, 	/* missing data: used esp. for local entry/exits */
370 };
371 #define p7T_NSTATETYPES 12
372 
373 typedef struct p7_trace_s {
374   int    N;		/* length of traceback                       */
375   int    nalloc;        /* allocated length of traceback             */
376   char  *st;		/* state type code                   [0..N-1]*/
377   int   *k;		/* node index; 1..M if M,D,I; else 0 [0..N-1]*/
378   int   *i;		/* pos emitted in dsq, 1..L; else 0  [0..N-1]*/
379   float *pp;		/* posterior prob of x_i; else 0     [0..N-1]*/
380   int    M;		/* model length M (maximum k)                */
381   int    L;		/* sequence length L (maximum i)             */
382 
383   /* The following section is data generated by "indexing" a trace's domains */
384   int   ndom;		/* number of domains in trace (= # of B or E states) */
385   int  *tfrom,   *tto;	/* locations of B/E states in trace (0..tr->N-1)     */
386   int  *sqfrom,  *sqto;	/* first/last M-emitted residue on sequence (1..L)   */
387   int  *hmmfrom, *hmmto;/* first/last M state on model (1..M)                */
388   int   ndomalloc;	/* current allocated size of these stacks            */
389 
390 } P7_TRACE;
391 
392 
393 
394 /*****************************************************************
395  * 5. P7_HMMFILE:  an HMM save file or database, open for reading.
396  *****************************************************************/
397 
398 /* These tags need to be in temporal order, so we can do tests
399  * like "if (format >= p7_HMMFILE_3b) ..."
400  */
401 enum p7_hmmfile_formats_e {
402   p7_HMMFILE_20 = 0,
403   p7_HMMFILE_3a = 1,
404   p7_HMMFILE_3b = 2,
405   p7_HMMFILE_3c = 3,
406   p7_HMMFILE_3d = 4,
407   p7_HMMFILE_3e = 5,
408   p7_HMMFILE_3f = 6,
409 };
410 
411 typedef struct p7_hmmfile_s {
412   FILE         *f;		 /* pointer to stream for reading models                 */
413   char         *fname;	         /* (fully qualified) name of the HMM file; [STDIN] if - */
414   ESL_SSI      *ssi;		 /* open SSI index for model file <f>; NULL if none.     */
415 
416   int           do_gzip;	/* TRUE if f is "gzip -dc |" (will pclose(f))           */
417   int           do_stdin;       /* TRUE if f is stdin (won't close f)                   */
418   int           newly_opened;	/* TRUE if we just opened the stream (and parsed magic) */
419   int           is_pressed;	/* TRUE if a pressed HMM database file (Pfam or equiv)  */
420 
421   int            format;	/* HMM file format code */
422   int           (*parser)(struct p7_hmmfile_s *, ESL_ALPHABET **, P7_HMM **);
423   ESL_FILEPARSER *efp;
424 
425   /* If <is_pressed>, we can read optimized profiles directly, via:  */
426   FILE         *ffp;		/* MSV part of the optimized profile */
427   FILE         *pfp;		/* rest of the optimized profile     */
428 
429 #ifdef HMMER_THREADS
430   int              syncRead;
431   pthread_mutex_t  readMutex;
432 #endif
433 
434   char          errbuf[eslERRBUFSIZE];
435 } P7_HMMFILE;
436 
437 /* note on <fname>, above:
438  * this is the actual name of the HMM file being read.
439  *
440  * The way p7_hmmfile_Open() works, it will preferentially look for
441  * hmmpress'ed binary files. If you open "foo", it will first try to
442  * open "foo.h3m" and <fname> will be "foo.h3m". "foo" does not even
443  * have to exist. If a parsing error occurs, you want <fname> to
444  * be "foo.h3m", so error messages report blame correctly.
445  * In the special case of reading from stdin, <fname> is "[STDIN]".
446  */
447 
448 
449 /*****************************************************************
450  * 6. P7_GMX: a "generic" dynamic programming matrix
451  *****************************************************************/
452 
453 enum p7g_scells_e {
454   p7G_M = 0,
455   p7G_I = 1,
456   p7G_D = 2,
457 };
458 #define p7G_NSCELLS 3
459 
460 enum p7g_xcells_e {
461   p7G_E  = 0,
462   p7G_N  = 1,
463   p7G_J  = 2,
464   p7G_B  = 3,
465   p7G_C  = 4
466 };
467 #define p7G_NXCELLS 5
468 
469 
470 typedef struct p7_gmx_s {
471   int  M;		/* actual model dimension (model 1..M)    */
472   int  L;		/* actual sequence dimension (seq 1..L)   */
473 
474   int      allocR;      /* current allocated # of rows : L+1 <= validR <= allocR                */
475   int      validR;	/* # of rows actually pointing at DP memory                             */
476   int      allocW;	/* current set row width :  M+1 <= allocW                               */
477   uint64_t ncells;	/* total # of allocated cells in 2D matrix : ncells >= (validR)(allocW) */
478 
479   float **dp;           /* logically [0.1..L][0.1..M][0..p7G_NSCELLS-1]; indexed [i][k*p7G_NSCELLS+s] */
480   float  *xmx;          /* logically [0.1..L][0..p7G_NXCELLS-1]; indexed [i*p7G_NXCELLS+s]            */
481 
482   float  *dp_mem;
483 } P7_GMX;
484 
485 /* Macros below implement indexing idioms for generic DP routines.
486  * They require the following setup, for profile <gm> and matrix <gx>:
487  *   float const *tsc = gm->tsc;
488  *   float      **dp  = gx->dp;
489  *   float       *xmx = gx->xmx;
490  * and for each row i (target residue x_i in digital seq <dsq>):
491  *   float const *rsc = gm->rsc[dsq[i]];
492  */
493 #define MMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_M])
494 #define IMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_I])
495 #define DMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_D])
496 #define XMX(i,s) (xmx[(i) * p7G_NXCELLS + (s)])
497 
498 #define TSC(s,k) (tsc[(k) * p7P_NTRANS + (s)])
499 #define MSC(k)   (rsc[(k) * p7P_NR     + p7P_MSC])
500 #define ISC(k)   (rsc[(k) * p7P_NR     + p7P_ISC])
501 
502 /* Flags that control P7_GMX debugging dumps */
503 #define p7_HIDE_SPECIALS (1<<0)
504 #define p7_SHOW_LOG      (1<<1)
505 
506 
507 /*****************************************************************
508  * 7. P7_PRIOR: mixture Dirichlet prior for profile HMMs
509  *****************************************************************/
510 
511 typedef struct p7_prior_s {
512   ESL_MIXDCHLET *tm;		/*  match transitions */
513   ESL_MIXDCHLET *ti;		/* insert transitions */
514   ESL_MIXDCHLET *td;		/* delete transitions */
515   ESL_MIXDCHLET *em;		/*  match emissions   */
516   ESL_MIXDCHLET *ei;		/* insert emissions   */
517 } P7_PRIOR;
518 
519 
520 
521 
522 /*****************************************************************
523  * 8. P7_SPENSEMBLE: segment pair ensembles for domain locations
524  *****************************************************************/
525 
526 /* struct p7_spcoord_s:
527  *    a coord quad defining a segment pair.
528  */
529 struct p7_spcoord_s {
530   int idx; 	/* backreference index: which trace a seg came from, or which cluster a domain came from */
531   int i, j;	/* start,end in a target sequence (1..L)  */
532   int k, m;     /* start,end in a query model (1..M)      */
533   float prob;	/* posterior probability of segment       */
534 };
535 
536 /* Structure: P7_SPENSEMBLE
537  *
538  * Collection and clustering of an ensemble of sampled segment pairs,
539  * in order to define domain locations using their posterior
540  * probability distribution (as opposed to Viterbi MAP tracebacks).
541  */
542 typedef struct p7_spensemble_s {
543   /* Section 1: a collected ensemble of segment pairs                                       */
544   int                  nsamples;    /* number of sampled traces                             */
545   struct p7_spcoord_s *sp;	    /* array of sampled seg pairs; [0..n-1]                 */
546   int                  nalloc;	    /* allocated size of <sp>                               */
547   int                  n;	    /* number of seg pairs in <sp>                          */
548 
549   /* Section 2: then the ensemble is clustered by single-linkage clustering                 */
550   int *workspace;                   /* temp space for Easel SLC algorithm: 2*n              */
551   int *assignment;                  /* each seg pair's cluster index: [0..n-1] = (0..nc-1)  */
552   int  nc;	                    /* number of different clusters                         */
553 
554   /* Section 3: then endpoint distribution is examined within each large cluster            */
555   int *epc;	                    /* array counting frequency of each endpoint            */
556   int  epc_alloc;	            /* allocated width of <epc>                             */
557 
558   /* Section 4: finally each large cluster is resolved into domain coords                   */
559   struct p7_spcoord_s *sigc;	    /* array of coords for each domain, [0..nsigc-1]        */
560   int                  nsigc;	    /* number of "significant" clusters, domains            */
561   int                  nsigc_alloc; /* current allocated max for nsigc                      */
562 } P7_SPENSEMBLE;
563 
564 
565 
566 /*****************************************************************
567  * 9. P7_ALIDISPLAY: an alignment formatted for printing
568  *****************************************************************/
569 
570 /* Structure: P7_ALIDISPLAY
571  *
572  * Alignment of a sequence domain to an HMM, formatted for printing.
573  *
574  * For an alignment of L residues and names C chars long, requires
575  * 6L + 2C + 30 bytes; for typical case of L=100,C=10, that's
576  * <0.7 Kb.
577  */
578 typedef struct p7_alidisplay_s {
579   char *rfline;                 /* reference coord info; or NULL        */
580   char *mmline;                 /* modelmask coord info; or NULL        */
581   char *csline;                 /* consensus structure info; or NULL    */
582   char *model;                  /* aligned query consensus sequence     */
583   char *mline;                  /* "identities", conservation +'s, etc. */
584   char *aseq;                   /* aligned target sequence              */
585   char *ntseq;                  /* nucleotide target sequence if nhmmscant */
586   char *ppline;		        /* posterior prob annotation; or NULL   */
587   int   N;		        /* length of strings                    */
588 
589   char *hmmname;		/* name of HMM                          */
590   char *hmmacc;			/* accession of HMM; or [0]='\0'        */
591   char *hmmdesc;		/* description of HMM; or [0]='\0'      */
592   int   hmmfrom;		/* start position on HMM (1..M, or -1)  */
593   int   hmmto;			/* end position on HMM (1..M, or -1)    */
594   int   M;			/* length of model                      */
595 
596   char *sqname;			/* name of target sequence              */
597   char *sqacc;			/* accession of target seq; or [0]='\0' */
598   char *sqdesc;			/* description of targ seq; or [0]='\0' */
599   int64_t  sqfrom;		/* start position on sequence (1..L)    */
600   int64_t  sqto;  	        /* end position on sequence   (1..L)    */
601   int64_t  L;			/* length of sequence                   */
602 
603   int   memsize;                /* size of allocated block of memory    */
604   char *mem;			/* memory used for the char data above  */
605 } P7_ALIDISPLAY;
606 
607 
608 /*****************************************************************
609  * 10. P7_DOMAINDEF: reusably managing workflow in defining domains
610  *****************************************************************/
611 
612 typedef struct p7_dom_s {
613   int64_t        ienv, jenv;
614   int64_t        iali, jali;
615   int64_t        iorf, jorf; /*Used in translated search to capture the range in the DNA sequence of the ORF containing the match to a protein query */
616   float          envsc;  	/* Forward score in envelope ienv..jenv; NATS; without null2 correction       */
617   float          domcorrection;	/* null2 score when calculating a per-domain score; NATS                      */
618   float          dombias;	/* FLogsum(0, log(bg->omega) + domcorrection): null2 score contribution; NATS */
619   float          oasc;		/* optimal accuracy score (units: expected # residues correctly aligned)      */
620   float          bitscore;	/* overall score in BITS, null corrected, if this were the only domain in seq */
621   double         lnP;	        /* log(P-value) of the bitscore                                               */
622   int            is_reported;	/* TRUE if domain meets reporting thresholds                                  */
623   int            is_included;	/* TRUE if domain meets inclusion thresholds                                  */
624   float         *scores_per_pos; /* score in BITS that each position in the alignment contributes to an overall viterbi score */
625   P7_ALIDISPLAY *ad;
626 } P7_DOMAIN;
627 
628 /* Structure: P7_DOMAINDEF
629  *
630  * This is a container for all the necessary information for domain
631  * definition procedures in <p7_domaindef.c>, including a bunch of
632  * heuristic thresholds. The structure is reusable to minimize the
633  * number of allocation/free cycles that need to be done when
634  * processing a large number of sequences. You create the structure
635  * with <p7_domaindef_Create()>; after you're done with defining
636  * domains on a sequence, you call <p7_domaindef_Reuse()> before using
637  * it on the next sequence; and when you're completely done, you free
638  * it with <p7_domaindef_Destroy()>. All memory management is handled
639  * internally; you don't need to reallocate anything yourself.
640  */
641 typedef struct p7_domaindef_s {
642   /* for posteriors of being in a domain, B, E */
643   float *mocc;			/* mocc[i=1..L] = prob that i is emitted by core model (is in a domain)       */
644   float *btot; 			/* btot[i=1..L] = cumulative expected times that domain starts at or before i */
645   float *etot;			/* etot[i=1..L] = cumulative expected times that domain ends at or before i   */
646   int    L;
647   int    Lalloc;
648 
649   /* the ad hoc null2 model: 1..L nat scores for each residue, log f'(x_i) / f(x_i) */
650   float *n2sc;
651 
652   /* rng and reusable memory for stochastic tracebacks */
653   ESL_RANDOMNESS *r;		/* random number generator                                 */
654   int             do_reseeding;	/* TRUE to reset the RNG, make results reproducible        */
655   P7_SPENSEMBLE  *sp;		/* an ensemble of sampled segment pairs (domain endpoints) */
656   P7_TRACE       *tr;		/* reusable space for a trace of a domain                  */
657   P7_TRACE       *gtr;		/* reusable space for a traceback of the entire target seq */
658 
659   /* Heuristic thresholds that control the region definition process */
660   /* "rt" = "region threshold", for lack of better term  */
661   float  rt1;   	/* controls when regions are called. mocc[i] post prob >= dt1 : triggers a region around i */
662   float  rt2;		/* controls extent of regions. regions extended until mocc[i]-{b,e}occ[i] < dt2            */
663   float  rt3;		/* controls when regions are flagged for split: if expected # of E preceding B is >= dt3   */
664 
665   /* Heuristic thresholds that control the stochastic traceback/clustering process */
666   int    nsamples;	/* collect ensemble of this many stochastic traces */
667   float  min_overlap;	/* 0.8 means >= 80% overlap of (smaller/larger) segment to link, both in seq and hmm            */
668   int    of_smaller;	/* see above; TRUE means overlap denom is calc'ed wrt smaller segment; FALSE means larger       */
669   int    max_diagdiff;	/* 4 means either start or endpoints of two segments must be within <=4 diagonals of each other */
670   float  min_posterior;	/* 0.25 means a cluster must have >= 25% posterior prob in the sample to be reported            */
671   float  min_endpointp;	/* 0.02 means choose widest endpoint with post prob of at least 2%                              */
672 
673   /* storage of the results; domain locations, scores, alignments          */
674   P7_DOMAIN *dcl;
675   int        ndom;	 /* number of domains defined, in the end.         */
676   int        nalloc;     /* number of domain structures allocated in <dcl> */
677 
678   /* Additional results storage */
679   float  nexpected;     /* posterior expected number of domains in the sequence (from posterior arrays) */
680   int    nregions;	/* number of regions evaluated */
681   int    nclustered;	/* number of regions evaluated by clustering ensemble of tracebacks */
682   int    noverlaps;	/* number of envelopes defined in ensemble clustering that overlap w/ prev envelope */
683   int    nenvelopes;	/* number of envelopes handed over for domain definition, null2, alignment, and scoring. */
684 
685 } P7_DOMAINDEF;
686 
687 
688 /*****************************************************************
689  * 11. P7_TOPHITS: ranking lists of top-scoring hits
690  *****************************************************************/
691 
692 #define p7_HITFLAGS_DEFAULT 0
693 #define p7_IS_INCLUDED      (1<<0)
694 #define p7_IS_REPORTED      (1<<1)
695 #define p7_IS_NEW           (1<<2)
696 #define p7_IS_DROPPED       (1<<3)
697 #define p7_IS_DUPLICATE     (1<<4)
698 
699 
700 /* Structure: P7_HIT
701  *
702  * Info about a high-scoring database hit, kept so we can output a
703  * sorted list of high hits at the end.
704  *
705  * sqfrom and sqto are the coordinates that will be shown
706  * in the results, not coords in arrays... therefore, reverse
707  * complements have sqfrom > sqto
708  */
709 typedef struct p7_hit_s {
710   char   *name;			/* name of the target               (mandatory)           */
711   char   *acc;			/* accession of the target          (optional; else NULL) */
712   char   *desc;			/* description of the target        (optional; else NULL) */
713   int    window_length;         /* for later use in e-value computation, when splitting long sequences */
714   double sortkey;		/* number to sort by; big is better                       */
715 
716   float  score;			/* bit score of the sequence (all domains, w/ correction) */
717   float  pre_score;		/* bit score of sequence before null2 correction          */
718   float  sum_score;		/* bit score reconstructed from sum of domain envelopes   */
719 
720   double lnP;		        /* log(P-value) of the score               */
721   double pre_lnP;		/* log(P-value) of the pre_score           */
722   double sum_lnP;		/* log(P-value) of the sum_score           */
723 
724   float  nexpected;     /* posterior expected number of domains in the sequence (from posterior arrays) */
725   int    nregions;	/* number of regions evaluated */
726   int    nclustered;	/* number of regions evaluated by clustering ensemble of tracebacks */
727   int    noverlaps;	/* number of envelopes defined in ensemble clustering that overlap w/ prev envelope */
728   int    nenvelopes;	/* number of envelopes handed over for domain definition, null2, alignment, and scoring. */
729   int    ndom;		/* total # of domains identified in this seq   */
730 
731   uint32_t flags;      	/* p7_IS_REPORTED | p7_IS_INCLUDED | p7_IS_NEW | p7_IS_DROPPED */
732   int      nreported;	/* # of domains satisfying reporting thresholding  */
733   int      nincluded;	/* # of domains satisfying inclusion thresholding */
734   int      best_domain;	/* index of best-scoring domain in dcl */
735 
736   int64_t  seqidx;          /*unique identifier to track the database sequence from which this hit came*/
737   int64_t  subseq_start; /*used to track which subsequence of a full_length target this hit came from, for purposes of removing duplicates */
738 
739   P7_DOMAIN *dcl;	/* domain coordinate list and alignment display */
740   esl_pos_t  offset;	/* used in socket communications, in serialized communication: offset of P7_DOMAIN msg for this P7_HIT */
741 } P7_HIT;
742 
743 
744 /* Structure: P7_TOPHITS
745  * merging when we prepare to output results. "hit" list is NULL and
746  * unavailable until after we do a sort.
747  */
748 typedef struct p7_tophits_s {
749   P7_HIT **hit;         /* sorted pointer array                     */
750   P7_HIT  *unsrt;	/* unsorted data storage                    */
751   uint64_t Nalloc;	/* current allocation size                  */
752   uint64_t N;		/* number of hits in list now               */
753   uint64_t nreported;	/* number of hits that are reportable       */
754   uint64_t nincluded;	/* number of hits that are includable       */
755   int      is_sorted_by_sortkey; /* TRUE when hits sorted by sortkey and th->hit valid for all N hits */
756   int      is_sorted_by_seqidx; /* TRUE when hits sorted by seq_idx, position, and th->hit valid for all N hits */
757 } P7_TOPHITS;
758 
759 
760 
761 
762 
763 /*****************************************************************
764  * 12. P7_SCOREDATA: data used in diagonal recovery and extension
765  *****************************************************************/
766 
767 enum p7_scoredatatype_e {
768   p7_sd_std  = 0,
769   p7_sd_fm   = 1,
770 };
771 
772 
773 /* This contains a compact representation of 8-bit bias-shifted scores for use in
774  * diagonal recovery (standard SSV) and extension (standard and FM-SSV),
775  * along with MAXL-associated prefix- and suffix-lengths, and optimal extensions
776  * for FM-SSV.
777  */
778 typedef struct p7_scoredata_s {
779   int         type;
780   int         M;
781   union {//implicit (M+1)*K matrix, where M = # states, and K = # characters in alphabet
782     uint8_t  *ssv_scores;    // this 2D array is used in the default nhmmer pipeline
783     float    *ssv_scores_f;  // this 2D array is used in the FM-index based pipeline
784   };
785   float      *prefix_lengths;
786   float      *suffix_lengths;
787   float      *fwd_scores;
788   float     **fwd_transitions;
789   float     **opt_ext_fwd; // Used only for FM-index based pipeline
790   float     **opt_ext_rev; // Used only for FM-index based pipeline
791 } P7_SCOREDATA;
792 
793 
794 /*****************************************************************
795  * 13. P7_HMM_WINDOW: data used to track lists of sequence windows
796  *****************************************************************/
797 
798 
799 typedef struct p7_hmm_window_s {
800   float      score;
801   float      null_sc;
802   int32_t    id;          //sequence id of the database sequence hit
803   int64_t    n;           //position in database sequence at which the diagonal/window starts
804   int64_t    fm_n;        //position in the concatenated fm-index sequence at which the diagonal starts
805   int32_t    length;      // length of the diagonal/window
806   int16_t    k;           //position of the model at which the diagonal ends
807   int64_t    target_len;  //length of the target sequence
808   int8_t     complementarity;
809   int8_t     used_to_extend;
810 } P7_HMM_WINDOW;
811 
812 typedef struct p7_hmm_window_list_s {
813   P7_HMM_WINDOW *windows;
814   int       count;
815   int       size;
816 } P7_HMM_WINDOWLIST;
817 
818 
819 
820 /*****************************************************************
821  * 14. Choice of vector implementation.
822  *****************************************************************/
823 #if   defined (eslENABLE_SSE)
824 #include "impl_sse/impl_sse.h"
825 #elif defined (eslENABLE_VMX)
826 #include "impl_vmx/impl_vmx.h"
827 #else
828 #error "No vector implementation enabled"
829 #endif
830 
831 
832 /*****************************************************************
833  * 15. The FM-index acceleration to the SSV filter.  Only works for SSE
834  *****************************************************************/
835 #define FM_MAX_LINE 256
836 
837 /* Structure the 2D occ array into a single array.  "type" is either b or sb.
838  * Note that one extra count value is required by RLE, one 4-byte int for
839  * each superblock count vector, and one 2-byte short for each block count
840  * vector. This is small overhead, even for a small alphabet like dna.
841  */
842 #define FM_OCC_CNT( type, i, c)  ( occCnts_##type[(meta->alph_size)*(i) + (c)])
843 
844 enum fm_alphabettypes_e {
845   fm_DNA        = 0,  //acgt,  2 bit
846   //fm_DNA_full   = 1,  //includes ambiguity codes, 4 bit.
847   fm_AMINO      = 4,  // 5 bit
848 };
849 /*TODO: fm_DNA_full has currently been disabled because of problems with how the
850  * FM index handles very long runs of the same character (in this case, Ns).
851  * See wheelert/notebook/2013/12-11-FM-alphabet-speed notes on 12/12.
852  */
853 
854 enum fm_direction_e {
855   fm_forward    = 0,
856   fm_backward   = 1,
857 };
858 
859 
860 typedef struct fm_interval_s {
861   int   lower;
862   int   upper;
863 } FM_INTERVAL;
864 
865 typedef struct fm_hit_s {
866   uint64_t  start;
867   uint32_t  block;
868   int       direction;
869   int       length;
870   int       sortkey;
871 } FM_HIT;
872 
873 
874 
875 typedef struct fm_ambiglist_s {
876   FM_INTERVAL *ranges;
877   uint32_t     count;
878   uint32_t     size;
879 } FM_AMBIGLIST;
880 
881 
882 typedef struct fm_seqdata_s {
883 
884   uint32_t target_id;      // Which sequence in the target database did this segment come from (can be multiple segment per sequence, if a sequence has Ns)
885   uint64_t target_start;   // The position in sequence {id} in the target database at which this sequence-block starts (usually 1, unless its a long sequence split out over multiple FMs)
886   uint32_t fm_start;       // The position in the FM block at which this sequence begins
887   uint32_t length;         // Length of this sequence segment  (usually the length of the target sequence, unless its a long sequence split out over multiple FMs)
888 
889 
890   //meta data taken from the sequence this segment was taken from
891   uint16_t name_length;
892   uint16_t source_length;
893   uint16_t acc_length;
894   uint16_t desc_length;
895   char     *name;
896   char     *source;
897   char     *acc;
898   char     *desc;
899 } FM_SEQDATA;
900 
901 
902 typedef struct fm_metadata_s {
903   uint8_t  fwd_only;
904   uint8_t  alph_type;
905   uint8_t  alph_size;
906   uint8_t  charBits;
907   uint32_t freq_SA; //frequency with which SA is sampled
908   uint32_t freq_cnt_sb; //frequency with which full cumulative counts are captured
909   uint32_t freq_cnt_b; //frequency with which intermittent counts are captured
910   uint16_t block_count;
911   uint32_t seq_count;
912   uint64_t char_count; //total count of characters including those in and out of the alphabet
913   char     *alph;
914   char     *inv_alph;
915   int      *compl_alph;
916   FILE         *fp;
917   FM_SEQDATA   *seq_data;
918   FM_AMBIGLIST *ambig_list;
919 } FM_METADATA;
920 
921 
922 typedef struct fm_data_s {
923   uint64_t N; //length of text
924   uint32_t term_loc; // location in the BWT at which the '$' char is found (replaced in the sequence with 'a')
925   uint32_t seq_offset;
926   uint32_t ambig_offset;
927   uint32_t seq_cnt;
928   uint32_t ambig_cnt;
929   uint32_t overlap; // number of bases at the beginning that overlap the FM-index for the preceding block
930   uint8_t  *T;  //text corresponding to the BWT
931   uint8_t  *BWT_mem;
932   uint8_t  *BWT;
933   uint32_t *SA; // sampled suffix array
934   int64_t  *C; //the first position of each letter of the alphabet if all of T is sorted.  (signed, as I use that to keep tract of presence/absence)
935   uint32_t *occCnts_sb;
936   uint16_t *occCnts_b;
937 } FM_DATA;
938 
939 typedef struct fm_dp_pair_s {
940   uint16_t    pos;  // position of the diagonal in the model.
941   float       score;
942   float       max_score;
943   uint8_t     score_peak_len; // how long was the diagonal when the most recent peak (within fm_drop_lim of the max score) was seen?
944   uint8_t     consec_pos;
945   uint8_t     max_consec_pos;
946   uint8_t     consec_consensus;
947   uint8_t     model_direction;
948   uint8_t     complementarity;
949 } FM_DP_PAIR;
950 
951 
952 typedef struct fm_diag_s {
953   uint64_t    n;  //position of the database sequence at which the diagonal starts
954   union {
955     double    sortkey;
956     double    score;
957   };
958   uint16_t    k;  //position of the model at which the diagonal starts
959   uint16_t    length;
960   uint8_t     complementarity;
961 } FM_DIAG;
962 
963 typedef struct fm_diaglist_s {
964   FM_DIAG   *diags;
965   int       count;
966   int       size;
967 } FM_DIAGLIST;
968 
969 /* Effectively global variables, to be initialized once in fm_initConfig(),
970  * then passed around among threads to avoid recomputing them
971  *
972  * When allocated, must be 16-byte aligned, and all _m128i elements
973  * must precede other types
974  */
975 typedef struct {
976 #if   defined (eslENABLE_SSE)
977   /* mask arrays, and 16-byte-offsets into them */
978   __m128i *fm_masks_mem;
979   __m128i *fm_masks_v;
980   __m128i *fm_reverse_masks_mem;
981   __m128i *fm_reverse_masks_v;
982   __m128i *fm_chars_mem;
983   __m128i *fm_chars_v;
984 
985   /*various precomputed vectors*/
986   __m128i fm_allones_v;
987   __m128i fm_zeros_v;
988   __m128i fm_neg128_v;
989   __m128i fm_m0f;  //00 00 11 11
990   __m128i fm_m01;  //01 01 01 01
991   __m128i fm_m11;  //00 00 00 11
992 
993   /* no non-__m128i- elements above this line */
994 #endif //#if   defined (eslENABLE_SSE)
995 
996   /*counter, to compute FM-index speed*/
997   int occCallCnt;
998 
999   /*bounding cutoffs*/
1000   int max_depth;
1001   float drop_lim;  // 0.2 ; in seed, max drop in a run of length [fm_drop_max_len]
1002   int drop_max_len; // 4 ; maximum run length with score under (max - [fm_drop_lim])
1003   int consec_pos_req; //5
1004   int consensus_match_req; //10
1005   float score_density_req; //.5
1006   int ssv_length;
1007   float scthreshFM;
1008   float sc_thresh_ratio; //information content deficit,  actual_relent/target_relent
1009 
1010   /*pointer to FM-index metadata*/
1011   FM_METADATA *meta;
1012 
1013 } FM_CFG;
1014 
1015 
1016 #if   defined (eslENABLE_SSE)
1017 //used to convert from a byte array to an __m128i
1018 typedef union {
1019         uint8_t bytes[16];
1020         __m128i m128;
1021         } byte_m128;
1022 
1023 
1024 /* Gather the sum of all counts in a 16x8-bit element into a single 16-bit
1025  *  element of the register (the 0th element)
1026  *
1027  *  the _mm_sad_epu8  accumulates 8-bit counts into 16-bit counts:
1028  *      left 8 counts (64-bits) accumulate in counts_v[0],
1029  *      right 8 counts in counts_v[4]  (the other 6 16-bit ints are 0)
1030  *  the _mm_shuffle_epi32  flips the 4th int into the 0th slot
1031  */
1032 #define FM_GATHER_8BIT_COUNTS( in_v, mid_v, out_v  ) do {\
1033     mid_v = _mm_sad_epu8 (in_v, cfg->fm_zeros_v);\
1034     tmp_v = _mm_shuffle_epi32(mid_v, _MM_SHUFFLE(1, 1, 1, 2));\
1035     out_v = _mm_add_epi16(mid_v, tmp_v);\
1036   } while (0)
1037 
1038 
1039 /* Macro for SSE operations to turn 2-bit character values into 2-bit binary
1040  * (00 or 01) match/mismatch values representing occurrences of a character in a
1041  * 4-char-per-byte packed BWT.
1042  *
1043  * Typically followed by a call to FM_COUNT_SSE_4PACKED, possibly with a
1044  * mask in between to handle the case where we don't want to add over all
1045  * positions in the vector
1046  *
1047  * tmp_v and tmp2_v are used as temporary vectors throughout, and hold meaningless values
1048  * at the end
1049  *
1050  * xor(in_v, c_v)        : each 2-bit value will be 00 if a match, and non-0 if a mismatch
1051  * and(in_v, 01010101)   : look at the right bit of each 2-bit value,
1052  * srli(1)+and()         : look at the left bit of each 2-bit value,
1053  * or()                  : if either left bit or right bit is non-0, 01, else 00 (match is 00)
1054  *
1055  * subs()                : invert, so match is 01, mismatch is 00
1056  *
1057  */
1058 #define FM_MATCH_2BIT(in_v, c_v, a_v, b_v, out_v) do {\
1059     a_v = _mm_xor_si128(in_v, c_v);\
1060     \
1061     b_v  = _mm_srli_epi16(a_v, 1);\
1062     a_v  = _mm_or_si128(a_v, b_v);\
1063     b_v  = _mm_and_si128(a_v, cfg->fm_m01);\
1064     \
1065     out_v  = _mm_subs_epi8(cfg->fm_m01,b_v);\
1066   } while (0)
1067 
1068 
1069 /*Macro for SSE operations to count bits produced by FM_MATCH_SSE_4PACKED
1070  *
1071  * tmp_v and tmp2_v are used as temporary vectors throughout, and hold meaningless values
1072  * at the end
1073  *
1074  * then add up the 2-bit values:
1075  * srli(4)+add()         : left 4 bits shifted right, added to right 4 bits
1076  *
1077  * srli(2)+and(00000011) : left 2 bits (value 0..2) shifted right, masked, so no other bits active
1078  * and(00000011)         : right 2 bits (value 0..2) masked so no other bits active
1079  *
1080  * final 2 add()s        : tack current counts on to already-tabulated counts.
1081  */
1082 #define FM_COUNT_2BIT(a_v, b_v, cnts_v) do {\
1083         b_v = _mm_srli_epi16(a_v, 4);\
1084         a_v  = _mm_add_epi16(a_v, b_v);\
1085         \
1086         b_v = _mm_srli_epi16(a_v, 2);\
1087         a_v  = _mm_and_si128(a_v,cfg->fm_m11);\
1088         b_v = _mm_and_si128(b_v,cfg->fm_m11);\
1089         \
1090         cnts_v = _mm_add_epi16(cnts_v, a_v);\
1091         cnts_v = _mm_add_epi16(cnts_v, b_v);\
1092   } while (0)
1093 
1094 
1095 
1096 /* Macro for SSE operations that turns a vector of 4-bit character values into
1097  * 2 vectors representing matches. Each byte in the input vector consists of
1098  * a left half (4 bits) and a right half (4 bits). The 16 left-halves produce
1099  * one vector, which contains all-1s for bytes in which the left half matches
1100  * the c_v character (and 0s if it doesn't), while the 16 right-halves produce
1101  * the other vector, again with each byte either all-1s or all-0s.
1102  *
1103  * The expectation is that FM_COUNT_4BIT will be called after this, to
1104  * turn these binary values into sums over a series of vectors. The macros
1105  * are split up to allow one end or other to be trimmed in the case that
1106  * counting is not expected to include the full vector.
1107  *
1108  * srli(4)+and() : capture the left 4-bit value   (need the mask because 16-bit shift leaves garbage in left-4-bit chunks)
1109  * and()         : capture the right 4-bit value
1110  *
1111  * cmpeq()x2     : test if both left and right == c.  For each, if ==c , value = 11111111 (-1)
1112  */
1113 #define FM_MATCH_4BIT(in_v, c_v, out1_v, out2_v) do {\
1114     out1_v    = _mm_srli_epi16(in_v, 4);\
1115     out2_v    = _mm_and_si128(in_v, cfg->fm_m0f);\
1116     out1_v    = _mm_and_si128(out1_v, cfg->fm_m0f);\
1117     \
1118     out1_v    = _mm_cmpeq_epi8(out1_v, c_v);\
1119     out2_v    = _mm_cmpeq_epi8(out2_v, c_v);\
1120   } while (0)
1121 
1122 
1123 /* Macro for SSE operations that turns a vector of 4-bit character values into
1124  * 2 vectors representing matches. Each byte in the input vector consists of
1125  * a left half (4 bits) and a right half (4 bits). The 16 left-halves produce
1126  * one vector, which contains all-1s for bytes in which the left half is less than
1127  * the c_v character (and 0s if it doesn't), while the 16 right-halves produce
1128  * the other vector, again with each byte either all-1s or all-0s.
1129  *
1130  * The expectation is that FM_COUNT_4BIT will be called after this, to
1131  * turn these binary values into sums over a series of vectors. The macros
1132  * are split up to allow one end or other to be trimmed in the case that
1133  * counting is not expected to include the full vector.
1134  *
1135  * srli(4)+and() : capture the left 4-bit value   (need the mask because 16-bit shift leaves garbage in left-4-bit chunks)
1136  * and()         : capture the right 4-bit value
1137  *
1138  * cmplt()x2     : test if both left and right < c.  For each, if <c , value = 11111111 (-1)
1139  */
1140 #define FM_LT_4BIT(in_v, c_v, out1_v, out2_v) do {\
1141     out1_v    = _mm_srli_epi16(in_v, 4);\
1142     out2_v    = _mm_and_si128(in_v, cfg->fm_m0f);\
1143     out1_v    = _mm_and_si128(out1_v, cfg->fm_m0f);\
1144     \
1145     out1_v    = _mm_cmplt_epi8(out1_v, c_v);\
1146     out2_v    = _mm_cmplt_epi8(out2_v, c_v);\
1147   } while (0)
1148 
1149 
1150 
1151 /* Macro for SSE operations to add occurrence counts to the tally vector counts_v,
1152  * in the 4-bits-per-character case
1153  *
1154  * The expectation is that in[12]_v will contain bytes that are either
1155  *   00000000  =  0
1156  *  or
1157  *   11111111  = -1
1158  * so subtracting the value of the byte is the same as adding 0 or 1.
1159  */
1160 #define FM_COUNT_4BIT(in1_v, in2_v, cnts_v) do {\
1161     cnts_v = _mm_subs_epi8(cnts_v, in1_v);\
1162     cnts_v = _mm_subs_epi8(cnts_v, in2_v);\
1163   } while (0)
1164 
1165 
1166 #endif  // if  defined (eslENABLE_SSE)
1167 
1168 /*****************************************************************
1169  * 16. P7_PIPELINE: H3's accelerated seq/profile comparison pipeline
1170  *****************************************************************/
1171 
1172 enum p7_pipemodes_e { p7_SEARCH_SEQS = 0, p7_SCAN_MODELS = 1 };
1173 enum p7_zsetby_e    { p7_ZSETBY_NTARGETS = 0, p7_ZSETBY_OPTION = 1, p7_ZSETBY_FILEINFO = 2 };
1174 enum p7_complementarity_e { p7_NOCOMPLEMENT    = 0, p7_COMPLEMENT   = 1 };
1175 
1176 typedef struct p7_pipeline_s {
1177   /* Dynamic programming matrices                                           */
1178   P7_OMX     *oxf;		/* one-row Forward matrix, accel pipe       */
1179   P7_OMX     *oxb;		/* one-row Backward matrix, accel pipe      */
1180   P7_OMX     *fwd;		/* full Fwd matrix for domain envelopes     */
1181   P7_OMX     *bck;		/* full Bck matrix for domain envelopes     */
1182 
1183   /* Domain postprocessing                                                  */
1184   ESL_RANDOMNESS *r;		/* random number generator                  */
1185   int             do_reseeding; /* TRUE: reseed for reproducible results    */
1186   int             do_alignment_score_calc;
1187   P7_DOMAINDEF   *ddef;		/* domain definition workflow               */
1188 
1189 
1190   /* Reporting threshold settings                                           */
1191   int     by_E;		        /* TRUE to cut per-target report off by E   */
1192   double  E;	                /* per-target E-value threshold             */
1193   double  T;	                /* per-target bit score threshold           */
1194   int     dom_by_E;             /* TRUE to cut domain reporting off by E    */
1195   double  domE;	                /* domain E-value threshold                 */
1196   double  domT;	                /* domain bit score threshold               */
1197   int     use_bit_cutoffs;      /* (FALSE | p7H_GA | p7H_TC | p7H_NC)       */
1198 
1199   /* Inclusion threshold settings                                           */
1200   int     inc_by_E;		/* TRUE to threshold inclusion by E-values  */
1201   double  incE;			/* per-target inclusion E-value threshold   */
1202   double  incT;			/* per-target inclusion score threshold     */
1203   int     incdom_by_E;		/* TRUE to threshold domain inclusion by E  */
1204   double  incdomE;		/* per-domain inclusion E-value threshold   */
1205   double  incdomT;		/* per-domain inclusion E-value threshold   */
1206 
1207   /* Tracking search space sizes for E value calculations                   */
1208   double  Z;			/* eff # targs searched (per-target E-val)  */
1209   double  domZ;			/* eff # signific targs (per-domain E-val)  */
1210   enum p7_zsetby_e Z_setby;   	/* how Z was set                            */
1211   enum p7_zsetby_e domZ_setby;	/* how domZ was set                         */
1212 
1213   /* Threshold settings for pipeline                                        */
1214   int     do_max;	        /* TRUE to run in slow/max mode             */
1215   double  F1;		        /* MSV filter threshold                     */
1216   double  F2;		        /* Viterbi filter threshold                 */
1217   double  F3;		        /* uncorrected Forward filter threshold     */
1218   int     B1;               /* window length for biased-composition modifier - MSV*/
1219   int     B2;               /* window length for biased-composition modifier - Viterbi*/
1220   int     B3;               /* window length for biased-composition modifier - Forward*/
1221   int     do_biasfilter;	/* TRUE to use biased comp HMM filter       */
1222   int     do_null2;		/* TRUE to use null2 score corrections      */
1223 
1224   /* Accounting. (reduceable in threaded/MPI parallel version)              */
1225   uint64_t      nmodels;        /* # of HMMs searched                       */
1226   uint64_t      nseqs;	        /* # of sequences searched                  */
1227   uint64_t      nres;	        /* # of residues searched                   */
1228   uint64_t      nnodes;	        /* # of model nodes searched                */
1229   uint64_t      n_past_msv;	/* # comparisons that pass MSVFilter()      */
1230   uint64_t      n_past_bias;	/* # comparisons that pass bias filter      */
1231   uint64_t      n_past_vit;	/* # comparisons that pass ViterbiFilter()  */
1232   uint64_t      n_past_fwd;	/* # comparisons that pass ForwardFilter()  */
1233   uint64_t      n_output;	    /* # alignments that make it to the final output (used for nhmmer) */
1234   uint64_t      pos_past_msv;	/* # positions that pass MSVFilter()  (used for nhmmer) */
1235   uint64_t      pos_past_bias;	/* # positions that pass bias filter  (used for nhmmer) */
1236   uint64_t      pos_past_vit;	/* # positions that pass ViterbiFilter()  (used for nhmmer) */
1237   uint64_t      pos_past_fwd;	/* # positions that pass ForwardFilter()  (used for nhmmer) */
1238   uint64_t      pos_output;	    /* # positions that make it to the final output (used for nhmmer) */
1239 
1240   enum p7_pipemodes_e mode;    	/* p7_SCAN_MODELS | p7_SEARCH_SEQS          */
1241   int           long_targets;   /* TRUE if the target sequences are expected to be very long (e.g. dna chromosome search in nhmmer) */
1242   int           strands;         /*  p7_STRAND_TOPONLY  | p7_STRAND_BOTTOMONLY |  p7_STRAND_BOTH */
1243   int 		    	W;              /* window length for nhmmer scan - essentially maximum length of model that we expect to find*/
1244   int           block_length;   /* length of overlapping blocks read in the multi-threaded variant (default MAX_RESIDUE_COUNT) */
1245 
1246   int           show_accessions;/* TRUE to output accessions not names      */
1247   int           show_alignments;/* TRUE to output alignments (default)      */
1248 
1249   P7_HMMFILE   *hfp;		/* COPY of open HMM database (if scan mode) */
1250   char          errbuf[eslERRBUFSIZE];
1251 } P7_PIPELINE;
1252 
1253 
1254 
1255 /*****************************************************************
1256  * 17. P7_BUILDER: pipeline for new HMM construction
1257  *****************************************************************/
1258 
1259 #define p7_DEFAULT_WINDOW_BETA  1e-7
1260 
1261 enum p7_archchoice_e { p7_ARCH_FAST = 0, p7_ARCH_HAND = 1 };
1262 enum p7_wgtchoice_e  { p7_WGT_NONE  = 0, p7_WGT_GIVEN = 1, p7_WGT_GSC    = 2, p7_WGT_PB       = 3, p7_WGT_BLOSUM = 4 };
1263 enum p7_effnchoice_e { p7_EFFN_NONE = 0, p7_EFFN_SET  = 1, p7_EFFN_CLUST = 2, p7_EFFN_ENTROPY = 3, p7_EFFN_ENTROPY_EXP = 4 };
1264 
1265 typedef struct p7_builder_s {
1266   /* Model architecture                                                                            */
1267   enum p7_archchoice_e arch_strategy;    /* choice of model architecture determination algorithm   */
1268   float                symfrac;	         /* residue occ thresh for fast architecture determination */
1269   float                fragthresh;	 /* if L <= fragthresh*alen, seq is called a fragment      */
1270 
1271   /* Relative sequence weights                                                                     */
1272   enum p7_wgtchoice_e  wgt_strategy;     /* choice of relative sequence weighting algorithm        */
1273   double               wid;		 /* %id threshold for BLOSUM relative weighting            */
1274 
1275   /* Effective sequence number                                                                     */
1276   enum p7_effnchoice_e effn_strategy;    /* choice of effective seq # determination algorithm      */
1277   double               re_target;	 /* rel entropy target for effn eweighting, if set; or -1.0*/
1278   double               esigma;		 /* min total rel ent parameter for effn entropy weights   */
1279   double               eid;		 /* %id threshold for effn clustering                      */
1280   double               eset;		 /* effective sequence number, if --eset; or -1.0          */
1281 
1282   /* Run-to-run variation due to random number generation                                          */
1283   ESL_RANDOMNESS      *r;	         /* RNG for E-value calibration simulations                */
1284   int                  do_reseeding;	 /* TRUE to reseed, making results reproducible            */
1285 
1286   /* E-value parameter calibration                                                                 */
1287   int                  EmL;            	 /* length of sequences generated for MSV fitting          */
1288   int                  EmN;	         /* # of sequences generated for MSV fitting               */
1289   int                  EvL;            	 /* length of sequences generated for Viterbi fitting      */
1290   int                  EvN;	         /* # of sequences generated for Viterbi fitting           */
1291   int                  EfL;	         /* length of sequences generated for Forward fitting      */
1292   int                  EfN;	         /* # of sequences generated for Forward fitting           */
1293   double               Eft;	         /* tail mass used for Forward fitting                     */
1294 
1295   /* Choice of prior                                                                               */
1296   P7_PRIOR            *prior;	         /* choice of prior when parameterizing from counts        */
1297   int                  max_insert_len;
1298 
1299   /* Optional: information used for parameterizing single sequence queries                         */
1300   ESL_SCOREMATRIX     *S;		 /* residue score matrix                                   */
1301   ESL_DMATRIX         *Q;	         /* Q->mx[a][b] = P(b|a) residue probabilities             */
1302   double               popen;         	 /* gap open probability                                   */
1303   double               pextend;          /* gap extend probability                                 */
1304 
1305   double               w_beta;    /*beta value used to compute W (window length)   */
1306   int                  w_len;     /*W (window length)  explicitly set */
1307 
1308   const ESL_ALPHABET  *abc;		 /* COPY of alphabet                                       */
1309   char errbuf[eslERRBUFSIZE];            /* informative message on model construction failure      */
1310 } P7_BUILDER;
1311 
1312 
1313 
1314 /*****************************************************************
1315  * 18. Routines in HMMER's exposed API.
1316  *****************************************************************/
1317 
1318 /* build.c */
1319 extern int p7_Handmodelmaker(ESL_MSA *msa,                P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***ret_tr);
1320 extern int p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***ret_tr);
1321 
1322 /* emit.c */
1323 extern int p7_CoreEmit   (ESL_RANDOMNESS *r, const P7_HMM *hmm,                                        ESL_SQ *sq, P7_TRACE *tr);
1324 extern int p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr);
1325 extern int p7_emit_SimpleConsensus(const P7_HMM *hmm, ESL_SQ *sq);
1326 extern int p7_emit_FancyConsensus (const P7_HMM *hmm, float min_lower, float min_upper, ESL_SQ *sq);
1327 
1328 /* errors.c */
1329 extern void p7_Die (char *format, ...);
1330 extern void p7_Fail(char *format, ...);
1331 
1332 /* evalues.c */
1333 extern int p7_Calibrate(P7_HMM *hmm, P7_BUILDER *cfg_b, ESL_RANDOMNESS **byp_rng, P7_BG **byp_bg, P7_PROFILE **byp_gm, P7_OPROFILE **byp_om);
1334 extern int p7_Lambda(P7_HMM *hmm, P7_BG *bg, double *ret_lambda);
1335 extern int p7_MSVMu     (ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda,               double *ret_mmu);
1336 extern int p7_ViterbiMu (ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda,               double *ret_vmu);
1337 extern int p7_Tau       (ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda, double tailp, double *ret_tau);
1338 
1339 /* eweight.c */
1340 extern int p7_EntropyWeight(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double infotarget, double *ret_Neff);
1341 
1342 extern int p7_EntropyWeight_exp(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double etarget, double *ret_exp);
1343 /* generic_decoding.c */
1344 extern int p7_GDecoding      (const P7_PROFILE *gm, const P7_GMX *fwd,       P7_GMX *bck, P7_GMX *pp);
1345 extern int p7_GDomainDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, const P7_GMX *bck, P7_DOMAINDEF *ddef);
1346 
1347 /* generic_fwdback.c */
1348 extern int p7_GForward     (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm,       P7_GMX *gx, float *ret_sc);
1349 extern int p7_GBackward    (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm,       P7_GMX *gx, float *ret_sc);
1350 extern int p7_GHybrid      (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm,       P7_GMX *gx, float *opt_fwdscore, float *opt_hybscore);
1351 
1352 /* generic_msv.c */
1353 extern int p7_GMSV           (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float nu, float *ret_sc);
1354 extern int p7_GMSV_longtarget(const ESL_DSQ *dsq, int L, P7_PROFILE *gm, P7_GMX *gx, float nu,  P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist);
1355 
1356 /* generic_null2.c */
1357 extern int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2);
1358 extern int p7_GNull2_ByTrace      (const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2);
1359 
1360 /* generic_optacc.c */
1361 extern int p7_GOptimalAccuracy(const P7_PROFILE *gm, const P7_GMX *pp,       P7_GMX *gx, float *ret_e);
1362 extern int p7_GOATrace        (const P7_PROFILE *gm, const P7_GMX *pp, const P7_GMX *gx, P7_TRACE *tr);
1363 
1364 /* generic_stotrace.c */
1365 extern int p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr);
1366 
1367 /* generic_viterbi.c */
1368 extern int p7_GViterbi     (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm,       P7_GMX *gx, float *ret_sc);
1369 
1370 /* generic_vtrace.c */
1371 extern int p7_GTrace       (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr);
1372 extern int p7_GViterbi_longtarget(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx,
1373                        float filtersc, double P, P7_HMM_WINDOWLIST *windowlist);
1374 
1375 
1376 /* heatmap.c (evolving now, intend to move this to Easel in the future) */
1377 extern double dmx_upper_max(ESL_DMATRIX *D);
1378 extern double dmx_upper_min(ESL_DMATRIX *D);
1379 extern double dmx_upper_element_sum(ESL_DMATRIX *D);
1380 extern double dmx_upper_norm(ESL_DMATRIX *D);
1381 extern int    dmx_Visualize(FILE *fp, ESL_DMATRIX *D, double min, double max);
1382 
1383 /* hmmdutils.c */
1384 extern void p7_openlog(const char *ident, int option, int facility);
1385 extern void p7_syslog(int priority, const char *format, ...);
1386 extern void p7_closelog(void);
1387 
1388 /* hmmlogo.c */
1389 extern float hmmlogo_maxHeight (P7_BG *bg);
1390 extern int hmmlogo_RelativeEntropy_all      (P7_HMM *hmm, P7_BG *bg, float *rel_ents, float **probs, float **heights );
1391 extern int hmmlogo_RelativeEntropy_above_bg (P7_HMM *hmm, P7_BG *bg, float *rel_ents, float **probs, float **heights );
1392 extern int hmmlogo_ScoreHeights (P7_HMM *hmm, P7_BG *bg, float **heights );
1393 extern int hmmlogo_IndelValues (P7_HMM *hmm, float *insert_P, float *insert_expL, float *delete_P );
1394 
1395 
1396 
1397 /* hmmpgmd2msa.c */
1398 extern int hmmpgmd2msa(void *data, P7_HMM *hmm, ESL_SQ *qsq, int *incl, int incl_size, int *excl, int excl_size, int excl_all, ESL_MSA **ret_msa);
1399 
1400 
1401 
1402 /* island.c */
1403 extern int   p7_island_Viterbi(ESL_DSQ *dsq, int L, P7_PROFILE *gm, P7_GMX *mx, ESL_HISTOGRAM *h);
1404 
1405 /* h2_io.c */
1406 extern int   p7_h2io_WriteASCII(FILE *fp, P7_HMM *hmm);
1407 
1408 /* hmmer.c */
1409 extern void         p7_banner(FILE *fp, const char *progname, char *banner);
1410 extern ESL_GETOPTS *p7_CreateDefaultApp(ESL_OPTIONS *options, int nargs, int argc, char **argv, char *banner, char *usage);
1411 extern int          p7_AminoFrequencies(float *f);
1412 
1413 /* logsum.c */
1414 extern int   p7_FLogsumInit(void);
1415 extern float p7_FLogsum(float a, float b);
1416 extern float p7_FLogsumError(float a, float b);
1417 extern int   p7_ILogsumInit(void);
1418 extern int   p7_ILogsum(int s1, int s2);
1419 
1420 
1421 /* modelconfig.c */
1422 extern int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode);
1423 extern int p7_ReconfigLength  (P7_PROFILE *gm, int L);
1424 extern int p7_ReconfigMultihit(P7_PROFILE *gm, int L);
1425 extern int p7_ReconfigUnihit  (P7_PROFILE *gm, int L);
1426 
1427 /* modelstats.c */
1428 extern double p7_MeanMatchInfo           (const P7_HMM *hmm, const P7_BG *bg);
1429 extern double p7_MeanMatchEntropy        (const P7_HMM *hmm);
1430 extern double p7_MeanMatchRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg);
1431 extern double p7_MeanForwardScore        (const P7_HMM *hmm, const P7_BG *bg);
1432 extern int    p7_MeanPositionRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg, double *ret_entropy);
1433 extern int    p7_hmm_CompositionKLD(const P7_HMM *hmm, const P7_BG *bg, float *ret_KL, float **opt_avp);
1434 
1435 
1436 /* mpisupport.c */
1437 #ifdef HMMER_MPI
1438 extern int p7_hmm_MPISend(P7_HMM *hmm, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc);
1439 extern int p7_hmm_MPIPackSize(P7_HMM *hmm, MPI_Comm comm, int *ret_n);
1440 extern int p7_hmm_MPIPack(P7_HMM *hmm, char *buf, int n, int *position, MPI_Comm comm);
1441 extern int p7_hmm_MPIUnpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **abc, P7_HMM **ret_hmm);
1442 extern int p7_hmm_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **abc, P7_HMM **ret_hmm);
1443 
1444 extern int p7_profile_MPISend(P7_PROFILE *gm, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc);
1445 extern int p7_profile_MPIRecv(int source, int tag, MPI_Comm comm, const ESL_ALPHABET *abc, const P7_BG *bg,
1446 			      char **buf, int *nalloc,  P7_PROFILE **ret_gm);
1447 
1448 extern int p7_pipeline_MPISend(P7_PIPELINE *pli, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc);
1449 extern int p7_pipeline_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_GETOPTS *go, P7_PIPELINE **ret_pli);
1450 
1451 extern int p7_tophits_MPISend(P7_TOPHITS *th, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc);
1452 extern int p7_tophits_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, P7_TOPHITS **ret_th);
1453 
1454 extern int p7_oprofile_MPISend(P7_OPROFILE *om, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc);
1455 extern int p7_oprofile_MPIPackSize(P7_OPROFILE *om, MPI_Comm comm, int *ret_n);
1456 extern int p7_oprofile_MPIPack(P7_OPROFILE *om, char *buf, int n, int *pos, MPI_Comm comm);
1457 extern int p7_oprofile_MPIUnpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **abc, P7_OPROFILE **ret_om);
1458 extern int p7_oprofile_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **abc, P7_OPROFILE **ret_om);
1459 #endif /*HMMER_MPI*/
1460 
1461 /* tracealign.c */
1462 extern int p7_tracealign_Seqs(ESL_SQ **sq,           P7_TRACE **tr, int nseq, int M,  int optflags, P7_HMM *hmm, ESL_MSA **ret_msa);
1463 extern int p7_tracealign_MSA (const ESL_MSA *premsa, P7_TRACE **tr,           int M,  int optflags, ESL_MSA **ret_postmsa);
1464 extern int p7_tracealign_computeTraces(P7_HMM *hmm, ESL_SQ  **sq, int offset, int N, P7_TRACE  **tr);
1465 extern int p7_tracealign_getMSAandStats(P7_HMM *hmm, ESL_SQ  **sq, int N, ESL_MSA **ret_msa, float **ret_pp, float **ret_relent, float **ret_scores );
1466 
1467 /* p7_alidisplay.c */
1468 extern P7_ALIDISPLAY *p7_alidisplay_Create(const P7_TRACE *tr, int which, const P7_OPROFILE *om, const ESL_SQ *sq, const ESL_SQ *ntsq);
1469 extern P7_ALIDISPLAY *p7_alidisplay_Create_empty();
1470 extern P7_ALIDISPLAY *p7_alidisplay_Clone(const P7_ALIDISPLAY *ad);
1471 extern size_t         p7_alidisplay_Sizeof(const P7_ALIDISPLAY *ad);
1472 extern int            p7_alidisplay_Serialize(const P7_ALIDISPLAY *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc);
1473 extern int            p7_alidisplay_Deserialize(const uint8_t *buf, uint32_t *n, P7_ALIDISPLAY *ret_obj);
1474 extern int            p7_alidisplay_Serialize_old(P7_ALIDISPLAY *ad);
1475 extern int            p7_alidisplay_Deserialize_old(P7_ALIDISPLAY *ad);
1476 extern void           p7_alidisplay_Destroy(P7_ALIDISPLAY *ad);
1477 extern char           p7_alidisplay_EncodePostProb(float p);
1478 extern float          p7_alidisplay_DecodePostProb(char pc);
1479 extern char           p7_alidisplay_EncodeAliPostProb(float p, float hi, float med, float lo);
1480 
1481 extern int            p7_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli);
1482 extern int            p7_translated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli);
1483 extern int            p7_nontranslated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, int show_accessions);
1484 
1485 extern int            p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr);
1486 extern int            p7_alidisplay_Sample(ESL_RANDOMNESS *rng, int N, P7_ALIDISPLAY **ret_ad);
1487 extern int            p7_alidisplay_Dump(FILE *fp, const P7_ALIDISPLAY *ad);
1488 extern int            p7_alidisplay_Compare(const P7_ALIDISPLAY *ad1, const P7_ALIDISPLAY *ad2);
1489 
1490 /* p7_bg.c */
1491 extern P7_BG *p7_bg_Create(const ESL_ALPHABET *abc);
1492 extern P7_BG *p7_bg_CreateUniform(const ESL_ALPHABET *abc);
1493 extern P7_BG *p7_bg_Clone(const P7_BG *bg);
1494 extern int    p7_bg_Dump(FILE *ofp, const P7_BG *bg);
1495 extern void   p7_bg_Destroy(P7_BG *bg);
1496 extern int    p7_bg_SetLength(P7_BG *bg, int L);
1497 extern int    p7_bg_NullOne(const P7_BG *bg, const ESL_DSQ *dsq, int L, float *ret_sc);
1498 
1499 extern int    p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf);
1500 extern int    p7_bg_Write(FILE *fp, P7_BG *bg);
1501 
1502 extern int    p7_bg_SetFilter  (P7_BG *bg, int M, const float *compo);
1503 extern int    p7_bg_FilterScore(P7_BG *bg, const ESL_DSQ *dsq, int L, float *ret_sc);
1504 
1505 /* p7_builder.c */
1506 extern P7_BUILDER *p7_builder_Create(const ESL_GETOPTS *go, const ESL_ALPHABET *abc);
1507 extern int         p7_builder_LoadScoreSystem(P7_BUILDER *bld, const char *matrix,                  double popen, double pextend, P7_BG *bg);
1508 extern int         p7_builder_SetScoreSystem (P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg);
1509 extern void        p7_builder_Destroy(P7_BUILDER *bld);
1510 
1511 extern int p7_Builder      (P7_BUILDER *bld, ESL_MSA *msa, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE ***opt_trarr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om, ESL_MSA **opt_postmsa);
1512 extern int p7_SingleBuilder(P7_BUILDER *bld, ESL_SQ *sq,   P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE  **opt_tr,    P7_PROFILE **opt_gm, P7_OPROFILE **opt_om);
1513 extern int p7_Builder_MaxLength      (P7_HMM *hmm, double emit_thresh);
1514 
1515 /* p7_domain.c */
1516 extern P7_DOMAIN *p7_domain_Create_empty();
1517 extern void p7_domain_Destroy(P7_DOMAIN *obj);
1518 extern int p7_domain_Serialize(const P7_DOMAIN *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc);
1519 extern int p7_domain_Deserialize(const uint8_t *buf, uint32_t *n, P7_DOMAIN *ret_obj);
1520 extern int p7_domain_TestSample(ESL_RAND64 *rng, P7_DOMAIN **ret_obj);
1521 extern int p7_domain_Compare(P7_DOMAIN *first, P7_DOMAIN *second, double atol, double rtol);
1522 
1523 /* p7_domaindef.c */
1524 extern P7_DOMAINDEF *p7_domaindef_Create (ESL_RANDOMNESS *r);
1525 extern int           p7_domaindef_Fetch  (P7_DOMAINDEF *ddef, int which, int *opt_i, int *opt_j, float *opt_sc, P7_ALIDISPLAY **opt_ad);
1526 extern int           p7_domaindef_GrowTo (P7_DOMAINDEF *ddef, int L);
1527 extern int           p7_domaindef_Reuse  (P7_DOMAINDEF *ddef);
1528 extern int           p7_domaindef_DumpPosteriors(FILE *ofp, P7_DOMAINDEF *ddef);
1529 extern void          p7_domaindef_Destroy(P7_DOMAINDEF *ddef);
1530 
1531 extern int p7_domaindef_ByViterbi            (P7_PROFILE *gm, const ESL_SQ *sq, const ESL_SQ *ntsq, P7_GMX *gx1, P7_GMX *gx2, P7_DOMAINDEF *ddef);
1532 extern int p7_domaindef_ByPosteriorHeuristics(const ESL_SQ *sq, const ESL_SQ *ntsq, P7_OPROFILE *om, P7_OMX *oxf, P7_OMX *oxb, P7_OMX *fwd, P7_OMX *bck,
1533 				                                  P7_DOMAINDEF *ddef, P7_BG *bg, int long_target,
1534 				                                  P7_BG *bg_tmp, float *scores_arr, float *fwd_emissions_arr);
1535 
1536 
1537 /* p7_gmx.c */
1538 extern P7_GMX *p7_gmx_Create (int allocM, int allocL);
1539 extern int     p7_gmx_GrowTo (P7_GMX *gx, int allocM, int allocL);
1540 extern size_t  p7_gmx_Sizeof (P7_GMX *gx);
1541 extern int     p7_gmx_Reuse  (P7_GMX *gx);
1542 extern void    p7_gmx_Destroy(P7_GMX *gx);
1543 extern int     p7_gmx_Compare(P7_GMX *gx1, P7_GMX *gx2, float tolerance);
1544 extern int     p7_gmx_Dump(FILE *fp, P7_GMX *gx, int flags);
1545 extern int     p7_gmx_DumpWindow(FILE *fp, P7_GMX *gx, int istart, int iend, int kstart, int kend, int show_specials);
1546 
1547 /* p7_hit.c */
1548 extern P7_HIT *p7_hit_Create_empty();
1549 extern void p7_hit_Destroy(P7_HIT *the_hit);
1550 extern int p7_hit_Serialize(const P7_HIT *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc);
1551 extern int p7_hit_Deserialize(const uint8_t *buf, uint32_t *n, P7_HIT *ret_obj);
1552 extern int p7_hit_TestSample(ESL_RAND64 *rng, P7_HIT **ret_obj);
1553 extern int p7_hit_Compare(P7_HIT *first, P7_HIT *second, double atol, double rtol);
1554 
1555 /* p7_hmm.c */
1556 /*      1. The P7_HMM object: allocation, initialization, destruction. */
1557 extern P7_HMM *p7_hmm_Create(int M, const ESL_ALPHABET *abc);
1558 extern P7_HMM *p7_hmm_CreateShell(void);
1559 extern int     p7_hmm_CreateBody(P7_HMM *hmm, int M, const ESL_ALPHABET *abc);
1560 extern void    p7_hmm_Destroy(P7_HMM *hmm);
1561 extern int     p7_hmm_CopyParameters(const P7_HMM *src, P7_HMM *dest);
1562 extern P7_HMM *p7_hmm_Clone(const P7_HMM *hmm);
1563 extern int     p7_hmm_Zero(P7_HMM *hmm);
1564 extern char    p7_hmm_EncodeStatetype(char *typestring);
1565 extern char   *p7_hmm_DecodeStatetype(char st);
1566 /*      2. Convenience routines for setting fields in an HMM. */
1567 extern int     p7_hmm_SetName       (P7_HMM *hmm, char *name);
1568 extern int     p7_hmm_SetAccession  (P7_HMM *hmm, char *acc);
1569 extern int     p7_hmm_SetDescription(P7_HMM *hmm, char *desc);
1570 extern int     p7_hmm_AppendComlog  (P7_HMM *hmm, int argc, char **argv);
1571 extern int     p7_hmm_SetCtime      (P7_HMM *hmm);
1572 extern int     p7_hmm_SetComposition(P7_HMM *hmm);
1573 extern int     p7_hmm_SetConsensus  (P7_HMM *hmm, ESL_SQ *sq);
1574 /*      3. Renormalization and rescaling counts in core HMMs. */
1575 extern int     p7_hmm_Scale      (P7_HMM *hmm, double scale);
1576 extern int     p7_hmm_ScaleExponential(P7_HMM *hmm, double exp);
1577 extern int     p7_hmm_Renormalize(P7_HMM *hmm);
1578 /*      4. Debugging and development code. */
1579 extern int     p7_hmm_Dump(FILE *fp, P7_HMM *hmm);
1580 extern int     p7_hmm_Sample          (ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, P7_HMM **ret_hmm);
1581 extern int     p7_hmm_SampleUngapped  (ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, P7_HMM **ret_hmm);
1582 extern int     p7_hmm_SampleEnumerable(ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, P7_HMM **ret_hmm);
1583 extern int     p7_hmm_SampleUniform   (ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc,
1584 				     float tmi, float tii, float tmd, float tdd,  P7_HMM **ret_hmm);
1585 extern int     p7_hmm_Compare(P7_HMM *h1, P7_HMM *h2, float tol);
1586 extern int     p7_hmm_Validate(P7_HMM *hmm, char *errbuf, float tol);
1587 /*      5. Other routines in the API */
1588 extern int     p7_hmm_CalculateOccupancy(const P7_HMM *hmm, float *mocc, float *iocc);
1589 
1590 
1591 
1592 /* p7_hmmfile.c */
1593 extern int  p7_hmmfile_OpenE    (const char *filename, char *env, P7_HMMFILE **ret_hfp, char *errbuf);
1594 extern int  p7_hmmfile_OpenENoDB(const char *filename, char *env, P7_HMMFILE **ret_hfp, char *errbuf);
1595 extern int  p7_hmmfile_Open     (const char *filename, char *env, P7_HMMFILE **ret_hfp); /* deprecated */
1596 extern int  p7_hmmfile_OpenNoDB (const char *filename, char *env, P7_HMMFILE **ret_hfp); /* deprecated */
1597 extern int  p7_hmmfile_OpenBuffer(const char *buffer, int size, P7_HMMFILE **ret_hfp);
1598 extern void p7_hmmfile_Close(P7_HMMFILE *hfp);
1599 #ifdef HMMER_THREADS
1600 extern int  p7_hmmfile_CreateLock(P7_HMMFILE *hfp);
1601 #endif
1602 extern int  p7_hmmfile_WriteBinary(FILE *fp, int format, P7_HMM *hmm);
1603 extern int  p7_hmmfile_WriteASCII (FILE *fp, int format, P7_HMM *hmm);
1604 extern int  p7_hmmfile_WriteToString (char **s, int format, P7_HMM *hmm);
1605 extern int  p7_hmmfile_Read(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc,  P7_HMM **opt_hmm);
1606 extern int  p7_hmmfile_PositionByKey(P7_HMMFILE *hfp, const char *key);
1607 extern int  p7_hmmfile_Position(P7_HMMFILE *hfp, const off_t offset);
1608 
1609 
1610 /* p7_hmmwindow.c */
1611 int p7_hmmwindow_init (P7_HMM_WINDOWLIST *list);
1612 P7_HMM_WINDOW *p7_hmmwindow_new (P7_HMM_WINDOWLIST *list, uint32_t id, uint32_t pos, uint32_t fm_pos, uint16_t k, uint32_t length, float score, uint8_t complementarity, uint32_t target_len);
1613 
1614 
1615 
1616 /* p7_msvdata.c */
1617 extern P7_SCOREDATA   *p7_hmm_ScoreDataCreate(P7_OPROFILE *om, P7_PROFILE *gm );
1618 extern P7_SCOREDATA   *p7_hmm_ScoreDataClone(P7_SCOREDATA *src, int K);
1619 extern int            p7_hmm_ScoreDataComputeRest(P7_OPROFILE *om, P7_SCOREDATA *data );
1620 extern void           p7_hmm_ScoreDataDestroy( P7_SCOREDATA *data );
1621 extern int            p7_hmm_initWindows (P7_HMM_WINDOWLIST *list);
1622 extern P7_HMM_WINDOW *p7_hmm_newWindow (P7_HMM_WINDOWLIST *list, uint32_t id, uint32_t pos, uint32_t fm_pos, uint16_t k, uint32_t length, float score, uint8_t complementarity);
1623 
1624 
1625 
1626 /* p7_null3.c */
1627 extern void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc);
1628 extern void p7_null3_windowed_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, int start, int stop, P7_BG *bg, float *ret_sc);
1629 
1630 /* p7_pipeline.c */
1631 extern P7_PIPELINE *p7_pipeline_Create(const ESL_GETOPTS *go, int M_hint, int L_hint, int long_targets, enum p7_pipemodes_e mode);
1632 extern int          p7_pipeline_Reuse  (P7_PIPELINE *pli);
1633 extern void         p7_pipeline_Destroy(P7_PIPELINE *pli);
1634 extern int          p7_pipeline_Merge  (P7_PIPELINE *p1, P7_PIPELINE *p2);
1635 
1636 extern int p7_pli_ExtendAndMergeWindows (P7_OPROFILE *om, const P7_SCOREDATA *msvdata, P7_HMM_WINDOWLIST *windowlist, float pct_overlap);
1637 extern int p7_pli_TargetReportable  (P7_PIPELINE *pli, float score,     double lnP);
1638 extern int p7_pli_DomainReportable  (P7_PIPELINE *pli, float dom_score, double lnP);
1639 
1640 extern int p7_pli_TargetIncludable  (P7_PIPELINE *pli, float score,     double lnP);
1641 extern int p7_pli_DomainIncludable  (P7_PIPELINE *pli, float dom_score, double lnP);
1642 extern int p7_pli_NewModel          (P7_PIPELINE *pli, const P7_OPROFILE *om, P7_BG *bg);
1643 extern int p7_pli_NewModelThresholds(P7_PIPELINE *pli, const P7_OPROFILE *om);
1644 extern int p7_pli_NewSeq            (P7_PIPELINE *pli, const ESL_SQ *sq);
1645 extern int p7_Pipeline              (P7_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, const ESL_SQ *sq, const ESL_SQ *ntsq, P7_TOPHITS *th);
1646 extern int p7_Pipeline_LongTarget   (P7_PIPELINE *pli, P7_OPROFILE *om, P7_SCOREDATA *data,
1647                                      P7_BG *bg, P7_TOPHITS *hitlist, int64_t seqidx,
1648                                      const ESL_SQ *sq, int complementarity,
1649                                      const FM_DATA *fmf, const FM_DATA *fmb, FM_CFG *fm_cfg
1650                                      );
1651 
1652 
1653 
1654 extern int p7_pli_Statistics(FILE *ofp, P7_PIPELINE *pli, ESL_STOPWATCH *w);
1655 
1656 
1657 /* p7_prior.c */
1658 extern P7_PRIOR  *p7_prior_CreateAmino(void);
1659 extern P7_PRIOR  *p7_prior_CreateNucleic(void);
1660 extern P7_PRIOR  *p7_prior_CreateLaplace(const ESL_ALPHABET *abc);
1661 extern void       p7_prior_Destroy(P7_PRIOR *pri);
1662 
1663 extern int        p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri);
1664 
1665 /* p7_profile.c */
1666 extern P7_PROFILE *p7_profile_Create(int M, const ESL_ALPHABET *abc);
1667 extern P7_PROFILE *p7_profile_Clone(const P7_PROFILE *gm);
1668 extern int         p7_profile_Copy(const P7_PROFILE *src, P7_PROFILE *dst);
1669 extern int         p7_profile_SetNullEmissions(P7_PROFILE *gm);
1670 extern int         p7_profile_Reuse(P7_PROFILE *gm);
1671 extern size_t      p7_profile_Sizeof(P7_PROFILE *gm);
1672 extern void        p7_profile_Destroy(P7_PROFILE *gm);
1673 extern int         p7_profile_IsLocal(const P7_PROFILE *gm);
1674 extern int         p7_profile_IsMultihit(const P7_PROFILE *gm);
1675 extern int         p7_profile_GetT(const P7_PROFILE *gm, char st1, int k1,
1676 				   char st2, int k2, float *ret_tsc);
1677 extern int         p7_profile_Validate(const P7_PROFILE *gm, char *errbuf, float tol);
1678 extern int         p7_profile_Compare(P7_PROFILE *gm1, P7_PROFILE *gm2, float tol);
1679 
1680 /* p7_spensemble.c */
1681 P7_SPENSEMBLE *p7_spensemble_Create(int init_n, int init_epc, int init_sigc);
1682 extern int     p7_spensemble_Reuse(P7_SPENSEMBLE *sp);
1683 extern int     p7_spensemble_Add(P7_SPENSEMBLE *sp, int sampleidx, int i, int j, int k, int m);
1684 extern int     p7_spensemble_Cluster(P7_SPENSEMBLE *sp,
1685 				     float min_overlap, int of_smaller, int max_diagdiff,
1686 				     float min_posterior, float min_endpointp,
1687 				     int *ret_nclusters);
1688 extern int     p7_spensemble_GetClusterCoords(P7_SPENSEMBLE *sp, int which,
1689 					      int *ret_i, int *ret_j, int *ret_k, int *ret_m, float *ret_p);
1690 extern void    p7_spensemble_Destroy(P7_SPENSEMBLE *sp);
1691 
1692 /* p7_tophits.c */
1693 extern P7_TOPHITS *p7_tophits_Create(void);
1694 extern int         p7_tophits_Grow(P7_TOPHITS *h);
1695 extern int         p7_tophits_CreateNextHit(P7_TOPHITS *h, P7_HIT **ret_hit);
1696 extern int         p7_tophits_Add(P7_TOPHITS *h,
1697 				  char *name, char *acc, char *desc,
1698 				  double sortkey,
1699 				  float score,    double lnP,
1700 				  float mothersc, double mother_lnP,
1701 				  int sqfrom, int sqto, int sqlen,
1702 				  int hmmfrom, int hmmto, int hmmlen,
1703 				  int domidx, int ndom,
1704 				  P7_ALIDISPLAY *ali);
1705 extern int         p7_tophits_SortBySortkey(P7_TOPHITS *h);
1706 extern int         p7_tophits_SortBySeqidxAndAlipos(P7_TOPHITS *h);
1707 extern int         p7_tophits_SortByModelnameAndAlipos(P7_TOPHITS *h);
1708 
1709 extern int         p7_tophits_Merge(P7_TOPHITS *h1, P7_TOPHITS *h2);
1710 extern int         p7_tophits_GetMaxPositionLength(P7_TOPHITS *h);
1711 extern int         p7_tophits_GetMaxNameLength(P7_TOPHITS *h);
1712 extern int         p7_tophits_GetMaxAccessionLength(P7_TOPHITS *h);
1713 extern int         p7_tophits_GetMaxShownLength(P7_TOPHITS *h);
1714 extern void        p7_tophits_Destroy(P7_TOPHITS *h);
1715 
1716 extern int p7_tophits_ComputeNhmmerEvalues(P7_TOPHITS *th, double N, int W);
1717 extern int p7_tophits_RemoveDuplicates(P7_TOPHITS *th, int using_bit_cutoffs);
1718 extern int p7_tophits_Threshold(P7_TOPHITS *th, P7_PIPELINE *pli);
1719 extern int p7_tophits_CompareRanking(P7_TOPHITS *th, ESL_KEYHASH *kh, int *opt_nnew);
1720 extern int p7_tophits_Targets(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw);
1721 extern int p7_tophits_Domains(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw);
1722 
1723 
1724 extern int p7_tophits_Alignment(const P7_TOPHITS *th, const ESL_ALPHABET *abc,
1725 				ESL_SQ **inc_sqarr, P7_TRACE **inc_trarr, int inc_n, int optflags,
1726 				ESL_MSA **ret_msa);
1727 extern int p7_tophits_TabularTargets(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header);
1728 extern int p7_tophits_TabularDomains(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header);
1729 extern int p7_tophits_TabularXfam(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli);
1730 extern int p7_tophits_TabularTail(FILE *ofp, const char *progname, enum p7_pipemodes_e pipemode,
1731 				  const char *qfile, const char *tfile, const ESL_GETOPTS *go);
1732 extern int p7_tophits_AliScores(FILE *ofp, char *qname, P7_TOPHITS *th );
1733 
1734 /* p7_trace.c */
1735 extern P7_TRACE *p7_trace_Create(void);
1736 extern P7_TRACE *p7_trace_CreateWithPP(void);
1737 extern int  p7_trace_Reuse(P7_TRACE *tr);
1738 extern int  p7_trace_Grow(P7_TRACE *tr);
1739 extern int  p7_trace_GrowIndex(P7_TRACE *tr);
1740 extern int  p7_trace_GrowTo(P7_TRACE *tr, int N);
1741 extern int  p7_trace_GrowIndexTo(P7_TRACE *tr, int ndom);
1742 extern void p7_trace_Destroy(P7_TRACE *tr);
1743 extern void p7_trace_DestroyArray(P7_TRACE **tr, int N);
1744 
1745 extern int  p7_trace_GetDomainCount   (const P7_TRACE *tr, int *ret_ndom);
1746 extern int  p7_trace_GetStateUseCounts(const P7_TRACE *tr, int *counts);
1747 extern int  p7_trace_GetDomainCoords  (const P7_TRACE *tr, int which, int *ret_i1, int *ret_i2,
1748 				       int *ret_k1, int *ret_k2);
1749 
1750 extern int   p7_trace_Validate(const P7_TRACE *tr, const ESL_ALPHABET *abc, const ESL_DSQ *dsq, char *errbuf);
1751 extern int   p7_trace_Dump(FILE *fp, const P7_TRACE *tr, const P7_PROFILE *gm, const ESL_DSQ *dsq);
1752 extern int   p7_trace_Compare(P7_TRACE *tr1, P7_TRACE *tr2, float pptol);
1753 extern int   p7_trace_Score(P7_TRACE *tr, ESL_DSQ *dsq, P7_PROFILE *gm, float *ret_sc);
1754 extern int   p7_trace_SetPP(P7_TRACE *tr, const P7_GMX *pp);
1755 extern float p7_trace_GetExpectedAccuracy(const P7_TRACE *tr);
1756 
1757 extern int  p7_trace_Append(P7_TRACE *tr, char st, int k, int i);
1758 extern int  p7_trace_AppendWithPP(P7_TRACE *tr, char st, int k, int i, float pp);
1759 extern int  p7_trace_Reverse(P7_TRACE *tr);
1760 extern int  p7_trace_Index(P7_TRACE *tr);
1761 
1762 extern int  p7_trace_FauxFromMSA(ESL_MSA *msa, int *matassign, int optflags, P7_TRACE **tr);
1763 extern int  p7_trace_Doctor(P7_TRACE *tr, int *opt_ndi, int *opt_nid);
1764 
1765 extern int  p7_trace_Count(P7_HMM *hmm, ESL_DSQ *dsq, float wt, P7_TRACE *tr);
1766 
1767 
1768 /* seqmodel.c */
1769 extern int p7_Seqmodel(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int M, char *name,
1770 		       ESL_DMATRIX *P, float *f, double popen, double pextend,
1771 		       P7_HMM **ret_hmm);
1772 
1773 /* fm_alphabet.c */
1774 extern int fm_alphabetCreate (FM_METADATA *meta, uint8_t *alph_bits);
1775 extern int fm_alphabetDestroy (FM_METADATA *meta);
1776 extern int fm_reverseString (char *str, int N);
1777 extern int fm_getComplement (char c, uint8_t alph_type);
1778 
1779 
1780 /* fm_general.c */
1781 extern uint64_t fm_computeSequenceOffset (const FM_DATA *fms, const FM_METADATA *meta, int block, uint64_t pos);
1782 extern int fm_getOriginalPosition (const FM_DATA *fms, const FM_METADATA *meta, int fm_id, int length, int direction, uint64_t fm_pos,
1783                                     uint32_t *segment_id, uint64_t *seg_pos);
1784 extern int fm_readFMmeta( FM_METADATA *meta);
1785 extern int fm_FM_read( FM_DATA *fm, FM_METADATA *meta, int getAll );
1786 extern void fm_FM_destroy ( FM_DATA *fm, int isMainFM);
1787 extern uint8_t fm_getChar(uint8_t alph_type, int j, const uint8_t *B );
1788 extern int fm_getSARangeReverse( const FM_DATA *fm, FM_CFG *cfg, char *query, char *inv_alph, FM_INTERVAL *interval);
1789 extern int fm_getSARangeForward( const FM_DATA *fm, FM_CFG *cfg, char *query, char *inv_alph, FM_INTERVAL *interval);
1790 extern int fm_configAlloc(FM_CFG **cfg);
1791 extern int fm_configDestroy(FM_CFG *cfg);
1792 extern int fm_metaDestroy(FM_METADATA *meta );
1793 extern int fm_updateIntervalForward( const FM_DATA *fm, const FM_CFG *cfg, char c, FM_INTERVAL *interval_f, FM_INTERVAL *interval_bk);
1794 extern int fm_updateIntervalReverse( const FM_DATA *fm, const FM_CFG *cfg, char c, FM_INTERVAL *interval);
1795 extern int fm_initSeeds (FM_DIAGLIST *list) ;
1796 extern FM_DIAG * fm_newSeed (FM_DIAGLIST *list);
1797 extern int fm_initAmbiguityList (FM_AMBIGLIST *list);
1798 extern int fm_addAmbiguityRange (FM_AMBIGLIST *list, uint32_t start, uint32_t stop);
1799 extern int fm_convertRange2DSQ(const FM_DATA *fm, const FM_METADATA *meta, uint64_t first, int length, int complementarity, ESL_SQ *sq, int fix_ambiguities );
1800 extern int fm_initConfigGeneric( FM_CFG *cfg, ESL_GETOPTS *go);
1801 
1802 /* fm_ssv.c */
1803 extern int p7_SSVFM_longlarget( P7_OPROFILE *om, float nu, P7_BG *bg, double F1,
1804                       const FM_DATA *fmf, const FM_DATA *fmb, FM_CFG *fm_cfg, const P7_SCOREDATA *ssvdata,
1805                       int strands, P7_HMM_WINDOWLIST *windowlist);
1806 
1807 
1808 /* fm_sse.c */
1809 extern int fm_configInit      (FM_CFG *cfg, ESL_GETOPTS *go);
1810 extern int fm_getOccCount     (const FM_DATA *fm, const FM_CFG *cfg, int pos, uint8_t c);
1811 extern int fm_getOccCountLT   (const FM_DATA *fm, const FM_CFG *cfg, int pos, uint8_t c, uint32_t *cnteq, uint32_t *cntlt);
1812 
1813 #endif /*P7_HMMERH_INCLUDED*/
1814 
1815 
1816