1 /* The all-encompassing include file for HMMER. 2 * All-encompassing because there's a lot of crossdependency. 3 * There's some opportunity for modularity, but not a lot. 4 * 5 * 1. P7_HMM: a core model. 6 * 2. P7_PROFILE: a scoring profile, and its implicit model. 7 * 3. P7_BG: a null (background) model. 8 * 4. P7_TRACE: a traceback path (alignment of seq to profile). 9 * 5. P7_HMMFILE: an HMM save file or database, open for reading. 10 * 6. P7_GMX: a "generic" dynamic programming matrix 11 * 7. P7_PRIOR: mixture Dirichlet prior for profile HMMs 12 * 8. P7_SPENSEMBLE: segment pair ensembles for domain locations 13 * 9. P7_ALIDISPLAY: an alignment formatted for printing 14 * 10. P7_DOMAINDEF: reusably managing workflow in annotating domains 15 * 11. P7_TOPHITS: ranking lists of top-scoring hits 16 * 12. P7_SCOREDATA: data used in diagonal recovery and extension 17 * 13. P7_HMM_WINDOW: data used to track lists of sequence windows 18 * 14. Inclusion of the architecture-specific optimized implementation. 19 * 16. P7_PIPELINE: H3's accelerated seq/profile comparison pipeline 20 * 17. P7_BUILDER: configuration options for new HMM construction. 21 * 18. Declaration of functions in HMMER's exposed API. 22 * 23 * Also, see impl_{sse,vmx}/impl_{sse,vmx}.h for additional API 24 * specific to the acceleration layer; in particular, the P7_OPROFILE 25 * structure for an optimized profile. 26 */ 27 #ifndef P7_HMMERH_INCLUDED 28 #define P7_HMMERH_INCLUDED 29 #include "p7_config.h" 30 31 #include <stdio.h> /* FILE */ 32 33 #ifdef HMMER_MPI 34 #include "mpi.h" 35 #endif 36 37 #ifdef HMMER_THREADS 38 #include <pthread.h> 39 #endif 40 41 #include "easel.h" 42 #include "esl_alphabet.h" /* ESL_DSQ, ESL_ALPHABET */ 43 #include "esl_dmatrix.h" /* ESL_DMATRIX */ 44 #include "esl_getopts.h" /* ESL_GETOPTS */ 45 #include "esl_histogram.h" /* ESL_HISTOGRAM */ 46 #include "esl_hmm.h" /* ESL_HMM */ 47 #include "esl_keyhash.h" /* ESL_KEYHASH */ 48 #include "esl_mixdchlet.h" /* ESL_MIXDCHLET */ 49 #include "esl_msa.h" /* ESL_MSA */ 50 #include "esl_random.h" /* ESL_RANDOMNESS */ 51 #include "esl_rand64.h" /* ESL_RAND64 */ 52 #include "esl_sq.h" /* ESL_SQ */ 53 #include "esl_scorematrix.h" /* ESL_SCOREMATRIX */ 54 #include "esl_stopwatch.h" /* ESL_STOPWATCH */ 55 56 57 58 /* Search modes. */ 59 #define p7_NO_MODE 0 60 #define p7_LOCAL 1 /* multihit local: "fs" mode */ 61 #define p7_GLOCAL 2 /* multihit glocal: "ls" mode */ 62 #define p7_UNILOCAL 3 /* unihit local: "sw" mode */ 63 #define p7_UNIGLOCAL 4 /* unihit glocal: "s" mode */ 64 65 #define p7_IsLocal(mode) (mode == p7_LOCAL || mode == p7_UNILOCAL) 66 #define p7_IsMulti(mode) (mode == p7_LOCAL || mode == p7_GLOCAL) 67 68 #define p7_NEVPARAM 6 /* number of statistical parameters stored in models */ 69 #define p7_NCUTOFFS 6 /* number of Pfam score cutoffs stored in models */ 70 #define p7_NOFFSETS 3 /* number of disk offsets stored in models for hmmscan's fast model input */ 71 enum p7_evparams_e { p7_MMU = 0, p7_MLAMBDA = 1, p7_VMU = 2, p7_VLAMBDA = 3, p7_FTAU = 4, p7_FLAMBDA = 5 }; 72 enum p7_cutoffs_e { p7_GA1 = 0, p7_GA2 = 1, p7_TC1 = 2, p7_TC2 = 3, p7_NC1 = 4, p7_NC2 = 5 }; 73 enum p7_offsets_e { p7_MOFFSET = 0, p7_FOFFSET = 1, p7_POFFSET = 2 }; 74 75 #define p7_EVPARAM_UNSET -99999.0f /* if evparam[0] is unset, then all unset */ 76 #define p7_CUTOFF_UNSET -99999.0f /* if cutoff[XX1] is unset, then cutoff[XX2] unset, XX={GA,TC,NC} */ 77 #define p7_COMPO_UNSET -1.0f /* if compo[0] is unset, then all unset */ 78 79 /* Option flags when creating multiple alignments with p7_tracealign_*() */ 80 #define p7_DEFAULT 0 81 #define p7_DIGITIZE (1<<0) 82 #define p7_ALL_CONSENSUS_COLS (1<<1) 83 #define p7_TRIM (1<<2) 84 85 /* Option flags when creating faux traces with p7_trace_FauxFromMSA() */ 86 #define p7_MSA_COORDS (1<<0) /* default: i = unaligned seq residue coords */ 87 88 /* Which strand(s) should be searched */ 89 enum p7_strands_e { p7_STRAND_TOPONLY = 0, p7_STRAND_BOTTOMONLY = 1, p7_STRAND_BOTH = 2}; 90 91 /***************************************************************** 92 * 1. P7_HMM: a core model. 93 *****************************************************************/ 94 95 /* Bit flags used in <hmm->flags>: optional annotation in an HMM 96 * 97 * Flags marked with ! may not be changed nor used for other meanings, 98 * because they're codes used by HMMER2 (and earlier) that must be 99 * preserved for reverse compatibility with old HMMER files. 100 * 101 * Why use flags? (So I don't ask this question of myself again:) 102 * 1. The way we allocate an HMM, we need to know if we're allocating 103 * M-width annotation fields (RF, CS, CA, MAP) before we read the 104 * annotation from the file. 105 * 2. Historically, H2 used flags, so we still need to read H2 flags 106 * for backwards compatibility; so we may as well keep using them. 107 */ 108 #define p7H_HASBITS (1<<0) /* obsolete (was: model has log-odds scores) !*/ 109 #define p7H_DESC (1<<1) /* description exists (legacy; xref SRE:J5/114) !*/ 110 #define p7H_RF (1<<2) /* #RF annotation available !*/ 111 #define p7H_CS (1<<3) /* #CS annotation available !*/ 112 #define p7H_XRAY (1<<4) /* obsolete (was: structural data available) !*/ 113 #define p7H_HASPROB (1<<5) /* obsolete (was: model in probability form) !*/ 114 #define p7H_HASDNA (1<<6) /* obsolete (was: protein HMM->DNA seq params set) !*/ 115 #define p7H_STATS (1<<7) /* model has E-value statistics calibrated !*/ 116 #define p7H_MAP (1<<8) /* alignment map is available !*/ 117 #define p7H_ACC (1<<9) /* accession is available (legacy; xref SRE:J5/114)!*/ 118 #define p7H_GA (1<<10) /* gathering thresholds available !*/ 119 #define p7H_TC (1<<11) /* trusted cutoffs available !*/ 120 #define p7H_NC (1<<12) /* noise cutoffs available !*/ 121 #define p7H_CA (1<<13) /* surface accessibilities available !*/ 122 #define p7H_COMPO (1<<14) /* model-specific residue composition available */ 123 #define p7H_CHKSUM (1<<15) /* model has an alignment checksum */ 124 #define p7H_CONS (1<<16) /* consensus residue line available */ 125 #define p7H_MMASK (1<<17) /* #MM annotation available !*/ 126 127 /* Indices of Plan7 main model state transitions, hmm->t[k][] */ 128 enum p7h_transitions_e { 129 p7H_MM = 0, 130 p7H_MI = 1, 131 p7H_MD = 2, 132 p7H_IM = 3, 133 p7H_II = 4, 134 p7H_DM = 5, 135 p7H_DD = 6 136 }; 137 #define p7H_NTRANSITIONS 7 138 139 /* How the hmm->t[k] vector is interpreted as separate probability vectors. */ 140 #define P7H_TMAT(hmm, k) ((hmm)->t[k]) 141 #define P7H_TINS(hmm, k) ((hmm)->t[k]+3) 142 #define P7H_TDEL(hmm, k) ((hmm)->t[k]+5) 143 #define p7H_NTMAT 3 144 #define p7H_NTDEL 2 145 #define p7H_NTINS 2 146 147 /* Some notes: 148 * 0. The model might be either in counts or probability form. 149 * 1. t[0] is special: t[0][TMM,TMI,TMD] are the begin->M_1,I_0,D_1 entry probabilities, 150 * t[0][TIM,TII] are the I_0 transitions, and delete state 0 doesn't 151 * exist. Therefore D[0] transitions and mat[0] emissions are unused. 152 * To simplify some normalization code, we adopt a convention that these are set 153 * to valid probability distributions: 1.0 for t[0][TDM] and mat[0][0], 154 * and 0 for the rest. 155 * 2. t[M] is also special: TMD and TDD are 0 because there is no next delete state; 156 * TDM is therefore 1.0 by definition. TMM and TDM are interpreted as the 157 * M->E and D->E end transitions. t[M][TDM] must be 1.0, therefore. 158 */ 159 typedef struct p7_hmm_s { 160 /*::cexcerpt::plan7_core::begin::*/ 161 int M; /* length of the model (# nodes) */ 162 float **t; /* transition prob's. t[(0),1..M][0..p7H_NTRANSITIONS-1] */ 163 float **mat; /* match emissions. mat[1..M][0..K-1] */ 164 float **ins; /* insert emissions. ins[1..M][0..K-1] */ 165 /*::cexcerpt::plan7_core::end::*/ 166 167 /* Annotation. Everything but <name> is optional. Flags are set when 168 * optional values are set. All the char *'s are proper nul-terminated 169 * strings, not just arrays. (hmm->map is an int array). 170 */ 171 char *name; /* name of the model (mandatory) */ /* String, \0-terminated */ 172 char *acc; /* accession number of model (Pfam) (optional: NULL) */ /* String, \0-terminated */ 173 char *desc; /* brief (1-line) description of model (optional: NULL) */ /* String, \0-terminated */ 174 char *rf; /* reference line from alignment 1..M (p7H_RF) */ /* String; 0=' ', M+1='\0' */ 175 char *mm; /* model mask line from alignment 1..M (p7H_MM) */ /* String; 0=' ', M+1='\0' */ 176 char *consensus; /* consensus residue line 1..M (p7H_CONS) */ /* String; 0=' ', M+1='\0' */ 177 char *cs; /* consensus structure line 1..M (p7H_CS) */ /* String; 0=' ', M+1='\0' */ 178 char *ca; /* consensus accessibility line 1..M (p7H_CA) */ /* String; 0=' ', M+1='\0' */ 179 180 char *comlog; /* command line(s) that built model (optional: NULL) */ /* String, \0-terminated */ 181 int nseq; /* number of training sequences (optional: -1) */ 182 float eff_nseq; /* effective number of seqs (<= nseq) (optional: -1) */ 183 int max_length; /* upper bound length, all but 1e-7 prob (optional: -1) */ 184 char *ctime; /* creation date (optional: NULL) */ 185 int *map; /* map of alignment cols onto model 1..M (p7H_MAP) */ /* Array; map[0]=0 */ 186 uint32_t checksum; /* checksum of training sequences (p7H_CHKSUM) */ 187 float evparam[p7_NEVPARAM]; /* E-value params (p7H_STATS) */ 188 float cutoff[p7_NCUTOFFS]; /* Pfam score cutoffs (p7H_{GA,TC,NC}) */ 189 float compo[p7_MAXABET]; /* model bg residue comp (p7H_COMPO) */ 190 191 off_t offset; /* HMM record offset on disk */ 192 const ESL_ALPHABET *abc; /* ptr to alphabet info (hmm->abc->K is alphabet size) */ 193 int flags; /* status flags */ 194 } P7_HMM; 195 196 197 /***************************************************************** 198 * 2. P7_PROFILE: a scoring profile, and its implicit model. 199 *****************************************************************/ 200 201 /* Indices for special state types in the length model, gm->xsc[x][] 202 */ 203 enum p7p_xstates_e { 204 p7P_E = 0, 205 p7P_N = 1, 206 p7P_J = 2, 207 p7P_C = 3 208 }; 209 #define p7P_NXSTATES 4 210 211 /* Indices for transitions from the length modeling scores gm->xsc[][x] 212 */ 213 enum p7p_xtransitions_e { 214 p7P_LOOP = 0, 215 p7P_MOVE = 1 216 }; 217 #define p7P_NXTRANS 2 218 219 /* Indices for transition scores gm->tsc[k][] */ 220 /* order is optimized for dynamic programming */ 221 enum p7p_tsc_e { 222 p7P_MM = 0, 223 p7P_IM = 1, 224 p7P_DM = 2, 225 p7P_BM = 3, 226 p7P_MD = 4, 227 p7P_DD = 5, 228 p7P_MI = 6, 229 p7P_II = 7, 230 }; 231 #define p7P_NTRANS 8 232 233 /* Indices for residue emission score vectors 234 */ 235 enum p7p_rsc_e { 236 p7P_MSC = 0, 237 p7P_ISC = 1 238 }; 239 #define p7P_NR 2 240 241 /* Accessing transition, emission scores */ 242 /* _BM is specially stored off-by-one: [k-1][p7P_BM] is score for entering at Mk */ 243 #define p7P_TSC(gm, k, s) ((gm)->tsc[(k) * p7P_NTRANS + (s)]) 244 #define p7P_MSC(gm, k, x) ((gm)->rsc[x][(k) * p7P_NR + p7P_MSC]) 245 #define p7P_ISC(gm, k, x) ((gm)->rsc[x][(k) * p7P_NR + p7P_ISC]) 246 247 248 typedef struct p7_profile_s { 249 float *tsc; /* transitions [0.1..M-1][0..p7P_NTRANS-1], hand-indexed */ 250 float **rsc; /* emissions [0..Kp-1][0.1..M][p7P_NR], hand-indexed */ 251 float xsc[p7P_NXSTATES][p7P_NXTRANS]; /* special transitions [NECJ][LOOP,MOVE] */ 252 253 int mode; /* configured algorithm mode (e.g. p7_LOCAL) */ 254 int L; /* current configured target seq length */ 255 int allocM; /* max # of nodes allocated in this structure */ 256 int M; /* number of nodes in the model */ 257 int max_length; /* calculated upper bound on emitted seq length */ 258 float nj; /* expected # of uses of J; precalculated from loop config */ 259 260 /* Info, most of which is a copy from parent HMM: */ 261 char *name; /* unique name of model */ 262 char *acc; /* unique accession of model, or NULL */ 263 char *desc; /* brief (1-line) description of model, or NULL */ 264 char *rf; /* reference line from alignment 1..M; *rf=0 means unused */ 265 char *mm; /* modelmask line 1..M; *ref=0: unused */ 266 char *cs; /* consensus structure line 1..M, *cs=0 means unused */ 267 char *consensus; /* consensus residues to display in alignments, 1..M */ 268 float evparam[p7_NEVPARAM]; /* parameters for determining E-values, or UNSET */ 269 float cutoff[p7_NCUTOFFS]; /* per-seq/per-domain bit score cutoffs, or UNSET */ 270 float compo[p7_MAXABET]; /* per-model HMM filter composition, or UNSET */ 271 272 /* Disk offset information for hmmpfam's fast model retrieval */ 273 off_t offs[p7_NOFFSETS]; /* p7_{MFP}OFFSET, or -1 */ 274 275 off_t roff; /* record offset (start of record); -1 if none */ 276 off_t eoff; /* offset to last byte of record; -1 if unknown */ 277 278 const ESL_ALPHABET *abc; /* copy of pointer to appropriate alphabet */ 279 } P7_PROFILE; 280 281 282 283 /***************************************************************** 284 * 3. P7_BG: a null (background) model. 285 *****************************************************************/ 286 287 /* This really contains three different things: 288 * 289 * - the "null1" model, a one-state HMM consisting of background 290 * frequencies <f> and a parameter <p1> for a target-length 291 * dependent geometric; 292 * 293 * - the "bias filter" <fhmm> a two-state HMM composed from null1's 294 * background <f> and the model's mean composition <compo>. This 295 * model is constructed dynamically, every time a new profile is 296 * considered; 297 * 298 * - a single term <omega> that's needed by the "null2" model to set 299 * a balance between the null1 and null2 scoring terms. The null2 300 * model is otherwise defined by construction, in p7_domaindef.c. 301 * 302 * Someday we might pull this apart into two or three separate 303 * objects. 304 */ 305 typedef struct p7_bg_s { 306 float *f; /* null1 background residue frequencies [0..K-1]: set at initialization */ 307 float p1; /* null1's transition prob: p7_bg_SetLength() sets this from target seq L */ 308 309 ESL_HMM *fhmm; /* bias filter: p7_bg_SetFilter() sets this, from model's mean composition */ 310 311 float omega; /* the "prior" on null2/null3: set at initialization (one omega for both null types) */ 312 313 const ESL_ALPHABET *abc; /* reference to alphabet in use: set at initialization */ 314 } P7_BG; 315 316 /***************************************************************** 317 * 4. P7_TRACE: a traceback (alignment of seq to profile). 318 *****************************************************************/ 319 320 /* Traceback structure for alignment of a model to a sequence. 321 * 322 * A traceback only makes sense in a triplet (tr, gm, dsq), for a 323 * given profile or HMM (with nodes 1..M) and a given digital sequence 324 * (with positions 1..L). 325 * 326 * A traceback may be relative to a profile (usually) or to a core 327 * model (as a special case in model construction; see build.c). You 328 * can tell the difference by looking at the first statetype, 329 * tr->st[0]; if it's a p7T_S, it's for a profile, and if it's p7T_B, 330 * it's for a core model. 331 * 332 * A "profile" trace uniquely has S,N,C,T,J states and their 333 * transitions; it also can have B->Mk and Mk->E internal entry/exit 334 * transitions for local alignments. It may not contain X states. 335 * 336 * A "core" trace may contain I0, IM, and D1 states and their 337 * transitions. A "core" trace can also have B->X->{MDI}k and 338 * {MDI}k->X->E transitions as a special hack in a build procedure, to 339 * deal with the case of a local alignment fragment implied by an 340 * input alignment, which is "impossible" for a core model. 341 * X "states" only appear in core traces, and only at these 342 * entry/exit places; some code depends on this. 343 * 344 * A profile's N,C,J states emit on transition, not on state, so a 345 * path of N emits 0 residues, NN emits 1 residue, NNN emits 2 346 * residues, and so on. By convention, the trace always associates an 347 * emission-on-transition with the trailing (destination) state, so 348 * the first N, C, or J is stored in a trace as a nonemitter (i=0). 349 * 350 * A i coords in a traceback are usually 1..L with respect to an 351 * unaligned digital target sequence, but in the special case of 352 * traces faked from existing MSAs (as in hmmbuild), the coords may 353 * be 1..alen relative to an MSA's columns. 354 */ 355 356 /* State types */ 357 enum p7t_statetype_e { 358 p7T_BOGUS = 0, 359 p7T_M = 1, 360 p7T_D = 2, 361 p7T_I = 3, 362 p7T_S = 4, 363 p7T_N = 5, 364 p7T_B = 6, 365 p7T_E = 7, 366 p7T_C = 8, 367 p7T_T = 9, 368 p7T_J = 10, 369 p7T_X = 11, /* missing data: used esp. for local entry/exits */ 370 }; 371 #define p7T_NSTATETYPES 12 372 373 typedef struct p7_trace_s { 374 int N; /* length of traceback */ 375 int nalloc; /* allocated length of traceback */ 376 char *st; /* state type code [0..N-1]*/ 377 int *k; /* node index; 1..M if M,D,I; else 0 [0..N-1]*/ 378 int *i; /* pos emitted in dsq, 1..L; else 0 [0..N-1]*/ 379 float *pp; /* posterior prob of x_i; else 0 [0..N-1]*/ 380 int M; /* model length M (maximum k) */ 381 int L; /* sequence length L (maximum i) */ 382 383 /* The following section is data generated by "indexing" a trace's domains */ 384 int ndom; /* number of domains in trace (= # of B or E states) */ 385 int *tfrom, *tto; /* locations of B/E states in trace (0..tr->N-1) */ 386 int *sqfrom, *sqto; /* first/last M-emitted residue on sequence (1..L) */ 387 int *hmmfrom, *hmmto;/* first/last M state on model (1..M) */ 388 int ndomalloc; /* current allocated size of these stacks */ 389 390 } P7_TRACE; 391 392 393 394 /***************************************************************** 395 * 5. P7_HMMFILE: an HMM save file or database, open for reading. 396 *****************************************************************/ 397 398 /* These tags need to be in temporal order, so we can do tests 399 * like "if (format >= p7_HMMFILE_3b) ..." 400 */ 401 enum p7_hmmfile_formats_e { 402 p7_HMMFILE_20 = 0, 403 p7_HMMFILE_3a = 1, 404 p7_HMMFILE_3b = 2, 405 p7_HMMFILE_3c = 3, 406 p7_HMMFILE_3d = 4, 407 p7_HMMFILE_3e = 5, 408 p7_HMMFILE_3f = 6, 409 }; 410 411 typedef struct p7_hmmfile_s { 412 FILE *f; /* pointer to stream for reading models */ 413 char *fname; /* (fully qualified) name of the HMM file; [STDIN] if - */ 414 ESL_SSI *ssi; /* open SSI index for model file <f>; NULL if none. */ 415 416 int do_gzip; /* TRUE if f is "gzip -dc |" (will pclose(f)) */ 417 int do_stdin; /* TRUE if f is stdin (won't close f) */ 418 int newly_opened; /* TRUE if we just opened the stream (and parsed magic) */ 419 int is_pressed; /* TRUE if a pressed HMM database file (Pfam or equiv) */ 420 421 int format; /* HMM file format code */ 422 int (*parser)(struct p7_hmmfile_s *, ESL_ALPHABET **, P7_HMM **); 423 ESL_FILEPARSER *efp; 424 425 /* If <is_pressed>, we can read optimized profiles directly, via: */ 426 FILE *ffp; /* MSV part of the optimized profile */ 427 FILE *pfp; /* rest of the optimized profile */ 428 429 #ifdef HMMER_THREADS 430 int syncRead; 431 pthread_mutex_t readMutex; 432 #endif 433 434 char errbuf[eslERRBUFSIZE]; 435 } P7_HMMFILE; 436 437 /* note on <fname>, above: 438 * this is the actual name of the HMM file being read. 439 * 440 * The way p7_hmmfile_Open() works, it will preferentially look for 441 * hmmpress'ed binary files. If you open "foo", it will first try to 442 * open "foo.h3m" and <fname> will be "foo.h3m". "foo" does not even 443 * have to exist. If a parsing error occurs, you want <fname> to 444 * be "foo.h3m", so error messages report blame correctly. 445 * In the special case of reading from stdin, <fname> is "[STDIN]". 446 */ 447 448 449 /***************************************************************** 450 * 6. P7_GMX: a "generic" dynamic programming matrix 451 *****************************************************************/ 452 453 enum p7g_scells_e { 454 p7G_M = 0, 455 p7G_I = 1, 456 p7G_D = 2, 457 }; 458 #define p7G_NSCELLS 3 459 460 enum p7g_xcells_e { 461 p7G_E = 0, 462 p7G_N = 1, 463 p7G_J = 2, 464 p7G_B = 3, 465 p7G_C = 4 466 }; 467 #define p7G_NXCELLS 5 468 469 470 typedef struct p7_gmx_s { 471 int M; /* actual model dimension (model 1..M) */ 472 int L; /* actual sequence dimension (seq 1..L) */ 473 474 int allocR; /* current allocated # of rows : L+1 <= validR <= allocR */ 475 int validR; /* # of rows actually pointing at DP memory */ 476 int allocW; /* current set row width : M+1 <= allocW */ 477 uint64_t ncells; /* total # of allocated cells in 2D matrix : ncells >= (validR)(allocW) */ 478 479 float **dp; /* logically [0.1..L][0.1..M][0..p7G_NSCELLS-1]; indexed [i][k*p7G_NSCELLS+s] */ 480 float *xmx; /* logically [0.1..L][0..p7G_NXCELLS-1]; indexed [i*p7G_NXCELLS+s] */ 481 482 float *dp_mem; 483 } P7_GMX; 484 485 /* Macros below implement indexing idioms for generic DP routines. 486 * They require the following setup, for profile <gm> and matrix <gx>: 487 * float const *tsc = gm->tsc; 488 * float **dp = gx->dp; 489 * float *xmx = gx->xmx; 490 * and for each row i (target residue x_i in digital seq <dsq>): 491 * float const *rsc = gm->rsc[dsq[i]]; 492 */ 493 #define MMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_M]) 494 #define IMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_I]) 495 #define DMX(i,k) (dp[(i)][(k) * p7G_NSCELLS + p7G_D]) 496 #define XMX(i,s) (xmx[(i) * p7G_NXCELLS + (s)]) 497 498 #define TSC(s,k) (tsc[(k) * p7P_NTRANS + (s)]) 499 #define MSC(k) (rsc[(k) * p7P_NR + p7P_MSC]) 500 #define ISC(k) (rsc[(k) * p7P_NR + p7P_ISC]) 501 502 /* Flags that control P7_GMX debugging dumps */ 503 #define p7_HIDE_SPECIALS (1<<0) 504 #define p7_SHOW_LOG (1<<1) 505 506 507 /***************************************************************** 508 * 7. P7_PRIOR: mixture Dirichlet prior for profile HMMs 509 *****************************************************************/ 510 511 typedef struct p7_prior_s { 512 ESL_MIXDCHLET *tm; /* match transitions */ 513 ESL_MIXDCHLET *ti; /* insert transitions */ 514 ESL_MIXDCHLET *td; /* delete transitions */ 515 ESL_MIXDCHLET *em; /* match emissions */ 516 ESL_MIXDCHLET *ei; /* insert emissions */ 517 } P7_PRIOR; 518 519 520 521 522 /***************************************************************** 523 * 8. P7_SPENSEMBLE: segment pair ensembles for domain locations 524 *****************************************************************/ 525 526 /* struct p7_spcoord_s: 527 * a coord quad defining a segment pair. 528 */ 529 struct p7_spcoord_s { 530 int idx; /* backreference index: which trace a seg came from, or which cluster a domain came from */ 531 int i, j; /* start,end in a target sequence (1..L) */ 532 int k, m; /* start,end in a query model (1..M) */ 533 float prob; /* posterior probability of segment */ 534 }; 535 536 /* Structure: P7_SPENSEMBLE 537 * 538 * Collection and clustering of an ensemble of sampled segment pairs, 539 * in order to define domain locations using their posterior 540 * probability distribution (as opposed to Viterbi MAP tracebacks). 541 */ 542 typedef struct p7_spensemble_s { 543 /* Section 1: a collected ensemble of segment pairs */ 544 int nsamples; /* number of sampled traces */ 545 struct p7_spcoord_s *sp; /* array of sampled seg pairs; [0..n-1] */ 546 int nalloc; /* allocated size of <sp> */ 547 int n; /* number of seg pairs in <sp> */ 548 549 /* Section 2: then the ensemble is clustered by single-linkage clustering */ 550 int *workspace; /* temp space for Easel SLC algorithm: 2*n */ 551 int *assignment; /* each seg pair's cluster index: [0..n-1] = (0..nc-1) */ 552 int nc; /* number of different clusters */ 553 554 /* Section 3: then endpoint distribution is examined within each large cluster */ 555 int *epc; /* array counting frequency of each endpoint */ 556 int epc_alloc; /* allocated width of <epc> */ 557 558 /* Section 4: finally each large cluster is resolved into domain coords */ 559 struct p7_spcoord_s *sigc; /* array of coords for each domain, [0..nsigc-1] */ 560 int nsigc; /* number of "significant" clusters, domains */ 561 int nsigc_alloc; /* current allocated max for nsigc */ 562 } P7_SPENSEMBLE; 563 564 565 566 /***************************************************************** 567 * 9. P7_ALIDISPLAY: an alignment formatted for printing 568 *****************************************************************/ 569 570 /* Structure: P7_ALIDISPLAY 571 * 572 * Alignment of a sequence domain to an HMM, formatted for printing. 573 * 574 * For an alignment of L residues and names C chars long, requires 575 * 6L + 2C + 30 bytes; for typical case of L=100,C=10, that's 576 * <0.7 Kb. 577 */ 578 typedef struct p7_alidisplay_s { 579 char *rfline; /* reference coord info; or NULL */ 580 char *mmline; /* modelmask coord info; or NULL */ 581 char *csline; /* consensus structure info; or NULL */ 582 char *model; /* aligned query consensus sequence */ 583 char *mline; /* "identities", conservation +'s, etc. */ 584 char *aseq; /* aligned target sequence */ 585 char *ntseq; /* nucleotide target sequence if nhmmscant */ 586 char *ppline; /* posterior prob annotation; or NULL */ 587 int N; /* length of strings */ 588 589 char *hmmname; /* name of HMM */ 590 char *hmmacc; /* accession of HMM; or [0]='\0' */ 591 char *hmmdesc; /* description of HMM; or [0]='\0' */ 592 int hmmfrom; /* start position on HMM (1..M, or -1) */ 593 int hmmto; /* end position on HMM (1..M, or -1) */ 594 int M; /* length of model */ 595 596 char *sqname; /* name of target sequence */ 597 char *sqacc; /* accession of target seq; or [0]='\0' */ 598 char *sqdesc; /* description of targ seq; or [0]='\0' */ 599 int64_t sqfrom; /* start position on sequence (1..L) */ 600 int64_t sqto; /* end position on sequence (1..L) */ 601 int64_t L; /* length of sequence */ 602 603 int memsize; /* size of allocated block of memory */ 604 char *mem; /* memory used for the char data above */ 605 } P7_ALIDISPLAY; 606 607 608 /***************************************************************** 609 * 10. P7_DOMAINDEF: reusably managing workflow in defining domains 610 *****************************************************************/ 611 612 typedef struct p7_dom_s { 613 int64_t ienv, jenv; 614 int64_t iali, jali; 615 int64_t iorf, jorf; /*Used in translated search to capture the range in the DNA sequence of the ORF containing the match to a protein query */ 616 float envsc; /* Forward score in envelope ienv..jenv; NATS; without null2 correction */ 617 float domcorrection; /* null2 score when calculating a per-domain score; NATS */ 618 float dombias; /* FLogsum(0, log(bg->omega) + domcorrection): null2 score contribution; NATS */ 619 float oasc; /* optimal accuracy score (units: expected # residues correctly aligned) */ 620 float bitscore; /* overall score in BITS, null corrected, if this were the only domain in seq */ 621 double lnP; /* log(P-value) of the bitscore */ 622 int is_reported; /* TRUE if domain meets reporting thresholds */ 623 int is_included; /* TRUE if domain meets inclusion thresholds */ 624 float *scores_per_pos; /* score in BITS that each position in the alignment contributes to an overall viterbi score */ 625 P7_ALIDISPLAY *ad; 626 } P7_DOMAIN; 627 628 /* Structure: P7_DOMAINDEF 629 * 630 * This is a container for all the necessary information for domain 631 * definition procedures in <p7_domaindef.c>, including a bunch of 632 * heuristic thresholds. The structure is reusable to minimize the 633 * number of allocation/free cycles that need to be done when 634 * processing a large number of sequences. You create the structure 635 * with <p7_domaindef_Create()>; after you're done with defining 636 * domains on a sequence, you call <p7_domaindef_Reuse()> before using 637 * it on the next sequence; and when you're completely done, you free 638 * it with <p7_domaindef_Destroy()>. All memory management is handled 639 * internally; you don't need to reallocate anything yourself. 640 */ 641 typedef struct p7_domaindef_s { 642 /* for posteriors of being in a domain, B, E */ 643 float *mocc; /* mocc[i=1..L] = prob that i is emitted by core model (is in a domain) */ 644 float *btot; /* btot[i=1..L] = cumulative expected times that domain starts at or before i */ 645 float *etot; /* etot[i=1..L] = cumulative expected times that domain ends at or before i */ 646 int L; 647 int Lalloc; 648 649 /* the ad hoc null2 model: 1..L nat scores for each residue, log f'(x_i) / f(x_i) */ 650 float *n2sc; 651 652 /* rng and reusable memory for stochastic tracebacks */ 653 ESL_RANDOMNESS *r; /* random number generator */ 654 int do_reseeding; /* TRUE to reset the RNG, make results reproducible */ 655 P7_SPENSEMBLE *sp; /* an ensemble of sampled segment pairs (domain endpoints) */ 656 P7_TRACE *tr; /* reusable space for a trace of a domain */ 657 P7_TRACE *gtr; /* reusable space for a traceback of the entire target seq */ 658 659 /* Heuristic thresholds that control the region definition process */ 660 /* "rt" = "region threshold", for lack of better term */ 661 float rt1; /* controls when regions are called. mocc[i] post prob >= dt1 : triggers a region around i */ 662 float rt2; /* controls extent of regions. regions extended until mocc[i]-{b,e}occ[i] < dt2 */ 663 float rt3; /* controls when regions are flagged for split: if expected # of E preceding B is >= dt3 */ 664 665 /* Heuristic thresholds that control the stochastic traceback/clustering process */ 666 int nsamples; /* collect ensemble of this many stochastic traces */ 667 float min_overlap; /* 0.8 means >= 80% overlap of (smaller/larger) segment to link, both in seq and hmm */ 668 int of_smaller; /* see above; TRUE means overlap denom is calc'ed wrt smaller segment; FALSE means larger */ 669 int max_diagdiff; /* 4 means either start or endpoints of two segments must be within <=4 diagonals of each other */ 670 float min_posterior; /* 0.25 means a cluster must have >= 25% posterior prob in the sample to be reported */ 671 float min_endpointp; /* 0.02 means choose widest endpoint with post prob of at least 2% */ 672 673 /* storage of the results; domain locations, scores, alignments */ 674 P7_DOMAIN *dcl; 675 int ndom; /* number of domains defined, in the end. */ 676 int nalloc; /* number of domain structures allocated in <dcl> */ 677 678 /* Additional results storage */ 679 float nexpected; /* posterior expected number of domains in the sequence (from posterior arrays) */ 680 int nregions; /* number of regions evaluated */ 681 int nclustered; /* number of regions evaluated by clustering ensemble of tracebacks */ 682 int noverlaps; /* number of envelopes defined in ensemble clustering that overlap w/ prev envelope */ 683 int nenvelopes; /* number of envelopes handed over for domain definition, null2, alignment, and scoring. */ 684 685 } P7_DOMAINDEF; 686 687 688 /***************************************************************** 689 * 11. P7_TOPHITS: ranking lists of top-scoring hits 690 *****************************************************************/ 691 692 #define p7_HITFLAGS_DEFAULT 0 693 #define p7_IS_INCLUDED (1<<0) 694 #define p7_IS_REPORTED (1<<1) 695 #define p7_IS_NEW (1<<2) 696 #define p7_IS_DROPPED (1<<3) 697 #define p7_IS_DUPLICATE (1<<4) 698 699 700 /* Structure: P7_HIT 701 * 702 * Info about a high-scoring database hit, kept so we can output a 703 * sorted list of high hits at the end. 704 * 705 * sqfrom and sqto are the coordinates that will be shown 706 * in the results, not coords in arrays... therefore, reverse 707 * complements have sqfrom > sqto 708 */ 709 typedef struct p7_hit_s { 710 char *name; /* name of the target (mandatory) */ 711 char *acc; /* accession of the target (optional; else NULL) */ 712 char *desc; /* description of the target (optional; else NULL) */ 713 int window_length; /* for later use in e-value computation, when splitting long sequences */ 714 double sortkey; /* number to sort by; big is better */ 715 716 float score; /* bit score of the sequence (all domains, w/ correction) */ 717 float pre_score; /* bit score of sequence before null2 correction */ 718 float sum_score; /* bit score reconstructed from sum of domain envelopes */ 719 720 double lnP; /* log(P-value) of the score */ 721 double pre_lnP; /* log(P-value) of the pre_score */ 722 double sum_lnP; /* log(P-value) of the sum_score */ 723 724 float nexpected; /* posterior expected number of domains in the sequence (from posterior arrays) */ 725 int nregions; /* number of regions evaluated */ 726 int nclustered; /* number of regions evaluated by clustering ensemble of tracebacks */ 727 int noverlaps; /* number of envelopes defined in ensemble clustering that overlap w/ prev envelope */ 728 int nenvelopes; /* number of envelopes handed over for domain definition, null2, alignment, and scoring. */ 729 int ndom; /* total # of domains identified in this seq */ 730 731 uint32_t flags; /* p7_IS_REPORTED | p7_IS_INCLUDED | p7_IS_NEW | p7_IS_DROPPED */ 732 int nreported; /* # of domains satisfying reporting thresholding */ 733 int nincluded; /* # of domains satisfying inclusion thresholding */ 734 int best_domain; /* index of best-scoring domain in dcl */ 735 736 int64_t seqidx; /*unique identifier to track the database sequence from which this hit came*/ 737 int64_t subseq_start; /*used to track which subsequence of a full_length target this hit came from, for purposes of removing duplicates */ 738 739 P7_DOMAIN *dcl; /* domain coordinate list and alignment display */ 740 esl_pos_t offset; /* used in socket communications, in serialized communication: offset of P7_DOMAIN msg for this P7_HIT */ 741 } P7_HIT; 742 743 744 /* Structure: P7_TOPHITS 745 * merging when we prepare to output results. "hit" list is NULL and 746 * unavailable until after we do a sort. 747 */ 748 typedef struct p7_tophits_s { 749 P7_HIT **hit; /* sorted pointer array */ 750 P7_HIT *unsrt; /* unsorted data storage */ 751 uint64_t Nalloc; /* current allocation size */ 752 uint64_t N; /* number of hits in list now */ 753 uint64_t nreported; /* number of hits that are reportable */ 754 uint64_t nincluded; /* number of hits that are includable */ 755 int is_sorted_by_sortkey; /* TRUE when hits sorted by sortkey and th->hit valid for all N hits */ 756 int is_sorted_by_seqidx; /* TRUE when hits sorted by seq_idx, position, and th->hit valid for all N hits */ 757 } P7_TOPHITS; 758 759 760 761 762 763 /***************************************************************** 764 * 12. P7_SCOREDATA: data used in diagonal recovery and extension 765 *****************************************************************/ 766 767 enum p7_scoredatatype_e { 768 p7_sd_std = 0, 769 p7_sd_fm = 1, 770 }; 771 772 773 /* This contains a compact representation of 8-bit bias-shifted scores for use in 774 * diagonal recovery (standard SSV) and extension (standard and FM-SSV), 775 * along with MAXL-associated prefix- and suffix-lengths, and optimal extensions 776 * for FM-SSV. 777 */ 778 typedef struct p7_scoredata_s { 779 int type; 780 int M; 781 union {//implicit (M+1)*K matrix, where M = # states, and K = # characters in alphabet 782 uint8_t *ssv_scores; // this 2D array is used in the default nhmmer pipeline 783 float *ssv_scores_f; // this 2D array is used in the FM-index based pipeline 784 }; 785 float *prefix_lengths; 786 float *suffix_lengths; 787 float *fwd_scores; 788 float **fwd_transitions; 789 float **opt_ext_fwd; // Used only for FM-index based pipeline 790 float **opt_ext_rev; // Used only for FM-index based pipeline 791 } P7_SCOREDATA; 792 793 794 /***************************************************************** 795 * 13. P7_HMM_WINDOW: data used to track lists of sequence windows 796 *****************************************************************/ 797 798 799 typedef struct p7_hmm_window_s { 800 float score; 801 float null_sc; 802 int32_t id; //sequence id of the database sequence hit 803 int64_t n; //position in database sequence at which the diagonal/window starts 804 int64_t fm_n; //position in the concatenated fm-index sequence at which the diagonal starts 805 int32_t length; // length of the diagonal/window 806 int16_t k; //position of the model at which the diagonal ends 807 int64_t target_len; //length of the target sequence 808 int8_t complementarity; 809 int8_t used_to_extend; 810 } P7_HMM_WINDOW; 811 812 typedef struct p7_hmm_window_list_s { 813 P7_HMM_WINDOW *windows; 814 int count; 815 int size; 816 } P7_HMM_WINDOWLIST; 817 818 819 820 /***************************************************************** 821 * 14. Choice of vector implementation. 822 *****************************************************************/ 823 #if defined (eslENABLE_SSE) 824 #include "impl_sse/impl_sse.h" 825 #elif defined (eslENABLE_VMX) 826 #include "impl_vmx/impl_vmx.h" 827 #else 828 #error "No vector implementation enabled" 829 #endif 830 831 832 /***************************************************************** 833 * 15. The FM-index acceleration to the SSV filter. Only works for SSE 834 *****************************************************************/ 835 #define FM_MAX_LINE 256 836 837 /* Structure the 2D occ array into a single array. "type" is either b or sb. 838 * Note that one extra count value is required by RLE, one 4-byte int for 839 * each superblock count vector, and one 2-byte short for each block count 840 * vector. This is small overhead, even for a small alphabet like dna. 841 */ 842 #define FM_OCC_CNT( type, i, c) ( occCnts_##type[(meta->alph_size)*(i) + (c)]) 843 844 enum fm_alphabettypes_e { 845 fm_DNA = 0, //acgt, 2 bit 846 //fm_DNA_full = 1, //includes ambiguity codes, 4 bit. 847 fm_AMINO = 4, // 5 bit 848 }; 849 /*TODO: fm_DNA_full has currently been disabled because of problems with how the 850 * FM index handles very long runs of the same character (in this case, Ns). 851 * See wheelert/notebook/2013/12-11-FM-alphabet-speed notes on 12/12. 852 */ 853 854 enum fm_direction_e { 855 fm_forward = 0, 856 fm_backward = 1, 857 }; 858 859 860 typedef struct fm_interval_s { 861 int lower; 862 int upper; 863 } FM_INTERVAL; 864 865 typedef struct fm_hit_s { 866 uint64_t start; 867 uint32_t block; 868 int direction; 869 int length; 870 int sortkey; 871 } FM_HIT; 872 873 874 875 typedef struct fm_ambiglist_s { 876 FM_INTERVAL *ranges; 877 uint32_t count; 878 uint32_t size; 879 } FM_AMBIGLIST; 880 881 882 typedef struct fm_seqdata_s { 883 884 uint32_t target_id; // Which sequence in the target database did this segment come from (can be multiple segment per sequence, if a sequence has Ns) 885 uint64_t target_start; // The position in sequence {id} in the target database at which this sequence-block starts (usually 1, unless its a long sequence split out over multiple FMs) 886 uint32_t fm_start; // The position in the FM block at which this sequence begins 887 uint32_t length; // Length of this sequence segment (usually the length of the target sequence, unless its a long sequence split out over multiple FMs) 888 889 890 //meta data taken from the sequence this segment was taken from 891 uint16_t name_length; 892 uint16_t source_length; 893 uint16_t acc_length; 894 uint16_t desc_length; 895 char *name; 896 char *source; 897 char *acc; 898 char *desc; 899 } FM_SEQDATA; 900 901 902 typedef struct fm_metadata_s { 903 uint8_t fwd_only; 904 uint8_t alph_type; 905 uint8_t alph_size; 906 uint8_t charBits; 907 uint32_t freq_SA; //frequency with which SA is sampled 908 uint32_t freq_cnt_sb; //frequency with which full cumulative counts are captured 909 uint32_t freq_cnt_b; //frequency with which intermittent counts are captured 910 uint16_t block_count; 911 uint32_t seq_count; 912 uint64_t char_count; //total count of characters including those in and out of the alphabet 913 char *alph; 914 char *inv_alph; 915 int *compl_alph; 916 FILE *fp; 917 FM_SEQDATA *seq_data; 918 FM_AMBIGLIST *ambig_list; 919 } FM_METADATA; 920 921 922 typedef struct fm_data_s { 923 uint64_t N; //length of text 924 uint32_t term_loc; // location in the BWT at which the '$' char is found (replaced in the sequence with 'a') 925 uint32_t seq_offset; 926 uint32_t ambig_offset; 927 uint32_t seq_cnt; 928 uint32_t ambig_cnt; 929 uint32_t overlap; // number of bases at the beginning that overlap the FM-index for the preceding block 930 uint8_t *T; //text corresponding to the BWT 931 uint8_t *BWT_mem; 932 uint8_t *BWT; 933 uint32_t *SA; // sampled suffix array 934 int64_t *C; //the first position of each letter of the alphabet if all of T is sorted. (signed, as I use that to keep tract of presence/absence) 935 uint32_t *occCnts_sb; 936 uint16_t *occCnts_b; 937 } FM_DATA; 938 939 typedef struct fm_dp_pair_s { 940 uint16_t pos; // position of the diagonal in the model. 941 float score; 942 float max_score; 943 uint8_t score_peak_len; // how long was the diagonal when the most recent peak (within fm_drop_lim of the max score) was seen? 944 uint8_t consec_pos; 945 uint8_t max_consec_pos; 946 uint8_t consec_consensus; 947 uint8_t model_direction; 948 uint8_t complementarity; 949 } FM_DP_PAIR; 950 951 952 typedef struct fm_diag_s { 953 uint64_t n; //position of the database sequence at which the diagonal starts 954 union { 955 double sortkey; 956 double score; 957 }; 958 uint16_t k; //position of the model at which the diagonal starts 959 uint16_t length; 960 uint8_t complementarity; 961 } FM_DIAG; 962 963 typedef struct fm_diaglist_s { 964 FM_DIAG *diags; 965 int count; 966 int size; 967 } FM_DIAGLIST; 968 969 /* Effectively global variables, to be initialized once in fm_initConfig(), 970 * then passed around among threads to avoid recomputing them 971 * 972 * When allocated, must be 16-byte aligned, and all _m128i elements 973 * must precede other types 974 */ 975 typedef struct { 976 #if defined (eslENABLE_SSE) 977 /* mask arrays, and 16-byte-offsets into them */ 978 __m128i *fm_masks_mem; 979 __m128i *fm_masks_v; 980 __m128i *fm_reverse_masks_mem; 981 __m128i *fm_reverse_masks_v; 982 __m128i *fm_chars_mem; 983 __m128i *fm_chars_v; 984 985 /*various precomputed vectors*/ 986 __m128i fm_allones_v; 987 __m128i fm_zeros_v; 988 __m128i fm_neg128_v; 989 __m128i fm_m0f; //00 00 11 11 990 __m128i fm_m01; //01 01 01 01 991 __m128i fm_m11; //00 00 00 11 992 993 /* no non-__m128i- elements above this line */ 994 #endif //#if defined (eslENABLE_SSE) 995 996 /*counter, to compute FM-index speed*/ 997 int occCallCnt; 998 999 /*bounding cutoffs*/ 1000 int max_depth; 1001 float drop_lim; // 0.2 ; in seed, max drop in a run of length [fm_drop_max_len] 1002 int drop_max_len; // 4 ; maximum run length with score under (max - [fm_drop_lim]) 1003 int consec_pos_req; //5 1004 int consensus_match_req; //10 1005 float score_density_req; //.5 1006 int ssv_length; 1007 float scthreshFM; 1008 float sc_thresh_ratio; //information content deficit, actual_relent/target_relent 1009 1010 /*pointer to FM-index metadata*/ 1011 FM_METADATA *meta; 1012 1013 } FM_CFG; 1014 1015 1016 #if defined (eslENABLE_SSE) 1017 //used to convert from a byte array to an __m128i 1018 typedef union { 1019 uint8_t bytes[16]; 1020 __m128i m128; 1021 } byte_m128; 1022 1023 1024 /* Gather the sum of all counts in a 16x8-bit element into a single 16-bit 1025 * element of the register (the 0th element) 1026 * 1027 * the _mm_sad_epu8 accumulates 8-bit counts into 16-bit counts: 1028 * left 8 counts (64-bits) accumulate in counts_v[0], 1029 * right 8 counts in counts_v[4] (the other 6 16-bit ints are 0) 1030 * the _mm_shuffle_epi32 flips the 4th int into the 0th slot 1031 */ 1032 #define FM_GATHER_8BIT_COUNTS( in_v, mid_v, out_v ) do {\ 1033 mid_v = _mm_sad_epu8 (in_v, cfg->fm_zeros_v);\ 1034 tmp_v = _mm_shuffle_epi32(mid_v, _MM_SHUFFLE(1, 1, 1, 2));\ 1035 out_v = _mm_add_epi16(mid_v, tmp_v);\ 1036 } while (0) 1037 1038 1039 /* Macro for SSE operations to turn 2-bit character values into 2-bit binary 1040 * (00 or 01) match/mismatch values representing occurrences of a character in a 1041 * 4-char-per-byte packed BWT. 1042 * 1043 * Typically followed by a call to FM_COUNT_SSE_4PACKED, possibly with a 1044 * mask in between to handle the case where we don't want to add over all 1045 * positions in the vector 1046 * 1047 * tmp_v and tmp2_v are used as temporary vectors throughout, and hold meaningless values 1048 * at the end 1049 * 1050 * xor(in_v, c_v) : each 2-bit value will be 00 if a match, and non-0 if a mismatch 1051 * and(in_v, 01010101) : look at the right bit of each 2-bit value, 1052 * srli(1)+and() : look at the left bit of each 2-bit value, 1053 * or() : if either left bit or right bit is non-0, 01, else 00 (match is 00) 1054 * 1055 * subs() : invert, so match is 01, mismatch is 00 1056 * 1057 */ 1058 #define FM_MATCH_2BIT(in_v, c_v, a_v, b_v, out_v) do {\ 1059 a_v = _mm_xor_si128(in_v, c_v);\ 1060 \ 1061 b_v = _mm_srli_epi16(a_v, 1);\ 1062 a_v = _mm_or_si128(a_v, b_v);\ 1063 b_v = _mm_and_si128(a_v, cfg->fm_m01);\ 1064 \ 1065 out_v = _mm_subs_epi8(cfg->fm_m01,b_v);\ 1066 } while (0) 1067 1068 1069 /*Macro for SSE operations to count bits produced by FM_MATCH_SSE_4PACKED 1070 * 1071 * tmp_v and tmp2_v are used as temporary vectors throughout, and hold meaningless values 1072 * at the end 1073 * 1074 * then add up the 2-bit values: 1075 * srli(4)+add() : left 4 bits shifted right, added to right 4 bits 1076 * 1077 * srli(2)+and(00000011) : left 2 bits (value 0..2) shifted right, masked, so no other bits active 1078 * and(00000011) : right 2 bits (value 0..2) masked so no other bits active 1079 * 1080 * final 2 add()s : tack current counts on to already-tabulated counts. 1081 */ 1082 #define FM_COUNT_2BIT(a_v, b_v, cnts_v) do {\ 1083 b_v = _mm_srli_epi16(a_v, 4);\ 1084 a_v = _mm_add_epi16(a_v, b_v);\ 1085 \ 1086 b_v = _mm_srli_epi16(a_v, 2);\ 1087 a_v = _mm_and_si128(a_v,cfg->fm_m11);\ 1088 b_v = _mm_and_si128(b_v,cfg->fm_m11);\ 1089 \ 1090 cnts_v = _mm_add_epi16(cnts_v, a_v);\ 1091 cnts_v = _mm_add_epi16(cnts_v, b_v);\ 1092 } while (0) 1093 1094 1095 1096 /* Macro for SSE operations that turns a vector of 4-bit character values into 1097 * 2 vectors representing matches. Each byte in the input vector consists of 1098 * a left half (4 bits) and a right half (4 bits). The 16 left-halves produce 1099 * one vector, which contains all-1s for bytes in which the left half matches 1100 * the c_v character (and 0s if it doesn't), while the 16 right-halves produce 1101 * the other vector, again with each byte either all-1s or all-0s. 1102 * 1103 * The expectation is that FM_COUNT_4BIT will be called after this, to 1104 * turn these binary values into sums over a series of vectors. The macros 1105 * are split up to allow one end or other to be trimmed in the case that 1106 * counting is not expected to include the full vector. 1107 * 1108 * srli(4)+and() : capture the left 4-bit value (need the mask because 16-bit shift leaves garbage in left-4-bit chunks) 1109 * and() : capture the right 4-bit value 1110 * 1111 * cmpeq()x2 : test if both left and right == c. For each, if ==c , value = 11111111 (-1) 1112 */ 1113 #define FM_MATCH_4BIT(in_v, c_v, out1_v, out2_v) do {\ 1114 out1_v = _mm_srli_epi16(in_v, 4);\ 1115 out2_v = _mm_and_si128(in_v, cfg->fm_m0f);\ 1116 out1_v = _mm_and_si128(out1_v, cfg->fm_m0f);\ 1117 \ 1118 out1_v = _mm_cmpeq_epi8(out1_v, c_v);\ 1119 out2_v = _mm_cmpeq_epi8(out2_v, c_v);\ 1120 } while (0) 1121 1122 1123 /* Macro for SSE operations that turns a vector of 4-bit character values into 1124 * 2 vectors representing matches. Each byte in the input vector consists of 1125 * a left half (4 bits) and a right half (4 bits). The 16 left-halves produce 1126 * one vector, which contains all-1s for bytes in which the left half is less than 1127 * the c_v character (and 0s if it doesn't), while the 16 right-halves produce 1128 * the other vector, again with each byte either all-1s or all-0s. 1129 * 1130 * The expectation is that FM_COUNT_4BIT will be called after this, to 1131 * turn these binary values into sums over a series of vectors. The macros 1132 * are split up to allow one end or other to be trimmed in the case that 1133 * counting is not expected to include the full vector. 1134 * 1135 * srli(4)+and() : capture the left 4-bit value (need the mask because 16-bit shift leaves garbage in left-4-bit chunks) 1136 * and() : capture the right 4-bit value 1137 * 1138 * cmplt()x2 : test if both left and right < c. For each, if <c , value = 11111111 (-1) 1139 */ 1140 #define FM_LT_4BIT(in_v, c_v, out1_v, out2_v) do {\ 1141 out1_v = _mm_srli_epi16(in_v, 4);\ 1142 out2_v = _mm_and_si128(in_v, cfg->fm_m0f);\ 1143 out1_v = _mm_and_si128(out1_v, cfg->fm_m0f);\ 1144 \ 1145 out1_v = _mm_cmplt_epi8(out1_v, c_v);\ 1146 out2_v = _mm_cmplt_epi8(out2_v, c_v);\ 1147 } while (0) 1148 1149 1150 1151 /* Macro for SSE operations to add occurrence counts to the tally vector counts_v, 1152 * in the 4-bits-per-character case 1153 * 1154 * The expectation is that in[12]_v will contain bytes that are either 1155 * 00000000 = 0 1156 * or 1157 * 11111111 = -1 1158 * so subtracting the value of the byte is the same as adding 0 or 1. 1159 */ 1160 #define FM_COUNT_4BIT(in1_v, in2_v, cnts_v) do {\ 1161 cnts_v = _mm_subs_epi8(cnts_v, in1_v);\ 1162 cnts_v = _mm_subs_epi8(cnts_v, in2_v);\ 1163 } while (0) 1164 1165 1166 #endif // if defined (eslENABLE_SSE) 1167 1168 /***************************************************************** 1169 * 16. P7_PIPELINE: H3's accelerated seq/profile comparison pipeline 1170 *****************************************************************/ 1171 1172 enum p7_pipemodes_e { p7_SEARCH_SEQS = 0, p7_SCAN_MODELS = 1 }; 1173 enum p7_zsetby_e { p7_ZSETBY_NTARGETS = 0, p7_ZSETBY_OPTION = 1, p7_ZSETBY_FILEINFO = 2 }; 1174 enum p7_complementarity_e { p7_NOCOMPLEMENT = 0, p7_COMPLEMENT = 1 }; 1175 1176 typedef struct p7_pipeline_s { 1177 /* Dynamic programming matrices */ 1178 P7_OMX *oxf; /* one-row Forward matrix, accel pipe */ 1179 P7_OMX *oxb; /* one-row Backward matrix, accel pipe */ 1180 P7_OMX *fwd; /* full Fwd matrix for domain envelopes */ 1181 P7_OMX *bck; /* full Bck matrix for domain envelopes */ 1182 1183 /* Domain postprocessing */ 1184 ESL_RANDOMNESS *r; /* random number generator */ 1185 int do_reseeding; /* TRUE: reseed for reproducible results */ 1186 int do_alignment_score_calc; 1187 P7_DOMAINDEF *ddef; /* domain definition workflow */ 1188 1189 1190 /* Reporting threshold settings */ 1191 int by_E; /* TRUE to cut per-target report off by E */ 1192 double E; /* per-target E-value threshold */ 1193 double T; /* per-target bit score threshold */ 1194 int dom_by_E; /* TRUE to cut domain reporting off by E */ 1195 double domE; /* domain E-value threshold */ 1196 double domT; /* domain bit score threshold */ 1197 int use_bit_cutoffs; /* (FALSE | p7H_GA | p7H_TC | p7H_NC) */ 1198 1199 /* Inclusion threshold settings */ 1200 int inc_by_E; /* TRUE to threshold inclusion by E-values */ 1201 double incE; /* per-target inclusion E-value threshold */ 1202 double incT; /* per-target inclusion score threshold */ 1203 int incdom_by_E; /* TRUE to threshold domain inclusion by E */ 1204 double incdomE; /* per-domain inclusion E-value threshold */ 1205 double incdomT; /* per-domain inclusion E-value threshold */ 1206 1207 /* Tracking search space sizes for E value calculations */ 1208 double Z; /* eff # targs searched (per-target E-val) */ 1209 double domZ; /* eff # signific targs (per-domain E-val) */ 1210 enum p7_zsetby_e Z_setby; /* how Z was set */ 1211 enum p7_zsetby_e domZ_setby; /* how domZ was set */ 1212 1213 /* Threshold settings for pipeline */ 1214 int do_max; /* TRUE to run in slow/max mode */ 1215 double F1; /* MSV filter threshold */ 1216 double F2; /* Viterbi filter threshold */ 1217 double F3; /* uncorrected Forward filter threshold */ 1218 int B1; /* window length for biased-composition modifier - MSV*/ 1219 int B2; /* window length for biased-composition modifier - Viterbi*/ 1220 int B3; /* window length for biased-composition modifier - Forward*/ 1221 int do_biasfilter; /* TRUE to use biased comp HMM filter */ 1222 int do_null2; /* TRUE to use null2 score corrections */ 1223 1224 /* Accounting. (reduceable in threaded/MPI parallel version) */ 1225 uint64_t nmodels; /* # of HMMs searched */ 1226 uint64_t nseqs; /* # of sequences searched */ 1227 uint64_t nres; /* # of residues searched */ 1228 uint64_t nnodes; /* # of model nodes searched */ 1229 uint64_t n_past_msv; /* # comparisons that pass MSVFilter() */ 1230 uint64_t n_past_bias; /* # comparisons that pass bias filter */ 1231 uint64_t n_past_vit; /* # comparisons that pass ViterbiFilter() */ 1232 uint64_t n_past_fwd; /* # comparisons that pass ForwardFilter() */ 1233 uint64_t n_output; /* # alignments that make it to the final output (used for nhmmer) */ 1234 uint64_t pos_past_msv; /* # positions that pass MSVFilter() (used for nhmmer) */ 1235 uint64_t pos_past_bias; /* # positions that pass bias filter (used for nhmmer) */ 1236 uint64_t pos_past_vit; /* # positions that pass ViterbiFilter() (used for nhmmer) */ 1237 uint64_t pos_past_fwd; /* # positions that pass ForwardFilter() (used for nhmmer) */ 1238 uint64_t pos_output; /* # positions that make it to the final output (used for nhmmer) */ 1239 1240 enum p7_pipemodes_e mode; /* p7_SCAN_MODELS | p7_SEARCH_SEQS */ 1241 int long_targets; /* TRUE if the target sequences are expected to be very long (e.g. dna chromosome search in nhmmer) */ 1242 int strands; /* p7_STRAND_TOPONLY | p7_STRAND_BOTTOMONLY | p7_STRAND_BOTH */ 1243 int W; /* window length for nhmmer scan - essentially maximum length of model that we expect to find*/ 1244 int block_length; /* length of overlapping blocks read in the multi-threaded variant (default MAX_RESIDUE_COUNT) */ 1245 1246 int show_accessions;/* TRUE to output accessions not names */ 1247 int show_alignments;/* TRUE to output alignments (default) */ 1248 1249 P7_HMMFILE *hfp; /* COPY of open HMM database (if scan mode) */ 1250 char errbuf[eslERRBUFSIZE]; 1251 } P7_PIPELINE; 1252 1253 1254 1255 /***************************************************************** 1256 * 17. P7_BUILDER: pipeline for new HMM construction 1257 *****************************************************************/ 1258 1259 #define p7_DEFAULT_WINDOW_BETA 1e-7 1260 1261 enum p7_archchoice_e { p7_ARCH_FAST = 0, p7_ARCH_HAND = 1 }; 1262 enum p7_wgtchoice_e { p7_WGT_NONE = 0, p7_WGT_GIVEN = 1, p7_WGT_GSC = 2, p7_WGT_PB = 3, p7_WGT_BLOSUM = 4 }; 1263 enum p7_effnchoice_e { p7_EFFN_NONE = 0, p7_EFFN_SET = 1, p7_EFFN_CLUST = 2, p7_EFFN_ENTROPY = 3, p7_EFFN_ENTROPY_EXP = 4 }; 1264 1265 typedef struct p7_builder_s { 1266 /* Model architecture */ 1267 enum p7_archchoice_e arch_strategy; /* choice of model architecture determination algorithm */ 1268 float symfrac; /* residue occ thresh for fast architecture determination */ 1269 float fragthresh; /* if L <= fragthresh*alen, seq is called a fragment */ 1270 1271 /* Relative sequence weights */ 1272 enum p7_wgtchoice_e wgt_strategy; /* choice of relative sequence weighting algorithm */ 1273 double wid; /* %id threshold for BLOSUM relative weighting */ 1274 1275 /* Effective sequence number */ 1276 enum p7_effnchoice_e effn_strategy; /* choice of effective seq # determination algorithm */ 1277 double re_target; /* rel entropy target for effn eweighting, if set; or -1.0*/ 1278 double esigma; /* min total rel ent parameter for effn entropy weights */ 1279 double eid; /* %id threshold for effn clustering */ 1280 double eset; /* effective sequence number, if --eset; or -1.0 */ 1281 1282 /* Run-to-run variation due to random number generation */ 1283 ESL_RANDOMNESS *r; /* RNG for E-value calibration simulations */ 1284 int do_reseeding; /* TRUE to reseed, making results reproducible */ 1285 1286 /* E-value parameter calibration */ 1287 int EmL; /* length of sequences generated for MSV fitting */ 1288 int EmN; /* # of sequences generated for MSV fitting */ 1289 int EvL; /* length of sequences generated for Viterbi fitting */ 1290 int EvN; /* # of sequences generated for Viterbi fitting */ 1291 int EfL; /* length of sequences generated for Forward fitting */ 1292 int EfN; /* # of sequences generated for Forward fitting */ 1293 double Eft; /* tail mass used for Forward fitting */ 1294 1295 /* Choice of prior */ 1296 P7_PRIOR *prior; /* choice of prior when parameterizing from counts */ 1297 int max_insert_len; 1298 1299 /* Optional: information used for parameterizing single sequence queries */ 1300 ESL_SCOREMATRIX *S; /* residue score matrix */ 1301 ESL_DMATRIX *Q; /* Q->mx[a][b] = P(b|a) residue probabilities */ 1302 double popen; /* gap open probability */ 1303 double pextend; /* gap extend probability */ 1304 1305 double w_beta; /*beta value used to compute W (window length) */ 1306 int w_len; /*W (window length) explicitly set */ 1307 1308 const ESL_ALPHABET *abc; /* COPY of alphabet */ 1309 char errbuf[eslERRBUFSIZE]; /* informative message on model construction failure */ 1310 } P7_BUILDER; 1311 1312 1313 1314 /***************************************************************** 1315 * 18. Routines in HMMER's exposed API. 1316 *****************************************************************/ 1317 1318 /* build.c */ 1319 extern int p7_Handmodelmaker(ESL_MSA *msa, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***ret_tr); 1320 extern int p7_Fastmodelmaker(ESL_MSA *msa, float symfrac, P7_BUILDER *bld, P7_HMM **ret_hmm, P7_TRACE ***ret_tr); 1321 1322 /* emit.c */ 1323 extern int p7_CoreEmit (ESL_RANDOMNESS *r, const P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr); 1324 extern int p7_ProfileEmit(ESL_RANDOMNESS *r, const P7_HMM *hmm, const P7_PROFILE *gm, const P7_BG *bg, ESL_SQ *sq, P7_TRACE *tr); 1325 extern int p7_emit_SimpleConsensus(const P7_HMM *hmm, ESL_SQ *sq); 1326 extern int p7_emit_FancyConsensus (const P7_HMM *hmm, float min_lower, float min_upper, ESL_SQ *sq); 1327 1328 /* errors.c */ 1329 extern void p7_Die (char *format, ...); 1330 extern void p7_Fail(char *format, ...); 1331 1332 /* evalues.c */ 1333 extern int p7_Calibrate(P7_HMM *hmm, P7_BUILDER *cfg_b, ESL_RANDOMNESS **byp_rng, P7_BG **byp_bg, P7_PROFILE **byp_gm, P7_OPROFILE **byp_om); 1334 extern int p7_Lambda(P7_HMM *hmm, P7_BG *bg, double *ret_lambda); 1335 extern int p7_MSVMu (ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda, double *ret_mmu); 1336 extern int p7_ViterbiMu (ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda, double *ret_vmu); 1337 extern int p7_Tau (ESL_RANDOMNESS *r, P7_OPROFILE *om, P7_BG *bg, int L, int N, double lambda, double tailp, double *ret_tau); 1338 1339 /* eweight.c */ 1340 extern int p7_EntropyWeight(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double infotarget, double *ret_Neff); 1341 1342 extern int p7_EntropyWeight_exp(const P7_HMM *hmm, const P7_BG *bg, const P7_PRIOR *pri, double etarget, double *ret_exp); 1343 /* generic_decoding.c */ 1344 extern int p7_GDecoding (const P7_PROFILE *gm, const P7_GMX *fwd, P7_GMX *bck, P7_GMX *pp); 1345 extern int p7_GDomainDecoding(const P7_PROFILE *gm, const P7_GMX *fwd, const P7_GMX *bck, P7_DOMAINDEF *ddef); 1346 1347 /* generic_fwdback.c */ 1348 extern int p7_GForward (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *ret_sc); 1349 extern int p7_GBackward (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *ret_sc); 1350 extern int p7_GHybrid (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_fwdscore, float *opt_hybscore); 1351 1352 /* generic_msv.c */ 1353 extern int p7_GMSV (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float nu, float *ret_sc); 1354 extern int p7_GMSV_longtarget(const ESL_DSQ *dsq, int L, P7_PROFILE *gm, P7_GMX *gx, float nu, P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist); 1355 1356 /* generic_null2.c */ 1357 extern int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2); 1358 extern int p7_GNull2_ByTrace (const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2); 1359 1360 /* generic_optacc.c */ 1361 extern int p7_GOptimalAccuracy(const P7_PROFILE *gm, const P7_GMX *pp, P7_GMX *gx, float *ret_e); 1362 extern int p7_GOATrace (const P7_PROFILE *gm, const P7_GMX *pp, const P7_GMX *gx, P7_TRACE *tr); 1363 1364 /* generic_stotrace.c */ 1365 extern int p7_GStochasticTrace(ESL_RANDOMNESS *r, const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr); 1366 1367 /* generic_viterbi.c */ 1368 extern int p7_GViterbi (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *ret_sc); 1369 1370 /* generic_vtrace.c */ 1371 extern int p7_GTrace (const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, const P7_GMX *gx, P7_TRACE *tr); 1372 extern int p7_GViterbi_longtarget(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, 1373 float filtersc, double P, P7_HMM_WINDOWLIST *windowlist); 1374 1375 1376 /* heatmap.c (evolving now, intend to move this to Easel in the future) */ 1377 extern double dmx_upper_max(ESL_DMATRIX *D); 1378 extern double dmx_upper_min(ESL_DMATRIX *D); 1379 extern double dmx_upper_element_sum(ESL_DMATRIX *D); 1380 extern double dmx_upper_norm(ESL_DMATRIX *D); 1381 extern int dmx_Visualize(FILE *fp, ESL_DMATRIX *D, double min, double max); 1382 1383 /* hmmdutils.c */ 1384 extern void p7_openlog(const char *ident, int option, int facility); 1385 extern void p7_syslog(int priority, const char *format, ...); 1386 extern void p7_closelog(void); 1387 1388 /* hmmlogo.c */ 1389 extern float hmmlogo_maxHeight (P7_BG *bg); 1390 extern int hmmlogo_RelativeEntropy_all (P7_HMM *hmm, P7_BG *bg, float *rel_ents, float **probs, float **heights ); 1391 extern int hmmlogo_RelativeEntropy_above_bg (P7_HMM *hmm, P7_BG *bg, float *rel_ents, float **probs, float **heights ); 1392 extern int hmmlogo_ScoreHeights (P7_HMM *hmm, P7_BG *bg, float **heights ); 1393 extern int hmmlogo_IndelValues (P7_HMM *hmm, float *insert_P, float *insert_expL, float *delete_P ); 1394 1395 1396 1397 /* hmmpgmd2msa.c */ 1398 extern int hmmpgmd2msa(void *data, P7_HMM *hmm, ESL_SQ *qsq, int *incl, int incl_size, int *excl, int excl_size, int excl_all, ESL_MSA **ret_msa); 1399 1400 1401 1402 /* island.c */ 1403 extern int p7_island_Viterbi(ESL_DSQ *dsq, int L, P7_PROFILE *gm, P7_GMX *mx, ESL_HISTOGRAM *h); 1404 1405 /* h2_io.c */ 1406 extern int p7_h2io_WriteASCII(FILE *fp, P7_HMM *hmm); 1407 1408 /* hmmer.c */ 1409 extern void p7_banner(FILE *fp, const char *progname, char *banner); 1410 extern ESL_GETOPTS *p7_CreateDefaultApp(ESL_OPTIONS *options, int nargs, int argc, char **argv, char *banner, char *usage); 1411 extern int p7_AminoFrequencies(float *f); 1412 1413 /* logsum.c */ 1414 extern int p7_FLogsumInit(void); 1415 extern float p7_FLogsum(float a, float b); 1416 extern float p7_FLogsumError(float a, float b); 1417 extern int p7_ILogsumInit(void); 1418 extern int p7_ILogsum(int s1, int s2); 1419 1420 1421 /* modelconfig.c */ 1422 extern int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode); 1423 extern int p7_ReconfigLength (P7_PROFILE *gm, int L); 1424 extern int p7_ReconfigMultihit(P7_PROFILE *gm, int L); 1425 extern int p7_ReconfigUnihit (P7_PROFILE *gm, int L); 1426 1427 /* modelstats.c */ 1428 extern double p7_MeanMatchInfo (const P7_HMM *hmm, const P7_BG *bg); 1429 extern double p7_MeanMatchEntropy (const P7_HMM *hmm); 1430 extern double p7_MeanMatchRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg); 1431 extern double p7_MeanForwardScore (const P7_HMM *hmm, const P7_BG *bg); 1432 extern int p7_MeanPositionRelativeEntropy(const P7_HMM *hmm, const P7_BG *bg, double *ret_entropy); 1433 extern int p7_hmm_CompositionKLD(const P7_HMM *hmm, const P7_BG *bg, float *ret_KL, float **opt_avp); 1434 1435 1436 /* mpisupport.c */ 1437 #ifdef HMMER_MPI 1438 extern int p7_hmm_MPISend(P7_HMM *hmm, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc); 1439 extern int p7_hmm_MPIPackSize(P7_HMM *hmm, MPI_Comm comm, int *ret_n); 1440 extern int p7_hmm_MPIPack(P7_HMM *hmm, char *buf, int n, int *position, MPI_Comm comm); 1441 extern int p7_hmm_MPIUnpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **abc, P7_HMM **ret_hmm); 1442 extern int p7_hmm_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **abc, P7_HMM **ret_hmm); 1443 1444 extern int p7_profile_MPISend(P7_PROFILE *gm, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc); 1445 extern int p7_profile_MPIRecv(int source, int tag, MPI_Comm comm, const ESL_ALPHABET *abc, const P7_BG *bg, 1446 char **buf, int *nalloc, P7_PROFILE **ret_gm); 1447 1448 extern int p7_pipeline_MPISend(P7_PIPELINE *pli, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc); 1449 extern int p7_pipeline_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_GETOPTS *go, P7_PIPELINE **ret_pli); 1450 1451 extern int p7_tophits_MPISend(P7_TOPHITS *th, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc); 1452 extern int p7_tophits_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, P7_TOPHITS **ret_th); 1453 1454 extern int p7_oprofile_MPISend(P7_OPROFILE *om, int dest, int tag, MPI_Comm comm, char **buf, int *nalloc); 1455 extern int p7_oprofile_MPIPackSize(P7_OPROFILE *om, MPI_Comm comm, int *ret_n); 1456 extern int p7_oprofile_MPIPack(P7_OPROFILE *om, char *buf, int n, int *pos, MPI_Comm comm); 1457 extern int p7_oprofile_MPIUnpack(char *buf, int n, int *pos, MPI_Comm comm, ESL_ALPHABET **abc, P7_OPROFILE **ret_om); 1458 extern int p7_oprofile_MPIRecv(int source, int tag, MPI_Comm comm, char **buf, int *nalloc, ESL_ALPHABET **abc, P7_OPROFILE **ret_om); 1459 #endif /*HMMER_MPI*/ 1460 1461 /* tracealign.c */ 1462 extern int p7_tracealign_Seqs(ESL_SQ **sq, P7_TRACE **tr, int nseq, int M, int optflags, P7_HMM *hmm, ESL_MSA **ret_msa); 1463 extern int p7_tracealign_MSA (const ESL_MSA *premsa, P7_TRACE **tr, int M, int optflags, ESL_MSA **ret_postmsa); 1464 extern int p7_tracealign_computeTraces(P7_HMM *hmm, ESL_SQ **sq, int offset, int N, P7_TRACE **tr); 1465 extern int p7_tracealign_getMSAandStats(P7_HMM *hmm, ESL_SQ **sq, int N, ESL_MSA **ret_msa, float **ret_pp, float **ret_relent, float **ret_scores ); 1466 1467 /* p7_alidisplay.c */ 1468 extern P7_ALIDISPLAY *p7_alidisplay_Create(const P7_TRACE *tr, int which, const P7_OPROFILE *om, const ESL_SQ *sq, const ESL_SQ *ntsq); 1469 extern P7_ALIDISPLAY *p7_alidisplay_Create_empty(); 1470 extern P7_ALIDISPLAY *p7_alidisplay_Clone(const P7_ALIDISPLAY *ad); 1471 extern size_t p7_alidisplay_Sizeof(const P7_ALIDISPLAY *ad); 1472 extern int p7_alidisplay_Serialize(const P7_ALIDISPLAY *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc); 1473 extern int p7_alidisplay_Deserialize(const uint8_t *buf, uint32_t *n, P7_ALIDISPLAY *ret_obj); 1474 extern int p7_alidisplay_Serialize_old(P7_ALIDISPLAY *ad); 1475 extern int p7_alidisplay_Deserialize_old(P7_ALIDISPLAY *ad); 1476 extern void p7_alidisplay_Destroy(P7_ALIDISPLAY *ad); 1477 extern char p7_alidisplay_EncodePostProb(float p); 1478 extern float p7_alidisplay_DecodePostProb(char pc); 1479 extern char p7_alidisplay_EncodeAliPostProb(float p, float hi, float med, float lo); 1480 1481 extern int p7_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli); 1482 extern int p7_translated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli); 1483 extern int p7_nontranslated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, int show_accessions); 1484 1485 extern int p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr); 1486 extern int p7_alidisplay_Sample(ESL_RANDOMNESS *rng, int N, P7_ALIDISPLAY **ret_ad); 1487 extern int p7_alidisplay_Dump(FILE *fp, const P7_ALIDISPLAY *ad); 1488 extern int p7_alidisplay_Compare(const P7_ALIDISPLAY *ad1, const P7_ALIDISPLAY *ad2); 1489 1490 /* p7_bg.c */ 1491 extern P7_BG *p7_bg_Create(const ESL_ALPHABET *abc); 1492 extern P7_BG *p7_bg_CreateUniform(const ESL_ALPHABET *abc); 1493 extern P7_BG *p7_bg_Clone(const P7_BG *bg); 1494 extern int p7_bg_Dump(FILE *ofp, const P7_BG *bg); 1495 extern void p7_bg_Destroy(P7_BG *bg); 1496 extern int p7_bg_SetLength(P7_BG *bg, int L); 1497 extern int p7_bg_NullOne(const P7_BG *bg, const ESL_DSQ *dsq, int L, float *ret_sc); 1498 1499 extern int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf); 1500 extern int p7_bg_Write(FILE *fp, P7_BG *bg); 1501 1502 extern int p7_bg_SetFilter (P7_BG *bg, int M, const float *compo); 1503 extern int p7_bg_FilterScore(P7_BG *bg, const ESL_DSQ *dsq, int L, float *ret_sc); 1504 1505 /* p7_builder.c */ 1506 extern P7_BUILDER *p7_builder_Create(const ESL_GETOPTS *go, const ESL_ALPHABET *abc); 1507 extern int p7_builder_LoadScoreSystem(P7_BUILDER *bld, const char *matrix, double popen, double pextend, P7_BG *bg); 1508 extern int p7_builder_SetScoreSystem (P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg); 1509 extern void p7_builder_Destroy(P7_BUILDER *bld); 1510 1511 extern int p7_Builder (P7_BUILDER *bld, ESL_MSA *msa, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE ***opt_trarr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om, ESL_MSA **opt_postmsa); 1512 extern int p7_SingleBuilder(P7_BUILDER *bld, ESL_SQ *sq, P7_BG *bg, P7_HMM **opt_hmm, P7_TRACE **opt_tr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om); 1513 extern int p7_Builder_MaxLength (P7_HMM *hmm, double emit_thresh); 1514 1515 /* p7_domain.c */ 1516 extern P7_DOMAIN *p7_domain_Create_empty(); 1517 extern void p7_domain_Destroy(P7_DOMAIN *obj); 1518 extern int p7_domain_Serialize(const P7_DOMAIN *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc); 1519 extern int p7_domain_Deserialize(const uint8_t *buf, uint32_t *n, P7_DOMAIN *ret_obj); 1520 extern int p7_domain_TestSample(ESL_RAND64 *rng, P7_DOMAIN **ret_obj); 1521 extern int p7_domain_Compare(P7_DOMAIN *first, P7_DOMAIN *second, double atol, double rtol); 1522 1523 /* p7_domaindef.c */ 1524 extern P7_DOMAINDEF *p7_domaindef_Create (ESL_RANDOMNESS *r); 1525 extern int p7_domaindef_Fetch (P7_DOMAINDEF *ddef, int which, int *opt_i, int *opt_j, float *opt_sc, P7_ALIDISPLAY **opt_ad); 1526 extern int p7_domaindef_GrowTo (P7_DOMAINDEF *ddef, int L); 1527 extern int p7_domaindef_Reuse (P7_DOMAINDEF *ddef); 1528 extern int p7_domaindef_DumpPosteriors(FILE *ofp, P7_DOMAINDEF *ddef); 1529 extern void p7_domaindef_Destroy(P7_DOMAINDEF *ddef); 1530 1531 extern int p7_domaindef_ByViterbi (P7_PROFILE *gm, const ESL_SQ *sq, const ESL_SQ *ntsq, P7_GMX *gx1, P7_GMX *gx2, P7_DOMAINDEF *ddef); 1532 extern int p7_domaindef_ByPosteriorHeuristics(const ESL_SQ *sq, const ESL_SQ *ntsq, P7_OPROFILE *om, P7_OMX *oxf, P7_OMX *oxb, P7_OMX *fwd, P7_OMX *bck, 1533 P7_DOMAINDEF *ddef, P7_BG *bg, int long_target, 1534 P7_BG *bg_tmp, float *scores_arr, float *fwd_emissions_arr); 1535 1536 1537 /* p7_gmx.c */ 1538 extern P7_GMX *p7_gmx_Create (int allocM, int allocL); 1539 extern int p7_gmx_GrowTo (P7_GMX *gx, int allocM, int allocL); 1540 extern size_t p7_gmx_Sizeof (P7_GMX *gx); 1541 extern int p7_gmx_Reuse (P7_GMX *gx); 1542 extern void p7_gmx_Destroy(P7_GMX *gx); 1543 extern int p7_gmx_Compare(P7_GMX *gx1, P7_GMX *gx2, float tolerance); 1544 extern int p7_gmx_Dump(FILE *fp, P7_GMX *gx, int flags); 1545 extern int p7_gmx_DumpWindow(FILE *fp, P7_GMX *gx, int istart, int iend, int kstart, int kend, int show_specials); 1546 1547 /* p7_hit.c */ 1548 extern P7_HIT *p7_hit_Create_empty(); 1549 extern void p7_hit_Destroy(P7_HIT *the_hit); 1550 extern int p7_hit_Serialize(const P7_HIT *obj, uint8_t **buf, uint32_t *n, uint32_t *nalloc); 1551 extern int p7_hit_Deserialize(const uint8_t *buf, uint32_t *n, P7_HIT *ret_obj); 1552 extern int p7_hit_TestSample(ESL_RAND64 *rng, P7_HIT **ret_obj); 1553 extern int p7_hit_Compare(P7_HIT *first, P7_HIT *second, double atol, double rtol); 1554 1555 /* p7_hmm.c */ 1556 /* 1. The P7_HMM object: allocation, initialization, destruction. */ 1557 extern P7_HMM *p7_hmm_Create(int M, const ESL_ALPHABET *abc); 1558 extern P7_HMM *p7_hmm_CreateShell(void); 1559 extern int p7_hmm_CreateBody(P7_HMM *hmm, int M, const ESL_ALPHABET *abc); 1560 extern void p7_hmm_Destroy(P7_HMM *hmm); 1561 extern int p7_hmm_CopyParameters(const P7_HMM *src, P7_HMM *dest); 1562 extern P7_HMM *p7_hmm_Clone(const P7_HMM *hmm); 1563 extern int p7_hmm_Zero(P7_HMM *hmm); 1564 extern char p7_hmm_EncodeStatetype(char *typestring); 1565 extern char *p7_hmm_DecodeStatetype(char st); 1566 /* 2. Convenience routines for setting fields in an HMM. */ 1567 extern int p7_hmm_SetName (P7_HMM *hmm, char *name); 1568 extern int p7_hmm_SetAccession (P7_HMM *hmm, char *acc); 1569 extern int p7_hmm_SetDescription(P7_HMM *hmm, char *desc); 1570 extern int p7_hmm_AppendComlog (P7_HMM *hmm, int argc, char **argv); 1571 extern int p7_hmm_SetCtime (P7_HMM *hmm); 1572 extern int p7_hmm_SetComposition(P7_HMM *hmm); 1573 extern int p7_hmm_SetConsensus (P7_HMM *hmm, ESL_SQ *sq); 1574 /* 3. Renormalization and rescaling counts in core HMMs. */ 1575 extern int p7_hmm_Scale (P7_HMM *hmm, double scale); 1576 extern int p7_hmm_ScaleExponential(P7_HMM *hmm, double exp); 1577 extern int p7_hmm_Renormalize(P7_HMM *hmm); 1578 /* 4. Debugging and development code. */ 1579 extern int p7_hmm_Dump(FILE *fp, P7_HMM *hmm); 1580 extern int p7_hmm_Sample (ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, P7_HMM **ret_hmm); 1581 extern int p7_hmm_SampleUngapped (ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, P7_HMM **ret_hmm); 1582 extern int p7_hmm_SampleEnumerable(ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, P7_HMM **ret_hmm); 1583 extern int p7_hmm_SampleUniform (ESL_RANDOMNESS *r, int M, const ESL_ALPHABET *abc, 1584 float tmi, float tii, float tmd, float tdd, P7_HMM **ret_hmm); 1585 extern int p7_hmm_Compare(P7_HMM *h1, P7_HMM *h2, float tol); 1586 extern int p7_hmm_Validate(P7_HMM *hmm, char *errbuf, float tol); 1587 /* 5. Other routines in the API */ 1588 extern int p7_hmm_CalculateOccupancy(const P7_HMM *hmm, float *mocc, float *iocc); 1589 1590 1591 1592 /* p7_hmmfile.c */ 1593 extern int p7_hmmfile_OpenE (const char *filename, char *env, P7_HMMFILE **ret_hfp, char *errbuf); 1594 extern int p7_hmmfile_OpenENoDB(const char *filename, char *env, P7_HMMFILE **ret_hfp, char *errbuf); 1595 extern int p7_hmmfile_Open (const char *filename, char *env, P7_HMMFILE **ret_hfp); /* deprecated */ 1596 extern int p7_hmmfile_OpenNoDB (const char *filename, char *env, P7_HMMFILE **ret_hfp); /* deprecated */ 1597 extern int p7_hmmfile_OpenBuffer(const char *buffer, int size, P7_HMMFILE **ret_hfp); 1598 extern void p7_hmmfile_Close(P7_HMMFILE *hfp); 1599 #ifdef HMMER_THREADS 1600 extern int p7_hmmfile_CreateLock(P7_HMMFILE *hfp); 1601 #endif 1602 extern int p7_hmmfile_WriteBinary(FILE *fp, int format, P7_HMM *hmm); 1603 extern int p7_hmmfile_WriteASCII (FILE *fp, int format, P7_HMM *hmm); 1604 extern int p7_hmmfile_WriteToString (char **s, int format, P7_HMM *hmm); 1605 extern int p7_hmmfile_Read(P7_HMMFILE *hfp, ESL_ALPHABET **ret_abc, P7_HMM **opt_hmm); 1606 extern int p7_hmmfile_PositionByKey(P7_HMMFILE *hfp, const char *key); 1607 extern int p7_hmmfile_Position(P7_HMMFILE *hfp, const off_t offset); 1608 1609 1610 /* p7_hmmwindow.c */ 1611 int p7_hmmwindow_init (P7_HMM_WINDOWLIST *list); 1612 P7_HMM_WINDOW *p7_hmmwindow_new (P7_HMM_WINDOWLIST *list, uint32_t id, uint32_t pos, uint32_t fm_pos, uint16_t k, uint32_t length, float score, uint8_t complementarity, uint32_t target_len); 1613 1614 1615 1616 /* p7_msvdata.c */ 1617 extern P7_SCOREDATA *p7_hmm_ScoreDataCreate(P7_OPROFILE *om, P7_PROFILE *gm ); 1618 extern P7_SCOREDATA *p7_hmm_ScoreDataClone(P7_SCOREDATA *src, int K); 1619 extern int p7_hmm_ScoreDataComputeRest(P7_OPROFILE *om, P7_SCOREDATA *data ); 1620 extern void p7_hmm_ScoreDataDestroy( P7_SCOREDATA *data ); 1621 extern int p7_hmm_initWindows (P7_HMM_WINDOWLIST *list); 1622 extern P7_HMM_WINDOW *p7_hmm_newWindow (P7_HMM_WINDOWLIST *list, uint32_t id, uint32_t pos, uint32_t fm_pos, uint16_t k, uint32_t length, float score, uint8_t complementarity); 1623 1624 1625 1626 /* p7_null3.c */ 1627 extern void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc); 1628 extern void p7_null3_windowed_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, int start, int stop, P7_BG *bg, float *ret_sc); 1629 1630 /* p7_pipeline.c */ 1631 extern P7_PIPELINE *p7_pipeline_Create(const ESL_GETOPTS *go, int M_hint, int L_hint, int long_targets, enum p7_pipemodes_e mode); 1632 extern int p7_pipeline_Reuse (P7_PIPELINE *pli); 1633 extern void p7_pipeline_Destroy(P7_PIPELINE *pli); 1634 extern int p7_pipeline_Merge (P7_PIPELINE *p1, P7_PIPELINE *p2); 1635 1636 extern int p7_pli_ExtendAndMergeWindows (P7_OPROFILE *om, const P7_SCOREDATA *msvdata, P7_HMM_WINDOWLIST *windowlist, float pct_overlap); 1637 extern int p7_pli_TargetReportable (P7_PIPELINE *pli, float score, double lnP); 1638 extern int p7_pli_DomainReportable (P7_PIPELINE *pli, float dom_score, double lnP); 1639 1640 extern int p7_pli_TargetIncludable (P7_PIPELINE *pli, float score, double lnP); 1641 extern int p7_pli_DomainIncludable (P7_PIPELINE *pli, float dom_score, double lnP); 1642 extern int p7_pli_NewModel (P7_PIPELINE *pli, const P7_OPROFILE *om, P7_BG *bg); 1643 extern int p7_pli_NewModelThresholds(P7_PIPELINE *pli, const P7_OPROFILE *om); 1644 extern int p7_pli_NewSeq (P7_PIPELINE *pli, const ESL_SQ *sq); 1645 extern int p7_Pipeline (P7_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, const ESL_SQ *sq, const ESL_SQ *ntsq, P7_TOPHITS *th); 1646 extern int p7_Pipeline_LongTarget (P7_PIPELINE *pli, P7_OPROFILE *om, P7_SCOREDATA *data, 1647 P7_BG *bg, P7_TOPHITS *hitlist, int64_t seqidx, 1648 const ESL_SQ *sq, int complementarity, 1649 const FM_DATA *fmf, const FM_DATA *fmb, FM_CFG *fm_cfg 1650 ); 1651 1652 1653 1654 extern int p7_pli_Statistics(FILE *ofp, P7_PIPELINE *pli, ESL_STOPWATCH *w); 1655 1656 1657 /* p7_prior.c */ 1658 extern P7_PRIOR *p7_prior_CreateAmino(void); 1659 extern P7_PRIOR *p7_prior_CreateNucleic(void); 1660 extern P7_PRIOR *p7_prior_CreateLaplace(const ESL_ALPHABET *abc); 1661 extern void p7_prior_Destroy(P7_PRIOR *pri); 1662 1663 extern int p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri); 1664 1665 /* p7_profile.c */ 1666 extern P7_PROFILE *p7_profile_Create(int M, const ESL_ALPHABET *abc); 1667 extern P7_PROFILE *p7_profile_Clone(const P7_PROFILE *gm); 1668 extern int p7_profile_Copy(const P7_PROFILE *src, P7_PROFILE *dst); 1669 extern int p7_profile_SetNullEmissions(P7_PROFILE *gm); 1670 extern int p7_profile_Reuse(P7_PROFILE *gm); 1671 extern size_t p7_profile_Sizeof(P7_PROFILE *gm); 1672 extern void p7_profile_Destroy(P7_PROFILE *gm); 1673 extern int p7_profile_IsLocal(const P7_PROFILE *gm); 1674 extern int p7_profile_IsMultihit(const P7_PROFILE *gm); 1675 extern int p7_profile_GetT(const P7_PROFILE *gm, char st1, int k1, 1676 char st2, int k2, float *ret_tsc); 1677 extern int p7_profile_Validate(const P7_PROFILE *gm, char *errbuf, float tol); 1678 extern int p7_profile_Compare(P7_PROFILE *gm1, P7_PROFILE *gm2, float tol); 1679 1680 /* p7_spensemble.c */ 1681 P7_SPENSEMBLE *p7_spensemble_Create(int init_n, int init_epc, int init_sigc); 1682 extern int p7_spensemble_Reuse(P7_SPENSEMBLE *sp); 1683 extern int p7_spensemble_Add(P7_SPENSEMBLE *sp, int sampleidx, int i, int j, int k, int m); 1684 extern int p7_spensemble_Cluster(P7_SPENSEMBLE *sp, 1685 float min_overlap, int of_smaller, int max_diagdiff, 1686 float min_posterior, float min_endpointp, 1687 int *ret_nclusters); 1688 extern int p7_spensemble_GetClusterCoords(P7_SPENSEMBLE *sp, int which, 1689 int *ret_i, int *ret_j, int *ret_k, int *ret_m, float *ret_p); 1690 extern void p7_spensemble_Destroy(P7_SPENSEMBLE *sp); 1691 1692 /* p7_tophits.c */ 1693 extern P7_TOPHITS *p7_tophits_Create(void); 1694 extern int p7_tophits_Grow(P7_TOPHITS *h); 1695 extern int p7_tophits_CreateNextHit(P7_TOPHITS *h, P7_HIT **ret_hit); 1696 extern int p7_tophits_Add(P7_TOPHITS *h, 1697 char *name, char *acc, char *desc, 1698 double sortkey, 1699 float score, double lnP, 1700 float mothersc, double mother_lnP, 1701 int sqfrom, int sqto, int sqlen, 1702 int hmmfrom, int hmmto, int hmmlen, 1703 int domidx, int ndom, 1704 P7_ALIDISPLAY *ali); 1705 extern int p7_tophits_SortBySortkey(P7_TOPHITS *h); 1706 extern int p7_tophits_SortBySeqidxAndAlipos(P7_TOPHITS *h); 1707 extern int p7_tophits_SortByModelnameAndAlipos(P7_TOPHITS *h); 1708 1709 extern int p7_tophits_Merge(P7_TOPHITS *h1, P7_TOPHITS *h2); 1710 extern int p7_tophits_GetMaxPositionLength(P7_TOPHITS *h); 1711 extern int p7_tophits_GetMaxNameLength(P7_TOPHITS *h); 1712 extern int p7_tophits_GetMaxAccessionLength(P7_TOPHITS *h); 1713 extern int p7_tophits_GetMaxShownLength(P7_TOPHITS *h); 1714 extern void p7_tophits_Destroy(P7_TOPHITS *h); 1715 1716 extern int p7_tophits_ComputeNhmmerEvalues(P7_TOPHITS *th, double N, int W); 1717 extern int p7_tophits_RemoveDuplicates(P7_TOPHITS *th, int using_bit_cutoffs); 1718 extern int p7_tophits_Threshold(P7_TOPHITS *th, P7_PIPELINE *pli); 1719 extern int p7_tophits_CompareRanking(P7_TOPHITS *th, ESL_KEYHASH *kh, int *opt_nnew); 1720 extern int p7_tophits_Targets(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw); 1721 extern int p7_tophits_Domains(FILE *ofp, P7_TOPHITS *th, P7_PIPELINE *pli, int textw); 1722 1723 1724 extern int p7_tophits_Alignment(const P7_TOPHITS *th, const ESL_ALPHABET *abc, 1725 ESL_SQ **inc_sqarr, P7_TRACE **inc_trarr, int inc_n, int optflags, 1726 ESL_MSA **ret_msa); 1727 extern int p7_tophits_TabularTargets(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header); 1728 extern int p7_tophits_TabularDomains(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli, int show_header); 1729 extern int p7_tophits_TabularXfam(FILE *ofp, char *qname, char *qacc, P7_TOPHITS *th, P7_PIPELINE *pli); 1730 extern int p7_tophits_TabularTail(FILE *ofp, const char *progname, enum p7_pipemodes_e pipemode, 1731 const char *qfile, const char *tfile, const ESL_GETOPTS *go); 1732 extern int p7_tophits_AliScores(FILE *ofp, char *qname, P7_TOPHITS *th ); 1733 1734 /* p7_trace.c */ 1735 extern P7_TRACE *p7_trace_Create(void); 1736 extern P7_TRACE *p7_trace_CreateWithPP(void); 1737 extern int p7_trace_Reuse(P7_TRACE *tr); 1738 extern int p7_trace_Grow(P7_TRACE *tr); 1739 extern int p7_trace_GrowIndex(P7_TRACE *tr); 1740 extern int p7_trace_GrowTo(P7_TRACE *tr, int N); 1741 extern int p7_trace_GrowIndexTo(P7_TRACE *tr, int ndom); 1742 extern void p7_trace_Destroy(P7_TRACE *tr); 1743 extern void p7_trace_DestroyArray(P7_TRACE **tr, int N); 1744 1745 extern int p7_trace_GetDomainCount (const P7_TRACE *tr, int *ret_ndom); 1746 extern int p7_trace_GetStateUseCounts(const P7_TRACE *tr, int *counts); 1747 extern int p7_trace_GetDomainCoords (const P7_TRACE *tr, int which, int *ret_i1, int *ret_i2, 1748 int *ret_k1, int *ret_k2); 1749 1750 extern int p7_trace_Validate(const P7_TRACE *tr, const ESL_ALPHABET *abc, const ESL_DSQ *dsq, char *errbuf); 1751 extern int p7_trace_Dump(FILE *fp, const P7_TRACE *tr, const P7_PROFILE *gm, const ESL_DSQ *dsq); 1752 extern int p7_trace_Compare(P7_TRACE *tr1, P7_TRACE *tr2, float pptol); 1753 extern int p7_trace_Score(P7_TRACE *tr, ESL_DSQ *dsq, P7_PROFILE *gm, float *ret_sc); 1754 extern int p7_trace_SetPP(P7_TRACE *tr, const P7_GMX *pp); 1755 extern float p7_trace_GetExpectedAccuracy(const P7_TRACE *tr); 1756 1757 extern int p7_trace_Append(P7_TRACE *tr, char st, int k, int i); 1758 extern int p7_trace_AppendWithPP(P7_TRACE *tr, char st, int k, int i, float pp); 1759 extern int p7_trace_Reverse(P7_TRACE *tr); 1760 extern int p7_trace_Index(P7_TRACE *tr); 1761 1762 extern int p7_trace_FauxFromMSA(ESL_MSA *msa, int *matassign, int optflags, P7_TRACE **tr); 1763 extern int p7_trace_Doctor(P7_TRACE *tr, int *opt_ndi, int *opt_nid); 1764 1765 extern int p7_trace_Count(P7_HMM *hmm, ESL_DSQ *dsq, float wt, P7_TRACE *tr); 1766 1767 1768 /* seqmodel.c */ 1769 extern int p7_Seqmodel(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int M, char *name, 1770 ESL_DMATRIX *P, float *f, double popen, double pextend, 1771 P7_HMM **ret_hmm); 1772 1773 /* fm_alphabet.c */ 1774 extern int fm_alphabetCreate (FM_METADATA *meta, uint8_t *alph_bits); 1775 extern int fm_alphabetDestroy (FM_METADATA *meta); 1776 extern int fm_reverseString (char *str, int N); 1777 extern int fm_getComplement (char c, uint8_t alph_type); 1778 1779 1780 /* fm_general.c */ 1781 extern uint64_t fm_computeSequenceOffset (const FM_DATA *fms, const FM_METADATA *meta, int block, uint64_t pos); 1782 extern int fm_getOriginalPosition (const FM_DATA *fms, const FM_METADATA *meta, int fm_id, int length, int direction, uint64_t fm_pos, 1783 uint32_t *segment_id, uint64_t *seg_pos); 1784 extern int fm_readFMmeta( FM_METADATA *meta); 1785 extern int fm_FM_read( FM_DATA *fm, FM_METADATA *meta, int getAll ); 1786 extern void fm_FM_destroy ( FM_DATA *fm, int isMainFM); 1787 extern uint8_t fm_getChar(uint8_t alph_type, int j, const uint8_t *B ); 1788 extern int fm_getSARangeReverse( const FM_DATA *fm, FM_CFG *cfg, char *query, char *inv_alph, FM_INTERVAL *interval); 1789 extern int fm_getSARangeForward( const FM_DATA *fm, FM_CFG *cfg, char *query, char *inv_alph, FM_INTERVAL *interval); 1790 extern int fm_configAlloc(FM_CFG **cfg); 1791 extern int fm_configDestroy(FM_CFG *cfg); 1792 extern int fm_metaDestroy(FM_METADATA *meta ); 1793 extern int fm_updateIntervalForward( const FM_DATA *fm, const FM_CFG *cfg, char c, FM_INTERVAL *interval_f, FM_INTERVAL *interval_bk); 1794 extern int fm_updateIntervalReverse( const FM_DATA *fm, const FM_CFG *cfg, char c, FM_INTERVAL *interval); 1795 extern int fm_initSeeds (FM_DIAGLIST *list) ; 1796 extern FM_DIAG * fm_newSeed (FM_DIAGLIST *list); 1797 extern int fm_initAmbiguityList (FM_AMBIGLIST *list); 1798 extern int fm_addAmbiguityRange (FM_AMBIGLIST *list, uint32_t start, uint32_t stop); 1799 extern int fm_convertRange2DSQ(const FM_DATA *fm, const FM_METADATA *meta, uint64_t first, int length, int complementarity, ESL_SQ *sq, int fix_ambiguities ); 1800 extern int fm_initConfigGeneric( FM_CFG *cfg, ESL_GETOPTS *go); 1801 1802 /* fm_ssv.c */ 1803 extern int p7_SSVFM_longlarget( P7_OPROFILE *om, float nu, P7_BG *bg, double F1, 1804 const FM_DATA *fmf, const FM_DATA *fmb, FM_CFG *fm_cfg, const P7_SCOREDATA *ssvdata, 1805 int strands, P7_HMM_WINDOWLIST *windowlist); 1806 1807 1808 /* fm_sse.c */ 1809 extern int fm_configInit (FM_CFG *cfg, ESL_GETOPTS *go); 1810 extern int fm_getOccCount (const FM_DATA *fm, const FM_CFG *cfg, int pos, uint8_t c); 1811 extern int fm_getOccCountLT (const FM_DATA *fm, const FM_CFG *cfg, int pos, uint8_t c, uint32_t *cnteq, uint32_t *cntlt); 1812 1813 #endif /*P7_HMMERH_INCLUDED*/ 1814 1815 1816