1 /* Sequence weighting algorithms.
2  *
3  * SRE, Sun Nov  5 09:11:13 2006 [Janelia]
4  */
5 #ifndef eslMSAWEIGHT_INCLUDED
6 #define eslMSAWEIGHT_INCLUDED
7 #include "esl_config.h"
8 
9 #include "esl_msa.h"
10 #include "esl_rand64.h"
11 
12 /* ESL_MSAWEIGHT_CFG
13  * optional configuration/customization of PB weighting and %id filtering.
14  */
15 typedef struct {
16   float fragthresh;     // seq is a fragment if (length from 1st to last aligned residue)/alen < fragthresh (i.e. span < minspan)
17   float symfrac;        // col is consensus if nres / (nres+ngap) >= symfrac
18   int   ignore_rf;      // TRUE to ignore RF line (if present), always determine our own consensus
19   int   allow_samp;     // TRUE to allow consensus determination by subsampling (if nseq > sampthresh)
20   int   sampthresh;     // if nseq > sampthresh, try to determine consensus on a sample, not all nseq
21   int   nsamp;          // # of seqs in sample, if determining consensus by sample
22   int   maxfrag;        // if sample has > maxfrag fragments in it, abort determining consensus by sample; use all nseq instead
23   uint64_t seed;        // RNG seed
24 
25   /* Only affects %id filtering: */
26   int   filterpref;     // eslMSAWEIGHT_FILT_CONSCOVER | eslMSAWEIGHT_FILT_RANDOM | eslMSAWEIGHT_FILT_ORIGORDER
27 } ESL_MSAWEIGHT_CFG;
28 
29 /* Default parameters for ESL_MSAWEIGHT_CFG */
30 #define  eslMSAWEIGHT_FRAGTHRESH  0.5
31 #define  eslMSAWEIGHT_SYMFRAC     0.5
32 #define  eslMSAWEIGHT_IGNORE_RF   FALSE
33 #define  eslMSAWEIGHT_ALLOW_SAMP  TRUE
34 #define  eslMSAWEIGHT_SAMPTHRESH  50000
35 #define  eslMSAWEIGHT_NSAMP       10000
36 #define  eslMSAWEIGHT_MAXFRAG     5000
37 #define  eslMSAWEIGHT_RNGSEED     42
38 
39 /* Exclusive settings for seq preference rule in %id filter */
40 #define  eslMSAWEIGHT_FILT_CONSCOVER 1
41 #define  eslMSAWEIGHT_FILT_RANDOM    2
42 #define  eslMSAWEIGHT_FILT_ORIGORDER 3
43 
44 
45 /* ESL_MSAWEIGHT_DAT
46  * optional data collected from PB weighting
47  */
48 typedef struct {
49   uint64_t seed;         // RNG seed used. (if cfg->seed is 0, random seed is chosen, and this is it.)
50 
51   int  cons_by_rf;       // TRUE if consensus columns were determined using RF annotation
52   int  cons_by_sample;   //   ... or by using a subsample of sequences
53   int  cons_by_all;      //   ... or by using all sequences
54   int  cons_allcols;     //   ... or (if all else fails) by using all columns
55   int  rejected_sample;  // TRUE if we tried sampling but rejected it (too many fragments)
56 
57   int  ncons;            // number of consensus columns
58   int *conscols;         // list of column indices (1..alen) defined as consensus
59 
60   int  all_nfrag;        // number of fragments defined when counting all sequences
61   int  samp_nfrag;       // if <cons_by_sample>, number of fragments defined in subsample
62 } ESL_MSAWEIGHT_DAT;
63 
64 
65 extern int esl_msaweight_PB(ESL_MSA *msa);
66 extern int esl_msaweight_PB_adv(const ESL_MSAWEIGHT_CFG *cfg, ESL_MSA *msa, ESL_MSAWEIGHT_DAT *dat);
67 
68 extern ESL_MSAWEIGHT_CFG *esl_msaweight_cfg_Create(void);
69 extern void               esl_msaweight_cfg_Destroy(ESL_MSAWEIGHT_CFG *cfg);
70 extern ESL_MSAWEIGHT_DAT *esl_msaweight_dat_Create(void);
71 extern int                esl_msaweight_dat_Reuse  (ESL_MSAWEIGHT_DAT *dat);
72 extern void               esl_msaweight_dat_Destroy(ESL_MSAWEIGHT_DAT *dat);
73 
74 extern int esl_msaweight_GSC(ESL_MSA *msa);
75 extern int esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid);
76 
77 extern int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa);
78 extern int esl_msaweight_IDFilter_adv(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa);
79 
80 
81 #endif /*eslMSAWEIGHT_INCLUDED*/
82