1 /* Sequence weighting algorithms. 2 * 3 * SRE, Sun Nov 5 09:11:13 2006 [Janelia] 4 */ 5 #ifndef eslMSAWEIGHT_INCLUDED 6 #define eslMSAWEIGHT_INCLUDED 7 #include "esl_config.h" 8 9 #include "esl_msa.h" 10 #include "esl_rand64.h" 11 12 /* ESL_MSAWEIGHT_CFG 13 * optional configuration/customization of PB weighting and %id filtering. 14 */ 15 typedef struct { 16 float fragthresh; // seq is a fragment if (length from 1st to last aligned residue)/alen < fragthresh (i.e. span < minspan) 17 float symfrac; // col is consensus if nres / (nres+ngap) >= symfrac 18 int ignore_rf; // TRUE to ignore RF line (if present), always determine our own consensus 19 int allow_samp; // TRUE to allow consensus determination by subsampling (if nseq > sampthresh) 20 int sampthresh; // if nseq > sampthresh, try to determine consensus on a sample, not all nseq 21 int nsamp; // # of seqs in sample, if determining consensus by sample 22 int maxfrag; // if sample has > maxfrag fragments in it, abort determining consensus by sample; use all nseq instead 23 uint64_t seed; // RNG seed 24 25 /* Only affects %id filtering: */ 26 int filterpref; // eslMSAWEIGHT_FILT_CONSCOVER | eslMSAWEIGHT_FILT_RANDOM | eslMSAWEIGHT_FILT_ORIGORDER 27 } ESL_MSAWEIGHT_CFG; 28 29 /* Default parameters for ESL_MSAWEIGHT_CFG */ 30 #define eslMSAWEIGHT_FRAGTHRESH 0.5 31 #define eslMSAWEIGHT_SYMFRAC 0.5 32 #define eslMSAWEIGHT_IGNORE_RF FALSE 33 #define eslMSAWEIGHT_ALLOW_SAMP TRUE 34 #define eslMSAWEIGHT_SAMPTHRESH 50000 35 #define eslMSAWEIGHT_NSAMP 10000 36 #define eslMSAWEIGHT_MAXFRAG 5000 37 #define eslMSAWEIGHT_RNGSEED 42 38 39 /* Exclusive settings for seq preference rule in %id filter */ 40 #define eslMSAWEIGHT_FILT_CONSCOVER 1 41 #define eslMSAWEIGHT_FILT_RANDOM 2 42 #define eslMSAWEIGHT_FILT_ORIGORDER 3 43 44 45 /* ESL_MSAWEIGHT_DAT 46 * optional data collected from PB weighting 47 */ 48 typedef struct { 49 uint64_t seed; // RNG seed used. (if cfg->seed is 0, random seed is chosen, and this is it.) 50 51 int cons_by_rf; // TRUE if consensus columns were determined using RF annotation 52 int cons_by_sample; // ... or by using a subsample of sequences 53 int cons_by_all; // ... or by using all sequences 54 int cons_allcols; // ... or (if all else fails) by using all columns 55 int rejected_sample; // TRUE if we tried sampling but rejected it (too many fragments) 56 57 int ncons; // number of consensus columns 58 int *conscols; // list of column indices (1..alen) defined as consensus 59 60 int all_nfrag; // number of fragments defined when counting all sequences 61 int samp_nfrag; // if <cons_by_sample>, number of fragments defined in subsample 62 } ESL_MSAWEIGHT_DAT; 63 64 65 extern int esl_msaweight_PB(ESL_MSA *msa); 66 extern int esl_msaweight_PB_adv(const ESL_MSAWEIGHT_CFG *cfg, ESL_MSA *msa, ESL_MSAWEIGHT_DAT *dat); 67 68 extern ESL_MSAWEIGHT_CFG *esl_msaweight_cfg_Create(void); 69 extern void esl_msaweight_cfg_Destroy(ESL_MSAWEIGHT_CFG *cfg); 70 extern ESL_MSAWEIGHT_DAT *esl_msaweight_dat_Create(void); 71 extern int esl_msaweight_dat_Reuse (ESL_MSAWEIGHT_DAT *dat); 72 extern void esl_msaweight_dat_Destroy(ESL_MSAWEIGHT_DAT *dat); 73 74 extern int esl_msaweight_GSC(ESL_MSA *msa); 75 extern int esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid); 76 77 extern int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa); 78 extern int esl_msaweight_IDFilter_adv(const ESL_MSAWEIGHT_CFG *cfg, const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa); 79 80 81 #endif /*eslMSAWEIGHT_INCLUDED*/ 82