1 // 2 // Created by mad on 2/3/16. 3 // 4 5 #ifndef MMSEQS_MSAFILTER_H 6 #define MMSEQS_MSAFILTER_H 7 8 9 #include <SubstitutionMatrix.h> 10 #include "MultipleAlignment.h" 11 12 class MsaFilter { 13 14 public: 15 16 MsaFilter(int maxSeqLen, int maxSetSize, SubstitutionMatrix *m, int gapOpen, int gapExtend); 17 18 ~MsaFilter(); 19 ///////////////////////////////////////////////////////////////////////////////////// 20 // Select set of representative sequences in the multiple sequence alignment 21 // Filter criteria: 22 // * Remove sequences with coverage of query less than "coverage" percent 23 // * Remove sequences with sequence identity to query of less than "qid" percent 24 // * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent 25 // * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1 26 // and seqid2 (%), where the minimum seqid threshold is determined such that, 27 // in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left. 28 // This ensures that in multi-domain proteins sequences covering one domain are not 29 // removed completely because sequences covering other domains are more diverse. 30 // 31 // Allways the shorter of two compared sequences is removed (=> sort sequences by length first). 32 // Please note: sequence identity of sequence x with y when filtering x is calculated as 33 // number of residues in sequence x that are identical to an aligned residue in y / number of residues in x 34 // Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its 35 // length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x. 36 ///////////////////////////////////////////////////////////////////////////////////// 37 size_t filter(MultipleAlignment::MSAResult& msa, std::vector<Matcher::result_t> &alnResults, int coverage, int qid, float qsc, int max_seqid, int Ndiff); 38 size_t filter(const int N_in, const int L, const int coverage, const int qid, 39 const float qsc, const int max_seqid, int Ndiff, const char **X, const bool shuffleMsa); 40 41 void getKept(bool *offsets, size_t setSize); 42 43 const float PLTY_GAPOPEN; // for -qsc option (filter for min similarity to query): 6 bits to open gap 44 const float PLTY_GAPEXTD; // for -qsc option (filter for min similarity to query): 1 bit to extend gap 45 46 void pruneAlignment(char ** msaSequence, int N_in, int L); 47 48 49 private: 50 // shuffles the filtered sequences to the back of the array, the unfiltered ones remain in the front 51 void shuffleSequences(const char ** X, size_t setSize); 52 53 // prune sequence based on score 54 int prune(int start, int end, float b, char * query, char *target); 55 56 void increaseSetSize(int newSetSize); 57 58 BaseMatrix *m; 59 60 int maxSeqLen; 61 int maxSetSize; 62 int gapOpen; 63 int gapExtend; 64 65 // position-dependent maximum-sequence-identity threshold for filtering? (variable used in former version was idmax) 66 int *Nmax; 67 // minimum value of idmax[i-WFIL,i+WFIL] 68 int *idmaxwin; 69 // N[i] number of already accepted sequences at position i 70 int *N; 71 // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid 72 char *in; 73 // inkk[k]=1 iff in[ksort[k]]=1 else 0; 74 char *inkk; 75 // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid) 76 int *seqid_prev; 77 int *nres; 78 // first residue in sequence k 79 int* first; 80 // last residue in sequence k 81 int* last; 82 // index for sorting sequences: X[ksort[k]] 83 int* ksort; 84 // display[k]=1 if sequence will be displayed in output alignments; 0 otherwise (first=0) 85 char* display; 86 // keep[k]=1 if sequence is included in amino acid frequencies; 0 otherwise (first=0) 87 char *keep; 88 }; 89 90 91 #endif //MMSEQS_MSAFILTER_H 92