1 //
2 // Created by mad on 2/3/16.
3 //
4 
5 #ifndef MMSEQS_MSAFILTER_H
6 #define MMSEQS_MSAFILTER_H
7 
8 
9 #include <SubstitutionMatrix.h>
10 #include "MultipleAlignment.h"
11 
12 class MsaFilter {
13 
14 public:
15 
16     MsaFilter(int maxSeqLen, int maxSetSize, SubstitutionMatrix *m, int gapOpen, int gapExtend);
17 
18     ~MsaFilter();
19     /////////////////////////////////////////////////////////////////////////////////////
20     // Select set of representative sequences in the multiple sequence alignment
21     // Filter criteria:
22     //   * Remove sequences with coverage of query less than "coverage" percent
23     //   * Remove sequences with sequence identity to query of less than "qid" percent
24     //   * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent
25     //   * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1
26     //     and seqid2 (%), where the minimum seqid threshold is determined such that,
27     //     in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left.
28     //     This ensures that in multi-domain proteins sequences covering one domain are not
29     //     removed completely because sequences covering other domains are more diverse.
30     //
31     // Allways the shorter of two compared sequences is removed (=> sort sequences by length first).
32     // Please note: sequence identity of sequence x with y when filtering x is calculated as
33     // number of residues in sequence x that are identical to an aligned residue in y / number of residues in x
34     // Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its
35     // length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x.
36     /////////////////////////////////////////////////////////////////////////////////////
37     size_t filter(MultipleAlignment::MSAResult& msa, std::vector<Matcher::result_t> &alnResults, int coverage, int qid, float qsc, int max_seqid, int Ndiff);
38     size_t filter(const int N_in, const int L, const int coverage, const int qid,
39                   const float qsc, const int max_seqid, int Ndiff, const char **X, const bool shuffleMsa);
40 
41     void getKept(bool *offsets, size_t setSize);
42 
43     const float PLTY_GAPOPEN; // for -qsc option (filter for min similarity to query): 6 bits to open gap
44     const float PLTY_GAPEXTD; // for -qsc option (filter for min similarity to query): 1 bit to extend gap
45 
46     void pruneAlignment(char ** msaSequence, int N_in, int L);
47 
48 
49 private:
50     // shuffles the filtered sequences to the back of the array, the unfiltered ones remain in the front
51     void shuffleSequences(const char ** X, size_t setSize);
52 
53     // prune sequence based on score
54     int prune(int start, int end, float b, char * query, char *target);
55 
56     void increaseSetSize(int newSetSize);
57 
58     BaseMatrix *m;
59 
60     int maxSeqLen;
61     int maxSetSize;
62     int gapOpen;
63     int gapExtend;
64 
65     // position-dependent maximum-sequence-identity threshold for filtering? (variable used in former version was idmax)
66     int *Nmax;
67     // minimum value of idmax[i-WFIL,i+WFIL]
68     int *idmaxwin;
69     // N[i] number of already accepted sequences at position i
70     int *N;
71     // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid
72     char *in;
73     // inkk[k]=1 iff in[ksort[k]]=1 else 0;
74     char *inkk;
75     // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid)
76     int *seqid_prev;
77     int *nres;
78     // first residue in sequence k
79     int* first;
80     // last  residue in sequence k
81     int* last;
82     // index for sorting sequences: X[ksort[k]]
83     int* ksort;
84     // display[k]=1 if sequence will be displayed in output alignments; 0 otherwise (first=0)
85     char* display;
86     // keep[k]=1 if sequence is included in amino acid frequencies; 0 otherwise (first=0)
87     char *keep;
88 };
89 
90 
91 #endif //MMSEQS_MSAFILTER_H
92