1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #ifndef FILTER_H
25 #define FILTER_H
26 
27 #include <algorithm>
28 #include <cctype>
29 #include "hts_utils.h"
30 #include "variant.h"
31 #include "pregex.h"
32 
33 //TYPES
34 //this is defined from the 7th to 13th bit by setting the bit
35 #define VT_LOGIC_OP  2048   //0x1000
36 #define VT_MATH_CMP  4096   //0x2000
37 #define VT_MATH_OP   8192   //0x4000
38 #define VT_BCF_OP    16384  //0x8000
39 
40 #define VT_BOOL     64      //0x0040
41 #define VT_INT      128     //0x0080
42 #define VT_FLT      256     //0x0100
43 #define VT_STR      512     //0x0200
44 #define VT_FLG      1024    //0x0400
45 
46 //common unary and binary ops (ordered by precedence level)
47 //this is identified over the first 6 bits which gives 64 possible operations
48 #define VT_NOT      (0|VT_LOGIC_OP|VT_BOOL)
49 #define VT_AND      (1|VT_LOGIC_OP|VT_BOOL)
50 #define VT_OR       (2|VT_LOGIC_OP|VT_BOOL)
51 
52 #define VT_EQ       (3|VT_MATH_CMP|VT_BOOL)
53 #define VT_NE       (4|VT_MATH_CMP|VT_BOOL)
54 #define VT_LT       (5|VT_MATH_CMP|VT_BOOL)
55 #define VT_LE       (6|VT_MATH_CMP|VT_BOOL)
56 #define VT_GT       (7|VT_MATH_CMP|VT_BOOL)
57 #define VT_GE       (8|VT_MATH_CMP|VT_BOOL)
58 #define VT_MATCH    (9|VT_MATH_CMP|VT_BOOL)
59 #define VT_NO_MATCH (10|VT_MATH_CMP|VT_BOOL)
60 #define VT_ADD      (11|VT_MATH_OP|VT_FLT)
61 #define VT_SUB      (12|VT_MATH_OP|VT_FLT)
62 #define VT_MUL      (13|VT_MATH_OP|VT_FLT)
63 #define VT_DIV      (14|VT_MATH_OP|VT_FLT)
64 #define VT_BIT_AND  (15|VT_INT|VT_MATH_OP|VT_BOOL)
65 #define VT_BIT_OR   (16|VT_INT|VT_MATH_OP|VT_BOOL)
66 
67 //unary ops (data getters for vcf)
68 #define VT_VARIANT_TYPE        (33|VT_INT|VT_BCF_OP|VT_BOOL)
69 #define VT_VARIANT_DLEN        (34|VT_INT|VT_BCF_OP)
70 #define VT_VARIANT_LEN         (35|VT_INT|VT_BCF_OP)
71 #define VT_VARIANT_CONTAINS_N  (36|VT_BCF_OP|VT_BOOL)
72 #define VT_N_ALLELE            (37|VT_INT|VT_BCF_OP)
73 #define VT_QUAL                (38|VT_BCF_OP|VT_FLT)
74 #define VT_FILTER              (39|VT_BCF_OP|VT_BOOL)
75 #define VT_N_FILTER            (40|VT_INT|VT_BCF_OP)
76 #define VT_INFO                (41|VT_BCF_OP)
77 #define VT_REF_COL             (42|VT_BCF_OP|VT_STR)
78 #define VT_ALT                 (43|VT_BCF_OP|VT_STR)
79 
80 //problems will arise once you pass 63.
81 #define VT_UNKNOWN -1
82 
83 /**
84  * Class for filtering VCF records.
85  */
86 class Node
87 {
88     public:
89 
90     Node* parent;
91     Node* left;
92     Node* right;
93 
94 //VT_LOGIC_OP  2048
95 //VT_MATH_CMP  4096
96 //VT_MATH_OP   8192
97 //VT_BCF_OP    16384
98 //
99 //VT_BOOL     64
100 //VT_INT      128
101 //VT_FLT      256
102 //VT_STR      512
103 //#define VT_FLG      1024
104     int32_t type;  // data type
105 //BCF_VL_FIXED 0  - implemented this
106 //BCF_VL_VAR   1  todo: implement?
107 //BCF_VL_A     2  todo: implement
108 //BCF_VL_G     3  todo: implement
109 //BCF_VL_R     4  todo: implement
110     int32_t var_length;     //variable length
111     int32_t number; //actual length
112     kstring_t tag;  //store the INFO tag of a BCF type
113     int32_t index;  //store index value of interest
114 
115     bool value_exists; // if value exists
116 
117     kstring_t s;   // string value
118     bool b;        // boolean value
119     int32_t i;     // integer value
120     float f;       // float value
121 
122     //for cases of fix multiple values
123     std::vector<std::string> svec;
124     std::vector<bool> bvec;
125     std::vector<int32_t> ivec;
126     std::vector<float> fvec;
127 
128     PERLregex pregex;
129     bool regex_set;
130 
131     /**
132      * Constructor.
133      */
134     Node();
135 
136     /**
137      * Constructor with type initlialization.
138      */
139     Node(int32_t type);
140 
141     /**
142      * Evaluates the actions for this node.
143      */
144     void evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug=false);
145 
146     /**
147      * Converts type to string.
148      */
149     std::string type2string(int32_t type);
150 };
151 
152 /**
153  * Filter for VCF records.
154  */
155 class Filter
156 {
157     public:
158 
159     //filter expression
160     Node* tree;
161 
162     //useful pointers for applying the filter to a vcf record
163     bcf_hdr_t *h;
164     bcf1_t *v;
165     Variant *variant;
166 
167     /**
168      * Constructor.
169      */
170     Filter();
171 
172     /**
173      * Constructor with expression initialization.
174      */
175     Filter(std::string exp);
176 
177     /**
178      * Parses filter expression.
179      */
180     void parse(const char* exp, bool debug=false);
181 
182     /**
183      * Applies filter to vcf record.
184      */
185     bool apply(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug=false);
186 
187     /**
188      * Attempts to simplify the expression tree by collapsing nodes that can be precomputed.
189      */
190     void simplify();
191 
192     /**
193      * Resets filter.
194      */
195     void reset();
196 
197     private:
198 
199     /**
200      * Recursive call for parse.
201      */
202     void parse(const char* exp, int32_t len, Node * node, bool debug=false);
203 
204     /**
205      * Checks if exp is a literal.
206      */
207     bool is_literal(const char* exp, int32_t len, bool debug=false);
208 
209     /**
210      * Checks if exp is a unary op.
211      */
212     bool is_unary_op(const char* exp, int32_t len, bool debug=false);
213 
214     /**
215      * Checks is expression is bracketed.
216      */
217     bool is_bracketed_expression(const char* exp, int32_t len, bool debug);
218 
219     /**
220      * Detect index width.
221      * e.g. AC[1] => 3
222      * e.g. EVIDENCE[12] => 4
223      *
224      * Populated index with the index queried.
225      */
226     int32_t get_index_width(const char *exp, int32_t n, int32_t *index);
227 
228     /**
229      * Parse literals.
230      */
231     void parse_literal(const char* exp, int32_t len, Node * node, bool debug=false);
232 
233     /**
234      * Trim brackets from an expression.
235      */
236     void trim_brackets(const char* &exp, int32_t &len, bool debug=false);
237 
238     /**
239      * Moves r to the closing bracket if this expression starts with an open bracket.
240      * Returns -1 if end of r else 0.
241      */
242     int32_t fwd_to_closing_bracket(const char* &r, int32_t &len);
243 
244     /**
245      * Returns -1 if no operator found. Updates oplen to be the length of the operator observed.
246      */
247     int32_t peek_op(const char* &r, int32_t len, int32_t &oplen, bool debug);
248 
249     /**
250      * Recursive call for apply.
251      */
252     void apply(Node* node, bool debug=false);
253 
254     /**
255      * Help message on filter expressions.
256      */
257     void print_filter_help();
258 };
259 
260 #endif