1 /* The MIT License 2 3 Copyright (c) 2013 Adrian Tan <atks@umich.edu> 4 5 Permission is hereby granted, free of charge, to any person obtaining a copy 6 of this software and associated documentation files (the "Software"), to deal 7 in the Software without restriction, including without limitation the rights 8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 copies of the Software, and to permit persons to whom the Software is 10 furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 THE SOFTWARE. 22 */ 23 24 #ifndef FILTER_H 25 #define FILTER_H 26 27 #include <algorithm> 28 #include <cctype> 29 #include "hts_utils.h" 30 #include "variant.h" 31 #include "pregex.h" 32 33 //TYPES 34 //this is defined from the 7th to 13th bit by setting the bit 35 #define VT_LOGIC_OP 2048 //0x1000 36 #define VT_MATH_CMP 4096 //0x2000 37 #define VT_MATH_OP 8192 //0x4000 38 #define VT_BCF_OP 16384 //0x8000 39 40 #define VT_BOOL 64 //0x0040 41 #define VT_INT 128 //0x0080 42 #define VT_FLT 256 //0x0100 43 #define VT_STR 512 //0x0200 44 #define VT_FLG 1024 //0x0400 45 46 //common unary and binary ops (ordered by precedence level) 47 //this is identified over the first 6 bits which gives 64 possible operations 48 #define VT_NOT (0|VT_LOGIC_OP|VT_BOOL) 49 #define VT_AND (1|VT_LOGIC_OP|VT_BOOL) 50 #define VT_OR (2|VT_LOGIC_OP|VT_BOOL) 51 52 #define VT_EQ (3|VT_MATH_CMP|VT_BOOL) 53 #define VT_NE (4|VT_MATH_CMP|VT_BOOL) 54 #define VT_LT (5|VT_MATH_CMP|VT_BOOL) 55 #define VT_LE (6|VT_MATH_CMP|VT_BOOL) 56 #define VT_GT (7|VT_MATH_CMP|VT_BOOL) 57 #define VT_GE (8|VT_MATH_CMP|VT_BOOL) 58 #define VT_MATCH (9|VT_MATH_CMP|VT_BOOL) 59 #define VT_NO_MATCH (10|VT_MATH_CMP|VT_BOOL) 60 #define VT_ADD (11|VT_MATH_OP|VT_FLT) 61 #define VT_SUB (12|VT_MATH_OP|VT_FLT) 62 #define VT_MUL (13|VT_MATH_OP|VT_FLT) 63 #define VT_DIV (14|VT_MATH_OP|VT_FLT) 64 #define VT_BIT_AND (15|VT_INT|VT_MATH_OP|VT_BOOL) 65 #define VT_BIT_OR (16|VT_INT|VT_MATH_OP|VT_BOOL) 66 67 //unary ops (data getters for vcf) 68 #define VT_VARIANT_TYPE (33|VT_INT|VT_BCF_OP|VT_BOOL) 69 #define VT_VARIANT_DLEN (34|VT_INT|VT_BCF_OP) 70 #define VT_VARIANT_LEN (35|VT_INT|VT_BCF_OP) 71 #define VT_VARIANT_CONTAINS_N (36|VT_BCF_OP|VT_BOOL) 72 #define VT_N_ALLELE (37|VT_INT|VT_BCF_OP) 73 #define VT_QUAL (38|VT_BCF_OP|VT_FLT) 74 #define VT_FILTER (39|VT_BCF_OP|VT_BOOL) 75 #define VT_N_FILTER (40|VT_INT|VT_BCF_OP) 76 #define VT_INFO (41|VT_BCF_OP) 77 #define VT_REF_COL (42|VT_BCF_OP|VT_STR) 78 #define VT_ALT (43|VT_BCF_OP|VT_STR) 79 80 //problems will arise once you pass 63. 81 #define VT_UNKNOWN -1 82 83 /** 84 * Class for filtering VCF records. 85 */ 86 class Node 87 { 88 public: 89 90 Node* parent; 91 Node* left; 92 Node* right; 93 94 //VT_LOGIC_OP 2048 95 //VT_MATH_CMP 4096 96 //VT_MATH_OP 8192 97 //VT_BCF_OP 16384 98 // 99 //VT_BOOL 64 100 //VT_INT 128 101 //VT_FLT 256 102 //VT_STR 512 103 //#define VT_FLG 1024 104 int32_t type; // data type 105 //BCF_VL_FIXED 0 - implemented this 106 //BCF_VL_VAR 1 todo: implement? 107 //BCF_VL_A 2 todo: implement 108 //BCF_VL_G 3 todo: implement 109 //BCF_VL_R 4 todo: implement 110 int32_t var_length; //variable length 111 int32_t number; //actual length 112 kstring_t tag; //store the INFO tag of a BCF type 113 int32_t index; //store index value of interest 114 115 bool value_exists; // if value exists 116 117 kstring_t s; // string value 118 bool b; // boolean value 119 int32_t i; // integer value 120 float f; // float value 121 122 //for cases of fix multiple values 123 std::vector<std::string> svec; 124 std::vector<bool> bvec; 125 std::vector<int32_t> ivec; 126 std::vector<float> fvec; 127 128 PERLregex pregex; 129 bool regex_set; 130 131 /** 132 * Constructor. 133 */ 134 Node(); 135 136 /** 137 * Constructor with type initlialization. 138 */ 139 Node(int32_t type); 140 141 /** 142 * Evaluates the actions for this node. 143 */ 144 void evaluate(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug=false); 145 146 /** 147 * Converts type to string. 148 */ 149 std::string type2string(int32_t type); 150 }; 151 152 /** 153 * Filter for VCF records. 154 */ 155 class Filter 156 { 157 public: 158 159 //filter expression 160 Node* tree; 161 162 //useful pointers for applying the filter to a vcf record 163 bcf_hdr_t *h; 164 bcf1_t *v; 165 Variant *variant; 166 167 /** 168 * Constructor. 169 */ 170 Filter(); 171 172 /** 173 * Constructor with expression initialization. 174 */ 175 Filter(std::string exp); 176 177 /** 178 * Parses filter expression. 179 */ 180 void parse(const char* exp, bool debug=false); 181 182 /** 183 * Applies filter to vcf record. 184 */ 185 bool apply(bcf_hdr_t *h, bcf1_t *v, Variant *variant, bool debug=false); 186 187 /** 188 * Attempts to simplify the expression tree by collapsing nodes that can be precomputed. 189 */ 190 void simplify(); 191 192 /** 193 * Resets filter. 194 */ 195 void reset(); 196 197 private: 198 199 /** 200 * Recursive call for parse. 201 */ 202 void parse(const char* exp, int32_t len, Node * node, bool debug=false); 203 204 /** 205 * Checks if exp is a literal. 206 */ 207 bool is_literal(const char* exp, int32_t len, bool debug=false); 208 209 /** 210 * Checks if exp is a unary op. 211 */ 212 bool is_unary_op(const char* exp, int32_t len, bool debug=false); 213 214 /** 215 * Checks is expression is bracketed. 216 */ 217 bool is_bracketed_expression(const char* exp, int32_t len, bool debug); 218 219 /** 220 * Detect index width. 221 * e.g. AC[1] => 3 222 * e.g. EVIDENCE[12] => 4 223 * 224 * Populated index with the index queried. 225 */ 226 int32_t get_index_width(const char *exp, int32_t n, int32_t *index); 227 228 /** 229 * Parse literals. 230 */ 231 void parse_literal(const char* exp, int32_t len, Node * node, bool debug=false); 232 233 /** 234 * Trim brackets from an expression. 235 */ 236 void trim_brackets(const char* &exp, int32_t &len, bool debug=false); 237 238 /** 239 * Moves r to the closing bracket if this expression starts with an open bracket. 240 * Returns -1 if end of r else 0. 241 */ 242 int32_t fwd_to_closing_bracket(const char* &r, int32_t &len); 243 244 /** 245 * Returns -1 if no operator found. Updates oplen to be the length of the operator observed. 246 */ 247 int32_t peek_op(const char* &r, int32_t len, int32_t &oplen, bool debug); 248 249 /** 250 * Recursive call for apply. 251 */ 252 void apply(Node* node, bool debug=false); 253 254 /** 255 * Help message on filter expressions. 256 */ 257 void print_filter_help(); 258 }; 259 260 #endif