1 /* 2 * entry.h 3 * 4 * Created on: Dec 12, 2012 5 * Author: amarcketta 6 */ 7 #ifndef ENTRY_H_ 8 #define ENTRY_H_ 9 10 #include <set> 11 #include <deque> 12 #include <vector> 13 #include <cassert> 14 15 #include "header.h" 16 #include "bgzf.h" 17 #include "output_log.h" 18 #include "parameters.h" 19 20 using namespace std; 21 extern output_log LOG; 22 23 class entry 24 { 25 public: ~entry()26 virtual ~entry() {}; 27 unsigned int N_indv; 28 bool passed_filters; 29 header entry_header; 30 vector<bool> include_indv; 31 vector<bool> include_genotype; 32 33 virtual void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false) = 0; 34 virtual void parse_full_entry(bool parse_FORMAT=true) = 0; 35 virtual void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0; 36 virtual void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0; 37 38 virtual void reset(const vector<char> &data_line) = 0; 39 int apply_filters(const parameters ¶ms); 40 41 void filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false); 42 void filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file); 43 void filter_sites_to_exclude(const string &snps_to_exclude_file); 44 void filter_sites_by_position(const string &chr, int start_pos, int end_pos); 45 void filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file); 46 void filter_sites_by_overlap_positions(const string &positions_overlap_file, const string &exclude_positions_overlap_file); 47 void filter_sites_by_chromosome(const set<string> &chrs_to_keep, const set<string> &chrs_to_exclude); 48 void filter_sites_by_quality(double min_quality); 49 void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth); 50 void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_non_ref_af_any, double max_non_ref_af_any, double min_site_call_rate); 51 void filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels); 52 void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double min_non_ref_ac_any, double max_non_ref_ac_any, double max_missing_call_count); 53 void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles); 54 void filter_sites_by_HWE_pvalue(double min_HWE_pvalue); 55 void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false); 56 void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0); 57 void filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all = false); 58 void filter_sites_by_phase(); 59 void filter_sites_by_thinning(int min_SNP_distance); 60 void filter_sites_by_INFO(const set<string> &flags_to_remove, const set<string> &flags_to_keep); 61 62 void filter_genotypes_by_quality_value(double min_genotype_quality); 63 void filter_genotypes_by_depth_range(int min_depth, int max_depth); 64 void filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all = false); 65 66 string get_CHROM() const; 67 void get_CHROM(string &out) const; 68 int get_POS() const; 69 string get_ID() const; 70 string get_REF() const; 71 string get_ALT() const; 72 string get_ALT_allele(int allele_num) const; 73 void get_allele(int allele_num, string &out) const; 74 string get_allele(int allele_num) const; 75 void get_alleles_vector(vector<string> &out) const; 76 string get_FILTER() const; 77 void get_FILTER_vector(vector<string> &out) const; 78 double get_QUAL() const; 79 string get_INFO(const set<string> &INFO_to_keep, bool keep_all_INFO=false) const; 80 string get_INFO_value(const string &key) const; 81 vector<string> get_INFO_values(const string &key) const; 82 string get_FORMAT() const; 83 84 void get_indv_GENOTYPE_ids(unsigned int indv, pair<int, int> &out) const; 85 void get_indv_GENOTYPE_strings(unsigned int indv, pair<string, string> &out) const; 86 char get_indv_PHASE(unsigned int indv) const; 87 double get_indv_GQUALITY(unsigned int indv) const; 88 int get_indv_DEPTH(unsigned int indv) const; 89 void get_indv_GFILTER(unsigned int indv, string &out) const; 90 void get_indv_GFILTER_vector(unsigned int indv, vector<string> &out) const; 91 int get_indv_ploidy(unsigned int indv) const; 92 93 bool is_SNP() const; 94 bool is_biallelic_SNP() const; 95 bool is_diploid() const; 96 virtual void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out) = 0; 97 bool FORMAT_id_exists(const string &FORMAT_id); 98 99 void get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out) const; 100 void get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out, const vector<bool> &include_indv, const vector<bool> &include_genotype) const; 101 void get_genotype_counts(unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const; 102 void get_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const; 103 void get_multiple_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, vector<unsigned int> &out_N_hom, vector<unsigned int> &out_N_het) const; 104 unsigned int get_N_alleles() const; 105 unsigned int get_N_chr() const; 106 107 void get_POS_binary(vector<char> &out) const; 108 void get_ID_binary(vector<char> &out); 109 void get_rlen(vector<char> &out) const; 110 void get_QUAL_binary(vector<char> &out) const; 111 void get_n_allele_info(vector<char> &out) const; 112 void get_n_fmt_sample(vector<char> &out) const; 113 void get_ALLELES_binary(vector<char> &out); 114 vector<pair<string, string> > get_INFO_vector(const set<string> &INFO_to_keep, bool keep_all_INFO=false); 115 void get_FORMAT_binary(vector<char> &out) const; 116 117 string get_typed_string( unsigned int * line_position, const vector<char>& line ); 118 void get_type(unsigned int * line_position, const vector<char>& line, unsigned int &type, unsigned int &size); 119 vector<int> get_int_vector(unsigned int * line_position, const vector<char>& line); 120 int get_typed_int(unsigned int * line_position, const vector<char>& line, unsigned int &type, unsigned int &size); 121 void get_number(uint32_t &out, unsigned int * line_position, const vector<char>& line); 122 123 void make_typed_string(vector<char> &out, const string &in, bool typed); 124 void make_typed_int(vector<char> &out, const int &in, bool typed); 125 void make_int(vector<char> &out, const int &in, int type); 126 void make_typed_int_vector(vector<char> &out, const vector<string> &in, int number = -1); 127 void make_typed_int_vector(vector<char> &out, const string &in, int number = -1); 128 void make_typed_int_vector(vector<char> &out, const vector<int> &in); 129 void make_typed_float_vector(vector<char> &out, const string &in, int number = -1); 130 void make_typed_float_vector(vector<char> &out, const vector<string> &in, int number = -1); 131 void make_typed_string_vector(vector<char> &out, const vector<string> &in, int number = -1); 132 void make_typed_GT_vector(vector<char> &out, vector<string> &in); 133 void make_type_size(vector<char> &out, const unsigned int &type, const unsigned int &size); 134 void encode_genotype(vector<char> &out, string &in, int exp_size); 135 136 void skip_section(unsigned int *line_position, const vector<char> &line); 137 bool check_missing(unsigned int line_position, const unsigned int type, const vector<char> &line); 138 bool check_end(unsigned int line_position, const unsigned int type, const vector<char> &line); 139 void add_ALT_allele(const string &in); 140 141 virtual void print(ostream &out, const set<string> &INFO_to_keep, bool keep_all_INFO=false) = 0; 142 virtual void print_bcf(BGZF* out, const set<string> &INFO_to_keep, bool keep_all_INFO=false) = 0; 143 144 virtual void filter_genotypes_by_depth(int min_depth, int max_depth) = 0; 145 virtual void filter_genotypes_by_quality(double min_genotype_quality) = 0; 146 virtual void filter_genotypes_by_filter_status(const set<string> &filter_flags_to_remove, bool remove_all = false) = 0; 147 148 static void SNPHWE(int obs_hets, int obs_hom1, int obs_hom2, double &p_hwe, double &p_lo, double &p_hi); 149 150 static set<string> local_snps_to_keep; 151 static set<string> snps_to_exclude; 152 static vector< set<int > > keep_positions; 153 static vector< set<int > > exclude_positions; 154 static map<string,int> chr_to_idx; 155 static vector< deque<pair<int,int> > > lims; 156 static ifstream mask; 157 static string mask_chr; 158 static string mask_line; 159 static int mask_pos; 160 static string thin_chrom; 161 static int thin_pos; 162 163 protected: 164 istringstream data_stream; 165 vector<char> line; 166 167 bool basic_parsed; 168 bool fully_parsed; 169 bool parsed_ALT; 170 bool parsed_FILTER; 171 bool parsed_INFO; 172 bool parsed_FORMAT; 173 bool parsed_FORMAT_binary; 174 175 string CHROM; 176 int POS; 177 string ID; 178 string REF; 179 vector<string> ALT; 180 double QUAL; 181 vector<string> FILTER; 182 vector<pair<string, string> > INFO; 183 vector<string> FORMAT; 184 vector<char> FORMAT_binary; 185 int N_INFO_removed; 186 int N_FORMAT_removed; 187 188 vector< pair<int,int> > GENOTYPE; 189 vector<int> ploidy; 190 vector<char> PHASE; 191 vector<double> GQUALITY; 192 vector<int> DEPTH; 193 vector< vector<string> > GFILTER; 194 195 vector<bool> parsed_GT; 196 vector<bool> parsed_GQ; 197 vector<bool> parsed_DP; 198 vector<bool> parsed_FT; 199 200 map<string, unsigned int> FORMAT_to_idx; 201 int GT_idx; 202 int GQ_idx; 203 int DP_idx; 204 int FT_idx; 205 206 void set_indv_DEPTH(unsigned int indv, int in); 207 vector<unsigned int> FORMAT_positions, FORMAT_types, FORMAT_sizes, FORMAT_skip, FORMAT_keys; 208 }; 209 210 #endif /* ENTRY_H_ */ 211