1 /*
2  * entry.h
3  *
4  *  Created on: Dec 12, 2012
5  *      Author: amarcketta
6  */
7 #ifndef ENTRY_H_
8 #define ENTRY_H_
9 
10 #include <set>
11 #include <deque>
12 #include <vector>
13 #include <cassert>
14 
15 #include "header.h"
16 #include "bgzf.h"
17 #include "output_log.h"
18 #include "parameters.h"
19 
20 using namespace std;
21 extern output_log LOG;
22 
23 class entry
24 {
25 public:
~entry()26 	virtual ~entry() {};
27 	unsigned int N_indv;
28 	bool passed_filters;
29 	header entry_header;
30 	vector<bool> include_indv;
31 	vector<bool> include_genotype;
32 
33 	virtual void parse_basic_entry(bool parse_ALT=false, bool parse_FILTER=false, bool parse_INFO=false) = 0;
34 	virtual void parse_full_entry(bool parse_FORMAT=true) = 0;
35 	virtual void parse_genotype_entry(unsigned int indv, bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0;
36 	virtual void parse_genotype_entries(bool GT=false, bool GQ=false, bool DP=false, bool FT=false) = 0;
37 
38 	virtual void reset(const vector<char> &data_line) = 0;
39 	int apply_filters(const parameters &params);
40 
41 	void filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false);
42 	void filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file);
43 	void filter_sites_to_exclude(const string &snps_to_exclude_file);
44 	void filter_sites_by_position(const string &chr, int start_pos, int end_pos);
45 	void filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file);
46 	void filter_sites_by_overlap_positions(const string &positions_overlap_file, const string &exclude_positions_overlap_file);
47 	void filter_sites_by_chromosome(const set<string> &chrs_to_keep, const set<string> &chrs_to_exclude);
48 	void filter_sites_by_quality(double min_quality);
49 	void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth);
50 	void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_non_ref_af_any, double max_non_ref_af_any, double min_site_call_rate);
51 	void filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels);
52 	void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double min_non_ref_ac_any, double max_non_ref_ac_any, double max_missing_call_count);
53 	void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles);
54 	void filter_sites_by_HWE_pvalue(double min_HWE_pvalue);
55 	void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false);
56 	void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0);
57 	void filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all = false);
58 	void filter_sites_by_phase();
59 	void filter_sites_by_thinning(int min_SNP_distance);
60 	void filter_sites_by_INFO(const set<string> &flags_to_remove, const set<string> &flags_to_keep);
61 
62 	void filter_genotypes_by_quality_value(double min_genotype_quality);
63 	void filter_genotypes_by_depth_range(int min_depth, int max_depth);
64 	void filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all = false);
65 
66 	string get_CHROM() const;
67 	void get_CHROM(string &out) const;
68 	int get_POS() const;
69 	string get_ID() const;
70 	string get_REF() const;
71 	string get_ALT() const;
72 	string get_ALT_allele(int allele_num) const;
73 	void get_allele(int allele_num, string &out) const;
74 	string get_allele(int allele_num) const;
75 	void get_alleles_vector(vector<string> &out) const;
76 	string get_FILTER() const;
77 	void get_FILTER_vector(vector<string> &out) const;
78 	double get_QUAL() const;
79 	string get_INFO(const set<string> &INFO_to_keep, bool keep_all_INFO=false) const;
80 	string get_INFO_value(const string &key) const;
81 	vector<string> get_INFO_values(const string &key) const;
82 	string get_FORMAT() const;
83 
84 	void get_indv_GENOTYPE_ids(unsigned int indv, pair<int, int> &out) const;
85 	void get_indv_GENOTYPE_strings(unsigned int indv, pair<string, string> &out) const;
86 	char get_indv_PHASE(unsigned int indv) const;
87 	double get_indv_GQUALITY(unsigned int indv) const;
88 	int get_indv_DEPTH(unsigned int indv) const;
89 	void get_indv_GFILTER(unsigned int indv, string &out) const;
90 	void get_indv_GFILTER_vector(unsigned int indv, vector<string> &out) const;
91 	int get_indv_ploidy(unsigned int indv) const;
92 
93 	bool is_SNP() const;
94 	bool is_biallelic_SNP() const;
95 	bool is_diploid() const;
96 	virtual void read_indv_generic_entry(unsigned int indv, const string &FORMAT_id, string &out) = 0;
97 	bool FORMAT_id_exists(const string &FORMAT_id);
98 
99 	void get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out) const;
100 	void get_allele_counts(vector<int> &out, unsigned int &N_non_missing_chr_out, const vector<bool> &include_indv, const vector<bool> &include_genotype) const;
101 	void get_genotype_counts(unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const;
102 	void get_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, unsigned int &out_N_hom1, unsigned int &out_N_het, unsigned int &out_N_hom2) const;
103 	void get_multiple_genotype_counts(const vector<bool> &include_indv, const vector<bool> &include_genotype, vector<unsigned int> &out_N_hom, vector<unsigned int> &out_N_het) const;
104 	unsigned int get_N_alleles() const;
105 	unsigned int get_N_chr() const;
106 
107 	void get_POS_binary(vector<char> &out) const;
108 	void get_ID_binary(vector<char> &out);
109 	void get_rlen(vector<char> &out) const;
110 	void get_QUAL_binary(vector<char> &out) const;
111 	void get_n_allele_info(vector<char> &out) const;
112 	void get_n_fmt_sample(vector<char> &out) const;
113 	void get_ALLELES_binary(vector<char> &out);
114 	vector<pair<string, string> > get_INFO_vector(const set<string> &INFO_to_keep, bool keep_all_INFO=false);
115 	void get_FORMAT_binary(vector<char> &out) const;
116 
117 	string get_typed_string( unsigned int * line_position, const vector<char>& line );
118 	void get_type(unsigned int * line_position, const vector<char>& line, unsigned int &type, unsigned int &size);
119 	vector<int> get_int_vector(unsigned int * line_position, const vector<char>& line);
120 	int get_typed_int(unsigned int * line_position, const vector<char>& line, unsigned int &type, unsigned int &size);
121 	void get_number(uint32_t &out, unsigned int * line_position, const vector<char>& line);
122 
123 	void make_typed_string(vector<char> &out, const string &in, bool typed);
124 	void make_typed_int(vector<char> &out, const int &in, bool typed);
125 	void make_int(vector<char> &out, const int &in, int type);
126 	void make_typed_int_vector(vector<char> &out, const vector<string> &in, int number = -1);
127 	void make_typed_int_vector(vector<char> &out, const string &in, int number = -1);
128 	void make_typed_int_vector(vector<char> &out, const vector<int> &in);
129 	void make_typed_float_vector(vector<char> &out, const string &in, int number = -1);
130 	void make_typed_float_vector(vector<char> &out, const vector<string> &in, int number = -1);
131 	void make_typed_string_vector(vector<char> &out, const vector<string> &in, int number = -1);
132 	void make_typed_GT_vector(vector<char> &out, vector<string> &in);
133 	void make_type_size(vector<char> &out, const unsigned int &type, const unsigned int &size);
134 	void encode_genotype(vector<char> &out, string &in, int exp_size);
135 
136 	void skip_section(unsigned int *line_position, const vector<char> &line);
137 	bool check_missing(unsigned int line_position, const unsigned int type, const vector<char> &line);
138 	bool check_end(unsigned int line_position, const unsigned int type, const vector<char> &line);
139 	void add_ALT_allele(const string &in);
140 
141 	virtual void print(ostream &out, const set<string> &INFO_to_keep, bool keep_all_INFO=false) = 0;
142 	virtual void print_bcf(BGZF* out, const set<string> &INFO_to_keep, bool keep_all_INFO=false) = 0;
143 
144 	virtual void filter_genotypes_by_depth(int min_depth, int max_depth) = 0;
145 	virtual void filter_genotypes_by_quality(double min_genotype_quality) = 0;
146 	virtual void filter_genotypes_by_filter_status(const set<string> &filter_flags_to_remove, bool remove_all = false) = 0;
147 
148 	static void SNPHWE(int obs_hets, int obs_hom1, int obs_hom2, double &p_hwe, double &p_lo, double &p_hi);
149 
150 	static set<string> local_snps_to_keep;
151 	static set<string> snps_to_exclude;
152 	static vector< set<int > > keep_positions;
153 	static vector< set<int > > exclude_positions;
154 	static map<string,int> chr_to_idx;
155 	static vector< deque<pair<int,int> > > lims;
156 	static ifstream mask;
157 	static string mask_chr;
158 	static string mask_line;
159 	static int mask_pos;
160 	static string thin_chrom;
161 	static int thin_pos;
162 
163 protected:
164 	istringstream data_stream;
165 	vector<char> line;
166 
167 	bool basic_parsed;
168 	bool fully_parsed;
169 	bool parsed_ALT;
170 	bool parsed_FILTER;
171 	bool parsed_INFO;
172 	bool parsed_FORMAT;
173 	bool parsed_FORMAT_binary;
174 
175 	string CHROM;
176 	int POS;
177 	string ID;
178 	string REF;
179 	vector<string> ALT;
180 	double QUAL;
181 	vector<string> FILTER;
182 	vector<pair<string, string> > INFO;
183 	vector<string> FORMAT;
184 	vector<char> FORMAT_binary;
185 	int N_INFO_removed;
186 	int N_FORMAT_removed;
187 
188 	vector< pair<int,int> > GENOTYPE;
189 	vector<int> ploidy;
190 	vector<char>  PHASE;
191 	vector<double> GQUALITY;
192 	vector<int>   DEPTH;
193 	vector< vector<string> > GFILTER;
194 
195 	vector<bool> parsed_GT;
196 	vector<bool> parsed_GQ;
197 	vector<bool> parsed_DP;
198 	vector<bool> parsed_FT;
199 
200 	map<string, unsigned int> FORMAT_to_idx;
201 	int GT_idx;
202 	int GQ_idx;
203 	int DP_idx;
204 	int FT_idx;
205 
206 	void set_indv_DEPTH(unsigned int indv, int in);
207 	vector<unsigned int> FORMAT_positions, FORMAT_types, FORMAT_sizes, FORMAT_skip, FORMAT_keys;
208 };
209 
210 #endif /* ENTRY_H_ */
211