1 /*
2 * variant_file_filters.cpp
3 *
4 * Author: amarcketta
5 */
6
7 #include "variant_file.h"
8
apply_filters(const parameters & params)9 void variant_file::apply_filters(const parameters ¶ms)
10 {
11 filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_files, params.indv_exclude_files);
12 filter_individuals_randomly(params.max_N_indv);
13 }
14
filter_individuals(const set<string> & indv_to_keep,const set<string> & indv_to_exclude,const vector<string> & indv_to_keep_filenames,const vector<string> & indv_to_exclude_filenames,bool keep_then_exclude)15 void variant_file::filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const vector<string> &indv_to_keep_filenames, const vector<string> &indv_to_exclude_filenames, bool keep_then_exclude)
16 {
17 // Filter individuals by user provided lists
18 if (keep_then_exclude)
19 {
20 filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filenames);
21 filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filenames);
22 }
23 else
24 {
25 filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filenames);
26 filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filenames);
27 }
28 }
29
filter_individuals_by_keep_list(const set<string> & indv_to_keep,const vector<string> & indv_to_keep_filenames)30 void variant_file::filter_individuals_by_keep_list(const set<string> &indv_to_keep, const vector<string> &indv_to_keep_filenames)
31 {
32 // Filter individuals by user provided list
33 if ((indv_to_keep_filenames.size() == 0) && (indv_to_keep.size() == 0))
34 return;
35
36 LOG.printLOG("Keeping individuals in 'keep' list\n");
37 set<string> indv_to_keep_copy = indv_to_keep;
38 if (indv_to_keep_filenames.size() != 0)
39 {
40 for (unsigned int ui=0; ui<indv_to_keep_filenames.size(); ui++)
41 {
42 ifstream infile(indv_to_keep_filenames[ui].c_str());
43 if (!infile.is_open())
44 LOG.error("Could not open Individual file:" + indv_to_keep_filenames[ui], 1);
45 string line;
46 string tmp_indv;
47 stringstream ss;
48 while (!infile.eof())
49 {
50 getline(infile, line);
51 ss.str(line);
52 ss >> tmp_indv;
53 indv_to_keep_copy.insert(tmp_indv);
54 ss.clear();
55 }
56 infile.close();
57 }
58 }
59 for (unsigned int ui=0; ui<include_indv.size(); ui++)
60 {
61 if (include_indv[ui] == false)
62 continue;
63 if (indv_to_keep_copy.find(meta_data.indv[ui]) == indv_to_keep_copy.end())
64 include_indv[ui] = false;
65 }
66 }
67
filter_individuals_by_exclude_list(const set<string> & indv_to_exclude,const vector<string> & indv_to_exclude_filenames)68 void variant_file::filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const vector<string> &indv_to_exclude_filenames)
69 {
70 // Filter individuals by user provided list
71 if ((indv_to_exclude_filenames.size() == 0) && (indv_to_exclude.size() == 0))
72 return;
73 LOG.printLOG("Excluding individuals in 'exclude' list\n");
74 set<string> indv_to_exclude_copy = indv_to_exclude;
75 if (indv_to_exclude_filenames.size() != 0)
76 {
77 for (unsigned int ui=0; ui<indv_to_exclude_filenames.size(); ui++)
78 {
79 ifstream infile(indv_to_exclude_filenames[ui].c_str());
80 if (!infile.is_open())
81 LOG.error("Could not open Individual file:" + indv_to_exclude_filenames[ui], 1);
82 string line;
83 string tmp_indv;
84 stringstream ss;
85 while (!infile.eof())
86 {
87 getline(infile, line);
88 ss.str(line);
89 ss >> tmp_indv;
90 indv_to_exclude_copy.insert(tmp_indv);
91 ss.clear();
92 }
93 infile.close();
94 }
95 }
96 for (unsigned int ui=0; ui<include_indv.size(); ui++)
97 {
98 if (include_indv[ui] == false)
99 continue;
100 if (indv_to_exclude_copy.find(meta_data.indv[ui]) != indv_to_exclude_copy.end())
101 include_indv[ui] = false;
102 }
103 }
104
filter_individuals_randomly(int max_N_indv)105 void variant_file::filter_individuals_randomly(int max_N_indv)
106 {
107 // Filter individuals randomly until have a random subset
108 if (max_N_indv < 0)
109 return;
110 LOG.printLOG("Filtering Individuals Randomly\n");
111
112 if (meta_data.has_genotypes == false)
113 LOG.error("Require Genotypes in variant file filter individuals.");
114
115 unsigned int N_kept_indv = N_kept_individuals();
116
117 srand ( time(NULL) );
118 vector<unsigned int> keep_index(N_kept_indv);
119 int count = 0;
120 for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
121 {
122 if (include_indv[ui] == true)
123 {
124 keep_index[count] = ui;
125 count++;
126 }
127 }
128
129 random_shuffle(keep_index.begin(), keep_index.end()); // Get a random order
130 keep_index.resize(min(max_N_indv, (signed)keep_index.size())); // Only keep a subset
131
132 for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
133 {
134 if (include_indv[ui] == false)
135 continue;
136 bool found = false;
137 for (unsigned int uj=0; uj<keep_index.size(); uj++)
138 {
139 if (keep_index[uj] == ui)
140 {
141 found = true;
142 }
143 }
144 if (found == false)
145 include_indv[ui] = false;
146 }
147 }
148