1 #include <fstream>
2 #include <dirent.h>
3 #include <iostream>
4 #include <algorithm>
5 #include <cstring>
6 #include <vector>
7
8 #include "config.h"
9 #include "constants.h"
10 #include "input.h"
11
12 #include "MetaPopInfo.h"
13
14 using namespace std;
15
16 const string Pop::default_name = "defaultpop";
17 const string Group::default_name = "defaultgrp";
18
reset_sample_map()19 void MetaPopInfo::reset_sample_map() {
20 sample_indexes_.clear();
21 for (size_t i = 0; i < samples_.size(); ++i) {
22 bool rv = sample_indexes_.insert( {samples_[i].name, i} ).second;
23 if (!rv) {
24 cerr << "Error: Two or more samples have the same name '" << samples_[i].name << "'.\n";
25 throw exception();
26 }
27 }
28 }
29
reset_pop_map()30 void MetaPopInfo::reset_pop_map() {
31 pop_indexes_.clear();
32 for (size_t i = 0; i < pops_.size(); ++i) {
33 bool rv = pop_indexes_.insert( {pops_[i].name, i} ).second;
34 if (!rv) {
35 cerr << "Error: Two or more populations have the same name '" << pops_[i].name << "'.\n";
36 throw exception();
37 }
38 }
39 }
40
reset_group_map()41 void MetaPopInfo::reset_group_map() {
42 group_indexes_.clear();
43 for (size_t i = 0; i < groups_.size(); ++i) {
44 bool rv = group_indexes_.insert( {groups_[i].name, i} ).second;
45 if (!rv) {
46 cerr << "Error: Two or more groups have the same name '" << groups_[i].name << "'.\n";
47 throw exception();
48 }
49 }
50 }
51
reset_orig_order()52 void MetaPopInfo::reset_orig_order() {
53 sample_indexes_orig_order_.clear();
54 for (const string& name : orig_sample_order_) {
55 auto itr = sample_indexes_.find(name);
56 if (itr != sample_indexes_.end())
57 // This sample is still there (n.b. delete_samples/intersect_with).
58 sample_indexes_orig_order_.push_back(itr->second);
59 }
60 assert(sample_indexes_orig_order_.size() == samples_.size());
61 }
62
init_popmap(const string & pmap_path)63 void MetaPopInfo::init_popmap(const string& pmap_path) {
64 assert(samples_.empty());
65
66 ifstream fh(pmap_path.c_str(), ifstream::in);
67 if (fh.fail()) {
68 cerr << "Error: Failed to open population map file '" << pmap_path << "'.\n";
69 throw exception();
70 }
71
72 size_t p = 0; // pop index counter
73 size_t g = 0; // group index counter
74
75 char line[max_len];
76 memset(line, '\0', max_len);
77 vector<string> parts;
78 while (fh.getline(line, max_len)) {
79 size_t len = strlen(line);
80
81 // Skip empty lines and comments
82 if (len == 0 || line[0] == '#')
83 continue;
84
85 // Check for Windows line endings
86 if (line[len - 1] == '\r') {
87 line[len - 1] = '\0';
88 len -= 1;
89 }
90
91 //
92 // Parse the contents, we expect:
93 // <file name> tab <population string> [tab <group string>]
94 //
95
96 parse_tsv(line, parts);
97 for (string& part : parts)
98 strip_whitespace(part);
99
100 if (parts.size() < 2
101 || parts.size() > 3
102 || parts[0].empty()
103 || parts[1].empty()) {
104 cerr << "Error: Malformed population map -- expected 'SAMPLE \\t POP [\\t GROUP]'. In file '" << pmap_path << "', line :\n" << line << "\n";
105 throw exception();
106 }
107
108 //
109 // Process the sample name.
110 //
111
112 samples_.push_back(Sample(parts[0]));
113 orig_sample_order_.push_back(parts[0]);
114
115 //
116 // Process the population field.
117 //
118
119 pair<map<string,size_t>::iterator, bool> pop_ins = pop_indexes_.insert( {parts[1], p} );
120 size_t pop_index = pop_ins.first->second;
121
122 samples_.back().pop = pop_index; // Set the sample's population index.
123 if (pop_ins.second) {
124 // Unknown pop
125 pops_.push_back(Pop(parts[1]));
126 ++p;
127 }
128
129 //
130 // Process the group field, if any
131 //
132
133 if (parts.size() == 3 && ! parts[2].empty()) {
134 // Get the index of this group -- create it if necessary.
135 pair<map<string,size_t>::iterator, bool> grp_ins = group_indexes_.insert( {parts[2], g} );
136 if (grp_ins.second) {
137 groups_.push_back(Group(parts[2]));
138 ++g;
139 }
140 size_t grp_index = grp_ins.first->second;
141
142 if (pops_[pop_index].group != size_t(-1)) {
143 // The current pop already has a group, check that it is the same.
144 if (pops_[pop_index].group != grp_index) {
145 cerr << "Error: In population map file '"
146 << pmap_path << "': population '"
147 << pops_[pop_index].name << "' belongs to two groups, '"
148 << groups_[pops_[pop_index].group].name << "' and '"
149 << groups_[grp_index].name << "'\n.";
150 throw exception();
151 }
152 } else {
153 pops_[pop_index].group = grp_index;
154 groups_[grp_index].pops.push_back(pop_index);
155 }
156 }
157 }
158 if (samples_.empty()) {
159 cerr << "Error: Population map '" << pmap_path << "' appears to be empty.\n";
160 throw exception();
161 }
162
163 //
164 // Check that all the populations are in a group. Put
165 // populations that do not have a group in a default
166 // one.
167 //
168
169 bool missing_group = false;
170 if (not groups_.empty()) {
171 for (vector<Pop>::iterator p = pops_.begin(); p != pops_.end(); ++p) {
172 if (p->group == size_t(-1)) {
173 cerr << "Warning: Population '" << p->name
174 << "' did not have a group, adding it to '"
175 << Group::default_name << "'.\n";
176 missing_group = true;
177 }
178 }
179 } else {
180 missing_group = true;
181 }
182 if (missing_group) {
183 groups_.push_back(Group(Group::default_name));
184 g = groups_.size()-1;
185 group_indexes_.insert( {Group::default_name, g} );
186 for (size_t p = 0; p < pops_.size(); ++p) {
187 if (pops_[p].group == size_t(-1)) {
188 pops_[p].group = g;
189 groups_[g].pops.push_back(p);
190 }
191 }
192 }
193
194 //
195 // Sort the samples. Determine the first/last indexes for each population.
196 //
197
198 sort(samples_.begin(), samples_.end());
199 reset_sample_map();
200 reset_orig_order();
201
202 size_t curr_pop = 0;
203 pops_[curr_pop].first_sample = 0;
204 for (size_t s = 1; s < samples_.size(); ++s) {
205 if (samples_[s].pop != curr_pop) {
206 pops_[curr_pop].last_sample = s-1;
207 ++curr_pop;
208 pops_[curr_pop].first_sample = s;
209 }
210 }
211 pops_[curr_pop].last_sample = samples_.size()-1;
212 }
213
init_names(const vector<string> & sample_names)214 void MetaPopInfo::init_names(const vector<string>& sample_names) {
215 if(sample_names.empty()) {
216 cerr << "Error: No samples.\n";
217 throw exception();
218 }
219 assert(samples_.empty());
220
221 orig_sample_order_ = sample_names;
222
223 // Create the samples
224 for (vector<string>::const_iterator s=sample_names.begin(); s!= sample_names.end(); ++s) {
225 samples_.push_back(Sample(*s));
226 samples_.back().pop = 0;
227 }
228 sort(samples_.begin(), samples_.end());
229
230 // Create a default population
231 pops_.push_back(Pop(Pop::default_name));
232 pops_[0].first_sample = 0;
233 pops_[0].last_sample = samples_.size()-1;
234 pops_[0].group = 0;
235
236 // Create a default group
237 groups_.push_back(Group(Group::default_name));
238 groups_[0].pops.push_back(0);
239
240 // Set the support members.
241 reset_sample_map();
242 reset_pop_map();
243 reset_group_map();
244 reset_orig_order();
245 }
246
init_directory(const string & dir_path)247 void MetaPopInfo::init_directory(const string& dir_path) {
248
249 //
250 // Find all sample names.
251 //
252
253 vector<string> sample_names;
254 DIR* dir = opendir(dir_path.c_str());
255 if (dir == NULL) {
256 cerr << "Unable to open directory '" << dir_path << "' for reading.\n";
257 throw exception();
258 }
259 dirent *direntry;
260 while ((direntry = readdir(dir)) != NULL) {
261 string filename = direntry->d_name;
262
263 if (filename == "." || filename == ".." || filename.substr(0, 6) == "batch_")
264 continue;
265
266 size_t pos = filename.rfind(".tags.tsv");
267 if (pos == string::npos)
268 pos = filename.rfind(".tags.tsv.gz");
269
270 if (pos != string::npos)
271 sample_names.push_back(filename.substr(0, pos));
272 }
273 closedir(dir);
274
275 if (sample_names.empty()) {
276 cerr << "Error: Failed to find sample files in directory '" << dir_path << "'.\n";
277 throw exception();
278 }
279
280 //
281 // Initialize the MetaPopInfo
282 //
283 init_names(sample_names);
284 }
285
delete_samples(const vector<size_t> & rm_samples)286 void MetaPopInfo::delete_samples(const vector<size_t>& rm_samples) {
287
288 if (rm_samples.empty())
289 return;
290
291 // Remove these samples from [samples_].
292 for (size_t s : rm_samples)
293 samples_.at(s).name.clear(); // Mark the sample for removal.
294 samples_.erase(remove_if(
295 samples_.begin(),
296 samples_.end(),
297 [] (Sample& s) {return s.name.empty();}
298 ), samples_.end());
299
300 // Update the indexes of the populations.
301 for (Pop& p : pops_) {
302 for (vector<size_t>::const_reverse_iterator rm_sample = rm_samples.rbegin(); rm_sample != rm_samples.rend(); ++rm_sample) {
303 if (p.first_sample > *rm_sample) // n.b. ">"
304 --p.first_sample;
305 if (p.last_sample >= *rm_sample) // n.b. ">=". Thus if the population becomes
306 // empty, [first_sample] will be past [last_sample].
307 // n.b. If removing the first pop, last_sample=size_t(-1).
308 --p.last_sample;
309 }
310 }
311
312 // Remove empty populations.
313 auto pop_is_empty = [] (Pop& p) {return (p.first_sample > p.last_sample || p.last_sample == size_t(-1));};
314 for(Group& g : groups_)
315 g.pops.erase(remove_if(
316 g.pops.begin(),
317 g.pops.end(),
318 [this,&pop_is_empty] (size_t p) {return pop_is_empty(pops_[p]);}
319 ), g.pops.end());
320 pops_.erase(remove_if(
321 pops_.begin(),
322 pops_.end(),
323 pop_is_empty
324 ), pops_.end());
325
326 // Remove empty groups from [groups_].
327 groups_.erase(remove_if(
328 groups_.begin(),
329 groups_.end(),
330 [] (Group& g) {return g.pops.empty();}
331 ), groups_.end());
332
333 // Update the support members.
334 reset_sample_map();
335 reset_pop_map();
336 reset_group_map();
337 reset_sample_id_map();
338 reset_orig_order();
339 }
340
intersect_with(const vector<string> & samples)341 void MetaPopInfo::intersect_with(const vector<string>& samples) {
342 vector<size_t> common_samples;
343 for (const string& s : samples) {
344 auto itr = sample_indexes_.find(s);
345 if (itr != sample_indexes_.end())
346 common_samples.push_back(itr->second);
347 }
348 sort(common_samples.begin(), common_samples.end());
349
350 vector<size_t> rm_samples;
351 auto next_common = common_samples.begin();
352 for (size_t i=0; i< samples_.size(); ++i)
353 if (next_common != common_samples.end() && i == *next_common)
354 ++next_common;
355 else
356 rm_samples.push_back(i);
357
358 delete_samples(rm_samples);
359 }
360
361 void
status(ostream & fh)362 MetaPopInfo::status(ostream &fh)
363 {
364 fh << "Working on " << this->samples().size() << " samples.\n";
365 fh << "Working on " << this->pops().size() << " population(s):\n";
366 for (vector<Pop>::const_iterator p = this->pops().begin(); p != this->pops().end(); p++) {
367 size_t indent = p->name.length() + 6;
368 size_t line_lim = 120;
369 size_t line_len = 0;
370
371 fh << " " << p->name << ": ";
372 for (size_t s = p->first_sample; s < p->last_sample; ++s) {
373 fh << this->samples()[s].name << ", ";
374 line_len += this->samples()[s].name.length() + 2;
375 if (line_len > line_lim) {
376 fh << "\n" << string(indent, ' ');
377 line_len = 0;
378 }
379 }
380 fh << this->samples()[p->last_sample].name << "\n";
381 }
382 fh << "Working on " << this->groups().size() << " group(s) of populations:\n";
383 for (vector<Group>::const_iterator g = this->groups().begin(); g != this->groups().end(); g++) {
384 fh << " " << g->name << ": ";
385 for (vector<size_t>::const_iterator p = g->pops.begin(); p != g->pops.end() -1; ++p) {
386 //rem. end()-1 and back() are safe, there's always at least one pop
387 fh << this->pops()[*p].name << ", ";
388 }
389 fh << this->pops()[g->pops.back()].name << "\n";
390 }
391 fh << "\n";
392 }
393
reset_sample_id_map()394 void MetaPopInfo::reset_sample_id_map() {
395 sample_indexes_by_id_.clear();
396 for (size_t i = 0; i < samples_.size(); ++i)
397 sample_indexes_by_id_[samples_[i].id] = i;
398 }
399
fill_files(vector<pair<int,string>> & files) const400 void MetaPopInfo::fill_files(vector<pair<int, string> >& files) const {
401 files.clear();
402 for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
403 files.push_back( {sample->pop, sample->name} );
404 }
405
fill_sample_ids(vector<int> & sample_ids) const406 void MetaPopInfo::fill_sample_ids(vector<int>& sample_ids) const {
407 sample_ids.clear();
408 for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
409 sample_ids.push_back(sample->id);
410 }
411
fill_samples(map<int,string> & samples) const412 void MetaPopInfo::fill_samples(map<int, string>& samples) const {
413 samples.clear();
414 for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
415 samples.insert({sample->id, sample->name});
416 }
417
fill_pop_key(map<int,string> & pop_key) const418 void MetaPopInfo::fill_pop_key(map<int, string>& pop_key) const {
419 pop_key.clear();
420 for (size_t i = 0; i < pops_.size(); ++i)
421 pop_key.insert( {i, pops_[i].name} );
422 }
423
fill_pop_indexes(map<int,pair<int,int>> & pop_indexes) const424 void MetaPopInfo::fill_pop_indexes(map<int, pair<int, int> >& pop_indexes) const {
425 pop_indexes.clear();
426 for (size_t i = 0; i < pops_.size(); ++i)
427 pop_indexes.insert( {i, {pops_[i].first_sample, pops_[i].last_sample}} );
428 }
429
fill_grp_key(map<int,string> & grp_key) const430 void MetaPopInfo::fill_grp_key(map<int, string>& grp_key) const {
431 grp_key.clear();
432 for (size_t i = 0; i < groups_.size(); ++i)
433 grp_key.insert({i, groups_[i].name});
434 }
435
fill_grp_members(map<int,vector<int>> & grp_members) const436 void MetaPopInfo::fill_grp_members(map<int, vector<int> >& grp_members) const {
437 grp_members.clear();
438 for (size_t i = 0; i < groups_.size(); ++i) {
439 vector<int>& pop_ids = grp_members.insert( {i, vector<int>()} ).first->second;
440 for(vector<size_t>::const_iterator p = groups_[i].pops.begin(); p != groups_[i].pops.end(); ++p)
441 pop_ids.push_back(*p);
442 }
443 }
444