1 #include <fstream>
2 #include <dirent.h>
3 #include <iostream>
4 #include <algorithm>
5 #include <cstring>
6 #include <vector>
7 
8 #include "config.h"
9 #include "constants.h"
10 #include "input.h"
11 
12 #include "MetaPopInfo.h"
13 
14 using namespace std;
15 
16 const string Pop::default_name = "defaultpop";
17 const string Group::default_name = "defaultgrp";
18 
reset_sample_map()19 void MetaPopInfo::reset_sample_map() {
20     sample_indexes_.clear();
21     for (size_t i = 0; i < samples_.size(); ++i) {
22         bool rv = sample_indexes_.insert( {samples_[i].name, i} ).second;
23         if (!rv) {
24             cerr << "Error: Two or more samples have the same name '" << samples_[i].name << "'.\n";
25             throw exception();
26         }
27     }
28 }
29 
reset_pop_map()30 void MetaPopInfo::reset_pop_map() {
31     pop_indexes_.clear();
32     for (size_t i = 0; i < pops_.size(); ++i) {
33         bool rv = pop_indexes_.insert( {pops_[i].name, i} ).second;
34         if (!rv) {
35             cerr << "Error: Two or more populations have the same name '" << pops_[i].name << "'.\n";
36             throw exception();
37         }
38     }
39 }
40 
reset_group_map()41 void MetaPopInfo::reset_group_map() {
42     group_indexes_.clear();
43     for (size_t i = 0; i < groups_.size(); ++i) {
44         bool rv = group_indexes_.insert( {groups_[i].name, i} ).second;
45         if (!rv) {
46             cerr << "Error: Two or more groups have the same name '" << groups_[i].name << "'.\n";
47             throw exception();
48         }
49     }
50 }
51 
reset_orig_order()52 void MetaPopInfo::reset_orig_order() {
53     sample_indexes_orig_order_.clear();
54     for (const string& name : orig_sample_order_) {
55         auto itr = sample_indexes_.find(name);
56         if (itr != sample_indexes_.end())
57             // This sample is still there (n.b. delete_samples/intersect_with).
58             sample_indexes_orig_order_.push_back(itr->second);
59     }
60     assert(sample_indexes_orig_order_.size() == samples_.size());
61 }
62 
init_popmap(const string & pmap_path)63 void MetaPopInfo::init_popmap(const string& pmap_path) {
64     assert(samples_.empty());
65 
66     ifstream fh(pmap_path.c_str(), ifstream::in);
67     if (fh.fail()) {
68         cerr << "Error: Failed to open population map file '" << pmap_path << "'.\n";
69         throw exception();
70     }
71 
72     size_t p = 0; // pop index counter
73     size_t g = 0; // group index counter
74 
75     char line[max_len];
76     memset(line, '\0', max_len);
77     vector<string> parts;
78     while (fh.getline(line, max_len)) {
79         size_t len = strlen(line);
80 
81         // Skip empty lines and comments
82         if (len == 0 || line[0] == '#')
83             continue;
84 
85         // Check for Windows line endings
86         if (line[len - 1] == '\r') {
87             line[len - 1] = '\0';
88             len -= 1;
89         }
90 
91         //
92         // Parse the contents, we expect:
93         // <file name> tab <population string> [tab <group string>]
94         //
95 
96         parse_tsv(line, parts);
97         for (string& part : parts)
98             strip_whitespace(part);
99 
100         if (parts.size() < 2
101             || parts.size() > 3
102             || parts[0].empty()
103             || parts[1].empty()) {
104             cerr << "Error: Malformed population map -- expected 'SAMPLE \\t POP [\\t GROUP]'. In file '" << pmap_path << "', line :\n" << line << "\n";
105             throw exception();
106         }
107 
108         //
109         // Process the sample name.
110         //
111 
112         samples_.push_back(Sample(parts[0]));
113         orig_sample_order_.push_back(parts[0]);
114 
115         //
116         // Process the population field.
117         //
118 
119         pair<map<string,size_t>::iterator, bool> pop_ins = pop_indexes_.insert( {parts[1], p} );
120         size_t pop_index = pop_ins.first->second;
121 
122         samples_.back().pop = pop_index; // Set the sample's population index.
123         if (pop_ins.second) {
124             // Unknown pop
125             pops_.push_back(Pop(parts[1]));
126             ++p;
127         }
128 
129         //
130         // Process the group field, if any
131         //
132 
133         if (parts.size() == 3 && ! parts[2].empty()) {
134             // Get the index of this group -- create it if necessary.
135             pair<map<string,size_t>::iterator, bool> grp_ins = group_indexes_.insert( {parts[2], g} );
136             if (grp_ins.second) {
137                 groups_.push_back(Group(parts[2]));
138                 ++g;
139             }
140             size_t grp_index = grp_ins.first->second;
141 
142             if (pops_[pop_index].group != size_t(-1)) {
143                 // The current pop already has a group, check that it is the same.
144                 if (pops_[pop_index].group != grp_index) {
145                     cerr << "Error: In population map file '"
146                          << pmap_path << "': population '"
147                          << pops_[pop_index].name << "' belongs to two groups, '"
148                          << groups_[pops_[pop_index].group].name << "' and '"
149                          << groups_[grp_index].name << "'\n.";
150                     throw exception();
151                 }
152             } else {
153                 pops_[pop_index].group = grp_index;
154                 groups_[grp_index].pops.push_back(pop_index);
155             }
156         }
157     }
158     if (samples_.empty()) {
159         cerr << "Error: Population map '" << pmap_path << "' appears to be empty.\n";
160         throw exception();
161     }
162 
163     //
164     // Check that all the populations are in a group. Put
165     // populations that do not have a group in a default
166     // one.
167     //
168 
169     bool missing_group = false;
170     if (not groups_.empty()) {
171         for (vector<Pop>::iterator p = pops_.begin(); p != pops_.end(); ++p) {
172             if (p->group == size_t(-1)) {
173                 cerr << "Warning: Population '" << p->name
174                 << "' did not have a group, adding it to '"
175                 << Group::default_name << "'.\n";
176                 missing_group = true;
177             }
178         }
179     } else {
180         missing_group = true;
181     }
182     if (missing_group) {
183         groups_.push_back(Group(Group::default_name));
184         g = groups_.size()-1;
185         group_indexes_.insert( {Group::default_name, g} );
186         for (size_t p = 0; p < pops_.size(); ++p) {
187             if (pops_[p].group == size_t(-1)) {
188                 pops_[p].group = g;
189                 groups_[g].pops.push_back(p);
190             }
191         }
192     }
193 
194     //
195     // Sort the samples. Determine the first/last indexes for each population.
196     //
197 
198     sort(samples_.begin(), samples_.end());
199     reset_sample_map();
200     reset_orig_order();
201 
202     size_t curr_pop = 0;
203     pops_[curr_pop].first_sample = 0;
204     for (size_t s = 1; s < samples_.size(); ++s) {
205         if (samples_[s].pop != curr_pop) {
206             pops_[curr_pop].last_sample = s-1;
207             ++curr_pop;
208             pops_[curr_pop].first_sample = s;
209         }
210     }
211     pops_[curr_pop].last_sample = samples_.size()-1;
212 }
213 
init_names(const vector<string> & sample_names)214 void MetaPopInfo::init_names(const vector<string>& sample_names) {
215     if(sample_names.empty()) {
216         cerr << "Error: No samples.\n";
217         throw exception();
218     }
219     assert(samples_.empty());
220 
221     orig_sample_order_ = sample_names;
222 
223     // Create the samples
224     for (vector<string>::const_iterator s=sample_names.begin(); s!= sample_names.end(); ++s) {
225         samples_.push_back(Sample(*s));
226         samples_.back().pop = 0;
227     }
228     sort(samples_.begin(), samples_.end());
229 
230     // Create a default population
231     pops_.push_back(Pop(Pop::default_name));
232     pops_[0].first_sample = 0;
233     pops_[0].last_sample = samples_.size()-1;
234     pops_[0].group = 0;
235 
236     // Create a default group
237     groups_.push_back(Group(Group::default_name));
238     groups_[0].pops.push_back(0);
239 
240     // Set the support members.
241     reset_sample_map();
242     reset_pop_map();
243     reset_group_map();
244     reset_orig_order();
245 }
246 
init_directory(const string & dir_path)247 void MetaPopInfo::init_directory(const string& dir_path) {
248 
249     //
250     // Find all sample names.
251     //
252 
253     vector<string> sample_names;
254     DIR* dir = opendir(dir_path.c_str());
255     if (dir == NULL) {
256         cerr << "Unable to open directory '" << dir_path << "' for reading.\n";
257         throw exception();
258     }
259     dirent *direntry;
260     while ((direntry = readdir(dir)) != NULL) {
261         string filename = direntry->d_name;
262 
263         if (filename == "." || filename == ".." || filename.substr(0, 6) == "batch_")
264             continue;
265 
266         size_t pos = filename.rfind(".tags.tsv");
267         if (pos == string::npos)
268             pos = filename.rfind(".tags.tsv.gz");
269 
270         if (pos != string::npos)
271             sample_names.push_back(filename.substr(0, pos));
272     }
273     closedir(dir);
274 
275     if (sample_names.empty()) {
276         cerr << "Error: Failed to find sample files in directory '" << dir_path << "'.\n";
277         throw exception();
278     }
279 
280     //
281     // Initialize the MetaPopInfo
282     //
283     init_names(sample_names);
284 }
285 
delete_samples(const vector<size_t> & rm_samples)286 void MetaPopInfo::delete_samples(const vector<size_t>& rm_samples) {
287 
288     if (rm_samples.empty())
289         return;
290 
291     // Remove these samples from [samples_].
292     for (size_t s : rm_samples)
293         samples_.at(s).name.clear(); // Mark the sample for removal.
294     samples_.erase(remove_if(
295             samples_.begin(),
296             samples_.end(),
297             [] (Sample& s) {return s.name.empty();}
298             ), samples_.end());
299 
300     // Update the indexes of the populations.
301     for (Pop& p : pops_) {
302         for (vector<size_t>::const_reverse_iterator rm_sample = rm_samples.rbegin(); rm_sample != rm_samples.rend(); ++rm_sample) {
303             if (p.first_sample > *rm_sample) // n.b. ">"
304                 --p.first_sample;
305             if (p.last_sample >= *rm_sample) // n.b. ">=". Thus if the population becomes
306                                               // empty, [first_sample] will be past [last_sample].
307                                               // n.b. If removing the first pop, last_sample=size_t(-1).
308                 --p.last_sample;
309         }
310     }
311 
312     // Remove empty populations.
313     auto pop_is_empty = [] (Pop& p) {return (p.first_sample > p.last_sample || p.last_sample == size_t(-1));};
314     for(Group& g : groups_)
315         g.pops.erase(remove_if(
316                 g.pops.begin(),
317                 g.pops.end(),
318                 [this,&pop_is_empty] (size_t p) {return pop_is_empty(pops_[p]);}
319                 ), g.pops.end());
320     pops_.erase(remove_if(
321             pops_.begin(),
322             pops_.end(),
323             pop_is_empty
324             ),  pops_.end());
325 
326     // Remove empty groups from [groups_].
327     groups_.erase(remove_if(
328             groups_.begin(),
329             groups_.end(),
330             [] (Group& g) {return g.pops.empty();}
331             ), groups_.end());
332 
333     // Update the support members.
334     reset_sample_map();
335     reset_pop_map();
336     reset_group_map();
337     reset_sample_id_map();
338     reset_orig_order();
339 }
340 
intersect_with(const vector<string> & samples)341 void MetaPopInfo::intersect_with(const vector<string>& samples) {
342     vector<size_t> common_samples;
343     for (const string& s : samples) {
344         auto itr = sample_indexes_.find(s);
345         if (itr != sample_indexes_.end())
346             common_samples.push_back(itr->second);
347     }
348     sort(common_samples.begin(), common_samples.end());
349 
350     vector<size_t> rm_samples;
351     auto next_common = common_samples.begin();
352     for (size_t i=0; i< samples_.size(); ++i)
353         if (next_common != common_samples.end() && i == *next_common)
354             ++next_common;
355         else
356             rm_samples.push_back(i);
357 
358     delete_samples(rm_samples);
359 }
360 
361 void
status(ostream & fh)362 MetaPopInfo::status(ostream &fh)
363 {
364     fh << "Working on " << this->samples().size() << " samples.\n";
365     fh << "Working on " << this->pops().size() << " population(s):\n";
366     for (vector<Pop>::const_iterator p = this->pops().begin(); p != this->pops().end(); p++) {
367         size_t indent   = p->name.length() + 6;
368         size_t line_lim = 120;
369         size_t line_len = 0;
370 
371         fh << "    " << p->name << ": ";
372         for (size_t s = p->first_sample; s < p->last_sample; ++s) {
373             fh << this->samples()[s].name << ", ";
374             line_len += this->samples()[s].name.length() + 2;
375             if (line_len > line_lim) {
376                 fh << "\n" << string(indent, ' ');
377                 line_len = 0;
378             }
379         }
380         fh << this->samples()[p->last_sample].name << "\n";
381     }
382     fh << "Working on " << this->groups().size() << " group(s) of populations:\n";
383     for (vector<Group>::const_iterator g = this->groups().begin(); g != this->groups().end(); g++) {
384         fh << "    " << g->name << ": ";
385         for (vector<size_t>::const_iterator p = g->pops.begin(); p != g->pops.end() -1; ++p) {
386             //rem. end()-1 and back() are safe, there's always at least one pop
387             fh << this->pops()[*p].name << ", ";
388         }
389         fh << this->pops()[g->pops.back()].name << "\n";
390     }
391     fh << "\n";
392 }
393 
reset_sample_id_map()394 void MetaPopInfo::reset_sample_id_map() {
395     sample_indexes_by_id_.clear();
396     for (size_t i = 0; i < samples_.size(); ++i)
397         sample_indexes_by_id_[samples_[i].id] = i;
398 }
399 
fill_files(vector<pair<int,string>> & files) const400 void MetaPopInfo::fill_files(vector<pair<int, string> >& files) const {
401     files.clear();
402     for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
403         files.push_back( {sample->pop, sample->name} );
404 }
405 
fill_sample_ids(vector<int> & sample_ids) const406 void MetaPopInfo::fill_sample_ids(vector<int>& sample_ids) const {
407     sample_ids.clear();
408     for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
409         sample_ids.push_back(sample->id);
410 }
411 
fill_samples(map<int,string> & samples) const412 void MetaPopInfo::fill_samples(map<int, string>& samples) const {
413     samples.clear();
414     for (vector<Sample>::const_iterator sample = samples_.begin(); sample != samples_.end(); ++sample)
415         samples.insert({sample->id, sample->name});
416 }
417 
fill_pop_key(map<int,string> & pop_key) const418 void MetaPopInfo::fill_pop_key(map<int, string>& pop_key) const {
419     pop_key.clear();
420     for (size_t i = 0; i < pops_.size(); ++i)
421         pop_key.insert( {i, pops_[i].name} );
422 }
423 
fill_pop_indexes(map<int,pair<int,int>> & pop_indexes) const424 void MetaPopInfo::fill_pop_indexes(map<int, pair<int, int> >& pop_indexes) const {
425     pop_indexes.clear();
426     for (size_t i = 0; i < pops_.size(); ++i)
427         pop_indexes.insert( {i, {pops_[i].first_sample, pops_[i].last_sample}} );
428 }
429 
fill_grp_key(map<int,string> & grp_key) const430 void MetaPopInfo::fill_grp_key(map<int, string>& grp_key) const {
431     grp_key.clear();
432     for (size_t i = 0; i < groups_.size(); ++i)
433         grp_key.insert({i, groups_[i].name});
434 }
435 
fill_grp_members(map<int,vector<int>> & grp_members) const436 void MetaPopInfo::fill_grp_members(map<int, vector<int> >& grp_members) const {
437     grp_members.clear();
438     for (size_t i = 0; i < groups_.size(); ++i) {
439         vector<int>& pop_ids = grp_members.insert( {i, vector<int>()} ).first->second;
440         for(vector<size_t>::const_iterator p = groups_[i].pops.begin(); p != groups_[i].pops.end(); ++p)
441             pop_ids.push_back(*p);
442     }
443 }
444