1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2016, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __POPMAP_H__
22 #define __POPMAP_H__
23 
24 #include <exception>
25 #include <cstring>
26 #include <string>
27 #include <vector>
28 #include <map>
29 #include <set>
30 #include <numeric>
31 using std::accumulate;
32 #include <algorithm>
33 #include <utility>
34 
35 #include "stacks.h"
36 #include "locus.h"
37 #include "aln_utils.h"
38 #include "MetaPopInfo.h"
39 #include "Vcf.h"
40 
41 class Datum {
42 public:
43     struct SNPData {
44         size_t tot_depth;
45         Counts<Nt2> nt_depths;
46         uint8_t gq;
47         GtLiks gtliks;
SNPDataSNPData48         SNPData() : tot_depth(0), gq(UINT8_MAX) {}
49     };
50 
51     int            id;            // Stack ID
52     int            merge_partner; // Stack ID of merged datum, if this datum was merged/phased from two, overlapping datums.
53     int            len;           // Length of locus
54     bool           corrected;     // Has this genotype call been corrected
55     char          *model;         // String representing SNP model output for each nucleotide at this locus.
56     char          *gtype;         // Genotype
57     char          *trans_gtype;   // Translated Genotype
58     char          *cigar;         // CIGAR string describing how the datum aligns to the catalog locus.
59     vector<char *> obshap;        // Observed Haplotypes
60     vector<SNPData> snpdata;
61 
Datum()62     Datum()  {
63         this->id            = -1;
64         this->corrected     = false;
65         this->gtype         = NULL;
66         this->trans_gtype   = NULL;
67         this->model         = NULL;
68         this->cigar         = NULL;
69         this->len           = 0;
70         this->merge_partner = 0;
71     }
~Datum()72     ~Datum() {
73         for (uint i = 0; i < this->obshap.size(); i++)
74             delete [] this->obshap[i];
75         delete [] this->gtype;
76         delete [] this->trans_gtype;
77         delete [] this->model;
78         delete [] this->cigar;
79     }
80 };
81 
82 template<class LocusT=Locus>
83 class PopMap {
84     const MetaPopInfo& metapopinfo;
85     int      num_loci;
86     set<pair<int, int> > blacklist;
87     Datum ***data;
88     map<int, int> locus_order;  // LocusID => ArrayIndex; map catalog IDs to their first dimension
89                                 // position in the Datum array.
90     map<int, int> rev_locus_order;
91     map<string, vector<LocusT *> > ordered_loci_; // Loci ordered by genomic position
92 
93 public:
94     PopMap(const MetaPopInfo& mpopi, int n_loci);
95     ~PopMap();
96 
97     // Populates the Locus & PopMap based on Stacks (v2) files.
98     static void populate_internal(
99         LocusT* cloc,
100         Datum** locdata,
101         const Seq& fasta_record,
102         const vector<VcfRecord>& vcf_records,
103         const VcfHeader& vcf_header,
104         const MetaPopInfo* mpopi,
105         const vector<size_t>& sample_vcf_i_to_mpopi_i
106     );
107     // Populates the Locus & PopMap based on external VCF files. Returns false
108     // and prints warnings if the parsing of the record fails.
109     static bool populate_external(
110         LocusT* cloc,
111         Datum** locdata,
112         int cloc_id,
113         const VcfRecord& vcf_record,
114         const VcfHeader& vcf_header,
115         const MetaPopInfo* mpopi,
116         const vector<size_t>& sample_vcf_i_to_mpopi_i
117     );
118 
119     //
120     // Obtain indexes, etc.
121     //
loci_cnt()122     int loci_cnt() const { return this->num_loci; }
locus_index(int id)123     int locus_index(int id) const {return locus_order.at(id);}
rev_locus_index(int index)124     int rev_locus_index(int index) const {try {return rev_locus_order.at(index);} catch(exception&) {return -1;}}
125 
sample_cnt()126     int sample_cnt() const { return metapopinfo.samples().size(); }
sample_index(int id)127     int sample_index(int id) const {try {return metapopinfo.get_sample_index(id);} catch (exception&) {return -1;}}
rev_sample_index(int index)128     int rev_sample_index(int index) const {return metapopinfo.samples().at(index).id;}
129 
blacklisted(int loc_id,int sample_id)130     bool blacklisted(int loc_id, int sample_id) const {return blacklist.count({sample_id, loc_id});}
131 
132     //
133     // Access the Datums.
134     //
locus(int id)135     Datum **locus(int id) { return this->data[this->locus_order[id]]; }
datum(int loc_id,int sample_id)136     Datum  *datum(int loc_id, int sample_id) { return this->locus(loc_id)[metapopinfo.get_sample_index(sample_id)]; }
137 };
138 
139 template<class LocusT>
PopMap(const MetaPopInfo & mpopi,int num_loci)140 PopMap<LocusT>::PopMap(const MetaPopInfo& mpopi, int num_loci): metapopinfo(mpopi)
141 {
142     this->data = new Datum **[num_loci];
143 
144     for (int i = 0; i < num_loci; i++) {
145         this->data[i] = new Datum *[metapopinfo.samples().size()];
146 
147         for (size_t j = 0; j < metapopinfo.samples().size(); j++)
148             this->data[i][j] = NULL;
149     }
150 
151     this->num_loci    = num_loci;
152 }
153 
154 template<class LocusT>
~PopMap()155 PopMap<LocusT>::~PopMap() {
156     for (int i = 0; i < this->num_loci; i++) {
157         for (size_t j = 0; j < metapopinfo.samples().size(); j++)
158             delete this->data[i][j];
159         delete [] this->data[i];
160     }
161     delete [] this->data;
162 }
163 
164 template<class LocusT>
populate_internal(LocusT * cloc,Datum ** locdata,const Seq & fasta_record,const vector<VcfRecord> & vcf_records,const VcfHeader & vcf_header,const MetaPopInfo * mpopi,const vector<size_t> & sample_vcf_i_to_mpopi_i)165 void PopMap<LocusT>::populate_internal(
166     LocusT* cloc,
167     Datum** locdata,
168     const Seq& fasta_record,
169     const vector<VcfRecord>& vcf_records,
170     const VcfHeader& vcf_header,
171     const MetaPopInfo* mpopi,
172     const vector<size_t>& sample_vcf_i_to_mpopi_i
173 ) { try {
174     assert(fasta_record.id != NULL);
175     assert(fasta_record.seq != NULL);
176     assert(!fasta_record.comment.empty());
177     assert(!vcf_records.empty());
178     assert(sample_vcf_i_to_mpopi_i.size() == vcf_header.samples().size());
179 
180     // Parse the FASTA record.
181     // ==========
182     // Locus ID.
183     cloc->id = is_integer(fasta_record.id);
184     if (cloc->id < 0) {
185         cerr << "Error: Unable to parse locus ID.\n";
186         throw exception();
187     }
188     // Consensus sequence.
189     cloc->add_consensus(fasta_record.seq);
190     if (cloc->len < vcf_records.back().pos() + 1) {
191         cerr << "Error: Bad consensus length.\n";
192         throw exception();
193     }
194     // Locus position:
195     // If the analysis is reference-based, there will be a ' pos=...' field on
196     // the fasta ID line, placed as a comment adjacent to the locus ID.
197     // Overlap: if this data is de novo, there will potentially be an overlap
198     // length between the single and paired-end contigs.
199     assert(cloc->loc.empty());
200     const char *p, *q;
201     p = fasta_record.comment.c_str();
202     do {
203         q = p;
204         while (*q != '\0' && *q != ' ' && *q != '\t')
205             ++q;
206         if (strncmp(p, "pos=", 4) == 0) {
207             p += 4;
208             cloc->loc = PhyLoc(string(p, q));
209         } else if (strncmp(p, "contig=overlapped:", 18) == 0) {
210             p += 18;
211             cloc->overlap = is_integer(p);
212             cloc->pe_ctg = true;
213         } else if (strncmp(p, "contig=separate", 15) == 0) {
214             cloc->pe_ctg = true;
215         }
216         p = q;
217         while(*p != '\0' && (*p == ' ' || *p == '\t'))
218             ++p;
219     } while (*p != '\0');
220     if (cloc->loc.empty())
221         cloc->loc = PhyLoc("", 0, strand_plus);
222 
223     // Parse the VCF records.
224     // ==========
225 
226     // CSLocus.
227     // ----------
228     for (auto& rec : vcf_records) {
229         if (!rec.is_monomorphic()) {
230             SNP* snp = new SNP();
231             cloc->snps.push_back(snp);
232             snp->type = snp_type_het;
233             snp->col = rec.pos();
234             auto a = rec.begin_alleles();
235             snp->rank_1 = **a;
236             ++a;
237             assert(a != rec.end_alleles());
238             snp->rank_2 = **a;
239             ++a;
240             if (a != rec.end_alleles()) {
241                 snp->rank_3 = **a;
242                 ++a;
243                 if (a != rec.end_alleles())
244                     snp->rank_4 = **a;
245             }
246         }
247     }
248 
249     // Genotypes.
250     // ----------
251     size_t snp_i = 0;
252     --snp_i;
253     vector<Nt2> rec_alleles;
254     size_t dp_index = SIZE_MAX;
255     size_t ad_index = SIZE_MAX;
256     size_t gq_index = SIZE_MAX;
257     size_t gl_index = SIZE_MAX;
258     for (auto& rec : vcf_records)
259     { try {
260         size_t col = rec.pos();
261         bool snp_rec = !rec.is_monomorphic();
262         if (snp_rec) {
263             if (!rec.is_snp()) {
264                 cerr << "Error: Not a SNP.\n";
265                 throw exception();
266             }
267             ++snp_i;
268             rec_alleles.clear();
269             for(auto a = rec.begin_alleles(); a != rec.end_alleles(); ++a)
270                 rec_alleles.push_back(Nt2(**a));
271             if (rec.index_of_gt_subfield("PS") != 1)
272                 throw exception();
273             dp_index = rec.index_of_gt_subfield("DP");
274             ad_index = rec.index_of_gt_subfield("AD");
275             gq_index = rec.index_of_gt_subfield("GQ");
276             gl_index = rec.index_of_gt_subfield("GL");
277         } else {
278             assert(rec.count_formats() == 1 && strcmp(rec.format0(),"DP")==0);
279         }
280         VcfRecord::iterator gt_itr = rec.begin_samples();
281         for (size_t sample_vcf_i = 0;
282             sample_vcf_i < vcf_header.samples().size();
283             ++gt_itr, ++sample_vcf_i
284         ) { try {
285             assert(gt_itr != rec.end_samples());
286             // Check that the sample is present in the population map.
287             size_t sample_mpopi_i = sample_vcf_i_to_mpopi_i[sample_vcf_i];
288             if (sample_mpopi_i == SIZE_MAX)
289                 continue;
290             else
291                 assert(sample_mpopi_i < mpopi->samples().size());
292             // Check if the sample has data.
293             const char* gt_str = *gt_itr;
294             if (gt_str[0] == '.')
295                 continue;
296             Datum* d = locdata[sample_mpopi_i];
297             if (snp_rec) {
298                 // Check that this isn't an unphased HET.
299                 // N.B.: GStacks writes a single phase set per sample.
300                 pair<long,long> gt = VcfRecord::util::parse_gt_gt(gt_str);
301                 assert(gt.first >= 0);
302                 if (gt.first != gt.second) {
303                     const char* ps = strchr(gt_str, ':');
304                     if (ps == NULL)
305                         throw exception();
306                     ++ps;
307                     if (ps[0] == '.')
308                         continue;
309                 }
310             }
311             // This sample has data for this locus for this column.
312             if (d == NULL) {
313                 ++cloc->cnt;
314                 d = new Datum();
315                 locdata[sample_mpopi_i] = d;
316                 d->id = cloc->id;
317                 d->len = cloc->len;
318                 d->model = new char[cloc->len+1];
319                 memset(d->model, 'U', cloc->len);
320                 d->model[cloc->len] = '\0';
321                 if (!cloc->snps.empty()) {
322                     d->snpdata = vector<Datum::SNPData>(cloc->snps.size());
323                     for (size_t i=0; i<2; ++i) {
324                         char* hap = new char[cloc->snps.size()+1];
325                         d->obshap.push_back(hap);
326                         memset(hap, 'N', cloc->snps.size());
327                         hap[cloc->snps.size()] = '\0';
328                     }
329                 } else {
330                     d->obshap.push_back(new char[10]);
331                     strncpy(d->obshap[0], "consensus", 10);
332                 }
333             }
334             // Handle the non-SNP-record case.
335             if (!snp_rec) {
336                 d->model[col] = 'O';
337                 continue;
338             }
339             // SNP column.
340             pair<long,long> gt = VcfRecord::util::parse_gt_gt(gt_str);
341             d->model[col] = (gt.first == gt.second ? 'O' : 'E');
342             d->obshap[0][snp_i] = char(rec_alleles.at(gt.first));
343             d->obshap[1][snp_i] = char(rec_alleles.at(gt.second));
344             // Record additional information on the call.
345             Datum::SNPData& s = d->snpdata[snp_i];
346             s.tot_depth = VcfRecord::util::parse_gt_dp(gt_str, dp_index);
347             s.nt_depths = VcfRecord::util::parse_gt_ad(gt_str, ad_index, rec_alleles);
348             s.gq = VcfRecord::util::parse_gt_gq(gt_str, gq_index);
349             s.gtliks = VcfRecord::util::parse_gt_gl(gt_str, gl_index, rec_alleles);
350         } catch (exception&) {
351             cerr << "Error: At the " << (sample_vcf_i+1) << "th sample.\n";
352             throw;
353         }}
354     } catch (exception&) {
355         cerr << "Error: In record '" << rec.chrom() << ":" << rec.pos()+1 << "'.\n";
356         throw;
357     }}
358 
359     // Finalize the CSLocus (??).
360     // ==========
361     if (!cloc->snps.empty()) {
362         string hap;
363         for (size_t i = 0; i < mpopi->samples().size(); ++i) {
364             Datum* d = locdata[i];
365             if (d == NULL)
366                 continue;
367             assert(d->obshap.size() == 2);
368             if (strchr(d->obshap[0], 'N') == NULL) {
369                 assert(strchr(d->obshap[1], 'N') == NULL);
370                 hap = d->obshap[0];
371                 ++cloc->alleles[move(hap)];
372                 hap = d->obshap[1];
373                 ++cloc->alleles[move(hap)];
374             }
375         }
376         cloc->populate_alleles();
377     }
378 
379 } catch (exception&) {
380     cerr << "Error: Locus " << fasta_record.id << "\n"
381          << "Error: Bad GStacks files.\n";
382     throw;
383 }}
384 
385 template<class LocusT> bool
populate_external(LocusT * cloc,Datum ** locdata,int cloc_id,const VcfRecord & vcf_record,const VcfHeader & vcf_header,const MetaPopInfo * mpopi,const vector<size_t> & sample_vcf_i_to_mpopi_i)386 PopMap<LocusT>::populate_external(
387     LocusT* cloc,
388     Datum** locdata,
389     int cloc_id,
390     const VcfRecord& vcf_record,
391     const VcfHeader& vcf_header,
392     const MetaPopInfo* mpopi,
393     const vector<size_t>& sample_vcf_i_to_mpopi_i
394 ) { try {
395     assert(vcf_record.is_snp());
396     assert(sample_vcf_i_to_mpopi_i.size() == vcf_header.samples().size());
397     // We ignore the '*' allele & treat samples that have it as missing.
398     long upstream_del_allele = -1;
399     vector<Nt2> rec_alleles;
400     long allele_i = 0;
401 try {
402     for (auto a = vcf_record.begin_alleles();
403         a != vcf_record.end_alleles();
404         ++a, ++allele_i
405     ) {
406         if (**a == '*') {
407             if (upstream_del_allele != -1)
408                 throw exception();
409             upstream_del_allele = allele_i;
410             rec_alleles.push_back(Nt2::a);
411         } else {
412             if (!Nt4(**a).is_acgt())
413                 throw exception();
414             rec_alleles.push_back(Nt2(**a));
415         }
416     }
417     if (rec_alleles.size() > 4 + (upstream_del_allele != -1 ? 1 : 0))
418         throw exception();
419 } catch (exception&) {
420     cerr << "Warning: Illegal alleles.\n";
421     throw;
422 }
423     // CSLocus.
424     // ==========
425     cloc->id = cloc_id;
426     cloc->len = 1;
427     cloc->con = new char[2];
428     cloc->con[0] = *vcf_record.allele0();
429     cloc->con[1] = '\0';
430     cloc->loc.set(vcf_record.chrom(), vcf_record.pos(), strand_plus);
431     // Locus::snps
432     SNP* snp = new SNP();
433     cloc->snps.push_back(snp);
434     snp->type = snp_type_het;
435     snp->col = 0;
436     // Locus::rankX
437     allele_i = 0;
438     if (allele_i == upstream_del_allele)
439         ++allele_i;
440     snp->rank_1 = char(rec_alleles[allele_i]);
441     ++allele_i;
442     if (allele_i == upstream_del_allele)
443         ++allele_i;
444     assert(allele_i != long(rec_alleles.size()));
445     snp->rank_2 = char(rec_alleles[allele_i]);
446     ++allele_i;
447     if (allele_i == upstream_del_allele)
448         ++allele_i;
449     if (allele_i != long(rec_alleles.size())) {
450         snp->rank_3 = char(rec_alleles[allele_i]);
451         ++allele_i;
452         if (allele_i == upstream_del_allele)
453             ++allele_i;
454         if (allele_i != long(rec_alleles.size()))
455             snp->rank_4 = char(rec_alleles[allele_i]);
456     }
457 
458     // Genotypes.
459     // ==========
460     size_t dp_index = vcf_record.index_of_gt_subfield("DP");
461     size_t ad_index = vcf_record.index_of_gt_subfield("AD");
462     size_t gq_index = vcf_record.index_of_gt_subfield("GQ");
463     size_t gl_index = vcf_record.index_of_gt_subfield("GL");
464     VcfRecord::iterator gt_itr = vcf_record.begin_samples();
465     for (size_t sample_vcf_i = 0;
466         sample_vcf_i < vcf_header.samples().size();
467         ++gt_itr, ++sample_vcf_i
468     ) { try {
469         assert(gt_itr != vcf_record.end_samples());
470         // Check that the sample is present in the population map.
471         size_t sample_mpopi_i = sample_vcf_i_to_mpopi_i[sample_vcf_i];
472         if (sample_mpopi_i == SIZE_MAX)
473             continue;
474         else
475             assert(sample_mpopi_i < mpopi->samples().size());
476         // Check if the sample has data.
477         const char* gt_str = *gt_itr;
478         pair<long,long> gt = VcfRecord::util::parse_gt_gt(gt_str);
479         if (gt.first == -1)
480             continue;
481         else if (gt.first == upstream_del_allele || gt.second == upstream_del_allele)
482             continue;
483         // Fill the datum.
484         assert(locdata[sample_mpopi_i] == NULL);
485         ++cloc->cnt;
486         Datum* d = new Datum();
487         locdata[sample_mpopi_i] = d;
488         d->id = cloc->id;
489         d->len = 1;
490         d->model = new char[2];
491         d->model[0] = (gt.first == gt.second ? 'O' : 'E');
492         d->model[1] = '\0';
493         for (size_t i=0; i<2; ++i) {
494             d->obshap.push_back(new char[2]);
495             d->obshap.back()[0] = 'N';
496             d->obshap.back()[1] = '\0';
497         }
498         d->obshap[0][0] = char(rec_alleles.at(gt.first));
499         d->obshap[1][0] = char(rec_alleles.at(gt.second));
500         d->snpdata = vector<Datum::SNPData>(1);
501         Datum::SNPData& s = d->snpdata[0];
502         if (dp_index != SIZE_MAX)
503             s.tot_depth = VcfRecord::util::parse_gt_dp(gt_str, dp_index);
504         if (ad_index != SIZE_MAX)
505             s.nt_depths = VcfRecord::util::parse_gt_ad(gt_str, ad_index, rec_alleles);
506         if (gq_index != SIZE_MAX)
507             s.gq = VcfRecord::util::parse_gt_gq(gt_str, gq_index);
508         if (gl_index != SIZE_MAX)
509             s.gtliks = VcfRecord::util::parse_gt_gl(gt_str, gl_index, rec_alleles);
510     } catch (exception&) {
511         cerr << "Warning: Malformed sample field '" << *gt_itr << "'.\n";
512         throw;
513     }}
514 
515     // Finally, "populate alleles".
516     for (size_t i = 0; i < mpopi->samples().size(); ++i) {
517         Datum* d = locdata[i];
518         if (d == NULL)
519             continue;
520         assert(d->obshap.size() == 2);
521         assert(strchr(d->obshap[0], 'N') == NULL);
522         ++cloc->alleles[d->obshap[0]];
523         ++cloc->alleles[d->obshap[1]];
524     }
525     cloc->populate_alleles();
526     return true;
527 } catch (exception&) {
528     cerr << "Warning: Discarding VCF SNP record '"
529          << vcf_record.chrom() << ":" << (vcf_record.pos()+1) << "'.\n";
530     return false;
531 }}
532 
533 #endif // __POPMAP_H__
534