1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2016, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __POPMAP_H__
22 #define __POPMAP_H__
23
24 #include <exception>
25 #include <cstring>
26 #include <string>
27 #include <vector>
28 #include <map>
29 #include <set>
30 #include <numeric>
31 using std::accumulate;
32 #include <algorithm>
33 #include <utility>
34
35 #include "stacks.h"
36 #include "locus.h"
37 #include "aln_utils.h"
38 #include "MetaPopInfo.h"
39 #include "Vcf.h"
40
41 class Datum {
42 public:
43 struct SNPData {
44 size_t tot_depth;
45 Counts<Nt2> nt_depths;
46 uint8_t gq;
47 GtLiks gtliks;
SNPDataSNPData48 SNPData() : tot_depth(0), gq(UINT8_MAX) {}
49 };
50
51 int id; // Stack ID
52 int merge_partner; // Stack ID of merged datum, if this datum was merged/phased from two, overlapping datums.
53 int len; // Length of locus
54 bool corrected; // Has this genotype call been corrected
55 char *model; // String representing SNP model output for each nucleotide at this locus.
56 char *gtype; // Genotype
57 char *trans_gtype; // Translated Genotype
58 char *cigar; // CIGAR string describing how the datum aligns to the catalog locus.
59 vector<char *> obshap; // Observed Haplotypes
60 vector<SNPData> snpdata;
61
Datum()62 Datum() {
63 this->id = -1;
64 this->corrected = false;
65 this->gtype = NULL;
66 this->trans_gtype = NULL;
67 this->model = NULL;
68 this->cigar = NULL;
69 this->len = 0;
70 this->merge_partner = 0;
71 }
~Datum()72 ~Datum() {
73 for (uint i = 0; i < this->obshap.size(); i++)
74 delete [] this->obshap[i];
75 delete [] this->gtype;
76 delete [] this->trans_gtype;
77 delete [] this->model;
78 delete [] this->cigar;
79 }
80 };
81
82 template<class LocusT=Locus>
83 class PopMap {
84 const MetaPopInfo& metapopinfo;
85 int num_loci;
86 set<pair<int, int> > blacklist;
87 Datum ***data;
88 map<int, int> locus_order; // LocusID => ArrayIndex; map catalog IDs to their first dimension
89 // position in the Datum array.
90 map<int, int> rev_locus_order;
91 map<string, vector<LocusT *> > ordered_loci_; // Loci ordered by genomic position
92
93 public:
94 PopMap(const MetaPopInfo& mpopi, int n_loci);
95 ~PopMap();
96
97 // Populates the Locus & PopMap based on Stacks (v2) files.
98 static void populate_internal(
99 LocusT* cloc,
100 Datum** locdata,
101 const Seq& fasta_record,
102 const vector<VcfRecord>& vcf_records,
103 const VcfHeader& vcf_header,
104 const MetaPopInfo* mpopi,
105 const vector<size_t>& sample_vcf_i_to_mpopi_i
106 );
107 // Populates the Locus & PopMap based on external VCF files. Returns false
108 // and prints warnings if the parsing of the record fails.
109 static bool populate_external(
110 LocusT* cloc,
111 Datum** locdata,
112 int cloc_id,
113 const VcfRecord& vcf_record,
114 const VcfHeader& vcf_header,
115 const MetaPopInfo* mpopi,
116 const vector<size_t>& sample_vcf_i_to_mpopi_i
117 );
118
119 //
120 // Obtain indexes, etc.
121 //
loci_cnt()122 int loci_cnt() const { return this->num_loci; }
locus_index(int id)123 int locus_index(int id) const {return locus_order.at(id);}
rev_locus_index(int index)124 int rev_locus_index(int index) const {try {return rev_locus_order.at(index);} catch(exception&) {return -1;}}
125
sample_cnt()126 int sample_cnt() const { return metapopinfo.samples().size(); }
sample_index(int id)127 int sample_index(int id) const {try {return metapopinfo.get_sample_index(id);} catch (exception&) {return -1;}}
rev_sample_index(int index)128 int rev_sample_index(int index) const {return metapopinfo.samples().at(index).id;}
129
blacklisted(int loc_id,int sample_id)130 bool blacklisted(int loc_id, int sample_id) const {return blacklist.count({sample_id, loc_id});}
131
132 //
133 // Access the Datums.
134 //
locus(int id)135 Datum **locus(int id) { return this->data[this->locus_order[id]]; }
datum(int loc_id,int sample_id)136 Datum *datum(int loc_id, int sample_id) { return this->locus(loc_id)[metapopinfo.get_sample_index(sample_id)]; }
137 };
138
139 template<class LocusT>
PopMap(const MetaPopInfo & mpopi,int num_loci)140 PopMap<LocusT>::PopMap(const MetaPopInfo& mpopi, int num_loci): metapopinfo(mpopi)
141 {
142 this->data = new Datum **[num_loci];
143
144 for (int i = 0; i < num_loci; i++) {
145 this->data[i] = new Datum *[metapopinfo.samples().size()];
146
147 for (size_t j = 0; j < metapopinfo.samples().size(); j++)
148 this->data[i][j] = NULL;
149 }
150
151 this->num_loci = num_loci;
152 }
153
154 template<class LocusT>
~PopMap()155 PopMap<LocusT>::~PopMap() {
156 for (int i = 0; i < this->num_loci; i++) {
157 for (size_t j = 0; j < metapopinfo.samples().size(); j++)
158 delete this->data[i][j];
159 delete [] this->data[i];
160 }
161 delete [] this->data;
162 }
163
164 template<class LocusT>
populate_internal(LocusT * cloc,Datum ** locdata,const Seq & fasta_record,const vector<VcfRecord> & vcf_records,const VcfHeader & vcf_header,const MetaPopInfo * mpopi,const vector<size_t> & sample_vcf_i_to_mpopi_i)165 void PopMap<LocusT>::populate_internal(
166 LocusT* cloc,
167 Datum** locdata,
168 const Seq& fasta_record,
169 const vector<VcfRecord>& vcf_records,
170 const VcfHeader& vcf_header,
171 const MetaPopInfo* mpopi,
172 const vector<size_t>& sample_vcf_i_to_mpopi_i
173 ) { try {
174 assert(fasta_record.id != NULL);
175 assert(fasta_record.seq != NULL);
176 assert(!fasta_record.comment.empty());
177 assert(!vcf_records.empty());
178 assert(sample_vcf_i_to_mpopi_i.size() == vcf_header.samples().size());
179
180 // Parse the FASTA record.
181 // ==========
182 // Locus ID.
183 cloc->id = is_integer(fasta_record.id);
184 if (cloc->id < 0) {
185 cerr << "Error: Unable to parse locus ID.\n";
186 throw exception();
187 }
188 // Consensus sequence.
189 cloc->add_consensus(fasta_record.seq);
190 if (cloc->len < vcf_records.back().pos() + 1) {
191 cerr << "Error: Bad consensus length.\n";
192 throw exception();
193 }
194 // Locus position:
195 // If the analysis is reference-based, there will be a ' pos=...' field on
196 // the fasta ID line, placed as a comment adjacent to the locus ID.
197 // Overlap: if this data is de novo, there will potentially be an overlap
198 // length between the single and paired-end contigs.
199 assert(cloc->loc.empty());
200 const char *p, *q;
201 p = fasta_record.comment.c_str();
202 do {
203 q = p;
204 while (*q != '\0' && *q != ' ' && *q != '\t')
205 ++q;
206 if (strncmp(p, "pos=", 4) == 0) {
207 p += 4;
208 cloc->loc = PhyLoc(string(p, q));
209 } else if (strncmp(p, "contig=overlapped:", 18) == 0) {
210 p += 18;
211 cloc->overlap = is_integer(p);
212 cloc->pe_ctg = true;
213 } else if (strncmp(p, "contig=separate", 15) == 0) {
214 cloc->pe_ctg = true;
215 }
216 p = q;
217 while(*p != '\0' && (*p == ' ' || *p == '\t'))
218 ++p;
219 } while (*p != '\0');
220 if (cloc->loc.empty())
221 cloc->loc = PhyLoc("", 0, strand_plus);
222
223 // Parse the VCF records.
224 // ==========
225
226 // CSLocus.
227 // ----------
228 for (auto& rec : vcf_records) {
229 if (!rec.is_monomorphic()) {
230 SNP* snp = new SNP();
231 cloc->snps.push_back(snp);
232 snp->type = snp_type_het;
233 snp->col = rec.pos();
234 auto a = rec.begin_alleles();
235 snp->rank_1 = **a;
236 ++a;
237 assert(a != rec.end_alleles());
238 snp->rank_2 = **a;
239 ++a;
240 if (a != rec.end_alleles()) {
241 snp->rank_3 = **a;
242 ++a;
243 if (a != rec.end_alleles())
244 snp->rank_4 = **a;
245 }
246 }
247 }
248
249 // Genotypes.
250 // ----------
251 size_t snp_i = 0;
252 --snp_i;
253 vector<Nt2> rec_alleles;
254 size_t dp_index = SIZE_MAX;
255 size_t ad_index = SIZE_MAX;
256 size_t gq_index = SIZE_MAX;
257 size_t gl_index = SIZE_MAX;
258 for (auto& rec : vcf_records)
259 { try {
260 size_t col = rec.pos();
261 bool snp_rec = !rec.is_monomorphic();
262 if (snp_rec) {
263 if (!rec.is_snp()) {
264 cerr << "Error: Not a SNP.\n";
265 throw exception();
266 }
267 ++snp_i;
268 rec_alleles.clear();
269 for(auto a = rec.begin_alleles(); a != rec.end_alleles(); ++a)
270 rec_alleles.push_back(Nt2(**a));
271 if (rec.index_of_gt_subfield("PS") != 1)
272 throw exception();
273 dp_index = rec.index_of_gt_subfield("DP");
274 ad_index = rec.index_of_gt_subfield("AD");
275 gq_index = rec.index_of_gt_subfield("GQ");
276 gl_index = rec.index_of_gt_subfield("GL");
277 } else {
278 assert(rec.count_formats() == 1 && strcmp(rec.format0(),"DP")==0);
279 }
280 VcfRecord::iterator gt_itr = rec.begin_samples();
281 for (size_t sample_vcf_i = 0;
282 sample_vcf_i < vcf_header.samples().size();
283 ++gt_itr, ++sample_vcf_i
284 ) { try {
285 assert(gt_itr != rec.end_samples());
286 // Check that the sample is present in the population map.
287 size_t sample_mpopi_i = sample_vcf_i_to_mpopi_i[sample_vcf_i];
288 if (sample_mpopi_i == SIZE_MAX)
289 continue;
290 else
291 assert(sample_mpopi_i < mpopi->samples().size());
292 // Check if the sample has data.
293 const char* gt_str = *gt_itr;
294 if (gt_str[0] == '.')
295 continue;
296 Datum* d = locdata[sample_mpopi_i];
297 if (snp_rec) {
298 // Check that this isn't an unphased HET.
299 // N.B.: GStacks writes a single phase set per sample.
300 pair<long,long> gt = VcfRecord::util::parse_gt_gt(gt_str);
301 assert(gt.first >= 0);
302 if (gt.first != gt.second) {
303 const char* ps = strchr(gt_str, ':');
304 if (ps == NULL)
305 throw exception();
306 ++ps;
307 if (ps[0] == '.')
308 continue;
309 }
310 }
311 // This sample has data for this locus for this column.
312 if (d == NULL) {
313 ++cloc->cnt;
314 d = new Datum();
315 locdata[sample_mpopi_i] = d;
316 d->id = cloc->id;
317 d->len = cloc->len;
318 d->model = new char[cloc->len+1];
319 memset(d->model, 'U', cloc->len);
320 d->model[cloc->len] = '\0';
321 if (!cloc->snps.empty()) {
322 d->snpdata = vector<Datum::SNPData>(cloc->snps.size());
323 for (size_t i=0; i<2; ++i) {
324 char* hap = new char[cloc->snps.size()+1];
325 d->obshap.push_back(hap);
326 memset(hap, 'N', cloc->snps.size());
327 hap[cloc->snps.size()] = '\0';
328 }
329 } else {
330 d->obshap.push_back(new char[10]);
331 strncpy(d->obshap[0], "consensus", 10);
332 }
333 }
334 // Handle the non-SNP-record case.
335 if (!snp_rec) {
336 d->model[col] = 'O';
337 continue;
338 }
339 // SNP column.
340 pair<long,long> gt = VcfRecord::util::parse_gt_gt(gt_str);
341 d->model[col] = (gt.first == gt.second ? 'O' : 'E');
342 d->obshap[0][snp_i] = char(rec_alleles.at(gt.first));
343 d->obshap[1][snp_i] = char(rec_alleles.at(gt.second));
344 // Record additional information on the call.
345 Datum::SNPData& s = d->snpdata[snp_i];
346 s.tot_depth = VcfRecord::util::parse_gt_dp(gt_str, dp_index);
347 s.nt_depths = VcfRecord::util::parse_gt_ad(gt_str, ad_index, rec_alleles);
348 s.gq = VcfRecord::util::parse_gt_gq(gt_str, gq_index);
349 s.gtliks = VcfRecord::util::parse_gt_gl(gt_str, gl_index, rec_alleles);
350 } catch (exception&) {
351 cerr << "Error: At the " << (sample_vcf_i+1) << "th sample.\n";
352 throw;
353 }}
354 } catch (exception&) {
355 cerr << "Error: In record '" << rec.chrom() << ":" << rec.pos()+1 << "'.\n";
356 throw;
357 }}
358
359 // Finalize the CSLocus (??).
360 // ==========
361 if (!cloc->snps.empty()) {
362 string hap;
363 for (size_t i = 0; i < mpopi->samples().size(); ++i) {
364 Datum* d = locdata[i];
365 if (d == NULL)
366 continue;
367 assert(d->obshap.size() == 2);
368 if (strchr(d->obshap[0], 'N') == NULL) {
369 assert(strchr(d->obshap[1], 'N') == NULL);
370 hap = d->obshap[0];
371 ++cloc->alleles[move(hap)];
372 hap = d->obshap[1];
373 ++cloc->alleles[move(hap)];
374 }
375 }
376 cloc->populate_alleles();
377 }
378
379 } catch (exception&) {
380 cerr << "Error: Locus " << fasta_record.id << "\n"
381 << "Error: Bad GStacks files.\n";
382 throw;
383 }}
384
385 template<class LocusT> bool
populate_external(LocusT * cloc,Datum ** locdata,int cloc_id,const VcfRecord & vcf_record,const VcfHeader & vcf_header,const MetaPopInfo * mpopi,const vector<size_t> & sample_vcf_i_to_mpopi_i)386 PopMap<LocusT>::populate_external(
387 LocusT* cloc,
388 Datum** locdata,
389 int cloc_id,
390 const VcfRecord& vcf_record,
391 const VcfHeader& vcf_header,
392 const MetaPopInfo* mpopi,
393 const vector<size_t>& sample_vcf_i_to_mpopi_i
394 ) { try {
395 assert(vcf_record.is_snp());
396 assert(sample_vcf_i_to_mpopi_i.size() == vcf_header.samples().size());
397 // We ignore the '*' allele & treat samples that have it as missing.
398 long upstream_del_allele = -1;
399 vector<Nt2> rec_alleles;
400 long allele_i = 0;
401 try {
402 for (auto a = vcf_record.begin_alleles();
403 a != vcf_record.end_alleles();
404 ++a, ++allele_i
405 ) {
406 if (**a == '*') {
407 if (upstream_del_allele != -1)
408 throw exception();
409 upstream_del_allele = allele_i;
410 rec_alleles.push_back(Nt2::a);
411 } else {
412 if (!Nt4(**a).is_acgt())
413 throw exception();
414 rec_alleles.push_back(Nt2(**a));
415 }
416 }
417 if (rec_alleles.size() > 4 + (upstream_del_allele != -1 ? 1 : 0))
418 throw exception();
419 } catch (exception&) {
420 cerr << "Warning: Illegal alleles.\n";
421 throw;
422 }
423 // CSLocus.
424 // ==========
425 cloc->id = cloc_id;
426 cloc->len = 1;
427 cloc->con = new char[2];
428 cloc->con[0] = *vcf_record.allele0();
429 cloc->con[1] = '\0';
430 cloc->loc.set(vcf_record.chrom(), vcf_record.pos(), strand_plus);
431 // Locus::snps
432 SNP* snp = new SNP();
433 cloc->snps.push_back(snp);
434 snp->type = snp_type_het;
435 snp->col = 0;
436 // Locus::rankX
437 allele_i = 0;
438 if (allele_i == upstream_del_allele)
439 ++allele_i;
440 snp->rank_1 = char(rec_alleles[allele_i]);
441 ++allele_i;
442 if (allele_i == upstream_del_allele)
443 ++allele_i;
444 assert(allele_i != long(rec_alleles.size()));
445 snp->rank_2 = char(rec_alleles[allele_i]);
446 ++allele_i;
447 if (allele_i == upstream_del_allele)
448 ++allele_i;
449 if (allele_i != long(rec_alleles.size())) {
450 snp->rank_3 = char(rec_alleles[allele_i]);
451 ++allele_i;
452 if (allele_i == upstream_del_allele)
453 ++allele_i;
454 if (allele_i != long(rec_alleles.size()))
455 snp->rank_4 = char(rec_alleles[allele_i]);
456 }
457
458 // Genotypes.
459 // ==========
460 size_t dp_index = vcf_record.index_of_gt_subfield("DP");
461 size_t ad_index = vcf_record.index_of_gt_subfield("AD");
462 size_t gq_index = vcf_record.index_of_gt_subfield("GQ");
463 size_t gl_index = vcf_record.index_of_gt_subfield("GL");
464 VcfRecord::iterator gt_itr = vcf_record.begin_samples();
465 for (size_t sample_vcf_i = 0;
466 sample_vcf_i < vcf_header.samples().size();
467 ++gt_itr, ++sample_vcf_i
468 ) { try {
469 assert(gt_itr != vcf_record.end_samples());
470 // Check that the sample is present in the population map.
471 size_t sample_mpopi_i = sample_vcf_i_to_mpopi_i[sample_vcf_i];
472 if (sample_mpopi_i == SIZE_MAX)
473 continue;
474 else
475 assert(sample_mpopi_i < mpopi->samples().size());
476 // Check if the sample has data.
477 const char* gt_str = *gt_itr;
478 pair<long,long> gt = VcfRecord::util::parse_gt_gt(gt_str);
479 if (gt.first == -1)
480 continue;
481 else if (gt.first == upstream_del_allele || gt.second == upstream_del_allele)
482 continue;
483 // Fill the datum.
484 assert(locdata[sample_mpopi_i] == NULL);
485 ++cloc->cnt;
486 Datum* d = new Datum();
487 locdata[sample_mpopi_i] = d;
488 d->id = cloc->id;
489 d->len = 1;
490 d->model = new char[2];
491 d->model[0] = (gt.first == gt.second ? 'O' : 'E');
492 d->model[1] = '\0';
493 for (size_t i=0; i<2; ++i) {
494 d->obshap.push_back(new char[2]);
495 d->obshap.back()[0] = 'N';
496 d->obshap.back()[1] = '\0';
497 }
498 d->obshap[0][0] = char(rec_alleles.at(gt.first));
499 d->obshap[1][0] = char(rec_alleles.at(gt.second));
500 d->snpdata = vector<Datum::SNPData>(1);
501 Datum::SNPData& s = d->snpdata[0];
502 if (dp_index != SIZE_MAX)
503 s.tot_depth = VcfRecord::util::parse_gt_dp(gt_str, dp_index);
504 if (ad_index != SIZE_MAX)
505 s.nt_depths = VcfRecord::util::parse_gt_ad(gt_str, ad_index, rec_alleles);
506 if (gq_index != SIZE_MAX)
507 s.gq = VcfRecord::util::parse_gt_gq(gt_str, gq_index);
508 if (gl_index != SIZE_MAX)
509 s.gtliks = VcfRecord::util::parse_gt_gl(gt_str, gl_index, rec_alleles);
510 } catch (exception&) {
511 cerr << "Warning: Malformed sample field '" << *gt_itr << "'.\n";
512 throw;
513 }}
514
515 // Finally, "populate alleles".
516 for (size_t i = 0; i < mpopi->samples().size(); ++i) {
517 Datum* d = locdata[i];
518 if (d == NULL)
519 continue;
520 assert(d->obshap.size() == 2);
521 assert(strchr(d->obshap[0], 'N') == NULL);
522 ++cloc->alleles[d->obshap[0]];
523 ++cloc->alleles[d->obshap[1]];
524 }
525 cloc->populate_alleles();
526 return true;
527 } catch (exception&) {
528 cerr << "Warning: Discarding VCF SNP record '"
529 << vcf_record.chrom() << ":" << (vcf_record.pos()+1) << "'.\n";
530 return false;
531 }}
532
533 #endif // __POPMAP_H__
534