1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2013, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __LOCUS_H__
22 #define __LOCUS_H__
23
24 #include <cstring>
25 #include <string>
26 #include <vector>
27 #include <map>
28 #include <algorithm>
29 #include <utility>
30
31 #include "constants.h"
32 #include "stacks.h"
33 #include "MetaPopInfo.h"
34 #include "Alignment.h"
35 #include "aln_utils.h"
36
37 class Match {
38 public:
39 uint cat_id;
40 allele_type cat_type;
41 allele_type query_type;
42 string cigar;
43 uint dist;
44
Match()45 Match() {
46 this->cat_id = 0;
47 this->dist = 0;
48 };
Match(const Match & other)49 Match(const Match &other) {
50 this->cat_id = other.cat_id;
51 this->cat_type = other.cat_type;
52 this->query_type = other.query_type;
53 this->cigar = other.cigar;
54 this->dist = other.dist;
55 };
56 };
57
58 class Locus;
59 int adjust_snps_for_gaps(Cigar&, Locus*);
60 int adjust_and_add_snps_for_gaps(Cigar&, Locus*);
61 int remove_snps_from_gaps(Cigar&, Locus*);
62
63 class Locus {
64 public:
65 int id; // Locus ID
66 int sample_id; // Sample ID
67 int depth; // Stack depth
68 char *con; // Consensus sequence
69 char *model; // Model calls for each nucleotide
70 uint len; // Sequence length
71
72 //
73 // Flags
74 //
75 bool blacklisted;
76 bool deleveraged;
77 bool lumberjackstack;
78
79 vector<char *> comp; // Raw components in this stack.
80 vector<char *> reads; // Sequence reads contributing to this locus.
81 vector<uint> comp_cnt; // Counter for internal stacks merged into this locus.
82 vector<read_type> comp_type; // Read types for reads contributing to this locus.
83 PhyLoc loc; // Physical genome location of this stack.
84 vector<SNP *> snps; // Single Nucleotide Polymorphisms in this stack.
85 map<string, int> alleles; // Map of the allelic configuration of SNPs in this stack along with the count of each
86 vector<pair<allele_type, string>> strings; // Strings for matching (representing the various allele combinations)
87
Locus()88 Locus() {
89 id = 0;
90 sample_id = 0;
91 depth = 0;
92 model = NULL;
93 con = NULL;
94 len = 0;
95 blacklisted = false;
96 deleveraged = false;
97 lumberjackstack = false;
98 }
99 Locus(const Locus &other);
~Locus()100 virtual ~Locus() {
101 delete [] con;
102 delete [] model;
103 for (uint i = 0; i < snps.size(); i++)
104 delete snps[i];
105 for (uint i = 0; i < comp.size(); i++)
106 delete [] comp[i];
107 for (uint i = 0; i < reads.size(); i++)
108 delete [] reads[i];
109 }
sort_bp()110 uint sort_bp() const {return this->loc.bp;}
sort_bp(uint col0)111 uint sort_bp(uint col0) const {return this->loc.bp + (this->loc.strand==strand_plus ? col0 : -col0);}
112 int snp_index(uint) const;
113 int add_consensus(const char *);
114 int add_model(const char *);
115 virtual int populate_alleles();
116 };
117
118 //
119 // Query Locus Class
120 //
121 class QLocus : public Locus {
122 public:
123 vector<Match *> matches; // Matching tags found for the catalog.
124
QLocus()125 QLocus(): Locus() {}
126 QLocus(const QLocus &other);
127 ~QLocus();
128
129 int add_match(int, allele_type, allele_type, int, string);
130 int add_match(int, allele_type, allele_type, int);
131 int add_match(int, allele_type);
132 int clear_matches();
133 };
134
135 //
136 // Catalog Locus Class, for use in cstacks, contains catalog loci and records the
137 // constiuent tags this locus was built from.
138 //
139 class CLocus : public Locus {
140 public:
141 vector<pair<int, int> > sources; // Sample/ID pairs for the sources contributing to this catalog entry
142 uint match_cnt;
143
CLocus()144 CLocus() : Locus() {
145 this->match_cnt = 0;
146 };
147
148 int merge_snps(Locus *);
149 int reduce_alleles(set<string> &);
150 };
151
152 //
153 // Catalog Summary Locus Class; used in genotypes and populations, records a catalog
154 // locus with summary information derived from individuals in the population.
155 //
156 class CSLocus : public Locus {
157 public:
CSLocus()158 CSLocus() : Locus() {
159 this->f = 0.0;
160 this->chisq = 1.0;
161 this->cnt = 0;
162 this->gcnt = 0;
163 this->trans_gcnt = 0;
164 this->pe_ctg = false;
165 this->overlap = 0;
166 };
167 string annotation;
168 string marker;
169 string uncor_marker;
170 map<string, int> hap_cnts; // Counts of each observed haplotype for this locus in the population.
171 map<string, string> gmap; // Observed haplotype to genotype map for this locus.
172 bool pe_ctg; // Was a paired-end contig constructed for this locus?
173 uint16_t overlap; // Size of overlap between single and paired-end contigs for this locus.
174 uint16_t cnt; // Number of samples containing data for this locus.
175 uint16_t gcnt; // Number of progeny containing a valid genotype.
176 uint16_t trans_gcnt; // Number of progeny containing a valid
177 // genotype, translated for a particular map type.
178 double f; // Inbreeder's coefficient
179 double chisq; // Chi squared p-value testing the null hypothesis of no segregation distortion.
180 };
181
182 // SRead: a Read belonging to a Sample.
183 struct SRead : Read {
184 size_t sample; // index in MetaPopInfo::samples_
SReadSRead185 SRead() : Read(), sample(SIZE_MAX) {}
SReadSRead186 SRead(Read&& r, size_t spl) : Read(move(r)), sample(spl) {}
187 };
188
189 struct SAlnRead : AlnRead {
190 size_t sample; // index in MetaPopInfo::samples_
SAlnReadSAlnRead191 SAlnRead() : AlnRead(), sample(SIZE_MAX) {}
SAlnReadSAlnRead192 SAlnRead(AlnRead&& r, size_t spl) : AlnRead(move(r)), sample(spl) {}
SAlnReadSAlnRead193 SAlnRead(Read&& r, Cigar&& c, size_t spl) : AlnRead(move(r), move(c)), sample(spl) {}
194 };
195
196 class CLocReadSet {
197 const MetaPopInfo* mpopi_;
198 size_t bam_i_; // BAM target index (SIZE_MAX if unavailable).
199 int id_; // Catalog locus ID
200 PhyLoc aln_pos_;
201 vector<SRead> reads_; // Forward reads. Order is arbitrary.
202 vector<SRead> pe_reads_; // Paired-end reads. Order and size are arbitrary.
203
204 public:
CLocReadSet(const MetaPopInfo & mpopi)205 CLocReadSet(const MetaPopInfo& mpopi)
206 : mpopi_(&mpopi), bam_i_(SIZE_MAX), id_(-1), aln_pos_(), reads_(), pe_reads_()
207 {}
208
mpopi()209 const MetaPopInfo& mpopi() const {return *mpopi_;}
bam_i()210 size_t bam_i() const {return bam_i_;}
id()211 int id() const {return id_;}
pos()212 const PhyLoc& pos() const {return aln_pos_;}
reads()213 const vector<SRead>& reads() const {return reads_;}
reads()214 vector<SRead>& reads() {return reads_;}
pe_reads()215 const vector<SRead>& pe_reads() const {return pe_reads_;}
pe_reads()216 vector<SRead>& pe_reads() {return pe_reads_;}
217
clear()218 void clear() {bam_i_=SIZE_MAX; id_=-1; reads_.clear(); pe_reads_.clear();}
bam_i(size_t i)219 void bam_i(size_t i) {bam_i_ = i;}
id(int id)220 void id(int id) {id_ = id;}
pos(const PhyLoc & p)221 void pos(const PhyLoc& p) {aln_pos_ = p;}
add(SRead && r)222 void add(SRead&& r) {reads_.push_back(move(r));}
add_pe(SRead && r)223 void add_pe(SRead&& r) {pe_reads_.push_back(move(r));}
224 size_t n_samples() const;
225 };
226
227 class CLocAlnSet {
228 int id_; // Catalog locus ID
229 PhyLoc aln_pos_;
230 DNASeq4 ref_;
231 const MetaPopInfo* mpopi_;
232 vector<SAlnRead> reads_;
233 vector<vector<size_t>> reads_per_sample_; // `at(sample)` is a vector of indexes in `reads_`.
234
235 public:
CLocAlnSet()236 CLocAlnSet()
237 : id_(-1), mpopi_(NULL)
238 {}
239 CLocAlnSet(CLocAlnSet&&) = default;
240 CLocAlnSet& operator= (CLocAlnSet&&) = default;
241
mpopi()242 const MetaPopInfo& mpopi() const {return *mpopi_;}
id()243 int id() const {return id_;}
pos()244 const PhyLoc& pos() const {return aln_pos_;}
ref()245 const DNASeq4& ref() const {return ref_;}
reads()246 const vector<SAlnRead>& reads() const {return reads_;}
sample_reads(size_t sample)247 const vector<size_t>& sample_reads(size_t sample) const {return reads_per_sample_.at(sample);}
248
249 void clear();
250 void reinit(int id, const PhyLoc& aln_pos, const MetaPopInfo* mpopi);
ref(DNASeq4 && s)251 void ref(DNASeq4&& s) {ref_ = move(s);}
252 void add(SAlnRead&& r);
253
254 void recompute_consensus();
255 void sort_by_read_name();
256 void sort_by_alignment_offset();
257 void hard_clip_right_Ns();
258 void merge_paired_reads();
259 void remove_unmerged_reads(ostream* log = NULL);
260 void remove_pcr_duplicates(vector<size_t>* clone_size_distrib = NULL, ostream* log = NULL);
n_samples()261 size_t n_samples() const {size_t n=0; for(auto& reads : reads_per_sample_) if (!reads.empty()) ++n; return n;}
262
263 friend ostream& operator<< (ostream& os, const CLocAlnSet& loc);
264
265 static CLocAlnSet juxtapose(CLocAlnSet&& left, CLocAlnSet&& right, long offset);
266
267 //
268 // Class to iterate over sites.
269 //
270 class site_iterator {
271
272 const CLocAlnSet& loc_aln_;
273 DNASeq4::iterator ref_it_;
274 DNASeq4::iterator ref_past_;
275 vector<Alignment::iterator> its_;
276 size_t col_;
277
278 public:
279 // Iteration methods.
site_iterator(const CLocAlnSet & loc_aln)280 site_iterator(const CLocAlnSet& loc_aln)
281 : loc_aln_(loc_aln),
282 ref_it_(loc_aln.ref().begin()),
283 ref_past_(loc_aln.ref().end()),
284 col_(0)
285 {
286 its_.reserve(loc_aln.reads().size());
287 for (const SAlnRead& r: loc_aln.reads())
288 its_.push_back(Alignment::iterator(r.aln()));
289 }
290 operator bool () const {return ref_it_ != ref_past_;}
291 site_iterator& operator++ ();
292
293 // Site interface.
ref_nt()294 Nt4 ref_nt() const {return *ref_it_;} // Get the contig nt4.
295 void counts(Counts<Nt4>& counts) const; // Get the nt counts across all samples.
296 void counts(Counts<Nt4>& counts, size_t sample) const; // Get the nt counts for a given sample.
297 SiteCounts counts() const;
col()298 size_t col() const {return col_;}
299
mpopi()300 const MetaPopInfo& mpopi() const {return loc_aln_.mpopi();}
301 };
302 };
303
304 //
305 // ==================
306 // Inline definitions
307 // ==================
308 //
309
310 inline
add(SAlnRead && r)311 void CLocAlnSet::add(SAlnRead&& r) {
312 assert(cigar_length_ref(r.cigar) == ref_.length());
313 reads_per_sample_.at(r.sample).push_back(reads_.size());
314 reads_.push_back(move(r));
315 }
316
317 inline
318 CLocAlnSet::site_iterator& CLocAlnSet::site_iterator::operator++ () {
319 ++ref_it_;
320 for (auto& it: its_)
321 ++it;
322
323 #ifdef DEBUG
324 if (! (ref_it_ != ref_past_)) {
325 // Make sure we've reached the end of the alignment for every read.
326 for (auto& it: its_)
327 assert(!bool(it));
328 }
329 #endif
330
331 ++col_;
332 return *this;
333 }
334
335 inline
counts(Counts<Nt4> & counts)336 void CLocAlnSet::site_iterator::counts(Counts<Nt4>& counts) const {
337 counts.clear();
338 for (auto& read: its_)
339 counts.increment(*read);
340 }
341
342 inline
counts(Counts<Nt4> & counts,size_t sample)343 void CLocAlnSet::site_iterator::counts(Counts<Nt4>& counts, size_t sample) const {
344 counts.clear();
345 for (size_t read_i : loc_aln_.sample_reads(sample))
346 counts.increment(*its_[read_i]);
347 }
348
349 inline
counts()350 SiteCounts CLocAlnSet::site_iterator::counts() const {
351 SiteCounts cnts;
352 cnts.mpopi = &mpopi();
353
354 cnts.samples.reserve(mpopi().samples().size());
355 Counts<Nt4> tmp;
356 for (size_t sample=0; sample<mpopi().samples().size(); ++sample) {
357 counts(tmp, sample); //this->counts()
358 cnts.samples.push_back(Counts<Nt2>(tmp));
359 cnts.tot += cnts.samples.back();
360 }
361 return cnts;
362 }
363
364 #endif // __LOCUS_H__
365