1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2013, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __LOCUS_H__
22 #define __LOCUS_H__
23 
24 #include <cstring>
25 #include <string>
26 #include <vector>
27 #include <map>
28 #include <algorithm>
29 #include <utility>
30 
31 #include "constants.h"
32 #include "stacks.h"
33 #include "MetaPopInfo.h"
34 #include "Alignment.h"
35 #include "aln_utils.h"
36 
37 class Match {
38  public:
39     uint        cat_id;
40     allele_type cat_type;
41     allele_type query_type;
42     string      cigar;
43     uint        dist;
44 
Match()45     Match() {
46         this->cat_id = 0;
47         this->dist   = 0;
48     };
Match(const Match & other)49     Match(const Match &other) {
50         this->cat_id     = other.cat_id;
51         this->cat_type   = other.cat_type;
52         this->query_type = other.query_type;
53         this->cigar      = other.cigar;
54         this->dist       = other.dist;
55     };
56 };
57 
58 class Locus;
59 int  adjust_snps_for_gaps(Cigar&, Locus*);
60 int  adjust_and_add_snps_for_gaps(Cigar&, Locus*);
61 int  remove_snps_from_gaps(Cigar&, Locus*);
62 
63 class Locus {
64  public:
65     int          id; // Locus ID
66     int   sample_id; // Sample ID
67     int       depth; // Stack depth
68     char       *con; // Consensus sequence
69     char     *model; // Model calls for each nucleotide
70     uint        len; // Sequence length
71 
72     //
73     // Flags
74     //
75     bool blacklisted;
76     bool deleveraged;
77     bool lumberjackstack;
78 
79     vector<char *>          comp;   // Raw components in this stack.
80     vector<char *>         reads;   // Sequence reads contributing to this locus.
81     vector<uint>        comp_cnt;   // Counter for internal stacks merged into this locus.
82     vector<read_type>  comp_type;   // Read types for reads contributing to this locus.
83     PhyLoc                   loc;   // Physical genome location of this stack.
84     vector<SNP *>           snps;   // Single Nucleotide Polymorphisms in this stack.
85     map<string, int>     alleles;   // Map of the allelic configuration of SNPs in this stack along with the count of each
86     vector<pair<allele_type, string>> strings; // Strings for matching (representing the various allele combinations)
87 
Locus()88     Locus()  {
89         id              = 0;
90         sample_id       = 0;
91         depth           = 0;
92         model           = NULL;
93         con             = NULL;
94         len             = 0;
95         blacklisted     = false;
96         deleveraged     = false;
97         lumberjackstack = false;
98     }
99     Locus(const Locus &other);
~Locus()100     virtual ~Locus() {
101         delete [] con;
102         delete [] model;
103         for (uint i = 0; i < snps.size(); i++)
104             delete snps[i];
105         for (uint i = 0; i < comp.size(); i++)
106             delete [] comp[i];
107         for (uint i = 0; i < reads.size(); i++)
108             delete [] reads[i];
109     }
sort_bp()110     uint sort_bp() const {return this->loc.bp;}
sort_bp(uint col0)111     uint sort_bp(uint col0) const {return this->loc.bp + (this->loc.strand==strand_plus ? col0 : -col0);}
112     int snp_index(uint) const;
113     int add_consensus(const char *);
114     int add_model(const char *);
115     virtual int populate_alleles();
116 };
117 
118 //
119 // Query Locus Class
120 //
121 class QLocus : public Locus {
122  public:
123     vector<Match *> matches;   // Matching tags found for the catalog.
124 
QLocus()125     QLocus(): Locus() {}
126     QLocus(const QLocus &other);
127     ~QLocus();
128 
129     int add_match(int, allele_type, allele_type, int, string);
130     int add_match(int, allele_type, allele_type, int);
131     int add_match(int, allele_type);
132     int clear_matches();
133 };
134 
135 //
136 // Catalog Locus Class, for use in cstacks, contains catalog loci and records the
137 // constiuent tags this locus was built from.
138 //
139 class CLocus : public Locus {
140  public:
141     vector<pair<int, int> > sources;   // Sample/ID pairs for the sources contributing to this catalog entry
142     uint match_cnt;
143 
CLocus()144     CLocus() : Locus() {
145         this->match_cnt = 0;
146     };
147 
148     int merge_snps(Locus *);
149     int reduce_alleles(set<string> &);
150 };
151 
152 //
153 // Catalog Summary Locus Class; used in genotypes and populations, records a catalog
154 // locus with summary information derived from individuals in the population.
155 //
156 class CSLocus : public Locus {
157 public:
CSLocus()158     CSLocus() : Locus() {
159         this->f              = 0.0;
160         this->chisq          = 1.0;
161         this->cnt            = 0;
162         this->gcnt           = 0;
163         this->trans_gcnt     = 0;
164         this->pe_ctg         = false;
165         this->overlap        = 0;
166     };
167     string annotation;
168     string marker;
169     string uncor_marker;
170     map<string, int>    hap_cnts; // Counts of each observed haplotype for this locus in the population.
171     map<string, string> gmap;     // Observed haplotype to genotype map for this locus.
172     bool     pe_ctg;              // Was a paired-end contig constructed for this locus?
173     uint16_t overlap;             // Size of overlap between single and paired-end contigs for this locus.
174     uint16_t cnt;                 // Number of samples containing data for this locus.
175     uint16_t gcnt;                // Number of progeny containing a valid genotype.
176     uint16_t trans_gcnt;          // Number of progeny containing a valid
177                                   //   genotype, translated for a particular map type.
178     double f;                     // Inbreeder's coefficient
179     double chisq;                 // Chi squared p-value testing the null hypothesis of no segregation distortion.
180 };
181 
182 // SRead: a Read belonging to a Sample.
183 struct SRead : Read {
184     size_t sample; // index in MetaPopInfo::samples_
SReadSRead185     SRead() : Read(), sample(SIZE_MAX) {}
SReadSRead186     SRead(Read&& r, size_t spl) : Read(move(r)), sample(spl) {}
187 };
188 
189 struct SAlnRead : AlnRead {
190     size_t sample; // index in MetaPopInfo::samples_
SAlnReadSAlnRead191     SAlnRead() : AlnRead(), sample(SIZE_MAX) {}
SAlnReadSAlnRead192     SAlnRead(AlnRead&& r, size_t spl) : AlnRead(move(r)), sample(spl) {}
SAlnReadSAlnRead193     SAlnRead(Read&& r, Cigar&& c, size_t spl) : AlnRead(move(r), move(c)), sample(spl) {}
194 };
195 
196 class CLocReadSet {
197     const MetaPopInfo* mpopi_;
198     size_t bam_i_; // BAM target index (SIZE_MAX if unavailable).
199     int id_; // Catalog locus ID
200     PhyLoc aln_pos_;
201     vector<SRead> reads_; // Forward reads. Order is arbitrary.
202     vector<SRead> pe_reads_; // Paired-end reads. Order and size are arbitrary.
203 
204 public:
CLocReadSet(const MetaPopInfo & mpopi)205     CLocReadSet(const MetaPopInfo& mpopi)
206         : mpopi_(&mpopi), bam_i_(SIZE_MAX), id_(-1), aln_pos_(), reads_(), pe_reads_()
207         {}
208 
mpopi()209     const MetaPopInfo& mpopi() const {return *mpopi_;}
bam_i()210     size_t bam_i() const {return bam_i_;}
id()211     int id() const {return id_;}
pos()212     const PhyLoc& pos() const {return aln_pos_;}
reads()213     const vector<SRead>& reads() const {return reads_;}
reads()214           vector<SRead>& reads()       {return reads_;}
pe_reads()215     const vector<SRead>& pe_reads() const {return pe_reads_;}
pe_reads()216           vector<SRead>& pe_reads()       {return pe_reads_;}
217 
clear()218     void clear() {bam_i_=SIZE_MAX; id_=-1; reads_.clear(); pe_reads_.clear();}
bam_i(size_t i)219     void bam_i(size_t i) {bam_i_ = i;}
id(int id)220     void id(int id) {id_ = id;}
pos(const PhyLoc & p)221     void pos(const PhyLoc& p) {aln_pos_ = p;}
add(SRead && r)222     void add(SRead&& r) {reads_.push_back(move(r));}
add_pe(SRead && r)223     void add_pe(SRead&& r) {pe_reads_.push_back(move(r));}
224     size_t n_samples() const;
225 };
226 
227 class CLocAlnSet {
228     int id_; // Catalog locus ID
229     PhyLoc aln_pos_;
230     DNASeq4 ref_;
231     const MetaPopInfo* mpopi_;
232     vector<SAlnRead> reads_;
233     vector<vector<size_t>> reads_per_sample_; // `at(sample)` is a vector of indexes in `reads_`.
234 
235 public:
CLocAlnSet()236     CLocAlnSet()
237         : id_(-1), mpopi_(NULL)
238         {}
239     CLocAlnSet(CLocAlnSet&&) = default;
240     CLocAlnSet& operator= (CLocAlnSet&&) = default;
241 
mpopi()242     const MetaPopInfo& mpopi() const {return *mpopi_;}
id()243     int id() const {return id_;}
pos()244     const PhyLoc& pos() const {return aln_pos_;}
ref()245     const DNASeq4& ref() const {return ref_;}
reads()246     const vector<SAlnRead>& reads() const {return reads_;}
sample_reads(size_t sample)247     const vector<size_t>& sample_reads(size_t sample) const {return reads_per_sample_.at(sample);}
248 
249     void clear();
250     void reinit(int id, const PhyLoc& aln_pos, const MetaPopInfo* mpopi);
ref(DNASeq4 && s)251     void ref(DNASeq4&& s) {ref_ = move(s);}
252     void add(SAlnRead&& r);
253 
254     void recompute_consensus();
255     void sort_by_read_name();
256     void sort_by_alignment_offset();
257     void hard_clip_right_Ns();
258     void merge_paired_reads();
259     void remove_unmerged_reads(ostream* log = NULL);
260     void remove_pcr_duplicates(vector<size_t>* clone_size_distrib = NULL, ostream* log = NULL);
n_samples()261     size_t n_samples() const {size_t n=0; for(auto& reads : reads_per_sample_) if (!reads.empty()) ++n; return n;}
262 
263     friend ostream& operator<< (ostream& os, const CLocAlnSet& loc);
264 
265     static CLocAlnSet juxtapose(CLocAlnSet&& left, CLocAlnSet&& right, long offset);
266 
267     //
268     // Class to iterate over sites.
269     //
270     class site_iterator {
271 
272         const CLocAlnSet& loc_aln_;
273         DNASeq4::iterator ref_it_;
274         DNASeq4::iterator ref_past_;
275         vector<Alignment::iterator> its_;
276         size_t col_;
277 
278     public:
279         // Iteration methods.
site_iterator(const CLocAlnSet & loc_aln)280         site_iterator(const CLocAlnSet& loc_aln)
281                 : loc_aln_(loc_aln),
282                 ref_it_(loc_aln.ref().begin()),
283                 ref_past_(loc_aln.ref().end()),
284                 col_(0)
285                 {
286             its_.reserve(loc_aln.reads().size());
287             for (const SAlnRead& r: loc_aln.reads())
288                 its_.push_back(Alignment::iterator(r.aln()));
289         }
290         operator bool () const {return ref_it_ != ref_past_;}
291         site_iterator& operator++ ();
292 
293         // Site interface.
ref_nt()294         Nt4 ref_nt() const {return *ref_it_;} // Get the contig nt4.
295         void counts(Counts<Nt4>& counts) const; // Get the nt counts across all samples.
296         void counts(Counts<Nt4>& counts, size_t sample) const; // Get the nt counts for a given sample.
297         SiteCounts counts() const;
col()298         size_t col() const {return col_;}
299 
mpopi()300         const MetaPopInfo& mpopi() const {return loc_aln_.mpopi();}
301     };
302 };
303 
304 //
305 // ==================
306 // Inline definitions
307 // ==================
308 //
309 
310 inline
add(SAlnRead && r)311 void CLocAlnSet::add(SAlnRead&& r) {
312     assert(cigar_length_ref(r.cigar) == ref_.length());
313     reads_per_sample_.at(r.sample).push_back(reads_.size());
314     reads_.push_back(move(r));
315 }
316 
317 inline
318 CLocAlnSet::site_iterator& CLocAlnSet::site_iterator::operator++ () {
319     ++ref_it_;
320     for (auto& it: its_)
321         ++it;
322 
323     #ifdef DEBUG
324     if (! (ref_it_ != ref_past_)) {
325         // Make sure we've reached the end of the alignment for every read.
326         for (auto& it: its_)
327             assert(!bool(it));
328     }
329     #endif
330 
331     ++col_;
332     return *this;
333 }
334 
335 inline
counts(Counts<Nt4> & counts)336 void CLocAlnSet::site_iterator::counts(Counts<Nt4>& counts) const {
337     counts.clear();
338     for (auto& read: its_)
339         counts.increment(*read);
340 }
341 
342 inline
counts(Counts<Nt4> & counts,size_t sample)343 void CLocAlnSet::site_iterator::counts(Counts<Nt4>& counts, size_t sample) const {
344     counts.clear();
345     for (size_t read_i : loc_aln_.sample_reads(sample))
346         counts.increment(*its_[read_i]);
347 }
348 
349 inline
counts()350 SiteCounts CLocAlnSet::site_iterator::counts() const {
351     SiteCounts cnts;
352     cnts.mpopi = &mpopi();
353 
354     cnts.samples.reserve(mpopi().samples().size());
355     Counts<Nt4> tmp;
356     for (size_t sample=0; sample<mpopi().samples().size(); ++sample) {
357         counts(tmp, sample); //this->counts()
358         cnts.samples.push_back(Counts<Nt2>(tmp));
359         cnts.tot += cnts.samples.back();
360     }
361     return cnts;
362 }
363 
364 #endif // __LOCUS_H__
365