1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2017, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 #ifndef GSTACKS_H
21 #define GSTACKS_H
22 
23 #include "constants.h"
24 #include "nucleotides.h"
25 #include "models.h"
26 #include "SuffixTree.h"
27 #include "GappedAln.h"
28 #include "BamI.h"
29 
30 string parse_command_line(int argc, char* argv[]);
31 
32 enum class GStacksInputT {unknown, denovo_popmap, denovo_merger, refbased_popmap, refbased_list};
33 
34 //
35 // PhasedHet & PhaseSet
36 // ----------
37 //
38 struct PhasedHet {
39     size_t phase_set; // N.B. The convention in VCF is to use the position of the first phased SNP for this.
40     Nt4 left_allele;
41     Nt4 right_allele;
42 
43     bool operator== (const PhasedHet& other) const
44         {return phase_set == other.phase_set && left_allele == other.left_allele && right_allele == other.right_allele;}
45 };
46 class PhaseSet {
47     vector<PhasedHet> phased_hets_;
48 public:
PhaseSet()49     PhaseSet() : phased_hets_{} {}
clear()50     void clear() {phased_hets_.clear();}
empty()51     bool empty() const {return phased_hets_.empty();}
52 
PhaseSet(size_t n_hets)53     PhaseSet(size_t n_hets) : phased_hets_(n_hets, {SIZE_MAX, Nt4::n, Nt4::n}) {}
size()54     size_t size() const {return phased_hets_.size();}
het(size_t het_i)55     const PhasedHet& het(size_t het_i) const {return phased_hets_[het_i];}
56 
57     // Add the first node to the phase set.
58     void add_het(size_t het_i, array<Nt2,2> alleles);
59     // Add a singleton node to the phase set, according to the given edge.
60     void add_het(size_t het_i, array<Nt2,2> alleles, Nt2 nt_i, size_t het_j, Nt2 nt_j);
61     // Merge with another phase set, according to the given edge.
62     void merge_with(const PhaseSet& other, size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j);
63     // Check that a new edge between two nodes within the phase set is consistent.
64     bool is_edge_consistent(size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j) const;
65 };
66 
67 //
68 // SnpAlleleCooccurrenceCounter
69 // ----------
70 // Convenience matrix class to record SNP allele co-occurences in
71 // reads/read pairs.
72 //
73 // The matrix is n_snps*n_snps but we only use the i<j half.
74 //
75 class SnpAlleleCooccurrenceCounter {
76     size_t n_snps_;
77     vector<array<array<size_t,4>,4>> cooccurrences_;
78 public:
SnpAlleleCooccurrenceCounter(size_t n_snps)79     SnpAlleleCooccurrenceCounter(size_t n_snps) : n_snps_(n_snps), cooccurrences_(n_snps_*n_snps_) {};
80     void clear();
81 
82     const size_t& at(size_t snp_i1, Nt2 snp1_allele, size_t snp_i2, Nt2 snp2_allele) const;
at(size_t snp_i1,Nt2 snp1_allele,size_t snp_i2,Nt2 snp2_allele)83           size_t& at(size_t snp_i1, Nt2 snp1_allele, size_t snp_i2, Nt2 snp2_allele)
84               {return (size_t&)((const SnpAlleleCooccurrenceCounter&)*this).at(snp_i1, snp1_allele, snp_i2, snp2_allele);}
85 };
86 
87 //
88 // GenotypeStats
89 // ----------
90 // Statistics produced by LocusProcessor regarding genotyping.
91 //
92 class GenotypeStats {
93 public:
94     size_t n_genotyped_loci;
95     size_t n_sites_tot;
mean_n_sites_per_loc()96     double mean_n_sites_per_loc() const {return (double) n_sites_tot / n_genotyped_loci;}
97 
98     // Statistics for --rm-unpaired-reads, --rm-pcr-duplicates.
99     struct PerSampleStats {
100         size_t n_unpaired_reads;
101         size_t n_read_pairs_pcr_dupl;
102         size_t n_read_pairs_used;
103         size_t n_loci_with_sample;
104         size_t ns_cumsum;
105         size_t ns_weighted_n_read_pairs_used;
106     };
107     vector<PerSampleStats> per_sample_stats;
108     vector<size_t> pcr_clone_size_distrib;
n_unpaired_reads_rm()109     size_t n_unpaired_reads_rm() const {size_t n=0; for(auto& s: per_sample_stats) n+=s.n_unpaired_reads; return n;}
n_read_pairs_pcr_dupl()110     size_t n_read_pairs_pcr_dupl() const {size_t n=0; for(auto& s: per_sample_stats) n+=s.n_read_pairs_pcr_dupl; return n;}
n_read_pairs_used()111     size_t n_read_pairs_used() const {size_t n=0; for(auto& s: per_sample_stats) n+=s.n_read_pairs_used; return n;}
112 
GenotypeStats(size_t n_samples)113     GenotypeStats(size_t n_samples) : n_genotyped_loci(0), n_sites_tot(0), per_sample_stats(n_samples, PerSampleStats()) {}
114     GenotypeStats& operator+= (const GenotypeStats& other);
115 };
116 
117 //
118 // HaplotypeStats
119 // ----------
120 // Statistics produced by LocusProcessor regarding haplotyping.
121 //
122 class HaplotypeStats {
123 public:
124     map<pair<size_t,size_t>,size_t> n_badly_phased_samples; // { {n_consistent_hets, n_hets_2snps} : count }
125 
126     struct PerSampleStats {
127         size_t n_diploid_loci;
128         size_t n_hets_2snps;
129         size_t n_phased;
130         size_t n_phased_2ndpass;
131     };
132     vector<PerSampleStats> per_sample_stats;
133 
HaplotypeStats(size_t n_samples)134     HaplotypeStats(size_t n_samples) : per_sample_stats(n_samples, PerSampleStats()) {}
135     HaplotypeStats& operator+= (const HaplotypeStats& other);
136 
137     size_t n_halplotyping_attempts() const;
138     size_t n_consistent_hap_pairs() const;
139 };
140 
141 //
142 // ContigStats
143 // ----------
144 // Statistics produced by `LocusProcessor::process()` when doing de novo assembly/alignment.
145 //
146 class ContigStats {
147 public:
148     size_t n_nonempty_loci;
149     size_t n_loci_w_pe_reads;
150     size_t n_loci_almost_no_pe_reads;
151     size_t n_loci_pe_graph_not_dag;
152     size_t n_loci_pe_graph_fixed_to_dag;
153     size_t length_ctg_tot;
154 
155     size_t n_aln_reads;
156     size_t n_tot_reads;
157 
158     size_t n_overlaps;
159     size_t length_overlap_tot;
160     size_t length_olapd_loci_tot;
161 
162     OnlineMeanVar insert_length_olap_mv;
163 
n_loci_no_pe_reads()164     size_t n_loci_no_pe_reads() const { return n_nonempty_loci - n_loci_w_pe_reads; }
n_loci_ctg()165     size_t n_loci_ctg() const { return n_loci_w_pe_reads - n_loci_almost_no_pe_reads - n_loci_pe_graph_not_dag; }
ctg_avg_length()166     double ctg_avg_length() const {return (double) length_ctg_tot / n_loci_ctg();}
mean_olap_length()167     double mean_olap_length() const {return (double) length_overlap_tot / n_overlaps;}
mean_olapd_locus_length()168     double mean_olapd_locus_length() const {return (double) length_olapd_loci_tot / n_overlaps;}
169 
170     ContigStats& operator+= (const ContigStats& other);
171 };
172 
173 //
174 // LocData
175 // ----------
176 // Data regarding the locus LocusProcessor currently works on.
177 //
178 class LocData {
179 public:
180     int id;
181     PhyLoc pos;
182     const MetaPopInfo* mpopi;
183 
184     enum ContigStatus {unknown, separate, overlapped};
185     ContigStatus ctg_status = unknown;
186     size_t olap_length;
187 
188     string o_vcf;
189     string o_fa;
190     string o_details;
191     vector<vector<BamRecord>> o_bam;
192     stringstream details_ss;
193 
194     void clear();
195 };
196 
197 //
198 // Clocks
199 // ----------
200 // For Benchmarking. Thread specific.
201 //
202 struct Timers {
203     Timer reading;
204     Timer processing;
205     Timer writing_fa;
206     Timer writing_vcf;
207     Timer writing_details;
208     Timer writing_bams;
209 
210     Timer processing_pre_alns;
211     Timer rm_Ns;
212     Timer assembling;
213     Timer init_alignments;
214     Timer aligning;
215     Timer building_bams;
216     Timer merge_paired_reads;
217     Timer processing_post_alns;
218     Timer rm_reads;
219     Timer genotyping;
220     Timer haplotyping;
221     Timer building_vcf;
222     Timer building_fa;
223     Timer cpt_consensus;
224     Timer counting_nts;
225 
226     Timers& operator+= (const Timers& other);
227 };
228 
229 //
230 // LocusProcessor
231 // ----------
232 // Functor for processing loci. Thread-specific.
233 //
234 class LocusProcessor {
235 public:
LocusProcessor(size_t n_samples)236     LocusProcessor(size_t n_samples) : gt_stats_(n_samples), hap_stats_(n_samples), ctg_stats_(), loc_() {}
237 
238     // Process a locus.
239     void process(CLocReadSet& loc);
240     void process(CLocAlnSet& aln_loc);
241 
242     // Access the output. Statistics & movable fasta/vcf per-locus text outputs.
gt_stats()243     const GenotypeStats& gt_stats() const {return gt_stats_;}
hap_stats()244     const HaplotypeStats& hap_stats() const {return hap_stats_;}
ctg_stats()245     const ContigStats& ctg_stats() const {return ctg_stats_;}
timers()246     Timers& timers() {return timers_;}
247 
vcf_out()248     string& vcf_out() {return loc_.o_vcf;}
fasta_out()249     string& fasta_out() {return loc_.o_fa;}
details_out()250     string& details_out() {return loc_.o_details;}
bam_out()251     vector<vector<BamRecord>>& bam_out() {return loc_.o_bam;}
252 
253 private:
254     GenotypeStats gt_stats_;
255     HaplotypeStats hap_stats_;
256     ContigStats ctg_stats_;
257     Timers timers_;
258     mutable LocData loc_;
259 
260     DNASeq4 assemble_locus_contig(
261         const vector<SRead>& fw_reads,
262         const vector<SRead>& pe_reads);
263     bool add_read_to_aln(
264             CLocAlnSet& aln_loc,
265             AlignRes& aln_res,
266             SRead&& read,
267             GappedAln* aligner,
268             SuffixTree* stree
269             );
270 
271     bool align_reads_to_contig(SuffixTree *st, GappedAln *g_aln, DNASeq4 &query, AlignRes &aln_res) const;
272     int suffix_tree_hits_to_dag(size_t query_len, vector<STAln> &alns, vector<STAln> &final_alns) const;
273     size_t find_locus_overlap(SuffixTree *st, GappedAln *g_aln, const DNASeq4 &se_consensus, string &overlap_cigar) const;
274 
275     // Phase all samples.
276     void phase_hets (
277             vector<map<size_t,PhasedHet>>& phase_data,
278             vector<SiteCall>& calls,
279             const CLocAlnSet& aln_loc,
280             HaplotypeStats& hap_stats
281             ) const;
282 
283     // Phase (heterozygous) positions for one sample.
284     bool phase_sample_hets(
285             map<size_t,PhasedHet>& phased_sample,
286             const vector<SiteCall>& calls,
287             const CLocAlnSet& aln_loc,
288             const vector<size_t>& snp_cols,
289             size_t sample,
290             HaplotypeStats& hap_stats,
291             size_t& n_hets_needing_phasing,
292             size_t& n_consistent_hets,
293             ostream& o_hapgraph_ss,
294             bool& has_subgraphs,
295             bool first_pass=true) const;
296 
297     void count_pairwise_cooccurrences(
298             SnpAlleleCooccurrenceCounter& cooccurrences,
299             const CLocAlnSet& aln_loc,
300             size_t sample,
301             const vector<size_t>& snp_cols,
302             const vector<size_t>& het_snps,
303             const vector<const SampleCall*>& sample_het_calls
304             ) const;
305 
306     // Assemble phase sets.
307     // This is based on the graph of cooccurrences between the het alleles.
308     // We build a graph in which the nodes are phased pairs of hets alleles
309     // (`PhasedHet`s) and phase sets subgraphs of such nodes. The edges between
310     // pairs of alleles are based on those in the graph of cooccurrences, but
311     // in a phased context they appear as one of two types:
312     // * parallel (+): left-allele-to-right-allele (left-to-left) or right-to-right.
313     // * crossed (-): left-to-right, or right-to-left.
314     // Allele linkage is transitive (through composition of +/- edges).
315     // The assembly fails if there are inconsistent edges, i.e. edges that link
316     // (transitively) one allele of one het SNP to both alleles of another het
317     // SNP.
318     // In practice, we start with an empty graph and iterate on edges, adding
319     // nodes/PhasedHets when we first see them and checking that new edges are
320     // consistent with the growing graph.
321     bool assemble_phase_sets(
322             vector<PhaseSet>& phase_sets,
323             const vector<size_t>& het_snps,
324             const vector<const SampleCall*>& sample_het_calls,
325             const SnpAlleleCooccurrenceCounter& cooccurrences,
326             const size_t min_n_cooccurrences
327             ) const;
328 
329     // Create the fasta/vcf text outputs.
330     void write_one_locus (
331             const CLocAlnSet& aln_loc,
332             const vector<SiteCounts>& depths,
333             const vector<SiteCall>& calls,
334             const vector<map<size_t,PhasedHet>>& phase_data // {col : phasedhet} maps, for all samples
335     );
336 
337     // (debug) Write a sample's haplotype graph.
338     void write_sample_hapgraph(
339             ostream& os,
340             size_t sample,
341             const vector<size_t>& het_snps,
342             const vector<size_t>& snp_cols,
343             const vector<const SampleCall*>& sample_het_calls,
344             const SnpAlleleCooccurrenceCounter& cooccurences
345             ) const;
346 
347     //
348     // (debug) Creates a CLocAlnSet from a CLocReadSet, using the true
349     // reference but computing paired-end reads alignments. Same requirements as
350     // `from_true_alignments()`.
351     //
352     void using_true_reference(CLocAlnSet& aln_loc, CLocReadSet&& loc);
353 };
354 
355 #endif
356