1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*- 2 // 3 // Copyright 2017, Julian Catchen <jcatchen@illinois.edu> 4 // 5 // This file is part of Stacks. 6 // 7 // Stacks is free software: you can redistribute it and/or modify 8 // it under the terms of the GNU General Public License as published by 9 // the Free Software Foundation, either version 3 of the License, or 10 // (at your option) any later version. 11 // 12 // Stacks is distributed in the hope that it will be useful, 13 // but WITHOUT ANY WARRANTY; without even the implied warranty of 14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 // GNU General Public License for more details. 16 // 17 // You should have received a copy of the GNU General Public License 18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>. 19 // 20 #ifndef GSTACKS_H 21 #define GSTACKS_H 22 23 #include "constants.h" 24 #include "nucleotides.h" 25 #include "models.h" 26 #include "SuffixTree.h" 27 #include "GappedAln.h" 28 #include "BamI.h" 29 30 string parse_command_line(int argc, char* argv[]); 31 32 enum class GStacksInputT {unknown, denovo_popmap, denovo_merger, refbased_popmap, refbased_list}; 33 34 // 35 // PhasedHet & PhaseSet 36 // ---------- 37 // 38 struct PhasedHet { 39 size_t phase_set; // N.B. The convention in VCF is to use the position of the first phased SNP for this. 40 Nt4 left_allele; 41 Nt4 right_allele; 42 43 bool operator== (const PhasedHet& other) const 44 {return phase_set == other.phase_set && left_allele == other.left_allele && right_allele == other.right_allele;} 45 }; 46 class PhaseSet { 47 vector<PhasedHet> phased_hets_; 48 public: PhaseSet()49 PhaseSet() : phased_hets_{} {} clear()50 void clear() {phased_hets_.clear();} empty()51 bool empty() const {return phased_hets_.empty();} 52 PhaseSet(size_t n_hets)53 PhaseSet(size_t n_hets) : phased_hets_(n_hets, {SIZE_MAX, Nt4::n, Nt4::n}) {} size()54 size_t size() const {return phased_hets_.size();} het(size_t het_i)55 const PhasedHet& het(size_t het_i) const {return phased_hets_[het_i];} 56 57 // Add the first node to the phase set. 58 void add_het(size_t het_i, array<Nt2,2> alleles); 59 // Add a singleton node to the phase set, according to the given edge. 60 void add_het(size_t het_i, array<Nt2,2> alleles, Nt2 nt_i, size_t het_j, Nt2 nt_j); 61 // Merge with another phase set, according to the given edge. 62 void merge_with(const PhaseSet& other, size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j); 63 // Check that a new edge between two nodes within the phase set is consistent. 64 bool is_edge_consistent(size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j) const; 65 }; 66 67 // 68 // SnpAlleleCooccurrenceCounter 69 // ---------- 70 // Convenience matrix class to record SNP allele co-occurences in 71 // reads/read pairs. 72 // 73 // The matrix is n_snps*n_snps but we only use the i<j half. 74 // 75 class SnpAlleleCooccurrenceCounter { 76 size_t n_snps_; 77 vector<array<array<size_t,4>,4>> cooccurrences_; 78 public: SnpAlleleCooccurrenceCounter(size_t n_snps)79 SnpAlleleCooccurrenceCounter(size_t n_snps) : n_snps_(n_snps), cooccurrences_(n_snps_*n_snps_) {}; 80 void clear(); 81 82 const size_t& at(size_t snp_i1, Nt2 snp1_allele, size_t snp_i2, Nt2 snp2_allele) const; at(size_t snp_i1,Nt2 snp1_allele,size_t snp_i2,Nt2 snp2_allele)83 size_t& at(size_t snp_i1, Nt2 snp1_allele, size_t snp_i2, Nt2 snp2_allele) 84 {return (size_t&)((const SnpAlleleCooccurrenceCounter&)*this).at(snp_i1, snp1_allele, snp_i2, snp2_allele);} 85 }; 86 87 // 88 // GenotypeStats 89 // ---------- 90 // Statistics produced by LocusProcessor regarding genotyping. 91 // 92 class GenotypeStats { 93 public: 94 size_t n_genotyped_loci; 95 size_t n_sites_tot; mean_n_sites_per_loc()96 double mean_n_sites_per_loc() const {return (double) n_sites_tot / n_genotyped_loci;} 97 98 // Statistics for --rm-unpaired-reads, --rm-pcr-duplicates. 99 struct PerSampleStats { 100 size_t n_unpaired_reads; 101 size_t n_read_pairs_pcr_dupl; 102 size_t n_read_pairs_used; 103 size_t n_loci_with_sample; 104 size_t ns_cumsum; 105 size_t ns_weighted_n_read_pairs_used; 106 }; 107 vector<PerSampleStats> per_sample_stats; 108 vector<size_t> pcr_clone_size_distrib; n_unpaired_reads_rm()109 size_t n_unpaired_reads_rm() const {size_t n=0; for(auto& s: per_sample_stats) n+=s.n_unpaired_reads; return n;} n_read_pairs_pcr_dupl()110 size_t n_read_pairs_pcr_dupl() const {size_t n=0; for(auto& s: per_sample_stats) n+=s.n_read_pairs_pcr_dupl; return n;} n_read_pairs_used()111 size_t n_read_pairs_used() const {size_t n=0; for(auto& s: per_sample_stats) n+=s.n_read_pairs_used; return n;} 112 GenotypeStats(size_t n_samples)113 GenotypeStats(size_t n_samples) : n_genotyped_loci(0), n_sites_tot(0), per_sample_stats(n_samples, PerSampleStats()) {} 114 GenotypeStats& operator+= (const GenotypeStats& other); 115 }; 116 117 // 118 // HaplotypeStats 119 // ---------- 120 // Statistics produced by LocusProcessor regarding haplotyping. 121 // 122 class HaplotypeStats { 123 public: 124 map<pair<size_t,size_t>,size_t> n_badly_phased_samples; // { {n_consistent_hets, n_hets_2snps} : count } 125 126 struct PerSampleStats { 127 size_t n_diploid_loci; 128 size_t n_hets_2snps; 129 size_t n_phased; 130 size_t n_phased_2ndpass; 131 }; 132 vector<PerSampleStats> per_sample_stats; 133 HaplotypeStats(size_t n_samples)134 HaplotypeStats(size_t n_samples) : per_sample_stats(n_samples, PerSampleStats()) {} 135 HaplotypeStats& operator+= (const HaplotypeStats& other); 136 137 size_t n_halplotyping_attempts() const; 138 size_t n_consistent_hap_pairs() const; 139 }; 140 141 // 142 // ContigStats 143 // ---------- 144 // Statistics produced by `LocusProcessor::process()` when doing de novo assembly/alignment. 145 // 146 class ContigStats { 147 public: 148 size_t n_nonempty_loci; 149 size_t n_loci_w_pe_reads; 150 size_t n_loci_almost_no_pe_reads; 151 size_t n_loci_pe_graph_not_dag; 152 size_t n_loci_pe_graph_fixed_to_dag; 153 size_t length_ctg_tot; 154 155 size_t n_aln_reads; 156 size_t n_tot_reads; 157 158 size_t n_overlaps; 159 size_t length_overlap_tot; 160 size_t length_olapd_loci_tot; 161 162 OnlineMeanVar insert_length_olap_mv; 163 n_loci_no_pe_reads()164 size_t n_loci_no_pe_reads() const { return n_nonempty_loci - n_loci_w_pe_reads; } n_loci_ctg()165 size_t n_loci_ctg() const { return n_loci_w_pe_reads - n_loci_almost_no_pe_reads - n_loci_pe_graph_not_dag; } ctg_avg_length()166 double ctg_avg_length() const {return (double) length_ctg_tot / n_loci_ctg();} mean_olap_length()167 double mean_olap_length() const {return (double) length_overlap_tot / n_overlaps;} mean_olapd_locus_length()168 double mean_olapd_locus_length() const {return (double) length_olapd_loci_tot / n_overlaps;} 169 170 ContigStats& operator+= (const ContigStats& other); 171 }; 172 173 // 174 // LocData 175 // ---------- 176 // Data regarding the locus LocusProcessor currently works on. 177 // 178 class LocData { 179 public: 180 int id; 181 PhyLoc pos; 182 const MetaPopInfo* mpopi; 183 184 enum ContigStatus {unknown, separate, overlapped}; 185 ContigStatus ctg_status = unknown; 186 size_t olap_length; 187 188 string o_vcf; 189 string o_fa; 190 string o_details; 191 vector<vector<BamRecord>> o_bam; 192 stringstream details_ss; 193 194 void clear(); 195 }; 196 197 // 198 // Clocks 199 // ---------- 200 // For Benchmarking. Thread specific. 201 // 202 struct Timers { 203 Timer reading; 204 Timer processing; 205 Timer writing_fa; 206 Timer writing_vcf; 207 Timer writing_details; 208 Timer writing_bams; 209 210 Timer processing_pre_alns; 211 Timer rm_Ns; 212 Timer assembling; 213 Timer init_alignments; 214 Timer aligning; 215 Timer building_bams; 216 Timer merge_paired_reads; 217 Timer processing_post_alns; 218 Timer rm_reads; 219 Timer genotyping; 220 Timer haplotyping; 221 Timer building_vcf; 222 Timer building_fa; 223 Timer cpt_consensus; 224 Timer counting_nts; 225 226 Timers& operator+= (const Timers& other); 227 }; 228 229 // 230 // LocusProcessor 231 // ---------- 232 // Functor for processing loci. Thread-specific. 233 // 234 class LocusProcessor { 235 public: LocusProcessor(size_t n_samples)236 LocusProcessor(size_t n_samples) : gt_stats_(n_samples), hap_stats_(n_samples), ctg_stats_(), loc_() {} 237 238 // Process a locus. 239 void process(CLocReadSet& loc); 240 void process(CLocAlnSet& aln_loc); 241 242 // Access the output. Statistics & movable fasta/vcf per-locus text outputs. gt_stats()243 const GenotypeStats& gt_stats() const {return gt_stats_;} hap_stats()244 const HaplotypeStats& hap_stats() const {return hap_stats_;} ctg_stats()245 const ContigStats& ctg_stats() const {return ctg_stats_;} timers()246 Timers& timers() {return timers_;} 247 vcf_out()248 string& vcf_out() {return loc_.o_vcf;} fasta_out()249 string& fasta_out() {return loc_.o_fa;} details_out()250 string& details_out() {return loc_.o_details;} bam_out()251 vector<vector<BamRecord>>& bam_out() {return loc_.o_bam;} 252 253 private: 254 GenotypeStats gt_stats_; 255 HaplotypeStats hap_stats_; 256 ContigStats ctg_stats_; 257 Timers timers_; 258 mutable LocData loc_; 259 260 DNASeq4 assemble_locus_contig( 261 const vector<SRead>& fw_reads, 262 const vector<SRead>& pe_reads); 263 bool add_read_to_aln( 264 CLocAlnSet& aln_loc, 265 AlignRes& aln_res, 266 SRead&& read, 267 GappedAln* aligner, 268 SuffixTree* stree 269 ); 270 271 bool align_reads_to_contig(SuffixTree *st, GappedAln *g_aln, DNASeq4 &query, AlignRes &aln_res) const; 272 int suffix_tree_hits_to_dag(size_t query_len, vector<STAln> &alns, vector<STAln> &final_alns) const; 273 size_t find_locus_overlap(SuffixTree *st, GappedAln *g_aln, const DNASeq4 &se_consensus, string &overlap_cigar) const; 274 275 // Phase all samples. 276 void phase_hets ( 277 vector<map<size_t,PhasedHet>>& phase_data, 278 vector<SiteCall>& calls, 279 const CLocAlnSet& aln_loc, 280 HaplotypeStats& hap_stats 281 ) const; 282 283 // Phase (heterozygous) positions for one sample. 284 bool phase_sample_hets( 285 map<size_t,PhasedHet>& phased_sample, 286 const vector<SiteCall>& calls, 287 const CLocAlnSet& aln_loc, 288 const vector<size_t>& snp_cols, 289 size_t sample, 290 HaplotypeStats& hap_stats, 291 size_t& n_hets_needing_phasing, 292 size_t& n_consistent_hets, 293 ostream& o_hapgraph_ss, 294 bool& has_subgraphs, 295 bool first_pass=true) const; 296 297 void count_pairwise_cooccurrences( 298 SnpAlleleCooccurrenceCounter& cooccurrences, 299 const CLocAlnSet& aln_loc, 300 size_t sample, 301 const vector<size_t>& snp_cols, 302 const vector<size_t>& het_snps, 303 const vector<const SampleCall*>& sample_het_calls 304 ) const; 305 306 // Assemble phase sets. 307 // This is based on the graph of cooccurrences between the het alleles. 308 // We build a graph in which the nodes are phased pairs of hets alleles 309 // (`PhasedHet`s) and phase sets subgraphs of such nodes. The edges between 310 // pairs of alleles are based on those in the graph of cooccurrences, but 311 // in a phased context they appear as one of two types: 312 // * parallel (+): left-allele-to-right-allele (left-to-left) or right-to-right. 313 // * crossed (-): left-to-right, or right-to-left. 314 // Allele linkage is transitive (through composition of +/- edges). 315 // The assembly fails if there are inconsistent edges, i.e. edges that link 316 // (transitively) one allele of one het SNP to both alleles of another het 317 // SNP. 318 // In practice, we start with an empty graph and iterate on edges, adding 319 // nodes/PhasedHets when we first see them and checking that new edges are 320 // consistent with the growing graph. 321 bool assemble_phase_sets( 322 vector<PhaseSet>& phase_sets, 323 const vector<size_t>& het_snps, 324 const vector<const SampleCall*>& sample_het_calls, 325 const SnpAlleleCooccurrenceCounter& cooccurrences, 326 const size_t min_n_cooccurrences 327 ) const; 328 329 // Create the fasta/vcf text outputs. 330 void write_one_locus ( 331 const CLocAlnSet& aln_loc, 332 const vector<SiteCounts>& depths, 333 const vector<SiteCall>& calls, 334 const vector<map<size_t,PhasedHet>>& phase_data // {col : phasedhet} maps, for all samples 335 ); 336 337 // (debug) Write a sample's haplotype graph. 338 void write_sample_hapgraph( 339 ostream& os, 340 size_t sample, 341 const vector<size_t>& het_snps, 342 const vector<size_t>& snp_cols, 343 const vector<const SampleCall*>& sample_het_calls, 344 const SnpAlleleCooccurrenceCounter& cooccurences 345 ) const; 346 347 // 348 // (debug) Creates a CLocAlnSet from a CLocReadSet, using the true 349 // reference but computing paired-end reads alignments. Same requirements as 350 // `from_true_alignments()`. 351 // 352 void using_true_reference(CLocAlnSet& aln_loc, CLocReadSet&& loc); 353 }; 354 355 #endif 356