1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2017-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20 #include <getopt.h>
21 #include <zlib.h>
22
23 #include "gstacks.h"
24
25 #include "constants.h"
26 #include "utils.h"
27 #include "log_utils.h"
28 #include "catalog_utils.h"
29 #include "locus.h"
30 #include "locus_readers.h"
31 #include "debruijn.h"
32 #include "aln_utils.h"
33 #include "Alignment.h"
34 #include "models.h"
35
36 //
37 // Argument globals.
38 //
39 GStacksInputT input_type = GStacksInputT::unknown;
40 string popmap_path; // Set if --popmap is given. For logging purposes.
41 vector<string> sample_names; // Set if --popmap is given.
42 vector<string> in_bams;
43 vector<string> out_bams;
44
45 string out_dir;
46
47 BamCLocBuilder::Config refbased_cfg {true, 1000, 10, 0.20, 1, false, 1};
48
49 int num_threads = 1;
50 bool quiet = false;
51 bool ignore_pe_reads = false;
52 double min_aln_cov = 0.75;
53 int min_se_pe_overlap = 5;
54 double overlap_min_pct_id = 0.80;
55 bool bam_output = false;
56 bool detailed_output = false;
57 bool rm_unpaired_reads = false;
58 bool rm_pcr_duplicates = false;
59
60 modelt model_type = marukilow;
61 unique_ptr<const Model> model;
62
63 size_t km_length = 31;
64 size_t max_debruijn_reads = 1000;
65 size_t min_km_count = 2;
66 size_t max_fragment_alns = 2;
67
68 pair<size_t,size_t> phasing_cooccurrences_thr_range = {2, 2};
69 bool phasing_dont_prune_hets = false;
70 long phasing_min_mac = 1;
71
72 bool dbg_no_overlaps = false;
73 bool dbg_no_haplotypes = false;
74 bool dbg_print_cloc_ids = false;
75 bool dbg_write_gfa = false;
76 bool dbg_write_gfa_notdag = false;
77 bool dbg_write_alns = false;
78 bool dbg_write_hapgraphs = false;
79 bool dbg_write_nt_depths = false;
80 bool dbg_log_stats_phasing = false;
81 bool dbg_phasing_no_2ndpass = false;
82 size_t dbg_denovo_min_loc_samples = 0;
83
84 //
85 // Additional globals.
86 //
87 const string prog_name = "gstacks";
88 unique_ptr<LogAlterator> logger;
89 gzFile o_gzfasta_f = NULL;
90 unique_ptr<VcfWriter> o_vcf_f;
91 unique_ptr<VersatileWriter> o_details_f;
92
93 ofstream o_aln_f;
94 ofstream o_hapgraphs_f;
95 const char o_aln_header[] =
96 "# This prints observed read haplotypes:\n"
97 "# show_loc() { loc=$1; cat ./gstacks.alns | sed -n \"/^END $loc\\b/ q; /^BEGIN $loc\\b/,$ p\" | tail -n+2; }\n"
98 "# snp_cols() { loc=$1; zcat ./gstacks.calls | awk \"\\$1==$loc; \\$1>$loc {exit}\" | awk '$5!=\".\"' | cut -f2 | paste -sd ','; }\n"
99 "# show_haps() { loc=$1; cols=$2; spl=$3; show_loc $loc | grep \"\\b$spl\\b\" | cut -f3 | cut -c \"${cols//./,}\" | sort; }\n"
100 "# format_haps() { loc=$1; cols=$2; spl=$3; show_haps $loc $cols $spl | transpose-ch | paste <(echo $cols | tr ,. '\\n' | xargs printf '% 4s\\n') - | sed 's/\\t/ /' | seq.color | { echo; echo $loc/$spl; echo; cat; echo; }; }\n"
101 "# true_loci() { loc=$1; spl=$2; show_loc $loc | grep \"\\b$spl\\b\" | grep -v ref | cut -d: -f1 | sort -u; }\n"
102 ;
103 const char o_hapgraphs_header[] =
104 "# dot -Tpdf -O gstacks.hapgraphs.dot\n"
105 "# loc=371\n"
106 "# { g=gstacks.hapgraphs.dot; sed -n '0,/^subgraph/p' $g | head -n-1; sed -n \"/^subgraph cluster_loc$loc\\b/,/^}/p\" $g; echo \\}; } | dot -Tpdf -o haps.$loc.pdf\n"
107 "graph {\n"
108 "edge[color=\"grey60\",fontsize=12,labeljust=\"l\"];\n";
109 ;
110
111 //
112 // main
113 // ==========
114 //
115
116 int
main(int argc,char ** argv)117 main(int argc, char** argv)
118 {
119 try {
120 //
121 // Parse arguments.
122 //
123 string opts_report = parse_command_line(argc, argv);
124
125 //
126 // Open the BAM file(s).
127 //
128 vector<Bam*> bam_f_ptrs;
129 try {
130 for (const string& in_bam : in_bams)
131 bam_f_ptrs.push_back(new Bam(in_bam));
132 } catch(exception& e) {
133 if (bam_f_ptrs.size() >= 250)
134 cerr << "Error: You might need to increase your system's max-open-files limit,"
135 " see https://groups.google.com/d/msg/stacks-users/GZqJM_WkMhI/m9Hreg4oBgAJ\n";
136 throw;
137 }
138
139 //
140 // Open the log.
141 //
142 logger.reset(new LogAlterator(out_dir + "gstacks", true, quiet, argc, argv));
143 cout << opts_report << flush;
144
145 //
146 // Initialize the locus readers.
147 //
148 cout << "\nReading BAM headers...\n" << flush;
149 unique_ptr<BamCLocReader> bam_cloc_reader;
150 unique_ptr<BamCLocBuilder> bam_cloc_builder;
151 const MetaPopInfo* bam_mpopi;
152 if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
153 bam_cloc_reader.reset(new BamCLocReader(move(bam_f_ptrs)));
154 // (Note: For de novo (Stacks) input we use read groups, even when a
155 // popmap was specified, since the input files ought to have them and
156 // this allows to detect corruption.)
157 bam_mpopi = &bam_cloc_reader->mpopi();
158 } else if (input_type == GStacksInputT::refbased_popmap || input_type == GStacksInputT::refbased_list) {
159 bam_cloc_builder.reset(new BamCLocBuilder(move(bam_f_ptrs), refbased_cfg, sample_names));
160 bam_mpopi = &bam_cloc_builder->mpopi();
161 } else {
162 DOES_NOT_HAPPEN;
163 }
164
165 //
166 // Open the output files.
167 //
168 string o_gzfasta_path = out_dir + "catalog.fa.gz";
169 o_gzfasta_f = gzopen(o_gzfasta_path.c_str(), "wb");
170 check_open(o_gzfasta_f, o_gzfasta_path);
171
172 VcfHeader vcf_header;
173 vcf_header.add_std_meta();
174 for(size_t s : bam_mpopi->sample_indexes_orig_order())
175 vcf_header.add_sample(bam_mpopi->samples()[s].name);
176 o_vcf_f.reset(new VcfWriter(out_dir + "catalog.calls", move(vcf_header)));
177
178 if (detailed_output) {
179 o_details_f.reset(new VersatileWriter(out_dir + "gstacks.details.gz"));
180 *o_details_f << "# show_loc() { loc=$1; zcat ./gstacks.details.gz | sed -rn \"/^BEGIN locus $loc\\b/,\\$ p; /^END locus $loc\\b/ q;\"; }\n";
181 }
182
183 vector<unique_ptr<Bam>> bam_of_ptrs;
184 if (bam_output) {
185 try {
186 stringstream header_ss;
187 header_ss << "@HD\tVN:1.5\tSO:coordinate\n";
188 for (const Sample& s : bam_cloc_reader->mpopi().samples())
189 header_ss << "@RG\tID:" << s.id << "\tSM:" << s.name << "\tid:" << s.id << '\n';
190 for (size_t target_i=0; target_i<bam_cloc_reader->n_loci(); ++target_i)
191 header_ss << "@SQ\tSN:" << bam_cloc_reader->target2id(target_i) << "\tLN:10000\n";
192 BamHeader header = BamHeader(header_ss.str());
193 if (input_type == GStacksInputT::denovo_merger) {
194 assert(out_bams.size() == 1);
195 bam_of_ptrs.emplace_back(new Bam(out_bams[0], BamHeader(header)));
196 } else if (input_type == GStacksInputT::denovo_popmap) {
197 assert(out_bams.size() == bam_cloc_reader->bam_fs().size());
198 for(size_t i=0; i<out_bams.size(); ++i) {
199 bam_of_ptrs.emplace_back(new Bam(out_bams[i], BamHeader(header)));
200 }
201 } else {
202 DOES_NOT_HAPPEN;
203 }
204 } catch(exception& e) {
205 if (bam_f_ptrs.size() + bam_of_ptrs.size() >= 250)
206 cerr << "Error: You might need to increase your system's max-open-files limit,"
207 " see https://groups.google.com/d/msg/stacks-users/GZqJM_WkMhI/m9Hreg4oBgAJ\n";
208 throw;
209 }}
210
211 if (dbg_write_alns) {
212 string o_aln_path = out_dir + "gstacks.alns";
213 o_aln_f.open(o_aln_path);
214 check_open(o_aln_f, o_aln_path);
215 o_aln_f << o_aln_header;
216 }
217
218 if (dbg_write_hapgraphs) {
219 string o_hapgraphs_path = out_dir + "gstacks.hapgraphs.dot";
220 o_hapgraphs_f.open(o_hapgraphs_path);
221 check_open(o_hapgraphs_f, o_hapgraphs_path);
222 o_hapgraphs_f << o_hapgraphs_header;
223 }
224
225 //
226 // Process every locus
227 //
228 GenotypeStats gt_stats (bam_mpopi->samples().size());
229 HaplotypeStats hap_stats (bam_mpopi->samples().size());
230 ContigStats denovo_ctg_stats {};
231
232 // For clocking.
233 Timers t_threads_totals;
234 Timer t_parallel;
235 Timer t_writing_vcf;
236 size_t n_writes = 0;
237 size_t max_size_before_write = 0;
238
239 // For parallelization.
240 int omp_return = 0;
241 std::deque<pair<bool,string>> fa_outputs;
242 std::deque<pair<bool,string>> vcf_outputs;
243 std::deque<pair<bool,string>> det_outputs; // (details)
244 std::deque<pair<bool,vector<vector<BamRecord>>>> bam_outputs;
245 size_t next_fa_to_write = 0; // locus index, in the input BAM file.
246 size_t next_vcf_to_write = 0;
247 size_t next_det_to_write = 0;
248 size_t next_bam_to_write = 0;
249
250 cout << "Processing all loci...\n" << flush;
251 ProgressMeter progress =
252 (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) ?
253 ProgressMeter(cout, true, bam_cloc_reader->tally_n_components())
254 : ProgressMeter(cout, false, 1000);
255
256 bool eof = false;
257 #pragma omp parallel num_threads(num_threads)
258 { try {
259 LocusProcessor loc_proc (bam_mpopi->samples().size());
260 Timers& t = loc_proc.timers();
261
262 CLocReadSet loc (*bam_mpopi); // For denovo.
263 CLocAlnSet aln_loc; // For ref-based.
264 bool thread_eof = false;
265 while(omp_return == 0) {
266 t.reading.restart();
267 #pragma omp critical(read)
268 { try {
269 if (!eof && omp_return == 0) {
270 if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger)
271 eof = !bam_cloc_reader->read_one_locus(loc);
272 else
273 eof = !bam_cloc_builder->build_one_locus(aln_loc);
274 }
275 thread_eof = eof;
276 } catch (exception& e) {
277 omp_return = stacks_handle_exceptions(e);
278 }}
279 t.reading.update();
280 if (thread_eof || omp_return != 0)
281 break;
282
283 // Process it.
284 t.processing.restart();
285 size_t loc_i;
286 if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
287 loc_i = loc.bam_i();
288 if (dbg_print_cloc_ids)
289 cerr << (to_string(loc.id()) + '\n') << flush;
290 if (ignore_pe_reads)
291 loc.pe_reads().clear();
292 loc_proc.process(loc);
293 } else {
294 assert(aln_loc.id() >= 1);
295 loc_i = aln_loc.id() - 1;
296 if (dbg_print_cloc_ids)
297 cerr << (to_string(aln_loc.id()) + '\n') << flush;
298 if (refbased_cfg.paired)
299 aln_loc.merge_paired_reads();
300 loc_proc.process(aln_loc);
301 }
302 t.processing.update();
303
304 // Write the FASTA output.
305 t.writing_fa.restart();
306 #pragma omp critical(write_fa)
307 {
308 for (size_t i=next_fa_to_write+fa_outputs.size(); i<=loc_i; ++i)
309 fa_outputs.push_back( {false, string()} );
310 fa_outputs[loc_i - next_fa_to_write] = {true, move(loc_proc.fasta_out())};
311
312 while (!fa_outputs.empty() && fa_outputs.front().first) {
313 const string& fa = fa_outputs.front().second;
314 if (!fa.empty() && gzwrite(o_gzfasta_f, fa.c_str(), fa.length()) <= 0)
315 throw std::ios::failure("gzwrite");
316 fa_outputs.pop_front();
317 ++next_fa_to_write;
318 }
319 }
320 t.writing_fa.update();
321
322 // Write the VCF output.
323 t.writing_vcf.restart();
324 #pragma omp critical(write_vcf)
325 {
326 for (size_t i=next_vcf_to_write+vcf_outputs.size(); i<=loc_i; ++i)
327 vcf_outputs.push_back( {false, string()} );
328 vcf_outputs[loc_i - next_vcf_to_write] = {true, move(loc_proc.vcf_out()) };
329
330 if (vcf_outputs.front().first) {
331 ++n_writes;
332 if (vcf_outputs.size() > max_size_before_write)
333 max_size_before_write = vcf_outputs.size();
334 t_writing_vcf.restart();
335 do {
336 o_vcf_f->file() << vcf_outputs.front().second;
337 vcf_outputs.pop_front();
338 if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger)
339 progress += bam_cloc_reader->n_catalog_components_of(next_vcf_to_write);
340 else
341 ++progress;
342 ++next_vcf_to_write;
343 } while (!vcf_outputs.empty() && vcf_outputs.front().first);
344 t_writing_vcf.update();
345 }
346 }
347 t.writing_vcf.update();
348
349 // Write the alignments.
350 if (bam_output) {
351 t.writing_bams.restart();
352 #pragma omp critical(write_bams)
353 {
354 for (size_t i=next_bam_to_write+bam_outputs.size(); i<=loc_i; ++i)
355 bam_outputs.push_back( make_pair(false, vector<vector<BamRecord>>()) );
356 bam_outputs[loc_i - next_bam_to_write] = make_pair(true, move(loc_proc.bam_out()));
357
358 while (!bam_outputs.empty() && bam_outputs.front().first) {
359 vector<vector<BamRecord>>& recs = bam_outputs.front().second;
360 if (!recs.empty()) {
361 assert(recs.size() == bam_of_ptrs.size());
362 for (size_t i=0; i<recs.size(); ++i)
363 for (const BamRecord& r : recs[i])
364 bam_of_ptrs[i]->write(r);
365 }
366 bam_outputs.pop_front();
367 ++next_bam_to_write;
368 }
369 }
370 t.writing_bams.update();
371 }
372
373 // Write the detailed output.
374 if (detailed_output) {
375 t.writing_details.restart();
376 #pragma omp critical(write_details)
377 {
378 for (size_t i=next_det_to_write+det_outputs.size(); i<=loc_i; ++i)
379 det_outputs.push_back( {false, string()} );
380 det_outputs[loc_i - next_det_to_write] = {true, move(loc_proc.details_out())};
381
382 while (!det_outputs.empty() && det_outputs.front().first) {
383 *o_details_f << det_outputs.front().second;
384 det_outputs.pop_front();
385 ++next_det_to_write;
386 }
387 }
388 t.writing_details.update();
389 }
390 }
391
392 // Tally the per-thread statistics.
393 #pragma omp critical(stats)
394 if (omp_return == 0) {
395 denovo_ctg_stats += loc_proc.ctg_stats();
396 gt_stats += loc_proc.gt_stats();
397 hap_stats += loc_proc.hap_stats();
398 t_threads_totals += loc_proc.timers();
399 }
400 } catch (exception& e) {
401 #pragma omp critical(exc)
402 omp_return = stacks_handle_exceptions(e);
403 }}
404 if (omp_return != 0)
405 return omp_return;
406 t_parallel.update();
407 progress.done();
408
409 //
410 // Report statistics.
411 //
412 ostream o_fp1 (cout.rdbuf());
413 o_fp1 << std::fixed << std::setprecision(1);
414 ostream o_fp3 (cout.rdbuf());
415 o_fp3 << std::fixed << std::setprecision(3);
416 ostream x_fp1 (logger->x.rdbuf());
417 x_fp1 << std::fixed << std::setprecision(1);
418 ostream x_fp3 (logger->x.rdbuf());
419 x_fp3 << std::fixed << std::setprecision(3);
420
421 if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
422 if (denovo_ctg_stats.n_loci_w_pe_reads == 0) {
423 cout << "Input appears to be single-end (no paired-end reads were seen).\n";
424 } else {
425 // Report assembly statistics.
426 assert(!ignore_pe_reads);
427 const ContigStats& cs = denovo_ctg_stats;
428 size_t no_pe = cs.n_loci_no_pe_reads() + cs.n_loci_almost_no_pe_reads;
429 size_t pe_ndag = cs.n_loci_pe_graph_not_dag;
430 size_t pe_ctg = cs.n_loci_ctg();
431 auto pct = [&cs](size_t n) { return as_percentage((double) n / cs.n_nonempty_loci); };
432
433 o_fp1 << "\n"
434 << "Attempted to assemble and align paired-end reads for " << cs.n_nonempty_loci << " loci:\n"
435 << " " << no_pe << " loci had no or almost no paired-end reads (" << pct(no_pe) << ");\n"
436 << " " << pe_ndag << " loci had paired-end reads that couldn't be assembled into a contig ("
437 << pct(pe_ndag) << ");\n"
438 << " For the remaining " << pe_ctg << " loci (" << pct(pe_ctg) << "), a paired-end contig was assembled;\n"
439 << " Average contig size was " << cs.ctg_avg_length() << " bp;\n"
440 << " " << cs.n_overlaps << " paired-end contigs overlapped the forward region ("
441 << as_percentage((double) cs.n_overlaps / cs.n_loci_ctg()) << ")\n"
442 << " Mean overlap: " << cs.mean_olap_length() << "bp; mean size of overlapped loci after merging: "
443 << cs.mean_olapd_locus_length() << ";\n"
444 << " Out of " << cs.n_tot_reads << " paired-end reads in these loci (mean "
445 << (double) cs.n_aln_reads / pe_ctg << " reads per locus),\n"
446 << " " << cs.n_aln_reads << " were successfuly aligned ("
447 << as_percentage((double) cs.n_aln_reads / cs.n_tot_reads) << ");\n"
448 << " Mean insert length was " << cs.insert_length_olap_mv.mean() << ", stdev: "
449 << cs.insert_length_olap_mv.sd_p() << " (based on aligned reads in overlapped loci).\n";
450 }
451
452 } else if (input_type == GStacksInputT::refbased_popmap || input_type == GStacksInputT::refbased_list) {
453 // Report statistics on the input BAM(s).
454 BamCLocBuilder::BamStats bam_stats {};
455 const vector<BamCLocBuilder::BamStats>& bam_stats_s = bam_cloc_builder->bam_stats_per_sample();
456 for (auto& bstats : bam_stats_s)
457 bam_stats += bstats;
458 size_t tot = bam_stats.n_primary + bam_stats.n_unmapped;
459 cout << "\n"
460 << "Read " << bam_stats.n_records << " BAM records:\n"
461 << " kept " << bam_stats.n_primary_kept() << " primary alignments ("
462 << as_percentage(bam_stats.n_primary_kept(), tot) << ")";
463 if (refbased_cfg.paired)
464 cout << ", of which " << bam_stats.n_primary_kept_read2 << " reverse reads";
465 cout << "\n"
466 << " skipped " << bam_stats.n_primary_mapq << " primary alignments with insufficient mapping qualities ("
467 << as_percentage((double) bam_stats.n_primary_mapq / tot) << ")\n"
468 << " skipped " << bam_stats.n_primary_softclipped << " excessively soft-clipped primary alignments ("
469 << as_percentage((double) bam_stats.n_primary_softclipped / tot) << ")\n"
470 << " skipped " << bam_stats.n_unmapped << " unmapped reads ("
471 << as_percentage((double) bam_stats.n_unmapped / tot) << ")\n";
472 if (bam_stats.n_secondary > 0 || bam_stats.n_supplementary > 0)
473 cout << " skipped some suboptimal (secondary/supplementary) alignment records\n";
474 if (refbased_cfg.ign_pe_reads)
475 cout << " ignored " << bam_stats.n_ignored_read2_recs << " READ2 records\n";
476
477 // Per sample BAM stats.
478 logger->x << "\n"
479 << "BEGIN bam_stats_per_sample\n"
480 << "sample"
481 "\trecords"
482 "\tprimary_kept"
483 "\tkept_frac"
484 "\tprimary_kept_read2"
485 "\tprimary_disc_mapq"
486 "\tprimary_disc_sclip"
487 "\tunmapped"
488 "\tsecondary"
489 "\tsupplementary\n";
490 size_t min_recs = SIZE_MAX;
491 size_t max_recs = 0;
492 double mean_recs = 0.0;
493 double min_keep_rate = DBL_MAX;
494 double max_keep_rate = 0.0;
495 assert(bam_stats_s.size() == bam_mpopi->samples().size());
496 for (size_t sample=0; sample<bam_stats_s.size(); ++sample) {
497 auto& bstats = bam_stats_s[sample];
498 x_fp3 << bam_mpopi->samples()[sample].name << '\t'
499 << bstats.n_records << '\t'
500 << bstats.n_primary_kept() << '\t'
501 << (double) bstats.n_primary_kept() / bstats.n_records << '\t'
502 << bstats.n_primary_kept_read2 << '\t'
503 << bstats.n_primary_mapq << '\t'
504 << bstats.n_primary_softclipped << '\t'
505 << bstats.n_unmapped << '\t'
506 << bstats.n_secondary << '\t'
507 << bstats.n_supplementary << '\n';
508 mean_recs += bstats.n_records;
509 if (bstats.n_records < min_recs)
510 min_recs = bstats.n_records;
511 if (bstats.n_records > max_recs)
512 max_recs = bstats.n_records;
513 double keep_rate = (double) bstats.n_primary_kept() / bstats.n_records;
514 if (keep_rate < min_keep_rate)
515 min_keep_rate = keep_rate;
516 if (keep_rate > max_keep_rate)
517 max_keep_rate = keep_rate;
518 }
519 logger->x << "END bam_stats_per_sample\n";
520 mean_recs /= bam_mpopi->samples().size();
521 o_fp1 << "\n"
522 << " Per-sample stats (details in '"
523 << logger->distribs_path.substr(logger->distribs_path.rfind('/') + 1)
524 << "'):\n"
525 << " read " << mean_recs << " records/sample (" << min_recs
526 << "-" << max_recs << ")\n"
527 << " kept " << as_percentage(min_keep_rate) << "-"
528 << as_percentage(max_keep_rate) << " of these\n";
529
530 // Report statistics on the loci that were built.
531 const BamCLocBuilder::LocStats& loc_stats = bam_cloc_builder->loc_stats();
532 cout << "\n"
533 << "Built " << loc_stats.n_loci_built << " loci comprising "
534 << loc_stats.n_fw_reads;
535 if (refbased_cfg.paired) {
536 o_fp1 << " forward reads and "
537 << loc_stats.n_read_pairs() << " matching paired-end reads;"
538 << " mean insert length was " << loc_stats.insert_lengths_mv.mean()
539 << " (sd: " << loc_stats.insert_lengths_mv.sd_p() << ").\n";
540 } else {
541 cout << " reads.\n";
542 }
543 }
544
545 if (rm_unpaired_reads) {
546 // Report statistics on pairs.
547 size_t n_unpaired = gt_stats.n_unpaired_reads_rm();
548 size_t n_pcr_dupl = gt_stats.n_read_pairs_pcr_dupl();
549 size_t n_used = gt_stats.n_read_pairs_used();
550 size_t n_remaining_loci = gt_stats.n_genotyped_loci;
551 cout << "Removed " << n_unpaired << " unpaired (forward) reads ("
552 << as_percentage((double) n_unpaired / (n_unpaired + n_used + n_pcr_dupl)) << "); kept "
553 << n_used + n_pcr_dupl << " read pairs in "
554 << n_remaining_loci << " loci.\n";
555 if (rm_pcr_duplicates) {
556 cout << "Removed " << n_pcr_dupl
557 << " read pairs whose insert length had already been seen in the same sample as putative PCR duplicates ("
558 << as_percentage((double) n_pcr_dupl / (n_pcr_dupl + n_used))
559 << "); kept " << n_used << " read pairs.\n";
560 } else {
561 assert(n_pcr_dupl == 0);
562 }
563 } else {
564 assert(gt_stats.n_unpaired_reads_rm() == 0 && gt_stats.n_read_pairs_pcr_dupl() == 0);
565 }
566
567 // Report statistics on genotyping and haplotyping.
568 size_t n_hap_attempts = hap_stats.n_halplotyping_attempts();
569 size_t n_hap_pairs = hap_stats.n_consistent_hap_pairs();
570 OnlineMeanVar ecov_mean;
571 double ecov_min = DBL_MAX;
572 double ecov_max = 0.0;
573 for (const GenotypeStats::PerSampleStats& sample : gt_stats.per_sample_stats) {
574 double cov = (double) sample.ns_weighted_n_read_pairs_used / sample.ns_cumsum;
575 ecov_mean.increment(cov);
576 if (cov < ecov_min)
577 ecov_min = cov;
578 if (cov > ecov_max)
579 ecov_max = cov;
580 }
581 o_fp1 << "\n"
582 << "Genotyped " << gt_stats.n_genotyped_loci << " loci:\n";
583 if (gt_stats.n_genotyped_loci == 0) {
584 cerr << "Error: There wasn't any locus to genotype; check input/arguments.\n";
585 throw exception();
586 }
587 o_fp1 << " effective per-sample coverage: mean=" << ecov_mean.mean() << "x, stdev="
588 << ecov_mean.sd_p() << "x, min=" << ecov_min << "x, max=" << ecov_max << "x\n"
589 << " mean number of sites per locus: " << gt_stats.mean_n_sites_per_loc() << "\n"
590 << " a consistent phasing was found for " << n_hap_pairs << " of out " << n_hap_attempts
591 << " (" << as_percentage((double) n_hap_pairs / n_hap_attempts)
592 << ") diploid loci needing phasing\n";
593
594 // effective_coverages_per_sample
595 logger->x << "\n"
596 "BEGIN effective_coverages_per_sample\n"
597 "# For mean_cov_ns, the coverage at each locus is weighted by the number of\n"
598 "# samples present at that locus (i.e. coverage at shared loci counts more).\n";
599 logger->x << "sample"
600 "\tn_loci"
601 "\tn_used_fw_reads"
602 "\tmean_cov"
603 "\tmean_cov_ns";
604 if (rm_unpaired_reads) {
605 logger->x << "\tn_unpaired_reads";
606 if (rm_pcr_duplicates) {
607 logger->x << "\tn_pcr_dupl_pairs"
608 "\tpcr_dupl_rate";
609 }
610 }
611 logger->x << '\n';
612 assert(gt_stats.per_sample_stats.size() == bam_mpopi->samples().size());
613 for (size_t sample=0; sample<bam_mpopi->samples().size(); ++sample) {
614 const GenotypeStats::PerSampleStats& stats = gt_stats.per_sample_stats[sample];
615 x_fp3 << bam_mpopi->samples()[sample].name
616 << '\t' << stats.n_loci_with_sample
617 << '\t' << stats.n_read_pairs_used
618 << '\t' << (double) stats.n_read_pairs_used / stats.n_loci_with_sample
619 << '\t' << (double) stats.ns_weighted_n_read_pairs_used / stats.ns_cumsum;
620 if (rm_unpaired_reads) {
621 x_fp3 << '\t' << stats.n_unpaired_reads;
622 if (rm_pcr_duplicates) {
623 x_fp3 << '\t' << stats.n_read_pairs_pcr_dupl
624 << '\t' << (double) stats.n_read_pairs_pcr_dupl / (stats.n_read_pairs_pcr_dupl + stats.n_read_pairs_used);
625 }
626 }
627 x_fp3 << '\n';
628 }
629 logger->x << "END effective_coverages_per_sample\n";
630
631 // pcr_clone_size_distrib
632 if(rm_pcr_duplicates) {
633 const vector<size_t>& cz_d = gt_stats.pcr_clone_size_distrib;
634 assert(!cz_d.empty());
635 assert(cz_d[0] == 0); // (We can't observe a clone of 0 reads.)
636 logger->x << "\n"
637 << "BEGIN pcr_clone_size_distrib\n"
638 <<"clone_size\tn_clones\tn_reads\n";
639 for(size_t clone_size=1; clone_size<cz_d.size(); ++clone_size)
640 logger->x << clone_size
641 << '\t' << cz_d[clone_size]
642 << '\t' << cz_d[clone_size] * clone_size
643 << '\n';
644 logger->x << "END pcr_clone_size_distrib\n";
645 }
646
647 // phasing_rates_samples
648 logger->x << "\n"
649 << "BEGIN phasing_rates_per_sample\n"
650 << "sample\tn_gts\tn_multisnp_hets\tn_phased\tmisphasing_rate"
651 #ifdef DEBUG
652 << "\tn_phased_2ndpass"
653 #endif
654 << "\n";
655
656 assert(hap_stats.per_sample_stats.size() == bam_mpopi->samples().size());
657 for (size_t sample=0; sample<bam_mpopi->samples().size(); ++sample) {
658 const HaplotypeStats::PerSampleStats& stats = hap_stats.per_sample_stats[sample];
659
660 x_fp3 << bam_mpopi->samples()[sample].name
661 << '\t' << stats.n_diploid_loci
662 << '\t' << stats.n_hets_2snps
663 << '\t' << stats.n_phased
664 << '\t' << 1.0 - (double) stats.n_phased / stats.n_hets_2snps
665 #ifdef DEBUG
666 << '\t' << stats.n_phased_2ndpass
667 #endif
668 << '\n';
669 }
670 logger->x << "END phasing_rates_per_sample\n";
671
672 // phasing_rates_loci
673 #ifdef DEBUG
674 logger->x << "\n"
675 << "BEGIN phasing_rates_loci\n"
676 << "n_hets_2snps\tn_bad_hets\tn_loci\n";
677 for (auto& elem : hap_stats.n_badly_phased_samples)
678 logger->x << elem.first.second
679 << '\t' << elem.first.second - elem.first.first
680 << '\t' << elem.second
681 << '\n';
682 logger->x << "END phasing_rates_loci\n";
683 #endif
684
685 // Report clockings.
686 {
687 double ll = t_parallel.elapsed();
688 double v = t_writing_vcf.elapsed();
689
690 double r = t_threads_totals.reading.elapsed() / num_threads;
691 double p = t_threads_totals.processing.elapsed() / num_threads;
692 double w_f = t_threads_totals.writing_fa.elapsed() / num_threads;
693 double w_v = t_threads_totals.writing_vcf.elapsed() / num_threads;
694 double w_d = t_threads_totals.writing_details.elapsed() / num_threads;
695 double w_b = t_threads_totals.writing_bams.elapsed() / num_threads;
696
697 double ppr = t_threads_totals.processing_pre_alns.elapsed() / num_threads;
698 double rn = t_threads_totals.rm_Ns.elapsed() / num_threads;
699 double as = t_threads_totals.assembling.elapsed() / num_threads;
700 double ia = t_threads_totals.init_alignments.elapsed() / num_threads;
701 double al = t_threads_totals.aligning.elapsed() / num_threads;
702 double me = t_threads_totals.merge_paired_reads.elapsed() / num_threads;
703 double ppo = t_threads_totals.processing_post_alns.elapsed() / num_threads;
704 double rr = t_threads_totals.rm_reads.elapsed() / num_threads;
705 double cnt = t_threads_totals.counting_nts.elapsed() / num_threads;
706 double g = t_threads_totals.genotyping.elapsed() / num_threads;
707 double h = t_threads_totals.haplotyping.elapsed() / num_threads;
708 double u = t_threads_totals.cpt_consensus.elapsed() / num_threads;
709 double b_v = t_threads_totals.building_vcf.elapsed() / num_threads;
710 double b_f = t_threads_totals.building_fa.elapsed() / num_threads;
711 double b_b = t_threads_totals.building_bams.elapsed() / num_threads;
712
713 double c = t_parallel.consumed()
714 + t_threads_totals.reading.consumed() / num_threads
715 + t_threads_totals.processing.consumed() / num_threads
716 + t_threads_totals.writing_fa.consumed() / num_threads
717 + t_threads_totals.writing_vcf.consumed() / num_threads
718 + t_threads_totals.writing_details.consumed() / num_threads
719 + t_threads_totals.writing_bams.consumed() / num_threads
720 + t_threads_totals.processing_pre_alns.consumed() / num_threads
721 + t_threads_totals.rm_Ns.consumed() / num_threads
722 + t_threads_totals.assembling.consumed() / num_threads
723 + t_threads_totals.init_alignments.consumed() / num_threads
724 + t_threads_totals.aligning.consumed() / num_threads
725 + t_threads_totals.merge_paired_reads.consumed() / num_threads
726 + t_threads_totals.processing_post_alns.consumed() / num_threads
727 + t_threads_totals.rm_reads.consumed() / num_threads
728 + t_threads_totals.counting_nts.consumed() / num_threads
729 + t_threads_totals.genotyping.consumed() / num_threads
730 + t_threads_totals.haplotyping.consumed() / num_threads
731 + t_threads_totals.cpt_consensus.consumed() / num_threads
732 + t_threads_totals.building_vcf.consumed() / num_threads
733 + t_threads_totals.building_fa.consumed() / num_threads
734 + t_threads_totals.building_bams.consumed() / num_threads
735 + t_writing_vcf.consumed()
736 ;
737
738 x_fp1 << "\n"
739 << "BEGIN clockings\n"
740 << "Num. threads: " << num_threads << "\n"
741 << "Parallel time: " << ll << "\n"
742 << "Average thread time spent:\n"
743 << std::setw(8) << r << " reading (" << as_percentage(r / ll) << ")\n"
744 << std::setw(8) << p << " processing (" << as_percentage(p / ll) << ")\n";
745 if (as != 0.0)
746 // De novo mode & paired-ends.
747 x_fp1
748 << std::setw(16) << ppr << " pre-alignments block (" << as_percentage(ppr / ll) << ")\n"
749 << std::setw(16) << rn << " reformatting fw-reads (" << as_percentage(rn / ll) << ")\n"
750 << std::setw(16) << as << " assembling (" << as_percentage(as / ll) << ")\n"
751 << std::setw(16) << ia << " initializing alignments (" << as_percentage(ia / ll) << ")\n"
752 << std::setw(16) << al << " aligning (" << as_percentage(al / ll) << ")\n"
753 << std::setw(16) << me << " merging read pairs (" << as_percentage(me / ll) << ")\n"
754 ;
755 x_fp1
756 << std::setw(16) << ppo << " post-alignments block (" << as_percentage(ppo / ll) << ")\n"
757 << std::setw(16) << rr << " filtering reads (" << as_percentage(rr / ll) << ")\n"
758 << std::setw(16) << cnt << " counting nucleotides (" << as_percentage(cnt / ll) << ")\n"
759 << std::setw(16) << g << " genotyping (" << as_percentage(g / ll) << ")\n"
760 << std::setw(16) << h << " haplotyping (" << as_percentage(h / ll) << ")\n"
761 << std::setw(16) << u << " computing consensus (" << as_percentage(u / ll) << ")\n"
762 << std::setw(16) << b_f << " building_fa (" << as_percentage(b_f / ll) << ")\n"
763 << std::setw(16) << b_v << " building_vcf (" << as_percentage(b_v / ll) << ")\n";
764 if (bam_output)
765 x_fp1 << std::setw(16) << b_b << " building_bam (" << as_percentage(b_b / ll) << ")\n"
766 << std::setw(8) << w_b << " writing_bam (" << as_percentage(w_b / ll) << ")\n";
767 x_fp1 << std::setw(8) << w_f << " writing_fa (" << as_percentage(w_f / ll) << ")\n"
768 << std::setw(8) << w_v << " writing_vcf (" << as_percentage(w_v / ll) << ")\n";
769 if (detailed_output)
770 x_fp1 << std::setw(8) << w_d << " writing_details (" << as_percentage(w_d / ll) << ")\n";
771 x_fp1 << std::setw(8) << c << " clocking (" << as_percentage(c / ll) << ")\n"
772 << "Total time spent writing vcf: " << v << " (" << as_percentage(v / ll) << ")\n"
773 << "VCFwrite block size: mean=" << (double) gt_stats.n_genotyped_loci / n_writes
774 << "(n=" << n_writes << "); max=" << max_size_before_write << "\n"
775 << "END clockings\n";
776 }
777
778 #ifdef DEBUG
779 const MarukiLowModel* m = dynamic_cast<const MarukiLowModel*>(model.get());
780 if (m != NULL) {
781 // Report how often the "underflow" likelihood equations were used
782 cout << "\n"
783 << "DEBUG: marukilow: calc_ln_weighted_sums: "
784 << m->n_wsum_tot() << " calls, "
785 << m->n_wsum_underflows() << " underflows ("
786 << as_percentage(m->n_wsum_underflows(), m->n_wsum_tot())
787 << ").\n";
788 cout << "DEBUG: marukilow: mean_err_rate: "
789 << m->mean_err_rate() << '\n';
790 }
791 #endif
792
793 //
794 // Cleanup & return.
795 //
796
797 gzclose(o_gzfasta_f);
798 o_vcf_f->file().close();
799 o_vcf_f.reset();
800 for (unique_ptr<Bam>& b : bam_of_ptrs) {
801 b->close();
802 b.reset();
803 }
804 if (o_details_f) {
805 o_details_f->close();
806 o_details_f.reset();
807 }
808 model.reset();
809 if (dbg_write_hapgraphs)
810 o_hapgraphs_f << "}\n";
811 cout << "\ngstacks is done.\n";
812 return 0;
813
814 } catch (exception& e) {
815 return stacks_handle_exceptions(e);
816 }
817 }
818
819
820 //
821 // SnpAlleleCooccurrenceCounter
822 // ============================
823 //
824
at(size_t snp_i1,Nt2 snp1_allele,size_t snp_i2,Nt2 snp2_allele) const825 const size_t& SnpAlleleCooccurrenceCounter::at(size_t snp_i1, Nt2 snp1_allele, size_t snp_i2, Nt2 snp2_allele) const {
826 assert(snp_i1 < snp_i2);
827 return cooccurrences_[snp_i1*n_snps_+snp_i2][size_t(snp1_allele)][size_t(snp2_allele)];
828 }
829
clear()830 void SnpAlleleCooccurrenceCounter::clear() {
831 for(size_t i=0; i<n_snps_; ++i)
832 for(size_t j=i+1; j<n_snps_; ++j)
833 cooccurrences_[i*n_snps_+j] = array<array<size_t,4>,4>();
834 }
835
836 //
837 // GenotypeStats, HaplotypeStats & ContigStats
838 // ===============
839 //
840
operator +=(const GenotypeStats & other)841 GenotypeStats& GenotypeStats::operator+= (const GenotypeStats& other) {
842 this->n_genotyped_loci += other.n_genotyped_loci;
843 this->n_sites_tot += other.n_sites_tot;
844 if (pcr_clone_size_distrib.size() < other.pcr_clone_size_distrib.size())
845 pcr_clone_size_distrib.resize(other.pcr_clone_size_distrib.size());
846 for (size_t i=0; i<other.pcr_clone_size_distrib.size(); ++i)
847 pcr_clone_size_distrib[i] += other.pcr_clone_size_distrib[i];
848 for (size_t sample=0; sample<per_sample_stats.size(); ++sample) {
849 per_sample_stats[sample].n_unpaired_reads += other.per_sample_stats[sample].n_unpaired_reads;
850 per_sample_stats[sample].n_read_pairs_pcr_dupl += other.per_sample_stats[sample].n_read_pairs_pcr_dupl;
851 per_sample_stats[sample].n_read_pairs_used += other.per_sample_stats[sample].n_read_pairs_used;
852 per_sample_stats[sample].n_loci_with_sample += other.per_sample_stats[sample].n_loci_with_sample;
853 per_sample_stats[sample].ns_cumsum += other.per_sample_stats[sample].ns_cumsum;
854 per_sample_stats[sample].ns_weighted_n_read_pairs_used += other.per_sample_stats[sample].ns_weighted_n_read_pairs_used;
855 }
856 return *this;
857 }
858
operator +=(const HaplotypeStats & other)859 HaplotypeStats& HaplotypeStats::operator+= (const HaplotypeStats& other) {
860 for (auto elem : other.n_badly_phased_samples)
861 this->n_badly_phased_samples[elem.first] += elem.second;
862 assert(per_sample_stats.size() == other.per_sample_stats.size());
863 for (size_t sample=0; sample<per_sample_stats.size(); ++sample) {
864 per_sample_stats[sample].n_diploid_loci += other.per_sample_stats[sample].n_diploid_loci;
865 per_sample_stats[sample].n_hets_2snps += other.per_sample_stats[sample].n_hets_2snps;
866 per_sample_stats[sample].n_phased += other.per_sample_stats[sample].n_phased;
867 per_sample_stats[sample].n_phased_2ndpass += other.per_sample_stats[sample].n_phased_2ndpass;
868 }
869 return *this;
870 }
871
n_halplotyping_attempts() const872 size_t HaplotypeStats::n_halplotyping_attempts() const {
873 size_t n1 = 0;
874 for (auto& s: per_sample_stats)
875 n1 += s.n_hets_2snps;
876 size_t n2 = 0;
877 for (auto& e: n_badly_phased_samples)
878 n2+= e.second * e.first.second;
879 assert(n1==n2);
880 return n1;
881 }
882
n_consistent_hap_pairs() const883 size_t HaplotypeStats::n_consistent_hap_pairs() const {
884 size_t n1 = 0;
885 for (auto& s: per_sample_stats)
886 n1 += s.n_phased;
887 size_t n2 = 0;
888 for (auto& e: n_badly_phased_samples)
889 n2 += e.second * e.first.first;
890 assert(n1==n2);
891 return n1;
892 }
893
operator +=(const ContigStats & other)894 ContigStats& ContigStats::operator+= (const ContigStats& other) {
895 this->n_nonempty_loci += other.n_nonempty_loci;
896 this->n_loci_w_pe_reads += other.n_loci_w_pe_reads;
897 this->n_loci_almost_no_pe_reads += other.n_loci_almost_no_pe_reads;
898 this->n_loci_pe_graph_not_dag += other.n_loci_pe_graph_not_dag;
899 this->n_loci_pe_graph_fixed_to_dag += other.n_loci_pe_graph_fixed_to_dag;
900 this->length_ctg_tot += other.length_ctg_tot;
901 this->n_aln_reads += other.n_aln_reads;
902 this->n_tot_reads += other.n_tot_reads;
903 this->n_overlaps += other.n_overlaps;
904 this->length_overlap_tot += other.length_overlap_tot;
905 this->length_olapd_loci_tot += other.length_olapd_loci_tot;
906 this->insert_length_olap_mv += other.insert_length_olap_mv;
907
908 return *this;
909 }
910
911 //
912 // LocData
913 // =======
914 //
clear()915 void LocData::clear() {
916 id = -1;
917 pos.clear();
918 mpopi = NULL;
919 ctg_status = unknown;
920 olap_length = SIZE_MAX;
921 o_vcf.clear();
922 o_fa.clear();
923 o_details.clear();
924 details_ss.clear();
925 details_ss.str(string());
926 }
927
928 //
929 // LocusProcessor
930 // ==============
931 //
932
933 void
process(CLocReadSet & loc)934 LocusProcessor::process(CLocReadSet& loc)
935 {
936 timers_.processing_pre_alns.restart();
937 loc_.clear();
938 if (dbg_denovo_min_loc_samples > 0
939 && loc.n_samples() < dbg_denovo_min_loc_samples)
940 loc.clear();
941 if (loc.reads().empty()) {
942 timers_.processing_pre_alns.update();
943 return;
944 }
945 if (detailed_output)
946 loc_.details_ss << "BEGIN locus " << loc.id() << "\n";
947
948 ++ctg_stats_.n_nonempty_loci;
949 if (!loc.pe_reads().empty())
950 ++ctg_stats_.n_loci_w_pe_reads;
951 loc_.id = loc.id();
952 loc_.pos = loc.pos();
953 loc_.mpopi = &loc.mpopi();
954
955 CLocAlnSet aln_loc;
956 aln_loc.reinit(loc_.id, loc_.pos, loc_.mpopi);
957
958 //
959 // Remove N's in the forward reads.
960 //
961 timers_.rm_Ns.restart();
962 for (SRead& r : loc.reads())
963 r.seq.remove_Ns();
964 timers_.rm_Ns.update();
965
966 //
967 // Assemble a contig.
968 //
969 timers_.assembling.restart();
970 if (detailed_output)
971 loc_.details_ss << "BEGIN contig\n";
972 DNASeq4 ctg = assemble_locus_contig(loc.reads(), loc.pe_reads());
973 timers_.assembling.update();
974 timers_.processing_pre_alns.update();
975 if (ctg.empty()) {
976 if (detailed_output) {
977 loc_.details_ss << "contig_assembly_failed\n"
978 << "END contig\n"
979 << "END locus " << loc_.id << '\n';
980 loc_.o_details = loc_.details_ss.str();
981 }
982 return;
983 }
984 aln_loc.ref(move(ctg));
985 timers_.assembling.update();
986
987 timers_.init_alignments.restart();
988 SuffixTree* stree = new SuffixTree(aln_loc.ref());
989 stree->build_tree();
990 GappedAln aligner;
991 AlignRes aln_res;
992 if (!loc.pe_reads().empty()) {
993 //
994 // Check that the contig starts at the cutsite (and not in the
995 // adapter), by aligning one FW read.
996 //
997 for (SRead& r : loc.reads()) {
998 if (!align_reads_to_contig(stree, &aligner, r.seq, aln_res))
999 continue;
1000 // Read did align. Check start position & break.
1001 if (aln_res.subj_pos > 0) {
1002 DNASeq4 new_ctg;
1003 DNASeq4::iterator itr = aln_loc.ref().begin();
1004 for (size_t i=0; i<aln_res.subj_pos; ++i) {
1005 assert(itr != aln_loc.ref().end());
1006 ++itr;
1007 }
1008 new_ctg.append(itr, aln_loc.ref().end());
1009 if (detailed_output)
1010 loc_.details_ss
1011 << "orig_contig\t" << aln_loc.ref() << '\n'
1012 << "first_fw_read_aln\t" << r.name << "\tpos:" << aln_res.subj_pos << '\n';
1013 aln_loc.ref(move(new_ctg));
1014 delete stree;
1015 stree = new SuffixTree(aln_loc.ref());
1016 stree->build_tree();
1017 }
1018 break;
1019 }
1020 }
1021 ctg_stats_.length_ctg_tot += aln_loc.ref().length();
1022 if (detailed_output)
1023 loc_.details_ss << "contig\t" << aln_loc.ref() << '\n'
1024 << "END contig\n";
1025 timers_.init_alignments.update();
1026
1027 //
1028 // Align the reads.
1029 //
1030 timers_.aligning.restart();
1031 this->ctg_stats_.n_tot_reads += loc.reads().size();
1032 this->ctg_stats_.n_tot_reads += loc.pe_reads().size();
1033 if (detailed_output)
1034 loc_.details_ss << "BEGIN pe_alns\n";
1035 for (SRead& r : loc.reads()) { // FORWARD READS
1036 if(add_read_to_aln(aln_loc, aln_res, move(r), &aligner, stree)) {
1037 this->ctg_stats_.n_aln_reads++;
1038 if (detailed_output)
1039 loc_.details_ss << "fw_aln_local"
1040 << '\t' << aln_loc.reads().back().name
1041 << '\t' << aln_res.subj_pos + 1 << ':' << aln_res.cigar
1042 << '\n';
1043 } else {
1044 if (detailed_output)
1045 // `r` hasn't been moved.
1046 loc_.details_ss << "fw_aln_fail\t" << r.name << '\n';
1047 }
1048 }
1049 for (SRead& r : loc.pe_reads()) {
1050 if (add_read_to_aln(aln_loc, aln_res, move(r), &aligner, stree)) {
1051 this->ctg_stats_.n_aln_reads++;
1052 if (loc_.ctg_status == LocData::overlapped) {
1053 // Record the insert length. (Insert lengths are just based on
1054 // reverse reads but they should all have matching foward reads
1055 // starting at the restriction site so it makes sense.)
1056 const Cigar& cigar = aln_loc.reads().back().cigar;
1057 size_t insert_length = aln_loc.ref().length();
1058 assert(!cigar.empty());
1059 if (cigar.back().first == 'D') {
1060 insert_length -= cigar.back().second;
1061 if (cigar.size() >=2 && (*++cigar.rbegin()).first == 'I')
1062 // Soft clipping on the far end.
1063 insert_length += (*++cigar.rbegin()).second;
1064 }
1065 ctg_stats_.insert_length_olap_mv.increment(insert_length);
1066 }
1067 if (detailed_output)
1068 loc_.details_ss << "pe_aln_local"
1069 << '\t' << aln_loc.reads().back().name
1070 << '\t' << aln_res.subj_pos + 1 << ':' << aln_res.cigar
1071 << '\n';
1072 } else {
1073 if (detailed_output)
1074 // `r` hasn't been moved.
1075 loc_.details_ss << "pe_aln_fail\t" << r.name << '\n';
1076 }
1077 }
1078 if (detailed_output)
1079 loc_.details_ss << "END pe_alns\n";
1080 delete stree;
1081 timers_.aligning.update();
1082
1083 if (bam_output) {
1084 timers_.building_bams.restart();
1085 assert(input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger);
1086 const bool merged_input = input_type == GStacksInputT::denovo_merger;
1087 if (loc_.o_bam.empty())
1088 loc_.o_bam.resize(merged_input ? 1 : loc_.mpopi->n_samples());
1089 aln_loc.sort_by_alignment_offset();
1090 for (const SAlnRead& read : aln_loc.reads()) {
1091 BamRecord rec;
1092 assert(read.name.length() >= 2 && *++read.name.rbegin() == '/');
1093 Cigar c = read.cigar;
1094 size_t offset = 0;
1095 assert(!c.empty());
1096 if (c.back().first == 'D')
1097 c.pop_back();
1098 else if (c.back().first == 'I' && c.size() >= 2 && (*----c.end()).first == 'D')
1099 c.erase(----c.end());
1100 if (c[0].first == 'D') {
1101 offset = c[0].second;
1102 c.erase(c.begin());
1103 } else if (c[0].first == 'I' && c.size() >= 2 && c[1].first == 'D') {
1104 offset = c[1].second;
1105 c.erase(++c.begin());
1106 }
1107 rec.assign(
1108 read.name.substr(0, read.name.length() - 2),
1109 *read.name.rbegin() == '1' ? BAM_FREAD1 : BAM_FREAD2,
1110 aln_loc.id(),
1111 offset,
1112 c,
1113 read.seq,
1114 read.sample
1115 );
1116 loc_.o_bam.at(merged_input ? 0 : read.sample).push_back(move(rec));
1117 }
1118 timers_.building_bams.update();
1119 }
1120
1121 timers_.merge_paired_reads.restart();
1122 aln_loc.merge_paired_reads();
1123 timers_.merge_paired_reads.update();
1124 loc.clear();
1125 timers_.processing_pre_alns.update();
1126 if (aln_loc.reads().empty()) {
1127 if (detailed_output)
1128 loc_.details_ss << "no_reads_were_aligned\n";
1129 return;
1130 }
1131 process(aln_loc);
1132 }
1133
1134 void
process(CLocAlnSet & aln_loc)1135 LocusProcessor::process(CLocAlnSet& aln_loc)
1136 {
1137 assert(!aln_loc.reads().empty());
1138 timers_.processing_post_alns.restart();
1139 if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
1140 // Called from process(CLocReadSet&).
1141 assert(this->loc_.id == aln_loc.id());
1142 } else {
1143 this->loc_.clear();
1144 this->loc_.id = aln_loc.id();
1145 this->loc_.pos = aln_loc.pos();
1146 this->loc_.mpopi = &aln_loc.mpopi();
1147 if (detailed_output)
1148 this->loc_.details_ss << "BEGIN locus " << loc_.id << "\n";
1149 }
1150
1151 //
1152 // Remove unpaired reads/PCR duplicates.
1153 //
1154 timers_.rm_reads.restart();
1155 if (rm_unpaired_reads) {
1156 for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1157 gt_stats_.per_sample_stats[sample].n_unpaired_reads += aln_loc.sample_reads(sample).size();
1158 aln_loc.remove_unmerged_reads(detailed_output ? &this->loc_.details_ss : NULL);
1159 for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1160 gt_stats_.per_sample_stats[sample].n_unpaired_reads -= aln_loc.sample_reads(sample).size();
1161 if(aln_loc.reads().empty()) {
1162 loc_.clear();
1163 timers_.processing_post_alns.update();
1164 return;
1165 }
1166 }
1167 if (rm_pcr_duplicates) {
1168 assert(rm_unpaired_reads);
1169 for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1170 gt_stats_.per_sample_stats[sample].n_read_pairs_pcr_dupl += aln_loc.sample_reads(sample).size();
1171 aln_loc.remove_pcr_duplicates(>_stats_.pcr_clone_size_distrib,
1172 detailed_output ? &this->loc_.details_ss : NULL);
1173 for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1174 gt_stats_.per_sample_stats[sample].n_read_pairs_pcr_dupl -= aln_loc.sample_reads(sample).size();
1175 }
1176 timers_.rm_reads.update();
1177 assert(!aln_loc.reads().empty());
1178 ++gt_stats_.n_genotyped_loci;
1179 size_t n_samples = aln_loc.n_samples();
1180 for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample) {
1181 if (!aln_loc.sample_reads(sample).empty()) {
1182 auto& s = gt_stats_.per_sample_stats[sample];
1183 ++s.n_loci_with_sample;
1184 s.n_read_pairs_used += aln_loc.sample_reads(sample).size();
1185 s.ns_cumsum += n_samples;
1186 s.ns_weighted_n_read_pairs_used += n_samples * aln_loc.sample_reads(sample).size();
1187 }
1188 }
1189 if (detailed_output) {
1190 loc_.details_ss << "BEGIN aln_matrix\n";
1191 for (const SAlnRead& read : aln_loc.reads())
1192 loc_.details_ss << read.name << '\t' << loc_.mpopi->samples()[read.sample].name
1193 << '\t' << read.cigar << '\n';
1194 loc_.details_ss << "END aln_matrix\n";
1195 }
1196
1197 //
1198 // Get the nucleotide counts at each position.
1199 //
1200 timers_.counting_nts.restart();
1201 vector<SiteCounts> depths;
1202 depths.reserve(aln_loc.ref().length());
1203 for (CLocAlnSet::site_iterator site (aln_loc); bool(site); ++site) {
1204 depths.push_back(site.counts());
1205 if (depths.back().tot.sum() > 0)
1206 ++gt_stats_.n_sites_tot; // Sites "with data".
1207 }
1208 assert(depths.size() == aln_loc.ref().length());
1209 timers_.counting_nts.update();
1210
1211 //
1212 // Call SNPs.
1213 //
1214 timers_.genotyping.restart();
1215 vector<SiteCall> calls;
1216 calls.reserve(aln_loc.ref().length());
1217 for (const SiteCounts& site_depths : depths) {
1218 calls.push_back(model->call(site_depths));
1219 // Make sure our SNP and genotype calls are consistent (discard SNPs with
1220 // a MAC of 0, or without calls at all).
1221 if (calls.back().alleles().size() > 1)
1222 calls.back().filter_mac(1);
1223 }
1224 timers_.genotyping.update();
1225
1226 //
1227 // Call haplotypes; amend SNP/genotype calls.
1228 //
1229 vector<map<size_t,PhasedHet>> phase_data;
1230 if (!dbg_no_haplotypes) {
1231 timers_.haplotyping.restart();
1232 phase_hets(phase_data, calls, aln_loc, hap_stats_);
1233 timers_.haplotyping.update();
1234 }
1235
1236 //
1237 // Determine the consensus sequence.
1238 //
1239 timers_.cpt_consensus.restart();
1240 DNASeq4 consensus (aln_loc.ref().length());
1241 assert(calls.size() == aln_loc.ref().length());
1242 assert(depths.size() == aln_loc.ref().length());
1243 for (size_t i = 0; i < aln_loc.ref().length(); ++i) {
1244 if (!calls[i].alleles().empty()) {
1245 consensus.set(i, calls[i].most_frequent_allele());
1246 } else {
1247 // Use the majority-rule nucleotide.
1248 // (For the high/low Maruki" models this actually only happens when
1249 // there is no coverage; for the Hohenlohe model it may also happen
1250 // when there isn't a single significant call.)
1251 pair<size_t,Nt2> nt = depths[i].tot.sorted()[0];
1252 if (nt.first > 0)
1253 consensus.set(i, nt.second);
1254 }
1255 }
1256 aln_loc.ref(move(consensus));
1257 timers_.cpt_consensus.update();
1258
1259 //
1260 // Create the outputs.
1261 //
1262 write_one_locus(aln_loc, depths, calls, phase_data);
1263
1264 if (detailed_output) {
1265 loc_.details_ss << "END locus " << loc_.id << "\n";
1266 loc_.o_details = loc_.details_ss.str();
1267 }
1268 if (dbg_write_alns) {
1269 #pragma omp critical(dbg_alns)
1270 o_aln_f << "BEGIN " << loc_.id << "\n"
1271 << aln_loc
1272 << "\nEND " << loc_.id << "\n";
1273 }
1274 timers_.processing_post_alns.update();
1275 }
1276
1277 bool
align_reads_to_contig(SuffixTree * st,GappedAln * g_aln,DNASeq4 & enc_query,AlignRes & aln_res) const1278 LocusProcessor::align_reads_to_contig(SuffixTree *st, GappedAln *g_aln, DNASeq4 &enc_query, AlignRes &aln_res) const
1279 {
1280 vector<STAln> alns, final_alns;
1281 vector<pair<size_t, size_t> > step_alns;
1282 string query = enc_query.str();
1283 const char *q = query.c_str();
1284 const char *q_stop = q + query.length();
1285 size_t q_pos = 0;
1286 size_t id = 0;
1287 char c[id_len];
1288
1289 do {
1290 step_alns.clear();
1291
1292 q_pos = q - query.c_str();
1293
1294 st->align(q, step_alns);
1295
1296 if (step_alns.size() == 0 || step_alns.size() > max_fragment_alns) {
1297 q++;
1298 } else {
1299 for (uint i = 0; i < step_alns.size(); i++)
1300 alns.push_back(STAln(id, q_pos, step_alns[i].first, step_alns[i].second));
1301 q += step_alns[0].second + 1;
1302 id++;
1303 }
1304 } while (q < q_stop);
1305
1306 //
1307 // No alignments to the suffix tree were found.
1308 //
1309 if (alns.size() == 0)
1310 return false;
1311
1312 //
1313 // Perfect alignmnet to the suffix tree. Return result.
1314 //
1315 if (alns.size() == 1 && alns[0].aln_len == query.length()) {
1316 snprintf(c, id_len, "%luM", query.length());
1317 aln_res.cigar = c;
1318 aln_res.subj_pos = alns[0].subj_pos;
1319 return true;
1320 }
1321
1322 //
1323 // Find a consistent set of suffix tree alignments, ordered in a directed, acyclic grapgh (DAG).
1324 //
1325 this->suffix_tree_hits_to_dag(query.length(), alns, final_alns);
1326
1327 g_aln->init(query.length(), st->seq_len(), true);
1328 if (g_aln->align_constrained(query, st->seq_str(), final_alns)) {
1329 aln_res = g_aln->result();
1330 }
1331
1332 if (aln_res.pct_id < min_aln_cov)
1333 return false;
1334
1335 return true;
1336 }
1337
1338 int
suffix_tree_hits_to_dag(size_t query_len,vector<STAln> & alns,vector<STAln> & final_alns) const1339 LocusProcessor::suffix_tree_hits_to_dag(size_t query_len, vector<STAln> &alns, vector<STAln> &final_alns) const
1340 {
1341 //
1342 // 1. Sort the alignment fragments so they are primarily ordered by subject position, secondarily by query position.
1343 //
1344 sort(alns.begin(), alns.end(), [](STAln a, STAln b)
1345 {
1346 if (a.subj_pos == b.subj_pos)
1347 return a.query_pos < b.query_pos;
1348 else
1349 return a.subj_pos < b.subj_pos;
1350 });
1351
1352 int gap_len, end_pos;
1353 double score, scale;
1354 bool term;
1355 vector<size_t> term_nodes;
1356 //
1357 // 2. Traverse the list of fragments and add links to reachable nodes, recording the score.
1358 // 2.1. A successor node is reachable from the predecessor node if both the query and subject
1359 // positions are advanced relative to the predecessor, and there is no sequence overlap
1360 // between the two nodes.
1361 // 2.2. We weight the score by the number of 'gap' nucleotides between the two fragments to
1362 // penalize fragments that are further away.
1363 // 2.3 Record the maximal link in the graph.
1364 // 2.4. Determine if a node is a terminal node.
1365 //
1366 for (uint i = 0; i < alns.size(); i++) {
1367 end_pos = alns[i].subj_pos + alns[i].aln_len - 1;
1368 term = true;
1369
1370 assert(end_pos > 0);
1371
1372 for (uint j = i + 1; j < alns.size(); j++) {
1373 if (alns[j].query_pos > alns[i].query_pos &&
1374 alns[j].subj_pos > alns[i].subj_pos &&
1375 alns[j].subj_pos > (uint) end_pos) {
1376
1377 term = false;
1378 gap_len = alns[j].subj_pos - end_pos - 1;
1379 scale = gap_len > (int) query_len ? 1 : alns[i].aln_len * ((double) gap_len / (double) query_len);
1380 score = alns[i].aln_len - scale; // Raw score.
1381 score += alns[i].max._score; // Score adjusted for the highest incoming node.
1382
1383 assert(gap_len >= 0);
1384 assert(score >= 0);
1385
1386 alns[j].links.push_back(STLink(i, score));
1387 if (alns[j].max._index == j || score > alns[j].max._score)
1388 alns[j].max = STLink(i, score);
1389 }
1390 }
1391 if (term)
1392 term_nodes.push_back(i);
1393 }
1394
1395 //
1396 // 3. Find the terminal node with the highest score.
1397 //
1398 double max_score = alns[term_nodes.front()].max._score;
1399 size_t max_index = term_nodes.front();
1400 for (uint i = 1; i < term_nodes.size(); i++) {
1401 if (alns[term_nodes[i]].max._score > max_score) {
1402 max_index = term_nodes[i];
1403 max_score = alns[term_nodes[i]].max._score;
1404 }
1405 }
1406
1407 //
1408 // 4. Backtrack to get the optimal path.
1409 //
1410 vector<size_t> optimal;
1411
1412 if (max_score == 0 && max_index == 0) {
1413 //
1414 // None of the fragments were connected in the DAG, select the largest fragment.
1415 //
1416 max_score = alns[term_nodes.front()].aln_len;
1417 max_index = term_nodes.front();
1418 for (uint i = 1; i < term_nodes.size(); i++)
1419 if (alns[i].aln_len > max_score) {
1420 max_index = term_nodes[i];
1421 max_score = alns[term_nodes[i]].aln_len;
1422 }
1423 optimal.push_back(max_index);
1424
1425 } else {
1426
1427 uint n = max_index;
1428 while (alns[n].links.size() > 0) {
1429 optimal.push_back(n);
1430 n = alns[n].max._index;
1431 }
1432 optimal.push_back(n);
1433 }
1434 assert(optimal.size() > 0);
1435
1436 final_alns.clear();
1437 for (int i = optimal.size() - 1; i >= 0; i--)
1438 final_alns.push_back(alns[optimal[i]]);
1439
1440 return 0;
1441 }
1442
1443 size_t
find_locus_overlap(SuffixTree * stree,GappedAln * g_aln,const DNASeq4 & se_consensus,string & overlap_cigar) const1444 LocusProcessor::find_locus_overlap(SuffixTree *stree, GappedAln *g_aln, const DNASeq4 &se_consensus, string &overlap_cigar) const
1445 {
1446 vector<STAln> alns, final_alns;
1447 vector<pair<size_t, size_t> > step_alns;
1448 AlignRes aln_res;
1449
1450 string query = se_consensus.str();
1451 const char *q = query.c_str();
1452 const char *q_stop = q + query.length();
1453 size_t q_pos = 0;
1454 size_t id = 1;
1455
1456 do {
1457 step_alns.clear();
1458
1459 q_pos = q - query.c_str();
1460
1461 stree->align(q, step_alns);
1462
1463 if (step_alns.size() == 0 || step_alns.size() > max_fragment_alns) {
1464 q++;
1465 } else {
1466 for (uint i = 0; i < step_alns.size(); i++)
1467 alns.push_back(STAln(id, q_pos, step_alns[i].first, step_alns[i].second));
1468 q += step_alns[0].second + 1;
1469 id++;
1470 }
1471 } while (q < q_stop);
1472
1473
1474 if (alns.size() == 0) {
1475 //
1476 // If no alignments have been found, search the tails of the query and subject for any overlap
1477 // that is too small to be picked up by the SuffixTree using the gapped aligner.
1478 //
1479 uint olap_bound = stree->min_aln() * 2;
1480
1481 //
1482 // Create a gapped alignment, looking only at the tail of the two sequences, to determine the exact overlap.
1483 //
1484 g_aln->init(query.length(), stree->seq_len(), true);
1485 if (g_aln->align_region(query, stree->seq_str(),
1486 query.length() - olap_bound, query.length() - 1, 0, olap_bound - 1)) {
1487 aln_res = g_aln->result();
1488 overlap_cigar = aln_res.cigar;
1489 }
1490 // g_aln->dump_alignment(query, stree->seq_str());
1491
1492 } else {
1493 size_t query_stop = alns.front().query_pos + alns.front().aln_len - 1;
1494
1495 if ( alns.size() == 1 &&
1496 (query_stop == (query.length() - 1) && alns.front().subj_pos == 0)) {
1497 //
1498 // If a single alignment has been found, check for a perfect alignmnet to the suffix tree
1499 // that occupies the end of the query and the beginning of the subject.
1500 //
1501 char buf[id_len];
1502 snprintf(buf, id_len, "%luM", alns.front().aln_len);
1503 overlap_cigar = buf;
1504 aln_res.pct_id = 1;
1505
1506 } else {
1507 //
1508 // Otherwise, find a consistent set of suffix tree alignments, ordered in a directed, acyclic grapgh (DAG).
1509 //
1510 this->suffix_tree_hits_to_dag(query.length(), alns, final_alns);
1511
1512 //
1513 // Create a gapped alignment, prefilling the suffix tree hits, to determine the exact overlap.
1514 //
1515 g_aln->init(query.length(), stree->seq_len(), true);
1516 if (g_aln->align_constrained(query, stree->seq_str(), final_alns)) {
1517 aln_res = g_aln->result();
1518 overlap_cigar = aln_res.cigar;
1519 }
1520 }
1521 }
1522
1523 Cigar cigar;
1524 parse_cigar(overlap_cigar.c_str(), cigar, true);
1525
1526 if (cigar.size() == 0)
1527 return 0;
1528
1529 // cerr << "Cigar: " << aln_res.cigar.c_str() << "; cigar_length_ref aka overlap: " << cigar_length_ref(cigar) << "\n";
1530
1531 //
1532 // Assess the gapped alignment and make sure it is reasonable.
1533 //
1534 size_t overlap = 0;
1535 size_t cigar_index = 0;
1536 double total_len = 0.0;
1537 double align_len = 0.0;
1538
1539 if (cigar[cigar_index].first == 'S') {
1540 //
1541 // Here is an example where we want to ignore the initial softmasking repored for the SE contig,
1542 // but we must count the offset to the start of the PE contig as a set of mismatches.
1543 // CIGAR: 138S4M2S
1544 // SE CTG: TGCAGGAGATTTTCCTCTCTGCTGT...GGAGCTCGTTGGTGTTGGTGGCACTGCCCTGAGGTTTTCCTCATNNTCCCCAGGNCCCAGCGTCAGC
1545 // PE CTG: GGCACAGATGTGTCACAGCTGC...CTTCTT
1546 // ****
1547 cigar_index++;
1548 total_len += aln_res.subj_pos;
1549
1550 } else {
1551 //
1552 // If the PE contig overlaps the entire SE contig, and there is sequence remaining on the PE contig, adjust the
1553 // overlap to include it, but it does not count as unaligned total length.
1554 // e.g.:
1555 // CIGAR: 68M1D6M
1556 // SE CTG: TGCAGGCGCACCTGTAGGCCCCCGGACAGGAGGGGGTGGCTTCAAGCAGGGGCCAGCCCGAGGCCCCC-ACCCAC
1557 // PE CTG: TAGTTGCAGGCGCACCTGTAGGCCCCCGGACAGGAGGGGGTGGCTTCAAGCAGGGGCCAGCCCGAGGCCCCCCACCCACACCCAGCACACACTGGCC...
1558 //
1559 overlap += aln_res.subj_pos;
1560 }
1561
1562 for (; cigar_index < cigar.size(); cigar_index++) {
1563 total_len += cigar[cigar_index].second;
1564 if (cigar[cigar_index].first == 'M')
1565 align_len += cigar[cigar_index].second;
1566 }
1567
1568 //
1569 // Test that 80% of the overlapping sequence is aligned (includes gaps) AND
1570 // make sure that of those sequences aligned, 80% are identities.
1571 //
1572 if (align_len / total_len < overlap_min_pct_id ||
1573 aln_res.pct_id < overlap_min_pct_id)
1574 return 0;
1575
1576 overlap += align_len;
1577
1578 //
1579 // Return overlap.
1580 //
1581 return overlap;
1582 }
1583
1584 DNASeq4
assemble_locus_contig(const vector<SRead> & fw_reads,const vector<SRead> & pe_reads)1585 LocusProcessor::assemble_locus_contig(
1586 const vector<SRead>& fw_reads,
1587 const vector<SRead>& pe_reads
1588 ) {
1589 //
1590 // List the sequences that will enter the graph.
1591 //
1592 array<vector<const DNASeq4*>, 3> seqs_to_assemble;
1593 array<const vector<SRead>*, 2> reads = {{&fw_reads, &pe_reads}};
1594 size_t fw = 0, pe = 1, both = 2;
1595 for (size_t i : {fw, pe}) {
1596 if (reads[i]->size() <= max_debruijn_reads) {
1597 for (const Read& r : *reads[i])
1598 seqs_to_assemble[i].push_back(&r.seq);
1599 } else {
1600 double seq_i = 0.0;
1601 double step = (double) reads[i]->size() / max_debruijn_reads;
1602 for (size_t j=0; j<max_debruijn_reads; ++j) {
1603 seqs_to_assemble[i].push_back(&(*reads[i])[(size_t)seq_i].seq);
1604 seq_i += step;
1605 }
1606 }
1607 }
1608 seqs_to_assemble[both] = seqs_to_assemble[fw];
1609 seqs_to_assemble[both].insert(
1610 seqs_to_assemble[both].end(),
1611 seqs_to_assemble[pe].begin(), seqs_to_assemble[pe].end());
1612
1613 //
1614 // Build the graph.
1615 //
1616 Graph graph (km_length);
1617 graph.rebuild(seqs_to_assemble[both], min_km_count);
1618 if (graph.empty())
1619 return DNASeq4();
1620 if (dbg_write_gfa) {
1621 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".spaths.gfa");
1622 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".nodes.gfa", true);
1623 }
1624
1625 //
1626 // Find the best path in the graph.
1627 //
1628 vector<const SPath*> best_path;
1629 do {
1630 if (graph.find_best_path(best_path))
1631 break;
1632
1633 // The graph wasn't a DAG, try to fix it.
1634 if(graph.remove_cycles() && graph.find_best_path(best_path)) {
1635 ++ctg_stats_.n_loci_pe_graph_fixed_to_dag;
1636 if (detailed_output)
1637 loc_.details_ss << "fixed_to_dag\n";
1638 if (dbg_write_gfa) {
1639 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".fixed.spaths.gfa");
1640 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".fixed.nodes.gfa", true);
1641 }
1642 break;
1643 }
1644
1645 // Contig contruction failed.
1646 ++ctg_stats_.n_loci_pe_graph_not_dag;
1647 if (detailed_output)
1648 loc_.details_ss << "not_dag\n";
1649 if (dbg_write_gfa_notdag) {
1650 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".spaths.gfa");
1651 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".nodes.gfa", true);
1652 }
1653 return DNASeq4();
1654 } while (false);
1655 if (detailed_output)
1656 loc_.details_ss << "is_dag\n";
1657 DNASeq4 contig = DNASeq4(SPath::contig_str(best_path.begin(), best_path.end(), km_length));
1658
1659 //
1660 // Handle the case where there are paired-end reads and the main
1661 // corresponding subgraph is distinct from that of the forward reads.
1662 //
1663 if (!pe_reads.empty()) {
1664 //
1665 // Extract the components and map the fw & pe reads to them.
1666 //
1667 vector<vector<const SPath*>> components = graph.components();
1668 unordered_map<Kmer,size_t> component_kmers;
1669 for (size_t i=0; i<components.size(); ++i) {
1670 size_t n_nodes = 0;
1671 for (const SPath* sp : components[i]) {
1672 for (const Node* n=sp->first(); ; n=n->first_succ()) {
1673 component_kmers[n->km()] = i;
1674 ++n_nodes;
1675 if (n == sp->last())
1676 break;
1677 }
1678 }
1679 }
1680 array<vector<size_t>, 2> comp_depths;
1681 for (size_t i : {fw, pe}) {
1682 comp_depths[i] = vector<size_t>(components.size(), 0);
1683 for (const DNASeq4* s : seqs_to_assemble[i]) {
1684 Kmerizer kmers {km_length, *s};
1685 Kmer km;
1686 decltype(component_kmers.begin()) itr;
1687 while ((km = kmers.next()))
1688 if ((itr = component_kmers.find(km)) != component_kmers.end())
1689 ++comp_depths[i][itr->second];
1690 }
1691 }
1692 //
1693 // Check that there actually exist kmers for the forward and paired-end reads.
1694 //
1695 if (accumulate(comp_depths[fw].begin(), comp_depths[fw].end(), 0) == 0) {
1696 if (detailed_output)
1697 loc_.details_ss << "fw_reads_absent_from_graph\n";
1698 return DNASeq4();
1699 }
1700 if (accumulate(comp_depths[pe].begin(), comp_depths[pe].end(), 0) == 0) {
1701 if (detailed_output)
1702 loc_.details_ss << "pe_reads_absent_from_graph\n";
1703 } else {
1704 //
1705 // Check that the main forward and paired-end components are the same one.
1706 //
1707 array<size_t, 2> best_comp;
1708 for (size_t i : {fw, pe}) {
1709 size_t best_depth = 0;
1710 for (size_t c=0; c<components.size(); ++c) {
1711 if (comp_depths[i][c] > best_depth) {
1712 best_comp[i] = c;
1713 best_depth = comp_depths[i][c];
1714 }
1715 }
1716 }
1717 if (best_comp[fw] == best_comp[pe]) {
1718 loc_.ctg_status = LocData::overlapped;
1719 loc_.olap_length = km_length;
1720 ctg_stats_.n_overlaps++;
1721 ctg_stats_.length_overlap_tot += loc_.olap_length;
1722 ctg_stats_.length_olapd_loci_tot += contig.length();
1723 if (detailed_output)
1724 loc_.details_ss << "one_debruijn_subgraph\n"
1725 << "overlap\t" << km_length << '\n';
1726 } else {
1727 if (detailed_output)
1728 loc_.details_ss << "two_debruijn_subgraphs\n";
1729 //
1730 // Determine the component the best path corresponds to.
1731 //
1732 Kmer km = Kmerizer(km_length, contig).next();
1733 assert(km);
1734 auto comp_itr = component_kmers.find(km);
1735 assert(comp_itr != component_kmers.end());
1736 size_t comp = comp_itr->second;
1737 //
1738 // Get two contigs, one for forward and one for paired-end.
1739 //
1740 DNASeq4 fw_contig, pe_contig;
1741 DNASeq4* todo_contig;
1742 const vector<const DNASeq4*>* todo_seqs;
1743 if (comp == best_comp[fw]) {
1744 if (detailed_output)
1745 loc_.details_ss << "best_path_is_best_fw_compo\n";
1746 fw_contig = move(contig);
1747 todo_contig = &pe_contig;
1748 todo_seqs = &seqs_to_assemble[pe];
1749 } else if (comp == best_comp[pe]) {
1750 if (detailed_output)
1751 loc_.details_ss << "best_path_is_best_pe_compo\n";
1752 pe_contig = move(contig);
1753 todo_contig = &fw_contig;
1754 todo_seqs = &seqs_to_assemble[fw];
1755 } else {
1756 if (detailed_output)
1757 loc_.details_ss << "best_path_is_neither_best_fw_compo_nor_best_pe_compo\n";
1758 ++ctg_stats_.n_loci_pe_graph_not_dag;
1759 return DNASeq4();
1760 }
1761 graph.rebuild(*todo_seqs, min_km_count);
1762 if (graph.empty()) {
1763 // DOES_NOT_HAPPEN; // May fail, c.f. mailing list 2019-01-18.
1764 // We can arrive here in some limit cases. Especially, if the forward and reverse
1765 // regions overlap but the graph is still disconnected, which may happen e.g. if
1766 // there are very few reads and some sequencing errors/SNPs (e.g. two 150bp forward
1767 // reads and one 150bp reverse read with a 50bp overlap, and the two forward reads
1768 // differ at position 100 and min_km_count is 2; with enough errors in the forward
1769 // reads it's probably also possible to make this happen with the graph's best path
1770 // being in the reverse region.)
1771 loc_.details_ss << "limit_case_the_other_compo_disappeared\n";
1772 return DNASeq4();
1773 }
1774 if (!graph.find_best_path(best_path)
1775 && !(graph.remove_cycles() && graph.find_best_path(best_path)))
1776 DOES_NOT_HAPPEN;
1777 *todo_contig = DNASeq4(SPath::contig_str(best_path.begin(), best_path.end(), km_length));
1778 if(detailed_output)
1779 loc_.details_ss << "contig_fw\t" << fw_contig << '\n'
1780 << "contig_pe\t" << pe_contig << '\n';
1781 //
1782 // Try to overlap the two contigs.
1783 //
1784 SuffixTree stree {pe_contig};
1785 stree.build_tree();
1786 GappedAln aligner;
1787 AlignRes aln_res;
1788 string overlap_cigar;
1789 int overlap;
1790 if (dbg_no_overlaps)
1791 overlap = 0;
1792 else
1793 overlap = this->find_locus_overlap(&stree, &aligner, fw_contig, overlap_cigar);
1794 if (overlap > 0) {
1795 if(detailed_output)
1796 loc_.details_ss << "overlap\t" << overlap << '\t' << overlap_cigar << "\n";
1797 this->loc_.ctg_status = LocData::overlapped;
1798 this->loc_.olap_length = overlap;
1799 this->ctg_stats_.n_overlaps++;
1800 this->ctg_stats_.length_overlap_tot += overlap;
1801 this->ctg_stats_.length_olapd_loci_tot += fw_contig.length() + pe_contig.length() - overlap;
1802 contig = move(fw_contig);
1803 auto seq_itr = pe_contig.begin();
1804 for(size_t i=0; i<size_t(overlap); ++i) {
1805 assert(seq_itr != pe_contig.end());
1806 ++seq_itr;
1807 }
1808 contig.append(seq_itr, pe_contig.end());
1809 } else {
1810 if(detailed_output)
1811 loc_.details_ss << "no_overlap\n";
1812 this->loc_.ctg_status = LocData::separate;
1813 contig = move(fw_contig);
1814 for(size_t i=0; i<10; ++i)
1815 contig.push_back(Nt4::n);
1816 contig.append(pe_contig.begin(), pe_contig.end());
1817 }
1818 }
1819 }
1820 }
1821 return contig;
1822 }
1823
add_read_to_aln(CLocAlnSet & aln_loc,AlignRes & aln_res,SRead && r,GappedAln * aligner,SuffixTree * stree)1824 bool LocusProcessor::add_read_to_aln(
1825 CLocAlnSet& aln_loc,
1826 AlignRes& aln_res,
1827 SRead&& r,
1828 GappedAln* aligner,
1829 SuffixTree* stree
1830 ) {
1831 if (!this->align_reads_to_contig(stree, aligner, r.seq, aln_res))
1832 return false;
1833
1834 Cigar cigar;
1835 parse_cigar(aln_res.cigar.c_str(), cigar);
1836 assert(!cigar.empty());
1837 simplify_cigar_to_MDI(cigar);
1838 cigar_extend_left(cigar, aln_res.subj_pos);
1839 assert(cigar_length_ref(cigar) <= aln_loc.ref().length());
1840 cigar_extend_right(cigar, aln_loc.ref().length() - cigar_length_ref(cigar));
1841
1842 aln_loc.add(SAlnRead(move((Read&)r), move(cigar), r.sample));
1843 return true;
1844 }
1845
phase_hets(vector<map<size_t,PhasedHet>> & phased_samples,vector<SiteCall> & calls,const CLocAlnSet & aln_loc,HaplotypeStats & hap_stats) const1846 void LocusProcessor::phase_hets(
1847 vector<map<size_t,PhasedHet>>& phased_samples,
1848 vector<SiteCall>& calls,
1849 const CLocAlnSet& aln_loc,
1850 HaplotypeStats& hap_stats
1851 ) const {
1852 phased_samples.clear();
1853 phased_samples.resize(loc_.mpopi->samples().size());
1854
1855 //
1856 // Apply a first MAC filter, if requested.
1857 //
1858 if (phasing_min_mac >= 2)
1859 for (SiteCall& call : calls)
1860 if (call.alleles().size() > 1)
1861 call.filter_mac(phasing_min_mac);
1862
1863 //
1864 // Find SNPs.
1865 //
1866 vector<size_t> snp_cols;
1867 for (size_t i=0; i<aln_loc.ref().length(); ++i)
1868 if (calls[i].alleles().size() > 1)
1869 snp_cols.push_back(i);
1870
1871 //
1872 // Check that the locus is polymorphic.
1873 //
1874 if (snp_cols.empty()) {
1875 // No SNPs, there is no phasing to do.
1876 ++hap_stats.n_badly_phased_samples[ {0, 0} ];
1877 return;
1878 }
1879
1880 //
1881 // Initialize counters, streams.
1882 //
1883 size_t n_hets_needing_phasing = 0;
1884 size_t n_consistent_hets = 0;
1885 assert(hap_stats.per_sample_stats.size() == loc_.mpopi->samples().size());
1886 if (detailed_output)
1887 loc_.details_ss << "BEGIN phasing\n";
1888 stringstream o_hapgraph_ss;
1889 bool has_subgraphs = false;
1890 if (dbg_write_hapgraphs) {
1891 o_hapgraph_ss << "\n"
1892 << "subgraph cluster_loc" << loc_.id << " {\n"
1893 << "\tlabel=\"locus " << loc_.id << "\";\n"
1894 << "\t# snp columns: ";
1895 join(snp_cols, ',', o_hapgraph_ss);
1896 o_hapgraph_ss << "\n";
1897 }
1898
1899 //
1900 // Phase each sample.
1901 //
1902 vector<size_t> samples_failed;
1903 vector<size_t> samples_passed;
1904 for (size_t sample=0; sample<loc_.mpopi->samples().size(); ++sample) {
1905 if (phase_sample_hets(
1906 phased_samples[sample],
1907 calls, aln_loc, snp_cols, sample,
1908 hap_stats,
1909 n_hets_needing_phasing, n_consistent_hets,
1910 o_hapgraph_ss, has_subgraphs)
1911 ) {
1912 samples_passed.push_back(sample);
1913 } else {
1914 samples_failed.push_back(sample);
1915 }
1916 }
1917
1918 //
1919 // Retry to phase failed samples after removing SNPs for which the minor
1920 // allele was only seen in hets and was never successfully phased.
1921 //
1922 if (!dbg_phasing_no_2ndpass) {
1923 vector<size_t> snp_cols_reduced;
1924 for(size_t col : snp_cols) {
1925 const vector<SampleCall>& c = calls[col].sample_calls();
1926 Counts<Nt2> counts;
1927 for (size_t sample : samples_passed) {
1928 switch(c[sample].call()) {
1929 case snp_type_hom:
1930 counts.increment(c[sample].nt0(), 2);
1931 break;
1932 case snp_type_het:
1933 if (phased_samples[sample].count(col)) {
1934 // This het was phased.
1935 counts.increment(c[sample].nt0());
1936 counts.increment(c[sample].nt1());
1937 }
1938 break;
1939 default:
1940 break;
1941 }
1942 }
1943 size_t mac = counts.sorted()[1].first;
1944 if (mac >= size_t(phasing_min_mac))
1945 // Keep it.
1946 snp_cols_reduced.push_back(col);
1947 }
1948 if (snp_cols_reduced.size() < snp_cols.size())
1949 for (size_t sample : samples_failed)
1950 if (phase_sample_hets(
1951 phased_samples[sample],
1952 calls, aln_loc, snp_cols_reduced, sample,
1953 hap_stats,
1954 n_hets_needing_phasing, n_consistent_hets,
1955 o_hapgraph_ss, has_subgraphs, false))
1956 ++hap_stats.per_sample_stats[sample].n_phased_2ndpass;
1957 }
1958
1959 //
1960 // Update the SiteCalls according to the haplotype calls, and reapply
1961 // the MAC filter on the updated data.
1962 //
1963 for (size_t col : snp_cols) {
1964 SiteCall& c = calls[col];
1965 for (size_t sample=0; sample<loc_.mpopi->samples().size(); ++sample)
1966 if (c.sample_calls()[sample].call() == snp_type_het
1967 && !phased_samples[sample].count(col))
1968 c.discard_sample(sample);
1969 c.filter_mac(phasing_min_mac);
1970 }
1971
1972 //
1973 // Clean up.
1974 //
1975 ++hap_stats.n_badly_phased_samples[ {n_consistent_hets, n_hets_needing_phasing} ];
1976 if (dbg_write_hapgraphs && has_subgraphs) {
1977 o_hapgraph_ss << "}\n";
1978 #pragma omp critical
1979 o_hapgraphs_f << o_hapgraph_ss.rdbuf();
1980 }
1981 if (detailed_output)
1982 loc_.details_ss << "END phasing\n";
1983 }
1984
phase_sample_hets(map<size_t,PhasedHet> & phased_sample,const vector<SiteCall> & calls,const CLocAlnSet & aln_loc,const vector<size_t> & snp_cols,size_t sample,HaplotypeStats & hap_stats,size_t & n_hets_needing_phasing,size_t & n_consistent_hets,ostream & o_hapgraph_ss,bool & has_subgraphs,bool first_pass) const1985 bool LocusProcessor::phase_sample_hets(
1986 map<size_t,PhasedHet>& phased_sample,
1987 const vector<SiteCall>& calls,
1988 const CLocAlnSet& aln_loc,
1989 const vector<size_t>& snp_cols,
1990 size_t sample,
1991 HaplotypeStats& hap_stats,
1992 size_t& n_hets_needing_phasing, size_t& n_consistent_hets,
1993 ostream& o_hapgraph_ss, bool& has_subgraphs,
1994 bool first_pass
1995 ) const {
1996 phased_sample.clear();
1997 if (aln_loc.sample_reads(sample).empty())
1998 return true;
1999 if (first_pass)
2000 ++hap_stats.per_sample_stats[sample].n_diploid_loci;
2001
2002 //
2003 // Find heterozygous positions.
2004 //
2005 vector<size_t> het_snps;
2006 for(size_t snp_i=0; snp_i<snp_cols.size(); ++snp_i)
2007 if (calls[snp_cols[snp_i]].sample_calls()[sample].call() == snp_type_het)
2008 het_snps.push_back(snp_i);
2009 if (!first_pass) {
2010 --hap_stats.per_sample_stats[sample].n_hets_2snps;
2011 --n_hets_needing_phasing;
2012 }
2013 if (het_snps.size() == 0) {
2014 // Sample is homozygous.
2015 return true;
2016 } else if (het_snps.size() == 1) {
2017 // One heterozygous SNP; sample has trivial 1nt-long haplotypes.
2018 size_t col = snp_cols[het_snps[0]];
2019 const SampleCall& c = calls[col].sample_calls()[sample];
2020 phased_sample.insert({col, {col, c.nt0(), c.nt1()}});
2021 return true;
2022 }
2023 ++hap_stats.per_sample_stats[sample].n_hets_2snps;
2024 ++n_hets_needing_phasing;
2025
2026 vector<const SampleCall*> sample_het_calls; //**********
2027 for (size_t het_i=0; het_i<het_snps.size(); ++het_i)
2028 sample_het_calls.push_back(&calls[snp_cols[het_snps[het_i]]].sample_calls()[sample]);
2029
2030 //
2031 // Iterate over reads, record as pairwise cooccurrences of alleles.
2032 //
2033 SnpAlleleCooccurrenceCounter cooccurrences (snp_cols.size());
2034 count_pairwise_cooccurrences(cooccurrences, aln_loc,
2035 sample, snp_cols, het_snps, sample_het_calls);
2036 if (dbg_write_hapgraphs) {
2037 has_subgraphs = true;
2038 write_sample_hapgraph(o_hapgraph_ss, sample, het_snps, snp_cols,
2039 sample_het_calls, cooccurrences);
2040 }
2041
2042 //
2043 // Assemble phase sets.
2044 //
2045 vector<PhaseSet> phase_sets;
2046 bool phased = false;
2047 for (size_t min_n_cooccurrences = phasing_cooccurrences_thr_range.first;
2048 min_n_cooccurrences <= phasing_cooccurrences_thr_range.second;
2049 ++min_n_cooccurrences
2050 ) {
2051 if (assemble_phase_sets(
2052 phase_sets, het_snps, sample_het_calls,
2053 cooccurrences, min_n_cooccurrences
2054 )) {
2055 // Phasing succeeded.
2056 phased = true;
2057 if (detailed_output)
2058 loc_.details_ss << "phasing_ok\t"
2059 << loc_.mpopi->samples()[sample].name
2060 << "\tn_hets=" << het_snps.size()
2061 << "\tcooc_thr=" << min_n_cooccurrences << '\n';
2062 break;
2063 }
2064
2065 // Phasing failed. find alleles that are not connected to any
2066 // others, discard the het calls they are part of, and retry.
2067 if (phasing_dont_prune_hets)
2068 continue;
2069 vector<array<bool,2>> alleles_with_edges;
2070 alleles_with_edges.resize(het_snps.size(), {{false, false}});
2071 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2072 array<Nt2,2> allelesi = sample_het_calls[het_i]->nts();
2073 for (size_t het_j=het_i+1; het_j<het_snps.size(); ++het_j) {
2074 array<Nt2,2> allelesj = sample_het_calls[het_j]->nts();
2075 for (size_t allelei=0; allelei<2; ++allelei) {
2076 for (size_t allelej=0; allelej<2; ++allelej) {
2077 if (cooccurrences.at(het_snps[het_i],
2078 allelesi[allelei],
2079 het_snps[het_j],
2080 allelesj[allelej])
2081 >= min_n_cooccurrences
2082 ) {
2083 alleles_with_edges[het_i][allelei] = true;
2084 alleles_with_edges[het_j][allelej] = true;
2085 }
2086 }
2087 }
2088 }
2089 }
2090 vector<size_t> het_snps_reduced;
2091 vector<const SampleCall*> sample_het_calls_reduced;
2092 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2093 if (alleles_with_edges[het_i] == array<bool,2>({{true, true}})) {
2094 // Keep this het.
2095 het_snps_reduced.push_back(het_snps[het_i]);
2096 sample_het_calls_reduced.push_back(sample_het_calls[het_i]);
2097 }
2098 }
2099 if (het_snps_reduced.size() == het_snps.size())
2100 // No hets were removed; the situation is unchanged and phasing will fail again.
2101 continue;
2102 assert(!het_snps_reduced.empty()); // Otherwise the phasing wouldn't have previously failed.
2103 if (het_snps_reduced.size() == 1
2104 || assemble_phase_sets(phase_sets,
2105 het_snps_reduced,
2106 sample_het_calls_reduced,
2107 cooccurrences,
2108 min_n_cooccurrences)
2109 ) {
2110 // Phasing succeeded or became trivial.
2111 // We just downsize `het_snps`. We can do this because `phased_samples`
2112 // doesn't keep track of all the hets; only those of the best phase set.
2113 // In addition, unphased HETs are eventually discarded (as of Feb. 2018,
2114 // v2.0Beta8; c.f. PopMap::populate_locus) so we don't need to do anything
2115 // more to blacklist those genotypes.
2116 phased = true;
2117 if (detailed_output)
2118 loc_.details_ss << "phasing_ok_rmhets\t"
2119 << loc_.mpopi->samples()[sample].name
2120 << "\tn_hets=" << het_snps.size()
2121 << "\tcooc_thr=" << min_n_cooccurrences
2122 << "\tn_kept_hets=" << het_snps_reduced.size() << '\n';
2123 het_snps = het_snps_reduced;
2124 sample_het_calls = sample_het_calls_reduced;
2125 break;
2126 }
2127 }
2128 if (!phased) {
2129 if (detailed_output)
2130 loc_.details_ss << "phasing_failed\t"
2131 << loc_.mpopi->samples()[sample].name
2132 << "\tn_hets=" << het_snps.size() << '\n';
2133 return false;
2134 }
2135 ++hap_stats.per_sample_stats[sample].n_phased; // (if !first_pass, this wasn't reach before; no guard.)
2136 ++n_consistent_hets;
2137 if (het_snps.size() == 1) {
2138 // After pruning hets, sample has trivial 1nt-long haplotypes.
2139 // TODO Record that we've just removed the problem...
2140 size_t col = snp_cols[het_snps[0]];
2141 const SampleCall& c = *sample_het_calls[0];
2142 phased_sample.insert({col, {col, c.nt0(), c.nt1()}});
2143 return true;
2144 }
2145
2146 //
2147 // Record phase sets.
2148 //
2149 for (const PhaseSet& ps : phase_sets) {
2150 assert(ps.size() == het_snps.size());
2151 size_t phase_set_id = SIZE_MAX;
2152 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2153 if (ps.het(het_i).left_allele == Nt4::n)
2154 continue;
2155
2156 size_t col = snp_cols[het_snps[het_i]];
2157 if (phase_set_id == SIZE_MAX)
2158 phase_set_id = col; // Recommended value, c.f. VCF specification.
2159 PhasedHet ph = ps.het(het_i);
2160 ph.phase_set = phase_set_id;
2161 phased_sample[col] = ph;
2162 }
2163 }
2164
2165 // Record singleton nodes (that are implicit in our representation).
2166 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2167 size_t col = snp_cols[het_snps[het_i]];
2168 if (!phased_sample.count(col)) {
2169 array<Nt2,2> alleles = sample_het_calls[het_i]->nts();
2170 phased_sample[col] = PhasedHet({col, alleles[0], alleles[1]});
2171 }
2172 }
2173 assert(!phased_sample.empty());
2174
2175 //
2176 // Remove all phase sets but the largest one.
2177 //
2178 map<size_t,size_t> phase_set_sizes;
2179 for (auto& phasedhet : phased_sample)
2180 ++phase_set_sizes[phasedhet.second.phase_set];
2181 auto best_ps = phase_set_sizes.begin();
2182 for (auto ps=++phase_set_sizes.begin(); ps!=phase_set_sizes.end(); ++ps)
2183 if (ps->second > best_ps->second)
2184 best_ps = ps;
2185 for (auto phasedhet=phased_sample.begin(); phasedhet!=phased_sample.end();) {
2186 if (phasedhet->second.phase_set == best_ps->first)
2187 ++phasedhet;
2188 else
2189 phased_sample.erase(phasedhet++);
2190 }
2191 return true;
2192 }
2193
count_pairwise_cooccurrences(SnpAlleleCooccurrenceCounter & cooccurrences,const CLocAlnSet & aln_loc,size_t sample,const vector<size_t> & snp_cols,const vector<size_t> & het_snps,const vector<const SampleCall * > & sample_het_calls) const2194 void LocusProcessor::count_pairwise_cooccurrences(
2195 SnpAlleleCooccurrenceCounter& cooccurrences,
2196 const CLocAlnSet& aln_loc,
2197 size_t sample,
2198 const vector<size_t>& snp_cols,
2199 const vector<size_t>& het_snps,
2200 const vector<const SampleCall*>& sample_het_calls
2201 ) const {
2202
2203 cooccurrences.clear();
2204 vector<Nt4> read_hap (het_snps.size());
2205 for (size_t read_i : aln_loc.sample_reads(sample)) {
2206 // Build the haplotype.
2207 size_t curr_col = 0;
2208 Alignment::iterator read_itr (aln_loc.reads()[read_i].aln());
2209 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2210 size_t col = snp_cols[het_snps[het_i]];
2211 read_itr += col - curr_col;
2212 curr_col = col;
2213 Nt4 nt = *read_itr;
2214 if (nt == Nt4::n) {
2215 read_hap[het_i] = Nt4::n;
2216 } else {
2217 const SampleCall& c = *sample_het_calls[het_i];
2218 Nt2 nt2 = Nt2(nt);
2219 if (nt2 == c.nt0() || nt2 == c.nt1())
2220 read_hap[het_i] = nt;
2221 else
2222 read_hap[het_i] = Nt4::n;
2223 }
2224 }
2225
2226 // Record the pairwise cooccurrences.
2227 for (size_t i=0; i<het_snps.size(); ++i) {
2228 Nt4 nti = read_hap[i];
2229 if (nti == Nt4::n)
2230 continue;
2231 for (size_t j=i+1; j<het_snps.size(); ++j) {
2232 Nt4 ntj = read_hap[j];
2233 if (ntj == Nt4::n)
2234 continue;
2235 ++cooccurrences.at(het_snps[i], Nt2(nti), het_snps[j], Nt2(ntj));
2236 }
2237 }
2238 }
2239 }
2240
assemble_phase_sets(vector<PhaseSet> & phase_sets,const vector<size_t> & het_snps,const vector<const SampleCall * > & sample_het_calls,const SnpAlleleCooccurrenceCounter & cooccurrences,const size_t min_n_cooccurrences) const2241 bool LocusProcessor::assemble_phase_sets(
2242 vector<PhaseSet>& phase_sets,
2243 const vector<size_t>& het_snps,
2244 const vector<const SampleCall*>& sample_het_calls,
2245 const SnpAlleleCooccurrenceCounter& cooccurrences,
2246 const size_t min_n_cooccurrences
2247 ) const {
2248 // xxx 2018 Feb. @Nick: Actually all of this would be more simple and efficient
2249 // if we used a single vector of PhasedHet's, with each het starting in its own
2250 // phase set.
2251 phase_sets.clear();
2252
2253 // We keep track of which phase set each het is currently part of.
2254 vector<size_t> allele_to_ps (het_snps.size(), SIZE_MAX);
2255
2256 assert(het_snps.size() == sample_het_calls.size());
2257 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2258 size_t snp_i = het_snps[het_i];
2259 size_t& ps_i = allele_to_ps[het_i]; //(by reference; always up to date)
2260 array<Nt2,2> allelesi = sample_het_calls[het_i]->nts();
2261 for (size_t het_j=het_i+1; het_j<het_snps.size(); ++het_j) {
2262 size_t snp_j = het_snps[het_j];
2263 size_t& ps_j = allele_to_ps[het_j];
2264 array<Nt2,2> allelesj = sample_het_calls[het_j]->nts();
2265
2266 for (Nt2 nti : allelesi) {
2267 for (Nt2 ntj : allelesj) {
2268 size_t n = cooccurrences.at(snp_i, nti, snp_j, ntj);
2269 if (n < min_n_cooccurrences)
2270 // Inexistent (n==0) or low-coverage edge.
2271 continue;
2272
2273 if (ps_i == SIZE_MAX && ps_j == SIZE_MAX) {
2274 // Both nodes are singletons. Start a new phase set.
2275 ps_i = phase_sets.size();
2276 ps_j = phase_sets.size();
2277 phase_sets.push_back(PhaseSet(het_snps.size()));
2278 phase_sets.back().add_het(het_i, allelesi);
2279 phase_sets.back().add_het(het_j, allelesj, ntj, het_i, nti);
2280 } else if (ps_i == SIZE_MAX) {
2281 // Add `het_i` to `ps_j`.
2282 phase_sets[ps_j].add_het(het_i, allelesi, nti, het_j, ntj);
2283 ps_i = ps_j;
2284 } else if (ps_j == SIZE_MAX) {
2285 // Add `het_j` to `ps_i`.
2286 phase_sets[ps_i].add_het(het_j, allelesj, ntj, het_i, nti);
2287 ps_j = ps_i;
2288 } else if (ps_i != ps_j) {
2289 // Merge `ps_j` into `ps_i`.
2290 phase_sets[ps_i].merge_with(phase_sets[ps_j], het_i, nti, het_j, ntj);
2291 phase_sets[ps_j].clear();
2292 size_t merged_ps = ps_j;
2293 for (size_t& allele_ps : allele_to_ps)
2294 if (allele_ps == merged_ps)
2295 allele_ps = ps_i;
2296 } else {
2297 assert(ps_i == ps_j);
2298 // Check that the edge is consistent.
2299 if (!phase_sets[ps_i].is_edge_consistent(het_i, nti, het_j, ntj))
2300 return false;
2301 }
2302 }
2303 }
2304 }
2305 }
2306
2307 // Purge empty phase sets.
2308 phase_sets.erase(std::remove_if(
2309 phase_sets.begin(), phase_sets.end(),
2310 [] (const PhaseSet& ps) {return ps.empty();}
2311 ),phase_sets.end());
2312 return true;
2313 }
2314
add_het(size_t het_i,array<Nt2,2> alleles)2315 void PhaseSet::add_het(size_t het_i, array<Nt2,2> alleles) {
2316 assert(phased_hets_.size() > het_i);
2317 assert(phased_hets_ == PhaseSet(phased_hets_.size()).phased_hets_); // Uninitialized.
2318
2319 phased_hets_[het_i].left_allele = Nt4(alleles[0]);
2320 phased_hets_[het_i].right_allele = Nt4(alleles[1]);
2321 }
2322
add_het(size_t het_i,array<Nt2,2> alleles,Nt2 nt_i,size_t het_j,Nt2 nt_j)2323 void PhaseSet::add_het(size_t het_i, array<Nt2,2> alleles, Nt2 nt_i, size_t het_j, Nt2 nt_j) {
2324 assert(phased_hets_.size() > std::max(het_i, het_j));
2325 assert(phased_hets_[het_i].left_allele == Nt4::n); // `i` node shouldn't exist yet.
2326 assert(phased_hets_[het_j].left_allele != Nt4::n); //
2327
2328 bool crossed_edge =
2329 (nt_j == phased_hets_[het_j].left_allele)
2330 ^ (nt_i == alleles[0]);
2331
2332 if (crossed_edge) {
2333 // Flip the alleles so that the all the edges within the phase set are parallel.
2334 phased_hets_[het_i].left_allele = Nt4(alleles[1]);
2335 phased_hets_[het_i].right_allele = Nt4(alleles[0]);
2336 } else {
2337 phased_hets_[het_i].left_allele = Nt4(alleles[0]);
2338 phased_hets_[het_i].right_allele = Nt4(alleles[1]);
2339 }
2340 }
2341
merge_with(const PhaseSet & other,size_t het_i,Nt2 nt_i,size_t het_j,Nt2 nt_j)2342 void PhaseSet::merge_with(const PhaseSet& other, size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j) {
2343 assert(phased_hets_.size() > std::max(het_i, het_j));
2344 assert(phased_hets_.size() == other.phased_hets_.size());
2345 assert(phased_hets_[het_i].left_allele != Nt4::n);
2346 assert(other.phased_hets_[het_j].left_allele != Nt4::n);
2347
2348 bool crossed_edge =
2349 (nt_i == phased_hets_[het_i].left_allele)
2350 ^ (nt_j == other.phased_hets_[het_j].left_allele);
2351
2352 for (size_t i=0; i<phased_hets_.size(); ++i) {
2353 if (other.phased_hets_[i].left_allele != Nt4::n) {
2354 assert(other.phased_hets_[i].right_allele != Nt4::n); // Both alleles should be set/not set together.
2355 assert(phased_hets_[i].left_allele == Nt4::n); // Phase sets should be non-overlapping.
2356 if (crossed_edge) {
2357 phased_hets_[i].left_allele = other.phased_hets_[i].right_allele;
2358 phased_hets_[i].right_allele = other.phased_hets_[i].left_allele;
2359 } else {
2360 phased_hets_[i].left_allele = other.phased_hets_[i].left_allele;
2361 phased_hets_[i].right_allele = other.phased_hets_[i].right_allele;
2362 }
2363 }
2364 }
2365 }
2366
is_edge_consistent(size_t het_i,Nt2 nt_i,size_t het_j,Nt2 nt_j) const2367 bool PhaseSet::is_edge_consistent(size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j) const {
2368 assert(phased_hets_[het_i].left_allele != Nt4::n);
2369 assert(phased_hets_[het_j].left_allele != Nt4::n);
2370
2371 bool crossed_edge =
2372 (nt_i == phased_hets_[het_i].left_allele)
2373 ^ (nt_j == phased_hets_[het_j].left_allele);
2374
2375 // We build phase sets so that all inside edges are parallel
2376 // (by flipping the alleles when necessary), so the proposed
2377 // edge is inconsistent if it is crossed.
2378 return !crossed_edge;
2379 }
2380
write_one_locus(const CLocAlnSet & aln_loc,const vector<SiteCounts> & depths,const vector<SiteCall> & calls,const vector<map<size_t,PhasedHet>> & phase_data)2381 void LocusProcessor::write_one_locus (
2382 const CLocAlnSet& aln_loc,
2383 const vector<SiteCounts>& depths,
2384 const vector<SiteCall>& calls,
2385 const vector<map<size_t,PhasedHet>>& phase_data
2386 ) {
2387 char loc_id[16];
2388 sprintf(loc_id, "%d", loc_.id);
2389
2390 const DNASeq4& ref = aln_loc.ref();
2391 const MetaPopInfo& mpopi = aln_loc.mpopi();
2392
2393 //
2394 // Vcf output.
2395 //
2396 timers_.building_vcf.restart();
2397 assert(depths.size() == ref.length());
2398 assert(calls.size() == ref.length());
2399 vector<size_t> sample_sites_w_data (mpopi.samples().size(), 0);
2400 stringstream vcf_records;
2401 VcfRecord rec;
2402 for (size_t i=0; i<ref.length(); ++i) {
2403 const SiteCounts& sitedepths = depths[i];
2404 const SiteCall& sitecall = calls[i];
2405 if (sitecall.alleles().empty())
2406 // No useful data at this site.
2407 continue;
2408
2409 if (!ref[i].is_acgt())
2410 continue;
2411
2412 // Determine which alleles exist, and their order.
2413 // (n.b. As of Apr4,2017 the ref allele might not be the most frequent one.)
2414 vector<Nt2> vcf_alleles;
2415 map<Nt2, size_t> vcf_allele_indexes;
2416 {
2417 vcf_alleles.push_back(ref[i]);
2418 vcf_allele_indexes.insert({Nt2(ref[i]), 0});
2419
2420 // Sort the alleles by frequency.
2421 vector<pair<double, Nt2>> sorted_alleles;
2422 for (auto& a : sitecall.alleles())
2423 sorted_alleles.push_back({a.second, a.first});
2424 sort(sorted_alleles.rbegin(), sorted_alleles.rend()); // (decreasing)
2425
2426 // The reference allele has already been added to vcf_alleles; exclude it.
2427 for (auto iter=sorted_alleles.begin(); iter!=sorted_alleles.end(); ++iter) {
2428 if (iter->second == vcf_alleles[0]) {
2429 sorted_alleles.erase(iter);
2430 break;
2431 }
2432 }
2433
2434 // Record the alternative alleles.
2435 for (auto& alt_allele : sorted_alleles) {
2436 vcf_allele_indexes.insert({alt_allele.second, vcf_alleles.size()});
2437 vcf_alleles.push_back(alt_allele.second);
2438 }
2439 }
2440
2441 //
2442 // Create the VCF record.
2443 //
2444
2445 // Chrom & pos.
2446 rec.clear();
2447 rec.append_chrom(loc_id);
2448 rec.append_pos(i);
2449 rec.append_id();
2450
2451 // Alleles.
2452 for (Nt2 nt : vcf_alleles)
2453 rec.append_allele(nt);
2454
2455 // SNP quality.
2456 rec.append_qual(sitecall.snp_qual());
2457 rec.append_filters();
2458
2459 if(vcf_alleles.size() == 1) {
2460 // Fixed site.
2461
2462 // Info/DP.
2463 rec.append_info(string("DP=") + to_string(sitedepths.tot.sum()));
2464 // Info/AD.
2465 Nt2 ref_nt = sitecall.alleles().begin()->first;
2466 rec.append_info(string("AD=") + to_string(sitedepths.tot[ref_nt]));
2467 if (dbg_write_nt_depths) {
2468 // Info/cnts.
2469 stringstream cnts;
2470 join(sitedepths.tot.arr(), ',', cnts);
2471 rec.append_info(string("cnts=") + cnts.str());
2472 }
2473 // Format.
2474 rec.append_format("DP");
2475 // Genotypes.
2476 for (size_t sample=0; sample<mpopi.samples().size(); ++sample) {
2477 size_t dp = sitedepths.samples[sample].sum();
2478 if (dp == 0) {
2479 rec.append_sample(".");
2480 continue;
2481 }
2482 ++sample_sites_w_data[sample];
2483 rec.append_sample(to_string(dp));
2484 }
2485
2486 } else {
2487 // Polymorphic site.
2488
2489 // Info/DP.
2490 rec.append_info(string("DP=") + to_string(sitedepths.tot.sum()));
2491 // Info/AD.
2492 vector<size_t> ad;
2493 for (Nt2 nt : vcf_alleles)
2494 ad.push_back(sitedepths.tot[nt]);
2495 stringstream ss;
2496 join(ad, ',', ss);
2497 rec.append_info(string("AD=") + ss.str());
2498 // Info/AF.
2499 vector<double> alt_freqs;
2500 for (auto nt=++vcf_alleles.begin(); nt!=vcf_alleles.end(); ++nt) // rem. always >1 alleles.
2501 alt_freqs.push_back(sitecall.alleles().at(*nt));
2502 rec.append_info(VcfRecord::util::fmt_info_af(alt_freqs));
2503 if (dbg_write_nt_depths) {
2504 // Info/cnts.
2505 stringstream cnts;
2506 join(sitedepths.tot.arr(), ',', cnts);
2507 rec.append_info(string("cnts=") + cnts.str());
2508 }
2509
2510 // Format.
2511 rec.append_format("GT");
2512 if (!dbg_no_haplotypes) {
2513 rec.append_format("PS"); // Phase set.
2514 rec.append_format("FT"); // Filter.
2515 }
2516 rec.append_format("GQ");
2517 rec.append_format("DP");
2518 rec.append_format("AD");
2519 rec.append_format("GL");
2520
2521 // Genotypes.
2522 for (size_t sample : mpopi.sample_indexes_orig_order()) {
2523 const Counts<Nt2>& sdepths = sitedepths.samples[sample];
2524 const SampleCall& scall = sitecall.sample_calls()[sample];
2525 if (sdepths.sum() == 0) {
2526 // No data for this sample.
2527 rec.append_sample("./.");
2528 continue;
2529 }
2530 ++sample_sites_w_data[sample];
2531
2532 stringstream genotype;
2533 // GT field.
2534 array<size_t,2> gt;
2535 switch (scall.call()) {
2536 case snp_type_hom:
2537 gt[0] = vcf_allele_indexes.at(scall.nt0());
2538 genotype << gt[0] << '/' << gt[0];
2539 if (!dbg_no_haplotypes)
2540 genotype << ":.:.";
2541 genotype << ':' << scall.gq();
2542 break;
2543 case snp_type_het:
2544 if (!dbg_no_haplotypes) {
2545 assert(!phase_data.empty());
2546 const PhasedHet& p = phase_data[sample].at(i);
2547 genotype << vcf_allele_indexes.at(p.left_allele)
2548 << '|'
2549 << vcf_allele_indexes.at(p.right_allele)
2550 << ':' << (p.phase_set + 1)
2551 << ':' << '.';
2552 } else {
2553 //dbg_no_haplotypes
2554 gt[0] = vcf_allele_indexes.at(scall.nt0());
2555 gt[1] = vcf_allele_indexes.at(scall.nt1());
2556 std::sort(gt.begin(), gt.end()); // (Prevents '1/0'.)
2557 genotype << gt[0] << '/' << gt[1];
2558 }
2559 genotype << ':' << scall.gq();
2560 break;
2561 default:
2562 genotype << "./.";
2563 if (!dbg_no_haplotypes) {
2564 if (scall.call() == snp_type_discarded)
2565 genotype << ":.:disc";
2566 else
2567 genotype << ":.:.";
2568 }
2569 genotype << ":.";
2570 break;
2571 }
2572 // DP field.
2573 genotype << ':' << sdepths.sum();
2574 // AD field.
2575 vector<size_t> ad;
2576 ad.reserve(vcf_alleles.size());
2577 for (Nt2 nt : vcf_alleles)
2578 ad.push_back(sdepths[nt]);
2579 genotype << ':';
2580 join(ad, ',', genotype);
2581 // GL field.
2582 genotype << ':' << VcfRecord::util::fmt_gt_gl(vcf_alleles, scall.lnls());
2583 // cnts field.
2584 if (dbg_write_nt_depths) {
2585 genotype << ":";
2586 join(sdepths.arr(), ',', genotype);
2587 }
2588 // Push it.
2589 rec.append_sample(genotype.str());
2590 }
2591 }
2592 assert(rec.count_samples() == o_vcf_f->header().samples().size());
2593
2594 vcf_records << rec;
2595 }
2596 loc_.o_vcf = vcf_records.str();
2597 timers_.building_vcf.update();
2598
2599 //
2600 // Fasta output.
2601 //
2602 timers_.building_fa.restart();
2603
2604 // Determine the number of samples for this locus. Some samples may have
2605 // been discarded (as of May 26, 2017, this would be because their haplotypes
2606 // were inconsistent).
2607 set<size_t> samples_w_reads;
2608 for (const SAlnRead& r : aln_loc.reads())
2609 samples_w_reads.insert(r.sample);
2610 size_t n_remaining_samples = 0;
2611 for (size_t sample_n_sites : sample_sites_w_data)
2612 if (sample_n_sites > 0)
2613 ++n_remaining_samples;
2614
2615
2616 // Write the record.
2617 string& fa = loc_.o_fa;
2618 assert(fa.empty());
2619 char cstr_buf[32];
2620 fa += '>';
2621 fa += loc_id;
2622 // Genomic position.
2623 if (!aln_loc.pos().empty()) {
2624 const PhyLoc& p = aln_loc.pos();
2625 sprintf(cstr_buf, "%u", p.bp+1);
2626 fa += " pos=";
2627 fa += p.chr();
2628 fa += ':';
2629 fa += cstr_buf;
2630 fa += ':';
2631 fa += (p.strand == strand_plus ? '+' : '-');
2632 }
2633 // Number of samples
2634 sprintf(cstr_buf, "%zu", n_remaining_samples);
2635 fa += " NS=";
2636 fa += cstr_buf;
2637 if (n_remaining_samples != samples_w_reads.size()) {
2638 assert(n_remaining_samples < samples_w_reads.size());
2639 sprintf(cstr_buf, "%zu", samples_w_reads.size() - n_remaining_samples);
2640 fa += " n_discarded_samples=";
2641 fa += cstr_buf;
2642 }
2643 // Contig status.
2644 switch (loc_.ctg_status) {
2645 case LocData::overlapped:
2646 sprintf(cstr_buf, "%zu", loc_.olap_length);
2647 fa += " contig=overlapped:";
2648 fa += cstr_buf;
2649 break;
2650 case LocData::separate:
2651 fa += " contig=separate";
2652 break;
2653 default:
2654 break;
2655 }
2656 fa += '\n';
2657 fa += ref.str();
2658 fa += '\n';
2659 timers_.building_fa.update();
2660 }
2661
write_sample_hapgraph(ostream & os,size_t sample,const vector<size_t> & het_snps,const vector<size_t> & snp_cols,const vector<const SampleCall * > & sample_het_calls,const SnpAlleleCooccurrenceCounter & cooccurrences) const2662 void LocusProcessor::write_sample_hapgraph(
2663 ostream& os,
2664 size_t sample,
2665 const vector<size_t>& het_snps,
2666 const vector<size_t>& snp_cols,
2667 const vector<const SampleCall*>& sample_het_calls,
2668 const SnpAlleleCooccurrenceCounter& cooccurrences
2669 ) const {
2670
2671 auto nodeid = [this,&sample](size_t col, Nt2 allele)
2672 {return string("l")+to_string(loc_.id)+"s"+to_string(sample)+"c"+to_string(col)+char(allele);};
2673
2674 // Initialize the subgraph.
2675 os << "\tsubgraph cluster_sample" << sample << " {\n"
2676 << "\t\tlabel=\""
2677 << "i" << sample << " '" << loc_.mpopi->samples()[sample].name << "'\\n"
2678 << "\";\n"
2679 << "\t\tstyle=dashed;\n"
2680 << "\t\t# heterozygous columns: ";
2681 vector<size_t> het_cols;
2682 for (size_t snp_i : het_snps)
2683 het_cols.push_back(snp_cols[snp_i]);
2684 join(het_cols, ',', os);
2685 os << "\n";
2686
2687 // Write the node labels.
2688 for (size_t i=0; i<het_snps.size(); ++i) {
2689 array<Nt2,2> alleles = sample_het_calls[i]->nts();
2690 size_t col = snp_cols[het_snps[i]];
2691 for (Nt2 allele : alleles)
2692 os << "\t\t" << nodeid(col, allele)
2693 << " [label=<"
2694 << "<sup><font point-size=\"10\">" << col+1 << "</font></sup>" << allele
2695 << ">];\n";
2696 }
2697
2698 // Write the edges.
2699 for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2700 array<Nt2,2> alleles_i = sample_het_calls[het_i]->nts();
2701 size_t snp_i = het_snps[het_i];
2702 for (size_t het_j=het_i+1; het_j<het_snps.size(); ++het_j) {
2703 array<Nt2,2> alleles_j = sample_het_calls[het_j]->nts();
2704 size_t snp_j = het_snps[het_j];
2705 for (Nt2 nti : alleles_i) {
2706 for (Nt2 ntj : alleles_j) {
2707 size_t n = cooccurrences.at(snp_i, nti, snp_j, ntj);
2708 if (n == 0)
2709 continue;
2710 os << "\t\t" << nodeid(snp_cols[snp_i],nti)
2711 << " -- " << nodeid(snp_cols[snp_j],ntj) << " [";
2712 if (n==1)
2713 os << "style=dotted";
2714 else
2715 os << "label=\"" << n << "\",penwidth=" << n;
2716 os << "];\n";
2717 }
2718 }
2719 }
2720 }
2721 os << "\t}\n";
2722 }
2723
operator +=(const Timers & other)2724 Timers& Timers::operator+= (const Timers& other) {
2725 reading += other.reading;
2726 processing += other.processing;
2727 writing_fa += other.writing_fa;
2728 writing_vcf += other.writing_vcf;
2729 writing_details += other.writing_details;
2730 writing_bams += other.writing_bams;
2731 // Within processing:
2732 processing_pre_alns += other.processing_pre_alns;
2733 rm_Ns += other.rm_Ns;
2734 assembling += other.assembling;
2735 init_alignments += other.init_alignments;
2736 aligning += other.aligning;
2737 building_bams += other.building_bams;
2738 merge_paired_reads += other.merge_paired_reads;
2739 processing_post_alns += other.processing_post_alns;
2740 rm_reads += other.rm_reads;
2741 genotyping += other.genotyping;
2742 haplotyping += other.haplotyping;
2743 building_vcf += other.building_vcf;
2744 building_fa += other.building_fa;
2745 cpt_consensus += other.cpt_consensus;
2746 counting_nts += other.counting_nts;
2747 return *this;
2748 }
2749
2750 //
2751 // Arguments
2752 // ==========
2753 //
2754
2755 const string help_string = string() +
2756 "gstacks " + VERSION + "\n" +
2757 "\n"
2758 "De novo mode:\n"
2759 " gstacks -P stacks_dir -M popmap\n"
2760 "\n"
2761 " -P: input directory containg '*.matches.bam' files created by the\n"
2762 " de novo Stacks pipeline, ustacks-cstacks-sstacks-tsv2bam\n"
2763 "\n"
2764 "Reference-based mode:\n"
2765 " gstacks -I bam_dir -M popmap [-S suffix] -O out_dir\n"
2766 " gstacks -B bam_file [-B ...] -O out_dir\n"
2767 "\n"
2768 " -I: input directory containing BAM files\n"
2769 " -S: with -I/-M, suffix to use to build BAM file names: by default this\n"
2770 " is just '.bam', i.e. the program expects 'SAMPLE_NAME.bam'\n"
2771 " -B: input BAM file(s)\n"
2772 "\n"
2773 " The input BAM file(s) must be sorted by coordinate.\n"
2774 " With -B, records must be assigned to samples using BAM \"reads groups\"\n"
2775 " (gstacks uses the ID/identifier and SM/sample name fields). Read groups\n"
2776 " must be consistent if repeated different files. Note that with -I, read\n"
2777 " groups are unneeded and ignored.\n"
2778 "\n"
2779 "For both modes:\n"
2780 " -M: path to a population map giving the list of samples\n"
2781 " -O: output directory (default: none with -B; with -P same as the input\n"
2782 " directory)\n"
2783 " -t,--threads: number of threads to use (default: 1)\n"
2784 "\n"
2785 "SNP Model options:\n"
2786 " --model: model to use to call variants and genotypes; one of\n"
2787 " marukilow (default), marukihigh, or snp\n"
2788 " --var-alpha: alpha threshold for discovering SNPs (default: 0.01 for marukilow)\n"
2789 " --gt-alpha: alpha threshold for calling genotypes (default: 0.05)\n"
2790 "\n"
2791 "Paired-end options:\n"
2792 " --rm-pcr-duplicates: remove all but one set ofread pairs of the same sample that \n"
2793 " have the same insert length (implies --rm-unpaired-reads)\n"
2794 " --rm-unpaired-reads: discard unpaired reads\n"
2795 " --ignore-pe-reads: ignore paired-end reads even if present in the input\n"
2796 " --unpaired: ignore read pairing (only for paired-end GBS; treat READ2's as if they were READ1's)\n"
2797 "\n"
2798 "Advanced options:\n"
2799 " (De novo mode)\n"
2800 " --kmer-length: kmer length for the de Bruijn graph (default: 31, max. 31)\n"
2801 " --max-debruijn-reads: maximum number of reads to use in the de Bruijn graph (default: 1000)\n"
2802 " --min-kmer-cov: minimum coverage to consider a kmer (default: 2)\n"
2803 " --write-alignments: save read alignments (heavy BAM files)\n"
2804 "\n"
2805 " (Reference-based mode)\n"
2806 " --min-mapq: minimum PHRED-scaled mapping quality to consider a read (default: 10)\n"
2807 " --max-clipped: maximum soft-clipping level, in fraction of read length (default: 0.20)\n"
2808 " --max-insert-len: maximum allowed sequencing insert length (default: 1000)\n"
2809 "\n"
2810 " --details: write a heavier output\n"
2811 " --phasing-cooccurrences-thr-range: range of edge coverage thresholds to\n"
2812 " iterate over when building the graph of allele cooccurrences for\n"
2813 " SNP phasing (default: 1,2)\n"
2814 " --phasing-dont-prune-hets: don't try to ignore dubious heterozygote\n"
2815 " genotypes during phasing\n"
2816 "\n"
2817 #ifdef DEBUG
2818 "Debug options:\n"
2819 " --dbg-no-overlaps: disable overlapping\n"
2820 " --dbg-no-haps: disable phasing\n"
2821 " --dbg-gfa: output a GFA file for each locus\n"
2822 " --dbg-gfa-not-dag: output a GFA file for failed assemblies\n"
2823 " --dbg-alns: output a file showing the contigs & alignments\n"
2824 " --dbg-phasing-min-mac: minimum SNP MAC.\n"
2825 " --dbg-phasing-no-2ndpass: don't try a second pass.\n"
2826 " --dbg-hapgraphs: output a dot graph file showing phasing information\n"
2827 " --dbg-depths: write detailed depth data in the output VCF\n"
2828 " --dbg-log-stats-phasing: log detailed phasing statistics\n"
2829 " --dbg-min-spl-reads: discard samples with less than this many reads (ref-based)\n"
2830 " --dbg-min-loc-spls: discard loci with less than this many samples\n"
2831 " --dbg-max-debruijn-reads\n"
2832 "\n"
2833 #endif
2834 ;
2835
parse_command_line(int argc,char * argv[])2836 string parse_command_line(int argc, char* argv[]) {
2837 auto bad_args = [](){
2838 cerr << help_string;
2839 exit(1);
2840 };
2841 try {
2842 static const option long_options[] = {
2843 {"version", no_argument, NULL, 1000},
2844 {"help", no_argument, NULL, 'h'},
2845 {"quiet", no_argument, NULL, 'q'},
2846 {"stacks-dir", required_argument, NULL, 'P'},
2847 {"in-dir", required_argument, NULL, 'I'},
2848 {"in-bam", required_argument, NULL, 'B'},
2849 {"suffix", required_argument, NULL, 'S'},
2850 {"popmap", required_argument, NULL, 'M'},
2851 {"out-dir", required_argument, NULL, 'O'},
2852 {"unpaired", no_argument, NULL, 1007},
2853 {"threads", required_argument, NULL, 't'},
2854 {"model", required_argument, NULL, 1006},
2855 {"gt-alpha", required_argument, NULL, 1005},
2856 {"var-alpha", required_argument, NULL, 1008},
2857 {"kmer-length", required_argument, NULL, 1001},
2858 {"max-debruijn-reads", required_argument, NULL, 1024},
2859 {"min-kmer-cov", required_argument, NULL, 1002},
2860 {"ignore-pe-reads", no_argument, NULL, 1012},
2861 {"write-alignments", no_argument, NULL, 1021},
2862 {"details", no_argument, NULL, 1013},
2863 {"min-mapq", required_argument, NULL, 1014},
2864 {"max-clipped", required_argument, NULL, 1015},
2865 {"max-insert-len", required_argument, NULL, 1016},
2866 {"rm-unpaired-reads", no_argument, NULL, 1017},
2867 {"rm-pcr-duplicates", no_argument, NULL, 1018},
2868 {"phasing-cooccurrences-thr-range", required_argument, NULL, 1019},
2869 {"phasing-dont-prune-hets", no_argument, NULL, 1020},
2870 //debug options
2871 {"min-kmer-freq", required_argument, NULL, 3021},
2872 {"dbg-phasing-min-mac", required_argument, NULL, 2018},
2873 {"dbg-phasing-no-2ndpass", no_argument, NULL, 2019},
2874 {"dbg-print-cloc-ids", no_argument, NULL, 2000},
2875 {"dbg-gfa", no_argument, NULL, 2003},
2876 {"dbg-gfa-not-dag", no_argument, NULL, 2015},
2877 {"dbg-alns", no_argument, NULL, 2004}, {"alns", no_argument, NULL, 2004},
2878 {"dbg-depths", no_argument, NULL, 2007},
2879 {"dbg-hapgraphs", no_argument, NULL, 2010},
2880 {"dbg-no-overlaps", no_argument, NULL, 2008},
2881 {"dbg-no-haps", no_argument, NULL, 2009},
2882 {"dbg-min-spl-reads", required_argument, NULL, 2014},
2883 {"dbg-min-loc-spls", required_argument, NULL, 2017},
2884 {0, 0, 0, 0}
2885 };
2886
2887 string stacks_dir;
2888 string in_dir;
2889 string suffix = ".bam";
2890
2891 double gt_alpha = 0.05;
2892 double var_alpha = 0.0;
2893
2894 // bool pcr_dupl_measures = true;
2895 // auto pcr_duplicates_measures = [&](){
2896 // pcr_dupl_measures = true;
2897 // phasing_min_mac = 3;
2898 // //TODO
2899 // };
2900 // auto no_pcr_duplicates_measures = [&](){
2901 // pcr_dupl_measures = false;
2902 // phasing_min_mac = 1;
2903 // //TODO
2904 // };
2905 // pcr_duplicates_measures();
2906
2907 int c;
2908 int long_options_i;
2909 std::cmatch tmp_cmatch;
2910 long tmp_long;
2911 while (true) {
2912
2913 c = getopt_long(argc, argv, "hqP:I:B:S:s:M:O:W:t:m:", long_options, &long_options_i);
2914
2915 if (c == -1)
2916 break;
2917
2918 switch (c) {
2919 case 1000: //version
2920 cout << prog_name << " " << VERSION << "\n";
2921 exit(0);
2922 break;
2923 case 'h':
2924 cout << help_string;
2925 exit(0);
2926 break;
2927 case 'q':
2928 quiet = true;
2929 break;
2930 case 'P':
2931 stacks_dir = optarg;
2932 break;
2933 case 'I':
2934 in_dir = optarg;
2935 break;
2936 case 'B':
2937 in_bams.push_back(optarg);
2938 break;
2939 case 'S':
2940 suffix = optarg;
2941 break;
2942 case 'M':
2943 popmap_path = optarg;
2944 break;
2945 case 'O':
2946 out_dir = optarg;
2947 break;
2948 case 1007: //unpaired
2949 refbased_cfg.paired = false;
2950 break;
2951 case 1006: //model
2952 model_type = parse_model_type(optarg);
2953 break;
2954 case 1005: //gt-alpha
2955 gt_alpha = atof(optarg);
2956 break;
2957 case 1008: //var-alpha
2958 var_alpha = atof(optarg);
2959 break;
2960 case 't':
2961 num_threads = is_integer(optarg);
2962 if (num_threads < 0) {
2963 cerr << "Error: Illegal -t option value '" << optarg << "'.\n";
2964 bad_args();
2965 }
2966 break;
2967 case 1012: //ignore-pe-reads
2968 ignore_pe_reads = true;
2969 refbased_cfg.ign_pe_reads = true;
2970 break;
2971 case 1001://kmer-length
2972 km_length = atoi(optarg);
2973 if (km_length < 2 || km_length > 31) {
2974 cerr << "Error: Illegal -t option value '" << optarg << "' (valid range is 2-31).\n";
2975 bad_args();
2976 }
2977 break;
2978 case 1024: // max-debruijn-reads
2979 try {
2980 tmp_long = std::stol(optarg);
2981 if (tmp_long < 1)
2982 throw exception();
2983 } catch(std::exception) {
2984 cerr << "Error: Illegal max-debruijn-reads option value '" << optarg << "'.\n";
2985 bad_args();
2986 }
2987 max_debruijn_reads = tmp_long;
2988 break;
2989 case 1002://min-kmer-cov
2990 min_km_count = atoi(optarg);
2991 break;
2992 case 1021://write-alignments
2993 bam_output = true;
2994 break;
2995 case 1013://details
2996 detailed_output = true;
2997 break;
2998 case 1014://min-mapq
2999 refbased_cfg.min_mapq = stoi(optarg);
3000 if (refbased_cfg.min_mapq > 255) {
3001 cerr << "Error: Illegal --min-mapq value '" << optarg << "'.\n";
3002 bad_args();
3003 }
3004 break;
3005 case 1015://max-clipped
3006 refbased_cfg.max_clipped = atof(optarg);
3007 if (refbased_cfg.max_clipped < 0.0 || refbased_cfg.max_clipped > 1.0) {
3008 cerr << "Error: Illegal --max-clipped value '" << optarg << "'.\n";
3009 bad_args();
3010 }
3011 break;
3012 case 1016://max-insert
3013 refbased_cfg.max_insert_refsize = stoi(optarg);
3014 break;
3015 case 1017://rm-unpaired-reads
3016 rm_unpaired_reads = true;
3017 break;
3018 case 1018://rm-pcr-duplicates
3019 rm_pcr_duplicates = true;
3020 rm_unpaired_reads = true;
3021 // no_pcr_duplicates_measures();
3022 break;
3023 // case 1022://pcr-duplicates-measures
3024 // pcr_duplicates_measures();
3025 // break;
3026 // case 1023://no-pcr-duplicates-measures
3027 // no_pcr_duplicates_measures();
3028 // break;
3029 case 1019: //phasing-cooccurrences-thr-range
3030 std::regex_search(optarg, tmp_cmatch, std::regex("^[0-9]+,[0-9]+$"));
3031 if (!tmp_cmatch.empty()) {
3032 phasing_cooccurrences_thr_range.first = atoi(optarg);
3033 phasing_cooccurrences_thr_range.second = atoi(strchr(optarg, ',') + 1);
3034 }
3035 if (tmp_cmatch.empty() || phasing_cooccurrences_thr_range.first > phasing_cooccurrences_thr_range.second) {
3036 cerr << "Error: Illegal --phasing-cooccurrences-thr-range value '"
3037 << optarg << "'; expected 'INT_MIN,INT_MAX'.\n";
3038 bad_args();
3039 }
3040 break;
3041 case 1020: //phasing-dont-prune-hets
3042 phasing_dont_prune_hets = true;
3043 break;
3044
3045 //
3046 // Debug options
3047 //
3048 case 3021://min-kmer-freq
3049 cerr << "WARNING: Ignored option --min-kmer-freq that does not exist anymore. See --max-debruijn-reads.\n";
3050 break;
3051 case 2018: //dbg-phasing-min-mac
3052 phasing_min_mac = is_integer(optarg);
3053 if (phasing_min_mac < 0) {
3054 cerr << "Error: Illegal --dbg-phasing-min-mac value '" << optarg << "'.\n";
3055 bad_args();
3056 }
3057 break;
3058 case 2019: //dbg-phasing-no-2ndpass
3059 dbg_phasing_no_2ndpass = true;
3060 break;
3061 case 2000://dbg-print-cloc-ids
3062 dbg_print_cloc_ids = true;
3063 break;
3064 case 2003://dbg-gfa
3065 dbg_write_gfa = true;
3066 break;
3067 case 2015://dbg-gfa-not-dag
3068 dbg_write_gfa_notdag = true;
3069 break;
3070 case 2004://dbg-alns
3071 dbg_write_alns = true;
3072 break;
3073 case 2010://dbg-hapgraphs
3074 dbg_write_hapgraphs = true;
3075 break;
3076 case 2007://dbg-depths
3077 dbg_write_nt_depths = true;
3078 break;
3079 case 2008://dbg-no-haps
3080 dbg_no_overlaps = true;
3081 break;
3082 case 2009://dbg-no-haps
3083 dbg_no_haplotypes = true;
3084 break;
3085 case 2014://dbg-min-spl-reads
3086 refbased_cfg.min_reads_per_sample = stoi(optarg);
3087 break;
3088 case 2017://dbg-min-loc-spls
3089 refbased_cfg.min_samples_per_locus = stoi(optarg);
3090 dbg_denovo_min_loc_samples = stoi(optarg);
3091 break;
3092 case '?':
3093 bad_args();
3094 break;
3095 default:
3096 DOES_NOT_HAPPEN;
3097 break;
3098 }
3099 }
3100
3101 //
3102 // Check command consistency.
3103 //
3104
3105 if (optind < argc) {
3106 cerr << "Error: Failed to parse command line: '" << argv[optind]
3107 << "' is seen as a positional argument. Expected no positional arguments.\n";
3108 bad_args();
3109 }
3110
3111 size_t n_modes_given = !stacks_dir.empty() + !in_dir.empty() + !in_bams.empty();
3112 if (n_modes_given != 1) {
3113 cerr << "Error: Please specify exactly one of -P, -I or -B.\n";
3114 bad_args();
3115 }
3116
3117 typedef GStacksInputT In;
3118 if (!stacks_dir.empty())
3119 input_type = popmap_path.empty()
3120 ? In::denovo_merger
3121 : In::denovo_popmap;
3122 else if (!in_dir.empty())
3123 input_type = In::refbased_popmap;
3124 else if (!in_bams.empty())
3125 input_type = In::refbased_list;
3126 else
3127 DOES_NOT_HAPPEN;
3128
3129 if (input_type == In::refbased_popmap
3130 && popmap_path.empty()
3131 ) {
3132 cerr << "Error: Please specify a population map (-M).\n";
3133 bad_args();
3134 } else if (input_type == In::refbased_list
3135 && !popmap_path.empty()
3136 ) {
3137 cerr << "Error: Please specify -I/-M or -B, not both.\n";
3138 bad_args();
3139 }
3140
3141 if (input_type == In::denovo_popmap || input_type == In::denovo_merger) {
3142 if (!refbased_cfg.paired) {
3143 cerr << "Error: --unpaired is for the reference-based mode.\n";
3144 bad_args();
3145 }
3146 if (suffix != ".bam") {
3147 cerr << "Error: --suffix is for the reference-based mode.\n";
3148 bad_args();
3149 }
3150 } else if (input_type == In::refbased_popmap || input_type == In::refbased_list) {
3151 if (!refbased_cfg.paired && rm_unpaired_reads) {
3152 cerr << "Error: --unpaired and --rm-unpaired-reads/--rm-pcr-duplicates are not compatible.\n";
3153 bad_args();
3154 }
3155 if (out_dir.empty()) {
3156 cerr << "Error: Please specify an output directory (-O).\n";
3157 bad_args();
3158 }
3159 if (min_km_count != 2 || max_debruijn_reads != 1000) {
3160 cerr << "Error: --min-kmer-count, --max-debruijn-reads are for the denovo mode.\n";
3161 bad_args();
3162 }
3163 if (bam_output) {
3164 cerr << "Error: --write-alignments is for the denovo mode.\n";
3165 bad_args();
3166 }
3167 } else {
3168 DOES_NOT_HAPPEN;
3169 }
3170
3171 //
3172 // Process arguments.
3173 //
3174
3175 for (string* dir : {&stacks_dir, &in_dir, &out_dir})
3176 if (!dir->empty() && dir->back() != '/')
3177 *dir += '/';
3178
3179 if (var_alpha == 0.0) {
3180 if (model_type == marukilow) {
3181 var_alpha = 0.01;
3182 } else {
3183 cerr << "Error: No value was provided for --var-alpha"
3184 << " (and there is no default for this model).\n";
3185 bad_args();
3186 }
3187 }
3188 switch (model_type) {
3189 case snp: model.reset(new MultinomialModel(gt_alpha)); break;
3190 case marukihigh: model.reset(new MarukiHighModel(gt_alpha, var_alpha)); break;
3191 case marukilow: model.reset(new MarukiLowModel(gt_alpha, var_alpha)); break;
3192 default:
3193 cerr << "Error: Model choice '" << to_string(model_type) << "' is not supported.\n";
3194 bad_args();
3195 break;
3196 }
3197
3198 if (!popmap_path.empty()) {
3199 MetaPopInfo m;
3200 m.init_popmap(popmap_path);
3201 for (auto& s : m.samples())
3202 sample_names.push_back(s.name);
3203 }
3204
3205 if (input_type == In::denovo_popmap) {
3206 for (const string& s : sample_names) {
3207 in_bams.push_back(stacks_dir + s + ".matches.bam");
3208 if (bam_output)
3209 out_bams.push_back(stacks_dir + s + ".alns.bam");
3210 }
3211 if (out_dir.empty())
3212 out_dir = stacks_dir;
3213 } else if (input_type == In::denovo_merger) {
3214 in_bams.push_back(stacks_dir + "catalog.bam");
3215 if (bam_output)
3216 out_bams.push_back(stacks_dir + "alignments.bam");
3217 if (out_dir.empty())
3218 out_dir = stacks_dir;
3219 } else if (input_type == In::refbased_popmap) {
3220 for (const string& sample : sample_names)
3221 in_bams.push_back(in_dir + sample + suffix);
3222 } else {
3223 assert(input_type == In::refbased_list);
3224 }
3225
3226 check_or_mk_dir(out_dir);
3227
3228 if (in_bams.empty())
3229 DOES_NOT_HAPPEN;
3230
3231 //
3232 // Write the report.
3233 //
3234 stringstream os;
3235 os << "\n"
3236 << "Configuration for this run:\n";
3237 switch (input_type) {
3238 case GStacksInputT::denovo_popmap:
3239 case GStacksInputT::denovo_merger:
3240 os << " Input mode: denovo\n";
3241 break;
3242 case GStacksInputT::refbased_popmap:
3243 case GStacksInputT::refbased_list:
3244 os << " Input mode: reference-based";
3245 if (!refbased_cfg.paired)
3246 os << ", unpaired";
3247 os << "\n";
3248 break;
3249 default:
3250 DOES_NOT_HAPPEN;
3251 break;
3252 }
3253 if (!popmap_path.empty())
3254 os << " Population map: '" << popmap_path << "'\n";
3255 os << " Input files: " << in_bams.size() << ", e.g. '" << in_bams.front()<< "'\n"
3256 << " Output to: '" << out_dir << "'\n"
3257 << " Model: " << *model << "\n";
3258 if (ignore_pe_reads)
3259 os << " Ignoring paired-end reads.\n";
3260 if (!refbased_cfg.paired)
3261 os << " Ignoring pairing information.\n";
3262 if (km_length != 31)
3263 os << " Kmer length: " << km_length << "\n";
3264 if (max_debruijn_reads != 1000)
3265 os << " Maximum reads used for contig assembly: " << max_debruijn_reads << "\n";
3266 if (min_km_count != 2)
3267 os << " Minimum kmer count: " << min_km_count << "\n";
3268 if (rm_unpaired_reads)
3269 os << " Discarding unpaired reads.\n";
3270 if (rm_pcr_duplicates)
3271 os << " Removing PCR duplicates.\n";
3272 // if (pcr_dupl_measures)
3273 // os << " PCR duplicates mitigation measures enabled.\n";
3274
3275 return os.str();
3276
3277 } catch (std::invalid_argument&) {
3278 bad_args();
3279 exit(1);
3280 }
3281 }
3282