1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2017-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 #include <getopt.h>
21 #include <zlib.h>
22 
23 #include "gstacks.h"
24 
25 #include "constants.h"
26 #include "utils.h"
27 #include "log_utils.h"
28 #include "catalog_utils.h"
29 #include "locus.h"
30 #include "locus_readers.h"
31 #include "debruijn.h"
32 #include "aln_utils.h"
33 #include "Alignment.h"
34 #include "models.h"
35 
36 //
37 // Argument globals.
38 //
39 GStacksInputT  input_type = GStacksInputT::unknown;
40 string         popmap_path;  // Set if --popmap is given. For logging purposes.
41 vector<string> sample_names; // Set if --popmap is given.
42 vector<string> in_bams;
43 vector<string> out_bams;
44 
45 string out_dir;
46 
47 BamCLocBuilder::Config refbased_cfg {true, 1000, 10, 0.20, 1, false, 1};
48 
49 int    num_threads        = 1;
50 bool   quiet              = false;
51 bool   ignore_pe_reads    = false;
52 double min_aln_cov        = 0.75;
53 int    min_se_pe_overlap  = 5;
54 double overlap_min_pct_id = 0.80;
55 bool   bam_output         = false;
56 bool   detailed_output    = false;
57 bool   rm_unpaired_reads  = false;
58 bool   rm_pcr_duplicates  = false;
59 
60 modelt model_type = marukilow;
61 unique_ptr<const Model> model;
62 
63 size_t km_length         = 31;
64 size_t max_debruijn_reads = 1000;
65 size_t min_km_count      = 2;
66 size_t max_fragment_alns = 2;
67 
68 pair<size_t,size_t> phasing_cooccurrences_thr_range = {2, 2};
69 bool phasing_dont_prune_hets = false;
70 long phasing_min_mac = 1;
71 
72 bool   dbg_no_overlaps     = false;
73 bool   dbg_no_haplotypes   = false;
74 bool   dbg_print_cloc_ids  = false;
75 bool   dbg_write_gfa       = false;
76 bool   dbg_write_gfa_notdag = false;
77 bool   dbg_write_alns      = false;
78 bool   dbg_write_hapgraphs = false;
79 bool   dbg_write_nt_depths = false;
80 bool   dbg_log_stats_phasing = false;
81 bool   dbg_phasing_no_2ndpass = false;
82 size_t dbg_denovo_min_loc_samples = 0;
83 
84 //
85 // Additional globals.
86 //
87 const string prog_name = "gstacks";
88 unique_ptr<LogAlterator> logger;
89 gzFile o_gzfasta_f = NULL;
90 unique_ptr<VcfWriter> o_vcf_f;
91 unique_ptr<VersatileWriter> o_details_f;
92 
93 ofstream o_aln_f;
94 ofstream o_hapgraphs_f;
95 const char o_aln_header[] =
96     "# This prints observed read haplotypes:\n"
97     "# show_loc() { loc=$1; cat ./gstacks.alns | sed -n \"/^END $loc\\b/ q; /^BEGIN $loc\\b/,$ p\" | tail -n+2; }\n"
98     "# snp_cols() { loc=$1; zcat ./gstacks.calls | awk \"\\$1==$loc; \\$1>$loc {exit}\" | awk '$5!=\".\"' | cut -f2 | paste -sd ','; }\n"
99     "# show_haps() { loc=$1; cols=$2; spl=$3; show_loc $loc | grep \"\\b$spl\\b\" | cut -f3 | cut -c \"${cols//./,}\" | sort; }\n"
100     "# format_haps() { loc=$1; cols=$2; spl=$3; show_haps $loc $cols $spl | transpose-ch | paste <(echo $cols | tr ,. '\\n' | xargs printf '% 4s\\n') - | sed 's/\\t/  /' | seq.color | { echo; echo $loc/$spl; echo; cat; echo; }; }\n"
101     "# true_loci() { loc=$1; spl=$2; show_loc $loc | grep \"\\b$spl\\b\" | grep -v ref | cut -d: -f1 | sort -u; }\n"
102     ;
103 const char o_hapgraphs_header[] =
104     "# dot -Tpdf -O gstacks.hapgraphs.dot\n"
105     "# loc=371\n"
106     "# { g=gstacks.hapgraphs.dot; sed -n '0,/^subgraph/p' $g | head -n-1; sed -n \"/^subgraph cluster_loc$loc\\b/,/^}/p\" $g; echo \\}; } | dot -Tpdf -o haps.$loc.pdf\n"
107     "graph {\n"
108     "edge[color=\"grey60\",fontsize=12,labeljust=\"l\"];\n";
109     ;
110 
111 //
112 // main
113 // ==========
114 //
115 
116 int
main(int argc,char ** argv)117 main(int argc, char** argv)
118 {
119 try {
120     //
121     // Parse arguments.
122     //
123     string opts_report = parse_command_line(argc, argv);
124 
125     //
126     // Open the BAM file(s).
127     //
128     vector<Bam*> bam_f_ptrs;
129 try {
130     for (const string& in_bam : in_bams)
131         bam_f_ptrs.push_back(new Bam(in_bam));
132 } catch(exception& e) {
133     if (bam_f_ptrs.size() >= 250)
134         cerr << "Error: You might need to increase your system's max-open-files limit,"
135                 " see https://groups.google.com/d/msg/stacks-users/GZqJM_WkMhI/m9Hreg4oBgAJ\n";
136     throw;
137 }
138 
139     //
140     // Open the log.
141     //
142     logger.reset(new LogAlterator(out_dir + "gstacks", true, quiet, argc, argv));
143     cout << opts_report << flush;
144 
145     //
146     // Initialize the locus readers.
147     //
148     cout << "\nReading BAM headers...\n" << flush;
149     unique_ptr<BamCLocReader> bam_cloc_reader;
150     unique_ptr<BamCLocBuilder> bam_cloc_builder;
151     const MetaPopInfo* bam_mpopi;
152     if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
153         bam_cloc_reader.reset(new BamCLocReader(move(bam_f_ptrs)));
154         // (Note: For de novo (Stacks) input we use read groups, even when a
155         // popmap was specified, since the input files ought to have them and
156         // this allows to detect corruption.)
157         bam_mpopi = &bam_cloc_reader->mpopi();
158     } else if (input_type == GStacksInputT::refbased_popmap || input_type == GStacksInputT::refbased_list) {
159         bam_cloc_builder.reset(new BamCLocBuilder(move(bam_f_ptrs), refbased_cfg, sample_names));
160         bam_mpopi = &bam_cloc_builder->mpopi();
161     } else {
162         DOES_NOT_HAPPEN;
163     }
164 
165     //
166     // Open the output files.
167     //
168     string o_gzfasta_path = out_dir + "catalog.fa.gz";
169     o_gzfasta_f = gzopen(o_gzfasta_path.c_str(), "wb");
170     check_open(o_gzfasta_f, o_gzfasta_path);
171 
172     VcfHeader vcf_header;
173     vcf_header.add_std_meta();
174     for(size_t s : bam_mpopi->sample_indexes_orig_order())
175         vcf_header.add_sample(bam_mpopi->samples()[s].name);
176     o_vcf_f.reset(new VcfWriter(out_dir + "catalog.calls", move(vcf_header)));
177 
178     if (detailed_output) {
179         o_details_f.reset(new VersatileWriter(out_dir + "gstacks.details.gz"));
180         *o_details_f << "# show_loc() { loc=$1; zcat ./gstacks.details.gz | sed -rn \"/^BEGIN locus $loc\\b/,\\$ p; /^END locus $loc\\b/ q;\"; }\n";
181     }
182 
183     vector<unique_ptr<Bam>> bam_of_ptrs;
184     if (bam_output) {
185     try {
186         stringstream header_ss;
187         header_ss << "@HD\tVN:1.5\tSO:coordinate\n";
188         for (const Sample& s : bam_cloc_reader->mpopi().samples())
189             header_ss << "@RG\tID:" << s.id << "\tSM:" << s.name << "\tid:" << s.id << '\n';
190         for (size_t target_i=0; target_i<bam_cloc_reader->n_loci(); ++target_i)
191             header_ss << "@SQ\tSN:" << bam_cloc_reader->target2id(target_i) << "\tLN:10000\n";
192         BamHeader header = BamHeader(header_ss.str());
193         if (input_type == GStacksInputT::denovo_merger) {
194             assert(out_bams.size() == 1);
195             bam_of_ptrs.emplace_back(new Bam(out_bams[0], BamHeader(header)));
196         } else if (input_type == GStacksInputT::denovo_popmap) {
197             assert(out_bams.size() == bam_cloc_reader->bam_fs().size());
198             for(size_t i=0; i<out_bams.size(); ++i) {
199                 bam_of_ptrs.emplace_back(new Bam(out_bams[i], BamHeader(header)));
200             }
201         } else {
202             DOES_NOT_HAPPEN;
203         }
204     } catch(exception& e) {
205         if (bam_f_ptrs.size() + bam_of_ptrs.size() >= 250)
206             cerr << "Error: You might need to increase your system's max-open-files limit,"
207                     " see https://groups.google.com/d/msg/stacks-users/GZqJM_WkMhI/m9Hreg4oBgAJ\n";
208         throw;
209     }}
210 
211     if (dbg_write_alns) {
212         string o_aln_path = out_dir + "gstacks.alns";
213         o_aln_f.open(o_aln_path);
214         check_open(o_aln_f, o_aln_path);
215         o_aln_f << o_aln_header;
216     }
217 
218     if (dbg_write_hapgraphs) {
219         string o_hapgraphs_path = out_dir + "gstacks.hapgraphs.dot";
220         o_hapgraphs_f.open(o_hapgraphs_path);
221         check_open(o_hapgraphs_f, o_hapgraphs_path);
222         o_hapgraphs_f << o_hapgraphs_header;
223     }
224 
225     //
226     // Process every locus
227     //
228     GenotypeStats gt_stats (bam_mpopi->samples().size());
229     HaplotypeStats hap_stats (bam_mpopi->samples().size());
230     ContigStats denovo_ctg_stats {};
231 
232     // For clocking.
233     Timers t_threads_totals;
234     Timer t_parallel;
235     Timer t_writing_vcf;
236     size_t n_writes = 0;
237     size_t max_size_before_write = 0;
238 
239     // For parallelization.
240     int omp_return = 0;
241     std::deque<pair<bool,string>> fa_outputs;
242     std::deque<pair<bool,string>> vcf_outputs;
243     std::deque<pair<bool,string>> det_outputs; // (details)
244     std::deque<pair<bool,vector<vector<BamRecord>>>> bam_outputs;
245     size_t next_fa_to_write = 0; // locus index, in the input BAM file.
246     size_t next_vcf_to_write = 0;
247     size_t next_det_to_write = 0;
248     size_t next_bam_to_write = 0;
249 
250     cout << "Processing all loci...\n" << flush;
251     ProgressMeter progress =
252             (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) ?
253             ProgressMeter(cout, true, bam_cloc_reader->tally_n_components())
254             : ProgressMeter(cout, false, 1000);
255 
256     bool eof = false;
257     #pragma omp parallel num_threads(num_threads)
258     { try {
259         LocusProcessor loc_proc (bam_mpopi->samples().size());
260         Timers& t = loc_proc.timers();
261 
262         CLocReadSet loc (*bam_mpopi); // For denovo.
263         CLocAlnSet aln_loc; // For ref-based.
264         bool thread_eof = false;
265         while(omp_return == 0) {
266             t.reading.restart();
267             #pragma omp critical(read)
268             { try {
269                 if (!eof && omp_return == 0) {
270                     if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger)
271                         eof = !bam_cloc_reader->read_one_locus(loc);
272                     else
273                         eof = !bam_cloc_builder->build_one_locus(aln_loc);
274                 }
275                 thread_eof = eof;
276             } catch (exception& e) {
277                 omp_return = stacks_handle_exceptions(e);
278             }}
279             t.reading.update();
280             if (thread_eof || omp_return != 0)
281                 break;
282 
283             // Process it.
284             t.processing.restart();
285             size_t loc_i;
286             if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
287                 loc_i = loc.bam_i();
288                 if (dbg_print_cloc_ids)
289                     cerr << (to_string(loc.id()) + '\n') << flush;
290                 if (ignore_pe_reads)
291                     loc.pe_reads().clear();
292                 loc_proc.process(loc);
293             } else {
294                 assert(aln_loc.id() >= 1);
295                 loc_i = aln_loc.id() - 1;
296                 if (dbg_print_cloc_ids)
297                     cerr << (to_string(aln_loc.id()) + '\n') << flush;
298                 if (refbased_cfg.paired)
299                     aln_loc.merge_paired_reads();
300                 loc_proc.process(aln_loc);
301             }
302             t.processing.update();
303 
304             // Write the FASTA output.
305             t.writing_fa.restart();
306             #pragma omp critical(write_fa)
307             {
308                 for (size_t i=next_fa_to_write+fa_outputs.size(); i<=loc_i; ++i)
309                     fa_outputs.push_back( {false, string()} );
310                 fa_outputs[loc_i - next_fa_to_write] = {true, move(loc_proc.fasta_out())};
311 
312                 while (!fa_outputs.empty() && fa_outputs.front().first) {
313                     const string& fa = fa_outputs.front().second;
314                     if (!fa.empty() && gzwrite(o_gzfasta_f, fa.c_str(), fa.length()) <= 0)
315                         throw std::ios::failure("gzwrite");
316                     fa_outputs.pop_front();
317                     ++next_fa_to_write;
318                 }
319             }
320             t.writing_fa.update();
321 
322             // Write the VCF output.
323             t.writing_vcf.restart();
324             #pragma omp critical(write_vcf)
325             {
326                 for (size_t i=next_vcf_to_write+vcf_outputs.size(); i<=loc_i; ++i)
327                     vcf_outputs.push_back( {false, string()} );
328                 vcf_outputs[loc_i - next_vcf_to_write] = {true, move(loc_proc.vcf_out()) };
329 
330                 if (vcf_outputs.front().first) {
331                     ++n_writes;
332                     if (vcf_outputs.size() > max_size_before_write)
333                         max_size_before_write = vcf_outputs.size();
334                     t_writing_vcf.restart();
335                     do {
336                         o_vcf_f->file() << vcf_outputs.front().second;
337                         vcf_outputs.pop_front();
338                         if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger)
339                             progress += bam_cloc_reader->n_catalog_components_of(next_vcf_to_write);
340                         else
341                             ++progress;
342                         ++next_vcf_to_write;
343                     } while (!vcf_outputs.empty() && vcf_outputs.front().first);
344                     t_writing_vcf.update();
345                 }
346             }
347             t.writing_vcf.update();
348 
349             // Write the alignments.
350             if (bam_output) {
351                 t.writing_bams.restart();
352                 #pragma omp critical(write_bams)
353                 {
354                     for (size_t i=next_bam_to_write+bam_outputs.size(); i<=loc_i; ++i)
355                         bam_outputs.push_back( make_pair(false, vector<vector<BamRecord>>()) );
356                     bam_outputs[loc_i - next_bam_to_write] = make_pair(true, move(loc_proc.bam_out()));
357 
358                     while (!bam_outputs.empty() && bam_outputs.front().first) {
359                         vector<vector<BamRecord>>& recs = bam_outputs.front().second;
360                         if (!recs.empty()) {
361                             assert(recs.size() == bam_of_ptrs.size());
362                             for (size_t i=0; i<recs.size(); ++i)
363                                 for (const BamRecord& r : recs[i])
364                                     bam_of_ptrs[i]->write(r);
365                         }
366                         bam_outputs.pop_front();
367                         ++next_bam_to_write;
368                     }
369                 }
370                 t.writing_bams.update();
371             }
372 
373             // Write the detailed output.
374             if (detailed_output) {
375                 t.writing_details.restart();
376                 #pragma omp critical(write_details)
377                 {
378                     for (size_t i=next_det_to_write+det_outputs.size(); i<=loc_i; ++i)
379                         det_outputs.push_back( {false, string()} );
380                     det_outputs[loc_i - next_det_to_write] = {true, move(loc_proc.details_out())};
381 
382                     while (!det_outputs.empty() && det_outputs.front().first) {
383                         *o_details_f << det_outputs.front().second;
384                         det_outputs.pop_front();
385                         ++next_det_to_write;
386                     }
387                 }
388                 t.writing_details.update();
389             }
390         }
391 
392         // Tally the per-thread statistics.
393         #pragma omp critical(stats)
394         if (omp_return == 0) {
395             denovo_ctg_stats += loc_proc.ctg_stats();
396             gt_stats  += loc_proc.gt_stats();
397             hap_stats  += loc_proc.hap_stats();
398             t_threads_totals += loc_proc.timers();
399         }
400     } catch (exception& e) {
401         #pragma omp critical(exc)
402         omp_return = stacks_handle_exceptions(e);
403     }}
404     if (omp_return != 0)
405         return omp_return;
406     t_parallel.update();
407     progress.done();
408 
409     //
410     // Report statistics.
411     //
412     ostream o_fp1 (cout.rdbuf());
413     o_fp1 << std::fixed << std::setprecision(1);
414     ostream o_fp3 (cout.rdbuf());
415     o_fp3 << std::fixed << std::setprecision(3);
416     ostream x_fp1 (logger->x.rdbuf());
417     x_fp1 << std::fixed << std::setprecision(1);
418     ostream x_fp3 (logger->x.rdbuf());
419     x_fp3 << std::fixed << std::setprecision(3);
420 
421     if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
422         if (denovo_ctg_stats.n_loci_w_pe_reads == 0) {
423             cout << "Input appears to be single-end (no paired-end reads were seen).\n";
424         } else {
425             // Report assembly statistics.
426             assert(!ignore_pe_reads);
427             const ContigStats& cs = denovo_ctg_stats;
428             size_t no_pe   = cs.n_loci_no_pe_reads() + cs.n_loci_almost_no_pe_reads;
429             size_t pe_ndag  = cs.n_loci_pe_graph_not_dag;
430             size_t pe_ctg = cs.n_loci_ctg();
431             auto pct = [&cs](size_t n) { return as_percentage((double) n / cs.n_nonempty_loci); };
432 
433             o_fp1 << "\n"
434                << "Attempted to assemble and align paired-end reads for " << cs.n_nonempty_loci << " loci:\n"
435                << "  " << no_pe << " loci had no or almost no paired-end reads (" << pct(no_pe) << ");\n"
436                << "  " << pe_ndag << " loci had paired-end reads that couldn't be assembled into a contig ("
437                << pct(pe_ndag) << ");\n"
438                << "  For the remaining " << pe_ctg << " loci (" << pct(pe_ctg) << "), a paired-end contig was assembled;\n"
439                << "    Average contig size was " << cs.ctg_avg_length() << " bp;\n"
440                << "  " << cs.n_overlaps << " paired-end contigs overlapped the forward region ("
441                << as_percentage((double) cs.n_overlaps / cs.n_loci_ctg()) << ")\n"
442                << "    Mean overlap: " << cs.mean_olap_length() << "bp; mean size of overlapped loci after merging: "
443                << cs.mean_olapd_locus_length() << ";\n"
444                << "  Out of " << cs.n_tot_reads << " paired-end reads in these loci (mean "
445                << (double) cs.n_aln_reads / pe_ctg << " reads per locus),\n"
446                << "    " << cs.n_aln_reads << " were successfuly aligned ("
447                << as_percentage((double) cs.n_aln_reads / cs.n_tot_reads) << ");\n"
448                << "  Mean insert length was " << cs.insert_length_olap_mv.mean() << ", stdev: "
449                << cs.insert_length_olap_mv.sd_p() << " (based on aligned reads in overlapped loci).\n";
450         }
451 
452     } else if (input_type == GStacksInputT::refbased_popmap || input_type == GStacksInputT::refbased_list) {
453         // Report statistics on the input BAM(s).
454         BamCLocBuilder::BamStats bam_stats {};
455         const vector<BamCLocBuilder::BamStats>& bam_stats_s = bam_cloc_builder->bam_stats_per_sample();
456         for (auto& bstats : bam_stats_s)
457             bam_stats += bstats;
458         size_t tot = bam_stats.n_primary + bam_stats.n_unmapped;
459         cout << "\n"
460              << "Read " << bam_stats.n_records << " BAM records:\n"
461              << "  kept " << bam_stats.n_primary_kept() << " primary alignments ("
462              << as_percentage(bam_stats.n_primary_kept(), tot) << ")";
463         if (refbased_cfg.paired)
464             cout << ", of which " << bam_stats.n_primary_kept_read2 << " reverse reads";
465         cout << "\n"
466              << "  skipped " << bam_stats.n_primary_mapq << " primary alignments with insufficient mapping qualities ("
467              << as_percentage((double) bam_stats.n_primary_mapq / tot) << ")\n"
468              << "  skipped " << bam_stats.n_primary_softclipped << " excessively soft-clipped primary alignments ("
469              << as_percentage((double) bam_stats.n_primary_softclipped / tot) << ")\n"
470              << "  skipped " << bam_stats.n_unmapped << " unmapped reads ("
471              << as_percentage((double) bam_stats.n_unmapped / tot) << ")\n";
472         if (bam_stats.n_secondary > 0 || bam_stats.n_supplementary > 0)
473             cout << "  skipped some suboptimal (secondary/supplementary) alignment records\n";
474         if (refbased_cfg.ign_pe_reads)
475             cout << "  ignored " << bam_stats.n_ignored_read2_recs << " READ2 records\n";
476 
477         // Per sample BAM stats.
478         logger->x << "\n"
479                   << "BEGIN bam_stats_per_sample\n"
480                   << "sample"
481                      "\trecords"
482                      "\tprimary_kept"
483                      "\tkept_frac"
484                      "\tprimary_kept_read2"
485                      "\tprimary_disc_mapq"
486                      "\tprimary_disc_sclip"
487                      "\tunmapped"
488                      "\tsecondary"
489                      "\tsupplementary\n";
490         size_t min_recs = SIZE_MAX;
491         size_t max_recs = 0;
492         double mean_recs = 0.0;
493         double min_keep_rate = DBL_MAX;
494         double max_keep_rate = 0.0;
495         assert(bam_stats_s.size() == bam_mpopi->samples().size());
496         for (size_t sample=0; sample<bam_stats_s.size(); ++sample) {
497             auto& bstats = bam_stats_s[sample];
498             x_fp3 << bam_mpopi->samples()[sample].name << '\t'
499                   << bstats.n_records << '\t'
500                   << bstats.n_primary_kept() << '\t'
501                   << (double) bstats.n_primary_kept() / bstats.n_records << '\t'
502                   << bstats.n_primary_kept_read2 << '\t'
503                   << bstats.n_primary_mapq << '\t'
504                   << bstats.n_primary_softclipped << '\t'
505                   << bstats.n_unmapped << '\t'
506                   << bstats.n_secondary << '\t'
507                   << bstats.n_supplementary << '\n';
508             mean_recs += bstats.n_records;
509             if (bstats.n_records < min_recs)
510                 min_recs = bstats.n_records;
511             if (bstats.n_records > max_recs)
512                 max_recs = bstats.n_records;
513             double keep_rate = (double) bstats.n_primary_kept() / bstats.n_records;
514             if (keep_rate < min_keep_rate)
515                 min_keep_rate = keep_rate;
516             if (keep_rate > max_keep_rate)
517                 max_keep_rate = keep_rate;
518         }
519         logger->x << "END bam_stats_per_sample\n";
520         mean_recs /= bam_mpopi->samples().size();
521         o_fp1 << "\n"
522               << "  Per-sample stats (details in '"
523               << logger->distribs_path.substr(logger->distribs_path.rfind('/') + 1)
524               << "'):\n"
525               << "    read " << mean_recs << " records/sample (" << min_recs
526               << "-" << max_recs << ")\n"
527               << "    kept " << as_percentage(min_keep_rate) << "-"
528               << as_percentage(max_keep_rate) << " of these\n";
529 
530         // Report statistics on the loci that were built.
531         const BamCLocBuilder::LocStats& loc_stats = bam_cloc_builder->loc_stats();
532         cout << "\n"
533              << "Built " << loc_stats.n_loci_built << " loci comprising "
534              << loc_stats.n_fw_reads;
535         if (refbased_cfg.paired) {
536             o_fp1 << " forward reads and "
537                   << loc_stats.n_read_pairs() << " matching paired-end reads;"
538                   << " mean insert length was " << loc_stats.insert_lengths_mv.mean()
539                   << " (sd: " << loc_stats.insert_lengths_mv.sd_p() << ").\n";
540         } else {
541             cout << " reads.\n";
542         }
543     }
544 
545     if (rm_unpaired_reads) {
546         // Report statistics on pairs.
547         size_t n_unpaired = gt_stats.n_unpaired_reads_rm();
548         size_t n_pcr_dupl = gt_stats.n_read_pairs_pcr_dupl();
549         size_t n_used = gt_stats.n_read_pairs_used();
550         size_t n_remaining_loci = gt_stats.n_genotyped_loci;
551         cout << "Removed " << n_unpaired << " unpaired (forward) reads ("
552              << as_percentage((double) n_unpaired / (n_unpaired + n_used + n_pcr_dupl)) << "); kept "
553              << n_used + n_pcr_dupl << " read pairs in "
554              << n_remaining_loci << " loci.\n";
555         if (rm_pcr_duplicates) {
556             cout << "Removed " << n_pcr_dupl
557                  << " read pairs whose insert length had already been seen in the same sample as putative PCR duplicates ("
558                  << as_percentage((double) n_pcr_dupl / (n_pcr_dupl + n_used))
559                  << "); kept " << n_used << " read pairs.\n";
560         } else {
561             assert(n_pcr_dupl == 0);
562         }
563     } else {
564         assert(gt_stats.n_unpaired_reads_rm() == 0 && gt_stats.n_read_pairs_pcr_dupl() == 0);
565     }
566 
567     // Report statistics on genotyping and haplotyping.
568     size_t n_hap_attempts = hap_stats.n_halplotyping_attempts();
569     size_t n_hap_pairs = hap_stats.n_consistent_hap_pairs();
570     OnlineMeanVar ecov_mean;
571     double ecov_min = DBL_MAX;
572     double ecov_max = 0.0;
573     for (const GenotypeStats::PerSampleStats& sample : gt_stats.per_sample_stats) {
574         double cov = (double) sample.ns_weighted_n_read_pairs_used / sample.ns_cumsum;
575         ecov_mean.increment(cov);
576         if (cov < ecov_min)
577             ecov_min = cov;
578         if (cov > ecov_max)
579             ecov_max = cov;
580     }
581     o_fp1 << "\n"
582          << "Genotyped " << gt_stats.n_genotyped_loci << " loci:\n";
583     if (gt_stats.n_genotyped_loci == 0) {
584         cerr << "Error: There wasn't any locus to genotype; check input/arguments.\n";
585         throw exception();
586     }
587     o_fp1 << "  effective per-sample coverage: mean=" << ecov_mean.mean() << "x, stdev="
588          << ecov_mean.sd_p() << "x, min=" << ecov_min << "x, max=" << ecov_max << "x\n"
589          << "  mean number of sites per locus: " << gt_stats.mean_n_sites_per_loc() << "\n"
590          << "  a consistent phasing was found for " << n_hap_pairs << " of out " << n_hap_attempts
591          << " (" << as_percentage((double) n_hap_pairs / n_hap_attempts)
592          << ") diploid loci needing phasing\n";
593 
594     // effective_coverages_per_sample
595     logger->x << "\n"
596                  "BEGIN effective_coverages_per_sample\n"
597                  "# For mean_cov_ns, the coverage at each locus is weighted by the number of\n"
598                  "# samples present at that locus (i.e. coverage at shared loci counts more).\n";
599     logger->x << "sample"
600                  "\tn_loci"
601                  "\tn_used_fw_reads"
602                  "\tmean_cov"
603                  "\tmean_cov_ns";
604     if (rm_unpaired_reads) {
605         logger->x << "\tn_unpaired_reads";
606         if (rm_pcr_duplicates) {
607             logger->x << "\tn_pcr_dupl_pairs"
608                          "\tpcr_dupl_rate";
609         }
610     }
611     logger->x << '\n';
612     assert(gt_stats.per_sample_stats.size() == bam_mpopi->samples().size());
613     for (size_t sample=0; sample<bam_mpopi->samples().size(); ++sample) {
614         const GenotypeStats::PerSampleStats& stats = gt_stats.per_sample_stats[sample];
615         x_fp3 << bam_mpopi->samples()[sample].name
616            << '\t' << stats.n_loci_with_sample
617            << '\t' << stats.n_read_pairs_used
618            << '\t' << (double) stats.n_read_pairs_used / stats.n_loci_with_sample
619            << '\t' << (double) stats.ns_weighted_n_read_pairs_used / stats.ns_cumsum;
620         if (rm_unpaired_reads) {
621             x_fp3 << '\t' << stats.n_unpaired_reads;
622             if (rm_pcr_duplicates) {
623                 x_fp3 << '\t' << stats.n_read_pairs_pcr_dupl
624                       << '\t' << (double) stats.n_read_pairs_pcr_dupl / (stats.n_read_pairs_pcr_dupl + stats.n_read_pairs_used);
625             }
626         }
627         x_fp3 << '\n';
628     }
629     logger->x << "END effective_coverages_per_sample\n";
630 
631     // pcr_clone_size_distrib
632     if(rm_pcr_duplicates) {
633         const vector<size_t>& cz_d = gt_stats.pcr_clone_size_distrib;
634         assert(!cz_d.empty());
635         assert(cz_d[0] == 0); // (We can't observe a clone of 0 reads.)
636         logger->x << "\n"
637                   << "BEGIN pcr_clone_size_distrib\n"
638                   <<"clone_size\tn_clones\tn_reads\n";
639         for(size_t clone_size=1; clone_size<cz_d.size(); ++clone_size)
640             logger->x << clone_size
641                       << '\t' << cz_d[clone_size]
642                       << '\t' << cz_d[clone_size] * clone_size
643                       << '\n';
644         logger->x << "END pcr_clone_size_distrib\n";
645     }
646 
647     // phasing_rates_samples
648     logger->x << "\n"
649               << "BEGIN phasing_rates_per_sample\n"
650               << "sample\tn_gts\tn_multisnp_hets\tn_phased\tmisphasing_rate"
651               #ifdef DEBUG
652               << "\tn_phased_2ndpass"
653               #endif
654               << "\n";
655 
656     assert(hap_stats.per_sample_stats.size() == bam_mpopi->samples().size());
657     for (size_t sample=0; sample<bam_mpopi->samples().size(); ++sample) {
658         const HaplotypeStats::PerSampleStats& stats = hap_stats.per_sample_stats[sample];
659 
660         x_fp3 << bam_mpopi->samples()[sample].name
661            << '\t' << stats.n_diploid_loci
662            << '\t' << stats.n_hets_2snps
663            << '\t' << stats.n_phased
664            << '\t' << 1.0 - (double) stats.n_phased / stats.n_hets_2snps
665            #ifdef DEBUG
666            << '\t' << stats.n_phased_2ndpass
667            #endif
668            << '\n';
669     }
670     logger->x << "END phasing_rates_per_sample\n";
671 
672     // phasing_rates_loci
673     #ifdef DEBUG
674     logger->x << "\n"
675               << "BEGIN phasing_rates_loci\n"
676               << "n_hets_2snps\tn_bad_hets\tn_loci\n";
677     for (auto& elem : hap_stats.n_badly_phased_samples)
678         logger->x << elem.first.second
679                   << '\t' << elem.first.second - elem.first.first
680                   << '\t' << elem.second
681                   << '\n';
682     logger->x << "END phasing_rates_loci\n";
683     #endif
684 
685     // Report clockings.
686     {
687         double ll  = t_parallel.elapsed();
688         double v   = t_writing_vcf.elapsed();
689 
690         double r   = t_threads_totals.reading.elapsed() / num_threads;
691         double p   = t_threads_totals.processing.elapsed() / num_threads;
692         double w_f = t_threads_totals.writing_fa.elapsed() / num_threads;
693         double w_v = t_threads_totals.writing_vcf.elapsed() / num_threads;
694         double w_d = t_threads_totals.writing_details.elapsed() / num_threads;
695         double w_b = t_threads_totals.writing_bams.elapsed() / num_threads;
696 
697         double ppr = t_threads_totals.processing_pre_alns.elapsed() / num_threads;
698         double rn   = t_threads_totals.rm_Ns.elapsed() / num_threads;
699         double as   = t_threads_totals.assembling.elapsed() / num_threads;
700         double ia   = t_threads_totals.init_alignments.elapsed() / num_threads;
701         double al   = t_threads_totals.aligning.elapsed() / num_threads;
702         double me   = t_threads_totals.merge_paired_reads.elapsed() / num_threads;
703         double ppo = t_threads_totals.processing_post_alns.elapsed() / num_threads;
704         double rr = t_threads_totals.rm_reads.elapsed() / num_threads;
705         double cnt = t_threads_totals.counting_nts.elapsed() / num_threads;
706         double g   = t_threads_totals.genotyping.elapsed() / num_threads;
707         double h   = t_threads_totals.haplotyping.elapsed() / num_threads;
708         double u   = t_threads_totals.cpt_consensus.elapsed() / num_threads;
709         double b_v = t_threads_totals.building_vcf.elapsed() / num_threads;
710         double b_f = t_threads_totals.building_fa.elapsed() / num_threads;
711         double b_b = t_threads_totals.building_bams.elapsed() / num_threads;
712 
713         double c = t_parallel.consumed()
714                  + t_threads_totals.reading.consumed() / num_threads
715                  + t_threads_totals.processing.consumed() / num_threads
716                  + t_threads_totals.writing_fa.consumed() / num_threads
717                  + t_threads_totals.writing_vcf.consumed() / num_threads
718                  + t_threads_totals.writing_details.consumed() / num_threads
719                  + t_threads_totals.writing_bams.consumed() / num_threads
720                  + t_threads_totals.processing_pre_alns.consumed() / num_threads
721                  + t_threads_totals.rm_Ns.consumed() / num_threads
722                  + t_threads_totals.assembling.consumed() / num_threads
723                  + t_threads_totals.init_alignments.consumed() / num_threads
724                  + t_threads_totals.aligning.consumed() / num_threads
725                  + t_threads_totals.merge_paired_reads.consumed() / num_threads
726                  + t_threads_totals.processing_post_alns.consumed() / num_threads
727                  + t_threads_totals.rm_reads.consumed() / num_threads
728                  + t_threads_totals.counting_nts.consumed() / num_threads
729                  + t_threads_totals.genotyping.consumed() / num_threads
730                  + t_threads_totals.haplotyping.consumed() / num_threads
731                  + t_threads_totals.cpt_consensus.consumed() / num_threads
732                  + t_threads_totals.building_vcf.consumed() / num_threads
733                  + t_threads_totals.building_fa.consumed() / num_threads
734                  + t_threads_totals.building_bams.consumed() / num_threads
735                  + t_writing_vcf.consumed()
736                  ;
737 
738         x_fp1 << "\n"
739            << "BEGIN clockings\n"
740            << "Num. threads: " << num_threads << "\n"
741            << "Parallel time: " << ll << "\n"
742            << "Average thread time spent:\n"
743            << std::setw(8) << r  << "  reading (" << as_percentage(r / ll) << ")\n"
744            << std::setw(8) << p << "  processing (" << as_percentage(p / ll) << ")\n";
745         if (as != 0.0)
746             // De novo mode & paired-ends.
747             x_fp1
748                << std::setw(16) << ppr << " pre-alignments block (" << as_percentage(ppr / ll) << ")\n"
749                << std::setw(16) << rn << "  reformatting fw-reads (" << as_percentage(rn / ll) << ")\n"
750                << std::setw(16) << as << "  assembling (" << as_percentage(as / ll) << ")\n"
751                << std::setw(16) << ia << "  initializing alignments (" << as_percentage(ia / ll) << ")\n"
752                << std::setw(16) << al << "  aligning (" << as_percentage(al / ll) << ")\n"
753                << std::setw(16) << me << "  merging read pairs (" << as_percentage(me / ll) << ")\n"
754                ;
755         x_fp1
756            << std::setw(16) << ppo << " post-alignments block (" << as_percentage(ppo / ll) << ")\n"
757            << std::setw(16) << rr << "  filtering reads (" << as_percentage(rr / ll) << ")\n"
758            << std::setw(16) << cnt << "  counting nucleotides (" << as_percentage(cnt / ll) << ")\n"
759            << std::setw(16) << g << "  genotyping (" << as_percentage(g / ll) << ")\n"
760            << std::setw(16) << h << "  haplotyping (" << as_percentage(h / ll) << ")\n"
761            << std::setw(16) << u << "  computing consensus (" << as_percentage(u / ll) << ")\n"
762            << std::setw(16) << b_f << "  building_fa (" << as_percentage(b_f / ll) << ")\n"
763            << std::setw(16) << b_v << "  building_vcf (" << as_percentage(b_v / ll) << ")\n";
764         if (bam_output)
765             x_fp1 << std::setw(16) << b_b << "  building_bam (" << as_percentage(b_b / ll) << ")\n"
766                 << std::setw(8) << w_b << "  writing_bam (" << as_percentage(w_b / ll) << ")\n";
767         x_fp1 << std::setw(8) << w_f << "  writing_fa (" << as_percentage(w_f / ll) << ")\n"
768            << std::setw(8) << w_v << "  writing_vcf (" << as_percentage(w_v / ll) << ")\n";
769         if (detailed_output)
770             x_fp1 << std::setw(8) << w_d << "  writing_details (" << as_percentage(w_d / ll) << ")\n";
771         x_fp1 << std::setw(8) << c << "  clocking (" << as_percentage(c / ll) << ")\n"
772            << "Total time spent writing vcf: " << v << " (" << as_percentage(v / ll) << ")\n"
773            << "VCFwrite block size: mean=" << (double) gt_stats.n_genotyped_loci / n_writes
774                << "(n=" << n_writes << "); max=" << max_size_before_write << "\n"
775            << "END clockings\n";
776     }
777 
778     #ifdef DEBUG
779     const MarukiLowModel* m = dynamic_cast<const MarukiLowModel*>(model.get());
780     if (m != NULL) {
781         // Report how often the "underflow" likelihood equations were used
782         cout << "\n"
783              << "DEBUG: marukilow: calc_ln_weighted_sums: "
784              << m->n_wsum_tot() << " calls, "
785              << m->n_wsum_underflows() << " underflows ("
786              << as_percentage(m->n_wsum_underflows(), m->n_wsum_tot())
787              << ").\n";
788         cout << "DEBUG: marukilow: mean_err_rate: "
789              << m->mean_err_rate() << '\n';
790     }
791     #endif
792 
793     //
794     // Cleanup & return.
795     //
796 
797     gzclose(o_gzfasta_f);
798     o_vcf_f->file().close();
799     o_vcf_f.reset();
800     for (unique_ptr<Bam>& b : bam_of_ptrs) {
801         b->close();
802         b.reset();
803     }
804     if (o_details_f) {
805         o_details_f->close();
806         o_details_f.reset();
807     }
808     model.reset();
809     if (dbg_write_hapgraphs)
810         o_hapgraphs_f << "}\n";
811     cout << "\ngstacks is done.\n";
812     return 0;
813 
814 } catch (exception& e) {
815     return stacks_handle_exceptions(e);
816 }
817 }
818 
819 
820 //
821 // SnpAlleleCooccurrenceCounter
822 // ============================
823 //
824 
at(size_t snp_i1,Nt2 snp1_allele,size_t snp_i2,Nt2 snp2_allele) const825 const size_t& SnpAlleleCooccurrenceCounter::at(size_t snp_i1, Nt2 snp1_allele, size_t snp_i2, Nt2 snp2_allele) const {
826     assert(snp_i1 < snp_i2);
827     return cooccurrences_[snp_i1*n_snps_+snp_i2][size_t(snp1_allele)][size_t(snp2_allele)];
828 }
829 
clear()830 void SnpAlleleCooccurrenceCounter::clear() {
831     for(size_t i=0; i<n_snps_; ++i)
832         for(size_t j=i+1; j<n_snps_; ++j)
833             cooccurrences_[i*n_snps_+j] = array<array<size_t,4>,4>();
834 }
835 
836 //
837 // GenotypeStats, HaplotypeStats & ContigStats
838 // ===============
839 //
840 
operator +=(const GenotypeStats & other)841 GenotypeStats& GenotypeStats::operator+= (const GenotypeStats& other) {
842     this->n_genotyped_loci += other.n_genotyped_loci;
843     this->n_sites_tot      += other.n_sites_tot;
844     if (pcr_clone_size_distrib.size() < other.pcr_clone_size_distrib.size())
845         pcr_clone_size_distrib.resize(other.pcr_clone_size_distrib.size());
846     for (size_t i=0; i<other.pcr_clone_size_distrib.size(); ++i)
847         pcr_clone_size_distrib[i] += other.pcr_clone_size_distrib[i];
848     for (size_t sample=0; sample<per_sample_stats.size(); ++sample) {
849         per_sample_stats[sample].n_unpaired_reads += other.per_sample_stats[sample].n_unpaired_reads;
850         per_sample_stats[sample].n_read_pairs_pcr_dupl += other.per_sample_stats[sample].n_read_pairs_pcr_dupl;
851         per_sample_stats[sample].n_read_pairs_used += other.per_sample_stats[sample].n_read_pairs_used;
852         per_sample_stats[sample].n_loci_with_sample += other.per_sample_stats[sample].n_loci_with_sample;
853         per_sample_stats[sample].ns_cumsum += other.per_sample_stats[sample].ns_cumsum;
854         per_sample_stats[sample].ns_weighted_n_read_pairs_used += other.per_sample_stats[sample].ns_weighted_n_read_pairs_used;
855     }
856     return *this;
857 }
858 
operator +=(const HaplotypeStats & other)859 HaplotypeStats& HaplotypeStats::operator+= (const HaplotypeStats& other) {
860     for (auto elem : other.n_badly_phased_samples)
861         this->n_badly_phased_samples[elem.first] += elem.second;
862     assert(per_sample_stats.size() == other.per_sample_stats.size());
863     for (size_t sample=0; sample<per_sample_stats.size(); ++sample) {
864         per_sample_stats[sample].n_diploid_loci += other.per_sample_stats[sample].n_diploid_loci;
865         per_sample_stats[sample].n_hets_2snps += other.per_sample_stats[sample].n_hets_2snps;
866         per_sample_stats[sample].n_phased += other.per_sample_stats[sample].n_phased;
867         per_sample_stats[sample].n_phased_2ndpass += other.per_sample_stats[sample].n_phased_2ndpass;
868     }
869     return *this;
870 }
871 
n_halplotyping_attempts() const872 size_t HaplotypeStats::n_halplotyping_attempts() const {
873     size_t n1 = 0;
874     for (auto& s: per_sample_stats)
875         n1 += s.n_hets_2snps;
876     size_t n2 = 0;
877     for (auto& e: n_badly_phased_samples)
878         n2+= e.second * e.first.second;
879     assert(n1==n2);
880     return n1;
881 }
882 
n_consistent_hap_pairs() const883 size_t HaplotypeStats::n_consistent_hap_pairs() const {
884     size_t n1 = 0;
885     for (auto& s: per_sample_stats)
886         n1 += s.n_phased;
887     size_t n2 = 0;
888     for (auto& e: n_badly_phased_samples)
889         n2 += e.second * e.first.first;
890     assert(n1==n2);
891     return n1;
892 }
893 
operator +=(const ContigStats & other)894 ContigStats& ContigStats::operator+= (const ContigStats& other) {
895     this->n_nonempty_loci           += other.n_nonempty_loci;
896     this->n_loci_w_pe_reads         += other.n_loci_w_pe_reads;
897     this->n_loci_almost_no_pe_reads += other.n_loci_almost_no_pe_reads;
898     this->n_loci_pe_graph_not_dag   += other.n_loci_pe_graph_not_dag;
899     this->n_loci_pe_graph_fixed_to_dag += other.n_loci_pe_graph_fixed_to_dag;
900     this->length_ctg_tot            += other.length_ctg_tot;
901     this->n_aln_reads               += other.n_aln_reads;
902     this->n_tot_reads               += other.n_tot_reads;
903     this->n_overlaps                += other.n_overlaps;
904     this->length_overlap_tot        += other.length_overlap_tot;
905     this->length_olapd_loci_tot     += other.length_olapd_loci_tot;
906     this->insert_length_olap_mv    += other.insert_length_olap_mv;
907 
908     return *this;
909 }
910 
911 //
912 // LocData
913 // =======
914 //
clear()915 void LocData::clear() {
916     id = -1;
917     pos.clear();
918     mpopi = NULL;
919     ctg_status = unknown;
920     olap_length = SIZE_MAX;
921     o_vcf.clear();
922     o_fa.clear();
923     o_details.clear();
924     details_ss.clear();
925     details_ss.str(string());
926 }
927 
928 //
929 // LocusProcessor
930 // ==============
931 //
932 
933 void
process(CLocReadSet & loc)934 LocusProcessor::process(CLocReadSet& loc)
935 {
936     timers_.processing_pre_alns.restart();
937     loc_.clear();
938     if (dbg_denovo_min_loc_samples > 0
939             && loc.n_samples() < dbg_denovo_min_loc_samples)
940         loc.clear();
941     if (loc.reads().empty()) {
942         timers_.processing_pre_alns.update();
943         return;
944     }
945     if (detailed_output)
946         loc_.details_ss << "BEGIN locus " << loc.id() << "\n";
947 
948     ++ctg_stats_.n_nonempty_loci;
949     if (!loc.pe_reads().empty())
950         ++ctg_stats_.n_loci_w_pe_reads;
951     loc_.id = loc.id();
952     loc_.pos = loc.pos();
953     loc_.mpopi = &loc.mpopi();
954 
955     CLocAlnSet aln_loc;
956     aln_loc.reinit(loc_.id, loc_.pos, loc_.mpopi);
957 
958     //
959     // Remove N's in the forward reads.
960     //
961     timers_.rm_Ns.restart();
962     for (SRead& r : loc.reads())
963         r.seq.remove_Ns();
964     timers_.rm_Ns.update();
965 
966     //
967     // Assemble a contig.
968     //
969     timers_.assembling.restart();
970     if (detailed_output)
971         loc_.details_ss << "BEGIN contig\n";
972     DNASeq4 ctg = assemble_locus_contig(loc.reads(), loc.pe_reads());
973     timers_.assembling.update();
974     timers_.processing_pre_alns.update();
975     if (ctg.empty()) {
976         if (detailed_output) {
977             loc_.details_ss << "contig_assembly_failed\n"
978                 << "END contig\n"
979                 << "END locus " << loc_.id << '\n';
980             loc_.o_details = loc_.details_ss.str();
981         }
982         return;
983     }
984     aln_loc.ref(move(ctg));
985     timers_.assembling.update();
986 
987     timers_.init_alignments.restart();
988     SuffixTree* stree = new SuffixTree(aln_loc.ref());
989     stree->build_tree();
990     GappedAln aligner;
991     AlignRes aln_res;
992     if (!loc.pe_reads().empty()) {
993         //
994         // Check that the contig starts at the cutsite (and not in the
995         // adapter), by aligning one FW read.
996         //
997         for (SRead& r : loc.reads()) {
998             if (!align_reads_to_contig(stree, &aligner, r.seq, aln_res))
999                 continue;
1000             // Read did align. Check start position & break.
1001             if (aln_res.subj_pos > 0) {
1002                 DNASeq4 new_ctg;
1003                 DNASeq4::iterator itr = aln_loc.ref().begin();
1004                 for (size_t i=0; i<aln_res.subj_pos; ++i) {
1005                     assert(itr != aln_loc.ref().end());
1006                     ++itr;
1007                 }
1008                 new_ctg.append(itr, aln_loc.ref().end());
1009                 if (detailed_output)
1010                     loc_.details_ss
1011                         << "orig_contig\t" << aln_loc.ref() << '\n'
1012                         << "first_fw_read_aln\t" << r.name << "\tpos:" << aln_res.subj_pos << '\n';
1013                 aln_loc.ref(move(new_ctg));
1014                 delete stree;
1015                 stree = new SuffixTree(aln_loc.ref());
1016                 stree->build_tree();
1017             }
1018             break;
1019         }
1020     }
1021     ctg_stats_.length_ctg_tot += aln_loc.ref().length();
1022     if (detailed_output)
1023         loc_.details_ss << "contig\t" << aln_loc.ref() << '\n'
1024             << "END contig\n";
1025     timers_.init_alignments.update();
1026 
1027     //
1028     // Align the reads.
1029     //
1030     timers_.aligning.restart();
1031     this->ctg_stats_.n_tot_reads += loc.reads().size();
1032     this->ctg_stats_.n_tot_reads += loc.pe_reads().size();
1033     if (detailed_output)
1034         loc_.details_ss << "BEGIN pe_alns\n";
1035     for (SRead& r : loc.reads()) { // FORWARD READS
1036         if(add_read_to_aln(aln_loc, aln_res, move(r), &aligner, stree)) {
1037             this->ctg_stats_.n_aln_reads++;
1038             if (detailed_output)
1039                 loc_.details_ss << "fw_aln_local"
1040                                 << '\t' << aln_loc.reads().back().name
1041                                 << '\t' << aln_res.subj_pos + 1 << ':' << aln_res.cigar
1042                                 << '\n';
1043         } else {
1044             if (detailed_output)
1045                 // `r` hasn't been moved.
1046                 loc_.details_ss << "fw_aln_fail\t" << r.name << '\n';
1047         }
1048     }
1049     for (SRead& r : loc.pe_reads()) {
1050         if (add_read_to_aln(aln_loc, aln_res, move(r), &aligner, stree)) {
1051             this->ctg_stats_.n_aln_reads++;
1052             if (loc_.ctg_status == LocData::overlapped) {
1053                 // Record the insert length. (Insert lengths are just based on
1054                 // reverse reads but they should all have matching foward reads
1055                 // starting at the restriction site so it makes sense.)
1056                 const Cigar& cigar = aln_loc.reads().back().cigar;
1057                 size_t insert_length = aln_loc.ref().length();
1058                 assert(!cigar.empty());
1059                 if (cigar.back().first == 'D') {
1060                     insert_length -= cigar.back().second;
1061                     if (cigar.size() >=2 && (*++cigar.rbegin()).first == 'I')
1062                         // Soft clipping on the far end.
1063                         insert_length += (*++cigar.rbegin()).second;
1064                 }
1065                 ctg_stats_.insert_length_olap_mv.increment(insert_length);
1066             }
1067             if (detailed_output)
1068                 loc_.details_ss << "pe_aln_local"
1069                                 << '\t' << aln_loc.reads().back().name
1070                                 << '\t' << aln_res.subj_pos + 1 << ':' << aln_res.cigar
1071                                 << '\n';
1072         } else {
1073             if (detailed_output)
1074                 // `r` hasn't been moved.
1075                 loc_.details_ss << "pe_aln_fail\t" << r.name << '\n';
1076         }
1077     }
1078     if (detailed_output)
1079         loc_.details_ss << "END pe_alns\n";
1080     delete stree;
1081     timers_.aligning.update();
1082 
1083     if (bam_output) {
1084         timers_.building_bams.restart();
1085         assert(input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger);
1086         const bool merged_input = input_type == GStacksInputT::denovo_merger;
1087         if (loc_.o_bam.empty())
1088             loc_.o_bam.resize(merged_input ? 1 : loc_.mpopi->n_samples());
1089         aln_loc.sort_by_alignment_offset();
1090         for (const SAlnRead& read : aln_loc.reads()) {
1091             BamRecord rec;
1092             assert(read.name.length() >= 2 && *++read.name.rbegin() == '/');
1093             Cigar c = read.cigar;
1094             size_t offset = 0;
1095             assert(!c.empty());
1096             if (c.back().first == 'D')
1097                 c.pop_back();
1098             else if (c.back().first == 'I' && c.size() >= 2 && (*----c.end()).first == 'D')
1099                 c.erase(----c.end());
1100             if (c[0].first == 'D') {
1101                 offset = c[0].second;
1102                 c.erase(c.begin());
1103             } else if (c[0].first == 'I' && c.size() >= 2 && c[1].first == 'D') {
1104                 offset = c[1].second;
1105                 c.erase(++c.begin());
1106             }
1107             rec.assign(
1108                 read.name.substr(0, read.name.length() - 2),
1109                 *read.name.rbegin() == '1' ? BAM_FREAD1 : BAM_FREAD2,
1110                 aln_loc.id(),
1111                 offset,
1112                 c,
1113                 read.seq,
1114                 read.sample
1115                 );
1116             loc_.o_bam.at(merged_input ? 0 : read.sample).push_back(move(rec));
1117         }
1118         timers_.building_bams.update();
1119     }
1120 
1121     timers_.merge_paired_reads.restart();
1122     aln_loc.merge_paired_reads();
1123     timers_.merge_paired_reads.update();
1124     loc.clear();
1125     timers_.processing_pre_alns.update();
1126     if (aln_loc.reads().empty()) {
1127        if (detailed_output)
1128            loc_.details_ss << "no_reads_were_aligned\n";
1129         return;
1130     }
1131     process(aln_loc);
1132 }
1133 
1134 void
process(CLocAlnSet & aln_loc)1135 LocusProcessor::process(CLocAlnSet& aln_loc)
1136 {
1137     assert(!aln_loc.reads().empty());
1138     timers_.processing_post_alns.restart();
1139     if (input_type == GStacksInputT::denovo_popmap || input_type == GStacksInputT::denovo_merger) {
1140         // Called from process(CLocReadSet&).
1141         assert(this->loc_.id == aln_loc.id());
1142     } else {
1143         this->loc_.clear();
1144         this->loc_.id = aln_loc.id();
1145         this->loc_.pos = aln_loc.pos();
1146         this->loc_.mpopi = &aln_loc.mpopi();
1147         if (detailed_output)
1148             this->loc_.details_ss << "BEGIN locus " << loc_.id << "\n";
1149     }
1150 
1151     //
1152     // Remove unpaired reads/PCR duplicates.
1153     //
1154     timers_.rm_reads.restart();
1155     if (rm_unpaired_reads) {
1156         for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1157             gt_stats_.per_sample_stats[sample].n_unpaired_reads += aln_loc.sample_reads(sample).size();
1158         aln_loc.remove_unmerged_reads(detailed_output ? &this->loc_.details_ss : NULL);
1159         for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1160             gt_stats_.per_sample_stats[sample].n_unpaired_reads -= aln_loc.sample_reads(sample).size();
1161         if(aln_loc.reads().empty()) {
1162             loc_.clear();
1163             timers_.processing_post_alns.update();
1164             return;
1165         }
1166     }
1167     if (rm_pcr_duplicates) {
1168         assert(rm_unpaired_reads);
1169         for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1170             gt_stats_.per_sample_stats[sample].n_read_pairs_pcr_dupl += aln_loc.sample_reads(sample).size();
1171         aln_loc.remove_pcr_duplicates(&gt_stats_.pcr_clone_size_distrib,
1172                                       detailed_output ? &this->loc_.details_ss : NULL);
1173         for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample)
1174             gt_stats_.per_sample_stats[sample].n_read_pairs_pcr_dupl -= aln_loc.sample_reads(sample).size();
1175     }
1176     timers_.rm_reads.update();
1177     assert(!aln_loc.reads().empty());
1178     ++gt_stats_.n_genotyped_loci;
1179     size_t n_samples = aln_loc.n_samples();
1180     for (size_t sample=0; sample<gt_stats_.per_sample_stats.size(); ++sample) {
1181         if (!aln_loc.sample_reads(sample).empty()) {
1182             auto& s = gt_stats_.per_sample_stats[sample];
1183             ++s.n_loci_with_sample;
1184             s.n_read_pairs_used += aln_loc.sample_reads(sample).size();
1185             s.ns_cumsum += n_samples;
1186             s.ns_weighted_n_read_pairs_used += n_samples * aln_loc.sample_reads(sample).size();
1187         }
1188     }
1189     if (detailed_output) {
1190         loc_.details_ss << "BEGIN aln_matrix\n";
1191         for (const SAlnRead& read : aln_loc.reads())
1192             loc_.details_ss << read.name << '\t' << loc_.mpopi->samples()[read.sample].name
1193                             << '\t' << read.cigar << '\n';
1194         loc_.details_ss << "END aln_matrix\n";
1195     }
1196 
1197     //
1198     // Get the nucleotide counts at each position.
1199     //
1200     timers_.counting_nts.restart();
1201     vector<SiteCounts> depths;
1202     depths.reserve(aln_loc.ref().length());
1203     for (CLocAlnSet::site_iterator site (aln_loc); bool(site); ++site) {
1204         depths.push_back(site.counts());
1205         if (depths.back().tot.sum() > 0)
1206             ++gt_stats_.n_sites_tot; // Sites "with data".
1207     }
1208     assert(depths.size() == aln_loc.ref().length());
1209     timers_.counting_nts.update();
1210 
1211     //
1212     // Call SNPs.
1213     //
1214     timers_.genotyping.restart();
1215     vector<SiteCall> calls;
1216     calls.reserve(aln_loc.ref().length());
1217     for (const SiteCounts& site_depths : depths) {
1218         calls.push_back(model->call(site_depths));
1219         // Make sure our SNP and genotype calls are consistent (discard SNPs with
1220         // a MAC of 0, or without calls at all).
1221         if (calls.back().alleles().size() > 1)
1222             calls.back().filter_mac(1);
1223     }
1224     timers_.genotyping.update();
1225 
1226     //
1227     // Call haplotypes; amend SNP/genotype calls.
1228     //
1229     vector<map<size_t,PhasedHet>> phase_data;
1230     if (!dbg_no_haplotypes) {
1231         timers_.haplotyping.restart();
1232         phase_hets(phase_data, calls, aln_loc, hap_stats_);
1233         timers_.haplotyping.update();
1234     }
1235 
1236     //
1237     // Determine the consensus sequence.
1238     //
1239     timers_.cpt_consensus.restart();
1240     DNASeq4 consensus (aln_loc.ref().length());
1241     assert(calls.size() == aln_loc.ref().length());
1242     assert(depths.size() == aln_loc.ref().length());
1243     for (size_t i = 0; i < aln_loc.ref().length(); ++i) {
1244         if (!calls[i].alleles().empty()) {
1245             consensus.set(i, calls[i].most_frequent_allele());
1246         } else {
1247             // Use the majority-rule nucleotide.
1248             // (For the high/low Maruki" models this actually only happens when
1249             // there is no coverage; for the Hohenlohe model it may also happen
1250             // when there isn't a single significant call.)
1251             pair<size_t,Nt2> nt = depths[i].tot.sorted()[0];
1252             if (nt.first > 0)
1253                 consensus.set(i, nt.second);
1254         }
1255     }
1256     aln_loc.ref(move(consensus));
1257     timers_.cpt_consensus.update();
1258 
1259     //
1260     // Create the outputs.
1261     //
1262     write_one_locus(aln_loc, depths, calls, phase_data);
1263 
1264     if (detailed_output) {
1265         loc_.details_ss << "END locus " << loc_.id << "\n";
1266         loc_.o_details = loc_.details_ss.str();
1267     }
1268     if (dbg_write_alns) {
1269         #pragma omp critical(dbg_alns)
1270         o_aln_f << "BEGIN " << loc_.id << "\n"
1271                 << aln_loc
1272                 << "\nEND " << loc_.id << "\n";
1273     }
1274     timers_.processing_post_alns.update();
1275 }
1276 
1277 bool
align_reads_to_contig(SuffixTree * st,GappedAln * g_aln,DNASeq4 & enc_query,AlignRes & aln_res) const1278 LocusProcessor::align_reads_to_contig(SuffixTree *st, GappedAln *g_aln, DNASeq4 &enc_query, AlignRes &aln_res) const
1279 {
1280     vector<STAln> alns, final_alns;
1281     vector<pair<size_t, size_t> > step_alns;
1282     string      query  = enc_query.str();
1283     const char *q      = query.c_str();
1284     const char *q_stop = q + query.length();
1285     size_t      q_pos  = 0;
1286     size_t      id     = 0;
1287     char c[id_len];
1288 
1289     do {
1290         step_alns.clear();
1291 
1292         q_pos = q - query.c_str();
1293 
1294         st->align(q, step_alns);
1295 
1296         if (step_alns.size() == 0 || step_alns.size() > max_fragment_alns) {
1297             q++;
1298         } else {
1299             for (uint i = 0; i < step_alns.size(); i++)
1300                 alns.push_back(STAln(id, q_pos, step_alns[i].first, step_alns[i].second));
1301             q += step_alns[0].second + 1;
1302             id++;
1303         }
1304     } while (q < q_stop);
1305 
1306     //
1307     // No alignments to the suffix tree were found.
1308     //
1309     if (alns.size() == 0)
1310         return false;
1311 
1312     //
1313     // Perfect alignmnet to the suffix tree. Return result.
1314     //
1315     if (alns.size() == 1 && alns[0].aln_len == query.length()) {
1316         snprintf(c, id_len, "%luM", query.length());
1317         aln_res.cigar    = c;
1318         aln_res.subj_pos = alns[0].subj_pos;
1319         return true;
1320     }
1321 
1322     //
1323     // Find a consistent set of suffix tree alignments, ordered in a directed, acyclic grapgh (DAG).
1324     //
1325     this->suffix_tree_hits_to_dag(query.length(), alns, final_alns);
1326 
1327     g_aln->init(query.length(), st->seq_len(), true);
1328     if (g_aln->align_constrained(query, st->seq_str(), final_alns)) {
1329         aln_res = g_aln->result();
1330     }
1331 
1332     if (aln_res.pct_id < min_aln_cov)
1333         return false;
1334 
1335     return true;
1336 }
1337 
1338 int
suffix_tree_hits_to_dag(size_t query_len,vector<STAln> & alns,vector<STAln> & final_alns) const1339 LocusProcessor::suffix_tree_hits_to_dag(size_t query_len, vector<STAln> &alns, vector<STAln> &final_alns) const
1340 {
1341     //
1342     // 1. Sort the alignment fragments so they are primarily ordered by subject position, secondarily by query position.
1343     //
1344     sort(alns.begin(), alns.end(), [](STAln a, STAln b)
1345          {
1346             if (a.subj_pos == b.subj_pos)
1347                 return a.query_pos < b.query_pos;
1348             else
1349                 return a.subj_pos < b.subj_pos;
1350          });
1351 
1352     int    gap_len, end_pos;
1353     double score, scale;
1354     bool   term;
1355     vector<size_t> term_nodes;
1356     //
1357     // 2. Traverse the list of fragments and add links to reachable nodes, recording the score.
1358     //    2.1. A successor node is reachable from the predecessor node if both the query and subject
1359     //         positions are advanced relative to the predecessor, and there is no sequence overlap
1360     //         between the two nodes.
1361     //    2.2. We weight the score by the number of 'gap' nucleotides between the two fragments to
1362     //         penalize fragments that are further away.
1363     //    2.3  Record the maximal link in the graph.
1364     //    2.4. Determine if a node is a terminal node.
1365     //
1366     for (uint i = 0; i < alns.size(); i++) {
1367         end_pos = alns[i].subj_pos + alns[i].aln_len - 1;
1368         term    = true;
1369 
1370         assert(end_pos > 0);
1371 
1372         for (uint j = i + 1; j < alns.size(); j++) {
1373             if (alns[j].query_pos > alns[i].query_pos &&
1374                 alns[j].subj_pos  > alns[i].subj_pos  &&
1375                 alns[j].subj_pos  > (uint) end_pos) {
1376 
1377                 term    = false;
1378                 gap_len = alns[j].subj_pos - end_pos - 1;
1379                 scale   = gap_len > (int) query_len ? 1 : alns[i].aln_len * ((double) gap_len / (double) query_len);
1380                 score   = alns[i].aln_len - scale; // Raw score.
1381                 score  += alns[i].max._score;      // Score adjusted for the highest incoming node.
1382 
1383                 assert(gap_len >= 0);
1384                 assert(score   >= 0);
1385 
1386                 alns[j].links.push_back(STLink(i, score));
1387                 if (alns[j].max._index == j || score > alns[j].max._score)
1388                     alns[j].max = STLink(i, score);
1389             }
1390         }
1391         if (term)
1392             term_nodes.push_back(i);
1393     }
1394 
1395     //
1396     // 3. Find the terminal node with the highest score.
1397     //
1398     double max_score = alns[term_nodes.front()].max._score;
1399     size_t max_index = term_nodes.front();
1400     for (uint i = 1; i < term_nodes.size(); i++) {
1401         if (alns[term_nodes[i]].max._score > max_score) {
1402             max_index = term_nodes[i];
1403             max_score = alns[term_nodes[i]].max._score;
1404         }
1405     }
1406 
1407     //
1408     // 4. Backtrack to get the optimal path.
1409     //
1410     vector<size_t> optimal;
1411 
1412     if (max_score == 0 && max_index == 0) {
1413         //
1414         // None of the fragments were connected in the DAG, select the largest fragment.
1415         //
1416         max_score = alns[term_nodes.front()].aln_len;
1417         max_index = term_nodes.front();
1418         for (uint i = 1; i < term_nodes.size(); i++)
1419             if (alns[i].aln_len > max_score) {
1420                 max_index = term_nodes[i];
1421                 max_score = alns[term_nodes[i]].aln_len;
1422             }
1423         optimal.push_back(max_index);
1424 
1425     } else {
1426 
1427         uint n = max_index;
1428         while (alns[n].links.size() > 0) {
1429             optimal.push_back(n);
1430             n = alns[n].max._index;
1431         }
1432         optimal.push_back(n);
1433     }
1434     assert(optimal.size() > 0);
1435 
1436     final_alns.clear();
1437     for (int i = optimal.size() - 1; i >= 0; i--)
1438         final_alns.push_back(alns[optimal[i]]);
1439 
1440     return 0;
1441 }
1442 
1443 size_t
find_locus_overlap(SuffixTree * stree,GappedAln * g_aln,const DNASeq4 & se_consensus,string & overlap_cigar) const1444 LocusProcessor::find_locus_overlap(SuffixTree *stree, GappedAln *g_aln, const DNASeq4 &se_consensus, string &overlap_cigar) const
1445 {
1446     vector<STAln> alns, final_alns;
1447     vector<pair<size_t, size_t> > step_alns;
1448     AlignRes aln_res;
1449 
1450     string      query  = se_consensus.str();
1451     const char *q      = query.c_str();
1452     const char *q_stop = q + query.length();
1453     size_t      q_pos  = 0;
1454     size_t      id     = 1;
1455 
1456     do {
1457         step_alns.clear();
1458 
1459         q_pos = q - query.c_str();
1460 
1461         stree->align(q, step_alns);
1462 
1463         if (step_alns.size() == 0 || step_alns.size() > max_fragment_alns) {
1464             q++;
1465         } else {
1466             for (uint i = 0; i < step_alns.size(); i++)
1467                 alns.push_back(STAln(id, q_pos, step_alns[i].first, step_alns[i].second));
1468             q += step_alns[0].second + 1;
1469             id++;
1470         }
1471     } while (q < q_stop);
1472 
1473 
1474     if (alns.size() == 0) {
1475         //
1476         // If no alignments have been found, search the tails of the query and subject for any overlap
1477         // that is too small to be picked up by the SuffixTree using the gapped aligner.
1478         //
1479         uint olap_bound = stree->min_aln() * 2;
1480 
1481         //
1482         // Create a gapped alignment, looking only at the tail of the two sequences, to determine the exact overlap.
1483         //
1484         g_aln->init(query.length(), stree->seq_len(), true);
1485         if (g_aln->align_region(query, stree->seq_str(),
1486                                 query.length() - olap_bound, query.length() - 1, 0, olap_bound - 1)) {
1487             aln_res       = g_aln->result();
1488             overlap_cigar = aln_res.cigar;
1489         }
1490         // g_aln->dump_alignment(query, stree->seq_str());
1491 
1492     } else {
1493         size_t query_stop = alns.front().query_pos + alns.front().aln_len - 1;
1494 
1495         if ( alns.size() == 1 &&
1496             (query_stop == (query.length() - 1) && alns.front().subj_pos == 0)) {
1497             //
1498             // If a single alignment has been found, check for a perfect alignmnet to the suffix tree
1499             // that occupies the end of the query and the beginning of the subject.
1500             //
1501             char buf[id_len];
1502             snprintf(buf, id_len, "%luM", alns.front().aln_len);
1503             overlap_cigar  = buf;
1504             aln_res.pct_id = 1;
1505 
1506         } else {
1507             //
1508             // Otherwise, find a consistent set of suffix tree alignments, ordered in a directed, acyclic grapgh (DAG).
1509             //
1510             this->suffix_tree_hits_to_dag(query.length(), alns, final_alns);
1511 
1512             //
1513             // Create a gapped alignment, prefilling the suffix tree hits, to determine the exact overlap.
1514             //
1515             g_aln->init(query.length(), stree->seq_len(), true);
1516             if (g_aln->align_constrained(query, stree->seq_str(), final_alns)) {
1517                 aln_res       = g_aln->result();
1518                 overlap_cigar = aln_res.cigar;
1519             }
1520         }
1521     }
1522 
1523     Cigar cigar;
1524     parse_cigar(overlap_cigar.c_str(), cigar, true);
1525 
1526     if (cigar.size() == 0)
1527         return 0;
1528 
1529     // cerr << "Cigar: " << aln_res.cigar.c_str() << "; cigar_length_ref aka overlap: " << cigar_length_ref(cigar) << "\n";
1530 
1531     //
1532     // Assess the gapped alignment and make sure it is reasonable.
1533     //
1534     size_t overlap     = 0;
1535     size_t cigar_index = 0;
1536     double total_len   = 0.0;
1537     double align_len   = 0.0;
1538 
1539     if (cigar[cigar_index].first == 'S') {
1540         //
1541         // Here is an example where we want to ignore the initial softmasking repored for the SE contig,
1542         // but we must count the offset to the start of the PE contig as a set of mismatches.
1543         // CIGAR:  138S4M2S
1544         // SE CTG: TGCAGGAGATTTTCCTCTCTGCTGT...GGAGCTCGTTGGTGTTGGTGGCACTGCCCTGAGGTTTTCCTCATNNTCCCCAGGNCCCAGCGTCAGC
1545         // PE CTG:                                                                               GGCACAGATGTGTCACAGCTGC...CTTCTT
1546         //                                                                                                  ****
1547         cigar_index++;
1548         total_len += aln_res.subj_pos;
1549 
1550     } else {
1551         //
1552         // If the PE contig overlaps the entire SE contig, and there is sequence remaining on the PE contig, adjust the
1553         // overlap to include it, but it does not count as unaligned total length.
1554         // e.g.:
1555         // CIGAR:  68M1D6M
1556         // SE CTG:     TGCAGGCGCACCTGTAGGCCCCCGGACAGGAGGGGGTGGCTTCAAGCAGGGGCCAGCCCGAGGCCCCC-ACCCAC
1557         // PE CTG: TAGTTGCAGGCGCACCTGTAGGCCCCCGGACAGGAGGGGGTGGCTTCAAGCAGGGGCCAGCCCGAGGCCCCCCACCCACACCCAGCACACACTGGCC...
1558         //
1559         overlap += aln_res.subj_pos;
1560     }
1561 
1562     for (; cigar_index < cigar.size(); cigar_index++) {
1563         total_len += cigar[cigar_index].second;
1564         if (cigar[cigar_index].first == 'M')
1565             align_len += cigar[cigar_index].second;
1566     }
1567 
1568     //
1569     // Test that 80% of the overlapping sequence is aligned (includes gaps) AND
1570     // make sure that of those sequences aligned, 80% are identities.
1571     //
1572     if (align_len / total_len < overlap_min_pct_id ||
1573         aln_res.pct_id        < overlap_min_pct_id)
1574         return 0;
1575 
1576     overlap += align_len;
1577 
1578     //
1579     // Return overlap.
1580     //
1581     return overlap;
1582 }
1583 
1584 DNASeq4
assemble_locus_contig(const vector<SRead> & fw_reads,const vector<SRead> & pe_reads)1585 LocusProcessor::assemble_locus_contig(
1586     const vector<SRead>& fw_reads,
1587     const vector<SRead>& pe_reads
1588 ) {
1589     //
1590     // List the sequences that will enter the graph.
1591     //
1592     array<vector<const DNASeq4*>, 3> seqs_to_assemble;
1593     array<const vector<SRead>*, 2> reads = {{&fw_reads, &pe_reads}};
1594     size_t fw = 0, pe = 1, both = 2;
1595     for (size_t i : {fw, pe}) {
1596         if (reads[i]->size() <= max_debruijn_reads) {
1597             for (const Read& r : *reads[i])
1598                 seqs_to_assemble[i].push_back(&r.seq);
1599         } else {
1600             double seq_i = 0.0;
1601             double step = (double) reads[i]->size() / max_debruijn_reads;
1602             for (size_t j=0; j<max_debruijn_reads; ++j) {
1603                 seqs_to_assemble[i].push_back(&(*reads[i])[(size_t)seq_i].seq);
1604                 seq_i += step;
1605             }
1606         }
1607     }
1608     seqs_to_assemble[both] = seqs_to_assemble[fw];
1609     seqs_to_assemble[both].insert(
1610         seqs_to_assemble[both].end(),
1611         seqs_to_assemble[pe].begin(), seqs_to_assemble[pe].end());
1612 
1613     //
1614     // Build the graph.
1615     //
1616     Graph graph (km_length);
1617     graph.rebuild(seqs_to_assemble[both], min_km_count);
1618     if (graph.empty())
1619         return DNASeq4();
1620     if (dbg_write_gfa) {
1621         graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".spaths.gfa");
1622         graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".nodes.gfa", true);
1623     }
1624 
1625     //
1626     // Find the best path in the graph.
1627     //
1628     vector<const SPath*> best_path;
1629     do {
1630         if (graph.find_best_path(best_path))
1631             break;
1632 
1633         // The graph wasn't a DAG, try to fix it.
1634         if(graph.remove_cycles() && graph.find_best_path(best_path)) {
1635             ++ctg_stats_.n_loci_pe_graph_fixed_to_dag;
1636             if (detailed_output)
1637                 loc_.details_ss << "fixed_to_dag\n";
1638             if (dbg_write_gfa) {
1639                 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".fixed.spaths.gfa");
1640                 graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".fixed.nodes.gfa", true);
1641             }
1642             break;
1643         }
1644 
1645         // Contig contruction failed.
1646         ++ctg_stats_.n_loci_pe_graph_not_dag;
1647         if (detailed_output)
1648             loc_.details_ss << "not_dag\n";
1649         if (dbg_write_gfa_notdag) {
1650             graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".spaths.gfa");
1651             graph.dump_gfa(out_dir + "gstacks." + to_string(loc_.id) + ".nodes.gfa", true);
1652         }
1653         return DNASeq4();
1654     } while (false);
1655     if (detailed_output)
1656         loc_.details_ss << "is_dag\n";
1657     DNASeq4 contig = DNASeq4(SPath::contig_str(best_path.begin(), best_path.end(), km_length));
1658 
1659     //
1660     // Handle the case where there are paired-end reads and the main
1661     // corresponding subgraph is distinct from that of the forward reads.
1662     //
1663     if (!pe_reads.empty()) {
1664         //
1665         // Extract the components and map the fw & pe reads to them.
1666         //
1667         vector<vector<const SPath*>> components = graph.components();
1668         unordered_map<Kmer,size_t> component_kmers;
1669         for (size_t i=0; i<components.size(); ++i) {
1670             size_t n_nodes = 0;
1671             for (const SPath* sp : components[i]) {
1672                 for (const Node* n=sp->first(); ; n=n->first_succ()) {
1673                     component_kmers[n->km()] = i;
1674                     ++n_nodes;
1675                     if (n == sp->last())
1676                         break;
1677                 }
1678             }
1679         }
1680         array<vector<size_t>, 2> comp_depths;
1681         for (size_t i : {fw, pe}) {
1682             comp_depths[i] = vector<size_t>(components.size(), 0);
1683             for (const DNASeq4* s : seqs_to_assemble[i]) {
1684                 Kmerizer kmers {km_length, *s};
1685                 Kmer km;
1686                 decltype(component_kmers.begin()) itr;
1687                 while ((km = kmers.next()))
1688                     if ((itr = component_kmers.find(km)) != component_kmers.end())
1689                         ++comp_depths[i][itr->second];
1690             }
1691         }
1692         //
1693         // Check that there actually exist kmers for the forward and paired-end reads.
1694         //
1695         if (accumulate(comp_depths[fw].begin(), comp_depths[fw].end(), 0) == 0) {
1696             if (detailed_output)
1697                 loc_.details_ss << "fw_reads_absent_from_graph\n";
1698             return DNASeq4();
1699         }
1700         if (accumulate(comp_depths[pe].begin(), comp_depths[pe].end(), 0) == 0) {
1701             if (detailed_output)
1702                 loc_.details_ss << "pe_reads_absent_from_graph\n";
1703         } else {
1704             //
1705             // Check that the main forward and paired-end components are the same one.
1706             //
1707             array<size_t, 2> best_comp;
1708             for (size_t i : {fw, pe}) {
1709                 size_t best_depth = 0;
1710                 for (size_t c=0; c<components.size(); ++c) {
1711                     if (comp_depths[i][c] > best_depth) {
1712                         best_comp[i] = c;
1713                         best_depth = comp_depths[i][c];
1714                     }
1715                 }
1716             }
1717             if (best_comp[fw] == best_comp[pe]) {
1718                 loc_.ctg_status = LocData::overlapped;
1719                 loc_.olap_length = km_length;
1720                 ctg_stats_.n_overlaps++;
1721                 ctg_stats_.length_overlap_tot += loc_.olap_length;
1722                 ctg_stats_.length_olapd_loci_tot += contig.length();
1723                 if (detailed_output)
1724                     loc_.details_ss << "one_debruijn_subgraph\n"
1725                         << "overlap\t" << km_length << '\n';
1726             } else {
1727                 if (detailed_output)
1728                     loc_.details_ss << "two_debruijn_subgraphs\n";
1729                 //
1730                 // Determine the component the best path corresponds to.
1731                 //
1732                 Kmer km = Kmerizer(km_length, contig).next();
1733                 assert(km);
1734                 auto comp_itr = component_kmers.find(km);
1735                 assert(comp_itr != component_kmers.end());
1736                 size_t comp = comp_itr->second;
1737                 //
1738                 // Get two contigs, one for forward and one for paired-end.
1739                 //
1740                 DNASeq4 fw_contig, pe_contig;
1741                 DNASeq4* todo_contig;
1742                 const vector<const DNASeq4*>* todo_seqs;
1743                 if (comp == best_comp[fw]) {
1744                     if (detailed_output)
1745                         loc_.details_ss << "best_path_is_best_fw_compo\n";
1746                     fw_contig = move(contig);
1747                     todo_contig = &pe_contig;
1748                     todo_seqs = &seqs_to_assemble[pe];
1749                 } else if (comp == best_comp[pe]) {
1750                     if (detailed_output)
1751                         loc_.details_ss << "best_path_is_best_pe_compo\n";
1752                     pe_contig = move(contig);
1753                     todo_contig = &fw_contig;
1754                     todo_seqs = &seqs_to_assemble[fw];
1755                 } else {
1756                     if (detailed_output)
1757                         loc_.details_ss << "best_path_is_neither_best_fw_compo_nor_best_pe_compo\n";
1758                     ++ctg_stats_.n_loci_pe_graph_not_dag;
1759                     return DNASeq4();
1760                 }
1761                 graph.rebuild(*todo_seqs, min_km_count);
1762                 if (graph.empty()) {
1763                     // DOES_NOT_HAPPEN; // May fail, c.f. mailing list 2019-01-18.
1764                     // We can arrive here in some limit cases. Especially, if the forward and reverse
1765                     // regions overlap but the graph is still disconnected, which may happen e.g. if
1766                     // there are very few reads and some sequencing errors/SNPs (e.g. two 150bp forward
1767                     // reads and one 150bp reverse read with a 50bp overlap, and the two forward reads
1768                     // differ at position 100 and min_km_count is 2; with enough errors in the forward
1769                     // reads it's probably also possible to make this happen with the graph's best path
1770                     // being in the reverse region.)
1771                     loc_.details_ss << "limit_case_the_other_compo_disappeared\n";
1772                     return DNASeq4();
1773                 }
1774                 if (!graph.find_best_path(best_path)
1775                         && !(graph.remove_cycles() && graph.find_best_path(best_path)))
1776                     DOES_NOT_HAPPEN;
1777                 *todo_contig = DNASeq4(SPath::contig_str(best_path.begin(), best_path.end(), km_length));
1778                 if(detailed_output)
1779                     loc_.details_ss << "contig_fw\t" << fw_contig << '\n'
1780                                     << "contig_pe\t" << pe_contig << '\n';
1781                 //
1782                 // Try to overlap the two contigs.
1783                 //
1784                 SuffixTree stree {pe_contig};
1785                 stree.build_tree();
1786                 GappedAln aligner;
1787                 AlignRes  aln_res;
1788                 string overlap_cigar;
1789                 int overlap;
1790                 if (dbg_no_overlaps)
1791                     overlap = 0;
1792                 else
1793                     overlap = this->find_locus_overlap(&stree, &aligner, fw_contig, overlap_cigar);
1794                 if (overlap > 0) {
1795                     if(detailed_output)
1796                         loc_.details_ss << "overlap\t" << overlap << '\t' << overlap_cigar << "\n";
1797                     this->loc_.ctg_status = LocData::overlapped;
1798                     this->loc_.olap_length = overlap;
1799                     this->ctg_stats_.n_overlaps++;
1800                     this->ctg_stats_.length_overlap_tot += overlap;
1801                     this->ctg_stats_.length_olapd_loci_tot += fw_contig.length() + pe_contig.length() - overlap;
1802                     contig = move(fw_contig);
1803                     auto seq_itr = pe_contig.begin();
1804                     for(size_t i=0; i<size_t(overlap); ++i) {
1805                         assert(seq_itr != pe_contig.end());
1806                         ++seq_itr;
1807                     }
1808                     contig.append(seq_itr, pe_contig.end());
1809                 } else {
1810                     if(detailed_output)
1811                         loc_.details_ss << "no_overlap\n";
1812                     this->loc_.ctg_status = LocData::separate;
1813                     contig = move(fw_contig);
1814                     for(size_t i=0; i<10; ++i)
1815                         contig.push_back(Nt4::n);
1816                     contig.append(pe_contig.begin(), pe_contig.end());
1817                 }
1818             }
1819         }
1820     }
1821     return contig;
1822 }
1823 
add_read_to_aln(CLocAlnSet & aln_loc,AlignRes & aln_res,SRead && r,GappedAln * aligner,SuffixTree * stree)1824 bool LocusProcessor::add_read_to_aln(
1825         CLocAlnSet& aln_loc,
1826         AlignRes& aln_res,
1827         SRead&& r,
1828         GappedAln* aligner,
1829         SuffixTree* stree
1830 ) {
1831     if (!this->align_reads_to_contig(stree, aligner, r.seq, aln_res))
1832         return false;
1833 
1834     Cigar cigar;
1835     parse_cigar(aln_res.cigar.c_str(), cigar);
1836     assert(!cigar.empty());
1837     simplify_cigar_to_MDI(cigar);
1838     cigar_extend_left(cigar, aln_res.subj_pos);
1839     assert(cigar_length_ref(cigar) <= aln_loc.ref().length());
1840     cigar_extend_right(cigar, aln_loc.ref().length() - cigar_length_ref(cigar));
1841 
1842     aln_loc.add(SAlnRead(move((Read&)r), move(cigar), r.sample));
1843     return true;
1844 }
1845 
phase_hets(vector<map<size_t,PhasedHet>> & phased_samples,vector<SiteCall> & calls,const CLocAlnSet & aln_loc,HaplotypeStats & hap_stats) const1846 void LocusProcessor::phase_hets(
1847         vector<map<size_t,PhasedHet>>& phased_samples,
1848         vector<SiteCall>& calls,
1849         const CLocAlnSet& aln_loc,
1850         HaplotypeStats& hap_stats
1851 ) const {
1852     phased_samples.clear();
1853     phased_samples.resize(loc_.mpopi->samples().size());
1854 
1855     //
1856     // Apply a first MAC filter, if requested.
1857     //
1858     if (phasing_min_mac >= 2)
1859         for (SiteCall& call : calls)
1860             if (call.alleles().size() > 1)
1861                 call.filter_mac(phasing_min_mac);
1862 
1863     //
1864     // Find SNPs.
1865     //
1866     vector<size_t> snp_cols;
1867     for (size_t i=0; i<aln_loc.ref().length(); ++i)
1868         if (calls[i].alleles().size() > 1)
1869             snp_cols.push_back(i);
1870 
1871     //
1872     // Check that the locus is polymorphic.
1873     //
1874     if (snp_cols.empty()) {
1875         // No SNPs, there is no phasing to do.
1876         ++hap_stats.n_badly_phased_samples[ {0, 0} ];
1877         return;
1878     }
1879 
1880     //
1881     // Initialize counters, streams.
1882     //
1883     size_t n_hets_needing_phasing = 0;
1884     size_t n_consistent_hets = 0;
1885     assert(hap_stats.per_sample_stats.size() == loc_.mpopi->samples().size());
1886     if (detailed_output)
1887         loc_.details_ss << "BEGIN phasing\n";
1888     stringstream o_hapgraph_ss;
1889     bool has_subgraphs = false;
1890     if (dbg_write_hapgraphs) {
1891         o_hapgraph_ss << "\n"
1892                       << "subgraph cluster_loc" << loc_.id << " {\n"
1893                       << "\tlabel=\"locus " << loc_.id << "\";\n"
1894                       << "\t# snp columns: ";
1895         join(snp_cols, ',', o_hapgraph_ss);
1896         o_hapgraph_ss << "\n";
1897     }
1898 
1899     //
1900     // Phase each sample.
1901     //
1902     vector<size_t> samples_failed;
1903     vector<size_t> samples_passed;
1904     for (size_t sample=0; sample<loc_.mpopi->samples().size(); ++sample) {
1905         if (phase_sample_hets(
1906                 phased_samples[sample],
1907                 calls, aln_loc, snp_cols, sample,
1908                 hap_stats,
1909                 n_hets_needing_phasing, n_consistent_hets,
1910                 o_hapgraph_ss, has_subgraphs)
1911         ) {
1912             samples_passed.push_back(sample);
1913         } else {
1914             samples_failed.push_back(sample);
1915         }
1916     }
1917 
1918     //
1919     // Retry to phase failed samples after removing SNPs for which the minor
1920     // allele was only seen in hets and was never successfully phased.
1921     //
1922     if (!dbg_phasing_no_2ndpass) {
1923         vector<size_t> snp_cols_reduced;
1924         for(size_t col : snp_cols) {
1925             const vector<SampleCall>& c = calls[col].sample_calls();
1926             Counts<Nt2> counts;
1927             for (size_t sample : samples_passed) {
1928                 switch(c[sample].call()) {
1929                 case snp_type_hom:
1930                     counts.increment(c[sample].nt0(), 2);
1931                     break;
1932                 case snp_type_het:
1933                     if (phased_samples[sample].count(col)) {
1934                         // This het was phased.
1935                         counts.increment(c[sample].nt0());
1936                         counts.increment(c[sample].nt1());
1937                     }
1938                     break;
1939                 default:
1940                     break;
1941                 }
1942             }
1943             size_t mac = counts.sorted()[1].first;
1944             if (mac >= size_t(phasing_min_mac))
1945                 // Keep it.
1946                 snp_cols_reduced.push_back(col);
1947         }
1948         if (snp_cols_reduced.size() < snp_cols.size())
1949             for (size_t sample : samples_failed)
1950                 if (phase_sample_hets(
1951                         phased_samples[sample],
1952                         calls, aln_loc, snp_cols_reduced, sample,
1953                         hap_stats,
1954                         n_hets_needing_phasing, n_consistent_hets,
1955                         o_hapgraph_ss, has_subgraphs, false))
1956                     ++hap_stats.per_sample_stats[sample].n_phased_2ndpass;
1957     }
1958 
1959     //
1960     // Update the SiteCalls according to the haplotype calls, and reapply
1961     // the MAC filter on the updated data.
1962     //
1963     for (size_t col : snp_cols) {
1964         SiteCall& c = calls[col];
1965         for (size_t sample=0; sample<loc_.mpopi->samples().size(); ++sample)
1966             if (c.sample_calls()[sample].call() == snp_type_het
1967                     && !phased_samples[sample].count(col))
1968                 c.discard_sample(sample);
1969         c.filter_mac(phasing_min_mac);
1970     }
1971 
1972     //
1973     // Clean up.
1974     //
1975     ++hap_stats.n_badly_phased_samples[ {n_consistent_hets, n_hets_needing_phasing} ];
1976     if (dbg_write_hapgraphs && has_subgraphs) {
1977         o_hapgraph_ss << "}\n";
1978         #pragma omp critical
1979         o_hapgraphs_f << o_hapgraph_ss.rdbuf();
1980     }
1981     if (detailed_output)
1982         loc_.details_ss << "END phasing\n";
1983 }
1984 
phase_sample_hets(map<size_t,PhasedHet> & phased_sample,const vector<SiteCall> & calls,const CLocAlnSet & aln_loc,const vector<size_t> & snp_cols,size_t sample,HaplotypeStats & hap_stats,size_t & n_hets_needing_phasing,size_t & n_consistent_hets,ostream & o_hapgraph_ss,bool & has_subgraphs,bool first_pass) const1985 bool LocusProcessor::phase_sample_hets(
1986         map<size_t,PhasedHet>& phased_sample,
1987         const vector<SiteCall>& calls,
1988         const CLocAlnSet& aln_loc,
1989         const vector<size_t>& snp_cols,
1990         size_t sample,
1991         HaplotypeStats& hap_stats,
1992         size_t& n_hets_needing_phasing, size_t& n_consistent_hets,
1993         ostream& o_hapgraph_ss, bool& has_subgraphs,
1994         bool first_pass
1995 ) const {
1996     phased_sample.clear();
1997     if (aln_loc.sample_reads(sample).empty())
1998         return true;
1999     if (first_pass)
2000         ++hap_stats.per_sample_stats[sample].n_diploid_loci;
2001 
2002     //
2003     // Find heterozygous positions.
2004     //
2005     vector<size_t> het_snps;
2006     for(size_t snp_i=0; snp_i<snp_cols.size(); ++snp_i)
2007         if (calls[snp_cols[snp_i]].sample_calls()[sample].call() == snp_type_het)
2008             het_snps.push_back(snp_i);
2009     if (!first_pass) {
2010         --hap_stats.per_sample_stats[sample].n_hets_2snps;
2011         --n_hets_needing_phasing;
2012     }
2013     if (het_snps.size() == 0) {
2014         // Sample is homozygous.
2015         return true;
2016     } else if (het_snps.size() == 1) {
2017         // One heterozygous SNP; sample has trivial 1nt-long haplotypes.
2018         size_t col = snp_cols[het_snps[0]];
2019         const SampleCall& c = calls[col].sample_calls()[sample];
2020         phased_sample.insert({col, {col, c.nt0(), c.nt1()}});
2021         return true;
2022     }
2023     ++hap_stats.per_sample_stats[sample].n_hets_2snps;
2024     ++n_hets_needing_phasing;
2025 
2026     vector<const SampleCall*> sample_het_calls; //**********
2027     for (size_t het_i=0; het_i<het_snps.size(); ++het_i)
2028         sample_het_calls.push_back(&calls[snp_cols[het_snps[het_i]]].sample_calls()[sample]);
2029 
2030     //
2031     // Iterate over reads, record as pairwise cooccurrences of  alleles.
2032     //
2033     SnpAlleleCooccurrenceCounter cooccurrences (snp_cols.size());
2034     count_pairwise_cooccurrences(cooccurrences, aln_loc,
2035                                     sample, snp_cols, het_snps, sample_het_calls);
2036     if (dbg_write_hapgraphs) {
2037         has_subgraphs = true;
2038         write_sample_hapgraph(o_hapgraph_ss, sample, het_snps, snp_cols,
2039                                 sample_het_calls, cooccurrences);
2040     }
2041 
2042     //
2043     // Assemble phase sets.
2044     //
2045     vector<PhaseSet> phase_sets;
2046     bool phased = false;
2047     for (size_t min_n_cooccurrences = phasing_cooccurrences_thr_range.first;
2048             min_n_cooccurrences <= phasing_cooccurrences_thr_range.second;
2049             ++min_n_cooccurrences
2050     ) {
2051         if (assemble_phase_sets(
2052                 phase_sets, het_snps, sample_het_calls,
2053                 cooccurrences, min_n_cooccurrences
2054         )) {
2055             // Phasing succeeded.
2056             phased = true;
2057             if (detailed_output)
2058                 loc_.details_ss << "phasing_ok\t"
2059                     << loc_.mpopi->samples()[sample].name
2060                     << "\tn_hets=" << het_snps.size()
2061                     << "\tcooc_thr=" << min_n_cooccurrences << '\n';
2062             break;
2063         }
2064 
2065         // Phasing failed. find alleles that are not connected to any
2066         // others, discard the het calls they are part of, and retry.
2067         if (phasing_dont_prune_hets)
2068             continue;
2069         vector<array<bool,2>> alleles_with_edges;
2070         alleles_with_edges.resize(het_snps.size(), {{false, false}});
2071         for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2072             array<Nt2,2> allelesi = sample_het_calls[het_i]->nts();
2073             for (size_t het_j=het_i+1; het_j<het_snps.size(); ++het_j) {
2074                 array<Nt2,2> allelesj = sample_het_calls[het_j]->nts();
2075                 for (size_t allelei=0; allelei<2; ++allelei) {
2076                     for (size_t allelej=0; allelej<2; ++allelej) {
2077                         if (cooccurrences.at(het_snps[het_i],
2078                                                 allelesi[allelei],
2079                                                 het_snps[het_j],
2080                                                 allelesj[allelej])
2081                                 >= min_n_cooccurrences
2082                         ) {
2083                             alleles_with_edges[het_i][allelei] = true;
2084                             alleles_with_edges[het_j][allelej] = true;
2085                         }
2086                     }
2087                 }
2088             }
2089         }
2090         vector<size_t> het_snps_reduced;
2091         vector<const SampleCall*> sample_het_calls_reduced;
2092         for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2093             if (alleles_with_edges[het_i] == array<bool,2>({{true, true}})) {
2094                 // Keep this het.
2095                 het_snps_reduced.push_back(het_snps[het_i]);
2096                 sample_het_calls_reduced.push_back(sample_het_calls[het_i]);
2097             }
2098         }
2099         if (het_snps_reduced.size() == het_snps.size())
2100             // No hets were removed; the situation is unchanged and phasing will fail again.
2101             continue;
2102         assert(!het_snps_reduced.empty()); // Otherwise the phasing wouldn't have previously failed.
2103         if (het_snps_reduced.size() == 1
2104             || assemble_phase_sets(phase_sets,
2105                                     het_snps_reduced,
2106                                     sample_het_calls_reduced,
2107                                     cooccurrences,
2108                                     min_n_cooccurrences)
2109         ) {
2110             // Phasing succeeded or became trivial.
2111             // We just downsize `het_snps`. We can do this because `phased_samples`
2112             // doesn't keep track of all the hets; only those of the best phase set.
2113             // In addition, unphased HETs are eventually discarded (as of Feb. 2018,
2114             // v2.0Beta8; c.f. PopMap::populate_locus) so we don't need to do anything
2115             // more to blacklist those genotypes.
2116             phased = true;
2117             if (detailed_output)
2118                 loc_.details_ss << "phasing_ok_rmhets\t"
2119                     << loc_.mpopi->samples()[sample].name
2120                     << "\tn_hets=" << het_snps.size()
2121                     << "\tcooc_thr=" << min_n_cooccurrences
2122                     << "\tn_kept_hets=" << het_snps_reduced.size() << '\n';
2123             het_snps = het_snps_reduced;
2124             sample_het_calls = sample_het_calls_reduced;
2125             break;
2126         }
2127     }
2128     if (!phased) {
2129         if (detailed_output)
2130             loc_.details_ss << "phasing_failed\t"
2131                 << loc_.mpopi->samples()[sample].name
2132                 << "\tn_hets=" << het_snps.size() << '\n';
2133         return false;
2134     }
2135     ++hap_stats.per_sample_stats[sample].n_phased; // (if !first_pass, this wasn't reach before; no guard.)
2136     ++n_consistent_hets;
2137     if (het_snps.size() == 1) {
2138         // After pruning hets, sample has trivial 1nt-long haplotypes.
2139         // TODO Record that we've just removed the problem...
2140         size_t col = snp_cols[het_snps[0]];
2141         const SampleCall& c = *sample_het_calls[0];
2142         phased_sample.insert({col, {col, c.nt0(), c.nt1()}});
2143         return true;
2144     }
2145 
2146     //
2147     // Record phase sets.
2148     //
2149     for (const PhaseSet& ps : phase_sets) {
2150         assert(ps.size() == het_snps.size());
2151         size_t phase_set_id = SIZE_MAX;
2152         for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2153             if (ps.het(het_i).left_allele == Nt4::n)
2154                 continue;
2155 
2156             size_t col = snp_cols[het_snps[het_i]];
2157             if (phase_set_id == SIZE_MAX)
2158                 phase_set_id = col; // Recommended value, c.f. VCF specification.
2159             PhasedHet ph = ps.het(het_i);
2160             ph.phase_set = phase_set_id;
2161             phased_sample[col] = ph;
2162         }
2163     }
2164 
2165     // Record singleton nodes (that are implicit in our representation).
2166     for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2167         size_t col = snp_cols[het_snps[het_i]];
2168         if (!phased_sample.count(col)) {
2169             array<Nt2,2> alleles = sample_het_calls[het_i]->nts();
2170             phased_sample[col] = PhasedHet({col, alleles[0], alleles[1]});
2171         }
2172     }
2173     assert(!phased_sample.empty());
2174 
2175     //
2176     // Remove all phase sets but the largest one.
2177     //
2178     map<size_t,size_t> phase_set_sizes;
2179     for (auto& phasedhet : phased_sample)
2180         ++phase_set_sizes[phasedhet.second.phase_set];
2181     auto best_ps = phase_set_sizes.begin();
2182     for (auto ps=++phase_set_sizes.begin(); ps!=phase_set_sizes.end(); ++ps)
2183         if (ps->second > best_ps->second)
2184             best_ps = ps;
2185     for (auto phasedhet=phased_sample.begin(); phasedhet!=phased_sample.end();) {
2186         if (phasedhet->second.phase_set == best_ps->first)
2187             ++phasedhet;
2188         else
2189             phased_sample.erase(phasedhet++);
2190     }
2191     return true;
2192 }
2193 
count_pairwise_cooccurrences(SnpAlleleCooccurrenceCounter & cooccurrences,const CLocAlnSet & aln_loc,size_t sample,const vector<size_t> & snp_cols,const vector<size_t> & het_snps,const vector<const SampleCall * > & sample_het_calls) const2194 void LocusProcessor::count_pairwise_cooccurrences(
2195         SnpAlleleCooccurrenceCounter& cooccurrences,
2196         const CLocAlnSet& aln_loc,
2197         size_t sample,
2198         const vector<size_t>& snp_cols,
2199         const vector<size_t>& het_snps,
2200         const vector<const SampleCall*>& sample_het_calls
2201         ) const {
2202 
2203     cooccurrences.clear();
2204     vector<Nt4> read_hap (het_snps.size());
2205     for (size_t read_i : aln_loc.sample_reads(sample)) {
2206         // Build the haplotype.
2207         size_t curr_col = 0;
2208         Alignment::iterator read_itr (aln_loc.reads()[read_i].aln());
2209         for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2210             size_t col = snp_cols[het_snps[het_i]];
2211             read_itr += col - curr_col;
2212             curr_col = col;
2213             Nt4 nt = *read_itr;
2214             if (nt == Nt4::n) {
2215                 read_hap[het_i] = Nt4::n;
2216             } else {
2217                 const SampleCall& c = *sample_het_calls[het_i];
2218                 Nt2 nt2 = Nt2(nt);
2219                 if (nt2 == c.nt0() || nt2 == c.nt1())
2220                     read_hap[het_i] = nt;
2221                 else
2222                     read_hap[het_i] = Nt4::n;
2223             }
2224         }
2225 
2226         // Record the pairwise cooccurrences.
2227         for (size_t i=0; i<het_snps.size(); ++i) {
2228             Nt4 nti = read_hap[i];
2229             if (nti == Nt4::n)
2230                 continue;
2231             for (size_t j=i+1; j<het_snps.size(); ++j) {
2232                 Nt4 ntj = read_hap[j];
2233                 if (ntj == Nt4::n)
2234                     continue;
2235                 ++cooccurrences.at(het_snps[i], Nt2(nti), het_snps[j], Nt2(ntj));
2236             }
2237         }
2238     }
2239 }
2240 
assemble_phase_sets(vector<PhaseSet> & phase_sets,const vector<size_t> & het_snps,const vector<const SampleCall * > & sample_het_calls,const SnpAlleleCooccurrenceCounter & cooccurrences,const size_t min_n_cooccurrences) const2241 bool LocusProcessor::assemble_phase_sets(
2242         vector<PhaseSet>& phase_sets,
2243         const vector<size_t>& het_snps,
2244         const vector<const SampleCall*>& sample_het_calls,
2245         const SnpAlleleCooccurrenceCounter& cooccurrences,
2246         const size_t min_n_cooccurrences
2247 ) const {
2248     // xxx 2018 Feb. @Nick: Actually all of this would be more simple and efficient
2249     // if we used a single vector of PhasedHet's, with each het starting in its own
2250     // phase set.
2251     phase_sets.clear();
2252 
2253     // We keep track of which phase set each het is currently part of.
2254     vector<size_t> allele_to_ps (het_snps.size(), SIZE_MAX);
2255 
2256     assert(het_snps.size() == sample_het_calls.size());
2257     for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2258         size_t snp_i = het_snps[het_i];
2259         size_t& ps_i = allele_to_ps[het_i]; //(by reference; always up to date)
2260         array<Nt2,2> allelesi = sample_het_calls[het_i]->nts();
2261         for (size_t het_j=het_i+1; het_j<het_snps.size(); ++het_j) {
2262             size_t snp_j = het_snps[het_j];
2263             size_t& ps_j = allele_to_ps[het_j];
2264             array<Nt2,2> allelesj = sample_het_calls[het_j]->nts();
2265 
2266             for (Nt2 nti : allelesi) {
2267                 for (Nt2 ntj : allelesj) {
2268                     size_t n = cooccurrences.at(snp_i, nti, snp_j, ntj);
2269                     if (n < min_n_cooccurrences)
2270                         // Inexistent (n==0) or low-coverage edge.
2271                         continue;
2272 
2273                     if (ps_i == SIZE_MAX && ps_j == SIZE_MAX) {
2274                         // Both nodes are singletons. Start a new phase set.
2275                         ps_i = phase_sets.size();
2276                         ps_j = phase_sets.size();
2277                         phase_sets.push_back(PhaseSet(het_snps.size()));
2278                         phase_sets.back().add_het(het_i, allelesi);
2279                         phase_sets.back().add_het(het_j, allelesj, ntj, het_i, nti);
2280                     } else if (ps_i == SIZE_MAX) {
2281                         // Add `het_i` to `ps_j`.
2282                         phase_sets[ps_j].add_het(het_i, allelesi, nti, het_j, ntj);
2283                         ps_i = ps_j;
2284                     } else if (ps_j == SIZE_MAX) {
2285                         // Add `het_j` to `ps_i`.
2286                         phase_sets[ps_i].add_het(het_j, allelesj, ntj, het_i, nti);
2287                         ps_j = ps_i;
2288                     } else if (ps_i != ps_j) {
2289                         // Merge `ps_j` into `ps_i`.
2290                         phase_sets[ps_i].merge_with(phase_sets[ps_j], het_i, nti, het_j, ntj);
2291                         phase_sets[ps_j].clear();
2292                         size_t merged_ps = ps_j;
2293                         for (size_t& allele_ps : allele_to_ps)
2294                             if (allele_ps == merged_ps)
2295                                 allele_ps = ps_i;
2296                     } else {
2297                         assert(ps_i == ps_j);
2298                         // Check that the edge is consistent.
2299                         if (!phase_sets[ps_i].is_edge_consistent(het_i, nti, het_j, ntj))
2300                             return false;
2301                     }
2302                 }
2303             }
2304         }
2305     }
2306 
2307     // Purge empty phase sets.
2308     phase_sets.erase(std::remove_if(
2309             phase_sets.begin(), phase_sets.end(),
2310             [] (const PhaseSet& ps) {return ps.empty();}
2311             ),phase_sets.end());
2312     return true;
2313 }
2314 
add_het(size_t het_i,array<Nt2,2> alleles)2315 void PhaseSet::add_het(size_t het_i, array<Nt2,2> alleles) {
2316     assert(phased_hets_.size() > het_i);
2317     assert(phased_hets_ == PhaseSet(phased_hets_.size()).phased_hets_); // Uninitialized.
2318 
2319     phased_hets_[het_i].left_allele  = Nt4(alleles[0]);
2320     phased_hets_[het_i].right_allele = Nt4(alleles[1]);
2321 }
2322 
add_het(size_t het_i,array<Nt2,2> alleles,Nt2 nt_i,size_t het_j,Nt2 nt_j)2323 void PhaseSet::add_het(size_t het_i, array<Nt2,2> alleles, Nt2 nt_i, size_t het_j, Nt2 nt_j) {
2324     assert(phased_hets_.size() > std::max(het_i, het_j));
2325     assert(phased_hets_[het_i].left_allele == Nt4::n); // `i` node shouldn't exist yet.
2326     assert(phased_hets_[het_j].left_allele != Nt4::n); //
2327 
2328     bool crossed_edge =
2329             (nt_j == phased_hets_[het_j].left_allele)
2330             ^ (nt_i == alleles[0]);
2331 
2332     if (crossed_edge) {
2333         // Flip the alleles so that the all the edges within the phase set are parallel.
2334         phased_hets_[het_i].left_allele  = Nt4(alleles[1]);
2335         phased_hets_[het_i].right_allele = Nt4(alleles[0]);
2336     } else {
2337         phased_hets_[het_i].left_allele  = Nt4(alleles[0]);
2338         phased_hets_[het_i].right_allele = Nt4(alleles[1]);
2339     }
2340 }
2341 
merge_with(const PhaseSet & other,size_t het_i,Nt2 nt_i,size_t het_j,Nt2 nt_j)2342 void PhaseSet::merge_with(const PhaseSet& other, size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j) {
2343     assert(phased_hets_.size() > std::max(het_i, het_j));
2344     assert(phased_hets_.size() == other.phased_hets_.size());
2345     assert(phased_hets_[het_i].left_allele != Nt4::n);
2346     assert(other.phased_hets_[het_j].left_allele != Nt4::n);
2347 
2348     bool crossed_edge =
2349             (nt_i == phased_hets_[het_i].left_allele)
2350             ^ (nt_j == other.phased_hets_[het_j].left_allele);
2351 
2352     for (size_t i=0; i<phased_hets_.size(); ++i) {
2353         if (other.phased_hets_[i].left_allele != Nt4::n) {
2354             assert(other.phased_hets_[i].right_allele != Nt4::n); // Both alleles should be set/not set together.
2355             assert(phased_hets_[i].left_allele == Nt4::n); // Phase sets should be non-overlapping.
2356             if (crossed_edge) {
2357                 phased_hets_[i].left_allele  = other.phased_hets_[i].right_allele;
2358                 phased_hets_[i].right_allele = other.phased_hets_[i].left_allele;
2359             } else {
2360                 phased_hets_[i].left_allele  = other.phased_hets_[i].left_allele;
2361                 phased_hets_[i].right_allele = other.phased_hets_[i].right_allele;
2362             }
2363         }
2364     }
2365 }
2366 
is_edge_consistent(size_t het_i,Nt2 nt_i,size_t het_j,Nt2 nt_j) const2367 bool PhaseSet::is_edge_consistent(size_t het_i, Nt2 nt_i, size_t het_j, Nt2 nt_j) const {
2368     assert(phased_hets_[het_i].left_allele != Nt4::n);
2369     assert(phased_hets_[het_j].left_allele != Nt4::n);
2370 
2371     bool crossed_edge =
2372             (nt_i == phased_hets_[het_i].left_allele)
2373             ^ (nt_j == phased_hets_[het_j].left_allele);
2374 
2375     // We build phase sets so that all inside edges are parallel
2376     // (by flipping the alleles when necessary), so the proposed
2377     // edge is inconsistent if it is crossed.
2378     return !crossed_edge;
2379 }
2380 
write_one_locus(const CLocAlnSet & aln_loc,const vector<SiteCounts> & depths,const vector<SiteCall> & calls,const vector<map<size_t,PhasedHet>> & phase_data)2381 void LocusProcessor::write_one_locus (
2382         const CLocAlnSet& aln_loc,
2383         const vector<SiteCounts>& depths,
2384         const vector<SiteCall>& calls,
2385         const vector<map<size_t,PhasedHet>>& phase_data
2386 ) {
2387     char loc_id[16];
2388     sprintf(loc_id, "%d", loc_.id);
2389 
2390     const DNASeq4& ref = aln_loc.ref();
2391     const MetaPopInfo& mpopi = aln_loc.mpopi();
2392 
2393     //
2394     // Vcf output.
2395     //
2396     timers_.building_vcf.restart();
2397     assert(depths.size() == ref.length());
2398     assert(calls.size() == ref.length());
2399     vector<size_t> sample_sites_w_data (mpopi.samples().size(), 0);
2400     stringstream vcf_records;
2401     VcfRecord rec;
2402     for (size_t i=0; i<ref.length(); ++i) {
2403         const SiteCounts& sitedepths = depths[i];
2404         const SiteCall& sitecall = calls[i];
2405         if (sitecall.alleles().empty())
2406             // No useful data at this site.
2407             continue;
2408 
2409         if (!ref[i].is_acgt())
2410             continue;
2411 
2412         // Determine which alleles exist, and their order.
2413         // (n.b. As of Apr4,2017 the ref allele might not be the most frequent one.)
2414         vector<Nt2> vcf_alleles;
2415         map<Nt2, size_t> vcf_allele_indexes;
2416         {
2417             vcf_alleles.push_back(ref[i]);
2418             vcf_allele_indexes.insert({Nt2(ref[i]), 0});
2419 
2420             // Sort the alleles by frequency.
2421             vector<pair<double, Nt2>> sorted_alleles;
2422             for (auto& a : sitecall.alleles())
2423                 sorted_alleles.push_back({a.second, a.first});
2424             sort(sorted_alleles.rbegin(), sorted_alleles.rend()); // (decreasing)
2425 
2426             // The reference allele has already been added to vcf_alleles; exclude it.
2427             for (auto iter=sorted_alleles.begin(); iter!=sorted_alleles.end(); ++iter) {
2428                 if (iter->second == vcf_alleles[0]) {
2429                     sorted_alleles.erase(iter);
2430                     break;
2431                 }
2432             }
2433 
2434             // Record the alternative alleles.
2435             for (auto& alt_allele : sorted_alleles) {
2436                 vcf_allele_indexes.insert({alt_allele.second, vcf_alleles.size()});
2437                 vcf_alleles.push_back(alt_allele.second);
2438             }
2439         }
2440 
2441         //
2442         // Create the VCF record.
2443         //
2444 
2445         // Chrom & pos.
2446         rec.clear();
2447         rec.append_chrom(loc_id);
2448         rec.append_pos(i);
2449         rec.append_id();
2450 
2451         // Alleles.
2452         for (Nt2 nt : vcf_alleles)
2453             rec.append_allele(nt);
2454 
2455         // SNP quality.
2456         rec.append_qual(sitecall.snp_qual());
2457         rec.append_filters();
2458 
2459         if(vcf_alleles.size() == 1) {
2460             // Fixed site.
2461 
2462             // Info/DP.
2463             rec.append_info(string("DP=") + to_string(sitedepths.tot.sum()));
2464             // Info/AD.
2465             Nt2 ref_nt = sitecall.alleles().begin()->first;
2466             rec.append_info(string("AD=") + to_string(sitedepths.tot[ref_nt]));
2467             if (dbg_write_nt_depths) {
2468                 // Info/cnts.
2469                 stringstream cnts;
2470                 join(sitedepths.tot.arr(), ',', cnts);
2471                 rec.append_info(string("cnts=") + cnts.str());
2472             }
2473             // Format.
2474             rec.append_format("DP");
2475             // Genotypes.
2476             for (size_t sample=0; sample<mpopi.samples().size(); ++sample) {
2477                 size_t dp = sitedepths.samples[sample].sum();
2478                 if (dp == 0) {
2479                     rec.append_sample(".");
2480                     continue;
2481                 }
2482                 ++sample_sites_w_data[sample];
2483                 rec.append_sample(to_string(dp));
2484             }
2485 
2486         } else {
2487             // Polymorphic site.
2488 
2489             // Info/DP.
2490             rec.append_info(string("DP=") + to_string(sitedepths.tot.sum()));
2491             // Info/AD.
2492             vector<size_t> ad;
2493             for (Nt2 nt : vcf_alleles)
2494                 ad.push_back(sitedepths.tot[nt]);
2495             stringstream ss;
2496             join(ad, ',', ss);
2497             rec.append_info(string("AD=") + ss.str());
2498             // Info/AF.
2499             vector<double> alt_freqs;
2500             for (auto nt=++vcf_alleles.begin(); nt!=vcf_alleles.end(); ++nt) // rem. always >1 alleles.
2501                 alt_freqs.push_back(sitecall.alleles().at(*nt));
2502             rec.append_info(VcfRecord::util::fmt_info_af(alt_freqs));
2503             if (dbg_write_nt_depths) {
2504                 // Info/cnts.
2505                 stringstream cnts;
2506                 join(sitedepths.tot.arr(), ',', cnts);
2507                 rec.append_info(string("cnts=") + cnts.str());
2508             }
2509 
2510             // Format.
2511             rec.append_format("GT");
2512             if (!dbg_no_haplotypes) {
2513                 rec.append_format("PS"); // Phase set.
2514                 rec.append_format("FT"); // Filter.
2515             }
2516             rec.append_format("GQ");
2517             rec.append_format("DP");
2518             rec.append_format("AD");
2519             rec.append_format("GL");
2520 
2521             // Genotypes.
2522             for (size_t sample : mpopi.sample_indexes_orig_order()) {
2523                 const Counts<Nt2>& sdepths = sitedepths.samples[sample];
2524                 const SampleCall& scall = sitecall.sample_calls()[sample];
2525                 if (sdepths.sum() == 0) {
2526                     // No data for this sample.
2527                     rec.append_sample("./.");
2528                     continue;
2529                 }
2530                 ++sample_sites_w_data[sample];
2531 
2532                 stringstream genotype;
2533                 // GT field.
2534                 array<size_t,2> gt;
2535                 switch (scall.call()) {
2536                 case snp_type_hom:
2537                     gt[0] = vcf_allele_indexes.at(scall.nt0());
2538                     genotype << gt[0] << '/' << gt[0];
2539                     if (!dbg_no_haplotypes)
2540                         genotype << ":.:.";
2541                     genotype << ':' << scall.gq();
2542                     break;
2543                 case snp_type_het:
2544                     if (!dbg_no_haplotypes) {
2545                         assert(!phase_data.empty());
2546                         const PhasedHet& p = phase_data[sample].at(i);
2547                         genotype << vcf_allele_indexes.at(p.left_allele)
2548                                     << '|'
2549                                     << vcf_allele_indexes.at(p.right_allele)
2550                                     << ':' << (p.phase_set + 1)
2551                                     << ':' << '.';
2552                     } else {
2553                         //dbg_no_haplotypes
2554                         gt[0] = vcf_allele_indexes.at(scall.nt0());
2555                         gt[1] = vcf_allele_indexes.at(scall.nt1());
2556                         std::sort(gt.begin(), gt.end()); // (Prevents '1/0'.)
2557                         genotype << gt[0] << '/' << gt[1];
2558                     }
2559                     genotype << ':' << scall.gq();
2560                     break;
2561                 default:
2562                     genotype << "./.";
2563                     if (!dbg_no_haplotypes) {
2564                         if (scall.call() == snp_type_discarded)
2565                             genotype << ":.:disc";
2566                         else
2567                             genotype << ":.:.";
2568                     }
2569                     genotype << ":.";
2570                     break;
2571                 }
2572                 // DP field.
2573                 genotype << ':' << sdepths.sum();
2574                 // AD field.
2575                 vector<size_t> ad;
2576                 ad.reserve(vcf_alleles.size());
2577                 for (Nt2 nt : vcf_alleles)
2578                     ad.push_back(sdepths[nt]);
2579                 genotype << ':';
2580                 join(ad, ',', genotype);
2581                 // GL field.
2582                 genotype << ':' << VcfRecord::util::fmt_gt_gl(vcf_alleles, scall.lnls());
2583                 // cnts field.
2584                 if (dbg_write_nt_depths) {
2585                     genotype << ":";
2586                     join(sdepths.arr(), ',', genotype);
2587                 }
2588                 // Push it.
2589                 rec.append_sample(genotype.str());
2590             }
2591         }
2592         assert(rec.count_samples() == o_vcf_f->header().samples().size());
2593 
2594         vcf_records << rec;
2595     }
2596     loc_.o_vcf = vcf_records.str();
2597     timers_.building_vcf.update();
2598 
2599     //
2600     // Fasta output.
2601     //
2602     timers_.building_fa.restart();
2603 
2604     // Determine the number of samples for this locus. Some samples may have
2605     // been discarded (as of May 26, 2017, this would be because their haplotypes
2606     // were inconsistent).
2607     set<size_t> samples_w_reads;
2608     for (const SAlnRead& r : aln_loc.reads())
2609         samples_w_reads.insert(r.sample);
2610     size_t n_remaining_samples = 0;
2611     for (size_t sample_n_sites : sample_sites_w_data)
2612         if (sample_n_sites > 0)
2613             ++n_remaining_samples;
2614 
2615 
2616     // Write the record.
2617     string& fa = loc_.o_fa;
2618     assert(fa.empty());
2619     char cstr_buf[32];
2620     fa += '>';
2621     fa += loc_id;
2622     // Genomic position.
2623     if (!aln_loc.pos().empty()) {
2624         const PhyLoc& p = aln_loc.pos();
2625         sprintf(cstr_buf, "%u", p.bp+1);
2626         fa += " pos=";
2627         fa += p.chr();
2628         fa += ':';
2629         fa += cstr_buf;
2630         fa += ':';
2631         fa += (p.strand == strand_plus ? '+' : '-');
2632     }
2633     // Number of samples
2634     sprintf(cstr_buf, "%zu", n_remaining_samples);
2635     fa += " NS=";
2636     fa += cstr_buf;
2637     if (n_remaining_samples != samples_w_reads.size()) {
2638         assert(n_remaining_samples < samples_w_reads.size());
2639         sprintf(cstr_buf, "%zu", samples_w_reads.size() - n_remaining_samples);
2640         fa += " n_discarded_samples=";
2641         fa += cstr_buf;
2642     }
2643     // Contig status.
2644     switch (loc_.ctg_status) {
2645     case LocData::overlapped:
2646         sprintf(cstr_buf, "%zu", loc_.olap_length);
2647         fa += " contig=overlapped:";
2648         fa += cstr_buf;
2649         break;
2650     case LocData::separate:
2651         fa += " contig=separate";
2652         break;
2653     default:
2654         break;
2655     }
2656     fa += '\n';
2657     fa += ref.str();
2658     fa += '\n';
2659     timers_.building_fa.update();
2660 }
2661 
write_sample_hapgraph(ostream & os,size_t sample,const vector<size_t> & het_snps,const vector<size_t> & snp_cols,const vector<const SampleCall * > & sample_het_calls,const SnpAlleleCooccurrenceCounter & cooccurrences) const2662 void LocusProcessor::write_sample_hapgraph(
2663         ostream& os,
2664         size_t sample,
2665         const vector<size_t>& het_snps,
2666         const vector<size_t>& snp_cols,
2667         const vector<const SampleCall*>& sample_het_calls,
2668         const SnpAlleleCooccurrenceCounter& cooccurrences
2669         ) const {
2670 
2671     auto nodeid = [this,&sample](size_t col, Nt2 allele)
2672             {return string("l")+to_string(loc_.id)+"s"+to_string(sample)+"c"+to_string(col)+char(allele);};
2673 
2674     // Initialize the subgraph.
2675     os << "\tsubgraph cluster_sample" << sample << " {\n"
2676                   << "\t\tlabel=\""
2677                   << "i" << sample << " '" << loc_.mpopi->samples()[sample].name << "'\\n"
2678                   << "\";\n"
2679                   << "\t\tstyle=dashed;\n"
2680                   << "\t\t# heterozygous columns: ";
2681     vector<size_t> het_cols;
2682     for (size_t snp_i : het_snps)
2683         het_cols.push_back(snp_cols[snp_i]);
2684     join(het_cols, ',', os);
2685     os << "\n";
2686 
2687     // Write the node labels.
2688     for (size_t i=0; i<het_snps.size(); ++i) {
2689         array<Nt2,2> alleles = sample_het_calls[i]->nts();
2690         size_t col = snp_cols[het_snps[i]];
2691         for (Nt2 allele : alleles)
2692             os << "\t\t" << nodeid(col, allele)
2693                           << " [label=<"
2694                           << "<sup><font point-size=\"10\">" << col+1 << "</font></sup>" << allele
2695                           << ">];\n";
2696     }
2697 
2698     // Write the edges.
2699     for (size_t het_i=0; het_i<het_snps.size(); ++het_i) {
2700         array<Nt2,2> alleles_i = sample_het_calls[het_i]->nts();
2701         size_t snp_i = het_snps[het_i];
2702         for (size_t het_j=het_i+1; het_j<het_snps.size(); ++het_j) {
2703             array<Nt2,2> alleles_j = sample_het_calls[het_j]->nts();
2704             size_t snp_j = het_snps[het_j];
2705             for (Nt2 nti : alleles_i) {
2706                 for (Nt2 ntj : alleles_j) {
2707                     size_t n = cooccurrences.at(snp_i, nti, snp_j, ntj);
2708                     if (n == 0)
2709                         continue;
2710                     os << "\t\t" << nodeid(snp_cols[snp_i],nti)
2711                                   << " -- " << nodeid(snp_cols[snp_j],ntj) << " [";
2712                     if (n==1)
2713                         os << "style=dotted";
2714                     else
2715                         os << "label=\"" << n << "\",penwidth=" << n;
2716                     os << "];\n";
2717                 }
2718             }
2719         }
2720     }
2721     os << "\t}\n";
2722 }
2723 
operator +=(const Timers & other)2724 Timers& Timers::operator+= (const Timers& other) {
2725     reading += other.reading;
2726     processing += other.processing;
2727     writing_fa += other.writing_fa;
2728     writing_vcf += other.writing_vcf;
2729     writing_details += other.writing_details;
2730     writing_bams += other.writing_bams;
2731     // Within processing:
2732     processing_pre_alns += other.processing_pre_alns;
2733     rm_Ns += other.rm_Ns;
2734     assembling += other.assembling;
2735     init_alignments += other.init_alignments;
2736     aligning += other.aligning;
2737     building_bams += other.building_bams;
2738     merge_paired_reads += other.merge_paired_reads;
2739     processing_post_alns += other.processing_post_alns;
2740     rm_reads += other.rm_reads;
2741     genotyping += other.genotyping;
2742     haplotyping += other.haplotyping;
2743     building_vcf += other.building_vcf;
2744     building_fa += other.building_fa;
2745     cpt_consensus += other.cpt_consensus;
2746     counting_nts += other.counting_nts;
2747     return *this;
2748 }
2749 
2750 //
2751 // Arguments
2752 // ==========
2753 //
2754 
2755 const string help_string = string() +
2756         "gstacks " + VERSION  + "\n" +
2757         "\n"
2758         "De novo mode:\n"
2759         "  gstacks -P stacks_dir -M popmap\n"
2760         "\n"
2761         "  -P: input directory containg '*.matches.bam' files created by the\n"
2762         "      de novo Stacks pipeline, ustacks-cstacks-sstacks-tsv2bam\n"
2763         "\n"
2764         "Reference-based mode:\n"
2765         "  gstacks -I bam_dir -M popmap [-S suffix] -O out_dir\n"
2766         "  gstacks -B bam_file [-B ...] -O out_dir\n"
2767         "\n"
2768         "  -I: input directory containing BAM files\n"
2769         "  -S: with -I/-M, suffix to use to build BAM file names: by default this\n"
2770         "      is just '.bam', i.e. the program expects 'SAMPLE_NAME.bam'\n"
2771         "  -B: input BAM file(s)\n"
2772         "\n"
2773         "  The input BAM file(s) must be sorted by coordinate.\n"
2774         "  With -B, records must be assigned to samples using BAM \"reads groups\"\n"
2775         "  (gstacks uses the ID/identifier and SM/sample name fields). Read groups\n"
2776         "  must be consistent if repeated different files. Note that with -I, read\n"
2777         "  groups are unneeded and ignored.\n"
2778         "\n"
2779         "For both modes:\n"
2780         "  -M: path to a population map giving the list of samples\n"
2781         "  -O: output directory (default: none with -B; with -P same as the input\n"
2782         "      directory)\n"
2783         "  -t,--threads: number of threads to use (default: 1)\n"
2784         "\n"
2785         "SNP Model options:\n"
2786         "  --model: model to use to call variants and genotypes; one of\n"
2787         "           marukilow (default), marukihigh, or snp\n"
2788         "  --var-alpha: alpha threshold for discovering SNPs (default: 0.01 for marukilow)\n"
2789         "  --gt-alpha: alpha threshold for calling genotypes (default: 0.05)\n"
2790         "\n"
2791         "Paired-end options:\n"
2792         "  --rm-pcr-duplicates: remove all but one set ofread pairs of the same sample that \n"
2793         "                       have the same insert length (implies --rm-unpaired-reads)\n"
2794         "  --rm-unpaired-reads: discard unpaired reads\n"
2795         "  --ignore-pe-reads: ignore paired-end reads even if present in the input\n"
2796         "  --unpaired: ignore read pairing (only for paired-end GBS; treat READ2's as if they were READ1's)\n"
2797         "\n"
2798         "Advanced options:\n"
2799         "  (De novo mode)\n"
2800         "  --kmer-length: kmer length for the de Bruijn graph (default: 31, max. 31)\n"
2801         "  --max-debruijn-reads: maximum number of reads to use in the de Bruijn graph (default: 1000)\n"
2802         "  --min-kmer-cov: minimum coverage to consider a kmer (default: 2)\n"
2803         "  --write-alignments: save read alignments (heavy BAM files)\n"
2804         "\n"
2805         "  (Reference-based mode)\n"
2806         "  --min-mapq: minimum PHRED-scaled mapping quality to consider a read (default: 10)\n"
2807         "  --max-clipped: maximum soft-clipping level, in fraction of read length (default: 0.20)\n"
2808         "  --max-insert-len: maximum allowed sequencing insert length (default: 1000)\n"
2809         "\n"
2810         "  --details: write a heavier output\n"
2811         "  --phasing-cooccurrences-thr-range: range of edge coverage thresholds to\n"
2812         "        iterate over when building the graph of allele cooccurrences for\n"
2813         "        SNP phasing (default: 1,2)\n"
2814         "  --phasing-dont-prune-hets: don't try to ignore dubious heterozygote\n"
2815         "        genotypes during phasing\n"
2816         "\n"
2817 #ifdef DEBUG
2818         "Debug options:\n"
2819         "  --dbg-no-overlaps: disable overlapping\n"
2820         "  --dbg-no-haps: disable phasing\n"
2821         "  --dbg-gfa: output a GFA file for each locus\n"
2822         "  --dbg-gfa-not-dag: output a GFA file for failed assemblies\n"
2823         "  --dbg-alns: output a file showing the contigs & alignments\n"
2824         "  --dbg-phasing-min-mac: minimum SNP MAC.\n"
2825         "  --dbg-phasing-no-2ndpass: don't try a second pass.\n"
2826         "  --dbg-hapgraphs: output a dot graph file showing phasing information\n"
2827         "  --dbg-depths: write detailed depth data in the output VCF\n"
2828         "  --dbg-log-stats-phasing: log detailed phasing statistics\n"
2829         "  --dbg-min-spl-reads: discard samples with less than this many reads (ref-based)\n"
2830         "  --dbg-min-loc-spls: discard loci with less than this many samples\n"
2831         "  --dbg-max-debruijn-reads\n"
2832         "\n"
2833 #endif
2834         ;
2835 
parse_command_line(int argc,char * argv[])2836 string parse_command_line(int argc, char* argv[]) {
2837     auto bad_args = [](){
2838         cerr << help_string;
2839         exit(1);
2840     };
2841 try {
2842     static const option long_options[] = {
2843         {"version",      no_argument,       NULL,  1000},
2844         {"help",         no_argument,       NULL,  'h'},
2845         {"quiet",        no_argument,       NULL,  'q'},
2846         {"stacks-dir",   required_argument, NULL,  'P'},
2847         {"in-dir",       required_argument, NULL,  'I'},
2848         {"in-bam",       required_argument, NULL,  'B'},
2849         {"suffix",       required_argument, NULL,  'S'},
2850         {"popmap",       required_argument, NULL,  'M'},
2851         {"out-dir",      required_argument, NULL,  'O'},
2852         {"unpaired",     no_argument,       NULL,  1007},
2853         {"threads",      required_argument, NULL,  't'},
2854         {"model",        required_argument, NULL,  1006},
2855         {"gt-alpha",     required_argument, NULL,  1005},
2856         {"var-alpha",    required_argument, NULL,  1008},
2857         {"kmer-length",  required_argument, NULL,  1001},
2858         {"max-debruijn-reads", required_argument, NULL, 1024},
2859         {"min-kmer-cov", required_argument, NULL,  1002},
2860         {"ignore-pe-reads", no_argument,    NULL,  1012},
2861         {"write-alignments", no_argument,   NULL,  1021},
2862         {"details",      no_argument,       NULL,  1013},
2863         {"min-mapq",     required_argument, NULL,  1014},
2864         {"max-clipped",  required_argument, NULL,  1015},
2865         {"max-insert-len", required_argument, NULL,  1016},
2866         {"rm-unpaired-reads", no_argument,  NULL,  1017},
2867         {"rm-pcr-duplicates", no_argument,  NULL,  1018},
2868         {"phasing-cooccurrences-thr-range", required_argument, NULL, 1019},
2869         {"phasing-dont-prune-hets", no_argument, NULL, 1020},
2870         //debug options
2871         {"min-kmer-freq", required_argument, NULL, 3021},
2872         {"dbg-phasing-min-mac", required_argument, NULL,  2018},
2873         {"dbg-phasing-no-2ndpass", no_argument, NULL, 2019},
2874         {"dbg-print-cloc-ids", no_argument, NULL,  2000},
2875         {"dbg-gfa",      no_argument,       NULL,  2003},
2876         {"dbg-gfa-not-dag",  no_argument,   NULL,  2015},
2877         {"dbg-alns",     no_argument,       NULL,  2004}, {"alns", no_argument, NULL, 2004},
2878         {"dbg-depths",   no_argument,       NULL,  2007},
2879         {"dbg-hapgraphs", no_argument,      NULL,  2010},
2880         {"dbg-no-overlaps", no_argument,    NULL,  2008},
2881         {"dbg-no-haps",  no_argument,       NULL,  2009},
2882         {"dbg-min-spl-reads", required_argument, NULL, 2014},
2883         {"dbg-min-loc-spls", required_argument, NULL, 2017},
2884         {0, 0, 0, 0}
2885     };
2886 
2887     string stacks_dir;
2888     string in_dir;
2889     string suffix = ".bam";
2890 
2891     double gt_alpha = 0.05;
2892     double var_alpha = 0.0;
2893 
2894     // bool pcr_dupl_measures = true;
2895     // auto pcr_duplicates_measures = [&](){
2896     //     pcr_dupl_measures = true;
2897     //     phasing_min_mac = 3;
2898     //     //TODO
2899     // };
2900     // auto no_pcr_duplicates_measures = [&](){
2901     //     pcr_dupl_measures = false;
2902     //     phasing_min_mac = 1;
2903     //     //TODO
2904     // };
2905     // pcr_duplicates_measures();
2906 
2907     int c;
2908     int long_options_i;
2909     std::cmatch tmp_cmatch;
2910     long tmp_long;
2911     while (true) {
2912 
2913         c = getopt_long(argc, argv, "hqP:I:B:S:s:M:O:W:t:m:", long_options, &long_options_i);
2914 
2915         if (c == -1)
2916             break;
2917 
2918         switch (c) {
2919         case 1000: //version
2920             cout << prog_name << " " << VERSION << "\n";
2921             exit(0);
2922             break;
2923         case 'h':
2924             cout << help_string;
2925             exit(0);
2926             break;
2927         case 'q':
2928             quiet = true;
2929             break;
2930         case 'P':
2931             stacks_dir = optarg;
2932             break;
2933         case 'I':
2934             in_dir = optarg;
2935             break;
2936         case 'B':
2937             in_bams.push_back(optarg);
2938             break;
2939         case 'S':
2940             suffix = optarg;
2941             break;
2942         case 'M':
2943             popmap_path = optarg;
2944             break;
2945         case 'O':
2946             out_dir = optarg;
2947             break;
2948         case 1007: //unpaired
2949             refbased_cfg.paired = false;
2950             break;
2951         case 1006: //model
2952             model_type = parse_model_type(optarg);
2953             break;
2954         case 1005: //gt-alpha
2955             gt_alpha = atof(optarg);
2956             break;
2957         case 1008: //var-alpha
2958             var_alpha = atof(optarg);
2959             break;
2960         case 't':
2961             num_threads = is_integer(optarg);
2962             if (num_threads < 0) {
2963                 cerr << "Error: Illegal -t option value '" << optarg << "'.\n";
2964                 bad_args();
2965             }
2966             break;
2967         case 1012: //ignore-pe-reads
2968             ignore_pe_reads = true;
2969             refbased_cfg.ign_pe_reads = true;
2970             break;
2971         case 1001://kmer-length
2972             km_length = atoi(optarg);
2973             if (km_length < 2 || km_length > 31) {
2974                 cerr << "Error: Illegal -t option value '" << optarg << "' (valid range is 2-31).\n";
2975                 bad_args();
2976             }
2977             break;
2978         case 1024: // max-debruijn-reads
2979             try {
2980                 tmp_long = std::stol(optarg);
2981                 if (tmp_long < 1)
2982                     throw exception();
2983             } catch(std::exception) {
2984                 cerr << "Error: Illegal max-debruijn-reads option value '" << optarg << "'.\n";
2985                 bad_args();
2986             }
2987             max_debruijn_reads = tmp_long;
2988             break;
2989         case 1002://min-kmer-cov
2990             min_km_count = atoi(optarg);
2991             break;
2992         case 1021://write-alignments
2993             bam_output = true;
2994             break;
2995         case 1013://details
2996             detailed_output = true;
2997             break;
2998         case 1014://min-mapq
2999             refbased_cfg.min_mapq = stoi(optarg);
3000             if (refbased_cfg.min_mapq > 255) {
3001                 cerr << "Error: Illegal --min-mapq value '" << optarg << "'.\n";
3002                 bad_args();
3003             }
3004             break;
3005         case 1015://max-clipped
3006             refbased_cfg.max_clipped = atof(optarg);
3007             if (refbased_cfg.max_clipped < 0.0 || refbased_cfg.max_clipped > 1.0) {
3008                 cerr << "Error: Illegal --max-clipped value '" << optarg << "'.\n";
3009                 bad_args();
3010             }
3011             break;
3012         case 1016://max-insert
3013             refbased_cfg.max_insert_refsize = stoi(optarg);
3014             break;
3015         case 1017://rm-unpaired-reads
3016             rm_unpaired_reads = true;
3017             break;
3018         case 1018://rm-pcr-duplicates
3019             rm_pcr_duplicates = true;
3020             rm_unpaired_reads = true;
3021             // no_pcr_duplicates_measures();
3022             break;
3023         // case 1022://pcr-duplicates-measures
3024         //     pcr_duplicates_measures();
3025         //     break;
3026         // case 1023://no-pcr-duplicates-measures
3027         //     no_pcr_duplicates_measures();
3028         //     break;
3029         case 1019: //phasing-cooccurrences-thr-range
3030             std::regex_search(optarg, tmp_cmatch, std::regex("^[0-9]+,[0-9]+$"));
3031             if (!tmp_cmatch.empty()) {
3032                 phasing_cooccurrences_thr_range.first = atoi(optarg);
3033                 phasing_cooccurrences_thr_range.second = atoi(strchr(optarg, ',') + 1);
3034             }
3035             if (tmp_cmatch.empty() || phasing_cooccurrences_thr_range.first > phasing_cooccurrences_thr_range.second) {
3036                 cerr << "Error: Illegal --phasing-cooccurrences-thr-range value '"
3037                      << optarg << "'; expected 'INT_MIN,INT_MAX'.\n";
3038                 bad_args();
3039             }
3040             break;
3041         case 1020: //phasing-dont-prune-hets
3042             phasing_dont_prune_hets = true;
3043             break;
3044 
3045         //
3046         // Debug options
3047         //
3048         case 3021://min-kmer-freq
3049             cerr << "WARNING: Ignored option --min-kmer-freq that does not exist anymore. See --max-debruijn-reads.\n";
3050             break;
3051         case 2018: //dbg-phasing-min-mac
3052             phasing_min_mac = is_integer(optarg);
3053             if (phasing_min_mac < 0) {
3054                 cerr << "Error: Illegal --dbg-phasing-min-mac value '" << optarg << "'.\n";
3055                 bad_args();
3056             }
3057             break;
3058         case 2019: //dbg-phasing-no-2ndpass
3059             dbg_phasing_no_2ndpass = true;
3060             break;
3061         case 2000://dbg-print-cloc-ids
3062             dbg_print_cloc_ids = true;
3063             break;
3064         case 2003://dbg-gfa
3065             dbg_write_gfa = true;
3066             break;
3067         case 2015://dbg-gfa-not-dag
3068             dbg_write_gfa_notdag = true;
3069             break;
3070         case 2004://dbg-alns
3071             dbg_write_alns = true;
3072             break;
3073         case 2010://dbg-hapgraphs
3074             dbg_write_hapgraphs = true;
3075             break;
3076         case 2007://dbg-depths
3077             dbg_write_nt_depths = true;
3078             break;
3079         case 2008://dbg-no-haps
3080             dbg_no_overlaps = true;
3081             break;
3082         case 2009://dbg-no-haps
3083             dbg_no_haplotypes = true;
3084             break;
3085         case 2014://dbg-min-spl-reads
3086             refbased_cfg.min_reads_per_sample = stoi(optarg);
3087             break;
3088         case 2017://dbg-min-loc-spls
3089             refbased_cfg.min_samples_per_locus = stoi(optarg);
3090             dbg_denovo_min_loc_samples = stoi(optarg);
3091             break;
3092         case '?':
3093             bad_args();
3094             break;
3095         default:
3096             DOES_NOT_HAPPEN;
3097             break;
3098         }
3099     }
3100 
3101     //
3102     // Check command consistency.
3103     //
3104 
3105     if (optind < argc) {
3106         cerr << "Error: Failed to parse command line: '" << argv[optind]
3107              << "' is seen as a positional argument. Expected no positional arguments.\n";
3108         bad_args();
3109     }
3110 
3111     size_t n_modes_given = !stacks_dir.empty() + !in_dir.empty() + !in_bams.empty();
3112     if (n_modes_given != 1) {
3113         cerr << "Error: Please specify exactly one of -P, -I or -B.\n";
3114         bad_args();
3115     }
3116 
3117     typedef GStacksInputT In;
3118     if (!stacks_dir.empty())
3119         input_type = popmap_path.empty()
3120             ? In::denovo_merger
3121             : In::denovo_popmap;
3122     else if (!in_dir.empty())
3123         input_type = In::refbased_popmap;
3124     else if (!in_bams.empty())
3125         input_type = In::refbased_list;
3126     else
3127         DOES_NOT_HAPPEN;
3128 
3129     if (input_type == In::refbased_popmap
3130         && popmap_path.empty()
3131     ) {
3132         cerr << "Error: Please specify a population map (-M).\n";
3133         bad_args();
3134     } else if (input_type == In::refbased_list
3135         && !popmap_path.empty()
3136     ) {
3137         cerr << "Error: Please specify -I/-M or -B, not both.\n";
3138         bad_args();
3139     }
3140 
3141     if (input_type == In::denovo_popmap || input_type == In::denovo_merger) {
3142         if (!refbased_cfg.paired) {
3143             cerr << "Error: --unpaired is for the reference-based mode.\n";
3144             bad_args();
3145         }
3146         if (suffix != ".bam") {
3147             cerr << "Error: --suffix is for the reference-based mode.\n";
3148             bad_args();
3149         }
3150     } else if (input_type == In::refbased_popmap || input_type == In::refbased_list) {
3151         if (!refbased_cfg.paired && rm_unpaired_reads) {
3152             cerr << "Error: --unpaired and --rm-unpaired-reads/--rm-pcr-duplicates are not compatible.\n";
3153             bad_args();
3154         }
3155         if (out_dir.empty()) {
3156             cerr << "Error: Please specify an output directory (-O).\n";
3157             bad_args();
3158         }
3159         if (min_km_count != 2 || max_debruijn_reads != 1000) {
3160             cerr << "Error: --min-kmer-count, --max-debruijn-reads are for the denovo mode.\n";
3161             bad_args();
3162         }
3163         if (bam_output) {
3164             cerr << "Error: --write-alignments is for the denovo mode.\n";
3165             bad_args();
3166         }
3167     } else {
3168         DOES_NOT_HAPPEN;
3169     }
3170 
3171     //
3172     // Process arguments.
3173     //
3174 
3175     for (string* dir : {&stacks_dir, &in_dir, &out_dir})
3176         if (!dir->empty() && dir->back() != '/')
3177             *dir += '/';
3178 
3179     if (var_alpha == 0.0) {
3180         if (model_type == marukilow) {
3181             var_alpha = 0.01;
3182         } else {
3183             cerr << "Error: No value was provided for --var-alpha"
3184                  << " (and there is no default for this model).\n";
3185             bad_args();
3186         }
3187     }
3188     switch (model_type) {
3189     case snp:        model.reset(new MultinomialModel(gt_alpha)); break;
3190     case marukihigh: model.reset(new MarukiHighModel(gt_alpha, var_alpha));  break;
3191     case marukilow:  model.reset(new MarukiLowModel(gt_alpha, var_alpha));   break;
3192     default:
3193         cerr << "Error: Model choice '" << to_string(model_type) << "' is not supported.\n";
3194         bad_args();
3195         break;
3196     }
3197 
3198     if (!popmap_path.empty()) {
3199         MetaPopInfo m;
3200         m.init_popmap(popmap_path);
3201         for (auto& s : m.samples())
3202             sample_names.push_back(s.name);
3203     }
3204 
3205     if (input_type == In::denovo_popmap) {
3206         for (const string& s : sample_names) {
3207             in_bams.push_back(stacks_dir + s + ".matches.bam");
3208             if (bam_output)
3209                 out_bams.push_back(stacks_dir + s + ".alns.bam");
3210         }
3211         if (out_dir.empty())
3212             out_dir = stacks_dir;
3213     } else if (input_type == In::denovo_merger) {
3214         in_bams.push_back(stacks_dir + "catalog.bam");
3215         if (bam_output)
3216             out_bams.push_back(stacks_dir + "alignments.bam");
3217         if (out_dir.empty())
3218             out_dir = stacks_dir;
3219     } else if (input_type == In::refbased_popmap) {
3220         for (const string& sample : sample_names)
3221             in_bams.push_back(in_dir + sample + suffix);
3222     } else {
3223         assert(input_type == In::refbased_list);
3224     }
3225 
3226     check_or_mk_dir(out_dir);
3227 
3228     if (in_bams.empty())
3229         DOES_NOT_HAPPEN;
3230 
3231     //
3232     // Write the report.
3233     //
3234     stringstream os;
3235     os << "\n"
3236        << "Configuration for this run:\n";
3237     switch (input_type) {
3238     case GStacksInputT::denovo_popmap:
3239     case GStacksInputT::denovo_merger:
3240         os << "  Input mode: denovo\n";
3241         break;
3242     case GStacksInputT::refbased_popmap:
3243     case GStacksInputT::refbased_list:
3244         os << "  Input mode: reference-based";
3245         if (!refbased_cfg.paired)
3246             os << ", unpaired";
3247         os << "\n";
3248         break;
3249     default:
3250         DOES_NOT_HAPPEN;
3251         break;
3252     }
3253     if (!popmap_path.empty())
3254         os << "  Population map: '" << popmap_path << "'\n";
3255     os << "  Input files: " << in_bams.size() << ", e.g. '" << in_bams.front()<< "'\n"
3256        << "  Output to: '" << out_dir << "'\n"
3257        << "  Model: " << *model << "\n";
3258     if (ignore_pe_reads)
3259         os << "  Ignoring paired-end reads.\n";
3260     if (!refbased_cfg.paired)
3261         os << "  Ignoring pairing information.\n";
3262     if (km_length != 31)
3263         os << "  Kmer length: " << km_length << "\n";
3264     if (max_debruijn_reads != 1000)
3265         os << "  Maximum reads used for contig assembly: " << max_debruijn_reads << "\n";
3266     if (min_km_count != 2)
3267         os << "  Minimum kmer count: " << min_km_count << "\n";
3268     if (rm_unpaired_reads)
3269         os << "  Discarding unpaired reads.\n";
3270     if (rm_pcr_duplicates)
3271         os << "  Removing PCR duplicates.\n";
3272     // if (pcr_dupl_measures)
3273     //     os << "  PCR duplicates mitigation measures enabled.\n";
3274 
3275     return os.str();
3276 
3277 } catch (std::invalid_argument&) {
3278     bad_args();
3279     exit(1);
3280 }
3281 }
3282