1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // ustacks -- build denovo stacks
23 //
24 
25 #include "ustacks.h"
26 #include "log_utils.h"
27 
28 using namespace std;
29 
30 //
31 // Global variables to hold command-line options.
32 //
33 FileT   in_file_type;
34 string  in_file;
35 string  prefix_path;
36 int     num_threads       = 1;
37 int     sample_id         = -1;
38 bool    call_sec_hapl     = true;
39 bool    set_kmer_len      = true;
40 int     global_kmer_len   = 0;
41 int     min_merge_cov     = 3;
42 uint    max_subgraph      = 3;
43 int     dump_graph        = 0;
44 int     retain_rem_reads  = false;
45 int     deleverage_stacks = 0;
46 bool    remove_rep_stacks = true;
47 double  removal_threshold = 3.0;
48 int     max_utag_dist     = 2;
49 int     max_rem_dist      = -1;
50 bool    force_diff_len    = false;
51 bool    gapped_alignments = true;
52 double  min_match_len     = 0.80;
53 double  max_gaps          = 2.0;
54 int     removal_trigger;
55 
56 //
57 // For use with the multinomial model to call fixed nucleotides.
58 //
59 modelt model_type         = snp;
60 double alpha              = 0.05;
61 
main(int argc,char * argv[])62 int main (int argc, char* argv[]) {
63 try{
64     parse_command_line(argc, argv);
65 
66     //
67     // Set the max remainder distance to be greater than the max_utag_dist, if it is not
68     // specified on the command line.
69     //
70     if (max_rem_dist == -1) max_rem_dist = max_utag_dist + 2;
71 
72     cerr << "ustacks parameters selected:\n"
73          << "  Input file: '" << in_file   << "'\n"
74          << "  Sample ID: "   << sample_id << "\n"
75          << "  Min depth of coverage to create a stack (m): " << min_merge_cov << "\n"
76          << "  Repeat removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
77          << "  Max distance allowed between stacks (M): " << max_utag_dist << "\n"
78          << "  Max distance allowed to align secondary reads: " << max_rem_dist << "\n"
79          << "  Max number of stacks allowed per de novo locus: " << max_subgraph << "\n"
80          << "  Deleveraging algorithm: " << (deleverage_stacks ? "enabled" : "disabled") << "\n";
81     if (gapped_alignments) {
82         cerr << "  Gapped assembly: enabled\n"
83              << "  Minimum alignment length: " << min_match_len << "\n";
84     } else {
85         cerr << "  Gapped assembly: disabled\n";
86     }
87     cerr << "  Model type: ";
88     switch (model_type) {
89     case snp:
90         cerr << "SNP\n";
91         break;
92     case ::fixed:
93         cerr << "Fixed\n";
94         break;
95     case bounded:
96         cerr << "Bounded; lower epsilon bound: " << bound_low << "; upper bound: " << bound_high << "\n";
97         break;
98     default:
99         DOES_NOT_HAPPEN;
100         break;
101     }
102     cerr << "  Alpha significance level for model: " << alpha << "\n";
103     if (force_diff_len)
104         cerr << "  Forcing the allowance of sequences of different length.\n";
105     cerr << flush;
106 
107     //
108     // Set limits to call het or homozygote according to chi-square distribution with one
109     // degree of freedom:
110     //   http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
111     //
112     set_model_thresholds(alpha);
113 
114     //
115     // Set the number of OpenMP parallel threads to execute.
116     //
117     #ifdef _OPENMP
118     omp_set_num_threads(num_threads);
119     #endif
120 
121     //
122     // Load the reads and build primary & secondary stacks.
123     //
124 
125     cerr << "\n" << "Loading RAD-Tags...";
126 
127     size_t n_reads, n_u_reads, n_r_reads, n_rr_reads;
128     DNASeqHashMap radtags;
129     map<int, Stack *> unique;
130     map<int, Rem *>   remainders;
131     load_radtags(in_file, radtags, n_reads);
132     reduce_radtags(radtags, unique, remainders, n_u_reads, n_r_reads);
133     radtags = DNASeqHashMap();
134 
135     cerr << "\n"
136          << "Loaded " << n_reads << " reads; formed:\n"
137          << "  " << unique.size() << " stacks representing "
138          << n_u_reads << " primary reads (" << as_percentage((double)n_u_reads/n_reads) << ")\n"
139          << "  " << remainders.size() << " secondary stacks representing "
140          << n_r_reads << " secondary reads (" << as_percentage((double)n_r_reads/n_reads) << ")\n"
141          << flush;
142 
143     //
144     // Initialize the MergedStack object.
145     //
146     map<int, MergedStack *> merged;
147     populate_merged_tags(unique, merged);
148 
149     double cov_mean, cov_stdev, cov_max, n_used_reads;
150     ostream cerr_copy (cerr.rdbuf());
151     cerr_copy << std::fixed << std::setprecision(2);
152     auto report_cov = [&]() {
153         cerr_copy << "mean=" << cov_mean << "; stdev=" << cov_stdev << "; max=" << size_t(cov_max)
154                   << "; n_reads=" << size_t(n_used_reads) << "(" << as_percentage(n_used_reads/n_reads) << ")";
155     };
156 
157     calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
158     cerr << "\n";
159     cerr << "Stack coverage: ";
160     report_cov();
161     cerr << "\n";
162 
163     //
164     // Remove highly repetitive stacks.
165     //
166     size_t n_high_cov = 0;
167     if (remove_rep_stacks) {
168         removal_trigger = (int) floor(cov_mean + cov_stdev * removal_threshold + 1);
169         cerr << "Removing repetitive stacks: cov > " << removal_trigger << " (mean+" << removal_threshold << "*stdev)...\n";
170         calc_kmer_distance(merged, 1);
171         assert(merged.size() == unique.size());
172         n_high_cov = remove_repetitive_stacks(merged);
173         cerr << "  Blacklisted " << n_high_cov << " stacks.\n";
174 
175         calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
176         cerr << "Coverage after repeat removal: ";
177         report_cov();
178         cerr << "\n\n";
179     }
180 
181     //
182     // Assemble loci (merge primary stacks).
183     //
184     cerr << "Assembling stacks (max. dist. M=" << max_utag_dist << ")...\n";
185     calc_kmer_distance(merged, max_utag_dist);
186     size_t n_blacklisted;
187     merge_stacks(merged, n_blacklisted);
188     call_consensus(merged, unique, remainders, false);
189     cerr << "  Assembled " << unique.size()-n_high_cov << " stacks into " << merged.size()
190          << "; blacklisted " << n_blacklisted << " stacks.\n";
191     calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
192     cerr << "Coverage after assembling stacks: ";
193     report_cov();
194     cerr << "\n\n";
195 
196     //
197     // Merge secondary stacks.
198     //
199     cerr << "Merging secondary stacks (max. dist. N=" << max_rem_dist << " from consensus)...\n";
200     size_t utilized    = merge_remainders(merged, unique, remainders);
201     size_t util_gapped = merge_gapped_remainders(merged, unique, remainders);
202     call_consensus(merged, unique, remainders, false);
203     cerr << "  Merged " << utilized+util_gapped << " out of " << n_r_reads << " secondary reads ("
204          << as_percentage( (double) (utilized+util_gapped) / (double)n_r_reads) << "), "
205          << util_gapped << " merged with gapped alignments.\n";
206     n_rr_reads = n_r_reads - utilized - util_gapped;
207 
208     calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
209     cerr << "Coverage after merging secondary stacks: ";
210     report_cov();
211     cerr << "\n\n";
212 
213     //
214     // Merge loci based on alignments.
215     //
216     if (gapped_alignments) {
217         cerr << "Assembling stacks, allowing for gaps (min. match length " << as_percentage(min_match_len) << ")...\n";
218         const size_t n_ungapped_loci = merged.size();
219         search_for_gaps(merged);
220         merge_gapped_alns(unique, remainders, merged);
221         call_consensus(merged, unique, remainders, false);
222         cerr << "  Assembled " << n_ungapped_loci << " stacks into " << merged.size() << " stacks.\n";
223 
224         calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
225         cerr << "Coverage after gapped assembly: ";
226         report_cov();
227         cerr << "\n\n";
228 
229         cerr << "Merging secondary stacks, allowing for gaps (min. match length " << as_percentage(min_match_len) << ")...\n";
230         search_for_gapped_remainders(merged, unique, remainders);
231         util_gapped = merge_gapped_remainders(merged, unique, remainders);
232         cerr << "  Merged " << util_gapped << " out of " << n_rr_reads << " secondary reads ("
233              << as_percentage( (double)util_gapped / (double)n_rr_reads) << ").\n";
234 
235         calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
236         cerr << "Coverage after merging gapped secondary stacks: ";
237         report_cov();
238         cerr << "\n\n";
239     }
240 
241     //
242     // Report how many reads were used.
243     //
244     cerr << "Final coverage: ";
245     report_cov();
246     cerr << "\n";
247 
248     //
249     // Call the final consensus sequence and invoke the SNP model.
250     //
251     cerr << "Calling consensus sequences and haplotypes for catalog assembly...\n";
252     call_consensus(merged, unique, remainders, true);
253 
254     //
255     // Write output files.
256     //
257     cerr << "Writing tags, SNPs, and alleles files...\n";
258     write_results(merged, unique, remainders);
259     cerr << "done.\n";
260 
261     cerr << "ustacks is done.\n";
262     return 0;
263 } catch(std::exception& e) {
264     return stacks_handle_exceptions(e);
265 }}
266 
267 int
merge_gapped_alns(map<int,Stack * > & unique,map<int,Rem * > & rem,map<int,MergedStack * > & merged)268 merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedStack *> &merged)
269 {
270     map<int, MergedStack *> new_merged;
271     MergedStack *tag_1, *tag_2, *merged_tag;
272 
273     int  id        = 1;
274     uint merge_cnt = 0;
275 
276     set<int> processed;
277     string   cigar_1, cigar_2;
278     vector<pair<char, uint> > cigar;
279 
280     for (auto it = merged.begin(); it != merged.end(); it++) {
281         if (processed.count(it->first) > 0)
282             continue;
283 
284         tag_1 = it->second;
285         sort(tag_1->alns.begin(), tag_1->alns.end(), rank_alignments);
286 
287         //
288         // No gapped alignments, or no optimal alignment for this stack, or
289         // this stack has already been set aside.
290         //
291         if (tag_1->masked || tag_1->alns.size() == 0)
292             continue;
293         if (tag_1->alns.size() > 1 && tag_1->alns[0].pct_id == tag_1->alns[1].pct_id)
294             continue;
295 
296         //
297         // Found one or more gapped alignments. Make sure the best alignment for each of the aligned pairs
298         // of tags are reciprocal to each other.
299         //
300         tag_2 = merged[tag_1->alns[0].id];
301         sort(tag_2->alns.begin(), tag_2->alns.end(), rank_alignments);
302 
303         if (tag_2->masked || tag_2->alns.size() == 0)
304             continue;
305         if (tag_2->alns.size() > 1 && tag_2->alns[0].pct_id == tag_2->alns[1].pct_id)
306             continue;
307 
308         if (tag_1->id != (int)tag_2->alns[0].id)
309             continue;
310 
311         cigar_1 = invert_cigar(tag_1->alns[0].cigar);
312         cigar_2 = tag_2->alns[0].cigar;
313 
314         if (cigar_1 == cigar_2) {
315             parse_cigar(tag_1->alns[0].cigar.c_str(), cigar);
316 
317             //
318             // Check that the alignment still contains fewer than
319             // max_utag_dist mismatches.
320             //
321             if (dist(tag_1->con, tag_2->con, cigar) > max_utag_dist)
322                 continue;
323 
324             //
325             // If the alignment has too many gaps, skip it.
326             //
327             if (tag_1->alns[0].gap_cnt > (max_gaps + 1))
328                 continue;
329 
330             //
331             // If the alignment doesn't span enough of the two sequences, skip it.
332             //
333             if (tag_1->alns[0].pct_id < min_match_len)
334                 continue;
335 
336             //
337             // Edit the sequences to accommodate any added gaps.
338             //
339             edit_gapped_seqs(unique, rem, tag_1, cigar);
340 
341             parse_cigar(tag_2->alns[0].cigar.c_str(), cigar);
342             edit_gapped_seqs(unique, rem, tag_2, cigar);
343 
344             //
345             // Merge the tags.
346             //
347             merged_tag     = merge_tags(tag_1, tag_2, id);
348             new_merged[id] = merged_tag;
349             id++;
350 
351             //
352             // Record the gaps.
353             //
354             uint pos = 0;
355             for (uint j = 0; j < cigar.size(); j++) {
356                 if (cigar[j].first == 'I' || cigar[j].first == 'D')
357                     merged_tag->gaps.push_back(Gap(pos, pos + cigar[j].second));
358                 pos += cigar[j].second;
359             }
360 
361             processed.insert(tag_1->id);
362             processed.insert(tag_2->id);
363 
364             merge_cnt++;
365         }
366     }
367 
368     set<int> merge_set;
369     for (auto it = merged.begin(); it != merged.end(); it++) {
370         if (processed.count(it->first))
371             continue;
372         tag_1          = it->second;
373         merge_set.insert(tag_1->id);
374         tag_2          = merge_tags(merged, merge_set, id);
375         new_merged[id] = tag_2;
376         merge_set.clear();
377         id++;
378     }
379 
380     //
381     // Free the memory from the old map of merged tags.
382     //
383     for (auto it = merged.begin(); it != merged.end(); it++)
384         delete it->second;
385 
386     merged = new_merged;
387     return 0;
388 }
389 
390 bool
rank_alignments(Aln a,Aln b)391 rank_alignments(Aln a, Aln b)
392 {
393     return a.pct_id > b.pct_id;
394 }
395 
396 int
edit_gapped_seqs(map<int,Stack * > & unique,map<int,Rem * > & rem,MergedStack * tag,vector<pair<char,uint>> & cigar)397 edit_gapped_seqs(map<int, Stack *> &unique, map<int, Rem *> &rem, MergedStack *tag, vector<pair<char, uint> > &cigar)
398 {
399     int    stack_id;
400     Stack *s;
401     Rem   *r;
402     char  *buf = new char[cigar_length_padded(cigar) + 1];
403     string seq;
404 
405     for (uint i = 0; i < tag->utags.size(); i++) {
406         stack_id = tag->utags[i];
407         s = unique[stack_id];
408 
409         s->seq->seq(buf);
410         seq = apply_cigar_to_seq(buf, cigar);
411 
412         delete s->seq;
413         s->seq = new DNANSeq(seq.length(), seq.c_str());
414     }
415 
416     for (uint i = 0; i < tag->remtags.size(); i++) {
417         stack_id = tag->remtags[i];
418         r = rem[stack_id];
419 
420         r->seq->seq(buf);
421         seq = apply_cigar_to_seq(buf, cigar);
422 
423         delete r->seq;
424         r->seq = new DNANSeq(seq.length(), seq.c_str());
425     }
426 
427     delete [] buf;
428 
429     return 0;
430 }
431 
432 int
edit_gaps(vector<pair<char,uint>> & cigar,char * seq)433 edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
434 {
435     char *buf;
436     uint  size = cigar.size();
437     char  op;
438     uint  dist, bp, len, buf_len, buf_size, j, k, stop;
439 
440     len = strlen(seq);
441     bp  = 0;
442 
443     buf      = new char[len + 1];
444     buf_size = len + 1;
445 
446     for (uint i = 0; i < size; i++)  {
447         op   = cigar[i].first;
448         dist = cigar[i].second;
449 
450         switch(op) {
451         case 'S':
452             stop = bp + dist;
453             stop = stop > len ? len : stop;
454             while (bp < stop) {
455                 seq[bp] = 'N';
456                 bp++;
457             }
458             break;
459         case 'D':
460             //
461             // A deletion has occured in the read relative to the reference genome.
462             // Pad the read with sufficent Ns to match the deletion, shifting the existing
463             // sequence down. Trim the final length to keep the read length consistent.
464             //
465             k = bp >= len ? len : bp;
466 
467             strncpy(buf, seq + k, buf_size - 1);
468             buf[buf_size - 1] = '\0';
469             buf_len         = strlen(buf);
470 
471             stop = bp + dist;
472             stop = stop > len ? len : stop;
473             while (bp < stop) {
474                 seq[bp] = 'N';
475                 bp++;
476             }
477 
478             j = bp;
479             k = 0;
480             while (j < len && k < buf_len) {
481                 seq[j] = buf[k];
482                 k++;
483                 j++;
484             }
485             break;
486         case 'I':
487         case 'M':
488             bp += dist;
489             break;
490         default:
491             break;
492         }
493     }
494 
495     delete [] buf;
496 
497     return 0;
498 }
499 
500 int
search_for_gaps(map<int,MergedStack * > & merged)501 search_for_gaps(map<int, MergedStack *> &merged)
502 {
503     //
504     // Search for loci that can be merged with a gapped alignment.
505     //
506     KmerHashMap    kmer_map;
507     vector<char *> kmer_map_keys;
508     MergedStack   *tag_1, *tag_2;
509 
510     //
511     // OpenMP can't parallelize random access iterators, so we convert
512     // our map to a vector of integer keys.
513     //
514     vector<int> keys;
515     for (auto it = merged.begin(); it != merged.end(); it++)
516         keys.push_back(it->first);
517 
518     //
519     // Calculate the number of k-mers we will generate.
520     //
521     size_t con_len = 0;
522     for (auto miter = merged.begin(); miter != merged.end(); miter++)
523         con_len = miter->second->len > con_len ? miter->second->len : con_len;
524     size_t kmer_len  = set_kmer_len ? 19 : global_kmer_len;
525 
526     //
527     // Calculate the minimum number of matching k-mers required for a possible sequence match.
528     //
529     uint min_hits = (round((double) con_len * min_match_len) - (kmer_len * max_gaps)) - kmer_len + 1;
530 
531     populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
532 
533     #pragma omp parallel private(tag_1, tag_2)
534     {
535         KmerHashMap::iterator h;
536         vector<char *> query_kmers;
537         set<string>    uniq_kmers;
538         GappedAln     *aln = new GappedAln(con_len);
539         AlignRes       a;
540 
541         size_t num_kmers = con_len - kmer_len + 1;
542         initialize_kmers(kmer_len, num_kmers, query_kmers);
543 
544         #pragma omp for schedule(dynamic)
545         for (uint i = 0; i < keys.size(); i++) {
546             tag_1 = merged[keys[i]];
547 
548             //
549             // Don't compute distances for masked tags.
550             //
551             if (tag_1->masked) continue;
552 
553             //
554             // Don't compare tags that are already at or above max_locus_stacks.
555             //
556             if (tag_1->utags.size() >= max_subgraph)
557                 continue;
558 
559             if (tag_1->len < kmer_len) continue;
560             num_kmers = tag_1->len - kmer_len + 1;
561             generate_kmers_lazily(tag_1->con, kmer_len, num_kmers, query_kmers);
562 
563             assert(num_kmers > 0);
564 
565             uniq_kmers.clear();
566             for (uint j = 0; j < num_kmers; j++)
567                 uniq_kmers.insert(query_kmers[j]);
568 
569             vector<int> hits;
570             //
571             // Lookup the occurances of each k-mer in the kmer_map
572             //
573             for (auto j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
574                 h = kmer_map.find(j->c_str());
575 
576                 if (h != kmer_map.end())
577                     for (uint k = 0; k <  h->second.size(); k++)
578                         hits.push_back(h->second[k]);
579             }
580 
581             //
582             // Sort the vector of indexes; provides the number of hits to each allele/locus
583             // and orders them largest to smallest.
584             //
585             sort(hits.begin(), hits.end());
586 
587             //
588             // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
589             //
590             uint hit_cnt, index, prev_id, allele_id, hits_size, stop;
591             vector<pair<uint, uint>> ordered_hits;
592 
593             hits_size = hits.size();
594 
595             if (hits_size == 0)
596                 continue;
597 
598             prev_id = hits[0];
599             index   = 0;
600 
601             do {
602                 hit_cnt   = 0;
603                 allele_id = prev_id;
604 
605                 while (index < hits_size && (uint) hits[index] == prev_id) {
606                     hit_cnt++;
607                     index++;
608                 }
609 
610                 if (index < hits_size)
611                     prev_id = hits[index];
612 
613                 // Don't compare tag_1 against itself.
614                 if (tag_1->id == (int) allele_id) continue;
615 
616                 ordered_hits.push_back(make_pair(allele_id, hit_cnt));
617 
618             } while (index < hits_size);
619 
620             if (ordered_hits.size() == 0)
621                 continue;
622 
623             //
624             // Process the hits from most kmer hits to least kmer hits.
625             //
626             sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
627 
628             //
629             // Align at least one locus, regardless of kmer count, or more than one locus if
630             // there are sufficient k-mer hits supporting it.
631             //
632             stop = 1;
633             for (uint j = 1; j < ordered_hits.size(); j++)
634                 if (ordered_hits[j].second < min_hits) {
635                     stop = j;
636                     break;
637                 }
638 
639             //
640             // Iterate through the list of hits. For each hit that has more than min_hits
641             // check its full length to verify a match.
642             //
643             for (uint j = 0; j < stop; j++) {
644                 tag_2 = merged.at(ordered_hits[j].first);
645 
646                 // Don't compute distances for masked tags
647                 if (tag_2->masked) continue;
648 
649                 //
650                 // Don't compare tags that are already at or above max_locus_stacks.
651                 //
652                 if (tag_2->utags.size() >= max_subgraph)
653                     continue;
654 
655                 aln->init(tag_1->len, tag_2->len);
656 
657                 if (aln->align(tag_1->con, tag_2->con)) {
658                     a = aln->result();
659                     tag_1->alns.push_back(Aln(tag_2->id, a.cigar, a.pct_id, a.gap_cnt));
660                 }
661             }
662         }
663 
664         //
665         // Free the k-mers we generated for this query.
666         //
667         for (uint j = 0; j < query_kmers.size(); j++)
668             delete [] query_kmers[j];
669         query_kmers.clear();
670 
671         delete aln;
672     }
673 
674     free_kmer_hash(kmer_map, kmer_map_keys);
675 
676     return 0;
677 }
678 
679 size_t
merge_remainders(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem)680 merge_remainders(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem)
681 {
682     //
683     // OpenMP can't parallelize random access iterators, so we convert
684     // our map to a vector of integer keys.
685     //
686     vector<int> keys;
687     size_t max_rem_len = 0;
688     for (auto it = rem.begin(); it != rem.end(); it++) {
689         keys.push_back(it->first);
690         max_rem_len = it->second->seq->size() > max_rem_len ? it->second->seq->size() : max_rem_len;
691     }
692 
693     //
694     // Calculate the number of k-mers we will generate based on the longest sequence present.
695     // If kmer_len == 0, determine the optimal length for k-mers.
696     //
697     size_t con_len  = 0;
698     for (auto miter = merged.begin(); miter != merged.end(); miter++)
699         con_len = miter->second->len > con_len ? miter->second->len : con_len;
700     size_t kmer_len = set_kmer_len ? determine_kmer_length(con_len, max_rem_dist) : global_kmer_len;
701 
702     //
703     // Calculate the minimum number of matching k-mers required for a possible sequence match.
704     //
705     int min_hits = calc_min_kmer_matches(kmer_len, max_rem_dist, con_len, set_kmer_len ? true : false);
706 
707     //cerr << "  Distance allowed between stacks: " << max_rem_dist
708     //     << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
709     //     << min_hits << " k-mer hits required.\n";
710 
711     KmerHashMap    kmer_map;
712     vector<char *> kmer_map_keys;
713     populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
714     size_t utilized = 0;
715 
716     #pragma omp parallel reduction(+: utilized)
717     {
718         KmerHashMap::iterator h;
719         vector<char *> rem_kmers;
720         Cigar      cigar;
721         string     seq;
722         char      *buf = new char[max_rem_len + 1];
723         size_t     num_kmers = 0;
724 
725         #pragma omp for schedule(dynamic)
726         for (uint j = 0; j < keys.size(); j++) {
727             auto it = rem.find(keys[j]);
728             Rem  *r = it->second;
729 
730             //
731             // Generate the k-mers for this remainder sequence
732             //
733             r->seq->seq(buf);
734             if (r->seq->size() < kmer_len) continue;
735             num_kmers = r->seq->size() - kmer_len + 1;
736             generate_kmers_lazily(buf, kmer_len, num_kmers, rem_kmers);
737 
738             assert(num_kmers > 0);
739 
740             map<int, int> hits;
741             //
742             // Lookup the occurances of each remainder k-mer in the MergedStack k-mer map
743             //
744             for (uint k = 0; k < num_kmers; k++) {
745                 h = kmer_map.find(rem_kmers[k]);
746 
747                 if (h != kmer_map.end())
748                     for (uint n = 0; n < h->second.size(); n++)
749                         hits[h->second[n]]++;
750             }
751 
752             //
753             // Iterate through the list of hits. For each hit that has more than min_hits
754             // check its full length to verify a match.
755             //
756             map<int, int> dists;
757             for (auto hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
758 
759                 if (hit_it->second < min_hits)
760                     continue;
761 
762                 MergedStack *tag_1 = merged[hit_it->first];
763 
764                 int d = dist(tag_1, buf);
765 
766                 //
767                 // Store the distance between these two sequences if it is
768                 // below the maximum distance
769                 //
770                 if (d <= max_rem_dist) {
771                     dists[hit_it->first] = d;
772                 }
773             }
774 
775             //
776             // Check to see if there is a uniquely low distance, if so,
777             // merge this remainder tag. If not, discard it, since we
778             // can't locate a single best-fitting Stack to merge it into.
779             //
780             map<int, int>::iterator s;
781             int min_id = -1;
782             int count  =  0;
783             int dist   =  max_rem_dist + 1;
784 
785             for (s = dists.begin(); s != dists.end(); s++) {
786                 if ((*s).second < dist) {
787                     min_id = (*s).first;
788                     count  = 1;
789                     dist   = (*s).second;
790                 } else if ((*s).second == dist) {
791                     count++;
792                 }
793             }
794 
795             //
796             // Found a merge partner.
797             //
798             if (min_id >= 0 && count == 1) {
799                 MergedStack *tag_1 = merged.at(min_id);
800 
801                 //
802                 // The max_rem_dist distance allows for the possibility of a frameshift smaller
803                 // or equal to max_rem_dist at the 3' end of the read. If we detect a possible
804                 // frameshift, queue the remainder read to re-align the read using the gapped
805                 // alignment algorithm.
806                 //
807                 if (check_frameshift(tag_1, buf, max_rem_dist)) {
808                     #pragma omp critical
809                     tag_1->rem_queue.push_back(r->id);
810                     continue;
811                 }
812 
813                 r->utilized = true;
814                 utilized += r->count();
815 
816                 #pragma omp critical
817                 {
818                     tag_1->remtags.push_back(r->id);
819                     tag_1->count += r->count();
820                 }
821             }
822         }
823 
824         //
825         // Free the k-mers we generated for this remainder read
826         //
827         for (uint k = 0; k < rem_kmers.size(); k++)
828             delete [] rem_kmers[k];
829 
830         delete [] buf;
831     }
832 
833     free_kmer_hash(kmer_map, kmer_map_keys);
834 
835     return utilized;
836 }
837 
838 int
search_for_gapped_remainders(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem)839 search_for_gapped_remainders(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem)
840 {
841     //
842     // Search for remainders that can be merged with a gapped alignment.
843     //
844 
845     //
846     // OpenMP can't parallelize random access iterators, so we convert
847     // our map to a vector of integer keys.
848     //
849     vector<int> keys;
850     size_t max_rem_len = 0;
851     for (auto it = rem.begin(); it != rem.end(); it++) {
852         keys.push_back(it->first);
853         max_rem_len = it->second->seq->size() > max_rem_len ? it->second->seq->size() : max_rem_len;
854     }
855 
856     //
857     // Calculate the number of k-mers we will generate.
858     //
859     size_t con_len = 0;
860     for (auto miter = merged.begin(); miter != merged.end(); miter++)
861         con_len = miter->second->len > con_len ? miter->second->len : con_len;
862     size_t kmer_len  = set_kmer_len ? 19 : global_kmer_len;
863 
864     //
865     // Calculate the minimum number of matching k-mers required for a possible sequence match.
866     //
867     int min_hits = (round((double) con_len * min_match_len) - (kmer_len * max_gaps)) - kmer_len + 1;
868 
869     //cerr << "  Searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); " << min_hits << " k-mer hits required.\n";
870 
871     KmerHashMap    kmer_map;
872     vector<char *> kmer_map_keys;
873     populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
874 
875     #pragma omp parallel
876     {
877         KmerHashMap::iterator h;
878         vector<char *> rem_kmers;
879         set<string>    uniq_kmers;
880         GappedAln     *aln = new GappedAln(con_len);
881         AlignRes       a;
882         string         seq, buf;
883         char          *rem_buf   = new char[max_rem_len + 1];
884         size_t         num_kmers = 0;
885 
886         #pragma omp for schedule(dynamic)
887         for (uint i = 0; i < keys.size(); i++) {
888             auto it = rem.find(keys[i]);
889             Rem  *r = it->second;
890 
891             if (r->utilized) continue;
892 
893             //
894             // Generate the k-mers for this remainder sequence
895             //
896             r->seq->seq(rem_buf);
897             if (r->seq->size() < kmer_len) continue;
898             num_kmers = r->seq->size() - kmer_len + 1;
899             generate_kmers_lazily(rem_buf, kmer_len, num_kmers, rem_kmers);
900 
901             uniq_kmers.clear();
902             for (uint j = 0; j < num_kmers; j++)
903                 uniq_kmers.insert(rem_kmers[j]);
904 
905             assert(num_kmers > 0);
906 
907             map<int, int> hits;
908             //
909             // Lookup the occurances of each remainder k-mer in the MergedStack k-mer map
910             //
911             for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
912                 h = kmer_map.find(j->c_str());
913 
914                 if (h != kmer_map.end())
915                     for (uint k = 0; k <  h->second.size(); k++)
916                         hits[h->second[k]]++;
917             }
918 
919             //
920             // Iterate through the list of hits. For each hit that has more than min_hits
921             // check its full length to verify a match.
922             //
923             vector<Aln> rem_alns;
924             Cigar       cigar;
925 
926             for (auto hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
927 
928                 if (hit_it->second < min_hits) continue;
929 
930                 MergedStack *tag_1 = merged[hit_it->first];
931 
932                 // Don't compute distances for masked tags
933                 if (tag_1->masked) continue;
934 
935                 //
936                 // Calculate the alignment.
937                 //
938                 aln->init(r->seq->size(), tag_1->len);
939 
940                 if (aln->align(r->seq->seq().c_str(), tag_1->con)) {
941                     a = aln->result();
942                     rem_alns.push_back(Aln(tag_1->id, a.cigar, a.pct_id, a.gap_cnt));
943                 }
944             }
945 
946             if (rem_alns.size() == 0) continue;
947 
948             //
949             // Rank the alignments for this remainder read.
950             //
951             sort(rem_alns.begin(), rem_alns.end(), rank_alignments);
952 
953             MergedStack *tag_1 = merged[rem_alns[0].id];
954             parse_cigar(rem_alns[0].cigar.c_str(), cigar);
955 
956             //
957             // Check that the alignment still contains fewer than
958             // max_utag_dist mismatches.
959             //
960             if (dist(r->seq->seq().c_str(), tag_1->con, cigar) > max_utag_dist)
961                 continue;
962 
963             //
964             // If the alignment has too many gaps, skip it.
965             //
966             if (rem_alns[0].gap_cnt > (max_gaps + 1))
967                 continue;
968 
969             //
970             // If the alignment doesn't span enough of the two sequences, skip it.
971             //
972             if (rem_alns[0].pct_id < min_match_len)
973                 continue;
974 
975             #pragma omp critical
976             tag_1->rem_queue.push_back(r->id);
977         }
978 
979         //
980         // Free the k-mers we generated for this query.
981         //
982         for (uint j = 0; j < rem_kmers.size(); j++)
983             delete [] rem_kmers[j];
984         rem_kmers.clear();
985 
986         delete [] rem_buf;
987         delete aln;
988     }
989 
990     free_kmer_hash(kmer_map, kmer_map_keys);
991 
992     return 0;
993 }
994 
995 size_t
merge_gapped_remainders(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem)996 merge_gapped_remainders(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem)
997 {
998     //
999     // OpenMP can't parallelize random access iterators, so we convert
1000     // our map to a vector of integer keys.
1001     //
1002     vector<int> keys;
1003     for (auto it = merged.begin(); it != merged.end(); it++)
1004         keys.push_back(it->first);
1005 
1006     size_t utilized = 0;
1007 
1008     #pragma omp parallel reduction(+: utilized)
1009     {
1010         Cigar      cigar;
1011         GappedAln *aln = new GappedAln(merged[keys[0]]->len);
1012         AlignRes   a;
1013         string     buf, seq;
1014 
1015         #pragma omp for schedule(dynamic)
1016         for (uint i = 0; i < keys.size(); i++) {
1017             MergedStack *tag_1 = merged.at(keys[i]);
1018 
1019             //
1020             // Don't compute distances for masked tags.
1021             //
1022             if (tag_1->masked) continue;
1023 
1024             for (uint i = 0; i < tag_1->rem_queue.size(); i++) {
1025                 Rem *r = rem.at(tag_1->rem_queue[i]);
1026 
1027                 // //
1028                 // // We want to align the last max_rem_dist * 2 nucleotides. If sequences are already the wrong length,
1029                 // // adjust so the 3' ends are lined up properly in the alignment matrix.
1030                 // //
1031                 // diff    = tag_1->len - r->seq->size();
1032                 // q_start = r->seq->size() - (max_rem_dist * 2) - 1;
1033                 // q_end   = r->seq->size() - 1;
1034                 // s_start = tag_1->len - (max_rem_dist * 2) - 1;
1035                 // s_end   = tag_1->len - 1;
1036 
1037                 // q_start = diff > 0 ? q_start + diff : q_start;
1038                 // s_start = diff < 0 ? s_start + abs(diff) : s_start;
1039 
1040                 // cerr << "Consensus size: " << tag_1->len << "; remainder size: " << r->seq->size() << "\n"
1041                 //      << "Con seq: " << tag_1->con << " (" << strlen(tag_1->con) << ")\n"
1042                 //      << "Rem seq: " << r->seq->seq() << "\n"
1043                 //      << "q_start: " << q_start << "; q_end: " << q_end << "; s_start: " << s_start << "; s_end: " << s_end << "\n";
1044 
1045                 aln->init(r->seq->size(), tag_1->len);
1046 
1047                 if (aln->align(r->seq->seq().c_str(), tag_1->con)) {
1048                     a = aln->result();
1049                     parse_cigar(a.cigar.c_str(), cigar);
1050 
1051                     // cerr << "Consensus size: " << tag_1->len << "; remainder size: " << r->seq->size() << "\n"
1052                     //      << "Con seq: " << tag_1->con << " (" << strlen(tag_1->con) << ")\n"
1053                     //      << "Rem seq: " << r->seq->seq() << "\n"
1054                     //      << "Cigar:   " << a.cigar << "\n";
1055 
1056                     //
1057                     // Check that the alignment still contains fewer than
1058                     // max_utag_dist mismatches.
1059                     //
1060                     if (dist(r->seq->seq().c_str(), tag_1->con, cigar) > max_utag_dist)
1061                         continue;
1062 
1063                     //
1064                     // If the alignment has too many gaps, skip it.
1065                     //
1066 	            if (a.gap_cnt > (max_gaps + 1))
1067                         continue;
1068 
1069                     //
1070                     // If the alignment doesn't span enough of the two sequences, skip it.
1071                     //
1072                     if (a.pct_id < min_match_len)
1073                         continue;
1074 
1075                     r->utilized   = true;
1076                     tag_1->count += r->count();
1077                     utilized     += r->count();
1078 
1079                     buf = r->seq->seq();
1080                     seq = apply_cigar_to_seq(buf.c_str(), cigar);
1081                     r->add_seq(seq.c_str());
1082                     invert_cigar(cigar);
1083 
1084                     //
1085                     // Record the gaps (but not soft-masked 3' regions.
1086                     //
1087                     uint pos = 0;
1088                     for (uint j = 0; j < cigar.size(); j++) {
1089                         if (cigar[j].first == 'I' || cigar[j].first == 'D') {
1090                             tag_1->gaps.push_back(Gap(pos, pos + cigar[j].second - 1));
1091                         }
1092                         pos += cigar[j].second;
1093                     }
1094                     edit_gapped_seqs(unique, rem, tag_1, cigar);
1095 
1096                     //
1097                     // Add this aligned remainder tag to the locus after adjusting the other reads but before adjusting the consensus.
1098                     //
1099                     tag_1->remtags.push_back(r->id);
1100                     update_consensus(tag_1, unique, rem);
1101                 }
1102             }
1103 
1104             tag_1->rem_queue.clear();
1105         }
1106     }
1107 
1108     return utilized;
1109 }
1110 
1111 int
call_alleles(MergedStack * mtag,vector<DNANSeq * > & reads,vector<read_type> & read_types)1112 call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads, vector<read_type> &read_types)
1113 {
1114     int     row;
1115     int     height = reads.size();
1116     string  allele;
1117     DNANSeq *d;
1118     char    base;
1119     vector<SNP *>::iterator snp;
1120 
1121     for (row = 0; row < height; row++) {
1122         allele.clear();
1123 
1124         uint snp_cnt = 0;
1125 
1126         //
1127         // Only call a haplotype from primary reads.
1128         //
1129         if (!call_sec_hapl && read_types[row] == secondary) continue;
1130 
1131         for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
1132             if ((*snp)->type != snp_type_het) continue;
1133 
1134             snp_cnt++;
1135 
1136             d    = reads[row];
1137             base = (*d)[(*snp)->col];
1138 
1139             //
1140             // Check to make sure the nucleotide at the location of this SNP is
1141             // of one of the two possible states the multinomial model called.
1142             //
1143             if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
1144                 allele += base;
1145             else
1146                 break;
1147         }
1148 
1149         if (snp_cnt > 0 && allele.length() == snp_cnt)
1150             mtag->alleles[allele]++;
1151     }
1152 
1153     return 0;
1154 }
1155 
1156 int
call_consensus(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem,bool invoke_model)1157 call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem, bool invoke_model)
1158 {
1159     //
1160     // OpenMP can't parallelize random access iterators, so we convert
1161     // our map to a vector of integer keys.
1162     //
1163     map<int, MergedStack *>::iterator it;
1164     vector<int> keys;
1165     for (it = merged.begin(); it != merged.end(); it++)
1166         keys.push_back(it->first);
1167 
1168     int i;
1169     #pragma omp parallel private(i)
1170     {
1171         MergedStack *mtag;
1172         Stack       *utag;
1173         Rem         *r;
1174 
1175         #pragma omp for schedule(dynamic)
1176         for (i = 0; i < (int) keys.size(); i++) {
1177             mtag = merged[keys[i]];
1178 
1179             //
1180             // Create a two-dimensional array, each row containing one read. For
1181             // each unique tag that has been merged together, add the sequence for
1182             // that tag into our array as many times as it originally occurred.
1183             //
1184             vector<int>::iterator j;
1185             vector<DNANSeq *> reads;
1186             vector<read_type> read_types;
1187             uint length = unique[mtag->utags.front()]->seq->size();
1188 
1189             for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
1190                 utag = unique[*j];
1191 
1192                 for (uint k = 0; k < utag->count(); k++) {
1193                     reads.push_back(utag->seq);
1194                     read_types.push_back(primary);
1195 
1196                     assert(utag->seq->size() == length);
1197                 }
1198             }
1199 
1200             // For each remainder tag that has been merged into this Stack, add the sequence.
1201             for (j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
1202                 r = rem[*j];
1203 
1204                 for (uint k = 0; k < r->count(); k++) {
1205                     reads.push_back(r->seq);
1206                     read_types.push_back(secondary);
1207 
1208                     assert(r->seq->size() == length);
1209                 }
1210             }
1211 
1212             //
1213             // Iterate over each column of the array and call the consensus base.
1214             //
1215             uint row, col;
1216             uint height = reads.size();
1217             string con;
1218             map<char, int> nuc;
1219             map<char, int>::iterator max;
1220             DNANSeq *d;
1221 
1222             uint cur_gap = mtag->gaps.size() > 0 ? 0 : 1;
1223 
1224             for (col = 0; col < length; col++) {
1225                 nuc['A'] = 0;
1226                 nuc['G'] = 0;
1227                 nuc['C'] = 0;
1228                 nuc['T'] = 0;
1229 
1230                 for (row = 0; row < height; row++) {
1231                     d = reads[row];
1232                     if (nuc.count((*d)[col]))
1233                         nuc[(*d)[col]]++;
1234                 }
1235 
1236                 //
1237                 // Find the base with a plurality of occurances and call it.
1238                 //
1239                 max = nuc.end();
1240 
1241                 for (auto n = nuc.begin(); n != nuc.end(); n++) {
1242                     if (max == nuc.end() || n->second > max->second)
1243                         max = n;
1244                 }
1245                 con += max->first;
1246 
1247                 //
1248                 // Search this column for the presence of a SNP
1249                 //
1250                 if (invoke_model) {
1251                     //
1252                     // Don't invoke the model within gaps.
1253                     //
1254                     if (cur_gap < mtag->gaps.size() &&
1255                         col >= mtag->gaps[cur_gap].start && col <= mtag->gaps[cur_gap].end) {
1256                         SNP *snp    = new SNP;
1257                         snp->type   = snp_type_unk;
1258                         snp->col    = col;
1259                         snp->rank_1 = '-';
1260                         snp->rank_2 = '-';
1261                         mtag->snps.push_back(snp);
1262 
1263                         if (col == mtag->gaps[cur_gap].end)
1264                             cur_gap++;
1265                         continue;
1266                     }
1267 
1268                     switch(model_type) {
1269                     case snp:
1270                         call_multinomial_snp(mtag, col, nuc, true);
1271                         break;
1272                     case bounded:
1273                         call_bounded_multinomial_snp(mtag, col, nuc, true);
1274                         break;
1275                     case ::fixed:
1276                         call_multinomial_fixed(mtag, col, nuc);
1277                         break;
1278                     default:
1279                         DOES_NOT_HAPPEN;
1280                         break;
1281                     }
1282                 }
1283             }
1284 
1285             assert(con.length() == length);
1286 
1287             if (invoke_model) {
1288                 assert(mtag->snps.size() == length);
1289 
1290                 call_alleles(mtag, reads, read_types);
1291 
1292                 if (model_type == ::fixed) {
1293                     //
1294                     // Mask nucleotides that are not fixed.
1295                     //
1296                     vector<SNP *>::iterator s;
1297                     for (s = mtag->snps.begin(); s != mtag->snps.end(); s++) {
1298                         if ((*s)->type == snp_type_unk)
1299                             con.replace((*s)->col, 1, "N");
1300                     }
1301                 }
1302             }
1303 
1304             mtag->add_consensus(con.c_str());
1305         }
1306     }
1307 
1308     return 0;
1309 }
1310 
1311 int
update_consensus(MergedStack * mtag,map<int,Stack * > & unique,map<int,Rem * > & rem)1312 update_consensus(MergedStack *mtag, map<int, Stack *> &unique, map<int, Rem *> &rem)
1313 {
1314     Stack *utag;
1315     Rem   *r;
1316     //
1317     // Create a two-dimensional array, each row containing one read. For
1318     // each unique tag that has been merged together, add the sequence for
1319     // that tag into our array as many times as it originally occurred.
1320     //
1321     vector<DNANSeq *> reads;
1322 
1323     for (auto j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
1324         utag = unique[*j];
1325 
1326         for (uint k = 0; k < utag->count(); k++)
1327             reads.push_back(utag->seq);
1328     }
1329 
1330     // For each remainder tag that has been merged into this Stack, add the sequence.
1331     for (auto j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
1332         r = rem[*j];
1333 
1334         for (uint k = 0; k < r->count(); k++)
1335             reads.push_back(r->seq);
1336     }
1337 
1338     //
1339     // Iterate over each column of the array and call the consensus base.
1340     //
1341     string   con;
1342     uint     row, col;
1343     uint     length = reads[0]->size();
1344     uint     height = reads.size();
1345     DNANSeq *d;
1346 
1347     vector<uint> nucs = {0,  // A
1348                          0,  // C
1349                          0,  // G
1350                          0,  // T
1351                          0}; // N
1352 
1353     con.reserve(length);
1354 
1355     for (col = 0; col < length; col++) {
1356         nucs.assign(5, 0);
1357 
1358         for (row = 0; row < height; row++) {
1359             d = reads[row];
1360             switch ((*d)[col]) {
1361             case 'A':
1362                 nucs[0]++;
1363                 break;
1364             case 'C':
1365                 nucs[1]++;
1366                 break;
1367             case 'G':
1368                 nucs[2]++;
1369                 break;
1370             case 'T':
1371                 nucs[3]++;
1372                 break;
1373             case 'N':
1374                 nucs[4]++;
1375                 break;
1376             }
1377         }
1378 
1379         //
1380         // Call the base with a plurality of occurances. Only call 'N' if there is no other information.
1381         //
1382         uint max = 0;
1383         char nuc = 'N';
1384 
1385         for (uint i = 0; i < 4; i++) {
1386             switch (i) {
1387             case 0:
1388                 if (nucs[i] > max) {
1389                     max = nucs[i];
1390                     nuc = 'A';
1391                 }
1392                 break;
1393             case 1:
1394                 if (nucs[i] > max) {
1395                     max = nucs[i];
1396                     nuc = 'C';
1397                 }
1398                 break;
1399             case 2:
1400                 if (nucs[i] > max) {
1401                     max = nucs[i];
1402                     nuc = 'G';
1403                 }
1404                 break;
1405             case 3:
1406                 if (nucs[i] > max) {
1407                     max = nucs[i];
1408                     nuc = 'T';
1409                 }
1410                 break;
1411             }
1412         }
1413         con += nuc;
1414     }
1415 
1416     mtag->add_consensus(con.c_str());
1417 
1418     return 0;
1419 }
1420 
1421 int
populate_merged_tags(map<int,Stack * > & unique,map<int,MergedStack * > & merged)1422 populate_merged_tags(map<int, Stack *> &unique, map<int, MergedStack *> &merged)
1423 {
1424     map<int, Stack *>::iterator i;
1425     map<int, MergedStack *>::iterator it_new, it_old;
1426     Stack       *utag;
1427     MergedStack *mtag;
1428     int          k = 0;
1429 
1430     it_old = merged.begin();
1431 
1432     for (i = unique.begin(); i != unique.end(); i++) {
1433         utag = (*i).second;
1434         mtag = new MergedStack;
1435 
1436         mtag->id    = k;
1437         mtag->count = utag->count();
1438         mtag->utags.push_back(utag->id);
1439         mtag->add_consensus(utag->seq);
1440 
1441         // Insert the new MergedStack giving a hint as to which position
1442         // to insert it at.
1443         it_new = merged.insert(it_old, pair<int, MergedStack *>(k, mtag));
1444         it_old = it_new;
1445         k++;
1446     }
1447 
1448     return 0;
1449 }
1450 
1451 void
merge_stacks(map<int,MergedStack * > & merged,size_t & blist_cnt)1452 merge_stacks(map<int, MergedStack *> &merged, size_t& blist_cnt)
1453 {
1454     map<int, MergedStack *> new_merged;
1455     map<int, MergedStack *>::const_iterator it;
1456     map<int, MergedStack *>::iterator it_old, it_new;
1457     MergedStack *tag_1, *tag_2;
1458     set<int> merge_map;
1459     vector<set<int> > merge_lists;
1460 
1461     uint index     = 0;
1462     int  cohort_id  = 0;
1463     int  id         = 1;
1464     uint delev_cnt = 0;
1465     blist_cnt = 0;
1466 
1467     for (it = merged.begin(); it != merged.end(); it++) {
1468         tag_1 = it->second;
1469 
1470         //
1471         // This tag may already have been merged by an earlier operation.
1472         //
1473         if (merge_map.count(tag_1->id) > 0)
1474             continue;
1475 
1476         queue<int>                        merge_list;
1477         pair<set<int>::iterator,bool>     ret;
1478         vector<pair<int, int> >::iterator k;
1479 
1480         merge_lists.push_back(set<int>());
1481 
1482         if (tag_1->masked) {
1483             merge_lists[index].insert(tag_1->id);
1484             index++;
1485             continue;
1486         }
1487 
1488         //
1489         // Construct a list of MergedStacks that are within a particular distance
1490         // of this tag.
1491         //
1492         merge_lists[index].insert(tag_1->id);
1493         merge_list.push(tag_1->id);
1494 
1495         while (!merge_list.empty()) {
1496             tag_2 = merged[merge_list.front()];
1497             merge_list.pop();
1498 
1499             for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
1500                 ret = merge_lists[index].insert(k->first);
1501 
1502                 //
1503                 // If this Tag has not already been added to the merge list (i.e. we were able
1504                 // to insert it in to our unique_merge_list, which is a set), add it for consideration
1505                 // later in the loop.
1506                 //
1507                 if (ret.second == true)
1508                     merge_list.push((*k).first);
1509             }
1510         }
1511 
1512         //
1513         // Record the nodes that have been merged in this round.
1514         //
1515         set<int>::iterator j;
1516         for (j = merge_lists[index].begin(); j != merge_lists[index].end(); j++)
1517             merge_map.insert(*j);
1518 
1519         index++;
1520     }
1521 
1522     #pragma omp parallel private(tag_1, tag_2)
1523     {
1524         vector<MergedStack *> merged_tags;
1525 
1526         #pragma omp for reduction(+:delev_cnt) reduction(+:blist_cnt)
1527         for (uint index = 0; index < merge_lists.size(); index++) {
1528             //
1529             // Deal with the simple case of a single locus that does not need to be merged.
1530             //
1531             if (merge_lists[index].size() == 1) {
1532                 tag_1 = merged[*(merge_lists[index].begin())];
1533                 tag_2 = merge_tags(merged, merge_lists[index], 0);
1534 
1535                 //
1536                 // If this tag is masked, keep the old cohort_id.
1537                 //
1538                 if (tag_1->masked) {
1539                     tag_2->cohort_id = tag_1->cohort_id;
1540                 } else {
1541                     tag_2->cohort_id = cohort_id;
1542                     #pragma omp atomic
1543                     cohort_id++;
1544                 }
1545                 merged_tags.push_back(tag_2);
1546                 continue;
1547             }
1548 
1549             //
1550             // Break large loci down by constructing a minimum
1551             // spanning tree and severing long distance edges.
1552             //
1553             if (deleverage_stacks) {
1554                 vector<MergedStack *> tags;
1555                 bool delev;
1556 
1557                 deleverage(merged, merge_lists[index], cohort_id, tags);
1558 
1559                 if (tags.size() == 1) {
1560                     delev = false;
1561                 } else {
1562                     delev_cnt++;
1563                     delev = true;
1564                 }
1565 
1566                 for (uint t = 0; t < tags.size(); t++) {
1567                     //tags[t]->id = id;
1568                     tags[t]->deleveraged = delev;
1569 
1570                     if (tags[t]->utags.size() > max_subgraph) {
1571                         tags[t]->masked      = true;
1572                         tags[t]->blacklisted = true;
1573                         blist_cnt++;
1574                     }
1575 
1576                     //new_merged.insert(pair<int, MergedStack *>(id, tags[t]));
1577                     merged_tags.push_back(tags[t]);
1578                     //id++;
1579                 }
1580 
1581                 #pragma omp atomic
1582                 cohort_id++;
1583 
1584             } else {
1585                 //
1586                 // If not deleveraging, merge these tags together into a new MergedStack object.
1587                 //
1588                 tag_2 = merge_tags(merged, merge_lists[index], 0);
1589                 tag_2->cohort_id = cohort_id;
1590 
1591                 if (tag_2->utags.size() > max_subgraph) {
1592                     tag_2->masked      = true;
1593                     tag_2->blacklisted = true;
1594                     blist_cnt++;
1595                 }
1596 
1597                 //new_merged.insert(pair<int, MergedStack *>(id, tag_2));
1598                 merged_tags.push_back(tag_2);
1599 
1600                 #pragma omp atomic
1601                 cohort_id++;
1602                 //id++;
1603             }
1604         }
1605 
1606         //
1607         // Merge the accumulated tags into the new_merged map.
1608         //
1609         #pragma omp critical
1610         {
1611             it_old = merged.begin();
1612             for (uint j = 0; j < merged_tags.size(); j++) {
1613                 merged_tags[j]->id = id;
1614                 it_new = new_merged.insert(it_old, pair<int, MergedStack *>(id, merged_tags[j]));
1615                 it_old = it_new;
1616                 id++;
1617             }
1618         }
1619     }
1620 
1621     if (new_merged.empty()) {
1622         cerr << "Error: Couldn't assemble any loci.\n";
1623         throw exception();
1624     }
1625 
1626     //
1627     // Free the memory from the old map of merged tags.
1628     //
1629     for (it = merged.begin(); it != merged.end(); it++)
1630         delete it->second;
1631 
1632     swap(merged, new_merged);
1633 }
1634 
1635 MergedStack *
merge_tags(MergedStack * tag_1,MergedStack * tag_2,int id)1636 merge_tags(MergedStack *tag_1, MergedStack *tag_2, int id)
1637 {
1638     MergedStack *new_tag;
1639 
1640     new_tag     = new MergedStack;
1641     new_tag->id = id;
1642 
1643     new_tag->deleveraged      = tag_2->deleveraged      || tag_1->deleveraged;
1644     new_tag->masked           = tag_2->masked           || tag_1->masked;
1645     new_tag->blacklisted      = tag_2->blacklisted      || tag_1->blacklisted;
1646     new_tag->gappedlumberjack = tag_2->gappedlumberjack || tag_1->gappedlumberjack;
1647     new_tag->lumberjackstack  = tag_2->lumberjackstack  || tag_1->lumberjackstack;
1648 
1649     for (uint i = 0; i < tag_1->utags.size(); i++)
1650         new_tag->utags.push_back(tag_1->utags[i]);
1651 
1652     for (uint i = 0; i < tag_1->remtags.size(); i++)
1653         new_tag->remtags.push_back(tag_1->remtags[i]);
1654 
1655     for (uint i = 0; i < tag_1->gaps.size(); i++)
1656         new_tag->gaps.push_back(tag_1->gaps[i]);
1657 
1658     new_tag->count = tag_1->count;
1659 
1660     for (uint i = 0; i < tag_2->utags.size(); i++)
1661         new_tag->utags.push_back(tag_2->utags[i]);
1662 
1663     for (uint i = 0; i < tag_2->remtags.size(); i++)
1664         new_tag->remtags.push_back(tag_2->remtags[i]);
1665 
1666     for (uint i = 0; i < tag_2->gaps.size(); i++)
1667         new_tag->gaps.push_back(tag_2->gaps[i]);
1668 
1669     new_tag->count += tag_2->count;
1670 
1671     return new_tag;
1672 }
1673 
merge_tags(map<int,MergedStack * > & merged,set<int> & merge_list,int id)1674 MergedStack *merge_tags(map<int, MergedStack *> &merged, set<int> &merge_list, int id) {
1675     MergedStack *tag_1, *tag_2;
1676 
1677     tag_1     = new MergedStack;
1678     tag_1->id = id;
1679 
1680     for (auto i = merge_list.begin(); i != merge_list.end(); i++) {
1681         tag_2 = merged[(*i)];
1682 
1683         tag_1->deleveraged     = tag_2->deleveraged     ? true : tag_1->deleveraged;
1684         tag_1->masked          = tag_2->masked          ? true : tag_1->masked;
1685         tag_1->blacklisted     = tag_2->blacklisted     ? true : tag_1->blacklisted;
1686         tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
1687 
1688         for (auto j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
1689             tag_1->utags.push_back(*j);
1690 
1691         for (auto j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
1692             tag_1->remtags.push_back(*j);
1693 
1694         for (auto j = tag_2->gaps.begin(); j != tag_2->gaps.end(); j++)
1695             tag_1->gaps.push_back(*j);
1696 
1697         tag_1->count += tag_2->count;
1698     }
1699 
1700     return tag_1;
1701 }
1702 
merge_tags(map<int,MergedStack * > & merged,int * merge_list,int merge_list_size,int id)1703 MergedStack *merge_tags(map<int, MergedStack *> &merged, int *merge_list, int merge_list_size, int id) {
1704     int                   i;
1705     vector<int>::iterator j;
1706     MergedStack *tag_1, *tag_2;
1707 
1708     tag_1     = new MergedStack;
1709     tag_1->id = id;
1710 
1711     for (i = 0; i < merge_list_size; i++) {
1712         tag_2 = merged[merge_list[i]];
1713 
1714         tag_1->deleveraged     = tag_2->deleveraged     ? true : tag_1->deleveraged;
1715         tag_1->masked          = tag_2->masked          ? true : tag_1->masked;
1716         tag_1->blacklisted     = tag_2->blacklisted     ? true : tag_1->blacklisted;
1717         tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
1718 
1719         for (j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
1720             tag_1->utags.push_back(*j);
1721 
1722         for (j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
1723             tag_1->remtags.push_back(*j);
1724 
1725         for (auto j = tag_2->gaps.begin(); j != tag_2->gaps.end(); j++)
1726             tag_1->gaps.push_back(*j);
1727 
1728         tag_1->count += tag_2->count;
1729     }
1730 
1731     return tag_1;
1732 }
1733 
1734 size_t
remove_repetitive_stacks(map<int,MergedStack * > & merged)1735 remove_repetitive_stacks(map<int, MergedStack *> &merged)
1736 {
1737     //
1738     // If enabled, check the depth of coverage of each unique tag, and remove
1739     // from consideration any tags with depths greater than removal_trigger. These tags
1740     // are likely to be multiple repetitive sites that have been merged together.
1741     // Because large stacks of unique tags are likely to also generate many one-off
1742     // sequencing error reads, remove all seqeunces that are a distance of one from
1743     // the RAD-Tag with high depth of coverage.
1744     //
1745     map<int, MergedStack *>::iterator i;
1746     vector<pair<int, int> >::iterator k;
1747     map<int, MergedStack *> new_merged;
1748     MergedStack *tag_1, *tag_2;
1749     set<int> already_merged;
1750 
1751     const size_t initial_stacks = merged.size();
1752 
1753     //
1754     // First, iterate through the stacks and populate a list of tags that will be removed
1755     // (those above the removal_trigger and those 1 nucleotide away). Sort the list of
1756     // stacks so that we process them from largest depth to shortest so the same stacks
1757     // are always merged/removed..
1758     //
1759     vector<pair<int, int> > ordered_tags;
1760     for (i = merged.begin(); i != merged.end(); i++) {
1761         if (i->second->count > removal_trigger)
1762             ordered_tags.push_back(make_pair(i->second->id, i->second->count));
1763     }
1764     sort(ordered_tags.begin(), ordered_tags.end(), compare_pair_intint);
1765 
1766     pair<set<int>::iterator,bool> ret;
1767     int id = 0;
1768 
1769     //
1770     // Merge all stacks that are over the removal trigger with their nearest neighbors and
1771     // mask them so they are not further processed by the program.
1772     //
1773     for (uint j = 0; j < ordered_tags.size(); j++) {
1774         tag_1 = merged[ordered_tags[j].first];
1775 
1776         //
1777         // Don't process a tag that has already been merged.
1778         //
1779         if (already_merged.count(tag_1->id) > 0)
1780             continue;
1781 
1782         //
1783         // Construct a list of MergedStacks that are either:
1784         //   within a distance of 1 nucleotide of this tag, or
1785         //   are they themselves above the lumberjack stacks limit.
1786         //
1787         queue<int> merge_queue;
1788         set<int>   merge_list;
1789         merge_queue.push(tag_1->id);
1790         merge_list.insert(tag_1->id);
1791         already_merged.insert(tag_1->id);
1792 
1793         while (!merge_queue.empty()) {
1794             tag_2 = merged[merge_queue.front()];
1795             merge_queue.pop();
1796 
1797             if (tag_2->count < removal_trigger)
1798                 continue;
1799 
1800             for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
1801                 ret = already_merged.insert(k->first);
1802 
1803                 if (ret.second == true) {
1804                     merge_queue.push(k->first);
1805                     merge_list.insert(k->first);
1806                 }
1807             }
1808         }
1809 
1810         //
1811         // Merge these tags together into a new MergedStack object.
1812         //
1813         tag_2 = merge_tags(merged, merge_list, id);
1814         tag_2->add_consensus(tag_1->con);
1815 
1816         tag_2->lumberjackstack = true;
1817         tag_2->masked          = true;
1818         tag_2->blacklisted     = true;
1819 
1820         new_merged.insert(make_pair(id, tag_2));
1821         id++;
1822     }
1823 
1824     //
1825     // Move the non-lumberjack stacks, unmodified, into the new merged map.
1826     //
1827     size_t remaining_stacks = 0;
1828     for (i = merged.begin(); i != merged.end(); i++) {
1829         tag_1 = i->second;
1830 
1831         if (already_merged.count(tag_1->id) > 0)
1832             continue;
1833         ++remaining_stacks;
1834 
1835         set<int> merge_list;
1836         merge_list.insert(tag_1->id);
1837 
1838         tag_2 = merge_tags(merged, merge_list, id);
1839         tag_2->add_consensus(tag_1->con);
1840 
1841         new_merged.insert(make_pair(id, tag_2));
1842         id++;
1843     }
1844 
1845     //
1846     // Free the memory from the old map of merged tags.
1847     //
1848     map<int, MergedStack *>::iterator it;
1849     for (it = merged.begin(); it != merged.end(); it++)
1850         delete it->second;
1851 
1852     merged.swap(new_merged);
1853     return initial_stacks - remaining_stacks;
1854 }
1855 
deleverage(map<int,MergedStack * > & merged,set<int> & merge_list,int cohort_id,vector<MergedStack * > & deleveraged_tags)1856 int deleverage(map<int, MergedStack *> &merged,
1857                set<int> &merge_list,
1858                int cohort_id,
1859                vector<MergedStack *> &deleveraged_tags) {
1860     set<int>::iterator it;
1861     vector<pair<int, int> >::iterator j;
1862     MergedStack *tag_1, *tag_2;
1863     uint k, l;
1864 
1865     //
1866     // Create a minimum spanning tree in order to determine the minimum distance
1867     // between each node in the list.
1868     //
1869     mst::MinSpanTree *mst = new mst::MinSpanTree();
1870     vector<int>  keys;
1871 
1872     for (it = merge_list.begin(); it != merge_list.end(); it++) {
1873         keys.push_back(*it);
1874         mst->add_node(*it);
1875         tag_1 = merged[*it];
1876 
1877         // cerr << "  " << *it << " -> " << tag_1->utags[0] << "\n";
1878     }
1879 
1880     //
1881     // Measure the distance between each pair of nodes and add edges to our
1882     // minimum spanning tree.
1883     //
1884     mst::Node *n_1, *n_2;
1885     for (k = 0; k < keys.size(); k++) {
1886         tag_1 = merged[keys[k]];
1887         n_1   = mst->node(keys[k]);
1888 
1889         for (l = k+1; l < keys.size(); l++) {
1890             tag_2 = merged[keys[l]];
1891             n_2   = mst->node(keys[l]);
1892 
1893             int d = dist(tag_1, tag_2);
1894 
1895             n_1->add_edge(mst->node(keys[l]), d);
1896             n_2->add_edge(mst->node(keys[k]), d);
1897         }
1898     }
1899 
1900     //
1901     // Build the minimum spanning tree.
1902     //
1903     mst->build_tree();
1904 
1905     //
1906     // Visualize the MST
1907     //
1908     if (dump_graph) {
1909         stringstream gout_file;
1910         gout_file << prefix_path << "_" << keys[0] << ".dot";
1911         string vis = mst->vis(true);
1912         ofstream gvis(gout_file.str().c_str());
1913         gvis << vis;
1914         gvis.close();
1915     }
1916 
1917     set<int> visited;
1918     set<int, int_increasing> dists;
1919     queue<mst::Node *> q;
1920 
1921     mst::Node *n = mst->head();
1922     q.push(n);
1923 
1924     while (!q.empty()) {
1925         n = q.front();
1926         q.pop();
1927         visited.insert(n->id);
1928 
1929         for (uint i = 0; i < n->min_adj_list.size(); i++) {
1930             if (visited.count(n->min_adj_list[i]->id) == 0) {
1931                 q.push(n->min_adj_list[i]);
1932                 // cerr << n->id << " -> " << n->min_adj_list[i]->id << ": ";
1933 
1934                 //
1935                 // Find the edge distance.
1936                 //
1937                 for (uint j = 0; j < n->edges.size(); j++)
1938                     if (n->edges[j]->child == n->min_adj_list[i]) {
1939                         // cerr << n->edges[j]->dist << "\n";
1940                         dists.insert(n->edges[j]->dist);
1941                     }
1942             }
1943         }
1944     }
1945 
1946     //
1947     // This set is sorted by definition. Check if there is more than a single
1948     // distance separating stacks.
1949     //
1950     if (dists.size() == 1) {
1951         tag_1 = merge_tags(merged, merge_list, 0);
1952         deleveraged_tags.push_back(tag_1);
1953         delete mst;
1954         return 0;
1955     }
1956 
1957     uint min_dist = *(dists.begin());
1958 
1959     //
1960     // If there is more than a single distance, split the minimum spanning tree
1961     // into separate loci, by cutting the tree at the larger distances.
1962     //
1963     set<int> uniq_merge_list;
1964     visited.clear();
1965     n = mst->head();
1966     q.push(n);
1967     int id = 0;
1968 
1969     uniq_merge_list.insert(n->id);
1970     while (!q.empty()) {
1971         n = q.front();
1972         q.pop();
1973         visited.insert(n->id);
1974 
1975         for (uint i = 0; i < n->min_adj_list.size(); i++) {
1976             if (visited.count(n->min_adj_list[i]->id) == 0) {
1977                 q.push(n->min_adj_list[i]);
1978 
1979                 for (uint j = 0; j < n->edges.size(); j++) {
1980                     if (n->edges[j]->child == n->min_adj_list[i])
1981                         if (n->edges[j]->dist > min_dist) {
1982 
1983                             // cerr << "Merging the following stacks into a locus:\n";
1984                             for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
1985                                 tag_1 = merged[*it];
1986                                 // cerr << "  " << *it << " -> " << tag_1->utags[0] << "\n";
1987                             }
1988 
1989                             tag_1 = merge_tags(merged, uniq_merge_list, id);
1990                             tag_1->cohort_id = cohort_id;
1991                             deleveraged_tags.push_back(tag_1);
1992                             uniq_merge_list.clear();
1993                             id++;
1994                         }
1995                 }
1996 
1997                 uniq_merge_list.insert(n->min_adj_list[i]->id);
1998             }
1999         }
2000     }
2001 
2002     // cerr << "Merging the following stacks into a locus:\n";
2003     for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
2004         tag_1 = merged[*it];
2005         // cerr << "  " << *it << " -> " << tag_1->utags[0] << "\n";
2006     }
2007 
2008     tag_1 = merge_tags(merged, uniq_merge_list, id);
2009     tag_1->cohort_id = cohort_id;
2010     deleveraged_tags.push_back(tag_1);
2011     uniq_merge_list.clear();
2012 
2013     delete mst;
2014 
2015     return 0;
2016 }
2017 
2018 int
calc_kmer_distance(map<int,MergedStack * > & merged,int utag_dist)2019 calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist)
2020 {
2021     //
2022     // Calculate the distance (number of mismatches) between each pair
2023     // of Radtags. We expect all radtags to be the same length;
2024     //
2025     KmerHashMap    kmer_map;
2026     vector<char *> kmer_map_keys;
2027     MergedStack   *tag_1, *tag_2;
2028     map<int, MergedStack *>::iterator it;
2029 
2030     // OpenMP can't parallelize random access iterators, so we convert
2031     // our map to a vector of integer keys.
2032     vector<int> keys;
2033     for (it = merged.begin(); it != merged.end(); it++)
2034         keys.push_back(it->first);
2035 
2036     //
2037     // Calculate the number of k-mers we will generate. If kmer_len == 0,
2038     // determine the optimal length for k-mers.
2039     //
2040     size_t con_len   = 0;
2041     for (auto miter = merged.begin(); miter != merged.end(); miter++)
2042         con_len = miter->second->len > con_len ? miter->second->len : con_len;
2043     size_t kmer_len  = set_kmer_len ? determine_kmer_length(con_len, utag_dist) : global_kmer_len;
2044 
2045     //
2046     // Calculate the minimum number of matching k-mers required for a possible sequence match.
2047     //
2048     int min_hits = calc_min_kmer_matches(kmer_len, utag_dist, con_len, set_kmer_len ? true : false);
2049 
2050     //cerr << "  Distance allowed between stacks: " << utag_dist
2051     //     << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
2052     //     << min_hits << " k-mer hits required.\n";
2053 
2054     populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
2055 
2056     #pragma omp parallel private(tag_1, tag_2)
2057     {
2058         KmerHashMap::iterator h;
2059         vector<char *>        query_kmers;
2060 
2061         size_t num_kmers = con_len - kmer_len + 1;
2062         initialize_kmers(kmer_len, num_kmers, query_kmers);
2063 
2064         #pragma omp for schedule(dynamic)
2065         for (uint i = 0; i < keys.size(); i++) {
2066             tag_1 = merged[keys[i]];
2067 
2068             // Don't compute distances for masked tags
2069             if (tag_1->masked) continue;
2070 
2071             if (tag_1->len < kmer_len) continue;
2072             num_kmers = tag_1->len - kmer_len + 1;
2073             generate_kmers_lazily(tag_1->con, kmer_len, num_kmers, query_kmers);
2074 
2075             map<int, int> hits;
2076             int d;
2077             //
2078             // Lookup the occurances of each k-mer in the kmer_map
2079             //
2080             for (uint j = 0; j < num_kmers; j++) {
2081                 h = kmer_map.find(query_kmers[j]);
2082 
2083                 if (h != kmer_map.end())
2084                     for (uint k = 0; k <  h->second.size(); k++)
2085                         hits[h->second[k]]++;
2086             }
2087 
2088             //
2089             // Iterate through the list of hits. For each hit that has more than min_hits
2090             // check its full length to verify a match.
2091             //
2092             map<int, int>::iterator hit_it;
2093             for (hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
2094 
2095                 if (hit_it->second < min_hits) continue;
2096 
2097                 tag_2 = merged[hit_it->first];
2098 
2099                 // Don't compute distances for masked tags
2100                 if (tag_2->masked) continue;
2101 
2102                 // Don't compare tag_1 against itself.
2103                 if (tag_1 == tag_2) continue;
2104 
2105                 d = dist(tag_1, tag_2);
2106 
2107                 //
2108                 // Check if any of the mismatches occur at the 3' end of the read. If they
2109                 // do, they may indicate a frameshift is present at the 3' end of the read,
2110                 // which will cause problems when we try to merge loci across samples.
2111                 // If found, do not merge these tags, leave them for the gapped alignmnet
2112                 // algorithm.
2113                 //
2114                 if (d <= utag_dist && check_frameshift(tag_1, tag_2, (size_t) utag_dist))
2115                     continue;
2116 
2117                 //
2118                 // Store the distance between these two sequences if it is
2119                 // below the maximum distance. Thesesequences will then be
2120                 //  merged in the following step of the algorithm.
2121                 //
2122                 if (d <= utag_dist)
2123                     tag_1->add_dist(tag_2->id, d);
2124             }
2125             // Sort the vector of distances.
2126             sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
2127         }
2128 
2129         //
2130         // Free the k-mers we generated for this query
2131         //
2132         for (uint j = 0; j < query_kmers.size(); j++)
2133             delete [] query_kmers[j];
2134     }
2135 
2136     free_kmer_hash(kmer_map, kmer_map_keys);
2137 
2138     return 0;
2139 }
2140 
2141 //int calc_distance(map<int, MergedStack *> &merged, int utag_dist) {
2142 //    //
2143 //    // Calculate the distance (number of mismatches) between each pair
2144 //    // of Radtags. We expect all radtags to be the same length;
2145 //    //
2146 //    map<int, MergedStack *>::iterator it;
2147 //    MergedStack *tag_1, *tag_2;
2148 //    int i, j;
2149 //
2150 //    // OpenMP can't parallelize random access iterators, so we convert
2151 //    // our map to a vector of integer keys.
2152 //    vector<int> keys;
2153 //    for (it = merged.begin(); it != merged.end(); it++)
2154 //        keys.push_back(it->first);
2155 //
2156 //    #pragma omp parallel private(i, j, tag_1, tag_2)
2157 //    {
2158 //        #pragma omp for schedule(dynamic)
2159 //        for (i = 0; i < (int) keys.size(); i++) {
2160 //
2161 //            tag_1 = merged[keys[i]];
2162 //
2163 //            // Don't compute distances for masked tags
2164 //            if (tag_1->masked) continue;
2165 //
2166 //            int d;
2167 //
2168 //            for (j = 0; j < (int) keys.size(); j++) {
2169 //                tag_2 = merged[keys[j]];
2170 //
2171 //                // Don't compute distances for masked tags
2172 //                if (tag_2->masked) continue;
2173 //
2174 //                // Don't compare tag_1 against itself.
2175 //                if (tag_1 == tag_2) continue;
2176 //
2177 //                d = dist(tag_1, tag_2);
2178 //                //cerr << "  Distance: " << d << "\n";
2179 //
2180 //                //
2181 //                // Store the distance between these two sequences if it is
2182 //                // below the maximum distance (which governs those
2183 //                // sequences to be merged in the following step of the
2184 //                // algorithm.)
2185 //                //
2186 //                if (d == utag_dist) {
2187 //                    tag_1->add_dist(tag_2->id, d);
2188 //                    //cerr << "  HIT.\n";
2189 //                }
2190 //            }
2191 //
2192 //            // Sort the vector of distances.
2193 //            sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
2194 //        }
2195 //    }
2196 //
2197 //    return 0;
2198 //}
2199 
reduce_radtags(DNASeqHashMap & radtags,map<int,Stack * > & unique,map<int,Rem * > & rem,size_t & n_u_reads,size_t & n_r_reads)2200 void reduce_radtags(
2201         DNASeqHashMap &radtags, map<int, Stack *> &unique,
2202         map<int, Rem *> &rem,
2203         size_t& n_u_reads,
2204         size_t& n_r_reads
2205         ) {
2206     n_u_reads = 0;
2207     n_r_reads = 0;
2208     DNASeqHashMap::iterator it;
2209 
2210     Rem   *r;
2211     Stack *u;
2212     int   global_id = 1;
2213 
2214     for (it = radtags.begin(); it != radtags.end(); it++) {
2215         if (it->second.count() < min_merge_cov) {
2216             //
2217             // Don't record this unique RAD-Tag if its coverage is below
2218             // the specified cutoff. However, add the reads to the remainder
2219             // vector for later processing.
2220             //
2221             r     = new Rem;
2222             r->id = global_id;
2223             r->add_seq(&it->first);
2224 
2225             for (uint i = 0; i < it->second.ids.size(); i++)
2226                 r->add_id(it->second.ids[i]);
2227             n_r_reads += r->map.size();
2228 
2229             rem[r->id] = r;
2230 
2231         } else {
2232             //
2233             // Populate a Stack object for this unique radtag. Create a
2234             // map of the IDs for the sequences that have been
2235             // collapsed into this radtag.
2236             //
2237             u     = new Stack;
2238             u->id = global_id;
2239             u->add_seq(&it->first);
2240 
2241             // Copy the original Fastq IDs from which this unique radtag was built.
2242             for (uint i = 0; i < it->second.ids.size(); i++)
2243                 u->add_id(it->second.ids[i]);
2244             n_u_reads += u->map.size();
2245 
2246             unique[u->id] = u;
2247         }
2248         if (global_id == INT_MAX)
2249             throw std::overflow_error("overflow in reduce_radtags()");
2250         ++global_id;
2251     }
2252 
2253     if (unique.size() == 0) {
2254         cerr << "Error: Unable to form any primary stacks.\n";
2255         exit(1);
2256     }
2257 }
2258 
2259 void
calc_coverage_distribution(map<int,MergedStack * > & merged,double & mean,double & stdev,double & max,double & tot_depth)2260 calc_coverage_distribution(map<int, MergedStack *> &merged,
2261                            double &mean, double &stdev, double &max, double &tot_depth)
2262 {
2263     max   = 0.0;
2264     mean = 0.0;
2265     stdev = 0.0;
2266     tot_depth = 0.0;
2267 
2268     size_t not_blacklisted = 0;
2269     for (const pair<int, MergedStack*>& mtag : merged) {
2270         if (mtag.second->blacklisted)
2271             continue;
2272         ++not_blacklisted;
2273 
2274         double depth = mtag.second->count;
2275         tot_depth += depth;
2276         if (depth > max)
2277             max = depth;
2278     }
2279     mean = tot_depth / not_blacklisted;
2280 
2281     for (const pair<int, MergedStack*>& mtag : merged)
2282         if (!mtag.second->blacklisted)
2283             stdev += pow(mtag.second->count - mean, 2);
2284     stdev /= not_blacklisted;
2285     stdev = sqrt(stdev);
2286 }
2287 
2288 int
write_results(map<int,MergedStack * > & m,map<int,Stack * > & u,map<int,Rem * > & r)2289 write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *> &r)
2290 {
2291     map<int, MergedStack *>::iterator i;
2292     vector<int>::iterator      k;
2293     vector<SNP *>::iterator    s;
2294     map<string, int>::iterator t;
2295     MergedStack  *tag_1;
2296     Stack        *tag_2;
2297     Rem          *rem;
2298     char strbuf32[32];
2299 
2300     bool gzip = (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) ? true : false;
2301 
2302     //
2303     // Read in the set of sequencing IDs so they can be included in the output.
2304     //
2305     vector<char *> seq_ids;
2306     load_seq_ids(seq_ids);
2307 
2308     //
2309     // Create the output file names.
2310     //
2311     string tag_file = prefix_path + ".tags.tsv";
2312     string snp_file = prefix_path + ".snps.tsv";
2313     string all_file = prefix_path + ".alleles.tsv";
2314     if (gzip) {
2315         tag_file += ".gz";
2316         snp_file += ".gz";
2317         all_file += ".gz";
2318     }
2319 
2320     //
2321     // Open the output files for writing.
2322     //
2323     VersatileWriter tags{tag_file}, snps{snp_file}, alle{all_file};
2324 
2325     //
2326     // Record the version of Stacks used and the date generated as a comment in the catalog.
2327     //
2328     // Obtain the current date.
2329     //
2330     stringstream log;
2331     time_t       rawtime;
2332     struct tm   *timeinfo;
2333     char         date[32];
2334     time(&rawtime);
2335     timeinfo = localtime(&rawtime);
2336     strftime(date, 32, "%F %T", timeinfo);
2337     log << "# ustacks version " << VERSION << "; generated on " << date << "\n";
2338     tags << log.str();
2339     snps << log.str();
2340     alle << log.str();
2341 
2342     for (i = m.begin(); i != m.end(); i++) {
2343         float total = 0;
2344         tag_1 = i->second;
2345 
2346         //
2347         // Calculate the log likelihood of this merged stack.
2348         //
2349         tag_1->gen_matrix(u, r);
2350         tag_1->calc_likelihood();
2351 
2352         // First write the consensus sequence
2353         tags << sample_id          << "\t"
2354              << tag_1->id          << "\t"
2355              << "consensus\t"      << "\t"
2356              << "\t"
2357              << tag_1->con         << "\t"
2358              << tag_1->deleveraged << "\t"
2359              << tag_1->blacklisted << "\t"
2360              << tag_1->lumberjackstack << "\n";
2361 
2362         //
2363         // Write a sequence recording the output of the SNP model for each nucleotide.
2364         //
2365         tags << sample_id << "\t"
2366              << tag_1->id << "\t"
2367              << "model\t" << "\t"
2368              << "\t";
2369         for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
2370             switch((*s)->type) {
2371             case snp_type_het:
2372                 tags << "E";
2373                 break;
2374             case snp_type_hom:
2375                 tags << "O";
2376                 break;
2377             default:
2378                 tags << "U";
2379                 break;
2380             }
2381         }
2382         tags << "\t\t\t\n";
2383 
2384         //
2385         // Now write out the components of each unique tag merged into this locus.
2386         //
2387         int id = 0;
2388         for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
2389             tag_2  = u[*k];
2390             total += tag_2->count();
2391 
2392             for (uint j = 0; j < tag_2->map.size(); j++) {
2393                 tags << sample_id << "\t"
2394                      << tag_1->id << "\t"
2395                      << "primary\t"
2396                      << id << "\t"
2397                      << seq_ids[tag_2->map[j]] << "\t"
2398                      << tag_2->seq->seq()
2399                      << "\t\t\t\n";
2400             }
2401             id++;
2402         }
2403 
2404         //
2405         // Write out the remainder tags merged into this unique tag.
2406         //
2407         for (k = tag_1->remtags.begin(); k != tag_1->remtags.end(); k++) {
2408             rem    = r[*k];
2409             total += rem->map.size();
2410             for (uint j = 0; j < rem->map.size(); j++)
2411                 tags << sample_id << "\t"
2412                      << tag_1->id << "\t"
2413                      << "secondary\t"
2414                      << "\t"
2415                      << seq_ids[rem->map[j]] << "\t"
2416                      << rem->seq->seq()
2417                      << "\t\t\t\n";
2418         }
2419 
2420         //
2421         // Write out the model calls for each nucleotide in this locus.
2422         //
2423         for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
2424             snps << sample_id << "\t"
2425                  << tag_1->id << "\t"
2426                  << (size_t) (*s)->col << "\t";
2427             switch((*s)->type) {
2428             case snp_type_het:
2429                 snps << "E\t";
2430                 break;
2431             case snp_type_hom:
2432                 snps << "O\t";
2433                 break;
2434             default:
2435                 snps << "U\t";
2436                 break;
2437             }
2438             sprintf(strbuf32, "%.2f", (*s)->lratio);
2439             snps << strbuf32 << "\t"
2440                  << (*s)->rank_1 << "\t"
2441                  << (*s)->rank_2 << "\t\t\n";
2442         }
2443 
2444         //
2445         // Write the expressed alleles seen for the recorded SNPs and
2446         // the percentage of tags a particular allele occupies.
2447         //
2448         for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
2449             alle << sample_id   << "\t"
2450                  << tag_1->id   << "\t"
2451                  << (*t).first  << "\t";
2452             sprintf(strbuf32, "%.2f", ((*t).second/total) * 100);
2453             alle << strbuf32 << "\t"
2454                  << (*t).second << "\n";
2455         }
2456     }
2457     tags.close();
2458     snps.close();
2459     alle.close();
2460 
2461     //
2462     // Free sequence IDs.
2463     //
2464     for (uint i = 0; i < seq_ids.size(); i++)
2465         delete [] seq_ids[i];
2466 
2467     //
2468     // If specified, output reads not utilized in any stacks.
2469     //
2470     if (retain_rem_reads) {
2471         string unused_file = prefix_path + ".unused.fa";
2472         VersatileWriter unused{unused_file};
2473 
2474         map<int, Rem *>::iterator r_it;
2475         for (r_it = r.begin(); r_it != r.end(); r_it++)
2476             if (r_it->second->utilized == false)
2477                 unused << ">" << (long) r_it->second->id << "\n" << r_it->second->seq->seq() << "\n";
2478         unused.close();
2479     }
2480 
2481     return 0;
2482 }
2483 
dump_stack_graph(string data_file,map<int,Stack * > & unique,map<int,MergedStack * > & merged,vector<int> & keys,map<int,map<int,double>> & dist_map,map<int,set<int>> & cluster_map)2484 int dump_stack_graph(string data_file,
2485                      map<int, Stack *> &unique,
2486                      map<int, MergedStack *> &merged,
2487                      vector<int> &keys,
2488                      map<int, map<int, double> > &dist_map,
2489                      map<int, set<int> > &cluster_map) {
2490     uint s, t;
2491     double d, scale, scaled_d;
2492     char label[32];
2493     vector<string> colors;
2494     ofstream data(data_file.c_str());
2495 
2496     size_t pos_1 = data_file.find_last_of("/");
2497     size_t pos_2 = data_file.find_last_of(".");
2498     string title = data_file.substr(pos_1 + 1, pos_2 - pos_1 - 1);
2499 
2500     //
2501     // Output a list of IDs so we can locate these stacks in the final results.
2502     //
2503     for (s = 0; s < keys.size(); s++)
2504         data << "/* " << keys[s] << ": " << unique[merged[keys[s]]->utags[0]]->map[0] << "; depth: " << merged[keys[s]]->count << " */\n";
2505 
2506     //
2507     // Output a specification to visualize the stack graph using graphviz:
2508     //   http://www.graphviz.org/
2509     //
2510     data << "graph " << title.c_str() << " {\n"
2511          << "rankdir=LR\n"
2512          << "size=\"20!\"\n"
2513          << "overlap=false\n"
2514          << "node [shape=circle style=filled fillcolor=\"#3875d7\" fontname=\"Arial\"];\n"
2515          << "edge [fontsize=8.0 fontname=\"Arial\" color=\"#aaaaaa\"];\n";
2516 
2517     colors.push_back("red");
2518     colors.push_back("blue");
2519     colors.push_back("green");
2520     colors.push_back("brown");
2521     colors.push_back("purple");
2522 
2523     map<int, set<int> >::iterator c;
2524     set<int>::iterator it;
2525     int color_index = 0;
2526     string color;
2527 
2528     // Write out the clusters created by R, prior to writing all the nodes and connections.
2529     s = 0;
2530     for (c = cluster_map.begin(); c != cluster_map.end(); c++) {
2531         data << "subgraph " << s << " {\n"
2532              << "    edge [penwidth=5 fontsize=12.0 fontcolor=\"black\" color=\"black\"]\n";
2533 
2534         if ((*c).second.size() == 1) {
2535             color = "white";
2536             data << "    node [fillcolor=" << color.c_str() << " fontcolor=\"black\"]\n";
2537         } else {
2538             color = colors[color_index % colors.size()];
2539             data << "    node [fillcolor=" << color.c_str() << " fontcolor=\"white\"]\n";
2540             color_index++;
2541         }
2542 
2543         for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
2544             data << "    " << *it << "\n";
2545         }
2546 
2547         if ((*c).second.size() > 1) {
2548             uint j = 0;
2549             for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
2550                 data << *it;
2551                 if (j < (*c).second.size() - 1)
2552                     data << " -- ";
2553                 j++;
2554             }
2555         }
2556 
2557         data << "}\n";
2558         s++;
2559     }
2560 
2561     //
2562     // Scale the graph to display on a 10 inch canvas. Find the largest edge weight
2563     // and scale the edge lengths to fit the canvas.
2564     //
2565     scale = 0.0;
2566     for (s = 0; s < keys.size(); s++)
2567         for (t = s+1; t < keys.size(); t++)
2568             scale = dist_map[keys[s]][keys[t]] > scale ? dist_map[keys[s]][keys[t]] : scale;
2569     scale = scale / 20;
2570 
2571     for (s = 0; s < keys.size(); s++) {
2572         for (t = s+1; t < keys.size(); t++) {
2573             d = dist_map[keys[s]][keys[t]];
2574             scaled_d = d / scale;
2575             scaled_d = scaled_d < 0.75 ? 0.75 : scaled_d;
2576             sprintf(label, "%.1f", d);
2577             data << keys[s] << " -- " << keys[t] << " [len=" << scaled_d << ", label=" << label << "];\n";
2578         }
2579     }
2580 
2581     data << "}\n";
2582 
2583     data.close();
2584 
2585     return 0;
2586 }
2587 
dump_unique_tags(map<int,Stack * > & u)2588 int dump_unique_tags(map<int, Stack *> &u) {
2589     map<int, Stack *>::iterator it;
2590     vector<pair<int, int> >::iterator pit;
2591     vector<int>::iterator mit;
2592 
2593     for (it = u.begin(); it != u.end(); it++) {
2594         cerr << "UniqueTag UID: " << (*it).second->id << "\n"
2595              << "  Seq:       "   << it->second->seq->seq() << "\n"
2596              << "  IDs:       ";
2597 
2598         for (uint j = 0; j < it->second->map.size(); j++)
2599             cerr << it->second->map[j] << " ";
2600 
2601         cerr << "\n\n";
2602     }
2603 
2604     return 0;
2605 }
2606 
dump_merged_tags(map<int,MergedStack * > & m)2607 int dump_merged_tags(map<int, MergedStack *> &m) {
2608     map<int, MergedStack *>::iterator it;
2609     vector<pair<int, int> >::iterator pit;
2610     vector<int>::iterator fit;
2611 
2612     for (it = m.begin(); it != m.end(); it++) {
2613 
2614         cerr << "MergedStack ID: " << it->second->id << "\n"
2615              << "  Consensus:  ";
2616         if (it->second->con != NULL)
2617             cerr << it->second->con << "\n";
2618         else
2619             cerr << "\n";
2620         cerr << "  IDs:        ";
2621 
2622         for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
2623             cerr << (*fit) << " ";
2624 
2625         cerr << "\n"
2626              << "  Distances: ";
2627 
2628         for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
2629             cerr << (*pit).first << ": " << (*pit).second << ", ";
2630 
2631         cerr << "\n\n";
2632     }
2633 
2634     return 0;
2635 }
2636 
load_radtags(string in_file,DNASeqHashMap & radtags,size_t & n_reads)2637 void load_radtags(string in_file, DNASeqHashMap &radtags, size_t& n_reads) {
2638     n_reads = 0;
2639 
2640     Input *fh = NULL;
2641 
2642     if (in_file_type == FileT::fasta)
2643         fh = new Fasta(in_file.c_str());
2644     else if (in_file_type == FileT::fastq)
2645         fh = new Fastq(in_file.c_str());
2646     else if (in_file_type == FileT::gzfasta)
2647         fh = new GzFasta(in_file.c_str());
2648     else if (in_file_type == FileT::gzfastq)
2649         fh = new GzFastq(in_file.c_str());
2650 
2651     long  int corrected = 0;
2652     size_t i            = 0;
2653     short int seql      = 0;
2654     short int prev_seql = 0;
2655     bool  len_mismatch  = false;
2656 
2657     Seq c;
2658     c.id   = new char[id_len];
2659     c.seq  = new char[max_len];
2660     c.qual = new char[max_len];
2661 
2662     while ((fh->next_seq(c)) != 0) {
2663         if (i % 1000000 == 0 && i>0)
2664             cerr << i/1000000 << "M..." << flush;
2665 
2666         prev_seql = seql;
2667         seql      = 0;
2668 
2669         for (char *p = c.seq; *p != '\0'; p++, seql++)
2670             switch (*p) {
2671             case 'N':
2672             case 'n':
2673             case '.':
2674                 *p = 'A';
2675                 corrected++;
2676             }
2677 
2678         if (seql != prev_seql && prev_seql > 0)
2679             len_mismatch = true;
2680 
2681         DNASeqHashMap::iterator element = radtags.insert({DNANSeq(seql, c.seq), HVal()}).first;
2682         element->second.add_id(i);
2683         i++;
2684     }
2685     cerr << "\n";
2686 
2687     if (i == 0) {
2688         cerr << "Error: Unable to load data from '" << in_file.c_str() << "'.\n";
2689         exit(1);
2690     }
2691     if (len_mismatch && !force_diff_len) {
2692         cerr << "Error: different sequence lengths detected, this will interfere with Stacks "
2693              << "algorithms, trim reads to uniform length (override this check with --force-diff-len).\n";
2694         exit(1);
2695     } else if (force_diff_len) {
2696         cerr << "Warning: different sequence lengths detected, this could interfere with Stacks algorithms.\n";
2697     }
2698     if (corrected > 0)
2699         cerr << "Warning: Input reads contained " << corrected << " uncalled nucleotides.\n";
2700 
2701     delete fh;
2702     n_reads = i;
2703 }
2704 
2705 int
load_seq_ids(vector<char * > & seq_ids)2706 load_seq_ids(vector<char *> &seq_ids)
2707 {
2708     Input *fh = NULL;
2709 
2710     if (in_file_type == FileT::fasta)
2711         fh = new Fasta(in_file.c_str());
2712     else if (in_file_type == FileT::fastq)
2713         fh = new Fastq(in_file.c_str());
2714     else if (in_file_type == FileT::gzfasta)
2715         fh = new GzFasta(in_file.c_str());
2716     else if (in_file_type == FileT::gzfastq)
2717         fh = new GzFastq(in_file.c_str());
2718 
2719     cerr << "Refetching read IDs...";
2720 
2721     char *id;
2722     Seq c;
2723     c.id   = new char[id_len];
2724     c.seq  = new char[max_len];
2725     c.qual = new char[max_len];
2726 
2727     while ((fh->next_seq(c)) != 0) {
2728         id = new char[strlen(c.id) + 1];
2729         strcpy(id, c.id);
2730         seq_ids.push_back(id);
2731     }
2732 
2733     delete fh;
2734     return 0;
2735 }
2736 
factorial(int i)2737 long double factorial(int i) {
2738     long double f = 1;
2739 
2740     if (i == 0) return 1;
2741 
2742     do {
2743         f = f * i;
2744         i--;
2745     } while (i > 0);
2746 
2747     return f;
2748 }
2749 
parse_command_line(int argc,char * argv[])2750 int parse_command_line(int argc, char* argv[]) {
2751     string out_path;
2752     string sample_name;
2753 
2754     int c;
2755     while (1) {
2756         static struct option long_options[] = {
2757             {"help",             no_argument,       NULL, 'h'},
2758             {"version",          no_argument,       NULL, 'v'},
2759             {"infile-type",      required_argument, NULL, 't'}, {"infile_type",      required_argument, NULL, 't'},
2760             {"file",             required_argument, NULL, 'f'},
2761             {"outpath",          required_argument, NULL, 'o'},
2762             {"name",             required_argument, NULL, 1002},
2763             {"id",               required_argument, NULL, 'i'},
2764             {"min-cov",          required_argument, NULL, 'm'}, {"min_cov",          required_argument, NULL, 'm'},
2765             {"max-dist",         required_argument, NULL, 'M'}, {"max_dist",         required_argument, NULL, 'M'},
2766             {"max-sec-dist",     required_argument, NULL, 'N'}, {"max_sec_dist",     required_argument, NULL, 'N'},
2767             {"max-locus-stacks", required_argument, NULL, 'K'}, {"max_locus_stacks", required_argument, NULL, 'K'},
2768             {"k-len",            required_argument, NULL, 'k'}, {"k_len",            required_argument, NULL, 'k'},
2769             {"num-threads",      required_argument, NULL, 'p'}, {"num_threads",      required_argument, NULL, 'p'},
2770             {"deleverage",       no_argument,       NULL, 'd'},
2771             {"keep-high-cov",    no_argument,       NULL, 1000}, {"keep_high_cov",    no_argument,       NULL, 1000},
2772             {"high-cov-thres",   required_argument, NULL, 1001}, {"high_cov_thres",   required_argument, NULL, 1001},
2773             {"retain-rem",       no_argument,       NULL, 'R'}, {"retain_rem",       no_argument,       NULL, 'R'},
2774             {"graph",            no_argument,       NULL, 'g'},
2775             {"sec-hapl",         no_argument,       NULL, 'H'}, {"sec_hapl",         no_argument,       NULL, 'H'},
2776             {"disable-gapped",   no_argument,       NULL, 'G'},
2777             {"max-gaps",         required_argument, NULL, 'X'}, {"max_gaps",         required_argument, NULL, 'X'},
2778             {"min-aln-len",      required_argument, NULL, 'x'}, {"min_aln_len",      required_argument, NULL, 'x'},
2779             {"model-type",       required_argument, NULL, 'T'}, {"model_type",       required_argument, NULL, 'T'},
2780             {"bc-err-freq",      required_argument, NULL, 'e'}, {"bc_err_freq",      required_argument, NULL, 'e'},
2781             {"bound-low",        required_argument, NULL, 'L'}, {"bound_low",        required_argument, NULL, 'L'},
2782             {"bound-high",       required_argument, NULL, 'U'}, {"bound_high",       required_argument, NULL, 'U'},
2783             {"alpha",            required_argument, NULL, 'A'},
2784             {"r-deprecated",     no_argument,       NULL, 'r'},
2785             {"force-diff-len",   no_argument,       NULL, 1003}, {"force_diff_len",   no_argument,       NULL, 1003},
2786             {0, 0, 0, 0}
2787         };
2788 
2789         // getopt_long stores the option index here.
2790         int option_index = 0;
2791 
2792         c = getopt_long(argc, argv, "GhHvdrgRA:L:U:f:o:s:i:m:e:p:t:M:N:K:k:T:X:x:", long_options, &option_index);
2793 
2794         // Detect the end of the options.
2795         if (c == -1)
2796             break;
2797 
2798         switch (c) {
2799         case 'h':
2800             help();
2801             break;
2802         case 't':
2803             if (strcmp(optarg, "tsv") == 0)
2804                 in_file_type = FileT::tsv;
2805             else if (strcmp(optarg, "fasta") == 0)
2806                 in_file_type = FileT::fasta;
2807             else if (strcmp(optarg, "fastq") == 0)
2808                 in_file_type = FileT::fastq;
2809             else if (strcasecmp(optarg, "gzfasta") == 0)
2810                 in_file_type = FileT::gzfasta;
2811             else if (strcasecmp(optarg, "gzfastq") == 0)
2812                 in_file_type = FileT::gzfastq;
2813             else
2814                 in_file_type = FileT::unknown;
2815             break;
2816         case 'f':
2817             in_file = optarg;
2818             break;
2819         case 'o':
2820             out_path = optarg;
2821             break;
2822         case 1002:
2823             sample_name = optarg;
2824             break;
2825         case 'i':
2826             sample_id = is_integer(optarg);
2827             break;
2828         case 'm':
2829             min_merge_cov = is_integer(optarg);
2830             break;
2831         case 'M':
2832             max_utag_dist = is_integer(optarg);
2833             break;
2834         case 'N':
2835             max_rem_dist = is_integer(optarg);
2836             break;
2837         case 'd':
2838             deleverage_stacks++;
2839             break;
2840         case 1000: // keep_high_cov
2841             remove_rep_stacks = false;
2842             break;
2843         case 1001: // high_cov_thres
2844             removal_threshold = is_double(optarg);
2845             if (removal_threshold <= 0.0) {
2846                 cerr << "Error: Bad threshold: '" << optarg << "'\n";
2847                 help();
2848             }
2849             break;
2850         case 'K':
2851             max_subgraph = is_integer(optarg);
2852             break;
2853         case 'k':
2854             set_kmer_len    = false;
2855             global_kmer_len = is_integer(optarg);
2856             break;
2857         case 'R':
2858             retain_rem_reads = true;
2859             break;
2860         case 'g':
2861             dump_graph++;
2862             break;
2863         case 'G':
2864             gapped_alignments = false;
2865             break;
2866         case 'X':
2867             max_gaps = is_double(optarg);
2868             break;
2869         case 'x':
2870             min_match_len = is_double(optarg);
2871             break;
2872         case 'T':
2873             if (strcmp(optarg, "snp") == 0) {
2874                 model_type = snp;
2875             } else if (strcmp(optarg, "fixed") == 0) {
2876                 model_type = ::fixed;
2877             } else if (strcmp(optarg, "bounded") == 0) {
2878                 model_type = bounded;
2879             } else {
2880                 cerr << "Unknown model type specified '" << optarg << "'\n";
2881                 help();
2882             }
2883             break;;
2884         case 'e':
2885             barcode_err_freq = is_double(optarg);
2886             break;
2887         case 'L':
2888             bound_low  = is_double(optarg);
2889             break;
2890         case 'U':
2891             bound_high = is_double(optarg);
2892             break;
2893         case 'A':
2894             alpha = is_double(optarg);
2895             break;
2896         case 'H':
2897             call_sec_hapl = false;
2898             break;
2899         case 'p':
2900             num_threads = is_integer(optarg);
2901             break;
2902         case 'v':
2903             version();
2904             break;
2905         case 1003:
2906             force_diff_len = true;
2907             break;
2908         case 'r': // deprecated Dec 2016, v1.45
2909             cerr << "Warning: Ignoring deprecated option -r (this has become the default).\n";
2910             break;
2911         case '?':
2912             // getopt_long already printed an error message.
2913             help();
2914             break;
2915 
2916         default:
2917             cerr << "Unknown command line option '" << (char) c << "'\n";
2918             help();
2919             exit(1);
2920         }
2921     }
2922 
2923     if (optind < argc) {
2924         cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
2925         help();
2926     }
2927 
2928     if (set_kmer_len == false && (global_kmer_len < 7 || global_kmer_len > 31)) {
2929         cerr << "Kmer length must be between 7 and 31bp.\n";
2930         help();
2931     }
2932 
2933     if (alpha != 0.1 && alpha != 0.05 && alpha != 0.01 && alpha != 0.001) {
2934         cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
2935         help();
2936     }
2937 
2938     if (bound_low != 0 && (bound_low < 0 || bound_low >= 1.0)) {
2939         cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
2940         help();
2941     }
2942 
2943     if (bound_high != 1 && (bound_high <= 0 || bound_high > 1.0)) {
2944         cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
2945         help();
2946     }
2947 
2948     if (bound_low > 0 || bound_high < 1.0) {
2949         model_type = bounded;
2950     }
2951 
2952     if (in_file.empty()) {
2953         cerr << "You must specify an input file.\n";
2954         help();
2955     }
2956 
2957     if (in_file_type == FileT::unknown) {
2958         in_file_type = guess_file_type(in_file);
2959         if (in_file_type == FileT::unknown) {
2960             cerr << "Unable to recongnize the extention of file '" << in_file << "'.\n";
2961             help();
2962         }
2963     }
2964 
2965     // Set `prefix_path`.
2966     if (out_path.length() == 0)
2967         out_path = ".";
2968     if (out_path.at(out_path.length() - 1) != '/')
2969         out_path += "/";
2970     if (sample_name.empty()) {
2971         size_t pos_1 = in_file.find_last_of("/");
2972         size_t pos_2 = in_file.find_last_of(".");
2973         if (in_file.substr(pos_2) == ".gz")
2974             pos_2 = in_file.substr(0, pos_2).find_last_of(".");
2975         if (pos_2 == string::npos || pos_2 <= pos_1 + 1) {
2976             cerr << "Failed to guess the sample's name.\n";
2977             help();
2978         }
2979         sample_name = in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1));
2980     }
2981     prefix_path = out_path + sample_name;
2982 
2983     if (sample_id < 0) {
2984         cerr << "A unique sample ID must be provided (a positive integer).\n";
2985         help();
2986     }
2987 
2988     if (model_type == ::fixed && barcode_err_freq == 0) {
2989         cerr << "You must specify the barcode error frequency.\n";
2990         help();
2991     }
2992 
2993     return 0;
2994 }
2995 
version()2996 void version() {
2997     cerr << "ustacks " << VERSION << "\n\n";
2998 
2999     exit(1);
3000 }
3001 
help()3002 void help() {
3003     cerr << "ustacks " << VERSION << "\n"
3004          << "ustacks -f file_path -i id -o path [-M max_dist] [-m min_cov] [-p num_threads]" << "\n"
3005          << "  f: input file path.\n"
3006          << "  i: a unique integer ID for this sample.\n"
3007          << "  o: output path to write results.\n"
3008          << "  M: Maximum distance (in nucleotides) allowed between stacks (default 2).\n"
3009          << "  m: Minimum depth of coverage required to create a stack (default 3).\n"
3010          << "  N: Maximum distance allowed to align secondary reads to primary stacks (default: M + 2).\n"
3011          << "  p: enable parallel execution with num_threads threads.\n"
3012          << "  t: input file type. Supported types: fasta, fastq, gzfasta, or gzfastq (default: guess).\n"
3013          << "  --name: a name for the sample (default: input file name minus the suffix).\n"
3014          << "  R: retain unused reads.\n"
3015          << "  H: disable calling haplotypes from secondary reads.\n"
3016          << "\n"
3017          << "  Stack assembly options:\n"
3018          << "    d,--deleverage: enable the Deleveraging algorithm, used for resolving over merged tags.\n"
3019          << "    --keep-high-cov: disable the algorithm that removes highly-repetitive stacks and nearby errors.\n"
3020          << "    --high-cov-thres: highly-repetitive stacks threshold, in standard deviation units (default: 3.0).\n"
3021          << "    --max-locus-stacks <num>: maximum number of stacks at a single de novo locus (default 3).\n"
3022          << "     --k-len <len>: specify k-mer size for matching between alleles and loci (automatically calculated by default).\n\n"
3023          << "  Gapped assembly options:\n"
3024          << "    --max-gaps: number of gaps allowed between stacks before merging (default: 2).\n"
3025          << "    --min-aln-len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n\n"
3026          << "    --disable-gapped: do not preform gapped alignments between stacks (default: gapped alignements enabled).\n"
3027          << "  Model options:\n"
3028          << "    --model-type: either 'snp' (default), 'bounded', or 'fixed'\n"
3029          << "    For the SNP or Bounded SNP model:\n"
3030          << "      --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
3031          << "    For the Bounded SNP model:\n"
3032          << "      --bound-low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
3033          << "      --bound-high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
3034          << "    For the Fixed model:\n"
3035          << "      --bc-err-freq <num>: specify the barcode error frequency, between 0 and 1.0.\n"
3036          << "\n"
3037          << "  h: display this help messsage.\n";
3038 
3039     exit(1);
3040 }
3041