1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // ustacks -- build denovo stacks
23 //
24
25 #include "ustacks.h"
26 #include "log_utils.h"
27
28 using namespace std;
29
30 //
31 // Global variables to hold command-line options.
32 //
33 FileT in_file_type;
34 string in_file;
35 string prefix_path;
36 int num_threads = 1;
37 int sample_id = -1;
38 bool call_sec_hapl = true;
39 bool set_kmer_len = true;
40 int global_kmer_len = 0;
41 int min_merge_cov = 3;
42 uint max_subgraph = 3;
43 int dump_graph = 0;
44 int retain_rem_reads = false;
45 int deleverage_stacks = 0;
46 bool remove_rep_stacks = true;
47 double removal_threshold = 3.0;
48 int max_utag_dist = 2;
49 int max_rem_dist = -1;
50 bool force_diff_len = false;
51 bool gapped_alignments = true;
52 double min_match_len = 0.80;
53 double max_gaps = 2.0;
54 int removal_trigger;
55
56 //
57 // For use with the multinomial model to call fixed nucleotides.
58 //
59 modelt model_type = snp;
60 double alpha = 0.05;
61
main(int argc,char * argv[])62 int main (int argc, char* argv[]) {
63 try{
64 parse_command_line(argc, argv);
65
66 //
67 // Set the max remainder distance to be greater than the max_utag_dist, if it is not
68 // specified on the command line.
69 //
70 if (max_rem_dist == -1) max_rem_dist = max_utag_dist + 2;
71
72 cerr << "ustacks parameters selected:\n"
73 << " Input file: '" << in_file << "'\n"
74 << " Sample ID: " << sample_id << "\n"
75 << " Min depth of coverage to create a stack (m): " << min_merge_cov << "\n"
76 << " Repeat removal algorithm: " << (remove_rep_stacks ? "enabled" : "disabled") << "\n"
77 << " Max distance allowed between stacks (M): " << max_utag_dist << "\n"
78 << " Max distance allowed to align secondary reads: " << max_rem_dist << "\n"
79 << " Max number of stacks allowed per de novo locus: " << max_subgraph << "\n"
80 << " Deleveraging algorithm: " << (deleverage_stacks ? "enabled" : "disabled") << "\n";
81 if (gapped_alignments) {
82 cerr << " Gapped assembly: enabled\n"
83 << " Minimum alignment length: " << min_match_len << "\n";
84 } else {
85 cerr << " Gapped assembly: disabled\n";
86 }
87 cerr << " Model type: ";
88 switch (model_type) {
89 case snp:
90 cerr << "SNP\n";
91 break;
92 case ::fixed:
93 cerr << "Fixed\n";
94 break;
95 case bounded:
96 cerr << "Bounded; lower epsilon bound: " << bound_low << "; upper bound: " << bound_high << "\n";
97 break;
98 default:
99 DOES_NOT_HAPPEN;
100 break;
101 }
102 cerr << " Alpha significance level for model: " << alpha << "\n";
103 if (force_diff_len)
104 cerr << " Forcing the allowance of sequences of different length.\n";
105 cerr << flush;
106
107 //
108 // Set limits to call het or homozygote according to chi-square distribution with one
109 // degree of freedom:
110 // http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value
111 //
112 set_model_thresholds(alpha);
113
114 //
115 // Set the number of OpenMP parallel threads to execute.
116 //
117 #ifdef _OPENMP
118 omp_set_num_threads(num_threads);
119 #endif
120
121 //
122 // Load the reads and build primary & secondary stacks.
123 //
124
125 cerr << "\n" << "Loading RAD-Tags...";
126
127 size_t n_reads, n_u_reads, n_r_reads, n_rr_reads;
128 DNASeqHashMap radtags;
129 map<int, Stack *> unique;
130 map<int, Rem *> remainders;
131 load_radtags(in_file, radtags, n_reads);
132 reduce_radtags(radtags, unique, remainders, n_u_reads, n_r_reads);
133 radtags = DNASeqHashMap();
134
135 cerr << "\n"
136 << "Loaded " << n_reads << " reads; formed:\n"
137 << " " << unique.size() << " stacks representing "
138 << n_u_reads << " primary reads (" << as_percentage((double)n_u_reads/n_reads) << ")\n"
139 << " " << remainders.size() << " secondary stacks representing "
140 << n_r_reads << " secondary reads (" << as_percentage((double)n_r_reads/n_reads) << ")\n"
141 << flush;
142
143 //
144 // Initialize the MergedStack object.
145 //
146 map<int, MergedStack *> merged;
147 populate_merged_tags(unique, merged);
148
149 double cov_mean, cov_stdev, cov_max, n_used_reads;
150 ostream cerr_copy (cerr.rdbuf());
151 cerr_copy << std::fixed << std::setprecision(2);
152 auto report_cov = [&]() {
153 cerr_copy << "mean=" << cov_mean << "; stdev=" << cov_stdev << "; max=" << size_t(cov_max)
154 << "; n_reads=" << size_t(n_used_reads) << "(" << as_percentage(n_used_reads/n_reads) << ")";
155 };
156
157 calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
158 cerr << "\n";
159 cerr << "Stack coverage: ";
160 report_cov();
161 cerr << "\n";
162
163 //
164 // Remove highly repetitive stacks.
165 //
166 size_t n_high_cov = 0;
167 if (remove_rep_stacks) {
168 removal_trigger = (int) floor(cov_mean + cov_stdev * removal_threshold + 1);
169 cerr << "Removing repetitive stacks: cov > " << removal_trigger << " (mean+" << removal_threshold << "*stdev)...\n";
170 calc_kmer_distance(merged, 1);
171 assert(merged.size() == unique.size());
172 n_high_cov = remove_repetitive_stacks(merged);
173 cerr << " Blacklisted " << n_high_cov << " stacks.\n";
174
175 calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
176 cerr << "Coverage after repeat removal: ";
177 report_cov();
178 cerr << "\n\n";
179 }
180
181 //
182 // Assemble loci (merge primary stacks).
183 //
184 cerr << "Assembling stacks (max. dist. M=" << max_utag_dist << ")...\n";
185 calc_kmer_distance(merged, max_utag_dist);
186 size_t n_blacklisted;
187 merge_stacks(merged, n_blacklisted);
188 call_consensus(merged, unique, remainders, false);
189 cerr << " Assembled " << unique.size()-n_high_cov << " stacks into " << merged.size()
190 << "; blacklisted " << n_blacklisted << " stacks.\n";
191 calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
192 cerr << "Coverage after assembling stacks: ";
193 report_cov();
194 cerr << "\n\n";
195
196 //
197 // Merge secondary stacks.
198 //
199 cerr << "Merging secondary stacks (max. dist. N=" << max_rem_dist << " from consensus)...\n";
200 size_t utilized = merge_remainders(merged, unique, remainders);
201 size_t util_gapped = merge_gapped_remainders(merged, unique, remainders);
202 call_consensus(merged, unique, remainders, false);
203 cerr << " Merged " << utilized+util_gapped << " out of " << n_r_reads << " secondary reads ("
204 << as_percentage( (double) (utilized+util_gapped) / (double)n_r_reads) << "), "
205 << util_gapped << " merged with gapped alignments.\n";
206 n_rr_reads = n_r_reads - utilized - util_gapped;
207
208 calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
209 cerr << "Coverage after merging secondary stacks: ";
210 report_cov();
211 cerr << "\n\n";
212
213 //
214 // Merge loci based on alignments.
215 //
216 if (gapped_alignments) {
217 cerr << "Assembling stacks, allowing for gaps (min. match length " << as_percentage(min_match_len) << ")...\n";
218 const size_t n_ungapped_loci = merged.size();
219 search_for_gaps(merged);
220 merge_gapped_alns(unique, remainders, merged);
221 call_consensus(merged, unique, remainders, false);
222 cerr << " Assembled " << n_ungapped_loci << " stacks into " << merged.size() << " stacks.\n";
223
224 calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
225 cerr << "Coverage after gapped assembly: ";
226 report_cov();
227 cerr << "\n\n";
228
229 cerr << "Merging secondary stacks, allowing for gaps (min. match length " << as_percentage(min_match_len) << ")...\n";
230 search_for_gapped_remainders(merged, unique, remainders);
231 util_gapped = merge_gapped_remainders(merged, unique, remainders);
232 cerr << " Merged " << util_gapped << " out of " << n_rr_reads << " secondary reads ("
233 << as_percentage( (double)util_gapped / (double)n_rr_reads) << ").\n";
234
235 calc_coverage_distribution(merged, cov_mean, cov_stdev, cov_max, n_used_reads);
236 cerr << "Coverage after merging gapped secondary stacks: ";
237 report_cov();
238 cerr << "\n\n";
239 }
240
241 //
242 // Report how many reads were used.
243 //
244 cerr << "Final coverage: ";
245 report_cov();
246 cerr << "\n";
247
248 //
249 // Call the final consensus sequence and invoke the SNP model.
250 //
251 cerr << "Calling consensus sequences and haplotypes for catalog assembly...\n";
252 call_consensus(merged, unique, remainders, true);
253
254 //
255 // Write output files.
256 //
257 cerr << "Writing tags, SNPs, and alleles files...\n";
258 write_results(merged, unique, remainders);
259 cerr << "done.\n";
260
261 cerr << "ustacks is done.\n";
262 return 0;
263 } catch(std::exception& e) {
264 return stacks_handle_exceptions(e);
265 }}
266
267 int
merge_gapped_alns(map<int,Stack * > & unique,map<int,Rem * > & rem,map<int,MergedStack * > & merged)268 merge_gapped_alns(map<int, Stack *> &unique, map<int, Rem *> &rem, map<int, MergedStack *> &merged)
269 {
270 map<int, MergedStack *> new_merged;
271 MergedStack *tag_1, *tag_2, *merged_tag;
272
273 int id = 1;
274 uint merge_cnt = 0;
275
276 set<int> processed;
277 string cigar_1, cigar_2;
278 vector<pair<char, uint> > cigar;
279
280 for (auto it = merged.begin(); it != merged.end(); it++) {
281 if (processed.count(it->first) > 0)
282 continue;
283
284 tag_1 = it->second;
285 sort(tag_1->alns.begin(), tag_1->alns.end(), rank_alignments);
286
287 //
288 // No gapped alignments, or no optimal alignment for this stack, or
289 // this stack has already been set aside.
290 //
291 if (tag_1->masked || tag_1->alns.size() == 0)
292 continue;
293 if (tag_1->alns.size() > 1 && tag_1->alns[0].pct_id == tag_1->alns[1].pct_id)
294 continue;
295
296 //
297 // Found one or more gapped alignments. Make sure the best alignment for each of the aligned pairs
298 // of tags are reciprocal to each other.
299 //
300 tag_2 = merged[tag_1->alns[0].id];
301 sort(tag_2->alns.begin(), tag_2->alns.end(), rank_alignments);
302
303 if (tag_2->masked || tag_2->alns.size() == 0)
304 continue;
305 if (tag_2->alns.size() > 1 && tag_2->alns[0].pct_id == tag_2->alns[1].pct_id)
306 continue;
307
308 if (tag_1->id != (int)tag_2->alns[0].id)
309 continue;
310
311 cigar_1 = invert_cigar(tag_1->alns[0].cigar);
312 cigar_2 = tag_2->alns[0].cigar;
313
314 if (cigar_1 == cigar_2) {
315 parse_cigar(tag_1->alns[0].cigar.c_str(), cigar);
316
317 //
318 // Check that the alignment still contains fewer than
319 // max_utag_dist mismatches.
320 //
321 if (dist(tag_1->con, tag_2->con, cigar) > max_utag_dist)
322 continue;
323
324 //
325 // If the alignment has too many gaps, skip it.
326 //
327 if (tag_1->alns[0].gap_cnt > (max_gaps + 1))
328 continue;
329
330 //
331 // If the alignment doesn't span enough of the two sequences, skip it.
332 //
333 if (tag_1->alns[0].pct_id < min_match_len)
334 continue;
335
336 //
337 // Edit the sequences to accommodate any added gaps.
338 //
339 edit_gapped_seqs(unique, rem, tag_1, cigar);
340
341 parse_cigar(tag_2->alns[0].cigar.c_str(), cigar);
342 edit_gapped_seqs(unique, rem, tag_2, cigar);
343
344 //
345 // Merge the tags.
346 //
347 merged_tag = merge_tags(tag_1, tag_2, id);
348 new_merged[id] = merged_tag;
349 id++;
350
351 //
352 // Record the gaps.
353 //
354 uint pos = 0;
355 for (uint j = 0; j < cigar.size(); j++) {
356 if (cigar[j].first == 'I' || cigar[j].first == 'D')
357 merged_tag->gaps.push_back(Gap(pos, pos + cigar[j].second));
358 pos += cigar[j].second;
359 }
360
361 processed.insert(tag_1->id);
362 processed.insert(tag_2->id);
363
364 merge_cnt++;
365 }
366 }
367
368 set<int> merge_set;
369 for (auto it = merged.begin(); it != merged.end(); it++) {
370 if (processed.count(it->first))
371 continue;
372 tag_1 = it->second;
373 merge_set.insert(tag_1->id);
374 tag_2 = merge_tags(merged, merge_set, id);
375 new_merged[id] = tag_2;
376 merge_set.clear();
377 id++;
378 }
379
380 //
381 // Free the memory from the old map of merged tags.
382 //
383 for (auto it = merged.begin(); it != merged.end(); it++)
384 delete it->second;
385
386 merged = new_merged;
387 return 0;
388 }
389
390 bool
rank_alignments(Aln a,Aln b)391 rank_alignments(Aln a, Aln b)
392 {
393 return a.pct_id > b.pct_id;
394 }
395
396 int
edit_gapped_seqs(map<int,Stack * > & unique,map<int,Rem * > & rem,MergedStack * tag,vector<pair<char,uint>> & cigar)397 edit_gapped_seqs(map<int, Stack *> &unique, map<int, Rem *> &rem, MergedStack *tag, vector<pair<char, uint> > &cigar)
398 {
399 int stack_id;
400 Stack *s;
401 Rem *r;
402 char *buf = new char[cigar_length_padded(cigar) + 1];
403 string seq;
404
405 for (uint i = 0; i < tag->utags.size(); i++) {
406 stack_id = tag->utags[i];
407 s = unique[stack_id];
408
409 s->seq->seq(buf);
410 seq = apply_cigar_to_seq(buf, cigar);
411
412 delete s->seq;
413 s->seq = new DNANSeq(seq.length(), seq.c_str());
414 }
415
416 for (uint i = 0; i < tag->remtags.size(); i++) {
417 stack_id = tag->remtags[i];
418 r = rem[stack_id];
419
420 r->seq->seq(buf);
421 seq = apply_cigar_to_seq(buf, cigar);
422
423 delete r->seq;
424 r->seq = new DNANSeq(seq.length(), seq.c_str());
425 }
426
427 delete [] buf;
428
429 return 0;
430 }
431
432 int
edit_gaps(vector<pair<char,uint>> & cigar,char * seq)433 edit_gaps(vector<pair<char, uint> > &cigar, char *seq)
434 {
435 char *buf;
436 uint size = cigar.size();
437 char op;
438 uint dist, bp, len, buf_len, buf_size, j, k, stop;
439
440 len = strlen(seq);
441 bp = 0;
442
443 buf = new char[len + 1];
444 buf_size = len + 1;
445
446 for (uint i = 0; i < size; i++) {
447 op = cigar[i].first;
448 dist = cigar[i].second;
449
450 switch(op) {
451 case 'S':
452 stop = bp + dist;
453 stop = stop > len ? len : stop;
454 while (bp < stop) {
455 seq[bp] = 'N';
456 bp++;
457 }
458 break;
459 case 'D':
460 //
461 // A deletion has occured in the read relative to the reference genome.
462 // Pad the read with sufficent Ns to match the deletion, shifting the existing
463 // sequence down. Trim the final length to keep the read length consistent.
464 //
465 k = bp >= len ? len : bp;
466
467 strncpy(buf, seq + k, buf_size - 1);
468 buf[buf_size - 1] = '\0';
469 buf_len = strlen(buf);
470
471 stop = bp + dist;
472 stop = stop > len ? len : stop;
473 while (bp < stop) {
474 seq[bp] = 'N';
475 bp++;
476 }
477
478 j = bp;
479 k = 0;
480 while (j < len && k < buf_len) {
481 seq[j] = buf[k];
482 k++;
483 j++;
484 }
485 break;
486 case 'I':
487 case 'M':
488 bp += dist;
489 break;
490 default:
491 break;
492 }
493 }
494
495 delete [] buf;
496
497 return 0;
498 }
499
500 int
search_for_gaps(map<int,MergedStack * > & merged)501 search_for_gaps(map<int, MergedStack *> &merged)
502 {
503 //
504 // Search for loci that can be merged with a gapped alignment.
505 //
506 KmerHashMap kmer_map;
507 vector<char *> kmer_map_keys;
508 MergedStack *tag_1, *tag_2;
509
510 //
511 // OpenMP can't parallelize random access iterators, so we convert
512 // our map to a vector of integer keys.
513 //
514 vector<int> keys;
515 for (auto it = merged.begin(); it != merged.end(); it++)
516 keys.push_back(it->first);
517
518 //
519 // Calculate the number of k-mers we will generate.
520 //
521 size_t con_len = 0;
522 for (auto miter = merged.begin(); miter != merged.end(); miter++)
523 con_len = miter->second->len > con_len ? miter->second->len : con_len;
524 size_t kmer_len = set_kmer_len ? 19 : global_kmer_len;
525
526 //
527 // Calculate the minimum number of matching k-mers required for a possible sequence match.
528 //
529 uint min_hits = (round((double) con_len * min_match_len) - (kmer_len * max_gaps)) - kmer_len + 1;
530
531 populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
532
533 #pragma omp parallel private(tag_1, tag_2)
534 {
535 KmerHashMap::iterator h;
536 vector<char *> query_kmers;
537 set<string> uniq_kmers;
538 GappedAln *aln = new GappedAln(con_len);
539 AlignRes a;
540
541 size_t num_kmers = con_len - kmer_len + 1;
542 initialize_kmers(kmer_len, num_kmers, query_kmers);
543
544 #pragma omp for schedule(dynamic)
545 for (uint i = 0; i < keys.size(); i++) {
546 tag_1 = merged[keys[i]];
547
548 //
549 // Don't compute distances for masked tags.
550 //
551 if (tag_1->masked) continue;
552
553 //
554 // Don't compare tags that are already at or above max_locus_stacks.
555 //
556 if (tag_1->utags.size() >= max_subgraph)
557 continue;
558
559 if (tag_1->len < kmer_len) continue;
560 num_kmers = tag_1->len - kmer_len + 1;
561 generate_kmers_lazily(tag_1->con, kmer_len, num_kmers, query_kmers);
562
563 assert(num_kmers > 0);
564
565 uniq_kmers.clear();
566 for (uint j = 0; j < num_kmers; j++)
567 uniq_kmers.insert(query_kmers[j]);
568
569 vector<int> hits;
570 //
571 // Lookup the occurances of each k-mer in the kmer_map
572 //
573 for (auto j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
574 h = kmer_map.find(j->c_str());
575
576 if (h != kmer_map.end())
577 for (uint k = 0; k < h->second.size(); k++)
578 hits.push_back(h->second[k]);
579 }
580
581 //
582 // Sort the vector of indexes; provides the number of hits to each allele/locus
583 // and orders them largest to smallest.
584 //
585 sort(hits.begin(), hits.end());
586
587 //
588 // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
589 //
590 uint hit_cnt, index, prev_id, allele_id, hits_size, stop;
591 vector<pair<uint, uint>> ordered_hits;
592
593 hits_size = hits.size();
594
595 if (hits_size == 0)
596 continue;
597
598 prev_id = hits[0];
599 index = 0;
600
601 do {
602 hit_cnt = 0;
603 allele_id = prev_id;
604
605 while (index < hits_size && (uint) hits[index] == prev_id) {
606 hit_cnt++;
607 index++;
608 }
609
610 if (index < hits_size)
611 prev_id = hits[index];
612
613 // Don't compare tag_1 against itself.
614 if (tag_1->id == (int) allele_id) continue;
615
616 ordered_hits.push_back(make_pair(allele_id, hit_cnt));
617
618 } while (index < hits_size);
619
620 if (ordered_hits.size() == 0)
621 continue;
622
623 //
624 // Process the hits from most kmer hits to least kmer hits.
625 //
626 sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
627
628 //
629 // Align at least one locus, regardless of kmer count, or more than one locus if
630 // there are sufficient k-mer hits supporting it.
631 //
632 stop = 1;
633 for (uint j = 1; j < ordered_hits.size(); j++)
634 if (ordered_hits[j].second < min_hits) {
635 stop = j;
636 break;
637 }
638
639 //
640 // Iterate through the list of hits. For each hit that has more than min_hits
641 // check its full length to verify a match.
642 //
643 for (uint j = 0; j < stop; j++) {
644 tag_2 = merged.at(ordered_hits[j].first);
645
646 // Don't compute distances for masked tags
647 if (tag_2->masked) continue;
648
649 //
650 // Don't compare tags that are already at or above max_locus_stacks.
651 //
652 if (tag_2->utags.size() >= max_subgraph)
653 continue;
654
655 aln->init(tag_1->len, tag_2->len);
656
657 if (aln->align(tag_1->con, tag_2->con)) {
658 a = aln->result();
659 tag_1->alns.push_back(Aln(tag_2->id, a.cigar, a.pct_id, a.gap_cnt));
660 }
661 }
662 }
663
664 //
665 // Free the k-mers we generated for this query.
666 //
667 for (uint j = 0; j < query_kmers.size(); j++)
668 delete [] query_kmers[j];
669 query_kmers.clear();
670
671 delete aln;
672 }
673
674 free_kmer_hash(kmer_map, kmer_map_keys);
675
676 return 0;
677 }
678
679 size_t
merge_remainders(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem)680 merge_remainders(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem)
681 {
682 //
683 // OpenMP can't parallelize random access iterators, so we convert
684 // our map to a vector of integer keys.
685 //
686 vector<int> keys;
687 size_t max_rem_len = 0;
688 for (auto it = rem.begin(); it != rem.end(); it++) {
689 keys.push_back(it->first);
690 max_rem_len = it->second->seq->size() > max_rem_len ? it->second->seq->size() : max_rem_len;
691 }
692
693 //
694 // Calculate the number of k-mers we will generate based on the longest sequence present.
695 // If kmer_len == 0, determine the optimal length for k-mers.
696 //
697 size_t con_len = 0;
698 for (auto miter = merged.begin(); miter != merged.end(); miter++)
699 con_len = miter->second->len > con_len ? miter->second->len : con_len;
700 size_t kmer_len = set_kmer_len ? determine_kmer_length(con_len, max_rem_dist) : global_kmer_len;
701
702 //
703 // Calculate the minimum number of matching k-mers required for a possible sequence match.
704 //
705 int min_hits = calc_min_kmer_matches(kmer_len, max_rem_dist, con_len, set_kmer_len ? true : false);
706
707 //cerr << " Distance allowed between stacks: " << max_rem_dist
708 // << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
709 // << min_hits << " k-mer hits required.\n";
710
711 KmerHashMap kmer_map;
712 vector<char *> kmer_map_keys;
713 populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
714 size_t utilized = 0;
715
716 #pragma omp parallel reduction(+: utilized)
717 {
718 KmerHashMap::iterator h;
719 vector<char *> rem_kmers;
720 Cigar cigar;
721 string seq;
722 char *buf = new char[max_rem_len + 1];
723 size_t num_kmers = 0;
724
725 #pragma omp for schedule(dynamic)
726 for (uint j = 0; j < keys.size(); j++) {
727 auto it = rem.find(keys[j]);
728 Rem *r = it->second;
729
730 //
731 // Generate the k-mers for this remainder sequence
732 //
733 r->seq->seq(buf);
734 if (r->seq->size() < kmer_len) continue;
735 num_kmers = r->seq->size() - kmer_len + 1;
736 generate_kmers_lazily(buf, kmer_len, num_kmers, rem_kmers);
737
738 assert(num_kmers > 0);
739
740 map<int, int> hits;
741 //
742 // Lookup the occurances of each remainder k-mer in the MergedStack k-mer map
743 //
744 for (uint k = 0; k < num_kmers; k++) {
745 h = kmer_map.find(rem_kmers[k]);
746
747 if (h != kmer_map.end())
748 for (uint n = 0; n < h->second.size(); n++)
749 hits[h->second[n]]++;
750 }
751
752 //
753 // Iterate through the list of hits. For each hit that has more than min_hits
754 // check its full length to verify a match.
755 //
756 map<int, int> dists;
757 for (auto hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
758
759 if (hit_it->second < min_hits)
760 continue;
761
762 MergedStack *tag_1 = merged[hit_it->first];
763
764 int d = dist(tag_1, buf);
765
766 //
767 // Store the distance between these two sequences if it is
768 // below the maximum distance
769 //
770 if (d <= max_rem_dist) {
771 dists[hit_it->first] = d;
772 }
773 }
774
775 //
776 // Check to see if there is a uniquely low distance, if so,
777 // merge this remainder tag. If not, discard it, since we
778 // can't locate a single best-fitting Stack to merge it into.
779 //
780 map<int, int>::iterator s;
781 int min_id = -1;
782 int count = 0;
783 int dist = max_rem_dist + 1;
784
785 for (s = dists.begin(); s != dists.end(); s++) {
786 if ((*s).second < dist) {
787 min_id = (*s).first;
788 count = 1;
789 dist = (*s).second;
790 } else if ((*s).second == dist) {
791 count++;
792 }
793 }
794
795 //
796 // Found a merge partner.
797 //
798 if (min_id >= 0 && count == 1) {
799 MergedStack *tag_1 = merged.at(min_id);
800
801 //
802 // The max_rem_dist distance allows for the possibility of a frameshift smaller
803 // or equal to max_rem_dist at the 3' end of the read. If we detect a possible
804 // frameshift, queue the remainder read to re-align the read using the gapped
805 // alignment algorithm.
806 //
807 if (check_frameshift(tag_1, buf, max_rem_dist)) {
808 #pragma omp critical
809 tag_1->rem_queue.push_back(r->id);
810 continue;
811 }
812
813 r->utilized = true;
814 utilized += r->count();
815
816 #pragma omp critical
817 {
818 tag_1->remtags.push_back(r->id);
819 tag_1->count += r->count();
820 }
821 }
822 }
823
824 //
825 // Free the k-mers we generated for this remainder read
826 //
827 for (uint k = 0; k < rem_kmers.size(); k++)
828 delete [] rem_kmers[k];
829
830 delete [] buf;
831 }
832
833 free_kmer_hash(kmer_map, kmer_map_keys);
834
835 return utilized;
836 }
837
838 int
search_for_gapped_remainders(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem)839 search_for_gapped_remainders(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem)
840 {
841 //
842 // Search for remainders that can be merged with a gapped alignment.
843 //
844
845 //
846 // OpenMP can't parallelize random access iterators, so we convert
847 // our map to a vector of integer keys.
848 //
849 vector<int> keys;
850 size_t max_rem_len = 0;
851 for (auto it = rem.begin(); it != rem.end(); it++) {
852 keys.push_back(it->first);
853 max_rem_len = it->second->seq->size() > max_rem_len ? it->second->seq->size() : max_rem_len;
854 }
855
856 //
857 // Calculate the number of k-mers we will generate.
858 //
859 size_t con_len = 0;
860 for (auto miter = merged.begin(); miter != merged.end(); miter++)
861 con_len = miter->second->len > con_len ? miter->second->len : con_len;
862 size_t kmer_len = set_kmer_len ? 19 : global_kmer_len;
863
864 //
865 // Calculate the minimum number of matching k-mers required for a possible sequence match.
866 //
867 int min_hits = (round((double) con_len * min_match_len) - (kmer_len * max_gaps)) - kmer_len + 1;
868
869 //cerr << " Searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); " << min_hits << " k-mer hits required.\n";
870
871 KmerHashMap kmer_map;
872 vector<char *> kmer_map_keys;
873 populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
874
875 #pragma omp parallel
876 {
877 KmerHashMap::iterator h;
878 vector<char *> rem_kmers;
879 set<string> uniq_kmers;
880 GappedAln *aln = new GappedAln(con_len);
881 AlignRes a;
882 string seq, buf;
883 char *rem_buf = new char[max_rem_len + 1];
884 size_t num_kmers = 0;
885
886 #pragma omp for schedule(dynamic)
887 for (uint i = 0; i < keys.size(); i++) {
888 auto it = rem.find(keys[i]);
889 Rem *r = it->second;
890
891 if (r->utilized) continue;
892
893 //
894 // Generate the k-mers for this remainder sequence
895 //
896 r->seq->seq(rem_buf);
897 if (r->seq->size() < kmer_len) continue;
898 num_kmers = r->seq->size() - kmer_len + 1;
899 generate_kmers_lazily(rem_buf, kmer_len, num_kmers, rem_kmers);
900
901 uniq_kmers.clear();
902 for (uint j = 0; j < num_kmers; j++)
903 uniq_kmers.insert(rem_kmers[j]);
904
905 assert(num_kmers > 0);
906
907 map<int, int> hits;
908 //
909 // Lookup the occurances of each remainder k-mer in the MergedStack k-mer map
910 //
911 for (set<string>::iterator j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
912 h = kmer_map.find(j->c_str());
913
914 if (h != kmer_map.end())
915 for (uint k = 0; k < h->second.size(); k++)
916 hits[h->second[k]]++;
917 }
918
919 //
920 // Iterate through the list of hits. For each hit that has more than min_hits
921 // check its full length to verify a match.
922 //
923 vector<Aln> rem_alns;
924 Cigar cigar;
925
926 for (auto hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
927
928 if (hit_it->second < min_hits) continue;
929
930 MergedStack *tag_1 = merged[hit_it->first];
931
932 // Don't compute distances for masked tags
933 if (tag_1->masked) continue;
934
935 //
936 // Calculate the alignment.
937 //
938 aln->init(r->seq->size(), tag_1->len);
939
940 if (aln->align(r->seq->seq().c_str(), tag_1->con)) {
941 a = aln->result();
942 rem_alns.push_back(Aln(tag_1->id, a.cigar, a.pct_id, a.gap_cnt));
943 }
944 }
945
946 if (rem_alns.size() == 0) continue;
947
948 //
949 // Rank the alignments for this remainder read.
950 //
951 sort(rem_alns.begin(), rem_alns.end(), rank_alignments);
952
953 MergedStack *tag_1 = merged[rem_alns[0].id];
954 parse_cigar(rem_alns[0].cigar.c_str(), cigar);
955
956 //
957 // Check that the alignment still contains fewer than
958 // max_utag_dist mismatches.
959 //
960 if (dist(r->seq->seq().c_str(), tag_1->con, cigar) > max_utag_dist)
961 continue;
962
963 //
964 // If the alignment has too many gaps, skip it.
965 //
966 if (rem_alns[0].gap_cnt > (max_gaps + 1))
967 continue;
968
969 //
970 // If the alignment doesn't span enough of the two sequences, skip it.
971 //
972 if (rem_alns[0].pct_id < min_match_len)
973 continue;
974
975 #pragma omp critical
976 tag_1->rem_queue.push_back(r->id);
977 }
978
979 //
980 // Free the k-mers we generated for this query.
981 //
982 for (uint j = 0; j < rem_kmers.size(); j++)
983 delete [] rem_kmers[j];
984 rem_kmers.clear();
985
986 delete [] rem_buf;
987 delete aln;
988 }
989
990 free_kmer_hash(kmer_map, kmer_map_keys);
991
992 return 0;
993 }
994
995 size_t
merge_gapped_remainders(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem)996 merge_gapped_remainders(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem)
997 {
998 //
999 // OpenMP can't parallelize random access iterators, so we convert
1000 // our map to a vector of integer keys.
1001 //
1002 vector<int> keys;
1003 for (auto it = merged.begin(); it != merged.end(); it++)
1004 keys.push_back(it->first);
1005
1006 size_t utilized = 0;
1007
1008 #pragma omp parallel reduction(+: utilized)
1009 {
1010 Cigar cigar;
1011 GappedAln *aln = new GappedAln(merged[keys[0]]->len);
1012 AlignRes a;
1013 string buf, seq;
1014
1015 #pragma omp for schedule(dynamic)
1016 for (uint i = 0; i < keys.size(); i++) {
1017 MergedStack *tag_1 = merged.at(keys[i]);
1018
1019 //
1020 // Don't compute distances for masked tags.
1021 //
1022 if (tag_1->masked) continue;
1023
1024 for (uint i = 0; i < tag_1->rem_queue.size(); i++) {
1025 Rem *r = rem.at(tag_1->rem_queue[i]);
1026
1027 // //
1028 // // We want to align the last max_rem_dist * 2 nucleotides. If sequences are already the wrong length,
1029 // // adjust so the 3' ends are lined up properly in the alignment matrix.
1030 // //
1031 // diff = tag_1->len - r->seq->size();
1032 // q_start = r->seq->size() - (max_rem_dist * 2) - 1;
1033 // q_end = r->seq->size() - 1;
1034 // s_start = tag_1->len - (max_rem_dist * 2) - 1;
1035 // s_end = tag_1->len - 1;
1036
1037 // q_start = diff > 0 ? q_start + diff : q_start;
1038 // s_start = diff < 0 ? s_start + abs(diff) : s_start;
1039
1040 // cerr << "Consensus size: " << tag_1->len << "; remainder size: " << r->seq->size() << "\n"
1041 // << "Con seq: " << tag_1->con << " (" << strlen(tag_1->con) << ")\n"
1042 // << "Rem seq: " << r->seq->seq() << "\n"
1043 // << "q_start: " << q_start << "; q_end: " << q_end << "; s_start: " << s_start << "; s_end: " << s_end << "\n";
1044
1045 aln->init(r->seq->size(), tag_1->len);
1046
1047 if (aln->align(r->seq->seq().c_str(), tag_1->con)) {
1048 a = aln->result();
1049 parse_cigar(a.cigar.c_str(), cigar);
1050
1051 // cerr << "Consensus size: " << tag_1->len << "; remainder size: " << r->seq->size() << "\n"
1052 // << "Con seq: " << tag_1->con << " (" << strlen(tag_1->con) << ")\n"
1053 // << "Rem seq: " << r->seq->seq() << "\n"
1054 // << "Cigar: " << a.cigar << "\n";
1055
1056 //
1057 // Check that the alignment still contains fewer than
1058 // max_utag_dist mismatches.
1059 //
1060 if (dist(r->seq->seq().c_str(), tag_1->con, cigar) > max_utag_dist)
1061 continue;
1062
1063 //
1064 // If the alignment has too many gaps, skip it.
1065 //
1066 if (a.gap_cnt > (max_gaps + 1))
1067 continue;
1068
1069 //
1070 // If the alignment doesn't span enough of the two sequences, skip it.
1071 //
1072 if (a.pct_id < min_match_len)
1073 continue;
1074
1075 r->utilized = true;
1076 tag_1->count += r->count();
1077 utilized += r->count();
1078
1079 buf = r->seq->seq();
1080 seq = apply_cigar_to_seq(buf.c_str(), cigar);
1081 r->add_seq(seq.c_str());
1082 invert_cigar(cigar);
1083
1084 //
1085 // Record the gaps (but not soft-masked 3' regions.
1086 //
1087 uint pos = 0;
1088 for (uint j = 0; j < cigar.size(); j++) {
1089 if (cigar[j].first == 'I' || cigar[j].first == 'D') {
1090 tag_1->gaps.push_back(Gap(pos, pos + cigar[j].second - 1));
1091 }
1092 pos += cigar[j].second;
1093 }
1094 edit_gapped_seqs(unique, rem, tag_1, cigar);
1095
1096 //
1097 // Add this aligned remainder tag to the locus after adjusting the other reads but before adjusting the consensus.
1098 //
1099 tag_1->remtags.push_back(r->id);
1100 update_consensus(tag_1, unique, rem);
1101 }
1102 }
1103
1104 tag_1->rem_queue.clear();
1105 }
1106 }
1107
1108 return utilized;
1109 }
1110
1111 int
call_alleles(MergedStack * mtag,vector<DNANSeq * > & reads,vector<read_type> & read_types)1112 call_alleles(MergedStack *mtag, vector<DNANSeq *> &reads, vector<read_type> &read_types)
1113 {
1114 int row;
1115 int height = reads.size();
1116 string allele;
1117 DNANSeq *d;
1118 char base;
1119 vector<SNP *>::iterator snp;
1120
1121 for (row = 0; row < height; row++) {
1122 allele.clear();
1123
1124 uint snp_cnt = 0;
1125
1126 //
1127 // Only call a haplotype from primary reads.
1128 //
1129 if (!call_sec_hapl && read_types[row] == secondary) continue;
1130
1131 for (snp = mtag->snps.begin(); snp != mtag->snps.end(); snp++) {
1132 if ((*snp)->type != snp_type_het) continue;
1133
1134 snp_cnt++;
1135
1136 d = reads[row];
1137 base = (*d)[(*snp)->col];
1138
1139 //
1140 // Check to make sure the nucleotide at the location of this SNP is
1141 // of one of the two possible states the multinomial model called.
1142 //
1143 if (base == (*snp)->rank_1 || base == (*snp)->rank_2)
1144 allele += base;
1145 else
1146 break;
1147 }
1148
1149 if (snp_cnt > 0 && allele.length() == snp_cnt)
1150 mtag->alleles[allele]++;
1151 }
1152
1153 return 0;
1154 }
1155
1156 int
call_consensus(map<int,MergedStack * > & merged,map<int,Stack * > & unique,map<int,Rem * > & rem,bool invoke_model)1157 call_consensus(map<int, MergedStack *> &merged, map<int, Stack *> &unique, map<int, Rem *> &rem, bool invoke_model)
1158 {
1159 //
1160 // OpenMP can't parallelize random access iterators, so we convert
1161 // our map to a vector of integer keys.
1162 //
1163 map<int, MergedStack *>::iterator it;
1164 vector<int> keys;
1165 for (it = merged.begin(); it != merged.end(); it++)
1166 keys.push_back(it->first);
1167
1168 int i;
1169 #pragma omp parallel private(i)
1170 {
1171 MergedStack *mtag;
1172 Stack *utag;
1173 Rem *r;
1174
1175 #pragma omp for schedule(dynamic)
1176 for (i = 0; i < (int) keys.size(); i++) {
1177 mtag = merged[keys[i]];
1178
1179 //
1180 // Create a two-dimensional array, each row containing one read. For
1181 // each unique tag that has been merged together, add the sequence for
1182 // that tag into our array as many times as it originally occurred.
1183 //
1184 vector<int>::iterator j;
1185 vector<DNANSeq *> reads;
1186 vector<read_type> read_types;
1187 uint length = unique[mtag->utags.front()]->seq->size();
1188
1189 for (j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
1190 utag = unique[*j];
1191
1192 for (uint k = 0; k < utag->count(); k++) {
1193 reads.push_back(utag->seq);
1194 read_types.push_back(primary);
1195
1196 assert(utag->seq->size() == length);
1197 }
1198 }
1199
1200 // For each remainder tag that has been merged into this Stack, add the sequence.
1201 for (j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
1202 r = rem[*j];
1203
1204 for (uint k = 0; k < r->count(); k++) {
1205 reads.push_back(r->seq);
1206 read_types.push_back(secondary);
1207
1208 assert(r->seq->size() == length);
1209 }
1210 }
1211
1212 //
1213 // Iterate over each column of the array and call the consensus base.
1214 //
1215 uint row, col;
1216 uint height = reads.size();
1217 string con;
1218 map<char, int> nuc;
1219 map<char, int>::iterator max;
1220 DNANSeq *d;
1221
1222 uint cur_gap = mtag->gaps.size() > 0 ? 0 : 1;
1223
1224 for (col = 0; col < length; col++) {
1225 nuc['A'] = 0;
1226 nuc['G'] = 0;
1227 nuc['C'] = 0;
1228 nuc['T'] = 0;
1229
1230 for (row = 0; row < height; row++) {
1231 d = reads[row];
1232 if (nuc.count((*d)[col]))
1233 nuc[(*d)[col]]++;
1234 }
1235
1236 //
1237 // Find the base with a plurality of occurances and call it.
1238 //
1239 max = nuc.end();
1240
1241 for (auto n = nuc.begin(); n != nuc.end(); n++) {
1242 if (max == nuc.end() || n->second > max->second)
1243 max = n;
1244 }
1245 con += max->first;
1246
1247 //
1248 // Search this column for the presence of a SNP
1249 //
1250 if (invoke_model) {
1251 //
1252 // Don't invoke the model within gaps.
1253 //
1254 if (cur_gap < mtag->gaps.size() &&
1255 col >= mtag->gaps[cur_gap].start && col <= mtag->gaps[cur_gap].end) {
1256 SNP *snp = new SNP;
1257 snp->type = snp_type_unk;
1258 snp->col = col;
1259 snp->rank_1 = '-';
1260 snp->rank_2 = '-';
1261 mtag->snps.push_back(snp);
1262
1263 if (col == mtag->gaps[cur_gap].end)
1264 cur_gap++;
1265 continue;
1266 }
1267
1268 switch(model_type) {
1269 case snp:
1270 call_multinomial_snp(mtag, col, nuc, true);
1271 break;
1272 case bounded:
1273 call_bounded_multinomial_snp(mtag, col, nuc, true);
1274 break;
1275 case ::fixed:
1276 call_multinomial_fixed(mtag, col, nuc);
1277 break;
1278 default:
1279 DOES_NOT_HAPPEN;
1280 break;
1281 }
1282 }
1283 }
1284
1285 assert(con.length() == length);
1286
1287 if (invoke_model) {
1288 assert(mtag->snps.size() == length);
1289
1290 call_alleles(mtag, reads, read_types);
1291
1292 if (model_type == ::fixed) {
1293 //
1294 // Mask nucleotides that are not fixed.
1295 //
1296 vector<SNP *>::iterator s;
1297 for (s = mtag->snps.begin(); s != mtag->snps.end(); s++) {
1298 if ((*s)->type == snp_type_unk)
1299 con.replace((*s)->col, 1, "N");
1300 }
1301 }
1302 }
1303
1304 mtag->add_consensus(con.c_str());
1305 }
1306 }
1307
1308 return 0;
1309 }
1310
1311 int
update_consensus(MergedStack * mtag,map<int,Stack * > & unique,map<int,Rem * > & rem)1312 update_consensus(MergedStack *mtag, map<int, Stack *> &unique, map<int, Rem *> &rem)
1313 {
1314 Stack *utag;
1315 Rem *r;
1316 //
1317 // Create a two-dimensional array, each row containing one read. For
1318 // each unique tag that has been merged together, add the sequence for
1319 // that tag into our array as many times as it originally occurred.
1320 //
1321 vector<DNANSeq *> reads;
1322
1323 for (auto j = mtag->utags.begin(); j != mtag->utags.end(); j++) {
1324 utag = unique[*j];
1325
1326 for (uint k = 0; k < utag->count(); k++)
1327 reads.push_back(utag->seq);
1328 }
1329
1330 // For each remainder tag that has been merged into this Stack, add the sequence.
1331 for (auto j = mtag->remtags.begin(); j != mtag->remtags.end(); j++) {
1332 r = rem[*j];
1333
1334 for (uint k = 0; k < r->count(); k++)
1335 reads.push_back(r->seq);
1336 }
1337
1338 //
1339 // Iterate over each column of the array and call the consensus base.
1340 //
1341 string con;
1342 uint row, col;
1343 uint length = reads[0]->size();
1344 uint height = reads.size();
1345 DNANSeq *d;
1346
1347 vector<uint> nucs = {0, // A
1348 0, // C
1349 0, // G
1350 0, // T
1351 0}; // N
1352
1353 con.reserve(length);
1354
1355 for (col = 0; col < length; col++) {
1356 nucs.assign(5, 0);
1357
1358 for (row = 0; row < height; row++) {
1359 d = reads[row];
1360 switch ((*d)[col]) {
1361 case 'A':
1362 nucs[0]++;
1363 break;
1364 case 'C':
1365 nucs[1]++;
1366 break;
1367 case 'G':
1368 nucs[2]++;
1369 break;
1370 case 'T':
1371 nucs[3]++;
1372 break;
1373 case 'N':
1374 nucs[4]++;
1375 break;
1376 }
1377 }
1378
1379 //
1380 // Call the base with a plurality of occurances. Only call 'N' if there is no other information.
1381 //
1382 uint max = 0;
1383 char nuc = 'N';
1384
1385 for (uint i = 0; i < 4; i++) {
1386 switch (i) {
1387 case 0:
1388 if (nucs[i] > max) {
1389 max = nucs[i];
1390 nuc = 'A';
1391 }
1392 break;
1393 case 1:
1394 if (nucs[i] > max) {
1395 max = nucs[i];
1396 nuc = 'C';
1397 }
1398 break;
1399 case 2:
1400 if (nucs[i] > max) {
1401 max = nucs[i];
1402 nuc = 'G';
1403 }
1404 break;
1405 case 3:
1406 if (nucs[i] > max) {
1407 max = nucs[i];
1408 nuc = 'T';
1409 }
1410 break;
1411 }
1412 }
1413 con += nuc;
1414 }
1415
1416 mtag->add_consensus(con.c_str());
1417
1418 return 0;
1419 }
1420
1421 int
populate_merged_tags(map<int,Stack * > & unique,map<int,MergedStack * > & merged)1422 populate_merged_tags(map<int, Stack *> &unique, map<int, MergedStack *> &merged)
1423 {
1424 map<int, Stack *>::iterator i;
1425 map<int, MergedStack *>::iterator it_new, it_old;
1426 Stack *utag;
1427 MergedStack *mtag;
1428 int k = 0;
1429
1430 it_old = merged.begin();
1431
1432 for (i = unique.begin(); i != unique.end(); i++) {
1433 utag = (*i).second;
1434 mtag = new MergedStack;
1435
1436 mtag->id = k;
1437 mtag->count = utag->count();
1438 mtag->utags.push_back(utag->id);
1439 mtag->add_consensus(utag->seq);
1440
1441 // Insert the new MergedStack giving a hint as to which position
1442 // to insert it at.
1443 it_new = merged.insert(it_old, pair<int, MergedStack *>(k, mtag));
1444 it_old = it_new;
1445 k++;
1446 }
1447
1448 return 0;
1449 }
1450
1451 void
merge_stacks(map<int,MergedStack * > & merged,size_t & blist_cnt)1452 merge_stacks(map<int, MergedStack *> &merged, size_t& blist_cnt)
1453 {
1454 map<int, MergedStack *> new_merged;
1455 map<int, MergedStack *>::const_iterator it;
1456 map<int, MergedStack *>::iterator it_old, it_new;
1457 MergedStack *tag_1, *tag_2;
1458 set<int> merge_map;
1459 vector<set<int> > merge_lists;
1460
1461 uint index = 0;
1462 int cohort_id = 0;
1463 int id = 1;
1464 uint delev_cnt = 0;
1465 blist_cnt = 0;
1466
1467 for (it = merged.begin(); it != merged.end(); it++) {
1468 tag_1 = it->second;
1469
1470 //
1471 // This tag may already have been merged by an earlier operation.
1472 //
1473 if (merge_map.count(tag_1->id) > 0)
1474 continue;
1475
1476 queue<int> merge_list;
1477 pair<set<int>::iterator,bool> ret;
1478 vector<pair<int, int> >::iterator k;
1479
1480 merge_lists.push_back(set<int>());
1481
1482 if (tag_1->masked) {
1483 merge_lists[index].insert(tag_1->id);
1484 index++;
1485 continue;
1486 }
1487
1488 //
1489 // Construct a list of MergedStacks that are within a particular distance
1490 // of this tag.
1491 //
1492 merge_lists[index].insert(tag_1->id);
1493 merge_list.push(tag_1->id);
1494
1495 while (!merge_list.empty()) {
1496 tag_2 = merged[merge_list.front()];
1497 merge_list.pop();
1498
1499 for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
1500 ret = merge_lists[index].insert(k->first);
1501
1502 //
1503 // If this Tag has not already been added to the merge list (i.e. we were able
1504 // to insert it in to our unique_merge_list, which is a set), add it for consideration
1505 // later in the loop.
1506 //
1507 if (ret.second == true)
1508 merge_list.push((*k).first);
1509 }
1510 }
1511
1512 //
1513 // Record the nodes that have been merged in this round.
1514 //
1515 set<int>::iterator j;
1516 for (j = merge_lists[index].begin(); j != merge_lists[index].end(); j++)
1517 merge_map.insert(*j);
1518
1519 index++;
1520 }
1521
1522 #pragma omp parallel private(tag_1, tag_2)
1523 {
1524 vector<MergedStack *> merged_tags;
1525
1526 #pragma omp for reduction(+:delev_cnt) reduction(+:blist_cnt)
1527 for (uint index = 0; index < merge_lists.size(); index++) {
1528 //
1529 // Deal with the simple case of a single locus that does not need to be merged.
1530 //
1531 if (merge_lists[index].size() == 1) {
1532 tag_1 = merged[*(merge_lists[index].begin())];
1533 tag_2 = merge_tags(merged, merge_lists[index], 0);
1534
1535 //
1536 // If this tag is masked, keep the old cohort_id.
1537 //
1538 if (tag_1->masked) {
1539 tag_2->cohort_id = tag_1->cohort_id;
1540 } else {
1541 tag_2->cohort_id = cohort_id;
1542 #pragma omp atomic
1543 cohort_id++;
1544 }
1545 merged_tags.push_back(tag_2);
1546 continue;
1547 }
1548
1549 //
1550 // Break large loci down by constructing a minimum
1551 // spanning tree and severing long distance edges.
1552 //
1553 if (deleverage_stacks) {
1554 vector<MergedStack *> tags;
1555 bool delev;
1556
1557 deleverage(merged, merge_lists[index], cohort_id, tags);
1558
1559 if (tags.size() == 1) {
1560 delev = false;
1561 } else {
1562 delev_cnt++;
1563 delev = true;
1564 }
1565
1566 for (uint t = 0; t < tags.size(); t++) {
1567 //tags[t]->id = id;
1568 tags[t]->deleveraged = delev;
1569
1570 if (tags[t]->utags.size() > max_subgraph) {
1571 tags[t]->masked = true;
1572 tags[t]->blacklisted = true;
1573 blist_cnt++;
1574 }
1575
1576 //new_merged.insert(pair<int, MergedStack *>(id, tags[t]));
1577 merged_tags.push_back(tags[t]);
1578 //id++;
1579 }
1580
1581 #pragma omp atomic
1582 cohort_id++;
1583
1584 } else {
1585 //
1586 // If not deleveraging, merge these tags together into a new MergedStack object.
1587 //
1588 tag_2 = merge_tags(merged, merge_lists[index], 0);
1589 tag_2->cohort_id = cohort_id;
1590
1591 if (tag_2->utags.size() > max_subgraph) {
1592 tag_2->masked = true;
1593 tag_2->blacklisted = true;
1594 blist_cnt++;
1595 }
1596
1597 //new_merged.insert(pair<int, MergedStack *>(id, tag_2));
1598 merged_tags.push_back(tag_2);
1599
1600 #pragma omp atomic
1601 cohort_id++;
1602 //id++;
1603 }
1604 }
1605
1606 //
1607 // Merge the accumulated tags into the new_merged map.
1608 //
1609 #pragma omp critical
1610 {
1611 it_old = merged.begin();
1612 for (uint j = 0; j < merged_tags.size(); j++) {
1613 merged_tags[j]->id = id;
1614 it_new = new_merged.insert(it_old, pair<int, MergedStack *>(id, merged_tags[j]));
1615 it_old = it_new;
1616 id++;
1617 }
1618 }
1619 }
1620
1621 if (new_merged.empty()) {
1622 cerr << "Error: Couldn't assemble any loci.\n";
1623 throw exception();
1624 }
1625
1626 //
1627 // Free the memory from the old map of merged tags.
1628 //
1629 for (it = merged.begin(); it != merged.end(); it++)
1630 delete it->second;
1631
1632 swap(merged, new_merged);
1633 }
1634
1635 MergedStack *
merge_tags(MergedStack * tag_1,MergedStack * tag_2,int id)1636 merge_tags(MergedStack *tag_1, MergedStack *tag_2, int id)
1637 {
1638 MergedStack *new_tag;
1639
1640 new_tag = new MergedStack;
1641 new_tag->id = id;
1642
1643 new_tag->deleveraged = tag_2->deleveraged || tag_1->deleveraged;
1644 new_tag->masked = tag_2->masked || tag_1->masked;
1645 new_tag->blacklisted = tag_2->blacklisted || tag_1->blacklisted;
1646 new_tag->gappedlumberjack = tag_2->gappedlumberjack || tag_1->gappedlumberjack;
1647 new_tag->lumberjackstack = tag_2->lumberjackstack || tag_1->lumberjackstack;
1648
1649 for (uint i = 0; i < tag_1->utags.size(); i++)
1650 new_tag->utags.push_back(tag_1->utags[i]);
1651
1652 for (uint i = 0; i < tag_1->remtags.size(); i++)
1653 new_tag->remtags.push_back(tag_1->remtags[i]);
1654
1655 for (uint i = 0; i < tag_1->gaps.size(); i++)
1656 new_tag->gaps.push_back(tag_1->gaps[i]);
1657
1658 new_tag->count = tag_1->count;
1659
1660 for (uint i = 0; i < tag_2->utags.size(); i++)
1661 new_tag->utags.push_back(tag_2->utags[i]);
1662
1663 for (uint i = 0; i < tag_2->remtags.size(); i++)
1664 new_tag->remtags.push_back(tag_2->remtags[i]);
1665
1666 for (uint i = 0; i < tag_2->gaps.size(); i++)
1667 new_tag->gaps.push_back(tag_2->gaps[i]);
1668
1669 new_tag->count += tag_2->count;
1670
1671 return new_tag;
1672 }
1673
merge_tags(map<int,MergedStack * > & merged,set<int> & merge_list,int id)1674 MergedStack *merge_tags(map<int, MergedStack *> &merged, set<int> &merge_list, int id) {
1675 MergedStack *tag_1, *tag_2;
1676
1677 tag_1 = new MergedStack;
1678 tag_1->id = id;
1679
1680 for (auto i = merge_list.begin(); i != merge_list.end(); i++) {
1681 tag_2 = merged[(*i)];
1682
1683 tag_1->deleveraged = tag_2->deleveraged ? true : tag_1->deleveraged;
1684 tag_1->masked = tag_2->masked ? true : tag_1->masked;
1685 tag_1->blacklisted = tag_2->blacklisted ? true : tag_1->blacklisted;
1686 tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
1687
1688 for (auto j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
1689 tag_1->utags.push_back(*j);
1690
1691 for (auto j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
1692 tag_1->remtags.push_back(*j);
1693
1694 for (auto j = tag_2->gaps.begin(); j != tag_2->gaps.end(); j++)
1695 tag_1->gaps.push_back(*j);
1696
1697 tag_1->count += tag_2->count;
1698 }
1699
1700 return tag_1;
1701 }
1702
merge_tags(map<int,MergedStack * > & merged,int * merge_list,int merge_list_size,int id)1703 MergedStack *merge_tags(map<int, MergedStack *> &merged, int *merge_list, int merge_list_size, int id) {
1704 int i;
1705 vector<int>::iterator j;
1706 MergedStack *tag_1, *tag_2;
1707
1708 tag_1 = new MergedStack;
1709 tag_1->id = id;
1710
1711 for (i = 0; i < merge_list_size; i++) {
1712 tag_2 = merged[merge_list[i]];
1713
1714 tag_1->deleveraged = tag_2->deleveraged ? true : tag_1->deleveraged;
1715 tag_1->masked = tag_2->masked ? true : tag_1->masked;
1716 tag_1->blacklisted = tag_2->blacklisted ? true : tag_1->blacklisted;
1717 tag_1->lumberjackstack = tag_2->lumberjackstack ? true : tag_1->lumberjackstack;
1718
1719 for (j = tag_2->utags.begin(); j != tag_2->utags.end(); j++)
1720 tag_1->utags.push_back(*j);
1721
1722 for (j = tag_2->remtags.begin(); j != tag_2->remtags.end(); j++)
1723 tag_1->remtags.push_back(*j);
1724
1725 for (auto j = tag_2->gaps.begin(); j != tag_2->gaps.end(); j++)
1726 tag_1->gaps.push_back(*j);
1727
1728 tag_1->count += tag_2->count;
1729 }
1730
1731 return tag_1;
1732 }
1733
1734 size_t
remove_repetitive_stacks(map<int,MergedStack * > & merged)1735 remove_repetitive_stacks(map<int, MergedStack *> &merged)
1736 {
1737 //
1738 // If enabled, check the depth of coverage of each unique tag, and remove
1739 // from consideration any tags with depths greater than removal_trigger. These tags
1740 // are likely to be multiple repetitive sites that have been merged together.
1741 // Because large stacks of unique tags are likely to also generate many one-off
1742 // sequencing error reads, remove all seqeunces that are a distance of one from
1743 // the RAD-Tag with high depth of coverage.
1744 //
1745 map<int, MergedStack *>::iterator i;
1746 vector<pair<int, int> >::iterator k;
1747 map<int, MergedStack *> new_merged;
1748 MergedStack *tag_1, *tag_2;
1749 set<int> already_merged;
1750
1751 const size_t initial_stacks = merged.size();
1752
1753 //
1754 // First, iterate through the stacks and populate a list of tags that will be removed
1755 // (those above the removal_trigger and those 1 nucleotide away). Sort the list of
1756 // stacks so that we process them from largest depth to shortest so the same stacks
1757 // are always merged/removed..
1758 //
1759 vector<pair<int, int> > ordered_tags;
1760 for (i = merged.begin(); i != merged.end(); i++) {
1761 if (i->second->count > removal_trigger)
1762 ordered_tags.push_back(make_pair(i->second->id, i->second->count));
1763 }
1764 sort(ordered_tags.begin(), ordered_tags.end(), compare_pair_intint);
1765
1766 pair<set<int>::iterator,bool> ret;
1767 int id = 0;
1768
1769 //
1770 // Merge all stacks that are over the removal trigger with their nearest neighbors and
1771 // mask them so they are not further processed by the program.
1772 //
1773 for (uint j = 0; j < ordered_tags.size(); j++) {
1774 tag_1 = merged[ordered_tags[j].first];
1775
1776 //
1777 // Don't process a tag that has already been merged.
1778 //
1779 if (already_merged.count(tag_1->id) > 0)
1780 continue;
1781
1782 //
1783 // Construct a list of MergedStacks that are either:
1784 // within a distance of 1 nucleotide of this tag, or
1785 // are they themselves above the lumberjack stacks limit.
1786 //
1787 queue<int> merge_queue;
1788 set<int> merge_list;
1789 merge_queue.push(tag_1->id);
1790 merge_list.insert(tag_1->id);
1791 already_merged.insert(tag_1->id);
1792
1793 while (!merge_queue.empty()) {
1794 tag_2 = merged[merge_queue.front()];
1795 merge_queue.pop();
1796
1797 if (tag_2->count < removal_trigger)
1798 continue;
1799
1800 for (k = tag_2->dist.begin(); k != tag_2->dist.end(); k++) {
1801 ret = already_merged.insert(k->first);
1802
1803 if (ret.second == true) {
1804 merge_queue.push(k->first);
1805 merge_list.insert(k->first);
1806 }
1807 }
1808 }
1809
1810 //
1811 // Merge these tags together into a new MergedStack object.
1812 //
1813 tag_2 = merge_tags(merged, merge_list, id);
1814 tag_2->add_consensus(tag_1->con);
1815
1816 tag_2->lumberjackstack = true;
1817 tag_2->masked = true;
1818 tag_2->blacklisted = true;
1819
1820 new_merged.insert(make_pair(id, tag_2));
1821 id++;
1822 }
1823
1824 //
1825 // Move the non-lumberjack stacks, unmodified, into the new merged map.
1826 //
1827 size_t remaining_stacks = 0;
1828 for (i = merged.begin(); i != merged.end(); i++) {
1829 tag_1 = i->second;
1830
1831 if (already_merged.count(tag_1->id) > 0)
1832 continue;
1833 ++remaining_stacks;
1834
1835 set<int> merge_list;
1836 merge_list.insert(tag_1->id);
1837
1838 tag_2 = merge_tags(merged, merge_list, id);
1839 tag_2->add_consensus(tag_1->con);
1840
1841 new_merged.insert(make_pair(id, tag_2));
1842 id++;
1843 }
1844
1845 //
1846 // Free the memory from the old map of merged tags.
1847 //
1848 map<int, MergedStack *>::iterator it;
1849 for (it = merged.begin(); it != merged.end(); it++)
1850 delete it->second;
1851
1852 merged.swap(new_merged);
1853 return initial_stacks - remaining_stacks;
1854 }
1855
deleverage(map<int,MergedStack * > & merged,set<int> & merge_list,int cohort_id,vector<MergedStack * > & deleveraged_tags)1856 int deleverage(map<int, MergedStack *> &merged,
1857 set<int> &merge_list,
1858 int cohort_id,
1859 vector<MergedStack *> &deleveraged_tags) {
1860 set<int>::iterator it;
1861 vector<pair<int, int> >::iterator j;
1862 MergedStack *tag_1, *tag_2;
1863 uint k, l;
1864
1865 //
1866 // Create a minimum spanning tree in order to determine the minimum distance
1867 // between each node in the list.
1868 //
1869 mst::MinSpanTree *mst = new mst::MinSpanTree();
1870 vector<int> keys;
1871
1872 for (it = merge_list.begin(); it != merge_list.end(); it++) {
1873 keys.push_back(*it);
1874 mst->add_node(*it);
1875 tag_1 = merged[*it];
1876
1877 // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
1878 }
1879
1880 //
1881 // Measure the distance between each pair of nodes and add edges to our
1882 // minimum spanning tree.
1883 //
1884 mst::Node *n_1, *n_2;
1885 for (k = 0; k < keys.size(); k++) {
1886 tag_1 = merged[keys[k]];
1887 n_1 = mst->node(keys[k]);
1888
1889 for (l = k+1; l < keys.size(); l++) {
1890 tag_2 = merged[keys[l]];
1891 n_2 = mst->node(keys[l]);
1892
1893 int d = dist(tag_1, tag_2);
1894
1895 n_1->add_edge(mst->node(keys[l]), d);
1896 n_2->add_edge(mst->node(keys[k]), d);
1897 }
1898 }
1899
1900 //
1901 // Build the minimum spanning tree.
1902 //
1903 mst->build_tree();
1904
1905 //
1906 // Visualize the MST
1907 //
1908 if (dump_graph) {
1909 stringstream gout_file;
1910 gout_file << prefix_path << "_" << keys[0] << ".dot";
1911 string vis = mst->vis(true);
1912 ofstream gvis(gout_file.str().c_str());
1913 gvis << vis;
1914 gvis.close();
1915 }
1916
1917 set<int> visited;
1918 set<int, int_increasing> dists;
1919 queue<mst::Node *> q;
1920
1921 mst::Node *n = mst->head();
1922 q.push(n);
1923
1924 while (!q.empty()) {
1925 n = q.front();
1926 q.pop();
1927 visited.insert(n->id);
1928
1929 for (uint i = 0; i < n->min_adj_list.size(); i++) {
1930 if (visited.count(n->min_adj_list[i]->id) == 0) {
1931 q.push(n->min_adj_list[i]);
1932 // cerr << n->id << " -> " << n->min_adj_list[i]->id << ": ";
1933
1934 //
1935 // Find the edge distance.
1936 //
1937 for (uint j = 0; j < n->edges.size(); j++)
1938 if (n->edges[j]->child == n->min_adj_list[i]) {
1939 // cerr << n->edges[j]->dist << "\n";
1940 dists.insert(n->edges[j]->dist);
1941 }
1942 }
1943 }
1944 }
1945
1946 //
1947 // This set is sorted by definition. Check if there is more than a single
1948 // distance separating stacks.
1949 //
1950 if (dists.size() == 1) {
1951 tag_1 = merge_tags(merged, merge_list, 0);
1952 deleveraged_tags.push_back(tag_1);
1953 delete mst;
1954 return 0;
1955 }
1956
1957 uint min_dist = *(dists.begin());
1958
1959 //
1960 // If there is more than a single distance, split the minimum spanning tree
1961 // into separate loci, by cutting the tree at the larger distances.
1962 //
1963 set<int> uniq_merge_list;
1964 visited.clear();
1965 n = mst->head();
1966 q.push(n);
1967 int id = 0;
1968
1969 uniq_merge_list.insert(n->id);
1970 while (!q.empty()) {
1971 n = q.front();
1972 q.pop();
1973 visited.insert(n->id);
1974
1975 for (uint i = 0; i < n->min_adj_list.size(); i++) {
1976 if (visited.count(n->min_adj_list[i]->id) == 0) {
1977 q.push(n->min_adj_list[i]);
1978
1979 for (uint j = 0; j < n->edges.size(); j++) {
1980 if (n->edges[j]->child == n->min_adj_list[i])
1981 if (n->edges[j]->dist > min_dist) {
1982
1983 // cerr << "Merging the following stacks into a locus:\n";
1984 for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
1985 tag_1 = merged[*it];
1986 // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
1987 }
1988
1989 tag_1 = merge_tags(merged, uniq_merge_list, id);
1990 tag_1->cohort_id = cohort_id;
1991 deleveraged_tags.push_back(tag_1);
1992 uniq_merge_list.clear();
1993 id++;
1994 }
1995 }
1996
1997 uniq_merge_list.insert(n->min_adj_list[i]->id);
1998 }
1999 }
2000 }
2001
2002 // cerr << "Merging the following stacks into a locus:\n";
2003 for (it = uniq_merge_list.begin(); it != uniq_merge_list.end(); it++) {
2004 tag_1 = merged[*it];
2005 // cerr << " " << *it << " -> " << tag_1->utags[0] << "\n";
2006 }
2007
2008 tag_1 = merge_tags(merged, uniq_merge_list, id);
2009 tag_1->cohort_id = cohort_id;
2010 deleveraged_tags.push_back(tag_1);
2011 uniq_merge_list.clear();
2012
2013 delete mst;
2014
2015 return 0;
2016 }
2017
2018 int
calc_kmer_distance(map<int,MergedStack * > & merged,int utag_dist)2019 calc_kmer_distance(map<int, MergedStack *> &merged, int utag_dist)
2020 {
2021 //
2022 // Calculate the distance (number of mismatches) between each pair
2023 // of Radtags. We expect all radtags to be the same length;
2024 //
2025 KmerHashMap kmer_map;
2026 vector<char *> kmer_map_keys;
2027 MergedStack *tag_1, *tag_2;
2028 map<int, MergedStack *>::iterator it;
2029
2030 // OpenMP can't parallelize random access iterators, so we convert
2031 // our map to a vector of integer keys.
2032 vector<int> keys;
2033 for (it = merged.begin(); it != merged.end(); it++)
2034 keys.push_back(it->first);
2035
2036 //
2037 // Calculate the number of k-mers we will generate. If kmer_len == 0,
2038 // determine the optimal length for k-mers.
2039 //
2040 size_t con_len = 0;
2041 for (auto miter = merged.begin(); miter != merged.end(); miter++)
2042 con_len = miter->second->len > con_len ? miter->second->len : con_len;
2043 size_t kmer_len = set_kmer_len ? determine_kmer_length(con_len, utag_dist) : global_kmer_len;
2044
2045 //
2046 // Calculate the minimum number of matching k-mers required for a possible sequence match.
2047 //
2048 int min_hits = calc_min_kmer_matches(kmer_len, utag_dist, con_len, set_kmer_len ? true : false);
2049
2050 //cerr << " Distance allowed between stacks: " << utag_dist
2051 // << "; searching with a k-mer length of " << kmer_len << " (" << num_kmers << " k-mers per read); "
2052 // << min_hits << " k-mer hits required.\n";
2053
2054 populate_kmer_hash(merged, kmer_map, kmer_map_keys, kmer_len);
2055
2056 #pragma omp parallel private(tag_1, tag_2)
2057 {
2058 KmerHashMap::iterator h;
2059 vector<char *> query_kmers;
2060
2061 size_t num_kmers = con_len - kmer_len + 1;
2062 initialize_kmers(kmer_len, num_kmers, query_kmers);
2063
2064 #pragma omp for schedule(dynamic)
2065 for (uint i = 0; i < keys.size(); i++) {
2066 tag_1 = merged[keys[i]];
2067
2068 // Don't compute distances for masked tags
2069 if (tag_1->masked) continue;
2070
2071 if (tag_1->len < kmer_len) continue;
2072 num_kmers = tag_1->len - kmer_len + 1;
2073 generate_kmers_lazily(tag_1->con, kmer_len, num_kmers, query_kmers);
2074
2075 map<int, int> hits;
2076 int d;
2077 //
2078 // Lookup the occurances of each k-mer in the kmer_map
2079 //
2080 for (uint j = 0; j < num_kmers; j++) {
2081 h = kmer_map.find(query_kmers[j]);
2082
2083 if (h != kmer_map.end())
2084 for (uint k = 0; k < h->second.size(); k++)
2085 hits[h->second[k]]++;
2086 }
2087
2088 //
2089 // Iterate through the list of hits. For each hit that has more than min_hits
2090 // check its full length to verify a match.
2091 //
2092 map<int, int>::iterator hit_it;
2093 for (hit_it = hits.begin(); hit_it != hits.end(); hit_it++) {
2094
2095 if (hit_it->second < min_hits) continue;
2096
2097 tag_2 = merged[hit_it->first];
2098
2099 // Don't compute distances for masked tags
2100 if (tag_2->masked) continue;
2101
2102 // Don't compare tag_1 against itself.
2103 if (tag_1 == tag_2) continue;
2104
2105 d = dist(tag_1, tag_2);
2106
2107 //
2108 // Check if any of the mismatches occur at the 3' end of the read. If they
2109 // do, they may indicate a frameshift is present at the 3' end of the read,
2110 // which will cause problems when we try to merge loci across samples.
2111 // If found, do not merge these tags, leave them for the gapped alignmnet
2112 // algorithm.
2113 //
2114 if (d <= utag_dist && check_frameshift(tag_1, tag_2, (size_t) utag_dist))
2115 continue;
2116
2117 //
2118 // Store the distance between these two sequences if it is
2119 // below the maximum distance. Thesesequences will then be
2120 // merged in the following step of the algorithm.
2121 //
2122 if (d <= utag_dist)
2123 tag_1->add_dist(tag_2->id, d);
2124 }
2125 // Sort the vector of distances.
2126 sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
2127 }
2128
2129 //
2130 // Free the k-mers we generated for this query
2131 //
2132 for (uint j = 0; j < query_kmers.size(); j++)
2133 delete [] query_kmers[j];
2134 }
2135
2136 free_kmer_hash(kmer_map, kmer_map_keys);
2137
2138 return 0;
2139 }
2140
2141 //int calc_distance(map<int, MergedStack *> &merged, int utag_dist) {
2142 // //
2143 // // Calculate the distance (number of mismatches) between each pair
2144 // // of Radtags. We expect all radtags to be the same length;
2145 // //
2146 // map<int, MergedStack *>::iterator it;
2147 // MergedStack *tag_1, *tag_2;
2148 // int i, j;
2149 //
2150 // // OpenMP can't parallelize random access iterators, so we convert
2151 // // our map to a vector of integer keys.
2152 // vector<int> keys;
2153 // for (it = merged.begin(); it != merged.end(); it++)
2154 // keys.push_back(it->first);
2155 //
2156 // #pragma omp parallel private(i, j, tag_1, tag_2)
2157 // {
2158 // #pragma omp for schedule(dynamic)
2159 // for (i = 0; i < (int) keys.size(); i++) {
2160 //
2161 // tag_1 = merged[keys[i]];
2162 //
2163 // // Don't compute distances for masked tags
2164 // if (tag_1->masked) continue;
2165 //
2166 // int d;
2167 //
2168 // for (j = 0; j < (int) keys.size(); j++) {
2169 // tag_2 = merged[keys[j]];
2170 //
2171 // // Don't compute distances for masked tags
2172 // if (tag_2->masked) continue;
2173 //
2174 // // Don't compare tag_1 against itself.
2175 // if (tag_1 == tag_2) continue;
2176 //
2177 // d = dist(tag_1, tag_2);
2178 // //cerr << " Distance: " << d << "\n";
2179 //
2180 // //
2181 // // Store the distance between these two sequences if it is
2182 // // below the maximum distance (which governs those
2183 // // sequences to be merged in the following step of the
2184 // // algorithm.)
2185 // //
2186 // if (d == utag_dist) {
2187 // tag_1->add_dist(tag_2->id, d);
2188 // //cerr << " HIT.\n";
2189 // }
2190 // }
2191 //
2192 // // Sort the vector of distances.
2193 // sort(tag_1->dist.begin(), tag_1->dist.end(), compare_dist);
2194 // }
2195 // }
2196 //
2197 // return 0;
2198 //}
2199
reduce_radtags(DNASeqHashMap & radtags,map<int,Stack * > & unique,map<int,Rem * > & rem,size_t & n_u_reads,size_t & n_r_reads)2200 void reduce_radtags(
2201 DNASeqHashMap &radtags, map<int, Stack *> &unique,
2202 map<int, Rem *> &rem,
2203 size_t& n_u_reads,
2204 size_t& n_r_reads
2205 ) {
2206 n_u_reads = 0;
2207 n_r_reads = 0;
2208 DNASeqHashMap::iterator it;
2209
2210 Rem *r;
2211 Stack *u;
2212 int global_id = 1;
2213
2214 for (it = radtags.begin(); it != radtags.end(); it++) {
2215 if (it->second.count() < min_merge_cov) {
2216 //
2217 // Don't record this unique RAD-Tag if its coverage is below
2218 // the specified cutoff. However, add the reads to the remainder
2219 // vector for later processing.
2220 //
2221 r = new Rem;
2222 r->id = global_id;
2223 r->add_seq(&it->first);
2224
2225 for (uint i = 0; i < it->second.ids.size(); i++)
2226 r->add_id(it->second.ids[i]);
2227 n_r_reads += r->map.size();
2228
2229 rem[r->id] = r;
2230
2231 } else {
2232 //
2233 // Populate a Stack object for this unique radtag. Create a
2234 // map of the IDs for the sequences that have been
2235 // collapsed into this radtag.
2236 //
2237 u = new Stack;
2238 u->id = global_id;
2239 u->add_seq(&it->first);
2240
2241 // Copy the original Fastq IDs from which this unique radtag was built.
2242 for (uint i = 0; i < it->second.ids.size(); i++)
2243 u->add_id(it->second.ids[i]);
2244 n_u_reads += u->map.size();
2245
2246 unique[u->id] = u;
2247 }
2248 if (global_id == INT_MAX)
2249 throw std::overflow_error("overflow in reduce_radtags()");
2250 ++global_id;
2251 }
2252
2253 if (unique.size() == 0) {
2254 cerr << "Error: Unable to form any primary stacks.\n";
2255 exit(1);
2256 }
2257 }
2258
2259 void
calc_coverage_distribution(map<int,MergedStack * > & merged,double & mean,double & stdev,double & max,double & tot_depth)2260 calc_coverage_distribution(map<int, MergedStack *> &merged,
2261 double &mean, double &stdev, double &max, double &tot_depth)
2262 {
2263 max = 0.0;
2264 mean = 0.0;
2265 stdev = 0.0;
2266 tot_depth = 0.0;
2267
2268 size_t not_blacklisted = 0;
2269 for (const pair<int, MergedStack*>& mtag : merged) {
2270 if (mtag.second->blacklisted)
2271 continue;
2272 ++not_blacklisted;
2273
2274 double depth = mtag.second->count;
2275 tot_depth += depth;
2276 if (depth > max)
2277 max = depth;
2278 }
2279 mean = tot_depth / not_blacklisted;
2280
2281 for (const pair<int, MergedStack*>& mtag : merged)
2282 if (!mtag.second->blacklisted)
2283 stdev += pow(mtag.second->count - mean, 2);
2284 stdev /= not_blacklisted;
2285 stdev = sqrt(stdev);
2286 }
2287
2288 int
write_results(map<int,MergedStack * > & m,map<int,Stack * > & u,map<int,Rem * > & r)2289 write_results(map<int, MergedStack *> &m, map<int, Stack *> &u, map<int, Rem *> &r)
2290 {
2291 map<int, MergedStack *>::iterator i;
2292 vector<int>::iterator k;
2293 vector<SNP *>::iterator s;
2294 map<string, int>::iterator t;
2295 MergedStack *tag_1;
2296 Stack *tag_2;
2297 Rem *rem;
2298 char strbuf32[32];
2299
2300 bool gzip = (in_file_type == FileT::gzfastq || in_file_type == FileT::gzfasta) ? true : false;
2301
2302 //
2303 // Read in the set of sequencing IDs so they can be included in the output.
2304 //
2305 vector<char *> seq_ids;
2306 load_seq_ids(seq_ids);
2307
2308 //
2309 // Create the output file names.
2310 //
2311 string tag_file = prefix_path + ".tags.tsv";
2312 string snp_file = prefix_path + ".snps.tsv";
2313 string all_file = prefix_path + ".alleles.tsv";
2314 if (gzip) {
2315 tag_file += ".gz";
2316 snp_file += ".gz";
2317 all_file += ".gz";
2318 }
2319
2320 //
2321 // Open the output files for writing.
2322 //
2323 VersatileWriter tags{tag_file}, snps{snp_file}, alle{all_file};
2324
2325 //
2326 // Record the version of Stacks used and the date generated as a comment in the catalog.
2327 //
2328 // Obtain the current date.
2329 //
2330 stringstream log;
2331 time_t rawtime;
2332 struct tm *timeinfo;
2333 char date[32];
2334 time(&rawtime);
2335 timeinfo = localtime(&rawtime);
2336 strftime(date, 32, "%F %T", timeinfo);
2337 log << "# ustacks version " << VERSION << "; generated on " << date << "\n";
2338 tags << log.str();
2339 snps << log.str();
2340 alle << log.str();
2341
2342 for (i = m.begin(); i != m.end(); i++) {
2343 float total = 0;
2344 tag_1 = i->second;
2345
2346 //
2347 // Calculate the log likelihood of this merged stack.
2348 //
2349 tag_1->gen_matrix(u, r);
2350 tag_1->calc_likelihood();
2351
2352 // First write the consensus sequence
2353 tags << sample_id << "\t"
2354 << tag_1->id << "\t"
2355 << "consensus\t" << "\t"
2356 << "\t"
2357 << tag_1->con << "\t"
2358 << tag_1->deleveraged << "\t"
2359 << tag_1->blacklisted << "\t"
2360 << tag_1->lumberjackstack << "\n";
2361
2362 //
2363 // Write a sequence recording the output of the SNP model for each nucleotide.
2364 //
2365 tags << sample_id << "\t"
2366 << tag_1->id << "\t"
2367 << "model\t" << "\t"
2368 << "\t";
2369 for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
2370 switch((*s)->type) {
2371 case snp_type_het:
2372 tags << "E";
2373 break;
2374 case snp_type_hom:
2375 tags << "O";
2376 break;
2377 default:
2378 tags << "U";
2379 break;
2380 }
2381 }
2382 tags << "\t\t\t\n";
2383
2384 //
2385 // Now write out the components of each unique tag merged into this locus.
2386 //
2387 int id = 0;
2388 for (k = tag_1->utags.begin(); k != tag_1->utags.end(); k++) {
2389 tag_2 = u[*k];
2390 total += tag_2->count();
2391
2392 for (uint j = 0; j < tag_2->map.size(); j++) {
2393 tags << sample_id << "\t"
2394 << tag_1->id << "\t"
2395 << "primary\t"
2396 << id << "\t"
2397 << seq_ids[tag_2->map[j]] << "\t"
2398 << tag_2->seq->seq()
2399 << "\t\t\t\n";
2400 }
2401 id++;
2402 }
2403
2404 //
2405 // Write out the remainder tags merged into this unique tag.
2406 //
2407 for (k = tag_1->remtags.begin(); k != tag_1->remtags.end(); k++) {
2408 rem = r[*k];
2409 total += rem->map.size();
2410 for (uint j = 0; j < rem->map.size(); j++)
2411 tags << sample_id << "\t"
2412 << tag_1->id << "\t"
2413 << "secondary\t"
2414 << "\t"
2415 << seq_ids[rem->map[j]] << "\t"
2416 << rem->seq->seq()
2417 << "\t\t\t\n";
2418 }
2419
2420 //
2421 // Write out the model calls for each nucleotide in this locus.
2422 //
2423 for (s = tag_1->snps.begin(); s != tag_1->snps.end(); s++) {
2424 snps << sample_id << "\t"
2425 << tag_1->id << "\t"
2426 << (size_t) (*s)->col << "\t";
2427 switch((*s)->type) {
2428 case snp_type_het:
2429 snps << "E\t";
2430 break;
2431 case snp_type_hom:
2432 snps << "O\t";
2433 break;
2434 default:
2435 snps << "U\t";
2436 break;
2437 }
2438 sprintf(strbuf32, "%.2f", (*s)->lratio);
2439 snps << strbuf32 << "\t"
2440 << (*s)->rank_1 << "\t"
2441 << (*s)->rank_2 << "\t\t\n";
2442 }
2443
2444 //
2445 // Write the expressed alleles seen for the recorded SNPs and
2446 // the percentage of tags a particular allele occupies.
2447 //
2448 for (t = tag_1->alleles.begin(); t != tag_1->alleles.end(); t++) {
2449 alle << sample_id << "\t"
2450 << tag_1->id << "\t"
2451 << (*t).first << "\t";
2452 sprintf(strbuf32, "%.2f", ((*t).second/total) * 100);
2453 alle << strbuf32 << "\t"
2454 << (*t).second << "\n";
2455 }
2456 }
2457 tags.close();
2458 snps.close();
2459 alle.close();
2460
2461 //
2462 // Free sequence IDs.
2463 //
2464 for (uint i = 0; i < seq_ids.size(); i++)
2465 delete [] seq_ids[i];
2466
2467 //
2468 // If specified, output reads not utilized in any stacks.
2469 //
2470 if (retain_rem_reads) {
2471 string unused_file = prefix_path + ".unused.fa";
2472 VersatileWriter unused{unused_file};
2473
2474 map<int, Rem *>::iterator r_it;
2475 for (r_it = r.begin(); r_it != r.end(); r_it++)
2476 if (r_it->second->utilized == false)
2477 unused << ">" << (long) r_it->second->id << "\n" << r_it->second->seq->seq() << "\n";
2478 unused.close();
2479 }
2480
2481 return 0;
2482 }
2483
dump_stack_graph(string data_file,map<int,Stack * > & unique,map<int,MergedStack * > & merged,vector<int> & keys,map<int,map<int,double>> & dist_map,map<int,set<int>> & cluster_map)2484 int dump_stack_graph(string data_file,
2485 map<int, Stack *> &unique,
2486 map<int, MergedStack *> &merged,
2487 vector<int> &keys,
2488 map<int, map<int, double> > &dist_map,
2489 map<int, set<int> > &cluster_map) {
2490 uint s, t;
2491 double d, scale, scaled_d;
2492 char label[32];
2493 vector<string> colors;
2494 ofstream data(data_file.c_str());
2495
2496 size_t pos_1 = data_file.find_last_of("/");
2497 size_t pos_2 = data_file.find_last_of(".");
2498 string title = data_file.substr(pos_1 + 1, pos_2 - pos_1 - 1);
2499
2500 //
2501 // Output a list of IDs so we can locate these stacks in the final results.
2502 //
2503 for (s = 0; s < keys.size(); s++)
2504 data << "/* " << keys[s] << ": " << unique[merged[keys[s]]->utags[0]]->map[0] << "; depth: " << merged[keys[s]]->count << " */\n";
2505
2506 //
2507 // Output a specification to visualize the stack graph using graphviz:
2508 // http://www.graphviz.org/
2509 //
2510 data << "graph " << title.c_str() << " {\n"
2511 << "rankdir=LR\n"
2512 << "size=\"20!\"\n"
2513 << "overlap=false\n"
2514 << "node [shape=circle style=filled fillcolor=\"#3875d7\" fontname=\"Arial\"];\n"
2515 << "edge [fontsize=8.0 fontname=\"Arial\" color=\"#aaaaaa\"];\n";
2516
2517 colors.push_back("red");
2518 colors.push_back("blue");
2519 colors.push_back("green");
2520 colors.push_back("brown");
2521 colors.push_back("purple");
2522
2523 map<int, set<int> >::iterator c;
2524 set<int>::iterator it;
2525 int color_index = 0;
2526 string color;
2527
2528 // Write out the clusters created by R, prior to writing all the nodes and connections.
2529 s = 0;
2530 for (c = cluster_map.begin(); c != cluster_map.end(); c++) {
2531 data << "subgraph " << s << " {\n"
2532 << " edge [penwidth=5 fontsize=12.0 fontcolor=\"black\" color=\"black\"]\n";
2533
2534 if ((*c).second.size() == 1) {
2535 color = "white";
2536 data << " node [fillcolor=" << color.c_str() << " fontcolor=\"black\"]\n";
2537 } else {
2538 color = colors[color_index % colors.size()];
2539 data << " node [fillcolor=" << color.c_str() << " fontcolor=\"white\"]\n";
2540 color_index++;
2541 }
2542
2543 for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
2544 data << " " << *it << "\n";
2545 }
2546
2547 if ((*c).second.size() > 1) {
2548 uint j = 0;
2549 for (it = (*c).second.begin(); it != (*c).second.end(); it++) {
2550 data << *it;
2551 if (j < (*c).second.size() - 1)
2552 data << " -- ";
2553 j++;
2554 }
2555 }
2556
2557 data << "}\n";
2558 s++;
2559 }
2560
2561 //
2562 // Scale the graph to display on a 10 inch canvas. Find the largest edge weight
2563 // and scale the edge lengths to fit the canvas.
2564 //
2565 scale = 0.0;
2566 for (s = 0; s < keys.size(); s++)
2567 for (t = s+1; t < keys.size(); t++)
2568 scale = dist_map[keys[s]][keys[t]] > scale ? dist_map[keys[s]][keys[t]] : scale;
2569 scale = scale / 20;
2570
2571 for (s = 0; s < keys.size(); s++) {
2572 for (t = s+1; t < keys.size(); t++) {
2573 d = dist_map[keys[s]][keys[t]];
2574 scaled_d = d / scale;
2575 scaled_d = scaled_d < 0.75 ? 0.75 : scaled_d;
2576 sprintf(label, "%.1f", d);
2577 data << keys[s] << " -- " << keys[t] << " [len=" << scaled_d << ", label=" << label << "];\n";
2578 }
2579 }
2580
2581 data << "}\n";
2582
2583 data.close();
2584
2585 return 0;
2586 }
2587
dump_unique_tags(map<int,Stack * > & u)2588 int dump_unique_tags(map<int, Stack *> &u) {
2589 map<int, Stack *>::iterator it;
2590 vector<pair<int, int> >::iterator pit;
2591 vector<int>::iterator mit;
2592
2593 for (it = u.begin(); it != u.end(); it++) {
2594 cerr << "UniqueTag UID: " << (*it).second->id << "\n"
2595 << " Seq: " << it->second->seq->seq() << "\n"
2596 << " IDs: ";
2597
2598 for (uint j = 0; j < it->second->map.size(); j++)
2599 cerr << it->second->map[j] << " ";
2600
2601 cerr << "\n\n";
2602 }
2603
2604 return 0;
2605 }
2606
dump_merged_tags(map<int,MergedStack * > & m)2607 int dump_merged_tags(map<int, MergedStack *> &m) {
2608 map<int, MergedStack *>::iterator it;
2609 vector<pair<int, int> >::iterator pit;
2610 vector<int>::iterator fit;
2611
2612 for (it = m.begin(); it != m.end(); it++) {
2613
2614 cerr << "MergedStack ID: " << it->second->id << "\n"
2615 << " Consensus: ";
2616 if (it->second->con != NULL)
2617 cerr << it->second->con << "\n";
2618 else
2619 cerr << "\n";
2620 cerr << " IDs: ";
2621
2622 for (fit = it->second->utags.begin(); fit != it->second->utags.end(); fit++)
2623 cerr << (*fit) << " ";
2624
2625 cerr << "\n"
2626 << " Distances: ";
2627
2628 for (pit = it->second->dist.begin(); pit != it->second->dist.end(); pit++)
2629 cerr << (*pit).first << ": " << (*pit).second << ", ";
2630
2631 cerr << "\n\n";
2632 }
2633
2634 return 0;
2635 }
2636
load_radtags(string in_file,DNASeqHashMap & radtags,size_t & n_reads)2637 void load_radtags(string in_file, DNASeqHashMap &radtags, size_t& n_reads) {
2638 n_reads = 0;
2639
2640 Input *fh = NULL;
2641
2642 if (in_file_type == FileT::fasta)
2643 fh = new Fasta(in_file.c_str());
2644 else if (in_file_type == FileT::fastq)
2645 fh = new Fastq(in_file.c_str());
2646 else if (in_file_type == FileT::gzfasta)
2647 fh = new GzFasta(in_file.c_str());
2648 else if (in_file_type == FileT::gzfastq)
2649 fh = new GzFastq(in_file.c_str());
2650
2651 long int corrected = 0;
2652 size_t i = 0;
2653 short int seql = 0;
2654 short int prev_seql = 0;
2655 bool len_mismatch = false;
2656
2657 Seq c;
2658 c.id = new char[id_len];
2659 c.seq = new char[max_len];
2660 c.qual = new char[max_len];
2661
2662 while ((fh->next_seq(c)) != 0) {
2663 if (i % 1000000 == 0 && i>0)
2664 cerr << i/1000000 << "M..." << flush;
2665
2666 prev_seql = seql;
2667 seql = 0;
2668
2669 for (char *p = c.seq; *p != '\0'; p++, seql++)
2670 switch (*p) {
2671 case 'N':
2672 case 'n':
2673 case '.':
2674 *p = 'A';
2675 corrected++;
2676 }
2677
2678 if (seql != prev_seql && prev_seql > 0)
2679 len_mismatch = true;
2680
2681 DNASeqHashMap::iterator element = radtags.insert({DNANSeq(seql, c.seq), HVal()}).first;
2682 element->second.add_id(i);
2683 i++;
2684 }
2685 cerr << "\n";
2686
2687 if (i == 0) {
2688 cerr << "Error: Unable to load data from '" << in_file.c_str() << "'.\n";
2689 exit(1);
2690 }
2691 if (len_mismatch && !force_diff_len) {
2692 cerr << "Error: different sequence lengths detected, this will interfere with Stacks "
2693 << "algorithms, trim reads to uniform length (override this check with --force-diff-len).\n";
2694 exit(1);
2695 } else if (force_diff_len) {
2696 cerr << "Warning: different sequence lengths detected, this could interfere with Stacks algorithms.\n";
2697 }
2698 if (corrected > 0)
2699 cerr << "Warning: Input reads contained " << corrected << " uncalled nucleotides.\n";
2700
2701 delete fh;
2702 n_reads = i;
2703 }
2704
2705 int
load_seq_ids(vector<char * > & seq_ids)2706 load_seq_ids(vector<char *> &seq_ids)
2707 {
2708 Input *fh = NULL;
2709
2710 if (in_file_type == FileT::fasta)
2711 fh = new Fasta(in_file.c_str());
2712 else if (in_file_type == FileT::fastq)
2713 fh = new Fastq(in_file.c_str());
2714 else if (in_file_type == FileT::gzfasta)
2715 fh = new GzFasta(in_file.c_str());
2716 else if (in_file_type == FileT::gzfastq)
2717 fh = new GzFastq(in_file.c_str());
2718
2719 cerr << "Refetching read IDs...";
2720
2721 char *id;
2722 Seq c;
2723 c.id = new char[id_len];
2724 c.seq = new char[max_len];
2725 c.qual = new char[max_len];
2726
2727 while ((fh->next_seq(c)) != 0) {
2728 id = new char[strlen(c.id) + 1];
2729 strcpy(id, c.id);
2730 seq_ids.push_back(id);
2731 }
2732
2733 delete fh;
2734 return 0;
2735 }
2736
factorial(int i)2737 long double factorial(int i) {
2738 long double f = 1;
2739
2740 if (i == 0) return 1;
2741
2742 do {
2743 f = f * i;
2744 i--;
2745 } while (i > 0);
2746
2747 return f;
2748 }
2749
parse_command_line(int argc,char * argv[])2750 int parse_command_line(int argc, char* argv[]) {
2751 string out_path;
2752 string sample_name;
2753
2754 int c;
2755 while (1) {
2756 static struct option long_options[] = {
2757 {"help", no_argument, NULL, 'h'},
2758 {"version", no_argument, NULL, 'v'},
2759 {"infile-type", required_argument, NULL, 't'}, {"infile_type", required_argument, NULL, 't'},
2760 {"file", required_argument, NULL, 'f'},
2761 {"outpath", required_argument, NULL, 'o'},
2762 {"name", required_argument, NULL, 1002},
2763 {"id", required_argument, NULL, 'i'},
2764 {"min-cov", required_argument, NULL, 'm'}, {"min_cov", required_argument, NULL, 'm'},
2765 {"max-dist", required_argument, NULL, 'M'}, {"max_dist", required_argument, NULL, 'M'},
2766 {"max-sec-dist", required_argument, NULL, 'N'}, {"max_sec_dist", required_argument, NULL, 'N'},
2767 {"max-locus-stacks", required_argument, NULL, 'K'}, {"max_locus_stacks", required_argument, NULL, 'K'},
2768 {"k-len", required_argument, NULL, 'k'}, {"k_len", required_argument, NULL, 'k'},
2769 {"num-threads", required_argument, NULL, 'p'}, {"num_threads", required_argument, NULL, 'p'},
2770 {"deleverage", no_argument, NULL, 'd'},
2771 {"keep-high-cov", no_argument, NULL, 1000}, {"keep_high_cov", no_argument, NULL, 1000},
2772 {"high-cov-thres", required_argument, NULL, 1001}, {"high_cov_thres", required_argument, NULL, 1001},
2773 {"retain-rem", no_argument, NULL, 'R'}, {"retain_rem", no_argument, NULL, 'R'},
2774 {"graph", no_argument, NULL, 'g'},
2775 {"sec-hapl", no_argument, NULL, 'H'}, {"sec_hapl", no_argument, NULL, 'H'},
2776 {"disable-gapped", no_argument, NULL, 'G'},
2777 {"max-gaps", required_argument, NULL, 'X'}, {"max_gaps", required_argument, NULL, 'X'},
2778 {"min-aln-len", required_argument, NULL, 'x'}, {"min_aln_len", required_argument, NULL, 'x'},
2779 {"model-type", required_argument, NULL, 'T'}, {"model_type", required_argument, NULL, 'T'},
2780 {"bc-err-freq", required_argument, NULL, 'e'}, {"bc_err_freq", required_argument, NULL, 'e'},
2781 {"bound-low", required_argument, NULL, 'L'}, {"bound_low", required_argument, NULL, 'L'},
2782 {"bound-high", required_argument, NULL, 'U'}, {"bound_high", required_argument, NULL, 'U'},
2783 {"alpha", required_argument, NULL, 'A'},
2784 {"r-deprecated", no_argument, NULL, 'r'},
2785 {"force-diff-len", no_argument, NULL, 1003}, {"force_diff_len", no_argument, NULL, 1003},
2786 {0, 0, 0, 0}
2787 };
2788
2789 // getopt_long stores the option index here.
2790 int option_index = 0;
2791
2792 c = getopt_long(argc, argv, "GhHvdrgRA:L:U:f:o:s:i:m:e:p:t:M:N:K:k:T:X:x:", long_options, &option_index);
2793
2794 // Detect the end of the options.
2795 if (c == -1)
2796 break;
2797
2798 switch (c) {
2799 case 'h':
2800 help();
2801 break;
2802 case 't':
2803 if (strcmp(optarg, "tsv") == 0)
2804 in_file_type = FileT::tsv;
2805 else if (strcmp(optarg, "fasta") == 0)
2806 in_file_type = FileT::fasta;
2807 else if (strcmp(optarg, "fastq") == 0)
2808 in_file_type = FileT::fastq;
2809 else if (strcasecmp(optarg, "gzfasta") == 0)
2810 in_file_type = FileT::gzfasta;
2811 else if (strcasecmp(optarg, "gzfastq") == 0)
2812 in_file_type = FileT::gzfastq;
2813 else
2814 in_file_type = FileT::unknown;
2815 break;
2816 case 'f':
2817 in_file = optarg;
2818 break;
2819 case 'o':
2820 out_path = optarg;
2821 break;
2822 case 1002:
2823 sample_name = optarg;
2824 break;
2825 case 'i':
2826 sample_id = is_integer(optarg);
2827 break;
2828 case 'm':
2829 min_merge_cov = is_integer(optarg);
2830 break;
2831 case 'M':
2832 max_utag_dist = is_integer(optarg);
2833 break;
2834 case 'N':
2835 max_rem_dist = is_integer(optarg);
2836 break;
2837 case 'd':
2838 deleverage_stacks++;
2839 break;
2840 case 1000: // keep_high_cov
2841 remove_rep_stacks = false;
2842 break;
2843 case 1001: // high_cov_thres
2844 removal_threshold = is_double(optarg);
2845 if (removal_threshold <= 0.0) {
2846 cerr << "Error: Bad threshold: '" << optarg << "'\n";
2847 help();
2848 }
2849 break;
2850 case 'K':
2851 max_subgraph = is_integer(optarg);
2852 break;
2853 case 'k':
2854 set_kmer_len = false;
2855 global_kmer_len = is_integer(optarg);
2856 break;
2857 case 'R':
2858 retain_rem_reads = true;
2859 break;
2860 case 'g':
2861 dump_graph++;
2862 break;
2863 case 'G':
2864 gapped_alignments = false;
2865 break;
2866 case 'X':
2867 max_gaps = is_double(optarg);
2868 break;
2869 case 'x':
2870 min_match_len = is_double(optarg);
2871 break;
2872 case 'T':
2873 if (strcmp(optarg, "snp") == 0) {
2874 model_type = snp;
2875 } else if (strcmp(optarg, "fixed") == 0) {
2876 model_type = ::fixed;
2877 } else if (strcmp(optarg, "bounded") == 0) {
2878 model_type = bounded;
2879 } else {
2880 cerr << "Unknown model type specified '" << optarg << "'\n";
2881 help();
2882 }
2883 break;;
2884 case 'e':
2885 barcode_err_freq = is_double(optarg);
2886 break;
2887 case 'L':
2888 bound_low = is_double(optarg);
2889 break;
2890 case 'U':
2891 bound_high = is_double(optarg);
2892 break;
2893 case 'A':
2894 alpha = is_double(optarg);
2895 break;
2896 case 'H':
2897 call_sec_hapl = false;
2898 break;
2899 case 'p':
2900 num_threads = is_integer(optarg);
2901 break;
2902 case 'v':
2903 version();
2904 break;
2905 case 1003:
2906 force_diff_len = true;
2907 break;
2908 case 'r': // deprecated Dec 2016, v1.45
2909 cerr << "Warning: Ignoring deprecated option -r (this has become the default).\n";
2910 break;
2911 case '?':
2912 // getopt_long already printed an error message.
2913 help();
2914 break;
2915
2916 default:
2917 cerr << "Unknown command line option '" << (char) c << "'\n";
2918 help();
2919 exit(1);
2920 }
2921 }
2922
2923 if (optind < argc) {
2924 cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
2925 help();
2926 }
2927
2928 if (set_kmer_len == false && (global_kmer_len < 7 || global_kmer_len > 31)) {
2929 cerr << "Kmer length must be between 7 and 31bp.\n";
2930 help();
2931 }
2932
2933 if (alpha != 0.1 && alpha != 0.05 && alpha != 0.01 && alpha != 0.001) {
2934 cerr << "SNP model alpha significance level must be either 0.1, 0.05, 0.01, or 0.001.\n";
2935 help();
2936 }
2937
2938 if (bound_low != 0 && (bound_low < 0 || bound_low >= 1.0)) {
2939 cerr << "SNP model lower bound must be between 0.0 and 1.0.\n";
2940 help();
2941 }
2942
2943 if (bound_high != 1 && (bound_high <= 0 || bound_high > 1.0)) {
2944 cerr << "SNP model upper bound must be between 0.0 and 1.0.\n";
2945 help();
2946 }
2947
2948 if (bound_low > 0 || bound_high < 1.0) {
2949 model_type = bounded;
2950 }
2951
2952 if (in_file.empty()) {
2953 cerr << "You must specify an input file.\n";
2954 help();
2955 }
2956
2957 if (in_file_type == FileT::unknown) {
2958 in_file_type = guess_file_type(in_file);
2959 if (in_file_type == FileT::unknown) {
2960 cerr << "Unable to recongnize the extention of file '" << in_file << "'.\n";
2961 help();
2962 }
2963 }
2964
2965 // Set `prefix_path`.
2966 if (out_path.length() == 0)
2967 out_path = ".";
2968 if (out_path.at(out_path.length() - 1) != '/')
2969 out_path += "/";
2970 if (sample_name.empty()) {
2971 size_t pos_1 = in_file.find_last_of("/");
2972 size_t pos_2 = in_file.find_last_of(".");
2973 if (in_file.substr(pos_2) == ".gz")
2974 pos_2 = in_file.substr(0, pos_2).find_last_of(".");
2975 if (pos_2 == string::npos || pos_2 <= pos_1 + 1) {
2976 cerr << "Failed to guess the sample's name.\n";
2977 help();
2978 }
2979 sample_name = in_file.substr(pos_1 + 1, (pos_2 - pos_1 - 1));
2980 }
2981 prefix_path = out_path + sample_name;
2982
2983 if (sample_id < 0) {
2984 cerr << "A unique sample ID must be provided (a positive integer).\n";
2985 help();
2986 }
2987
2988 if (model_type == ::fixed && barcode_err_freq == 0) {
2989 cerr << "You must specify the barcode error frequency.\n";
2990 help();
2991 }
2992
2993 return 0;
2994 }
2995
version()2996 void version() {
2997 cerr << "ustacks " << VERSION << "\n\n";
2998
2999 exit(1);
3000 }
3001
help()3002 void help() {
3003 cerr << "ustacks " << VERSION << "\n"
3004 << "ustacks -f file_path -i id -o path [-M max_dist] [-m min_cov] [-p num_threads]" << "\n"
3005 << " f: input file path.\n"
3006 << " i: a unique integer ID for this sample.\n"
3007 << " o: output path to write results.\n"
3008 << " M: Maximum distance (in nucleotides) allowed between stacks (default 2).\n"
3009 << " m: Minimum depth of coverage required to create a stack (default 3).\n"
3010 << " N: Maximum distance allowed to align secondary reads to primary stacks (default: M + 2).\n"
3011 << " p: enable parallel execution with num_threads threads.\n"
3012 << " t: input file type. Supported types: fasta, fastq, gzfasta, or gzfastq (default: guess).\n"
3013 << " --name: a name for the sample (default: input file name minus the suffix).\n"
3014 << " R: retain unused reads.\n"
3015 << " H: disable calling haplotypes from secondary reads.\n"
3016 << "\n"
3017 << " Stack assembly options:\n"
3018 << " d,--deleverage: enable the Deleveraging algorithm, used for resolving over merged tags.\n"
3019 << " --keep-high-cov: disable the algorithm that removes highly-repetitive stacks and nearby errors.\n"
3020 << " --high-cov-thres: highly-repetitive stacks threshold, in standard deviation units (default: 3.0).\n"
3021 << " --max-locus-stacks <num>: maximum number of stacks at a single de novo locus (default 3).\n"
3022 << " --k-len <len>: specify k-mer size for matching between alleles and loci (automatically calculated by default).\n\n"
3023 << " Gapped assembly options:\n"
3024 << " --max-gaps: number of gaps allowed between stacks before merging (default: 2).\n"
3025 << " --min-aln-len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n\n"
3026 << " --disable-gapped: do not preform gapped alignments between stacks (default: gapped alignements enabled).\n"
3027 << " Model options:\n"
3028 << " --model-type: either 'snp' (default), 'bounded', or 'fixed'\n"
3029 << " For the SNP or Bounded SNP model:\n"
3030 << " --alpha <num>: chi square significance level required to call a heterozygote or homozygote, either 0.1, 0.05 (default), 0.01, or 0.001.\n"
3031 << " For the Bounded SNP model:\n"
3032 << " --bound-low <num>: lower bound for epsilon, the error rate, between 0 and 1.0 (default 0).\n"
3033 << " --bound-high <num>: upper bound for epsilon, the error rate, between 0 and 1.0 (default 1).\n"
3034 << " For the Fixed model:\n"
3035 << " --bc-err-freq <num>: specify the barcode error frequency, between 0 and 1.0.\n"
3036 << "\n"
3037 << " h: display this help messsage.\n";
3038
3039 exit(1);
3040 }
3041