1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // cstacks -- Create a catalog of Stacks.
23 //
24
25 #include "MetaPopInfo.h"
26 #include "cstacks.h"
27
28 // Global variables to hold command-line options.
29 queue<pair<int, string>> samples;
30 string out_path;
31 string catalog_path;
32 FileT in_file_type = FileT::sql;
33 int ctag_dist = 1;
34 bool set_kmer_len = true;
35 int kmer_len = 0;
36 searcht search_type = sequence;
37 int num_threads = 1;
38 bool report_mmatches = false;
39 bool require_uniq_haplotypes = false;
40 bool gapped_alignments = true;
41 double min_match_len = 0.80;
42 double max_gaps = 2.0;
43
44 size_t next_catalog_id = 1;
45
46 using namespace std;
47
main(int argc,char * argv[])48 int main (int argc, char* argv[]) {
49 IF_NDEBUG_TRY
50
51 parse_command_line(argc, argv);
52
53 uint sample_cnt = samples.size();
54
55 cerr << "cstacks parameters selected:\n"
56 << " Loci matched based on sequence identity.\n"
57 << " Number of mismatches allowed between stacks: " << ctag_dist << "\n"
58 << " Gapped alignments: " << (gapped_alignments ? "enabled" : "disabled") << "\n"
59 << "Constructing catalog from " << sample_cnt << " samples.\n";
60
61 //
62 // Set the number of OpenMP parallel threads to execute.
63 //
64 #ifdef _OPENMP
65 omp_set_num_threads(num_threads);
66 #endif
67
68 map<int, CLocus *> catalog;
69 pair<int, string> s;
70 bool compressed = false;
71 int i;
72
73 set<int> seen_sample_ids; // For checking sample ID unicity.
74
75 if (catalog_path.length() > 0) {
76 cerr << "\nInitializing existing catalog...\n";
77 if (!initialize_existing_catalog(catalog_path, catalog)) {
78 cerr << "Error: Failed to initialize the catalog.\n";
79 throw exception();
80 }
81 i = 1;
82
83 } else {
84 s = samples.front();
85 samples.pop();
86
87 cerr << "\nInitializing new catalog...\n";
88 if (!initialize_new_catalog(s, catalog)) {
89 cerr << "Error: Failed to initialize the catalog.\n";
90 throw exception();
91 }
92 i = 2;
93 seen_sample_ids.insert(catalog.begin()->second->sample_id);
94 }
95
96 while (!samples.empty()) {
97 map<int, QLocus *> sample;
98
99 s = samples.front();
100 samples.pop();
101
102 cerr << "\nProcessing sample " << s.second << " [" << i << " of " << sample_cnt << "]\n";
103
104 if (!load_loci(s.second, sample, 0, false, compressed)) {
105 cerr << "Failed to load sample " << i << "\n";
106 continue;
107 }
108 //
109 // Assign the ID for this sample data.
110 //
111 s.first = sample.begin()->second->sample_id;
112
113 // Check for unique sample IDs.
114 if (!seen_sample_ids.insert(s.first).second) {
115 // Insert failed.
116 cerr << "Error: Sample ID '" << s.first << "' occurs more than once. Sample IDs must be unique." << endl;
117 throw exception();
118 }
119
120 //dump_loci(sample);
121
122 cerr << "Searching for sequence matches...\n";
123 find_kmer_matches_by_sequence(catalog, sample, ctag_dist);
124
125 if (gapped_alignments) {
126 cerr << "Searching for gapped alignments...\n";
127 search_for_gaps(catalog, sample, min_match_len, ctag_dist);
128 }
129
130 cerr << "Merging matches into catalog...\n";
131 uint mmatches = 0;
132 uint gmatches = 0;
133 uint umatches = 0;
134 uint nmatches = 0;
135 uint merge_cloci = 0;
136 uint reduced_cloci = 0;
137 merge_matches(catalog, sample, s, ctag_dist, nmatches, umatches, gmatches, mmatches, merge_cloci, reduced_cloci);
138 cerr << " " << umatches << " loci were matched to a catalog locus.\n"
139 << " " << gmatches << " loci were matched to a catalog locus using gapped alignments.\n"
140 << " " << nmatches << " loci were newly added to the catalog.\n"
141 << " " << mmatches << " loci matched more than one catalog locus, linking them.\n"
142 << " " << merge_cloci << " linked catalog loci were merged into " << reduced_cloci << " loci.\n";
143
144 //
145 // Regenerate the alleles for the catalog tags after merging the new sample into the catalog.
146 //
147 for (auto cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++) {
148 cat_it->second->populate_alleles();
149 cat_it->second->match_cnt = 0;
150 }
151
152 i++;
153
154 for (auto query_it = sample.begin(); query_it != sample.end(); query_it++)
155 delete query_it->second;
156 sample.clear();
157 }
158
159 cerr << "\nWriting catalog in directory '" << out_path << "'.\n"
160 << "Final catalog contains " << catalog.size() << " loci.\n";
161
162 write_catalog(catalog);
163 cerr << "cstacks is done.\n";
164
165 //
166 // Free memory associated with the catalog.
167 //
168 for (auto cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++)
169 delete cat_it->second;
170 catalog.clear();
171
172 return 0;
173 IF_NDEBUG_CATCH_ALL_EXCEPTIONS
174 }
175
update_catalog_index(map<int,CLocus * > & catalog,map<string,int> & cat_index)176 int update_catalog_index(map<int, CLocus *> &catalog, map<string, int> &cat_index) {
177 map<int, CLocus *>::iterator j;
178 char id[id_len];
179
180 for (j = catalog.begin(); j != catalog.end(); j++) {
181 snprintf(id, id_len - 1, "%s|%d|%c",
182 j->second->loc.chr(),
183 j->second->loc.bp,
184 j->second->loc.strand == strand_plus ? '+' : '-');
185
186 if (cat_index.count(id) == 0) {
187 cat_index[id] = j->first;
188 } else {
189 if (cat_index[id] != j->first)
190 cerr << "Error: Catalog index mismatch, key: '" << id << "'.\n";
191 }
192 }
193
194 return 0;
195 }
196
197 int
characterize_mismatch_snps(CLocus * catalog_tag,QLocus * query_tag)198 characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
199 {
200 set<int> snp_cols;
201 uint i;
202 for (i = 0; i < catalog_tag->snps.size(); i++)
203 snp_cols.insert(catalog_tag->snps[i]->col);
204 for (i = 0; i < query_tag->snps.size(); i++)
205 snp_cols.insert(query_tag->snps[i]->col);
206
207 //
208 // For each mismatch found, create a SNP object
209 //
210 const char *c = catalog_tag->con;
211 const char *c_beg = c;
212 const char *c_end = c + strlen(c);
213 const char *q = query_tag->con;
214 const char *q_beg = q;
215 const char *q_end = q + strlen(q);
216
217 i = 0;
218 while (c < c_end && q < q_end) {
219 if (snp_cols.count(i) == 0 &&
220 (*c != *q) && (*c != 'N' && *q != 'N')) {
221
222 // cerr << "Adding a new SNP at position " << c - c_beg << ", " << *c << "/" << *q << "\n";
223 SNP *s = new SNP;
224 s->type = snp_type_het;
225 s->col = c - c_beg;
226 s->lratio = 0;
227 s->rank_1 = *c;
228 s->rank_2 = *q;
229
230 merge_allele(catalog_tag, s);
231 merge_allele(query_tag, s);
232
233 catalog_tag->snps.push_back(s);
234
235 s = new SNP;
236 s->type = snp_type_het;
237 s->col = q - q_beg;
238 s->lratio = 0;
239 s->rank_1 = *q;
240 s->rank_2 = *c;
241
242 query_tag->snps.push_back(s);
243 }
244 c++;
245 q++;
246 i++;
247 }
248
249 return 1;
250 }
251
252 int
merge_matches(map<int,CLocus * > & catalog,map<int,QLocus * > & sample,pair<int,string> & sample_file,int ctag_dist,uint & new_matches,uint & unique_matches,uint & gapped_matches,uint & multiple_matches,uint & merge_cloci,uint & reduced_cloci)253 merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int, string> &sample_file, int ctag_dist,
254 uint &new_matches, uint &unique_matches, uint &gapped_matches, uint &multiple_matches,
255 uint &merge_cloci, uint &reduced_cloci)
256 {
257 map<int, QLocus *>::iterator i;
258 CLocus *ctag;
259 QLocus *qtag;
260 string cseq, qseq, cigar_str;
261 int cseq_len, match_index;
262
263 GappedAln *aln = new GappedAln();
264
265 //
266 // Variables to control merging of catalog loci that are linked by haplotypes from one or more sample loci.
267 //
268 vector<vector<int>> cloc_merge_list;
269 map<int, int> cloc_merge_key;
270
271 for (i = sample.begin(); i != sample.end(); i++) {
272 qtag = i->second;
273
274 //
275 // If this stack didn't match an existing catalog stack, add this stack to the
276 // catalog as a new stack.
277 //
278 if (qtag->matches.size() == 0) {
279 add_unique_tag(sample_file, catalog, qtag);
280 new_matches++;
281 continue;
282 }
283
284 //
285 // Reduce the set of matches to the minimum distance per query allele.
286 //
287 map<allele_type, size_t> min_allele_dist;
288 map<allele_type, size_t> per_allele_match;
289 for (uint k = 0; k < qtag->matches.size(); k++)
290 if (min_allele_dist.count(qtag->matches[k]->query_type) == 0) {
291 min_allele_dist[qtag->matches[k]->query_type] = qtag->matches[k]->dist;
292 per_allele_match[qtag->matches[k]->query_type] = qtag->matches[k]->cat_id;
293 } else if (qtag->matches[k]->dist < min_allele_dist[qtag->matches[k]->query_type]) {
294 min_allele_dist[qtag->matches[k]->query_type] = qtag->matches[k]->dist;
295 per_allele_match[qtag->matches[k]->query_type] = qtag->matches[k]->cat_id;
296 }
297
298 int cat_id = -1;
299 set<int> catalog_ids;
300 //
301 // Iterate over all of the matches of minimal distance and merge this query locus into the catalog locus.
302 //
303 for (auto j = per_allele_match.begin(); j != per_allele_match.end(); j++) {
304
305 cat_id = j->second;
306 //
307 // If we already handled this catalog locus, skip the allele.
308 //
309 if (catalog_ids.count(cat_id) > 0)
310 continue;
311
312 //
313 // Make a copy of the query tag so that if we are merging it into multiple catalog
314 // loci (which will subsequently be collapsed), we will have an unmodified copy for
315 // each merge.
316 //
317 QLocus *qtag_merge = new QLocus(*qtag);
318
319 ctag = catalog.at(cat_id);
320 cigar_str = "";
321 match_index = -1;
322
323 assert(ctag != NULL);
324 catalog_ids.insert(cat_id);
325
326 for (uint k = 0; k < qtag_merge->matches.size(); k++)
327 if ((int) qtag_merge->matches[k]->cat_id == cat_id) {
328 cigar_str = qtag_merge->matches[k]->cigar;
329 match_index = k;
330 break;
331 }
332
333 assert(match_index >= 0 && match_index < (int) qtag_merge->matches.size());
334
335 string query_allele, query_seq;
336 if (gapped_alignments) {
337 //
338 // Find the proper query allele that was aligned against the catalog.
339 //
340 query_allele = qtag_merge->matches[match_index]->query_type;
341 for (uint k = 0; k < qtag_merge->strings.size(); k++)
342 if (qtag_merge->strings[k].first == query_allele) {
343 query_seq = qtag_merge->strings[k].second;
344 break;
345 }
346
347 //
348 // If we have already matched a query locus to this catalog locus for the current
349 // sample, we must re-align the sequences in case changes have been made to the
350 // sequence by the previous matching sequence.
351 //
352 if (ctag->match_cnt > 0) {
353 aln->init(ctag->len, query_seq.length());
354 aln->align(ctag->con, query_seq);
355 cigar_str = invert_cigar(aln->result().cigar);
356 }
357 }
358
359 bool gapped_aln = false;
360 if (cigar_str.length() > 0)
361 gapped_aln = true;
362
363 if (gapped_aln) {
364 Cigar cigar;
365 //
366 // Since the match was a gapped alignment, adjust the lengths of the consensus sequences.
367 // Adjust the postition of any SNPs that were shifted down sequence due to a gap.
368 //
369 cseq_len = parse_cigar(cigar_str.c_str(), cigar);
370 qseq = apply_cigar_to_seq(query_seq.c_str(), cigar);
371 adjust_snps_for_gaps(cigar, qtag_merge);
372
373 cigar_str = invert_cigar(cigar_str);
374 cseq_len = parse_cigar(cigar_str.c_str(), cigar);
375 cseq = apply_cigar_to_seq(ctag->con, cigar);
376 adjust_snps_for_gaps(cigar, ctag);
377
378 assert(qseq.length() == cseq.length());
379
380 //
381 // If the alignment modified the catalog locus, record it so we can re-align
382 // any other matching sequences from this sample.
383 //
384 if ((uint)cseq_len > ctag->len)
385 ctag->match_cnt++;
386
387 //
388 // Adjust the consensus sequences for both loci.
389 //
390 ctag->add_consensus(cseq.c_str());
391 qtag_merge->add_consensus(qseq.c_str());
392
393 gapped_matches++;
394
395 } else {
396 unique_matches++;
397 }
398
399 //
400 // If mismatches are allowed between query and catalog tags, identify the
401 // mismatches and convert them into SNP objects to be merged into the catalog tag.
402 //
403 if (ctag_dist > 0 && !characterize_mismatch_snps(ctag, qtag_merge))
404 cerr
405 << " Error characterizing mismatch SNPs "
406 << sample_file.second << ", tag " << qtag->id
407 << " with catalog tag " << ctag->id << "\n";
408
409 //
410 // Merge the SNPs and alleles from the sample into the catalog tag.
411 //
412 if (!ctag->merge_snps(qtag_merge)) {
413 cerr << "Error merging " << sample_file.second << ", tag " << qtag_merge->id
414 << " with catalog tag " << ctag->id << "\n";
415 }
416
417 //
418 // Add any new sequence information into the catalog consensus.
419 //
420 if (gapped_aln) {
421 for (uint k = 0; k < ctag->len && k < qtag_merge->len; k++)
422 if (qtag_merge->con[k] != 'N' && ctag->con[k] == 'N')
423 ctag->con[k] = qtag_merge->con[k];
424
425 }
426
427 assert(strlen(ctag->con) == strlen(qtag_merge->con));
428
429 ctag->sources.push_back(make_pair(sample_file.first, qtag_merge->id));
430
431 delete qtag_merge;
432 }
433
434 //
435 // If the query tag matches more than one tag in the catalog, mark the catalog loci for merging.
436 //
437 if (catalog_ids.size() > 1) {
438 multiple_matches++;
439
440 for (auto n = catalog_ids.begin(); n != catalog_ids.end(); n++) {
441 cat_id = *n;
442
443 if (cloc_merge_key.count(cat_id) > 0) {
444 uint index = cloc_merge_key[cat_id];
445 for (uint k = 0; k < cloc_merge_list[index].size(); k++)
446 catalog_ids.insert(cloc_merge_list[index][k]);
447 cloc_merge_list[index].clear();
448 }
449 }
450
451 // Insert the list of catalog loci to merge.
452 cloc_merge_list.push_back(vector<int>());
453 uint index = cloc_merge_list.size() - 1;
454
455 // Record where to find each catalog locus.
456 for (auto n = catalog_ids.begin(); n != catalog_ids.end(); n++) {
457 cloc_merge_key[*n] = index;
458 cloc_merge_list[index].push_back(*n);
459 }
460 }
461 }
462
463 delete aln;
464
465 //
466 // Merge catalog loci, that were linked by a sample locus, together.
467 //
468 for (uint j = 0; j < cloc_merge_list.size(); j++) {
469
470 if (cloc_merge_list[j].size() == 0) continue;
471
472 merge_cloci += cloc_merge_list[j].size();
473 reduced_cloci++;
474 merge_catalog_loci(catalog, cloc_merge_list[j]);
475 }
476 return 0;
477 }
478
479 int
add_unique_tag(pair<int,string> & sample_file,map<int,CLocus * > & catalog,QLocus * qloc)480 add_unique_tag(pair<int, string> &sample_file, map<int, CLocus *> &catalog, QLocus *qloc)
481 {
482 CLocus *c = new CLocus;
483
484 c->id = next_catalog_id;
485 next_catalog_id++;
486
487 c->add_consensus(qloc->con);
488 //
489 // Record the source of this catalog tag.
490 //
491 c->sources.push_back(make_pair(sample_file.first, qloc->id));
492 //
493 // Add the physical genome location of this locus.
494 //
495 c->loc.set(qloc->loc.chr(), qloc->loc.bp, qloc->loc.strand);
496
497 assert(catalog.count(c->id) == 0);
498
499 catalog[c->id] = c;
500
501 // cerr << "Adding sample: " << qloc->id << " to the catalog as ID: " << c->id << "\n";
502
503 for (auto i = qloc->snps.begin(); i != qloc->snps.end(); i++) {
504 SNP *snp = new SNP;
505 snp->col = (*i)->col;
506 snp->type = (*i)->type;
507 snp->lratio = (*i)->lratio;
508 snp->rank_1 = (*i)->rank_1;
509 snp->rank_2 = (*i)->rank_2;
510
511 c->snps.push_back(snp);
512 }
513
514 for (auto j = qloc->alleles.begin(); j != qloc->alleles.end(); j++) {
515 c->alleles[j->first] = j->second;
516 }
517
518 c->populate_alleles();
519
520 return 0;
521 }
522
523 int
merge_catalog_loci(map<int,CLocus * > & catalog,vector<int> & merge_list)524 merge_catalog_loci(map<int, CLocus *> &catalog, vector<int> &merge_list)
525 {
526 GappedAln *aln = new GappedAln();
527
528 sort(merge_list.begin(), merge_list.end());
529
530 //
531 // Initialize the new locus from the first locus in merge_list.
532 //
533 CLocus *c = catalog[merge_list[0]];
534
535 for (uint i = 1; i < merge_list.size(); i++) {
536 CLocus *merge_tag = catalog[merge_list[i]];
537
538 //
539 // Align the consensus sequences together if they are not the same length.
540 //
541 if (c->len != merge_tag->len) {
542 Cigar cigar;
543 string cigar_str, qseq, cseq;
544
545 aln->init(c->len, merge_tag->len);
546 aln->align(c->con, merge_tag->con);
547 cigar_str = invert_cigar(aln->result().cigar);
548
549 parse_cigar(cigar_str.c_str(), cigar);
550 qseq = apply_cigar_to_seq(merge_tag->con, cigar);
551 adjust_snps_for_gaps(cigar, merge_tag);
552
553 cigar_str = invert_cigar(cigar_str);
554 parse_cigar(cigar_str.c_str(), cigar);
555 cseq = apply_cigar_to_seq(c->con, cigar);
556 adjust_snps_for_gaps(cigar, c);
557
558 assert(qseq.length() == cseq.length());
559
560 for (uint k = 0; k < cseq.length() && k < qseq.length(); k++)
561 if (qseq[k] != 'N' && cseq[k] == 'N')
562 cseq[k] = qseq[k];
563
564 c->add_consensus(cseq.c_str());
565 }
566
567 //
568 // Record the source of this catalog tag.
569 //
570 set<pair<int, int>> sources;
571 for (uint j = 0; j < c->sources.size(); j++)
572 sources.insert(c->sources[j]);
573 for (uint j = 0; j < merge_tag->sources.size(); j++)
574 sources.insert(merge_tag->sources[j]);
575 c->sources.clear();
576 for (auto j = sources.begin(); j != sources.end(); j++)
577 c->sources.push_back(*j);
578
579 //
580 // Merge SNPs and alleles.
581 //
582 c->merge_snps(merge_tag);
583
584 catalog.erase(merge_tag->id);
585 delete merge_tag;
586 }
587
588 c->populate_alleles();
589
590 delete aln;
591
592 return 0;
593 }
594
find_kmer_matches_by_sequence(map<int,CLocus * > & catalog,map<int,QLocus * > & sample,int ctag_dist)595 int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, int ctag_dist) {
596 //
597 // Calculate the distance (number of mismatches) between each pair
598 // of Radtags. We expect all radtags to be the same length;
599 //
600 KmerHashMap kmer_map;
601 map<int, pair<allele_type, int>> allele_map;
602 vector<char *> kmer_map_keys;
603 QLocus *tag_1;
604 CLocus *tag_2;
605
606 // OpenMP can't parallelize random access iterators, so we convert
607 // our map to a vector of integer keys.
608 vector<int> keys;
609 for (auto it = sample.begin(); it != sample.end(); it++)
610 keys.push_back(it->first);
611
612 //
613 // Calculate the number of k-mers we will generate. If specified,
614 // determine the optimal length for k-mers. Find the minimal sequence length
615 // to use as a base to set the kmer size.
616 //
617 uint con_len = UINT_MAX;
618 for (uint i = 0; i < keys.size(); i++) {
619 tag_1 = sample[keys[i]];
620 con_len = sample[keys[i]]->len < con_len ? sample[keys[i]]->len : con_len;
621 }
622 if (set_kmer_len) kmer_len = determine_kmer_length(con_len, ctag_dist);
623
624 //
625 // Calculate the minimum number of matching k-mers required for a possible sequence match.
626 //
627 int min_hits = calc_min_kmer_matches(kmer_len, ctag_dist, con_len, set_kmer_len ? true : false);
628
629 populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
630
631 cerr << " " << catalog.size() << " loci in the catalog, " << kmer_map.size() << " kmers in the catalog hash.\n";
632
633 #pragma omp parallel private(tag_1, tag_2)
634 {
635 KmerHashMap::iterator h;
636 vector<char *> kmers;
637 set<string> uniq_kmers;
638 vector<int> hits;
639 vector<pair<int, int>> ordered_hits;
640 uint hit_cnt, index, prev_id, allele_id, hits_size;
641 int d;
642 pair<allele_type, int> cat_hit;
643 int num_kmers = con_len - kmer_len + 1;
644
645 initialize_kmers(kmer_len, num_kmers, kmers);
646
647 #pragma omp for
648 for (uint i = 0; i < keys.size(); i++) {
649 tag_1 = sample[keys[i]];
650
651 for (auto allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
652 assert(size_t(kmer_len) <= allele->second.length());
653
654 num_kmers = allele->second.length() - kmer_len + 1;
655 generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
656
657 //
658 // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
659 // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
660 //
661 uniq_kmers.clear();
662 for (int j = 0; j < num_kmers; j++)
663 uniq_kmers.insert(kmers[j]);
664
665 hits.clear();
666 ordered_hits.clear();
667
668 //
669 // Lookup the occurances of each k-mer in the kmer_map
670 //
671 for (auto j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
672
673 h = kmer_map.find(j->c_str());
674
675 if (h != kmer_map.end())
676 for (uint k = 0; k < h->second.size(); k++)
677 hits.push_back(h->second[k]);
678 }
679
680 //
681 // Sort the vector of indexes; provides the number of hits to each allele/locus
682 // and orders them largest to smallest.
683 //
684 sort(hits.begin(), hits.end());
685
686 //
687 // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
688 //
689 hits_size = hits.size();
690
691 if (hits_size == 0)
692 continue;
693
694 prev_id = hits[0];
695 index = 0;
696
697 do {
698 hit_cnt = 0;
699 allele_id = prev_id;
700
701 while (index < hits_size && (uint) hits[index] == prev_id) {
702 hit_cnt++;
703 index++;
704 }
705
706 if (index < hits_size)
707 prev_id = hits[index];
708
709 if (hit_cnt >= (uint) min_hits)
710 ordered_hits.push_back(make_pair(allele_id, hit_cnt));
711
712 } while (index < hits_size);
713
714 for (uint j = 0; j < ordered_hits.size(); j++) {
715 cat_hit = allele_map.at(ordered_hits[j].first);
716 hit_cnt = ordered_hits[j].second;
717
718 tag_2 = catalog[cat_hit.second];
719
720 //
721 // If the sequences are not the same length then they have to be reconciled by
722 // the gapped alignment algorithm.
723 //
724 if (tag_1->len != tag_2->len)
725 continue;
726
727 d = dist(allele->second.c_str(), tag_2, cat_hit.first);
728
729 assert(d >= 0);
730
731 //
732 // Check if any of the mismatches occur at the 3' end of the read. If they
733 // do, they may indicate a frameshift is present at the 3' end of the read.
734 // If found, do not merge these tags, leave them for the gapped alignmnet
735 // algorithm.
736 //
737 if (d <= ctag_dist && check_frameshift(allele->second.c_str(), tag_2, cat_hit.first, (size_t) ctag_dist))
738 continue;
739
740 //
741 // Add a match to the query sequence: catalog ID, catalog allele, query allele, distance
742 //
743 if (d <= ctag_dist)
744 tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d);
745 }
746 }
747
748 // Sort the vector of distances.
749 sort(tag_1->matches.begin(), tag_1->matches.end(), compare_matches);
750 }
751
752 //
753 // Free the allocated k-mers.
754 //
755 for (uint j = 0; j < kmers.size(); j++)
756 delete [] kmers[j];
757 kmers.clear();
758 }
759
760 free_kmer_hash(kmer_map, kmer_map_keys);
761
762 return 0;
763 }
764
765 int
search_for_gaps(map<int,CLocus * > & catalog,map<int,QLocus * > & sample,double min_match_len,double ctag_dist)766 search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double min_match_len, double ctag_dist)
767 {
768 //
769 // Search for loci that can be merged with a gapped alignment.
770 //
771 KmerHashMap kmer_map;
772 map<int, pair<allele_type, int>> allele_map;
773 vector<char *> kmer_map_keys;
774 QLocus *tag_1;
775 CLocus *tag_2;
776
777 //
778 // OpenMP can't parallelize random access iterators, so we convert
779 // our map to a vector of integer keys.
780 //
781 vector<int> keys;
782 for (auto it = sample.begin(); it != sample.end(); it++)
783 keys.push_back(it->first);
784
785 //
786 // Calculate the number of k-mers we will generate. If kmer_len == 0,
787 // determine the optimal length for k-mers.
788 //
789 int con_len = strlen(sample[keys[0]]->con);
790 int kmer_len = 19;
791
792 populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
793
794 #pragma omp parallel private(tag_1, tag_2)
795 {
796 KmerHashMap::iterator h;
797 AlignRes aln_res;
798 vector<char *> kmers;
799 set<string> uniq_kmers;
800 vector<int> hits;
801 vector<pair<int, int>> ordered_hits;
802 uint hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
803 int d;
804 vector<pair<char, uint>> cigar;
805 pair<allele_type, int> cat_hit;
806 string cat_seq;
807 int num_kmers = con_len - kmer_len + 1;
808
809 GappedAln *aln = new GappedAln();
810
811 initialize_kmers(kmer_len, num_kmers, kmers);
812
813 #pragma omp for schedule(dynamic)
814 for (uint i = 0; i < keys.size(); i++) {
815 tag_1 = sample[keys[i]];
816
817 //
818 // If we already matched this locus to the catalog without using gapped alignments, skip it now.
819 //
820 if (tag_1->matches.size() > 0)
821 continue;
822
823 for (auto allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
824
825 num_kmers = allele->second.length() - kmer_len + 1;
826 generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
827
828 //
829 // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
830 // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
831 //
832 uniq_kmers.clear();
833 for (int j = 0; j < num_kmers; j++)
834 uniq_kmers.insert(kmers[j]);
835
836 hits.clear();
837 ordered_hits.clear();
838
839 //
840 // Lookup the occurances of each k-mer in the kmer_map
841 //
842 for (auto j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
843
844 h = kmer_map.find(j->c_str());
845
846 if (h != kmer_map.end())
847 for (uint k = 0; k < h->second.size(); k++)
848 hits.push_back(h->second[k]);
849 }
850
851 //
852 // Sort the vector of indexes; provides the number of hits to each allele/locus
853 // and orders them largest to smallest.
854 //
855 sort(hits.begin(), hits.end());
856
857 //
858 // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
859 //
860 hits_size = hits.size();
861
862 if (hits_size == 0)
863 continue;
864
865 prev_id = hits[0];
866 index = 0;
867
868 do {
869 hit_cnt = 0;
870 allele_id = prev_id;
871
872 while (index < hits_size && (uint) hits[index] == prev_id) {
873 hit_cnt++;
874 index++;
875 }
876
877 if (index < hits_size)
878 prev_id = hits[index];
879
880 ordered_hits.push_back(make_pair(allele_id, hit_cnt));
881
882 } while (index < hits_size);
883
884 if (ordered_hits.size() == 0)
885 continue;
886
887 //
888 // Process the hits from most kmer hits to least kmer hits.
889 //
890 sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
891
892 //
893 // Only try to align the sequences with the most kmers in common.
894 //
895 top_hit = ordered_hits[0].second;
896 stop = 1;
897 for (uint j = 1; j < ordered_hits.size(); j++)
898 if ((uint) ordered_hits[j].second < top_hit) {
899 stop = j;
900 break;
901 }
902
903 for (uint j = 0; j < stop; j++) {
904 cat_hit = allele_map.at(ordered_hits[j].first);
905 tag_2 = catalog[cat_hit.second];
906 cat_seq = "";
907
908 for (uint k = 0; k < tag_2->strings.size(); k++)
909 if (tag_2->strings[k].first == cat_hit.first) {
910 cat_seq = tag_2->strings[k].second;
911 break;
912 }
913
914 aln->init(tag_2->len, tag_1->len);
915
916 if (aln->align(cat_seq, allele->second)) {
917 cigar.clear();
918 aln->parse_cigar(cigar);
919 aln_res = aln->result();
920 d = dist(cat_seq.c_str(), allele->second.c_str(), cigar);
921
922 //
923 // If the alignment has too many gaps, skip it.
924 //
925 if (aln_res.gap_cnt <= (max_gaps + 1)) {
926 //
927 // If the alignment doesn't span enough of the two sequences, skip it.
928 //
929 if (aln_res.pct_id >= min_match_len) {
930
931 if (d <= ctag_dist)
932 tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d, invert_cigar(aln_res.cigar));
933 }
934 }
935 }
936 }
937 }
938 }
939
940 //
941 // Free the k-mers we generated for this query
942 //
943 for (uint j = 0; j < kmers.size(); j++)
944 delete [] kmers[j];
945 kmers.clear();
946
947 delete aln;
948 }
949
950 free_kmer_hash(kmer_map, kmer_map_keys);
951
952 return 0;
953 }
954
compare_matches(Match * a,Match * b)955 bool compare_matches(Match *a, Match *b) {
956 return (a->dist < b->dist);
957 }
958
find_matches_by_sequence(map<int,CLocus * > & catalog,map<int,QLocus * > & sample)959 int find_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *> &sample) {
960 //
961 // Calculate the distance (number of mismatches) between each pair
962 // of Radtags. We expect all radtags to be the same length;
963 //
964 map<int, QLocus *>::iterator i;
965 map<int, CLocus *>::iterator j;
966 int k;
967
968 // OpenMP can't parallelize random access iterators, so we convert
969 // our map to a vector of integer keys.
970 vector<int> keys;
971 for (i = sample.begin(); i != sample.end(); i++)
972 keys.push_back(i->first);
973
974 #pragma omp parallel private(i, j, k)
975 {
976 #pragma omp for schedule(dynamic)
977 for (k = 0; k < (int) keys.size(); k++) {
978
979 i = sample.find(keys[k]);
980
981 vector<pair<allele_type, string>>::iterator r, s;
982
983 //
984 // Iterate through the possible SAMPLE alleles
985 //
986 for (r = i->second->strings.begin(); r != i->second->strings.end(); r++) {
987
988 for (j = catalog.begin(); j != catalog.end(); j++) {
989 //
990 // Iterate through the possible CATALOG alleles
991 //
992 for (s = j->second->strings.begin(); s != j->second->strings.end(); s++) {
993 if (r->second == s->second) {
994 //cerr << "Found a match between " << i->first << " (" << r->first << ") and " << j->first << " (" << s->first << ")\n";
995
996 i->second->add_match(j->second->id, s->first, r->first, 0);
997 }
998 }
999 }
1000 }
1001 }
1002 }
1003
1004 return 0;
1005 }
1006
find_matches_by_genomic_loc(map<string,int> & cat_index,map<int,QLocus * > & sample)1007 int find_matches_by_genomic_loc(map<string, int> &cat_index, map<int, QLocus *> &sample) {
1008 map<int, QLocus *>::iterator i;
1009 map<int, CLocus *>::iterator j;
1010
1011 //
1012 // OpenMP can't parallelize random access iterators, so we convert
1013 // our map to a vector of integer keys.
1014 //
1015 vector<int> keys;
1016 for (i = sample.begin(); i != sample.end(); i++)
1017 keys.push_back(i->first);
1018
1019 #pragma omp parallel private(i, j)
1020 {
1021 char id[id_len];
1022
1023 #pragma omp for
1024 for (int k = 0; k < (int) keys.size(); k++) {
1025
1026 i = sample.find(keys[k]);
1027
1028 snprintf(id, id_len - 1, "%s|%d|%c",
1029 i->second->loc.chr(),
1030 i->second->loc.bp,
1031 i->second->loc.strand == strand_plus ? '+' : '-');
1032
1033 if (cat_index.count(id) > 0)
1034 i->second->add_match(cat_index[id], "", "", 0);
1035 }
1036 }
1037
1038 return 0;
1039 }
1040
write_catalog(map<int,CLocus * > & catalog)1041 int write_catalog(map<int, CLocus *> &catalog) {
1042
1043 map<int, CLocus *>::iterator i;
1044 CLocus *tag;
1045 set<int> matches;
1046
1047 bool gzip = (in_file_type == FileT::gzsql) ? true : false;
1048
1049 string tag_file = out_path + "catalog.tags.tsv";
1050 string snp_file = out_path + "catalog.snps.tsv";
1051 string all_file = out_path + "catalog.alleles.tsv";
1052
1053 if (gzip) {
1054 tag_file += ".gz";
1055 snp_file += ".gz";
1056 all_file += ".gz";
1057 }
1058
1059 //
1060 // Open the output files for writing.
1061 //
1062 gzFile gz_tags, gz_snps, gz_alle;
1063 ofstream tags, snps, alle;
1064 if (gzip) {
1065 gz_tags = gzopen(tag_file.c_str(), "wb");
1066 if (!gz_tags) {
1067 cerr << "Error: Unable to open gzipped catalog tag file '" << tag_file << "': " << strerror(errno) << ".\n";
1068 exit(1);
1069 }
1070 #if ZLIB_VERNUM >= 0x1240
1071 gzbuffer(gz_tags, libz_buffer_size);
1072 #endif
1073 gz_snps = gzopen(snp_file.c_str(), "wb");
1074 if (!gz_snps) {
1075 cerr << "Error: Unable to open gzipped catalog snps file '" << snp_file << "': " << strerror(errno) << ".\n";
1076 exit(1);
1077 }
1078 #if ZLIB_VERNUM >= 0x1240
1079 gzbuffer(gz_snps, libz_buffer_size);
1080 #endif
1081 gz_alle = gzopen(all_file.c_str(), "wb");
1082 if (!gz_alle) {
1083 cerr << "Error: Unable to open gzipped catalog alleles file '" << all_file << "': " << strerror(errno) << ".\n";
1084 exit(1);
1085 }
1086 #if ZLIB_VERNUM >= 0x1240
1087 gzbuffer(gz_alle, libz_buffer_size);
1088 #endif
1089 } else {
1090 tags.open(tag_file.c_str());
1091 snps.open(snp_file.c_str());
1092 alle.open(all_file.c_str());
1093 check_open(tags, tag_file);
1094 check_open(snps, snp_file);
1095 check_open(alle, all_file);
1096 }
1097
1098 //
1099 // Record the version of Stacks used and the date generated as a comment in the catalog.
1100 //
1101 // Obtain the current date.
1102 //
1103 stringstream log;
1104 time_t rawtime;
1105 struct tm *timeinfo;
1106 char date[32];
1107 time(&rawtime);
1108 timeinfo = localtime(&rawtime);
1109 strftime(date, 32, "%F %T", timeinfo);
1110 log << "# cstacks version " << VERSION << "; catalog generated on " << date << "\n";
1111 if (gzip) {
1112 gzputs_throwing(gz_tags, log.str().c_str());
1113 gzputs_throwing(gz_snps, log.str().c_str());
1114 gzputs_throwing(gz_alle, log.str().c_str());
1115 } else {
1116 tags << log.str();
1117 snps << log.str();
1118 alle << log.str();
1119 }
1120
1121 for (i = catalog.begin(); i != catalog.end(); i++) {
1122 tag = i->second;
1123
1124 if (gzip)
1125 write_gzip_output(tag, gz_tags, gz_snps, gz_alle);
1126 else
1127 write_simple_output(tag, tags, snps, alle);
1128 }
1129
1130 if (gzip) {
1131 gzclose_throwing(gz_tags);
1132 gzclose_throwing(gz_snps);
1133 gzclose_throwing(gz_alle);
1134 } else {
1135 tags.close();
1136 snps.close();
1137 alle.close();
1138 }
1139
1140 return 0;
1141 }
1142
merge_allele(Locus * locus,SNP * snp)1143 int merge_allele(Locus *locus, SNP *snp) {
1144 map<int, pair<string, SNP *>> columns;
1145 map<int, pair<string, SNP *>>::iterator c;
1146 vector<SNP *>::iterator i;
1147 SNP *lsnp;
1148
1149 for (i = locus->snps.begin(); i != locus->snps.end(); i++)
1150 columns[(*i)->col] = make_pair("sample", *i);
1151
1152 if (columns.count(snp->col)) {
1153 lsnp = columns[snp->col].second;
1154
1155 //
1156 // If this is a new allele for this nucleotide, add it to the catalog SNP.
1157 //
1158 bool rank_1_exists = false;
1159 bool rank_2_exists = false;
1160
1161 if (snp->rank_1 == lsnp->rank_1 ||
1162 snp->rank_1 == lsnp->rank_2 ||
1163 snp->rank_1 == lsnp->rank_3 ||
1164 snp->rank_1 == lsnp->rank_4) {
1165 rank_1_exists = true;
1166 }
1167 if (snp->rank_2 == lsnp->rank_1 ||
1168 snp->rank_2 == lsnp->rank_2 ||
1169 snp->rank_2 == lsnp->rank_3 ||
1170 snp->rank_2 == lsnp->rank_4) {
1171 rank_2_exists = true;
1172 }
1173
1174 if (rank_1_exists == false) {
1175 if (lsnp->rank_3 == 0)
1176 lsnp->rank_3 = snp->rank_1;
1177 else
1178 lsnp->rank_4 = snp->rank_1;
1179 }
1180 if (rank_2_exists == false) {
1181 if (lsnp->rank_3 == 0)
1182 lsnp->rank_3 = snp->rank_2;
1183 else
1184 lsnp->rank_4 = snp->rank_2;
1185 }
1186
1187 columns[snp->col] = make_pair("both", lsnp);
1188 } else {
1189 columns[snp->col] = make_pair("merge", snp);
1190 }
1191
1192 vector<pair<string, SNP *>> merged_snps;
1193
1194 for (c = columns.begin(); c != columns.end(); c++)
1195 merged_snps.push_back((*c).second);
1196
1197 //
1198 // Sort the SNPs by column
1199 //
1200 sort(merged_snps.begin(), merged_snps.end(), compare_pair_snp);
1201
1202 //
1203 // Modify any existing alleles to account for this new SNP. If there are not any alleles,
1204 // create new ones.
1205 //
1206 stringstream sallele;
1207 set<string> merged_alleles;
1208 string allele, new_allele;
1209 int pos;
1210
1211 if (locus->alleles.size() == 0) {
1212 sallele << locus->con[snp->col];
1213 merged_alleles.insert(sallele.str());
1214 }
1215
1216 map<string, int>::iterator j;
1217 vector<pair<string, SNP *>>::iterator k;
1218
1219 for (j = locus->alleles.begin(); j != locus->alleles.end(); j++) {
1220 allele = j->first;
1221 new_allele = "";
1222 pos = 0;
1223
1224 // cerr << "Allele length: " << allele.size() << "\n";
1225
1226 for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
1227 //
1228 // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
1229 // sequence to account for it in the allele string.
1230 //
1231 if ((*k).first == "merge") {
1232 new_allele += locus->con[(*k).second->col];
1233 // cerr << " Adding char '" << locus->con[k->second->col] << "' from consensus position " << (*k).second->col << "\n";
1234 } else {
1235 new_allele += allele[pos];
1236 // cerr << " Adding char '" << allele[pos] << "' from allele position " << pos << "\n";
1237 pos++;
1238 }
1239 }
1240
1241 merged_alleles.insert(new_allele);
1242 }
1243
1244 set<string>::iterator s;
1245
1246 locus->alleles.clear();
1247 for (s = merged_alleles.begin(); s != merged_alleles.end(); s++) {
1248 locus->alleles[*s] = 0;
1249 }
1250
1251 return 1;
1252 }
1253
merge_snps(Locus * matched_tag)1254 int CLocus::merge_snps(Locus *matched_tag) {
1255 map<int, pair<string, SNP *>> columns;
1256 vector<pair<string, SNP *>> merged_snps;
1257 set<string> merged_alleles;
1258 SNP *csnp;
1259
1260 for (auto i = this->snps.begin(); i != this->snps.end(); i++)
1261 columns[(*i)->col] = make_pair("catalog", *i);
1262
1263 for (auto i = matched_tag->snps.begin(); i != matched_tag->snps.end(); i++) {
1264 //
1265 // Is this column already represented from the previous sample?
1266 //
1267 if (columns.count((*i)->col)) {
1268 csnp = columns[(*i)->col].second;
1269
1270 //
1271 // If this is a new allele for this nucleotide, add it to the catalog SNP.
1272 //
1273 bool rank_1_exists = false;
1274 bool rank_2_exists = false;
1275
1276 if ((*i)->rank_1 == csnp->rank_1 ||
1277 (*i)->rank_1 == csnp->rank_2 ||
1278 (*i)->rank_1 == csnp->rank_3 ||
1279 (*i)->rank_1 == csnp->rank_4) {
1280 rank_1_exists = true;
1281 }
1282 if ((*i)->rank_2 == csnp->rank_1 ||
1283 (*i)->rank_2 == csnp->rank_2 ||
1284 (*i)->rank_2 == csnp->rank_3 ||
1285 (*i)->rank_2 == csnp->rank_4) {
1286 rank_2_exists = true;
1287 }
1288
1289 if (rank_1_exists == false) {
1290 if (csnp->rank_3 == 0)
1291 csnp->rank_3 = (*i)->rank_1;
1292 else
1293 csnp->rank_4 = (*i)->rank_1;
1294 }
1295 if (rank_2_exists == false) {
1296 if (csnp->rank_3 == 0)
1297 csnp->rank_3 = (*i)->rank_2;
1298 else
1299 csnp->rank_4 = (*i)->rank_2;
1300 }
1301
1302 columns[(*i)->col] = make_pair("both", csnp);
1303 } else {
1304 columns[(*i)->col] = make_pair("sample", *i);
1305 }
1306 }
1307
1308 for (auto c = columns.begin(); c != columns.end(); c++)
1309 merged_snps.push_back((*c).second);
1310
1311 //
1312 // Sort the SNPs by column
1313 //
1314 sort(merged_snps.begin(), merged_snps.end(), compare_pair_snp);
1315
1316 //
1317 // If the catalog tag has no defined alleles, create a matching haplotype
1318 // from the consensus sequence before merging in the new alleles.
1319 //
1320 string allele, new_allele;
1321 int pos;
1322
1323 if (this->alleles.size() == 0) {
1324 char c;
1325 new_allele = "";
1326 for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1327 csnp = k->second;
1328 c = csnp->col < this->len ? this->con[k->second->col] : 'N';
1329
1330 new_allele += c;
1331
1332 if (csnp->col > this->len - 1)
1333 continue;
1334
1335 if (c != csnp->rank_1 &&
1336 c != csnp->rank_2 &&
1337 c != csnp->rank_3 &&
1338 c != csnp->rank_4) {
1339
1340 if (csnp->rank_3 == 0)
1341 csnp->rank_3 = c;
1342 else
1343 csnp->rank_4 = c;
1344 }
1345 }
1346
1347 if (new_allele.length() > 0)
1348 merged_alleles.insert(new_allele);
1349 }
1350
1351 //
1352 // Merge the alleles accounting for any SNPs added from either of the two samples.
1353 //
1354 for (auto j = this->alleles.begin(); j != this->alleles.end(); j++) {
1355 allele = j->first;
1356 new_allele = "";
1357 pos = 0;
1358
1359 for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1360 //
1361 // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
1362 // sequence to account for it in the allele string.
1363 //
1364 if (k->first == "sample") {
1365 new_allele += k->second->col > this->len - 1 ? 'N' : this->con[k->second->col];
1366 } else {
1367 new_allele += allele[pos];
1368 pos++;
1369 }
1370 }
1371
1372 merged_alleles.insert(new_allele);
1373 }
1374
1375 for (auto j = matched_tag->alleles.begin(); j != matched_tag->alleles.end(); j++) {
1376 allele = j->first;
1377 new_allele = "";
1378 pos = 0;
1379
1380 for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1381 if (k->first == "catalog") {
1382 new_allele += k->second->col > matched_tag->len - 1 ? 'N' : matched_tag->con[k->second->col];
1383 } else {
1384 new_allele += allele[pos];
1385 pos++;
1386 }
1387 }
1388
1389 merged_alleles.insert(new_allele);
1390 }
1391
1392 //
1393 // If the matching tag being merged into the catalog had no called SNPs
1394 // create alleles from the consensus sequence and check that catalog SNP
1395 // objects contain all the nucleoties.
1396 //
1397 if (matched_tag->alleles.size() == 0) {
1398 char c;
1399 new_allele = "";
1400 for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1401 csnp = k->second;
1402 c = csnp->col < matched_tag->len ? matched_tag->con[k->second->col] : 'N';
1403 new_allele += c;
1404
1405 if (csnp->col > matched_tag->len - 1)
1406 continue;
1407
1408 if (c != csnp->rank_1 &&
1409 c != csnp->rank_2 &&
1410 c != csnp->rank_3 &&
1411 c != csnp->rank_4) {
1412
1413 if (csnp->rank_3 == 0)
1414 csnp->rank_3 = c;
1415 else
1416 csnp->rank_4 = c;
1417 }
1418 }
1419
1420 if (new_allele.length() > 0)
1421 merged_alleles.insert(new_allele);
1422 }
1423
1424 //
1425 // Update the catalog entry's list of SNPs and alleles
1426 //
1427 this->snps.clear();
1428
1429 for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1430 SNP *snp = new SNP;
1431 snp->col = (*k).second->col;
1432 snp->type = (*k).second->type;
1433 snp->lratio = 0.0;
1434 snp->rank_1 = (*k).second->rank_1;
1435 snp->rank_2 = (*k).second->rank_2;
1436 snp->rank_3 = (*k).second->rank_3;
1437 snp->rank_4 = (*k).second->rank_4;
1438
1439 this->snps.push_back(snp);
1440 }
1441
1442 this->alleles.clear();
1443 for (auto s = merged_alleles.begin(); s != merged_alleles.end(); s++) {
1444 this->alleles[*s] = 0;
1445 }
1446
1447 return 1;
1448 }
1449
1450 int
reduce_alleles(set<string> & alleles)1451 CLocus::reduce_alleles(set<string> &alleles)
1452 {
1453 set<string>::iterator it;
1454 uint len, max_len, match, ncnt;
1455 vector<string> haplotypes, cur, next;
1456
1457 max_len = 0;
1458 for (it = alleles.begin(); it != alleles.end(); it++) {
1459 max_len = it->length() > max_len ? it->length() : max_len;
1460 haplotypes.push_back(*it);
1461 }
1462
1463 len = alleles.size();
1464 alleles.clear();
1465
1466 for (uint i = 0; i < len; i++) {
1467 //cerr << "Looking at haplotype[" << i << "]: " << haplotypes[i] << "\n";
1468 //
1469 // We will only look at strings that contain Ns.
1470 //
1471 if (haplotypes[i].find('N') == string::npos) {
1472 alleles.insert(haplotypes[i]);
1473 //cerr << " No Ns, skipping...\n";
1474 continue;
1475 }
1476
1477 uint k = 0;
1478 uint j = i + 1;
1479 while (k < len - 1) {
1480 cur.push_back(haplotypes[j % len]);
1481 k++;
1482 j++;
1483 }
1484
1485 //
1486 // Examine the haplotype alleles one SNP at a time. If we are able to uniquely
1487 // determine a second haplotype that encompasses the first
1488 // to, return it.
1489 //
1490 j = 0;
1491 while (cur.size() > 1 && j < max_len) {
1492
1493 for (k = 0; k < cur.size(); k++) {
1494 cerr << "Comparing haplotypes[" << i << "]: '" << haplotypes[i] << "' to '" << cur[k] << " at position " << j << "'\n";
1495 if (haplotypes[i][j] == cur[k][j] || haplotypes[i][j] == 'N') {
1496 cerr << " Keeping this haplotype.\n";
1497 next.push_back(cur[k]);
1498 } else {
1499 cerr << " Discarding this haplotype.\n";
1500 }
1501 }
1502 cur = next;
1503 next.clear();
1504 j++;
1505 }
1506
1507 //
1508 // If there is only one left, make sure what we have of the haplotype does match
1509 // and its not simply an erroneously called haplotype. If so, then this haplotype
1510 // is encompassed by another, longer haplotype and we do not need to keep it.
1511 //
1512 ncnt = 0;
1513 match = 0;
1514 if (cur.size() > 1) {
1515 cerr << "Discarding " << haplotypes[i] << "\n";
1516 continue;
1517 } else if (cur.size() == 1) {
1518 for (k = 0; k < max_len; k++)
1519 if (haplotypes[i][k] != 'N') ncnt++;
1520 for (k = 0; k < max_len; k++)
1521 if (cur[0][k] == haplotypes[i][k]) match++;
1522 if (match == ncnt) {
1523 cerr << "Discarding " << haplotypes[i] << "\n";
1524 continue;
1525 }
1526 }
1527
1528 cerr << "Keeping " << haplotypes[i] << "\n";
1529 alleles.insert(haplotypes[i]);
1530 }
1531
1532 return 0;
1533 }
1534
1535 int
populate_kmer_hash(map<int,CLocus * > & catalog,CatKmerHashMap & kmer_map,vector<char * > & kmer_map_keys,int kmer_len)1536 populate_kmer_hash(map<int, CLocus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
1537 {
1538 map<int, CLocus *>::iterator it;
1539 vector<pair<allele_type, string>>::iterator allele;
1540 vector<char *> kmers;
1541 CLocus *tag;
1542 char *hash_key;
1543 bool exists;
1544 int j;
1545
1546 //
1547 // Break each stack down into k-mers and create a hash map of those k-mers
1548 // recording in which sequences they occur.
1549 //
1550 int num_kmers = strlen(catalog.begin()->second->con) - kmer_len + 1;
1551
1552 for (it = catalog.begin(); it != catalog.end(); it++) {
1553 tag = it->second;
1554
1555 //
1556 // Iterate through the possible Catalog alleles
1557 //
1558 for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
1559 //
1560 // Generate and hash the kmers for this allele string
1561 //
1562 generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
1563
1564 for (j = 0; j < num_kmers; j++) {
1565 hash_key = kmers[j];
1566 exists = kmer_map.count(hash_key) == 0 ? false : true;
1567
1568 kmer_map[hash_key].push_back(make_pair(allele->first, tag->id));
1569
1570 if (exists)
1571 delete [] kmers[j];
1572 else
1573 kmer_map_keys.push_back(hash_key);
1574 }
1575
1576 kmers.clear();
1577 }
1578 }
1579
1580 //dump_kmer_map(kmer_map);
1581
1582 return 0;
1583 }
1584
1585 int
write_simple_output(CLocus * tag,ofstream & cat_file,ofstream & snp_file,ofstream & all_file)1586 write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstream &all_file)
1587 {
1588 string sources;
1589
1590 for (auto src_it = tag->sources.begin(); src_it != tag->sources.end(); src_it++) {
1591 stringstream s;
1592 s << (*src_it).first << "_" << (*src_it).second << ",";
1593 sources += s.str();
1594 }
1595 sources = sources.substr(0, sources.length() - 1);
1596
1597 cat_file << 0 << "\t" // Catalog has no sample ID.
1598 << tag->id << "\t"
1599 << "consensus" << "\t"
1600 << "0" << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
1601 << sources << "\t"
1602 << tag->con << "\t"
1603 << 0 << "\t" // These flags are unused in cstacks, but important in ustacks
1604 << 0 << "\t"
1605 << 0 << "\n";
1606
1607 //
1608 // Output the SNPs associated with the catalog tag
1609 //
1610 for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
1611 snp_file << "0" << "\t"
1612 << tag->id << "\t"
1613 << (*snp_it)->col << "\t";
1614
1615 switch((*snp_it)->type) {
1616 case snp_type_het:
1617 snp_file << "E\t";
1618 break;
1619 case snp_type_hom:
1620 snp_file << "O\t";
1621 break;
1622 default:
1623 snp_file << "U\t";
1624 break;
1625 }
1626
1627 snp_file <<
1628 (*snp_it)->lratio << "\t" <<
1629 (*snp_it)->rank_1 << "\t" <<
1630 (*snp_it)->rank_2 << "\t" <<
1631 ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
1632 ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
1633 }
1634
1635 //
1636 // Output the alleles associated with the two matched tags
1637 //
1638 for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
1639 all_file << "0" << "\t"
1640 << tag->id << "\t"
1641 << all_it->first << "\t"
1642 << "0" << "\t" // These two fields are used in the
1643 << "0" << "\n"; // ustacks/pstacks output, not in cstacks.
1644
1645 return 0;
1646 }
1647
1648 int
write_gzip_output(CLocus * tag,gzFile & cat_file,gzFile & snp_file,gzFile & all_file)1649 write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_file)
1650 {
1651 string sources;
1652 stringstream sstr;
1653
1654 for (auto src_it = tag->sources.begin(); src_it != tag->sources.end(); src_it++) {
1655 sstr << (*src_it).first << "_" << (*src_it).second << ",";
1656 }
1657 sources = sstr.str();
1658 sources = sources.substr(0, sources.length() - 1);
1659
1660 sstr.str("");
1661
1662 sstr << 0 << "\t" // Catalog has no sample ID.
1663 << tag->id << "\t"
1664 << "consensus" << "\t"
1665 << "0" << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
1666 << sources << "\t"
1667 << tag->con << "\t"
1668 << 0 << "\t" // These flags are unused in cstacks, but important in ustacks
1669 << 0 << "\t"
1670 << 0 << "\n";
1671
1672 gzputs(cat_file, sstr.str().c_str());
1673 sstr.str("");
1674
1675 //
1676 // Output the SNPs associated with the catalog tag
1677 //
1678 for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
1679 sstr << "0" << "\t"
1680 << tag->id << "\t"
1681 << (*snp_it)->col << "\t";
1682
1683 switch((*snp_it)->type) {
1684 case snp_type_het:
1685 sstr << "E\t";
1686 break;
1687 case snp_type_hom:
1688 sstr << "O\t";
1689 break;
1690 default:
1691 sstr << "U\t";
1692 break;
1693 }
1694
1695 sstr <<
1696 (*snp_it)->lratio << "\t" <<
1697 (*snp_it)->rank_1 << "\t" <<
1698 (*snp_it)->rank_2 << "\t" <<
1699 ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
1700 ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
1701 }
1702
1703 gzputs(snp_file, sstr.str().c_str());
1704 sstr.str("");
1705
1706 //
1707 // Output the alleles associated with the two matched tags
1708 //
1709 for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
1710 sstr << "0" << "\t"
1711 << tag->id << "\t"
1712 << all_it->first << "\t"
1713 << 0 << "\t"
1714 << 0 << "\n";
1715
1716 gzputs(all_file, sstr.str().c_str());
1717
1718 return 0;
1719 }
1720
1721 int
initialize_new_catalog(pair<int,string> & sample,map<int,CLocus * > & catalog)1722 initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
1723 {
1724 map<int, CLocus *> tmp_catalog;
1725 bool compressed = false;
1726
1727 //
1728 // Parse the input files.
1729 //
1730 if (!load_loci(sample.second, tmp_catalog, 0, false, compressed))
1731 return 0;
1732
1733 in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
1734
1735 sample.first = tmp_catalog.begin()->second->sample_id;
1736
1737 //
1738 // Iterate over the catalog entires and renumber them after recording the source of
1739 // locus.
1740 //
1741 for (auto j = tmp_catalog.begin(); j != tmp_catalog.end(); j++) {
1742 j->second->sources.push_back(make_pair(sample.first, j->second->id));
1743 j->second->id = next_catalog_id;
1744
1745 catalog[next_catalog_id] = j->second;
1746
1747 next_catalog_id++;
1748 }
1749
1750 cerr << " " << catalog.size() << " loci were newly added to the catalog.\n";
1751
1752 return 1;
1753 }
1754
1755 int
initialize_existing_catalog(string catalog_path,map<int,CLocus * > & catalog)1756 initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
1757 {
1758 bool compressed;
1759
1760 //
1761 // Parse the input files.
1762 //
1763 if (!load_loci(catalog_path, catalog, 0, false, compressed))
1764 return 0;
1765
1766 in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
1767
1768 //
1769 // Iterate over the catalog entires and convert the stack components
1770 // into source objects, to record what samples each locus came from.
1771 //
1772 CLocus *loc;
1773 char *p, *q;
1774 int sample_id, locus_id;
1775 size_t max_catalog_id = 0;
1776
1777 for (auto j = catalog.begin(); j != catalog.end(); j++) {
1778 loc = j->second;
1779
1780 if ((size_t) loc->id > max_catalog_id)
1781 max_catalog_id = (size_t) loc->id;
1782
1783 for (uint i = 0; i < loc->comp.size(); i++) {
1784 //
1785 // Parse the ID into sample ID / locus ID, given 43_1356, parse into
1786 // sample ID 43 and locus ID 1356.
1787 //
1788 for (p = loc->comp[i]; *p != '_' && *p != '\0'; p++);
1789 if (*p != '_')
1790 return 0;
1791 p++;
1792 sample_id = strtol(loc->comp[i], &q, 10);
1793 if (*q != '_')
1794 return 0;
1795
1796 locus_id = strtol(p, &q, 10);
1797
1798 if (*q != '\0')
1799 return 0;
1800
1801 loc->sources.push_back(make_pair(sample_id, locus_id));
1802 }
1803 }
1804
1805 next_catalog_id = max_catalog_id + 1;
1806
1807 return 1;
1808 }
1809
parse_command_line(int argc,char * argv[])1810 int parse_command_line(int argc, char* argv[]) {
1811 string in_dir;
1812 string popmap_path;
1813
1814 while (1) {
1815 static struct option long_options[] = {
1816 {"help", no_argument, NULL, 'h'},
1817 {"version", no_argument, NULL, 1000},
1818 {"uniq-haplotypes", no_argument, NULL, 'u'}, {"uniq_haplotypes", no_argument, NULL, 'u'},
1819 {"report-mmatches", no_argument, NULL, 'R'}, {"report_mmatches", no_argument, NULL, 'R'},
1820 {"disable-gapped", no_argument, NULL, 'G'}, {"disable_gapped", no_argument, NULL, 'G'},
1821 {"max-gaps", required_argument, NULL, 'X'}, {"max_gaps", required_argument, NULL, 'X'},
1822 {"min-aln-len", required_argument, NULL, 'x'}, {"min_aln_len", required_argument, NULL, 'x'},
1823 {"ctag-dist", required_argument, NULL, 'n'}, {"ctag_dist", required_argument, NULL, 'n'},
1824 {"k-len", required_argument, NULL, 'k'}, {"k_len", required_argument, NULL, 'k'},
1825 {"in-path", required_argument, NULL, 'P'}, {"in_path", required_argument, NULL, 'P'},
1826 {"popmap", required_argument, NULL, 'M'},
1827 {"catalog", required_argument, NULL, 'c'},
1828 {"sample", required_argument, NULL, 's'},
1829 {"out-path", required_argument, NULL, 'o'}, {"out_path", required_argument, NULL, 'o'},
1830 {"threads", required_argument, NULL, 'p'},
1831 {0, 0, 0, 0}
1832 };
1833
1834 int option_index = 0;
1835 int c = getopt_long(argc, argv, "hvuRGX:x:o:s:c:p:n:k:P:M:", long_options, &option_index);
1836
1837 // Detect the end of the options.
1838 if (c == -1)
1839 break;
1840
1841 switch (c) {
1842 case 'h':
1843 help();
1844 break;
1845 case 'n':
1846 ctag_dist = is_integer(optarg);
1847 break;
1848 case 'k':
1849 set_kmer_len = false;
1850 kmer_len = is_integer(optarg);
1851 break;
1852 case 'R':
1853 report_mmatches = true;
1854 break;
1855 case 's':
1856 samples.push(make_pair(0, optarg));
1857 break;
1858 case 'c':
1859 catalog_path = optarg;
1860 break;
1861 case 'o':
1862 out_path = optarg;
1863 break;
1864 case 'u':
1865 require_uniq_haplotypes = true;
1866 break;
1867 case 'G':
1868 gapped_alignments = false;
1869 break;
1870 case 'X':
1871 max_gaps = is_double(optarg);
1872 break;
1873 case 'x':
1874 min_match_len = is_double(optarg);
1875 break;
1876 case 1000:
1877 version();
1878 break;
1879 case 'p':
1880 num_threads = is_integer(optarg);
1881 break;
1882 case 'P':
1883 in_dir = optarg;
1884 break;
1885 case 'M':
1886 popmap_path = optarg;
1887 break;
1888 case '?':
1889 // getopt_long already printed an error message.
1890 help();
1891 break;
1892 default:
1893 help();
1894 exit(1);
1895 }
1896 }
1897
1898 if (optind < argc) {
1899 cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
1900 help();
1901 }
1902
1903 if (in_dir.empty() && samples.empty()) {
1904 cerr << "Error: You must specify one of -P or -s.\n";
1905 help();
1906 } else if ((!in_dir.empty() || !popmap_path.empty())
1907 && (!samples.empty() || !out_path.empty())) {
1908 cerr << "Error: Please use options -P/-M or -s/-o, not both.\n";
1909 help();
1910 }
1911
1912 if (!in_dir.empty()) {
1913 if (popmap_path.empty()) {
1914 cerr << "Error: Please specify a population map (-M).\n";
1915 help();
1916 }
1917
1918 if (in_dir.back() != '/')
1919 in_dir += "/";
1920
1921 // Set `samples`.
1922 MetaPopInfo popmap;
1923 popmap.init_popmap(popmap_path);
1924 for (const Sample& s : popmap.samples())
1925 samples.push({0, in_dir + s.name});
1926
1927 // Set `out_path`.
1928 out_path = in_dir;
1929
1930 } else if (!samples.empty()) {
1931 if (out_path.empty())
1932 out_path = ".";
1933
1934 if (out_path.back() != '/')
1935 out_path += "/";
1936 }
1937
1938 if (set_kmer_len == false && (kmer_len < 5 || kmer_len > 31)) {
1939 cerr << "Kmer length must be between 5 and 31bp.\n";
1940 help();
1941 }
1942
1943 return 0;
1944 }
1945
version()1946 void version() {
1947 cerr << "cstacks " << VERSION << "\n";
1948
1949 exit(1);
1950 }
1951
help()1952 void help() {
1953 cerr << "cstacks " << VERSION << "\n"
1954 << "cstacks -P in_dir -M popmap [-n num_mismatches] [-p num_threads]" << "\n"
1955 << "cstacks -s sample1_path [-s sample2_path ...] -o path [-n num_mismatches] [-p num_threads]" << "\n"
1956 << "\n"
1957 << " -P,--in-path: path to the directory containing Stacks files.\n"
1958 << " -M,--popmap: path to a population map file.\n"
1959 << " -n: number of mismatches allowed between sample loci when build the catalog (default 1; suggested: set to ustacks -M)." << "\n"
1960 << " -p,--threads: enable parallel execution with num_threads threads.\n"
1961 << " -s: sample prefix from which to load loci into the catalog." << "\n"
1962 << " -o,--outpath: output path to write results." << "\n"
1963 << " -c,--catalog <path>: add to an existing catalog.\n"
1964 << "\n"
1965 << "Gapped assembly options:\n"
1966 << " --max-gaps: number of gaps allowed between stacks before merging (default: 2).\n"
1967 << " --min-aln-len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n"
1968 << " --disable-gapped: disable gapped alignments between stacks (default: use gapped alignments).\n"
1969 << "\n"
1970 << "Advanced options:\n"
1971 << " --k-len <len>: specify k-mer size for matching between between catalog loci (automatically calculated by default).\n"
1972 << " --report-mmatches: report query loci that match more than one catalog locus.\n";
1973
1974 exit(1);
1975 }
1976