1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // cstacks -- Create a catalog of Stacks.
23 //
24 
25 #include "MetaPopInfo.h"
26 #include "cstacks.h"
27 
28 // Global variables to hold command-line options.
29 queue<pair<int, string>> samples;
30 string  out_path;
31 string  catalog_path;
32 FileT   in_file_type      = FileT::sql;
33 int     ctag_dist         = 1;
34 bool    set_kmer_len      = true;
35 int     kmer_len          = 0;
36 searcht search_type       = sequence;
37 int     num_threads       = 1;
38 bool    report_mmatches   = false;
39 bool    require_uniq_haplotypes = false;
40 bool    gapped_alignments = true;
41 double  min_match_len     = 0.80;
42 double  max_gaps          = 2.0;
43 
44 size_t  next_catalog_id   = 1;
45 
46 using namespace std;
47 
main(int argc,char * argv[])48 int main (int argc, char* argv[]) {
49     IF_NDEBUG_TRY
50 
51     parse_command_line(argc, argv);
52 
53     uint sample_cnt = samples.size();
54 
55     cerr << "cstacks parameters selected:\n"
56          << "  Loci matched based on sequence identity.\n"
57          << "  Number of mismatches allowed between stacks: " << ctag_dist << "\n"
58          << "  Gapped alignments: " << (gapped_alignments ? "enabled" : "disabled") << "\n"
59          << "Constructing catalog from " << sample_cnt << " samples.\n";
60 
61     //
62     // Set the number of OpenMP parallel threads to execute.
63     //
64     #ifdef _OPENMP
65     omp_set_num_threads(num_threads);
66     #endif
67 
68     map<int, CLocus *> catalog;
69     pair<int, string>  s;
70     bool compressed = false;
71     int  i;
72 
73     set<int> seen_sample_ids; // For checking sample ID unicity.
74 
75     if (catalog_path.length() > 0) {
76         cerr << "\nInitializing existing catalog...\n";
77         if (!initialize_existing_catalog(catalog_path, catalog)) {
78             cerr << "Error: Failed to initialize the catalog.\n";
79             throw exception();
80         }
81         i = 1;
82 
83     } else {
84         s = samples.front();
85         samples.pop();
86 
87         cerr << "\nInitializing new catalog...\n";
88         if (!initialize_new_catalog(s, catalog)) {
89             cerr << "Error: Failed to initialize the catalog.\n";
90             throw exception();
91         }
92         i = 2;
93         seen_sample_ids.insert(catalog.begin()->second->sample_id);
94     }
95 
96     while (!samples.empty()) {
97         map<int, QLocus *> sample;
98 
99         s = samples.front();
100         samples.pop();
101 
102         cerr << "\nProcessing sample " << s.second << " [" << i << " of " << sample_cnt << "]\n";
103 
104         if (!load_loci(s.second, sample, 0, false, compressed)) {
105             cerr << "Failed to load sample " << i << "\n";
106             continue;
107         }
108         //
109         // Assign the ID for this sample data.
110         //
111         s.first = sample.begin()->second->sample_id;
112 
113         // Check for unique sample IDs.
114         if (!seen_sample_ids.insert(s.first).second) {
115             // Insert failed.
116             cerr << "Error: Sample ID '" << s.first << "' occurs more than once. Sample IDs must be unique." << endl;
117             throw exception();
118         }
119 
120         //dump_loci(sample);
121 
122         cerr << "Searching for sequence matches...\n";
123         find_kmer_matches_by_sequence(catalog, sample, ctag_dist);
124 
125         if (gapped_alignments) {
126             cerr << "Searching for gapped alignments...\n";
127             search_for_gaps(catalog, sample, min_match_len, ctag_dist);
128         }
129 
130         cerr << "Merging matches into catalog...\n";
131         uint mmatches = 0;
132         uint gmatches = 0;
133         uint umatches = 0;
134         uint nmatches = 0;
135         uint merge_cloci   = 0;
136         uint reduced_cloci = 0;
137         merge_matches(catalog, sample, s, ctag_dist, nmatches, umatches, gmatches, mmatches, merge_cloci, reduced_cloci);
138         cerr << "  " << umatches << " loci were matched to a catalog locus.\n"
139              << "  " << gmatches << " loci were matched to a catalog locus using gapped alignments.\n"
140              << "  " << nmatches << " loci were newly added to the catalog.\n"
141              << "  " << mmatches << " loci matched more than one catalog locus, linking them.\n"
142              << "    " << merge_cloci << " linked catalog loci were merged into " << reduced_cloci << " loci.\n";
143 
144         //
145         // Regenerate the alleles for the catalog tags after merging the new sample into the catalog.
146         //
147         for (auto cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++) {
148             cat_it->second->populate_alleles();
149             cat_it->second->match_cnt = 0;
150         }
151 
152         i++;
153 
154         for (auto query_it = sample.begin(); query_it != sample.end(); query_it++)
155             delete query_it->second;
156         sample.clear();
157     }
158 
159     cerr << "\nWriting catalog in directory '" << out_path << "'.\n"
160          << "Final catalog contains " << catalog.size() << " loci.\n";
161 
162     write_catalog(catalog);
163     cerr << "cstacks is done.\n";
164 
165     //
166     // Free memory associated with the catalog.
167     //
168     for (auto cat_it = catalog.begin(); cat_it != catalog.end(); cat_it++)
169         delete cat_it->second;
170     catalog.clear();
171 
172     return 0;
173     IF_NDEBUG_CATCH_ALL_EXCEPTIONS
174 }
175 
update_catalog_index(map<int,CLocus * > & catalog,map<string,int> & cat_index)176 int update_catalog_index(map<int, CLocus *> &catalog, map<string, int> &cat_index) {
177     map<int, CLocus *>::iterator j;
178     char id[id_len];
179 
180     for (j = catalog.begin(); j != catalog.end(); j++) {
181         snprintf(id, id_len - 1, "%s|%d|%c",
182                  j->second->loc.chr(),
183                  j->second->loc.bp,
184                  j->second->loc.strand == strand_plus ? '+' : '-');
185 
186         if (cat_index.count(id) == 0) {
187             cat_index[id] = j->first;
188         } else {
189             if (cat_index[id] != j->first)
190                 cerr << "Error: Catalog index mismatch, key: '" << id << "'.\n";
191         }
192     }
193 
194     return 0;
195 }
196 
197 int
characterize_mismatch_snps(CLocus * catalog_tag,QLocus * query_tag)198 characterize_mismatch_snps(CLocus *catalog_tag, QLocus *query_tag)
199 {
200     set<int> snp_cols;
201     uint i;
202     for (i = 0; i < catalog_tag->snps.size(); i++)
203         snp_cols.insert(catalog_tag->snps[i]->col);
204     for (i = 0; i < query_tag->snps.size(); i++)
205         snp_cols.insert(query_tag->snps[i]->col);
206 
207     //
208     // For each mismatch found, create a SNP object
209     //
210     const char *c        = catalog_tag->con;
211     const char *c_beg    = c;
212     const char *c_end    = c + strlen(c);
213     const char *q        = query_tag->con;
214     const char *q_beg    = q;
215     const char *q_end    = q + strlen(q);
216 
217     i = 0;
218     while (c < c_end && q < q_end) {
219         if (snp_cols.count(i) == 0 &&
220             (*c != *q) && (*c != 'N' && *q != 'N')) {
221 
222             // cerr << "Adding a new SNP at position " << c - c_beg << ", " << *c << "/" << *q << "\n";
223             SNP *s = new SNP;
224             s->type   = snp_type_het;
225             s->col    = c - c_beg;
226             s->lratio = 0;
227             s->rank_1 = *c;
228             s->rank_2 = *q;
229 
230             merge_allele(catalog_tag, s);
231             merge_allele(query_tag, s);
232 
233             catalog_tag->snps.push_back(s);
234 
235             s = new SNP;
236             s->type   = snp_type_het;
237             s->col    = q - q_beg;
238             s->lratio = 0;
239             s->rank_1 = *q;
240             s->rank_2 = *c;
241 
242             query_tag->snps.push_back(s);
243         }
244         c++;
245         q++;
246         i++;
247     }
248 
249     return 1;
250 }
251 
252 int
merge_matches(map<int,CLocus * > & catalog,map<int,QLocus * > & sample,pair<int,string> & sample_file,int ctag_dist,uint & new_matches,uint & unique_matches,uint & gapped_matches,uint & multiple_matches,uint & merge_cloci,uint & reduced_cloci)253 merge_matches(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, pair<int, string> &sample_file, int ctag_dist,
254               uint &new_matches, uint &unique_matches, uint &gapped_matches, uint &multiple_matches,
255               uint &merge_cloci, uint &reduced_cloci)
256 {
257     map<int, QLocus *>::iterator i;
258     CLocus *ctag;
259     QLocus *qtag;
260     string  cseq, qseq, cigar_str;
261     int     cseq_len, match_index;
262 
263     GappedAln *aln = new GappedAln();
264 
265     //
266     // Variables to control merging of catalog loci that are linked by haplotypes from one or more sample loci.
267     //
268     vector<vector<int>> cloc_merge_list;
269     map<int, int>       cloc_merge_key;
270 
271     for (i = sample.begin(); i != sample.end(); i++) {
272         qtag = i->second;
273 
274         //
275         // If this stack didn't match an existing catalog stack, add this stack to the
276         // catalog as a new stack.
277         //
278         if (qtag->matches.size() == 0) {
279             add_unique_tag(sample_file, catalog, qtag);
280             new_matches++;
281             continue;
282         }
283 
284         //
285         // Reduce the set of matches to the minimum distance per query allele.
286         //
287         map<allele_type, size_t> min_allele_dist;
288         map<allele_type, size_t> per_allele_match;
289         for (uint k = 0; k < qtag->matches.size(); k++)
290             if (min_allele_dist.count(qtag->matches[k]->query_type) == 0) {
291                 min_allele_dist[qtag->matches[k]->query_type]  = qtag->matches[k]->dist;
292                 per_allele_match[qtag->matches[k]->query_type] = qtag->matches[k]->cat_id;
293             } else if (qtag->matches[k]->dist < min_allele_dist[qtag->matches[k]->query_type]) {
294                 min_allele_dist[qtag->matches[k]->query_type]  = qtag->matches[k]->dist;
295                 per_allele_match[qtag->matches[k]->query_type] = qtag->matches[k]->cat_id;
296             }
297 
298         int cat_id = -1;
299         set<int> catalog_ids;
300         //
301         // Iterate over all of the matches of minimal distance and merge this query locus into the catalog locus.
302         //
303         for (auto j = per_allele_match.begin(); j != per_allele_match.end(); j++) {
304 
305             cat_id = j->second;
306             //
307             // If we already handled this catalog locus, skip the allele.
308             //
309             if (catalog_ids.count(cat_id) > 0)
310                 continue;
311 
312             //
313             // Make a copy of the query tag so that if we are merging it into multiple catalog
314             // loci (which will subsequently be collapsed), we will have an unmodified copy for
315             // each merge.
316             //
317             QLocus *qtag_merge = new QLocus(*qtag);
318 
319             ctag        = catalog.at(cat_id);
320             cigar_str   = "";
321             match_index = -1;
322 
323             assert(ctag != NULL);
324             catalog_ids.insert(cat_id);
325 
326             for (uint k = 0; k < qtag_merge->matches.size(); k++)
327                 if ((int) qtag_merge->matches[k]->cat_id == cat_id) {
328                     cigar_str   = qtag_merge->matches[k]->cigar;
329                     match_index = k;
330                     break;
331                 }
332 
333             assert(match_index >= 0 && match_index < (int) qtag_merge->matches.size());
334 
335             string query_allele, query_seq;
336             if (gapped_alignments) {
337                 //
338                 // Find the proper query allele that was aligned against the catalog.
339                 //
340                 query_allele = qtag_merge->matches[match_index]->query_type;
341                 for (uint k = 0; k < qtag_merge->strings.size(); k++)
342                     if (qtag_merge->strings[k].first == query_allele) {
343                         query_seq = qtag_merge->strings[k].second;
344                         break;
345                     }
346 
347                 //
348                 // If we have already matched a query locus to this catalog locus for the current
349                 // sample, we must re-align the sequences in case changes have been made to the
350                 // sequence by the previous matching sequence.
351                 //
352                 if (ctag->match_cnt > 0) {
353                     aln->init(ctag->len, query_seq.length());
354                     aln->align(ctag->con, query_seq);
355                     cigar_str = invert_cigar(aln->result().cigar);
356                 }
357             }
358 
359             bool gapped_aln = false;
360             if (cigar_str.length() > 0)
361                 gapped_aln = true;
362 
363             if (gapped_aln) {
364                 Cigar cigar;
365                 //
366                 // Since the match was a gapped alignment, adjust the lengths of the consensus sequences.
367                 // Adjust the postition of any SNPs that were shifted down sequence due to a gap.
368                 //
369                 cseq_len  = parse_cigar(cigar_str.c_str(), cigar);
370                 qseq      = apply_cigar_to_seq(query_seq.c_str(), cigar);
371                 adjust_snps_for_gaps(cigar, qtag_merge);
372 
373                 cigar_str = invert_cigar(cigar_str);
374                 cseq_len  = parse_cigar(cigar_str.c_str(), cigar);
375                 cseq      = apply_cigar_to_seq(ctag->con, cigar);
376                 adjust_snps_for_gaps(cigar, ctag);
377 
378                 assert(qseq.length() == cseq.length());
379 
380                 //
381                 // If the alignment modified the catalog locus, record it so we can re-align
382                 // any other matching sequences from this sample.
383                 //
384                 if ((uint)cseq_len > ctag->len)
385                     ctag->match_cnt++;
386 
387                 //
388                 // Adjust the consensus sequences for both loci.
389                 //
390                 ctag->add_consensus(cseq.c_str());
391                 qtag_merge->add_consensus(qseq.c_str());
392 
393                 gapped_matches++;
394 
395             } else {
396                 unique_matches++;
397             }
398 
399             //
400             // If mismatches are allowed between query and catalog tags, identify the
401             // mismatches and convert them into SNP objects to be merged into the catalog tag.
402             //
403             if (ctag_dist > 0 && !characterize_mismatch_snps(ctag, qtag_merge))
404                 cerr
405                     << "  Error characterizing mismatch SNPs "
406                     << sample_file.second << ", tag " << qtag->id
407                     << " with catalog tag " << ctag->id << "\n";
408 
409             //
410             // Merge the SNPs and alleles from the sample into the catalog tag.
411             //
412             if (!ctag->merge_snps(qtag_merge)) {
413                 cerr << "Error merging " << sample_file.second << ", tag " << qtag_merge->id
414                      << " with catalog tag " << ctag->id << "\n";
415             }
416 
417             //
418             // Add any new sequence information into the catalog consensus.
419             //
420             if (gapped_aln) {
421                 for (uint k = 0; k < ctag->len && k < qtag_merge->len; k++)
422                     if (qtag_merge->con[k] != 'N' && ctag->con[k] == 'N')
423                         ctag->con[k] = qtag_merge->con[k];
424 
425             }
426 
427             assert(strlen(ctag->con) == strlen(qtag_merge->con));
428 
429             ctag->sources.push_back(make_pair(sample_file.first, qtag_merge->id));
430 
431             delete qtag_merge;
432         }
433 
434         //
435         // If the query tag matches more than one tag in the catalog, mark the catalog loci for merging.
436         //
437         if (catalog_ids.size() > 1) {
438             multiple_matches++;
439 
440             for (auto n = catalog_ids.begin(); n != catalog_ids.end(); n++) {
441                 cat_id = *n;
442 
443                 if (cloc_merge_key.count(cat_id) > 0) {
444                     uint index = cloc_merge_key[cat_id];
445                     for (uint k = 0; k < cloc_merge_list[index].size(); k++)
446                         catalog_ids.insert(cloc_merge_list[index][k]);
447                     cloc_merge_list[index].clear();
448                 }
449             }
450 
451             // Insert the list of catalog loci to merge.
452             cloc_merge_list.push_back(vector<int>());
453             uint index = cloc_merge_list.size() - 1;
454 
455             // Record where to find each catalog locus.
456             for (auto n = catalog_ids.begin(); n != catalog_ids.end(); n++) {
457                 cloc_merge_key[*n] = index;
458                 cloc_merge_list[index].push_back(*n);
459             }
460         }
461     }
462 
463     delete aln;
464 
465     //
466     // Merge catalog loci, that were linked by a sample locus, together.
467     //
468     for (uint j = 0; j < cloc_merge_list.size(); j++) {
469 
470         if (cloc_merge_list[j].size() == 0) continue;
471 
472         merge_cloci  += cloc_merge_list[j].size();
473 	reduced_cloci++;
474         merge_catalog_loci(catalog, cloc_merge_list[j]);
475     }
476     return 0;
477 }
478 
479 int
add_unique_tag(pair<int,string> & sample_file,map<int,CLocus * > & catalog,QLocus * qloc)480 add_unique_tag(pair<int, string> &sample_file, map<int, CLocus *> &catalog, QLocus *qloc)
481 {
482     CLocus *c = new CLocus;
483 
484     c->id = next_catalog_id;
485     next_catalog_id++;
486 
487     c->add_consensus(qloc->con);
488     //
489     // Record the source of this catalog tag.
490     //
491     c->sources.push_back(make_pair(sample_file.first, qloc->id));
492     //
493     // Add the physical genome location of this locus.
494     //
495     c->loc.set(qloc->loc.chr(), qloc->loc.bp, qloc->loc.strand);
496 
497     assert(catalog.count(c->id) == 0);
498 
499     catalog[c->id] = c;
500 
501     // cerr << "Adding sample: " << qloc->id << " to the catalog as ID: " << c->id << "\n";
502 
503     for (auto i = qloc->snps.begin(); i != qloc->snps.end(); i++) {
504         SNP *snp    = new SNP;
505         snp->col    = (*i)->col;
506         snp->type   = (*i)->type;
507         snp->lratio = (*i)->lratio;
508         snp->rank_1 = (*i)->rank_1;
509         snp->rank_2 = (*i)->rank_2;
510 
511         c->snps.push_back(snp);
512     }
513 
514     for (auto j = qloc->alleles.begin(); j != qloc->alleles.end(); j++) {
515         c->alleles[j->first] = j->second;
516     }
517 
518     c->populate_alleles();
519 
520     return 0;
521 }
522 
523 int
merge_catalog_loci(map<int,CLocus * > & catalog,vector<int> & merge_list)524 merge_catalog_loci(map<int, CLocus *> &catalog, vector<int> &merge_list)
525 {
526     GappedAln *aln = new GappedAln();
527 
528     sort(merge_list.begin(), merge_list.end());
529 
530     //
531     // Initialize the new locus from the first locus in merge_list.
532     //
533     CLocus *c = catalog[merge_list[0]];
534 
535     for (uint i = 1; i < merge_list.size(); i++) {
536 	CLocus *merge_tag = catalog[merge_list[i]];
537 
538 	//
539 	// Align the consensus sequences together if they are not the same length.
540 	//
541 	if (c->len != merge_tag->len) {
542 	    Cigar  cigar;
543 	    string cigar_str, qseq, cseq;
544 
545 	    aln->init(c->len,  merge_tag->len);
546 	    aln->align(c->con, merge_tag->con);
547 	    cigar_str = invert_cigar(aln->result().cigar);
548 
549 	    parse_cigar(cigar_str.c_str(), cigar);
550 	    qseq = apply_cigar_to_seq(merge_tag->con, cigar);
551 	    adjust_snps_for_gaps(cigar, merge_tag);
552 
553 	    cigar_str = invert_cigar(cigar_str);
554 	    parse_cigar(cigar_str.c_str(), cigar);
555 	    cseq = apply_cigar_to_seq(c->con, cigar);
556 	    adjust_snps_for_gaps(cigar, c);
557 
558 	    assert(qseq.length() == cseq.length());
559 
560 	    for (uint k = 0; k < cseq.length() && k < qseq.length(); k++)
561 		if (qseq[k] != 'N' && cseq[k] == 'N')
562 		    cseq[k] = qseq[k];
563 
564 	    c->add_consensus(cseq.c_str());
565 	}
566 
567 	//
568 	// Record the source of this catalog tag.
569 	//
570 	set<pair<int, int>> sources;
571 	for (uint j = 0; j < c->sources.size(); j++)
572 	    sources.insert(c->sources[j]);
573 	for (uint j = 0; j < merge_tag->sources.size(); j++)
574 	    sources.insert(merge_tag->sources[j]);
575 	c->sources.clear();
576 	for (auto j = sources.begin(); j != sources.end(); j++)
577 	    c->sources.push_back(*j);
578 
579 	//
580 	// Merge SNPs and alleles.
581 	//
582 	c->merge_snps(merge_tag);
583 
584 	catalog.erase(merge_tag->id);
585         delete merge_tag;
586     }
587 
588     c->populate_alleles();
589 
590     delete aln;
591 
592     return 0;
593 }
594 
find_kmer_matches_by_sequence(map<int,CLocus * > & catalog,map<int,QLocus * > & sample,int ctag_dist)595 int find_kmer_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, int ctag_dist) {
596     //
597     // Calculate the distance (number of mismatches) between each pair
598     // of Radtags. We expect all radtags to be the same length;
599     //
600     KmerHashMap                      kmer_map;
601     map<int, pair<allele_type, int>> allele_map;
602     vector<char *>                   kmer_map_keys;
603     QLocus *tag_1;
604     CLocus *tag_2;
605 
606     // OpenMP can't parallelize random access iterators, so we convert
607     // our map to a vector of integer keys.
608     vector<int> keys;
609     for (auto it = sample.begin(); it != sample.end(); it++)
610         keys.push_back(it->first);
611 
612     //
613     // Calculate the number of k-mers we will generate. If specified,
614     // determine the optimal length for k-mers. Find the minimal sequence length
615     // to use as a base to set the kmer size.
616     //
617     uint con_len = UINT_MAX;
618     for (uint i = 0; i < keys.size(); i++) {
619         tag_1 = sample[keys[i]];
620         con_len = sample[keys[i]]->len < con_len ? sample[keys[i]]->len : con_len;
621     }
622     if (set_kmer_len) kmer_len = determine_kmer_length(con_len, ctag_dist);
623 
624     //
625     // Calculate the minimum number of matching k-mers required for a possible sequence match.
626     //
627     int min_hits = calc_min_kmer_matches(kmer_len, ctag_dist, con_len, set_kmer_len ? true : false);
628 
629     populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
630 
631     cerr << "  " << catalog.size() << " loci in the catalog, " << kmer_map.size() << " kmers in the catalog hash.\n";
632 
633     #pragma omp parallel private(tag_1, tag_2)
634     {
635         KmerHashMap::iterator  h;
636         vector<char *>         kmers;
637         set<string>            uniq_kmers;
638         vector<int>            hits;
639         vector<pair<int, int>> ordered_hits;
640         uint                   hit_cnt, index, prev_id, allele_id, hits_size;
641         int                    d;
642         pair<allele_type, int> cat_hit;
643         int                    num_kmers = con_len - kmer_len + 1;
644 
645         initialize_kmers(kmer_len, num_kmers, kmers);
646 
647         #pragma omp for
648         for (uint i = 0; i < keys.size(); i++) {
649             tag_1 = sample[keys[i]];
650 
651             for (auto allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
652                 assert(size_t(kmer_len) <= allele->second.length());
653 
654                 num_kmers = allele->second.length() - kmer_len + 1;
655                 generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
656 
657                 //
658                 // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
659                 // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
660                 //
661                 uniq_kmers.clear();
662                 for (int j = 0; j < num_kmers; j++)
663                     uniq_kmers.insert(kmers[j]);
664 
665                 hits.clear();
666                 ordered_hits.clear();
667 
668                 //
669                 // Lookup the occurances of each k-mer in the kmer_map
670                 //
671                 for (auto j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
672 
673                     h = kmer_map.find(j->c_str());
674 
675                     if (h != kmer_map.end())
676                         for (uint k = 0; k <  h->second.size(); k++)
677                             hits.push_back(h->second[k]);
678                 }
679 
680                 //
681                 // Sort the vector of indexes; provides the number of hits to each allele/locus
682                 // and orders them largest to smallest.
683                 //
684                 sort(hits.begin(), hits.end());
685 
686                 //
687                 // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
688                 //
689                 hits_size = hits.size();
690 
691                 if (hits_size == 0)
692                     continue;
693 
694                 prev_id   = hits[0];
695                 index     = 0;
696 
697                 do {
698                     hit_cnt   = 0;
699                     allele_id = prev_id;
700 
701                     while (index < hits_size && (uint) hits[index] == prev_id) {
702                         hit_cnt++;
703                         index++;
704                     }
705 
706                     if (index < hits_size)
707                         prev_id = hits[index];
708 
709                     if (hit_cnt >= (uint) min_hits)
710                         ordered_hits.push_back(make_pair(allele_id, hit_cnt));
711 
712                 } while (index < hits_size);
713 
714                 for (uint j = 0; j < ordered_hits.size(); j++) {
715                     cat_hit = allele_map.at(ordered_hits[j].first);
716                     hit_cnt = ordered_hits[j].second;
717 
718                     tag_2 = catalog[cat_hit.second];
719 
720                     //
721                     // If the sequences are not the same length then they have to be reconciled by
722                     // the gapped alignment algorithm.
723                     //
724                     if (tag_1->len != tag_2->len)
725                         continue;
726 
727                     d = dist(allele->second.c_str(), tag_2, cat_hit.first);
728 
729                     assert(d >= 0);
730 
731                     //
732                     // Check if any of the mismatches occur at the 3' end of the read. If they
733                     // do, they may indicate a frameshift is present at the 3' end of the read.
734                     // If found, do not merge these tags, leave them for the gapped alignmnet
735                     // algorithm.
736                     //
737                     if (d <= ctag_dist && check_frameshift(allele->second.c_str(), tag_2, cat_hit.first, (size_t) ctag_dist))
738                         continue;
739 
740                     //
741                     // Add a match to the query sequence: catalog ID, catalog allele, query allele, distance
742                     //
743                     if (d <= ctag_dist)
744                         tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d);
745                 }
746             }
747 
748             // Sort the vector of distances.
749             sort(tag_1->matches.begin(), tag_1->matches.end(), compare_matches);
750         }
751 
752         //
753         // Free the allocated k-mers.
754         //
755         for (uint j = 0; j < kmers.size(); j++)
756             delete [] kmers[j];
757         kmers.clear();
758     }
759 
760     free_kmer_hash(kmer_map, kmer_map_keys);
761 
762     return 0;
763 }
764 
765 int
search_for_gaps(map<int,CLocus * > & catalog,map<int,QLocus * > & sample,double min_match_len,double ctag_dist)766 search_for_gaps(map<int, CLocus *> &catalog, map<int, QLocus *> &sample, double min_match_len, double ctag_dist)
767 {
768     //
769     // Search for loci that can be merged with a gapped alignment.
770     //
771     KmerHashMap                      kmer_map;
772     map<int, pair<allele_type, int>> allele_map;
773     vector<char *>                   kmer_map_keys;
774     QLocus *tag_1;
775     CLocus *tag_2;
776 
777     //
778     // OpenMP can't parallelize random access iterators, so we convert
779     // our map to a vector of integer keys.
780     //
781     vector<int> keys;
782     for (auto it = sample.begin(); it != sample.end(); it++)
783         keys.push_back(it->first);
784 
785     //
786     // Calculate the number of k-mers we will generate. If kmer_len == 0,
787     // determine the optimal length for k-mers.
788     //
789     int con_len   = strlen(sample[keys[0]]->con);
790     int kmer_len  = 19;
791 
792     populate_kmer_hash(catalog, kmer_map, kmer_map_keys, allele_map, kmer_len);
793 
794     #pragma omp parallel private(tag_1, tag_2)
795     {
796         KmerHashMap::iterator    h;
797         AlignRes                 aln_res;
798         vector<char *>           kmers;
799         set<string>              uniq_kmers;
800         vector<int>              hits;
801         vector<pair<int, int>>   ordered_hits;
802         uint                     hit_cnt, index, prev_id, allele_id, hits_size, stop, top_hit;
803         int                      d;
804         vector<pair<char, uint>> cigar;
805         pair<allele_type, int>   cat_hit;
806         string                   cat_seq;
807         int                      num_kmers = con_len - kmer_len + 1;
808 
809         GappedAln *aln = new GappedAln();
810 
811         initialize_kmers(kmer_len, num_kmers, kmers);
812 
813         #pragma omp for schedule(dynamic)
814         for (uint i = 0; i < keys.size(); i++) {
815             tag_1 = sample[keys[i]];
816 
817             //
818             // If we already matched this locus to the catalog without using gapped alignments, skip it now.
819             //
820             if (tag_1->matches.size() > 0)
821                 continue;
822 
823             for (auto allele = tag_1->strings.begin(); allele != tag_1->strings.end(); allele++) {
824 
825                 num_kmers = allele->second.length() - kmer_len + 1;
826                 generate_kmers_lazily(allele->second.c_str(), kmer_len, num_kmers, kmers);
827 
828                 //
829                 // We want to create a list of unique kmers to search with; otherwise, repetitive kmers will
830                 // generate, multiple, spurious hits in sequences with multiple copies of the same kmer.
831                 //
832                 uniq_kmers.clear();
833                 for (int j = 0; j < num_kmers; j++)
834                     uniq_kmers.insert(kmers[j]);
835 
836                 hits.clear();
837                 ordered_hits.clear();
838 
839                 //
840                 // Lookup the occurances of each k-mer in the kmer_map
841                 //
842                 for (auto j = uniq_kmers.begin(); j != uniq_kmers.end(); j++) {
843 
844                     h = kmer_map.find(j->c_str());
845 
846                     if (h != kmer_map.end())
847                         for (uint k = 0; k <  h->second.size(); k++)
848                             hits.push_back(h->second[k]);
849                 }
850 
851                 //
852                 // Sort the vector of indexes; provides the number of hits to each allele/locus
853                 // and orders them largest to smallest.
854                 //
855                 sort(hits.begin(), hits.end());
856 
857                 //
858                 // Iterate through the list of hits and collapse them down by number of kmer hits per allele.
859                 //
860                 hits_size = hits.size();
861 
862                 if (hits_size == 0)
863                     continue;
864 
865                 prev_id = hits[0];
866                 index   = 0;
867 
868                 do {
869                     hit_cnt   = 0;
870                     allele_id = prev_id;
871 
872                     while (index < hits_size && (uint) hits[index] == prev_id) {
873                         hit_cnt++;
874                         index++;
875                     }
876 
877                     if (index < hits_size)
878                         prev_id = hits[index];
879 
880                     ordered_hits.push_back(make_pair(allele_id, hit_cnt));
881 
882                 } while (index < hits_size);
883 
884                 if (ordered_hits.size() == 0)
885                     continue;
886 
887                 //
888                 // Process the hits from most kmer hits to least kmer hits.
889                 //
890                 sort(ordered_hits.begin(), ordered_hits.end(), compare_pair_intint);
891 
892                 //
893                 // Only try to align the sequences with the most kmers in common.
894                 //
895                 top_hit = ordered_hits[0].second;
896                 stop    = 1;
897                 for (uint j = 1; j < ordered_hits.size(); j++)
898                     if ((uint) ordered_hits[j].second < top_hit) {
899                         stop = j;
900                         break;
901                     }
902 
903                 for (uint j = 0; j < stop; j++) {
904                     cat_hit = allele_map.at(ordered_hits[j].first);
905                     tag_2   = catalog[cat_hit.second];
906                     cat_seq = "";
907 
908                     for (uint k = 0; k < tag_2->strings.size(); k++)
909                         if (tag_2->strings[k].first == cat_hit.first) {
910                             cat_seq = tag_2->strings[k].second;
911                             break;
912                         }
913 
914                     aln->init(tag_2->len, tag_1->len);
915 
916                     if (aln->align(cat_seq, allele->second)) {
917                         cigar.clear();
918                         aln->parse_cigar(cigar);
919                         aln_res = aln->result();
920                         d       = dist(cat_seq.c_str(), allele->second.c_str(), cigar);
921 
922                         //
923                         // If the alignment has too many gaps, skip it.
924                         //
925                         if (aln_res.gap_cnt <= (max_gaps + 1)) {
926                             //
927                             // If the alignment doesn't span enough of the two sequences, skip it.
928                             //
929                             if (aln_res.pct_id >= min_match_len) {
930 
931                                 if (d <= ctag_dist)
932                                     tag_1->add_match(tag_2->id, cat_hit.first, allele->first, d, invert_cigar(aln_res.cigar));
933                             }
934                         }
935                     }
936                 }
937             }
938         }
939 
940         //
941         // Free the k-mers we generated for this query
942         //
943         for (uint j = 0; j < kmers.size(); j++)
944             delete [] kmers[j];
945         kmers.clear();
946 
947         delete aln;
948     }
949 
950     free_kmer_hash(kmer_map, kmer_map_keys);
951 
952     return 0;
953 }
954 
compare_matches(Match * a,Match * b)955 bool compare_matches(Match *a, Match *b) {
956     return (a->dist < b->dist);
957 }
958 
find_matches_by_sequence(map<int,CLocus * > & catalog,map<int,QLocus * > & sample)959 int find_matches_by_sequence(map<int, CLocus *> &catalog, map<int, QLocus *> &sample) {
960     //
961     // Calculate the distance (number of mismatches) between each pair
962     // of Radtags. We expect all radtags to be the same length;
963     //
964     map<int, QLocus *>::iterator i;
965     map<int, CLocus *>::iterator j;
966     int k;
967 
968     // OpenMP can't parallelize random access iterators, so we convert
969     // our map to a vector of integer keys.
970     vector<int> keys;
971     for (i = sample.begin(); i != sample.end(); i++)
972         keys.push_back(i->first);
973 
974     #pragma omp parallel private(i, j, k)
975     {
976         #pragma omp for schedule(dynamic)
977         for (k = 0; k < (int) keys.size(); k++) {
978 
979             i = sample.find(keys[k]);
980 
981             vector<pair<allele_type, string>>::iterator r, s;
982 
983             //
984             // Iterate through the possible SAMPLE alleles
985             //
986             for (r = i->second->strings.begin(); r != i->second->strings.end(); r++) {
987 
988                 for (j = catalog.begin(); j != catalog.end(); j++) {
989                     //
990                     // Iterate through the possible CATALOG alleles
991                     //
992                     for (s = j->second->strings.begin(); s != j->second->strings.end(); s++) {
993                         if (r->second == s->second) {
994                             //cerr << "Found a match between " << i->first << " (" << r->first << ") and " << j->first << " (" << s->first << ")\n";
995 
996                             i->second->add_match(j->second->id, s->first, r->first, 0);
997                         }
998                     }
999                 }
1000             }
1001         }
1002     }
1003 
1004     return 0;
1005 }
1006 
find_matches_by_genomic_loc(map<string,int> & cat_index,map<int,QLocus * > & sample)1007 int find_matches_by_genomic_loc(map<string, int> &cat_index, map<int, QLocus *> &sample) {
1008     map<int, QLocus *>::iterator i;
1009     map<int, CLocus *>::iterator j;
1010 
1011     //
1012     // OpenMP can't parallelize random access iterators, so we convert
1013     // our map to a vector of integer keys.
1014     //
1015     vector<int> keys;
1016     for (i = sample.begin(); i != sample.end(); i++)
1017         keys.push_back(i->first);
1018 
1019     #pragma omp parallel private(i, j)
1020     {
1021         char id[id_len];
1022 
1023         #pragma omp for
1024         for (int k = 0; k < (int) keys.size(); k++) {
1025 
1026             i = sample.find(keys[k]);
1027 
1028             snprintf(id, id_len - 1, "%s|%d|%c",
1029                      i->second->loc.chr(),
1030                      i->second->loc.bp,
1031                      i->second->loc.strand == strand_plus ? '+' : '-');
1032 
1033             if (cat_index.count(id) > 0)
1034                 i->second->add_match(cat_index[id], "", "", 0);
1035         }
1036     }
1037 
1038     return 0;
1039 }
1040 
write_catalog(map<int,CLocus * > & catalog)1041 int write_catalog(map<int, CLocus *> &catalog) {
1042 
1043     map<int, CLocus *>::iterator i;
1044     CLocus  *tag;
1045     set<int> matches;
1046 
1047     bool gzip = (in_file_type == FileT::gzsql) ? true : false;
1048 
1049     string tag_file = out_path + "catalog.tags.tsv";
1050     string snp_file = out_path + "catalog.snps.tsv";
1051     string all_file = out_path + "catalog.alleles.tsv";
1052 
1053     if (gzip) {
1054         tag_file += ".gz";
1055         snp_file += ".gz";
1056         all_file += ".gz";
1057     }
1058 
1059     //
1060     // Open the output files for writing.
1061     //
1062     gzFile   gz_tags, gz_snps, gz_alle;
1063     ofstream tags, snps, alle;
1064     if (gzip) {
1065         gz_tags = gzopen(tag_file.c_str(), "wb");
1066         if (!gz_tags) {
1067             cerr << "Error: Unable to open gzipped catalog tag file '" << tag_file << "': " << strerror(errno) << ".\n";
1068             exit(1);
1069         }
1070         #if ZLIB_VERNUM >= 0x1240
1071         gzbuffer(gz_tags, libz_buffer_size);
1072         #endif
1073         gz_snps = gzopen(snp_file.c_str(), "wb");
1074         if (!gz_snps) {
1075             cerr << "Error: Unable to open gzipped catalog snps file '" << snp_file << "': " << strerror(errno) << ".\n";
1076             exit(1);
1077         }
1078         #if ZLIB_VERNUM >= 0x1240
1079         gzbuffer(gz_snps, libz_buffer_size);
1080         #endif
1081         gz_alle = gzopen(all_file.c_str(), "wb");
1082         if (!gz_alle) {
1083             cerr << "Error: Unable to open gzipped catalog alleles file '" << all_file << "': " << strerror(errno) << ".\n";
1084             exit(1);
1085         }
1086         #if ZLIB_VERNUM >= 0x1240
1087         gzbuffer(gz_alle, libz_buffer_size);
1088         #endif
1089     } else {
1090         tags.open(tag_file.c_str());
1091         snps.open(snp_file.c_str());
1092         alle.open(all_file.c_str());
1093         check_open(tags, tag_file);
1094         check_open(snps, snp_file);
1095         check_open(alle, all_file);
1096     }
1097 
1098     //
1099     // Record the version of Stacks used and the date generated as a comment in the catalog.
1100     //
1101     // Obtain the current date.
1102     //
1103     stringstream log;
1104     time_t       rawtime;
1105     struct tm   *timeinfo;
1106     char         date[32];
1107     time(&rawtime);
1108     timeinfo = localtime(&rawtime);
1109     strftime(date, 32, "%F %T", timeinfo);
1110     log << "# cstacks version " << VERSION << "; catalog generated on " << date << "\n";
1111     if (gzip) {
1112         gzputs_throwing(gz_tags, log.str().c_str());
1113         gzputs_throwing(gz_snps, log.str().c_str());
1114         gzputs_throwing(gz_alle, log.str().c_str());
1115     } else {
1116         tags << log.str();
1117         snps << log.str();
1118         alle << log.str();
1119     }
1120 
1121     for (i = catalog.begin(); i != catalog.end(); i++) {
1122         tag = i->second;
1123 
1124         if (gzip)
1125             write_gzip_output(tag, gz_tags, gz_snps, gz_alle);
1126         else
1127             write_simple_output(tag, tags, snps, alle);
1128     }
1129 
1130     if (gzip) {
1131         gzclose_throwing(gz_tags);
1132         gzclose_throwing(gz_snps);
1133         gzclose_throwing(gz_alle);
1134     } else {
1135         tags.close();
1136         snps.close();
1137         alle.close();
1138     }
1139 
1140     return 0;
1141 }
1142 
merge_allele(Locus * locus,SNP * snp)1143 int merge_allele(Locus *locus, SNP *snp) {
1144     map<int, pair<string, SNP *>> columns;
1145     map<int, pair<string, SNP *>>::iterator c;
1146     vector<SNP *>::iterator i;
1147     SNP *lsnp;
1148 
1149     for (i = locus->snps.begin(); i != locus->snps.end(); i++)
1150         columns[(*i)->col] = make_pair("sample", *i);
1151 
1152     if (columns.count(snp->col)) {
1153         lsnp = columns[snp->col].second;
1154 
1155         //
1156         // If this is a new allele for this nucleotide, add it to the catalog SNP.
1157         //
1158         bool rank_1_exists = false;
1159         bool rank_2_exists = false;
1160 
1161         if (snp->rank_1 == lsnp->rank_1 ||
1162             snp->rank_1 == lsnp->rank_2 ||
1163             snp->rank_1 == lsnp->rank_3 ||
1164             snp->rank_1 == lsnp->rank_4) {
1165             rank_1_exists = true;
1166         }
1167         if (snp->rank_2 == lsnp->rank_1 ||
1168             snp->rank_2 == lsnp->rank_2 ||
1169             snp->rank_2 == lsnp->rank_3 ||
1170             snp->rank_2 == lsnp->rank_4) {
1171             rank_2_exists = true;
1172         }
1173 
1174         if (rank_1_exists == false) {
1175             if (lsnp->rank_3 == 0)
1176                 lsnp->rank_3 = snp->rank_1;
1177             else
1178                 lsnp->rank_4 = snp->rank_1;
1179         }
1180         if (rank_2_exists == false) {
1181             if (lsnp->rank_3 == 0)
1182                 lsnp->rank_3 = snp->rank_2;
1183             else
1184                 lsnp->rank_4 = snp->rank_2;
1185         }
1186 
1187         columns[snp->col] = make_pair("both", lsnp);
1188     } else {
1189         columns[snp->col] = make_pair("merge", snp);
1190     }
1191 
1192     vector<pair<string, SNP *>> merged_snps;
1193 
1194     for (c = columns.begin(); c != columns.end(); c++)
1195         merged_snps.push_back((*c).second);
1196 
1197     //
1198     // Sort the SNPs by column
1199     //
1200     sort(merged_snps.begin(), merged_snps.end(), compare_pair_snp);
1201 
1202     //
1203     // Modify any existing alleles to account for this new SNP. If there are not any alleles,
1204     // create new ones.
1205     //
1206     stringstream sallele;
1207     set<string> merged_alleles;
1208     string allele, new_allele;
1209     int pos;
1210 
1211     if (locus->alleles.size() == 0) {
1212         sallele << locus->con[snp->col];
1213         merged_alleles.insert(sallele.str());
1214     }
1215 
1216     map<string, int>::iterator j;
1217     vector<pair<string, SNP *>>::iterator k;
1218 
1219     for (j = locus->alleles.begin(); j != locus->alleles.end(); j++) {
1220         allele     = j->first;
1221         new_allele = "";
1222         pos        = 0;
1223 
1224         // cerr << "Allele length: " << allele.size() << "\n";
1225 
1226         for (k = merged_snps.begin(); k != merged_snps.end(); k++) {
1227             //
1228             // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
1229             // sequence to account for it in the allele string.
1230             //
1231             if ((*k).first == "merge") {
1232                 new_allele += locus->con[(*k).second->col];
1233                 // cerr << "  Adding char '" << locus->con[k->second->col] << "' from consensus position " << (*k).second->col << "\n";
1234             } else {
1235                 new_allele += allele[pos];
1236                 // cerr << "  Adding char '" << allele[pos] << "' from allele position " << pos << "\n";
1237                 pos++;
1238             }
1239         }
1240 
1241         merged_alleles.insert(new_allele);
1242     }
1243 
1244     set<string>::iterator s;
1245 
1246     locus->alleles.clear();
1247     for (s = merged_alleles.begin(); s != merged_alleles.end(); s++) {
1248         locus->alleles[*s] = 0;
1249     }
1250 
1251     return 1;
1252 }
1253 
merge_snps(Locus * matched_tag)1254 int CLocus::merge_snps(Locus *matched_tag) {
1255     map<int, pair<string, SNP *>> columns;
1256     vector<pair<string, SNP *>>   merged_snps;
1257     set<string> merged_alleles;
1258     SNP *csnp;
1259 
1260     for (auto i = this->snps.begin(); i != this->snps.end(); i++)
1261         columns[(*i)->col] = make_pair("catalog", *i);
1262 
1263     for (auto i = matched_tag->snps.begin(); i != matched_tag->snps.end(); i++) {
1264         //
1265         // Is this column already represented from the previous sample?
1266         //
1267         if (columns.count((*i)->col)) {
1268             csnp = columns[(*i)->col].second;
1269 
1270             //
1271             // If this is a new allele for this nucleotide, add it to the catalog SNP.
1272             //
1273             bool rank_1_exists = false;
1274             bool rank_2_exists = false;
1275 
1276             if ((*i)->rank_1 == csnp->rank_1 ||
1277                 (*i)->rank_1 == csnp->rank_2 ||
1278                 (*i)->rank_1 == csnp->rank_3 ||
1279                 (*i)->rank_1 == csnp->rank_4) {
1280                 rank_1_exists = true;
1281             }
1282             if ((*i)->rank_2 == csnp->rank_1 ||
1283                 (*i)->rank_2 == csnp->rank_2 ||
1284                 (*i)->rank_2 == csnp->rank_3 ||
1285                 (*i)->rank_2 == csnp->rank_4) {
1286                 rank_2_exists = true;
1287             }
1288 
1289             if (rank_1_exists == false) {
1290                 if (csnp->rank_3 == 0)
1291                     csnp->rank_3 = (*i)->rank_1;
1292                 else
1293                     csnp->rank_4 = (*i)->rank_1;
1294             }
1295             if (rank_2_exists == false) {
1296                 if (csnp->rank_3 == 0)
1297                     csnp->rank_3 = (*i)->rank_2;
1298                 else
1299                     csnp->rank_4 = (*i)->rank_2;
1300             }
1301 
1302             columns[(*i)->col] = make_pair("both", csnp);
1303         } else {
1304             columns[(*i)->col] = make_pair("sample", *i);
1305         }
1306     }
1307 
1308     for (auto c = columns.begin(); c != columns.end(); c++)
1309         merged_snps.push_back((*c).second);
1310 
1311     //
1312     // Sort the SNPs by column
1313     //
1314     sort(merged_snps.begin(), merged_snps.end(), compare_pair_snp);
1315 
1316     //
1317     // If the catalog tag has no defined alleles, create a matching haplotype
1318     // from the consensus sequence before merging in the new alleles.
1319     //
1320     string allele, new_allele;
1321     int    pos;
1322 
1323     if (this->alleles.size() == 0) {
1324         char c;
1325         new_allele = "";
1326         for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1327             csnp = k->second;
1328             c    = csnp->col < this->len ? this->con[k->second->col] : 'N';
1329 
1330             new_allele += c;
1331 
1332             if (csnp->col > this->len - 1)
1333                 continue;
1334 
1335             if (c != csnp->rank_1 &&
1336                 c != csnp->rank_2 &&
1337                 c != csnp->rank_3 &&
1338                 c != csnp->rank_4) {
1339 
1340                 if (csnp->rank_3 == 0)
1341                     csnp->rank_3 = c;
1342                 else
1343                     csnp->rank_4 = c;
1344             }
1345         }
1346 
1347         if (new_allele.length() > 0)
1348             merged_alleles.insert(new_allele);
1349     }
1350 
1351     //
1352     // Merge the alleles accounting for any SNPs added from either of the two samples.
1353     //
1354     for (auto j = this->alleles.begin(); j != this->alleles.end(); j++) {
1355         allele     = j->first;
1356         new_allele = "";
1357         pos        = 0;
1358 
1359         for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1360             //
1361             // If we inserted a SNP from the sample, add the proper nucleotide from the consensus
1362             // sequence to account for it in the allele string.
1363             //
1364             if (k->first == "sample") {
1365                 new_allele += k->second->col > this->len - 1 ? 'N' : this->con[k->second->col];
1366             } else {
1367                 new_allele += allele[pos];
1368                 pos++;
1369             }
1370         }
1371 
1372         merged_alleles.insert(new_allele);
1373     }
1374 
1375     for (auto j = matched_tag->alleles.begin(); j != matched_tag->alleles.end(); j++) {
1376         allele     = j->first;
1377         new_allele = "";
1378         pos        = 0;
1379 
1380         for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1381             if (k->first == "catalog") {
1382                 new_allele += k->second->col > matched_tag->len - 1 ? 'N' : matched_tag->con[k->second->col];
1383             } else {
1384                 new_allele += allele[pos];
1385                 pos++;
1386             }
1387         }
1388 
1389         merged_alleles.insert(new_allele);
1390     }
1391 
1392     //
1393     // If the matching tag being merged into the catalog had no called SNPs
1394     // create alleles from the consensus sequence and check that catalog SNP
1395     // objects contain all the nucleoties.
1396     //
1397     if (matched_tag->alleles.size() == 0) {
1398         char c;
1399         new_allele = "";
1400         for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1401             csnp = k->second;
1402             c    = csnp->col < matched_tag->len ? matched_tag->con[k->second->col] : 'N';
1403             new_allele += c;
1404 
1405             if (csnp->col > matched_tag->len - 1)
1406                 continue;
1407 
1408             if (c != csnp->rank_1 &&
1409                 c != csnp->rank_2 &&
1410                 c != csnp->rank_3 &&
1411                 c != csnp->rank_4) {
1412 
1413                 if (csnp->rank_3 == 0)
1414                     csnp->rank_3 = c;
1415                 else
1416                     csnp->rank_4 = c;
1417             }
1418         }
1419 
1420         if (new_allele.length() > 0)
1421             merged_alleles.insert(new_allele);
1422     }
1423 
1424     //
1425     // Update the catalog entry's list of SNPs and alleles
1426     //
1427     this->snps.clear();
1428 
1429     for (auto k = merged_snps.begin(); k != merged_snps.end(); k++) {
1430         SNP *snp    = new SNP;
1431         snp->col    = (*k).second->col;
1432         snp->type   = (*k).second->type;
1433         snp->lratio = 0.0;
1434         snp->rank_1 = (*k).second->rank_1;
1435         snp->rank_2 = (*k).second->rank_2;
1436         snp->rank_3 = (*k).second->rank_3;
1437         snp->rank_4 = (*k).second->rank_4;
1438 
1439         this->snps.push_back(snp);
1440     }
1441 
1442     this->alleles.clear();
1443     for (auto s = merged_alleles.begin(); s != merged_alleles.end(); s++) {
1444         this->alleles[*s] = 0;
1445     }
1446 
1447     return 1;
1448 }
1449 
1450 int
reduce_alleles(set<string> & alleles)1451 CLocus::reduce_alleles(set<string> &alleles)
1452 {
1453     set<string>::iterator it;
1454     uint len, max_len, match, ncnt;
1455     vector<string> haplotypes, cur, next;
1456 
1457     max_len = 0;
1458     for (it = alleles.begin(); it != alleles.end(); it++) {
1459         max_len = it->length() > max_len ? it->length() : max_len;
1460         haplotypes.push_back(*it);
1461     }
1462 
1463     len = alleles.size();
1464     alleles.clear();
1465 
1466     for (uint i = 0; i < len; i++) {
1467         //cerr << "Looking at haplotype[" << i << "]: " << haplotypes[i] << "\n";
1468         //
1469         // We will only look at strings that contain Ns.
1470         //
1471         if (haplotypes[i].find('N') == string::npos) {
1472             alleles.insert(haplotypes[i]);
1473             //cerr << "  No Ns, skipping...\n";
1474             continue;
1475         }
1476 
1477         uint k = 0;
1478         uint j = i + 1;
1479         while (k < len - 1) {
1480             cur.push_back(haplotypes[j % len]);
1481             k++;
1482             j++;
1483         }
1484 
1485         //
1486         // Examine the haplotype alleles one SNP at a time. If we are able to uniquely
1487         // determine a second haplotype that encompasses the first
1488         // to, return it.
1489         //
1490         j = 0;
1491         while (cur.size() > 1 && j < max_len) {
1492 
1493             for (k = 0; k < cur.size(); k++) {
1494                 cerr << "Comparing haplotypes[" << i << "]: '" << haplotypes[i] << "' to '" << cur[k] << " at position " << j << "'\n";
1495                 if (haplotypes[i][j] == cur[k][j] || haplotypes[i][j] == 'N') {
1496                     cerr << "  Keeping this haplotype.\n";
1497                     next.push_back(cur[k]);
1498                 } else {
1499                     cerr << "  Discarding this haplotype.\n";
1500                 }
1501             }
1502             cur = next;
1503             next.clear();
1504             j++;
1505         }
1506 
1507         //
1508         // If there is only one left, make sure what we have of the haplotype does match
1509         // and its not simply an erroneously called haplotype. If so, then this haplotype
1510         // is encompassed by another, longer haplotype and we do not need to keep it.
1511         //
1512         ncnt  = 0;
1513         match = 0;
1514         if (cur.size() > 1) {
1515             cerr << "Discarding " << haplotypes[i] << "\n";
1516             continue;
1517         } else if (cur.size() == 1) {
1518             for (k = 0; k < max_len; k++)
1519                 if (haplotypes[i][k] != 'N') ncnt++;
1520             for (k = 0; k < max_len; k++)
1521                 if (cur[0][k] == haplotypes[i][k]) match++;
1522             if (match == ncnt) {
1523                 cerr << "Discarding " << haplotypes[i] << "\n";
1524                 continue;
1525             }
1526         }
1527 
1528         cerr << "Keeping " << haplotypes[i] << "\n";
1529         alleles.insert(haplotypes[i]);
1530     }
1531 
1532     return 0;
1533 }
1534 
1535 int
populate_kmer_hash(map<int,CLocus * > & catalog,CatKmerHashMap & kmer_map,vector<char * > & kmer_map_keys,int kmer_len)1536 populate_kmer_hash(map<int, CLocus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
1537 {
1538     map<int, CLocus *>::iterator it;
1539     vector<pair<allele_type, string>>::iterator allele;
1540     vector<char *> kmers;
1541     CLocus        *tag;
1542     char          *hash_key;
1543     bool           exists;
1544     int            j;
1545 
1546     //
1547     // Break each stack down into k-mers and create a hash map of those k-mers
1548     // recording in which sequences they occur.
1549     //
1550     int num_kmers = strlen(catalog.begin()->second->con) - kmer_len + 1;
1551 
1552     for (it = catalog.begin(); it != catalog.end(); it++) {
1553         tag = it->second;
1554 
1555         //
1556         // Iterate through the possible Catalog alleles
1557         //
1558         for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
1559             //
1560             // Generate and hash the kmers for this allele string
1561             //
1562             generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
1563 
1564             for (j = 0; j < num_kmers; j++) {
1565                 hash_key = kmers[j];
1566                 exists   = kmer_map.count(hash_key) == 0 ? false : true;
1567 
1568                 kmer_map[hash_key].push_back(make_pair(allele->first, tag->id));
1569 
1570                 if (exists)
1571                     delete [] kmers[j];
1572                 else
1573                     kmer_map_keys.push_back(hash_key);
1574             }
1575 
1576             kmers.clear();
1577         }
1578     }
1579 
1580     //dump_kmer_map(kmer_map);
1581 
1582     return 0;
1583 }
1584 
1585 int
write_simple_output(CLocus * tag,ofstream & cat_file,ofstream & snp_file,ofstream & all_file)1586 write_simple_output(CLocus *tag, ofstream &cat_file, ofstream &snp_file, ofstream &all_file)
1587 {
1588     string sources;
1589 
1590     for (auto src_it = tag->sources.begin(); src_it != tag->sources.end(); src_it++) {
1591         stringstream s;
1592         s << (*src_it).first << "_" << (*src_it).second << ",";
1593         sources += s.str();
1594     }
1595     sources = sources.substr(0, sources.length() - 1);
1596 
1597     cat_file << 0           << "\t" // Catalog has no sample ID.
1598              << tag->id     << "\t"
1599              << "consensus" << "\t"
1600              << "0"         << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
1601              << sources     << "\t"
1602              << tag->con    << "\t"
1603              << 0           << "\t" // These flags are unused in cstacks, but important in ustacks
1604              << 0           << "\t"
1605              << 0           << "\n";
1606 
1607     //
1608     // Output the SNPs associated with the catalog tag
1609     //
1610     for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
1611         snp_file << "0"            << "\t"
1612                  << tag->id        << "\t"
1613                  << (*snp_it)->col << "\t";
1614 
1615         switch((*snp_it)->type) {
1616         case snp_type_het:
1617             snp_file << "E\t";
1618             break;
1619         case snp_type_hom:
1620             snp_file << "O\t";
1621             break;
1622         default:
1623             snp_file << "U\t";
1624             break;
1625         }
1626 
1627         snp_file <<
1628             (*snp_it)->lratio << "\t" <<
1629             (*snp_it)->rank_1 << "\t" <<
1630             (*snp_it)->rank_2 << "\t" <<
1631             ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
1632             ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
1633     }
1634 
1635     //
1636     // Output the alleles associated with the two matched tags
1637     //
1638     for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
1639         all_file << "0"           << "\t"
1640                  << tag->id       << "\t"
1641                  << all_it->first << "\t"
1642                  << "0"           << "\t"    // These two fields are used in the
1643                  << "0"           << "\n";   // ustacks/pstacks output, not in cstacks.
1644 
1645     return 0;
1646 }
1647 
1648 int
write_gzip_output(CLocus * tag,gzFile & cat_file,gzFile & snp_file,gzFile & all_file)1649 write_gzip_output(CLocus *tag, gzFile &cat_file, gzFile &snp_file, gzFile &all_file)
1650 {
1651     string       sources;
1652     stringstream sstr;
1653 
1654     for (auto src_it = tag->sources.begin(); src_it != tag->sources.end(); src_it++) {
1655         sstr << (*src_it).first << "_" << (*src_it).second << ",";
1656     }
1657     sources = sstr.str();
1658     sources = sources.substr(0, sources.length() - 1);
1659 
1660     sstr.str("");
1661 
1662     sstr << 0           << "\t" // Catalog has no sample ID.
1663          << tag->id     << "\t"
1664          << "consensus" << "\t"
1665          << "0"         << "\t" // Catalog has no component number for stacks since only consensus sequences are stored.
1666          << sources     << "\t"
1667          << tag->con    << "\t"
1668          << 0           << "\t" // These flags are unused in cstacks, but important in ustacks
1669          << 0           << "\t"
1670          << 0           << "\n";
1671 
1672     gzputs(cat_file, sstr.str().c_str());
1673     sstr.str("");
1674 
1675     //
1676     // Output the SNPs associated with the catalog tag
1677     //
1678     for (auto snp_it = tag->snps.begin(); snp_it != tag->snps.end(); snp_it++) {
1679         sstr << "0"            << "\t"
1680              <<   tag->id      << "\t"
1681              << (*snp_it)->col << "\t";
1682 
1683         switch((*snp_it)->type) {
1684         case snp_type_het:
1685             sstr << "E\t";
1686             break;
1687         case snp_type_hom:
1688             sstr << "O\t";
1689             break;
1690         default:
1691             sstr << "U\t";
1692             break;
1693         }
1694 
1695         sstr <<
1696             (*snp_it)->lratio << "\t" <<
1697             (*snp_it)->rank_1 << "\t" <<
1698             (*snp_it)->rank_2 << "\t" <<
1699             ((*snp_it)->rank_3 == 0 ? '-' : (*snp_it)->rank_3) << "\t" <<
1700             ((*snp_it)->rank_4 == 0 ? '-' : (*snp_it)->rank_4) << "\n";
1701     }
1702 
1703     gzputs(snp_file, sstr.str().c_str());
1704     sstr.str("");
1705 
1706     //
1707     // Output the alleles associated with the two matched tags
1708     //
1709     for (auto all_it = tag->alleles.begin(); all_it != tag->alleles.end(); all_it++)
1710         sstr << "0"     << "\t"
1711              << tag->id << "\t"
1712              << all_it->first << "\t"
1713              << 0       << "\t"
1714              << 0       << "\n";
1715 
1716     gzputs(all_file, sstr.str().c_str());
1717 
1718     return 0;
1719 }
1720 
1721 int
initialize_new_catalog(pair<int,string> & sample,map<int,CLocus * > & catalog)1722 initialize_new_catalog(pair<int, string> &sample, map<int, CLocus *> &catalog)
1723 {
1724     map<int, CLocus *> tmp_catalog;
1725     bool compressed = false;
1726 
1727     //
1728     // Parse the input files.
1729     //
1730     if (!load_loci(sample.second, tmp_catalog, 0, false, compressed))
1731         return 0;
1732 
1733     in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
1734 
1735     sample.first = tmp_catalog.begin()->second->sample_id;
1736 
1737     //
1738     // Iterate over the catalog entires and renumber them after recording the source of
1739     // locus.
1740     //
1741     for (auto j = tmp_catalog.begin(); j != tmp_catalog.end(); j++) {
1742         j->second->sources.push_back(make_pair(sample.first, j->second->id));
1743         j->second->id = next_catalog_id;
1744 
1745         catalog[next_catalog_id] = j->second;
1746 
1747         next_catalog_id++;
1748     }
1749 
1750     cerr << "  " << catalog.size() << " loci were newly added to the catalog.\n";
1751 
1752     return 1;
1753 }
1754 
1755 int
initialize_existing_catalog(string catalog_path,map<int,CLocus * > & catalog)1756 initialize_existing_catalog(string catalog_path, map<int, CLocus *> &catalog)
1757 {
1758     bool compressed;
1759 
1760     //
1761     // Parse the input files.
1762     //
1763     if (!load_loci(catalog_path, catalog, 0, false, compressed))
1764         return 0;
1765 
1766     in_file_type = compressed == true ? FileT::gzsql : FileT::sql;
1767 
1768     //
1769     // Iterate over the catalog entires and convert the stack components
1770     // into source objects, to record what samples each locus came from.
1771     //
1772     CLocus *loc;
1773     char   *p, *q;
1774     int     sample_id, locus_id;
1775     size_t  max_catalog_id = 0;
1776 
1777     for (auto j = catalog.begin(); j != catalog.end(); j++) {
1778         loc = j->second;
1779 
1780         if ((size_t) loc->id > max_catalog_id)
1781             max_catalog_id = (size_t) loc->id;
1782 
1783         for (uint i = 0; i < loc->comp.size(); i++) {
1784             //
1785             // Parse the ID into sample ID / locus ID, given 43_1356, parse into
1786             // sample ID 43 and locus ID 1356.
1787             //
1788             for (p = loc->comp[i]; *p != '_' && *p != '\0'; p++);
1789             if (*p != '_')
1790                 return 0;
1791             p++;
1792             sample_id = strtol(loc->comp[i], &q, 10);
1793             if (*q != '_')
1794                 return 0;
1795 
1796             locus_id = strtol(p, &q, 10);
1797 
1798             if (*q != '\0')
1799                 return 0;
1800 
1801             loc->sources.push_back(make_pair(sample_id, locus_id));
1802         }
1803     }
1804 
1805     next_catalog_id = max_catalog_id + 1;
1806 
1807     return 1;
1808 }
1809 
parse_command_line(int argc,char * argv[])1810 int parse_command_line(int argc, char* argv[]) {
1811     string in_dir;
1812     string popmap_path;
1813 
1814     while (1) {
1815         static struct option long_options[] = {
1816             {"help",            no_argument, NULL, 'h'},
1817             {"version",         no_argument, NULL, 1000},
1818             {"uniq-haplotypes", no_argument, NULL, 'u'}, {"uniq_haplotypes", no_argument, NULL, 'u'},
1819             {"report-mmatches", no_argument, NULL, 'R'}, {"report_mmatches", no_argument, NULL, 'R'},
1820             {"disable-gapped",  no_argument, NULL, 'G'}, {"disable_gapped",  no_argument, NULL, 'G'},
1821             {"max-gaps",        required_argument, NULL, 'X'}, {"max_gaps",        required_argument, NULL, 'X'},
1822             {"min-aln-len",     required_argument, NULL, 'x'}, {"min_aln_len",     required_argument, NULL, 'x'},
1823             {"ctag-dist",       required_argument, NULL, 'n'}, {"ctag_dist",       required_argument, NULL, 'n'},
1824             {"k-len",           required_argument, NULL, 'k'}, {"k_len",           required_argument, NULL, 'k'},
1825             {"in-path",         required_argument, NULL, 'P'}, {"in_path",         required_argument, NULL, 'P'},
1826             {"popmap",          required_argument, NULL, 'M'},
1827             {"catalog",         required_argument, NULL, 'c'},
1828             {"sample",          required_argument, NULL, 's'},
1829             {"out-path",        required_argument, NULL, 'o'}, {"out_path",        required_argument, NULL, 'o'},
1830             {"threads",         required_argument, NULL, 'p'},
1831             {0, 0, 0, 0}
1832         };
1833 
1834         int option_index = 0;
1835         int c = getopt_long(argc, argv, "hvuRGX:x:o:s:c:p:n:k:P:M:", long_options, &option_index);
1836 
1837         // Detect the end of the options.
1838         if (c == -1)
1839             break;
1840 
1841         switch (c) {
1842         case 'h':
1843             help();
1844             break;
1845         case 'n':
1846             ctag_dist = is_integer(optarg);
1847             break;
1848         case 'k':
1849             set_kmer_len = false;
1850             kmer_len     = is_integer(optarg);
1851             break;
1852         case 'R':
1853             report_mmatches = true;
1854             break;
1855         case 's':
1856             samples.push(make_pair(0, optarg));
1857             break;
1858         case 'c':
1859             catalog_path = optarg;
1860             break;
1861         case 'o':
1862             out_path = optarg;
1863             break;
1864         case 'u':
1865             require_uniq_haplotypes = true;
1866             break;
1867         case 'G':
1868             gapped_alignments = false;
1869             break;
1870         case 'X':
1871             max_gaps = is_double(optarg);
1872             break;
1873         case 'x':
1874             min_match_len = is_double(optarg);
1875             break;
1876         case 1000:
1877             version();
1878             break;
1879         case 'p':
1880             num_threads = is_integer(optarg);
1881             break;
1882         case 'P':
1883             in_dir = optarg;
1884             break;
1885         case 'M':
1886             popmap_path = optarg;
1887             break;
1888         case '?':
1889             // getopt_long already printed an error message.
1890             help();
1891             break;
1892         default:
1893             help();
1894             exit(1);
1895         }
1896     }
1897 
1898     if (optind < argc) {
1899         cerr << "Error: Failed to parse command line: '" << argv[optind] << "' is seen as a positional argument. Expected no positional arguments.\n";
1900         help();
1901     }
1902 
1903     if (in_dir.empty() && samples.empty()) {
1904         cerr << "Error: You must specify one of -P or -s.\n";
1905         help();
1906     } else if ((!in_dir.empty() || !popmap_path.empty())
1907             && (!samples.empty() || !out_path.empty())) {
1908         cerr << "Error: Please use options -P/-M or -s/-o, not both.\n";
1909         help();
1910     }
1911 
1912     if (!in_dir.empty()) {
1913         if (popmap_path.empty()) {
1914             cerr << "Error: Please specify a population map (-M).\n";
1915             help();
1916         }
1917 
1918         if (in_dir.back() != '/')
1919             in_dir += "/";
1920 
1921         // Set `samples`.
1922         MetaPopInfo popmap;
1923         popmap.init_popmap(popmap_path);
1924         for (const Sample& s : popmap.samples())
1925             samples.push({0, in_dir + s.name});
1926 
1927         // Set `out_path`.
1928         out_path = in_dir;
1929 
1930     } else if (!samples.empty()) {
1931         if (out_path.empty())
1932             out_path = ".";
1933 
1934         if (out_path.back() != '/')
1935             out_path += "/";
1936     }
1937 
1938     if (set_kmer_len == false && (kmer_len < 5 || kmer_len > 31)) {
1939         cerr << "Kmer length must be between 5 and 31bp.\n";
1940         help();
1941     }
1942 
1943     return 0;
1944 }
1945 
version()1946 void version() {
1947     cerr << "cstacks " << VERSION << "\n";
1948 
1949     exit(1);
1950 }
1951 
help()1952 void help() {
1953     cerr << "cstacks " << VERSION << "\n"
1954               << "cstacks -P in_dir -M popmap [-n num_mismatches] [-p num_threads]" << "\n"
1955               << "cstacks -s sample1_path [-s sample2_path ...] -o path [-n num_mismatches] [-p num_threads]" << "\n"
1956               << "\n"
1957               << "  -P,--in-path: path to the directory containing Stacks files.\n"
1958               << "  -M,--popmap: path to a population map file.\n"
1959               << "  -n: number of mismatches allowed between sample loci when build the catalog (default 1; suggested: set to ustacks -M)." << "\n"
1960               << "  -p,--threads: enable parallel execution with num_threads threads.\n"
1961               << "  -s: sample prefix from which to load loci into the catalog." << "\n"
1962               << "  -o,--outpath: output path to write results." << "\n"
1963               << "  -c,--catalog <path>: add to an existing catalog.\n"
1964               << "\n"
1965               << "Gapped assembly options:\n"
1966               << "  --max-gaps: number of gaps allowed between stacks before merging (default: 2).\n"
1967               << "  --min-aln-len: minimum length of aligned sequence in a gapped alignment (default: 0.80).\n"
1968               << "  --disable-gapped: disable gapped alignments between stacks (default: use gapped alignments).\n"
1969               << "\n"
1970               << "Advanced options:\n"
1971               << "  --k-len <len>: specify k-mer size for matching between between catalog loci (automatically calculated by default).\n"
1972               << "  --report-mmatches: report query loci that match more than one catalog locus.\n";
1973 
1974     exit(1);
1975 }
1976