1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // kmers.cc -- routines to generate and hash K-mers
23 //
24 #include "kmers.h"
25 
determine_kmer_length(int read_len,int dist)26 int determine_kmer_length(int read_len, int dist) {
27     int kmer_len, span, min_matches;
28 
29     //
30     // If distance allowed between sequences is 0, then k-mer length equals read length.
31     //
32     if (dist == 0)
33         return read_len;
34 
35     //
36     // Longer k-mer lengths will provide a smaller hash, with better key placement.
37     // Increase the kmer_len until we start to miss hits at the given distance. Then
38     // back the kmer_len off one unit to get the final value.
39     //
40     for (kmer_len = 7; kmer_len < read_len; kmer_len += 2) {
41         span = (kmer_len * (dist + 1)) - 1;
42 
43         min_matches = read_len - span;
44 
45         if (min_matches <= 0) break;
46     }
47 
48     if (kmer_len >= read_len) {
49         cerr << "Unable to find a suitable k-mer length for matching.\n";
50         exit(1);
51     }
52 
53     kmer_len -= 2;
54 
55     return kmer_len;
56 }
57 
calc_min_kmer_matches(int kmer_len,int dist,int read_len,bool exit_err)58 int calc_min_kmer_matches(int kmer_len, int dist, int read_len, bool exit_err) {
59     int span, min_matches;
60 
61     span = (kmer_len * (dist + 1)) - 1;
62 
63     min_matches = read_len - span;
64 
65     if (min_matches <= 0) {
66         cerr <<
67             "Warning: combination of k-mer length (" << kmer_len << ") and edit distance (" << dist << ") allows for " <<
68             "sequences to be missed by the matching algorithm.\n";
69     }
70 
71     if (min_matches <= 0 && exit_err)
72         exit(1);
73     else if (min_matches <= 0)
74         min_matches = 1;
75 
76     return min_matches;
77 }
78 
79 int
initialize_kmers(int kmer_len,int num_kmers,vector<char * > & kmers)80 initialize_kmers(int kmer_len, int num_kmers, vector<char *> &kmers)
81 {
82     char *kmer;
83 
84     for (int i = 0; i < num_kmers; i++) {
85         kmer = new char[kmer_len + 1];
86         kmers.push_back(kmer);
87     }
88 
89     return 0;
90 }
91 
92 int
generate_kmers_lazily(const char * seq,uint kmer_len,uint num_kmers,vector<char * > & kmers)93 generate_kmers_lazily(const char *seq, uint kmer_len, uint num_kmers, vector<char *> &kmers)
94 {
95     char *kmer;
96     const char *k = seq;
97 
98     if (num_kmers > kmers.size()) {
99         int new_kmers = num_kmers - kmers.size();
100 
101         for (int i = 0; i < new_kmers; i++) {
102             kmer = new char[kmer_len + 1];
103             kmers.push_back(kmer);
104         }
105     }
106 
107     for (uint i = 0; i < num_kmers; i++) {
108         kmer = kmers.at(i);
109         strncpy(kmer, k, kmer_len);
110         kmer[kmer_len] = '\0';
111         k++;
112     }
113 
114     return 0;
115 }
116 
117 int
generate_kmers(const char * seq,int kmer_len,int num_kmers,vector<char * > & kmers)118 generate_kmers(const char *seq, int kmer_len, int num_kmers, vector<char *> &kmers)
119 {
120     char *kmer;
121     const char *k = seq;
122 
123     for (int i = 0; i < num_kmers; i++) {
124         kmer = new char[kmer_len + 1];
125         strncpy(kmer, k, kmer_len);
126         kmer[kmer_len] = '\0';
127         kmers.push_back(kmer);
128         k++;
129     }
130 
131     return 0;
132 }
133 
generate_permutations(map<int,char ** > & pstrings,int width)134 int generate_permutations(map<int, char **> &pstrings, int width) {
135     int   i, j, rem, div, num;
136     char *p;
137     //
138     // Given a k-mer that allows wildcards -- 'N' characters, we need to generate all
139     // possible k-mers. To do so, we will generate a range of numbers that we convert to
140     // base 4, assuming that 0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T'.
141     //
142     const int base = 4;
143     int range      = (int) pow(4, width);
144 
145     //
146     // Create an array of strings to hold the permuted nucleotides.
147     //
148     char **strings = new char * [range];
149     for (i = 0; i < range; i++)
150         strings[i] = new char[width + 1];
151 
152     for (i = 0; i < range; i++) {
153         for (j = 0; j < width; j++)
154             strings[i][j] = 'A';
155         strings[i][width] = '\0';
156     }
157 
158     for (i = 0; i < range; i++) {
159         //
160         // Convert this number to base 4
161         //
162         p   = strings[i]; p += width - 1;
163         num = i;
164         do {
165             div = (int) floor(num / base);
166             rem = num % base;
167 
168             switch(rem) {
169             case 0:
170                 *p = 'A';
171                 break;
172             case 1:
173                 *p = 'C';
174                 break;
175             case 2:
176                 *p = 'G';
177                 break;
178             case 3:
179                 *p = 'T';
180                 break;
181             }
182             num = div;
183             p--;
184         } while (div > 0);
185     }
186 
187     pstrings[width] = strings;
188 
189     return 0;
190 }
191 
192 int
populate_kmer_hash(map<int,MergedStack * > & merged,KmerHashMap & kmer_map,vector<char * > & kmer_map_keys,int kmer_len)193 populate_kmer_hash(map<int, MergedStack *> &merged, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
194 {
195     map<int, MergedStack *>::iterator it;
196     MergedStack    *tag;
197     vector<char *>  kmers;
198     bool            exists;
199 
200     //
201     // Break each stack down into k-mers and create a hash map of those k-mers
202     // recording in which sequences they occur.
203     //
204     int num_kmers = strlen(merged.begin()->second->con) - kmer_len + 1;
205 
206     for (it = merged.begin(); it != merged.end(); it++) {
207         tag = it->second;
208 
209         // Don't compute distances for masked tags
210         if (tag->masked) continue;
211 
212         generate_kmers(tag->con, kmer_len, num_kmers, kmers);
213 
214         // Hash the kmers
215         for (int j = 0; j < num_kmers; j++) {
216             exists = kmer_map.count(kmers[j]) == 0 ? false : true;
217 
218             kmer_map[kmers[j]].push_back(tag->id);
219 
220             if (exists)
221                 delete [] kmers[j];
222             else
223                 kmer_map_keys.push_back(kmers[j]);
224         }
225         kmers.clear();
226     }
227 
228     //dump_kmer_map(kmer_map);
229 
230     return 0;
231 }
232 
233 int
populate_kmer_hash(map<int,Locus * > & catalog,CatKmerHashMap & kmer_map,vector<char * > & kmer_map_keys,int kmer_len)234 populate_kmer_hash(map<int, Locus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
235 {
236     map<int, Locus *>::iterator it;
237     vector<pair<allele_type, string> >::iterator allele;
238     vector<char *> kmers;
239     Locus         *tag;
240     char          *hash_key;
241     bool           exists;
242     int            num_kmers;
243 
244     //
245     // Break each stack down into k-mers and create a hash map of those k-mers
246     // recording in which sequences they occur.
247     //
248     for (it = catalog.begin(); it != catalog.end(); it++) {
249         tag = it->second;
250 
251         num_kmers = strlen(tag->con) - kmer_len + 1;
252 
253         //
254         // Iterate through the possible Catalog alleles
255         //
256         for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
257             //
258             // Generate and hash the kmers for this allele string
259             //
260             generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
261 
262             for (int j = 0; j < num_kmers; j++) {
263                 hash_key = kmers[j];
264                 exists   = kmer_map.count(hash_key) == 0 ? false : true;
265 
266                 kmer_map[hash_key].push_back(make_pair(allele->first, tag->id));
267 
268                 if (exists)
269                     delete [] kmers[j];
270                 else
271                     kmer_map_keys.push_back(hash_key);
272             }
273             kmers.clear();
274         }
275     }
276 
277     //dump_kmer_map(kmer_map);
278 
279     return 0;
280 }
281 
282 int
populate_kmer_hash(map<int,Locus * > & catalog,KmerHashMap & kmer_map,vector<char * > & kmer_map_keys,map<int,pair<allele_type,int>> & allele_map,int kmer_len)283 populate_kmer_hash(map<int, Locus *> &catalog, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, map<int, pair<allele_type, int> > &allele_map, int kmer_len)
284 {
285     map<int, Locus *>::iterator it;
286     KmerHashMap::iterator   map_it;
287     vector<pair<allele_type, string> >::iterator allele;
288     map<int, pair<allele_type, int> >::iterator  allele_it;
289     vector<char *> kmers;
290     Locus         *tag;
291     char          *hash_key;
292 
293     //
294     // Break each stack down into k-mers and create a hash map of those k-mers
295     // recording in which sequences they occur.
296     //
297     int num_kmers;
298     int allele_index = 0;
299 
300     allele_it = allele_map.begin();
301 
302     for (it = catalog.begin(); it != catalog.end(); it++) {
303         tag = it->second;
304 
305         num_kmers = strlen(tag->con) - kmer_len + 1;
306 
307         //
308         // Iterate through the possible Catalog alleles
309         //
310         for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
311             //
312             // Generate and hash the kmers for this allele string
313             //
314             generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
315 
316             allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
317 
318             for (int j = 0; j < num_kmers; j++) {
319                 hash_key = kmers[j];
320 
321                 map_it = kmer_map.find(hash_key);
322 
323                 if (map_it != kmer_map.end()) {
324                     map_it->second.push_back(allele_index);
325                     delete [] kmers[j];
326                 } else {
327                     kmer_map[hash_key].push_back(allele_index);
328                     kmer_map_keys.push_back(hash_key);
329                 }
330             }
331             kmers.clear();
332 
333             allele_index++;
334         }
335     }
336 
337     //dump_kmer_map(kmer_map);
338 
339     return 0;
340 }
341 
342 int
populate_kmer_hash(map<int,CLocus * > & catalog,KmerHashMap & kmer_map,vector<char * > & kmer_map_keys,map<int,pair<allele_type,int>> & allele_map,int kmer_len)343 populate_kmer_hash(map<int, CLocus *> &catalog, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, map<int, pair<allele_type, int> > &allele_map, int kmer_len)
344 {
345     map<int, CLocus *>::iterator it;
346     KmerHashMap::iterator   map_it;
347     vector<pair<allele_type, string> >::iterator allele;
348     map<int, pair<allele_type, int> >::iterator  allele_it;
349     vector<char *> kmers;
350     Locus         *tag;
351     char          *hash_key;
352 
353     //
354     // Break each stack down into k-mers and create a hash map of those k-mers
355     // recording in which sequences they occur.
356     //
357     int num_kmers;
358     int allele_index = 0;
359 
360     allele_it = allele_map.begin();
361 
362     for (it = catalog.begin(); it != catalog.end(); it++) {
363         tag = it->second;
364 
365         num_kmers = strlen(tag->con) - kmer_len + 1;
366 
367         //
368         // Iterate through the possible Catalog alleles
369         //
370         for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
371             //
372             // Generate and hash the kmers for this allele string
373             //
374             generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
375 
376             allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
377 
378             for (int j = 0; j < num_kmers; j++) {
379                 hash_key = kmers[j];
380 
381                 map_it = kmer_map.find(hash_key);
382 
383                 if (map_it != kmer_map.end()) {
384                     map_it->second.push_back(allele_index);
385                     delete [] kmers[j];
386                 } else {
387                     kmer_map[hash_key].push_back(allele_index);
388                     kmer_map_keys.push_back(hash_key);
389                 }
390             }
391             kmers.clear();
392 
393             allele_index++;
394         }
395     }
396 
397     //dump_kmer_map(kmer_map);
398 
399     return 0;
400 }
401 
402 int
free_kmer_hash(CatKmerHashMap & kmer_map,vector<char * > & kmer_map_keys)403 free_kmer_hash(CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
404 {
405     for (uint i = 0; i < kmer_map_keys.size(); i++) {
406         kmer_map[kmer_map_keys[i]].clear();
407     }
408     kmer_map.clear();
409 
410     for (uint i = 0; i < kmer_map_keys.size(); i++) {
411         delete [] kmer_map_keys[i];
412     }
413     kmer_map_keys.clear();
414 
415     return 0;
416 }
417 
418 int
free_kmer_hash(KmerHashMap & kmer_map,vector<char * > & kmer_map_keys)419 free_kmer_hash(KmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
420 {
421     for (uint i = 0; i < kmer_map_keys.size(); i++) {
422         kmer_map[kmer_map_keys[i]].clear();
423     }
424     kmer_map.clear();
425 
426     for (uint i = 0; i < kmer_map_keys.size(); i++) {
427         delete [] kmer_map_keys[i];
428     }
429     kmer_map_keys.clear();
430 
431     return 0;
432 }
433 
dist(const char * tag_1,Locus * tag_2,allele_type allele)434 int dist(const char *tag_1, Locus *tag_2, allele_type allele) {
435     int   dist = 0;
436     const char *p     = tag_1;
437     const char *p_end = p + strlen(p);
438     const char *q     = NULL;
439     //
440     // Identify which matching string has the proper allele
441     //
442     vector<pair<allele_type, string> >::iterator it;
443 
444     for (it = tag_2->strings.begin(); it != tag_2->strings.end(); it++)
445         if (it->first == allele)
446             q = it->second.c_str();
447     if (q == NULL) return -1;
448 
449     const char *q_end = q + strlen(q);
450 
451     // Count the number of characters that are different
452     // between the two sequences.
453     while (p < p_end && q < q_end) {
454         dist += (*p == *q) ? 0 : 1;
455         p++;
456         q++;
457     }
458 
459     return dist;
460 }
461 
462 int
dist(const char * tag_1,const char * tag_2,vector<pair<char,uint>> & cigar)463 dist(const char *tag_1, const char *tag_2, vector<pair<char, uint> > &cigar)
464 {
465     uint  size = cigar.size();
466     char  op;
467     uint  dist, len_1, len_2, pos_1, pos_2, stop;
468     int   mismatches = 0;
469 
470     len_1 = strlen(tag_1);
471     len_2 = strlen(tag_2);
472     pos_1 = 0;
473     pos_2 = 0;
474 
475     for (uint i = 0; i < size; i++)  {
476         op   = cigar[i].first;
477         dist = cigar[i].second;
478 
479         switch(op) {
480         case 'D':
481             //
482             // A deletion has occured in tag_1 relative to tag_2.
483             //
484             pos_2 += dist;
485             break;
486         case 'I':
487             //
488             // An insertion has occured in tag_1 relative to tag_2.
489             //
490             pos_1 += dist;
491             break;
492         case 'M':
493             stop = pos_1 + dist;
494             while (pos_1 < stop && pos_1 < len_1 && pos_2 < len_2) {
495                 if (tag_1[pos_1] != 'N' && tag_2[pos_2] != 'N' && tag_1[pos_1] != tag_2[pos_2])
496                     mismatches++;
497                 pos_1++;
498                 pos_2++;
499             }
500             break;
501         default:
502             break;
503         }
504     }
505 
506     return mismatches;
507 }
508 
509 int
dist(Locus * tag_1,Locus * tag_2)510 dist(Locus *tag_1, Locus *tag_2)
511 {
512     int   dist  = 0;
513     char *p     = tag_1->con;
514     char *q     = tag_2->con;
515     char *p_end = p + tag_1->len;
516     char *q_end = q + tag_2->len;
517 
518     if (tag_1->len != tag_2->len) {
519         if (tag_1->len < tag_2->len)
520             dist += tag_2->len - tag_1->len;
521         else if (tag_1->len > tag_2->len)
522             dist += tag_1->len - tag_2->len;
523     }
524 
525     //
526     // Count the number of characters that are different
527     // between the two sequences.
528     //
529     while (p < p_end && q < q_end) {
530         dist += (*p == *q) ? 0 : 1;
531         p++;
532         q++;
533     }
534 
535     return dist;
536 }
537 
538 int
dist(MergedStack * tag_1,MergedStack * tag_2)539 dist(MergedStack *tag_1, MergedStack *tag_2)
540 {
541     int   dist  = 0;
542     char *p     = tag_1->con;
543     char *q     = tag_2->con;
544     char *p_end = p + tag_1->len;
545     char *q_end = q + tag_2->len;
546 
547     //
548     // If the sequences are of different lengths, count the missing
549     // nucleotides as mismatches.
550     //
551     if (tag_1->len != tag_2->len) {
552         if (tag_1->len < tag_2->len)
553             dist += tag_2->len - tag_1->len;
554         else if (tag_1->len > tag_2->len)
555             dist += tag_1->len - tag_2->len;
556     }
557 
558     //
559     // Count the number of characters that are different
560     // between the two sequences.
561     //
562     while (p < p_end && q < q_end) {
563         dist += (*p == *q) ? 0 : 1;
564         p++;
565         q++;
566     }
567 
568     return dist;
569 }
570 
571 int
dist(MergedStack * tag_1,char * seq)572 dist(MergedStack *tag_1, char *seq)
573 {
574     int   dist  = 0;
575     char *p     = tag_1->con;
576     char *q     = seq;
577     uint  q_len = strlen(q);
578     char *p_end = p + tag_1->len;
579     char *q_end = q + q_len;
580 
581     //
582     // If the sequences are of different lengths, count the missing
583     // nucleotides as mismatches.
584     //
585     if (tag_1->len != q_len) {
586         if (tag_1->len < q_len)
587             dist += q_len - tag_1->len;
588         else if (tag_1->len > q_len)
589             dist += tag_1->len - q_len;
590     }
591 
592     //
593     // Count the number of characters that are different
594     // between the two sequences.
595     //
596     while (p < p_end && q < q_end) {
597         dist += (*p == *q) ? 0 : 1;
598         p++;
599         q++;
600     }
601 
602     return dist;
603 }
604 
compare_dist(pair<int,int> a,pair<int,int> b)605 bool compare_dist(pair<int, int> a, pair<int, int> b) {
606     return (a.second < b.second);
607 }
608 
609 int
check_frameshift(MergedStack * tag_1,MergedStack * tag_2,size_t mismatches)610 check_frameshift(MergedStack *tag_1, MergedStack *tag_2, size_t mismatches)
611 {
612     size_t cnt  = 0;
613     size_t diff = 0;
614     char const* p     = tag_1->con;
615     char const* q     = tag_2->con;
616     char const* p_end = p + tag_1->len - 1;
617     char const* q_end = q + tag_2->len - 1;
618 
619     //
620     // Set pointers to the common end of the sequences.
621     //
622     if (tag_1->len != tag_2->len) {
623         if (tag_1->len < tag_2->len) {
624             diff = tag_2->len - tag_1->len;
625             p_end -= diff;
626         } else if (tag_1->len > tag_2->len) {
627             diff   = tag_1->len - tag_2->len;
628             q_end -= diff;
629         }
630     }
631 
632     //
633     // Count the number of characters that are different
634     // at the 3' end of the sequence to test for possible frameshifts.
635     //
636     size_t i = diff;
637     while (p_end >= p && q_end >= q && i < mismatches) {
638         cnt += (*p_end != *q_end) ? 1 : 0;
639         p_end--;
640         q_end--;
641         i++;
642     }
643 
644     return cnt;
645 }
646 
647 int
check_frameshift(const char * tag_1,Locus * tag_2,allele_type allele,size_t mismatches)648 check_frameshift(const char *tag_1, Locus *tag_2, allele_type allele, size_t mismatches)
649 {
650     size_t cnt = 0;
651     const char *p     = tag_1;
652     const char *q     = NULL;
653     //
654     // Identify which matching string has the proper allele
655     //
656     vector<pair<allele_type, string> >::iterator it;
657 
658     for (it = tag_2->strings.begin(); it != tag_2->strings.end(); it++)
659         if (it->first == allele)
660             q = it->second.c_str();
661     if (q == NULL) return -1;
662 
663     const char *p_end = p + strlen(p) - 1;
664     const char *q_end = q + strlen(q) - 1;
665 
666     //
667     // Count the number of characters that are different
668     // at the 3' end of the sequence to test for possible frameshifts.
669     //
670     size_t i = 0;
671     while (p_end >= p && q_end >= q && i < mismatches) {
672         cnt += (*p_end != *q_end) ? 1 : 0;
673         p_end--;
674         q_end--;
675         i++;
676     }
677 
678     return cnt;
679 }
680 
681 int
check_frameshift(MergedStack * tag_1,const char * seq,size_t mismatches)682 check_frameshift(MergedStack *tag_1, const char *seq, size_t mismatches)
683 {
684     int   cnt     = 0;
685     const char *p = tag_1->con;
686     const char *q = seq;
687     uint  q_len   = strlen(q);
688     const char *p_end = p + tag_1->len - 1;
689     const char *q_end = q + q_len - 1;
690 
691     //
692     // If the sequences are of different lengths, count the missing
693     // nucleotides as mismatches.
694     //
695     if (tag_1->len != q_len) {
696         if (tag_1->len < q_len)
697             q_end -= q_len - tag_1->len;
698         else if (tag_1->len > q_len)
699             p_end -= tag_1->len - q_len;
700     }
701 
702     //
703     // Count the number of characters that are different
704     // at the 3' end of the sequence to test for possible frameshifts.
705     //
706     size_t i = 0;
707     while (p_end >= p && q_end >= q && i < mismatches) {
708         cnt += (*p_end != *q_end) ? 1 : 0;
709         p_end--;
710         q_end--;
711         i++;
712     }
713 
714     return cnt;
715 }
716 
dump_kmer_map(KmerHashMap & kmer_map)717 int dump_kmer_map(KmerHashMap &kmer_map) {
718     KmerHashMap::iterator kit;
719     vector<int>::iterator vit;
720 
721     cerr << kmer_map.size() << " keys in the map.\n";
722 
723     int i = 1;
724     for (kit = kmer_map.begin(); kit != kmer_map.end(); kit++) {
725         cerr << "Key #" << i << " " << kit->first << ": ";
726         for (vit = (kit->second).begin(); vit != (kit->second).end(); vit++)
727             cerr << " " << *vit;
728         cerr << "\n";
729         i++;
730 
731         if (i > 1000) break;
732     }
733 
734     return 0;
735 }
736