1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // kmers.cc -- routines to generate and hash K-mers
23 //
24 #include "kmers.h"
25
determine_kmer_length(int read_len,int dist)26 int determine_kmer_length(int read_len, int dist) {
27 int kmer_len, span, min_matches;
28
29 //
30 // If distance allowed between sequences is 0, then k-mer length equals read length.
31 //
32 if (dist == 0)
33 return read_len;
34
35 //
36 // Longer k-mer lengths will provide a smaller hash, with better key placement.
37 // Increase the kmer_len until we start to miss hits at the given distance. Then
38 // back the kmer_len off one unit to get the final value.
39 //
40 for (kmer_len = 7; kmer_len < read_len; kmer_len += 2) {
41 span = (kmer_len * (dist + 1)) - 1;
42
43 min_matches = read_len - span;
44
45 if (min_matches <= 0) break;
46 }
47
48 if (kmer_len >= read_len) {
49 cerr << "Unable to find a suitable k-mer length for matching.\n";
50 exit(1);
51 }
52
53 kmer_len -= 2;
54
55 return kmer_len;
56 }
57
calc_min_kmer_matches(int kmer_len,int dist,int read_len,bool exit_err)58 int calc_min_kmer_matches(int kmer_len, int dist, int read_len, bool exit_err) {
59 int span, min_matches;
60
61 span = (kmer_len * (dist + 1)) - 1;
62
63 min_matches = read_len - span;
64
65 if (min_matches <= 0) {
66 cerr <<
67 "Warning: combination of k-mer length (" << kmer_len << ") and edit distance (" << dist << ") allows for " <<
68 "sequences to be missed by the matching algorithm.\n";
69 }
70
71 if (min_matches <= 0 && exit_err)
72 exit(1);
73 else if (min_matches <= 0)
74 min_matches = 1;
75
76 return min_matches;
77 }
78
79 int
initialize_kmers(int kmer_len,int num_kmers,vector<char * > & kmers)80 initialize_kmers(int kmer_len, int num_kmers, vector<char *> &kmers)
81 {
82 char *kmer;
83
84 for (int i = 0; i < num_kmers; i++) {
85 kmer = new char[kmer_len + 1];
86 kmers.push_back(kmer);
87 }
88
89 return 0;
90 }
91
92 int
generate_kmers_lazily(const char * seq,uint kmer_len,uint num_kmers,vector<char * > & kmers)93 generate_kmers_lazily(const char *seq, uint kmer_len, uint num_kmers, vector<char *> &kmers)
94 {
95 char *kmer;
96 const char *k = seq;
97
98 if (num_kmers > kmers.size()) {
99 int new_kmers = num_kmers - kmers.size();
100
101 for (int i = 0; i < new_kmers; i++) {
102 kmer = new char[kmer_len + 1];
103 kmers.push_back(kmer);
104 }
105 }
106
107 for (uint i = 0; i < num_kmers; i++) {
108 kmer = kmers.at(i);
109 strncpy(kmer, k, kmer_len);
110 kmer[kmer_len] = '\0';
111 k++;
112 }
113
114 return 0;
115 }
116
117 int
generate_kmers(const char * seq,int kmer_len,int num_kmers,vector<char * > & kmers)118 generate_kmers(const char *seq, int kmer_len, int num_kmers, vector<char *> &kmers)
119 {
120 char *kmer;
121 const char *k = seq;
122
123 for (int i = 0; i < num_kmers; i++) {
124 kmer = new char[kmer_len + 1];
125 strncpy(kmer, k, kmer_len);
126 kmer[kmer_len] = '\0';
127 kmers.push_back(kmer);
128 k++;
129 }
130
131 return 0;
132 }
133
generate_permutations(map<int,char ** > & pstrings,int width)134 int generate_permutations(map<int, char **> &pstrings, int width) {
135 int i, j, rem, div, num;
136 char *p;
137 //
138 // Given a k-mer that allows wildcards -- 'N' characters, we need to generate all
139 // possible k-mers. To do so, we will generate a range of numbers that we convert to
140 // base 4, assuming that 0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T'.
141 //
142 const int base = 4;
143 int range = (int) pow(4, width);
144
145 //
146 // Create an array of strings to hold the permuted nucleotides.
147 //
148 char **strings = new char * [range];
149 for (i = 0; i < range; i++)
150 strings[i] = new char[width + 1];
151
152 for (i = 0; i < range; i++) {
153 for (j = 0; j < width; j++)
154 strings[i][j] = 'A';
155 strings[i][width] = '\0';
156 }
157
158 for (i = 0; i < range; i++) {
159 //
160 // Convert this number to base 4
161 //
162 p = strings[i]; p += width - 1;
163 num = i;
164 do {
165 div = (int) floor(num / base);
166 rem = num % base;
167
168 switch(rem) {
169 case 0:
170 *p = 'A';
171 break;
172 case 1:
173 *p = 'C';
174 break;
175 case 2:
176 *p = 'G';
177 break;
178 case 3:
179 *p = 'T';
180 break;
181 }
182 num = div;
183 p--;
184 } while (div > 0);
185 }
186
187 pstrings[width] = strings;
188
189 return 0;
190 }
191
192 int
populate_kmer_hash(map<int,MergedStack * > & merged,KmerHashMap & kmer_map,vector<char * > & kmer_map_keys,int kmer_len)193 populate_kmer_hash(map<int, MergedStack *> &merged, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
194 {
195 map<int, MergedStack *>::iterator it;
196 MergedStack *tag;
197 vector<char *> kmers;
198 bool exists;
199
200 //
201 // Break each stack down into k-mers and create a hash map of those k-mers
202 // recording in which sequences they occur.
203 //
204 int num_kmers = strlen(merged.begin()->second->con) - kmer_len + 1;
205
206 for (it = merged.begin(); it != merged.end(); it++) {
207 tag = it->second;
208
209 // Don't compute distances for masked tags
210 if (tag->masked) continue;
211
212 generate_kmers(tag->con, kmer_len, num_kmers, kmers);
213
214 // Hash the kmers
215 for (int j = 0; j < num_kmers; j++) {
216 exists = kmer_map.count(kmers[j]) == 0 ? false : true;
217
218 kmer_map[kmers[j]].push_back(tag->id);
219
220 if (exists)
221 delete [] kmers[j];
222 else
223 kmer_map_keys.push_back(kmers[j]);
224 }
225 kmers.clear();
226 }
227
228 //dump_kmer_map(kmer_map);
229
230 return 0;
231 }
232
233 int
populate_kmer_hash(map<int,Locus * > & catalog,CatKmerHashMap & kmer_map,vector<char * > & kmer_map_keys,int kmer_len)234 populate_kmer_hash(map<int, Locus *> &catalog, CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys, int kmer_len)
235 {
236 map<int, Locus *>::iterator it;
237 vector<pair<allele_type, string> >::iterator allele;
238 vector<char *> kmers;
239 Locus *tag;
240 char *hash_key;
241 bool exists;
242 int num_kmers;
243
244 //
245 // Break each stack down into k-mers and create a hash map of those k-mers
246 // recording in which sequences they occur.
247 //
248 for (it = catalog.begin(); it != catalog.end(); it++) {
249 tag = it->second;
250
251 num_kmers = strlen(tag->con) - kmer_len + 1;
252
253 //
254 // Iterate through the possible Catalog alleles
255 //
256 for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
257 //
258 // Generate and hash the kmers for this allele string
259 //
260 generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
261
262 for (int j = 0; j < num_kmers; j++) {
263 hash_key = kmers[j];
264 exists = kmer_map.count(hash_key) == 0 ? false : true;
265
266 kmer_map[hash_key].push_back(make_pair(allele->first, tag->id));
267
268 if (exists)
269 delete [] kmers[j];
270 else
271 kmer_map_keys.push_back(hash_key);
272 }
273 kmers.clear();
274 }
275 }
276
277 //dump_kmer_map(kmer_map);
278
279 return 0;
280 }
281
282 int
populate_kmer_hash(map<int,Locus * > & catalog,KmerHashMap & kmer_map,vector<char * > & kmer_map_keys,map<int,pair<allele_type,int>> & allele_map,int kmer_len)283 populate_kmer_hash(map<int, Locus *> &catalog, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, map<int, pair<allele_type, int> > &allele_map, int kmer_len)
284 {
285 map<int, Locus *>::iterator it;
286 KmerHashMap::iterator map_it;
287 vector<pair<allele_type, string> >::iterator allele;
288 map<int, pair<allele_type, int> >::iterator allele_it;
289 vector<char *> kmers;
290 Locus *tag;
291 char *hash_key;
292
293 //
294 // Break each stack down into k-mers and create a hash map of those k-mers
295 // recording in which sequences they occur.
296 //
297 int num_kmers;
298 int allele_index = 0;
299
300 allele_it = allele_map.begin();
301
302 for (it = catalog.begin(); it != catalog.end(); it++) {
303 tag = it->second;
304
305 num_kmers = strlen(tag->con) - kmer_len + 1;
306
307 //
308 // Iterate through the possible Catalog alleles
309 //
310 for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
311 //
312 // Generate and hash the kmers for this allele string
313 //
314 generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
315
316 allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
317
318 for (int j = 0; j < num_kmers; j++) {
319 hash_key = kmers[j];
320
321 map_it = kmer_map.find(hash_key);
322
323 if (map_it != kmer_map.end()) {
324 map_it->second.push_back(allele_index);
325 delete [] kmers[j];
326 } else {
327 kmer_map[hash_key].push_back(allele_index);
328 kmer_map_keys.push_back(hash_key);
329 }
330 }
331 kmers.clear();
332
333 allele_index++;
334 }
335 }
336
337 //dump_kmer_map(kmer_map);
338
339 return 0;
340 }
341
342 int
populate_kmer_hash(map<int,CLocus * > & catalog,KmerHashMap & kmer_map,vector<char * > & kmer_map_keys,map<int,pair<allele_type,int>> & allele_map,int kmer_len)343 populate_kmer_hash(map<int, CLocus *> &catalog, KmerHashMap &kmer_map, vector<char *> &kmer_map_keys, map<int, pair<allele_type, int> > &allele_map, int kmer_len)
344 {
345 map<int, CLocus *>::iterator it;
346 KmerHashMap::iterator map_it;
347 vector<pair<allele_type, string> >::iterator allele;
348 map<int, pair<allele_type, int> >::iterator allele_it;
349 vector<char *> kmers;
350 Locus *tag;
351 char *hash_key;
352
353 //
354 // Break each stack down into k-mers and create a hash map of those k-mers
355 // recording in which sequences they occur.
356 //
357 int num_kmers;
358 int allele_index = 0;
359
360 allele_it = allele_map.begin();
361
362 for (it = catalog.begin(); it != catalog.end(); it++) {
363 tag = it->second;
364
365 num_kmers = strlen(tag->con) - kmer_len + 1;
366
367 //
368 // Iterate through the possible Catalog alleles
369 //
370 for (allele = tag->strings.begin(); allele != tag->strings.end(); allele++) {
371 //
372 // Generate and hash the kmers for this allele string
373 //
374 generate_kmers(allele->second.c_str(), kmer_len, num_kmers, kmers);
375
376 allele_it = allele_map.insert(allele_it, make_pair(allele_index, make_pair(allele->first, tag->id)));
377
378 for (int j = 0; j < num_kmers; j++) {
379 hash_key = kmers[j];
380
381 map_it = kmer_map.find(hash_key);
382
383 if (map_it != kmer_map.end()) {
384 map_it->second.push_back(allele_index);
385 delete [] kmers[j];
386 } else {
387 kmer_map[hash_key].push_back(allele_index);
388 kmer_map_keys.push_back(hash_key);
389 }
390 }
391 kmers.clear();
392
393 allele_index++;
394 }
395 }
396
397 //dump_kmer_map(kmer_map);
398
399 return 0;
400 }
401
402 int
free_kmer_hash(CatKmerHashMap & kmer_map,vector<char * > & kmer_map_keys)403 free_kmer_hash(CatKmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
404 {
405 for (uint i = 0; i < kmer_map_keys.size(); i++) {
406 kmer_map[kmer_map_keys[i]].clear();
407 }
408 kmer_map.clear();
409
410 for (uint i = 0; i < kmer_map_keys.size(); i++) {
411 delete [] kmer_map_keys[i];
412 }
413 kmer_map_keys.clear();
414
415 return 0;
416 }
417
418 int
free_kmer_hash(KmerHashMap & kmer_map,vector<char * > & kmer_map_keys)419 free_kmer_hash(KmerHashMap &kmer_map, vector<char *> &kmer_map_keys)
420 {
421 for (uint i = 0; i < kmer_map_keys.size(); i++) {
422 kmer_map[kmer_map_keys[i]].clear();
423 }
424 kmer_map.clear();
425
426 for (uint i = 0; i < kmer_map_keys.size(); i++) {
427 delete [] kmer_map_keys[i];
428 }
429 kmer_map_keys.clear();
430
431 return 0;
432 }
433
dist(const char * tag_1,Locus * tag_2,allele_type allele)434 int dist(const char *tag_1, Locus *tag_2, allele_type allele) {
435 int dist = 0;
436 const char *p = tag_1;
437 const char *p_end = p + strlen(p);
438 const char *q = NULL;
439 //
440 // Identify which matching string has the proper allele
441 //
442 vector<pair<allele_type, string> >::iterator it;
443
444 for (it = tag_2->strings.begin(); it != tag_2->strings.end(); it++)
445 if (it->first == allele)
446 q = it->second.c_str();
447 if (q == NULL) return -1;
448
449 const char *q_end = q + strlen(q);
450
451 // Count the number of characters that are different
452 // between the two sequences.
453 while (p < p_end && q < q_end) {
454 dist += (*p == *q) ? 0 : 1;
455 p++;
456 q++;
457 }
458
459 return dist;
460 }
461
462 int
dist(const char * tag_1,const char * tag_2,vector<pair<char,uint>> & cigar)463 dist(const char *tag_1, const char *tag_2, vector<pair<char, uint> > &cigar)
464 {
465 uint size = cigar.size();
466 char op;
467 uint dist, len_1, len_2, pos_1, pos_2, stop;
468 int mismatches = 0;
469
470 len_1 = strlen(tag_1);
471 len_2 = strlen(tag_2);
472 pos_1 = 0;
473 pos_2 = 0;
474
475 for (uint i = 0; i < size; i++) {
476 op = cigar[i].first;
477 dist = cigar[i].second;
478
479 switch(op) {
480 case 'D':
481 //
482 // A deletion has occured in tag_1 relative to tag_2.
483 //
484 pos_2 += dist;
485 break;
486 case 'I':
487 //
488 // An insertion has occured in tag_1 relative to tag_2.
489 //
490 pos_1 += dist;
491 break;
492 case 'M':
493 stop = pos_1 + dist;
494 while (pos_1 < stop && pos_1 < len_1 && pos_2 < len_2) {
495 if (tag_1[pos_1] != 'N' && tag_2[pos_2] != 'N' && tag_1[pos_1] != tag_2[pos_2])
496 mismatches++;
497 pos_1++;
498 pos_2++;
499 }
500 break;
501 default:
502 break;
503 }
504 }
505
506 return mismatches;
507 }
508
509 int
dist(Locus * tag_1,Locus * tag_2)510 dist(Locus *tag_1, Locus *tag_2)
511 {
512 int dist = 0;
513 char *p = tag_1->con;
514 char *q = tag_2->con;
515 char *p_end = p + tag_1->len;
516 char *q_end = q + tag_2->len;
517
518 if (tag_1->len != tag_2->len) {
519 if (tag_1->len < tag_2->len)
520 dist += tag_2->len - tag_1->len;
521 else if (tag_1->len > tag_2->len)
522 dist += tag_1->len - tag_2->len;
523 }
524
525 //
526 // Count the number of characters that are different
527 // between the two sequences.
528 //
529 while (p < p_end && q < q_end) {
530 dist += (*p == *q) ? 0 : 1;
531 p++;
532 q++;
533 }
534
535 return dist;
536 }
537
538 int
dist(MergedStack * tag_1,MergedStack * tag_2)539 dist(MergedStack *tag_1, MergedStack *tag_2)
540 {
541 int dist = 0;
542 char *p = tag_1->con;
543 char *q = tag_2->con;
544 char *p_end = p + tag_1->len;
545 char *q_end = q + tag_2->len;
546
547 //
548 // If the sequences are of different lengths, count the missing
549 // nucleotides as mismatches.
550 //
551 if (tag_1->len != tag_2->len) {
552 if (tag_1->len < tag_2->len)
553 dist += tag_2->len - tag_1->len;
554 else if (tag_1->len > tag_2->len)
555 dist += tag_1->len - tag_2->len;
556 }
557
558 //
559 // Count the number of characters that are different
560 // between the two sequences.
561 //
562 while (p < p_end && q < q_end) {
563 dist += (*p == *q) ? 0 : 1;
564 p++;
565 q++;
566 }
567
568 return dist;
569 }
570
571 int
dist(MergedStack * tag_1,char * seq)572 dist(MergedStack *tag_1, char *seq)
573 {
574 int dist = 0;
575 char *p = tag_1->con;
576 char *q = seq;
577 uint q_len = strlen(q);
578 char *p_end = p + tag_1->len;
579 char *q_end = q + q_len;
580
581 //
582 // If the sequences are of different lengths, count the missing
583 // nucleotides as mismatches.
584 //
585 if (tag_1->len != q_len) {
586 if (tag_1->len < q_len)
587 dist += q_len - tag_1->len;
588 else if (tag_1->len > q_len)
589 dist += tag_1->len - q_len;
590 }
591
592 //
593 // Count the number of characters that are different
594 // between the two sequences.
595 //
596 while (p < p_end && q < q_end) {
597 dist += (*p == *q) ? 0 : 1;
598 p++;
599 q++;
600 }
601
602 return dist;
603 }
604
compare_dist(pair<int,int> a,pair<int,int> b)605 bool compare_dist(pair<int, int> a, pair<int, int> b) {
606 return (a.second < b.second);
607 }
608
609 int
check_frameshift(MergedStack * tag_1,MergedStack * tag_2,size_t mismatches)610 check_frameshift(MergedStack *tag_1, MergedStack *tag_2, size_t mismatches)
611 {
612 size_t cnt = 0;
613 size_t diff = 0;
614 char const* p = tag_1->con;
615 char const* q = tag_2->con;
616 char const* p_end = p + tag_1->len - 1;
617 char const* q_end = q + tag_2->len - 1;
618
619 //
620 // Set pointers to the common end of the sequences.
621 //
622 if (tag_1->len != tag_2->len) {
623 if (tag_1->len < tag_2->len) {
624 diff = tag_2->len - tag_1->len;
625 p_end -= diff;
626 } else if (tag_1->len > tag_2->len) {
627 diff = tag_1->len - tag_2->len;
628 q_end -= diff;
629 }
630 }
631
632 //
633 // Count the number of characters that are different
634 // at the 3' end of the sequence to test for possible frameshifts.
635 //
636 size_t i = diff;
637 while (p_end >= p && q_end >= q && i < mismatches) {
638 cnt += (*p_end != *q_end) ? 1 : 0;
639 p_end--;
640 q_end--;
641 i++;
642 }
643
644 return cnt;
645 }
646
647 int
check_frameshift(const char * tag_1,Locus * tag_2,allele_type allele,size_t mismatches)648 check_frameshift(const char *tag_1, Locus *tag_2, allele_type allele, size_t mismatches)
649 {
650 size_t cnt = 0;
651 const char *p = tag_1;
652 const char *q = NULL;
653 //
654 // Identify which matching string has the proper allele
655 //
656 vector<pair<allele_type, string> >::iterator it;
657
658 for (it = tag_2->strings.begin(); it != tag_2->strings.end(); it++)
659 if (it->first == allele)
660 q = it->second.c_str();
661 if (q == NULL) return -1;
662
663 const char *p_end = p + strlen(p) - 1;
664 const char *q_end = q + strlen(q) - 1;
665
666 //
667 // Count the number of characters that are different
668 // at the 3' end of the sequence to test for possible frameshifts.
669 //
670 size_t i = 0;
671 while (p_end >= p && q_end >= q && i < mismatches) {
672 cnt += (*p_end != *q_end) ? 1 : 0;
673 p_end--;
674 q_end--;
675 i++;
676 }
677
678 return cnt;
679 }
680
681 int
check_frameshift(MergedStack * tag_1,const char * seq,size_t mismatches)682 check_frameshift(MergedStack *tag_1, const char *seq, size_t mismatches)
683 {
684 int cnt = 0;
685 const char *p = tag_1->con;
686 const char *q = seq;
687 uint q_len = strlen(q);
688 const char *p_end = p + tag_1->len - 1;
689 const char *q_end = q + q_len - 1;
690
691 //
692 // If the sequences are of different lengths, count the missing
693 // nucleotides as mismatches.
694 //
695 if (tag_1->len != q_len) {
696 if (tag_1->len < q_len)
697 q_end -= q_len - tag_1->len;
698 else if (tag_1->len > q_len)
699 p_end -= tag_1->len - q_len;
700 }
701
702 //
703 // Count the number of characters that are different
704 // at the 3' end of the sequence to test for possible frameshifts.
705 //
706 size_t i = 0;
707 while (p_end >= p && q_end >= q && i < mismatches) {
708 cnt += (*p_end != *q_end) ? 1 : 0;
709 p_end--;
710 q_end--;
711 i++;
712 }
713
714 return cnt;
715 }
716
dump_kmer_map(KmerHashMap & kmer_map)717 int dump_kmer_map(KmerHashMap &kmer_map) {
718 KmerHashMap::iterator kit;
719 vector<int>::iterator vit;
720
721 cerr << kmer_map.size() << " keys in the map.\n";
722
723 int i = 1;
724 for (kit = kmer_map.begin(); kit != kmer_map.end(); kit++) {
725 cerr << "Key #" << i << " " << kit->first << ": ";
726 for (vit = (kit->second).begin(); vit != (kit->second).end(); vit++)
727 cerr << " " << *vit;
728 cerr << "\n";
729 i++;
730
731 if (i > 1000) break;
732 }
733
734 return 0;
735 }
736