1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 #define HASH CityHash64
64
65 struct kh_bucket_s
66 {
67 unsigned int kmer;
68 unsigned int pos; /* 1-based position, 0 = empty */
69 };
70
71 struct kh_handle_s
72 {
73 struct kh_bucket_s * hash;
74 unsigned int hash_mask;
75 int size;
76 int alloc;
77 int maxpos;
78 };
79
kh_init()80 struct kh_handle_s * kh_init()
81 {
82 auto * kh =
83 (struct kh_handle_s *) xmalloc(sizeof(struct kh_handle_s));
84
85 kh->maxpos = 0;
86 kh->alloc = 256;
87 kh->size = 0;
88 kh->hash_mask = kh->alloc - 1;
89 kh->hash =
90 (struct kh_bucket_s *) xmalloc(kh->alloc * sizeof(struct kh_bucket_s));
91
92 return kh;
93 }
94
kh_exit(struct kh_handle_s * kh)95 void kh_exit(struct kh_handle_s * kh)
96 {
97 if (kh->hash) {
98 xfree(kh->hash);
99
100 }
101 xfree(kh);
102 }
103
kh_insert_kmer(struct kh_handle_s * kh,int k,unsigned int kmer,unsigned int pos)104 inline void kh_insert_kmer(struct kh_handle_s * kh,
105 int k,
106 unsigned int kmer,
107 unsigned int pos)
108 {
109 /* find free bucket in hash */
110 unsigned int j = HASH((char*)&kmer, (k+3)/4) & kh->hash_mask;
111 while(kh->hash[j].pos) {
112 j = (j + 1) & kh->hash_mask;
113
114 }
115
116 kh->hash[j].kmer = kmer;
117 kh->hash[j].pos = pos;
118 }
119
kh_insert_kmers(struct kh_handle_s * kh,int k,char * seq,int len)120 void kh_insert_kmers(struct kh_handle_s * kh, int k, char * seq, int len)
121 {
122 int kmers = 1 << (2 * k);
123 unsigned int kmer_mask = kmers - 1;
124
125 /* reallocate hash table if necessary */
126
127 if (kh->alloc < 2 * len)
128 {
129 while (kh->alloc < 2 * len) {
130 kh->alloc *= 2;
131
132 }
133 kh->hash = (struct kh_bucket_s *)
134 xrealloc(kh->hash, kh->alloc * sizeof(struct kh_bucket_s));
135 }
136
137 kh->size = 1;
138 while(kh->size < 2 * len) {
139 kh->size *= 2;
140
141 }
142 kh->hash_mask = kh->size - 1;
143
144 kh->maxpos = len;
145
146 memset(kh->hash, 0, kh->size * sizeof(struct kh_bucket_s));
147
148 unsigned int bad = kmer_mask;
149 unsigned int kmer = 0;
150 char * s = seq;
151
152 unsigned int * maskmap = chrmap_mask_ambig;
153
154 for (int pos = 0; pos < len; pos++)
155 {
156 int c = *s++;
157
158 bad <<= 2ULL;
159 bad |= maskmap[c];
160 bad &= kmer_mask;
161
162 kmer <<= 2ULL;
163 kmer |= chrmap_2bit[c];
164 kmer &= kmer_mask;
165
166 if (!bad)
167 {
168 /* 1-based pos of start of kmer */
169 kh_insert_kmer(kh, k, kmer, pos - k + 1 + 1);
170 }
171 }
172 }
173
kh_find_best_diagonal(struct kh_handle_s * kh,int k,char * seq,int len)174 int kh_find_best_diagonal(struct kh_handle_s * kh, int k, char * seq, int len)
175 {
176 int diag_counts[kh->maxpos];
177
178 memset(diag_counts, 0, kh->maxpos * sizeof(int));
179
180 int kmers = 1 << (2 * k);
181 unsigned int kmer_mask = kmers - 1;
182
183 unsigned int bad = kmer_mask;
184 unsigned int kmer = 0;
185 char * s = seq + len - 1;
186
187 unsigned int * maskmap = chrmap_mask_ambig;
188
189 for (int pos = 0; pos < len; pos++)
190 {
191 int c = *s--;
192
193 bad <<= 2ULL;
194 bad |= maskmap[c];
195 bad &= kmer_mask;
196
197 kmer <<= 2ULL;
198 kmer |= chrmap_2bit[chrmap_complement[c]];
199 kmer &= kmer_mask;
200
201 if (!bad)
202 {
203 /* find matching buckets in hash */
204 unsigned int j = HASH((char*)&kmer, (k+3)/4) & kh->hash_mask;
205 while(kh->hash[j].pos)
206 {
207 if (kh->hash[j].kmer == kmer)
208 {
209 int fpos = kh->hash[j].pos - 1;
210 int diag = fpos - (pos - k + 1);
211 if (diag >= 0) {
212 diag_counts[diag]++;
213
214 }
215 }
216 j = (j + 1) & kh->hash_mask;
217 }
218 }
219 }
220
221 int best_diag_count = -1;
222 int best_diag = -1;
223 int good_diags = 0;
224
225 for(int d = 0; d < kh->maxpos - k + 1; d++)
226 {
227 int diag_len = kh->maxpos - d;
228 int minmatch = MAX(1, diag_len - k + 1 - k * MAX(diag_len / 20, 0));
229 int c = diag_counts[d];
230
231 if (c >= minmatch) {
232 good_diags++;
233
234 }
235
236 if (c > best_diag_count)
237 {
238 best_diag_count = c;
239 best_diag = d;
240 }
241 }
242
243 if (good_diags == 1) {
244 return best_diag;
245 } else {
246 return -1;
247
248 }
249 }
250
kh_find_diagonals(struct kh_handle_s * kh,int k,char * seq,int len,int * diags)251 void kh_find_diagonals(struct kh_handle_s * kh,
252 int k,
253 char * seq,
254 int len,
255 int * diags)
256 {
257 memset(diags, 0, (kh->maxpos+len) * sizeof(int));
258
259 int kmers = 1 << (2 * k);
260 unsigned int kmer_mask = kmers - 1;
261
262 unsigned int bad = kmer_mask;
263 unsigned int kmer = 0;
264 char * s = seq + len - 1;
265
266 for (int pos = 0; pos < len; pos++)
267 {
268 int c = *s--;
269
270 bad <<= 2ULL;
271 bad |= chrmap_mask_ambig[c];
272 bad &= kmer_mask;
273
274 kmer <<= 2ULL;
275 kmer |= chrmap_2bit[chrmap_complement[c]];
276 kmer &= kmer_mask;
277
278 if (!bad)
279 {
280 /* find matching buckets in hash */
281 unsigned int j = HASH((char*)&kmer, (k+3)/4) & kh->hash_mask;
282 while(kh->hash[j].pos)
283 {
284 if (kh->hash[j].kmer == kmer)
285 {
286 int fpos = kh->hash[j].pos - 1;
287 int diag = len + fpos - (pos - k + 1);
288 if (diag >= 0) {
289 diags[diag]++;
290
291 }
292 }
293 j = (j + 1) & kh->hash_mask;
294 }
295 }
296 }
297 }
298