1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 #define HASH CityHash64
64 
65 struct kh_bucket_s
66 {
67   unsigned int kmer;
68   unsigned int pos; /* 1-based position, 0 = empty */
69 };
70 
71 struct kh_handle_s
72 {
73   struct kh_bucket_s * hash;
74   unsigned int hash_mask;
75   int size;
76   int alloc;
77   int maxpos;
78 };
79 
kh_init()80 struct kh_handle_s * kh_init()
81 {
82   auto * kh =
83     (struct kh_handle_s *) xmalloc(sizeof(struct kh_handle_s));
84 
85   kh->maxpos = 0;
86   kh->alloc = 256;
87   kh->size = 0;
88   kh->hash_mask = kh->alloc - 1;
89   kh->hash =
90     (struct kh_bucket_s *) xmalloc(kh->alloc * sizeof(struct kh_bucket_s));
91 
92   return kh;
93 }
94 
kh_exit(struct kh_handle_s * kh)95 void kh_exit(struct kh_handle_s * kh)
96 {
97   if (kh->hash) {
98     xfree(kh->hash);
99 
100         }
101   xfree(kh);
102 }
103 
kh_insert_kmer(struct kh_handle_s * kh,int k,unsigned int kmer,unsigned int pos)104 inline void kh_insert_kmer(struct kh_handle_s * kh,
105                            int k,
106                            unsigned int kmer,
107                            unsigned int pos)
108 {
109   /* find free bucket in hash */
110   unsigned int j = HASH((char*)&kmer, (k+3)/4) & kh->hash_mask;
111   while(kh->hash[j].pos) {
112     j = (j + 1) & kh->hash_mask;
113 
114         }
115 
116   kh->hash[j].kmer = kmer;
117   kh->hash[j].pos = pos;
118 }
119 
kh_insert_kmers(struct kh_handle_s * kh,int k,char * seq,int len)120 void kh_insert_kmers(struct kh_handle_s * kh, int k, char * seq, int len)
121 {
122   int kmers = 1 << (2 * k);
123   unsigned int kmer_mask = kmers - 1;
124 
125   /* reallocate hash table if necessary */
126 
127   if (kh->alloc < 2 * len)
128     {
129       while (kh->alloc < 2 * len) {
130         kh->alloc *= 2;
131 
132         }
133       kh->hash = (struct kh_bucket_s *)
134         xrealloc(kh->hash, kh->alloc * sizeof(struct kh_bucket_s));
135     }
136 
137   kh->size = 1;
138   while(kh->size < 2 * len) {
139     kh->size *= 2;
140 
141         }
142   kh->hash_mask = kh->size - 1;
143 
144   kh->maxpos = len;
145 
146   memset(kh->hash, 0, kh->size * sizeof(struct kh_bucket_s));
147 
148   unsigned int bad = kmer_mask;
149   unsigned int kmer = 0;
150   char * s = seq;
151 
152   unsigned int * maskmap = chrmap_mask_ambig;
153 
154   for (int pos = 0; pos < len; pos++)
155     {
156       int c = *s++;
157 
158       bad <<= 2ULL;
159       bad |= maskmap[c];
160       bad &= kmer_mask;
161 
162       kmer <<= 2ULL;
163       kmer |= chrmap_2bit[c];
164       kmer &= kmer_mask;
165 
166       if (!bad)
167         {
168           /* 1-based pos of start of kmer */
169           kh_insert_kmer(kh, k, kmer, pos - k + 1 + 1);
170         }
171     }
172 }
173 
kh_find_best_diagonal(struct kh_handle_s * kh,int k,char * seq,int len)174 int kh_find_best_diagonal(struct kh_handle_s * kh, int k, char * seq, int len)
175 {
176   int diag_counts[kh->maxpos];
177 
178   memset(diag_counts, 0, kh->maxpos * sizeof(int));
179 
180   int kmers = 1 << (2 * k);
181   unsigned int kmer_mask = kmers - 1;
182 
183   unsigned int bad = kmer_mask;
184   unsigned int kmer = 0;
185   char * s = seq + len - 1;
186 
187   unsigned int * maskmap = chrmap_mask_ambig;
188 
189   for (int pos = 0; pos < len; pos++)
190     {
191       int c = *s--;
192 
193       bad <<= 2ULL;
194       bad |= maskmap[c];
195       bad &= kmer_mask;
196 
197       kmer <<= 2ULL;
198       kmer |= chrmap_2bit[chrmap_complement[c]];
199       kmer &= kmer_mask;
200 
201       if (!bad)
202         {
203           /* find matching buckets in hash */
204           unsigned int j = HASH((char*)&kmer, (k+3)/4) & kh->hash_mask;
205           while(kh->hash[j].pos)
206             {
207               if (kh->hash[j].kmer == kmer)
208                 {
209                   int fpos = kh->hash[j].pos - 1;
210                   int diag = fpos - (pos - k + 1);
211                   if (diag >= 0) {
212                     diag_counts[diag]++;
213 
214         }
215                 }
216               j = (j + 1) & kh->hash_mask;
217             }
218         }
219     }
220 
221   int best_diag_count = -1;
222   int best_diag = -1;
223   int good_diags = 0;
224 
225   for(int d = 0; d < kh->maxpos - k + 1; d++)
226     {
227       int diag_len = kh->maxpos - d;
228       int minmatch = MAX(1, diag_len - k + 1 - k * MAX(diag_len / 20, 0));
229       int c = diag_counts[d];
230 
231       if (c >= minmatch) {
232         good_diags++;
233 
234         }
235 
236       if (c > best_diag_count)
237         {
238           best_diag_count = c;
239           best_diag = d;
240         }
241     }
242 
243   if (good_diags == 1) {
244     return best_diag;
245   } else {
246     return -1;
247 
248         }
249 }
250 
kh_find_diagonals(struct kh_handle_s * kh,int k,char * seq,int len,int * diags)251 void kh_find_diagonals(struct kh_handle_s * kh,
252                        int k,
253                        char * seq,
254                        int len,
255                        int * diags)
256 {
257   memset(diags, 0, (kh->maxpos+len) * sizeof(int));
258 
259   int kmers = 1 << (2 * k);
260   unsigned int kmer_mask = kmers - 1;
261 
262   unsigned int bad = kmer_mask;
263   unsigned int kmer = 0;
264   char * s = seq + len - 1;
265 
266   for (int pos = 0; pos < len; pos++)
267     {
268       int c = *s--;
269 
270       bad <<= 2ULL;
271       bad |= chrmap_mask_ambig[c];
272       bad &= kmer_mask;
273 
274       kmer <<= 2ULL;
275       kmer |= chrmap_2bit[chrmap_complement[c]];
276       kmer &= kmer_mask;
277 
278       if (!bad)
279         {
280           /* find matching buckets in hash */
281           unsigned int j = HASH((char*)&kmer, (k+3)/4) & kh->hash_mask;
282           while(kh->hash[j].pos)
283             {
284               if (kh->hash[j].kmer == kmer)
285                 {
286                   int fpos = kh->hash[j].pos - 1;
287                   int diag = len + fpos - (pos - k + 1);
288                   if (diag >= 0) {
289                     diags[diag]++;
290 
291         }
292                 }
293               j = (j + 1) & kh->hash_mask;
294             }
295         }
296     }
297 }
298