1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 static bitmap_t * dbhash_bitmap;
64 static uint64_t dbhash_size;
65 static unsigned int dbhash_shift;
66 static uint64_t dbhash_mask;
67 static struct dbhash_bucket_s * dbhash_table;
68
dbhash_seqcmp(char * a,char * b,uint64_t n)69 int dbhash_seqcmp(char * a, char * b, uint64_t n)
70 {
71 char * p = a;
72 char * q = b;
73
74 if (n <= 0) {
75 return 0;
76
77 }
78
79 while ((n-- > 0) && (chrmap_4bit[(int)(*p)] == chrmap_4bit[(int)(*q)]))
80 {
81 if ((n == 0) || (*p == 0) || (*q == 0)) {
82 break;
83
84 }
85 p++;
86 q++;
87 }
88
89 return chrmap_4bit[(int)(*p)] - chrmap_4bit[(int)(*q)];
90 }
91
dbhash_open(uint64_t maxelements)92 void dbhash_open(uint64_t maxelements)
93 {
94 /* adjust size of hash table for 2/3 fill rate */
95 /* and use a multiple of 2 */
96
97 dbhash_size = 1;
98 dbhash_shift = 0;
99 while (3 * maxelements > 2 * dbhash_size)
100 {
101 dbhash_size <<= 1;
102 dbhash_shift++;
103 }
104 dbhash_mask = dbhash_size - 1;
105
106 dbhash_table = (struct dbhash_bucket_s *)
107 xmalloc(sizeof(dbhash_bucket_s) * dbhash_size);
108 memset(dbhash_table, 0, sizeof(dbhash_bucket_s) * dbhash_size);
109
110 dbhash_bitmap = bitmap_init(dbhash_size);
111 bitmap_reset_all(dbhash_bitmap);
112 }
113
dbhash_close()114 void dbhash_close()
115 {
116 bitmap_free(dbhash_bitmap);
117 dbhash_bitmap = nullptr;
118 xfree(dbhash_table);
119 dbhash_table = nullptr;
120 }
121
dbhash_search_first(char * seq,uint64_t seqlen,struct dbhash_search_info_s * info)122 int64_t dbhash_search_first(char * seq,
123 uint64_t seqlen,
124 struct dbhash_search_info_s * info)
125 {
126
127 uint64_t hash = hash_cityhash64(seq, seqlen);
128 info->hash = hash;
129 info->seq = seq;
130 info->seqlen = seqlen;
131 uint64_t index = hash & dbhash_mask;
132 struct dbhash_bucket_s * bp = dbhash_table + index;
133
134 while (bitmap_get(dbhash_bitmap, index)
135 &&
136 ((bp->hash != hash) ||
137 (seqlen != db_getsequencelen(bp->seqno)) ||
138 (dbhash_seqcmp(seq, db_getsequence(bp->seqno), seqlen))))
139 {
140 index = (index + 1) & dbhash_mask;
141 bp = dbhash_table + index;
142 }
143
144 info->index = index;
145
146 if (bitmap_get(dbhash_bitmap, index)) {
147 return bp->seqno;
148 } else {
149 return -1;
150
151 }
152 }
153
dbhash_search_next(struct dbhash_search_info_s * info)154 int64_t dbhash_search_next(struct dbhash_search_info_s * info)
155 {
156 uint64_t hash = info->hash;
157 char * seq = info->seq;
158 uint64_t seqlen = info->seqlen;
159 uint64_t index = (info->index + 1) & dbhash_mask;
160 struct dbhash_bucket_s * bp = dbhash_table + index;
161
162 while (bitmap_get(dbhash_bitmap, index)
163 &&
164 ((bp->hash != hash) ||
165 (seqlen != db_getsequencelen(bp->seqno)) ||
166 (dbhash_seqcmp(seq, db_getsequence(bp->seqno), seqlen))))
167 {
168 index = (index + 1) & dbhash_mask;
169 bp = dbhash_table + index;
170 }
171
172 info->index = index;
173
174 if (bitmap_get(dbhash_bitmap, index)) {
175 return bp->seqno;
176 } else {
177 return -1;
178
179 }
180 }
181
dbhash_add(char * seq,uint64_t seqlen,uint64_t seqno)182 void dbhash_add(char * seq, uint64_t seqlen, uint64_t seqno)
183 {
184 struct dbhash_search_info_s info;
185
186 int64_t ret = dbhash_search_first(seq, seqlen, & info);
187 while (ret >= 0) {
188 ret = dbhash_search_next(&info);
189
190 }
191
192 bitmap_set(dbhash_bitmap, info.index);
193 struct dbhash_bucket_s * bp = dbhash_table + info.index;
194 bp->hash = info.hash;
195 bp->seqno = seqno;
196 }
197
dbhash_add_one(uint64_t seqno)198 void dbhash_add_one(uint64_t seqno)
199 {
200 char * seq = db_getsequence(seqno);
201 uint64_t seqlen = db_getsequencelen(seqno);
202 char * normalized = (char*) xmalloc(seqlen+1);
203 string_normalize(normalized, seq, seqlen);
204 dbhash_add(normalized, seqlen, seqno);
205 }
206
dbhash_add_all()207 void dbhash_add_all()
208 {
209 progress_init("Hashing database sequences", db_getsequencecount());
210 char * normalized = (char*) xmalloc(db_getlongestsequence()+1);
211 for(uint64_t seqno=0; seqno < db_getsequencecount(); seqno++)
212 {
213 char * seq = db_getsequence(seqno);
214 uint64_t seqlen = db_getsequencelen(seqno);
215 string_normalize(normalized, seq, seqlen);
216 dbhash_add(normalized, seqlen, seqno);
217 progress_update(seqno+1);
218 }
219 xfree(normalized);
220 progress_done();
221 }
222