1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 static bitmap_t * dbhash_bitmap;
64 static uint64_t dbhash_size;
65 static unsigned int dbhash_shift;
66 static uint64_t dbhash_mask;
67 static struct dbhash_bucket_s * dbhash_table;
68 
dbhash_seqcmp(char * a,char * b,uint64_t n)69 int dbhash_seqcmp(char * a, char * b, uint64_t n)
70 {
71   char * p = a;
72   char * q = b;
73 
74   if (n <= 0) {
75     return 0;
76 
77         }
78 
79   while ((n-- > 0) && (chrmap_4bit[(int)(*p)] == chrmap_4bit[(int)(*q)]))
80     {
81       if ((n == 0) || (*p == 0) || (*q == 0)) {
82         break;
83 
84         }
85       p++;
86       q++;
87     }
88 
89   return chrmap_4bit[(int)(*p)] - chrmap_4bit[(int)(*q)];
90 }
91 
dbhash_open(uint64_t maxelements)92 void dbhash_open(uint64_t maxelements)
93 {
94   /* adjust size of hash table for 2/3 fill rate */
95   /* and use a multiple of 2 */
96 
97   dbhash_size = 1;
98   dbhash_shift = 0;
99   while (3 * maxelements > 2 * dbhash_size)
100     {
101       dbhash_size <<= 1;
102       dbhash_shift++;
103     }
104   dbhash_mask = dbhash_size - 1;
105 
106   dbhash_table = (struct dbhash_bucket_s *)
107     xmalloc(sizeof(dbhash_bucket_s) * dbhash_size);
108   memset(dbhash_table, 0, sizeof(dbhash_bucket_s) * dbhash_size);
109 
110   dbhash_bitmap = bitmap_init(dbhash_size);
111   bitmap_reset_all(dbhash_bitmap);
112 }
113 
dbhash_close()114 void dbhash_close()
115 {
116   bitmap_free(dbhash_bitmap);
117   dbhash_bitmap = nullptr;
118   xfree(dbhash_table);
119   dbhash_table = nullptr;
120 }
121 
dbhash_search_first(char * seq,uint64_t seqlen,struct dbhash_search_info_s * info)122 int64_t dbhash_search_first(char * seq,
123                         uint64_t seqlen,
124                         struct dbhash_search_info_s * info)
125 {
126 
127   uint64_t hash = hash_cityhash64(seq, seqlen);
128   info->hash = hash;
129   info->seq = seq;
130   info->seqlen = seqlen;
131   uint64_t index = hash & dbhash_mask;
132   struct dbhash_bucket_s * bp = dbhash_table + index;
133 
134   while (bitmap_get(dbhash_bitmap, index)
135          &&
136          ((bp->hash != hash) ||
137           (seqlen != db_getsequencelen(bp->seqno)) ||
138           (dbhash_seqcmp(seq, db_getsequence(bp->seqno), seqlen))))
139     {
140       index = (index + 1) & dbhash_mask;
141       bp = dbhash_table + index;
142     }
143 
144   info->index = index;
145 
146   if (bitmap_get(dbhash_bitmap, index)) {
147     return bp->seqno;
148   } else {
149     return -1;
150 
151         }
152 }
153 
dbhash_search_next(struct dbhash_search_info_s * info)154 int64_t dbhash_search_next(struct dbhash_search_info_s * info)
155 {
156   uint64_t hash = info->hash;
157   char * seq = info->seq;
158   uint64_t seqlen = info->seqlen;
159   uint64_t index = (info->index + 1) & dbhash_mask;
160   struct dbhash_bucket_s * bp = dbhash_table + index;
161 
162   while (bitmap_get(dbhash_bitmap, index)
163          &&
164          ((bp->hash != hash) ||
165           (seqlen != db_getsequencelen(bp->seqno)) ||
166           (dbhash_seqcmp(seq, db_getsequence(bp->seqno), seqlen))))
167     {
168       index = (index + 1) & dbhash_mask;
169       bp = dbhash_table + index;
170     }
171 
172   info->index = index;
173 
174   if (bitmap_get(dbhash_bitmap, index)) {
175     return bp->seqno;
176   } else {
177     return -1;
178 
179         }
180 }
181 
dbhash_add(char * seq,uint64_t seqlen,uint64_t seqno)182 void dbhash_add(char * seq, uint64_t seqlen, uint64_t seqno)
183 {
184   struct dbhash_search_info_s info;
185 
186   int64_t ret = dbhash_search_first(seq, seqlen, & info);
187   while (ret >= 0) {
188     ret = dbhash_search_next(&info);
189 
190         }
191 
192   bitmap_set(dbhash_bitmap, info.index);
193   struct dbhash_bucket_s * bp = dbhash_table + info.index;
194   bp->hash = info.hash;
195   bp->seqno = seqno;
196 }
197 
dbhash_add_one(uint64_t seqno)198 void dbhash_add_one(uint64_t seqno)
199 {
200   char * seq = db_getsequence(seqno);
201   uint64_t seqlen = db_getsequencelen(seqno);
202   char * normalized = (char*) xmalloc(seqlen+1);
203   string_normalize(normalized, seq, seqlen);
204   dbhash_add(normalized, seqlen, seqno);
205 }
206 
dbhash_add_all()207 void dbhash_add_all()
208 {
209   progress_init("Hashing database sequences", db_getsequencecount());
210   char * normalized = (char*) xmalloc(db_getlongestsequence()+1);
211   for(uint64_t seqno=0; seqno < db_getsequencecount(); seqno++)
212     {
213       char * seq = db_getsequence(seqno);
214       uint64_t seqlen = db_getsequencelen(seqno);
215       string_normalize(normalized, seq, seqlen);
216       dbhash_add(normalized, seqlen, seqno);
217       progress_update(seqno+1);
218     }
219   xfree(normalized);
220   progress_done();
221 }
222