1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 unsigned int * kmercount;
64 uint64_t * kmerhash;
65 unsigned int * kmerindex;
66 bitmap_t * * kmerbitmap;
67 unsigned int * dbindex_map;
68 unsigned int kmerhashsize;
69 uint64_t kmerindexsize;
70 unsigned int dbindex_count;
71 uhandle_s * dbindex_uh;
72
73 #define BITMAP_THRESHOLD 8
74
75 static unsigned int bitmap_mincount;
76
fprint_kmer(FILE * f,unsigned int kk,uint64_t kmer)77 void fprint_kmer(FILE * f, unsigned int kk, uint64_t kmer)
78 {
79 uint64_t x = kmer;
80 for(unsigned int i=0; i<kk; i++) {
81 fprintf(f, "%c", sym_nt_2bit[(x >> (2*(kk-i-1))) & 3]);
82
83 }
84 }
85
dbindex_addsequence(unsigned int seqno,int seqmask)86 void dbindex_addsequence(unsigned int seqno, int seqmask)
87 {
88 #if 0
89 printf("Adding seqno %d as index element no %d\n", seqno, dbindex_count);
90 #endif
91
92 unsigned int uniquecount;
93 unsigned int * uniquelist;
94 unique_count(dbindex_uh, opt_wordlength,
95 db_getsequencelen(seqno), db_getsequence(seqno),
96 & uniquecount, & uniquelist, seqmask);
97 dbindex_map[dbindex_count] = seqno;
98 for(unsigned int i=0; i<uniquecount; i++)
99 {
100 unsigned int kmer = uniquelist[i];
101 if (kmerbitmap[kmer])
102 {
103 kmercount[kmer]++;
104 bitmap_set(kmerbitmap[kmer], dbindex_count);
105 }
106 else {
107 kmerindex[kmerhash[kmer]+(kmercount[kmer]++)] = dbindex_count;
108
109 }
110 }
111 dbindex_count++;
112 }
113
dbindex_addallsequences(int seqmask)114 void dbindex_addallsequences(int seqmask)
115 {
116 unsigned int seqcount = db_getsequencecount();
117 progress_init("Creating k-mer index", seqcount);
118 for(unsigned int seqno = 0; seqno < seqcount ; seqno++)
119 {
120 dbindex_addsequence(seqno, seqmask);
121 progress_update(seqno);
122 }
123 progress_done();
124 }
125
dbindex_prepare(int use_bitmap,int seqmask)126 void dbindex_prepare(int use_bitmap, int seqmask)
127 {
128 dbindex_uh = unique_init();
129
130 unsigned int seqcount = db_getsequencecount();
131 kmerhashsize = 1 << (2 * opt_wordlength);
132
133 /* allocate memory for kmer count array */
134 kmercount = (unsigned int *) xmalloc(kmerhashsize * sizeof(unsigned int));
135 memset(kmercount, 0, kmerhashsize * sizeof(unsigned int));
136
137 /* first scan, just count occurences */
138 progress_init("Counting k-mers", seqcount);
139 for(unsigned int seqno = 0; seqno < seqcount ; seqno++)
140 {
141 unsigned int uniquecount;
142 unsigned int * uniquelist;
143 unique_count(dbindex_uh, opt_wordlength,
144 db_getsequencelen(seqno), db_getsequence(seqno),
145 & uniquecount, & uniquelist, seqmask);
146 for(unsigned int i=0; i<uniquecount; i++) {
147 kmercount[uniquelist[i]]++;
148
149 }
150 progress_update(seqno);
151 }
152 progress_done();
153
154 #if 0
155 /* dump kmer counts */
156 FILE * f = fopen_output("kmercounts.txt");
157 for(unsigned int kmer=0; kmer < kmerhashsize; kmer++)
158 {
159 fprint_kmer(f, 8, kmer);
160 fprintf(f, "\t%d\t%d\n", kmer, kmercount[kmer]);
161 }
162 fclose(f);
163 #endif
164
165 /* determine minimum kmer count for bitmap usage */
166 if (use_bitmap) {
167 bitmap_mincount = seqcount / BITMAP_THRESHOLD;
168 } else {
169 bitmap_mincount = seqcount + 1;
170
171 }
172
173 /* allocate and zero bitmap pointers */
174 kmerbitmap = (bitmap_t **) xmalloc(kmerhashsize * sizeof(bitmap_t *));
175 memset(kmerbitmap, 0, kmerhashsize * sizeof(bitmap_t *));
176
177 /* hash / bitmap setup */
178 /* convert hash counts to position in index */
179 kmerhash = (uint64_t *) xmalloc((kmerhashsize+1) * sizeof(uint64_t));
180 uint64_t sum = 0;
181 for(unsigned int i = 0; i < kmerhashsize; i++)
182 {
183 kmerhash[i] = sum;
184 if (kmercount[i] >= bitmap_mincount)
185 {
186 kmerbitmap[i] = bitmap_init(seqcount+127); // pad for xmm
187 bitmap_reset_all(kmerbitmap[i]);
188 }
189 else {
190 sum += kmercount[i];
191
192 }
193 }
194 kmerindexsize = sum;
195 kmerhash[kmerhashsize] = sum;
196
197 #if 0
198 if (!opt_quiet)
199 fprintf(stderr, "Unique %ld-mers: %u\n", opt_wordlength, kmerindexsize);
200 #endif
201
202 /* reset counts */
203 memset(kmercount, 0, kmerhashsize * sizeof(unsigned int));
204
205 /* allocate space for actual data */
206 kmerindex = (unsigned int *) xmalloc(kmerindexsize * sizeof(unsigned int));
207
208 /* allocate space for mapping from indexno to seqno */
209 dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int));
210
211 dbindex_count = 0;
212
213 show_rusage();
214 }
215
dbindex_free()216 void dbindex_free()
217 {
218 xfree(kmerhash);
219 xfree(kmerindex);
220 xfree(kmercount);
221 xfree(dbindex_map);
222
223 for(unsigned int kmer=0; kmer<kmerhashsize; kmer++) {
224 if (kmerbitmap[kmer]) {
225 bitmap_free(kmerbitmap[kmer]);
226
227 }
228
229 }
230 xfree(kmerbitmap);
231 unique_exit(dbindex_uh);
232 }
233