1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 unsigned int * kmercount;
64 uint64_t * kmerhash;
65 unsigned int * kmerindex;
66 bitmap_t * * kmerbitmap;
67 unsigned int * dbindex_map;
68 unsigned int kmerhashsize;
69 uint64_t kmerindexsize;
70 unsigned int dbindex_count;
71 uhandle_s * dbindex_uh;
72 
73 #define BITMAP_THRESHOLD 8
74 
75 static unsigned int bitmap_mincount;
76 
fprint_kmer(FILE * f,unsigned int kk,uint64_t kmer)77 void fprint_kmer(FILE * f, unsigned int kk, uint64_t kmer)
78 {
79   uint64_t x = kmer;
80   for(unsigned int i=0; i<kk; i++) {
81     fprintf(f, "%c", sym_nt_2bit[(x >> (2*(kk-i-1))) & 3]);
82 
83         }
84 }
85 
dbindex_addsequence(unsigned int seqno,int seqmask)86 void dbindex_addsequence(unsigned int seqno, int seqmask)
87 {
88 #if 0
89   printf("Adding seqno %d as index element no %d\n", seqno, dbindex_count);
90 #endif
91 
92   unsigned int uniquecount;
93   unsigned int * uniquelist;
94   unique_count(dbindex_uh, opt_wordlength,
95                db_getsequencelen(seqno), db_getsequence(seqno),
96                & uniquecount, & uniquelist, seqmask);
97   dbindex_map[dbindex_count] = seqno;
98   for(unsigned int i=0; i<uniquecount; i++)
99     {
100       unsigned int kmer = uniquelist[i];
101       if (kmerbitmap[kmer])
102         {
103           kmercount[kmer]++;
104           bitmap_set(kmerbitmap[kmer], dbindex_count);
105         }
106       else {
107         kmerindex[kmerhash[kmer]+(kmercount[kmer]++)] = dbindex_count;
108 
109         }
110     }
111   dbindex_count++;
112 }
113 
dbindex_addallsequences(int seqmask)114 void dbindex_addallsequences(int seqmask)
115 {
116   unsigned int seqcount = db_getsequencecount();
117   progress_init("Creating k-mer index", seqcount);
118   for(unsigned int seqno = 0; seqno < seqcount ; seqno++)
119     {
120       dbindex_addsequence(seqno, seqmask);
121       progress_update(seqno);
122     }
123   progress_done();
124 }
125 
dbindex_prepare(int use_bitmap,int seqmask)126 void dbindex_prepare(int use_bitmap, int seqmask)
127 {
128   dbindex_uh = unique_init();
129 
130   unsigned int seqcount = db_getsequencecount();
131   kmerhashsize = 1 << (2 * opt_wordlength);
132 
133   /* allocate memory for kmer count array */
134   kmercount = (unsigned int *) xmalloc(kmerhashsize * sizeof(unsigned int));
135   memset(kmercount, 0, kmerhashsize * sizeof(unsigned int));
136 
137   /* first scan, just count occurences */
138   progress_init("Counting k-mers", seqcount);
139   for(unsigned int seqno = 0; seqno < seqcount ; seqno++)
140     {
141       unsigned int uniquecount;
142       unsigned int * uniquelist;
143       unique_count(dbindex_uh, opt_wordlength,
144                    db_getsequencelen(seqno), db_getsequence(seqno),
145                    & uniquecount, & uniquelist, seqmask);
146       for(unsigned int i=0; i<uniquecount; i++) {
147         kmercount[uniquelist[i]]++;
148 
149         }
150       progress_update(seqno);
151     }
152   progress_done();
153 
154 #if 0
155   /* dump kmer counts */
156   FILE * f = fopen_output("kmercounts.txt");
157   for(unsigned int kmer=0; kmer < kmerhashsize; kmer++)
158     {
159       fprint_kmer(f, 8, kmer);
160       fprintf(f, "\t%d\t%d\n", kmer, kmercount[kmer]);
161     }
162   fclose(f);
163 #endif
164 
165   /* determine minimum kmer count for bitmap usage */
166   if (use_bitmap) {
167     bitmap_mincount = seqcount / BITMAP_THRESHOLD;
168   } else {
169     bitmap_mincount = seqcount + 1;
170 
171         }
172 
173   /* allocate and zero bitmap pointers */
174   kmerbitmap = (bitmap_t **) xmalloc(kmerhashsize * sizeof(bitmap_t *));
175   memset(kmerbitmap, 0, kmerhashsize * sizeof(bitmap_t *));
176 
177   /* hash / bitmap setup */
178   /* convert hash counts to position in index */
179   kmerhash = (uint64_t *) xmalloc((kmerhashsize+1) * sizeof(uint64_t));
180   uint64_t sum = 0;
181   for(unsigned int i = 0; i < kmerhashsize; i++)
182     {
183       kmerhash[i] = sum;
184       if (kmercount[i] >= bitmap_mincount)
185         {
186           kmerbitmap[i] = bitmap_init(seqcount+127); // pad for xmm
187           bitmap_reset_all(kmerbitmap[i]);
188         }
189       else {
190         sum += kmercount[i];
191 
192         }
193     }
194   kmerindexsize = sum;
195   kmerhash[kmerhashsize] = sum;
196 
197 #if 0
198   if (!opt_quiet)
199     fprintf(stderr, "Unique %ld-mers: %u\n", opt_wordlength, kmerindexsize);
200 #endif
201 
202   /* reset counts */
203   memset(kmercount, 0, kmerhashsize * sizeof(unsigned int));
204 
205   /* allocate space for actual data */
206   kmerindex = (unsigned int *) xmalloc(kmerindexsize * sizeof(unsigned int));
207 
208   /* allocate space for mapping from indexno to seqno */
209   dbindex_map = (unsigned int *) xmalloc(seqcount * sizeof(unsigned int));
210 
211   dbindex_count = 0;
212 
213   show_rusage();
214 }
215 
dbindex_free()216 void dbindex_free()
217 {
218   xfree(kmerhash);
219   xfree(kmerindex);
220   xfree(kmercount);
221   xfree(dbindex_map);
222 
223   for(unsigned int kmer=0; kmer<kmerhashsize; kmer++) {
224     if (kmerbitmap[kmer]) {
225       bitmap_free(kmerbitmap[kmer]);
226 
227         }
228 
229         }
230   xfree(kmerbitmap);
231   unique_exit(dbindex_uh);
232 }
233