1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 static const int dust_word = 3;
64 static const int dust_level = 20;
65 static const int dust_window = 64;
66 static const int dust_window2 = dust_window / 2;
67 static const int word_count = 1 << (2 * dust_word);
68 static const int bitmask = word_count - 1;
69
wo(int len,const char * s,int * beg,int * end)70 int wo(int len, const char *s, int *beg, int *end)
71 {
72 int l1 = len - dust_word + 1 - 5; /* smallest possible region is 8 */
73 if (l1 < 0) {
74 return 0;
75
76 }
77
78 int bestv = 0;
79 int besti = 0;
80 int bestj = 0;
81 int counts[word_count];
82 int words[dust_window];
83 int word = 0;
84
85 for (int j = 0; j < len; j++)
86 {
87 word <<= 2;
88 word |= chrmap_2bit[(int)(s[j])];
89 words[j] = word & bitmask;
90 }
91
92 for (int i=0; i < l1; i++)
93 {
94 memset(counts, 0, sizeof(counts));
95
96 int sum = 0;
97
98 for (int j = dust_word-1; j<len-i; j++)
99 {
100 word = words[i+j];
101 int c = counts[word];
102 if (c)
103 {
104 sum += c;
105 int v = 10 * sum / j;
106
107 if (v > bestv)
108 {
109 bestv = v;
110 besti = i;
111 bestj = j;
112 }
113 }
114 counts[word]++;
115 }
116 }
117
118 *beg = besti;
119 *end = besti + bestj;
120
121 return bestv;
122 }
123
dust(char * m,int len)124 void dust(char * m, int len)
125 {
126 int a, b;
127
128 /* make a local copy of the original sequence */
129 char * s = (char*) xmalloc(len+1);
130 strcpy(s, m);
131
132 if (! opt_hardmask)
133 {
134 /* convert sequence to upper case unless hardmask in effect */
135 for(int i=0; i < len; i++) {
136 m[i] = toupper(m[i]);
137
138 }
139 m[len] = 0;
140 }
141
142 for (int i=0; i < len; i += dust_window2)
143 {
144 int l = (len > i + dust_window) ? dust_window : len-i;
145 int v = wo(l, s+i, &a, &b);
146
147 if (v > dust_level)
148 {
149 if (opt_hardmask) {
150 for(int j=a+i; j<=b+i; j++) {
151 m[j] = 'N';
152
153 }
154 } else {
155 for(int j=a+i; j<=b+i; j++) {
156 m[j] = s[j] | 0x20;
157
158 }
159
160 }
161
162 if (b < dust_window2) {
163 i += dust_window2 - b;
164
165 }
166 }
167 }
168
169 xfree(s);
170 }
171
172 static pthread_t * pthread;
173 static pthread_attr_t attr;
174 static pthread_mutex_t mutex;
175 static int nextseq = 0;
176 static int seqcount = 0;
177
dust_all_worker(void * vp)178 void * dust_all_worker(void * vp)
179 {
180 while(true)
181 {
182 xpthread_mutex_lock(&mutex);
183 int seqno = nextseq;
184 if (seqno < seqcount)
185 {
186 nextseq++;
187 progress_update(seqno);
188 xpthread_mutex_unlock(&mutex);
189 dust(db_getsequence(seqno), db_getsequencelen(seqno));
190 }
191 else
192 {
193 xpthread_mutex_unlock(&mutex);
194 break;
195 }
196 }
197 return nullptr;
198 }
199
dust_all()200 void dust_all()
201 {
202 nextseq = 0;
203 seqcount = db_getsequencecount();
204 progress_init("Masking", seqcount);
205
206 xpthread_mutex_init(&mutex, nullptr);
207
208 xpthread_attr_init(&attr);
209 xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
210
211 pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t));
212
213 for(int t=0; t<opt_threads; t++) {
214 xpthread_create(pthread+t, &attr, dust_all_worker, (void*)(int64_t)t);
215
216 }
217
218 for(int t=0; t<opt_threads; t++) {
219 xpthread_join(pthread[t], nullptr);
220
221 }
222
223 xfree(pthread);
224
225 xpthread_attr_destroy(&attr);
226
227 xpthread_mutex_destroy(&mutex);
228
229 progress_done();
230 }
231
hardmask(char * seq,int len)232 void hardmask(char * seq, int len)
233 {
234 /* convert all lower case letters in seq to N */
235
236 for(int j=0; j<len; j++) {
237 if (seq[j] & 0x20) {
238 seq[j] = 'N';
239
240 }
241
242 }
243 }
244
hardmask_all()245 void hardmask_all()
246 {
247 for(uint64_t i=0; i<db_getsequencecount(); i++) {
248 hardmask(db_getsequence(i), db_getsequencelen(i));
249
250 }
251 }
252
maskfasta()253 void maskfasta()
254 {
255 FILE * fp_output = fopen_output(opt_output);
256 if (!fp_output) {
257 fatal("Unable to open mask output file for writing");
258
259 }
260
261 db_read(opt_maskfasta, 0);
262 show_rusage();
263
264 seqcount = db_getsequencecount();
265
266 if (opt_qmask == MASK_DUST) {
267 dust_all();
268 } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) {
269 hardmask_all();
270
271 }
272 show_rusage();
273
274 progress_init("Writing output", seqcount);
275 for(int i=0; i<seqcount; i++)
276 {
277 fasta_print_db_relabel(fp_output, i, i+1);
278 progress_update(i);
279 }
280 progress_done();
281 show_rusage();
282
283 db_free();
284 fclose(fp_output);
285 }
286
fastx_mask()287 void fastx_mask()
288 {
289 FILE * fp_fastaout = nullptr;
290 FILE * fp_fastqout = nullptr;
291
292 if (opt_fastaout)
293 {
294 fp_fastaout = fopen_output(opt_fastaout);
295 if (!fp_fastaout) {
296 fatal("Unable to open mask output FASTA file for writing");
297
298 }
299 }
300
301 if (opt_fastqout)
302 {
303 fp_fastqout = fopen_output(opt_fastqout);
304 if (!fp_fastqout) {
305 fatal("Unable to open mask output FASTQ file for writing");
306
307 }
308 }
309
310 db_read(opt_fastx_mask, 0);
311 show_rusage();
312
313 if (fp_fastqout && ! db_is_fastq()) {
314 fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores");
315
316 }
317
318 seqcount = db_getsequencecount();
319
320 if (opt_qmask == MASK_DUST) {
321 dust_all();
322 } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) {
323 hardmask_all();
324
325 }
326 show_rusage();
327
328 int kept = 0;
329 int discarded_less = 0;
330 int discarded_more = 0;
331 progress_init("Writing output", seqcount);
332 for(int i=0; i<seqcount; i++)
333 {
334 int unmasked = 0;
335 char * seq = db_getsequence(i);
336 int len = db_getsequencelen(i);
337 if (opt_qmask == MASK_NONE)
338 {
339 unmasked = len;
340 }
341 else if (opt_hardmask)
342 {
343 for(int j=0; j<len; j++) {
344 if (seq[j] != 'N') {
345 unmasked++;
346
347 }
348
349 }
350 }
351 else
352 {
353 for(int j=0; j<len; j++) {
354 if (isupper(seq[j])) {
355 unmasked++;
356
357 }
358
359 }
360 }
361 double unmasked_pct = 100.0 * unmasked / len;
362
363 if (unmasked_pct < opt_min_unmasked_pct) {
364 discarded_less++;
365 } else if (unmasked_pct > opt_max_unmasked_pct) {
366 discarded_more++;
367 } else
368 {
369 kept++;
370
371 if (opt_fastaout) {
372 fasta_print_general(fp_fastaout,
373 nullptr,
374 seq,
375 len,
376 db_getheader(i),
377 db_getheaderlen(i),
378 db_getabundance(i),
379 kept,
380 -1.0,
381 -1, -1, nullptr, 0.0);
382
383 }
384
385 if (opt_fastqout) {
386 fastq_print_general(fp_fastqout,
387 seq,
388 len,
389 db_getheader(i),
390 db_getheaderlen(i),
391 db_getquality(i),
392 db_getabundance(i),
393 kept,
394 -1.0);
395
396 }
397 }
398
399 progress_update(i);
400 }
401 progress_done();
402
403 if (!opt_quiet)
404 {
405 if (opt_min_unmasked_pct > 0.0) {
406 fprintf(stderr, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, opt_min_unmasked_pct);
407
408 }
409 if (opt_max_unmasked_pct < 100.0) {
410 fprintf(stderr, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, opt_max_unmasked_pct);
411
412 }
413 fprintf(stderr, "%d sequences kept\n", kept);
414 }
415
416 if (opt_log)
417 {
418 if (opt_min_unmasked_pct > 0.0) {
419 fprintf(fp_log, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, opt_min_unmasked_pct);
420
421 }
422 if (opt_max_unmasked_pct < 100.0) {
423 fprintf(fp_log, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, opt_max_unmasked_pct);
424
425 }
426 fprintf(fp_log, "%d sequences kept\n", kept);
427 }
428
429 show_rusage();
430 db_free();
431
432 if (fp_fastaout) {
433 fclose(fp_fastaout);
434
435 }
436 if (fp_fastqout) {
437 fclose(fp_fastqout);
438
439 }
440 }
441