1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 static const int dust_word = 3;
64 static const int dust_level = 20;
65 static const int dust_window = 64;
66 static const int dust_window2 = dust_window / 2;
67 static const int word_count = 1 << (2 * dust_word);
68 static const int bitmask = word_count - 1;
69 
wo(int len,const char * s,int * beg,int * end)70 int wo(int len, const char *s, int *beg, int *end)
71 {
72   int l1 = len - dust_word + 1 - 5; /* smallest possible region is 8 */
73   if (l1 < 0) {
74     return 0;
75 
76         }
77 
78   int bestv = 0;
79   int besti = 0;
80   int bestj = 0;
81   int counts[word_count];
82   int words[dust_window];
83   int word = 0;
84 
85   for (int j = 0; j < len; j++)
86     {
87       word <<= 2;
88       word |= chrmap_2bit[(int)(s[j])];
89       words[j] = word & bitmask;
90     }
91 
92   for (int i=0; i < l1; i++)
93     {
94       memset(counts, 0, sizeof(counts));
95 
96       int sum = 0;
97 
98       for (int j = dust_word-1; j<len-i; j++)
99         {
100           word = words[i+j];
101           int c = counts[word];
102           if (c)
103             {
104               sum += c;
105               int v = 10 * sum / j;
106 
107               if (v > bestv)
108                 {
109                   bestv = v;
110                   besti = i;
111                   bestj = j;
112                 }
113             }
114           counts[word]++;
115         }
116     }
117 
118   *beg = besti;
119   *end = besti + bestj;
120 
121   return bestv;
122 }
123 
dust(char * m,int len)124 void dust(char * m, int len)
125 {
126   int a, b;
127 
128   /* make a local copy of the original sequence */
129   char * s = (char*) xmalloc(len+1);
130   strcpy(s, m);
131 
132   if (! opt_hardmask)
133     {
134       /* convert sequence to upper case unless hardmask in effect */
135       for(int i=0; i < len; i++) {
136         m[i] = toupper(m[i]);
137 
138         }
139       m[len] = 0;
140     }
141 
142   for (int i=0; i < len; i += dust_window2)
143     {
144       int l = (len > i + dust_window) ? dust_window : len-i;
145       int v = wo(l, s+i, &a, &b);
146 
147       if (v > dust_level)
148         {
149           if (opt_hardmask) {
150             for(int j=a+i; j<=b+i; j++) {
151               m[j] = 'N';
152 
153         }
154           } else {
155             for(int j=a+i; j<=b+i; j++) {
156               m[j] = s[j] | 0x20;
157 
158         }
159 
160         }
161 
162           if (b < dust_window2) {
163             i += dust_window2 - b;
164 
165         }
166         }
167     }
168 
169   xfree(s);
170 }
171 
172 static pthread_t * pthread;
173 static pthread_attr_t attr;
174 static pthread_mutex_t mutex;
175 static int nextseq = 0;
176 static int seqcount = 0;
177 
dust_all_worker(void * vp)178 void * dust_all_worker(void * vp)
179 {
180   while(true)
181     {
182       xpthread_mutex_lock(&mutex);
183       int seqno = nextseq;
184       if (seqno < seqcount)
185         {
186           nextseq++;
187           progress_update(seqno);
188           xpthread_mutex_unlock(&mutex);
189           dust(db_getsequence(seqno), db_getsequencelen(seqno));
190         }
191       else
192         {
193           xpthread_mutex_unlock(&mutex);
194           break;
195         }
196     }
197   return nullptr;
198 }
199 
dust_all()200 void dust_all()
201 {
202   nextseq = 0;
203   seqcount = db_getsequencecount();
204   progress_init("Masking", seqcount);
205 
206   xpthread_mutex_init(&mutex, nullptr);
207 
208   xpthread_attr_init(&attr);
209   xpthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
210 
211   pthread = (pthread_t *) xmalloc(opt_threads * sizeof(pthread_t));
212 
213   for(int t=0; t<opt_threads; t++) {
214     xpthread_create(pthread+t, &attr, dust_all_worker, (void*)(int64_t)t);
215 
216         }
217 
218   for(int t=0; t<opt_threads; t++) {
219     xpthread_join(pthread[t], nullptr);
220 
221         }
222 
223   xfree(pthread);
224 
225   xpthread_attr_destroy(&attr);
226 
227   xpthread_mutex_destroy(&mutex);
228 
229   progress_done();
230 }
231 
hardmask(char * seq,int len)232 void hardmask(char * seq, int len)
233 {
234   /* convert all lower case letters in seq to N */
235 
236   for(int j=0; j<len; j++) {
237     if (seq[j] & 0x20) {
238       seq[j] = 'N';
239 
240         }
241 
242         }
243 }
244 
hardmask_all()245 void hardmask_all()
246 {
247   for(uint64_t i=0; i<db_getsequencecount(); i++) {
248     hardmask(db_getsequence(i), db_getsequencelen(i));
249 
250         }
251 }
252 
maskfasta()253 void maskfasta()
254 {
255   FILE * fp_output = fopen_output(opt_output);
256   if (!fp_output) {
257     fatal("Unable to open mask output file for writing");
258 
259         }
260 
261   db_read(opt_maskfasta, 0);
262   show_rusage();
263 
264   seqcount = db_getsequencecount();
265 
266   if (opt_qmask == MASK_DUST) {
267     dust_all();
268   } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) {
269     hardmask_all();
270 
271         }
272   show_rusage();
273 
274   progress_init("Writing output", seqcount);
275   for(int i=0; i<seqcount; i++)
276     {
277       fasta_print_db_relabel(fp_output, i, i+1);
278       progress_update(i);
279     }
280   progress_done();
281   show_rusage();
282 
283   db_free();
284   fclose(fp_output);
285 }
286 
fastx_mask()287 void fastx_mask()
288 {
289   FILE * fp_fastaout = nullptr;
290   FILE * fp_fastqout = nullptr;
291 
292   if (opt_fastaout)
293     {
294       fp_fastaout = fopen_output(opt_fastaout);
295       if (!fp_fastaout) {
296         fatal("Unable to open mask output FASTA file for writing");
297 
298         }
299     }
300 
301   if (opt_fastqout)
302     {
303       fp_fastqout = fopen_output(opt_fastqout);
304       if (!fp_fastqout) {
305         fatal("Unable to open mask output FASTQ file for writing");
306 
307         }
308     }
309 
310   db_read(opt_fastx_mask, 0);
311   show_rusage();
312 
313   if (fp_fastqout && ! db_is_fastq()) {
314     fatal("Cannot write FASTQ output with a FASTA input file, lacking quality scores");
315 
316         }
317 
318   seqcount = db_getsequencecount();
319 
320   if (opt_qmask == MASK_DUST) {
321     dust_all();
322   } else if ((opt_qmask == MASK_SOFT) && (opt_hardmask)) {
323     hardmask_all();
324 
325         }
326   show_rusage();
327 
328   int kept = 0;
329   int discarded_less = 0;
330   int discarded_more = 0;
331   progress_init("Writing output", seqcount);
332   for(int i=0; i<seqcount; i++)
333     {
334       int unmasked = 0;
335       char * seq = db_getsequence(i);
336       int len = db_getsequencelen(i);
337       if (opt_qmask == MASK_NONE)
338         {
339           unmasked = len;
340         }
341       else if (opt_hardmask)
342         {
343           for(int j=0; j<len; j++) {
344             if (seq[j] != 'N') {
345               unmasked++;
346 
347         }
348 
349         }
350         }
351       else
352         {
353           for(int j=0; j<len; j++) {
354             if (isupper(seq[j])) {
355               unmasked++;
356 
357         }
358 
359         }
360         }
361       double unmasked_pct = 100.0 * unmasked / len;
362 
363       if (unmasked_pct < opt_min_unmasked_pct) {
364         discarded_less++;
365       } else if (unmasked_pct >  opt_max_unmasked_pct) {
366         discarded_more++;
367       } else
368         {
369           kept++;
370 
371           if (opt_fastaout) {
372             fasta_print_general(fp_fastaout,
373                                 nullptr,
374                                 seq,
375                                 len,
376                                 db_getheader(i),
377                                 db_getheaderlen(i),
378                                 db_getabundance(i),
379                                 kept,
380                                 -1.0,
381                                 -1, -1, nullptr, 0.0);
382 
383         }
384 
385           if (opt_fastqout) {
386             fastq_print_general(fp_fastqout,
387                                 seq,
388                                 len,
389                                 db_getheader(i),
390                                 db_getheaderlen(i),
391                                 db_getquality(i),
392                                 db_getabundance(i),
393                                 kept,
394                                 -1.0);
395 
396         }
397         }
398 
399       progress_update(i);
400     }
401   progress_done();
402 
403   if (!opt_quiet)
404     {
405       if (opt_min_unmasked_pct > 0.0) {
406         fprintf(stderr, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, opt_min_unmasked_pct);
407 
408         }
409       if (opt_max_unmasked_pct < 100.0) {
410         fprintf(stderr, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, opt_max_unmasked_pct);
411 
412         }
413       fprintf(stderr, "%d sequences kept\n", kept);
414     }
415 
416   if (opt_log)
417     {
418       if (opt_min_unmasked_pct > 0.0) {
419         fprintf(fp_log, "%d sequences with less than %.1lf%% unmasked residues discarded\n", discarded_less, opt_min_unmasked_pct);
420 
421         }
422       if (opt_max_unmasked_pct < 100.0) {
423         fprintf(fp_log, "%d sequences with more than %.1lf%% unmasked residues discarded\n", discarded_more, opt_max_unmasked_pct);
424 
425         }
426       fprintf(fp_log, "%d sequences kept\n", kept);
427     }
428 
429   show_rusage();
430   db_free();
431 
432   if (fp_fastaout) {
433     fclose(fp_fastaout);
434 
435         }
436   if (fp_fastqout) {
437     fclose(fp_fastqout);
438 
439         }
440 }
441