1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
fasta_open(const char * filename)63 fastx_handle fasta_open(const char * filename)
64 {
65 fastx_handle h = fastx_open(filename);
66
67 if (fastx_is_fastq(h) && ! h->is_empty) {
68 fatal("FASTA file expected, FASTQ file found (%s)", filename);
69
70 }
71
72 return h;
73 }
74
fasta_close(fastx_handle h)75 void fasta_close(fastx_handle h)
76 {
77 fastx_close(h);
78 }
79
fasta_filter_sequence(fastx_handle h,unsigned int * char_action,const unsigned char * char_mapping)80 void fasta_filter_sequence(fastx_handle h,
81 unsigned int * char_action,
82 const unsigned char * char_mapping)
83 {
84 /* Strip unwanted characters from the sequence and raise warnings or
85 errors on certain characters. */
86
87 char * p = h->sequence_buffer.data;
88 char * q = p;
89 char c;
90 char msg[200];
91
92 while ((c = *p++))
93 {
94 char m = char_action[(unsigned char)c];
95
96 switch(m)
97 {
98 case 0:
99 /* stripped */
100 h->stripped_all++;
101 h->stripped[(unsigned char)c]++;
102 break;
103
104 case 1:
105 /* legal character */
106 *q++ = char_mapping[(unsigned char)(c)];
107 break;
108
109 case 2:
110 /* fatal character */
111 if ((c>=32) && (c<127)) {
112 snprintf(msg,
113 200,
114 "Illegal character '%c' in sequence on line %" PRIu64 " of FASTA file",
115 (unsigned char)c,
116 h->lineno);
117 } else {
118 snprintf(msg,
119 200,
120 "Illegal unprintable ASCII character no %d in sequence on line %" PRIu64 " of FASTA file",
121 (unsigned char) c,
122 h->lineno);
123
124 }
125 fatal(msg);
126 break;
127
128 case 3:
129 /* silently stripped chars (whitespace) */
130 break;
131
132 case 4:
133 /* newline (silently stripped) */
134 h->lineno++;
135 break;
136 }
137 }
138
139 /* add zero after sequence */
140 *q = 0;
141 h->sequence_buffer.length = q - h->sequence_buffer.data;
142 }
143
fasta_next(fastx_handle h,bool truncateatspace,const unsigned char * char_mapping)144 bool fasta_next(fastx_handle h,
145 bool truncateatspace,
146 const unsigned char * char_mapping)
147 {
148 h->lineno_start = h->lineno;
149
150 h->header_buffer.length = 0;
151 h->header_buffer.data[0] = 0;
152 h->sequence_buffer.length = 0;
153 h->sequence_buffer.data[0] = 0;
154
155 uint64_t rest = fastx_file_fill_buffer(h);
156
157 if (rest == 0) {
158 return false;
159
160 }
161
162 /* read header */
163
164 /* check initial > character */
165
166 if (h->file_buffer.data[h->file_buffer.position] != '>')
167 {
168 fprintf(stderr, "Found character %02x\n", (unsigned char)(h->file_buffer.data[h->file_buffer.position]));
169 fatal("Invalid FASTA - header must start with > character");
170 }
171 h->file_buffer.position++;
172 rest--;
173
174 char * lf = nullptr;
175 while (lf == nullptr)
176 {
177 /* get more data if buffer empty*/
178 rest = fastx_file_fill_buffer(h);
179 if (rest == 0) {
180 fatal("Invalid FASTA - header must be terminated with newline");
181
182 }
183
184 /* find LF */
185 lf = (char *) memchr(h->file_buffer.data + h->file_buffer.position,
186 '\n',
187 rest);
188
189 /* copy to header buffer */
190 uint64_t len = rest;
191 if (lf)
192 {
193 /* LF found, copy up to and including LF */
194 len = lf - (h->file_buffer.data + h->file_buffer.position) + 1;
195 h->lineno++;
196 }
197 buffer_extend(& h->header_buffer,
198 h->file_buffer.data + h->file_buffer.position,
199 len);
200 h->file_buffer.position += len;
201 rest -= len;
202 }
203
204 /* read one or more sequence lines */
205
206 while (true)
207 {
208 /* get more data, if necessary */
209 rest = fastx_file_fill_buffer(h);
210
211 /* end if no more data */
212 if (rest == 0) {
213 break;
214
215 }
216
217 /* end if new sequence starts */
218 if (lf && (h->file_buffer.data[h->file_buffer.position] == '>')) {
219 break;
220
221 }
222
223 /* find LF */
224 lf = (char *) memchr(h->file_buffer.data + h->file_buffer.position,
225 '\n', rest);
226
227 uint64_t len = rest;
228 if (lf)
229 {
230 /* LF found, copy up to and including LF */
231 len = lf - (h->file_buffer.data + h->file_buffer.position) + 1;
232 }
233 buffer_extend(& h->sequence_buffer,
234 h->file_buffer.data + h->file_buffer.position,
235 len);
236 h->file_buffer.position += len;
237 rest -= len;
238 }
239
240 h->seqno++;
241
242 fastx_filter_header(h, truncateatspace);
243 fasta_filter_sequence(h, char_fasta_action, char_mapping);
244
245 return true;
246 }
247
fasta_get_abundance(fastx_handle h)248 int64_t fasta_get_abundance(fastx_handle h)
249 {
250 // return 1 if not present
251 int64_t size = header_get_size(h->header_buffer.data,
252 h->header_buffer.length);
253 if (size > 0) {
254 return size;
255 } else {
256 return 1;
257
258 }
259 }
260
fasta_get_abundance_and_presence(fastx_handle h)261 int64_t fasta_get_abundance_and_presence(fastx_handle h)
262 {
263 // return 0 if not present
264 return header_get_size(h->header_buffer.data, h->header_buffer.length);
265 }
266
fasta_get_position(fastx_handle h)267 uint64_t fasta_get_position(fastx_handle h)
268 {
269 return h->file_position;
270 }
271
fasta_get_size(fastx_handle h)272 uint64_t fasta_get_size(fastx_handle h)
273 {
274 return h->file_size;
275 }
276
fasta_get_lineno(fastx_handle h)277 uint64_t fasta_get_lineno(fastx_handle h)
278 {
279 return h->lineno_start;
280 }
281
fasta_get_seqno(fastx_handle h)282 uint64_t fasta_get_seqno(fastx_handle h)
283 {
284 return h->seqno;
285 }
286
fasta_get_header_length(fastx_handle h)287 uint64_t fasta_get_header_length(fastx_handle h)
288 {
289 return h->header_buffer.length;
290 }
291
fasta_get_sequence_length(fastx_handle h)292 uint64_t fasta_get_sequence_length(fastx_handle h)
293 {
294 return h->sequence_buffer.length;
295 }
296
fasta_get_header(fastx_handle h)297 char * fasta_get_header(fastx_handle h)
298 {
299 return h->header_buffer.data;
300 }
301
fasta_get_sequence(fastx_handle h)302 char * fasta_get_sequence(fastx_handle h)
303 {
304 return h->sequence_buffer.data;
305 }
306
307
308 /* fasta output */
309
fasta_print_sequence(FILE * fp,char * seq,uint64_t len,int width)310 void fasta_print_sequence(FILE * fp, char * seq, uint64_t len, int width)
311 {
312 /*
313 The actual length of the sequence may be longer than "len", but only
314 "len" characters are printed.
315
316 Specify width of lines - zero (or <1) means linearize (all on one line).
317 */
318
319 if (width < 1) {
320 fprintf(fp, "%.*s\n", (int)(len), seq);
321 } else
322 {
323 int64_t rest = len;
324 for(uint64_t i=0; i<len; i += width)
325 {
326 fprintf(fp, "%.*s\n", (int)(MIN(rest,width)), seq+i);
327 rest -= width;
328 }
329 }
330 }
331
fasta_print(FILE * fp,const char * hdr,char * seq,uint64_t len)332 void fasta_print(FILE * fp, const char * hdr,
333 char * seq, uint64_t len)
334 {
335 fprintf(fp, ">%s\n", hdr);
336 fasta_print_sequence(fp, seq, len, opt_fasta_width);
337 }
338
fprint_seq_label(FILE * fp,char * seq,int len)339 inline void fprint_seq_label(FILE * fp, char * seq, int len)
340 {
341 /* normalize first? */
342 fprintf(fp, "%.*s", len, seq);
343 }
344
fasta_print_general(FILE * fp,const char * prefix,char * seq,int len,char * header,int header_len,unsigned int abundance,int ordinal,double ee,int clustersize,int clusterid,const char * score_name,double score)345 void fasta_print_general(FILE * fp,
346 const char * prefix,
347 char * seq,
348 int len,
349 char * header,
350 int header_len,
351 unsigned int abundance,
352 int ordinal,
353 double ee,
354 int clustersize,
355 int clusterid,
356 const char * score_name,
357 double score)
358 {
359 fprintf(fp, ">");
360
361 if (prefix) {
362 fprintf(fp, "%s", prefix);
363
364 }
365
366 if (opt_relabel_self) {
367 fprint_seq_label(fp, seq, len);
368 } else if (opt_relabel_sha1) {
369 fprint_seq_digest_sha1(fp, seq, len);
370 } else if (opt_relabel_md5) {
371 fprint_seq_digest_md5(fp, seq, len);
372 } else if (opt_relabel && (ordinal > 0)) {
373 fprintf(fp, "%s%d", opt_relabel, ordinal);
374 } else
375 {
376 bool xsize = opt_xsize || (opt_sizeout && (abundance > 0));
377 bool xee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0));
378 header_fprint_strip_size_ee(fp,
379 header,
380 header_len,
381 xsize,
382 xee);
383 }
384
385 if (opt_label_suffix) {
386 fprintf(fp, "%s", opt_label_suffix);
387
388 }
389
390 if (clustersize > 0) {
391 fprintf(fp, ";seqs=%d", clustersize);
392
393 }
394
395 if (clusterid >= 0) {
396 fprintf(fp, ";clusterid=%d", clusterid);
397
398 }
399
400 if (opt_sizeout && (abundance > 0)) {
401 fprintf(fp, ";size=%u", abundance);
402
403 }
404
405 if ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)) {
406 fprintf(fp, ";ee=%.4lf", ee);
407
408 }
409
410 if (score_name) {
411 fprintf(fp, ";%s=%.4lf", score_name, score);
412
413 }
414
415 if (opt_relabel_keep &&
416 ((opt_relabel && (ordinal > 0)) || opt_relabel_sha1 || opt_relabel_md5 || opt_relabel_self)) {
417 fprintf(fp, " %s", header);
418
419 }
420
421 fprintf(fp, "\n");
422
423 if (seq) {
424 fasta_print_sequence(fp, seq, len, opt_fasta_width);
425
426 }
427 }
428
fasta_print_db_relabel(FILE * fp,uint64_t seqno,int ordinal)429 void fasta_print_db_relabel(FILE * fp,
430 uint64_t seqno,
431 int ordinal)
432 {
433 fasta_print_general(fp,
434 nullptr,
435 db_getsequence(seqno),
436 db_getsequencelen(seqno),
437 db_getheader(seqno),
438 db_getheaderlen(seqno),
439 db_getabundance(seqno),
440 ordinal,
441 -1.0,
442 -1, -1,
443 nullptr, 0.0);
444 }
445
fasta_print_db(FILE * fp,uint64_t seqno)446 void fasta_print_db(FILE * fp, uint64_t seqno)
447 {
448 fasta_print_general(fp,
449 nullptr,
450 db_getsequence(seqno),
451 db_getsequencelen(seqno),
452 db_getheader(seqno),
453 db_getheaderlen(seqno),
454 db_getabundance(seqno),
455 0,
456 -1.0,
457 -1, -1,
458 nullptr, 0.0);
459 }
460