1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
fasta_open(const char * filename)63 fastx_handle fasta_open(const char * filename)
64 {
65   fastx_handle h = fastx_open(filename);
66 
67   if (fastx_is_fastq(h) && ! h->is_empty) {
68     fatal("FASTA file expected, FASTQ file found (%s)", filename);
69 
70         }
71 
72   return h;
73 }
74 
fasta_close(fastx_handle h)75 void fasta_close(fastx_handle h)
76 {
77   fastx_close(h);
78 }
79 
fasta_filter_sequence(fastx_handle h,unsigned int * char_action,const unsigned char * char_mapping)80 void fasta_filter_sequence(fastx_handle h,
81                            unsigned int * char_action,
82                            const unsigned char * char_mapping)
83 {
84   /* Strip unwanted characters from the sequence and raise warnings or
85      errors on certain characters. */
86 
87   char * p = h->sequence_buffer.data;
88   char * q = p;
89   char c;
90   char msg[200];
91 
92   while ((c = *p++))
93     {
94       char m = char_action[(unsigned char)c];
95 
96       switch(m)
97         {
98         case 0:
99           /* stripped */
100           h->stripped_all++;
101           h->stripped[(unsigned char)c]++;
102           break;
103 
104         case 1:
105           /* legal character */
106           *q++ = char_mapping[(unsigned char)(c)];
107           break;
108 
109         case 2:
110           /* fatal character */
111           if ((c>=32) && (c<127)) {
112             snprintf(msg,
113                      200,
114                      "Illegal character '%c' in sequence on line %" PRIu64 " of FASTA file",
115                      (unsigned char)c,
116                      h->lineno);
117           } else {
118             snprintf(msg,
119                      200,
120                      "Illegal unprintable ASCII character no %d in sequence on line %" PRIu64 " of FASTA file",
121                      (unsigned char) c,
122                      h->lineno);
123 
124         }
125           fatal(msg);
126           break;
127 
128         case 3:
129           /* silently stripped chars (whitespace) */
130           break;
131 
132         case 4:
133           /* newline (silently stripped) */
134           h->lineno++;
135           break;
136         }
137     }
138 
139   /* add zero after sequence */
140   *q = 0;
141   h->sequence_buffer.length = q - h->sequence_buffer.data;
142 }
143 
fasta_next(fastx_handle h,bool truncateatspace,const unsigned char * char_mapping)144 bool fasta_next(fastx_handle h,
145                 bool truncateatspace,
146                 const unsigned char * char_mapping)
147 {
148   h->lineno_start = h->lineno;
149 
150   h->header_buffer.length = 0;
151   h->header_buffer.data[0] = 0;
152   h->sequence_buffer.length = 0;
153   h->sequence_buffer.data[0] = 0;
154 
155   uint64_t rest = fastx_file_fill_buffer(h);
156 
157   if (rest == 0) {
158     return false;
159 
160         }
161 
162   /* read header */
163 
164   /* check initial > character */
165 
166   if (h->file_buffer.data[h->file_buffer.position] != '>')
167     {
168       fprintf(stderr, "Found character %02x\n", (unsigned char)(h->file_buffer.data[h->file_buffer.position]));
169       fatal("Invalid FASTA - header must start with > character");
170     }
171   h->file_buffer.position++;
172   rest--;
173 
174   char * lf = nullptr;
175   while (lf == nullptr)
176     {
177       /* get more data if buffer empty*/
178       rest = fastx_file_fill_buffer(h);
179       if (rest == 0) {
180         fatal("Invalid FASTA - header must be terminated with newline");
181 
182         }
183 
184       /* find LF */
185       lf = (char *) memchr(h->file_buffer.data + h->file_buffer.position,
186                            '\n',
187                            rest);
188 
189       /* copy to header buffer */
190       uint64_t len = rest;
191       if (lf)
192         {
193           /* LF found, copy up to and including LF */
194           len = lf - (h->file_buffer.data + h->file_buffer.position) + 1;
195           h->lineno++;
196         }
197       buffer_extend(& h->header_buffer,
198                     h->file_buffer.data + h->file_buffer.position,
199                     len);
200       h->file_buffer.position += len;
201       rest -= len;
202     }
203 
204   /* read one or more sequence lines */
205 
206   while (true)
207     {
208       /* get more data, if necessary */
209       rest = fastx_file_fill_buffer(h);
210 
211       /* end if no more data */
212       if (rest == 0) {
213         break;
214 
215         }
216 
217       /* end if new sequence starts */
218       if (lf && (h->file_buffer.data[h->file_buffer.position] == '>')) {
219         break;
220 
221         }
222 
223       /* find LF */
224       lf = (char *) memchr(h->file_buffer.data + h->file_buffer.position,
225                            '\n', rest);
226 
227       uint64_t len = rest;
228       if (lf)
229         {
230           /* LF found, copy up to and including LF */
231           len = lf - (h->file_buffer.data + h->file_buffer.position) + 1;
232         }
233       buffer_extend(& h->sequence_buffer,
234                     h->file_buffer.data + h->file_buffer.position,
235                     len);
236       h->file_buffer.position += len;
237       rest -= len;
238     }
239 
240   h->seqno++;
241 
242   fastx_filter_header(h, truncateatspace);
243   fasta_filter_sequence(h, char_fasta_action, char_mapping);
244 
245   return true;
246 }
247 
fasta_get_abundance(fastx_handle h)248 int64_t fasta_get_abundance(fastx_handle h)
249 {
250   // return 1 if not present
251   int64_t size = header_get_size(h->header_buffer.data,
252                                  h->header_buffer.length);
253   if (size > 0) {
254     return size;
255   } else {
256     return 1;
257 
258         }
259 }
260 
fasta_get_abundance_and_presence(fastx_handle h)261 int64_t fasta_get_abundance_and_presence(fastx_handle h)
262 {
263   // return 0 if not present
264   return header_get_size(h->header_buffer.data, h->header_buffer.length);
265 }
266 
fasta_get_position(fastx_handle h)267 uint64_t fasta_get_position(fastx_handle h)
268 {
269   return h->file_position;
270 }
271 
fasta_get_size(fastx_handle h)272 uint64_t fasta_get_size(fastx_handle h)
273 {
274   return h->file_size;
275 }
276 
fasta_get_lineno(fastx_handle h)277 uint64_t fasta_get_lineno(fastx_handle h)
278 {
279   return h->lineno_start;
280 }
281 
fasta_get_seqno(fastx_handle h)282 uint64_t fasta_get_seqno(fastx_handle h)
283 {
284   return h->seqno;
285 }
286 
fasta_get_header_length(fastx_handle h)287 uint64_t fasta_get_header_length(fastx_handle h)
288 {
289   return h->header_buffer.length;
290 }
291 
fasta_get_sequence_length(fastx_handle h)292 uint64_t fasta_get_sequence_length(fastx_handle h)
293 {
294   return h->sequence_buffer.length;
295 }
296 
fasta_get_header(fastx_handle h)297 char * fasta_get_header(fastx_handle h)
298 {
299   return h->header_buffer.data;
300 }
301 
fasta_get_sequence(fastx_handle h)302 char * fasta_get_sequence(fastx_handle h)
303 {
304   return h->sequence_buffer.data;
305 }
306 
307 
308 /* fasta output */
309 
fasta_print_sequence(FILE * fp,char * seq,uint64_t len,int width)310 void fasta_print_sequence(FILE * fp, char * seq, uint64_t len, int width)
311 {
312   /*
313      The actual length of the sequence may be longer than "len", but only
314      "len" characters are printed.
315 
316      Specify width of lines - zero (or <1) means linearize (all on one line).
317   */
318 
319   if (width < 1) {
320     fprintf(fp, "%.*s\n", (int)(len), seq);
321   } else
322     {
323       int64_t rest = len;
324       for(uint64_t i=0; i<len; i += width)
325         {
326           fprintf(fp, "%.*s\n", (int)(MIN(rest,width)), seq+i);
327           rest -= width;
328         }
329     }
330 }
331 
fasta_print(FILE * fp,const char * hdr,char * seq,uint64_t len)332 void fasta_print(FILE * fp, const char * hdr,
333                  char * seq, uint64_t len)
334 {
335   fprintf(fp, ">%s\n", hdr);
336   fasta_print_sequence(fp, seq, len, opt_fasta_width);
337 }
338 
fprint_seq_label(FILE * fp,char * seq,int len)339 inline void fprint_seq_label(FILE * fp, char * seq, int len)
340 {
341   /* normalize first? */
342   fprintf(fp, "%.*s", len, seq);
343 }
344 
fasta_print_general(FILE * fp,const char * prefix,char * seq,int len,char * header,int header_len,unsigned int abundance,int ordinal,double ee,int clustersize,int clusterid,const char * score_name,double score)345 void fasta_print_general(FILE * fp,
346                          const char * prefix,
347                          char * seq,
348                          int len,
349                          char * header,
350                          int header_len,
351                          unsigned int abundance,
352                          int ordinal,
353                          double ee,
354                          int clustersize,
355                          int clusterid,
356                          const char * score_name,
357                          double score)
358 {
359   fprintf(fp, ">");
360 
361   if (prefix) {
362     fprintf(fp, "%s", prefix);
363 
364         }
365 
366   if (opt_relabel_self) {
367     fprint_seq_label(fp, seq, len);
368   } else if (opt_relabel_sha1) {
369     fprint_seq_digest_sha1(fp, seq, len);
370   } else if (opt_relabel_md5) {
371     fprint_seq_digest_md5(fp, seq, len);
372   } else if (opt_relabel && (ordinal > 0)) {
373     fprintf(fp, "%s%d", opt_relabel, ordinal);
374   } else
375     {
376       bool xsize = opt_xsize || (opt_sizeout && (abundance > 0));
377       bool xee = opt_xee || ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0));
378       header_fprint_strip_size_ee(fp,
379                                   header,
380                                   header_len,
381                                   xsize,
382                                   xee);
383     }
384 
385   if (opt_label_suffix) {
386     fprintf(fp, "%s", opt_label_suffix);
387 
388         }
389 
390   if (clustersize > 0) {
391     fprintf(fp, ";seqs=%d", clustersize);
392 
393         }
394 
395   if (clusterid >= 0) {
396     fprintf(fp, ";clusterid=%d", clusterid);
397 
398         }
399 
400   if (opt_sizeout && (abundance > 0)) {
401     fprintf(fp, ";size=%u", abundance);
402 
403         }
404 
405   if ((opt_eeout || opt_fastq_eeout) && (ee >= 0.0)) {
406     fprintf(fp, ";ee=%.4lf", ee);
407 
408         }
409 
410   if (score_name) {
411     fprintf(fp, ";%s=%.4lf", score_name, score);
412 
413         }
414 
415   if (opt_relabel_keep &&
416       ((opt_relabel && (ordinal > 0)) || opt_relabel_sha1 || opt_relabel_md5 || opt_relabel_self)) {
417     fprintf(fp, " %s", header);
418 
419         }
420 
421   fprintf(fp, "\n");
422 
423   if (seq) {
424     fasta_print_sequence(fp, seq, len, opt_fasta_width);
425 
426         }
427 }
428 
fasta_print_db_relabel(FILE * fp,uint64_t seqno,int ordinal)429 void fasta_print_db_relabel(FILE * fp,
430                             uint64_t seqno,
431                             int ordinal)
432 {
433   fasta_print_general(fp,
434                       nullptr,
435                       db_getsequence(seqno),
436                       db_getsequencelen(seqno),
437                       db_getheader(seqno),
438                       db_getheaderlen(seqno),
439                       db_getabundance(seqno),
440                       ordinal,
441                       -1.0,
442                       -1, -1,
443                       nullptr, 0.0);
444 }
445 
fasta_print_db(FILE * fp,uint64_t seqno)446 void fasta_print_db(FILE * fp, uint64_t seqno)
447 {
448   fasta_print_general(fp,
449                       nullptr,
450                       db_getsequence(seqno),
451                       db_getsequencelen(seqno),
452                       db_getheader(seqno),
453                       db_getheaderlen(seqno),
454                       db_getabundance(seqno),
455                       0,
456                       -1.0,
457                       -1, -1,
458                       nullptr, 0.0);
459 }
460