1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 uint32_t sff_magic = 0x2e736666;
64 
65 struct sff_header_s
66 {
67   uint32_t magic_number; /* .sff */
68   uint32_t version;
69   uint64_t index_offset;
70   uint32_t index_length;
71   uint32_t number_of_reads;
72   uint16_t header_length;
73   uint16_t key_length;
74   uint16_t flows_per_read;
75   uint8_t  flowgram_format_code;
76 } sff_header;
77 
78 struct sff_read_header_s
79 {
80   uint16_t read_header_length;
81   uint16_t name_length;
82   uint32_t number_of_bases;
83   uint16_t clip_qual_left;
84   uint16_t clip_qual_right;
85   uint16_t clip_adapter_left;
86   uint16_t clip_adapter_right;
87 } read_header;
88 
fskip(FILE * fp,uint64_t length)89 uint64_t fskip(FILE * fp, uint64_t length)
90 {
91   /* read given amount of data from a stream and ignore it */
92   /* used instead of seeking in order to work with pipes   */
93 #define BLOCKSIZE 4096
94   char buffer[BLOCKSIZE];
95 
96   uint64_t skipped = 0;
97   uint64_t rest = length;
98 
99   while (rest > 0)
100     {
101       uint64_t want = ((rest > BLOCKSIZE) ? BLOCKSIZE : rest);
102       uint64_t got = fread(buffer, 1, want, fp);
103       skipped += got;
104       rest -= got;
105       if (got < want) {
106         break;
107 
108         }
109     }
110   return skipped;
111 }
112 
sff_convert()113 void sff_convert()
114 {
115   if (! opt_fastqout) {
116     fatal("No output file for sff_convert specified with --fastqout.");
117 
118         }
119 
120   FILE * fp_fastqout = fopen_output(opt_fastqout);
121   if (!fp_fastqout) {
122     fatal("Unable to open FASTQ output file for writing.");
123 
124         }
125 
126   FILE * fp_sff = fopen_input(opt_sff_convert);
127   if (!fp_sff) {
128     fatal("Unable to open SFF input file for reading.");
129 
130         }
131 
132   /* read and check header */
133 
134   uint64_t filepos = 0;
135 
136   if (fread(&sff_header, 1, 31, fp_sff) < 31) {
137     fatal("Unable to read from SFF file. File may be truncated.");
138 
139         }
140   filepos += 31;
141 
142   sff_header.magic_number    = bswap_32(sff_header.magic_number);
143   sff_header.version         = bswap_32(sff_header.version);
144   sff_header.index_offset    = bswap_64(sff_header.index_offset);
145   sff_header.index_length    = bswap_32(sff_header.index_length);
146   sff_header.number_of_reads = bswap_32(sff_header.number_of_reads);
147   sff_header.header_length   = bswap_16(sff_header.header_length);
148   sff_header.key_length      = bswap_16(sff_header.key_length);
149   sff_header.flows_per_read  = bswap_16(sff_header.flows_per_read);
150 
151   if (sff_header.magic_number != sff_magic) {
152     fatal("Invalid SFF file. Incorrect magic number. Must be 0x2e736666 (.sff).");
153 
154         }
155 
156   if (sff_header.version != 1) {
157     fatal("Invalid SFF file. Incorrect version. Must be 1.");
158 
159         }
160 
161   if (sff_header.flowgram_format_code != 1) {
162     fatal("Invalid SFF file. Incorrect flowgram format code. Must be 1.");
163 
164         }
165 
166   if (sff_header.header_length != 8 * ((31 + sff_header.flows_per_read + sff_header.key_length + 7) / 8)) {
167     fatal("Invalid SFF file. Incorrect header length.");
168 
169         }
170 
171   if (sff_header.key_length != 4) {
172     fatal("Invalid SFF file. Incorrect key length. Must be 4.");
173 
174         }
175 
176   if ((sff_header.index_length > 0) && (sff_header.index_length < 8)) {
177     fatal("Invalid SFF file. Incorrect index size. Must be at least 8.");
178 
179         }
180 
181   /* read and check flow chars, key and padding */
182 
183   if (fskip(fp_sff, sff_header.flows_per_read) < sff_header.flows_per_read) {
184     fatal("Invalid SFF file. Unable to read flow characters. File may be truncated.");
185 
186         }
187   filepos += sff_header.flows_per_read;
188 
189   char * key_sequence = (char *) xmalloc(sff_header.key_length + 1);
190   if (fread(key_sequence, 1, sff_header.key_length, fp_sff) < sff_header.key_length) {
191     fatal("Invalid SFF file. Unable to read key sequence. File may be truncated.");
192 
193         }
194   key_sequence[sff_header.key_length] = 0;
195   filepos += sff_header.key_length;
196 
197   uint32_t padding_length = sff_header.header_length - sff_header.flows_per_read - sff_header.key_length - 31;
198   if (fskip(fp_sff, padding_length) < padding_length) {
199     fatal("Invalid SFF file. Unable to read padding. File may be truncated.");
200 
201         }
202   filepos += padding_length;
203 
204   double totallength = 0.0;
205   uint32_t minimum = UINT_MAX;
206   uint32_t maximum = 0;
207 
208   bool index_done = (sff_header.index_offset == 0) || (sff_header.index_length == 0);
209   bool index_odd = false;
210   char index_kind[9];
211 
212   uint32_t index_padding = 0;
213   if ((sff_header.index_length & 7) > 0) {
214     index_padding = 8 - (sff_header.index_length & 7);
215 
216         }
217 
218   if (! opt_quiet)
219     {
220       fprintf(stderr, "Number of reads: %d\n", sff_header.number_of_reads);
221       fprintf(stderr, "Flows per read:  %d\n", sff_header.flows_per_read);
222       fprintf(stderr, "Key sequence:    %s\n", key_sequence);
223     }
224 
225   if (opt_log)
226     {
227       fprintf(fp_log, "Number of reads: %d\n", sff_header.number_of_reads);
228       fprintf(fp_log, "Flows per read:  %d\n", sff_header.flows_per_read);
229       fprintf(fp_log, "Key sequence:    %s\n", key_sequence);
230     }
231 
232   progress_init("Converting SFF: ", sff_header.number_of_reads);
233 
234   for(uint32_t read_no = 0; read_no < sff_header.number_of_reads; read_no++)
235     {
236       /* check if the index block is here */
237 
238       if (! index_done)
239         {
240           if (filepos == sff_header.index_offset)
241             {
242               if (fread(index_kind, 1, 8, fp_sff) < 8) {
243                 fatal("Invalid SFF file. Unable to read index header. File may be truncated.");
244 
245         }
246               filepos += 8;
247               index_kind[8] = 0;
248 
249               uint64 index_size = sff_header.index_length - 8 + index_padding;
250               if (fskip(fp_sff, index_size) != index_size) {
251                 fatal("Invalid SFF file. Unable to read entire index. File may be truncated.");
252 
253         }
254 
255               filepos += index_size;
256               index_done = true;
257               index_odd = true;
258             }
259         }
260 
261       /* read and check each read header */
262 
263       if (fread(&read_header, 1, 16, fp_sff) < 16) {
264         fatal("Invalid SFF file. Unable to read read header. File may be truncated.");
265 
266         }
267       filepos += 16;
268 
269       read_header.read_header_length = bswap_16(read_header.read_header_length);
270       read_header.name_length = bswap_16(read_header.name_length);
271       read_header.number_of_bases = bswap_32(read_header.number_of_bases);
272       read_header.clip_qual_left = bswap_16(read_header.clip_qual_left);
273       read_header.clip_qual_right = bswap_16(read_header.clip_qual_right);
274       read_header.clip_adapter_left = bswap_16(read_header.clip_adapter_left);
275       read_header.clip_adapter_right = bswap_16(read_header.clip_adapter_right);
276 
277       if (read_header.read_header_length != 8 * ((16 + read_header.name_length + 7) / 8)) {
278         fatal("Invalid SFF file. Incorrect read header length.");
279 
280         }
281       if (read_header.clip_qual_left > read_header.number_of_bases) {
282         fatal("Invalid SFF file. Incorrect clip_qual_left value.");
283 
284         }
285       if (read_header.clip_adapter_left > read_header.number_of_bases) {
286         fatal("Invalid SFF file. Incorrect clip_adapter_left value.");
287 
288         }
289       if (read_header.clip_qual_right > read_header.number_of_bases) {
290         fatal("Invalid SFF file. Incorrect clip_qual_right value.");
291 
292         }
293       if (read_header.clip_adapter_right > read_header.number_of_bases) {
294         fatal("Invalid SFF file. Incorrect clip_adapter_right value.");
295 
296         }
297 
298       char * read_name = (char *) xmalloc(read_header.name_length + 1);
299       if (fread(read_name, 1, read_header.name_length, fp_sff) < read_header.name_length) {
300         fatal("Invalid SFF file. Unable to read read name. File may be truncated.");
301 
302         }
303       filepos += read_header.name_length;
304       read_name[read_header.name_length] = 0;
305 
306       uint32_t read_header_padding_length = read_header.read_header_length - read_header.name_length - 16;
307       if (fskip(fp_sff, read_header_padding_length) < read_header_padding_length) {
308         fatal("Invalid SFF file. Unable to read read header padding. File may be truncated.");
309 
310         }
311       filepos += read_header_padding_length;
312 
313       /* read and check the flowgram and sequence */
314 
315       if (fskip(fp_sff, 2 * sff_header.flows_per_read) < sff_header.flows_per_read) {
316         fatal("Invalid SFF file. Unable to read flowgram values. File may be truncated.");
317 
318         }
319       filepos += 2 * sff_header.flows_per_read;
320 
321       if (fskip(fp_sff, read_header.number_of_bases) < read_header.number_of_bases) {
322         fatal("Invalid SFF file. Unable to read flow indices. File may be truncated.");
323 
324         }
325       filepos += read_header.number_of_bases;
326 
327       char * bases = (char *) xmalloc(read_header.number_of_bases + 1);
328       if (fread(bases, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) {
329         fatal("Invalid SFF file. Unable to read read length. File may be truncated.");
330 
331         }
332       bases[read_header.number_of_bases] = 0;
333       filepos += read_header.number_of_bases;
334 
335       char * qual = (char *) xmalloc(read_header.number_of_bases + 1);
336       if (fread(qual, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) {
337         fatal("Invalid SFF file. Unable to read quality scores. File may be truncated.");
338 
339         }
340       filepos += read_header.number_of_bases;
341 
342       /* convert quality scores to ascii characters */
343       for(uint32_t base_no = 0; base_no < read_header.number_of_bases; base_no++)
344         {
345           int q = qual[base_no];
346           if (q < opt_fastq_qminout) {
347             q = opt_fastq_qminout;
348 
349         }
350           if (q > opt_fastq_qmaxout) {
351             q = opt_fastq_qmaxout;
352 
353         }
354           qual[base_no] = opt_fastq_asciiout + q;
355         }
356       qual[read_header.number_of_bases] = 0;
357 
358       uint32_t read_data_length = (2 * sff_header.flows_per_read + 3 * read_header.number_of_bases);
359       uint32_t read_data_padded_length = 8 * ((read_data_length + 7) / 8);
360       uint32_t read_data_padding_length = read_data_padded_length - read_data_length;
361 
362       if (fskip(fp_sff, read_data_padding_length) < read_data_padding_length) {
363         fatal("Invalid SFF file. Unable to read read data padding. File may be truncated.");
364 
365         }
366       filepos += read_data_padding_length;
367 
368       uint32_t clip_start = 0;
369       clip_start = MAX(1, MAX(read_header.clip_qual_left, read_header.clip_adapter_left)) - 1;
370 
371       uint32_t clip_end = read_header.number_of_bases;
372       clip_end = MIN((read_header.clip_qual_right == 0 ? read_header.number_of_bases : read_header.clip_qual_right), (read_header.clip_adapter_right == 0 ? read_header.number_of_bases : read_header.clip_adapter_right));
373 
374       /* make the clipped bases lowercase and the rest uppercase */
375       for (uint32_t i = 0; i < read_header.number_of_bases; i++)
376         {
377           if ((i < clip_start) || (i >= clip_end)) {
378             bases[i] = tolower(bases[i]);
379           } else {
380             bases[i] = toupper(bases[i]);
381 
382         }
383         }
384 
385       if (opt_sff_clip)
386         {
387           bases[clip_end] = 0;
388           qual[clip_end] = 0;
389         }
390       else
391         {
392           clip_start = 0;
393           clip_end = read_header.number_of_bases;
394         }
395 
396       uint32_t length = clip_end - clip_start;
397 
398       fastq_print_general(fp_fastqout,
399                           bases + clip_start,
400                           length,
401                           read_name,
402                           strlen(read_name),
403                           qual + clip_start,
404                           1, read_no + 1, -1.0);
405 
406       xfree(read_name);
407       xfree(bases);
408       xfree(qual);
409 
410       totallength += length;
411       if (length < minimum) {
412         minimum = length;
413 
414         }
415       if (length > maximum) {
416         maximum = length;
417 
418         }
419 
420       progress_update(read_no + 1);
421     }
422   progress_done();
423 
424   /* check if the index block is here */
425 
426   if (! index_done)
427     {
428       if (filepos == sff_header.index_offset)
429         {
430           if (fread(index_kind, 1, 8, fp_sff) < 8) {
431             fatal("Invalid SFF file. Unable to read index header. File may be truncated.");
432 
433         }
434           filepos += 8;
435           index_kind[8] = 0;
436 
437           uint64 index_size = sff_header.index_length - 8;
438           if (fskip(fp_sff, index_size) != index_size) {
439             fatal("Invalid SFF file. Unable to read entire index. File may be truncated.");
440 
441         }
442 
443           filepos += index_size;
444           index_done = true;
445 
446           /* try to skip padding, if any */
447 
448           if (index_padding > 0)
449             {
450               uint64_t got = fskip(fp_sff, index_padding);
451               if ((got < index_padding) && (got != 0)) {
452                 fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n");
453 
454         }
455             }
456         }
457     }
458 
459   if (! index_done)
460     {
461       fprintf(stderr, "WARNING: SFF index missing\n");
462       if (opt_log) {
463         fprintf(fp_log, "WARNING: SFF index missing\n");
464 
465         }
466     }
467 
468   if (index_odd)
469     {
470       fprintf(stderr, "WARNING: Index at unusual position in file\n");
471       if (opt_log) {
472         fprintf(fp_log, "WARNING: Index at unusual position in file\n");
473 
474         }
475     }
476 
477   /* ignore the rest of file */
478 
479   /* try reading just another byte */
480 
481   if (fskip(fp_sff, 1) > 0)
482     {
483       fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n");
484       if (opt_log) {
485         fprintf(fp_log, "WARNING: Additional data at end of SFF file ignored\n");
486 
487         }
488     }
489 
490   fclose(fp_sff);
491   fclose(fp_fastqout);
492 
493   double average = totallength / sff_header.number_of_reads;
494 
495   if (! opt_quiet)
496     {
497       if (sff_header.index_length > 0) {
498         fprintf(stderr, "Index type:      %s\n", index_kind);
499 
500         }
501       fprintf(stderr, "\nSFF file read successfully.\n");
502       if (sff_header.number_of_reads > 0) {
503         fprintf(stderr, "Sequence length: minimum %d, average %.1f, maximum %d\n",
504                 minimum,
505                 average,
506                 maximum);
507 
508         }
509     }
510 
511   if (opt_log)
512     {
513       if (sff_header.index_length > 0) {
514         fprintf(fp_log, "Index type:      %s\n", index_kind);
515 
516         }
517       fprintf(fp_log, "\nSFF file read successfully.\n");
518       if (sff_header.number_of_reads > 0) {
519         fprintf(fp_log, "Sequence length: minimum %d, average %.1f, maximum %d\n",
520                 minimum,
521                 average,
522                 maximum);
523 
524         }
525     }
526 
527   xfree(key_sequence);
528 }
529