1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 uint32_t sff_magic = 0x2e736666;
64
65 struct sff_header_s
66 {
67 uint32_t magic_number; /* .sff */
68 uint32_t version;
69 uint64_t index_offset;
70 uint32_t index_length;
71 uint32_t number_of_reads;
72 uint16_t header_length;
73 uint16_t key_length;
74 uint16_t flows_per_read;
75 uint8_t flowgram_format_code;
76 } sff_header;
77
78 struct sff_read_header_s
79 {
80 uint16_t read_header_length;
81 uint16_t name_length;
82 uint32_t number_of_bases;
83 uint16_t clip_qual_left;
84 uint16_t clip_qual_right;
85 uint16_t clip_adapter_left;
86 uint16_t clip_adapter_right;
87 } read_header;
88
fskip(FILE * fp,uint64_t length)89 uint64_t fskip(FILE * fp, uint64_t length)
90 {
91 /* read given amount of data from a stream and ignore it */
92 /* used instead of seeking in order to work with pipes */
93 #define BLOCKSIZE 4096
94 char buffer[BLOCKSIZE];
95
96 uint64_t skipped = 0;
97 uint64_t rest = length;
98
99 while (rest > 0)
100 {
101 uint64_t want = ((rest > BLOCKSIZE) ? BLOCKSIZE : rest);
102 uint64_t got = fread(buffer, 1, want, fp);
103 skipped += got;
104 rest -= got;
105 if (got < want) {
106 break;
107
108 }
109 }
110 return skipped;
111 }
112
sff_convert()113 void sff_convert()
114 {
115 if (! opt_fastqout) {
116 fatal("No output file for sff_convert specified with --fastqout.");
117
118 }
119
120 FILE * fp_fastqout = fopen_output(opt_fastqout);
121 if (!fp_fastqout) {
122 fatal("Unable to open FASTQ output file for writing.");
123
124 }
125
126 FILE * fp_sff = fopen_input(opt_sff_convert);
127 if (!fp_sff) {
128 fatal("Unable to open SFF input file for reading.");
129
130 }
131
132 /* read and check header */
133
134 uint64_t filepos = 0;
135
136 if (fread(&sff_header, 1, 31, fp_sff) < 31) {
137 fatal("Unable to read from SFF file. File may be truncated.");
138
139 }
140 filepos += 31;
141
142 sff_header.magic_number = bswap_32(sff_header.magic_number);
143 sff_header.version = bswap_32(sff_header.version);
144 sff_header.index_offset = bswap_64(sff_header.index_offset);
145 sff_header.index_length = bswap_32(sff_header.index_length);
146 sff_header.number_of_reads = bswap_32(sff_header.number_of_reads);
147 sff_header.header_length = bswap_16(sff_header.header_length);
148 sff_header.key_length = bswap_16(sff_header.key_length);
149 sff_header.flows_per_read = bswap_16(sff_header.flows_per_read);
150
151 if (sff_header.magic_number != sff_magic) {
152 fatal("Invalid SFF file. Incorrect magic number. Must be 0x2e736666 (.sff).");
153
154 }
155
156 if (sff_header.version != 1) {
157 fatal("Invalid SFF file. Incorrect version. Must be 1.");
158
159 }
160
161 if (sff_header.flowgram_format_code != 1) {
162 fatal("Invalid SFF file. Incorrect flowgram format code. Must be 1.");
163
164 }
165
166 if (sff_header.header_length != 8 * ((31 + sff_header.flows_per_read + sff_header.key_length + 7) / 8)) {
167 fatal("Invalid SFF file. Incorrect header length.");
168
169 }
170
171 if (sff_header.key_length != 4) {
172 fatal("Invalid SFF file. Incorrect key length. Must be 4.");
173
174 }
175
176 if ((sff_header.index_length > 0) && (sff_header.index_length < 8)) {
177 fatal("Invalid SFF file. Incorrect index size. Must be at least 8.");
178
179 }
180
181 /* read and check flow chars, key and padding */
182
183 if (fskip(fp_sff, sff_header.flows_per_read) < sff_header.flows_per_read) {
184 fatal("Invalid SFF file. Unable to read flow characters. File may be truncated.");
185
186 }
187 filepos += sff_header.flows_per_read;
188
189 char * key_sequence = (char *) xmalloc(sff_header.key_length + 1);
190 if (fread(key_sequence, 1, sff_header.key_length, fp_sff) < sff_header.key_length) {
191 fatal("Invalid SFF file. Unable to read key sequence. File may be truncated.");
192
193 }
194 key_sequence[sff_header.key_length] = 0;
195 filepos += sff_header.key_length;
196
197 uint32_t padding_length = sff_header.header_length - sff_header.flows_per_read - sff_header.key_length - 31;
198 if (fskip(fp_sff, padding_length) < padding_length) {
199 fatal("Invalid SFF file. Unable to read padding. File may be truncated.");
200
201 }
202 filepos += padding_length;
203
204 double totallength = 0.0;
205 uint32_t minimum = UINT_MAX;
206 uint32_t maximum = 0;
207
208 bool index_done = (sff_header.index_offset == 0) || (sff_header.index_length == 0);
209 bool index_odd = false;
210 char index_kind[9];
211
212 uint32_t index_padding = 0;
213 if ((sff_header.index_length & 7) > 0) {
214 index_padding = 8 - (sff_header.index_length & 7);
215
216 }
217
218 if (! opt_quiet)
219 {
220 fprintf(stderr, "Number of reads: %d\n", sff_header.number_of_reads);
221 fprintf(stderr, "Flows per read: %d\n", sff_header.flows_per_read);
222 fprintf(stderr, "Key sequence: %s\n", key_sequence);
223 }
224
225 if (opt_log)
226 {
227 fprintf(fp_log, "Number of reads: %d\n", sff_header.number_of_reads);
228 fprintf(fp_log, "Flows per read: %d\n", sff_header.flows_per_read);
229 fprintf(fp_log, "Key sequence: %s\n", key_sequence);
230 }
231
232 progress_init("Converting SFF: ", sff_header.number_of_reads);
233
234 for(uint32_t read_no = 0; read_no < sff_header.number_of_reads; read_no++)
235 {
236 /* check if the index block is here */
237
238 if (! index_done)
239 {
240 if (filepos == sff_header.index_offset)
241 {
242 if (fread(index_kind, 1, 8, fp_sff) < 8) {
243 fatal("Invalid SFF file. Unable to read index header. File may be truncated.");
244
245 }
246 filepos += 8;
247 index_kind[8] = 0;
248
249 uint64 index_size = sff_header.index_length - 8 + index_padding;
250 if (fskip(fp_sff, index_size) != index_size) {
251 fatal("Invalid SFF file. Unable to read entire index. File may be truncated.");
252
253 }
254
255 filepos += index_size;
256 index_done = true;
257 index_odd = true;
258 }
259 }
260
261 /* read and check each read header */
262
263 if (fread(&read_header, 1, 16, fp_sff) < 16) {
264 fatal("Invalid SFF file. Unable to read read header. File may be truncated.");
265
266 }
267 filepos += 16;
268
269 read_header.read_header_length = bswap_16(read_header.read_header_length);
270 read_header.name_length = bswap_16(read_header.name_length);
271 read_header.number_of_bases = bswap_32(read_header.number_of_bases);
272 read_header.clip_qual_left = bswap_16(read_header.clip_qual_left);
273 read_header.clip_qual_right = bswap_16(read_header.clip_qual_right);
274 read_header.clip_adapter_left = bswap_16(read_header.clip_adapter_left);
275 read_header.clip_adapter_right = bswap_16(read_header.clip_adapter_right);
276
277 if (read_header.read_header_length != 8 * ((16 + read_header.name_length + 7) / 8)) {
278 fatal("Invalid SFF file. Incorrect read header length.");
279
280 }
281 if (read_header.clip_qual_left > read_header.number_of_bases) {
282 fatal("Invalid SFF file. Incorrect clip_qual_left value.");
283
284 }
285 if (read_header.clip_adapter_left > read_header.number_of_bases) {
286 fatal("Invalid SFF file. Incorrect clip_adapter_left value.");
287
288 }
289 if (read_header.clip_qual_right > read_header.number_of_bases) {
290 fatal("Invalid SFF file. Incorrect clip_qual_right value.");
291
292 }
293 if (read_header.clip_adapter_right > read_header.number_of_bases) {
294 fatal("Invalid SFF file. Incorrect clip_adapter_right value.");
295
296 }
297
298 char * read_name = (char *) xmalloc(read_header.name_length + 1);
299 if (fread(read_name, 1, read_header.name_length, fp_sff) < read_header.name_length) {
300 fatal("Invalid SFF file. Unable to read read name. File may be truncated.");
301
302 }
303 filepos += read_header.name_length;
304 read_name[read_header.name_length] = 0;
305
306 uint32_t read_header_padding_length = read_header.read_header_length - read_header.name_length - 16;
307 if (fskip(fp_sff, read_header_padding_length) < read_header_padding_length) {
308 fatal("Invalid SFF file. Unable to read read header padding. File may be truncated.");
309
310 }
311 filepos += read_header_padding_length;
312
313 /* read and check the flowgram and sequence */
314
315 if (fskip(fp_sff, 2 * sff_header.flows_per_read) < sff_header.flows_per_read) {
316 fatal("Invalid SFF file. Unable to read flowgram values. File may be truncated.");
317
318 }
319 filepos += 2 * sff_header.flows_per_read;
320
321 if (fskip(fp_sff, read_header.number_of_bases) < read_header.number_of_bases) {
322 fatal("Invalid SFF file. Unable to read flow indices. File may be truncated.");
323
324 }
325 filepos += read_header.number_of_bases;
326
327 char * bases = (char *) xmalloc(read_header.number_of_bases + 1);
328 if (fread(bases, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) {
329 fatal("Invalid SFF file. Unable to read read length. File may be truncated.");
330
331 }
332 bases[read_header.number_of_bases] = 0;
333 filepos += read_header.number_of_bases;
334
335 char * qual = (char *) xmalloc(read_header.number_of_bases + 1);
336 if (fread(qual, 1, read_header.number_of_bases, fp_sff) < read_header.number_of_bases) {
337 fatal("Invalid SFF file. Unable to read quality scores. File may be truncated.");
338
339 }
340 filepos += read_header.number_of_bases;
341
342 /* convert quality scores to ascii characters */
343 for(uint32_t base_no = 0; base_no < read_header.number_of_bases; base_no++)
344 {
345 int q = qual[base_no];
346 if (q < opt_fastq_qminout) {
347 q = opt_fastq_qminout;
348
349 }
350 if (q > opt_fastq_qmaxout) {
351 q = opt_fastq_qmaxout;
352
353 }
354 qual[base_no] = opt_fastq_asciiout + q;
355 }
356 qual[read_header.number_of_bases] = 0;
357
358 uint32_t read_data_length = (2 * sff_header.flows_per_read + 3 * read_header.number_of_bases);
359 uint32_t read_data_padded_length = 8 * ((read_data_length + 7) / 8);
360 uint32_t read_data_padding_length = read_data_padded_length - read_data_length;
361
362 if (fskip(fp_sff, read_data_padding_length) < read_data_padding_length) {
363 fatal("Invalid SFF file. Unable to read read data padding. File may be truncated.");
364
365 }
366 filepos += read_data_padding_length;
367
368 uint32_t clip_start = 0;
369 clip_start = MAX(1, MAX(read_header.clip_qual_left, read_header.clip_adapter_left)) - 1;
370
371 uint32_t clip_end = read_header.number_of_bases;
372 clip_end = MIN((read_header.clip_qual_right == 0 ? read_header.number_of_bases : read_header.clip_qual_right), (read_header.clip_adapter_right == 0 ? read_header.number_of_bases : read_header.clip_adapter_right));
373
374 /* make the clipped bases lowercase and the rest uppercase */
375 for (uint32_t i = 0; i < read_header.number_of_bases; i++)
376 {
377 if ((i < clip_start) || (i >= clip_end)) {
378 bases[i] = tolower(bases[i]);
379 } else {
380 bases[i] = toupper(bases[i]);
381
382 }
383 }
384
385 if (opt_sff_clip)
386 {
387 bases[clip_end] = 0;
388 qual[clip_end] = 0;
389 }
390 else
391 {
392 clip_start = 0;
393 clip_end = read_header.number_of_bases;
394 }
395
396 uint32_t length = clip_end - clip_start;
397
398 fastq_print_general(fp_fastqout,
399 bases + clip_start,
400 length,
401 read_name,
402 strlen(read_name),
403 qual + clip_start,
404 1, read_no + 1, -1.0);
405
406 xfree(read_name);
407 xfree(bases);
408 xfree(qual);
409
410 totallength += length;
411 if (length < minimum) {
412 minimum = length;
413
414 }
415 if (length > maximum) {
416 maximum = length;
417
418 }
419
420 progress_update(read_no + 1);
421 }
422 progress_done();
423
424 /* check if the index block is here */
425
426 if (! index_done)
427 {
428 if (filepos == sff_header.index_offset)
429 {
430 if (fread(index_kind, 1, 8, fp_sff) < 8) {
431 fatal("Invalid SFF file. Unable to read index header. File may be truncated.");
432
433 }
434 filepos += 8;
435 index_kind[8] = 0;
436
437 uint64 index_size = sff_header.index_length - 8;
438 if (fskip(fp_sff, index_size) != index_size) {
439 fatal("Invalid SFF file. Unable to read entire index. File may be truncated.");
440
441 }
442
443 filepos += index_size;
444 index_done = true;
445
446 /* try to skip padding, if any */
447
448 if (index_padding > 0)
449 {
450 uint64_t got = fskip(fp_sff, index_padding);
451 if ((got < index_padding) && (got != 0)) {
452 fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n");
453
454 }
455 }
456 }
457 }
458
459 if (! index_done)
460 {
461 fprintf(stderr, "WARNING: SFF index missing\n");
462 if (opt_log) {
463 fprintf(fp_log, "WARNING: SFF index missing\n");
464
465 }
466 }
467
468 if (index_odd)
469 {
470 fprintf(stderr, "WARNING: Index at unusual position in file\n");
471 if (opt_log) {
472 fprintf(fp_log, "WARNING: Index at unusual position in file\n");
473
474 }
475 }
476
477 /* ignore the rest of file */
478
479 /* try reading just another byte */
480
481 if (fskip(fp_sff, 1) > 0)
482 {
483 fprintf(stderr, "WARNING: Additional data at end of SFF file ignored\n");
484 if (opt_log) {
485 fprintf(fp_log, "WARNING: Additional data at end of SFF file ignored\n");
486
487 }
488 }
489
490 fclose(fp_sff);
491 fclose(fp_fastqout);
492
493 double average = totallength / sff_header.number_of_reads;
494
495 if (! opt_quiet)
496 {
497 if (sff_header.index_length > 0) {
498 fprintf(stderr, "Index type: %s\n", index_kind);
499
500 }
501 fprintf(stderr, "\nSFF file read successfully.\n");
502 if (sff_header.number_of_reads > 0) {
503 fprintf(stderr, "Sequence length: minimum %d, average %.1f, maximum %d\n",
504 minimum,
505 average,
506 maximum);
507
508 }
509 }
510
511 if (opt_log)
512 {
513 if (sff_header.index_length > 0) {
514 fprintf(fp_log, "Index type: %s\n", index_kind);
515
516 }
517 fprintf(fp_log, "\nSFF file read successfully.\n");
518 if (sff_header.number_of_reads > 0) {
519 fprintf(fp_log, "Sequence length: minimum %d, average %.1f, maximum %d\n",
520 minimum,
521 average,
522 maximum);
523
524 }
525 }
526
527 xfree(key_sequence);
528 }
529