1 /*
2 * Copyright (c) 2013 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36 */
37
38 #ifdef HAVE_CONFIG_H
39 #include "io_lib_config.h"
40 #endif
41
42 // Enable if we want V3.1 support. TODO: add a configure param for this
43 #define HAVE_FQZ
44
45 #include <unistd.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <inttypes.h>
50 #include <fcntl.h>
51 #include <zlib.h>
52 #include <assert.h>
53 #include <ctype.h>
54 #include <errno.h>
55 #include <unistd.h>
56
57 #if defined(__MINGW32__) || defined(__FreeBSD__) || defined(__APPLE__)
58 # include <getopt.h>
59 #endif
60
61 #include <io_lib/scram.h>
62 #include <io_lib/os.h>
63
parse_format(char * str)64 static char *parse_format(char *str) {
65 if (strcmp(str, "sam") == 0 || strcmp(str, "SAM") == 0)
66 return "s";
67
68 if (strcmp(str, "bam") == 0 || strcmp(str, "BAM") == 0)
69 return "b";
70
71 if (strcmp(str, "cram") == 0 || strcmp(str, "CRAM") == 0)
72 return "c";
73
74 fprintf(stderr, "Unrecognised file format '%s'\n", str);
75 exit(1);
76 }
77
detect_format(char * fn)78 static char *detect_format(char *fn) {
79 char *cp = strrchr(fn, '.');
80
81 if (!cp)
82 return "";
83
84 if (strcmp(cp, ".sam") == 0 || strcmp(cp, ".SAM") == 0)
85 return "s";
86 if (strcmp(cp, ".bam") == 0 || strcmp(cp, ".BAM") == 0)
87 return "b";
88 if (strcmp(cp, ".cram") == 0 || strcmp(cp, ".CRAM") == 0)
89 return "c";
90
91 return "";
92 }
93
usage(FILE * fp)94 static void usage(FILE *fp) {
95 fprintf(fp, " -=- sCRAMble -=- version %s\n", PACKAGE_VERSION);
96 fprintf(fp, "Author: James Bonfield, Wellcome Trust Sanger Institute. 2013-2015\n\n");
97
98 fprintf(fp, "Usage: scramble [options] [input_file [output_file]]\n");
99
100 fprintf(fp, "Options:\n");
101 fprintf(fp, " -I format Set input format: \"bam\", \"sam\" or \"cram\".\n");
102 fprintf(fp, " -O format Set output format: \"bam\", \"sam\" or \"cram\".\n");
103 fprintf(fp, " -1 to -9 Set compression level.\n");
104 fprintf(fp, " -0 or -u No compression.\n");
105 //fprintf(fp, " -v Verbose output.\n");
106 fprintf(fp, " -H [SAM] Do not print header\n");
107 fprintf(fp, " -R range [Cram] Specifies the refseq:start-end range\n");
108 fprintf(fp, " -r ref.fa [Cram] Specifies the reference file.\n");
109 fprintf(fp, " -b integer [Cram] Max. bases per slice, default %d.\n",
110 BASES_PER_SLICE);
111 fprintf(fp, " -s integer [Cram] Sequences per slice, default %d.\n",
112 SEQS_PER_SLICE);
113 fprintf(fp, " -S integer [Cram] Slices per container, default %d.\n",
114 SLICE_PER_CNT);
115 fprintf(fp, " -V version [Cram] Specify the file format version to write (eg 1.1, 2.0)\n");
116 fprintf(fp, " -e [Cram] Embed reference sequence.\n");
117 fprintf(fp, " -x [Cram] Non-reference based encoding.\n");
118 fprintf(fp, " -M [Cram] Use multiple references per slice.\n");
119 fprintf(fp, " -m [Cram] Generate MD and NM tags.\n");
120 #ifdef HAVE_LIBBZ2
121 fprintf(fp, " -j [Cram] Also compress using bzip2.\n");
122 #endif
123 #ifdef HAVE_LIBLZMA
124 fprintf(fp, " -Z [Cram] Also compress using lzma.\n");
125 #endif
126 #ifdef HAVE_LIBBSC
127 fprintf(fp, " -J [Cram] Also compression using libbsc (V3.1+)\n");
128 #endif
129 fprintf(fp, " -f [Cram] Also compression using fqzcomp (V3.1+)\n");
130 fprintf(fp, " -n [Cram] Discard read names where possible.\n");
131 fprintf(fp, " -P Preserve all aux tags (incl RG,NM,MD)\n");
132 fprintf(fp, " -p Preserve aux tag sizes ('i', 's', 'c')\n");
133 fprintf(fp, " -q Don't add scramble @PG header line\n");
134 fprintf(fp, " -N integer Stop decoding after 'integer' sequences\n");
135 fprintf(fp, " -t N Use N threads (availability varies by format)\n");
136 fprintf(fp, " -B Enable Illumina 8 quality-binning system (lossy)\n");
137 fprintf(fp, " -! Disable all checking of checksums\n");
138 fprintf(fp, " -g FILE Convert to Bam using index (file.gzi)\n");
139 fprintf(fp, " -G FILE Output Bam index when bam input(file.gzi)\n");
140 }
141
main(int argc,char ** argv)142 int main(int argc, char **argv) {
143 scram_fd *in, *out;
144 bam_seq_t *s;
145 char imode[10], *in_f = "", omode[10], *out_f = "", *index_fn = NULL, *index_out_fn = NULL;
146 int level = '\0'; // nul terminate string => auto level
147 int c, verbose = 0;
148 int s_opt = 0, S_opt = 0, embed_ref = 0, ignore_md5 = 0, decode_md = 0;
149 char *ref_fn = NULL;
150 int start, end, multi_seq = -1, no_ref = 0;
151 int use_bz2 = 0, use_bsc = 0, use_lzma = 0, use_fqz = 0;
152 char ref_name[1024] = {0};
153 refs_t *refs;
154 int nthreads = 1;
155 t_pool *p = NULL;
156 gzi *idx =NULL;
157 int max_reads = -1;
158 enum quality_binning binning = BINNING_NONE;
159 int sam_fields = 0; // all
160 int header = 1;
161 int bases_per_slice = 0;
162 int lossy_read_names = 0;
163 int preserve_aux_order = 0;
164 int preserve_aux_size = 0;
165 int add_pg = 1;
166
167 scram_init();
168
169 /* Parse command line arguments */
170 while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:xXeI:O:R:!MmjJZt:BN:F:Hb:nPpqg:G:f")) != -1) {
171 switch (c) {
172 case 'F':
173 sam_fields = strtol(optarg, NULL, 0); // undocumented for testing
174 break;
175
176 case '0': case '1': case '2': case '3': case '4':
177 case '5': case '6': case '7': case '8': case '9':
178 level = c;
179 break;
180
181 case 'u':
182 level = '0';
183 break;
184
185 case 'h':
186 usage(stdout);
187 return 0;
188
189 case 'H':
190 header = 0;
191 break;
192
193 case 'v':
194 verbose++;
195 break;
196
197 case 's':
198 s_opt = atoi(optarg);
199 bases_per_slice = s_opt * 500; // guesswork...
200 break;
201
202 case 'b':
203 bases_per_slice = atoi(optarg);
204 break;
205
206 case 'S':
207 S_opt = atoi(optarg);
208 break;
209
210 case 'm':
211 decode_md = 1;
212 break;
213
214 case 'V':
215 if (cram_set_option(NULL, CRAM_OPT_VERSION, optarg))
216 return 1;
217 break;
218
219 case 'r':
220 ref_fn = optarg;
221 break;
222
223 case 'X':
224 fprintf(stderr, "-X is deprecated in favour of -e.\n");
225 case 'e':
226 embed_ref = 1;
227 break;
228
229 case 'x':
230 no_ref = 1;
231 break;
232
233 case 'I':
234 in_f = parse_format(optarg);
235 break;
236
237 case 'O':
238 out_f = parse_format(optarg);
239 break;
240
241 case 'R': {
242 char *cp = strchr(optarg, ':');
243 if (cp) {
244 *cp = 0;
245 switch (sscanf(cp+1, "%d-%d", &start, &end)) {
246 case 1:
247 end = start;
248 break;
249 case 2:
250 break;
251 default:
252 fprintf(stderr, "Malformed range format\n");
253 return 1;
254 }
255 } else {
256 start = INT_MIN;
257 end = INT_MAX;
258 }
259 strncpy(ref_name, optarg, 1023);
260 break;
261 }
262
263 case '!':
264 ignore_md5 = 1;
265 break;
266
267 case 'n':
268 lossy_read_names = 1;
269 break;
270
271 case 'M':
272 multi_seq = 1;
273 break;
274
275 case 'j':
276 #ifdef HAVE_LIBBZ2
277 use_bz2 = 1;
278 #else
279 fprintf(stderr, "Warning: bzip2 support is not compiled into this"
280 " version.\nPlease recompile.\n");
281 #endif
282 break;
283
284 #ifdef HAVE_LIBBSC
285 case 'J':
286 use_bsc = 1;
287 break;
288 #else
289 fprintf(stderr, "Warning: bsc support is not compiled into this"
290 " version.\nPlease recompile.\n");
291 #endif
292
293 case 'Z':
294 #ifdef HAVE_LIBLZMA
295 use_lzma = 1;
296 #else
297 fprintf(stderr, "Warning: lzma support is not compiled into this"
298 " version.\nPlease recompile.\n");
299 #endif
300 break;
301
302 case 'f':
303 #ifdef HAVE_FQZ
304 use_fqz = 1;
305 #else
306 fprintf(stderr, "Warning: FQZ support is not compiled into this"
307 " version.\nPlease recompile.\n");
308 #endif
309 break;
310
311 case 't':
312 nthreads = atoi(optarg);
313 if (nthreads < 1) {
314 fprintf(stderr, "Number of threads needs to be >= 1\n");
315 return 1;
316 }
317 break;
318
319 case 'B':
320 binning = BINNING_ILLUMINA;
321 break;
322
323 case 'P':
324 preserve_aux_order = 1;
325 break;
326
327 case 'p':
328 preserve_aux_size = 1;
329 break;
330
331 case 'q':
332 add_pg = 0;
333 break;
334
335 case 'N':
336 max_reads = atoi(optarg);
337 break;
338
339 case 'g':
340 index_fn = optarg;
341 break;
342
343 case 'G':
344 index_out_fn = optarg;
345 break;
346
347 case '?':
348 fprintf(stderr, "Unrecognised option: -%c\n", optopt);
349 usage(stderr);
350 return 1;
351 }
352 }
353
354 if (cram_default_version() <= 300 && (use_bsc || use_fqz)) {
355 fprintf(stderr, "Libbsc and/or fqzcomp codecs are only permitted in CRAM v3.1 and 4.0.\n"
356 "Note these CRAM versions are a technology demonstration only.\n"
357 "Future versions of Scramble may not be able to read these files.\n");
358 return 1;
359 }
360
361 if (cram_default_version() > 300) {
362 fprintf(stderr, "\nWARNING: this version of CRAM is not a recognised GA4GH standard.\n"
363 "Note this CRAM version is a technology demonstration only.\n"
364 "Future versions of Scramble may not be able to read these files.\n\n");
365 }
366
367 if (argc - optind > 2) {
368 fprintf(stderr, "Usage: scramble [input_file [output_file]]\n");
369 return 1;
370 }
371
372
373 /* Open up input and output files */
374 sprintf(imode, "r%s%c", in_f, level);
375 if (argc - optind > 0) {
376 if (*in_f == 0)
377 sprintf(imode, "r%s%c", detect_format(argv[optind]), level);
378 if (!(in = scram_open(argv[optind], imode))) {
379 fprintf(stderr, "Failed to open file %s\n", argv[optind]);
380 return 1;
381 }
382 } else {
383 if (isatty(0)) {
384 usage(stdout);
385 return 0;
386 }
387
388 if (!(in = scram_open("-", imode))) {
389 fprintf(stderr, "Failed to open file %s\n", argv[optind]);
390 return 1;
391 }
392 }
393 if (!in->is_bam && ref_fn) {
394 cram_load_reference(in->c, ref_fn);
395 if (!in->c->refs && !embed_ref) {
396 fprintf(stderr, "Unable to find an appropriate reference.\n"
397 "Please specify a valid reference with "
398 "-r ref.fa option.\n");
399 return 1;
400 }
401 }
402
403 sprintf(omode, "w%s%c", out_f, level);
404 if (argc - optind > 1) {
405 if (*out_f == 0)
406 sprintf(omode, "w%s%c", detect_format(argv[optind+1]), level);
407 if (!(out = scram_open(argv[optind+1], omode))) {
408 fprintf(stderr, "Failed to open file %s\n", argv[optind+1]);
409 return 1;
410 }
411 } else {
412 if (!(out = scram_open("-", omode))) {
413 fprintf(stderr, "Failed to open file %s\n", argv[optind+1]);
414 return 1;
415 }
416 }
417
418
419 /* Set any format specific options */
420 scram_set_refs(out, refs = scram_get_refs(in));
421
422 scram_set_option(out, CRAM_OPT_VERBOSITY, verbose);
423 if (s_opt)
424 if (scram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt))
425 return 1;
426
427 if (S_opt)
428 if (scram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt))
429 return 1;
430
431 if (bases_per_slice)
432 if (scram_set_option(out, CRAM_OPT_BASES_PER_SLICE, bases_per_slice))
433 return 1;
434
435 if (embed_ref) {
436 if (scram_get_header(in)->sort_order == ORDER_NAME ||
437 scram_get_header(in)->sort_order == ORDER_UNSORTED) {
438 fprintf(stderr, "Embeded reference with non-coordinate sorted data is "
439 "not supported.\nUsing -x for no-ref instead.\n");
440 if (scram_set_option(out, CRAM_OPT_NO_REF, 1))
441 return 1;
442 } else {
443 if (scram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref))
444 return 1;
445 }
446 }
447
448 if (use_bz2)
449 if (scram_set_option(out, CRAM_OPT_USE_BZIP2, use_bz2))
450 return 1;
451
452 if (use_bsc)
453 if (scram_set_option(out, CRAM_OPT_USE_BSC, use_bsc))
454 return 1;
455
456 if (use_lzma)
457 if (scram_set_option(out, CRAM_OPT_USE_LZMA, use_lzma))
458 return 1;
459
460 if (use_fqz)
461 if (scram_set_option(out, CRAM_OPT_USE_FQZ, use_fqz))
462 return 1;
463
464 if (binning != BINNING_NONE)
465 if (scram_set_option(out, CRAM_OPT_BINNING, binning))
466 return 1;
467
468 if (no_ref)
469 if (scram_set_option(out, CRAM_OPT_NO_REF, no_ref))
470 return 1;
471
472 if (multi_seq)
473 if (scram_set_option(out, CRAM_OPT_MULTI_SEQ_PER_SLICE, multi_seq))
474 return 1;
475
476 if (decode_md) {
477 if (no_ref) {
478 fprintf(stderr, "Cannot use -m in conjunction with -x.\n");
479 return 1;
480 }
481 if (scram_set_option(in, CRAM_OPT_DECODE_MD, decode_md))
482 return 1;
483 }
484
485 if (index_fn) {
486 if (NULL == (idx = gzi_index_load(index_fn))) {
487 fprintf(stderr, "Cannot open index file.\n");
488 return 1;
489 }
490 if (scram_set_option(out, CRAM_OPT_WITH_BGZIP_INDEX, idx))
491 return 1;
492 }
493
494 if (index_out_fn) {
495 if (scram_set_option(in, CRAM_OPT_OUTPUT_BGZIP_IDX, index_out_fn))
496 return 1;
497 }
498
499 if (nthreads > 1) {
500 if (NULL == (p = t_pool_init(nthreads*2, nthreads)))
501 return 1;
502
503 if (scram_set_option(in, CRAM_OPT_THREAD_POOL, p))
504 return 1;
505 if (scram_set_option(out, CRAM_OPT_THREAD_POOL, p))
506 return 1;
507 }
508
509 if (ignore_md5) {
510 if (scram_set_option(in, CRAM_OPT_IGNORE_MD5, ignore_md5))
511 return 1;
512 if (scram_set_option(in, CRAM_OPT_IGNORE_CHKSUM, ignore_md5))
513 return 1;
514 if (scram_set_option(out, CRAM_OPT_IGNORE_CHKSUM, ignore_md5))
515 return 1;
516 }
517
518 if (lossy_read_names) {
519 if (scram_set_option(out, CRAM_OPT_LOSSY_READ_NAMES, lossy_read_names))
520 return 1;
521 }
522
523 if (preserve_aux_order)
524 if (scram_set_option(out, CRAM_OPT_PRESERVE_AUX_ORDER, preserve_aux_order))
525 return 1;
526
527 if (preserve_aux_size)
528 if (scram_set_option(out, CRAM_OPT_PRESERVE_AUX_SIZE, preserve_aux_size))
529 return 1;
530
531 if (sam_fields)
532 scram_set_option(in, CRAM_OPT_REQUIRED_FIELDS, sam_fields);
533
534 /* Copy header and refs from in to out, for writing purposes */
535 scram_set_header(out, scram_get_header(in));
536
537 // Needs doing after loading the header.
538 if (ref_fn) {
539 if (scram_set_option(out, CRAM_OPT_REFERENCE, ref_fn))
540 return 1;
541 } else {
542 // Attempt to fill out a cram->refs[] array from @SQ headers
543 scram_set_option(out, CRAM_OPT_REFERENCE, NULL);
544 }
545
546 if (scram_get_header(out)) {
547 if (add_pg) {
548 char *arg_list = stringify_argv(argc, argv);
549
550 if (!arg_list)
551 return 1;
552
553
554 if (sam_hdr_add_PG(scram_get_header(out), "scramble",
555 "VN", PACKAGE_VERSION,
556 "CL", arg_list, NULL))
557 return 1;
558
559 free(arg_list);
560 }
561
562 if ((header || (omode[1] != 's' && omode[1] != '\0')) && scram_write_header(out) != 0)
563 return 1;
564 }
565
566
567 /* Support for sub-range queries, currently implemented for CRAM only */
568 if (*ref_name != 0) {
569 cram_range r;
570 int refid;
571
572 if (in->is_bam) {
573 fprintf(stderr, "Currently the -R option is only implemented for CRAM indices\n");
574 return 1;
575 }
576
577 cram_index_load(in->c, argv[optind]);
578
579 refid = sam_hdr_name2ref(in->c->header, ref_name);
580
581 if (refid == -1 && *ref_name != '*') {
582 fprintf(stderr, "Unknown reference name '%s'\n", ref_name);
583 return 1;
584 }
585 r.refid = refid;
586 r.start = start;
587 r.end = end;
588 if (scram_set_option(in, CRAM_OPT_RANGE, &r))
589 return 1;
590 }
591
592 /* Do the actual file format conversion */
593 s = NULL;
594
595 while (scram_get_seq(in, &s) >= 0) {
596 if (-1 == scram_put_seq(out, s)) {
597 fprintf(stderr, "Failed to encode sequence\n");
598 return 1;
599 }
600 if (max_reads >= 0)
601 if (--max_reads == 0)
602 break;
603 }
604
605 switch(scram_eof(in)) {
606 case -1:
607 fprintf(stderr, "Failed to decode sequence\n");
608 return 1;
609 case 0:
610 if (max_reads == -1) {
611 fprintf(stderr, "Failed to decode sequence\n");
612 return 1;
613 } else {
614 break;
615 }
616 case 2:
617 fprintf(stderr, "Warning: no end-of-file block identified. "
618 "File may be truncated.\n");
619 break;
620 case 1: default:
621 // expected case
622 break;
623 }
624
625 /* Finally tidy up and close files */
626 if (scram_close(in)) {
627 fprintf(stderr, "Failed in scram_close(in)\n");
628 return 1;
629 }
630 if (scram_close(out)) {
631 fprintf(stderr, "Failed in scram_close(out)\n");
632 return 1;
633 }
634
635 if (p)
636 t_pool_destroy(p, 0);
637
638 if (s)
639 free(s);
640
641 return 0;
642 }
643