1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36  */
37 
38 #ifdef HAVE_CONFIG_H
39 #include "io_lib_config.h"
40 #endif
41 
42 // Enable if we want V3.1 support.  TODO: add a configure param for this
43 #define HAVE_FQZ
44 
45 #include <unistd.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <inttypes.h>
50 #include <fcntl.h>
51 #include <zlib.h>
52 #include <assert.h>
53 #include <ctype.h>
54 #include <errno.h>
55 #include <unistd.h>
56 
57 #if defined(__MINGW32__) || defined(__FreeBSD__) || defined(__APPLE__)
58 #   include <getopt.h>
59 #endif
60 
61 #include <io_lib/scram.h>
62 #include <io_lib/os.h>
63 
parse_format(char * str)64 static char *parse_format(char *str) {
65     if (strcmp(str, "sam") == 0 || strcmp(str, "SAM") == 0)
66 	return "s";
67 
68     if (strcmp(str, "bam") == 0 || strcmp(str, "BAM") == 0)
69 	return "b";
70 
71     if (strcmp(str, "cram") == 0 || strcmp(str, "CRAM") == 0)
72 	return "c";
73 
74     fprintf(stderr, "Unrecognised file format '%s'\n", str);
75     exit(1);
76 }
77 
detect_format(char * fn)78 static char *detect_format(char *fn) {
79     char *cp = strrchr(fn, '.');
80 
81     if (!cp)
82 	return "";
83 
84     if (strcmp(cp, ".sam") == 0 || strcmp(cp, ".SAM") == 0)
85 	return "s";
86     if (strcmp(cp, ".bam") == 0 || strcmp(cp, ".BAM") == 0)
87 	return "b";
88     if (strcmp(cp, ".cram") == 0 || strcmp(cp, ".CRAM") == 0)
89 	return "c";
90 
91     return "";
92 }
93 
usage(FILE * fp)94 static void usage(FILE *fp) {
95     fprintf(fp, "  -=- sCRAMble -=-     version %s\n", PACKAGE_VERSION);
96     fprintf(fp, "Author: James Bonfield, Wellcome Trust Sanger Institute. 2013-2015\n\n");
97 
98     fprintf(fp, "Usage:    scramble [options] [input_file [output_file]]\n");
99 
100     fprintf(fp, "Options:\n");
101     fprintf(fp, "    -I format      Set input format:  \"bam\", \"sam\" or \"cram\".\n");
102     fprintf(fp, "    -O format      Set output format: \"bam\", \"sam\" or \"cram\".\n");
103     fprintf(fp, "    -1 to -9       Set compression level.\n");
104     fprintf(fp, "    -0 or -u       No compression.\n");
105     //fprintf(fp, "    -v             Verbose output.\n");
106     fprintf(fp, "    -H             [SAM] Do not print header\n");
107     fprintf(fp, "    -R range       [Cram] Specifies the refseq:start-end range\n");
108     fprintf(fp, "    -r ref.fa      [Cram] Specifies the reference file.\n");
109     fprintf(fp, "    -b integer     [Cram] Max. bases per slice, default %d.\n",
110 	    BASES_PER_SLICE);
111     fprintf(fp, "    -s integer     [Cram] Sequences per slice, default %d.\n",
112 	    SEQS_PER_SLICE);
113     fprintf(fp, "    -S integer     [Cram] Slices per container, default %d.\n",
114 	    SLICE_PER_CNT);
115     fprintf(fp, "    -V version     [Cram] Specify the file format version to write (eg 1.1, 2.0)\n");
116     fprintf(fp, "    -e             [Cram] Embed reference sequence.\n");
117     fprintf(fp, "    -x             [Cram] Non-reference based encoding.\n");
118     fprintf(fp, "    -M             [Cram] Use multiple references per slice.\n");
119     fprintf(fp, "    -m             [Cram] Generate MD and NM tags.\n");
120 #ifdef HAVE_LIBBZ2
121     fprintf(fp, "    -j             [Cram] Also compress using bzip2.\n");
122 #endif
123 #ifdef HAVE_LIBLZMA
124     fprintf(fp, "    -Z             [Cram] Also compress using lzma.\n");
125 #endif
126 #ifdef HAVE_LIBBSC
127     fprintf(fp, "    -J             [Cram] Also compression using libbsc (V3.1+)\n");
128 #endif
129     fprintf(fp, "    -f             [Cram] Also compression using fqzcomp (V3.1+)\n");
130     fprintf(fp, "    -n             [Cram] Discard read names where possible.\n");
131     fprintf(fp, "    -P             Preserve all aux tags (incl RG,NM,MD)\n");
132     fprintf(fp, "    -p             Preserve aux tag sizes ('i', 's', 'c')\n");
133     fprintf(fp, "    -q             Don't add scramble @PG header line\n");
134     fprintf(fp, "    -N integer     Stop decoding after 'integer' sequences\n");
135     fprintf(fp, "    -t N           Use N threads (availability varies by format)\n");
136     fprintf(fp, "    -B             Enable Illumina 8 quality-binning system (lossy)\n");
137     fprintf(fp, "    -!             Disable all checking of checksums\n");
138     fprintf(fp, "    -g FILE        Convert to Bam using index (file.gzi)\n");
139     fprintf(fp, "    -G FILE        Output Bam index when bam input(file.gzi)\n");
140 }
141 
main(int argc,char ** argv)142 int main(int argc, char **argv) {
143     scram_fd *in, *out;
144     bam_seq_t *s;
145     char imode[10], *in_f = "", omode[10], *out_f = "", *index_fn = NULL, *index_out_fn = NULL;
146     int level = '\0'; // nul terminate string => auto level
147     int c, verbose = 0;
148     int s_opt = 0, S_opt = 0, embed_ref = 0, ignore_md5 = 0, decode_md = 0;
149     char *ref_fn = NULL;
150     int start, end, multi_seq = -1, no_ref = 0;
151     int use_bz2 = 0, use_bsc = 0, use_lzma = 0, use_fqz = 0;
152     char ref_name[1024] = {0};
153     refs_t *refs;
154     int nthreads = 1;
155     t_pool *p = NULL;
156     gzi *idx =NULL;
157     int max_reads = -1;
158     enum quality_binning binning = BINNING_NONE;
159     int sam_fields = 0; // all
160     int header = 1;
161     int bases_per_slice = 0;
162     int lossy_read_names = 0;
163     int preserve_aux_order = 0;
164     int preserve_aux_size = 0;
165     int add_pg = 1;
166 
167     scram_init();
168 
169     /* Parse command line arguments */
170     while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:xXeI:O:R:!MmjJZt:BN:F:Hb:nPpqg:G:f")) != -1) {
171 	switch (c) {
172 	case 'F':
173 	    sam_fields = strtol(optarg, NULL, 0); // undocumented for testing
174 	    break;
175 
176 	case '0': case '1': case '2': case '3': case '4':
177 	case '5': case '6': case '7': case '8': case '9':
178 	    level = c;
179 	    break;
180 
181 	case 'u':
182 	    level = '0';
183 	    break;
184 
185 	case 'h':
186 	    usage(stdout);
187 	    return 0;
188 
189 	case 'H':
190 	    header = 0;
191 	    break;
192 
193 	case 'v':
194 	    verbose++;
195 	    break;
196 
197 	case 's':
198 	    s_opt = atoi(optarg);
199 	    bases_per_slice = s_opt * 500; // guesswork...
200 	    break;
201 
202 	case 'b':
203 	    bases_per_slice = atoi(optarg);
204 	    break;
205 
206 	case 'S':
207 	    S_opt = atoi(optarg);
208 	    break;
209 
210 	case 'm':
211 	    decode_md = 1;
212 	    break;
213 
214 	case 'V':
215 	    if (cram_set_option(NULL, CRAM_OPT_VERSION, optarg))
216 		return 1;
217 	    break;
218 
219 	case 'r':
220 	    ref_fn = optarg;
221 	    break;
222 
223 	case 'X':
224 	    fprintf(stderr, "-X is deprecated in favour of -e.\n");
225 	case 'e':
226 	    embed_ref = 1;
227 	    break;
228 
229 	case 'x':
230 	    no_ref = 1;
231 	    break;
232 
233 	case 'I':
234 	    in_f = parse_format(optarg);
235 	    break;
236 
237 	case 'O':
238 	    out_f = parse_format(optarg);
239 	    break;
240 
241 	case 'R': {
242 	    char *cp = strchr(optarg, ':');
243 	    if (cp) {
244 		*cp = 0;
245 		switch (sscanf(cp+1, "%d-%d", &start, &end)) {
246 		case 1:
247 		    end = start;
248 		    break;
249 		case 2:
250 		    break;
251 		default:
252 		    fprintf(stderr, "Malformed range format\n");
253 		    return 1;
254 		}
255 	    } else {
256 		start = INT_MIN;
257 		end   = INT_MAX;
258 	    }
259 	    strncpy(ref_name, optarg, 1023);
260 	    break;
261 	}
262 
263 	case '!':
264 	    ignore_md5 = 1;
265 	    break;
266 
267 	case 'n':
268 	    lossy_read_names = 1;
269 	    break;
270 
271 	case 'M':
272 	    multi_seq = 1;
273 	    break;
274 
275 	case 'j':
276 #ifdef HAVE_LIBBZ2
277 	    use_bz2 = 1;
278 #else
279 	    fprintf(stderr, "Warning: bzip2 support is not compiled into this"
280 		    " version.\nPlease recompile.\n");
281 #endif
282 	    break;
283 
284 #ifdef HAVE_LIBBSC
285 	case 'J':
286 	    use_bsc = 1;
287 	    break;
288 #else
289 	    fprintf(stderr, "Warning: bsc support is not compiled into this"
290 		    " version.\nPlease recompile.\n");
291 #endif
292 
293 	case 'Z':
294 #ifdef HAVE_LIBLZMA
295 	    use_lzma = 1;
296 #else
297 	    fprintf(stderr, "Warning: lzma support is not compiled into this"
298 		    " version.\nPlease recompile.\n");
299 #endif
300 	    break;
301 
302 	case 'f':
303 #ifdef HAVE_FQZ
304 	    use_fqz = 1;
305 #else
306 	    fprintf(stderr, "Warning: FQZ support is not compiled into this"
307 		    " version.\nPlease recompile.\n");
308 #endif
309 	    break;
310 
311 	case 't':
312 	    nthreads = atoi(optarg);
313 	    if (nthreads < 1) {
314 		fprintf(stderr, "Number of threads needs to be >= 1\n");
315 		return 1;
316 	    }
317 	    break;
318 
319 	case 'B':
320 	    binning = BINNING_ILLUMINA;
321 	    break;
322 
323 	case 'P':
324 	    preserve_aux_order = 1;
325 	    break;
326 
327 	case 'p':
328 	    preserve_aux_size = 1;
329 	    break;
330 
331 	case 'q':
332 	    add_pg = 0;
333 	    break;
334 
335 	case 'N':
336 	    max_reads = atoi(optarg);
337 	    break;
338 
339 	case 'g':
340 	    index_fn = optarg;
341 	    break;
342 
343 	case 'G':
344 	    index_out_fn = optarg;
345 	    break;
346 
347 	case '?':
348 	    fprintf(stderr, "Unrecognised option: -%c\n", optopt);
349 	    usage(stderr);
350 	    return 1;
351 	}
352     }
353 
354     if (cram_default_version() <= 300 && (use_bsc || use_fqz)) {
355 	fprintf(stderr, "Libbsc and/or fqzcomp codecs are only permitted in CRAM v3.1 and 4.0.\n"
356 		"Note these CRAM versions are a technology demonstration only.\n"
357 		"Future versions of Scramble may not be able to read these files.\n");
358 	return 1;
359     }
360 
361     if (cram_default_version() > 300) {
362 	fprintf(stderr, "\nWARNING: this version of CRAM is not a recognised GA4GH standard.\n"
363 		"Note this CRAM version is a technology demonstration only.\n"
364 		"Future versions of Scramble may not be able to read these files.\n\n");
365     }
366 
367     if (argc - optind > 2) {
368 	fprintf(stderr, "Usage: scramble [input_file [output_file]]\n");
369 	return 1;
370     }
371 
372 
373     /* Open up input and output files */
374     sprintf(imode, "r%s%c", in_f, level);
375     if (argc - optind > 0) {
376 	if (*in_f == 0)
377 	    sprintf(imode, "r%s%c", detect_format(argv[optind]), level);
378 	if (!(in = scram_open(argv[optind], imode))) {
379 	    fprintf(stderr, "Failed to open file %s\n", argv[optind]);
380 	    return 1;
381 	}
382     } else {
383         if (isatty(0)) {
384 	    usage(stdout);
385 	    return 0;
386 	}
387 
388 	if (!(in = scram_open("-", imode))) {
389 	    fprintf(stderr, "Failed to open file %s\n", argv[optind]);
390 	    return 1;
391 	}
392     }
393     if (!in->is_bam && ref_fn) {
394 	cram_load_reference(in->c, ref_fn);
395 	if (!in->c->refs && !embed_ref) {
396 	    fprintf(stderr, "Unable to find an appropriate reference.\n"
397 		    "Please specify a valid reference with "
398 		    "-r ref.fa option.\n");
399 	    return 1;
400 	}
401     }
402 
403     sprintf(omode, "w%s%c", out_f, level);
404     if (argc - optind > 1) {
405 	if (*out_f == 0)
406 	    sprintf(omode, "w%s%c", detect_format(argv[optind+1]), level);
407 	if (!(out = scram_open(argv[optind+1], omode))) {
408 	    fprintf(stderr, "Failed to open file %s\n", argv[optind+1]);
409 	    return 1;
410 	}
411     } else {
412 	if (!(out = scram_open("-", omode))) {
413 	    fprintf(stderr, "Failed to open file %s\n", argv[optind+1]);
414 	    return 1;
415 	}
416     }
417 
418 
419     /* Set any format specific options */
420     scram_set_refs(out, refs = scram_get_refs(in));
421 
422     scram_set_option(out, CRAM_OPT_VERBOSITY, verbose);
423     if (s_opt)
424 	if (scram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt))
425 	    return 1;
426 
427     if (S_opt)
428 	if (scram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt))
429 	    return 1;
430 
431     if (bases_per_slice)
432 	if (scram_set_option(out, CRAM_OPT_BASES_PER_SLICE, bases_per_slice))
433 	    return 1;
434 
435     if (embed_ref) {
436 	if (scram_get_header(in)->sort_order == ORDER_NAME ||
437 	    scram_get_header(in)->sort_order == ORDER_UNSORTED) {
438 	    fprintf(stderr, "Embeded reference with non-coordinate sorted data is "
439 		    "not supported.\nUsing -x for no-ref instead.\n");
440 	    if (scram_set_option(out, CRAM_OPT_NO_REF, 1))
441 		return 1;
442 	} else {
443 	    if (scram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref))
444 		return 1;
445 	}
446     }
447 
448     if (use_bz2)
449 	if (scram_set_option(out, CRAM_OPT_USE_BZIP2, use_bz2))
450 	    return 1;
451 
452     if (use_bsc)
453 	if (scram_set_option(out, CRAM_OPT_USE_BSC, use_bsc))
454 	    return 1;
455 
456     if (use_lzma)
457 	if (scram_set_option(out, CRAM_OPT_USE_LZMA, use_lzma))
458 	    return 1;
459 
460     if (use_fqz)
461 	if (scram_set_option(out, CRAM_OPT_USE_FQZ, use_fqz))
462 	    return 1;
463 
464     if (binning != BINNING_NONE)
465 	if (scram_set_option(out, CRAM_OPT_BINNING, binning))
466 	    return 1;
467 
468     if (no_ref)
469 	if (scram_set_option(out, CRAM_OPT_NO_REF, no_ref))
470 	    return 1;
471 
472     if (multi_seq)
473 	if (scram_set_option(out, CRAM_OPT_MULTI_SEQ_PER_SLICE, multi_seq))
474 	    return 1;
475 
476     if (decode_md) {
477 	if (no_ref) {
478 	    fprintf(stderr, "Cannot use -m in conjunction with -x.\n");
479 	    return 1;
480 	}
481 	if (scram_set_option(in, CRAM_OPT_DECODE_MD, decode_md))
482 	    return 1;
483     }
484 
485     if (index_fn) {
486 	if (NULL == (idx = gzi_index_load(index_fn))) {
487 	    fprintf(stderr, "Cannot open index file.\n");
488 	    return 1;
489 	}
490 	if (scram_set_option(out, CRAM_OPT_WITH_BGZIP_INDEX, idx))
491 	    return 1;
492     }
493 
494     if (index_out_fn) {
495 	if (scram_set_option(in, CRAM_OPT_OUTPUT_BGZIP_IDX, index_out_fn))
496 	    return 1;
497     }
498 
499     if (nthreads > 1) {
500 	if (NULL == (p = t_pool_init(nthreads*2, nthreads)))
501 	    return 1;
502 
503 	if (scram_set_option(in,  CRAM_OPT_THREAD_POOL, p))
504 	    return 1;
505 	if (scram_set_option(out, CRAM_OPT_THREAD_POOL, p))
506 	    return 1;
507     }
508 
509     if (ignore_md5) {
510 	if (scram_set_option(in, CRAM_OPT_IGNORE_MD5, ignore_md5))
511 	    return 1;
512 	if (scram_set_option(in, CRAM_OPT_IGNORE_CHKSUM, ignore_md5))
513 	    return 1;
514 	if (scram_set_option(out, CRAM_OPT_IGNORE_CHKSUM, ignore_md5))
515 	    return 1;
516     }
517 
518     if (lossy_read_names) {
519 	if (scram_set_option(out, CRAM_OPT_LOSSY_READ_NAMES, lossy_read_names))
520 	    return 1;
521     }
522 
523     if (preserve_aux_order)
524 	if (scram_set_option(out, CRAM_OPT_PRESERVE_AUX_ORDER, preserve_aux_order))
525 	    return 1;
526 
527     if (preserve_aux_size)
528 	if (scram_set_option(out, CRAM_OPT_PRESERVE_AUX_SIZE, preserve_aux_size))
529 	    return 1;
530 
531     if (sam_fields)
532 	scram_set_option(in, CRAM_OPT_REQUIRED_FIELDS, sam_fields);
533 
534     /* Copy header and refs from in to out, for writing purposes */
535     scram_set_header(out, scram_get_header(in));
536 
537     // Needs doing after loading the header.
538     if (ref_fn) {
539 	if (scram_set_option(out, CRAM_OPT_REFERENCE, ref_fn))
540 	    return 1;
541     } else {
542 	// Attempt to fill out a cram->refs[] array from @SQ headers
543 	scram_set_option(out, CRAM_OPT_REFERENCE, NULL);
544     }
545 
546     if (scram_get_header(out)) {
547         if (add_pg) {
548 	    char *arg_list = stringify_argv(argc, argv);
549 
550 	    if (!arg_list)
551 		return 1;
552 
553 
554 	    if (sam_hdr_add_PG(scram_get_header(out), "scramble",
555 			       "VN", PACKAGE_VERSION,
556 			       "CL", arg_list, NULL))
557 	        return 1;
558 
559 	    free(arg_list);
560 	}
561 
562 	if ((header || (omode[1] != 's' && omode[1] != '\0')) && scram_write_header(out) != 0)
563 	    return 1;
564     }
565 
566 
567     /* Support for sub-range queries, currently implemented for CRAM only */
568     if (*ref_name != 0) {
569 	cram_range r;
570 	int refid;
571 
572 	if (in->is_bam) {
573 	    fprintf(stderr, "Currently the -R option is only implemented for CRAM indices\n");
574 	    return 1;
575 	}
576 
577 	cram_index_load(in->c, argv[optind]);
578 
579 	refid = sam_hdr_name2ref(in->c->header, ref_name);
580 
581 	if (refid == -1 && *ref_name != '*') {
582 	    fprintf(stderr, "Unknown reference name '%s'\n", ref_name);
583 	    return 1;
584 	}
585 	r.refid = refid;
586 	r.start = start;
587 	r.end = end;
588 	if (scram_set_option(in, CRAM_OPT_RANGE, &r))
589 	    return 1;
590     }
591 
592     /* Do the actual file format conversion */
593     s = NULL;
594 
595     while (scram_get_seq(in, &s) >= 0) {
596 	if (-1 == scram_put_seq(out, s)) {
597 	    fprintf(stderr, "Failed to encode sequence\n");
598 	    return 1;
599 	}
600 	if (max_reads >= 0)
601 	    if (--max_reads == 0)
602 		break;
603     }
604 
605     switch(scram_eof(in)) {
606     case -1:
607 	fprintf(stderr, "Failed to decode sequence\n");
608 	return 1;
609     case 0:
610 	if (max_reads == -1) {
611 	    fprintf(stderr, "Failed to decode sequence\n");
612 	    return 1;
613 	} else {
614 	    break;
615 	}
616     case 2:
617 	fprintf(stderr, "Warning: no end-of-file block identified. "
618 		"File may be truncated.\n");
619 	break;
620     case 1: default:
621 	// expected case
622 	break;
623     }
624 
625     /* Finally tidy up and close files */
626     if (scram_close(in)) {
627 	fprintf(stderr, "Failed in scram_close(in)\n");
628 	return 1;
629     }
630     if (scram_close(out)) {
631 	fprintf(stderr, "Failed in scram_close(out)\n");
632 	return 1;
633     }
634 
635     if (p)
636 	t_pool_destroy(p, 0);
637 
638     if (s)
639 	free(s);
640 
641     return 0;
642 }
643