1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Sanger Institute, 2013.
36  *
37  * Converts a SAM or BAM file into a CRAM file.
38  *
39  * Usage:
40  *     sam_to_cram [-level] input.sam reference.fasta [output.cram]
41  */
42 
43 #include "io_lib_config.h"
44 
45 #include <stdio.h>
46 #include <assert.h>
47 #include <string.h>
48 #include <unistd.h>
49 
50 #include <io_lib/cram.h>
51 
usage(FILE * fp)52 void usage(FILE *fp) {
53     fprintf(fp, "Usage: sam_to_cram [-r ref.fa] [-0..9] [-u] [-v] [-s int] "
54 	    "[-S int] in.sam/bam [output.cram]\n\n");
55     fprintf(fp, "Options:\n");
56     fprintf(fp, "    -r ref.fa      Specifies the reference file.\n");
57     fprintf(fp, "    -1 to -9       Set zlib compression level for CRAM\n");
58     fprintf(fp, "    -0 or -u       No zlib compression.\n");
59     fprintf(fp, "    -v             Verbose output.\n");
60     fprintf(fp, "    -s integer     Sequences per slice, default %d.\n",
61 	    SEQS_PER_SLICE);
62     fprintf(fp, "    -S integer     Slices per container, default %d.\n",
63 	    SLICE_PER_CNT);
64     fprintf(fp, "    -V version     Specify the CRAM format version to write (eg 1.1, 2.0)\n");
65     fprintf(fp, "    -X             Embed reference sequence.\n");
66 }
67 
main(int argc,char ** argv)68 int main(int argc, char **argv) {
69     cram_fd *out;
70     bam_file_t *in;
71     bam_seq_t *s = NULL;
72     char *out_fn;
73     int level = '\0'; // nul terminate string => auto level
74     char out_mode[4];
75     int c, verbose = 0;
76     int s_opt = 0, S_opt = 0, embed_ref = 0;
77     char *arg_list, *ref_fn = NULL;
78 
79     while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:X")) != -1) {
80 	switch (c) {
81 	case '0': case '1': case '2': case '3': case '4':
82 	case '5': case '6': case '7': case '8': case '9':
83 	    level = c;
84 	    break;
85 
86 	case 'u':
87 	    level = '0';
88 	    break;
89 
90 	case 'h':
91 	    usage(stdout);
92 	    return 0;
93 
94 	case 'v':
95 	    verbose++;
96 	    break;
97 
98 	case 's':
99 	    s_opt = atoi(optarg);
100 	    break;
101 
102 	case 'S':
103 	    S_opt = atoi(optarg);
104 	    break;
105 
106 	case 'V':
107 	    cram_set_option(NULL, CRAM_OPT_VERSION, optarg);
108 	    break;
109 
110 	case 'r':
111 	    ref_fn = optarg;
112 	    break;
113 
114 	case 'X':
115 	    embed_ref = 1;
116 	    break;
117 
118 	case '?':
119 	    fprintf(stderr, "Unrecognised option: -%c\n", optopt);
120 	    usage(stderr);
121 	    return 1;
122 	}
123     }
124 
125     if (argc - optind != 1 && argc - optind != 2) {
126 	usage(stderr);
127 	return 1;
128     }
129 
130     /* opening */
131     if (NULL == (in = bam_open(argv[optind], "rb"))) {
132 	perror(argv[optind]);
133 	return 1;
134     }
135 
136     out_fn = argc - optind == 2 ? argv[optind+1] : "-";
137     sprintf(out_mode, "wb%c", level);
138     if (NULL == (out = cram_open(out_fn, out_mode))) {
139 	fprintf(stderr, "Error opening CRAM file '%s'.\n", out_fn);
140 	return 1;
141     }
142 
143     /* SAM Header */
144     if (!(arg_list = stringify_argv(argc, argv)))
145 	return 1;
146     sam_hdr_add_PG(in->header, "sam_to_cram",
147 		   "VN", PACKAGE_VERSION,
148 		   "CL", arg_list, NULL);
149     free(arg_list);
150 
151     /* Find and load reference */
152     if (!ref_fn) {
153 	SAM_hdr_type *ty = sam_hdr_find(in->header, "SQ", NULL, NULL);
154 	if (ty) {
155 	    SAM_hdr_tag *tag;
156 
157 	    if ((tag = sam_hdr_find_key(in->header, ty, "UR", NULL))) {
158 		ref_fn  = tag->str + 3;
159 		if (strncmp(ref_fn, "file:", 5) == 0)
160 		    ref_fn += 5;
161 	    }
162 	}
163     }
164 
165     out->header = in->header;
166     if (ref_fn)
167 	cram_load_reference(out, ref_fn);
168 
169     if (!out->refs) {
170 	fprintf(stderr, "Unable to open reference.\n"
171 		"Please specify a valid reference with -r ref.fa option.\n");
172 	return 1;
173     }
174     refs2id(out->refs, out->header);
175 
176     if (-1 == cram_write_SAM_hdr(out, in->header))
177 	return 1;
178 
179     cram_set_option(out, CRAM_OPT_VERBOSITY, verbose);
180     if (s_opt)
181 	cram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt);
182 
183     if (S_opt)
184 	cram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt);
185 
186     if (embed_ref)
187 	cram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref);
188 
189     /* Sequence iterators */
190     while (bam_get_seq(in, &s) > 0) {
191 	if (-1 == cram_put_bam_seq(out, s)) {
192 	    fprintf(stderr, "Failed in cram_put_bam_seq()\n");
193 	    return 1;
194 	}
195     }
196 
197     bam_close(in);
198     out->header = NULL; // freed by bam_close()
199     if (-1 == cram_close(out)) {
200 	fprintf(stderr, "Failed in cram_close()\n");
201 	return 1;
202     }
203 
204     if (s)
205 	free(s);
206 
207     return 0;
208 }
209