1 /*
2 * Copyright (c) 2013 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Sanger Institute, 2013.
36 *
37 * Converts a SAM or BAM file into a CRAM file.
38 *
39 * Usage:
40 * sam_to_cram [-level] input.sam reference.fasta [output.cram]
41 */
42
43 #include "io_lib_config.h"
44
45 #include <stdio.h>
46 #include <assert.h>
47 #include <string.h>
48 #include <unistd.h>
49
50 #include <io_lib/cram.h>
51
usage(FILE * fp)52 void usage(FILE *fp) {
53 fprintf(fp, "Usage: sam_to_cram [-r ref.fa] [-0..9] [-u] [-v] [-s int] "
54 "[-S int] in.sam/bam [output.cram]\n\n");
55 fprintf(fp, "Options:\n");
56 fprintf(fp, " -r ref.fa Specifies the reference file.\n");
57 fprintf(fp, " -1 to -9 Set zlib compression level for CRAM\n");
58 fprintf(fp, " -0 or -u No zlib compression.\n");
59 fprintf(fp, " -v Verbose output.\n");
60 fprintf(fp, " -s integer Sequences per slice, default %d.\n",
61 SEQS_PER_SLICE);
62 fprintf(fp, " -S integer Slices per container, default %d.\n",
63 SLICE_PER_CNT);
64 fprintf(fp, " -V version Specify the CRAM format version to write (eg 1.1, 2.0)\n");
65 fprintf(fp, " -X Embed reference sequence.\n");
66 }
67
main(int argc,char ** argv)68 int main(int argc, char **argv) {
69 cram_fd *out;
70 bam_file_t *in;
71 bam_seq_t *s = NULL;
72 char *out_fn;
73 int level = '\0'; // nul terminate string => auto level
74 char out_mode[4];
75 int c, verbose = 0;
76 int s_opt = 0, S_opt = 0, embed_ref = 0;
77 char *arg_list, *ref_fn = NULL;
78
79 while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:X")) != -1) {
80 switch (c) {
81 case '0': case '1': case '2': case '3': case '4':
82 case '5': case '6': case '7': case '8': case '9':
83 level = c;
84 break;
85
86 case 'u':
87 level = '0';
88 break;
89
90 case 'h':
91 usage(stdout);
92 return 0;
93
94 case 'v':
95 verbose++;
96 break;
97
98 case 's':
99 s_opt = atoi(optarg);
100 break;
101
102 case 'S':
103 S_opt = atoi(optarg);
104 break;
105
106 case 'V':
107 cram_set_option(NULL, CRAM_OPT_VERSION, optarg);
108 break;
109
110 case 'r':
111 ref_fn = optarg;
112 break;
113
114 case 'X':
115 embed_ref = 1;
116 break;
117
118 case '?':
119 fprintf(stderr, "Unrecognised option: -%c\n", optopt);
120 usage(stderr);
121 return 1;
122 }
123 }
124
125 if (argc - optind != 1 && argc - optind != 2) {
126 usage(stderr);
127 return 1;
128 }
129
130 /* opening */
131 if (NULL == (in = bam_open(argv[optind], "rb"))) {
132 perror(argv[optind]);
133 return 1;
134 }
135
136 out_fn = argc - optind == 2 ? argv[optind+1] : "-";
137 sprintf(out_mode, "wb%c", level);
138 if (NULL == (out = cram_open(out_fn, out_mode))) {
139 fprintf(stderr, "Error opening CRAM file '%s'.\n", out_fn);
140 return 1;
141 }
142
143 /* SAM Header */
144 if (!(arg_list = stringify_argv(argc, argv)))
145 return 1;
146 sam_hdr_add_PG(in->header, "sam_to_cram",
147 "VN", PACKAGE_VERSION,
148 "CL", arg_list, NULL);
149 free(arg_list);
150
151 /* Find and load reference */
152 if (!ref_fn) {
153 SAM_hdr_type *ty = sam_hdr_find(in->header, "SQ", NULL, NULL);
154 if (ty) {
155 SAM_hdr_tag *tag;
156
157 if ((tag = sam_hdr_find_key(in->header, ty, "UR", NULL))) {
158 ref_fn = tag->str + 3;
159 if (strncmp(ref_fn, "file:", 5) == 0)
160 ref_fn += 5;
161 }
162 }
163 }
164
165 out->header = in->header;
166 if (ref_fn)
167 cram_load_reference(out, ref_fn);
168
169 if (!out->refs) {
170 fprintf(stderr, "Unable to open reference.\n"
171 "Please specify a valid reference with -r ref.fa option.\n");
172 return 1;
173 }
174 refs2id(out->refs, out->header);
175
176 if (-1 == cram_write_SAM_hdr(out, in->header))
177 return 1;
178
179 cram_set_option(out, CRAM_OPT_VERBOSITY, verbose);
180 if (s_opt)
181 cram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt);
182
183 if (S_opt)
184 cram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt);
185
186 if (embed_ref)
187 cram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref);
188
189 /* Sequence iterators */
190 while (bam_get_seq(in, &s) > 0) {
191 if (-1 == cram_put_bam_seq(out, s)) {
192 fprintf(stderr, "Failed in cram_put_bam_seq()\n");
193 return 1;
194 }
195 }
196
197 bam_close(in);
198 out->header = NULL; // freed by bam_close()
199 if (-1 == cram_close(out)) {
200 fprintf(stderr, "Failed in cram_close()\n");
201 return 1;
202 }
203
204 if (s)
205 free(s);
206
207 return 0;
208 }
209