1 /*
2 Copyright (c) 2015 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*! \file
32 * External CRAM interface.
33 *
34 * Internally we're happy to use macros and to grub around in the cram
35 * structures. This isn't very sustainable for an externally usable
36 * ABI though, so we have anonymous structs and accessor functions too
37 * to permit software such as samtools reheader to manipulate cram
38 * containers and blocks in a robust manner.
39 */
40
41 #include <config.h>
42
43 #include "htslib/hfile.h"
44 #include "cram/cram.h"
45
46 /*
47 *-----------------------------------------------------------------------------
48 * cram_fd
49 */
cram_fd_get_header(cram_fd * fd)50 SAM_hdr *cram_fd_get_header(cram_fd *fd) { return fd->header; }
cram_fd_set_header(cram_fd * fd,SAM_hdr * hdr)51 void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) { fd->header = hdr; }
52
cram_fd_get_version(cram_fd * fd)53 int cram_fd_get_version(cram_fd *fd) { return fd->version; }
cram_fd_set_version(cram_fd * fd,int vers)54 void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; }
55
cram_major_vers(cram_fd * fd)56 int cram_major_vers(cram_fd *fd) { return CRAM_MAJOR_VERS(fd->version); }
cram_minor_vers(cram_fd * fd)57 int cram_minor_vers(cram_fd *fd) { return CRAM_MINOR_VERS(fd->version); }
58
cram_fd_get_fp(cram_fd * fd)59 hFILE *cram_fd_get_fp(cram_fd *fd) { return fd->fp; }
cram_fd_set_fp(cram_fd * fd,hFILE * fp)60 void cram_fd_set_fp(cram_fd *fd, hFILE *fp) { fd->fp = fp; }
61
62
63 /*
64 *-----------------------------------------------------------------------------
65 * cram_container
66 */
cram_container_get_length(cram_container * c)67 int32_t cram_container_get_length(cram_container *c) {
68 return c->length;
69 }
70
cram_container_set_length(cram_container * c,int32_t length)71 void cram_container_set_length(cram_container *c, int32_t length) {
72 c->length = length;
73 }
74
75
cram_container_get_num_blocks(cram_container * c)76 int32_t cram_container_get_num_blocks(cram_container *c) {
77 return c->num_blocks;
78 }
79
cram_container_set_num_blocks(cram_container * c,int32_t num_blocks)80 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
81 c->num_blocks = num_blocks;
82 }
83
84
85 /* Returns the landmarks[] array and the number of elements
86 * in num_landmarks.
87 */
cram_container_get_landmarks(cram_container * c,int32_t * num_landmarks)88 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) {
89 *num_landmarks = c->num_landmarks;
90 return c->landmark;
91 }
92
93 /* Sets the landmarks[] array (pointer copy, not a memory dup) and
94 * num_landmarks value.
95 */
cram_container_set_landmarks(cram_container * c,int32_t num_landmarks,int32_t * landmarks)96 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
97 int32_t *landmarks) {
98 c->num_landmarks = num_landmarks;
99 c->landmark = landmarks;
100 }
101
102
103 /* Returns true if the container is empty (EOF marker) */
cram_container_is_empty(cram_fd * fd)104 int cram_container_is_empty(cram_fd *fd) {
105 return fd->empty_container;
106 }
107
108
109 /*
110 *-----------------------------------------------------------------------------
111 * cram_block_compression_hdr
112 */
113
114 /*
115 * Utility function to edit an RG id.
116 * This is only possible if there is one single RG value used and it
117 * is in the container compression header using HUFFMAN or BETA
118 * codec. In this case it is essentially hard coded and needs no
119 * editing of external (or worse, CORE) blocks.
120 *
121 * Returns 0 on success
122 * -1 on failure
123 */
124 // Or arbitrary set compression header constant?
125
cram_block_compression_hdr_set_DS(cram_block_compression_hdr * ch,int ds,int new_rg)126 static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
127 int ds, int new_rg) {
128 if (!ch || !ch->codecs[ds])
129 return -1;
130
131 switch (ch->codecs[ds]->codec) {
132 case E_HUFFMAN:
133 if (ch->codecs[ds]->huffman.ncodes != 1)
134 return -1;
135 ch->codecs[ds]->huffman.codes[0].symbol = new_rg;
136 return 0;
137
138 case E_BETA:
139 if (ch->codecs[ds]->beta.nbits != 0)
140 return -1;
141 ch->codecs[ds]->beta.offset = -new_rg;
142 return 0;
143
144 default:
145 return -1;
146 }
147
148 return 0;
149 }
150
cram_block_compression_hdr_set_rg(cram_block_compression_hdr * ch,int new_rg)151 int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
152 return cram_block_compression_hdr_set_DS(ch, DS_RG, new_rg);
153 }
154
155 /*
156 * Converts a cram_block_compression_hdr struct used for decoding to
157 * one used for encoding. Maybe this should be a transparent
158 * operation applied on-demand.
159 *
160 * Returns 0 on success
161 * -1 on failure
162 */
cram_block_compression_hdr_decoder2encoder(cram_fd * fd,cram_block_compression_hdr * ch)163 int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
164 cram_block_compression_hdr *ch) {
165 int i;
166
167 if (!ch)
168 return -1;
169
170 for (i = 0; i < DS_END; i++) {
171 cram_codec *co = ch->codecs[i];
172 if (!co)
173 continue;
174
175 if (-1 == cram_codec_decoder2encoder(fd, co))
176 return -1;
177 }
178
179 return 0;
180 }
181
182 /*
183 *-----------------------------------------------------------------------------
184 * cram_slice
185 */
cram_slice_hdr_get_num_blocks(cram_block_slice_hdr * hdr)186 int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
187 return hdr->num_blocks;
188 }
189
190
191 /*
192 *-----------------------------------------------------------------------------
193 * cram_block
194 */
cram_block_get_content_id(cram_block * b)195 int32_t cram_block_get_content_id(cram_block *b) { return b->content_id; }
cram_block_get_comp_size(cram_block * b)196 int32_t cram_block_get_comp_size(cram_block *b) { return b->comp_size; }
cram_block_get_uncomp_size(cram_block * b)197 int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
cram_block_get_crc32(cram_block * b)198 int32_t cram_block_get_crc32(cram_block *b) { return b->crc32; }
cram_block_get_data(cram_block * b)199 void * cram_block_get_data(cram_block *b) { return BLOCK_DATA(b); }
cram_block_get_size(cram_block * b)200 int32_t cram_block_get_size(cram_block *b) { return BLOCK_SIZE(b); }
cram_block_get_content_type(cram_block * b)201 enum cram_content_type cram_block_get_content_type(cram_block *b) {
202 return b->content_type;
203 }
204
cram_block_set_content_id(cram_block * b,int32_t id)205 void cram_block_set_content_id(cram_block *b, int32_t id) { b->content_id = id; }
cram_block_set_comp_size(cram_block * b,int32_t size)206 void cram_block_set_comp_size(cram_block *b, int32_t size) { b->comp_size = size; }
cram_block_set_uncomp_size(cram_block * b,int32_t size)207 void cram_block_set_uncomp_size(cram_block *b, int32_t size) { b->uncomp_size = size; }
cram_block_set_crc32(cram_block * b,int32_t crc)208 void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; }
cram_block_set_data(cram_block * b,void * data)209 void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; }
cram_block_set_size(cram_block * b,int32_t size)210 void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; }
211
cram_block_append(cram_block * b,void * data,int size)212 int cram_block_append(cram_block *b, void *data, int size) {
213 BLOCK_APPEND(b, data, size);
214 return BLOCK_DATA(b) ? 0 : -1; // It'll do for now...
215 }
cram_block_update_size(cram_block * b)216 void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }
217
218 // Offset is known as "size" internally, but it can be confusing.
cram_block_get_offset(cram_block * b)219 size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
cram_block_set_offset(cram_block * b,size_t offset)220 void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }
221
222
223 /*
224 * Copies the blocks representing the next num_slice slices from a
225 * container from 'in' to 'out'. It is expected that the file pointer
226 * is just after the read of the cram_container and cram compression
227 * header.
228 *
229 * Returns 0 on success
230 * -1 on failure
231 */
cram_copy_slice(cram_fd * in,cram_fd * out,int32_t num_slice)232 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
233 int32_t i, j;
234
235 for (i = 0; i < num_slice; i++) {
236 cram_block *blk;
237 cram_block_slice_hdr *hdr;
238
239 if (!(blk = cram_read_block(in)))
240 return -1;
241 if (!(hdr = cram_decode_slice_header(in, blk))) {
242 cram_free_block(blk);
243 return -1;
244 }
245 if (cram_write_block(out, blk) != 0) {
246 cram_free_block(blk);
247 return -1;
248 }
249 cram_free_block(blk);
250
251 int num_blocks = cram_slice_hdr_get_num_blocks(hdr);
252 for (j = 0; j < num_blocks; j++) {
253 blk = cram_read_block(in);
254 if (!blk || cram_write_block(out, blk) != 0) {
255 if (blk) cram_free_block(blk);
256 return -1;
257 }
258 cram_free_block(blk);
259 }
260 cram_free_slice_header(hdr);
261 }
262
263 return 0;
264 }
265
266 /*
267 * Renumbers RG numbers in a cram compression header.
268 *
269 * CRAM stores RG as the Nth number in the header, rather than a
270 * string holding the ID: tag. This is smaller in space, but means
271 * "samtools cat" to join files together that contain single but
272 * different RG lines needs a way of renumbering them.
273 *
274 * The file descriptor is expected to be immediately after the
275 * cram_container structure (ie before the cram compression header).
276 * Due to the nature of the CRAM format, this needs to read and write
277 * the blocks itself. Note that there may be multiple slices within
278 * the container, meaning multiple compression headers to manipulate.
279 * Changing RG may change the size of the compression header and
280 * therefore the length field in the container. Hence we rewrite all
281 * blocks just incase and also emit the adjusted container.
282 *
283 * The current implementation can only cope with renumbering a single
284 * RG (and only then if it is using HUFFMAN or BETA codecs). In
285 * theory it *may* be possible to renumber multiple RGs if they use
286 * HUFFMAN to the CORE block or use an external block unshared by any
287 * other data series. So we have an API that can be upgraded to
288 * support this, but do not implement it for now. An example
289 * implementation of RG as an EXTERNAL block would be to find that
290 * block and rewrite it, returning the number of blocks consumed.
291 *
292 * Returns 0 on success;
293 * -1 if unable to edit;
294 * -2 on other errors (eg I/O).
295 */
cram_transcode_rg(cram_fd * in,cram_fd * out,cram_container * c,int nrg,int * in_rg,int * out_rg)296 int cram_transcode_rg(cram_fd *in, cram_fd *out,
297 cram_container *c,
298 int nrg, int *in_rg, int *out_rg) {
299 int new_rg = *out_rg, old_size, new_size;
300 cram_block *o_blk, *n_blk;
301 cram_block_compression_hdr *ch;
302
303 if (nrg != 1) {
304 hts_log_error("CRAM transcode supports only a single RG");
305 return -2;
306 }
307
308 // Produce a new block holding the updated compression header,
309 // with RG transcoded to a new value. (Single only supported.)
310 o_blk = cram_read_block(in);
311 old_size = cram_block_size(o_blk);
312 ch = cram_decode_compression_header(in, o_blk);
313 if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
314 return -1;
315 cram_block_compression_hdr_decoder2encoder(in, ch);
316 n_blk = cram_encode_compression_header(in, c, ch);
317 cram_free_compression_header(ch);
318
319 /*
320 * Warning: this has internal knowledge of the cram compression
321 * header format.
322 *
323 * The decoder doesn't set c->tags_used, so the encoder puts a two
324 * byte blank segment. This means n_blk is too short. We skip
325 * through the decoded old block (o_blk) and copy from there.
326 */
327 char *cp = cram_block_get_data(o_blk);
328 char *op = cp;
329 char *endp = cp + cram_block_get_uncomp_size(o_blk);
330 //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
331 int32_t i32;
332
333 cp += safe_itf8_get(cp, endp, &i32);
334 cp += i32;
335 cp += safe_itf8_get(cp, endp, &i32);
336 cp += i32;
337 op = cp;
338 cp += safe_itf8_get(cp, endp, &i32);
339 i32 += (cp-op);
340
341 //fprintf(stderr, "remaining %d bytes\n", i32);
342 cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
343 cram_block_append(n_blk, op, i32);
344 cram_block_update_size(n_blk);
345
346 new_size = cram_block_size(n_blk);
347
348 //fprintf(stderr, "size %d -> %d\n", old_size, new_size);
349
350 // Now we've constructedthe updated compression header,
351 // amend the container too (it may have changed size).
352 int32_t *landmarks, num_landmarks;
353 landmarks = cram_container_get_landmarks(c, &num_landmarks);
354
355 if (old_size != new_size) {
356 int diff = new_size - old_size, j;
357
358 for (j = 0; j < num_landmarks; j++)
359 landmarks[j] += diff;
360 //cram_container_set_landmarks(c, num_landmarks, landmarks);
361 cram_container_set_length(c, cram_container_get_length(c) + diff);
362 }
363
364 // Finally write it all out; container, compression header,
365 // and then all the remaining slice blocks.
366 if (cram_write_container(out, c) != 0)
367 return -2;
368
369 cram_write_block(out, n_blk);
370 cram_free_block(o_blk);
371 cram_free_block(n_blk);
372
373 // Container num_blocks can be invalid, due to a bug.
374 // Instead we iterate in slice context instead.
375 return cram_copy_slice(in, out, num_landmarks);
376 }
377
378
379 /*!
380 * Returns the refs_t structure used by a cram file handle.
381 *
382 * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
383 * share reference memory between multiple file handles.
384 *
385 * @return
386 * Returns NULL if none exists or the file handle is not a CRAM file.
387 */
cram_get_refs(htsFile * fd)388 refs_t *cram_get_refs(htsFile *fd) {
389 return fd->format.format == cram
390 ? fd->fp.cram->refs
391 : NULL;
392 }
393