1 /*
2 Copyright (c) 2015, 2018-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*! \file
32 * External CRAM interface.
33 *
34 * Internally we're happy to use macros and to grub around in the cram
35 * structures. This isn't very sustainable for an externally usable
36 * ABI though, so we have anonymous structs and accessor functions too
37 * to permit software such as samtools reheader to manipulate cram
38 * containers and blocks in a robust manner.
39 */
40
41 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
42 #include <config.h>
43
44 #include "../htslib/hfile.h"
45 #include "cram.h"
46
47 /*
48 *-----------------------------------------------------------------------------
49 * cram_fd
50 */
cram_fd_get_header(cram_fd * fd)51 sam_hdr_t *cram_fd_get_header(cram_fd *fd) { return fd->header; }
cram_fd_set_header(cram_fd * fd,sam_hdr_t * hdr)52 void cram_fd_set_header(cram_fd *fd, sam_hdr_t *hdr) { fd->header = hdr; }
53
cram_fd_get_version(cram_fd * fd)54 int cram_fd_get_version(cram_fd *fd) { return fd->version; }
cram_fd_set_version(cram_fd * fd,int vers)55 void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; }
56
cram_major_vers(cram_fd * fd)57 int cram_major_vers(cram_fd *fd) { return CRAM_MAJOR_VERS(fd->version); }
cram_minor_vers(cram_fd * fd)58 int cram_minor_vers(cram_fd *fd) { return CRAM_MINOR_VERS(fd->version); }
59
cram_fd_get_fp(cram_fd * fd)60 hFILE *cram_fd_get_fp(cram_fd *fd) { return fd->fp; }
cram_fd_set_fp(cram_fd * fd,hFILE * fp)61 void cram_fd_set_fp(cram_fd *fd, hFILE *fp) { fd->fp = fp; }
62
63
64 /*
65 *-----------------------------------------------------------------------------
66 * cram_container
67 */
cram_container_get_length(cram_container * c)68 int32_t cram_container_get_length(cram_container *c) {
69 return c->length;
70 }
71
cram_container_set_length(cram_container * c,int32_t length)72 void cram_container_set_length(cram_container *c, int32_t length) {
73 c->length = length;
74 }
75
76
cram_container_get_num_blocks(cram_container * c)77 int32_t cram_container_get_num_blocks(cram_container *c) {
78 return c->num_blocks;
79 }
80
cram_container_set_num_blocks(cram_container * c,int32_t num_blocks)81 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
82 c->num_blocks = num_blocks;
83 }
84
85
86 /* Returns the landmarks[] array and the number of elements
87 * in num_landmarks.
88 */
cram_container_get_landmarks(cram_container * c,int32_t * num_landmarks)89 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) {
90 *num_landmarks = c->num_landmarks;
91 return c->landmark;
92 }
93
94 /* Sets the landmarks[] array (pointer copy, not a memory dup) and
95 * num_landmarks value.
96 */
cram_container_set_landmarks(cram_container * c,int32_t num_landmarks,int32_t * landmarks)97 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
98 int32_t *landmarks) {
99 c->num_landmarks = num_landmarks;
100 c->landmark = landmarks;
101 }
102
103
104 /* Returns true if the container is empty (EOF marker) */
cram_container_is_empty(cram_fd * fd)105 int cram_container_is_empty(cram_fd *fd) {
106 return fd->empty_container;
107 }
108
109
110 /*
111 *-----------------------------------------------------------------------------
112 * cram_block_compression_hdr
113 */
114
115 /*
116 * Utility function to edit an RG id.
117 * This is only possible if there is one single RG value used and it
118 * is in the container compression header using HUFFMAN or BETA
119 * codec. In this case it is essentially hard coded and needs no
120 * editing of external (or worse, CORE) blocks.
121 *
122 * Returns 0 on success
123 * -1 on failure
124 */
125 // Or arbitrary set compression header constant?
126
cram_block_compression_hdr_set_DS(cram_block_compression_hdr * ch,int ds,int new_rg)127 static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
128 int ds, int new_rg) {
129 if (!ch || !ch->codecs[ds])
130 return -1;
131
132 switch (ch->codecs[ds]->codec) {
133 case E_HUFFMAN:
134 if (ch->codecs[ds]->u.huffman.ncodes != 1)
135 return -1;
136 ch->codecs[ds]->u.huffman.codes[0].symbol = new_rg;
137 return 0;
138
139 case E_BETA:
140 if (ch->codecs[ds]->u.beta.nbits != 0)
141 return -1;
142 ch->codecs[ds]->u.beta.offset = -new_rg;
143 return 0;
144
145 default:
146 break;
147 }
148
149 return -1;
150 }
151
cram_block_compression_hdr_set_rg(cram_block_compression_hdr * ch,int new_rg)152 int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
153 return cram_block_compression_hdr_set_DS(ch, DS_RG, new_rg);
154 }
155
156 /*
157 * Converts a cram_block_compression_hdr struct used for decoding to
158 * one used for encoding. Maybe this should be a transparent
159 * operation applied on-demand.
160 *
161 * Returns 0 on success
162 * -1 on failure
163 */
cram_block_compression_hdr_decoder2encoder(cram_fd * fd,cram_block_compression_hdr * ch)164 int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
165 cram_block_compression_hdr *ch) {
166 int i;
167
168 if (!ch)
169 return -1;
170
171 for (i = 0; i < DS_END; i++) {
172 cram_codec *co = ch->codecs[i];
173 if (!co)
174 continue;
175
176 if (-1 == cram_codec_decoder2encoder(fd, co))
177 return -1;
178 }
179
180 return 0;
181 }
182
183 /*
184 *-----------------------------------------------------------------------------
185 * cram_slice
186 */
cram_slice_hdr_get_num_blocks(cram_block_slice_hdr * hdr)187 int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
188 return hdr->num_blocks;
189 }
190
191
192 /*
193 *-----------------------------------------------------------------------------
194 * cram_block
195 */
cram_block_get_content_id(cram_block * b)196 int32_t cram_block_get_content_id(cram_block *b) { return b->content_id; }
cram_block_get_comp_size(cram_block * b)197 int32_t cram_block_get_comp_size(cram_block *b) { return b->comp_size; }
cram_block_get_uncomp_size(cram_block * b)198 int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
cram_block_get_crc32(cram_block * b)199 int32_t cram_block_get_crc32(cram_block *b) { return b->crc32; }
cram_block_get_data(cram_block * b)200 void * cram_block_get_data(cram_block *b) { return BLOCK_DATA(b); }
cram_block_get_size(cram_block * b)201 int32_t cram_block_get_size(cram_block *b) { return BLOCK_SIZE(b); }
cram_block_get_content_type(cram_block * b)202 enum cram_content_type cram_block_get_content_type(cram_block *b) {
203 return b->content_type;
204 }
205
cram_block_set_content_id(cram_block * b,int32_t id)206 void cram_block_set_content_id(cram_block *b, int32_t id) { b->content_id = id; }
cram_block_set_comp_size(cram_block * b,int32_t size)207 void cram_block_set_comp_size(cram_block *b, int32_t size) { b->comp_size = size; }
cram_block_set_uncomp_size(cram_block * b,int32_t size)208 void cram_block_set_uncomp_size(cram_block *b, int32_t size) { b->uncomp_size = size; }
cram_block_set_crc32(cram_block * b,int32_t crc)209 void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; }
cram_block_set_data(cram_block * b,void * data)210 void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; }
cram_block_set_size(cram_block * b,int32_t size)211 void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; }
212
cram_block_append(cram_block * b,const void * data,int size)213 int cram_block_append(cram_block *b, const void *data, int size) {
214 BLOCK_APPEND(b, data, size);
215 return 0;
216
217 block_err:
218 return -1;
219 }
cram_block_update_size(cram_block * b)220 void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }
221
222 // Offset is known as "size" internally, but it can be confusing.
cram_block_get_offset(cram_block * b)223 size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
cram_block_set_offset(cram_block * b,size_t offset)224 void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }
225
226
227 /*
228 * Copies the blocks representing the next num_slice slices from a
229 * container from 'in' to 'out'. It is expected that the file pointer
230 * is just after the read of the cram_container and cram compression
231 * header.
232 *
233 * Returns 0 on success
234 * -1 on failure
235 */
cram_copy_slice(cram_fd * in,cram_fd * out,int32_t num_slice)236 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
237 int32_t i, j;
238
239 for (i = 0; i < num_slice; i++) {
240 cram_block *blk;
241 cram_block_slice_hdr *hdr;
242
243 if (!(blk = cram_read_block(in)))
244 return -1;
245 if (!(hdr = cram_decode_slice_header(in, blk))) {
246 cram_free_block(blk);
247 return -1;
248 }
249 if (cram_write_block(out, blk) != 0) {
250 cram_free_block(blk);
251 return -1;
252 }
253 cram_free_block(blk);
254
255 int num_blocks = cram_slice_hdr_get_num_blocks(hdr);
256 for (j = 0; j < num_blocks; j++) {
257 blk = cram_read_block(in);
258 if (!blk || cram_write_block(out, blk) != 0) {
259 if (blk) cram_free_block(blk);
260 return -1;
261 }
262 cram_free_block(blk);
263 }
264 cram_free_slice_header(hdr);
265 }
266
267 return 0;
268 }
269
270 /*
271 * Renumbers RG numbers in a cram compression header.
272 *
273 * CRAM stores RG as the Nth number in the header, rather than a
274 * string holding the ID: tag. This is smaller in space, but means
275 * "samtools cat" to join files together that contain single but
276 * different RG lines needs a way of renumbering them.
277 *
278 * The file descriptor is expected to be immediately after the
279 * cram_container structure (ie before the cram compression header).
280 * Due to the nature of the CRAM format, this needs to read and write
281 * the blocks itself. Note that there may be multiple slices within
282 * the container, meaning multiple compression headers to manipulate.
283 * Changing RG may change the size of the compression header and
284 * therefore the length field in the container. Hence we rewrite all
285 * blocks just in case and also emit the adjusted container.
286 *
287 * The current implementation can only cope with renumbering a single
288 * RG (and only then if it is using HUFFMAN or BETA codecs). In
289 * theory it *may* be possible to renumber multiple RGs if they use
290 * HUFFMAN to the CORE block or use an external block unshared by any
291 * other data series. So we have an API that can be upgraded to
292 * support this, but do not implement it for now. An example
293 * implementation of RG as an EXTERNAL block would be to find that
294 * block and rewrite it, returning the number of blocks consumed.
295 *
296 * Returns 0 on success;
297 * -1 if unable to edit;
298 * -2 on other errors (eg I/O).
299 */
cram_transcode_rg(cram_fd * in,cram_fd * out,cram_container * c,int nrg,int * in_rg,int * out_rg)300 int cram_transcode_rg(cram_fd *in, cram_fd *out,
301 cram_container *c,
302 int nrg, int *in_rg, int *out_rg) {
303 int new_rg = *out_rg, old_size, new_size;
304 cram_block *o_blk, *n_blk;
305 cram_block_compression_hdr *ch;
306
307 if (nrg != 1) {
308 hts_log_error("CRAM transcode supports only a single RG");
309 return -2;
310 }
311
312 // Produce a new block holding the updated compression header,
313 // with RG transcoded to a new value. (Single only supported.)
314 o_blk = cram_read_block(in);
315 old_size = cram_block_size(o_blk);
316 ch = cram_decode_compression_header(in, o_blk);
317 if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
318 return -1;
319 if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0)
320 return -1;
321 n_blk = cram_encode_compression_header(in, c, ch);
322 cram_free_compression_header(ch);
323
324 /*
325 * Warning: this has internal knowledge of the cram compression
326 * header format.
327 *
328 * The decoder doesn't set c->tags_used, so the encoder puts a two
329 * byte blank segment. This means n_blk is too short. We skip
330 * through the decoded old block (o_blk) and copy from there.
331 */
332 char *cp = cram_block_get_data(o_blk);
333 char *op = cp;
334 char *endp = cp + cram_block_get_uncomp_size(o_blk);
335 //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
336 int32_t i32, err = 0;
337
338 i32 = in->vv.varint_get32(&cp, endp, &err);
339 cp += i32;
340 i32 = in->vv.varint_get32(&cp, endp, &err);
341 cp += i32;
342 op = cp;
343 i32 = in->vv.varint_get32(&cp, endp, &err);
344 i32 += (cp-op);
345 if (err)
346 return -2;
347
348 //fprintf(stderr, "remaining %d bytes\n", i32);
349 cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
350 cram_block_append(n_blk, op, i32);
351 cram_block_update_size(n_blk);
352
353 new_size = cram_block_size(n_blk);
354
355 //fprintf(stderr, "size %d -> %d\n", old_size, new_size);
356
357 // Now we've constructedthe updated compression header,
358 // amend the container too (it may have changed size).
359 int32_t *landmarks, num_landmarks;
360 landmarks = cram_container_get_landmarks(c, &num_landmarks);
361
362 if (old_size != new_size) {
363 int diff = new_size - old_size, j;
364
365 for (j = 0; j < num_landmarks; j++)
366 landmarks[j] += diff;
367 //cram_container_set_landmarks(c, num_landmarks, landmarks);
368 cram_container_set_length(c, cram_container_get_length(c) + diff);
369 }
370
371 // Finally write it all out; container, compression header,
372 // and then all the remaining slice blocks.
373 if (cram_write_container(out, c) != 0)
374 return -2;
375
376 cram_write_block(out, n_blk);
377 cram_free_block(o_blk);
378 cram_free_block(n_blk);
379
380 // Container num_blocks can be invalid, due to a bug.
381 // Instead we iterate in slice context instead.
382 return cram_copy_slice(in, out, num_landmarks);
383 }
384
385
386 /*!
387 * Returns the refs_t structure used by a cram file handle.
388 *
389 * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
390 * share reference memory between multiple file handles.
391 *
392 * @return
393 * Returns NULL if none exists or the file handle is not a CRAM file.
394 */
cram_get_refs(htsFile * fd)395 refs_t *cram_get_refs(htsFile *fd) {
396 return fd->format.format == cram
397 ? fd->fp.cram->refs
398 : NULL;
399 }
400