1 /*
2 Copyright (c) 2015 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*! \file
32  * External CRAM interface.
33  *
34  * Internally we're happy to use macros and to grub around in the cram
35  * structures.  This isn't very sustainable for an externally usable
36  * ABI though, so we have anonymous structs and accessor functions too
37  * to permit software such as samtools reheader to manipulate cram
38  * containers and blocks in a robust manner.
39  */
40 
41 #include <config.h>
42 
43 #include "htslib/hfile.h"
44 #include "cram/cram.h"
45 
46 /*
47  *-----------------------------------------------------------------------------
48  * cram_fd
49  */
cram_fd_get_header(cram_fd * fd)50 SAM_hdr *cram_fd_get_header(cram_fd *fd) { return fd->header; }
cram_fd_set_header(cram_fd * fd,SAM_hdr * hdr)51 void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr) { fd->header = hdr; }
52 
cram_fd_get_version(cram_fd * fd)53 int cram_fd_get_version(cram_fd *fd) { return fd->version; }
cram_fd_set_version(cram_fd * fd,int vers)54 void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; }
55 
cram_major_vers(cram_fd * fd)56 int cram_major_vers(cram_fd *fd) { return CRAM_MAJOR_VERS(fd->version); }
cram_minor_vers(cram_fd * fd)57 int cram_minor_vers(cram_fd *fd) { return CRAM_MINOR_VERS(fd->version); }
58 
cram_fd_get_fp(cram_fd * fd)59 hFILE *cram_fd_get_fp(cram_fd *fd) { return fd->fp; }
cram_fd_set_fp(cram_fd * fd,hFILE * fp)60 void cram_fd_set_fp(cram_fd *fd, hFILE *fp) { fd->fp = fp; }
61 
62 
63 /*
64  *-----------------------------------------------------------------------------
65  * cram_container
66  */
cram_container_get_length(cram_container * c)67 int32_t cram_container_get_length(cram_container *c) {
68     return c->length;
69 }
70 
cram_container_set_length(cram_container * c,int32_t length)71 void cram_container_set_length(cram_container *c, int32_t length) {
72     c->length = length;
73 }
74 
75 
cram_container_get_num_blocks(cram_container * c)76 int32_t cram_container_get_num_blocks(cram_container *c) {
77     return c->num_blocks;
78 }
79 
cram_container_set_num_blocks(cram_container * c,int32_t num_blocks)80 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
81     c->num_blocks = num_blocks;
82 }
83 
84 
85 /* Returns the landmarks[] array and the number of elements
86  * in num_landmarks.
87  */
cram_container_get_landmarks(cram_container * c,int32_t * num_landmarks)88 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) {
89     *num_landmarks = c->num_landmarks;
90     return c->landmark;
91 }
92 
93 /* Sets the landmarks[] array (pointer copy, not a memory dup) and
94  * num_landmarks value.
95  */
cram_container_set_landmarks(cram_container * c,int32_t num_landmarks,int32_t * landmarks)96 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
97 				  int32_t *landmarks) {
98     c->num_landmarks = num_landmarks;
99     c->landmark = landmarks;
100 }
101 
102 
103 /* Returns true if the container is empty (EOF marker) */
cram_container_is_empty(cram_fd * fd)104 int cram_container_is_empty(cram_fd *fd) {
105     return fd->empty_container;
106 }
107 
108 
109 /*
110  *-----------------------------------------------------------------------------
111  * cram_block_compression_hdr
112  */
113 
114 /*
115  * Utility function to edit an RG id.
116  * This is only possible if there is one single RG value used and it
117  * is in the container compression header using HUFFMAN or BETA
118  * codec.  In this case it is essentially hard coded and needs no
119  * editing of external (or worse, CORE) blocks.
120  *
121  * Returns 0 on success
122  *        -1 on failure
123  */
124 // Or arbitrary set compression header constant?
125 
cram_block_compression_hdr_set_DS(cram_block_compression_hdr * ch,int ds,int new_rg)126 static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
127 					     int ds, int new_rg) {
128     if (!ch || !ch->codecs[ds])
129 	return -1;
130 
131     switch (ch->codecs[ds]->codec) {
132     case E_HUFFMAN:
133 	if (ch->codecs[ds]->huffman.ncodes != 1)
134 	    return -1;
135 	ch->codecs[ds]->huffman.codes[0].symbol = new_rg;
136 	return 0;
137 
138     case E_BETA:
139 	if (ch->codecs[ds]->beta.nbits != 0)
140 	    return -1;
141 	ch->codecs[ds]->beta.offset = -new_rg;
142 	return 0;
143 
144     default:
145 	return -1;
146     }
147 
148     return 0;
149 }
150 
cram_block_compression_hdr_set_rg(cram_block_compression_hdr * ch,int new_rg)151 int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
152     return cram_block_compression_hdr_set_DS(ch, DS_RG, new_rg);
153 }
154 
155 /*
156  * Converts a cram_block_compression_hdr struct used for decoding to
157  * one used for encoding.  Maybe this should be a transparent
158  * operation applied on-demand.
159  *
160  * Returns 0 on success
161  *        -1 on failure
162  */
cram_block_compression_hdr_decoder2encoder(cram_fd * fd,cram_block_compression_hdr * ch)163 int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
164 					       cram_block_compression_hdr *ch) {
165     int i;
166 
167     if (!ch)
168 	return -1;
169 
170     for (i = 0; i < DS_END; i++) {
171 	cram_codec *co = ch->codecs[i];
172 	if (!co)
173 	    continue;
174 
175 	if (-1 == cram_codec_decoder2encoder(fd, co))
176 	    return -1;
177     }
178 
179     return 0;
180 }
181 
182 /*
183  *-----------------------------------------------------------------------------
184  * cram_slice
185  */
cram_slice_hdr_get_num_blocks(cram_block_slice_hdr * hdr)186 int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
187     return hdr->num_blocks;
188 }
189 
190 
191 /*
192  *-----------------------------------------------------------------------------
193  * cram_block
194  */
cram_block_get_content_id(cram_block * b)195 int32_t cram_block_get_content_id(cram_block *b)  { return b->content_id; }
cram_block_get_comp_size(cram_block * b)196 int32_t cram_block_get_comp_size(cram_block *b)   { return b->comp_size; }
cram_block_get_uncomp_size(cram_block * b)197 int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
cram_block_get_crc32(cram_block * b)198 int32_t cram_block_get_crc32(cram_block *b)       { return b->crc32; }
cram_block_get_data(cram_block * b)199 void *  cram_block_get_data(cram_block *b)        { return BLOCK_DATA(b); }
cram_block_get_size(cram_block * b)200 int32_t cram_block_get_size(cram_block *b)        { return BLOCK_SIZE(b); }
cram_block_get_content_type(cram_block * b)201 enum cram_content_type cram_block_get_content_type(cram_block *b) {
202     return b->content_type;
203 }
204 
cram_block_set_content_id(cram_block * b,int32_t id)205 void cram_block_set_content_id(cram_block *b, int32_t id) { b->content_id = id; }
cram_block_set_comp_size(cram_block * b,int32_t size)206 void cram_block_set_comp_size(cram_block *b, int32_t size) { b->comp_size = size; }
cram_block_set_uncomp_size(cram_block * b,int32_t size)207 void cram_block_set_uncomp_size(cram_block *b, int32_t size) { b->uncomp_size = size; }
cram_block_set_crc32(cram_block * b,int32_t crc)208 void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; }
cram_block_set_data(cram_block * b,void * data)209 void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; }
cram_block_set_size(cram_block * b,int32_t size)210 void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; }
211 
cram_block_append(cram_block * b,void * data,int size)212 int cram_block_append(cram_block *b, void *data, int size) {
213     BLOCK_APPEND(b, data, size);
214     return BLOCK_DATA(b) ? 0 : -1; // It'll do for now...
215 }
cram_block_update_size(cram_block * b)216 void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }
217 
218 // Offset is known as "size" internally, but it can be confusing.
cram_block_get_offset(cram_block * b)219 size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
cram_block_set_offset(cram_block * b,size_t offset)220 void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }
221 
222 
223 /*
224  * Copies the blocks representing the next num_slice slices from a
225  * container from 'in' to 'out'.  It is expected that the file pointer
226  * is just after the read of the cram_container and cram compression
227  * header.
228  *
229  * Returns 0 on success
230  *        -1 on failure
231  */
cram_copy_slice(cram_fd * in,cram_fd * out,int32_t num_slice)232 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
233     int32_t i, j;
234 
235     for (i = 0; i < num_slice; i++) {
236 	cram_block *blk;
237 	cram_block_slice_hdr *hdr;
238 
239 	if (!(blk = cram_read_block(in)))
240 	    return -1;
241 	if (!(hdr = cram_decode_slice_header(in, blk))) {
242 	    cram_free_block(blk);
243 	    return -1;
244 	}
245 	if (cram_write_block(out, blk) != 0) {
246 	    cram_free_block(blk);
247 	    return -1;
248 	}
249 	cram_free_block(blk);
250 
251 	int num_blocks = cram_slice_hdr_get_num_blocks(hdr);
252 	for (j = 0; j < num_blocks; j++) {
253 	    blk = cram_read_block(in);
254 	    if (!blk || cram_write_block(out, blk) != 0) {
255 		if (blk) cram_free_block(blk);
256 		return -1;
257 	    }
258 	    cram_free_block(blk);
259 	}
260 	cram_free_slice_header(hdr);
261     }
262 
263     return 0;
264 }
265 
266 /*
267  * Renumbers RG numbers in a cram compression header.
268  *
269  * CRAM stores RG as the Nth number in the header, rather than a
270  * string holding the ID: tag.  This is smaller in space, but means
271  * "samtools cat" to join files together that contain single but
272  * different RG lines needs a way of renumbering them.
273  *
274  * The file descriptor is expected to be immediately after the
275  * cram_container structure (ie before the cram compression header).
276  * Due to the nature of the CRAM format, this needs to read and write
277  * the blocks itself.  Note that there may be multiple slices within
278  * the container, meaning multiple compression headers to manipulate.
279  * Changing RG may change the size of the compression header and
280  * therefore the length field in the container.  Hence we rewrite all
281  * blocks just incase and also emit the adjusted container.
282  *
283  * The current implementation can only cope with renumbering a single
284  * RG (and only then if it is using HUFFMAN or BETA codecs).  In
285  * theory it *may* be possible to renumber multiple RGs if they use
286  * HUFFMAN to the CORE block or use an external block unshared by any
287  * other data series.  So we have an API that can be upgraded to
288  * support this, but do not implement it for now.  An example
289  * implementation of RG as an EXTERNAL block would be to find that
290  * block and rewrite it, returning the number of blocks consumed.
291  *
292  * Returns 0 on success;
293  *        -1 if unable to edit;
294  *        -2 on other errors (eg I/O).
295  */
cram_transcode_rg(cram_fd * in,cram_fd * out,cram_container * c,int nrg,int * in_rg,int * out_rg)296 int cram_transcode_rg(cram_fd *in, cram_fd *out,
297 		      cram_container *c,
298 		      int nrg, int *in_rg, int *out_rg) {
299     int new_rg = *out_rg, old_size, new_size;
300     cram_block *o_blk, *n_blk;
301     cram_block_compression_hdr *ch;
302 
303     if (nrg != 1) {
304 	hts_log_error("CRAM transcode supports only a single RG");
305 	return -2;
306     }
307 
308     // Produce a new block holding the updated compression header,
309     // with RG transcoded to a new value. (Single only supported.)
310     o_blk = cram_read_block(in);
311     old_size = cram_block_size(o_blk);
312     ch = cram_decode_compression_header(in, o_blk);
313     if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
314 	return -1;
315     cram_block_compression_hdr_decoder2encoder(in, ch);
316     n_blk = cram_encode_compression_header(in, c, ch);
317     cram_free_compression_header(ch);
318 
319     /*
320      * Warning: this has internal knowledge of the cram compression
321      * header format.
322      *
323      * The decoder doesn't set c->tags_used, so the encoder puts a two
324      * byte blank segment.  This means n_blk is too short.  We skip
325      * through the decoded old block (o_blk) and copy from there.
326      */
327     char *cp = cram_block_get_data(o_blk);
328     char *op = cp;
329     char *endp = cp + cram_block_get_uncomp_size(o_blk);
330     //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
331     int32_t i32;
332 
333     cp += safe_itf8_get(cp, endp, &i32);
334     cp += i32;
335     cp += safe_itf8_get(cp, endp, &i32);
336     cp += i32;
337     op = cp;
338     cp += safe_itf8_get(cp, endp, &i32);
339     i32 += (cp-op);
340 
341     //fprintf(stderr, "remaining %d bytes\n", i32);
342     cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
343     cram_block_append(n_blk, op, i32);
344     cram_block_update_size(n_blk);
345 
346     new_size = cram_block_size(n_blk);
347 
348     //fprintf(stderr, "size %d -> %d\n", old_size, new_size);
349 
350     // Now we've constructedthe updated compression header,
351     // amend the container too (it may have changed size).
352     int32_t *landmarks, num_landmarks;
353     landmarks = cram_container_get_landmarks(c, &num_landmarks);
354 
355     if (old_size != new_size) {
356 	int diff = new_size - old_size, j;
357 
358 	for (j = 0; j < num_landmarks; j++)
359 	    landmarks[j] += diff;
360 	//cram_container_set_landmarks(c, num_landmarks, landmarks);
361 	cram_container_set_length(c, cram_container_get_length(c) + diff);
362     }
363 
364     // Finally write it all out; container, compression header,
365     // and then all the remaining slice blocks.
366     if (cram_write_container(out, c) != 0)
367 	return -2;
368 
369     cram_write_block(out, n_blk);
370     cram_free_block(o_blk);
371     cram_free_block(n_blk);
372 
373     // Container num_blocks can be invalid, due to a bug.
374     // Instead we iterate in slice context instead.
375     return cram_copy_slice(in, out, num_landmarks);
376 }
377 
378 
379 /*!
380  * Returns the refs_t structure used by a cram file handle.
381  *
382  * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
383  * share reference memory between multiple file handles.
384  *
385  * @return
386  * Returns NULL if none exists or the file handle is not a CRAM file.
387  */
cram_get_refs(htsFile * fd)388 refs_t *cram_get_refs(htsFile *fd) {
389     return fd->format.format == cram
390         ? fd->fp.cram->refs
391         : NULL;
392 }
393