1 /*
2 Copyright (c) 2015, 2018-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*! \file
32  * External CRAM interface.
33  *
34  * Internally we're happy to use macros and to grub around in the cram
35  * structures.  This isn't very sustainable for an externally usable
36  * ABI though, so we have anonymous structs and accessor functions too
37  * to permit software such as samtools reheader to manipulate cram
38  * containers and blocks in a robust manner.
39  */
40 
41 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
42 #include <config.h>
43 
44 #include "../htslib/hfile.h"
45 #include "cram.h"
46 
47 /*
48  *-----------------------------------------------------------------------------
49  * cram_fd
50  */
cram_fd_get_header(cram_fd * fd)51 sam_hdr_t *cram_fd_get_header(cram_fd *fd) { return fd->header; }
cram_fd_set_header(cram_fd * fd,sam_hdr_t * hdr)52 void cram_fd_set_header(cram_fd *fd, sam_hdr_t *hdr) { fd->header = hdr; }
53 
cram_fd_get_version(cram_fd * fd)54 int cram_fd_get_version(cram_fd *fd) { return fd->version; }
cram_fd_set_version(cram_fd * fd,int vers)55 void cram_fd_set_version(cram_fd *fd, int vers) { fd->version = vers; }
56 
cram_major_vers(cram_fd * fd)57 int cram_major_vers(cram_fd *fd) { return CRAM_MAJOR_VERS(fd->version); }
cram_minor_vers(cram_fd * fd)58 int cram_minor_vers(cram_fd *fd) { return CRAM_MINOR_VERS(fd->version); }
59 
cram_fd_get_fp(cram_fd * fd)60 hFILE *cram_fd_get_fp(cram_fd *fd) { return fd->fp; }
cram_fd_set_fp(cram_fd * fd,hFILE * fp)61 void cram_fd_set_fp(cram_fd *fd, hFILE *fp) { fd->fp = fp; }
62 
63 
64 /*
65  *-----------------------------------------------------------------------------
66  * cram_container
67  */
cram_container_get_length(cram_container * c)68 int32_t cram_container_get_length(cram_container *c) {
69     return c->length;
70 }
71 
cram_container_set_length(cram_container * c,int32_t length)72 void cram_container_set_length(cram_container *c, int32_t length) {
73     c->length = length;
74 }
75 
76 
cram_container_get_num_blocks(cram_container * c)77 int32_t cram_container_get_num_blocks(cram_container *c) {
78     return c->num_blocks;
79 }
80 
cram_container_set_num_blocks(cram_container * c,int32_t num_blocks)81 void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) {
82     c->num_blocks = num_blocks;
83 }
84 
85 
86 /* Returns the landmarks[] array and the number of elements
87  * in num_landmarks.
88  */
cram_container_get_landmarks(cram_container * c,int32_t * num_landmarks)89 int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) {
90     *num_landmarks = c->num_landmarks;
91     return c->landmark;
92 }
93 
94 /* Sets the landmarks[] array (pointer copy, not a memory dup) and
95  * num_landmarks value.
96  */
cram_container_set_landmarks(cram_container * c,int32_t num_landmarks,int32_t * landmarks)97 void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
98                                   int32_t *landmarks) {
99     c->num_landmarks = num_landmarks;
100     c->landmark = landmarks;
101 }
102 
103 
104 /* Returns true if the container is empty (EOF marker) */
cram_container_is_empty(cram_fd * fd)105 int cram_container_is_empty(cram_fd *fd) {
106     return fd->empty_container;
107 }
108 
109 
110 /*
111  *-----------------------------------------------------------------------------
112  * cram_block_compression_hdr
113  */
114 
115 /*
116  * Utility function to edit an RG id.
117  * This is only possible if there is one single RG value used and it
118  * is in the container compression header using HUFFMAN or BETA
119  * codec.  In this case it is essentially hard coded and needs no
120  * editing of external (or worse, CORE) blocks.
121  *
122  * Returns 0 on success
123  *        -1 on failure
124  */
125 // Or arbitrary set compression header constant?
126 
cram_block_compression_hdr_set_DS(cram_block_compression_hdr * ch,int ds,int new_rg)127 static int cram_block_compression_hdr_set_DS(cram_block_compression_hdr *ch,
128                                              int ds, int new_rg) {
129     if (!ch || !ch->codecs[ds])
130         return -1;
131 
132     switch (ch->codecs[ds]->codec) {
133     case E_HUFFMAN:
134         if (ch->codecs[ds]->u.huffman.ncodes != 1)
135             return -1;
136         ch->codecs[ds]->u.huffman.codes[0].symbol = new_rg;
137         return 0;
138 
139     case E_BETA:
140         if (ch->codecs[ds]->u.beta.nbits != 0)
141             return -1;
142         ch->codecs[ds]->u.beta.offset = -new_rg;
143         return 0;
144 
145     default:
146         break;
147     }
148 
149     return -1;
150 }
151 
cram_block_compression_hdr_set_rg(cram_block_compression_hdr * ch,int new_rg)152 int cram_block_compression_hdr_set_rg(cram_block_compression_hdr *ch, int new_rg) {
153     return cram_block_compression_hdr_set_DS(ch, DS_RG, new_rg);
154 }
155 
156 /*
157  * Converts a cram_block_compression_hdr struct used for decoding to
158  * one used for encoding.  Maybe this should be a transparent
159  * operation applied on-demand.
160  *
161  * Returns 0 on success
162  *        -1 on failure
163  */
cram_block_compression_hdr_decoder2encoder(cram_fd * fd,cram_block_compression_hdr * ch)164 int cram_block_compression_hdr_decoder2encoder(cram_fd *fd,
165                                                cram_block_compression_hdr *ch) {
166     int i;
167 
168     if (!ch)
169         return -1;
170 
171     for (i = 0; i < DS_END; i++) {
172         cram_codec *co = ch->codecs[i];
173         if (!co)
174             continue;
175 
176         if (-1 == cram_codec_decoder2encoder(fd, co))
177             return -1;
178     }
179 
180     return 0;
181 }
182 
183 /*
184  *-----------------------------------------------------------------------------
185  * cram_slice
186  */
cram_slice_hdr_get_num_blocks(cram_block_slice_hdr * hdr)187 int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) {
188     return hdr->num_blocks;
189 }
190 
191 
192 /*
193  *-----------------------------------------------------------------------------
194  * cram_block
195  */
cram_block_get_content_id(cram_block * b)196 int32_t cram_block_get_content_id(cram_block *b)  { return b->content_id; }
cram_block_get_comp_size(cram_block * b)197 int32_t cram_block_get_comp_size(cram_block *b)   { return b->comp_size; }
cram_block_get_uncomp_size(cram_block * b)198 int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; }
cram_block_get_crc32(cram_block * b)199 int32_t cram_block_get_crc32(cram_block *b)       { return b->crc32; }
cram_block_get_data(cram_block * b)200 void *  cram_block_get_data(cram_block *b)        { return BLOCK_DATA(b); }
cram_block_get_size(cram_block * b)201 int32_t cram_block_get_size(cram_block *b)        { return BLOCK_SIZE(b); }
cram_block_get_content_type(cram_block * b)202 enum cram_content_type cram_block_get_content_type(cram_block *b) {
203     return b->content_type;
204 }
205 
cram_block_set_content_id(cram_block * b,int32_t id)206 void cram_block_set_content_id(cram_block *b, int32_t id) { b->content_id = id; }
cram_block_set_comp_size(cram_block * b,int32_t size)207 void cram_block_set_comp_size(cram_block *b, int32_t size) { b->comp_size = size; }
cram_block_set_uncomp_size(cram_block * b,int32_t size)208 void cram_block_set_uncomp_size(cram_block *b, int32_t size) { b->uncomp_size = size; }
cram_block_set_crc32(cram_block * b,int32_t crc)209 void cram_block_set_crc32(cram_block *b, int32_t crc) { b->crc32 = crc; }
cram_block_set_data(cram_block * b,void * data)210 void cram_block_set_data(cram_block *b, void *data) { BLOCK_DATA(b) = data; }
cram_block_set_size(cram_block * b,int32_t size)211 void cram_block_set_size(cram_block *b, int32_t size) { BLOCK_SIZE(b) = size; }
212 
cram_block_append(cram_block * b,const void * data,int size)213 int cram_block_append(cram_block *b, const void *data, int size) {
214     BLOCK_APPEND(b, data, size);
215     return 0;
216 
217  block_err:
218     return -1;
219 }
cram_block_update_size(cram_block * b)220 void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); }
221 
222 // Offset is known as "size" internally, but it can be confusing.
cram_block_get_offset(cram_block * b)223 size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); }
cram_block_set_offset(cram_block * b,size_t offset)224 void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; }
225 
226 
227 /*
228  * Copies the blocks representing the next num_slice slices from a
229  * container from 'in' to 'out'.  It is expected that the file pointer
230  * is just after the read of the cram_container and cram compression
231  * header.
232  *
233  * Returns 0 on success
234  *        -1 on failure
235  */
cram_copy_slice(cram_fd * in,cram_fd * out,int32_t num_slice)236 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
237     int32_t i, j;
238 
239     for (i = 0; i < num_slice; i++) {
240         cram_block *blk;
241         cram_block_slice_hdr *hdr;
242 
243         if (!(blk = cram_read_block(in)))
244             return -1;
245         if (!(hdr = cram_decode_slice_header(in, blk))) {
246             cram_free_block(blk);
247             return -1;
248         }
249         if (cram_write_block(out, blk) != 0) {
250             cram_free_block(blk);
251             return -1;
252         }
253         cram_free_block(blk);
254 
255         int num_blocks = cram_slice_hdr_get_num_blocks(hdr);
256         for (j = 0; j < num_blocks; j++) {
257             blk = cram_read_block(in);
258             if (!blk || cram_write_block(out, blk) != 0) {
259                 if (blk) cram_free_block(blk);
260                 return -1;
261             }
262             cram_free_block(blk);
263         }
264         cram_free_slice_header(hdr);
265     }
266 
267     return 0;
268 }
269 
270 /*
271  * Renumbers RG numbers in a cram compression header.
272  *
273  * CRAM stores RG as the Nth number in the header, rather than a
274  * string holding the ID: tag.  This is smaller in space, but means
275  * "samtools cat" to join files together that contain single but
276  * different RG lines needs a way of renumbering them.
277  *
278  * The file descriptor is expected to be immediately after the
279  * cram_container structure (ie before the cram compression header).
280  * Due to the nature of the CRAM format, this needs to read and write
281  * the blocks itself.  Note that there may be multiple slices within
282  * the container, meaning multiple compression headers to manipulate.
283  * Changing RG may change the size of the compression header and
284  * therefore the length field in the container.  Hence we rewrite all
285  * blocks just in case and also emit the adjusted container.
286  *
287  * The current implementation can only cope with renumbering a single
288  * RG (and only then if it is using HUFFMAN or BETA codecs).  In
289  * theory it *may* be possible to renumber multiple RGs if they use
290  * HUFFMAN to the CORE block or use an external block unshared by any
291  * other data series.  So we have an API that can be upgraded to
292  * support this, but do not implement it for now.  An example
293  * implementation of RG as an EXTERNAL block would be to find that
294  * block and rewrite it, returning the number of blocks consumed.
295  *
296  * Returns 0 on success;
297  *        -1 if unable to edit;
298  *        -2 on other errors (eg I/O).
299  */
cram_transcode_rg(cram_fd * in,cram_fd * out,cram_container * c,int nrg,int * in_rg,int * out_rg)300 int cram_transcode_rg(cram_fd *in, cram_fd *out,
301                       cram_container *c,
302                       int nrg, int *in_rg, int *out_rg) {
303     int new_rg = *out_rg, old_size, new_size;
304     cram_block *o_blk, *n_blk;
305     cram_block_compression_hdr *ch;
306 
307     if (nrg != 1) {
308         hts_log_error("CRAM transcode supports only a single RG");
309         return -2;
310     }
311 
312     // Produce a new block holding the updated compression header,
313     // with RG transcoded to a new value. (Single only supported.)
314     o_blk = cram_read_block(in);
315     old_size = cram_block_size(o_blk);
316     ch = cram_decode_compression_header(in, o_blk);
317     if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
318         return -1;
319     if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0)
320         return -1;
321     n_blk = cram_encode_compression_header(in, c, ch);
322     cram_free_compression_header(ch);
323 
324     /*
325      * Warning: this has internal knowledge of the cram compression
326      * header format.
327      *
328      * The decoder doesn't set c->tags_used, so the encoder puts a two
329      * byte blank segment.  This means n_blk is too short.  We skip
330      * through the decoded old block (o_blk) and copy from there.
331      */
332     char *cp = cram_block_get_data(o_blk);
333     char *op = cp;
334     char *endp = cp + cram_block_get_uncomp_size(o_blk);
335     //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
336     int32_t i32, err = 0;
337 
338     i32 = in->vv.varint_get32(&cp, endp, &err);
339     cp += i32;
340     i32 = in->vv.varint_get32(&cp, endp, &err);
341     cp += i32;
342     op = cp;
343     i32 = in->vv.varint_get32(&cp, endp, &err);
344     i32 += (cp-op);
345     if (err)
346         return -2;
347 
348     //fprintf(stderr, "remaining %d bytes\n", i32);
349     cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
350     cram_block_append(n_blk, op, i32);
351     cram_block_update_size(n_blk);
352 
353     new_size = cram_block_size(n_blk);
354 
355     //fprintf(stderr, "size %d -> %d\n", old_size, new_size);
356 
357     // Now we've constructedthe updated compression header,
358     // amend the container too (it may have changed size).
359     int32_t *landmarks, num_landmarks;
360     landmarks = cram_container_get_landmarks(c, &num_landmarks);
361 
362     if (old_size != new_size) {
363         int diff = new_size - old_size, j;
364 
365         for (j = 0; j < num_landmarks; j++)
366             landmarks[j] += diff;
367         //cram_container_set_landmarks(c, num_landmarks, landmarks);
368         cram_container_set_length(c, cram_container_get_length(c) + diff);
369     }
370 
371     // Finally write it all out; container, compression header,
372     // and then all the remaining slice blocks.
373     if (cram_write_container(out, c) != 0)
374         return -2;
375 
376     cram_write_block(out, n_blk);
377     cram_free_block(o_blk);
378     cram_free_block(n_blk);
379 
380     // Container num_blocks can be invalid, due to a bug.
381     // Instead we iterate in slice context instead.
382     return cram_copy_slice(in, out, num_landmarks);
383 }
384 
385 
386 /*!
387  * Returns the refs_t structure used by a cram file handle.
388  *
389  * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
390  * share reference memory between multiple file handles.
391  *
392  * @return
393  * Returns NULL if none exists or the file handle is not a CRAM file.
394  */
cram_get_refs(htsFile * fd)395 refs_t *cram_get_refs(htsFile *fd) {
396     return fd->format.format == cram
397         ? fd->fp.cram->refs
398         : NULL;
399 }
400