1 /*
2  * Copyright (c) 2013, 2014, 2015 Genome Research Ltd.
3  * Author(s): James Bonfield, Rob Davies
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36  *
37  * - In-memory decoding of CRAM data structures.
38  * - Iterator for reading CRAM record by record.
39  */
40 
41 #ifdef HAVE_CONFIG_H
42 #include "io_lib_config.h"
43 #endif
44 
45 #include <stdio.h>
46 #include <errno.h>
47 #include <assert.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <zlib.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 #include <math.h>
54 #include <ctype.h>
55 
56 #include "io_lib/cram.h"
57 #include "io_lib/os.h"
58 #include "io_lib/md5.h"
59 #include "io_lib/crc32.h"
60 
61 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
62 //#define USE_X
63 
64 
65 /* ----------------------------------------------------------------------
66  * CRAM compression headers
67  */
68 
69 /*
70  * Decodes the Tag Dictionary record in the preservation map
71  * Updates the cram compression header.
72  *
73  * Returns number of bytes decoded on success
74  *        -1 on failure
75  */
cram_decode_TD(char * cp,const char * endp,cram_block_compression_hdr * h)76 int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) {
77     char *op = cp;
78     unsigned char *dat;
79     cram_block *b;
80     int32_t blk_size = 0;
81     int nTL, i, sz;
82 
83     if (!(b = cram_new_block(0, 0)))
84 	return -1;
85 
86     /* Decode */
87     cp += safe_itf8_get(cp, endp, &blk_size);
88     if (!blk_size) {
89 	h->nTL = 0;
90 	h->TL = NULL;
91 	cram_free_block(b);
92 	return cp - op;
93     }
94 
95     if (blk_size < 0 || endp - cp < blk_size) {
96         cram_free_block(b);
97 	return -1;
98     }
99 
100     BLOCK_APPEND(b, cp, blk_size);
101     cp += blk_size;
102     sz = cp - op;
103 
104     // Force nul termination if missing
105     if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
106 	BLOCK_APPEND_CHAR(b, '\0');
107 
108     /* Set up TL lookup table */
109     dat = BLOCK_DATA(b);
110 
111     // Count
112     for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
113 	nTL++;
114 	while (dat[i])
115 	    i++;
116     }
117 
118     // Copy
119     h->nTL = nTL;
120     if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) {
121         cram_free_block(b);
122         return -1;
123     }
124     for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
125 	h->TL[nTL++] = &dat[i];
126 	while (dat[i])
127 	    i++;
128     }
129     h->TD_blk = b;
130 
131     return sz;
132 }
133 
134 /*
135  * Decodes a CRAM block compression header.
136  * Returns header ptr on success
137  *         NULL on failure
138  */
cram_decode_compression_header(cram_fd * fd,cram_block * b)139 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
140 							   cram_block *b) {
141     char *cp, *endp, *cp_copy;
142     cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
143     int i;
144     int32_t map_size = 0, map_count = 0;
145 
146     if (!hdr)
147 	return NULL;
148 
149     if (b->method != RAW) {
150 	if (cram_uncompress_block(b)) {
151 	    free(hdr);
152 	    return NULL;
153 	}
154     }
155 
156     cp = (char *)b->data;
157     endp = cp + b->uncomp_size;
158 
159     if (IS_CRAM_1_VERS(fd)) {
160 	int32_t i32;
161 	cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id);
162 	if (CRAM_MAJOR_VERS(fd->version) >= 4) {
163 	    int64_t i64;
164 	    cp += safe_ltf8_get(cp, endp, &i64); hdr->ref_seq_start=i64;
165 	    cp += safe_ltf8_get(cp, endp, &i64); hdr->ref_seq_span=i64;
166 	} else {
167 	    cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32;
168 	    cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32;
169 	}
170 	cp += safe_itf8_get(cp, endp, &hdr->num_records);
171 	cp += safe_itf8_get(cp, endp, &hdr->num_landmarks);
172 	if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
173 	    free(hdr);
174 	    return NULL;
175 	}
176 	for (i = 0; i < hdr->num_landmarks; i++) {
177 	    cp += safe_itf8_get(cp, endp, &hdr->landmark[i]);
178 	}
179     }
180 
181     hdr->preservation_map = HashTableCreate(4, HASH_NONVOLATILE_KEYS |
182 					    HASH_FUNC_TCL);
183     memset(hdr->rec_encoding_map, 0,
184 	   CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
185     memset(hdr->tag_encoding_map, 0,
186 	   CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
187 
188     if (!hdr->preservation_map) {
189 	cram_free_compression_header(hdr);
190 	return NULL;
191     }
192 
193     /* Initialise defaults for preservation map */
194     hdr->mapped_qs_included = 0;
195     hdr->unmapped_qs_included = 0;
196     hdr->unmapped_placed = 0;
197     hdr->qs_included = 0;
198     hdr->read_names_included = 0;
199     hdr->AP_delta = 1;
200     memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
201 
202     /* Preservation map */
203     cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
204     cp += safe_itf8_get(cp, endp, &map_count);
205     for (i = 0; i < map_count; i++) {
206 	HashData hd;
207 	if (endp - cp < 2) {
208 	    cram_free_compression_header(hdr);
209 	    return NULL;
210 	}
211 	cp += 2;
212 	switch(CRAM_KEY(cp[-2],cp[-1])) {
213 	case CRAM_KEY('M','I'):
214 	    hd.i = *cp++;
215 	    if (!HashTableAdd(hdr->preservation_map, "MI", 2, hd, NULL)) {
216 		cram_free_compression_header(hdr);
217 		return NULL;
218 	    }
219 	    hdr->mapped_qs_included = hd.i;
220 	    break;
221 
222 	case CRAM_KEY('U','I'):
223 	    hd.i = *cp++;
224 	    if (!HashTableAdd(hdr->preservation_map, "UI", 2, hd, NULL)) {
225 		cram_free_compression_header(hdr);
226 		return NULL;
227 	    }
228 	    hdr->unmapped_qs_included = hd.i;
229 	    break;
230 
231 	case CRAM_KEY('P','I'):
232 	    hd.i = *cp++;
233 	    if (!HashTableAdd(hdr->preservation_map, "PI", 2, hd, NULL)) {
234 		cram_free_compression_header(hdr);
235 		return NULL;
236 	    }
237 	    hdr->unmapped_placed = hd.i;
238 	    break;
239 
240 	case CRAM_KEY('R','N'):
241 	    hd.i = *cp++;
242 	    if (!HashTableAdd(hdr->preservation_map, "RN", 2, hd, NULL)) {
243 		cram_free_compression_header(hdr);
244 		return NULL;
245 	    }
246 	    hdr->read_names_included = hd.i;
247 	    break;
248 
249 	case CRAM_KEY('A','P'):
250 	    hd.i = *cp++;
251 	    if (!HashTableAdd(hdr->preservation_map, "AP", 2, hd, NULL)) {
252 		cram_free_compression_header(hdr);
253 		return NULL;
254 	    }
255 	    hdr->AP_delta = hd.i;
256 	    break;
257 
258 	case CRAM_KEY('R','R'):
259 	    hd.i = *cp++;
260 	    if (!HashTableAdd(hdr->preservation_map, "RR", 2, hd, NULL)) {
261 		cram_free_compression_header(hdr);
262 		return NULL;
263 	    }
264 	    hdr->no_ref = !hd.i;
265 	    break;
266 
267 	case CRAM_KEY('S','M'):
268 	    if (endp - cp < 5) {
269 	        cram_free_compression_header(hdr);
270 		return NULL;
271 	    }
272 	    hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
273 	    hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
274 	    hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
275 	    hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
276 
277 	    hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
278 	    hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
279 	    hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
280 	    hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
281 
282 	    hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
283 	    hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
284 	    hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
285 	    hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
286 
287 	    hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
288 	    hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
289 	    hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
290 	    hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
291 
292 	    hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
293 	    hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
294 	    hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
295 	    hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
296 	    hd.p = cp;
297 	    cp += 5;
298 	    if (!HashTableAdd(hdr->preservation_map, "SM", 2, hd, NULL)) {
299 		cram_free_compression_header(hdr);
300 		return NULL;
301 	    }
302 	    break;
303 
304 	case CRAM_KEY('T','D'): {
305 	    int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary
306 	    if (sz < 0) {
307 		cram_free_compression_header(hdr);
308 		return NULL;
309 	    }
310 	    hd.p = cp;
311 	    cp += sz;
312 	    if (!HashTableAdd(hdr->preservation_map, "TD", 2, hd, NULL)) {
313 		cram_free_compression_header(hdr);
314 		return NULL;
315 	    }
316 	    break;
317 	}
318 
319 	default:
320 	    fprintf(stderr, "Unrecognised preservation map key %c%c\n",
321 		    cp[-2], cp[-1]);
322 	    // guess byte;
323 	    cp++;
324 	    break;
325 	}
326     }
327     if (cp - cp_copy != map_size) {
328 	cram_free_compression_header(hdr);
329 	return NULL;
330     }
331 
332     /* Record encoding map */
333     cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
334     cp += safe_itf8_get(cp, endp, &map_count);
335     int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
336     for (i = 0; i < map_count; i++) {
337 	char *key = cp;
338 	int32_t encoding = E_NULL;
339 	int32_t size = 0;
340 	cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
341 
342 	if (!m || endp - cp < 4) {
343 	    free(m);
344 	    cram_free_compression_header(hdr);
345 	    return NULL;
346 	}
347 
348 	cp += 2;
349 	cp += safe_itf8_get(cp, endp, &encoding);
350 	cp += safe_itf8_get(cp, endp, &size);
351 
352 	// Fill out cram_map purely for cram_dump to dump out.
353 	m->key = (key[0]<<8)|key[1];
354 	m->encoding = encoding;
355 	m->size     = size;
356 	m->offset   = cp - (char *)b->data;
357 	m->codec = NULL;
358 
359 	if (m->encoding == E_NULL)
360 	    continue;
361 
362 	if (size < 0 || endp - cp < size) {
363 	    free(m);
364 	    cram_free_compression_header(hdr);
365 	    return NULL;
366 	}
367 
368 	//printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
369 
370 	/*
371 	 * For CRAM1.0 CF and BF are Byte and not Int.
372 	 * Practically speaking it makes no difference unless we have a
373 	 * 1.0 format file that stores these in EXTERNAL as only then
374 	 * does Byte vs Int matter.
375 	 *
376 	 * Neither this C code nor Java reference implementations did this,
377 	 * so we gloss over it and treat them as int.
378 	 */
379 
380 	if (key[0] == 'B' && key[1] == 'F') {
381 	    if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size, E_INT,
382 						    fd->version))) {
383 		cram_free_compression_header(hdr);
384 		return NULL;
385 	    }
386 	} else if (key[0] == 'C' && key[1] == 'F') {
387 	    if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size, E_INT,
388 						    fd->version))) {
389 		cram_free_compression_header(hdr);
390 		return NULL;
391 	    }
392 	} else if (key[0] == 'R' && key[1] == 'I') {
393 	    if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size, E_INT,
394 						    fd->version))) {
395 		cram_free_compression_header(hdr);
396 		return NULL;
397 	    }
398 	} else if (key[0] == 'R' && key[1] == 'L') {
399 	    if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size, E_INT,
400 						    fd->version))) {
401 		cram_free_compression_header(hdr);
402 		return NULL;
403 	    }
404 	} else if (key[0] == 'A' && key[1] == 'P') {
405 	    if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
406 							 is_v4 ? E_LONG : E_INT,
407 							 fd->version))) {
408 		cram_free_compression_header(hdr);
409 		return NULL;
410 	    }
411 	} else if (key[0] == 'R' && key[1] == 'G') {
412 	    if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size, E_INT,
413 						    fd->version))) {
414 		cram_free_compression_header(hdr);
415 		return NULL;
416 	    }
417 	} else if (key[0] == 'M' && key[1] == 'F') {
418 	    if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size, E_INT,
419 						    fd->version))) {
420 		cram_free_compression_header(hdr);
421 		return NULL;
422 	    }
423 	} else if (key[0] == 'N' && key[1] == 'S') {
424 	    if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size, E_INT,
425 						    fd->version))) {
426 		cram_free_compression_header(hdr);
427 		return NULL;
428 	    }
429 	} else if (key[0] == 'N' && key[1] == 'P') {
430 	    if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
431 							 is_v4 ? E_LONG : E_INT,
432 							 fd->version))) {
433 		cram_free_compression_header(hdr);
434 		return NULL;
435 	    }
436 	} else if (key[0] == 'T' && key[1] == 'S') {
437 	    if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
438 							 is_v4 ? E_LONG : E_INT,
439 							 fd->version))) {
440 		cram_free_compression_header(hdr);
441 		return NULL;
442 	    }
443 	} else if (key[0] == 'N' && key[1] == 'F') {
444 	    if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size, E_INT,
445 						    fd->version))) {
446 		cram_free_compression_header(hdr);
447 		return NULL;
448 	    }
449 	} else if (key[0] == 'T' && key[1] == 'C') {
450 	    if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size, E_BYTE,
451 						    fd->version))) {
452 		cram_free_compression_header(hdr);
453 		return NULL;
454 	    }
455 	} else if (key[0] == 'T' && key[1] == 'N') {
456 	    if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size, E_INT,
457 						    fd->version))) {
458 		cram_free_compression_header(hdr);
459 		return NULL;
460 	    }
461 	} else if (key[0] == 'F' && key[1] == 'N') {
462 	    if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size, E_INT,
463 						    fd->version))) {
464 		cram_free_compression_header(hdr);
465 		return NULL;
466 	    }
467 	} else if (key[0] == 'F' && key[1] == 'C') {
468 	    if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size, E_BYTE,
469 						    fd->version))) {
470 		cram_free_compression_header(hdr);
471 		return NULL;
472 	    }
473 	} else if (key[0] == 'F' && key[1] == 'P') {
474 	    if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size, E_INT,
475 						    fd->version))) {
476 		cram_free_compression_header(hdr);
477 		return NULL;
478 	    }
479 	} else if (key[0] == 'B' && key[1] == 'S') {
480 	    if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size, E_BYTE,
481 						    fd->version))) {
482 		cram_free_compression_header(hdr);
483 		return NULL;
484 	    }
485 	} else if (key[0] == 'I' && key[1] == 'N') {
486 	    if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
487 						    E_BYTE_ARRAY,
488 						    fd->version))) {
489 		cram_free_compression_header(hdr);
490 		return NULL;
491 	    }
492 	} else if (key[0] == 'S' && key[1] == 'C') {
493 	    if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
494 						    E_BYTE_ARRAY,
495 						    fd->version))) {
496 		cram_free_compression_header(hdr);
497 		return NULL;
498 	    }
499 	} else if (key[0] == 'D' && key[1] == 'L') {
500 	    if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size, E_INT,
501 						    fd->version))) {
502 		cram_free_compression_header(hdr);
503 		return NULL;
504 	    }
505 	} else if (key[0] == 'B' && key[1] == 'A') {
506 	    if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size, E_BYTE,
507 						    fd->version))) {
508 		cram_free_compression_header(hdr);
509 		return NULL;
510 	    }
511 	} else if (key[0] == 'B' && key[1] == 'B') {
512 	    if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size, E_BYTE_ARRAY,
513 						    fd->version))) {
514 		cram_free_compression_header(hdr);
515 		return NULL;
516 	    }
517 	} else if (key[0] == 'R' && key[1] == 'S') {
518 	    if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size, E_INT,
519 						    fd->version))) {
520 		cram_free_compression_header(hdr);
521 		return NULL;
522 	    }
523 	} else if (key[0] == 'P' && key[1] == 'D') {
524 	    if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size, E_INT,
525 						    fd->version))) {
526 		cram_free_compression_header(hdr);
527 		return NULL;
528 	    }
529 	} else if (key[0] == 'H' && key[1] == 'C') {
530 	    if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size, E_INT,
531 						    fd->version))) {
532 		cram_free_compression_header(hdr);
533 		return NULL;
534 	    }
535 	} else if (key[0] == 'M' && key[1] == 'Q') {
536 	    if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size, E_INT,
537 						    fd->version))) {
538 		cram_free_compression_header(hdr);
539 		return NULL;
540 	    }
541 	} else if (key[0] == 'R' && key[1] == 'N') {
542 	    if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
543 							 E_BYTE_ARRAY_BLOCK,
544 							 fd->version))) {
545 		cram_free_compression_header(hdr);
546 		return NULL;
547 	    }
548 	} else if (key[0] == 'Q' && key[1] == 'S') {
549 	    if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size, E_BYTE,
550 						    fd->version))) {
551 		cram_free_compression_header(hdr);
552 		return NULL;
553 	    }
554 	} else if (key[0] == 'Q' && key[1] == 'Q') {
555 	    if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size, E_BYTE_ARRAY,
556 						    fd->version))) {
557 		cram_free_compression_header(hdr);
558 		return NULL;
559 	    }
560 	} else if (key[0] == 'T' && key[1] == 'L') {
561 	    if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size, E_INT,
562 						    fd->version))) {
563 		cram_free_compression_header(hdr);
564 		return NULL;
565 	    }
566 	} else if (key[0] == 'T' && key[1] == 'M') {
567 	} else if (key[0] == 'T' && key[1] == 'V') {
568 	} else
569 	    fprintf(stderr, "Unrecognised key: %.2s\n", key);
570 
571 	cp += size;
572 
573 	m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
574 	hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
575     }
576     if (cp - cp_copy != map_size) {
577 	cram_free_compression_header(hdr);
578 	return NULL;
579     }
580 
581     /* Tag encoding map */
582     cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
583     cp += safe_itf8_get(cp, endp, &map_count);
584     for (i = 0; i < map_count; i++) {
585 	int32_t encoding = E_NULL;
586 	int32_t size = 0;
587 	cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
588 	uint8_t key[3];
589 
590 	if (!m || endp - cp < 6) {
591 	    free(m);
592 	    cram_free_compression_header(hdr);
593 	    return NULL;
594 	}
595 
596 	cp += safe_itf8_get(cp, endp, &m->key);
597 	key[0] = m->key>>16;
598 	key[1] = m->key>>8;
599 	key[2] = m->key;
600 	cp += safe_itf8_get(cp, endp, &encoding);
601 	cp += safe_itf8_get(cp, endp, &size);
602 
603 	m->encoding = encoding;
604 	m->size     = size;
605 	m->offset   = cp - (char *)b->data;
606 	if (size < 0 || endp - cp < size ||
607 	    !(m->codec = cram_decoder_init(encoding, cp, size,
608 					   E_BYTE_ARRAY_BLOCK, fd->version))) {
609 	    cram_free_compression_header(hdr);
610 	    free(m);
611 	    return NULL;
612 	}
613 
614 	cp += size;
615 
616 	m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
617 	hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
618     }
619     if (cp - cp_copy != map_size) {
620 	cram_free_compression_header(hdr);
621 	return NULL;
622     }
623 
624     return hdr;
625 }
626 
627 
628 /*
629  * Note we also need to scan through the record encoding map to
630  * see which data series share the same block, either external or
631  * CORE. For example if we need the BF data series but MQ and CF
632  * are also encoded in the same block then we need to add those in
633  * as a dependency in order to correctly decode BF.
634  *
635  * Returns 0 on success
636  *        -1 on failure
637  */
cram_dependent_data_series(cram_fd * fd,cram_block_compression_hdr * hdr,cram_slice * s)638 int cram_dependent_data_series(cram_fd *fd,
639 			       cram_block_compression_hdr *hdr,
640 			       cram_slice *s) {
641     int *block_used;
642     int core_used = 0;
643     int i;
644     static int i_to_id[] = {
645 	DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
646 	DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
647 	DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
648 	DS_HC, DS_SC, DS_BB, DS_QQ,
649     };
650     uint32_t orig_ds;
651 
652     /*
653      * Set the data_series bit field based on fd->required_fields
654      * contents.
655      */
656     if (fd->required_fields && fd->required_fields != INT_MAX) {
657 	s->data_series = 0;
658 
659 	if (fd->required_fields & SAM_QNAME)
660 	    s->data_series |= CRAM_RN;
661 
662 	if (fd->required_fields & SAM_FLAG)
663 	    s->data_series |= CRAM_BF;
664 
665 	if (fd->required_fields & SAM_RNAME)
666 	    s->data_series |= CRAM_RI | CRAM_BF;
667 
668 	if (fd->required_fields & SAM_POS)
669 	    s->data_series |= CRAM_AP | CRAM_BF;
670 
671 	if (fd->required_fields & SAM_MAPQ)
672 	    s->data_series |= CRAM_MQ;
673 
674 	if (fd->required_fields & SAM_CIGAR)
675 	    s->data_series |= CRAM_CIGAR;
676 
677 	if (fd->required_fields & SAM_RNEXT)
678 	    s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
679 
680 	if (fd->required_fields & SAM_PNEXT)
681 	    s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
682 
683 	if (fd->required_fields & SAM_TLEN)
684 	    s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
685 		CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
686 
687 	if (fd->required_fields & SAM_SEQ)
688 	    s->data_series |= CRAM_SEQ;
689 
690 	if (!(fd->required_fields & SAM_AUX))
691 	    // No easy way to get MD/NM without other tags at present
692 	    s->decode_md = 0;
693 
694 	if (fd->required_fields & SAM_QUAL) {
695 	    s->data_series |= CRAM_QUAL;
696 	    if (CRAM_MAJOR_VERS(fd->version) >= 4)
697 		s->data_series |= CRAM_BF;
698 	}
699 
700 	if (fd->required_fields & SAM_AUX)
701 	    s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
702 
703 	if (fd->required_fields & SAM_RGAUX)
704 	    s->data_series |= CRAM_RG | CRAM_BF;
705 
706 	// Always uncompress CORE block
707 	if (cram_uncompress_block(s->block[0]))
708 	    return -1;
709     } else {
710 	s->data_series = CRAM_ALL;
711 
712 	for (i = 0; i < s->hdr->num_blocks; i++) {
713 	    if (cram_uncompress_block(s->block[i]))
714 		return -1;
715 	}
716 
717 	return 0;
718     }
719 
720     block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
721     if (!block_used)
722 	return -1;
723 
724     do {
725 	/*
726 	 * Also set data_series based on code prerequisites. Eg if we need
727 	 * CRAM_QS then we also need to know CRAM_RL so we know how long it
728 	 * is, or if we need FC/FP then we also need FN (number of features).
729 	 *
730 	 * It's not reciprocal though. We may be needing to decode FN
731 	 * but have no need to decode FC, FP and cigar ops.
732 	 */
733 	if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
734 	if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
735 	if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
736 	if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
737 	if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
738 	if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
739 	if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
740 	if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
741 	if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
742 	if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
743 	if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
744 
745 	// cram_decode_seq() needs seq[] array
746 	if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
747 
748 	if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
749 	if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
750 	if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
751 	if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
752 	if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
753 	if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
754 	if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
755 	    s->data_series |= CRAM_CF;
756 	if (!hdr->read_names_included && s->data_series & CRAM_RN)
757 	    s->data_series |= CRAM_CF | CRAM_NF;
758 	if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
759 	    s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
760 
761 	orig_ds = s->data_series;
762 
763 	// Find which blocks are in use.
764 	for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
765 	    int bnum1, bnum2, j;
766 	    cram_codec *c = hdr->codecs[i_to_id[i]];
767 
768 	    if (!(s->data_series & (1<<i)))
769 		continue;
770 
771 	    if (!c)
772 		continue;
773 
774 	    bnum1 = cram_codec_to_id(c, &bnum2);
775 
776 	    for (;;) {
777 		switch (bnum1) {
778 		case -2:
779 		    break;
780 
781 		case -1:
782 		    core_used = 1;
783 		    break;
784 
785 		default:
786 		    for (j = 0; j < s->hdr->num_blocks; j++) {
787 			if (s->block[j]->content_type == EXTERNAL &&
788 			    s->block[j]->content_id == bnum1) {
789 			    block_used[j] = 1;
790 			    if (cram_uncompress_block(s->block[j])) {
791 				free(block_used);
792 				return -1;
793 			    }
794 			}
795 		    }
796 		    break;
797 		}
798 
799 		if (bnum2 == -2 || bnum1 == bnum2)
800 		    break;
801 
802 		bnum1 = bnum2; // 2nd pass
803 	    }
804 	}
805 
806 	// Tags too
807 	if ((fd->required_fields & SAM_AUX) ||
808 	    (s->data_series & CRAM_aux)) {
809 	    for (i = 0; i < CRAM_MAP_HASH; i++) {
810 		int bnum1, bnum2, j;
811 		cram_map *m = hdr->tag_encoding_map[i];
812 
813 		while (m) {
814 		    cram_codec *c = m->codec;
815 		    if (!c) {
816 			m = m->next;
817 			continue;
818 		    }
819 
820 		    bnum1 = cram_codec_to_id(c, &bnum2);
821 
822 		    for (;;) {
823 			switch (bnum1) {
824 			case -2:
825 			    break;
826 
827 			case -1:
828 			    core_used = 1;
829 			    break;
830 
831 			default:
832 			    for (j = 0; j < s->hdr->num_blocks; j++) {
833 				if (s->block[j]->content_type == EXTERNAL &&
834 				    s->block[j]->content_id == bnum1) {
835 				    block_used[j] = 1;
836 				    if (cram_uncompress_block(s->block[j])) {
837 					free(block_used);
838 					return -1;
839 				    }
840 				}
841 			    }
842 			    break;
843 			}
844 
845 			if (bnum2 == -2 || bnum1 == bnum2)
846 			    break;
847 
848 			bnum1 = bnum2; // 2nd pass
849 		    }
850 
851 		    m = m->next;
852 		}
853 	    }
854 	}
855 
856 	// We now know which blocks are in used, so repeat and find
857 	// which other data series need to be added.
858 	for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
859 	    int bnum1, bnum2, j;
860 	    cram_codec *c = hdr->codecs[i_to_id[i]];
861 
862 	    if (!c)
863 		continue;
864 
865 	    bnum1 = cram_codec_to_id(c, &bnum2);
866 
867 	    for (;;) {
868 		switch (bnum1) {
869 		case -2:
870 		    break;
871 
872 		case -1:
873 		    if (core_used) {
874 			//printf(" + data series %08x:\n", 1<<i);
875 			s->data_series |= 1<<i;
876 		    }
877 		    break;
878 
879 		default:
880 		    for (j = 0; j < s->hdr->num_blocks; j++) {
881 			if (s->block[j]->content_type == EXTERNAL &&
882 			    s->block[j]->content_id == bnum1) {
883 			    if (block_used[j]) {
884 				//printf(" + data series %08x:\n", 1<<i);
885 				s->data_series |= 1<<i;
886 			    }
887 			}
888 		    }
889 		    break;
890 		}
891 
892 		if (bnum2 == -2 || bnum1 == bnum2)
893 		    break;
894 
895 		bnum1 = bnum2; // 2nd pass
896 	    }
897 	}
898 
899 	// Tags too
900 	for (i = 0; i < CRAM_MAP_HASH; i++) {
901 	    int bnum1, bnum2, j;
902 	    cram_map *m = hdr->tag_encoding_map[i];
903 
904 	    while (m) {
905 		cram_codec *c = m->codec;
906 		if (!c) {
907 		    m = m->next;
908 		    continue;
909 		}
910 
911 		bnum1 = cram_codec_to_id(c, &bnum2);
912 
913 		for (;;) {
914 		    switch (bnum1) {
915 		    case -2:
916 			break;
917 
918 		    case -1:
919 			//printf(" + data series %08x:\n", CRAM_aux);
920 			s->data_series |= CRAM_aux;
921 			break;
922 
923 		    default:
924 			for (j = 0; j < s->hdr->num_blocks; j++) {
925 			    if (s->block[j]->content_type == EXTERNAL &&
926 				s->block[j]->content_id == bnum1) {
927 				if (block_used[j]) {
928 				    //printf(" + data series %08x:\n",
929 				    //       CRAM_aux);
930 				    s->data_series |= CRAM_aux;
931 				}
932 			    }
933 			}
934 			break;
935 		    }
936 
937 		    if (bnum2 == -2 || bnum1 == bnum2)
938 			break;
939 
940 		    bnum1 = bnum2; // 2nd pass
941 		}
942 
943 		m = m->next;
944 	    }
945 	}
946     } while (orig_ds != s->data_series);
947 
948     free(block_used);
949     return 0;
950 }
951 
952 /*
953  * Checks whether an external block is used solely by a single data series.
954  * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
955  *         or 0 if not (E_NULL).
956  */
cram_ds_unique(cram_block_compression_hdr * hdr,cram_codec * c,int id)957 static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
958 			  int id) {
959     int i, n_id = 0;
960     enum cram_encoding e_type = 0;
961 
962     for (i = 0; i < DS_END; i++) {
963 	cram_codec *c;
964 	int bnum1, bnum2, old_n_id;
965 
966 	if (!(c = hdr->codecs[i]))
967 	    continue;
968 
969 	bnum1 = cram_codec_to_id(c, &bnum2);
970 
971 	old_n_id = n_id;
972 	if (bnum1 == id) {
973 	    n_id++;
974 	    e_type = c->codec;
975 	}
976 	if (bnum2 == id) {
977 	    n_id++;
978 	    e_type = c->codec;
979 	}
980 
981 	if (n_id == old_n_id+2)
982 	    n_id--; // len/val in same place counts once only.
983     }
984 
985     return n_id == 1 ? e_type : 0;
986 }
987 
988 /*
989  * Attempts to estimate the size of some blocks so we can preallocate them
990  * before decoding.  Although decoding will automatically grow the blocks,
991  * it is typically more efficient to preallocate.
992  */
cram_decode_estimate_sizes(cram_block_compression_hdr * hdr,cram_slice * s,int * qual_size,int * name_size,int * q_id)993 void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
994 				int *qual_size, int *name_size,
995 				int *q_id) {
996     int bnum1, bnum2;
997     cram_codec *cd;
998 
999     *qual_size = 0;
1000     *name_size = 0;
1001 
1002     /* Qual */
1003     if (!(cd = hdr->codecs[DS_QS]))
1004 	return;
1005 
1006     bnum1 = cram_codec_to_id(cd, &bnum2);
1007     if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1008     if (cram_ds_unique(hdr, cd, bnum1)) {
1009 	cram_block *b = cram_get_block_by_id(s, bnum1);
1010 	if (b) *qual_size = b->uncomp_size;
1011 	if (q_id && cd->codec == E_EXTERNAL)
1012 	    *q_id = bnum1;
1013     }
1014 
1015     /* Name */
1016     cd = hdr->codecs[DS_RN];
1017     bnum1 = cram_codec_to_id(cd, &bnum2);
1018     if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1019     if (cram_ds_unique(hdr, cd, bnum1)) {
1020 	cram_block *b = cram_get_block_by_id(s, bnum1);
1021 	if (b) *name_size = b->uncomp_size;
1022     }
1023 }
1024 
1025 
1026 /* ----------------------------------------------------------------------
1027  * CRAM slices
1028  */
1029 
1030 /*
1031  * Decodes a CRAM (un)mapped slice header block.
1032  * Returns slice header ptr on success
1033  *         NULL on failure
1034  */
cram_decode_slice_header(cram_fd * fd,cram_block * b)1035 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
1036     cram_block_slice_hdr *hdr;
1037     unsigned char *cp;
1038     unsigned char *cp_end;
1039     int i;
1040 
1041     if (b->method != RAW) {
1042         /* Spec. says slice header should be RAW, but we can future-proof
1043 	   by trying to decode it if it isn't. */
1044         if (cram_uncompress_block(b) < 0)
1045             return NULL;
1046     }
1047 
1048     cp = (unsigned char *)BLOCK_DATA(b);
1049     cp_end = cp + b->uncomp_size;
1050 
1051     if (b->content_type != MAPPED_SLICE &&
1052 	b->content_type != UNMAPPED_SLICE)
1053 	return NULL;
1054 
1055     if (!(hdr  = calloc(1, sizeof(*hdr))))
1056 	return NULL;
1057 
1058     hdr->content_type = b->content_type;
1059 
1060     if (b->content_type == MAPPED_SLICE) {
1061 	int32_t i32;
1062         cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id);
1063 	if (CRAM_MAJOR_VERS(fd->version) >= 4) {
1064 	    int64_t i64;
1065 	    cp += safe_ltf8_get((char *)cp, (char *)cp_end, &i64);
1066 	    hdr->ref_seq_start = i64;
1067 	    cp += safe_ltf8_get((char *)cp, (char *)cp_end, &i64);
1068 	    hdr->ref_seq_span = i64;
1069 	} else {
1070 	    cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1071 	    hdr->ref_seq_start = i32;
1072 	    cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1073 	    hdr->ref_seq_span = i32;
1074 	}
1075     }
1076     cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->num_records);
1077     hdr->record_counter = 0;
1078     if (CRAM_MAJOR_VERS(fd->version) == 2) {
1079 	int32_t i32 = 0;
1080 	cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1081 	hdr->record_counter = i32;
1082     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1083 	cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter);
1084     }
1085     cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks);
1086     cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids);
1087     if (hdr->num_content_ids < 1 ||
1088 	hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) {
1089         /* Slice must have at least one data block,
1090 	   and malloc'd size shouldn't wrap. */
1091         free(hdr);
1092         return NULL;
1093     }
1094     hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1095     if (!hdr->block_content_ids) {
1096 	free(hdr);
1097 	return NULL;
1098     }
1099 
1100     for (i = 0; i < hdr->num_content_ids; i++) {
1101 	int l = safe_itf8_get((char *)cp, (char *) cp_end,
1102 			      &hdr->block_content_ids[i]);
1103 	if (l <= 0) {
1104 	    free(hdr->block_content_ids);
1105 	    free(hdr);
1106 	    return NULL;
1107 	}
1108 	cp += l;
1109     }
1110 
1111     if (b->content_type == MAPPED_SLICE) {
1112         cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id);
1113     }
1114 
1115     if (!IS_CRAM_1_VERS(fd)) {
1116         if (cp_end - cp < 16) {
1117             free(hdr->block_content_ids);
1118             free(hdr);
1119             return NULL;
1120         }
1121 	memcpy(hdr->md5, cp, 16);
1122 	cp += 16;
1123     } else {
1124 	memset(hdr->md5, 0, 16);
1125     }
1126 
1127     // Decode any optional tag:type:value fields
1128     if (cp == cp_end)
1129 	return hdr;
1130 
1131     hdr->tags = HashTableCreate(4, HASH_FUNC_TCL);
1132     while (cp <= cp_end) {
1133 	unsigned int sub_len;
1134 	unsigned char id[3];
1135 	HashData hd;
1136 
1137 	if (cp_end - cp < 4)
1138 	    return hdr;
1139 
1140 	id[0] = cp[0];
1141 	id[1] = cp[1];
1142 	id[2] = '\0';
1143 
1144 	switch (cp[2]) {
1145 	case 'c':
1146 	    id[2] = 'i';
1147 	    hd.i = (int8_t)(cp[3]);
1148 	    cp += 4;
1149 	    break;
1150 
1151 	case 'C':
1152 	    id[2] = 'i';
1153 	    hd.i = (uint8_t)(cp[3]);
1154 	    cp += 4;
1155 	    break;
1156 
1157 	case 's':
1158 	    if (cp_end - cp < 5) break;
1159 	    id[2] = 'i';
1160 	    hd.i = (int16_t)(cp[3] + (cp[4]<<8));
1161 	    cp += 5;
1162 	    break;
1163 
1164 	case 'S':
1165 	    if (cp_end - cp < 5) break;
1166 	    id[2] = 'i';
1167 	    hd.i = (uint16_t)(cp[3] + (cp[4]<<8));
1168 	    cp += 5;
1169 	    break;
1170 
1171 	case 'i':
1172 	    if (cp_end - cp < 7) break;
1173 	    id[2] = 'i';
1174 	    hd.i = (int32_t)(cp[3] + (cp[4]<<8) + (cp[5]<<16) + (cp[6]<<24));
1175 	    cp += 7;
1176 	    break;
1177 
1178 	case 'I':
1179 	    if (cp_end - cp < 7) break;
1180 	    id[2] = 'i';
1181 	    hd.i = (uint32_t)(cp[3] + (cp[4]<<8) + (cp[5]<<16) + (cp[6]<<24));
1182 	    cp += 7;
1183 	    break;
1184 
1185 	case 'f':
1186 	    if (cp_end - cp < 7) break;
1187 	    id[2] = cp[2];
1188 	    hd.f = bam_aux_f(cp+2);
1189 	    cp += 7;
1190 	    break;
1191 
1192 	case 'A':
1193 	    id[2] = cp[2];
1194 	    hd.i = cp[3];
1195 	    cp += 4;
1196 	    break;
1197 
1198 	case 'Z': case 'H':
1199 	    id[2] = cp[2];
1200 	    hd.p = &cp[3];
1201 	    cp += 3;
1202 	    while (cp < cp_end && *cp != '\0') cp++;
1203 	    if (cp < cp_end) {
1204 	        /* Skip NUL */
1205 	        cp++;
1206 	    } else {
1207 	        /* Add missing NUL termination */
1208 	        assert(cp == BLOCK_DATA(b) + b->uncomp_size);
1209 	        BLOCK_RESIZE(b, b->uncomp_size + 1);
1210 		cp = cp_end = BLOCK_DATA(b) + b->uncomp_size;
1211 		*cp = '\0';
1212 	    }
1213 	    break;
1214 
1215 	case 'B':
1216 	    // <type>B<code><4-len><...>
1217 	    if (cp_end - cp < 8) break;
1218 	    hd.p = &cp[3];
1219 	    sub_len = cp[4] + (cp[5]<<8) + (cp[6]<<16) + (cp[7]<<24);
1220 	    switch (cp[3]) {
1221 	    case 'c': case 'C':
1222 	        if (cp_end - cp < 8 + sub_len) break;
1223 	        id[2] = cp[2];
1224 	        cp += 8 + sub_len;
1225 	        break;
1226 	    case 's': case 'S':
1227 	        if (cp_end - cp < 8 + 2 * sub_len) break;
1228 	        id[2] = cp[2];
1229 	        cp += 8 + 2*sub_len;
1230 	        break;
1231 	    case 'i': case 'I': case 'f':
1232 	        if (cp_end - cp < 8 + 4 * sub_len) break;
1233 	        id[2] = cp[2];
1234 	        cp += 8 + 4*sub_len;
1235 	        break;
1236 	    default:
1237 	        fprintf(stderr, "Unknown aux type 'B' sub-code.\n");
1238 	        cp = cp_end;
1239 	    }
1240 	    break;
1241 
1242         default:
1243 	    fprintf(stderr, "Unknown aux type.\n");
1244 	    cp = cp_end;
1245 	}
1246 
1247 	if (id[2] != '\0') {
1248 	    HashTableAdd(hdr->tags, (char *)id, 3, hd, NULL);
1249 	} else {
1250 	    cp = cp_end;
1251 	    break;
1252 	}
1253 
1254 	if (id[0] == 'B' && id[1] == 'D' && id[2] == 'B') {
1255 	    unsigned char *p = hd.p;
1256 	    hdr->BD_crc =  p[5] | (p[6]<<8) | (p[7]<<16) | (p[8]<<24);
1257 	}
1258 
1259 	if (id[0] == 'S' && id[1] == 'D' && id[2] == 'B') {
1260 	    unsigned char *p = hd.p;
1261 	    hdr->SD_crc =  p[5] | (p[6]<<8) | (p[7]<<16) | (p[8]<<24);
1262 	}
1263    }
1264 
1265     return hdr;
1266 }
1267 
1268 
1269 #if 0
1270 /* Returns the number of bits set in val; it the highest bit used */
1271 static int nbits(int v) {
1272     static const int MultiplyDeBruijnBitPosition[32] = {
1273 	1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1274 	9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1275     };
1276 
1277     v |= v >> 1; // first up to set all bits 1 after the first 1 */
1278     v |= v >> 2;
1279     v |= v >> 4;
1280     v |= v >> 8;
1281     v |= v >> 16;
1282 
1283     // DeBruijn magic to find top bit
1284     return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1285 }
1286 #endif
1287 
1288 #if 0
1289 static int sort_freqs(const void *vp1, const void *vp2) {
1290     const int i1 = *(const int *)vp1;
1291     const int i2 = *(const int *)vp2;
1292     return i1-i2;
1293 }
1294 #endif
1295 
1296 /* ----------------------------------------------------------------------
1297  * Primary CRAM sequence decoder
1298  */
1299 
1300 /*
1301  * Internal part of cram_decode_slice().
1302  * Generates the sequence, quality and cigar components.
1303  */
cram_decode_seq(cram_fd * fd,cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,SAM_hdr * bfd,int cf,char * seq,char * qual,int has_MD,int has_NM)1304 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1305 			   cram_block *blk, cram_record *cr, SAM_hdr *bfd,
1306 			   int cf, char *seq, char *qual,
1307 			   int has_MD, int has_NM) {
1308     int prev_pos = 0, f, r = 0, out_sz = 1;
1309     int seq_pos = 1;
1310     int cig_len = 0;
1311     int64_t ref_pos = cr->apos;
1312     int32_t fn, i32;
1313     enum cigar_op cig_op = BAM_CMATCH;
1314     uint32_t *cigar = s->cigar;
1315     uint32_t ncigar = s->ncigar;
1316     uint32_t cigar_alloc = s->cigar_alloc;
1317     uint32_t nm = 0;
1318     int32_t md_dist = 0;
1319     int orig_aux = 0;
1320     int decode_md = s->decode_md && s->ref && !has_MD;
1321     int decode_nm = s->decode_md && s->ref && !has_NM;
1322     uint32_t ds = s->data_series;
1323 
1324     if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1325 	memset(qual, 255, cr->len);
1326     }
1327 
1328     if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1329         decode_md = decode_nm = 0;
1330 
1331     if (decode_md) {
1332 	orig_aux = BLOCK_SIZE(s->aux_blk);
1333 	BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1334     }
1335 
1336     if (ds & CRAM_FN) {
1337 	if (!c->comp_hdr->codecs[DS_FN]) return -1;
1338 	r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
1339 						blk, (char *)&fn, &out_sz);
1340         if (r) return r;
1341     } else {
1342 	fn = 0;
1343     }
1344 
1345     ref_pos--; // count from 0
1346     cr->cigar = ncigar;
1347 
1348     if (!(ds & (CRAM_FC | CRAM_FP)))
1349 	goto skip_cigar;
1350 
1351     for (f = 0; f < fn; f++) {
1352 	int32_t pos = 0;
1353 	char op;
1354 
1355 	if (ncigar+2 >= cigar_alloc) {
1356 	    cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1357 	    s->cigar = cigar;
1358 	    if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1359 		return -1;
1360 	}
1361 
1362 	if (ds & CRAM_FC) {
1363 	    if (!c->comp_hdr->codecs[DS_FC]) return -1;
1364 	    r |= c->comp_hdr->codecs[DS_FC]->decode(s,
1365 						    c->comp_hdr->codecs[DS_FC],
1366 						    blk,
1367 						    &op,  &out_sz);
1368 	    if (r) return r;
1369 	}
1370 
1371 	if (!(ds & CRAM_FP))
1372 	    continue;
1373 
1374 	if (!c->comp_hdr->codecs[DS_FP]) return -1;
1375 	r |= c->comp_hdr->codecs[DS_FP]->decode(s,
1376 						c->comp_hdr->codecs[DS_FP],
1377 						blk,
1378 						(char *)&pos, &out_sz);
1379 	if (r) return r;
1380 	pos += prev_pos;
1381 
1382 	if (pos <= 0) {
1383 	    fprintf(stderr, "Error: feature position %d before start of read.\n",
1384 		    pos);
1385 	    return -1;
1386 	}
1387 
1388 	if (pos > seq_pos) {
1389 	    if (pos > cr->len+1)
1390 		return -1;
1391 
1392 	    if (s->ref && cr->ref_id >= 0) {
1393 		if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1394 		    static int whinged = 0;
1395 		    int rlen;
1396 		    if (!whinged)
1397 			fprintf(stderr, "Ref pos outside of ref "
1398 				"sequence boundary\n");
1399 		    whinged = 1;
1400 		    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1401 		    if (rlen > 0) {
1402 			memcpy(&seq[seq_pos-1],
1403 			       &s->ref[ref_pos - s->ref_start +1], rlen);
1404 			if ((pos - seq_pos) - rlen > 0)
1405 			    memset(&seq[seq_pos-1+rlen], 'N',
1406 				   (pos - seq_pos) - rlen);
1407 		    } else {
1408 		        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1409 		    }
1410 		} else {
1411 		    memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1412 			   pos - seq_pos);
1413 		}
1414 	    }
1415 #ifdef USE_X
1416 	    if (cig_len && cig_op != BAM_CBASE_MATCH) {
1417 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1418 		cig_len = 0;
1419 	    }
1420 	    cig_op = BAM_CBASE_MATCH;
1421 #else
1422 	    if (cig_len && cig_op != BAM_CMATCH) {
1423 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1424 		cig_len = 0;
1425 	    }
1426 	    cig_op = BAM_CMATCH;
1427 #endif
1428 	    cig_len += pos - seq_pos;
1429 	    ref_pos += pos - seq_pos;
1430 	    if (md_dist >= 0)
1431 		md_dist += pos - seq_pos;
1432 	    seq_pos = pos;
1433 	}
1434 
1435 	prev_pos = pos;
1436 
1437 	if (!(ds & CRAM_FC))
1438 	    goto skip_cigar;
1439 
1440 	switch(op) {
1441 	case 'S': { // soft clip: IN
1442 	    int32_t out_sz2 = 1;
1443 	    int have_sc = 0;
1444 
1445 	    if (cig_len) {
1446 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1447 		cig_len = 0;
1448 	    }
1449 	    switch (CRAM_MAJOR_VERS(fd->version)) {
1450 	    case 1:
1451 	        if (ds & CRAM_IN) {
1452 		    r |= c->comp_hdr->codecs[DS_IN]
1453 			? c->comp_hdr->codecs[DS_IN]
1454 			             ->decode(s, c->comp_hdr->codecs[DS_IN],
1455 					      blk,
1456 					      cr->len ? &seq[pos-1] : NULL,
1457 					      &out_sz2)
1458 			: (seq[pos-1] = 'N', out_sz2 = 1, 0);
1459 		    have_sc = 1;
1460 		}
1461 		break;
1462 	    case 2:
1463 	    default:
1464 	        if (ds & CRAM_SC) {
1465 		    r |= c->comp_hdr->codecs[DS_SC]
1466 			? c->comp_hdr->codecs[DS_SC]
1467 			             ->decode(s, c->comp_hdr->codecs[DS_SC],
1468 					      blk,
1469 					      cr->len ? &seq[pos-1] : NULL,
1470 					      &out_sz2)
1471 			: (seq[pos-1] = 'N', out_sz2 = 1, 0);
1472 		    have_sc = 1;
1473 		}
1474 		break;
1475 
1476 //		default:
1477 //		    r |= c->comp_hdr->codecs[DS_BB]
1478 //			? c->comp_hdr->codecs[DS_BB]
1479 //			             ->decode(s, c->comp_hdr->codecs[DS_BB],
1480 //					      blk, &seq[pos-1], &out_sz2)
1481 //			: (seq[pos-1] = 'N', out_sz2 = 1, 0);
1482 	    }
1483 	    if (have_sc) {
1484 		if (r) return r;
1485 		cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1486 		cig_op = BAM_CSOFT_CLIP;
1487 		seq_pos += out_sz2;
1488 	    }
1489 	    break;
1490 	}
1491 
1492 	case 'X': { // Substitution; BS
1493 	    unsigned char base;
1494 #ifdef USE_X
1495 	    if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1496 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1497 		cig_len = 0;
1498 	    }
1499 	    if (ds & CRAM_BS) {
1500 		if (!c->comp_hdr->codecs[DS_BS]) return -1;
1501 		r |= c->comp_hdr->codecs[DS_BS]
1502 		                ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1503 					 (char *)&base, &out_sz);
1504 		if (pos-1 < cr->len)
1505 		    seq[pos-1] = 'N'; // FIXME look up BS=base value
1506 	    }
1507 	    cig_op = BAM_CBASE_MISMATCH;
1508 #else
1509 	    int ref_base;
1510 	    if (cig_len && cig_op != BAM_CMATCH) {
1511 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1512 		cig_len = 0;
1513 	    }
1514 	    if (ds & CRAM_BS) {
1515 		if (!c->comp_hdr->codecs[DS_BS]) return -1;
1516 		r |= c->comp_hdr->codecs[DS_BS]
1517 		                ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1518 					 (char *)&base, &out_sz);
1519 		if (r) return -1;
1520 		if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1521 		    if (pos-1 < cr->len)
1522 			seq[pos-1] = c->comp_hdr->
1523 			    substitution_matrix[fd->L1['N']][base];
1524 		    if (decode_md || decode_nm) {
1525 			if (md_dist >= 0 && decode_md)
1526 			    BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1527 			md_dist = -1;
1528 			nm--;
1529 		    }
1530 		} else {
1531                     unsigned char ref_call = ref_pos <= s->ref_end
1532                         ? (uc)s->ref[ref_pos - s->ref_start +1]
1533                         : 'N';
1534                     ref_base = fd->L1[ref_call];
1535 		    if (pos-1 < cr->len)
1536 			seq[pos-1] = c->comp_hdr->
1537 			    substitution_matrix[ref_base][base];
1538 		    if (decode_md) {
1539                         BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1540                         BLOCK_APPEND_CHAR(s->aux_blk, ref_call);
1541 			md_dist = 0;
1542 		    }
1543 		}
1544 	    }
1545 	    cig_op = BAM_CMATCH;
1546 #endif
1547 	    nm++;
1548 	    cig_len++;
1549 	    seq_pos++;
1550 	    ref_pos++;
1551 	    break;
1552 	}
1553 
1554 	case 'D': { // Deletion; DL
1555 	    if (cig_len && cig_op != BAM_CDEL) {
1556 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1557 		cig_len = 0;
1558 	    }
1559 	    if (ds & CRAM_DL) {
1560 		if (!c->comp_hdr->codecs[DS_DL]) return -1;
1561 		r |= c->comp_hdr->codecs[DS_DL]
1562 		                ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
1563 					 (char *)&i32, &out_sz);
1564 		if (r) return r;
1565 		if (decode_md || decode_nm) {
1566 		    if (md_dist >= 0 && decode_md)
1567 			BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1568 		    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1569 			if (decode_md) {
1570 			    BLOCK_APPEND_CHAR(s->aux_blk, '^');
1571 			    BLOCK_APPEND(s->aux_blk,
1572 					 &s->ref[ref_pos - s->ref_start +1],
1573 					 i32);
1574 			    md_dist = 0;
1575 			}
1576 			nm += i32;
1577 		    } else {
1578 			uint32_t dlen;
1579 			if (bfd->ref[cr->ref_id].len >= ref_pos) {
1580 			    if (decode_md) {
1581 				BLOCK_APPEND_CHAR(s->aux_blk, '^');
1582 				BLOCK_APPEND(s->aux_blk,
1583 					     &s->ref[ref_pos - s->ref_start+1],
1584 					     bfd->ref[cr->ref_id].len-ref_pos);
1585 				BLOCK_APPEND_UINT(s->aux_blk, 0);
1586 			    }
1587 			    dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1588 			    nm += i32 - dlen;
1589 			} else {
1590 			    dlen = i32;
1591 			}
1592 
1593 			md_dist = -1;
1594 		    }
1595 		}
1596 		cig_op = BAM_CDEL;
1597 		cig_len += i32;
1598 		ref_pos += i32;
1599 		//printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1600 	    }
1601 	    break;
1602 	}
1603 
1604 	case 'I': { // Insertion (several bases); IN
1605 	    int32_t out_sz2 = 1;
1606 
1607 	    if (cig_len && cig_op != BAM_CINS) {
1608 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1609 		cig_len = 0;
1610 	    }
1611 
1612 	    if (ds & CRAM_IN) {
1613 		if (!c->comp_hdr->codecs[DS_IN]) return -1;
1614 		r |= c->comp_hdr->codecs[DS_IN]
1615 		                ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
1616 					 cr->len ? &seq[pos-1] : NULL,
1617 					 &out_sz2);
1618 		if (r) return r;
1619 		cig_op = BAM_CINS;
1620 		cig_len += out_sz2;
1621 		seq_pos += out_sz2;
1622 		nm      += out_sz2;
1623 		//printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1624 	    }
1625 	    break;
1626 	}
1627 
1628 	case 'i': { // Insertion (single base); BA
1629 	    if (cig_len && cig_op != BAM_CINS) {
1630 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1631 		cig_len = 0;
1632 	    }
1633 	    if (ds & CRAM_BA) {
1634 		if (!c->comp_hdr->codecs[DS_BA]) return -1;
1635 		r |= c->comp_hdr->codecs[DS_BA]
1636 		                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1637 					 cr->len ? &seq[pos-1] : NULL,
1638 					 &out_sz);
1639 		if (r) return r;
1640 		//printf("  %d: BA = %c (ret %d)\n", f, seq[pos-1], r);
1641 	    }
1642 	    cig_op = BAM_CINS;
1643 	    cig_len++;
1644 	    seq_pos++;
1645 	    nm++;
1646 	    break;
1647 	}
1648 
1649 	case 'b': { // Several bases
1650 	    int32_t len = 1;
1651 
1652 	    if (cig_len && cig_op != BAM_CMATCH) {
1653 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1654 		cig_len = 0;
1655 	    }
1656 
1657 	    if (ds & CRAM_BB) {
1658 		if (!c->comp_hdr->codecs[DS_BB]) return -1;
1659 		r |= c->comp_hdr->codecs[DS_BB]
1660 		    ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
1661 			     cr->len ? &seq[pos-1] : NULL,
1662 			     &len);
1663 		if (r) return r;
1664 
1665 		if (decode_md || decode_nm) {
1666 		    int x;
1667 		    if (md_dist >= 0 && decode_md)
1668 			BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1669 
1670 		    for (x = 0; x < len; x++) {
1671 			if (x && decode_md)
1672 			    BLOCK_APPEND_UINT(s->aux_blk, 0);
1673 			if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1674 			    md_dist = -1;
1675 			    break;
1676 			} else {
1677 			    if (decode_md) {
1678 				char r = s->ref[ref_pos+x-s->ref_start +1];
1679 				BLOCK_APPEND_CHAR(s->aux_blk, r);
1680 			    }
1681 			}
1682 		    }
1683 
1684 		    nm += x;
1685 		}
1686 	    }
1687 
1688 	    cig_op = BAM_CMATCH;
1689 
1690 	    cig_len+=len;
1691 	    seq_pos+=len;
1692 	    ref_pos+=len;
1693 	    //prev_pos+=len;
1694 	    break;
1695 	}
1696 
1697 	case 'q': { // Several quality values
1698 	    int32_t len = 1;
1699 
1700 	    if (cig_len && cig_op != BAM_CMATCH) {
1701 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1702 		cig_len = 0;
1703 	    }
1704 
1705 	    if (ds & CRAM_QQ) {
1706 		if (!c->comp_hdr->codecs[DS_QQ]) return -1;
1707 		r |= c->comp_hdr->codecs[DS_QQ]
1708 		    ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
1709 			     (char *)&qual[pos-1], &len);
1710 		if (r) return r;
1711 	    }
1712 
1713 	    cig_op = BAM_CMATCH;
1714 
1715 	    cig_len+=len;
1716 	    seq_pos+=len;
1717 	    ref_pos+=len;
1718 	    //prev_pos+=len;
1719 	    break;
1720 	}
1721 
1722 	case 'B': { // Read base; BA, QS
1723 #ifdef USE_X
1724 	    if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1725 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1726 		cig_len = 0;
1727 	    }
1728 #else
1729 	    if (cig_len && cig_op != BAM_CMATCH) {
1730 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1731 		cig_len = 0;
1732 	    }
1733 #endif
1734 	    if (ds & CRAM_BA) {
1735 		if (!c->comp_hdr->codecs[DS_BA]) return -1;
1736 		r |= c->comp_hdr->codecs[DS_BA]
1737 		                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1738 					 cr->len ? &seq[pos-1] : NULL,
1739 					 &out_sz);
1740 
1741 		if (decode_md || decode_nm) {
1742 		    if (md_dist >= 0 && decode_md)
1743 			BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1744 		    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1745 			md_dist = -1;
1746 		    } else {
1747 			if (decode_md)
1748 			    BLOCK_APPEND_CHAR(s->aux_blk,
1749 					      s->ref[ref_pos-s->ref_start +1]);
1750 			nm++;
1751 			md_dist = 0;
1752 		    }
1753 		}
1754 	    }
1755 	    if (ds & CRAM_QS) {
1756 		if (!c->comp_hdr->codecs[DS_QS]) return -1;
1757 		r |= c->comp_hdr->codecs[DS_QS]
1758 		                ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1759 					 (char *)&qual[pos-1], &out_sz);
1760 	    }
1761 #ifdef USE_X
1762 	    cig_op = BAM_CBASE_MISMATCH;
1763 #else
1764 	    cig_op = BAM_CMATCH;
1765 #endif
1766 	    cig_len++;
1767 	    seq_pos++;
1768 	    ref_pos++;
1769 	    //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1770 	    break;
1771 	}
1772 
1773 	case 'Q': { // Quality score; QS
1774 	    if (ds & CRAM_QS) {
1775 		if (!c->comp_hdr->codecs[DS_QS]) return -1;
1776 		r |= c->comp_hdr->codecs[DS_QS]
1777 		                ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1778 					 (char *)&qual[pos-1], &out_sz);
1779 		//printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1780 	    }
1781 	    break;
1782 	}
1783 
1784 	case 'H': { // hard clip; HC
1785 	    if (cig_len && cig_op != BAM_CHARD_CLIP) {
1786 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1787 		cig_len = 0;
1788 	    }
1789 	    if (ds & CRAM_HC) {
1790 		if (!c->comp_hdr->codecs[DS_HC]) return -1;
1791 		r |= c->comp_hdr->codecs[DS_HC]
1792 		                ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
1793 					 (char *)&i32, &out_sz);
1794 		if (r) return r;
1795 		cig_op = BAM_CHARD_CLIP;
1796 		cig_len += i32;
1797 	    }
1798 	    break;
1799 	}
1800 
1801 	case 'P': { // padding; PD
1802 	    if (cig_len && cig_op != BAM_CPAD) {
1803 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1804 		cig_len = 0;
1805 	    }
1806 	    if (ds & CRAM_PD) {
1807 		if (!c->comp_hdr->codecs[DS_PD]) return -1;
1808 		r |= c->comp_hdr->codecs[DS_PD]
1809 		                ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
1810 					 (char *)&i32, &out_sz);
1811 		if (r) return r;
1812 		cig_op = BAM_CPAD;
1813 		cig_len += i32;
1814 		nm      += i32;
1815 	    }
1816 	    break;
1817 	}
1818 
1819 	case 'N': { // Ref skip; RS
1820 	    if (cig_len && cig_op != BAM_CREF_SKIP) {
1821 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1822 		cig_len = 0;
1823 	    }
1824 	    if (ds & CRAM_RS) {
1825 		if (!c->comp_hdr->codecs[DS_RS]) return -1;
1826 		r |= c->comp_hdr->codecs[DS_RS]
1827 		                ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
1828 					 (char *)&i32, &out_sz);
1829 		if (r) return r;
1830 		cig_op = BAM_CREF_SKIP;
1831 		cig_len += i32;
1832 		ref_pos += i32;
1833 		nm      += i32;
1834 	    }
1835 	    break;
1836 	}
1837 
1838 	default:
1839             fprintf(stderr, "Error: Unknown feature code '%c'\n", op);
1840 	    return -1;
1841 	}
1842     }
1843 
1844     if (!(ds & CRAM_FC))
1845 	goto skip_cigar;
1846 
1847     /* An implicit match op for any unaccounted for bases */
1848     if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1849 	if (s->ref) {
1850 	    if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1851 		static int whinged = 0;
1852 		int rlen;
1853 		if (!whinged)
1854 		    fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
1855 		whinged = 1;
1856 		rlen = bfd->ref[cr->ref_id].len - ref_pos;
1857 		if (rlen > 0) {
1858 		    if (seq_pos-1 + rlen < cr->len)
1859 			memcpy(&seq[seq_pos-1],
1860 			       &s->ref[ref_pos - s->ref_start +1], rlen);
1861 		    if ((cr->len - seq_pos + 1) - rlen > 0)
1862 		        memset(&seq[seq_pos-1+rlen], 'N',
1863                                (cr->len - seq_pos + 1) - rlen);
1864 		} else {
1865 		    if (cr->len - seq_pos + 1 > 0)
1866 			memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1867 		}
1868 	    } else {
1869 		if (cr->len - seq_pos + 1 > 0)
1870 		    memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1871 			   cr->len - seq_pos + 1);
1872 		ref_pos += cr->len - seq_pos + 1;
1873 		if (md_dist >= 0)
1874 		    md_dist += cr->len - seq_pos + 1;
1875 	    }
1876 	} else {
1877 	    // So alignment end can be computed even when not decoding sequence
1878 	    ref_pos += cr->len - seq_pos + 1;
1879 	}
1880 
1881 	if (ncigar+1 >= cigar_alloc) {
1882 	    cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1883 	    s->cigar = cigar;
1884 	    if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1885 		return -1;
1886 	}
1887 #ifdef USE_X
1888 	if (cig_len && cig_op != BAM_CBASE_MATCH) {
1889 	    cigar[ncigar++] = (cig_len<<4) + cig_op;
1890 	    cig_len = 0;
1891 	}
1892 	cig_op = BAM_CBASE_MATCH;
1893 #else
1894 	if (cig_len && cig_op != BAM_CMATCH) {
1895 	    cigar[ncigar++] = (cig_len<<4) + cig_op;
1896 	    cig_len = 0;
1897 	}
1898 	cig_op = BAM_CMATCH;
1899 #endif
1900 	cig_len += cr->len - seq_pos+1;
1901     }
1902 
1903  skip_cigar:
1904 
1905     if ((ds & CRAM_FN) && decode_md) {
1906 	if (md_dist >= 0)
1907 	    BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1908     }
1909 
1910     if (cig_len) {
1911 	if (ncigar >= cigar_alloc) {
1912 	    cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1913 	    s->cigar = cigar;
1914 	    if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1915 		return -1;
1916 	}
1917 
1918 	cigar[ncigar++] = (cig_len<<4) + cig_op;
1919     }
1920 
1921     cr->ncigar = ncigar - cr->cigar;
1922     cr->aend = ref_pos;
1923 
1924     //printf("2: %.*s %d .. %d %d\n", cr->name_len, (char *)BLOCK_DATA(s->name_blk) + cr->name, cr->apos, ref_pos, seq_pos);
1925 
1926     if (ds & CRAM_MQ) {
1927 	if (!c->comp_hdr->codecs[DS_MQ]) return -1;
1928 	r |= c->comp_hdr->codecs[DS_MQ]
1929 	                ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
1930 				 (char *)&cr->mqual, &out_sz);
1931     } else {
1932 	cr->mqual = 40;
1933     }
1934 
1935     if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1936 	int32_t out_sz2 = cr->len;
1937 
1938 	if (ds & CRAM_QS) {
1939 	    if (!c->comp_hdr->codecs[DS_QS]) return -1;
1940 	    r |= c->comp_hdr->codecs[DS_QS]
1941 		            ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1942 				     qual, &out_sz2);
1943 	}
1944     }
1945 
1946     s->cigar = cigar;
1947     s->cigar_alloc = cigar_alloc;
1948     s->ncigar = ncigar;
1949 
1950     if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1951 	cr->len = 0;
1952 
1953     if (decode_md) {
1954 	BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1955 	cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
1956     }
1957 
1958     if (decode_nm) {
1959 	char buf[7];
1960 	buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
1961 	buf[3] = (nm>> 0) & 0xff;
1962 	buf[4] = (nm>> 8) & 0xff;
1963 	buf[5] = (nm>>16) & 0xff;
1964 	buf[6] = (nm>>24) & 0xff;
1965 	BLOCK_APPEND(s->aux_blk, buf, 7);
1966 	cr->aux_size += 7;
1967     }
1968 
1969     return r;
1970 }
1971 
1972 /*
1973  * Quick and simple hash lookup for cram_map arrays
1974  */
map_find(cram_map ** map,unsigned char * key,int id)1975 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1976     cram_map *m;
1977 
1978     m = map[CRAM_MAP(key[0],key[1])];
1979     while (m && m->key != id)
1980 	m= m->next;
1981 
1982     return m;
1983 }
1984 
1985 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1986 
1987 
cram_decode_aux_1_0(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr)1988 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1989 			       cram_block *blk, cram_record *cr) {
1990     int i, r = 0, out_sz = 1;
1991     unsigned char ntags;
1992 
1993     if (!c->comp_hdr->codecs[DS_TC]) return -1;
1994     r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1995 				       (char *)&ntags, &out_sz);
1996     cr->ntags = ntags;
1997 
1998     //printf("TC=%d\n", cr->ntags);
1999     cr->aux_size = 0;
2000     cr->aux = BLOCK_SIZE(s->aux_blk);
2001 
2002     for (i = 0; i < cr->ntags; i++) {
2003 	int32_t id, out_sz = 1;
2004 	unsigned char tag_data[3];
2005 	cram_map *m;
2006 
2007 	//printf("Tag %d/%d\n", i+1, cr->ntags);
2008 	if (!c->comp_hdr->codecs[DS_TN]) return -1;
2009 	r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
2010 					   blk, (char *)&id, &out_sz);
2011 	if (out_sz == 3) {
2012 	    tag_data[0] = ((char *)&id)[0];
2013 	    tag_data[1] = ((char *)&id)[1];
2014 	    tag_data[2] = ((char *)&id)[2];
2015 	} else {
2016 	    tag_data[0] = (id>>16) & 0xff;
2017 	    tag_data[1] = (id>>8)  & 0xff;
2018 	    tag_data[2] = id       & 0xff;
2019 	}
2020 
2021 	m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2022 	if (!m)
2023 	    return -1;
2024 	BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2025 
2026 	if (!m->codec) return -1;
2027 	r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2028 
2029 	cr->aux_size += out_sz + 3;
2030     }
2031 
2032     return r;
2033 }
2034 
cram_decode_aux(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,int * has_MD,int * has_NM)2035 static int cram_decode_aux(cram_container *c, cram_slice *s,
2036 			   cram_block *blk, cram_record *cr,
2037 			   int *has_MD, int *has_NM) {
2038     int i, r = 0, out_sz = 1;
2039     int32_t TL = 0;
2040     unsigned char *TN;
2041     uint32_t ds = s->data_series;
2042 
2043     if (!(ds & (CRAM_TL|CRAM_aux))) {
2044 	cr->aux = 0;
2045 	cr->aux_size = 0;
2046 	return 0;
2047     }
2048 
2049     if (!c->comp_hdr->codecs[DS_TL]) return -1;
2050     r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
2051 					    (char *)&TL, &out_sz);
2052     if (r || TL < 0 || TL >= c->comp_hdr->nTL)
2053 	return -1;
2054 
2055     TN = c->comp_hdr->TL[TL];
2056     cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
2057 
2058     //printf("TC=%d\n", cr->ntags);
2059     cr->aux_size = 0;
2060     cr->aux = BLOCK_SIZE(s->aux_blk);
2061 
2062     if (!(ds & CRAM_aux))
2063 	return 0;
2064 
2065     for (i = 0; i < cr->ntags; i++) {
2066 	int32_t id, out_sz = 1;
2067 	unsigned char tag_data[3];
2068 	cram_map *m;
2069 
2070 	if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
2071 	    *has_MD = 1;
2072 	if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
2073 	    *has_NM = 1;
2074 
2075 	//printf("Tag %d/%d\n", i+1, cr->ntags);
2076 	tag_data[0] = *TN++;
2077 	tag_data[1] = *TN++;
2078 	tag_data[2] = *TN++;
2079 	id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2080 
2081 	m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2082 	if (!m)
2083 	    return -1;
2084 	BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2085 
2086 	if (!m->codec) return -1;
2087 	r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2088 	if (r) break;
2089 	cr->aux_size += out_sz + 3;
2090     }
2091 
2092     return r;
2093 }
2094 
2095 /* Resolve mate pair cross-references between recs within this slice */
cram_decode_slice_xref(cram_slice * s,int required_fields)2096 static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2097     int rec;
2098 
2099     if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2100 	for (rec = 0; rec < s->hdr->num_records; rec++) {
2101 	    cram_record *cr = &s->crecs[rec];
2102 
2103 	    cr->tlen = 0;
2104 	    cr->mate_pos = 0;
2105 	    cr->mate_ref_id = -1;
2106 	}
2107 
2108 	return 0;
2109     }
2110 
2111     for (rec = 0; rec < s->hdr->num_records; rec++) {
2112 	cram_record *cr = &s->crecs[rec];
2113 
2114 	if (cr->mate_line >= 0) {
2115 	    if (cr->mate_line < s->hdr->num_records) {
2116 		/*
2117 		 * On the first read, loop through computing lengths.
2118 		 * It's not perfect as we have one slice per reference so we
2119 		 * cannot detect when TLEN should be zero due to seqs that
2120 		 * map to multiple references.
2121 		 *
2122 		 * We also cannot set tlen correct when it spans a slice for
2123 		 * other reasons. This may make tlen too small. Should we
2124 		 * fix this by forcing TLEN to be stored verbatim in such cases?
2125 		 *
2126 		 * Or do we just admit defeat and output 0 for tlen? It's the
2127 		 * safe option...
2128 		 */
2129 		if (cr->tlen == INT_MIN) {
2130 		    int id1 = rec, id2 = rec;
2131 		    int aleft = cr->apos, aright = cr->aend;
2132 		    int tlen;
2133 		    int ref = cr->ref_id;
2134 
2135 		    // number of segments starting at the same point.
2136 		    int left_cnt = 0;
2137 
2138 		    do {
2139 			if (aleft > s->crecs[id2].apos)
2140 			    aleft = s->crecs[id2].apos, left_cnt = 1;
2141 			else if (aleft == s->crecs[id2].apos)
2142 			    left_cnt++;
2143 			if (aright < s->crecs[id2].aend)
2144 			    aright = s->crecs[id2].aend;
2145 			if (s->crecs[id2].mate_line == -1) {
2146 			    s->crecs[id2].mate_line = rec;
2147 			    break;
2148 			}
2149 			if (s->crecs[id2].mate_line <= id2 ||
2150                             s->crecs[id2].mate_line >= s->hdr->num_records)
2151 			    return -1;
2152 			id2 = s->crecs[id2].mate_line;
2153 
2154 			if (s->crecs[id2].ref_id != ref)
2155 			    ref = -1;
2156 		    } while (id2 != id1);
2157 
2158 		    if (ref != -1) {
2159 			tlen = aright - aleft + 1;
2160 			id1 = id2 = rec;
2161 
2162 			/*
2163 			 * When we have two seqs with identical start and
2164 			 * end coordinates, set +/- tlen based on 1st/last
2165 			 * bit flags instead, as a tie breaker.
2166 			 */
2167 			if (s->crecs[id2].apos == aleft) {
2168 			    if (left_cnt == 1 ||
2169 				(s->crecs[id2].flags & BAM_FREAD1))
2170 				s->crecs[id2].tlen = tlen;
2171 			    else
2172 				s->crecs[id2].tlen = -tlen;
2173 			} else {
2174 			    s->crecs[id2].tlen = -tlen;
2175 			}
2176 
2177 			id2 = s->crecs[id2].mate_line;
2178 			while (id2 != id1) {
2179 			    if (s->crecs[id2].apos == aleft) {
2180 				if (left_cnt == 1 ||
2181 				    (s->crecs[id2].flags & BAM_FREAD1))
2182 				    s->crecs[id2].tlen = tlen;
2183 				else
2184 				    s->crecs[id2].tlen = -tlen;
2185 			    } else {
2186 				s->crecs[id2].tlen = -tlen;
2187 			    }
2188 			    id2 = s->crecs[id2].mate_line;
2189 			}
2190 		    } else {
2191 			id1 = id2 = rec;
2192 
2193 			s->crecs[id2].tlen = 0;
2194 			id2 = s->crecs[id2].mate_line;
2195 			while (id2 != id1) {
2196 			    s->crecs[id2].tlen = 0;
2197 			    id2 = s->crecs[id2].mate_line;
2198 			}
2199 		    }
2200 		}
2201 
2202 		cr->mate_pos = s->crecs[cr->mate_line].apos;
2203 		cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2204 
2205 		// paired
2206 		cr->flags |= BAM_FPAIRED;
2207 
2208 		// set mate unmapped if needed
2209 		if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2210 		    cr->flags |= BAM_FMUNMAP;
2211 		    cr->tlen = 0;
2212 		}
2213 		if (cr->flags & BAM_FUNMAP) {
2214 		    cr->tlen = 0;
2215 		}
2216 
2217 		// set mate reversed if needed
2218 		if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2219 		    cr->flags |= BAM_FMREVERSE;
2220 	    } else {
2221 		fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
2222 			cr->mate_line, s->hdr->num_records-1);
2223 	    }
2224 
2225 	    /* FIXME: construct read names here too if needed */
2226 	} else {
2227 	    if (cr->mate_flags & CRAM_M_REVERSE) {
2228 		cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2229 	    }
2230 	    if (cr->mate_flags & CRAM_M_UNMAP) {
2231 		cr->flags |= BAM_FMUNMAP;
2232 		//cr->mate_ref_id = -1;
2233 	    }
2234 	    if (!(cr->flags & BAM_FPAIRED))
2235 		cr->mate_ref_id = -1;
2236 	}
2237 
2238 	if (cr->tlen == INT_MIN)
2239 	    cr->tlen = 0; // Just incase
2240     }
2241     return 0;
2242 }
2243 
md5_print(unsigned char * md5,char * out)2244 static char *md5_print(unsigned char *md5, char *out) {
2245     int i;
2246     for (i = 0; i < 16; i++) {
2247 	out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2248 	out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2249     }
2250     out[32] = 0;
2251 
2252     return out;
2253 }
2254 
2255 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
2256 		       cram_record *cr, int rec, bam_seq_t **bam);
2257 
2258 /*
2259  * Bulk conversion of an entire cram slice to an array of bam objects.
2260  * (Assumption that fd->required_fields will not change from one
2261  * cram_get_bam_seq() call to the  next.)
2262  *
2263  * Returns 0 on success
2264  *        -1 on failure
2265  */
2266 
bam_size(SAM_hdr * bfd,cram_fd * fd,cram_record * cr)2267 static int bam_size(SAM_hdr *bfd, cram_fd *fd, cram_record *cr) {
2268     int name_len, rg_len;
2269 
2270     if (fd->required_fields & SAM_QNAME) {
2271 	if (cr->name_len)
2272 	    name_len = cr->name_len;
2273 	else
2274 	    name_len = strlen(fd->prefix) + 20; // overestimate
2275     } else {
2276 	name_len = 1;
2277     }
2278 
2279     rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
2280 
2281     return sizeof(bam_seq_t)
2282 	+ name_len + 1
2283 	+ round4(name_len+1)
2284 	+ 4 * cr->ncigar
2285 	+ (cr->len+1)/2
2286 	+ cr->len
2287 	+ cr->aux_size + rg_len + 1;
2288 }
2289 
bulk_cram_to_bam(SAM_hdr * bfd,cram_fd * fd,cram_slice * s)2290 static int bulk_cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s) {
2291     int i;
2292     int r = 0;
2293     int sizes[10000];
2294 
2295     size_t len = 0;
2296     for (i = 0; i < s->hdr->num_records; i++) {
2297 	int sz = bam_size(bfd, fd, &s->crecs[i]);
2298 	if (i < 10000)
2299 	    sizes[i] = sz;
2300 	len += sz;
2301     }
2302 
2303     s->bl = (bam_seq_t **)malloc(s->hdr->num_records * sizeof(*s->bl) + len);
2304     if (!s->bl)
2305 	return -1;
2306 
2307     char *x = ((char *)s->bl) + s->hdr->num_records * sizeof(*s->bl);
2308     for (i = 0; i < s->hdr->num_records; i++) {
2309 	bam_seq_t *b = (bam_seq_t *)x, *o = b;
2310 	int bsize = i < 10000 ? sizes[i] : bam_size(bfd, fd, &s->crecs[i]);
2311 	b->alloc = bsize;
2312 	r |= (cram_to_bam(fd->header, fd, s, &s->crecs[i], i, &b) < 0);
2313 	// if we allocated enough, the above won't have resized b
2314 	assert(o == b && o->alloc == bsize);
2315 	x += bsize;
2316 	s->bl[i] = b;
2317     }
2318 
2319     return 0;
2320 }
2321 
2322 /*
2323  * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2324  * Returns 0 on success
2325  *        -1 on failure
2326  */
cram_decode_slice(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2327 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2328 		      SAM_hdr *bfd) {
2329     cram_block *blk = s->block[0];
2330     int32_t bf, ref_id;
2331     unsigned char cf;
2332     int out_sz, r = 0;
2333     int rec;
2334     char *seq = NULL, *qual = NULL;
2335     int unknown_rg = -1;
2336     int embed_ref;
2337     char **refs = NULL;
2338     uint32_t ds;
2339 
2340     if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2341 	return -1;
2342 
2343     ds = s->data_series;
2344     //printf("%08x\n", ds);
2345 
2346     blk->bit = 7; // MSB first
2347 
2348     // Study the blocks and estimate approx sizes to preallocate.
2349     // This looks to speed up decoding by around 8-9%.
2350     // We can always shrink back down at the end if we overestimated.
2351     // However it's likely that this also saves memory as own growth
2352     // factor (*=1.5) is never applied.
2353     {
2354 	int qsize, nsize, q_id;
2355 	cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2356 	//fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2357 
2358 	if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2359 	if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2360 	if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2361 
2362 	// To do - consider using q_id here to usurp the quality block and
2363 	// avoid a memcpy during decode.
2364 	// Specifically when quality is an external block uniquely used by
2365 	// DS_QS only, then we can set s->qual_blk directly to this
2366 	// block and save the codec->decode() calls. (Approx 3% cpu saving)
2367     }
2368 
2369     /* Look for unknown RG, added as last by Java CRAM? */
2370     if (bfd->nrg > 0 &&
2371 	bfd->rg[bfd->nrg-1].name != NULL &&
2372 	!strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2373 	unknown_rg = bfd->nrg-1;
2374 
2375     if (blk->content_type != CORE)
2376 	return -1;
2377 
2378     if (s->crecs)
2379 	free(s->crecs);
2380     if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2381 	return -1;
2382 
2383     ref_id = s->hdr->ref_seq_id;
2384     embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2385 
2386     if (ref_id >= 0) {
2387 	if (embed_ref) {
2388 	    cram_block *b;
2389 	    if (s->hdr->ref_base_id < 0) {
2390 		fprintf(stderr, "No reference specified and "
2391 			"no embedded reference is available.\n");
2392 		return -1;
2393 	    }
2394 	    b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2395 	    if (!b)
2396 		return -1;
2397             if (cram_uncompress_block(b) != 0)
2398                 return -1;
2399 	    s->ref = (char *)BLOCK_DATA(b);
2400 	    s->ref_start = s->hdr->ref_seq_start;
2401 	    s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2402 	    if (s->ref_end - s->ref_start > b->uncomp_size) {
2403 		fprintf(stderr, "Embedded reference is too small.\n");
2404 		return -1;
2405 	    }
2406 	} else if (!c->comp_hdr->no_ref) {
2407 	    //// Avoid Java cramtools bug by loading entire reference seq
2408 	    //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2409 	    //s->ref_start = 1;
2410 
2411 	    if (fd->required_fields & SAM_SEQ)
2412 		s->ref =
2413 		cram_get_ref(fd, s->hdr->ref_seq_id,
2414 			     s->hdr->ref_seq_start,
2415 			     s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2416 	    s->ref_start = s->hdr->ref_seq_start;
2417 	    s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2418 
2419 	    /* Sanity check */
2420 	    if (s->ref_start < 0) {
2421 		fprintf(stderr, "Slice starts before base 1.\n");
2422 		s->ref_start = 0;
2423 	    }
2424 	    if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2425 	    pthread_mutex_lock(&fd->refs->lock);
2426 	    if ((fd->required_fields & SAM_SEQ) &&
2427 		s->ref_end > fd->refs->ref_id[ref_id]->length) {
2428 		s->ref_end = fd->refs->ref_id[ref_id]->length;
2429 	    }
2430 	    pthread_mutex_unlock(&fd->refs->lock);
2431 	    if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2432 	}
2433     }
2434 
2435     if ((fd->required_fields & SAM_SEQ) &&
2436 	s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2437 	fprintf(stderr, "Unable to fetch reference #%d %"PRId64"..%"PRId64"\n",
2438 		s->hdr->ref_seq_id, s->hdr->ref_seq_start,
2439 		s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2440 	return -1;
2441     }
2442 
2443     if (!IS_CRAM_1_VERS(fd)
2444 	&& (fd->required_fields & SAM_SEQ)
2445 	&& s->hdr->ref_seq_id >= 0
2446 	&& !fd->ignore_md5
2447 	&& memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2448 	MD5_CTX md5;
2449 	unsigned char digest[16];
2450 
2451 	if (s->ref && s->hdr->ref_seq_id >= 0) {
2452 	    int start, len;
2453 
2454 	    if (s->hdr->ref_seq_start >= s->ref_start) {
2455 		start = s->hdr->ref_seq_start - s->ref_start;
2456 	    } else {
2457 		fprintf(stderr, "Slice starts before base 1.\n");
2458 		start = 0;
2459 	    }
2460 
2461 	    if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2462 		len = s->hdr->ref_seq_span;
2463 	    } else {
2464 		fprintf(stderr, "Slice ends beyond reference end.\n");
2465 		len = s->ref_end - s->ref_start + 1;
2466 	    }
2467 
2468 	    MD5_Init(&md5);
2469 	    if (start + len > s->ref_end - s->ref_start + 1)
2470 		len = s->ref_end - s->ref_start + 1 - start;
2471 	    if (len >= 0)
2472 		MD5_Update(&md5, s->ref + start, len);
2473 	    MD5_Final(digest, &md5);
2474 	} else if (!s->ref && s->hdr->ref_base_id >= 0) {
2475 	    cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2476 	    if (b) {
2477 		MD5_Init(&md5);
2478 		MD5_Update(&md5, b->data, b->uncomp_size);
2479 		MD5_Final(digest, &md5);
2480 	    }
2481 	}
2482 
2483 	if ((!s->ref && s->hdr->ref_base_id < 0)
2484 	    || memcmp(digest, s->hdr->md5, 16) != 0) {
2485 	    char M[33];
2486 	    fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
2487 		    "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
2488 	    fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
2489 	    fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
2490 	    return -1;
2491 	}
2492     }
2493 
2494     if (ref_id == -2) {
2495 	if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2496 	pthread_mutex_lock(&fd->refs->lock);
2497 	refs = calloc(fd->refs->nref, sizeof(char *));
2498 	pthread_mutex_unlock(&fd->refs->lock);
2499 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2500 	if (!refs)
2501 	    return -1;
2502     }
2503 
2504     int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2505     for (rec = 0; rec < s->hdr->num_records; rec++) {
2506 	cram_record *cr = &s->crecs[rec];
2507 	int has_MD, has_NM;
2508 
2509 	//fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2510 
2511 	cr->s = s;
2512 
2513 	out_sz = 1; /* decode 1 item */
2514 	if (ds & CRAM_BF) {
2515 	    if (!c->comp_hdr->codecs[DS_BF]) return -1;
2516 	    r |= c->comp_hdr->codecs[DS_BF]
2517 		            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2518 				     (char *)&bf, &out_sz);
2519 	    if (r || bf < 0 ||
2520 		bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2521 		return -1;
2522 	    bf = fd->bam_flag_swap[bf];
2523 	    cr->flags = bf;
2524 	} else {
2525 	    cr->flags = bf = 0x4; // unmapped
2526 	}
2527 
2528 	if (ds & CRAM_CF) {
2529 	    if (IS_CRAM_1_VERS(fd)) {
2530 		/* CF is byte in 1.0, int32 in 2.0 */
2531 		if (!c->comp_hdr->codecs[DS_CF]) return -1;
2532 		r |= c->comp_hdr->codecs[DS_CF]
2533 		                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2534 				 	 (char *)&cf, &out_sz);
2535 		if (r) return -1;
2536 		cr->cram_flags = cf;
2537 	    } else {
2538 		if (!c->comp_hdr->codecs[DS_CF]) return -1;
2539 		r |= c->comp_hdr->codecs[DS_CF]
2540 		                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2541 					 (char *)&cr->cram_flags, &out_sz);
2542 		if (r) return -1;
2543 		cf = cr->cram_flags;
2544 	    }
2545 	} else {
2546 	    cf = cr->cram_flags = 0;
2547 	}
2548 
2549 	if (!IS_CRAM_1_VERS(fd) && ref_id == -2) {
2550 	    if (ds & CRAM_RI) {
2551 		if (!c->comp_hdr->codecs[DS_RI]) return -1;
2552 		r |= c->comp_hdr->codecs[DS_RI]
2553 		                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2554 					 (char *)&cr->ref_id, &out_sz);
2555 		if (r) return -1;
2556 		if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2557 		    && cr->ref_id >= 0
2558 		    && cr->ref_id != last_ref_id) {
2559 		    if (!c->comp_hdr->no_ref) {
2560 			if (!refs[cr->ref_id])
2561 			    refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
2562 							    1, 0);
2563 			s->ref = refs[cr->ref_id];
2564 
2565 			if (!fd->unsorted && last_ref_id >= 0 && refs[last_ref_id]) {
2566 			    cram_ref_decr(fd->refs, last_ref_id);
2567 			    refs[last_ref_id] = NULL;
2568 			}
2569 		    }
2570 		    s->ref_start = 1;
2571 		    if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2572 		    pthread_mutex_lock(&fd->refs->lock);
2573 		    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2574 		    pthread_mutex_unlock(&fd->refs->lock);
2575 		    if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2576 
2577 		    last_ref_id = cr->ref_id;
2578 		}
2579 	    } else {
2580 		cr->ref_id = -1;
2581 	    }
2582 	} else {
2583 	    cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2584 	}
2585 	if (cr->ref_id >= bfd->nref) {
2586 	    fprintf(stderr, "Requested unknown reference ID %d\n", cr->ref_id);
2587             return -1;
2588 	}
2589 
2590 	if (ds & CRAM_RL) {
2591 	    if (!c->comp_hdr->codecs[DS_RL]) return -1;
2592 	    r |= c->comp_hdr->codecs[DS_RL]
2593 		            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2594 				     (char *)&cr->len, &out_sz);
2595 	    if (r) return r;
2596 	    if (cr->len < 0) {
2597 	        fprintf(stderr, "Read has negative length\n");
2598 		return -1;
2599 	    }
2600 	}
2601 
2602 	if (ds & CRAM_AP) {
2603 	    if (!c->comp_hdr->codecs[DS_AP]) return -1;
2604 	    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2605 		int32_t i32;
2606 		r |= c->comp_hdr->codecs[DS_AP]
2607 		                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2608 					 (char *)&i32, &out_sz);
2609 		cr->apos = i32;
2610 	    } else {
2611 		r |= c->comp_hdr->codecs[DS_AP]
2612 		                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2613 					 (char *)&cr->apos, &out_sz);
2614 	    }
2615 	    if (r) return r;
2616 	    if (c->comp_hdr->AP_delta)
2617 		cr->apos += s->last_apos;
2618 	    s->last_apos=  cr->apos;
2619 	} else {
2620 	    cr->apos = c->ref_seq_start;
2621 	}
2622 
2623 	if (ds & CRAM_RG) {
2624 	    if (!c->comp_hdr->codecs[DS_RG]) return -1;
2625 	    r |= c->comp_hdr->codecs[DS_RG]
2626 		           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2627 				    (char *)&cr->rg, &out_sz);
2628 	    if (r) return r;
2629 	    if (cr->rg == unknown_rg)
2630 		cr->rg = -1;
2631 	} else {
2632 	    cr->rg = -1;
2633 	}
2634 
2635 	cr->name_len = 0;
2636 
2637 	if (c->comp_hdr->read_names_included) {
2638 	    int32_t out_sz2 = 1;
2639 
2640 	    // Read directly into name cram_block
2641 	    cr->name = BLOCK_SIZE(s->name_blk);
2642 	    if (ds & CRAM_RN) {
2643 		if (!c->comp_hdr->codecs[DS_RN]) return -1;
2644 		r |= c->comp_hdr->codecs[DS_RN]
2645 		                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2646 					 (char *)s->name_blk, &out_sz2);
2647 		if (r) return r;
2648 		cr->name_len = out_sz2;
2649 	    }
2650 	}
2651 
2652 	cr->mate_pos = 0;
2653 	cr->mate_line = -1;
2654 	cr->mate_ref_id = -1;
2655 	if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2656 	    if (ds & CRAM_MF) {
2657 		if (IS_CRAM_1_VERS(fd)) {
2658 		    /* MF is byte in 1.0, int32 in 2.0 */
2659 		    unsigned char mf;
2660 		    if (!c->comp_hdr->codecs[DS_MF]) return -1;
2661 		    r |= c->comp_hdr->codecs[DS_MF]
2662 			            ->decode(s, c->comp_hdr->codecs[DS_MF],
2663 					     blk, (char *)&mf, &out_sz);
2664 		    if (r) return r;
2665 		    cr->mate_flags = mf;
2666 		} else {
2667 		    if (!c->comp_hdr->codecs[DS_MF]) return -1;
2668 		    r |= c->comp_hdr->codecs[DS_MF]
2669 			            ->decode(s, c->comp_hdr->codecs[DS_MF],
2670 					     blk,
2671 					     (char *)&cr->mate_flags,
2672 					     &out_sz);
2673 		    if (r) return r;
2674 		}
2675 	    } else {
2676 		cr->mate_flags = 0;
2677 	    }
2678 
2679 	    if (!c->comp_hdr->read_names_included) {
2680 		int32_t out_sz2 = 1;
2681 
2682 		// Read directly into name cram_block
2683 		cr->name = BLOCK_SIZE(s->name_blk);
2684 		if (ds & CRAM_RN) {
2685 		    if (!c->comp_hdr->codecs[DS_RN]) return -1;
2686 		    r |= c->comp_hdr->codecs[DS_RN]
2687 			            ->decode(s, c->comp_hdr->codecs[DS_RN],
2688 					     blk, (char *)s->name_blk,
2689 					     &out_sz2);
2690 		    if (r) return r;
2691 		    cr->name_len = out_sz2;
2692 		}
2693 	    }
2694 
2695 	    if (ds & CRAM_NS) {
2696 		if (!c->comp_hdr->codecs[DS_NS]) return -1;
2697 		r |= c->comp_hdr->codecs[DS_NS]
2698 		                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2699 					 (char *)&cr->mate_ref_id, &out_sz);
2700 		if (r) return r;
2701 	    }
2702 
2703 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2704 //	    if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2705 //		/* Paired, but unmapped */
2706 //		cr->flags |= BAM_FMUNMAP;
2707 //	    }
2708 
2709 	    if (ds & CRAM_NP) {
2710 		if (!c->comp_hdr->codecs[DS_NP]) return -1;
2711 		if (CRAM_MAJOR_VERS(fd->version) < 4) {
2712 		    int32_t i32;
2713 		    r |= c->comp_hdr->codecs[DS_NP]
2714 			            ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2715 					     (char *)&i32, &out_sz);
2716 		    cr->mate_pos = i32;
2717 		} else {
2718 		    r |= c->comp_hdr->codecs[DS_NP]
2719 			->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2720 				 (char *)&cr->mate_pos, &out_sz);
2721 		}
2722 		if (r) return r;
2723 	    }
2724 
2725 	    if (ds & CRAM_TS) {
2726 		if (!c->comp_hdr->codecs[DS_TS]) return -1;
2727 		if (CRAM_MAJOR_VERS(fd->version) < 4) {
2728 		    int32_t i32;
2729 		    r |= c->comp_hdr->codecs[DS_TS]
2730 			->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2731 				 (char *)&i32, &out_sz);
2732 		    cr->tlen = i32;
2733 		} else {
2734 		    r |= c->comp_hdr->codecs[DS_TS]
2735 		                    ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2736 					     (char *)&cr->tlen, &out_sz);
2737 		}
2738 		if (r) return r;
2739 	    } else {
2740 		cr->tlen = INT_MIN;
2741 	    }
2742 	} else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2743 	    if (ds & CRAM_NF) {
2744 		if (!c->comp_hdr->codecs[DS_NF]) return -1;
2745 		r |= c->comp_hdr->codecs[DS_NF]
2746 		                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2747 					 (char *)&cr->mate_line, &out_sz);
2748 		if (r) return r;
2749 		cr->mate_line += rec + 1;
2750 
2751 		//cr->name_len = sprintf(name, "%d", name_id++);
2752 		//cr->name = DSTRING_LEN(name_ds);
2753 		//dstring_nappend(name_ds, name, cr->name_len);
2754 
2755 		cr->mate_ref_id = -1;
2756 		cr->tlen = INT_MIN;
2757 		cr->mate_pos = 0;
2758 	    } else  {
2759 		cr->mate_flags = 0;
2760 		cr->tlen = INT_MIN;
2761 	    }
2762 	} else {
2763 	    cr->mate_flags = 0;
2764 	    cr->tlen = INT_MIN;
2765 	}
2766 	/*
2767 	else if (!name[0]) {
2768 	    //name[0] = '?'; name[1] = 0;
2769 	    //cr->name_len = 1;
2770 	    //cr->name=  DSTRING_LEN(s->name_ds);
2771 	    //dstring_nappend(s->name_ds, "?", 1);
2772 
2773 	    cr->mate_ref_id = -1;
2774 	    cr->tlen = 0;
2775 	    cr->mate_pos = 0;
2776 	}
2777 	*/
2778 
2779 	/* Auxiliary tags */
2780 	has_MD = has_NM = 0;
2781 	if (IS_CRAM_1_VERS(fd))
2782 	    r |= cram_decode_aux_1_0(c, s, blk, cr);
2783 	else
2784 	    r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM);
2785 	if (r) return r;
2786 
2787 	/* Fake up dynamic string growth and appending */
2788 	if (ds & CRAM_RL) {
2789 	    cr->seq = BLOCK_SIZE(s->seqs_blk);
2790 	    BLOCK_GROW(s->seqs_blk, cr->len);
2791 	    seq = (char *)BLOCK_END(s->seqs_blk);
2792 	    BLOCK_SIZE(s->seqs_blk) += cr->len;
2793 
2794 	    if (!seq)
2795 		return -1;
2796 
2797 	    cr->qual = BLOCK_SIZE(s->qual_blk);
2798 	    BLOCK_GROW(s->qual_blk, cr->len);
2799 	    qual = (char *)BLOCK_END(s->qual_blk);
2800 	    BLOCK_SIZE(s->qual_blk) += cr->len;
2801 
2802 	    if (!s->ref)
2803 		memset(seq, '=', cr->len);
2804 	}
2805 
2806 	if (!(bf & BAM_FUNMAP)) {
2807             if ((ds & CRAM_AP) && cr->apos <= 0) {
2808                 fprintf(stderr,
2809 			"Read has alignment position %"PRId64
2810 			" but no unmapped flag\n",
2811 			cr->apos);
2812 		return -1;
2813 	    }
2814 	    /* Decode sequence and generate CIGAR */
2815 	    if (ds & (CRAM_SEQ | CRAM_MQ)) {
2816 		r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual,
2817 				     has_MD, has_NM);
2818 		if (r) return r;
2819 	    } else {
2820 		cr->cigar = 0;
2821 		cr->ncigar = 0;
2822 		cr->aend = cr->apos;
2823 		cr->mqual = 0;
2824 	    }
2825 	} else {
2826 	    int out_sz2 = cr->len;
2827 
2828 	    //puts("Unmapped");
2829 	    cr->cigar = 0;
2830 	    cr->ncigar = 0;
2831 	    cr->aend = cr->apos;
2832 	    cr->mqual = 0;
2833 
2834 	    if (ds & CRAM_BA && cr->len) {
2835 		if (!c->comp_hdr->codecs[DS_BA]) return -1;
2836 		r |= c->comp_hdr->codecs[DS_BA]
2837 		                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2838 					 (char *)seq, &out_sz2);
2839 		if (r) return r;
2840 	    }
2841 
2842 	    if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2843 		out_sz2 = cr->len;
2844 		if (ds & CRAM_QS && cr->len >= 0) {
2845 		    if (!c->comp_hdr->codecs[DS_QS]) return -1;
2846 		    r |= c->comp_hdr->codecs[DS_QS]
2847 			            ->decode(s, c->comp_hdr->codecs[DS_QS],
2848 					     blk, qual, &out_sz2);
2849 		    if (r) return r;
2850 		}
2851 	    } else {
2852 		if (ds & CRAM_RL)
2853 		    memset(qual, 255, cr->len);
2854 	    }
2855 	}
2856 
2857 	if (!fd->ignore_chksum) {
2858 	    if (s->hdr->BD_crc && ds & CRAM_BA && s->ref)
2859 		s->BD_crc += iolib_crc32(0L, (Bytef *) seq, cr->len);
2860 
2861 	    if (s->hdr->SD_crc &&
2862 		(ds & CRAM_QS) &&
2863 		(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2864 		s->SD_crc += iolib_crc32(0L, (Bytef *) qual, cr->len);
2865 	    }
2866 	}
2867     }
2868 
2869     if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2870     if (refs) {
2871 	int i;
2872 	for (i = 0; i < fd->refs->nref; i++) {
2873 	    if (refs[i])
2874 		cram_ref_decr(fd->refs, i);
2875 	}
2876 	free(refs);
2877     } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2878 	cram_ref_decr(fd->refs, ref_id);
2879     }
2880     if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2881 
2882     /* Resolve mate pair cross-references between recs within this slice */
2883     r |= cram_decode_slice_xref(s, fd->required_fields);
2884 
2885     // Free the original blocks as we no longer need these.
2886     {
2887 	int i;
2888 	for (i = 0; i < s->hdr->num_blocks; i++) {
2889 	    cram_block *b = s->block[i];
2890 	    cram_free_block(b);
2891 	    s->block[i] = NULL;
2892 	}
2893     }
2894 
2895     // Also see initial BLOCK_RESIZE_EXACT at top of function.
2896     // As we grow blocks we overallocate by up to 50%. So shrink
2897     // back to their final sizes here.
2898     //
2899 //    fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2900 //    	    (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2901 //    	    (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2902 //    	    (int)s->name_blk->byte, (int)s->name_blk->alloc,
2903 //    	    (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2904     BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2905     BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2906     BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2907     BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2908 
2909     /* Checksum */
2910     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2911 	if (ds & CRAM_BA && s->BD_crc && s->hdr->BD_crc) {
2912 	    if (s->BD_crc != s->hdr->BD_crc) {
2913 		fprintf(stderr, "BD checksum failure: %08x vs %08x\n",
2914 			(uint32_t)s->BD_crc, (uint32_t)s->hdr->BD_crc);
2915 		r |= -1;
2916 	    }
2917 	}
2918 
2919 	if (ds & CRAM_QS && s->SD_crc && s->hdr->SD_crc) {
2920 	    if (s->SD_crc != s->hdr->SD_crc) {
2921 		fprintf(stderr, "SD checksum failure: %08x vs %08x\n",
2922 			(uint32_t)s->SD_crc, (uint32_t)s->hdr->SD_crc);
2923 		r |= -1;
2924 	    }
2925 	}
2926     }
2927 
2928     // If we're wanting BAM records, convert these up-front too.
2929     // This is useful when we're streaming lots of data in a
2930     // multi-threaded environment as the cram to bam conversion is
2931     // then threaded too.
2932     //
2933     // Possible future optimisation - check range query and don't
2934     // convert all reads to BAM.
2935 
2936     if (fd->pool)
2937 	r |= bulk_cram_to_bam(bfd, fd, s);
2938 
2939     return r;
2940 }
2941 
2942 typedef struct {
2943     cram_fd *fd;
2944     cram_container *c;
2945     cram_slice *s;
2946     SAM_hdr *h;
2947     int exit_code;
2948 } cram_decode_job;
2949 
cram_decode_slice_thread(void * arg)2950 void *cram_decode_slice_thread(void *arg) {
2951     cram_decode_job *j = (cram_decode_job *)arg;
2952 
2953     j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2954 
2955     return j;
2956 }
2957 
2958 /*
2959  * Spawn a multi-threaded version of cram_decode_slice().
2960  */
cram_decode_slice_mt(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2961 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2962 			 SAM_hdr *bfd) {
2963     cram_decode_job *j;
2964     int nonblock;
2965 
2966     if (!fd->pool)
2967 	return cram_decode_slice(fd, c, s, bfd);
2968 
2969     if (!(j = malloc(sizeof(*j))))
2970 	return -1;
2971 
2972     j->fd = fd;
2973     j->c  = c;
2974     j->s  = s;
2975     j->h  = bfd;
2976 
2977     nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
2978 
2979     if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2980 			       j, nonblock)) {
2981 	/* Would block */
2982 	fd->job_pending = j;
2983     } else {
2984 	fd->job_pending = NULL;
2985     }
2986 
2987     // flush too
2988     return 0;
2989 }
2990 
2991 
2992 /* ----------------------------------------------------------------------
2993  * CRAM sequence iterators.
2994  */
2995 
2996 /*
2997  * Converts a cram in-memory record into a bam in-memory record. We
2998  * pass a pointer to a bam_seq_t pointer along with the a pointer to
2999  * the allocated size. These can initially be pointers to NULL and zero.
3000  *
3001  * This function will reallocate the bam buffer as required and update
3002  * (*bam)->alloc accordingly, allowing it to be used within a loop
3003  * efficiently without needing to allocate new bam objects over and
3004  * over again.
3005  *
3006  * Returns the used size of the bam record on success
3007  *         -1 on failure.
3008  */
cram_to_bam(SAM_hdr * bfd,cram_fd * fd,cram_slice * s,cram_record * cr,int rec,bam_seq_t ** bam)3009 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
3010 		       cram_record *cr, int rec, bam_seq_t **bam) {
3011     int bam_idx, rg_len;
3012     char name_a[1024], *name;
3013     int name_len;
3014     char *aux, *aux_orig;
3015     char *seq, *qual;
3016 
3017     /* Assign names if not explicitly set */
3018     if (fd->required_fields & SAM_QNAME) {
3019 	if (cr->name_len) {
3020 	    name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3021 	    name_len = cr->name_len;
3022 	} else {
3023 	    name = name_a;
3024 	    name_len = strlen(fd->prefix);
3025 	    memcpy(name, fd->prefix, name_len);
3026 	    name += name_len;
3027 	    *name++ = ':';
3028 	    if (cr->mate_line >= 0 && cr->mate_line < rec)
3029 		name = (char *)append_uint64((unsigned char *)name,
3030 					     s->hdr->record_counter +
3031 					     cr->mate_line + 1);
3032 	    else
3033 		name = (char *)append_uint64((unsigned char *)name,
3034 					     s->hdr->record_counter +
3035 					     rec + 1);
3036 	    name_len = name - name_a;
3037 	    name = name_a;
3038 	}
3039     } else {
3040 	name = "?";
3041 	name_len = 1;
3042     }
3043 
3044     /* Generate BAM record */
3045     if (cr->rg < -1 || cr->rg >= bfd->nrg)
3046 	return -1;
3047     rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3048 
3049     if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3050 	if (!BLOCK_DATA(s->seqs_blk))
3051 	    return -1;
3052 	seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3053     } else {
3054 	seq = "*";
3055 	cr->len = 0;
3056     }
3057 
3058     if (fd->required_fields & SAM_QUAL) {
3059 	if (!BLOCK_DATA(s->qual_blk))
3060 	    return -1;
3061 	qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3062 	if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3063 	    if (cr->flags & BAM_FREVERSE) {
3064 		int i, j;
3065 		for (i = 0, j = cr->len-1; i < j; i++, j--) {
3066 		    unsigned char c;
3067 		    c = qual[i];
3068 		    qual[i] = qual[j];
3069 		    qual[j] = c;
3070 		}
3071 	    }
3072 	}
3073     } else {
3074 	qual = NULL;
3075     }
3076 
3077     bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
3078 				name, name_len,
3079 				cr->flags,
3080 				cr->ref_id,
3081 				cr->apos,
3082 				cr->aend,
3083 				cr->mqual,
3084 				cr->ncigar, &s->cigar[cr->cigar],
3085 				cr->mate_ref_id,
3086 				cr->mate_pos,
3087 				cr->tlen,
3088 				cr->len,
3089 				seq,
3090 				qual);
3091     if (bam_idx == -1)
3092 	return -1;
3093 
3094     aux = aux_orig = (char *)bam_aux(*bam);
3095 
3096     /* Auxiliary strings */
3097     if (cr->aux_size != 0) {
3098 	memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3099 	aux += cr->aux_size;
3100     }
3101 
3102     /* RG:Z: */
3103     if (cr->rg != -1) {
3104 	int len = bfd->rg[cr->rg].name_len;
3105 	*aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3106 	memcpy(aux, bfd->rg[cr->rg].name, len);
3107 	aux += len;
3108 	*aux++ = 0;
3109     }
3110 
3111 #ifndef SAMTOOLS
3112     bam_set_blk_size(*bam, bam_blk_size(*bam) + (aux - aux_orig));
3113 #endif
3114 
3115     *aux++ = 0;
3116 
3117     return bam_idx + (aux - aux_orig);
3118 }
3119 
3120 /*
3121  * Here be dragons! The multi-threading code in this is crufty beyond belief.
3122  */
3123 /*
3124  * Load first container.
3125  * Called when fd->ctr is NULL>
3126  *
3127  * Returns container on success
3128  *        NULL on failure.
3129  */
cram_first_slice(cram_fd * fd)3130 static cram_container *cram_first_slice(cram_fd *fd) {
3131     cram_container *c;
3132 
3133     do {
3134 	if (!(c = fd->ctr = cram_read_container(fd)))
3135 	    return NULL;
3136 	c->curr_slice_mt = c->curr_slice;
3137     } while (c->length == 0);
3138 
3139     /*
3140      * The first container may be a result of a sub-range query.
3141      * In which case it may still not be the optimal starting point
3142      * due to skipped containers/slices in the index.
3143      */
3144     // No need for locks here as we're in the main thread.
3145     if (fd->range.refid != -2) {
3146 	while (c->ref_seq_id != -2 &&
3147 	       (c->ref_seq_id < fd->range.refid ||
3148 		(fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3149 		 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3150 	    if (0 != cram_seek(fd, c->length, SEEK_CUR))
3151 		return NULL;
3152 	    cram_free_container(fd->ctr);
3153 	    do {
3154 		if (!(c = fd->ctr = cram_read_container(fd)))
3155 		    return NULL;
3156 	    } while (c->length == 0);
3157 	}
3158 
3159 	if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3160 	    fd->eof = 1;
3161 	    return NULL;
3162 	}
3163     }
3164 
3165     if (!(c->comp_hdr_block = cram_read_block(fd)))
3166 	return NULL;
3167     if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3168 	return NULL;
3169 
3170     c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3171     if (!c->comp_hdr)
3172 	return NULL;
3173     if (!c->comp_hdr->AP_delta &&
3174 	sam_hdr_sort_order(fd->header) != ORDER_COORD) {
3175 	if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
3176 	fd->unsorted = 1;
3177 	if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3178     }
3179 
3180     return c;
3181 }
3182 
cram_next_slice(cram_fd * fd,cram_container ** cp)3183 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3184     cram_container *c_curr;  // container being consumed via cram_get_seq()
3185     cram_slice *s_curr = NULL;
3186 
3187     // Populate the first container if unknown.
3188     if (!(c_curr = fd->ctr)) {
3189 	if (!(c_curr = cram_first_slice(fd)))
3190 	    return NULL;
3191     }
3192 
3193     // Discard previous slice
3194     if ((s_curr = c_curr->slice)) {
3195 	c_curr->slice = NULL;
3196 	cram_free_slice(s_curr);
3197 	s_curr = NULL;
3198     }
3199 
3200     // If we've consumed all slices in this container, also discard
3201     // the container too.
3202     if (c_curr->curr_slice == c_curr->max_slice) {
3203 	if (fd->ctr == c_curr)
3204 	    fd->ctr = NULL;
3205 	if (fd->ctr_mt == c_curr)
3206 	    fd->ctr_mt = NULL;
3207 	cram_free_container(c_curr);
3208 	c_curr = NULL;
3209     }
3210 
3211     if (!fd->ctr_mt)
3212 	fd->ctr_mt = c_curr;
3213 
3214     // Fetch the next slice (and the container if necessary).
3215     //
3216     // If single threaded this loop bails out as soon as it finds
3217     // a slice in range.  In this case c_next and c_curr end up being
3218     // the same thing.
3219     //
3220     // If multi-threaded, we loop until we have filled out
3221     // thread pool input queue.  Here c_next and c_curr *may* differ, as
3222     // can fd->ctr and fd->ctr_mt.
3223     for (;;) {
3224 	cram_container *c_next = fd->ctr_mt;
3225 	cram_slice *s_next = NULL;
3226 
3227 	// Next slice; either from the last job we failed to push
3228 	// to the input queue or via more I/O.
3229 	if (fd->job_pending) {
3230 	    cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3231 	    c_next = j->c;
3232 	    s_next = j->s;
3233 	    free(fd->job_pending);
3234 	    fd->job_pending = NULL;
3235 	} else if (!fd->ooc) {
3236 	empty_container:
3237 	    if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3238 		// new container
3239 		for(;;) {
3240 		    if (!(c_next = cram_read_container(fd))) {
3241 			if (fd->pool) {
3242 			    fd->ooc = 1;
3243 			    break;
3244 			}
3245 
3246 			return NULL;
3247 		    }
3248 		    c_next->curr_slice_mt = c_next->curr_slice;
3249 
3250 		    if (c_next->length != 0)
3251 			break;
3252 
3253 		    cram_free_container(c_next);
3254 		}
3255 		if (fd->ooc)
3256 		    break;
3257 
3258 		/* Skip containers not yet spanning our range */
3259 		if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3260 		    // ref_id beyond end of range; bail out
3261 		    if (c_next->ref_seq_id != fd->range.refid) {
3262 			cram_free_container(c_next);
3263 			fd->ctr_mt = NULL;
3264 			fd->ooc = 1;
3265 			break;
3266 		    }
3267 
3268 		    // position beyond end of range; bail out
3269 		    if (c_next->ref_seq_start > fd->range.end) {
3270 			cram_free_container(c_next);
3271 			fd->ctr_mt = NULL;
3272 			fd->ooc = 1;
3273 			break;
3274 		    }
3275 
3276 		    // before start of range; skip to next container
3277 		    if (c_next->ref_seq_start + c_next->ref_seq_span-1 <
3278 			fd->range.start) {
3279 			c_next->curr_slice_mt = c_next->max_slice;
3280 			cram_seek(fd, c_next->length, SEEK_CUR);
3281 			cram_free_container(c_next);
3282 			c_next = NULL;
3283 			continue;
3284 		    }
3285 		}
3286 
3287 		// Container is valid range, so remember it for restarting
3288 		// this function.
3289 		fd->ctr_mt = c_next;
3290 
3291 		if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3292 		    return NULL;
3293 		if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3294 		    return NULL;
3295 
3296 		c_next->comp_hdr =
3297 		    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3298 		if (!c_next->comp_hdr)
3299 		    return NULL;
3300 
3301 		if (!c_next->comp_hdr->AP_delta &&
3302 		    sam_hdr_sort_order(fd->header) != ORDER_COORD) {
3303 		    if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
3304 		    fd->unsorted = 1;
3305 		    if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3306 		}
3307 	    }
3308 
3309 	    if (c_next->num_records == 0) {
3310 		cram_free_container(c_next);
3311 		if (fd->ctr_mt == c_next)
3312 		    fd->ctr_mt = NULL;
3313 		c_next = NULL;
3314 		goto empty_container;
3315 	    }
3316 
3317 	    if (!(s_next = c_next->slice = cram_read_slice(fd)))
3318 		return NULL;
3319 
3320 	    s_next->slice_num = ++c_next->curr_slice_mt;
3321 	    s_next->curr_rec = 0;
3322 	    s_next->max_rec = s_next->hdr->num_records;
3323 
3324 	    s_next->last_apos = s_next->hdr->ref_seq_start;
3325 
3326 	    // We know the container overlaps our range, but with multi-slice
3327 	    // containers we may have slices that do not.  Skip these also.
3328 	    if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3329 		// ref_id beyond end of range; bail out
3330 		if (s_next->hdr->ref_seq_id != fd->range.refid) {
3331 		    fd->ooc = 1;
3332 		    cram_free_slice(s_next);
3333 		    s_next = NULL;
3334 		    break;
3335 		}
3336 
3337 		// position beyond end of range; bail out
3338 		if (s_next->hdr->ref_seq_start > fd->range.end) {
3339 		    fd->ooc = 1;
3340 		    cram_free_slice(s_next);
3341 		    s_next = NULL;
3342 		    break;
3343 		}
3344 
3345 		// before start of range; skip to next slice
3346 		if (s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3347 		    fd->range.start) {
3348 		    cram_free_slice(s_next);
3349 		    continue;
3350 		}
3351 	    }
3352 	} // end: if (!fd->ooc)
3353 
3354 	if (!c_next || !s_next)
3355 	    break;
3356 
3357 	// Decode the slice, either right now (non-threaded) or by pushing
3358 	// it to the a decode queue (threaded).
3359 	if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3360 	    fprintf(stderr, "Failure to decode slice\n");
3361 	    cram_free_slice(s_next);
3362 	    c_next->slice = NULL;
3363 	    return NULL;
3364 	}
3365 
3366 	// No thread pool, so don't loop again
3367 	if (!fd->pool) {
3368 	    c_curr = c_next;
3369 	    s_curr = s_next;
3370 	    break;
3371 	}
3372 
3373 	// With thread pool, but we have a job pending so our decode queue
3374 	// is full.
3375 	if (fd->job_pending)
3376 	    break;
3377 
3378 	// Otherwise we're threaded with room in the decode input queue, so
3379 	// keep reading slices for decode.
3380 	// Push it a bit far, to qsize in queue rather than pending arrival,
3381 	// as cram tends to be a bit bursty in decode timings.
3382 	if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
3383 	    break;
3384     } // end of for(;;)
3385 
3386 
3387     // When not threaded we've already have c_curr and s_curr.
3388     // Otherwise we need get them by pulling off the decode output queue.
3389     if (fd->pool) {
3390 	t_pool_result *res;
3391 	cram_decode_job *j;
3392 
3393 	if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
3394 	    return NULL;
3395 
3396 	res = t_pool_next_result_wait(fd->rqueue);
3397 
3398 	if (!res || !res->data) {
3399 	    fprintf(stderr, "Call to t_pool_next_result failed\n");
3400 	    return NULL;
3401 	}
3402 
3403 	j = (cram_decode_job *)res->data;
3404 	c_curr = j->c;
3405 	s_curr = j->s;
3406 
3407 	if (j->exit_code != 0) {
3408 	    fprintf(stderr, "Slice decode failure\n");
3409 	    fd->eof = 0;
3410 	    t_pool_delete_result(res, 1);
3411 	    return NULL;
3412 	}
3413 
3414 	t_pool_delete_result(res, 1);
3415     }
3416 
3417     *cp = c_curr;
3418 
3419     // Update current slice being processed (as opposed to current
3420     // slice in the multi-threaded reahead.
3421     fd->ctr = c_curr;
3422     if (c_curr) {
3423 	c_curr->slice = s_curr;
3424 	if (s_curr)
3425 	    c_curr->curr_slice = s_curr->slice_num;
3426     }
3427     if (s_curr)
3428 	s_curr->curr_rec = 0;
3429     else
3430 	fd->eof = 1;
3431 
3432     return s_curr;
3433 }
3434 
3435 /*
3436  * Read the next cram record and return it.
3437  * Note that to decode cram_record the caller will need to look up some data
3438  * in the current slice, pointed to by fd->ctr->slice. This is valid until
3439  * the next call to cram_get_seq (which may invalidate it).
3440  *
3441  * Returns record pointer on success (do not free)
3442  *        NULL on failure
3443  */
cram_get_seq(cram_fd * fd)3444 cram_record *cram_get_seq(cram_fd *fd) {
3445     cram_container *c;
3446     cram_slice *s;
3447 
3448     for (;;) {
3449 	c = fd->ctr;
3450 	if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3451 	    s = c->slice;
3452 	} else {
3453 	    if (!(s = cram_next_slice(fd, &c)))
3454 		return NULL;
3455 	    continue; /* In case slice contains no records */
3456 	}
3457 
3458 	if (fd->range.refid != -2) {
3459 	    if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3460 		// Special case when looking for unmapped blocks at end.
3461 		// If these are mixed in with mapped data (c->ref_id == -2)
3462 		// then we need skip until we find the unmapped data, if at all
3463 		s->curr_rec++;
3464 		continue;
3465 	    }
3466 	    if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3467 		s->crecs[s->curr_rec].ref_id != -1) {
3468 		// Looking for a mapped read, but not there yet.  Special case
3469 		// as -1 (unmapped) shouldn't be considered < refid.
3470 		s->curr_rec++;
3471 		continue;
3472 	    }
3473 
3474 	    if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3475 		fd->eof = 1;
3476 		cram_free_slice(s);
3477 		c->slice = NULL;
3478 		return NULL;
3479 	    }
3480 
3481 	    if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3482 		fd->eof = 1;
3483 		cram_free_slice(s);
3484 		c->slice = NULL;
3485 		return NULL;
3486 	    }
3487 
3488 	    if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3489 		s->curr_rec++;
3490 		continue;
3491 	    }
3492 	}
3493 
3494 	break;
3495     }
3496 
3497     fd->ctr = c;
3498     c->slice = s;
3499     return &s->crecs[s->curr_rec++];
3500 }
3501 
3502 /*
3503  * Read the next cram record and convert it to a bam_seq_t struct.
3504  *
3505  * Returns 0 on success
3506  *        -1 on EOF or failure (check fd->err)
3507  */
cram_get_bam_seq(cram_fd * fd,bam_seq_t ** bam)3508 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3509     cram_record *cr;
3510     cram_container *c;
3511     cram_slice *s;
3512 
3513     if (!(cr = cram_get_seq(fd))) {
3514 	//*bam=0;
3515 	return -1;
3516     }
3517 
3518     c = fd->ctr;
3519     s = c->slice;
3520 
3521     if (s->bl) {
3522 	//*bam = s->bl[s->curr_rec-1]; return 0;
3523 
3524 	// Ideally we'd just do: *bam = s->bl[s->curr_rec-1];
3525 	// That works, but it changes the API as the bam object is
3526 	// no longer a malloced block of memory and cannot be
3527 	// freed by the caller.  (Possibly we can do *bam=0
3528 	// in the case where cram_get_seq hits EOF, but this is
3529 	// also assuming that the *bam object was ours and not a
3530 	// result of, say, a merge with a bam file.)
3531 	//
3532 	// Hence instead we laboriously manage the memory and do a
3533 	// memcpy each time.  (This is around an extra 40% time taken
3534 	// in main to decode a CRAM file, harming parallel execution.)
3535 	int sz = s->bl[s->curr_rec-1]->alloc;
3536 	if (!*bam) {
3537 	    if (!(*bam = malloc(sz)))
3538 		return -1;
3539 	    (*bam)->alloc = sz;
3540 	} else if ((*bam)->alloc < sz) {
3541 	    if (!(*bam = realloc(*bam, sz)))
3542 		return -1;
3543 	    (*bam)->alloc = sz;
3544 	}
3545 	memcpy(*bam, s->bl[s->curr_rec-1], sz);
3546 	return 0;
3547     }
3548 
3549     return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam) >= 0 ? 0 : -1;
3550 }
3551