1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * - In-memory decoding of CRAM data structures.
33  * - Iterator for reading CRAM record by record.
34  */
35 
36 #include <config.h>
37 
38 #include <stdio.h>
39 #include <errno.h>
40 #include <assert.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <zlib.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <math.h>
47 #include <stdint.h>
48 
49 #include "cram/cram.h"
50 #include "cram/os.h"
51 #include "htslib/hts.h"
52 
53 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
54 //#define USE_X
55 
56 /* ----------------------------------------------------------------------
57  * CRAM compression headers
58  */
59 
60 /*
61  * Decodes the Tag Dictionary record in the preservation map
62  * Updates the cram compression header.
63  *
64  * Returns number of bytes decoded on success
65  *        -1 on failure
66  */
cram_decode_TD(char * cp,const char * endp,cram_block_compression_hdr * h)67 int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) {
68     char *op = cp;
69     unsigned char *dat;
70     cram_block *b;
71     int32_t blk_size = 0;
72     int nTL, i, sz;
73 
74     if (!(b = cram_new_block(0, 0)))
75 	return -1;
76 
77     /* Decode */
78     cp += safe_itf8_get(cp, endp, &blk_size);
79     if (!blk_size) {
80 	h->nTL = 0;
81 	h->TL = NULL;
82 	cram_free_block(b);
83 	return cp - op;
84     }
85 
86     if (blk_size < 0 || endp - cp < blk_size) {
87         cram_free_block(b);
88 	return -1;
89     }
90 
91     BLOCK_APPEND(b, cp, blk_size);
92     cp += blk_size;
93     sz = cp - op;
94     // Force nul termination if missing
95     if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
96 	BLOCK_APPEND_CHAR(b, '\0');
97 
98     /* Set up TL lookup table */
99     dat = BLOCK_DATA(b);
100 
101     // Count
102     for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
103 	nTL++;
104 	while (dat[i])
105 	    i++;
106     }
107 
108     // Copy
109     h->nTL = nTL;
110     if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) {
111         cram_free_block(b);
112         return -1;
113     }
114     for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
115 	h->TL[nTL++] = &dat[i];
116 	while (dat[i])
117 	    i++;
118     }
119     h->TD_blk = b;
120 
121     return sz;
122 }
123 
124 /*
125  * Decodes a CRAM block compression header.
126  * Returns header ptr on success
127  *         NULL on failure
128  */
cram_decode_compression_header(cram_fd * fd,cram_block * b)129 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
130 							   cram_block *b) {
131     char *cp, *endp, *cp_copy;
132     cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
133     int i;
134     int32_t map_size = 0, map_count = 0;
135 
136     if (!hdr)
137 	return NULL;
138 
139     if (b->method != RAW) {
140 	if (cram_uncompress_block(b)) {
141 	    free(hdr);
142 	    return NULL;
143 	}
144     }
145 
146     cp = (char *)b->data;
147     endp = cp + b->uncomp_size;
148 
149     if (CRAM_MAJOR_VERS(fd->version) == 1) {
150 	cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id);
151 	cp += safe_itf8_get(cp, endp, &hdr->ref_seq_start);
152 	cp += safe_itf8_get(cp, endp, &hdr->ref_seq_span);
153 	cp += safe_itf8_get(cp, endp, &hdr->num_records);
154 	cp += safe_itf8_get(cp, endp, &hdr->num_landmarks);
155         if ((hdr->num_landmarks < 0 ||
156              hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t))) {
157             free(hdr);
158 	    return NULL;
159         }
160 	if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
161 	    free(hdr);
162 	    return NULL;
163 	}
164 	for (i = 0; i < hdr->num_landmarks; i++) {
165 	    cp += safe_itf8_get(cp, endp, &hdr->landmark[i]);
166 	}
167     }
168 
169     hdr->preservation_map = kh_init(map);
170 
171     memset(hdr->rec_encoding_map, 0,
172 	   CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
173     memset(hdr->tag_encoding_map, 0,
174 	   CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
175 
176     if (!hdr->preservation_map) {
177 	cram_free_compression_header(hdr);
178 	return NULL;
179     }
180 
181     /* Initialise defaults for preservation map */
182     hdr->mapped_qs_included = 0;
183     hdr->unmapped_qs_included = 0;
184     hdr->unmapped_placed = 0;
185     hdr->qs_included = 0;
186     hdr->read_names_included = 0;
187     hdr->AP_delta = 1;
188     memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
189 
190     /* Preservation map */
191     cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
192     cp += safe_itf8_get(cp, endp, &map_count);
193     for (i = 0; i < map_count; i++) {
194 	pmap_t hd;
195 	khint_t k;
196 	int r;
197 
198 	if (endp - cp < 3) {
199 	    cram_free_compression_header(hdr);
200 	    return NULL;
201 	}
202 	cp += 2;
203 	switch(CRAM_KEY(cp[-2],cp[-1])) {
204 	case CRAM_KEY('M','I'):
205 	    hd.i = *cp++;
206 	    k = kh_put(map, hdr->preservation_map, "MI", &r);
207 	    if (-1 == r) {
208 		cram_free_compression_header(hdr);
209                 return NULL;
210             }
211 
212 	    kh_val(hdr->preservation_map, k) = hd;
213 	    hdr->mapped_qs_included = hd.i;
214 	    break;
215 
216 	case CRAM_KEY('U','I'):
217 	    hd.i = *cp++;
218 	    k = kh_put(map, hdr->preservation_map, "UI", &r);
219 	    if (-1 == r) {
220 		cram_free_compression_header(hdr);
221                 return NULL;
222             }
223 
224 	    kh_val(hdr->preservation_map, k) = hd;
225 	    hdr->unmapped_qs_included = hd.i;
226 	    break;
227 
228 	case CRAM_KEY('P','I'):
229 	    hd.i = *cp++;
230 	    k = kh_put(map, hdr->preservation_map, "PI", &r);
231 	    if (-1 == r) {
232 		cram_free_compression_header(hdr);
233                 return NULL;
234             }
235 
236 	    kh_val(hdr->preservation_map, k) = hd;
237 	    hdr->unmapped_placed = hd.i;
238 	    break;
239 
240 	case CRAM_KEY('R','N'):
241 	    hd.i = *cp++;
242 	    k = kh_put(map, hdr->preservation_map, "RN", &r);
243 	    if (-1 == r) {
244 		cram_free_compression_header(hdr);
245                 return NULL;
246             }
247 
248 	    kh_val(hdr->preservation_map, k) = hd;
249 	    hdr->read_names_included = hd.i;
250 	    break;
251 
252 	case CRAM_KEY('A','P'):
253 	    hd.i = *cp++;
254 	    k = kh_put(map, hdr->preservation_map, "AP", &r);
255 	    if (-1 == r) {
256 		cram_free_compression_header(hdr);
257                 return NULL;
258             }
259 
260 	    kh_val(hdr->preservation_map, k) = hd;
261 	    hdr->AP_delta = hd.i;
262 	    break;
263 
264 	case CRAM_KEY('R','R'):
265 	    hd.i = *cp++;
266 	    k = kh_put(map, hdr->preservation_map, "RR", &r);
267 	    if (-1 == r) {
268 		cram_free_compression_header(hdr);
269                 return NULL;
270             }
271 
272 	    kh_val(hdr->preservation_map, k) = hd;
273 	    fd->no_ref = !hd.i;
274 	    break;
275 
276 	case CRAM_KEY('S','M'):
277 	    if (endp - cp < 5) {
278 	        cram_free_compression_header(hdr);
279 		return NULL;
280 	    }
281 	    hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
282 	    hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
283 	    hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
284 	    hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
285 
286 	    hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
287 	    hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
288 	    hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
289 	    hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
290 
291 	    hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
292 	    hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
293 	    hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
294 	    hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
295 
296 	    hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
297 	    hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
298 	    hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
299 	    hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
300 
301 	    hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
302 	    hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
303 	    hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
304 	    hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
305 
306 	    hd.p = cp;
307 	    cp += 5;
308 
309 	    k = kh_put(map, hdr->preservation_map, "SM", &r);
310 	    if (-1 == r) {
311 		cram_free_compression_header(hdr);
312 		return NULL;
313 	    }
314 	    kh_val(hdr->preservation_map, k) = hd;
315 	    break;
316 
317 	case CRAM_KEY('T','D'): {
318 	    int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary
319 	    if (sz < 0) {
320 		cram_free_compression_header(hdr);
321 		return NULL;
322 	    }
323 
324 	    hd.p = cp;
325 	    cp += sz;
326 
327 	    k = kh_put(map, hdr->preservation_map, "TD", &r);
328 	    if (-1 == r) {
329 		cram_free_compression_header(hdr);
330 		return NULL;
331 	    }
332 	    kh_val(hdr->preservation_map, k) = hd;
333 	    break;
334 	}
335 
336 	default:
337 	    fprintf(stderr, "Unrecognised preservation map key %c%c\n",
338 		    cp[-2], cp[-1]);
339 	    // guess byte;
340 	    cp++;
341 	    break;
342 	}
343     }
344     if (cp - cp_copy != map_size) {
345 	cram_free_compression_header(hdr);
346 	return NULL;
347     }
348 
349     /* Record encoding map */
350     cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
351     cp += safe_itf8_get(cp, endp, &map_count);
352     for (i = 0; i < map_count; i++) {
353 	char *key = cp;
354 	int32_t encoding = E_NULL;
355 	int32_t size = 0;
356 	cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
357 
358 	if (!m || endp - cp < 4) {
359 	    free(m);
360 	    cram_free_compression_header(hdr);
361 	    return NULL;
362 	}
363 
364 	cp += 2;
365 	cp += safe_itf8_get(cp, endp, &encoding);
366 	cp += safe_itf8_get(cp, endp, &size);
367 
368 	// Fill out cram_map purely for cram_dump to dump out.
369 	m->key = (key[0]<<8)|key[1];
370 	m->encoding = encoding;
371 	m->size     = size;
372 	m->offset   = cp - (char *)b->data;
373 	m->codec = NULL;
374 
375 	if (m->encoding == E_NULL)
376 	    continue;
377 
378 	if (size < 0 || endp - cp < size) {
379 	    free(m);
380 	    cram_free_compression_header(hdr);
381 	    return NULL;
382 	}
383 
384 	//printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
385 
386 	/*
387 	 * For CRAM1.0 CF and BF are Byte and not Int.
388 	 * Practically speaking it makes no difference unless we have a
389 	 * 1.0 format file that stores these in EXTERNAL as only then
390 	 * does Byte vs Int matter.
391 	 *
392 	 * Neither this C code nor Java reference implementations did this,
393 	 * so we gloss over it and treat them as int.
394 	 */
395 
396 	if (key[0] == 'B' && key[1] == 'F') {
397 	    if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
398 							 E_INT,
399 							 fd->version))) {
400 		cram_free_compression_header(hdr);
401 		return NULL;
402 	    }
403 	} else if (key[0] == 'C' && key[1] == 'F') {
404 	    if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
405 							 E_INT,
406 							 fd->version))) {
407 		cram_free_compression_header(hdr);
408 		return NULL;
409 	    }
410 	} else if (key[0] == 'R' && key[1] == 'I') {
411 	    if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
412 							 E_INT,
413 							 fd->version))) {
414 		cram_free_compression_header(hdr);
415 		return NULL;
416 	    }
417 	} else if (key[0] == 'R' && key[1] == 'L') {
418 	    if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
419 							 E_INT,
420 							 fd->version))) {
421 		cram_free_compression_header(hdr);
422 		return NULL;
423 	    }
424 	} else if (key[0] == 'A' && key[1] == 'P') {
425 	    if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
426 							 E_INT,
427 							 fd->version))) {
428 		cram_free_compression_header(hdr);
429 		return NULL;
430 	    }
431 	} else if (key[0] == 'R' && key[1] == 'G') {
432 	    if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
433 							 E_INT,
434 							 fd->version))) {
435 		cram_free_compression_header(hdr);
436 		return NULL;
437 	    }
438 	} else if (key[0] == 'M' && key[1] == 'F') {
439 	    if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
440 							 E_INT,
441 							 fd->version))) {
442 		cram_free_compression_header(hdr);
443 		return NULL;
444 	    }
445 	} else if (key[0] == 'N' && key[1] == 'S') {
446 	    if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
447 							 E_INT,
448 							 fd->version))) {
449 		cram_free_compression_header(hdr);
450 		return NULL;
451 	    }
452 	} else if (key[0] == 'N' && key[1] == 'P') {
453 	    if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
454 							 E_INT,
455 							 fd->version))) {
456 		cram_free_compression_header(hdr);
457 		return NULL;
458 	    }
459 	} else if (key[0] == 'T' && key[1] == 'S') {
460 	    if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
461 							 E_INT,
462 							 fd->version))) {
463 		cram_free_compression_header(hdr);
464 		return NULL;
465 	    }
466 	} else if (key[0] == 'N' && key[1] == 'F') {
467 	    if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
468 							 E_INT,
469 							 fd->version))) {
470 		cram_free_compression_header(hdr);
471 		return NULL;
472 	    }
473 	} else if (key[0] == 'T' && key[1] == 'C') {
474 	    if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
475 							 E_BYTE,
476 							 fd->version))) {
477 		cram_free_compression_header(hdr);
478 		return NULL;
479 	    }
480 	} else if (key[0] == 'T' && key[1] == 'N') {
481 	    if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
482 							 E_INT,
483 							 fd->version))) {
484 		cram_free_compression_header(hdr);
485 		return NULL;
486 	    }
487 	} else if (key[0] == 'F' && key[1] == 'N') {
488 	    if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
489 							 E_INT,
490 							 fd->version))) {
491 		cram_free_compression_header(hdr);
492 		return NULL;
493 	    }
494 	} else if (key[0] == 'F' && key[1] == 'C') {
495 	    if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
496 							 E_BYTE,
497 							 fd->version))) {
498 		cram_free_compression_header(hdr);
499 		return NULL;
500 	    }
501 	} else if (key[0] == 'F' && key[1] == 'P') {
502 	    if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
503 							 E_INT,
504 							 fd->version))) {
505 		cram_free_compression_header(hdr);
506 		return NULL;
507 	    }
508 	} else if (key[0] == 'B' && key[1] == 'S') {
509 	    if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
510 							 E_BYTE,
511 							 fd->version))) {
512 		cram_free_compression_header(hdr);
513 		return NULL;
514 	    }
515 	} else if (key[0] == 'I' && key[1] == 'N') {
516 	    if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
517 							 E_BYTE_ARRAY,
518 							 fd->version))) {
519 		cram_free_compression_header(hdr);
520 		return NULL;
521 	    }
522 	} else if (key[0] == 'S' && key[1] == 'C') {
523 	    if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
524 							 E_BYTE_ARRAY,
525 							 fd->version))) {
526 		cram_free_compression_header(hdr);
527 		return NULL;
528 	    }
529 	} else if (key[0] == 'D' && key[1] == 'L') {
530 	    if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
531 							 E_INT,
532 							 fd->version))) {
533 		cram_free_compression_header(hdr);
534 		return NULL;
535 	    }
536 	} else if (key[0] == 'B' && key[1] == 'A') {
537 	    if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
538 							 E_BYTE,
539 							 fd->version))) {
540 		cram_free_compression_header(hdr);
541 		return NULL;
542 	    }
543 	} else if (key[0] == 'B' && key[1] == 'B') {
544 	    if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
545 							 E_BYTE_ARRAY,
546 							 fd->version))) {
547 		cram_free_compression_header(hdr);
548 		return NULL;
549 	    }
550 	} else if (key[0] == 'R' && key[1] == 'S') {
551 	    if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
552 							 E_INT,
553 							 fd->version))) {
554 		cram_free_compression_header(hdr);
555 		return NULL;
556 	    }
557 	} else if (key[0] == 'P' && key[1] == 'D') {
558 	    if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
559 							 E_INT,
560 							 fd->version))) {
561 		cram_free_compression_header(hdr);
562 		return NULL;
563 	    }
564 	} else if (key[0] == 'H' && key[1] == 'C') {
565 	    if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
566 							 E_INT,
567 							 fd->version))) {
568 		cram_free_compression_header(hdr);
569 		return NULL;
570 	    }
571 	} else if (key[0] == 'M' && key[1] == 'Q') {
572 	    if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
573 							 E_INT,
574 							 fd->version))) {
575 		cram_free_compression_header(hdr);
576 		return NULL;
577 	    }
578 	} else if (key[0] == 'R' && key[1] == 'N') {
579 	    if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
580 							 E_BYTE_ARRAY_BLOCK,
581 							 fd->version))) {
582 		cram_free_compression_header(hdr);
583 		return NULL;
584 	    }
585 	} else if (key[0] == 'Q' && key[1] == 'S') {
586 	    if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
587 							 E_BYTE,
588 							 fd->version))) {
589 		cram_free_compression_header(hdr);
590 		return NULL;
591 	    }
592 	} else if (key[0] == 'Q' && key[1] == 'Q') {
593 	    if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
594 							 E_BYTE_ARRAY,
595 							 fd->version))) {
596 		cram_free_compression_header(hdr);
597 		return NULL;
598 	    }
599 	} else if (key[0] == 'T' && key[1] == 'L') {
600 	    if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
601 							 E_INT,
602 							 fd->version))) {
603 		cram_free_compression_header(hdr);
604 		return NULL;
605 	    }
606 	} else if (key[0] == 'T' && key[1] == 'M') {
607 	} else if (key[0] == 'T' && key[1] == 'V') {
608 	} else
609 	    fprintf(stderr, "Unrecognised key: %.2s\n", key);
610 
611 	cp += size;
612 
613 	m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
614 	hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
615     }
616     if (cp - cp_copy != map_size) {
617 	cram_free_compression_header(hdr);
618 	return NULL;
619     }
620 
621     /* Tag encoding map */
622     cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
623     cp += safe_itf8_get(cp, endp, &map_count);
624     for (i = 0; i < map_count; i++) {
625 	int32_t encoding = E_NULL;
626 	int32_t size = 0;
627 	cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
628 	char *key;
629 
630 	if (!m || endp - cp < 6) {
631 	    free(m);
632 	    cram_free_compression_header(hdr);
633 	    return NULL;
634 	}
635 
636 	key = cp + 1;
637 	m->key = (key[0]<<16)|(key[1]<<8)|key[2];
638 
639 	cp += 4; // Strictly ITF8, but this suffices
640 	cp += safe_itf8_get(cp, endp, &encoding);
641 	cp += safe_itf8_get(cp, endp, &size);
642 
643 	m->encoding = encoding;
644 	m->size     = size;
645 	m->offset   = cp - (char *)b->data;
646 	if (size < 0 || endp - cp < size ||
647 	    !(m->codec = cram_decoder_init(encoding, cp, size,
648 					   E_BYTE_ARRAY_BLOCK, fd->version))) {
649 	    cram_free_compression_header(hdr);
650 	    free(m);
651 	    return NULL;
652 	}
653 
654 	cp += size;
655 
656 	m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
657 	hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
658     }
659     if (cp - cp_copy != map_size) {
660 	cram_free_compression_header(hdr);
661 	return NULL;
662     }
663 
664     return hdr;
665 }
666 
667 /*
668  * Note we also need to scan through the record encoding map to
669  * see which data series share the same block, either external or
670  * CORE. For example if we need the BF data series but MQ and CF
671  * are also encoded in the same block then we need to add those in
672  * as a dependency in order to correctly decode BF.
673  *
674  * Returns 0 on success
675  *        -1 on failure
676  */
cram_dependent_data_series(cram_fd * fd,cram_block_compression_hdr * hdr,cram_slice * s)677 int cram_dependent_data_series(cram_fd *fd,
678 			       cram_block_compression_hdr *hdr,
679 			       cram_slice *s) {
680     int *block_used;
681     int core_used = 0;
682     int i;
683     static int i_to_id[] = {
684 	DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
685 	DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
686 	DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
687 	DS_HC, DS_SC, DS_BB, DS_QQ,
688     };
689     uint32_t orig_ds;
690 
691     /*
692      * Set the data_series bit field based on fd->required_fields
693      * contents.
694      */
695     if (fd->required_fields && fd->required_fields != INT_MAX) {
696 	hdr->data_series = 0;
697 
698 	if (fd->required_fields & SAM_QNAME)
699 	    hdr->data_series |= CRAM_RN;
700 
701 	if (fd->required_fields & SAM_FLAG)
702 	    hdr->data_series |= CRAM_BF;
703 
704 	if (fd->required_fields & SAM_RNAME)
705 	    hdr->data_series |= CRAM_RI | CRAM_BF;
706 
707 	if (fd->required_fields & SAM_POS)
708 	    hdr->data_series |= CRAM_AP | CRAM_BF;
709 
710 	if (fd->required_fields & SAM_MAPQ)
711 	    hdr->data_series |= CRAM_MQ;
712 
713 	if (fd->required_fields & SAM_CIGAR)
714 	    hdr->data_series |= CRAM_CIGAR;
715 
716 	if (fd->required_fields & SAM_RNEXT)
717 	    hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
718 
719 	if (fd->required_fields & SAM_PNEXT)
720 	    hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
721 
722 	if (fd->required_fields & SAM_TLEN)
723 	    hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
724 		CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
725 
726 	if (fd->required_fields & SAM_SEQ)
727 	    hdr->data_series |= CRAM_SEQ;
728 
729 	if (!(fd->required_fields & SAM_AUX))
730 	    // No easy way to get MD/NM without other tags at present
731 	    fd->decode_md = 0;
732 
733 	if (fd->required_fields & SAM_QUAL)
734 	    hdr->data_series |= CRAM_QUAL;
735 
736 	if (fd->required_fields & SAM_AUX)
737 	    hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
738 
739 	if (fd->required_fields & SAM_RGAUX)
740 	    hdr->data_series |= CRAM_RG | CRAM_BF;
741 
742 	// Always uncompress CORE block
743 	if (cram_uncompress_block(s->block[0]))
744 	    return -1;
745     } else {
746 	hdr->data_series = CRAM_ALL;
747 
748 	for (i = 0; i < s->hdr->num_blocks; i++) {
749 	    if (cram_uncompress_block(s->block[i]))
750 		return -1;
751 	}
752 
753 	return 0;
754     }
755 
756     block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
757     if (!block_used)
758 	return -1;
759 
760     do {
761 	/*
762 	 * Also set data_series based on code prerequisites. Eg if we need
763 	 * CRAM_QS then we also need to know CRAM_RL so we know how long it
764 	 * is, or if we need FC/FP then we also need FN (number of features).
765 	 *
766 	 * It's not reciprocal though. We may be needing to decode FN
767 	 * but have no need to decode FC, FP and cigar ops.
768 	 */
769 	if (hdr->data_series & CRAM_RS)    hdr->data_series |= CRAM_FC|CRAM_FP;
770 	if (hdr->data_series & CRAM_PD)    hdr->data_series |= CRAM_FC|CRAM_FP;
771 	if (hdr->data_series & CRAM_HC)    hdr->data_series |= CRAM_FC|CRAM_FP;
772 	if (hdr->data_series & CRAM_QS)    hdr->data_series |= CRAM_FC|CRAM_FP;
773 	if (hdr->data_series & CRAM_IN)    hdr->data_series |= CRAM_FC|CRAM_FP;
774 	if (hdr->data_series & CRAM_SC)    hdr->data_series |= CRAM_FC|CRAM_FP;
775 	if (hdr->data_series & CRAM_BS)    hdr->data_series |= CRAM_FC|CRAM_FP;
776 	if (hdr->data_series & CRAM_DL)    hdr->data_series |= CRAM_FC|CRAM_FP;
777 	if (hdr->data_series & CRAM_BA)    hdr->data_series |= CRAM_FC|CRAM_FP;
778 	if (hdr->data_series & CRAM_BB)    hdr->data_series |= CRAM_FC|CRAM_FP;
779 	if (hdr->data_series & CRAM_QQ)    hdr->data_series |= CRAM_FC|CRAM_FP;
780 
781 	// cram_decode_seq() needs seq[] array
782 	if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
783 
784 	if (hdr->data_series & CRAM_FP)    hdr->data_series |= CRAM_FC;
785 	if (hdr->data_series & CRAM_FC)    hdr->data_series |= CRAM_FN;
786 	if (hdr->data_series & CRAM_aux)   hdr->data_series |= CRAM_TL;
787 	if (hdr->data_series & CRAM_MF)    hdr->data_series |= CRAM_CF;
788 	if (hdr->data_series & CRAM_MQ)    hdr->data_series |= CRAM_BF;
789 	if (hdr->data_series & CRAM_BS)    hdr->data_series |= CRAM_RI;
790 	if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
791 	    hdr->data_series |= CRAM_CF;
792 	if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
793 	    hdr->data_series |= CRAM_CF | CRAM_NF;
794 	if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
795 	    hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
796 
797 	orig_ds = hdr->data_series;
798 
799 	// Find which blocks are in use.
800 	for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
801 	    int bnum1, bnum2, j;
802 	    cram_codec *c = hdr->codecs[i_to_id[i]];
803 
804 	    if (!(hdr->data_series & (1<<i)))
805 		continue;
806 
807 	    if (!c)
808 		continue;
809 
810 	    bnum1 = cram_codec_to_id(c, &bnum2);
811 
812 	    for (;;) {
813 		switch (bnum1) {
814 		case -2:
815 		    break;
816 
817 		case -1:
818 		    core_used = 1;
819 		    break;
820 
821 		default:
822 		    for (j = 0; j < s->hdr->num_blocks; j++) {
823 			if (s->block[j]->content_type == EXTERNAL &&
824 			    s->block[j]->content_id == bnum1) {
825 			    block_used[j] = 1;
826 			    if (cram_uncompress_block(s->block[j])) {
827 				free(block_used);
828 				return -1;
829 			    }
830 			}
831 		    }
832 		    break;
833 		}
834 
835 		if (bnum2 == -2 || bnum1 == bnum2)
836 		    break;
837 
838 		bnum1 = bnum2; // 2nd pass
839 	    }
840 	}
841 
842 	// Tags too
843 	if ((fd->required_fields & SAM_AUX) ||
844 	    (hdr->data_series & CRAM_aux)) {
845 	    for (i = 0; i < CRAM_MAP_HASH; i++) {
846 		int bnum1, bnum2, j;
847 		cram_map *m = hdr->tag_encoding_map[i];
848 
849 		while (m) {
850 		    cram_codec *c = m->codec;
851 		    if (!c) {
852 			m = m->next;
853 			continue;
854 		    }
855 
856 		    bnum1 = cram_codec_to_id(c, &bnum2);
857 
858 		    for (;;) {
859 			switch (bnum1) {
860 			case -2:
861 			    break;
862 
863 			case -1:
864 			    core_used = 1;
865 			    break;
866 
867 			default:
868 			    for (j = 0; j < s->hdr->num_blocks; j++) {
869 				if (s->block[j]->content_type == EXTERNAL &&
870 				    s->block[j]->content_id == bnum1) {
871 				    block_used[j] = 1;
872 				    if (cram_uncompress_block(s->block[j])) {
873 					free(block_used);
874 					return -1;
875 				    }
876 				}
877 			    }
878 			    break;
879 			}
880 
881 			if (bnum2 == -2 || bnum1 == bnum2)
882 			    break;
883 
884 			bnum1 = bnum2; // 2nd pass
885 		    }
886 
887 		    m = m->next;
888 		}
889 	    }
890 	}
891 
892 	// We now know which blocks are in used, so repeat and find
893 	// which other data series need to be added.
894 	for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
895 	    int bnum1, bnum2, j;
896 	    cram_codec *c = hdr->codecs[i_to_id[i]];
897 
898 	    if (!c)
899 		continue;
900 
901 	    bnum1 = cram_codec_to_id(c, &bnum2);
902 
903 	    for (;;) {
904 		switch (bnum1) {
905 		case -2:
906 		    break;
907 
908 		case -1:
909 		    if (core_used) {
910 			//printf(" + data series %08x:\n", 1<<i);
911 			hdr->data_series |= 1<<i;
912 		    }
913 		    break;
914 
915 		default:
916 		    for (j = 0; j < s->hdr->num_blocks; j++) {
917 			if (s->block[j]->content_type == EXTERNAL &&
918 			    s->block[j]->content_id == bnum1) {
919 			    if (block_used[j]) {
920 				//printf(" + data series %08x:\n", 1<<i);
921 				hdr->data_series |= 1<<i;
922 			    }
923 			}
924 		    }
925 		    break;
926 		}
927 
928 		if (bnum2 == -2 || bnum1 == bnum2)
929 		    break;
930 
931 		bnum1 = bnum2; // 2nd pass
932 	    }
933 	}
934 
935 	// Tags too
936 	for (i = 0; i < CRAM_MAP_HASH; i++) {
937 	    int bnum1, bnum2, j;
938 	    cram_map *m = hdr->tag_encoding_map[i];
939 
940 	    while (m) {
941 		cram_codec *c = m->codec;
942 		if (!c) {
943 		    m = m->next;
944 		    continue;
945 		}
946 
947 		bnum1 = cram_codec_to_id(c, &bnum2);
948 
949 		for (;;) {
950 		    switch (bnum1) {
951 		    case -2:
952 			break;
953 
954 		    case -1:
955 			//printf(" + data series %08x:\n", CRAM_aux);
956 			hdr->data_series |= CRAM_aux;
957 			break;
958 
959 		    default:
960 			for (j = 0; j < s->hdr->num_blocks; j++) {
961 			    if (s->block[j]->content_type == EXTERNAL &&
962 				s->block[j]->content_id == bnum1) {
963 				if (block_used[j]) {
964 				    //printf(" + data series %08x:\n",
965 				    //       CRAM_aux);
966 				    hdr->data_series |= CRAM_aux;
967 				}
968 			    }
969 			}
970 			break;
971 		    }
972 
973 		    if (bnum2 == -2 || bnum1 == bnum2)
974 			break;
975 
976 		    bnum1 = bnum2; // 2nd pass
977 		}
978 
979 		m = m->next;
980 	    }
981 	}
982     } while (orig_ds != hdr->data_series);
983 
984     free(block_used);
985     return 0;
986 }
987 
988 /*
989  * Checks whether an external block is used solely by a single data series.
990  * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
991  *         or 0 if not (E_NULL).
992  */
cram_ds_unique(cram_block_compression_hdr * hdr,cram_codec * c,int id)993 static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
994 			  int id) {
995     int i, n_id = 0;
996     enum cram_encoding e_type = 0;
997 
998     for (i = 0; i < DS_END; i++) {
999 	cram_codec *c;
1000 	int bnum1, bnum2, old_n_id;
1001 
1002 	if (!(c = hdr->codecs[i]))
1003 	    continue;
1004 
1005 	bnum1 = cram_codec_to_id(c, &bnum2);
1006 
1007 	old_n_id = n_id;
1008 	if (bnum1 == id) {
1009 	    n_id++;
1010 	    e_type = c->codec;
1011 	}
1012 	if (bnum2 == id) {
1013 	    n_id++;
1014 	    e_type = c->codec;
1015 	}
1016 
1017 	if (n_id == old_n_id+2)
1018 	    n_id--; // len/val in same place counts once only.
1019     }
1020 
1021     return n_id == 1 ? e_type : 0;
1022 }
1023 
1024 /*
1025  * Attempts to estimate the size of some blocks so we can preallocate them
1026  * before decoding.  Although decoding will automatically grow the blocks,
1027  * it is typically more efficient to preallocate.
1028  */
cram_decode_estimate_sizes(cram_block_compression_hdr * hdr,cram_slice * s,int * qual_size,int * name_size,int * q_id)1029 void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
1030 				int *qual_size, int *name_size,
1031 				int *q_id) {
1032     int bnum1, bnum2;
1033     cram_codec *cd;
1034 
1035     *qual_size = 0;
1036     *name_size = 0;
1037 
1038     /* Qual */
1039     cd = hdr->codecs[DS_QS];
1040     if (cd == NULL) return;
1041     bnum1 = cram_codec_to_id(cd, &bnum2);
1042     if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1043     if (cram_ds_unique(hdr, cd, bnum1)) {
1044 	cram_block *b = cram_get_block_by_id(s, bnum1);
1045 	if (b) *qual_size = b->uncomp_size;
1046 	if (q_id && cd->codec == E_EXTERNAL)
1047 	    *q_id = bnum1;
1048     }
1049 
1050     /* Name */
1051     cd = hdr->codecs[DS_RN];
1052     if (cd == NULL) return;
1053     bnum1 = cram_codec_to_id(cd, &bnum2);
1054     if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1055     if (cram_ds_unique(hdr, cd, bnum1)) {
1056 	cram_block *b = cram_get_block_by_id(s, bnum1);
1057 	if (b) *name_size = b->uncomp_size;
1058     }
1059 }
1060 
1061 
1062 /* ----------------------------------------------------------------------
1063  * CRAM slices
1064  */
1065 
1066 /*
1067  * Decodes a CRAM (un)mapped slice header block.
1068  * Returns slice header ptr on success
1069  *         NULL on failure
1070  */
cram_decode_slice_header(cram_fd * fd,cram_block * b)1071 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
1072     cram_block_slice_hdr *hdr;
1073     unsigned char *cp;
1074     unsigned char *cp_end;
1075     int i;
1076 
1077     if (b->method != RAW) {
1078         /* Spec. says slice header should be RAW, but we can future-proof
1079 	   by trying to decode it if it isn't. */
1080         if (cram_uncompress_block(b) < 0)
1081             return NULL;
1082     }
1083     cp =  (unsigned char *)BLOCK_DATA(b);
1084     cp_end = cp + b->uncomp_size;
1085 
1086     if (b->content_type != MAPPED_SLICE &&
1087 	b->content_type != UNMAPPED_SLICE)
1088 	return NULL;
1089 
1090     if (!(hdr  = calloc(1, sizeof(*hdr))))
1091 	return NULL;
1092 
1093     hdr->content_type = b->content_type;
1094 
1095     if (b->content_type == MAPPED_SLICE) {
1096         cp += safe_itf8_get((char *)cp,  (char *)cp_end, &hdr->ref_seq_id);
1097         cp += safe_itf8_get((char *)cp,  (char *)cp_end, &hdr->ref_seq_start);
1098         cp += safe_itf8_get((char *)cp,  (char *)cp_end, &hdr->ref_seq_span);
1099     }
1100     cp += safe_itf8_get((char *)cp,  (char *)cp_end, &hdr->num_records);
1101     hdr->record_counter = 0;
1102     if (CRAM_MAJOR_VERS(fd->version) == 2) {
1103 	int32_t i32 = 0;
1104 	cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1105 	hdr->record_counter = i32;
1106     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1107 	cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter);
1108     }
1109 
1110     cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks);
1111 
1112     cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids);
1113     if (hdr->num_content_ids < 1 ||
1114 	hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) {
1115         /* Slice must have at least one data block,
1116 	   and malloc'd size shouldn't wrap. */
1117         free(hdr);
1118         return NULL;
1119     }
1120     hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1121     if (!hdr->block_content_ids) {
1122 	free(hdr);
1123 	return NULL;
1124     }
1125 
1126     for (i = 0; i < hdr->num_content_ids; i++) {
1127 	int l = safe_itf8_get((char *)cp, (char *)cp_end,
1128 			      &hdr->block_content_ids[i]);
1129 	if (l <= 0) {
1130 	    free(hdr->block_content_ids);
1131 	    free(hdr);
1132 	    return NULL;
1133 	}
1134 	cp += l;
1135     }
1136 
1137     if (b->content_type == MAPPED_SLICE) {
1138         cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id);
1139     }
1140 
1141     if (CRAM_MAJOR_VERS(fd->version) != 1) {
1142         if (cp_end - cp < 16) {
1143             free(hdr->block_content_ids);
1144             free(hdr);
1145             return NULL;
1146         }
1147 	memcpy(hdr->md5, cp, 16);
1148     } else {
1149 	memset(hdr->md5, 0, 16);
1150     }
1151 
1152     return hdr;
1153 }
1154 
1155 
1156 #if 0
1157 /* Returns the number of bits set in val; it the highest bit used */
1158 static int nbits(int v) {
1159     static const int MultiplyDeBruijnBitPosition[32] = {
1160 	1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1161 	9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1162     };
1163 
1164     v |= v >> 1; // first up to set all bits 1 after the first 1 */
1165     v |= v >> 2;
1166     v |= v >> 4;
1167     v |= v >> 8;
1168     v |= v >> 16;
1169 
1170     // DeBruijn magic to find top bit
1171     return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1172 }
1173 #endif
1174 
1175 #if 0
1176 static int sort_freqs(const void *vp1, const void *vp2) {
1177     const int i1 = *(const int *)vp1;
1178     const int i2 = *(const int *)vp2;
1179     return i1-i2;
1180 }
1181 #endif
1182 
1183 /* ----------------------------------------------------------------------
1184  * Primary CRAM sequence decoder
1185  */
1186 
add_md_char(cram_slice * s,int decode_md,char c,int32_t * md_dist)1187 static inline void add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1188     if (decode_md) {
1189 	BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1190 	BLOCK_APPEND_CHAR(s->aux_blk, c);
1191 	*md_dist = 0;
1192     }
1193 }
1194 
1195 /*
1196  * Internal part of cram_decode_slice().
1197  * Generates the sequence, quality and cigar components.
1198  */
cram_decode_seq(cram_fd * fd,cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,SAM_hdr * bfd,int cf,char * seq,char * qual,int has_MD,int has_NM)1199 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1200 			   cram_block *blk, cram_record *cr, SAM_hdr *bfd,
1201 			   int cf, char *seq, char *qual,
1202 			   int has_MD, int has_NM) {
1203     int prev_pos = 0, f, r = 0, out_sz = 1;
1204     int seq_pos = 1;
1205     int cig_len = 0, ref_pos = cr->apos;
1206     int32_t fn, i32;
1207     enum cigar_op cig_op = BAM_CMATCH;
1208     uint32_t *cigar = s->cigar;
1209     uint32_t ncigar = s->ncigar;
1210     uint32_t cigar_alloc = s->cigar_alloc;
1211     uint32_t nm = 0;
1212     int32_t md_dist = 0;
1213     int orig_aux = 0;
1214     int decode_md = fd->decode_md && s->ref && !has_MD;
1215     int decode_nm = fd->decode_md && s->ref && !has_NM;
1216     uint32_t ds = c->comp_hdr->data_series;
1217 
1218     if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1219 	memset(qual, 255, cr->len);
1220     }
1221 
1222     if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1223 	decode_md = decode_nm = 0;
1224 
1225     if (decode_md) {
1226 	orig_aux = BLOCK_SIZE(s->aux_blk);
1227 	BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1228     }
1229 
1230     if (ds & CRAM_FN) {
1231 	if (!c->comp_hdr->codecs[DS_FN]) return -1;
1232 	r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
1233 						blk, (char *)&fn, &out_sz);
1234         if (r) return r;
1235     } else {
1236 	fn = 0;
1237     }
1238 
1239     ref_pos--; // count from 0
1240     cr->cigar = ncigar;
1241 
1242     if (!(ds & (CRAM_FC | CRAM_FP)))
1243 	goto skip_cigar;
1244 
1245     for (f = 0; f < fn; f++) {
1246 	int32_t pos = 0;
1247 	char op;
1248 
1249 	if (ncigar+2 >= cigar_alloc) {
1250 	    cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1251 	    s->cigar = cigar;
1252 	    if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1253 		return -1;
1254 	}
1255 
1256 	if (ds & CRAM_FC) {
1257 	    if (!c->comp_hdr->codecs[DS_FC]) return -1;
1258 	    r |= c->comp_hdr->codecs[DS_FC]->decode(s,
1259 						    c->comp_hdr->codecs[DS_FC],
1260 						    blk,
1261 						    &op,  &out_sz);
1262 	    if (r) return r;
1263 	}
1264 
1265 	if (!(ds & CRAM_FP))
1266 	    continue;
1267 
1268 	if (!c->comp_hdr->codecs[DS_FP]) return -1;
1269 	r |= c->comp_hdr->codecs[DS_FP]->decode(s,
1270 						c->comp_hdr->codecs[DS_FP],
1271 						blk,
1272 						(char *)&pos, &out_sz);
1273 	if (r) return r;
1274 	pos += prev_pos;
1275 
1276 	if (pos <= 0) {
1277 	    fprintf(stderr, "Error: feature position %d before start of read.\n",
1278 		    pos);
1279 	    return -1;
1280 	}
1281 
1282 	if (pos > seq_pos) {
1283 	    if (pos > cr->len+1)
1284 		return -1;
1285 
1286 	    if (s->ref && cr->ref_id >= 0) {
1287 		if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1288 		    static int whinged = 0;
1289 		    int rlen;
1290 		    if (!whinged)
1291 			fprintf(stderr, "Ref pos outside of ref "
1292 				"sequence boundary\n");
1293 		    whinged = 1;
1294 		    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1295 		    // May miss MD/NM cases where both seq/ref are N, but this is a
1296 		    // malformed cram file anyway.
1297 		    if (rlen > 0) {
1298 			memcpy(&seq[seq_pos-1],
1299 			       &s->ref[ref_pos - s->ref_start +1], rlen);
1300 			if ((pos - seq_pos) - rlen > 0)
1301 			    memset(&seq[seq_pos-1+rlen], 'N',
1302 				   (pos - seq_pos) - rlen);
1303 		    } else {
1304 		        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1305 		    }
1306 		    if (md_dist >= 0)
1307 			md_dist += pos - seq_pos;
1308 		} else {
1309 		    // 'N' in both ref and seq is also mismatch for NM/MD
1310 		    if (decode_md || decode_nm) {
1311 			int i;
1312 			for (i = 0; i < pos - seq_pos; i++) {
1313 			    // FIXME: not N, but nt16 lookup == 15?
1314 			    char base = s->ref[ref_pos - s->ref_start + 1 + i];
1315 			    if (base == 'N') {
1316 				add_md_char(s, decode_md,
1317 					    s->ref[ref_pos - s->ref_start + 1 + i],
1318 					    &md_dist);
1319 				nm++;
1320 			    } else {
1321 				md_dist++;
1322 			    }
1323 			    seq[seq_pos-1+i] = base;
1324 			}
1325 		    } else {
1326 			memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1327 			       pos - seq_pos);
1328 		    }
1329 		}
1330 	    }
1331 #ifdef USE_X
1332 	    if (cig_len && cig_op != BAM_CBASE_MATCH) {
1333 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1334 		cig_len = 0;
1335 	    }
1336 	    cig_op = BAM_CBASE_MATCH;
1337 #else
1338 	    if (cig_len && cig_op != BAM_CMATCH) {
1339 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1340 		cig_len = 0;
1341 	    }
1342 	    cig_op = BAM_CMATCH;
1343 #endif
1344 	    cig_len += pos - seq_pos;
1345 	    ref_pos += pos - seq_pos;
1346 	    seq_pos = pos;
1347 	}
1348 
1349 	prev_pos = pos;
1350 
1351 	if (!(ds & CRAM_FC))
1352 	    goto skip_cigar;
1353 
1354 	switch(op) {
1355 	case 'S': { // soft clip: IN
1356 	    int32_t out_sz2 = 1;
1357 	    int have_sc = 0;
1358 
1359 	    if (cig_len) {
1360 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1361 		cig_len = 0;
1362 	    }
1363 	    switch (CRAM_MAJOR_VERS(fd->version)) {
1364 	    case 1:
1365 	        if (ds & CRAM_IN) {
1366 		    r |= c->comp_hdr->codecs[DS_IN]
1367 			? c->comp_hdr->codecs[DS_IN]
1368 			             ->decode(s, c->comp_hdr->codecs[DS_IN],
1369 					      blk,
1370 					      cr->len ? &seq[pos-1] : NULL,
1371 					      &out_sz2)
1372 			: (seq[pos-1] = 'N', out_sz2 = 1, 0);
1373 		    have_sc = 1;
1374 		}
1375 		break;
1376 	    case 2:
1377 	    default:
1378 	        if (ds & CRAM_SC) {
1379 		    r |= c->comp_hdr->codecs[DS_SC]
1380 			? c->comp_hdr->codecs[DS_SC]
1381 			             ->decode(s, c->comp_hdr->codecs[DS_SC],
1382 					      blk,
1383 					      cr->len ? &seq[pos-1] : NULL,
1384 					      &out_sz2)
1385 			: (seq[pos-1] = 'N', out_sz2 = 1, 0);
1386 		    have_sc = 1;
1387 		}
1388 		break;
1389 
1390 //		default:
1391 //		    r |= c->comp_hdr->codecs[DS_BB]
1392 //			? c->comp_hdr->codecs[DS_BB]
1393 //			             ->decode(s, c->comp_hdr->codecs[DS_BB],
1394 //					      blk, &seq[pos-1], &out_sz2)
1395 //			: (seq[pos-1] = 'N', out_sz2 = 1, 0);
1396 	    }
1397 	    if (have_sc) {
1398 		if (r) return r;
1399 		cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1400 		cig_op = BAM_CSOFT_CLIP;
1401 		seq_pos += out_sz2;
1402 	    }
1403 	    break;
1404 	}
1405 
1406 	case 'X': { // Substitution; BS
1407 	    unsigned char base;
1408 #ifdef USE_X
1409 	    if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1410 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1411 		cig_len = 0;
1412 	    }
1413 	    if (ds & CRAM_BS) {
1414 		if (!c->comp_hdr->codecs[DS_BS]) return -1;
1415 		r |= c->comp_hdr->codecs[DS_BS]
1416 		                ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1417 					 (char *)&base, &out_sz);
1418 		if (pos-1 < cr->len)
1419 		    seq[pos-1] = 'N'; // FIXME look up BS=base value
1420 	    }
1421 	    cig_op = BAM_CBASE_MISMATCH;
1422 #else
1423 	    int ref_base;
1424 	    if (cig_len && cig_op != BAM_CMATCH) {
1425 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1426 		cig_len = 0;
1427 	    }
1428 	    if (ds & CRAM_BS) {
1429 		if (!c->comp_hdr->codecs[DS_BS]) return -1;
1430 		r |= c->comp_hdr->codecs[DS_BS]
1431 		                ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1432 					 (char *)&base, &out_sz);
1433 		if (r) return -1;
1434 		if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1435 		    if (pos-1 < cr->len)
1436 			seq[pos-1] = c->comp_hdr->
1437 			    substitution_matrix[fd->L1['N']][base];
1438 		    if (decode_md || decode_nm) {
1439 			if (md_dist >= 0 && decode_md)
1440 			    BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1441 			md_dist = -1;
1442 			nm--;
1443 		    }
1444 		} else {
1445 		    unsigned char ref_call = ref_pos <= s->ref_end
1446 			? (uc)s->ref[ref_pos - s->ref_start +1]
1447 			: 'N';
1448 		    ref_base = fd->L1[ref_call];
1449 		    if (pos-1 < cr->len)
1450 			seq[pos-1] = c->comp_hdr->
1451 			    substitution_matrix[ref_base][base];
1452 		    add_md_char(s, decode_md, ref_call, &md_dist);
1453 		}
1454 	    }
1455 	    cig_op = BAM_CMATCH;
1456 #endif
1457 	    nm++;
1458 	    cig_len++;
1459 	    seq_pos++;
1460 	    ref_pos++;
1461 	    break;
1462 	}
1463 
1464 	case 'D': { // Deletion; DL
1465 	    if (cig_len && cig_op != BAM_CDEL) {
1466 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1467 		cig_len = 0;
1468 	    }
1469 	    if (ds & CRAM_DL) {
1470 		if (!c->comp_hdr->codecs[DS_DL]) return -1;
1471 		r |= c->comp_hdr->codecs[DS_DL]
1472 		                ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
1473 					 (char *)&i32, &out_sz);
1474 		if (r) return r;
1475 		if (decode_md || decode_nm) {
1476 		    if (md_dist >= 0 && decode_md)
1477 			BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1478 		    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1479 			if (decode_md) {
1480 			    BLOCK_APPEND_CHAR(s->aux_blk, '^');
1481 			    BLOCK_APPEND(s->aux_blk,
1482 					 &s->ref[ref_pos - s->ref_start +1],
1483 					 i32);
1484 			    md_dist = 0;
1485 			}
1486 			nm += i32;
1487 		    } else {
1488 			uint32_t dlen;
1489 			if (bfd->ref[cr->ref_id].len >= ref_pos) {
1490 			    if (decode_md) {
1491 				BLOCK_APPEND_CHAR(s->aux_blk, '^');
1492 				BLOCK_APPEND(s->aux_blk,
1493 					     &s->ref[ref_pos - s->ref_start+1],
1494 					     bfd->ref[cr->ref_id].len-ref_pos);
1495 				BLOCK_APPEND_UINT(s->aux_blk, 0);
1496 			    }
1497 			    dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1498 			    nm += i32 - dlen;
1499 			} else {
1500 			    dlen = i32;
1501 			}
1502 
1503 			md_dist = -1;
1504 		    }
1505 		}
1506 		cig_op = BAM_CDEL;
1507 		cig_len += i32;
1508 		ref_pos += i32;
1509 		//printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1510 	    }
1511 	    break;
1512 	}
1513 
1514 	case 'I': { // Insertion (several bases); IN
1515 	    int32_t out_sz2 = 1;
1516 
1517 	    if (cig_len && cig_op != BAM_CINS) {
1518 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1519 		cig_len = 0;
1520 	    }
1521 
1522 	    if (ds & CRAM_IN) {
1523 		if (!c->comp_hdr->codecs[DS_IN]) return -1;
1524 		r |= c->comp_hdr->codecs[DS_IN]
1525 		                ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
1526 					 cr->len ? &seq[pos-1] : NULL,
1527 					 &out_sz2);
1528 		if (r) return r;
1529 		cig_op = BAM_CINS;
1530 		cig_len += out_sz2;
1531 		seq_pos += out_sz2;
1532 		nm      += out_sz2;
1533 		//printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1534 	    }
1535 	    break;
1536 	}
1537 
1538 	case 'i': { // Insertion (single base); BA
1539 	    if (cig_len && cig_op != BAM_CINS) {
1540 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1541 		cig_len = 0;
1542 	    }
1543 	    if (ds & CRAM_BA) {
1544 		if (!c->comp_hdr->codecs[DS_BA]) return -1;
1545 		r |= c->comp_hdr->codecs[DS_BA]
1546 		                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1547 					 cr->len ? &seq[pos-1] : NULL,
1548 					 &out_sz);
1549 		if (r) return r;
1550 	    }
1551 	    cig_op = BAM_CINS;
1552 	    cig_len++;
1553 	    seq_pos++;
1554 	    nm++;
1555 	    break;
1556 	}
1557 
1558 	case 'b': { // Several bases
1559 	    int32_t len = 1;
1560 
1561 	    if (cig_len && cig_op != BAM_CMATCH) {
1562 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1563 		cig_len = 0;
1564 	    }
1565 
1566 	    if (ds & CRAM_BB) {
1567 		if (!c->comp_hdr->codecs[DS_BB]) return -1;
1568 		r |= c->comp_hdr->codecs[DS_BB]
1569 		    ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
1570 			     cr->len ? &seq[pos-1] : NULL,
1571 			     &len);
1572 		if (r) return r;
1573 
1574 		if (decode_md || decode_nm) {
1575 		    int x;
1576 		    if (md_dist >= 0 && decode_md)
1577 			BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1578 
1579 		    for (x = 0; x < len; x++) {
1580 			if (x && decode_md)
1581 			    BLOCK_APPEND_UINT(s->aux_blk, 0);
1582 			if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1583 			    md_dist = -1;
1584 			    break;
1585 			} else {
1586 			    if (decode_md) {
1587 				char r = s->ref[ref_pos+x-s->ref_start +1];
1588 				BLOCK_APPEND_CHAR(s->aux_blk, r);
1589 			    }
1590 			}
1591 		    }
1592 
1593 		    nm += x;
1594 		}
1595 	    }
1596 
1597 	    cig_op = BAM_CMATCH;
1598 
1599 	    cig_len+=len;
1600 	    seq_pos+=len;
1601 	    ref_pos+=len;
1602 	    //prev_pos+=len;
1603 	    break;
1604 	}
1605 
1606 	case 'q': { // Several quality values
1607 	    int32_t len = 1;
1608 
1609 	    if (cig_len && cig_op != BAM_CMATCH) {
1610 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1611 		cig_len = 0;
1612 	    }
1613 
1614 	    if (ds & CRAM_QQ) {
1615 		if (!c->comp_hdr->codecs[DS_QQ]) return -1;
1616 		r |= c->comp_hdr->codecs[DS_QQ]
1617 		    ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
1618 			     (char *)&qual[pos-1], &len);
1619 		if (r) return r;
1620 	    }
1621 
1622 	    cig_op = BAM_CMATCH;
1623 
1624 	    cig_len+=len;
1625 	    seq_pos+=len;
1626 	    ref_pos+=len;
1627 	    //prev_pos+=len;
1628 	    break;
1629 	}
1630 
1631 	case 'B': { // Read base; BA, QS
1632 #ifdef USE_X
1633 	    if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1634 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1635 		cig_len = 0;
1636 	    }
1637 #else
1638 	    if (cig_len && cig_op != BAM_CMATCH) {
1639 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1640 		cig_len = 0;
1641 	    }
1642 #endif
1643 	    if (ds & CRAM_BA) {
1644 		if (!c->comp_hdr->codecs[DS_BA]) return -1;
1645 		r |= c->comp_hdr->codecs[DS_BA]
1646 		                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1647 					 cr->len ? &seq[pos-1] : NULL,
1648 					 &out_sz);
1649 
1650 		if (decode_md || decode_nm) {
1651 		    if (md_dist >= 0 && decode_md)
1652 			BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1653 		    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1654 			md_dist = -1;
1655 		    } else {
1656 			if (decode_md)
1657 			    BLOCK_APPEND_CHAR(s->aux_blk,
1658 					      s->ref[ref_pos-s->ref_start +1]);
1659 			nm++;
1660 			md_dist = 0;
1661 		    }
1662 		}
1663 	    }
1664 	    if (ds & CRAM_QS) {
1665 		if (!c->comp_hdr->codecs[DS_QS]) return -1;
1666 		r |= c->comp_hdr->codecs[DS_QS]
1667 		                ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1668 					 (char *)&qual[pos-1], &out_sz);
1669 	    }
1670 #ifdef USE_X
1671 	    cig_op = BAM_CBASE_MISMATCH;
1672 #else
1673 	    cig_op = BAM_CMATCH;
1674 #endif
1675 	    cig_len++;
1676 	    seq_pos++;
1677 	    ref_pos++;
1678 	    //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1679 	    break;
1680 	}
1681 
1682 	case 'Q': { // Quality score; QS
1683 	    if (ds & CRAM_QS) {
1684 		if (!c->comp_hdr->codecs[DS_QS]) return -1;
1685 		r |= c->comp_hdr->codecs[DS_QS]
1686 		                ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1687 					 (char *)&qual[pos-1], &out_sz);
1688 		//printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1689 	    }
1690 	    break;
1691 	}
1692 
1693 	case 'H': { // hard clip; HC
1694 	    if (cig_len && cig_op != BAM_CHARD_CLIP) {
1695 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1696 		cig_len = 0;
1697 	    }
1698 	    if (ds & CRAM_HC) {
1699 		if (!c->comp_hdr->codecs[DS_HC]) return -1;
1700 		r |= c->comp_hdr->codecs[DS_HC]
1701 		                ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
1702 					 (char *)&i32, &out_sz);
1703 		if (r) return r;
1704 		cig_op = BAM_CHARD_CLIP;
1705 		cig_len += i32;
1706 	    }
1707 	    break;
1708 	}
1709 
1710 	case 'P': { // padding; PD
1711 	    if (cig_len && cig_op != BAM_CPAD) {
1712 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1713 		cig_len = 0;
1714 	    }
1715 	    if (ds & CRAM_PD) {
1716 		if (!c->comp_hdr->codecs[DS_PD]) return -1;
1717 		r |= c->comp_hdr->codecs[DS_PD]
1718 		                ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
1719 					 (char *)&i32, &out_sz);
1720 		if (r) return r;
1721 		cig_op = BAM_CPAD;
1722 		cig_len += i32;
1723 	    }
1724 	    break;
1725 	}
1726 
1727 	case 'N': { // Ref skip; RS
1728 	    if (cig_len && cig_op != BAM_CREF_SKIP) {
1729 		cigar[ncigar++] = (cig_len<<4) + cig_op;
1730 		cig_len = 0;
1731 	    }
1732 	    if (ds & CRAM_RS) {
1733 		if (!c->comp_hdr->codecs[DS_RS]) return -1;
1734 		r |= c->comp_hdr->codecs[DS_RS]
1735 		                ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
1736 					 (char *)&i32, &out_sz);
1737 		if (r) return r;
1738 		cig_op = BAM_CREF_SKIP;
1739 		cig_len += i32;
1740 		ref_pos += i32;
1741 	    }
1742 	    break;
1743 	}
1744 
1745 	default:
1746             fprintf(stderr, "Error: Unknown feature code '%c'\n", op);
1747 	    return -1;
1748 	}
1749     }
1750 
1751     if (!(ds & CRAM_FC))
1752 	goto skip_cigar;
1753 
1754     /* An implicit match op for any unaccounted for bases */
1755     if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1756 	if (s->ref) {
1757 	    if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1758 		static int whinged = 0;
1759 		int rlen;
1760 		if (!whinged)
1761 		    fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
1762 		whinged = 1;
1763 		rlen = bfd->ref[cr->ref_id].len - ref_pos;
1764 		// May miss MD/NM cases where both seq/ref are N, but this is a
1765 		// malformed cram file anyway.
1766 		if (rlen > 0) {
1767 		    if (seq_pos-1 + rlen < cr->len)
1768 			memcpy(&seq[seq_pos-1],
1769 			       &s->ref[ref_pos - s->ref_start +1], rlen);
1770 		    if ((cr->len - seq_pos + 1) - rlen > 0)
1771 		        memset(&seq[seq_pos-1+rlen], 'N',
1772                                (cr->len - seq_pos + 1) - rlen);
1773 		} else {
1774 		    if (cr->len - seq_pos + 1 > 0)
1775 			memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1776 		}
1777 		if (md_dist >= 0)
1778 		    md_dist += cr->len - seq_pos + 1;
1779 	    } else {
1780 		if (cr->len - seq_pos + 1 > 0) {
1781 		    if (decode_md || decode_nm) {
1782 			int i;
1783 			for (i = 0; i < cr->len - seq_pos + 1; i++) {
1784 			    // FIXME: not N, but nt16 lookup == 15?
1785 			    char base = s->ref[ref_pos - s->ref_start + 1 + i];
1786 			    if (base == 'N') {
1787 				add_md_char(s, decode_md,
1788 					    s->ref[ref_pos - s->ref_start + 1 + i],
1789 					    &md_dist);
1790 				nm++;
1791 			    } else {
1792 				md_dist++;
1793 			    }
1794 			    seq[seq_pos-1+i] = base;
1795 			}
1796 		    } else {
1797 			memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1798 			       cr->len - seq_pos + 1);
1799 		    }
1800 		}
1801 		ref_pos += cr->len - seq_pos + 1;
1802 	    }
1803 	}
1804 
1805 	if (ncigar+1 >= cigar_alloc) {
1806 	    cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1807 	    s->cigar = cigar;
1808 	    if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1809 		return -1;
1810 	}
1811 #ifdef USE_X
1812 	if (cig_len && cig_op != BAM_CBASE_MATCH) {
1813 	    cigar[ncigar++] = (cig_len<<4) + cig_op;
1814 	    cig_len = 0;
1815 	}
1816 	cig_op = BAM_CBASE_MATCH;
1817 #else
1818 	if (cig_len && cig_op != BAM_CMATCH) {
1819 	    cigar[ncigar++] = (cig_len<<4) + cig_op;
1820 	    cig_len = 0;
1821 	}
1822 	cig_op = BAM_CMATCH;
1823 #endif
1824 	cig_len += cr->len - seq_pos+1;
1825     }
1826 
1827  skip_cigar:
1828 
1829     if ((ds & CRAM_FN) && decode_md) {
1830 	if (md_dist >= 0)
1831 	    BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1832     }
1833 
1834     if (cig_len) {
1835 	if (ncigar >= cigar_alloc) {
1836 	    cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1837 	    s->cigar = cigar;
1838 	    if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1839 		return -1;
1840 	}
1841 
1842 	cigar[ncigar++] = (cig_len<<4) + cig_op;
1843     }
1844 
1845     cr->ncigar = ncigar - cr->cigar;
1846     cr->aend = ref_pos;
1847 
1848     //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1849 
1850     if (ds & CRAM_MQ) {
1851 	if (!c->comp_hdr->codecs[DS_MQ]) return -1;
1852 	r |= c->comp_hdr->codecs[DS_MQ]
1853 	                ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
1854 				 (char *)&cr->mqual, &out_sz);
1855     } else {
1856 	cr->mqual = 40;
1857     }
1858 
1859     if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1860 	int32_t out_sz2 = cr->len;
1861 
1862 	if (ds & CRAM_QS) {
1863 	    if (!c->comp_hdr->codecs[DS_QS]) return -1;
1864 	    r |= c->comp_hdr->codecs[DS_QS]
1865 		            ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1866 				     qual, &out_sz2);
1867 	}
1868     }
1869 
1870     s->cigar = cigar;
1871     s->cigar_alloc = cigar_alloc;
1872     s->ncigar = ncigar;
1873 
1874     if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1875 	cr->len = 0;
1876 
1877     if (decode_md) {
1878 	BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1879 	cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
1880     }
1881 
1882     if (decode_nm) {
1883 	char buf[7];
1884 	buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
1885 	buf[3] = (nm>> 0) & 0xff;
1886 	buf[4] = (nm>> 8) & 0xff;
1887 	buf[5] = (nm>>16) & 0xff;
1888 	buf[6] = (nm>>24) & 0xff;
1889 	BLOCK_APPEND(s->aux_blk, buf, 7);
1890 	cr->aux_size += 7;
1891     }
1892 
1893     return r;
1894 }
1895 
1896 /*
1897  * Quick and simple hash lookup for cram_map arrays
1898  */
map_find(cram_map ** map,unsigned char * key,int id)1899 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1900     cram_map *m;
1901 
1902     m = map[CRAM_MAP(key[0],key[1])];
1903     while (m && m->key != id)
1904 	m= m->next;
1905 
1906     return m;
1907 }
1908 
1909 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1910 
1911 
cram_decode_aux_1_0(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr)1912 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1913 			       cram_block *blk, cram_record *cr) {
1914     int i, r = 0, out_sz = 1;
1915     unsigned char ntags;
1916 
1917     if (!c->comp_hdr->codecs[DS_TC]) return -1;
1918     r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1919 					    (char *)&ntags, &out_sz);
1920     cr->ntags = ntags;
1921 
1922     //printf("TC=%d\n", cr->ntags);
1923     cr->aux_size = 0;
1924     cr->aux = BLOCK_SIZE(s->aux_blk);
1925 
1926     for (i = 0; i < cr->ntags; i++) {
1927 	int32_t id, out_sz = 1;
1928 	unsigned char tag_data[3];
1929 	cram_map *m;
1930 
1931 	//printf("Tag %d/%d\n", i+1, cr->ntags);
1932 	if (!c->comp_hdr->codecs[DS_TN]) return -1;
1933 	r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1934 						blk, (char *)&id, &out_sz);
1935 	if (out_sz == 3) {
1936 	    tag_data[0] = ((char *)&id)[0];
1937 	    tag_data[1] = ((char *)&id)[1];
1938 	    tag_data[2] = ((char *)&id)[2];
1939 	} else {
1940 	    tag_data[0] = (id>>16) & 0xff;
1941 	    tag_data[1] = (id>>8)  & 0xff;
1942 	    tag_data[2] = id       & 0xff;
1943 	}
1944 
1945 	m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1946 	if (!m)
1947 	    return -1;
1948 	BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1949 
1950 	if (!m->codec) return -1;
1951 	r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1952 
1953 	cr->aux_size += out_sz + 3;
1954     }
1955 
1956     return r;
1957 }
1958 
cram_decode_aux(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,int * has_MD,int * has_NM)1959 static int cram_decode_aux(cram_container *c, cram_slice *s,
1960 			   cram_block *blk, cram_record *cr,
1961 			   int *has_MD, int *has_NM) {
1962     int i, r = 0, out_sz = 1;
1963     int32_t TL = 0;
1964     unsigned char *TN;
1965     uint32_t ds = c->comp_hdr->data_series;
1966 
1967     if (!(ds & (CRAM_TL|CRAM_aux))) {
1968 	cr->aux = 0;
1969 	cr->aux_size = 0;
1970 	return 0;
1971     }
1972 
1973     if (!c->comp_hdr->codecs[DS_TL]) return -1;
1974     r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1975 					    (char *)&TL, &out_sz);
1976     if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1977 	return -1;
1978 
1979     TN = c->comp_hdr->TL[TL];
1980     cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1981 
1982     //printf("TC=%d\n", cr->ntags);
1983     cr->aux_size = 0;
1984     cr->aux = BLOCK_SIZE(s->aux_blk);
1985 
1986     if (!(ds & CRAM_aux))
1987 	return 0;
1988 
1989     for (i = 0; i < cr->ntags; i++) {
1990 	int32_t id, out_sz = 1;
1991 	unsigned char tag_data[3];
1992 	cram_map *m;
1993 
1994 	if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1995 	    *has_MD = 1;
1996 	if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1997 	    *has_NM = 1;
1998 
1999 	//printf("Tag %d/%d\n", i+1, cr->ntags);
2000 	tag_data[0] = *TN++;
2001 	tag_data[1] = *TN++;
2002 	tag_data[2] = *TN++;
2003 	id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2004 
2005 	m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2006 	if (!m)
2007 	    return -1;
2008 	BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2009 
2010 	if (!m->codec) return -1;
2011 	r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2012 	if (r) break;
2013 	cr->aux_size += out_sz + 3;
2014     }
2015 
2016     return r;
2017 }
2018 
2019 /* Resolve mate pair cross-references between recs within this slice */
cram_decode_slice_xref(cram_slice * s,int required_fields)2020 static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2021     int rec;
2022 
2023     if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2024 	for (rec = 0; rec < s->hdr->num_records; rec++) {
2025 	    cram_record *cr = &s->crecs[rec];
2026 
2027 	    cr->tlen = 0;
2028 	    cr->mate_pos = 0;
2029 	    cr->mate_ref_id = -1;
2030 	}
2031 
2032 	return 0;
2033     }
2034 
2035     for (rec = 0; rec < s->hdr->num_records; rec++) {
2036 	cram_record *cr = &s->crecs[rec];
2037 
2038 	if (cr->mate_line >= 0) {
2039 	    if (cr->mate_line < s->hdr->num_records) {
2040 		/*
2041 		 * On the first read, loop through computing lengths.
2042 		 * It's not perfect as we have one slice per reference so we
2043 		 * cannot detect when TLEN should be zero due to seqs that
2044 		 * map to multiple references.
2045 		 *
2046 		 * We also cannot set tlen correct when it spans a slice for
2047 		 * other reasons. This may make tlen too small. Should we
2048 		 * fix this by forcing TLEN to be stored verbatim in such cases?
2049 		 *
2050 		 * Or do we just admit defeat and output 0 for tlen? It's the
2051 		 * safe option...
2052 		 */
2053 		if (cr->tlen == INT_MIN) {
2054 		    int id1 = rec, id2 = rec;
2055 		    int aleft = cr->apos, aright = cr->aend;
2056 		    int tlen;
2057 		    int ref = cr->ref_id;
2058 
2059 		    // number of segments starting at the same point.
2060 		    int left_cnt = 0;
2061 
2062 		    do {
2063 			if (aleft > s->crecs[id2].apos)
2064 			    aleft = s->crecs[id2].apos, left_cnt = 1;
2065 			else if (aleft == s->crecs[id2].apos)
2066 			    left_cnt++;
2067 			if (aright < s->crecs[id2].aend)
2068 			    aright = s->crecs[id2].aend;
2069 			if (s->crecs[id2].mate_line == -1) {
2070 			    s->crecs[id2].mate_line = rec;
2071 			    break;
2072 			}
2073 			if (s->crecs[id2].mate_line <= id2 ||
2074                             s->crecs[id2].mate_line >= s->hdr->num_records)
2075 			    return -1;
2076 			id2 = s->crecs[id2].mate_line;
2077 
2078 			if (s->crecs[id2].ref_id != ref)
2079 			    ref = -1;
2080 		    } while (id2 != id1);
2081 
2082 		    if (ref != -1) {
2083 			tlen = aright - aleft + 1;
2084 			id1 = id2 = rec;
2085 
2086 			/*
2087 			 * When we have two seqs with identical start and
2088 			 * end coordinates, set +/- tlen based on 1st/last
2089 			 * bit flags instead, as a tie breaker.
2090 			 */
2091 			if (s->crecs[id2].apos == aleft) {
2092 			    if (left_cnt == 1 ||
2093 				(s->crecs[id2].flags & BAM_FREAD1))
2094 				s->crecs[id2].tlen = tlen;
2095 			    else
2096 				s->crecs[id2].tlen = -tlen;
2097 			} else {
2098 			    s->crecs[id2].tlen = -tlen;
2099 			}
2100 
2101 			id2 = s->crecs[id2].mate_line;
2102 			while (id2 != id1) {
2103 			    if (s->crecs[id2].apos == aleft) {
2104 				if (left_cnt == 1 ||
2105 				    (s->crecs[id2].flags & BAM_FREAD1))
2106 				    s->crecs[id2].tlen = tlen;
2107 				else
2108 				    s->crecs[id2].tlen = -tlen;
2109 			    } else {
2110 				s->crecs[id2].tlen = -tlen;
2111 			    }
2112 			    id2 = s->crecs[id2].mate_line;
2113 			}
2114 		    } else {
2115 			id1 = id2 = rec;
2116 
2117 			s->crecs[id2].tlen = 0;
2118 			id2 = s->crecs[id2].mate_line;
2119 			while (id2 != id1) {
2120 			    s->crecs[id2].tlen = 0;
2121 			    id2 = s->crecs[id2].mate_line;
2122 			}
2123 		    }
2124 		}
2125 
2126 		cr->mate_pos = s->crecs[cr->mate_line].apos;
2127 		cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2128 
2129 		// paired
2130 		cr->flags |= BAM_FPAIRED;
2131 
2132 		// set mate unmapped if needed
2133 		if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2134 		    cr->flags |= BAM_FMUNMAP;
2135 		    cr->tlen = 0;
2136 		}
2137 		if (cr->flags & BAM_FUNMAP) {
2138 		    cr->tlen = 0;
2139 		}
2140 
2141 		// set mate reversed if needed
2142 		if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2143 		    cr->flags |= BAM_FMREVERSE;
2144 	    } else {
2145 		fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
2146 			cr->mate_line, s->hdr->num_records-1);
2147 	    }
2148 
2149 	    /* FIXME: construct read names here too if needed */
2150 	} else {
2151 	    if (cr->mate_flags & CRAM_M_REVERSE) {
2152 		cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2153 	    }
2154 	    if (cr->mate_flags & CRAM_M_UNMAP) {
2155 		cr->flags |= BAM_FMUNMAP;
2156 		//cr->mate_ref_id = -1;
2157 	    }
2158 	    if (!(cr->flags & BAM_FPAIRED))
2159 		cr->mate_ref_id = -1;
2160 	}
2161 
2162 	if (cr->tlen == INT_MIN)
2163 	    cr->tlen = 0; // Just incase
2164     }
2165     return 0;
2166 }
2167 
md5_print(unsigned char * md5,char * out)2168 static char *md5_print(unsigned char *md5, char *out) {
2169     int i;
2170     for (i = 0; i < 16; i++) {
2171 	out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2172 	out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2173     }
2174     out[32] = 0;
2175 
2176     return out;
2177 }
2178 
reset_all_codecs(cram_block_compression_hdr * hdr)2179 static void reset_all_codecs(cram_block_compression_hdr *hdr) {
2180     int i;
2181 
2182     for (i = 0; i < DS_END; i++) {
2183 	if (hdr->codecs[i] && hdr->codecs[i]->reset)
2184 	    hdr->codecs[i]->reset(hdr->codecs[i]);
2185     }
2186 
2187     for (i = 0; i < CRAM_MAP_HASH; i++) {
2188 	cram_map *m;
2189 	for (m = hdr->rec_encoding_map[i]; m; m = m->next)
2190 	    if (m->codec) m->codec->reset(m->codec);
2191     }
2192 
2193     for (i = 0; i < CRAM_MAP_HASH; i++) {
2194 	cram_map *m;
2195 	for (m = hdr->tag_encoding_map[i]; m; m = m->next)
2196 	    if (m->codec) m->codec->reset(m->codec);
2197     }
2198 }
2199 
2200 /*
2201  * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2202  * Returns 0 on success
2203  *        -1 on failure
2204  */
cram_decode_slice(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2205 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2206 		      SAM_hdr *bfd) {
2207     cram_block *blk = s->block[0];
2208     int32_t bf, ref_id;
2209     unsigned char cf;
2210     int out_sz, r = 0;
2211     int rec;
2212     char *seq = NULL, *qual = NULL;
2213     int unknown_rg = -1;
2214     int embed_ref;
2215     char **refs = NULL;
2216     uint32_t ds;
2217 
2218     /* Clear any cached blocks in the external / B.A.S. codecs */
2219     reset_all_codecs(c->comp_hdr);
2220 
2221     if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2222 	return -1;
2223 
2224     ds = c->comp_hdr->data_series;
2225 
2226     blk->bit = 7; // MSB first
2227 
2228     // Study the blocks and estimate approx sizes to preallocate.
2229     // This looks to speed up decoding by around 8-9%.
2230     // We can always shrink back down at the end if we overestimated.
2231     // However it's likely that this also saves memory as own growth
2232     // factor (*=1.5) is never applied.
2233     {
2234 	int qsize, nsize, q_id;
2235 	cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2236 	//fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2237 
2238 	if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2239 	if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2240 	if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2241 
2242 	// To do - consider using q_id here to usurp the quality block and
2243 	// avoid a memcpy during decode.
2244 	// Specifically when quality is an external block uniquely used by
2245 	// DS_QS only, then we can set s->qual_blk directly to this
2246 	// block and save the codec->decode() calls. (Approx 3% cpu saving)
2247     }
2248 
2249     /* Look for unknown RG, added as last by Java CRAM? */
2250     if (bfd->nrg > 0 &&
2251 	bfd->rg[bfd->nrg-1].name != NULL &&
2252 	!strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2253 	unknown_rg = bfd->nrg-1;
2254 
2255     if (blk->content_type != CORE)
2256 	return -1;
2257 
2258     if (s->crecs)
2259 	free(s->crecs);
2260     if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2261 	return -1;
2262 
2263     ref_id = s->hdr->ref_seq_id;
2264     embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2265 
2266     if (ref_id >= 0) {
2267 	if (embed_ref) {
2268 	    cram_block *b;
2269 	    if (s->hdr->ref_base_id < 0) {
2270 		fprintf(stderr, "No reference specified and "
2271 			"no embedded reference is available.\n");
2272 		return -1;
2273 	    }
2274 	    b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2275 	    if (!b)
2276 		return -1;
2277 	    if (cram_uncompress_block(b) != 0)
2278 		return -1;
2279 	    s->ref = (char *)BLOCK_DATA(b);
2280 	    s->ref_start = s->hdr->ref_seq_start;
2281 	    s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2282 	    if (s->ref_end - s->ref_start > b->uncomp_size) {
2283 		fprintf(stderr, "Embedded reference is too small.\n");
2284 		return -1;
2285 	    }
2286 	} else if (!fd->no_ref) {
2287 	    //// Avoid Java cramtools bug by loading entire reference seq
2288 	    //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2289 	    //s->ref_start = 1;
2290 
2291 	    if (fd->required_fields & SAM_SEQ)
2292 		s->ref =
2293 		cram_get_ref(fd, s->hdr->ref_seq_id,
2294 			     s->hdr->ref_seq_start,
2295 			     s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2296 	    s->ref_start = s->hdr->ref_seq_start;
2297 	    s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2298 
2299 	    /* Sanity check */
2300 	    if (s->ref_start < 0) {
2301 		fprintf(stderr, "Slice starts before base 1.\n");
2302 		s->ref_start = 0;
2303 	    }
2304 	    pthread_mutex_lock(&fd->ref_lock);
2305 	    pthread_mutex_lock(&fd->refs->lock);
2306 	    if ((fd->required_fields & SAM_SEQ) &&
2307                 ref_id < fd->refs->nref &&
2308 		s->ref_end > fd->refs->ref_id[ref_id]->length) {
2309 		s->ref_end = fd->refs->ref_id[ref_id]->length;
2310 	    }
2311 	    pthread_mutex_unlock(&fd->refs->lock);
2312 	    pthread_mutex_unlock(&fd->ref_lock);
2313 	}
2314     }
2315 
2316     if ((fd->required_fields & SAM_SEQ) &&
2317 	s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
2318 	fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
2319 		s->hdr->ref_seq_id, s->hdr->ref_seq_start,
2320 		s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2321 	return -1;
2322     }
2323 
2324     if (CRAM_MAJOR_VERS(fd->version) != 1
2325 	&& (fd->required_fields & SAM_SEQ)
2326 	&& s->hdr->ref_seq_id >= 0
2327 	&& !fd->ignore_md5
2328 	&& memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2329 	hts_md5_context *md5;
2330 	unsigned char digest[16];
2331 
2332 	if (s->ref && s->hdr->ref_seq_id >= 0) {
2333 	    int start, len;
2334 
2335 	    if (s->hdr->ref_seq_start >= s->ref_start) {
2336 		start = s->hdr->ref_seq_start - s->ref_start;
2337 	    } else {
2338 		fprintf(stderr, "Slice starts before base 1.\n");
2339 		start = 0;
2340 	    }
2341 
2342 	    if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2343 		len = s->hdr->ref_seq_span;
2344 	    } else {
2345 		fprintf(stderr, "Slice ends beyond reference end.\n");
2346 		len = s->ref_end - s->ref_start + 1;
2347 	    }
2348 
2349 	    if (!(md5 = hts_md5_init()))
2350 		return -1;
2351 	    if (start + len > s->ref_end - s->ref_start + 1)
2352 		len = s->ref_end - s->ref_start + 1 - start;
2353 	    if (len >= 0)
2354 		hts_md5_update(md5, s->ref + start, len);
2355 	    hts_md5_final(digest, md5);
2356 	    hts_md5_destroy(md5);
2357 	} else if (!s->ref && s->hdr->ref_base_id >= 0) {
2358 	    cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2359 	    if (b) {
2360 		if (!(md5 = hts_md5_init()))
2361 		    return -1;
2362 		hts_md5_update(md5, b->data, b->uncomp_size);
2363 		hts_md5_final(digest, md5);
2364 		hts_md5_destroy(md5);
2365 	    }
2366 	}
2367 
2368 	if ((!s->ref && s->hdr->ref_base_id < 0)
2369 	    || memcmp(digest, s->hdr->md5, 16) != 0) {
2370 	    char M[33];
2371 	    fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
2372 		    "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
2373 	    fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
2374 	    fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
2375 	    return -1;
2376 	}
2377     }
2378 
2379     if (ref_id == -2) {
2380 	pthread_mutex_lock(&fd->ref_lock);
2381 	pthread_mutex_lock(&fd->refs->lock);
2382 	refs = calloc(fd->refs->nref, sizeof(char *));
2383 	pthread_mutex_unlock(&fd->refs->lock);
2384 	pthread_mutex_unlock(&fd->ref_lock);
2385 	if (!refs)
2386 	    return -1;
2387     }
2388 
2389     int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2390     for (rec = 0; rec < s->hdr->num_records; rec++) {
2391 	cram_record *cr = &s->crecs[rec];
2392 	int has_MD, has_NM;
2393 
2394 	//fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2395 
2396 	cr->s = s;
2397 
2398 	out_sz = 1; /* decode 1 item */
2399 	if (ds & CRAM_BF) {
2400 	    if (!c->comp_hdr->codecs[DS_BF]) return -1;
2401 	    r |= c->comp_hdr->codecs[DS_BF]
2402 		            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2403 				     (char *)&bf, &out_sz);
2404 	    if (r || bf < 0 ||
2405 		bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2406 		return -1;
2407 	    bf = fd->bam_flag_swap[bf];
2408 	    cr->flags = bf;
2409 	} else {
2410 	    cr->flags = bf = 0x4; // unmapped
2411 	}
2412 
2413 	if (ds & CRAM_CF) {
2414 	    if (CRAM_MAJOR_VERS(fd->version) == 1) {
2415 		/* CF is byte in 1.0, int32 in 2.0 */
2416 		if (!c->comp_hdr->codecs[DS_CF]) return -1;
2417 		r |= c->comp_hdr->codecs[DS_CF]
2418 		                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2419 				 	 (char *)&cf, &out_sz);
2420 		if (r) return -1;
2421 		cr->cram_flags = cf;
2422 	    } else {
2423 		if (!c->comp_hdr->codecs[DS_CF]) return -1;
2424 		r |= c->comp_hdr->codecs[DS_CF]
2425 		                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2426 					 (char *)&cr->cram_flags, &out_sz);
2427 		if (r) return -1;
2428 		cf = cr->cram_flags;
2429 	    }
2430 	} else {
2431 	    cf = cr->cram_flags = 0;
2432 	}
2433 
2434 	if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2435 	    if (ds & CRAM_RI) {
2436 		if (!c->comp_hdr->codecs[DS_RI]) return -1;
2437 		r |= c->comp_hdr->codecs[DS_RI]
2438 		                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2439 					 (char *)&cr->ref_id, &out_sz);
2440 		if (r) return -1;
2441 		if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2442 		    && cr->ref_id >= 0
2443 		    && cr->ref_id != last_ref_id) {
2444 		    if (!fd->no_ref) {
2445 			if (!refs[cr->ref_id])
2446 			    refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
2447 							    1, 0);
2448 			s->ref = refs[cr->ref_id];
2449 
2450 			if (!fd->unsorted && last_ref_id >= 0 && refs[last_ref_id]) {
2451 			    cram_ref_decr(fd->refs, last_ref_id);
2452 			    refs[last_ref_id] = NULL;
2453 			}
2454 		    }
2455 		    s->ref_start = 1;
2456 		    pthread_mutex_lock(&fd->ref_lock);
2457 		    pthread_mutex_lock(&fd->refs->lock);
2458 		    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2459 		    pthread_mutex_unlock(&fd->refs->lock);
2460 		    pthread_mutex_unlock(&fd->ref_lock);
2461 
2462 		    last_ref_id = cr->ref_id;
2463 		}
2464 	    } else {
2465 		cr->ref_id = -1;
2466 	    }
2467 	} else {
2468 	    cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2469 	}
2470 	if (cr->ref_id >= bfd->nref) {
2471 	    fprintf(stderr, "Requested unknown reference ID %d\n", cr->ref_id);
2472             return -1;
2473 	}
2474 
2475 	if (ds & CRAM_RL) {
2476 	    if (!c->comp_hdr->codecs[DS_RL]) return -1;
2477 	    r |= c->comp_hdr->codecs[DS_RL]
2478 		            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2479 				     (char *)&cr->len, &out_sz);
2480 	    if (r) return r;
2481 	    if (cr->len < 0) {
2482 	        fprintf(stderr, "Read has negative length\n");
2483 		return -1;
2484 	    }
2485 	}
2486 
2487 	if (ds & CRAM_AP) {
2488 	    if (!c->comp_hdr->codecs[DS_AP]) return -1;
2489 	    r |= c->comp_hdr->codecs[DS_AP]
2490 		            ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2491 				     (char *)&cr->apos, &out_sz);
2492 	    if (r) return r;
2493 	    if (c->comp_hdr->AP_delta)
2494 		cr->apos += s->last_apos;
2495 	    s->last_apos=  cr->apos;
2496 	} else {
2497 	    cr->apos = c->ref_seq_start;
2498 	}
2499 
2500 	if (ds & CRAM_RG) {
2501 	    if (!c->comp_hdr->codecs[DS_RG]) return -1;
2502 	    r |= c->comp_hdr->codecs[DS_RG]
2503 		           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2504 				    (char *)&cr->rg, &out_sz);
2505 	    if (r) return r;
2506 	    if (cr->rg == unknown_rg)
2507 		cr->rg = -1;
2508 	} else {
2509 	    cr->rg = -1;
2510 	}
2511 
2512 	cr->name_len = 0;
2513 
2514 	if (c->comp_hdr->read_names_included) {
2515 	    int32_t out_sz2 = 1;
2516 
2517 	    // Read directly into name cram_block
2518 	    cr->name = BLOCK_SIZE(s->name_blk);
2519 	    if (ds & CRAM_RN) {
2520 		if (!c->comp_hdr->codecs[DS_RN]) return -1;
2521 		r |= c->comp_hdr->codecs[DS_RN]
2522 		                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2523 					 (char *)s->name_blk, &out_sz2);
2524 		if (r) return r;
2525 		cr->name_len = out_sz2;
2526 	    }
2527 	}
2528 
2529 	cr->mate_pos = 0;
2530 	cr->mate_line = -1;
2531 	cr->mate_ref_id = -1;
2532 	if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2533 	    if (ds & CRAM_MF) {
2534 		if (CRAM_MAJOR_VERS(fd->version) == 1) {
2535 		    /* MF is byte in 1.0, int32 in 2.0 */
2536 		    unsigned char mf;
2537 		    if (!c->comp_hdr->codecs[DS_MF]) return -1;
2538 		    r |= c->comp_hdr->codecs[DS_MF]
2539 			            ->decode(s, c->comp_hdr->codecs[DS_MF],
2540 					     blk, (char *)&mf, &out_sz);
2541 		    if (r) return r;
2542 		    cr->mate_flags = mf;
2543 		} else {
2544 		    if (!c->comp_hdr->codecs[DS_MF]) return -1;
2545 		    r |= c->comp_hdr->codecs[DS_MF]
2546 			            ->decode(s, c->comp_hdr->codecs[DS_MF],
2547 					     blk,
2548 					     (char *)&cr->mate_flags,
2549 					     &out_sz);
2550 		    if (r) return r;
2551 		}
2552 	    } else {
2553 		cr->mate_flags = 0;
2554 	    }
2555 
2556 	    if (!c->comp_hdr->read_names_included) {
2557 		int32_t out_sz2 = 1;
2558 
2559 		// Read directly into name cram_block
2560 		cr->name = BLOCK_SIZE(s->name_blk);
2561 		if (ds & CRAM_RN) {
2562 		    if (!c->comp_hdr->codecs[DS_RN]) return -1;
2563 		    r |= c->comp_hdr->codecs[DS_RN]
2564 			            ->decode(s, c->comp_hdr->codecs[DS_RN],
2565 					     blk, (char *)s->name_blk,
2566 					     &out_sz2);
2567 		    if (r) return r;
2568 		    cr->name_len = out_sz2;
2569 		}
2570 	    }
2571 
2572 	    if (ds & CRAM_NS) {
2573 		if (!c->comp_hdr->codecs[DS_NS]) return -1;
2574 		r |= c->comp_hdr->codecs[DS_NS]
2575 		                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2576 					 (char *)&cr->mate_ref_id, &out_sz);
2577 		if (r) return r;
2578 	    }
2579 
2580 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2581 //	    if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2582 //		/* Paired, but unmapped */
2583 //		cr->flags |= BAM_FMUNMAP;
2584 //	    }
2585 
2586 	    if (ds & CRAM_NP) {
2587 		if (!c->comp_hdr->codecs[DS_NP]) return -1;
2588 		r |= c->comp_hdr->codecs[DS_NP]
2589 		                ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2590 					 (char *)&cr->mate_pos, &out_sz);
2591 		if (r) return r;
2592 	    }
2593 
2594 	    if (ds & CRAM_TS) {
2595 		if (!c->comp_hdr->codecs[DS_TS]) return -1;
2596 		r |= c->comp_hdr->codecs[DS_TS]
2597 		                ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2598 					 (char *)&cr->tlen, &out_sz);
2599 		if (r) return r;
2600 	    } else {
2601 		cr->tlen = INT_MIN;
2602 	    }
2603 	} else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2604 	    if (ds & CRAM_NF) {
2605 		if (!c->comp_hdr->codecs[DS_NF]) return -1;
2606 		r |= c->comp_hdr->codecs[DS_NF]
2607 		                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2608 					 (char *)&cr->mate_line, &out_sz);
2609 		if (r) return r;
2610 		cr->mate_line += rec + 1;
2611 
2612 		//cr->name_len = sprintf(name, "%d", name_id++);
2613 		//cr->name = DSTRING_LEN(name_ds);
2614 		//dstring_nappend(name_ds, name, cr->name_len);
2615 
2616 		cr->mate_ref_id = -1;
2617 		cr->tlen = INT_MIN;
2618 		cr->mate_pos = 0;
2619 	    } else  {
2620 		cr->mate_flags = 0;
2621 		cr->tlen = INT_MIN;
2622 	    }
2623 	} else {
2624 	    cr->mate_flags = 0;
2625 	    cr->tlen = INT_MIN;
2626 	}
2627 	/*
2628 	else if (!name[0]) {
2629 	    //name[0] = '?'; name[1] = 0;
2630 	    //cr->name_len = 1;
2631 	    //cr->name=  DSTRING_LEN(s->name_ds);
2632 	    //dstring_nappend(s->name_ds, "?", 1);
2633 
2634 	    cr->mate_ref_id = -1;
2635 	    cr->tlen = 0;
2636 	    cr->mate_pos = 0;
2637 	}
2638 	*/
2639 
2640 	/* Auxiliary tags */
2641 	has_MD = has_NM = 0;
2642 	if (CRAM_MAJOR_VERS(fd->version) == 1)
2643 	    r |= cram_decode_aux_1_0(c, s, blk, cr);
2644 	else
2645 	    r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM);
2646 	if (r) return r;
2647 
2648 	/* Fake up dynamic string growth and appending */
2649 	if (ds & CRAM_RL) {
2650 	    cr->seq = BLOCK_SIZE(s->seqs_blk);
2651 	    BLOCK_GROW(s->seqs_blk, cr->len);
2652 	    seq = (char *)BLOCK_END(s->seqs_blk);
2653 	    BLOCK_SIZE(s->seqs_blk) += cr->len;
2654 
2655 	    if (!seq)
2656 		return -1;
2657 
2658 	    cr->qual = BLOCK_SIZE(s->qual_blk);
2659 	    BLOCK_GROW(s->qual_blk, cr->len);
2660 	    qual = (char *)BLOCK_END(s->qual_blk);
2661 	    BLOCK_SIZE(s->qual_blk) += cr->len;
2662 
2663 	    if (!s->ref)
2664 		memset(seq, '=', cr->len);
2665 	}
2666 
2667 	if (!(bf & BAM_FUNMAP)) {
2668             if ((ds & CRAM_AP) && cr->apos <= 0) {
2669                 fprintf(stderr,
2670 			"Read has alignment position %d but no unmapped flag\n",
2671 			cr->apos);
2672 		return -1;
2673 	    }
2674 	    /* Decode sequence and generate CIGAR */
2675 	    if (ds & (CRAM_SEQ | CRAM_MQ)) {
2676 		r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual,
2677 				     has_MD, has_NM);
2678 		if (r) return r;
2679 	    } else {
2680 		cr->cigar = 0;
2681 		cr->ncigar = 0;
2682 		cr->aend = cr->apos;
2683 		cr->mqual = 0;
2684 	    }
2685 	} else {
2686 	    int out_sz2 = cr->len;
2687 
2688 	    //puts("Unmapped");
2689 	    cr->cigar = 0;
2690 	    cr->ncigar = 0;
2691 	    cr->aend = cr->apos;
2692 	    cr->mqual = 0;
2693 
2694 	    if (ds & CRAM_BA && cr->len) {
2695 		if (!c->comp_hdr->codecs[DS_BA]) return -1;
2696 		r |= c->comp_hdr->codecs[DS_BA]
2697 		                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2698 					 (char *)seq, &out_sz2);
2699 		if (r) return r;
2700 	    }
2701 
2702 	    if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2703 		out_sz2 = cr->len;
2704 		if (ds & CRAM_QS && cr->len >= 0) {
2705 		    if (!c->comp_hdr->codecs[DS_QS]) return -1;
2706 		    r |= c->comp_hdr->codecs[DS_QS]
2707 			            ->decode(s, c->comp_hdr->codecs[DS_QS],
2708 					     blk, qual, &out_sz2);
2709 		    if (r) return r;
2710 		}
2711 	    } else {
2712 		if (ds & CRAM_RL)
2713 		    memset(qual, 255, cr->len);
2714 	    }
2715 	}
2716     }
2717 
2718     pthread_mutex_lock(&fd->ref_lock);
2719     if (refs) {
2720 	int i;
2721 	for (i = 0; i < fd->refs->nref; i++) {
2722 	    if (refs[i])
2723 		cram_ref_decr(fd->refs, i);
2724 	}
2725 	free(refs);
2726     } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2727 	cram_ref_decr(fd->refs, ref_id);
2728     }
2729     pthread_mutex_unlock(&fd->ref_lock);
2730 
2731     /* Resolve mate pair cross-references between recs within this slice */
2732     r |= cram_decode_slice_xref(s, fd->required_fields);
2733 
2734     // Free the original blocks as we no longer need these.
2735     {
2736 	int i;
2737 	for (i = 0; i < s->hdr->num_blocks; i++) {
2738 	    cram_block *b = s->block[i];
2739 	    cram_free_block(b);
2740 	    s->block[i] = NULL;
2741 	}
2742     }
2743 
2744     // Also see initial BLOCK_RESIZE_EXACT at top of function.
2745     // As we grow blocks we overallocate by up to 50%. So shrink
2746     // back to their final sizes here.
2747     //
2748 //    fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2749 //    	    (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2750 //    	    (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2751 //    	    (int)s->name_blk->byte, (int)s->name_blk->alloc,
2752 //    	    (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2753     BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2754     BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2755     BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2756     BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2757 
2758     return r;
2759 }
2760 
2761 typedef struct {
2762     cram_fd *fd;
2763     cram_container *c;
2764     cram_slice *s;
2765     SAM_hdr *h;
2766     int exit_code;
2767 } cram_decode_job;
2768 
cram_decode_slice_thread(void * arg)2769 void *cram_decode_slice_thread(void *arg) {
2770     cram_decode_job *j = (cram_decode_job *)arg;
2771 
2772     j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2773 
2774     return j;
2775 }
2776 
2777 /*
2778  * Spawn a multi-threaded version of cram_decode_slice().
2779  */
cram_decode_slice_mt(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2780 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2781 			 SAM_hdr *bfd) {
2782     cram_decode_job *j;
2783     int nonblock;
2784 
2785     if (!fd->pool)
2786 	return cram_decode_slice(fd, c, s, bfd);
2787 
2788     if (!(j = malloc(sizeof(*j))))
2789 	return -1;
2790 
2791     j->fd = fd;
2792     j->c  = c;
2793     j->s  = s;
2794     j->h  = bfd;
2795 
2796     nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2797 
2798     if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2799 			       j, nonblock)) {
2800 	/* Would block */
2801 	fd->job_pending = j;
2802     } else {
2803 	fd->job_pending = NULL;
2804     }
2805 
2806     // flush too
2807     return 0;
2808 }
2809 
2810 
2811 /* ----------------------------------------------------------------------
2812  * CRAM sequence iterators.
2813  */
2814 
2815 /*
2816  * Converts a cram in-memory record into a bam in-memory record. We
2817  * pass a pointer to a bam_seq_t pointer along with the a pointer to
2818  * the allocated size. These can initially be pointers to NULL and zero.
2819  *
2820  * This function will reallocate the bam buffer as required and update
2821  * (*bam)->alloc accordingly, allowing it to be used within a loop
2822  * efficiently without needing to allocate new bam objects over and
2823  * over again.
2824  *
2825  * Returns the used size of the bam record on success
2826  *         -1 on failure.
2827  */
cram_to_bam(SAM_hdr * bfd,cram_fd * fd,cram_slice * s,cram_record * cr,int rec,bam_seq_t ** bam)2828 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
2829 		       cram_record *cr, int rec, bam_seq_t **bam) {
2830     int bam_idx, rg_len;
2831     char name_a[1024], *name;
2832     int name_len;
2833     char *aux, *aux_orig;
2834     char *seq, *qual;
2835 
2836     /* Assign names if not explicitly set */
2837     if (fd->required_fields & SAM_QNAME) {
2838 	if (cr->name_len) {
2839 	    name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
2840 	    name_len = cr->name_len;
2841 	} else {
2842 	    name = name_a;
2843 	    name_len = strlen(fd->prefix);
2844 	    memcpy(name, fd->prefix, name_len);
2845 	    name += name_len;
2846 	    *name++ = ':';
2847 	    if (cr->mate_line >= 0 && cr->mate_line < rec)
2848 		name = (char *)append_uint64((unsigned char *)name,
2849 					     s->hdr->record_counter +
2850 					     cr->mate_line + 1);
2851 	    else
2852 		name = (char *)append_uint64((unsigned char *)name,
2853 					     s->hdr->record_counter +
2854 					     rec + 1);
2855 	    name_len = name - name_a;
2856 	    name = name_a;
2857 	}
2858     } else {
2859 	name = "?";
2860 	name_len = 1;
2861     }
2862 
2863     /* Generate BAM record */
2864     if (cr->rg < -1 || cr->rg >= bfd->nrg)
2865 	return -1;
2866     rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
2867 
2868     if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
2869 	if (!BLOCK_DATA(s->seqs_blk))
2870 	    return -1;
2871 	seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
2872     } else {
2873 	seq = "*";
2874 	cr->len = 0;
2875     }
2876 
2877 
2878     if (fd->required_fields & SAM_QUAL) {
2879 	if (!BLOCK_DATA(s->qual_blk))
2880 	    return -1;
2881 	qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
2882     } else {
2883 	qual = NULL;
2884     }
2885 
2886     bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
2887 				name, name_len,
2888 				cr->flags,
2889 				cr->ref_id,
2890 				cr->apos,
2891 				cr->aend,
2892 				cr->mqual,
2893 				cr->ncigar, &s->cigar[cr->cigar],
2894 				cr->mate_ref_id,
2895 				cr->mate_pos,
2896 				cr->tlen,
2897 				cr->len,
2898 				seq,
2899 				qual);
2900     if (bam_idx == -1)
2901 	return -1;
2902 
2903     aux = aux_orig = (char *)bam_aux(*bam);
2904 
2905     /* Auxiliary strings */
2906     if (cr->aux_size != 0) {
2907 	memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
2908 	aux += cr->aux_size;
2909     }
2910 
2911     /* RG:Z: */
2912     if (cr->rg != -1) {
2913 	int len = bfd->rg[cr->rg].name_len;
2914 	*aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
2915 	memcpy(aux, bfd->rg[cr->rg].name, len);
2916 	aux += len;
2917 	*aux++ = 0;
2918     }
2919 
2920     return bam_idx + (aux - aux_orig);
2921 }
2922 
2923 /*
2924  * Here be dragons! The multi-threading code in this is crufty beyond belief.
2925  */
cram_next_slice(cram_fd * fd,cram_container ** cp)2926 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
2927     cram_container *c;
2928     cram_slice *s = NULL;
2929 
2930     if (!(c = fd->ctr)) {
2931 	// Load first container.
2932 	do {
2933 	    if (!(c = fd->ctr = cram_read_container(fd)))
2934 		return NULL;
2935 	} while (c->length == 0);
2936 
2937 	/*
2938 	 * The first container may be a result of a sub-range query.
2939 	 * In which case it may still not be the optimal starting point
2940 	 * due to skipped containers/slices in the index.
2941 	 */
2942 	if (fd->range.refid != -2) {
2943 	    while (c->ref_seq_id != -2 &&
2944 		   (c->ref_seq_id < fd->range.refid ||
2945 		    c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
2946 		if (0 != cram_seek(fd, c->length, SEEK_CUR))
2947 		    return NULL;
2948 		cram_free_container(fd->ctr);
2949 		do {
2950 		    if (!(c = fd->ctr = cram_read_container(fd)))
2951 			return NULL;
2952 		} while (c->length == 0);
2953 	    }
2954 
2955 	    if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
2956 		return NULL;
2957 	}
2958 
2959 	if (!(c->comp_hdr_block = cram_read_block(fd)))
2960 	    return NULL;
2961 	if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
2962 	    return NULL;
2963 
2964 	c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
2965 	if (!c->comp_hdr)
2966 	    return NULL;
2967 	if (!c->comp_hdr->AP_delta &&
2968 	    sam_hdr_sort_order(fd->header) != ORDER_COORD) {
2969 	    pthread_mutex_lock(&fd->ref_lock);
2970 	    fd->unsorted = 1;
2971 	    pthread_mutex_unlock(&fd->ref_lock);
2972 	}
2973     }
2974 
2975     if ((s = c->slice)) {
2976 	c->slice = NULL;
2977 	cram_free_slice(s);
2978 	s = NULL;
2979     }
2980 
2981     if (c->curr_slice == c->max_slice) {
2982 	if (fd->ctr == c)
2983 	    fd->ctr = NULL;
2984 	cram_free_container(c);
2985 	c = NULL;
2986     }
2987 
2988     /* Sorry this is so contorted! */
2989     for (;;) {
2990 	if (fd->job_pending) {
2991 	    cram_decode_job *j = (cram_decode_job *)fd->job_pending;
2992 	    c = j->c;
2993 	    s = j->s;
2994 	    free(fd->job_pending);
2995 	    fd->job_pending = NULL;
2996 	} else if (!fd->ooc) {
2997 	empty_container:
2998 	    if (!c || c->curr_slice == c->max_slice) {
2999 		// new container
3000 		do {
3001 		    if (!(c = fd->ctr = cram_read_container(fd))) {
3002 			if (fd->pool) {
3003 			    fd->ooc = 1;
3004 			    break;
3005 			}
3006 
3007 			return NULL;
3008 		    }
3009 		} while (c->length == 0);
3010 		if (fd->ooc)
3011 		    break;
3012 
3013 		/* Skip containers not yet spanning our range */
3014 		if (fd->range.refid != -2 && c->ref_seq_id != -2) {
3015 		    fd->required_fields |= SAM_POS;
3016 
3017 		    if (c->ref_seq_id != fd->range.refid) {
3018 			cram_free_container(c);
3019 			fd->ctr = NULL;
3020 			fd->ooc = 1;
3021 			fd->eof = 1;
3022 			break;
3023 		    }
3024 
3025 		    if (c->ref_seq_start > fd->range.end) {
3026 			cram_free_container(c);
3027 			fd->ctr = NULL;
3028 			fd->ooc = 1;
3029 			fd->eof = 1;
3030 			break;
3031 		    }
3032 
3033 		    if (c->ref_seq_start + c->ref_seq_span-1 <
3034 			fd->range.start) {
3035 			c->curr_rec = c->max_rec;
3036 			c->curr_slice = c->max_slice;
3037 			cram_seek(fd, c->length, SEEK_CUR);
3038 			cram_free_container(c);
3039 			c = NULL;
3040 			continue;
3041 		    }
3042 		}
3043 
3044 		if (!(c->comp_hdr_block = cram_read_block(fd)))
3045 		    return NULL;
3046 		if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3047 		    return NULL;
3048 
3049 		c->comp_hdr =
3050 		    cram_decode_compression_header(fd, c->comp_hdr_block);
3051 		if (!c->comp_hdr)
3052 		    return NULL;
3053 
3054 		if (!c->comp_hdr->AP_delta &&
3055 		    sam_hdr_sort_order(fd->header) != ORDER_COORD) {
3056 		    pthread_mutex_lock(&fd->ref_lock);
3057 		    fd->unsorted = 1;
3058 		    pthread_mutex_unlock(&fd->ref_lock);
3059 		}
3060 	    }
3061 
3062 	    if (c->num_records == 0) {
3063 		cram_free_container(c); c = NULL;
3064 		goto empty_container;
3065 	    }
3066 
3067 
3068 	    if (!(s = c->slice = cram_read_slice(fd)))
3069 		return NULL;
3070 	    c->curr_slice++;
3071 	    c->curr_rec = 0;
3072 	    c->max_rec = s->hdr->num_records;
3073 
3074 	    s->last_apos = s->hdr->ref_seq_start;
3075 
3076 	    /* Skip slices not yet spanning our range */
3077 	    if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
3078 		if (s->hdr->ref_seq_id != fd->range.refid) {
3079 		    fd->eof = 1;
3080 		    cram_free_slice(s);
3081 		    c->slice = NULL;
3082 		    return NULL;
3083 		}
3084 
3085 		if (s->hdr->ref_seq_start > fd->range.end) {
3086 		    fd->eof = 1;
3087 		    cram_free_slice(s);
3088 		    c->slice = NULL;
3089 		    return NULL;
3090 		}
3091 
3092 		if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
3093 		    fd->range.start) {
3094 		    cram_free_slice(s);
3095 		    c->slice = NULL;
3096 		    cram_free_container(c);
3097 		    c = NULL;
3098 		    continue;
3099 		}
3100 	    }
3101 	}
3102 
3103 	/* Test decoding of 1st seq */
3104 	if (!c || !s)
3105 	    break;
3106 
3107 	if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
3108 	    //	if (cram_decode_slice(fd, c, s, fd->header) != 0) {
3109 	    fprintf(stderr, "Failure to decode slice\n");
3110 	    cram_free_slice(s);
3111 	    c->slice = NULL;
3112 	    return NULL;
3113 	}
3114 
3115 	if (!fd->pool || fd->job_pending)
3116 	    break;
3117 
3118 	// Push it a bit far, to qsize in queue rather than pending arrival,
3119 	// as cram tends to be a bit bursty in decode timings.
3120 	if (hts_tpool_process_len(fd->rqueue) > hts_tpool_process_qsize(fd->rqueue))
3121 	    break;
3122     }
3123 
3124     if (fd->pool) {
3125 	hts_tpool_result *res;
3126 	cram_decode_job *j;
3127 
3128 //	fprintf(stderr, "Thread pool len = %d, %d\n",
3129 //		hts_tpool_results_queue_len(fd->rqueue),
3130 //		hts_tpool_results_queue_sz(fd->rqueue));
3131 
3132 	if (fd->ooc && hts_tpool_process_empty(fd->rqueue))
3133 	    return NULL;
3134 
3135 	res = hts_tpool_next_result_wait(fd->rqueue);
3136 
3137 	if (!res || !hts_tpool_result_data(res)) {
3138 	    fprintf(stderr, "hts_tpool_next_result failure\n");
3139 	    return NULL;
3140 	}
3141 
3142 	j = (cram_decode_job *)hts_tpool_result_data(res);
3143 	c = j->c;
3144 	s = j->s;
3145 
3146 	if (j->exit_code != 0) {
3147 	    fprintf(stderr, "Slice decode failure\n");
3148 	    fd->eof = 0;
3149 	    hts_tpool_delete_result(res, 1);
3150 	    return NULL;
3151 	}
3152 
3153 	fd->ctr = c;
3154 
3155 	hts_tpool_delete_result(res, 1);
3156     }
3157 
3158     *cp = c;
3159     return s;
3160 }
3161 
3162 /*
3163  * Read the next cram record and return it.
3164  * Note that to decode cram_record the caller will need to look up some data
3165  * in the current slice, pointed to by fd->ctr->slice. This is valid until
3166  * the next call to cram_get_seq (which may invalidate it).
3167  *
3168  * Returns record pointer on success (do not free)
3169  *        NULL on failure
3170  */
cram_get_seq(cram_fd * fd)3171 cram_record *cram_get_seq(cram_fd *fd) {
3172     cram_container *c;
3173     cram_slice *s;
3174 
3175     for (;;) {
3176 	c = fd->ctr;
3177 	if (c && c->slice && c->curr_rec < c->max_rec) {
3178 	    s = c->slice;
3179 	} else {
3180 	    if (!(s = cram_next_slice(fd, &c)))
3181 		return NULL;
3182 	    continue; /* In case slice contains no records */
3183 	}
3184 
3185 	if (fd->range.refid != -2) {
3186 	    if (fd->range.refid == -1 && s->crecs[c->curr_rec].ref_id != -1) {
3187 		// Special case when looking for unmapped blocks at end.
3188 		// If these are mixed in with mapped data (c->ref_id == -2)
3189 		// then we need skip until we find the unmapped data, if at all
3190 		c->curr_rec++;
3191 		continue;
3192 	    }
3193 	    if (s->crecs[c->curr_rec].ref_id < fd->range.refid &&
3194 		s->crecs[c->curr_rec].ref_id != -1) {
3195 		// Looking for a mapped read, but not there yet.  Special case
3196 		// as -1 (unmapped) shouldn't be considered < refid.
3197 		c->curr_rec++;
3198 		continue;
3199 	    }
3200 
3201 	    if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
3202 		fd->eof = 1;
3203 		cram_free_slice(s);
3204 		c->slice = NULL;
3205 		return NULL;
3206 	    }
3207 
3208 	    if (fd->range.refid != -1 && s->crecs[c->curr_rec].apos > fd->range.end) {
3209 		fd->eof = 1;
3210 		cram_free_slice(s);
3211 		c->slice = NULL;
3212 		return NULL;
3213 	    }
3214 
3215 	    if (fd->range.refid != -1 && s->crecs[c->curr_rec].aend < fd->range.start) {
3216 		c->curr_rec++;
3217 		continue;
3218 	    }
3219 	}
3220 
3221 	break;
3222     }
3223 
3224     fd->ctr = c;
3225     c->slice = s;
3226     return &s->crecs[c->curr_rec++];
3227 }
3228 
3229 /*
3230  * Read the next cram record and convert it to a bam_seq_t struct.
3231  *
3232  * Returns 0 on success
3233  *        -1 on EOF or failure (check fd->err)
3234  */
cram_get_bam_seq(cram_fd * fd,bam_seq_t ** bam)3235 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3236     cram_record *cr;
3237     cram_container *c;
3238     cram_slice *s;
3239 
3240     if (!(cr = cram_get_seq(fd)))
3241 	return -1;
3242 
3243     c = fd->ctr;
3244     s = c->slice;
3245 
3246     return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
3247 }
3248