1 /*
2 Copyright (c) 2012-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * - In-memory decoding of CRAM data structures.
33 * - Iterator for reading CRAM record by record.
34 */
35
36 #include <config.h>
37
38 #include <stdio.h>
39 #include <errno.h>
40 #include <assert.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <zlib.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <math.h>
47 #include <stdint.h>
48
49 #include "cram/cram.h"
50 #include "cram/os.h"
51 #include "htslib/hts.h"
52
53 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
54 //#define USE_X
55
56 /* ----------------------------------------------------------------------
57 * CRAM compression headers
58 */
59
60 /*
61 * Decodes the Tag Dictionary record in the preservation map
62 * Updates the cram compression header.
63 *
64 * Returns number of bytes decoded on success
65 * -1 on failure
66 */
cram_decode_TD(char * cp,const char * endp,cram_block_compression_hdr * h)67 int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) {
68 char *op = cp;
69 unsigned char *dat;
70 cram_block *b;
71 int32_t blk_size = 0;
72 int nTL, i, sz;
73
74 if (!(b = cram_new_block(0, 0)))
75 return -1;
76
77 /* Decode */
78 cp += safe_itf8_get(cp, endp, &blk_size);
79 if (!blk_size) {
80 h->nTL = 0;
81 h->TL = NULL;
82 cram_free_block(b);
83 return cp - op;
84 }
85
86 if (blk_size < 0 || endp - cp < blk_size) {
87 cram_free_block(b);
88 return -1;
89 }
90
91 BLOCK_APPEND(b, cp, blk_size);
92 cp += blk_size;
93 sz = cp - op;
94 // Force nul termination if missing
95 if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
96 BLOCK_APPEND_CHAR(b, '\0');
97
98 /* Set up TL lookup table */
99 dat = BLOCK_DATA(b);
100
101 // Count
102 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
103 nTL++;
104 while (dat[i])
105 i++;
106 }
107
108 // Copy
109 h->nTL = nTL;
110 if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) {
111 cram_free_block(b);
112 return -1;
113 }
114 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
115 h->TL[nTL++] = &dat[i];
116 while (dat[i])
117 i++;
118 }
119 h->TD_blk = b;
120
121 return sz;
122 }
123
124 /*
125 * Decodes a CRAM block compression header.
126 * Returns header ptr on success
127 * NULL on failure
128 */
cram_decode_compression_header(cram_fd * fd,cram_block * b)129 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
130 cram_block *b) {
131 char *cp, *endp, *cp_copy;
132 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
133 int i;
134 int32_t map_size = 0, map_count = 0;
135
136 if (!hdr)
137 return NULL;
138
139 if (b->method != RAW) {
140 if (cram_uncompress_block(b)) {
141 free(hdr);
142 return NULL;
143 }
144 }
145
146 cp = (char *)b->data;
147 endp = cp + b->uncomp_size;
148
149 if (CRAM_MAJOR_VERS(fd->version) == 1) {
150 cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id);
151 cp += safe_itf8_get(cp, endp, &hdr->ref_seq_start);
152 cp += safe_itf8_get(cp, endp, &hdr->ref_seq_span);
153 cp += safe_itf8_get(cp, endp, &hdr->num_records);
154 cp += safe_itf8_get(cp, endp, &hdr->num_landmarks);
155 if ((hdr->num_landmarks < 0 ||
156 hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t))) {
157 free(hdr);
158 return NULL;
159 }
160 if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
161 free(hdr);
162 return NULL;
163 }
164 for (i = 0; i < hdr->num_landmarks; i++) {
165 cp += safe_itf8_get(cp, endp, &hdr->landmark[i]);
166 }
167 }
168
169 hdr->preservation_map = kh_init(map);
170
171 memset(hdr->rec_encoding_map, 0,
172 CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
173 memset(hdr->tag_encoding_map, 0,
174 CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
175
176 if (!hdr->preservation_map) {
177 cram_free_compression_header(hdr);
178 return NULL;
179 }
180
181 /* Initialise defaults for preservation map */
182 hdr->mapped_qs_included = 0;
183 hdr->unmapped_qs_included = 0;
184 hdr->unmapped_placed = 0;
185 hdr->qs_included = 0;
186 hdr->read_names_included = 0;
187 hdr->AP_delta = 1;
188 memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
189
190 /* Preservation map */
191 cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
192 cp += safe_itf8_get(cp, endp, &map_count);
193 for (i = 0; i < map_count; i++) {
194 pmap_t hd;
195 khint_t k;
196 int r;
197
198 if (endp - cp < 3) {
199 cram_free_compression_header(hdr);
200 return NULL;
201 }
202 cp += 2;
203 switch(CRAM_KEY(cp[-2],cp[-1])) {
204 case CRAM_KEY('M','I'):
205 hd.i = *cp++;
206 k = kh_put(map, hdr->preservation_map, "MI", &r);
207 if (-1 == r) {
208 cram_free_compression_header(hdr);
209 return NULL;
210 }
211
212 kh_val(hdr->preservation_map, k) = hd;
213 hdr->mapped_qs_included = hd.i;
214 break;
215
216 case CRAM_KEY('U','I'):
217 hd.i = *cp++;
218 k = kh_put(map, hdr->preservation_map, "UI", &r);
219 if (-1 == r) {
220 cram_free_compression_header(hdr);
221 return NULL;
222 }
223
224 kh_val(hdr->preservation_map, k) = hd;
225 hdr->unmapped_qs_included = hd.i;
226 break;
227
228 case CRAM_KEY('P','I'):
229 hd.i = *cp++;
230 k = kh_put(map, hdr->preservation_map, "PI", &r);
231 if (-1 == r) {
232 cram_free_compression_header(hdr);
233 return NULL;
234 }
235
236 kh_val(hdr->preservation_map, k) = hd;
237 hdr->unmapped_placed = hd.i;
238 break;
239
240 case CRAM_KEY('R','N'):
241 hd.i = *cp++;
242 k = kh_put(map, hdr->preservation_map, "RN", &r);
243 if (-1 == r) {
244 cram_free_compression_header(hdr);
245 return NULL;
246 }
247
248 kh_val(hdr->preservation_map, k) = hd;
249 hdr->read_names_included = hd.i;
250 break;
251
252 case CRAM_KEY('A','P'):
253 hd.i = *cp++;
254 k = kh_put(map, hdr->preservation_map, "AP", &r);
255 if (-1 == r) {
256 cram_free_compression_header(hdr);
257 return NULL;
258 }
259
260 kh_val(hdr->preservation_map, k) = hd;
261 hdr->AP_delta = hd.i;
262 break;
263
264 case CRAM_KEY('R','R'):
265 hd.i = *cp++;
266 k = kh_put(map, hdr->preservation_map, "RR", &r);
267 if (-1 == r) {
268 cram_free_compression_header(hdr);
269 return NULL;
270 }
271
272 kh_val(hdr->preservation_map, k) = hd;
273 fd->no_ref = !hd.i;
274 break;
275
276 case CRAM_KEY('S','M'):
277 if (endp - cp < 5) {
278 cram_free_compression_header(hdr);
279 return NULL;
280 }
281 hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
282 hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
283 hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
284 hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
285
286 hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
287 hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
288 hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
289 hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
290
291 hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
292 hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
293 hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
294 hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
295
296 hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
297 hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
298 hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
299 hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
300
301 hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
302 hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
303 hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
304 hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
305
306 hd.p = cp;
307 cp += 5;
308
309 k = kh_put(map, hdr->preservation_map, "SM", &r);
310 if (-1 == r) {
311 cram_free_compression_header(hdr);
312 return NULL;
313 }
314 kh_val(hdr->preservation_map, k) = hd;
315 break;
316
317 case CRAM_KEY('T','D'): {
318 int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary
319 if (sz < 0) {
320 cram_free_compression_header(hdr);
321 return NULL;
322 }
323
324 hd.p = cp;
325 cp += sz;
326
327 k = kh_put(map, hdr->preservation_map, "TD", &r);
328 if (-1 == r) {
329 cram_free_compression_header(hdr);
330 return NULL;
331 }
332 kh_val(hdr->preservation_map, k) = hd;
333 break;
334 }
335
336 default:
337 fprintf(stderr, "Unrecognised preservation map key %c%c\n",
338 cp[-2], cp[-1]);
339 // guess byte;
340 cp++;
341 break;
342 }
343 }
344 if (cp - cp_copy != map_size) {
345 cram_free_compression_header(hdr);
346 return NULL;
347 }
348
349 /* Record encoding map */
350 cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
351 cp += safe_itf8_get(cp, endp, &map_count);
352 for (i = 0; i < map_count; i++) {
353 char *key = cp;
354 int32_t encoding = E_NULL;
355 int32_t size = 0;
356 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
357
358 if (!m || endp - cp < 4) {
359 free(m);
360 cram_free_compression_header(hdr);
361 return NULL;
362 }
363
364 cp += 2;
365 cp += safe_itf8_get(cp, endp, &encoding);
366 cp += safe_itf8_get(cp, endp, &size);
367
368 // Fill out cram_map purely for cram_dump to dump out.
369 m->key = (key[0]<<8)|key[1];
370 m->encoding = encoding;
371 m->size = size;
372 m->offset = cp - (char *)b->data;
373 m->codec = NULL;
374
375 if (m->encoding == E_NULL)
376 continue;
377
378 if (size < 0 || endp - cp < size) {
379 free(m);
380 cram_free_compression_header(hdr);
381 return NULL;
382 }
383
384 //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
385
386 /*
387 * For CRAM1.0 CF and BF are Byte and not Int.
388 * Practically speaking it makes no difference unless we have a
389 * 1.0 format file that stores these in EXTERNAL as only then
390 * does Byte vs Int matter.
391 *
392 * Neither this C code nor Java reference implementations did this,
393 * so we gloss over it and treat them as int.
394 */
395
396 if (key[0] == 'B' && key[1] == 'F') {
397 if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size,
398 E_INT,
399 fd->version))) {
400 cram_free_compression_header(hdr);
401 return NULL;
402 }
403 } else if (key[0] == 'C' && key[1] == 'F') {
404 if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size,
405 E_INT,
406 fd->version))) {
407 cram_free_compression_header(hdr);
408 return NULL;
409 }
410 } else if (key[0] == 'R' && key[1] == 'I') {
411 if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size,
412 E_INT,
413 fd->version))) {
414 cram_free_compression_header(hdr);
415 return NULL;
416 }
417 } else if (key[0] == 'R' && key[1] == 'L') {
418 if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size,
419 E_INT,
420 fd->version))) {
421 cram_free_compression_header(hdr);
422 return NULL;
423 }
424 } else if (key[0] == 'A' && key[1] == 'P') {
425 if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
426 E_INT,
427 fd->version))) {
428 cram_free_compression_header(hdr);
429 return NULL;
430 }
431 } else if (key[0] == 'R' && key[1] == 'G') {
432 if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size,
433 E_INT,
434 fd->version))) {
435 cram_free_compression_header(hdr);
436 return NULL;
437 }
438 } else if (key[0] == 'M' && key[1] == 'F') {
439 if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size,
440 E_INT,
441 fd->version))) {
442 cram_free_compression_header(hdr);
443 return NULL;
444 }
445 } else if (key[0] == 'N' && key[1] == 'S') {
446 if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size,
447 E_INT,
448 fd->version))) {
449 cram_free_compression_header(hdr);
450 return NULL;
451 }
452 } else if (key[0] == 'N' && key[1] == 'P') {
453 if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
454 E_INT,
455 fd->version))) {
456 cram_free_compression_header(hdr);
457 return NULL;
458 }
459 } else if (key[0] == 'T' && key[1] == 'S') {
460 if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
461 E_INT,
462 fd->version))) {
463 cram_free_compression_header(hdr);
464 return NULL;
465 }
466 } else if (key[0] == 'N' && key[1] == 'F') {
467 if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size,
468 E_INT,
469 fd->version))) {
470 cram_free_compression_header(hdr);
471 return NULL;
472 }
473 } else if (key[0] == 'T' && key[1] == 'C') {
474 if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size,
475 E_BYTE,
476 fd->version))) {
477 cram_free_compression_header(hdr);
478 return NULL;
479 }
480 } else if (key[0] == 'T' && key[1] == 'N') {
481 if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size,
482 E_INT,
483 fd->version))) {
484 cram_free_compression_header(hdr);
485 return NULL;
486 }
487 } else if (key[0] == 'F' && key[1] == 'N') {
488 if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size,
489 E_INT,
490 fd->version))) {
491 cram_free_compression_header(hdr);
492 return NULL;
493 }
494 } else if (key[0] == 'F' && key[1] == 'C') {
495 if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size,
496 E_BYTE,
497 fd->version))) {
498 cram_free_compression_header(hdr);
499 return NULL;
500 }
501 } else if (key[0] == 'F' && key[1] == 'P') {
502 if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size,
503 E_INT,
504 fd->version))) {
505 cram_free_compression_header(hdr);
506 return NULL;
507 }
508 } else if (key[0] == 'B' && key[1] == 'S') {
509 if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size,
510 E_BYTE,
511 fd->version))) {
512 cram_free_compression_header(hdr);
513 return NULL;
514 }
515 } else if (key[0] == 'I' && key[1] == 'N') {
516 if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
517 E_BYTE_ARRAY,
518 fd->version))) {
519 cram_free_compression_header(hdr);
520 return NULL;
521 }
522 } else if (key[0] == 'S' && key[1] == 'C') {
523 if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
524 E_BYTE_ARRAY,
525 fd->version))) {
526 cram_free_compression_header(hdr);
527 return NULL;
528 }
529 } else if (key[0] == 'D' && key[1] == 'L') {
530 if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size,
531 E_INT,
532 fd->version))) {
533 cram_free_compression_header(hdr);
534 return NULL;
535 }
536 } else if (key[0] == 'B' && key[1] == 'A') {
537 if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size,
538 E_BYTE,
539 fd->version))) {
540 cram_free_compression_header(hdr);
541 return NULL;
542 }
543 } else if (key[0] == 'B' && key[1] == 'B') {
544 if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size,
545 E_BYTE_ARRAY,
546 fd->version))) {
547 cram_free_compression_header(hdr);
548 return NULL;
549 }
550 } else if (key[0] == 'R' && key[1] == 'S') {
551 if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size,
552 E_INT,
553 fd->version))) {
554 cram_free_compression_header(hdr);
555 return NULL;
556 }
557 } else if (key[0] == 'P' && key[1] == 'D') {
558 if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size,
559 E_INT,
560 fd->version))) {
561 cram_free_compression_header(hdr);
562 return NULL;
563 }
564 } else if (key[0] == 'H' && key[1] == 'C') {
565 if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size,
566 E_INT,
567 fd->version))) {
568 cram_free_compression_header(hdr);
569 return NULL;
570 }
571 } else if (key[0] == 'M' && key[1] == 'Q') {
572 if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size,
573 E_INT,
574 fd->version))) {
575 cram_free_compression_header(hdr);
576 return NULL;
577 }
578 } else if (key[0] == 'R' && key[1] == 'N') {
579 if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
580 E_BYTE_ARRAY_BLOCK,
581 fd->version))) {
582 cram_free_compression_header(hdr);
583 return NULL;
584 }
585 } else if (key[0] == 'Q' && key[1] == 'S') {
586 if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size,
587 E_BYTE,
588 fd->version))) {
589 cram_free_compression_header(hdr);
590 return NULL;
591 }
592 } else if (key[0] == 'Q' && key[1] == 'Q') {
593 if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size,
594 E_BYTE_ARRAY,
595 fd->version))) {
596 cram_free_compression_header(hdr);
597 return NULL;
598 }
599 } else if (key[0] == 'T' && key[1] == 'L') {
600 if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size,
601 E_INT,
602 fd->version))) {
603 cram_free_compression_header(hdr);
604 return NULL;
605 }
606 } else if (key[0] == 'T' && key[1] == 'M') {
607 } else if (key[0] == 'T' && key[1] == 'V') {
608 } else
609 fprintf(stderr, "Unrecognised key: %.2s\n", key);
610
611 cp += size;
612
613 m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
614 hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
615 }
616 if (cp - cp_copy != map_size) {
617 cram_free_compression_header(hdr);
618 return NULL;
619 }
620
621 /* Tag encoding map */
622 cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
623 cp += safe_itf8_get(cp, endp, &map_count);
624 for (i = 0; i < map_count; i++) {
625 int32_t encoding = E_NULL;
626 int32_t size = 0;
627 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
628 char *key;
629
630 if (!m || endp - cp < 6) {
631 free(m);
632 cram_free_compression_header(hdr);
633 return NULL;
634 }
635
636 key = cp + 1;
637 m->key = (key[0]<<16)|(key[1]<<8)|key[2];
638
639 cp += 4; // Strictly ITF8, but this suffices
640 cp += safe_itf8_get(cp, endp, &encoding);
641 cp += safe_itf8_get(cp, endp, &size);
642
643 m->encoding = encoding;
644 m->size = size;
645 m->offset = cp - (char *)b->data;
646 if (size < 0 || endp - cp < size ||
647 !(m->codec = cram_decoder_init(encoding, cp, size,
648 E_BYTE_ARRAY_BLOCK, fd->version))) {
649 cram_free_compression_header(hdr);
650 free(m);
651 return NULL;
652 }
653
654 cp += size;
655
656 m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
657 hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
658 }
659 if (cp - cp_copy != map_size) {
660 cram_free_compression_header(hdr);
661 return NULL;
662 }
663
664 return hdr;
665 }
666
667 /*
668 * Note we also need to scan through the record encoding map to
669 * see which data series share the same block, either external or
670 * CORE. For example if we need the BF data series but MQ and CF
671 * are also encoded in the same block then we need to add those in
672 * as a dependency in order to correctly decode BF.
673 *
674 * Returns 0 on success
675 * -1 on failure
676 */
cram_dependent_data_series(cram_fd * fd,cram_block_compression_hdr * hdr,cram_slice * s)677 int cram_dependent_data_series(cram_fd *fd,
678 cram_block_compression_hdr *hdr,
679 cram_slice *s) {
680 int *block_used;
681 int core_used = 0;
682 int i;
683 static int i_to_id[] = {
684 DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
685 DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
686 DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
687 DS_HC, DS_SC, DS_BB, DS_QQ,
688 };
689 uint32_t orig_ds;
690
691 /*
692 * Set the data_series bit field based on fd->required_fields
693 * contents.
694 */
695 if (fd->required_fields && fd->required_fields != INT_MAX) {
696 hdr->data_series = 0;
697
698 if (fd->required_fields & SAM_QNAME)
699 hdr->data_series |= CRAM_RN;
700
701 if (fd->required_fields & SAM_FLAG)
702 hdr->data_series |= CRAM_BF;
703
704 if (fd->required_fields & SAM_RNAME)
705 hdr->data_series |= CRAM_RI | CRAM_BF;
706
707 if (fd->required_fields & SAM_POS)
708 hdr->data_series |= CRAM_AP | CRAM_BF;
709
710 if (fd->required_fields & SAM_MAPQ)
711 hdr->data_series |= CRAM_MQ;
712
713 if (fd->required_fields & SAM_CIGAR)
714 hdr->data_series |= CRAM_CIGAR;
715
716 if (fd->required_fields & SAM_RNEXT)
717 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
718
719 if (fd->required_fields & SAM_PNEXT)
720 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
721
722 if (fd->required_fields & SAM_TLEN)
723 hdr->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
724 CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
725
726 if (fd->required_fields & SAM_SEQ)
727 hdr->data_series |= CRAM_SEQ;
728
729 if (!(fd->required_fields & SAM_AUX))
730 // No easy way to get MD/NM without other tags at present
731 fd->decode_md = 0;
732
733 if (fd->required_fields & SAM_QUAL)
734 hdr->data_series |= CRAM_QUAL;
735
736 if (fd->required_fields & SAM_AUX)
737 hdr->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
738
739 if (fd->required_fields & SAM_RGAUX)
740 hdr->data_series |= CRAM_RG | CRAM_BF;
741
742 // Always uncompress CORE block
743 if (cram_uncompress_block(s->block[0]))
744 return -1;
745 } else {
746 hdr->data_series = CRAM_ALL;
747
748 for (i = 0; i < s->hdr->num_blocks; i++) {
749 if (cram_uncompress_block(s->block[i]))
750 return -1;
751 }
752
753 return 0;
754 }
755
756 block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
757 if (!block_used)
758 return -1;
759
760 do {
761 /*
762 * Also set data_series based on code prerequisites. Eg if we need
763 * CRAM_QS then we also need to know CRAM_RL so we know how long it
764 * is, or if we need FC/FP then we also need FN (number of features).
765 *
766 * It's not reciprocal though. We may be needing to decode FN
767 * but have no need to decode FC, FP and cigar ops.
768 */
769 if (hdr->data_series & CRAM_RS) hdr->data_series |= CRAM_FC|CRAM_FP;
770 if (hdr->data_series & CRAM_PD) hdr->data_series |= CRAM_FC|CRAM_FP;
771 if (hdr->data_series & CRAM_HC) hdr->data_series |= CRAM_FC|CRAM_FP;
772 if (hdr->data_series & CRAM_QS) hdr->data_series |= CRAM_FC|CRAM_FP;
773 if (hdr->data_series & CRAM_IN) hdr->data_series |= CRAM_FC|CRAM_FP;
774 if (hdr->data_series & CRAM_SC) hdr->data_series |= CRAM_FC|CRAM_FP;
775 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_FC|CRAM_FP;
776 if (hdr->data_series & CRAM_DL) hdr->data_series |= CRAM_FC|CRAM_FP;
777 if (hdr->data_series & CRAM_BA) hdr->data_series |= CRAM_FC|CRAM_FP;
778 if (hdr->data_series & CRAM_BB) hdr->data_series |= CRAM_FC|CRAM_FP;
779 if (hdr->data_series & CRAM_QQ) hdr->data_series |= CRAM_FC|CRAM_FP;
780
781 // cram_decode_seq() needs seq[] array
782 if (hdr->data_series & (CRAM_SEQ|CRAM_CIGAR)) hdr->data_series |= CRAM_RL;
783
784 if (hdr->data_series & CRAM_FP) hdr->data_series |= CRAM_FC;
785 if (hdr->data_series & CRAM_FC) hdr->data_series |= CRAM_FN;
786 if (hdr->data_series & CRAM_aux) hdr->data_series |= CRAM_TL;
787 if (hdr->data_series & CRAM_MF) hdr->data_series |= CRAM_CF;
788 if (hdr->data_series & CRAM_MQ) hdr->data_series |= CRAM_BF;
789 if (hdr->data_series & CRAM_BS) hdr->data_series |= CRAM_RI;
790 if (hdr->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
791 hdr->data_series |= CRAM_CF;
792 if (!hdr->read_names_included && hdr->data_series & CRAM_RN)
793 hdr->data_series |= CRAM_CF | CRAM_NF;
794 if (hdr->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
795 hdr->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
796
797 orig_ds = hdr->data_series;
798
799 // Find which blocks are in use.
800 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
801 int bnum1, bnum2, j;
802 cram_codec *c = hdr->codecs[i_to_id[i]];
803
804 if (!(hdr->data_series & (1<<i)))
805 continue;
806
807 if (!c)
808 continue;
809
810 bnum1 = cram_codec_to_id(c, &bnum2);
811
812 for (;;) {
813 switch (bnum1) {
814 case -2:
815 break;
816
817 case -1:
818 core_used = 1;
819 break;
820
821 default:
822 for (j = 0; j < s->hdr->num_blocks; j++) {
823 if (s->block[j]->content_type == EXTERNAL &&
824 s->block[j]->content_id == bnum1) {
825 block_used[j] = 1;
826 if (cram_uncompress_block(s->block[j])) {
827 free(block_used);
828 return -1;
829 }
830 }
831 }
832 break;
833 }
834
835 if (bnum2 == -2 || bnum1 == bnum2)
836 break;
837
838 bnum1 = bnum2; // 2nd pass
839 }
840 }
841
842 // Tags too
843 if ((fd->required_fields & SAM_AUX) ||
844 (hdr->data_series & CRAM_aux)) {
845 for (i = 0; i < CRAM_MAP_HASH; i++) {
846 int bnum1, bnum2, j;
847 cram_map *m = hdr->tag_encoding_map[i];
848
849 while (m) {
850 cram_codec *c = m->codec;
851 if (!c) {
852 m = m->next;
853 continue;
854 }
855
856 bnum1 = cram_codec_to_id(c, &bnum2);
857
858 for (;;) {
859 switch (bnum1) {
860 case -2:
861 break;
862
863 case -1:
864 core_used = 1;
865 break;
866
867 default:
868 for (j = 0; j < s->hdr->num_blocks; j++) {
869 if (s->block[j]->content_type == EXTERNAL &&
870 s->block[j]->content_id == bnum1) {
871 block_used[j] = 1;
872 if (cram_uncompress_block(s->block[j])) {
873 free(block_used);
874 return -1;
875 }
876 }
877 }
878 break;
879 }
880
881 if (bnum2 == -2 || bnum1 == bnum2)
882 break;
883
884 bnum1 = bnum2; // 2nd pass
885 }
886
887 m = m->next;
888 }
889 }
890 }
891
892 // We now know which blocks are in used, so repeat and find
893 // which other data series need to be added.
894 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
895 int bnum1, bnum2, j;
896 cram_codec *c = hdr->codecs[i_to_id[i]];
897
898 if (!c)
899 continue;
900
901 bnum1 = cram_codec_to_id(c, &bnum2);
902
903 for (;;) {
904 switch (bnum1) {
905 case -2:
906 break;
907
908 case -1:
909 if (core_used) {
910 //printf(" + data series %08x:\n", 1<<i);
911 hdr->data_series |= 1<<i;
912 }
913 break;
914
915 default:
916 for (j = 0; j < s->hdr->num_blocks; j++) {
917 if (s->block[j]->content_type == EXTERNAL &&
918 s->block[j]->content_id == bnum1) {
919 if (block_used[j]) {
920 //printf(" + data series %08x:\n", 1<<i);
921 hdr->data_series |= 1<<i;
922 }
923 }
924 }
925 break;
926 }
927
928 if (bnum2 == -2 || bnum1 == bnum2)
929 break;
930
931 bnum1 = bnum2; // 2nd pass
932 }
933 }
934
935 // Tags too
936 for (i = 0; i < CRAM_MAP_HASH; i++) {
937 int bnum1, bnum2, j;
938 cram_map *m = hdr->tag_encoding_map[i];
939
940 while (m) {
941 cram_codec *c = m->codec;
942 if (!c) {
943 m = m->next;
944 continue;
945 }
946
947 bnum1 = cram_codec_to_id(c, &bnum2);
948
949 for (;;) {
950 switch (bnum1) {
951 case -2:
952 break;
953
954 case -1:
955 //printf(" + data series %08x:\n", CRAM_aux);
956 hdr->data_series |= CRAM_aux;
957 break;
958
959 default:
960 for (j = 0; j < s->hdr->num_blocks; j++) {
961 if (s->block[j]->content_type == EXTERNAL &&
962 s->block[j]->content_id == bnum1) {
963 if (block_used[j]) {
964 //printf(" + data series %08x:\n",
965 // CRAM_aux);
966 hdr->data_series |= CRAM_aux;
967 }
968 }
969 }
970 break;
971 }
972
973 if (bnum2 == -2 || bnum1 == bnum2)
974 break;
975
976 bnum1 = bnum2; // 2nd pass
977 }
978
979 m = m->next;
980 }
981 }
982 } while (orig_ds != hdr->data_series);
983
984 free(block_used);
985 return 0;
986 }
987
988 /*
989 * Checks whether an external block is used solely by a single data series.
990 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
991 * or 0 if not (E_NULL).
992 */
cram_ds_unique(cram_block_compression_hdr * hdr,cram_codec * c,int id)993 static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
994 int id) {
995 int i, n_id = 0;
996 enum cram_encoding e_type = 0;
997
998 for (i = 0; i < DS_END; i++) {
999 cram_codec *c;
1000 int bnum1, bnum2, old_n_id;
1001
1002 if (!(c = hdr->codecs[i]))
1003 continue;
1004
1005 bnum1 = cram_codec_to_id(c, &bnum2);
1006
1007 old_n_id = n_id;
1008 if (bnum1 == id) {
1009 n_id++;
1010 e_type = c->codec;
1011 }
1012 if (bnum2 == id) {
1013 n_id++;
1014 e_type = c->codec;
1015 }
1016
1017 if (n_id == old_n_id+2)
1018 n_id--; // len/val in same place counts once only.
1019 }
1020
1021 return n_id == 1 ? e_type : 0;
1022 }
1023
1024 /*
1025 * Attempts to estimate the size of some blocks so we can preallocate them
1026 * before decoding. Although decoding will automatically grow the blocks,
1027 * it is typically more efficient to preallocate.
1028 */
cram_decode_estimate_sizes(cram_block_compression_hdr * hdr,cram_slice * s,int * qual_size,int * name_size,int * q_id)1029 void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
1030 int *qual_size, int *name_size,
1031 int *q_id) {
1032 int bnum1, bnum2;
1033 cram_codec *cd;
1034
1035 *qual_size = 0;
1036 *name_size = 0;
1037
1038 /* Qual */
1039 cd = hdr->codecs[DS_QS];
1040 if (cd == NULL) return;
1041 bnum1 = cram_codec_to_id(cd, &bnum2);
1042 if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1043 if (cram_ds_unique(hdr, cd, bnum1)) {
1044 cram_block *b = cram_get_block_by_id(s, bnum1);
1045 if (b) *qual_size = b->uncomp_size;
1046 if (q_id && cd->codec == E_EXTERNAL)
1047 *q_id = bnum1;
1048 }
1049
1050 /* Name */
1051 cd = hdr->codecs[DS_RN];
1052 if (cd == NULL) return;
1053 bnum1 = cram_codec_to_id(cd, &bnum2);
1054 if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1055 if (cram_ds_unique(hdr, cd, bnum1)) {
1056 cram_block *b = cram_get_block_by_id(s, bnum1);
1057 if (b) *name_size = b->uncomp_size;
1058 }
1059 }
1060
1061
1062 /* ----------------------------------------------------------------------
1063 * CRAM slices
1064 */
1065
1066 /*
1067 * Decodes a CRAM (un)mapped slice header block.
1068 * Returns slice header ptr on success
1069 * NULL on failure
1070 */
cram_decode_slice_header(cram_fd * fd,cram_block * b)1071 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
1072 cram_block_slice_hdr *hdr;
1073 unsigned char *cp;
1074 unsigned char *cp_end;
1075 int i;
1076
1077 if (b->method != RAW) {
1078 /* Spec. says slice header should be RAW, but we can future-proof
1079 by trying to decode it if it isn't. */
1080 if (cram_uncompress_block(b) < 0)
1081 return NULL;
1082 }
1083 cp = (unsigned char *)BLOCK_DATA(b);
1084 cp_end = cp + b->uncomp_size;
1085
1086 if (b->content_type != MAPPED_SLICE &&
1087 b->content_type != UNMAPPED_SLICE)
1088 return NULL;
1089
1090 if (!(hdr = calloc(1, sizeof(*hdr))))
1091 return NULL;
1092
1093 hdr->content_type = b->content_type;
1094
1095 if (b->content_type == MAPPED_SLICE) {
1096 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id);
1097 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start);
1098 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span);
1099 }
1100 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records);
1101 hdr->record_counter = 0;
1102 if (CRAM_MAJOR_VERS(fd->version) == 2) {
1103 int32_t i32 = 0;
1104 cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1105 hdr->record_counter = i32;
1106 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1107 cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter);
1108 }
1109
1110 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks);
1111
1112 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids);
1113 if (hdr->num_content_ids < 1 ||
1114 hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) {
1115 /* Slice must have at least one data block,
1116 and malloc'd size shouldn't wrap. */
1117 free(hdr);
1118 return NULL;
1119 }
1120 hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1121 if (!hdr->block_content_ids) {
1122 free(hdr);
1123 return NULL;
1124 }
1125
1126 for (i = 0; i < hdr->num_content_ids; i++) {
1127 int l = safe_itf8_get((char *)cp, (char *)cp_end,
1128 &hdr->block_content_ids[i]);
1129 if (l <= 0) {
1130 free(hdr->block_content_ids);
1131 free(hdr);
1132 return NULL;
1133 }
1134 cp += l;
1135 }
1136
1137 if (b->content_type == MAPPED_SLICE) {
1138 cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id);
1139 }
1140
1141 if (CRAM_MAJOR_VERS(fd->version) != 1) {
1142 if (cp_end - cp < 16) {
1143 free(hdr->block_content_ids);
1144 free(hdr);
1145 return NULL;
1146 }
1147 memcpy(hdr->md5, cp, 16);
1148 } else {
1149 memset(hdr->md5, 0, 16);
1150 }
1151
1152 return hdr;
1153 }
1154
1155
1156 #if 0
1157 /* Returns the number of bits set in val; it the highest bit used */
1158 static int nbits(int v) {
1159 static const int MultiplyDeBruijnBitPosition[32] = {
1160 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1161 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1162 };
1163
1164 v |= v >> 1; // first up to set all bits 1 after the first 1 */
1165 v |= v >> 2;
1166 v |= v >> 4;
1167 v |= v >> 8;
1168 v |= v >> 16;
1169
1170 // DeBruijn magic to find top bit
1171 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1172 }
1173 #endif
1174
1175 #if 0
1176 static int sort_freqs(const void *vp1, const void *vp2) {
1177 const int i1 = *(const int *)vp1;
1178 const int i2 = *(const int *)vp2;
1179 return i1-i2;
1180 }
1181 #endif
1182
1183 /* ----------------------------------------------------------------------
1184 * Primary CRAM sequence decoder
1185 */
1186
add_md_char(cram_slice * s,int decode_md,char c,int32_t * md_dist)1187 static inline void add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1188 if (decode_md) {
1189 BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1190 BLOCK_APPEND_CHAR(s->aux_blk, c);
1191 *md_dist = 0;
1192 }
1193 }
1194
1195 /*
1196 * Internal part of cram_decode_slice().
1197 * Generates the sequence, quality and cigar components.
1198 */
cram_decode_seq(cram_fd * fd,cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,SAM_hdr * bfd,int cf,char * seq,char * qual,int has_MD,int has_NM)1199 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1200 cram_block *blk, cram_record *cr, SAM_hdr *bfd,
1201 int cf, char *seq, char *qual,
1202 int has_MD, int has_NM) {
1203 int prev_pos = 0, f, r = 0, out_sz = 1;
1204 int seq_pos = 1;
1205 int cig_len = 0, ref_pos = cr->apos;
1206 int32_t fn, i32;
1207 enum cigar_op cig_op = BAM_CMATCH;
1208 uint32_t *cigar = s->cigar;
1209 uint32_t ncigar = s->ncigar;
1210 uint32_t cigar_alloc = s->cigar_alloc;
1211 uint32_t nm = 0;
1212 int32_t md_dist = 0;
1213 int orig_aux = 0;
1214 int decode_md = fd->decode_md && s->ref && !has_MD;
1215 int decode_nm = fd->decode_md && s->ref && !has_NM;
1216 uint32_t ds = c->comp_hdr->data_series;
1217
1218 if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1219 memset(qual, 255, cr->len);
1220 }
1221
1222 if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1223 decode_md = decode_nm = 0;
1224
1225 if (decode_md) {
1226 orig_aux = BLOCK_SIZE(s->aux_blk);
1227 BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1228 }
1229
1230 if (ds & CRAM_FN) {
1231 if (!c->comp_hdr->codecs[DS_FN]) return -1;
1232 r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
1233 blk, (char *)&fn, &out_sz);
1234 if (r) return r;
1235 } else {
1236 fn = 0;
1237 }
1238
1239 ref_pos--; // count from 0
1240 cr->cigar = ncigar;
1241
1242 if (!(ds & (CRAM_FC | CRAM_FP)))
1243 goto skip_cigar;
1244
1245 for (f = 0; f < fn; f++) {
1246 int32_t pos = 0;
1247 char op;
1248
1249 if (ncigar+2 >= cigar_alloc) {
1250 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1251 s->cigar = cigar;
1252 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1253 return -1;
1254 }
1255
1256 if (ds & CRAM_FC) {
1257 if (!c->comp_hdr->codecs[DS_FC]) return -1;
1258 r |= c->comp_hdr->codecs[DS_FC]->decode(s,
1259 c->comp_hdr->codecs[DS_FC],
1260 blk,
1261 &op, &out_sz);
1262 if (r) return r;
1263 }
1264
1265 if (!(ds & CRAM_FP))
1266 continue;
1267
1268 if (!c->comp_hdr->codecs[DS_FP]) return -1;
1269 r |= c->comp_hdr->codecs[DS_FP]->decode(s,
1270 c->comp_hdr->codecs[DS_FP],
1271 blk,
1272 (char *)&pos, &out_sz);
1273 if (r) return r;
1274 pos += prev_pos;
1275
1276 if (pos <= 0) {
1277 fprintf(stderr, "Error: feature position %d before start of read.\n",
1278 pos);
1279 return -1;
1280 }
1281
1282 if (pos > seq_pos) {
1283 if (pos > cr->len+1)
1284 return -1;
1285
1286 if (s->ref && cr->ref_id >= 0) {
1287 if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1288 static int whinged = 0;
1289 int rlen;
1290 if (!whinged)
1291 fprintf(stderr, "Ref pos outside of ref "
1292 "sequence boundary\n");
1293 whinged = 1;
1294 rlen = bfd->ref[cr->ref_id].len - ref_pos;
1295 // May miss MD/NM cases where both seq/ref are N, but this is a
1296 // malformed cram file anyway.
1297 if (rlen > 0) {
1298 memcpy(&seq[seq_pos-1],
1299 &s->ref[ref_pos - s->ref_start +1], rlen);
1300 if ((pos - seq_pos) - rlen > 0)
1301 memset(&seq[seq_pos-1+rlen], 'N',
1302 (pos - seq_pos) - rlen);
1303 } else {
1304 memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1305 }
1306 if (md_dist >= 0)
1307 md_dist += pos - seq_pos;
1308 } else {
1309 // 'N' in both ref and seq is also mismatch for NM/MD
1310 if (decode_md || decode_nm) {
1311 int i;
1312 for (i = 0; i < pos - seq_pos; i++) {
1313 // FIXME: not N, but nt16 lookup == 15?
1314 char base = s->ref[ref_pos - s->ref_start + 1 + i];
1315 if (base == 'N') {
1316 add_md_char(s, decode_md,
1317 s->ref[ref_pos - s->ref_start + 1 + i],
1318 &md_dist);
1319 nm++;
1320 } else {
1321 md_dist++;
1322 }
1323 seq[seq_pos-1+i] = base;
1324 }
1325 } else {
1326 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1327 pos - seq_pos);
1328 }
1329 }
1330 }
1331 #ifdef USE_X
1332 if (cig_len && cig_op != BAM_CBASE_MATCH) {
1333 cigar[ncigar++] = (cig_len<<4) + cig_op;
1334 cig_len = 0;
1335 }
1336 cig_op = BAM_CBASE_MATCH;
1337 #else
1338 if (cig_len && cig_op != BAM_CMATCH) {
1339 cigar[ncigar++] = (cig_len<<4) + cig_op;
1340 cig_len = 0;
1341 }
1342 cig_op = BAM_CMATCH;
1343 #endif
1344 cig_len += pos - seq_pos;
1345 ref_pos += pos - seq_pos;
1346 seq_pos = pos;
1347 }
1348
1349 prev_pos = pos;
1350
1351 if (!(ds & CRAM_FC))
1352 goto skip_cigar;
1353
1354 switch(op) {
1355 case 'S': { // soft clip: IN
1356 int32_t out_sz2 = 1;
1357 int have_sc = 0;
1358
1359 if (cig_len) {
1360 cigar[ncigar++] = (cig_len<<4) + cig_op;
1361 cig_len = 0;
1362 }
1363 switch (CRAM_MAJOR_VERS(fd->version)) {
1364 case 1:
1365 if (ds & CRAM_IN) {
1366 r |= c->comp_hdr->codecs[DS_IN]
1367 ? c->comp_hdr->codecs[DS_IN]
1368 ->decode(s, c->comp_hdr->codecs[DS_IN],
1369 blk,
1370 cr->len ? &seq[pos-1] : NULL,
1371 &out_sz2)
1372 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1373 have_sc = 1;
1374 }
1375 break;
1376 case 2:
1377 default:
1378 if (ds & CRAM_SC) {
1379 r |= c->comp_hdr->codecs[DS_SC]
1380 ? c->comp_hdr->codecs[DS_SC]
1381 ->decode(s, c->comp_hdr->codecs[DS_SC],
1382 blk,
1383 cr->len ? &seq[pos-1] : NULL,
1384 &out_sz2)
1385 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1386 have_sc = 1;
1387 }
1388 break;
1389
1390 // default:
1391 // r |= c->comp_hdr->codecs[DS_BB]
1392 // ? c->comp_hdr->codecs[DS_BB]
1393 // ->decode(s, c->comp_hdr->codecs[DS_BB],
1394 // blk, &seq[pos-1], &out_sz2)
1395 // : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1396 }
1397 if (have_sc) {
1398 if (r) return r;
1399 cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1400 cig_op = BAM_CSOFT_CLIP;
1401 seq_pos += out_sz2;
1402 }
1403 break;
1404 }
1405
1406 case 'X': { // Substitution; BS
1407 unsigned char base;
1408 #ifdef USE_X
1409 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1410 cigar[ncigar++] = (cig_len<<4) + cig_op;
1411 cig_len = 0;
1412 }
1413 if (ds & CRAM_BS) {
1414 if (!c->comp_hdr->codecs[DS_BS]) return -1;
1415 r |= c->comp_hdr->codecs[DS_BS]
1416 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1417 (char *)&base, &out_sz);
1418 if (pos-1 < cr->len)
1419 seq[pos-1] = 'N'; // FIXME look up BS=base value
1420 }
1421 cig_op = BAM_CBASE_MISMATCH;
1422 #else
1423 int ref_base;
1424 if (cig_len && cig_op != BAM_CMATCH) {
1425 cigar[ncigar++] = (cig_len<<4) + cig_op;
1426 cig_len = 0;
1427 }
1428 if (ds & CRAM_BS) {
1429 if (!c->comp_hdr->codecs[DS_BS]) return -1;
1430 r |= c->comp_hdr->codecs[DS_BS]
1431 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1432 (char *)&base, &out_sz);
1433 if (r) return -1;
1434 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1435 if (pos-1 < cr->len)
1436 seq[pos-1] = c->comp_hdr->
1437 substitution_matrix[fd->L1['N']][base];
1438 if (decode_md || decode_nm) {
1439 if (md_dist >= 0 && decode_md)
1440 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1441 md_dist = -1;
1442 nm--;
1443 }
1444 } else {
1445 unsigned char ref_call = ref_pos <= s->ref_end
1446 ? (uc)s->ref[ref_pos - s->ref_start +1]
1447 : 'N';
1448 ref_base = fd->L1[ref_call];
1449 if (pos-1 < cr->len)
1450 seq[pos-1] = c->comp_hdr->
1451 substitution_matrix[ref_base][base];
1452 add_md_char(s, decode_md, ref_call, &md_dist);
1453 }
1454 }
1455 cig_op = BAM_CMATCH;
1456 #endif
1457 nm++;
1458 cig_len++;
1459 seq_pos++;
1460 ref_pos++;
1461 break;
1462 }
1463
1464 case 'D': { // Deletion; DL
1465 if (cig_len && cig_op != BAM_CDEL) {
1466 cigar[ncigar++] = (cig_len<<4) + cig_op;
1467 cig_len = 0;
1468 }
1469 if (ds & CRAM_DL) {
1470 if (!c->comp_hdr->codecs[DS_DL]) return -1;
1471 r |= c->comp_hdr->codecs[DS_DL]
1472 ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
1473 (char *)&i32, &out_sz);
1474 if (r) return r;
1475 if (decode_md || decode_nm) {
1476 if (md_dist >= 0 && decode_md)
1477 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1478 if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1479 if (decode_md) {
1480 BLOCK_APPEND_CHAR(s->aux_blk, '^');
1481 BLOCK_APPEND(s->aux_blk,
1482 &s->ref[ref_pos - s->ref_start +1],
1483 i32);
1484 md_dist = 0;
1485 }
1486 nm += i32;
1487 } else {
1488 uint32_t dlen;
1489 if (bfd->ref[cr->ref_id].len >= ref_pos) {
1490 if (decode_md) {
1491 BLOCK_APPEND_CHAR(s->aux_blk, '^');
1492 BLOCK_APPEND(s->aux_blk,
1493 &s->ref[ref_pos - s->ref_start+1],
1494 bfd->ref[cr->ref_id].len-ref_pos);
1495 BLOCK_APPEND_UINT(s->aux_blk, 0);
1496 }
1497 dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1498 nm += i32 - dlen;
1499 } else {
1500 dlen = i32;
1501 }
1502
1503 md_dist = -1;
1504 }
1505 }
1506 cig_op = BAM_CDEL;
1507 cig_len += i32;
1508 ref_pos += i32;
1509 //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
1510 }
1511 break;
1512 }
1513
1514 case 'I': { // Insertion (several bases); IN
1515 int32_t out_sz2 = 1;
1516
1517 if (cig_len && cig_op != BAM_CINS) {
1518 cigar[ncigar++] = (cig_len<<4) + cig_op;
1519 cig_len = 0;
1520 }
1521
1522 if (ds & CRAM_IN) {
1523 if (!c->comp_hdr->codecs[DS_IN]) return -1;
1524 r |= c->comp_hdr->codecs[DS_IN]
1525 ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
1526 cr->len ? &seq[pos-1] : NULL,
1527 &out_sz2);
1528 if (r) return r;
1529 cig_op = BAM_CINS;
1530 cig_len += out_sz2;
1531 seq_pos += out_sz2;
1532 nm += out_sz2;
1533 //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1534 }
1535 break;
1536 }
1537
1538 case 'i': { // Insertion (single base); BA
1539 if (cig_len && cig_op != BAM_CINS) {
1540 cigar[ncigar++] = (cig_len<<4) + cig_op;
1541 cig_len = 0;
1542 }
1543 if (ds & CRAM_BA) {
1544 if (!c->comp_hdr->codecs[DS_BA]) return -1;
1545 r |= c->comp_hdr->codecs[DS_BA]
1546 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1547 cr->len ? &seq[pos-1] : NULL,
1548 &out_sz);
1549 if (r) return r;
1550 }
1551 cig_op = BAM_CINS;
1552 cig_len++;
1553 seq_pos++;
1554 nm++;
1555 break;
1556 }
1557
1558 case 'b': { // Several bases
1559 int32_t len = 1;
1560
1561 if (cig_len && cig_op != BAM_CMATCH) {
1562 cigar[ncigar++] = (cig_len<<4) + cig_op;
1563 cig_len = 0;
1564 }
1565
1566 if (ds & CRAM_BB) {
1567 if (!c->comp_hdr->codecs[DS_BB]) return -1;
1568 r |= c->comp_hdr->codecs[DS_BB]
1569 ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
1570 cr->len ? &seq[pos-1] : NULL,
1571 &len);
1572 if (r) return r;
1573
1574 if (decode_md || decode_nm) {
1575 int x;
1576 if (md_dist >= 0 && decode_md)
1577 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1578
1579 for (x = 0; x < len; x++) {
1580 if (x && decode_md)
1581 BLOCK_APPEND_UINT(s->aux_blk, 0);
1582 if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1583 md_dist = -1;
1584 break;
1585 } else {
1586 if (decode_md) {
1587 char r = s->ref[ref_pos+x-s->ref_start +1];
1588 BLOCK_APPEND_CHAR(s->aux_blk, r);
1589 }
1590 }
1591 }
1592
1593 nm += x;
1594 }
1595 }
1596
1597 cig_op = BAM_CMATCH;
1598
1599 cig_len+=len;
1600 seq_pos+=len;
1601 ref_pos+=len;
1602 //prev_pos+=len;
1603 break;
1604 }
1605
1606 case 'q': { // Several quality values
1607 int32_t len = 1;
1608
1609 if (cig_len && cig_op != BAM_CMATCH) {
1610 cigar[ncigar++] = (cig_len<<4) + cig_op;
1611 cig_len = 0;
1612 }
1613
1614 if (ds & CRAM_QQ) {
1615 if (!c->comp_hdr->codecs[DS_QQ]) return -1;
1616 r |= c->comp_hdr->codecs[DS_QQ]
1617 ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
1618 (char *)&qual[pos-1], &len);
1619 if (r) return r;
1620 }
1621
1622 cig_op = BAM_CMATCH;
1623
1624 cig_len+=len;
1625 seq_pos+=len;
1626 ref_pos+=len;
1627 //prev_pos+=len;
1628 break;
1629 }
1630
1631 case 'B': { // Read base; BA, QS
1632 #ifdef USE_X
1633 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1634 cigar[ncigar++] = (cig_len<<4) + cig_op;
1635 cig_len = 0;
1636 }
1637 #else
1638 if (cig_len && cig_op != BAM_CMATCH) {
1639 cigar[ncigar++] = (cig_len<<4) + cig_op;
1640 cig_len = 0;
1641 }
1642 #endif
1643 if (ds & CRAM_BA) {
1644 if (!c->comp_hdr->codecs[DS_BA]) return -1;
1645 r |= c->comp_hdr->codecs[DS_BA]
1646 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1647 cr->len ? &seq[pos-1] : NULL,
1648 &out_sz);
1649
1650 if (decode_md || decode_nm) {
1651 if (md_dist >= 0 && decode_md)
1652 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1653 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1654 md_dist = -1;
1655 } else {
1656 if (decode_md)
1657 BLOCK_APPEND_CHAR(s->aux_blk,
1658 s->ref[ref_pos-s->ref_start +1]);
1659 nm++;
1660 md_dist = 0;
1661 }
1662 }
1663 }
1664 if (ds & CRAM_QS) {
1665 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1666 r |= c->comp_hdr->codecs[DS_QS]
1667 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1668 (char *)&qual[pos-1], &out_sz);
1669 }
1670 #ifdef USE_X
1671 cig_op = BAM_CBASE_MISMATCH;
1672 #else
1673 cig_op = BAM_CMATCH;
1674 #endif
1675 cig_len++;
1676 seq_pos++;
1677 ref_pos++;
1678 //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1679 break;
1680 }
1681
1682 case 'Q': { // Quality score; QS
1683 if (ds & CRAM_QS) {
1684 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1685 r |= c->comp_hdr->codecs[DS_QS]
1686 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1687 (char *)&qual[pos-1], &out_sz);
1688 //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
1689 }
1690 break;
1691 }
1692
1693 case 'H': { // hard clip; HC
1694 if (cig_len && cig_op != BAM_CHARD_CLIP) {
1695 cigar[ncigar++] = (cig_len<<4) + cig_op;
1696 cig_len = 0;
1697 }
1698 if (ds & CRAM_HC) {
1699 if (!c->comp_hdr->codecs[DS_HC]) return -1;
1700 r |= c->comp_hdr->codecs[DS_HC]
1701 ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
1702 (char *)&i32, &out_sz);
1703 if (r) return r;
1704 cig_op = BAM_CHARD_CLIP;
1705 cig_len += i32;
1706 }
1707 break;
1708 }
1709
1710 case 'P': { // padding; PD
1711 if (cig_len && cig_op != BAM_CPAD) {
1712 cigar[ncigar++] = (cig_len<<4) + cig_op;
1713 cig_len = 0;
1714 }
1715 if (ds & CRAM_PD) {
1716 if (!c->comp_hdr->codecs[DS_PD]) return -1;
1717 r |= c->comp_hdr->codecs[DS_PD]
1718 ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
1719 (char *)&i32, &out_sz);
1720 if (r) return r;
1721 cig_op = BAM_CPAD;
1722 cig_len += i32;
1723 }
1724 break;
1725 }
1726
1727 case 'N': { // Ref skip; RS
1728 if (cig_len && cig_op != BAM_CREF_SKIP) {
1729 cigar[ncigar++] = (cig_len<<4) + cig_op;
1730 cig_len = 0;
1731 }
1732 if (ds & CRAM_RS) {
1733 if (!c->comp_hdr->codecs[DS_RS]) return -1;
1734 r |= c->comp_hdr->codecs[DS_RS]
1735 ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
1736 (char *)&i32, &out_sz);
1737 if (r) return r;
1738 cig_op = BAM_CREF_SKIP;
1739 cig_len += i32;
1740 ref_pos += i32;
1741 }
1742 break;
1743 }
1744
1745 default:
1746 fprintf(stderr, "Error: Unknown feature code '%c'\n", op);
1747 return -1;
1748 }
1749 }
1750
1751 if (!(ds & CRAM_FC))
1752 goto skip_cigar;
1753
1754 /* An implicit match op for any unaccounted for bases */
1755 if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1756 if (s->ref) {
1757 if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1758 static int whinged = 0;
1759 int rlen;
1760 if (!whinged)
1761 fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
1762 whinged = 1;
1763 rlen = bfd->ref[cr->ref_id].len - ref_pos;
1764 // May miss MD/NM cases where both seq/ref are N, but this is a
1765 // malformed cram file anyway.
1766 if (rlen > 0) {
1767 if (seq_pos-1 + rlen < cr->len)
1768 memcpy(&seq[seq_pos-1],
1769 &s->ref[ref_pos - s->ref_start +1], rlen);
1770 if ((cr->len - seq_pos + 1) - rlen > 0)
1771 memset(&seq[seq_pos-1+rlen], 'N',
1772 (cr->len - seq_pos + 1) - rlen);
1773 } else {
1774 if (cr->len - seq_pos + 1 > 0)
1775 memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1776 }
1777 if (md_dist >= 0)
1778 md_dist += cr->len - seq_pos + 1;
1779 } else {
1780 if (cr->len - seq_pos + 1 > 0) {
1781 if (decode_md || decode_nm) {
1782 int i;
1783 for (i = 0; i < cr->len - seq_pos + 1; i++) {
1784 // FIXME: not N, but nt16 lookup == 15?
1785 char base = s->ref[ref_pos - s->ref_start + 1 + i];
1786 if (base == 'N') {
1787 add_md_char(s, decode_md,
1788 s->ref[ref_pos - s->ref_start + 1 + i],
1789 &md_dist);
1790 nm++;
1791 } else {
1792 md_dist++;
1793 }
1794 seq[seq_pos-1+i] = base;
1795 }
1796 } else {
1797 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1798 cr->len - seq_pos + 1);
1799 }
1800 }
1801 ref_pos += cr->len - seq_pos + 1;
1802 }
1803 }
1804
1805 if (ncigar+1 >= cigar_alloc) {
1806 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1807 s->cigar = cigar;
1808 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1809 return -1;
1810 }
1811 #ifdef USE_X
1812 if (cig_len && cig_op != BAM_CBASE_MATCH) {
1813 cigar[ncigar++] = (cig_len<<4) + cig_op;
1814 cig_len = 0;
1815 }
1816 cig_op = BAM_CBASE_MATCH;
1817 #else
1818 if (cig_len && cig_op != BAM_CMATCH) {
1819 cigar[ncigar++] = (cig_len<<4) + cig_op;
1820 cig_len = 0;
1821 }
1822 cig_op = BAM_CMATCH;
1823 #endif
1824 cig_len += cr->len - seq_pos+1;
1825 }
1826
1827 skip_cigar:
1828
1829 if ((ds & CRAM_FN) && decode_md) {
1830 if (md_dist >= 0)
1831 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1832 }
1833
1834 if (cig_len) {
1835 if (ncigar >= cigar_alloc) {
1836 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1837 s->cigar = cigar;
1838 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1839 return -1;
1840 }
1841
1842 cigar[ncigar++] = (cig_len<<4) + cig_op;
1843 }
1844
1845 cr->ncigar = ncigar - cr->cigar;
1846 cr->aend = ref_pos;
1847
1848 //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1849
1850 if (ds & CRAM_MQ) {
1851 if (!c->comp_hdr->codecs[DS_MQ]) return -1;
1852 r |= c->comp_hdr->codecs[DS_MQ]
1853 ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
1854 (char *)&cr->mqual, &out_sz);
1855 } else {
1856 cr->mqual = 40;
1857 }
1858
1859 if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1860 int32_t out_sz2 = cr->len;
1861
1862 if (ds & CRAM_QS) {
1863 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1864 r |= c->comp_hdr->codecs[DS_QS]
1865 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1866 qual, &out_sz2);
1867 }
1868 }
1869
1870 s->cigar = cigar;
1871 s->cigar_alloc = cigar_alloc;
1872 s->ncigar = ncigar;
1873
1874 if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1875 cr->len = 0;
1876
1877 if (decode_md) {
1878 BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1879 cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
1880 }
1881
1882 if (decode_nm) {
1883 char buf[7];
1884 buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
1885 buf[3] = (nm>> 0) & 0xff;
1886 buf[4] = (nm>> 8) & 0xff;
1887 buf[5] = (nm>>16) & 0xff;
1888 buf[6] = (nm>>24) & 0xff;
1889 BLOCK_APPEND(s->aux_blk, buf, 7);
1890 cr->aux_size += 7;
1891 }
1892
1893 return r;
1894 }
1895
1896 /*
1897 * Quick and simple hash lookup for cram_map arrays
1898 */
map_find(cram_map ** map,unsigned char * key,int id)1899 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1900 cram_map *m;
1901
1902 m = map[CRAM_MAP(key[0],key[1])];
1903 while (m && m->key != id)
1904 m= m->next;
1905
1906 return m;
1907 }
1908
1909 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1910
1911
cram_decode_aux_1_0(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr)1912 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1913 cram_block *blk, cram_record *cr) {
1914 int i, r = 0, out_sz = 1;
1915 unsigned char ntags;
1916
1917 if (!c->comp_hdr->codecs[DS_TC]) return -1;
1918 r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1919 (char *)&ntags, &out_sz);
1920 cr->ntags = ntags;
1921
1922 //printf("TC=%d\n", cr->ntags);
1923 cr->aux_size = 0;
1924 cr->aux = BLOCK_SIZE(s->aux_blk);
1925
1926 for (i = 0; i < cr->ntags; i++) {
1927 int32_t id, out_sz = 1;
1928 unsigned char tag_data[3];
1929 cram_map *m;
1930
1931 //printf("Tag %d/%d\n", i+1, cr->ntags);
1932 if (!c->comp_hdr->codecs[DS_TN]) return -1;
1933 r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1934 blk, (char *)&id, &out_sz);
1935 if (out_sz == 3) {
1936 tag_data[0] = ((char *)&id)[0];
1937 tag_data[1] = ((char *)&id)[1];
1938 tag_data[2] = ((char *)&id)[2];
1939 } else {
1940 tag_data[0] = (id>>16) & 0xff;
1941 tag_data[1] = (id>>8) & 0xff;
1942 tag_data[2] = id & 0xff;
1943 }
1944
1945 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1946 if (!m)
1947 return -1;
1948 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1949
1950 if (!m->codec) return -1;
1951 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1952
1953 cr->aux_size += out_sz + 3;
1954 }
1955
1956 return r;
1957 }
1958
cram_decode_aux(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,int * has_MD,int * has_NM)1959 static int cram_decode_aux(cram_container *c, cram_slice *s,
1960 cram_block *blk, cram_record *cr,
1961 int *has_MD, int *has_NM) {
1962 int i, r = 0, out_sz = 1;
1963 int32_t TL = 0;
1964 unsigned char *TN;
1965 uint32_t ds = c->comp_hdr->data_series;
1966
1967 if (!(ds & (CRAM_TL|CRAM_aux))) {
1968 cr->aux = 0;
1969 cr->aux_size = 0;
1970 return 0;
1971 }
1972
1973 if (!c->comp_hdr->codecs[DS_TL]) return -1;
1974 r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1975 (char *)&TL, &out_sz);
1976 if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1977 return -1;
1978
1979 TN = c->comp_hdr->TL[TL];
1980 cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1981
1982 //printf("TC=%d\n", cr->ntags);
1983 cr->aux_size = 0;
1984 cr->aux = BLOCK_SIZE(s->aux_blk);
1985
1986 if (!(ds & CRAM_aux))
1987 return 0;
1988
1989 for (i = 0; i < cr->ntags; i++) {
1990 int32_t id, out_sz = 1;
1991 unsigned char tag_data[3];
1992 cram_map *m;
1993
1994 if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1995 *has_MD = 1;
1996 if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1997 *has_NM = 1;
1998
1999 //printf("Tag %d/%d\n", i+1, cr->ntags);
2000 tag_data[0] = *TN++;
2001 tag_data[1] = *TN++;
2002 tag_data[2] = *TN++;
2003 id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2004
2005 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2006 if (!m)
2007 return -1;
2008 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2009
2010 if (!m->codec) return -1;
2011 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2012 if (r) break;
2013 cr->aux_size += out_sz + 3;
2014 }
2015
2016 return r;
2017 }
2018
2019 /* Resolve mate pair cross-references between recs within this slice */
cram_decode_slice_xref(cram_slice * s,int required_fields)2020 static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2021 int rec;
2022
2023 if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2024 for (rec = 0; rec < s->hdr->num_records; rec++) {
2025 cram_record *cr = &s->crecs[rec];
2026
2027 cr->tlen = 0;
2028 cr->mate_pos = 0;
2029 cr->mate_ref_id = -1;
2030 }
2031
2032 return 0;
2033 }
2034
2035 for (rec = 0; rec < s->hdr->num_records; rec++) {
2036 cram_record *cr = &s->crecs[rec];
2037
2038 if (cr->mate_line >= 0) {
2039 if (cr->mate_line < s->hdr->num_records) {
2040 /*
2041 * On the first read, loop through computing lengths.
2042 * It's not perfect as we have one slice per reference so we
2043 * cannot detect when TLEN should be zero due to seqs that
2044 * map to multiple references.
2045 *
2046 * We also cannot set tlen correct when it spans a slice for
2047 * other reasons. This may make tlen too small. Should we
2048 * fix this by forcing TLEN to be stored verbatim in such cases?
2049 *
2050 * Or do we just admit defeat and output 0 for tlen? It's the
2051 * safe option...
2052 */
2053 if (cr->tlen == INT_MIN) {
2054 int id1 = rec, id2 = rec;
2055 int aleft = cr->apos, aright = cr->aend;
2056 int tlen;
2057 int ref = cr->ref_id;
2058
2059 // number of segments starting at the same point.
2060 int left_cnt = 0;
2061
2062 do {
2063 if (aleft > s->crecs[id2].apos)
2064 aleft = s->crecs[id2].apos, left_cnt = 1;
2065 else if (aleft == s->crecs[id2].apos)
2066 left_cnt++;
2067 if (aright < s->crecs[id2].aend)
2068 aright = s->crecs[id2].aend;
2069 if (s->crecs[id2].mate_line == -1) {
2070 s->crecs[id2].mate_line = rec;
2071 break;
2072 }
2073 if (s->crecs[id2].mate_line <= id2 ||
2074 s->crecs[id2].mate_line >= s->hdr->num_records)
2075 return -1;
2076 id2 = s->crecs[id2].mate_line;
2077
2078 if (s->crecs[id2].ref_id != ref)
2079 ref = -1;
2080 } while (id2 != id1);
2081
2082 if (ref != -1) {
2083 tlen = aright - aleft + 1;
2084 id1 = id2 = rec;
2085
2086 /*
2087 * When we have two seqs with identical start and
2088 * end coordinates, set +/- tlen based on 1st/last
2089 * bit flags instead, as a tie breaker.
2090 */
2091 if (s->crecs[id2].apos == aleft) {
2092 if (left_cnt == 1 ||
2093 (s->crecs[id2].flags & BAM_FREAD1))
2094 s->crecs[id2].tlen = tlen;
2095 else
2096 s->crecs[id2].tlen = -tlen;
2097 } else {
2098 s->crecs[id2].tlen = -tlen;
2099 }
2100
2101 id2 = s->crecs[id2].mate_line;
2102 while (id2 != id1) {
2103 if (s->crecs[id2].apos == aleft) {
2104 if (left_cnt == 1 ||
2105 (s->crecs[id2].flags & BAM_FREAD1))
2106 s->crecs[id2].tlen = tlen;
2107 else
2108 s->crecs[id2].tlen = -tlen;
2109 } else {
2110 s->crecs[id2].tlen = -tlen;
2111 }
2112 id2 = s->crecs[id2].mate_line;
2113 }
2114 } else {
2115 id1 = id2 = rec;
2116
2117 s->crecs[id2].tlen = 0;
2118 id2 = s->crecs[id2].mate_line;
2119 while (id2 != id1) {
2120 s->crecs[id2].tlen = 0;
2121 id2 = s->crecs[id2].mate_line;
2122 }
2123 }
2124 }
2125
2126 cr->mate_pos = s->crecs[cr->mate_line].apos;
2127 cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2128
2129 // paired
2130 cr->flags |= BAM_FPAIRED;
2131
2132 // set mate unmapped if needed
2133 if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2134 cr->flags |= BAM_FMUNMAP;
2135 cr->tlen = 0;
2136 }
2137 if (cr->flags & BAM_FUNMAP) {
2138 cr->tlen = 0;
2139 }
2140
2141 // set mate reversed if needed
2142 if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2143 cr->flags |= BAM_FMREVERSE;
2144 } else {
2145 fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
2146 cr->mate_line, s->hdr->num_records-1);
2147 }
2148
2149 /* FIXME: construct read names here too if needed */
2150 } else {
2151 if (cr->mate_flags & CRAM_M_REVERSE) {
2152 cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2153 }
2154 if (cr->mate_flags & CRAM_M_UNMAP) {
2155 cr->flags |= BAM_FMUNMAP;
2156 //cr->mate_ref_id = -1;
2157 }
2158 if (!(cr->flags & BAM_FPAIRED))
2159 cr->mate_ref_id = -1;
2160 }
2161
2162 if (cr->tlen == INT_MIN)
2163 cr->tlen = 0; // Just incase
2164 }
2165 return 0;
2166 }
2167
md5_print(unsigned char * md5,char * out)2168 static char *md5_print(unsigned char *md5, char *out) {
2169 int i;
2170 for (i = 0; i < 16; i++) {
2171 out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2172 out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2173 }
2174 out[32] = 0;
2175
2176 return out;
2177 }
2178
reset_all_codecs(cram_block_compression_hdr * hdr)2179 static void reset_all_codecs(cram_block_compression_hdr *hdr) {
2180 int i;
2181
2182 for (i = 0; i < DS_END; i++) {
2183 if (hdr->codecs[i] && hdr->codecs[i]->reset)
2184 hdr->codecs[i]->reset(hdr->codecs[i]);
2185 }
2186
2187 for (i = 0; i < CRAM_MAP_HASH; i++) {
2188 cram_map *m;
2189 for (m = hdr->rec_encoding_map[i]; m; m = m->next)
2190 if (m->codec) m->codec->reset(m->codec);
2191 }
2192
2193 for (i = 0; i < CRAM_MAP_HASH; i++) {
2194 cram_map *m;
2195 for (m = hdr->tag_encoding_map[i]; m; m = m->next)
2196 if (m->codec) m->codec->reset(m->codec);
2197 }
2198 }
2199
2200 /*
2201 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2202 * Returns 0 on success
2203 * -1 on failure
2204 */
cram_decode_slice(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2205 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2206 SAM_hdr *bfd) {
2207 cram_block *blk = s->block[0];
2208 int32_t bf, ref_id;
2209 unsigned char cf;
2210 int out_sz, r = 0;
2211 int rec;
2212 char *seq = NULL, *qual = NULL;
2213 int unknown_rg = -1;
2214 int embed_ref;
2215 char **refs = NULL;
2216 uint32_t ds;
2217
2218 /* Clear any cached blocks in the external / B.A.S. codecs */
2219 reset_all_codecs(c->comp_hdr);
2220
2221 if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2222 return -1;
2223
2224 ds = c->comp_hdr->data_series;
2225
2226 blk->bit = 7; // MSB first
2227
2228 // Study the blocks and estimate approx sizes to preallocate.
2229 // This looks to speed up decoding by around 8-9%.
2230 // We can always shrink back down at the end if we overestimated.
2231 // However it's likely that this also saves memory as own growth
2232 // factor (*=1.5) is never applied.
2233 {
2234 int qsize, nsize, q_id;
2235 cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2236 //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2237
2238 if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2239 if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2240 if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2241
2242 // To do - consider using q_id here to usurp the quality block and
2243 // avoid a memcpy during decode.
2244 // Specifically when quality is an external block uniquely used by
2245 // DS_QS only, then we can set s->qual_blk directly to this
2246 // block and save the codec->decode() calls. (Approx 3% cpu saving)
2247 }
2248
2249 /* Look for unknown RG, added as last by Java CRAM? */
2250 if (bfd->nrg > 0 &&
2251 bfd->rg[bfd->nrg-1].name != NULL &&
2252 !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2253 unknown_rg = bfd->nrg-1;
2254
2255 if (blk->content_type != CORE)
2256 return -1;
2257
2258 if (s->crecs)
2259 free(s->crecs);
2260 if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2261 return -1;
2262
2263 ref_id = s->hdr->ref_seq_id;
2264 embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2265
2266 if (ref_id >= 0) {
2267 if (embed_ref) {
2268 cram_block *b;
2269 if (s->hdr->ref_base_id < 0) {
2270 fprintf(stderr, "No reference specified and "
2271 "no embedded reference is available.\n");
2272 return -1;
2273 }
2274 b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2275 if (!b)
2276 return -1;
2277 if (cram_uncompress_block(b) != 0)
2278 return -1;
2279 s->ref = (char *)BLOCK_DATA(b);
2280 s->ref_start = s->hdr->ref_seq_start;
2281 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2282 if (s->ref_end - s->ref_start > b->uncomp_size) {
2283 fprintf(stderr, "Embedded reference is too small.\n");
2284 return -1;
2285 }
2286 } else if (!fd->no_ref) {
2287 //// Avoid Java cramtools bug by loading entire reference seq
2288 //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2289 //s->ref_start = 1;
2290
2291 if (fd->required_fields & SAM_SEQ)
2292 s->ref =
2293 cram_get_ref(fd, s->hdr->ref_seq_id,
2294 s->hdr->ref_seq_start,
2295 s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2296 s->ref_start = s->hdr->ref_seq_start;
2297 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2298
2299 /* Sanity check */
2300 if (s->ref_start < 0) {
2301 fprintf(stderr, "Slice starts before base 1.\n");
2302 s->ref_start = 0;
2303 }
2304 pthread_mutex_lock(&fd->ref_lock);
2305 pthread_mutex_lock(&fd->refs->lock);
2306 if ((fd->required_fields & SAM_SEQ) &&
2307 ref_id < fd->refs->nref &&
2308 s->ref_end > fd->refs->ref_id[ref_id]->length) {
2309 s->ref_end = fd->refs->ref_id[ref_id]->length;
2310 }
2311 pthread_mutex_unlock(&fd->refs->lock);
2312 pthread_mutex_unlock(&fd->ref_lock);
2313 }
2314 }
2315
2316 if ((fd->required_fields & SAM_SEQ) &&
2317 s->ref == NULL && s->hdr->ref_seq_id >= 0 && !fd->no_ref) {
2318 fprintf(stderr, "Unable to fetch reference #%d %d..%d\n",
2319 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
2320 s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2321 return -1;
2322 }
2323
2324 if (CRAM_MAJOR_VERS(fd->version) != 1
2325 && (fd->required_fields & SAM_SEQ)
2326 && s->hdr->ref_seq_id >= 0
2327 && !fd->ignore_md5
2328 && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2329 hts_md5_context *md5;
2330 unsigned char digest[16];
2331
2332 if (s->ref && s->hdr->ref_seq_id >= 0) {
2333 int start, len;
2334
2335 if (s->hdr->ref_seq_start >= s->ref_start) {
2336 start = s->hdr->ref_seq_start - s->ref_start;
2337 } else {
2338 fprintf(stderr, "Slice starts before base 1.\n");
2339 start = 0;
2340 }
2341
2342 if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2343 len = s->hdr->ref_seq_span;
2344 } else {
2345 fprintf(stderr, "Slice ends beyond reference end.\n");
2346 len = s->ref_end - s->ref_start + 1;
2347 }
2348
2349 if (!(md5 = hts_md5_init()))
2350 return -1;
2351 if (start + len > s->ref_end - s->ref_start + 1)
2352 len = s->ref_end - s->ref_start + 1 - start;
2353 if (len >= 0)
2354 hts_md5_update(md5, s->ref + start, len);
2355 hts_md5_final(digest, md5);
2356 hts_md5_destroy(md5);
2357 } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2358 cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2359 if (b) {
2360 if (!(md5 = hts_md5_init()))
2361 return -1;
2362 hts_md5_update(md5, b->data, b->uncomp_size);
2363 hts_md5_final(digest, md5);
2364 hts_md5_destroy(md5);
2365 }
2366 }
2367
2368 if ((!s->ref && s->hdr->ref_base_id < 0)
2369 || memcmp(digest, s->hdr->md5, 16) != 0) {
2370 char M[33];
2371 fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
2372 "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
2373 fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
2374 fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
2375 return -1;
2376 }
2377 }
2378
2379 if (ref_id == -2) {
2380 pthread_mutex_lock(&fd->ref_lock);
2381 pthread_mutex_lock(&fd->refs->lock);
2382 refs = calloc(fd->refs->nref, sizeof(char *));
2383 pthread_mutex_unlock(&fd->refs->lock);
2384 pthread_mutex_unlock(&fd->ref_lock);
2385 if (!refs)
2386 return -1;
2387 }
2388
2389 int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2390 for (rec = 0; rec < s->hdr->num_records; rec++) {
2391 cram_record *cr = &s->crecs[rec];
2392 int has_MD, has_NM;
2393
2394 //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2395
2396 cr->s = s;
2397
2398 out_sz = 1; /* decode 1 item */
2399 if (ds & CRAM_BF) {
2400 if (!c->comp_hdr->codecs[DS_BF]) return -1;
2401 r |= c->comp_hdr->codecs[DS_BF]
2402 ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2403 (char *)&bf, &out_sz);
2404 if (r || bf < 0 ||
2405 bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2406 return -1;
2407 bf = fd->bam_flag_swap[bf];
2408 cr->flags = bf;
2409 } else {
2410 cr->flags = bf = 0x4; // unmapped
2411 }
2412
2413 if (ds & CRAM_CF) {
2414 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2415 /* CF is byte in 1.0, int32 in 2.0 */
2416 if (!c->comp_hdr->codecs[DS_CF]) return -1;
2417 r |= c->comp_hdr->codecs[DS_CF]
2418 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2419 (char *)&cf, &out_sz);
2420 if (r) return -1;
2421 cr->cram_flags = cf;
2422 } else {
2423 if (!c->comp_hdr->codecs[DS_CF]) return -1;
2424 r |= c->comp_hdr->codecs[DS_CF]
2425 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2426 (char *)&cr->cram_flags, &out_sz);
2427 if (r) return -1;
2428 cf = cr->cram_flags;
2429 }
2430 } else {
2431 cf = cr->cram_flags = 0;
2432 }
2433
2434 if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2435 if (ds & CRAM_RI) {
2436 if (!c->comp_hdr->codecs[DS_RI]) return -1;
2437 r |= c->comp_hdr->codecs[DS_RI]
2438 ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2439 (char *)&cr->ref_id, &out_sz);
2440 if (r) return -1;
2441 if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2442 && cr->ref_id >= 0
2443 && cr->ref_id != last_ref_id) {
2444 if (!fd->no_ref) {
2445 if (!refs[cr->ref_id])
2446 refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
2447 1, 0);
2448 s->ref = refs[cr->ref_id];
2449
2450 if (!fd->unsorted && last_ref_id >= 0 && refs[last_ref_id]) {
2451 cram_ref_decr(fd->refs, last_ref_id);
2452 refs[last_ref_id] = NULL;
2453 }
2454 }
2455 s->ref_start = 1;
2456 pthread_mutex_lock(&fd->ref_lock);
2457 pthread_mutex_lock(&fd->refs->lock);
2458 s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2459 pthread_mutex_unlock(&fd->refs->lock);
2460 pthread_mutex_unlock(&fd->ref_lock);
2461
2462 last_ref_id = cr->ref_id;
2463 }
2464 } else {
2465 cr->ref_id = -1;
2466 }
2467 } else {
2468 cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2469 }
2470 if (cr->ref_id >= bfd->nref) {
2471 fprintf(stderr, "Requested unknown reference ID %d\n", cr->ref_id);
2472 return -1;
2473 }
2474
2475 if (ds & CRAM_RL) {
2476 if (!c->comp_hdr->codecs[DS_RL]) return -1;
2477 r |= c->comp_hdr->codecs[DS_RL]
2478 ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2479 (char *)&cr->len, &out_sz);
2480 if (r) return r;
2481 if (cr->len < 0) {
2482 fprintf(stderr, "Read has negative length\n");
2483 return -1;
2484 }
2485 }
2486
2487 if (ds & CRAM_AP) {
2488 if (!c->comp_hdr->codecs[DS_AP]) return -1;
2489 r |= c->comp_hdr->codecs[DS_AP]
2490 ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2491 (char *)&cr->apos, &out_sz);
2492 if (r) return r;
2493 if (c->comp_hdr->AP_delta)
2494 cr->apos += s->last_apos;
2495 s->last_apos= cr->apos;
2496 } else {
2497 cr->apos = c->ref_seq_start;
2498 }
2499
2500 if (ds & CRAM_RG) {
2501 if (!c->comp_hdr->codecs[DS_RG]) return -1;
2502 r |= c->comp_hdr->codecs[DS_RG]
2503 ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2504 (char *)&cr->rg, &out_sz);
2505 if (r) return r;
2506 if (cr->rg == unknown_rg)
2507 cr->rg = -1;
2508 } else {
2509 cr->rg = -1;
2510 }
2511
2512 cr->name_len = 0;
2513
2514 if (c->comp_hdr->read_names_included) {
2515 int32_t out_sz2 = 1;
2516
2517 // Read directly into name cram_block
2518 cr->name = BLOCK_SIZE(s->name_blk);
2519 if (ds & CRAM_RN) {
2520 if (!c->comp_hdr->codecs[DS_RN]) return -1;
2521 r |= c->comp_hdr->codecs[DS_RN]
2522 ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2523 (char *)s->name_blk, &out_sz2);
2524 if (r) return r;
2525 cr->name_len = out_sz2;
2526 }
2527 }
2528
2529 cr->mate_pos = 0;
2530 cr->mate_line = -1;
2531 cr->mate_ref_id = -1;
2532 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2533 if (ds & CRAM_MF) {
2534 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2535 /* MF is byte in 1.0, int32 in 2.0 */
2536 unsigned char mf;
2537 if (!c->comp_hdr->codecs[DS_MF]) return -1;
2538 r |= c->comp_hdr->codecs[DS_MF]
2539 ->decode(s, c->comp_hdr->codecs[DS_MF],
2540 blk, (char *)&mf, &out_sz);
2541 if (r) return r;
2542 cr->mate_flags = mf;
2543 } else {
2544 if (!c->comp_hdr->codecs[DS_MF]) return -1;
2545 r |= c->comp_hdr->codecs[DS_MF]
2546 ->decode(s, c->comp_hdr->codecs[DS_MF],
2547 blk,
2548 (char *)&cr->mate_flags,
2549 &out_sz);
2550 if (r) return r;
2551 }
2552 } else {
2553 cr->mate_flags = 0;
2554 }
2555
2556 if (!c->comp_hdr->read_names_included) {
2557 int32_t out_sz2 = 1;
2558
2559 // Read directly into name cram_block
2560 cr->name = BLOCK_SIZE(s->name_blk);
2561 if (ds & CRAM_RN) {
2562 if (!c->comp_hdr->codecs[DS_RN]) return -1;
2563 r |= c->comp_hdr->codecs[DS_RN]
2564 ->decode(s, c->comp_hdr->codecs[DS_RN],
2565 blk, (char *)s->name_blk,
2566 &out_sz2);
2567 if (r) return r;
2568 cr->name_len = out_sz2;
2569 }
2570 }
2571
2572 if (ds & CRAM_NS) {
2573 if (!c->comp_hdr->codecs[DS_NS]) return -1;
2574 r |= c->comp_hdr->codecs[DS_NS]
2575 ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2576 (char *)&cr->mate_ref_id, &out_sz);
2577 if (r) return r;
2578 }
2579
2580 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2581 // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2582 // /* Paired, but unmapped */
2583 // cr->flags |= BAM_FMUNMAP;
2584 // }
2585
2586 if (ds & CRAM_NP) {
2587 if (!c->comp_hdr->codecs[DS_NP]) return -1;
2588 r |= c->comp_hdr->codecs[DS_NP]
2589 ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2590 (char *)&cr->mate_pos, &out_sz);
2591 if (r) return r;
2592 }
2593
2594 if (ds & CRAM_TS) {
2595 if (!c->comp_hdr->codecs[DS_TS]) return -1;
2596 r |= c->comp_hdr->codecs[DS_TS]
2597 ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2598 (char *)&cr->tlen, &out_sz);
2599 if (r) return r;
2600 } else {
2601 cr->tlen = INT_MIN;
2602 }
2603 } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2604 if (ds & CRAM_NF) {
2605 if (!c->comp_hdr->codecs[DS_NF]) return -1;
2606 r |= c->comp_hdr->codecs[DS_NF]
2607 ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2608 (char *)&cr->mate_line, &out_sz);
2609 if (r) return r;
2610 cr->mate_line += rec + 1;
2611
2612 //cr->name_len = sprintf(name, "%d", name_id++);
2613 //cr->name = DSTRING_LEN(name_ds);
2614 //dstring_nappend(name_ds, name, cr->name_len);
2615
2616 cr->mate_ref_id = -1;
2617 cr->tlen = INT_MIN;
2618 cr->mate_pos = 0;
2619 } else {
2620 cr->mate_flags = 0;
2621 cr->tlen = INT_MIN;
2622 }
2623 } else {
2624 cr->mate_flags = 0;
2625 cr->tlen = INT_MIN;
2626 }
2627 /*
2628 else if (!name[0]) {
2629 //name[0] = '?'; name[1] = 0;
2630 //cr->name_len = 1;
2631 //cr->name= DSTRING_LEN(s->name_ds);
2632 //dstring_nappend(s->name_ds, "?", 1);
2633
2634 cr->mate_ref_id = -1;
2635 cr->tlen = 0;
2636 cr->mate_pos = 0;
2637 }
2638 */
2639
2640 /* Auxiliary tags */
2641 has_MD = has_NM = 0;
2642 if (CRAM_MAJOR_VERS(fd->version) == 1)
2643 r |= cram_decode_aux_1_0(c, s, blk, cr);
2644 else
2645 r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM);
2646 if (r) return r;
2647
2648 /* Fake up dynamic string growth and appending */
2649 if (ds & CRAM_RL) {
2650 cr->seq = BLOCK_SIZE(s->seqs_blk);
2651 BLOCK_GROW(s->seqs_blk, cr->len);
2652 seq = (char *)BLOCK_END(s->seqs_blk);
2653 BLOCK_SIZE(s->seqs_blk) += cr->len;
2654
2655 if (!seq)
2656 return -1;
2657
2658 cr->qual = BLOCK_SIZE(s->qual_blk);
2659 BLOCK_GROW(s->qual_blk, cr->len);
2660 qual = (char *)BLOCK_END(s->qual_blk);
2661 BLOCK_SIZE(s->qual_blk) += cr->len;
2662
2663 if (!s->ref)
2664 memset(seq, '=', cr->len);
2665 }
2666
2667 if (!(bf & BAM_FUNMAP)) {
2668 if ((ds & CRAM_AP) && cr->apos <= 0) {
2669 fprintf(stderr,
2670 "Read has alignment position %d but no unmapped flag\n",
2671 cr->apos);
2672 return -1;
2673 }
2674 /* Decode sequence and generate CIGAR */
2675 if (ds & (CRAM_SEQ | CRAM_MQ)) {
2676 r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual,
2677 has_MD, has_NM);
2678 if (r) return r;
2679 } else {
2680 cr->cigar = 0;
2681 cr->ncigar = 0;
2682 cr->aend = cr->apos;
2683 cr->mqual = 0;
2684 }
2685 } else {
2686 int out_sz2 = cr->len;
2687
2688 //puts("Unmapped");
2689 cr->cigar = 0;
2690 cr->ncigar = 0;
2691 cr->aend = cr->apos;
2692 cr->mqual = 0;
2693
2694 if (ds & CRAM_BA && cr->len) {
2695 if (!c->comp_hdr->codecs[DS_BA]) return -1;
2696 r |= c->comp_hdr->codecs[DS_BA]
2697 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2698 (char *)seq, &out_sz2);
2699 if (r) return r;
2700 }
2701
2702 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2703 out_sz2 = cr->len;
2704 if (ds & CRAM_QS && cr->len >= 0) {
2705 if (!c->comp_hdr->codecs[DS_QS]) return -1;
2706 r |= c->comp_hdr->codecs[DS_QS]
2707 ->decode(s, c->comp_hdr->codecs[DS_QS],
2708 blk, qual, &out_sz2);
2709 if (r) return r;
2710 }
2711 } else {
2712 if (ds & CRAM_RL)
2713 memset(qual, 255, cr->len);
2714 }
2715 }
2716 }
2717
2718 pthread_mutex_lock(&fd->ref_lock);
2719 if (refs) {
2720 int i;
2721 for (i = 0; i < fd->refs->nref; i++) {
2722 if (refs[i])
2723 cram_ref_decr(fd->refs, i);
2724 }
2725 free(refs);
2726 } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2727 cram_ref_decr(fd->refs, ref_id);
2728 }
2729 pthread_mutex_unlock(&fd->ref_lock);
2730
2731 /* Resolve mate pair cross-references between recs within this slice */
2732 r |= cram_decode_slice_xref(s, fd->required_fields);
2733
2734 // Free the original blocks as we no longer need these.
2735 {
2736 int i;
2737 for (i = 0; i < s->hdr->num_blocks; i++) {
2738 cram_block *b = s->block[i];
2739 cram_free_block(b);
2740 s->block[i] = NULL;
2741 }
2742 }
2743
2744 // Also see initial BLOCK_RESIZE_EXACT at top of function.
2745 // As we grow blocks we overallocate by up to 50%. So shrink
2746 // back to their final sizes here.
2747 //
2748 // fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2749 // (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2750 // (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2751 // (int)s->name_blk->byte, (int)s->name_blk->alloc,
2752 // (int)s->aux_blk->byte, (int)s->aux_blk->alloc);
2753 BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2754 BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2755 BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2756 BLOCK_RESIZE_EXACT(s->aux_blk, BLOCK_SIZE(s->aux_blk)+1);
2757
2758 return r;
2759 }
2760
2761 typedef struct {
2762 cram_fd *fd;
2763 cram_container *c;
2764 cram_slice *s;
2765 SAM_hdr *h;
2766 int exit_code;
2767 } cram_decode_job;
2768
cram_decode_slice_thread(void * arg)2769 void *cram_decode_slice_thread(void *arg) {
2770 cram_decode_job *j = (cram_decode_job *)arg;
2771
2772 j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2773
2774 return j;
2775 }
2776
2777 /*
2778 * Spawn a multi-threaded version of cram_decode_slice().
2779 */
cram_decode_slice_mt(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2780 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2781 SAM_hdr *bfd) {
2782 cram_decode_job *j;
2783 int nonblock;
2784
2785 if (!fd->pool)
2786 return cram_decode_slice(fd, c, s, bfd);
2787
2788 if (!(j = malloc(sizeof(*j))))
2789 return -1;
2790
2791 j->fd = fd;
2792 j->c = c;
2793 j->s = s;
2794 j->h = bfd;
2795
2796 nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2797
2798 if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2799 j, nonblock)) {
2800 /* Would block */
2801 fd->job_pending = j;
2802 } else {
2803 fd->job_pending = NULL;
2804 }
2805
2806 // flush too
2807 return 0;
2808 }
2809
2810
2811 /* ----------------------------------------------------------------------
2812 * CRAM sequence iterators.
2813 */
2814
2815 /*
2816 * Converts a cram in-memory record into a bam in-memory record. We
2817 * pass a pointer to a bam_seq_t pointer along with the a pointer to
2818 * the allocated size. These can initially be pointers to NULL and zero.
2819 *
2820 * This function will reallocate the bam buffer as required and update
2821 * (*bam)->alloc accordingly, allowing it to be used within a loop
2822 * efficiently without needing to allocate new bam objects over and
2823 * over again.
2824 *
2825 * Returns the used size of the bam record on success
2826 * -1 on failure.
2827 */
cram_to_bam(SAM_hdr * bfd,cram_fd * fd,cram_slice * s,cram_record * cr,int rec,bam_seq_t ** bam)2828 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
2829 cram_record *cr, int rec, bam_seq_t **bam) {
2830 int bam_idx, rg_len;
2831 char name_a[1024], *name;
2832 int name_len;
2833 char *aux, *aux_orig;
2834 char *seq, *qual;
2835
2836 /* Assign names if not explicitly set */
2837 if (fd->required_fields & SAM_QNAME) {
2838 if (cr->name_len) {
2839 name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
2840 name_len = cr->name_len;
2841 } else {
2842 name = name_a;
2843 name_len = strlen(fd->prefix);
2844 memcpy(name, fd->prefix, name_len);
2845 name += name_len;
2846 *name++ = ':';
2847 if (cr->mate_line >= 0 && cr->mate_line < rec)
2848 name = (char *)append_uint64((unsigned char *)name,
2849 s->hdr->record_counter +
2850 cr->mate_line + 1);
2851 else
2852 name = (char *)append_uint64((unsigned char *)name,
2853 s->hdr->record_counter +
2854 rec + 1);
2855 name_len = name - name_a;
2856 name = name_a;
2857 }
2858 } else {
2859 name = "?";
2860 name_len = 1;
2861 }
2862
2863 /* Generate BAM record */
2864 if (cr->rg < -1 || cr->rg >= bfd->nrg)
2865 return -1;
2866 rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
2867
2868 if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
2869 if (!BLOCK_DATA(s->seqs_blk))
2870 return -1;
2871 seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
2872 } else {
2873 seq = "*";
2874 cr->len = 0;
2875 }
2876
2877
2878 if (fd->required_fields & SAM_QUAL) {
2879 if (!BLOCK_DATA(s->qual_blk))
2880 return -1;
2881 qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
2882 } else {
2883 qual = NULL;
2884 }
2885
2886 bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
2887 name, name_len,
2888 cr->flags,
2889 cr->ref_id,
2890 cr->apos,
2891 cr->aend,
2892 cr->mqual,
2893 cr->ncigar, &s->cigar[cr->cigar],
2894 cr->mate_ref_id,
2895 cr->mate_pos,
2896 cr->tlen,
2897 cr->len,
2898 seq,
2899 qual);
2900 if (bam_idx == -1)
2901 return -1;
2902
2903 aux = aux_orig = (char *)bam_aux(*bam);
2904
2905 /* Auxiliary strings */
2906 if (cr->aux_size != 0) {
2907 memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
2908 aux += cr->aux_size;
2909 }
2910
2911 /* RG:Z: */
2912 if (cr->rg != -1) {
2913 int len = bfd->rg[cr->rg].name_len;
2914 *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
2915 memcpy(aux, bfd->rg[cr->rg].name, len);
2916 aux += len;
2917 *aux++ = 0;
2918 }
2919
2920 return bam_idx + (aux - aux_orig);
2921 }
2922
2923 /*
2924 * Here be dragons! The multi-threading code in this is crufty beyond belief.
2925 */
cram_next_slice(cram_fd * fd,cram_container ** cp)2926 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
2927 cram_container *c;
2928 cram_slice *s = NULL;
2929
2930 if (!(c = fd->ctr)) {
2931 // Load first container.
2932 do {
2933 if (!(c = fd->ctr = cram_read_container(fd)))
2934 return NULL;
2935 } while (c->length == 0);
2936
2937 /*
2938 * The first container may be a result of a sub-range query.
2939 * In which case it may still not be the optimal starting point
2940 * due to skipped containers/slices in the index.
2941 */
2942 if (fd->range.refid != -2) {
2943 while (c->ref_seq_id != -2 &&
2944 (c->ref_seq_id < fd->range.refid ||
2945 c->ref_seq_start + c->ref_seq_span-1 < fd->range.start)) {
2946 if (0 != cram_seek(fd, c->length, SEEK_CUR))
2947 return NULL;
2948 cram_free_container(fd->ctr);
2949 do {
2950 if (!(c = fd->ctr = cram_read_container(fd)))
2951 return NULL;
2952 } while (c->length == 0);
2953 }
2954
2955 if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid)
2956 return NULL;
2957 }
2958
2959 if (!(c->comp_hdr_block = cram_read_block(fd)))
2960 return NULL;
2961 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
2962 return NULL;
2963
2964 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
2965 if (!c->comp_hdr)
2966 return NULL;
2967 if (!c->comp_hdr->AP_delta &&
2968 sam_hdr_sort_order(fd->header) != ORDER_COORD) {
2969 pthread_mutex_lock(&fd->ref_lock);
2970 fd->unsorted = 1;
2971 pthread_mutex_unlock(&fd->ref_lock);
2972 }
2973 }
2974
2975 if ((s = c->slice)) {
2976 c->slice = NULL;
2977 cram_free_slice(s);
2978 s = NULL;
2979 }
2980
2981 if (c->curr_slice == c->max_slice) {
2982 if (fd->ctr == c)
2983 fd->ctr = NULL;
2984 cram_free_container(c);
2985 c = NULL;
2986 }
2987
2988 /* Sorry this is so contorted! */
2989 for (;;) {
2990 if (fd->job_pending) {
2991 cram_decode_job *j = (cram_decode_job *)fd->job_pending;
2992 c = j->c;
2993 s = j->s;
2994 free(fd->job_pending);
2995 fd->job_pending = NULL;
2996 } else if (!fd->ooc) {
2997 empty_container:
2998 if (!c || c->curr_slice == c->max_slice) {
2999 // new container
3000 do {
3001 if (!(c = fd->ctr = cram_read_container(fd))) {
3002 if (fd->pool) {
3003 fd->ooc = 1;
3004 break;
3005 }
3006
3007 return NULL;
3008 }
3009 } while (c->length == 0);
3010 if (fd->ooc)
3011 break;
3012
3013 /* Skip containers not yet spanning our range */
3014 if (fd->range.refid != -2 && c->ref_seq_id != -2) {
3015 fd->required_fields |= SAM_POS;
3016
3017 if (c->ref_seq_id != fd->range.refid) {
3018 cram_free_container(c);
3019 fd->ctr = NULL;
3020 fd->ooc = 1;
3021 fd->eof = 1;
3022 break;
3023 }
3024
3025 if (c->ref_seq_start > fd->range.end) {
3026 cram_free_container(c);
3027 fd->ctr = NULL;
3028 fd->ooc = 1;
3029 fd->eof = 1;
3030 break;
3031 }
3032
3033 if (c->ref_seq_start + c->ref_seq_span-1 <
3034 fd->range.start) {
3035 c->curr_rec = c->max_rec;
3036 c->curr_slice = c->max_slice;
3037 cram_seek(fd, c->length, SEEK_CUR);
3038 cram_free_container(c);
3039 c = NULL;
3040 continue;
3041 }
3042 }
3043
3044 if (!(c->comp_hdr_block = cram_read_block(fd)))
3045 return NULL;
3046 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3047 return NULL;
3048
3049 c->comp_hdr =
3050 cram_decode_compression_header(fd, c->comp_hdr_block);
3051 if (!c->comp_hdr)
3052 return NULL;
3053
3054 if (!c->comp_hdr->AP_delta &&
3055 sam_hdr_sort_order(fd->header) != ORDER_COORD) {
3056 pthread_mutex_lock(&fd->ref_lock);
3057 fd->unsorted = 1;
3058 pthread_mutex_unlock(&fd->ref_lock);
3059 }
3060 }
3061
3062 if (c->num_records == 0) {
3063 cram_free_container(c); c = NULL;
3064 goto empty_container;
3065 }
3066
3067
3068 if (!(s = c->slice = cram_read_slice(fd)))
3069 return NULL;
3070 c->curr_slice++;
3071 c->curr_rec = 0;
3072 c->max_rec = s->hdr->num_records;
3073
3074 s->last_apos = s->hdr->ref_seq_start;
3075
3076 /* Skip slices not yet spanning our range */
3077 if (fd->range.refid != -2 && s->hdr->ref_seq_id != -2) {
3078 if (s->hdr->ref_seq_id != fd->range.refid) {
3079 fd->eof = 1;
3080 cram_free_slice(s);
3081 c->slice = NULL;
3082 return NULL;
3083 }
3084
3085 if (s->hdr->ref_seq_start > fd->range.end) {
3086 fd->eof = 1;
3087 cram_free_slice(s);
3088 c->slice = NULL;
3089 return NULL;
3090 }
3091
3092 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span-1 <
3093 fd->range.start) {
3094 cram_free_slice(s);
3095 c->slice = NULL;
3096 cram_free_container(c);
3097 c = NULL;
3098 continue;
3099 }
3100 }
3101 }
3102
3103 /* Test decoding of 1st seq */
3104 if (!c || !s)
3105 break;
3106
3107 if (cram_decode_slice_mt(fd, c, s, fd->header) != 0) {
3108 // if (cram_decode_slice(fd, c, s, fd->header) != 0) {
3109 fprintf(stderr, "Failure to decode slice\n");
3110 cram_free_slice(s);
3111 c->slice = NULL;
3112 return NULL;
3113 }
3114
3115 if (!fd->pool || fd->job_pending)
3116 break;
3117
3118 // Push it a bit far, to qsize in queue rather than pending arrival,
3119 // as cram tends to be a bit bursty in decode timings.
3120 if (hts_tpool_process_len(fd->rqueue) > hts_tpool_process_qsize(fd->rqueue))
3121 break;
3122 }
3123
3124 if (fd->pool) {
3125 hts_tpool_result *res;
3126 cram_decode_job *j;
3127
3128 // fprintf(stderr, "Thread pool len = %d, %d\n",
3129 // hts_tpool_results_queue_len(fd->rqueue),
3130 // hts_tpool_results_queue_sz(fd->rqueue));
3131
3132 if (fd->ooc && hts_tpool_process_empty(fd->rqueue))
3133 return NULL;
3134
3135 res = hts_tpool_next_result_wait(fd->rqueue);
3136
3137 if (!res || !hts_tpool_result_data(res)) {
3138 fprintf(stderr, "hts_tpool_next_result failure\n");
3139 return NULL;
3140 }
3141
3142 j = (cram_decode_job *)hts_tpool_result_data(res);
3143 c = j->c;
3144 s = j->s;
3145
3146 if (j->exit_code != 0) {
3147 fprintf(stderr, "Slice decode failure\n");
3148 fd->eof = 0;
3149 hts_tpool_delete_result(res, 1);
3150 return NULL;
3151 }
3152
3153 fd->ctr = c;
3154
3155 hts_tpool_delete_result(res, 1);
3156 }
3157
3158 *cp = c;
3159 return s;
3160 }
3161
3162 /*
3163 * Read the next cram record and return it.
3164 * Note that to decode cram_record the caller will need to look up some data
3165 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3166 * the next call to cram_get_seq (which may invalidate it).
3167 *
3168 * Returns record pointer on success (do not free)
3169 * NULL on failure
3170 */
cram_get_seq(cram_fd * fd)3171 cram_record *cram_get_seq(cram_fd *fd) {
3172 cram_container *c;
3173 cram_slice *s;
3174
3175 for (;;) {
3176 c = fd->ctr;
3177 if (c && c->slice && c->curr_rec < c->max_rec) {
3178 s = c->slice;
3179 } else {
3180 if (!(s = cram_next_slice(fd, &c)))
3181 return NULL;
3182 continue; /* In case slice contains no records */
3183 }
3184
3185 if (fd->range.refid != -2) {
3186 if (fd->range.refid == -1 && s->crecs[c->curr_rec].ref_id != -1) {
3187 // Special case when looking for unmapped blocks at end.
3188 // If these are mixed in with mapped data (c->ref_id == -2)
3189 // then we need skip until we find the unmapped data, if at all
3190 c->curr_rec++;
3191 continue;
3192 }
3193 if (s->crecs[c->curr_rec].ref_id < fd->range.refid &&
3194 s->crecs[c->curr_rec].ref_id != -1) {
3195 // Looking for a mapped read, but not there yet. Special case
3196 // as -1 (unmapped) shouldn't be considered < refid.
3197 c->curr_rec++;
3198 continue;
3199 }
3200
3201 if (s->crecs[c->curr_rec].ref_id != fd->range.refid) {
3202 fd->eof = 1;
3203 cram_free_slice(s);
3204 c->slice = NULL;
3205 return NULL;
3206 }
3207
3208 if (fd->range.refid != -1 && s->crecs[c->curr_rec].apos > fd->range.end) {
3209 fd->eof = 1;
3210 cram_free_slice(s);
3211 c->slice = NULL;
3212 return NULL;
3213 }
3214
3215 if (fd->range.refid != -1 && s->crecs[c->curr_rec].aend < fd->range.start) {
3216 c->curr_rec++;
3217 continue;
3218 }
3219 }
3220
3221 break;
3222 }
3223
3224 fd->ctr = c;
3225 c->slice = s;
3226 return &s->crecs[c->curr_rec++];
3227 }
3228
3229 /*
3230 * Read the next cram record and convert it to a bam_seq_t struct.
3231 *
3232 * Returns 0 on success
3233 * -1 on EOF or failure (check fd->err)
3234 */
cram_get_bam_seq(cram_fd * fd,bam_seq_t ** bam)3235 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3236 cram_record *cr;
3237 cram_container *c;
3238 cram_slice *s;
3239
3240 if (!(cr = cram_get_seq(fd)))
3241 return -1;
3242
3243 c = fd->ctr;
3244 s = c->slice;
3245
3246 return cram_to_bam(fd->header, fd, s, cr, c->curr_rec-1, bam);
3247 }
3248