1 /*
2 * Copyright (c) 2013, 2014, 2015 Genome Research Ltd.
3 * Author(s): James Bonfield, Rob Davies
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2013
36 *
37 * - In-memory decoding of CRAM data structures.
38 * - Iterator for reading CRAM record by record.
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "io_lib_config.h"
43 #endif
44
45 #include <stdio.h>
46 #include <errno.h>
47 #include <assert.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <zlib.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 #include <math.h>
54 #include <ctype.h>
55
56 #include "io_lib/cram.h"
57 #include "io_lib/os.h"
58 #include "io_lib/md5.h"
59 #include "io_lib/crc32.h"
60
61 //Whether CIGAR has just M or uses = and X to indicate match and mismatch
62 //#define USE_X
63
64
65 /* ----------------------------------------------------------------------
66 * CRAM compression headers
67 */
68
69 /*
70 * Decodes the Tag Dictionary record in the preservation map
71 * Updates the cram compression header.
72 *
73 * Returns number of bytes decoded on success
74 * -1 on failure
75 */
cram_decode_TD(char * cp,const char * endp,cram_block_compression_hdr * h)76 int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) {
77 char *op = cp;
78 unsigned char *dat;
79 cram_block *b;
80 int32_t blk_size = 0;
81 int nTL, i, sz;
82
83 if (!(b = cram_new_block(0, 0)))
84 return -1;
85
86 /* Decode */
87 cp += safe_itf8_get(cp, endp, &blk_size);
88 if (!blk_size) {
89 h->nTL = 0;
90 h->TL = NULL;
91 cram_free_block(b);
92 return cp - op;
93 }
94
95 if (blk_size < 0 || endp - cp < blk_size) {
96 cram_free_block(b);
97 return -1;
98 }
99
100 BLOCK_APPEND(b, cp, blk_size);
101 cp += blk_size;
102 sz = cp - op;
103
104 // Force nul termination if missing
105 if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
106 BLOCK_APPEND_CHAR(b, '\0');
107
108 /* Set up TL lookup table */
109 dat = BLOCK_DATA(b);
110
111 // Count
112 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
113 nTL++;
114 while (dat[i])
115 i++;
116 }
117
118 // Copy
119 h->nTL = nTL;
120 if (!(h->TL = calloc(h->nTL, sizeof(unsigned char *)))) {
121 cram_free_block(b);
122 return -1;
123 }
124 for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
125 h->TL[nTL++] = &dat[i];
126 while (dat[i])
127 i++;
128 }
129 h->TD_blk = b;
130
131 return sz;
132 }
133
134 /*
135 * Decodes a CRAM block compression header.
136 * Returns header ptr on success
137 * NULL on failure
138 */
cram_decode_compression_header(cram_fd * fd,cram_block * b)139 cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
140 cram_block *b) {
141 char *cp, *endp, *cp_copy;
142 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
143 int i;
144 int32_t map_size = 0, map_count = 0;
145
146 if (!hdr)
147 return NULL;
148
149 if (b->method != RAW) {
150 if (cram_uncompress_block(b)) {
151 free(hdr);
152 return NULL;
153 }
154 }
155
156 cp = (char *)b->data;
157 endp = cp + b->uncomp_size;
158
159 if (IS_CRAM_1_VERS(fd)) {
160 int32_t i32;
161 cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id);
162 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
163 int64_t i64;
164 cp += safe_ltf8_get(cp, endp, &i64); hdr->ref_seq_start=i64;
165 cp += safe_ltf8_get(cp, endp, &i64); hdr->ref_seq_span=i64;
166 } else {
167 cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32;
168 cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32;
169 }
170 cp += safe_itf8_get(cp, endp, &hdr->num_records);
171 cp += safe_itf8_get(cp, endp, &hdr->num_landmarks);
172 if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
173 free(hdr);
174 return NULL;
175 }
176 for (i = 0; i < hdr->num_landmarks; i++) {
177 cp += safe_itf8_get(cp, endp, &hdr->landmark[i]);
178 }
179 }
180
181 hdr->preservation_map = HashTableCreate(4, HASH_NONVOLATILE_KEYS |
182 HASH_FUNC_TCL);
183 memset(hdr->rec_encoding_map, 0,
184 CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
185 memset(hdr->tag_encoding_map, 0,
186 CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
187
188 if (!hdr->preservation_map) {
189 cram_free_compression_header(hdr);
190 return NULL;
191 }
192
193 /* Initialise defaults for preservation map */
194 hdr->mapped_qs_included = 0;
195 hdr->unmapped_qs_included = 0;
196 hdr->unmapped_placed = 0;
197 hdr->qs_included = 0;
198 hdr->read_names_included = 0;
199 hdr->AP_delta = 1;
200 memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
201
202 /* Preservation map */
203 cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
204 cp += safe_itf8_get(cp, endp, &map_count);
205 for (i = 0; i < map_count; i++) {
206 HashData hd;
207 if (endp - cp < 2) {
208 cram_free_compression_header(hdr);
209 return NULL;
210 }
211 cp += 2;
212 switch(CRAM_KEY(cp[-2],cp[-1])) {
213 case CRAM_KEY('M','I'):
214 hd.i = *cp++;
215 if (!HashTableAdd(hdr->preservation_map, "MI", 2, hd, NULL)) {
216 cram_free_compression_header(hdr);
217 return NULL;
218 }
219 hdr->mapped_qs_included = hd.i;
220 break;
221
222 case CRAM_KEY('U','I'):
223 hd.i = *cp++;
224 if (!HashTableAdd(hdr->preservation_map, "UI", 2, hd, NULL)) {
225 cram_free_compression_header(hdr);
226 return NULL;
227 }
228 hdr->unmapped_qs_included = hd.i;
229 break;
230
231 case CRAM_KEY('P','I'):
232 hd.i = *cp++;
233 if (!HashTableAdd(hdr->preservation_map, "PI", 2, hd, NULL)) {
234 cram_free_compression_header(hdr);
235 return NULL;
236 }
237 hdr->unmapped_placed = hd.i;
238 break;
239
240 case CRAM_KEY('R','N'):
241 hd.i = *cp++;
242 if (!HashTableAdd(hdr->preservation_map, "RN", 2, hd, NULL)) {
243 cram_free_compression_header(hdr);
244 return NULL;
245 }
246 hdr->read_names_included = hd.i;
247 break;
248
249 case CRAM_KEY('A','P'):
250 hd.i = *cp++;
251 if (!HashTableAdd(hdr->preservation_map, "AP", 2, hd, NULL)) {
252 cram_free_compression_header(hdr);
253 return NULL;
254 }
255 hdr->AP_delta = hd.i;
256 break;
257
258 case CRAM_KEY('R','R'):
259 hd.i = *cp++;
260 if (!HashTableAdd(hdr->preservation_map, "RR", 2, hd, NULL)) {
261 cram_free_compression_header(hdr);
262 return NULL;
263 }
264 hdr->no_ref = !hd.i;
265 break;
266
267 case CRAM_KEY('S','M'):
268 if (endp - cp < 5) {
269 cram_free_compression_header(hdr);
270 return NULL;
271 }
272 hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
273 hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
274 hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
275 hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
276
277 hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
278 hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
279 hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
280 hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
281
282 hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
283 hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
284 hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
285 hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
286
287 hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
288 hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
289 hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
290 hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
291
292 hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
293 hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
294 hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
295 hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
296 hd.p = cp;
297 cp += 5;
298 if (!HashTableAdd(hdr->preservation_map, "SM", 2, hd, NULL)) {
299 cram_free_compression_header(hdr);
300 return NULL;
301 }
302 break;
303
304 case CRAM_KEY('T','D'): {
305 int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary
306 if (sz < 0) {
307 cram_free_compression_header(hdr);
308 return NULL;
309 }
310 hd.p = cp;
311 cp += sz;
312 if (!HashTableAdd(hdr->preservation_map, "TD", 2, hd, NULL)) {
313 cram_free_compression_header(hdr);
314 return NULL;
315 }
316 break;
317 }
318
319 default:
320 fprintf(stderr, "Unrecognised preservation map key %c%c\n",
321 cp[-2], cp[-1]);
322 // guess byte;
323 cp++;
324 break;
325 }
326 }
327 if (cp - cp_copy != map_size) {
328 cram_free_compression_header(hdr);
329 return NULL;
330 }
331
332 /* Record encoding map */
333 cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
334 cp += safe_itf8_get(cp, endp, &map_count);
335 int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
336 for (i = 0; i < map_count; i++) {
337 char *key = cp;
338 int32_t encoding = E_NULL;
339 int32_t size = 0;
340 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
341
342 if (!m || endp - cp < 4) {
343 free(m);
344 cram_free_compression_header(hdr);
345 return NULL;
346 }
347
348 cp += 2;
349 cp += safe_itf8_get(cp, endp, &encoding);
350 cp += safe_itf8_get(cp, endp, &size);
351
352 // Fill out cram_map purely for cram_dump to dump out.
353 m->key = (key[0]<<8)|key[1];
354 m->encoding = encoding;
355 m->size = size;
356 m->offset = cp - (char *)b->data;
357 m->codec = NULL;
358
359 if (m->encoding == E_NULL)
360 continue;
361
362 if (size < 0 || endp - cp < size) {
363 free(m);
364 cram_free_compression_header(hdr);
365 return NULL;
366 }
367
368 //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
369
370 /*
371 * For CRAM1.0 CF and BF are Byte and not Int.
372 * Practically speaking it makes no difference unless we have a
373 * 1.0 format file that stores these in EXTERNAL as only then
374 * does Byte vs Int matter.
375 *
376 * Neither this C code nor Java reference implementations did this,
377 * so we gloss over it and treat them as int.
378 */
379
380 if (key[0] == 'B' && key[1] == 'F') {
381 if (!(hdr->codecs[DS_BF] = cram_decoder_init(encoding, cp, size, E_INT,
382 fd->version))) {
383 cram_free_compression_header(hdr);
384 return NULL;
385 }
386 } else if (key[0] == 'C' && key[1] == 'F') {
387 if (!(hdr->codecs[DS_CF] = cram_decoder_init(encoding, cp, size, E_INT,
388 fd->version))) {
389 cram_free_compression_header(hdr);
390 return NULL;
391 }
392 } else if (key[0] == 'R' && key[1] == 'I') {
393 if (!(hdr->codecs[DS_RI] = cram_decoder_init(encoding, cp, size, E_INT,
394 fd->version))) {
395 cram_free_compression_header(hdr);
396 return NULL;
397 }
398 } else if (key[0] == 'R' && key[1] == 'L') {
399 if (!(hdr->codecs[DS_RL] = cram_decoder_init(encoding, cp, size, E_INT,
400 fd->version))) {
401 cram_free_compression_header(hdr);
402 return NULL;
403 }
404 } else if (key[0] == 'A' && key[1] == 'P') {
405 if (!(hdr->codecs[DS_AP] = cram_decoder_init(encoding, cp, size,
406 is_v4 ? E_LONG : E_INT,
407 fd->version))) {
408 cram_free_compression_header(hdr);
409 return NULL;
410 }
411 } else if (key[0] == 'R' && key[1] == 'G') {
412 if (!(hdr->codecs[DS_RG] = cram_decoder_init(encoding, cp, size, E_INT,
413 fd->version))) {
414 cram_free_compression_header(hdr);
415 return NULL;
416 }
417 } else if (key[0] == 'M' && key[1] == 'F') {
418 if (!(hdr->codecs[DS_MF] = cram_decoder_init(encoding, cp, size, E_INT,
419 fd->version))) {
420 cram_free_compression_header(hdr);
421 return NULL;
422 }
423 } else if (key[0] == 'N' && key[1] == 'S') {
424 if (!(hdr->codecs[DS_NS] = cram_decoder_init(encoding, cp, size, E_INT,
425 fd->version))) {
426 cram_free_compression_header(hdr);
427 return NULL;
428 }
429 } else if (key[0] == 'N' && key[1] == 'P') {
430 if (!(hdr->codecs[DS_NP] = cram_decoder_init(encoding, cp, size,
431 is_v4 ? E_LONG : E_INT,
432 fd->version))) {
433 cram_free_compression_header(hdr);
434 return NULL;
435 }
436 } else if (key[0] == 'T' && key[1] == 'S') {
437 if (!(hdr->codecs[DS_TS] = cram_decoder_init(encoding, cp, size,
438 is_v4 ? E_LONG : E_INT,
439 fd->version))) {
440 cram_free_compression_header(hdr);
441 return NULL;
442 }
443 } else if (key[0] == 'N' && key[1] == 'F') {
444 if (!(hdr->codecs[DS_NF] = cram_decoder_init(encoding, cp, size, E_INT,
445 fd->version))) {
446 cram_free_compression_header(hdr);
447 return NULL;
448 }
449 } else if (key[0] == 'T' && key[1] == 'C') {
450 if (!(hdr->codecs[DS_TC] = cram_decoder_init(encoding, cp, size, E_BYTE,
451 fd->version))) {
452 cram_free_compression_header(hdr);
453 return NULL;
454 }
455 } else if (key[0] == 'T' && key[1] == 'N') {
456 if (!(hdr->codecs[DS_TN] = cram_decoder_init(encoding, cp, size, E_INT,
457 fd->version))) {
458 cram_free_compression_header(hdr);
459 return NULL;
460 }
461 } else if (key[0] == 'F' && key[1] == 'N') {
462 if (!(hdr->codecs[DS_FN] = cram_decoder_init(encoding, cp, size, E_INT,
463 fd->version))) {
464 cram_free_compression_header(hdr);
465 return NULL;
466 }
467 } else if (key[0] == 'F' && key[1] == 'C') {
468 if (!(hdr->codecs[DS_FC] = cram_decoder_init(encoding, cp, size, E_BYTE,
469 fd->version))) {
470 cram_free_compression_header(hdr);
471 return NULL;
472 }
473 } else if (key[0] == 'F' && key[1] == 'P') {
474 if (!(hdr->codecs[DS_FP] = cram_decoder_init(encoding, cp, size, E_INT,
475 fd->version))) {
476 cram_free_compression_header(hdr);
477 return NULL;
478 }
479 } else if (key[0] == 'B' && key[1] == 'S') {
480 if (!(hdr->codecs[DS_BS] = cram_decoder_init(encoding, cp, size, E_BYTE,
481 fd->version))) {
482 cram_free_compression_header(hdr);
483 return NULL;
484 }
485 } else if (key[0] == 'I' && key[1] == 'N') {
486 if (!(hdr->codecs[DS_IN] = cram_decoder_init(encoding, cp, size,
487 E_BYTE_ARRAY,
488 fd->version))) {
489 cram_free_compression_header(hdr);
490 return NULL;
491 }
492 } else if (key[0] == 'S' && key[1] == 'C') {
493 if (!(hdr->codecs[DS_SC] = cram_decoder_init(encoding, cp, size,
494 E_BYTE_ARRAY,
495 fd->version))) {
496 cram_free_compression_header(hdr);
497 return NULL;
498 }
499 } else if (key[0] == 'D' && key[1] == 'L') {
500 if (!(hdr->codecs[DS_DL] = cram_decoder_init(encoding, cp, size, E_INT,
501 fd->version))) {
502 cram_free_compression_header(hdr);
503 return NULL;
504 }
505 } else if (key[0] == 'B' && key[1] == 'A') {
506 if (!(hdr->codecs[DS_BA] = cram_decoder_init(encoding, cp, size, E_BYTE,
507 fd->version))) {
508 cram_free_compression_header(hdr);
509 return NULL;
510 }
511 } else if (key[0] == 'B' && key[1] == 'B') {
512 if (!(hdr->codecs[DS_BB] = cram_decoder_init(encoding, cp, size, E_BYTE_ARRAY,
513 fd->version))) {
514 cram_free_compression_header(hdr);
515 return NULL;
516 }
517 } else if (key[0] == 'R' && key[1] == 'S') {
518 if (!(hdr->codecs[DS_RS] = cram_decoder_init(encoding, cp, size, E_INT,
519 fd->version))) {
520 cram_free_compression_header(hdr);
521 return NULL;
522 }
523 } else if (key[0] == 'P' && key[1] == 'D') {
524 if (!(hdr->codecs[DS_PD] = cram_decoder_init(encoding, cp, size, E_INT,
525 fd->version))) {
526 cram_free_compression_header(hdr);
527 return NULL;
528 }
529 } else if (key[0] == 'H' && key[1] == 'C') {
530 if (!(hdr->codecs[DS_HC] = cram_decoder_init(encoding, cp, size, E_INT,
531 fd->version))) {
532 cram_free_compression_header(hdr);
533 return NULL;
534 }
535 } else if (key[0] == 'M' && key[1] == 'Q') {
536 if (!(hdr->codecs[DS_MQ] = cram_decoder_init(encoding, cp, size, E_INT,
537 fd->version))) {
538 cram_free_compression_header(hdr);
539 return NULL;
540 }
541 } else if (key[0] == 'R' && key[1] == 'N') {
542 if (!(hdr->codecs[DS_RN] = cram_decoder_init(encoding, cp, size,
543 E_BYTE_ARRAY_BLOCK,
544 fd->version))) {
545 cram_free_compression_header(hdr);
546 return NULL;
547 }
548 } else if (key[0] == 'Q' && key[1] == 'S') {
549 if (!(hdr->codecs[DS_QS] = cram_decoder_init(encoding, cp, size, E_BYTE,
550 fd->version))) {
551 cram_free_compression_header(hdr);
552 return NULL;
553 }
554 } else if (key[0] == 'Q' && key[1] == 'Q') {
555 if (!(hdr->codecs[DS_QQ] = cram_decoder_init(encoding, cp, size, E_BYTE_ARRAY,
556 fd->version))) {
557 cram_free_compression_header(hdr);
558 return NULL;
559 }
560 } else if (key[0] == 'T' && key[1] == 'L') {
561 if (!(hdr->codecs[DS_TL] = cram_decoder_init(encoding, cp, size, E_INT,
562 fd->version))) {
563 cram_free_compression_header(hdr);
564 return NULL;
565 }
566 } else if (key[0] == 'T' && key[1] == 'M') {
567 } else if (key[0] == 'T' && key[1] == 'V') {
568 } else
569 fprintf(stderr, "Unrecognised key: %.2s\n", key);
570
571 cp += size;
572
573 m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
574 hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
575 }
576 if (cp - cp_copy != map_size) {
577 cram_free_compression_header(hdr);
578 return NULL;
579 }
580
581 /* Tag encoding map */
582 cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp;
583 cp += safe_itf8_get(cp, endp, &map_count);
584 for (i = 0; i < map_count; i++) {
585 int32_t encoding = E_NULL;
586 int32_t size = 0;
587 cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
588 uint8_t key[3];
589
590 if (!m || endp - cp < 6) {
591 free(m);
592 cram_free_compression_header(hdr);
593 return NULL;
594 }
595
596 cp += safe_itf8_get(cp, endp, &m->key);
597 key[0] = m->key>>16;
598 key[1] = m->key>>8;
599 key[2] = m->key;
600 cp += safe_itf8_get(cp, endp, &encoding);
601 cp += safe_itf8_get(cp, endp, &size);
602
603 m->encoding = encoding;
604 m->size = size;
605 m->offset = cp - (char *)b->data;
606 if (size < 0 || endp - cp < size ||
607 !(m->codec = cram_decoder_init(encoding, cp, size,
608 E_BYTE_ARRAY_BLOCK, fd->version))) {
609 cram_free_compression_header(hdr);
610 free(m);
611 return NULL;
612 }
613
614 cp += size;
615
616 m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
617 hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
618 }
619 if (cp - cp_copy != map_size) {
620 cram_free_compression_header(hdr);
621 return NULL;
622 }
623
624 return hdr;
625 }
626
627
628 /*
629 * Note we also need to scan through the record encoding map to
630 * see which data series share the same block, either external or
631 * CORE. For example if we need the BF data series but MQ and CF
632 * are also encoded in the same block then we need to add those in
633 * as a dependency in order to correctly decode BF.
634 *
635 * Returns 0 on success
636 * -1 on failure
637 */
cram_dependent_data_series(cram_fd * fd,cram_block_compression_hdr * hdr,cram_slice * s)638 int cram_dependent_data_series(cram_fd *fd,
639 cram_block_compression_hdr *hdr,
640 cram_slice *s) {
641 int *block_used;
642 int core_used = 0;
643 int i;
644 static int i_to_id[] = {
645 DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
646 DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
647 DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
648 DS_HC, DS_SC, DS_BB, DS_QQ,
649 };
650 uint32_t orig_ds;
651
652 /*
653 * Set the data_series bit field based on fd->required_fields
654 * contents.
655 */
656 if (fd->required_fields && fd->required_fields != INT_MAX) {
657 s->data_series = 0;
658
659 if (fd->required_fields & SAM_QNAME)
660 s->data_series |= CRAM_RN;
661
662 if (fd->required_fields & SAM_FLAG)
663 s->data_series |= CRAM_BF;
664
665 if (fd->required_fields & SAM_RNAME)
666 s->data_series |= CRAM_RI | CRAM_BF;
667
668 if (fd->required_fields & SAM_POS)
669 s->data_series |= CRAM_AP | CRAM_BF;
670
671 if (fd->required_fields & SAM_MAPQ)
672 s->data_series |= CRAM_MQ;
673
674 if (fd->required_fields & SAM_CIGAR)
675 s->data_series |= CRAM_CIGAR;
676
677 if (fd->required_fields & SAM_RNEXT)
678 s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
679
680 if (fd->required_fields & SAM_PNEXT)
681 s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
682
683 if (fd->required_fields & SAM_TLEN)
684 s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
685 CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
686
687 if (fd->required_fields & SAM_SEQ)
688 s->data_series |= CRAM_SEQ;
689
690 if (!(fd->required_fields & SAM_AUX))
691 // No easy way to get MD/NM without other tags at present
692 s->decode_md = 0;
693
694 if (fd->required_fields & SAM_QUAL) {
695 s->data_series |= CRAM_QUAL;
696 if (CRAM_MAJOR_VERS(fd->version) >= 4)
697 s->data_series |= CRAM_BF;
698 }
699
700 if (fd->required_fields & SAM_AUX)
701 s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
702
703 if (fd->required_fields & SAM_RGAUX)
704 s->data_series |= CRAM_RG | CRAM_BF;
705
706 // Always uncompress CORE block
707 if (cram_uncompress_block(s->block[0]))
708 return -1;
709 } else {
710 s->data_series = CRAM_ALL;
711
712 for (i = 0; i < s->hdr->num_blocks; i++) {
713 if (cram_uncompress_block(s->block[i]))
714 return -1;
715 }
716
717 return 0;
718 }
719
720 block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
721 if (!block_used)
722 return -1;
723
724 do {
725 /*
726 * Also set data_series based on code prerequisites. Eg if we need
727 * CRAM_QS then we also need to know CRAM_RL so we know how long it
728 * is, or if we need FC/FP then we also need FN (number of features).
729 *
730 * It's not reciprocal though. We may be needing to decode FN
731 * but have no need to decode FC, FP and cigar ops.
732 */
733 if (s->data_series & CRAM_RS) s->data_series |= CRAM_FC|CRAM_FP;
734 if (s->data_series & CRAM_PD) s->data_series |= CRAM_FC|CRAM_FP;
735 if (s->data_series & CRAM_HC) s->data_series |= CRAM_FC|CRAM_FP;
736 if (s->data_series & CRAM_QS) s->data_series |= CRAM_FC|CRAM_FP;
737 if (s->data_series & CRAM_IN) s->data_series |= CRAM_FC|CRAM_FP;
738 if (s->data_series & CRAM_SC) s->data_series |= CRAM_FC|CRAM_FP;
739 if (s->data_series & CRAM_BS) s->data_series |= CRAM_FC|CRAM_FP;
740 if (s->data_series & CRAM_DL) s->data_series |= CRAM_FC|CRAM_FP;
741 if (s->data_series & CRAM_BA) s->data_series |= CRAM_FC|CRAM_FP;
742 if (s->data_series & CRAM_BB) s->data_series |= CRAM_FC|CRAM_FP;
743 if (s->data_series & CRAM_QQ) s->data_series |= CRAM_FC|CRAM_FP;
744
745 // cram_decode_seq() needs seq[] array
746 if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
747
748 if (s->data_series & CRAM_FP) s->data_series |= CRAM_FC;
749 if (s->data_series & CRAM_FC) s->data_series |= CRAM_FN;
750 if (s->data_series & CRAM_aux) s->data_series |= CRAM_TL;
751 if (s->data_series & CRAM_MF) s->data_series |= CRAM_CF;
752 if (s->data_series & CRAM_MQ) s->data_series |= CRAM_BF;
753 if (s->data_series & CRAM_BS) s->data_series |= CRAM_RI;
754 if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
755 s->data_series |= CRAM_CF;
756 if (!hdr->read_names_included && s->data_series & CRAM_RN)
757 s->data_series |= CRAM_CF | CRAM_NF;
758 if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
759 s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
760
761 orig_ds = s->data_series;
762
763 // Find which blocks are in use.
764 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
765 int bnum1, bnum2, j;
766 cram_codec *c = hdr->codecs[i_to_id[i]];
767
768 if (!(s->data_series & (1<<i)))
769 continue;
770
771 if (!c)
772 continue;
773
774 bnum1 = cram_codec_to_id(c, &bnum2);
775
776 for (;;) {
777 switch (bnum1) {
778 case -2:
779 break;
780
781 case -1:
782 core_used = 1;
783 break;
784
785 default:
786 for (j = 0; j < s->hdr->num_blocks; j++) {
787 if (s->block[j]->content_type == EXTERNAL &&
788 s->block[j]->content_id == bnum1) {
789 block_used[j] = 1;
790 if (cram_uncompress_block(s->block[j])) {
791 free(block_used);
792 return -1;
793 }
794 }
795 }
796 break;
797 }
798
799 if (bnum2 == -2 || bnum1 == bnum2)
800 break;
801
802 bnum1 = bnum2; // 2nd pass
803 }
804 }
805
806 // Tags too
807 if ((fd->required_fields & SAM_AUX) ||
808 (s->data_series & CRAM_aux)) {
809 for (i = 0; i < CRAM_MAP_HASH; i++) {
810 int bnum1, bnum2, j;
811 cram_map *m = hdr->tag_encoding_map[i];
812
813 while (m) {
814 cram_codec *c = m->codec;
815 if (!c) {
816 m = m->next;
817 continue;
818 }
819
820 bnum1 = cram_codec_to_id(c, &bnum2);
821
822 for (;;) {
823 switch (bnum1) {
824 case -2:
825 break;
826
827 case -1:
828 core_used = 1;
829 break;
830
831 default:
832 for (j = 0; j < s->hdr->num_blocks; j++) {
833 if (s->block[j]->content_type == EXTERNAL &&
834 s->block[j]->content_id == bnum1) {
835 block_used[j] = 1;
836 if (cram_uncompress_block(s->block[j])) {
837 free(block_used);
838 return -1;
839 }
840 }
841 }
842 break;
843 }
844
845 if (bnum2 == -2 || bnum1 == bnum2)
846 break;
847
848 bnum1 = bnum2; // 2nd pass
849 }
850
851 m = m->next;
852 }
853 }
854 }
855
856 // We now know which blocks are in used, so repeat and find
857 // which other data series need to be added.
858 for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
859 int bnum1, bnum2, j;
860 cram_codec *c = hdr->codecs[i_to_id[i]];
861
862 if (!c)
863 continue;
864
865 bnum1 = cram_codec_to_id(c, &bnum2);
866
867 for (;;) {
868 switch (bnum1) {
869 case -2:
870 break;
871
872 case -1:
873 if (core_used) {
874 //printf(" + data series %08x:\n", 1<<i);
875 s->data_series |= 1<<i;
876 }
877 break;
878
879 default:
880 for (j = 0; j < s->hdr->num_blocks; j++) {
881 if (s->block[j]->content_type == EXTERNAL &&
882 s->block[j]->content_id == bnum1) {
883 if (block_used[j]) {
884 //printf(" + data series %08x:\n", 1<<i);
885 s->data_series |= 1<<i;
886 }
887 }
888 }
889 break;
890 }
891
892 if (bnum2 == -2 || bnum1 == bnum2)
893 break;
894
895 bnum1 = bnum2; // 2nd pass
896 }
897 }
898
899 // Tags too
900 for (i = 0; i < CRAM_MAP_HASH; i++) {
901 int bnum1, bnum2, j;
902 cram_map *m = hdr->tag_encoding_map[i];
903
904 while (m) {
905 cram_codec *c = m->codec;
906 if (!c) {
907 m = m->next;
908 continue;
909 }
910
911 bnum1 = cram_codec_to_id(c, &bnum2);
912
913 for (;;) {
914 switch (bnum1) {
915 case -2:
916 break;
917
918 case -1:
919 //printf(" + data series %08x:\n", CRAM_aux);
920 s->data_series |= CRAM_aux;
921 break;
922
923 default:
924 for (j = 0; j < s->hdr->num_blocks; j++) {
925 if (s->block[j]->content_type == EXTERNAL &&
926 s->block[j]->content_id == bnum1) {
927 if (block_used[j]) {
928 //printf(" + data series %08x:\n",
929 // CRAM_aux);
930 s->data_series |= CRAM_aux;
931 }
932 }
933 }
934 break;
935 }
936
937 if (bnum2 == -2 || bnum1 == bnum2)
938 break;
939
940 bnum1 = bnum2; // 2nd pass
941 }
942
943 m = m->next;
944 }
945 }
946 } while (orig_ds != s->data_series);
947
948 free(block_used);
949 return 0;
950 }
951
952 /*
953 * Checks whether an external block is used solely by a single data series.
954 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
955 * or 0 if not (E_NULL).
956 */
cram_ds_unique(cram_block_compression_hdr * hdr,cram_codec * c,int id)957 static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
958 int id) {
959 int i, n_id = 0;
960 enum cram_encoding e_type = 0;
961
962 for (i = 0; i < DS_END; i++) {
963 cram_codec *c;
964 int bnum1, bnum2, old_n_id;
965
966 if (!(c = hdr->codecs[i]))
967 continue;
968
969 bnum1 = cram_codec_to_id(c, &bnum2);
970
971 old_n_id = n_id;
972 if (bnum1 == id) {
973 n_id++;
974 e_type = c->codec;
975 }
976 if (bnum2 == id) {
977 n_id++;
978 e_type = c->codec;
979 }
980
981 if (n_id == old_n_id+2)
982 n_id--; // len/val in same place counts once only.
983 }
984
985 return n_id == 1 ? e_type : 0;
986 }
987
988 /*
989 * Attempts to estimate the size of some blocks so we can preallocate them
990 * before decoding. Although decoding will automatically grow the blocks,
991 * it is typically more efficient to preallocate.
992 */
cram_decode_estimate_sizes(cram_block_compression_hdr * hdr,cram_slice * s,int * qual_size,int * name_size,int * q_id)993 void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
994 int *qual_size, int *name_size,
995 int *q_id) {
996 int bnum1, bnum2;
997 cram_codec *cd;
998
999 *qual_size = 0;
1000 *name_size = 0;
1001
1002 /* Qual */
1003 if (!(cd = hdr->codecs[DS_QS]))
1004 return;
1005
1006 bnum1 = cram_codec_to_id(cd, &bnum2);
1007 if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1008 if (cram_ds_unique(hdr, cd, bnum1)) {
1009 cram_block *b = cram_get_block_by_id(s, bnum1);
1010 if (b) *qual_size = b->uncomp_size;
1011 if (q_id && cd->codec == E_EXTERNAL)
1012 *q_id = bnum1;
1013 }
1014
1015 /* Name */
1016 cd = hdr->codecs[DS_RN];
1017 bnum1 = cram_codec_to_id(cd, &bnum2);
1018 if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
1019 if (cram_ds_unique(hdr, cd, bnum1)) {
1020 cram_block *b = cram_get_block_by_id(s, bnum1);
1021 if (b) *name_size = b->uncomp_size;
1022 }
1023 }
1024
1025
1026 /* ----------------------------------------------------------------------
1027 * CRAM slices
1028 */
1029
1030 /*
1031 * Decodes a CRAM (un)mapped slice header block.
1032 * Returns slice header ptr on success
1033 * NULL on failure
1034 */
cram_decode_slice_header(cram_fd * fd,cram_block * b)1035 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
1036 cram_block_slice_hdr *hdr;
1037 unsigned char *cp;
1038 unsigned char *cp_end;
1039 int i;
1040
1041 if (b->method != RAW) {
1042 /* Spec. says slice header should be RAW, but we can future-proof
1043 by trying to decode it if it isn't. */
1044 if (cram_uncompress_block(b) < 0)
1045 return NULL;
1046 }
1047
1048 cp = (unsigned char *)BLOCK_DATA(b);
1049 cp_end = cp + b->uncomp_size;
1050
1051 if (b->content_type != MAPPED_SLICE &&
1052 b->content_type != UNMAPPED_SLICE)
1053 return NULL;
1054
1055 if (!(hdr = calloc(1, sizeof(*hdr))))
1056 return NULL;
1057
1058 hdr->content_type = b->content_type;
1059
1060 if (b->content_type == MAPPED_SLICE) {
1061 int32_t i32;
1062 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id);
1063 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
1064 int64_t i64;
1065 cp += safe_ltf8_get((char *)cp, (char *)cp_end, &i64);
1066 hdr->ref_seq_start = i64;
1067 cp += safe_ltf8_get((char *)cp, (char *)cp_end, &i64);
1068 hdr->ref_seq_span = i64;
1069 } else {
1070 cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1071 hdr->ref_seq_start = i32;
1072 cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1073 hdr->ref_seq_span = i32;
1074 }
1075 }
1076 cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->num_records);
1077 hdr->record_counter = 0;
1078 if (CRAM_MAJOR_VERS(fd->version) == 2) {
1079 int32_t i32 = 0;
1080 cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32);
1081 hdr->record_counter = i32;
1082 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1083 cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter);
1084 }
1085 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks);
1086 cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids);
1087 if (hdr->num_content_ids < 1 ||
1088 hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) {
1089 /* Slice must have at least one data block,
1090 and malloc'd size shouldn't wrap. */
1091 free(hdr);
1092 return NULL;
1093 }
1094 hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1095 if (!hdr->block_content_ids) {
1096 free(hdr);
1097 return NULL;
1098 }
1099
1100 for (i = 0; i < hdr->num_content_ids; i++) {
1101 int l = safe_itf8_get((char *)cp, (char *) cp_end,
1102 &hdr->block_content_ids[i]);
1103 if (l <= 0) {
1104 free(hdr->block_content_ids);
1105 free(hdr);
1106 return NULL;
1107 }
1108 cp += l;
1109 }
1110
1111 if (b->content_type == MAPPED_SLICE) {
1112 cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id);
1113 }
1114
1115 if (!IS_CRAM_1_VERS(fd)) {
1116 if (cp_end - cp < 16) {
1117 free(hdr->block_content_ids);
1118 free(hdr);
1119 return NULL;
1120 }
1121 memcpy(hdr->md5, cp, 16);
1122 cp += 16;
1123 } else {
1124 memset(hdr->md5, 0, 16);
1125 }
1126
1127 // Decode any optional tag:type:value fields
1128 if (cp == cp_end)
1129 return hdr;
1130
1131 hdr->tags = HashTableCreate(4, HASH_FUNC_TCL);
1132 while (cp <= cp_end) {
1133 unsigned int sub_len;
1134 unsigned char id[3];
1135 HashData hd;
1136
1137 if (cp_end - cp < 4)
1138 return hdr;
1139
1140 id[0] = cp[0];
1141 id[1] = cp[1];
1142 id[2] = '\0';
1143
1144 switch (cp[2]) {
1145 case 'c':
1146 id[2] = 'i';
1147 hd.i = (int8_t)(cp[3]);
1148 cp += 4;
1149 break;
1150
1151 case 'C':
1152 id[2] = 'i';
1153 hd.i = (uint8_t)(cp[3]);
1154 cp += 4;
1155 break;
1156
1157 case 's':
1158 if (cp_end - cp < 5) break;
1159 id[2] = 'i';
1160 hd.i = (int16_t)(cp[3] + (cp[4]<<8));
1161 cp += 5;
1162 break;
1163
1164 case 'S':
1165 if (cp_end - cp < 5) break;
1166 id[2] = 'i';
1167 hd.i = (uint16_t)(cp[3] + (cp[4]<<8));
1168 cp += 5;
1169 break;
1170
1171 case 'i':
1172 if (cp_end - cp < 7) break;
1173 id[2] = 'i';
1174 hd.i = (int32_t)(cp[3] + (cp[4]<<8) + (cp[5]<<16) + (cp[6]<<24));
1175 cp += 7;
1176 break;
1177
1178 case 'I':
1179 if (cp_end - cp < 7) break;
1180 id[2] = 'i';
1181 hd.i = (uint32_t)(cp[3] + (cp[4]<<8) + (cp[5]<<16) + (cp[6]<<24));
1182 cp += 7;
1183 break;
1184
1185 case 'f':
1186 if (cp_end - cp < 7) break;
1187 id[2] = cp[2];
1188 hd.f = bam_aux_f(cp+2);
1189 cp += 7;
1190 break;
1191
1192 case 'A':
1193 id[2] = cp[2];
1194 hd.i = cp[3];
1195 cp += 4;
1196 break;
1197
1198 case 'Z': case 'H':
1199 id[2] = cp[2];
1200 hd.p = &cp[3];
1201 cp += 3;
1202 while (cp < cp_end && *cp != '\0') cp++;
1203 if (cp < cp_end) {
1204 /* Skip NUL */
1205 cp++;
1206 } else {
1207 /* Add missing NUL termination */
1208 assert(cp == BLOCK_DATA(b) + b->uncomp_size);
1209 BLOCK_RESIZE(b, b->uncomp_size + 1);
1210 cp = cp_end = BLOCK_DATA(b) + b->uncomp_size;
1211 *cp = '\0';
1212 }
1213 break;
1214
1215 case 'B':
1216 // <type>B<code><4-len><...>
1217 if (cp_end - cp < 8) break;
1218 hd.p = &cp[3];
1219 sub_len = cp[4] + (cp[5]<<8) + (cp[6]<<16) + (cp[7]<<24);
1220 switch (cp[3]) {
1221 case 'c': case 'C':
1222 if (cp_end - cp < 8 + sub_len) break;
1223 id[2] = cp[2];
1224 cp += 8 + sub_len;
1225 break;
1226 case 's': case 'S':
1227 if (cp_end - cp < 8 + 2 * sub_len) break;
1228 id[2] = cp[2];
1229 cp += 8 + 2*sub_len;
1230 break;
1231 case 'i': case 'I': case 'f':
1232 if (cp_end - cp < 8 + 4 * sub_len) break;
1233 id[2] = cp[2];
1234 cp += 8 + 4*sub_len;
1235 break;
1236 default:
1237 fprintf(stderr, "Unknown aux type 'B' sub-code.\n");
1238 cp = cp_end;
1239 }
1240 break;
1241
1242 default:
1243 fprintf(stderr, "Unknown aux type.\n");
1244 cp = cp_end;
1245 }
1246
1247 if (id[2] != '\0') {
1248 HashTableAdd(hdr->tags, (char *)id, 3, hd, NULL);
1249 } else {
1250 cp = cp_end;
1251 break;
1252 }
1253
1254 if (id[0] == 'B' && id[1] == 'D' && id[2] == 'B') {
1255 unsigned char *p = hd.p;
1256 hdr->BD_crc = p[5] | (p[6]<<8) | (p[7]<<16) | (p[8]<<24);
1257 }
1258
1259 if (id[0] == 'S' && id[1] == 'D' && id[2] == 'B') {
1260 unsigned char *p = hd.p;
1261 hdr->SD_crc = p[5] | (p[6]<<8) | (p[7]<<16) | (p[8]<<24);
1262 }
1263 }
1264
1265 return hdr;
1266 }
1267
1268
1269 #if 0
1270 /* Returns the number of bits set in val; it the highest bit used */
1271 static int nbits(int v) {
1272 static const int MultiplyDeBruijnBitPosition[32] = {
1273 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1274 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1275 };
1276
1277 v |= v >> 1; // first up to set all bits 1 after the first 1 */
1278 v |= v >> 2;
1279 v |= v >> 4;
1280 v |= v >> 8;
1281 v |= v >> 16;
1282
1283 // DeBruijn magic to find top bit
1284 return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1285 }
1286 #endif
1287
1288 #if 0
1289 static int sort_freqs(const void *vp1, const void *vp2) {
1290 const int i1 = *(const int *)vp1;
1291 const int i2 = *(const int *)vp2;
1292 return i1-i2;
1293 }
1294 #endif
1295
1296 /* ----------------------------------------------------------------------
1297 * Primary CRAM sequence decoder
1298 */
1299
1300 /*
1301 * Internal part of cram_decode_slice().
1302 * Generates the sequence, quality and cigar components.
1303 */
cram_decode_seq(cram_fd * fd,cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,SAM_hdr * bfd,int cf,char * seq,char * qual,int has_MD,int has_NM)1304 static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1305 cram_block *blk, cram_record *cr, SAM_hdr *bfd,
1306 int cf, char *seq, char *qual,
1307 int has_MD, int has_NM) {
1308 int prev_pos = 0, f, r = 0, out_sz = 1;
1309 int seq_pos = 1;
1310 int cig_len = 0;
1311 int64_t ref_pos = cr->apos;
1312 int32_t fn, i32;
1313 enum cigar_op cig_op = BAM_CMATCH;
1314 uint32_t *cigar = s->cigar;
1315 uint32_t ncigar = s->ncigar;
1316 uint32_t cigar_alloc = s->cigar_alloc;
1317 uint32_t nm = 0;
1318 int32_t md_dist = 0;
1319 int orig_aux = 0;
1320 int decode_md = s->decode_md && s->ref && !has_MD;
1321 int decode_nm = s->decode_md && s->ref && !has_NM;
1322 uint32_t ds = s->data_series;
1323
1324 if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1325 memset(qual, 255, cr->len);
1326 }
1327
1328 if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1329 decode_md = decode_nm = 0;
1330
1331 if (decode_md) {
1332 orig_aux = BLOCK_SIZE(s->aux_blk);
1333 BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1334 }
1335
1336 if (ds & CRAM_FN) {
1337 if (!c->comp_hdr->codecs[DS_FN]) return -1;
1338 r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN],
1339 blk, (char *)&fn, &out_sz);
1340 if (r) return r;
1341 } else {
1342 fn = 0;
1343 }
1344
1345 ref_pos--; // count from 0
1346 cr->cigar = ncigar;
1347
1348 if (!(ds & (CRAM_FC | CRAM_FP)))
1349 goto skip_cigar;
1350
1351 for (f = 0; f < fn; f++) {
1352 int32_t pos = 0;
1353 char op;
1354
1355 if (ncigar+2 >= cigar_alloc) {
1356 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1357 s->cigar = cigar;
1358 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1359 return -1;
1360 }
1361
1362 if (ds & CRAM_FC) {
1363 if (!c->comp_hdr->codecs[DS_FC]) return -1;
1364 r |= c->comp_hdr->codecs[DS_FC]->decode(s,
1365 c->comp_hdr->codecs[DS_FC],
1366 blk,
1367 &op, &out_sz);
1368 if (r) return r;
1369 }
1370
1371 if (!(ds & CRAM_FP))
1372 continue;
1373
1374 if (!c->comp_hdr->codecs[DS_FP]) return -1;
1375 r |= c->comp_hdr->codecs[DS_FP]->decode(s,
1376 c->comp_hdr->codecs[DS_FP],
1377 blk,
1378 (char *)&pos, &out_sz);
1379 if (r) return r;
1380 pos += prev_pos;
1381
1382 if (pos <= 0) {
1383 fprintf(stderr, "Error: feature position %d before start of read.\n",
1384 pos);
1385 return -1;
1386 }
1387
1388 if (pos > seq_pos) {
1389 if (pos > cr->len+1)
1390 return -1;
1391
1392 if (s->ref && cr->ref_id >= 0) {
1393 if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1394 static int whinged = 0;
1395 int rlen;
1396 if (!whinged)
1397 fprintf(stderr, "Ref pos outside of ref "
1398 "sequence boundary\n");
1399 whinged = 1;
1400 rlen = bfd->ref[cr->ref_id].len - ref_pos;
1401 if (rlen > 0) {
1402 memcpy(&seq[seq_pos-1],
1403 &s->ref[ref_pos - s->ref_start +1], rlen);
1404 if ((pos - seq_pos) - rlen > 0)
1405 memset(&seq[seq_pos-1+rlen], 'N',
1406 (pos - seq_pos) - rlen);
1407 } else {
1408 memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1409 }
1410 } else {
1411 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1412 pos - seq_pos);
1413 }
1414 }
1415 #ifdef USE_X
1416 if (cig_len && cig_op != BAM_CBASE_MATCH) {
1417 cigar[ncigar++] = (cig_len<<4) + cig_op;
1418 cig_len = 0;
1419 }
1420 cig_op = BAM_CBASE_MATCH;
1421 #else
1422 if (cig_len && cig_op != BAM_CMATCH) {
1423 cigar[ncigar++] = (cig_len<<4) + cig_op;
1424 cig_len = 0;
1425 }
1426 cig_op = BAM_CMATCH;
1427 #endif
1428 cig_len += pos - seq_pos;
1429 ref_pos += pos - seq_pos;
1430 if (md_dist >= 0)
1431 md_dist += pos - seq_pos;
1432 seq_pos = pos;
1433 }
1434
1435 prev_pos = pos;
1436
1437 if (!(ds & CRAM_FC))
1438 goto skip_cigar;
1439
1440 switch(op) {
1441 case 'S': { // soft clip: IN
1442 int32_t out_sz2 = 1;
1443 int have_sc = 0;
1444
1445 if (cig_len) {
1446 cigar[ncigar++] = (cig_len<<4) + cig_op;
1447 cig_len = 0;
1448 }
1449 switch (CRAM_MAJOR_VERS(fd->version)) {
1450 case 1:
1451 if (ds & CRAM_IN) {
1452 r |= c->comp_hdr->codecs[DS_IN]
1453 ? c->comp_hdr->codecs[DS_IN]
1454 ->decode(s, c->comp_hdr->codecs[DS_IN],
1455 blk,
1456 cr->len ? &seq[pos-1] : NULL,
1457 &out_sz2)
1458 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1459 have_sc = 1;
1460 }
1461 break;
1462 case 2:
1463 default:
1464 if (ds & CRAM_SC) {
1465 r |= c->comp_hdr->codecs[DS_SC]
1466 ? c->comp_hdr->codecs[DS_SC]
1467 ->decode(s, c->comp_hdr->codecs[DS_SC],
1468 blk,
1469 cr->len ? &seq[pos-1] : NULL,
1470 &out_sz2)
1471 : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1472 have_sc = 1;
1473 }
1474 break;
1475
1476 // default:
1477 // r |= c->comp_hdr->codecs[DS_BB]
1478 // ? c->comp_hdr->codecs[DS_BB]
1479 // ->decode(s, c->comp_hdr->codecs[DS_BB],
1480 // blk, &seq[pos-1], &out_sz2)
1481 // : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1482 }
1483 if (have_sc) {
1484 if (r) return r;
1485 cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1486 cig_op = BAM_CSOFT_CLIP;
1487 seq_pos += out_sz2;
1488 }
1489 break;
1490 }
1491
1492 case 'X': { // Substitution; BS
1493 unsigned char base;
1494 #ifdef USE_X
1495 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1496 cigar[ncigar++] = (cig_len<<4) + cig_op;
1497 cig_len = 0;
1498 }
1499 if (ds & CRAM_BS) {
1500 if (!c->comp_hdr->codecs[DS_BS]) return -1;
1501 r |= c->comp_hdr->codecs[DS_BS]
1502 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1503 (char *)&base, &out_sz);
1504 if (pos-1 < cr->len)
1505 seq[pos-1] = 'N'; // FIXME look up BS=base value
1506 }
1507 cig_op = BAM_CBASE_MISMATCH;
1508 #else
1509 int ref_base;
1510 if (cig_len && cig_op != BAM_CMATCH) {
1511 cigar[ncigar++] = (cig_len<<4) + cig_op;
1512 cig_len = 0;
1513 }
1514 if (ds & CRAM_BS) {
1515 if (!c->comp_hdr->codecs[DS_BS]) return -1;
1516 r |= c->comp_hdr->codecs[DS_BS]
1517 ->decode(s, c->comp_hdr->codecs[DS_BS], blk,
1518 (char *)&base, &out_sz);
1519 if (r) return -1;
1520 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1521 if (pos-1 < cr->len)
1522 seq[pos-1] = c->comp_hdr->
1523 substitution_matrix[fd->L1['N']][base];
1524 if (decode_md || decode_nm) {
1525 if (md_dist >= 0 && decode_md)
1526 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1527 md_dist = -1;
1528 nm--;
1529 }
1530 } else {
1531 unsigned char ref_call = ref_pos <= s->ref_end
1532 ? (uc)s->ref[ref_pos - s->ref_start +1]
1533 : 'N';
1534 ref_base = fd->L1[ref_call];
1535 if (pos-1 < cr->len)
1536 seq[pos-1] = c->comp_hdr->
1537 substitution_matrix[ref_base][base];
1538 if (decode_md) {
1539 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1540 BLOCK_APPEND_CHAR(s->aux_blk, ref_call);
1541 md_dist = 0;
1542 }
1543 }
1544 }
1545 cig_op = BAM_CMATCH;
1546 #endif
1547 nm++;
1548 cig_len++;
1549 seq_pos++;
1550 ref_pos++;
1551 break;
1552 }
1553
1554 case 'D': { // Deletion; DL
1555 if (cig_len && cig_op != BAM_CDEL) {
1556 cigar[ncigar++] = (cig_len<<4) + cig_op;
1557 cig_len = 0;
1558 }
1559 if (ds & CRAM_DL) {
1560 if (!c->comp_hdr->codecs[DS_DL]) return -1;
1561 r |= c->comp_hdr->codecs[DS_DL]
1562 ->decode(s, c->comp_hdr->codecs[DS_DL], blk,
1563 (char *)&i32, &out_sz);
1564 if (r) return r;
1565 if (decode_md || decode_nm) {
1566 if (md_dist >= 0 && decode_md)
1567 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1568 if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1569 if (decode_md) {
1570 BLOCK_APPEND_CHAR(s->aux_blk, '^');
1571 BLOCK_APPEND(s->aux_blk,
1572 &s->ref[ref_pos - s->ref_start +1],
1573 i32);
1574 md_dist = 0;
1575 }
1576 nm += i32;
1577 } else {
1578 uint32_t dlen;
1579 if (bfd->ref[cr->ref_id].len >= ref_pos) {
1580 if (decode_md) {
1581 BLOCK_APPEND_CHAR(s->aux_blk, '^');
1582 BLOCK_APPEND(s->aux_blk,
1583 &s->ref[ref_pos - s->ref_start+1],
1584 bfd->ref[cr->ref_id].len-ref_pos);
1585 BLOCK_APPEND_UINT(s->aux_blk, 0);
1586 }
1587 dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1588 nm += i32 - dlen;
1589 } else {
1590 dlen = i32;
1591 }
1592
1593 md_dist = -1;
1594 }
1595 }
1596 cig_op = BAM_CDEL;
1597 cig_len += i32;
1598 ref_pos += i32;
1599 //printf(" %d: DL = %d (ret %d)\n", f, i32, r);
1600 }
1601 break;
1602 }
1603
1604 case 'I': { // Insertion (several bases); IN
1605 int32_t out_sz2 = 1;
1606
1607 if (cig_len && cig_op != BAM_CINS) {
1608 cigar[ncigar++] = (cig_len<<4) + cig_op;
1609 cig_len = 0;
1610 }
1611
1612 if (ds & CRAM_IN) {
1613 if (!c->comp_hdr->codecs[DS_IN]) return -1;
1614 r |= c->comp_hdr->codecs[DS_IN]
1615 ->decode(s, c->comp_hdr->codecs[DS_IN], blk,
1616 cr->len ? &seq[pos-1] : NULL,
1617 &out_sz2);
1618 if (r) return r;
1619 cig_op = BAM_CINS;
1620 cig_len += out_sz2;
1621 seq_pos += out_sz2;
1622 nm += out_sz2;
1623 //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1624 }
1625 break;
1626 }
1627
1628 case 'i': { // Insertion (single base); BA
1629 if (cig_len && cig_op != BAM_CINS) {
1630 cigar[ncigar++] = (cig_len<<4) + cig_op;
1631 cig_len = 0;
1632 }
1633 if (ds & CRAM_BA) {
1634 if (!c->comp_hdr->codecs[DS_BA]) return -1;
1635 r |= c->comp_hdr->codecs[DS_BA]
1636 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1637 cr->len ? &seq[pos-1] : NULL,
1638 &out_sz);
1639 if (r) return r;
1640 //printf(" %d: BA = %c (ret %d)\n", f, seq[pos-1], r);
1641 }
1642 cig_op = BAM_CINS;
1643 cig_len++;
1644 seq_pos++;
1645 nm++;
1646 break;
1647 }
1648
1649 case 'b': { // Several bases
1650 int32_t len = 1;
1651
1652 if (cig_len && cig_op != BAM_CMATCH) {
1653 cigar[ncigar++] = (cig_len<<4) + cig_op;
1654 cig_len = 0;
1655 }
1656
1657 if (ds & CRAM_BB) {
1658 if (!c->comp_hdr->codecs[DS_BB]) return -1;
1659 r |= c->comp_hdr->codecs[DS_BB]
1660 ->decode(s, c->comp_hdr->codecs[DS_BB], blk,
1661 cr->len ? &seq[pos-1] : NULL,
1662 &len);
1663 if (r) return r;
1664
1665 if (decode_md || decode_nm) {
1666 int x;
1667 if (md_dist >= 0 && decode_md)
1668 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1669
1670 for (x = 0; x < len; x++) {
1671 if (x && decode_md)
1672 BLOCK_APPEND_UINT(s->aux_blk, 0);
1673 if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1674 md_dist = -1;
1675 break;
1676 } else {
1677 if (decode_md) {
1678 char r = s->ref[ref_pos+x-s->ref_start +1];
1679 BLOCK_APPEND_CHAR(s->aux_blk, r);
1680 }
1681 }
1682 }
1683
1684 nm += x;
1685 }
1686 }
1687
1688 cig_op = BAM_CMATCH;
1689
1690 cig_len+=len;
1691 seq_pos+=len;
1692 ref_pos+=len;
1693 //prev_pos+=len;
1694 break;
1695 }
1696
1697 case 'q': { // Several quality values
1698 int32_t len = 1;
1699
1700 if (cig_len && cig_op != BAM_CMATCH) {
1701 cigar[ncigar++] = (cig_len<<4) + cig_op;
1702 cig_len = 0;
1703 }
1704
1705 if (ds & CRAM_QQ) {
1706 if (!c->comp_hdr->codecs[DS_QQ]) return -1;
1707 r |= c->comp_hdr->codecs[DS_QQ]
1708 ->decode(s, c->comp_hdr->codecs[DS_QQ], blk,
1709 (char *)&qual[pos-1], &len);
1710 if (r) return r;
1711 }
1712
1713 cig_op = BAM_CMATCH;
1714
1715 cig_len+=len;
1716 seq_pos+=len;
1717 ref_pos+=len;
1718 //prev_pos+=len;
1719 break;
1720 }
1721
1722 case 'B': { // Read base; BA, QS
1723 #ifdef USE_X
1724 if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1725 cigar[ncigar++] = (cig_len<<4) + cig_op;
1726 cig_len = 0;
1727 }
1728 #else
1729 if (cig_len && cig_op != BAM_CMATCH) {
1730 cigar[ncigar++] = (cig_len<<4) + cig_op;
1731 cig_len = 0;
1732 }
1733 #endif
1734 if (ds & CRAM_BA) {
1735 if (!c->comp_hdr->codecs[DS_BA]) return -1;
1736 r |= c->comp_hdr->codecs[DS_BA]
1737 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
1738 cr->len ? &seq[pos-1] : NULL,
1739 &out_sz);
1740
1741 if (decode_md || decode_nm) {
1742 if (md_dist >= 0 && decode_md)
1743 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1744 if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1745 md_dist = -1;
1746 } else {
1747 if (decode_md)
1748 BLOCK_APPEND_CHAR(s->aux_blk,
1749 s->ref[ref_pos-s->ref_start +1]);
1750 nm++;
1751 md_dist = 0;
1752 }
1753 }
1754 }
1755 if (ds & CRAM_QS) {
1756 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1757 r |= c->comp_hdr->codecs[DS_QS]
1758 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1759 (char *)&qual[pos-1], &out_sz);
1760 }
1761 #ifdef USE_X
1762 cig_op = BAM_CBASE_MISMATCH;
1763 #else
1764 cig_op = BAM_CMATCH;
1765 #endif
1766 cig_len++;
1767 seq_pos++;
1768 ref_pos++;
1769 //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1770 break;
1771 }
1772
1773 case 'Q': { // Quality score; QS
1774 if (ds & CRAM_QS) {
1775 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1776 r |= c->comp_hdr->codecs[DS_QS]
1777 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1778 (char *)&qual[pos-1], &out_sz);
1779 //printf(" %d: QS = %d (ret %d)\n", f, qc, r);
1780 }
1781 break;
1782 }
1783
1784 case 'H': { // hard clip; HC
1785 if (cig_len && cig_op != BAM_CHARD_CLIP) {
1786 cigar[ncigar++] = (cig_len<<4) + cig_op;
1787 cig_len = 0;
1788 }
1789 if (ds & CRAM_HC) {
1790 if (!c->comp_hdr->codecs[DS_HC]) return -1;
1791 r |= c->comp_hdr->codecs[DS_HC]
1792 ->decode(s, c->comp_hdr->codecs[DS_HC], blk,
1793 (char *)&i32, &out_sz);
1794 if (r) return r;
1795 cig_op = BAM_CHARD_CLIP;
1796 cig_len += i32;
1797 }
1798 break;
1799 }
1800
1801 case 'P': { // padding; PD
1802 if (cig_len && cig_op != BAM_CPAD) {
1803 cigar[ncigar++] = (cig_len<<4) + cig_op;
1804 cig_len = 0;
1805 }
1806 if (ds & CRAM_PD) {
1807 if (!c->comp_hdr->codecs[DS_PD]) return -1;
1808 r |= c->comp_hdr->codecs[DS_PD]
1809 ->decode(s, c->comp_hdr->codecs[DS_PD], blk,
1810 (char *)&i32, &out_sz);
1811 if (r) return r;
1812 cig_op = BAM_CPAD;
1813 cig_len += i32;
1814 nm += i32;
1815 }
1816 break;
1817 }
1818
1819 case 'N': { // Ref skip; RS
1820 if (cig_len && cig_op != BAM_CREF_SKIP) {
1821 cigar[ncigar++] = (cig_len<<4) + cig_op;
1822 cig_len = 0;
1823 }
1824 if (ds & CRAM_RS) {
1825 if (!c->comp_hdr->codecs[DS_RS]) return -1;
1826 r |= c->comp_hdr->codecs[DS_RS]
1827 ->decode(s, c->comp_hdr->codecs[DS_RS], blk,
1828 (char *)&i32, &out_sz);
1829 if (r) return r;
1830 cig_op = BAM_CREF_SKIP;
1831 cig_len += i32;
1832 ref_pos += i32;
1833 nm += i32;
1834 }
1835 break;
1836 }
1837
1838 default:
1839 fprintf(stderr, "Error: Unknown feature code '%c'\n", op);
1840 return -1;
1841 }
1842 }
1843
1844 if (!(ds & CRAM_FC))
1845 goto skip_cigar;
1846
1847 /* An implicit match op for any unaccounted for bases */
1848 if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1849 if (s->ref) {
1850 if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1851 static int whinged = 0;
1852 int rlen;
1853 if (!whinged)
1854 fprintf(stderr, "Ref pos outside of ref sequence boundary\n");
1855 whinged = 1;
1856 rlen = bfd->ref[cr->ref_id].len - ref_pos;
1857 if (rlen > 0) {
1858 if (seq_pos-1 + rlen < cr->len)
1859 memcpy(&seq[seq_pos-1],
1860 &s->ref[ref_pos - s->ref_start +1], rlen);
1861 if ((cr->len - seq_pos + 1) - rlen > 0)
1862 memset(&seq[seq_pos-1+rlen], 'N',
1863 (cr->len - seq_pos + 1) - rlen);
1864 } else {
1865 if (cr->len - seq_pos + 1 > 0)
1866 memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1867 }
1868 } else {
1869 if (cr->len - seq_pos + 1 > 0)
1870 memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1],
1871 cr->len - seq_pos + 1);
1872 ref_pos += cr->len - seq_pos + 1;
1873 if (md_dist >= 0)
1874 md_dist += cr->len - seq_pos + 1;
1875 }
1876 } else {
1877 // So alignment end can be computed even when not decoding sequence
1878 ref_pos += cr->len - seq_pos + 1;
1879 }
1880
1881 if (ncigar+1 >= cigar_alloc) {
1882 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1883 s->cigar = cigar;
1884 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1885 return -1;
1886 }
1887 #ifdef USE_X
1888 if (cig_len && cig_op != BAM_CBASE_MATCH) {
1889 cigar[ncigar++] = (cig_len<<4) + cig_op;
1890 cig_len = 0;
1891 }
1892 cig_op = BAM_CBASE_MATCH;
1893 #else
1894 if (cig_len && cig_op != BAM_CMATCH) {
1895 cigar[ncigar++] = (cig_len<<4) + cig_op;
1896 cig_len = 0;
1897 }
1898 cig_op = BAM_CMATCH;
1899 #endif
1900 cig_len += cr->len - seq_pos+1;
1901 }
1902
1903 skip_cigar:
1904
1905 if ((ds & CRAM_FN) && decode_md) {
1906 if (md_dist >= 0)
1907 BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1908 }
1909
1910 if (cig_len) {
1911 if (ncigar >= cigar_alloc) {
1912 cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1913 s->cigar = cigar;
1914 if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1915 return -1;
1916 }
1917
1918 cigar[ncigar++] = (cig_len<<4) + cig_op;
1919 }
1920
1921 cr->ncigar = ncigar - cr->cigar;
1922 cr->aend = ref_pos;
1923
1924 //printf("2: %.*s %d .. %d %d\n", cr->name_len, (char *)BLOCK_DATA(s->name_blk) + cr->name, cr->apos, ref_pos, seq_pos);
1925
1926 if (ds & CRAM_MQ) {
1927 if (!c->comp_hdr->codecs[DS_MQ]) return -1;
1928 r |= c->comp_hdr->codecs[DS_MQ]
1929 ->decode(s, c->comp_hdr->codecs[DS_MQ], blk,
1930 (char *)&cr->mqual, &out_sz);
1931 } else {
1932 cr->mqual = 40;
1933 }
1934
1935 if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1936 int32_t out_sz2 = cr->len;
1937
1938 if (ds & CRAM_QS) {
1939 if (!c->comp_hdr->codecs[DS_QS]) return -1;
1940 r |= c->comp_hdr->codecs[DS_QS]
1941 ->decode(s, c->comp_hdr->codecs[DS_QS], blk,
1942 qual, &out_sz2);
1943 }
1944 }
1945
1946 s->cigar = cigar;
1947 s->cigar_alloc = cigar_alloc;
1948 s->ncigar = ncigar;
1949
1950 if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1951 cr->len = 0;
1952
1953 if (decode_md) {
1954 BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1955 cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux;
1956 }
1957
1958 if (decode_nm) {
1959 char buf[7];
1960 buf[0] = 'N'; buf[1] = 'M'; buf[2] = 'I';
1961 buf[3] = (nm>> 0) & 0xff;
1962 buf[4] = (nm>> 8) & 0xff;
1963 buf[5] = (nm>>16) & 0xff;
1964 buf[6] = (nm>>24) & 0xff;
1965 BLOCK_APPEND(s->aux_blk, buf, 7);
1966 cr->aux_size += 7;
1967 }
1968
1969 return r;
1970 }
1971
1972 /*
1973 * Quick and simple hash lookup for cram_map arrays
1974 */
map_find(cram_map ** map,unsigned char * key,int id)1975 static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1976 cram_map *m;
1977
1978 m = map[CRAM_MAP(key[0],key[1])];
1979 while (m && m->key != id)
1980 m= m->next;
1981
1982 return m;
1983 }
1984
1985 //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1986
1987
cram_decode_aux_1_0(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr)1988 static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1989 cram_block *blk, cram_record *cr) {
1990 int i, r = 0, out_sz = 1;
1991 unsigned char ntags;
1992
1993 if (!c->comp_hdr->codecs[DS_TC]) return -1;
1994 r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1995 (char *)&ntags, &out_sz);
1996 cr->ntags = ntags;
1997
1998 //printf("TC=%d\n", cr->ntags);
1999 cr->aux_size = 0;
2000 cr->aux = BLOCK_SIZE(s->aux_blk);
2001
2002 for (i = 0; i < cr->ntags; i++) {
2003 int32_t id, out_sz = 1;
2004 unsigned char tag_data[3];
2005 cram_map *m;
2006
2007 //printf("Tag %d/%d\n", i+1, cr->ntags);
2008 if (!c->comp_hdr->codecs[DS_TN]) return -1;
2009 r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
2010 blk, (char *)&id, &out_sz);
2011 if (out_sz == 3) {
2012 tag_data[0] = ((char *)&id)[0];
2013 tag_data[1] = ((char *)&id)[1];
2014 tag_data[2] = ((char *)&id)[2];
2015 } else {
2016 tag_data[0] = (id>>16) & 0xff;
2017 tag_data[1] = (id>>8) & 0xff;
2018 tag_data[2] = id & 0xff;
2019 }
2020
2021 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2022 if (!m)
2023 return -1;
2024 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2025
2026 if (!m->codec) return -1;
2027 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2028
2029 cr->aux_size += out_sz + 3;
2030 }
2031
2032 return r;
2033 }
2034
cram_decode_aux(cram_container * c,cram_slice * s,cram_block * blk,cram_record * cr,int * has_MD,int * has_NM)2035 static int cram_decode_aux(cram_container *c, cram_slice *s,
2036 cram_block *blk, cram_record *cr,
2037 int *has_MD, int *has_NM) {
2038 int i, r = 0, out_sz = 1;
2039 int32_t TL = 0;
2040 unsigned char *TN;
2041 uint32_t ds = s->data_series;
2042
2043 if (!(ds & (CRAM_TL|CRAM_aux))) {
2044 cr->aux = 0;
2045 cr->aux_size = 0;
2046 return 0;
2047 }
2048
2049 if (!c->comp_hdr->codecs[DS_TL]) return -1;
2050 r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
2051 (char *)&TL, &out_sz);
2052 if (r || TL < 0 || TL >= c->comp_hdr->nTL)
2053 return -1;
2054
2055 TN = c->comp_hdr->TL[TL];
2056 cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
2057
2058 //printf("TC=%d\n", cr->ntags);
2059 cr->aux_size = 0;
2060 cr->aux = BLOCK_SIZE(s->aux_blk);
2061
2062 if (!(ds & CRAM_aux))
2063 return 0;
2064
2065 for (i = 0; i < cr->ntags; i++) {
2066 int32_t id, out_sz = 1;
2067 unsigned char tag_data[3];
2068 cram_map *m;
2069
2070 if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
2071 *has_MD = 1;
2072 if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
2073 *has_NM = 1;
2074
2075 //printf("Tag %d/%d\n", i+1, cr->ntags);
2076 tag_data[0] = *TN++;
2077 tag_data[1] = *TN++;
2078 tag_data[2] = *TN++;
2079 id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2080
2081 m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2082 if (!m)
2083 return -1;
2084 BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2085
2086 if (!m->codec) return -1;
2087 r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2088 if (r) break;
2089 cr->aux_size += out_sz + 3;
2090 }
2091
2092 return r;
2093 }
2094
2095 /* Resolve mate pair cross-references between recs within this slice */
cram_decode_slice_xref(cram_slice * s,int required_fields)2096 static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2097 int rec;
2098
2099 if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2100 for (rec = 0; rec < s->hdr->num_records; rec++) {
2101 cram_record *cr = &s->crecs[rec];
2102
2103 cr->tlen = 0;
2104 cr->mate_pos = 0;
2105 cr->mate_ref_id = -1;
2106 }
2107
2108 return 0;
2109 }
2110
2111 for (rec = 0; rec < s->hdr->num_records; rec++) {
2112 cram_record *cr = &s->crecs[rec];
2113
2114 if (cr->mate_line >= 0) {
2115 if (cr->mate_line < s->hdr->num_records) {
2116 /*
2117 * On the first read, loop through computing lengths.
2118 * It's not perfect as we have one slice per reference so we
2119 * cannot detect when TLEN should be zero due to seqs that
2120 * map to multiple references.
2121 *
2122 * We also cannot set tlen correct when it spans a slice for
2123 * other reasons. This may make tlen too small. Should we
2124 * fix this by forcing TLEN to be stored verbatim in such cases?
2125 *
2126 * Or do we just admit defeat and output 0 for tlen? It's the
2127 * safe option...
2128 */
2129 if (cr->tlen == INT_MIN) {
2130 int id1 = rec, id2 = rec;
2131 int aleft = cr->apos, aright = cr->aend;
2132 int tlen;
2133 int ref = cr->ref_id;
2134
2135 // number of segments starting at the same point.
2136 int left_cnt = 0;
2137
2138 do {
2139 if (aleft > s->crecs[id2].apos)
2140 aleft = s->crecs[id2].apos, left_cnt = 1;
2141 else if (aleft == s->crecs[id2].apos)
2142 left_cnt++;
2143 if (aright < s->crecs[id2].aend)
2144 aright = s->crecs[id2].aend;
2145 if (s->crecs[id2].mate_line == -1) {
2146 s->crecs[id2].mate_line = rec;
2147 break;
2148 }
2149 if (s->crecs[id2].mate_line <= id2 ||
2150 s->crecs[id2].mate_line >= s->hdr->num_records)
2151 return -1;
2152 id2 = s->crecs[id2].mate_line;
2153
2154 if (s->crecs[id2].ref_id != ref)
2155 ref = -1;
2156 } while (id2 != id1);
2157
2158 if (ref != -1) {
2159 tlen = aright - aleft + 1;
2160 id1 = id2 = rec;
2161
2162 /*
2163 * When we have two seqs with identical start and
2164 * end coordinates, set +/- tlen based on 1st/last
2165 * bit flags instead, as a tie breaker.
2166 */
2167 if (s->crecs[id2].apos == aleft) {
2168 if (left_cnt == 1 ||
2169 (s->crecs[id2].flags & BAM_FREAD1))
2170 s->crecs[id2].tlen = tlen;
2171 else
2172 s->crecs[id2].tlen = -tlen;
2173 } else {
2174 s->crecs[id2].tlen = -tlen;
2175 }
2176
2177 id2 = s->crecs[id2].mate_line;
2178 while (id2 != id1) {
2179 if (s->crecs[id2].apos == aleft) {
2180 if (left_cnt == 1 ||
2181 (s->crecs[id2].flags & BAM_FREAD1))
2182 s->crecs[id2].tlen = tlen;
2183 else
2184 s->crecs[id2].tlen = -tlen;
2185 } else {
2186 s->crecs[id2].tlen = -tlen;
2187 }
2188 id2 = s->crecs[id2].mate_line;
2189 }
2190 } else {
2191 id1 = id2 = rec;
2192
2193 s->crecs[id2].tlen = 0;
2194 id2 = s->crecs[id2].mate_line;
2195 while (id2 != id1) {
2196 s->crecs[id2].tlen = 0;
2197 id2 = s->crecs[id2].mate_line;
2198 }
2199 }
2200 }
2201
2202 cr->mate_pos = s->crecs[cr->mate_line].apos;
2203 cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2204
2205 // paired
2206 cr->flags |= BAM_FPAIRED;
2207
2208 // set mate unmapped if needed
2209 if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2210 cr->flags |= BAM_FMUNMAP;
2211 cr->tlen = 0;
2212 }
2213 if (cr->flags & BAM_FUNMAP) {
2214 cr->tlen = 0;
2215 }
2216
2217 // set mate reversed if needed
2218 if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2219 cr->flags |= BAM_FMREVERSE;
2220 } else {
2221 fprintf(stderr, "Mate line out of bounds: %d vs [0, %d]\n",
2222 cr->mate_line, s->hdr->num_records-1);
2223 }
2224
2225 /* FIXME: construct read names here too if needed */
2226 } else {
2227 if (cr->mate_flags & CRAM_M_REVERSE) {
2228 cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2229 }
2230 if (cr->mate_flags & CRAM_M_UNMAP) {
2231 cr->flags |= BAM_FMUNMAP;
2232 //cr->mate_ref_id = -1;
2233 }
2234 if (!(cr->flags & BAM_FPAIRED))
2235 cr->mate_ref_id = -1;
2236 }
2237
2238 if (cr->tlen == INT_MIN)
2239 cr->tlen = 0; // Just incase
2240 }
2241 return 0;
2242 }
2243
md5_print(unsigned char * md5,char * out)2244 static char *md5_print(unsigned char *md5, char *out) {
2245 int i;
2246 for (i = 0; i < 16; i++) {
2247 out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2248 out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2249 }
2250 out[32] = 0;
2251
2252 return out;
2253 }
2254
2255 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
2256 cram_record *cr, int rec, bam_seq_t **bam);
2257
2258 /*
2259 * Bulk conversion of an entire cram slice to an array of bam objects.
2260 * (Assumption that fd->required_fields will not change from one
2261 * cram_get_bam_seq() call to the next.)
2262 *
2263 * Returns 0 on success
2264 * -1 on failure
2265 */
2266
bam_size(SAM_hdr * bfd,cram_fd * fd,cram_record * cr)2267 static int bam_size(SAM_hdr *bfd, cram_fd *fd, cram_record *cr) {
2268 int name_len, rg_len;
2269
2270 if (fd->required_fields & SAM_QNAME) {
2271 if (cr->name_len)
2272 name_len = cr->name_len;
2273 else
2274 name_len = strlen(fd->prefix) + 20; // overestimate
2275 } else {
2276 name_len = 1;
2277 }
2278
2279 rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
2280
2281 return sizeof(bam_seq_t)
2282 + name_len + 1
2283 + round4(name_len+1)
2284 + 4 * cr->ncigar
2285 + (cr->len+1)/2
2286 + cr->len
2287 + cr->aux_size + rg_len + 1;
2288 }
2289
bulk_cram_to_bam(SAM_hdr * bfd,cram_fd * fd,cram_slice * s)2290 static int bulk_cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s) {
2291 int i;
2292 int r = 0;
2293 int sizes[10000];
2294
2295 size_t len = 0;
2296 for (i = 0; i < s->hdr->num_records; i++) {
2297 int sz = bam_size(bfd, fd, &s->crecs[i]);
2298 if (i < 10000)
2299 sizes[i] = sz;
2300 len += sz;
2301 }
2302
2303 s->bl = (bam_seq_t **)malloc(s->hdr->num_records * sizeof(*s->bl) + len);
2304 if (!s->bl)
2305 return -1;
2306
2307 char *x = ((char *)s->bl) + s->hdr->num_records * sizeof(*s->bl);
2308 for (i = 0; i < s->hdr->num_records; i++) {
2309 bam_seq_t *b = (bam_seq_t *)x, *o = b;
2310 int bsize = i < 10000 ? sizes[i] : bam_size(bfd, fd, &s->crecs[i]);
2311 b->alloc = bsize;
2312 r |= (cram_to_bam(fd->header, fd, s, &s->crecs[i], i, &b) < 0);
2313 // if we allocated enough, the above won't have resized b
2314 assert(o == b && o->alloc == bsize);
2315 x += bsize;
2316 s->bl[i] = b;
2317 }
2318
2319 return 0;
2320 }
2321
2322 /*
2323 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2324 * Returns 0 on success
2325 * -1 on failure
2326 */
cram_decode_slice(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2327 int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2328 SAM_hdr *bfd) {
2329 cram_block *blk = s->block[0];
2330 int32_t bf, ref_id;
2331 unsigned char cf;
2332 int out_sz, r = 0;
2333 int rec;
2334 char *seq = NULL, *qual = NULL;
2335 int unknown_rg = -1;
2336 int embed_ref;
2337 char **refs = NULL;
2338 uint32_t ds;
2339
2340 if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2341 return -1;
2342
2343 ds = s->data_series;
2344 //printf("%08x\n", ds);
2345
2346 blk->bit = 7; // MSB first
2347
2348 // Study the blocks and estimate approx sizes to preallocate.
2349 // This looks to speed up decoding by around 8-9%.
2350 // We can always shrink back down at the end if we overestimated.
2351 // However it's likely that this also saves memory as own growth
2352 // factor (*=1.5) is never applied.
2353 {
2354 int qsize, nsize, q_id;
2355 cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2356 //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2357
2358 if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2359 if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2360 if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2361
2362 // To do - consider using q_id here to usurp the quality block and
2363 // avoid a memcpy during decode.
2364 // Specifically when quality is an external block uniquely used by
2365 // DS_QS only, then we can set s->qual_blk directly to this
2366 // block and save the codec->decode() calls. (Approx 3% cpu saving)
2367 }
2368
2369 /* Look for unknown RG, added as last by Java CRAM? */
2370 if (bfd->nrg > 0 &&
2371 bfd->rg[bfd->nrg-1].name != NULL &&
2372 !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2373 unknown_rg = bfd->nrg-1;
2374
2375 if (blk->content_type != CORE)
2376 return -1;
2377
2378 if (s->crecs)
2379 free(s->crecs);
2380 if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2381 return -1;
2382
2383 ref_id = s->hdr->ref_seq_id;
2384 embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2385
2386 if (ref_id >= 0) {
2387 if (embed_ref) {
2388 cram_block *b;
2389 if (s->hdr->ref_base_id < 0) {
2390 fprintf(stderr, "No reference specified and "
2391 "no embedded reference is available.\n");
2392 return -1;
2393 }
2394 b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2395 if (!b)
2396 return -1;
2397 if (cram_uncompress_block(b) != 0)
2398 return -1;
2399 s->ref = (char *)BLOCK_DATA(b);
2400 s->ref_start = s->hdr->ref_seq_start;
2401 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2402 if (s->ref_end - s->ref_start > b->uncomp_size) {
2403 fprintf(stderr, "Embedded reference is too small.\n");
2404 return -1;
2405 }
2406 } else if (!c->comp_hdr->no_ref) {
2407 //// Avoid Java cramtools bug by loading entire reference seq
2408 //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2409 //s->ref_start = 1;
2410
2411 if (fd->required_fields & SAM_SEQ)
2412 s->ref =
2413 cram_get_ref(fd, s->hdr->ref_seq_id,
2414 s->hdr->ref_seq_start,
2415 s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2416 s->ref_start = s->hdr->ref_seq_start;
2417 s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2418
2419 /* Sanity check */
2420 if (s->ref_start < 0) {
2421 fprintf(stderr, "Slice starts before base 1.\n");
2422 s->ref_start = 0;
2423 }
2424 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2425 pthread_mutex_lock(&fd->refs->lock);
2426 if ((fd->required_fields & SAM_SEQ) &&
2427 s->ref_end > fd->refs->ref_id[ref_id]->length) {
2428 s->ref_end = fd->refs->ref_id[ref_id]->length;
2429 }
2430 pthread_mutex_unlock(&fd->refs->lock);
2431 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2432 }
2433 }
2434
2435 if ((fd->required_fields & SAM_SEQ) &&
2436 s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2437 fprintf(stderr, "Unable to fetch reference #%d %"PRId64"..%"PRId64"\n",
2438 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
2439 s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2440 return -1;
2441 }
2442
2443 if (!IS_CRAM_1_VERS(fd)
2444 && (fd->required_fields & SAM_SEQ)
2445 && s->hdr->ref_seq_id >= 0
2446 && !fd->ignore_md5
2447 && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2448 MD5_CTX md5;
2449 unsigned char digest[16];
2450
2451 if (s->ref && s->hdr->ref_seq_id >= 0) {
2452 int start, len;
2453
2454 if (s->hdr->ref_seq_start >= s->ref_start) {
2455 start = s->hdr->ref_seq_start - s->ref_start;
2456 } else {
2457 fprintf(stderr, "Slice starts before base 1.\n");
2458 start = 0;
2459 }
2460
2461 if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2462 len = s->hdr->ref_seq_span;
2463 } else {
2464 fprintf(stderr, "Slice ends beyond reference end.\n");
2465 len = s->ref_end - s->ref_start + 1;
2466 }
2467
2468 MD5_Init(&md5);
2469 if (start + len > s->ref_end - s->ref_start + 1)
2470 len = s->ref_end - s->ref_start + 1 - start;
2471 if (len >= 0)
2472 MD5_Update(&md5, s->ref + start, len);
2473 MD5_Final(digest, &md5);
2474 } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2475 cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2476 if (b) {
2477 MD5_Init(&md5);
2478 MD5_Update(&md5, b->data, b->uncomp_size);
2479 MD5_Final(digest, &md5);
2480 }
2481 }
2482
2483 if ((!s->ref && s->hdr->ref_base_id < 0)
2484 || memcmp(digest, s->hdr->md5, 16) != 0) {
2485 char M[33];
2486 fprintf(stderr, "ERROR: md5sum reference mismatch for ref "
2487 "%d pos %d..%d\n", ref_id, s->ref_start, s->ref_end);
2488 fprintf(stderr, "CRAM: %s\n", md5_print(s->hdr->md5, M));
2489 fprintf(stderr, "Ref : %s\n", md5_print(digest, M));
2490 return -1;
2491 }
2492 }
2493
2494 if (ref_id == -2) {
2495 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2496 pthread_mutex_lock(&fd->refs->lock);
2497 refs = calloc(fd->refs->nref, sizeof(char *));
2498 pthread_mutex_unlock(&fd->refs->lock);
2499 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2500 if (!refs)
2501 return -1;
2502 }
2503
2504 int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2505 for (rec = 0; rec < s->hdr->num_records; rec++) {
2506 cram_record *cr = &s->crecs[rec];
2507 int has_MD, has_NM;
2508
2509 //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2510
2511 cr->s = s;
2512
2513 out_sz = 1; /* decode 1 item */
2514 if (ds & CRAM_BF) {
2515 if (!c->comp_hdr->codecs[DS_BF]) return -1;
2516 r |= c->comp_hdr->codecs[DS_BF]
2517 ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2518 (char *)&bf, &out_sz);
2519 if (r || bf < 0 ||
2520 bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2521 return -1;
2522 bf = fd->bam_flag_swap[bf];
2523 cr->flags = bf;
2524 } else {
2525 cr->flags = bf = 0x4; // unmapped
2526 }
2527
2528 if (ds & CRAM_CF) {
2529 if (IS_CRAM_1_VERS(fd)) {
2530 /* CF is byte in 1.0, int32 in 2.0 */
2531 if (!c->comp_hdr->codecs[DS_CF]) return -1;
2532 r |= c->comp_hdr->codecs[DS_CF]
2533 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2534 (char *)&cf, &out_sz);
2535 if (r) return -1;
2536 cr->cram_flags = cf;
2537 } else {
2538 if (!c->comp_hdr->codecs[DS_CF]) return -1;
2539 r |= c->comp_hdr->codecs[DS_CF]
2540 ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2541 (char *)&cr->cram_flags, &out_sz);
2542 if (r) return -1;
2543 cf = cr->cram_flags;
2544 }
2545 } else {
2546 cf = cr->cram_flags = 0;
2547 }
2548
2549 if (!IS_CRAM_1_VERS(fd) && ref_id == -2) {
2550 if (ds & CRAM_RI) {
2551 if (!c->comp_hdr->codecs[DS_RI]) return -1;
2552 r |= c->comp_hdr->codecs[DS_RI]
2553 ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2554 (char *)&cr->ref_id, &out_sz);
2555 if (r) return -1;
2556 if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2557 && cr->ref_id >= 0
2558 && cr->ref_id != last_ref_id) {
2559 if (!c->comp_hdr->no_ref) {
2560 if (!refs[cr->ref_id])
2561 refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id,
2562 1, 0);
2563 s->ref = refs[cr->ref_id];
2564
2565 if (!fd->unsorted && last_ref_id >= 0 && refs[last_ref_id]) {
2566 cram_ref_decr(fd->refs, last_ref_id);
2567 refs[last_ref_id] = NULL;
2568 }
2569 }
2570 s->ref_start = 1;
2571 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2572 pthread_mutex_lock(&fd->refs->lock);
2573 s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2574 pthread_mutex_unlock(&fd->refs->lock);
2575 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2576
2577 last_ref_id = cr->ref_id;
2578 }
2579 } else {
2580 cr->ref_id = -1;
2581 }
2582 } else {
2583 cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2584 }
2585 if (cr->ref_id >= bfd->nref) {
2586 fprintf(stderr, "Requested unknown reference ID %d\n", cr->ref_id);
2587 return -1;
2588 }
2589
2590 if (ds & CRAM_RL) {
2591 if (!c->comp_hdr->codecs[DS_RL]) return -1;
2592 r |= c->comp_hdr->codecs[DS_RL]
2593 ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2594 (char *)&cr->len, &out_sz);
2595 if (r) return r;
2596 if (cr->len < 0) {
2597 fprintf(stderr, "Read has negative length\n");
2598 return -1;
2599 }
2600 }
2601
2602 if (ds & CRAM_AP) {
2603 if (!c->comp_hdr->codecs[DS_AP]) return -1;
2604 if (CRAM_MAJOR_VERS(fd->version) < 4) {
2605 int32_t i32;
2606 r |= c->comp_hdr->codecs[DS_AP]
2607 ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2608 (char *)&i32, &out_sz);
2609 cr->apos = i32;
2610 } else {
2611 r |= c->comp_hdr->codecs[DS_AP]
2612 ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2613 (char *)&cr->apos, &out_sz);
2614 }
2615 if (r) return r;
2616 if (c->comp_hdr->AP_delta)
2617 cr->apos += s->last_apos;
2618 s->last_apos= cr->apos;
2619 } else {
2620 cr->apos = c->ref_seq_start;
2621 }
2622
2623 if (ds & CRAM_RG) {
2624 if (!c->comp_hdr->codecs[DS_RG]) return -1;
2625 r |= c->comp_hdr->codecs[DS_RG]
2626 ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2627 (char *)&cr->rg, &out_sz);
2628 if (r) return r;
2629 if (cr->rg == unknown_rg)
2630 cr->rg = -1;
2631 } else {
2632 cr->rg = -1;
2633 }
2634
2635 cr->name_len = 0;
2636
2637 if (c->comp_hdr->read_names_included) {
2638 int32_t out_sz2 = 1;
2639
2640 // Read directly into name cram_block
2641 cr->name = BLOCK_SIZE(s->name_blk);
2642 if (ds & CRAM_RN) {
2643 if (!c->comp_hdr->codecs[DS_RN]) return -1;
2644 r |= c->comp_hdr->codecs[DS_RN]
2645 ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2646 (char *)s->name_blk, &out_sz2);
2647 if (r) return r;
2648 cr->name_len = out_sz2;
2649 }
2650 }
2651
2652 cr->mate_pos = 0;
2653 cr->mate_line = -1;
2654 cr->mate_ref_id = -1;
2655 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2656 if (ds & CRAM_MF) {
2657 if (IS_CRAM_1_VERS(fd)) {
2658 /* MF is byte in 1.0, int32 in 2.0 */
2659 unsigned char mf;
2660 if (!c->comp_hdr->codecs[DS_MF]) return -1;
2661 r |= c->comp_hdr->codecs[DS_MF]
2662 ->decode(s, c->comp_hdr->codecs[DS_MF],
2663 blk, (char *)&mf, &out_sz);
2664 if (r) return r;
2665 cr->mate_flags = mf;
2666 } else {
2667 if (!c->comp_hdr->codecs[DS_MF]) return -1;
2668 r |= c->comp_hdr->codecs[DS_MF]
2669 ->decode(s, c->comp_hdr->codecs[DS_MF],
2670 blk,
2671 (char *)&cr->mate_flags,
2672 &out_sz);
2673 if (r) return r;
2674 }
2675 } else {
2676 cr->mate_flags = 0;
2677 }
2678
2679 if (!c->comp_hdr->read_names_included) {
2680 int32_t out_sz2 = 1;
2681
2682 // Read directly into name cram_block
2683 cr->name = BLOCK_SIZE(s->name_blk);
2684 if (ds & CRAM_RN) {
2685 if (!c->comp_hdr->codecs[DS_RN]) return -1;
2686 r |= c->comp_hdr->codecs[DS_RN]
2687 ->decode(s, c->comp_hdr->codecs[DS_RN],
2688 blk, (char *)s->name_blk,
2689 &out_sz2);
2690 if (r) return r;
2691 cr->name_len = out_sz2;
2692 }
2693 }
2694
2695 if (ds & CRAM_NS) {
2696 if (!c->comp_hdr->codecs[DS_NS]) return -1;
2697 r |= c->comp_hdr->codecs[DS_NS]
2698 ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2699 (char *)&cr->mate_ref_id, &out_sz);
2700 if (r) return r;
2701 }
2702
2703 // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2704 // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2705 // /* Paired, but unmapped */
2706 // cr->flags |= BAM_FMUNMAP;
2707 // }
2708
2709 if (ds & CRAM_NP) {
2710 if (!c->comp_hdr->codecs[DS_NP]) return -1;
2711 if (CRAM_MAJOR_VERS(fd->version) < 4) {
2712 int32_t i32;
2713 r |= c->comp_hdr->codecs[DS_NP]
2714 ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2715 (char *)&i32, &out_sz);
2716 cr->mate_pos = i32;
2717 } else {
2718 r |= c->comp_hdr->codecs[DS_NP]
2719 ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2720 (char *)&cr->mate_pos, &out_sz);
2721 }
2722 if (r) return r;
2723 }
2724
2725 if (ds & CRAM_TS) {
2726 if (!c->comp_hdr->codecs[DS_TS]) return -1;
2727 if (CRAM_MAJOR_VERS(fd->version) < 4) {
2728 int32_t i32;
2729 r |= c->comp_hdr->codecs[DS_TS]
2730 ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2731 (char *)&i32, &out_sz);
2732 cr->tlen = i32;
2733 } else {
2734 r |= c->comp_hdr->codecs[DS_TS]
2735 ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2736 (char *)&cr->tlen, &out_sz);
2737 }
2738 if (r) return r;
2739 } else {
2740 cr->tlen = INT_MIN;
2741 }
2742 } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2743 if (ds & CRAM_NF) {
2744 if (!c->comp_hdr->codecs[DS_NF]) return -1;
2745 r |= c->comp_hdr->codecs[DS_NF]
2746 ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2747 (char *)&cr->mate_line, &out_sz);
2748 if (r) return r;
2749 cr->mate_line += rec + 1;
2750
2751 //cr->name_len = sprintf(name, "%d", name_id++);
2752 //cr->name = DSTRING_LEN(name_ds);
2753 //dstring_nappend(name_ds, name, cr->name_len);
2754
2755 cr->mate_ref_id = -1;
2756 cr->tlen = INT_MIN;
2757 cr->mate_pos = 0;
2758 } else {
2759 cr->mate_flags = 0;
2760 cr->tlen = INT_MIN;
2761 }
2762 } else {
2763 cr->mate_flags = 0;
2764 cr->tlen = INT_MIN;
2765 }
2766 /*
2767 else if (!name[0]) {
2768 //name[0] = '?'; name[1] = 0;
2769 //cr->name_len = 1;
2770 //cr->name= DSTRING_LEN(s->name_ds);
2771 //dstring_nappend(s->name_ds, "?", 1);
2772
2773 cr->mate_ref_id = -1;
2774 cr->tlen = 0;
2775 cr->mate_pos = 0;
2776 }
2777 */
2778
2779 /* Auxiliary tags */
2780 has_MD = has_NM = 0;
2781 if (IS_CRAM_1_VERS(fd))
2782 r |= cram_decode_aux_1_0(c, s, blk, cr);
2783 else
2784 r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM);
2785 if (r) return r;
2786
2787 /* Fake up dynamic string growth and appending */
2788 if (ds & CRAM_RL) {
2789 cr->seq = BLOCK_SIZE(s->seqs_blk);
2790 BLOCK_GROW(s->seqs_blk, cr->len);
2791 seq = (char *)BLOCK_END(s->seqs_blk);
2792 BLOCK_SIZE(s->seqs_blk) += cr->len;
2793
2794 if (!seq)
2795 return -1;
2796
2797 cr->qual = BLOCK_SIZE(s->qual_blk);
2798 BLOCK_GROW(s->qual_blk, cr->len);
2799 qual = (char *)BLOCK_END(s->qual_blk);
2800 BLOCK_SIZE(s->qual_blk) += cr->len;
2801
2802 if (!s->ref)
2803 memset(seq, '=', cr->len);
2804 }
2805
2806 if (!(bf & BAM_FUNMAP)) {
2807 if ((ds & CRAM_AP) && cr->apos <= 0) {
2808 fprintf(stderr,
2809 "Read has alignment position %"PRId64
2810 " but no unmapped flag\n",
2811 cr->apos);
2812 return -1;
2813 }
2814 /* Decode sequence and generate CIGAR */
2815 if (ds & (CRAM_SEQ | CRAM_MQ)) {
2816 r |= cram_decode_seq(fd, c, s, blk, cr, bfd, cf, seq, qual,
2817 has_MD, has_NM);
2818 if (r) return r;
2819 } else {
2820 cr->cigar = 0;
2821 cr->ncigar = 0;
2822 cr->aend = cr->apos;
2823 cr->mqual = 0;
2824 }
2825 } else {
2826 int out_sz2 = cr->len;
2827
2828 //puts("Unmapped");
2829 cr->cigar = 0;
2830 cr->ncigar = 0;
2831 cr->aend = cr->apos;
2832 cr->mqual = 0;
2833
2834 if (ds & CRAM_BA && cr->len) {
2835 if (!c->comp_hdr->codecs[DS_BA]) return -1;
2836 r |= c->comp_hdr->codecs[DS_BA]
2837 ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2838 (char *)seq, &out_sz2);
2839 if (r) return r;
2840 }
2841
2842 if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2843 out_sz2 = cr->len;
2844 if (ds & CRAM_QS && cr->len >= 0) {
2845 if (!c->comp_hdr->codecs[DS_QS]) return -1;
2846 r |= c->comp_hdr->codecs[DS_QS]
2847 ->decode(s, c->comp_hdr->codecs[DS_QS],
2848 blk, qual, &out_sz2);
2849 if (r) return r;
2850 }
2851 } else {
2852 if (ds & CRAM_RL)
2853 memset(qual, 255, cr->len);
2854 }
2855 }
2856
2857 if (!fd->ignore_chksum) {
2858 if (s->hdr->BD_crc && ds & CRAM_BA && s->ref)
2859 s->BD_crc += iolib_crc32(0L, (Bytef *) seq, cr->len);
2860
2861 if (s->hdr->SD_crc &&
2862 (ds & CRAM_QS) &&
2863 (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2864 s->SD_crc += iolib_crc32(0L, (Bytef *) qual, cr->len);
2865 }
2866 }
2867 }
2868
2869 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
2870 if (refs) {
2871 int i;
2872 for (i = 0; i < fd->refs->nref; i++) {
2873 if (refs[i])
2874 cram_ref_decr(fd->refs, i);
2875 }
2876 free(refs);
2877 } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2878 cram_ref_decr(fd->refs, ref_id);
2879 }
2880 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
2881
2882 /* Resolve mate pair cross-references between recs within this slice */
2883 r |= cram_decode_slice_xref(s, fd->required_fields);
2884
2885 // Free the original blocks as we no longer need these.
2886 {
2887 int i;
2888 for (i = 0; i < s->hdr->num_blocks; i++) {
2889 cram_block *b = s->block[i];
2890 cram_free_block(b);
2891 s->block[i] = NULL;
2892 }
2893 }
2894
2895 // Also see initial BLOCK_RESIZE_EXACT at top of function.
2896 // As we grow blocks we overallocate by up to 50%. So shrink
2897 // back to their final sizes here.
2898 //
2899 // fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2900 // (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2901 // (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2902 // (int)s->name_blk->byte, (int)s->name_blk->alloc,
2903 // (int)s->aux_blk->byte, (int)s->aux_blk->alloc);
2904 BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2905 BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2906 BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2907 BLOCK_RESIZE_EXACT(s->aux_blk, BLOCK_SIZE(s->aux_blk)+1);
2908
2909 /* Checksum */
2910 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2911 if (ds & CRAM_BA && s->BD_crc && s->hdr->BD_crc) {
2912 if (s->BD_crc != s->hdr->BD_crc) {
2913 fprintf(stderr, "BD checksum failure: %08x vs %08x\n",
2914 (uint32_t)s->BD_crc, (uint32_t)s->hdr->BD_crc);
2915 r |= -1;
2916 }
2917 }
2918
2919 if (ds & CRAM_QS && s->SD_crc && s->hdr->SD_crc) {
2920 if (s->SD_crc != s->hdr->SD_crc) {
2921 fprintf(stderr, "SD checksum failure: %08x vs %08x\n",
2922 (uint32_t)s->SD_crc, (uint32_t)s->hdr->SD_crc);
2923 r |= -1;
2924 }
2925 }
2926 }
2927
2928 // If we're wanting BAM records, convert these up-front too.
2929 // This is useful when we're streaming lots of data in a
2930 // multi-threaded environment as the cram to bam conversion is
2931 // then threaded too.
2932 //
2933 // Possible future optimisation - check range query and don't
2934 // convert all reads to BAM.
2935
2936 if (fd->pool)
2937 r |= bulk_cram_to_bam(bfd, fd, s);
2938
2939 return r;
2940 }
2941
2942 typedef struct {
2943 cram_fd *fd;
2944 cram_container *c;
2945 cram_slice *s;
2946 SAM_hdr *h;
2947 int exit_code;
2948 } cram_decode_job;
2949
cram_decode_slice_thread(void * arg)2950 void *cram_decode_slice_thread(void *arg) {
2951 cram_decode_job *j = (cram_decode_job *)arg;
2952
2953 j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2954
2955 return j;
2956 }
2957
2958 /*
2959 * Spawn a multi-threaded version of cram_decode_slice().
2960 */
cram_decode_slice_mt(cram_fd * fd,cram_container * c,cram_slice * s,SAM_hdr * bfd)2961 int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2962 SAM_hdr *bfd) {
2963 cram_decode_job *j;
2964 int nonblock;
2965
2966 if (!fd->pool)
2967 return cram_decode_slice(fd, c, s, bfd);
2968
2969 if (!(j = malloc(sizeof(*j))))
2970 return -1;
2971
2972 j->fd = fd;
2973 j->c = c;
2974 j->s = s;
2975 j->h = bfd;
2976
2977 nonblock = t_pool_results_queue_sz(fd->rqueue) ? 1 : 0;
2978
2979 if (-1 == t_pool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2980 j, nonblock)) {
2981 /* Would block */
2982 fd->job_pending = j;
2983 } else {
2984 fd->job_pending = NULL;
2985 }
2986
2987 // flush too
2988 return 0;
2989 }
2990
2991
2992 /* ----------------------------------------------------------------------
2993 * CRAM sequence iterators.
2994 */
2995
2996 /*
2997 * Converts a cram in-memory record into a bam in-memory record. We
2998 * pass a pointer to a bam_seq_t pointer along with the a pointer to
2999 * the allocated size. These can initially be pointers to NULL and zero.
3000 *
3001 * This function will reallocate the bam buffer as required and update
3002 * (*bam)->alloc accordingly, allowing it to be used within a loop
3003 * efficiently without needing to allocate new bam objects over and
3004 * over again.
3005 *
3006 * Returns the used size of the bam record on success
3007 * -1 on failure.
3008 */
cram_to_bam(SAM_hdr * bfd,cram_fd * fd,cram_slice * s,cram_record * cr,int rec,bam_seq_t ** bam)3009 static int cram_to_bam(SAM_hdr *bfd, cram_fd *fd, cram_slice *s,
3010 cram_record *cr, int rec, bam_seq_t **bam) {
3011 int bam_idx, rg_len;
3012 char name_a[1024], *name;
3013 int name_len;
3014 char *aux, *aux_orig;
3015 char *seq, *qual;
3016
3017 /* Assign names if not explicitly set */
3018 if (fd->required_fields & SAM_QNAME) {
3019 if (cr->name_len) {
3020 name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3021 name_len = cr->name_len;
3022 } else {
3023 name = name_a;
3024 name_len = strlen(fd->prefix);
3025 memcpy(name, fd->prefix, name_len);
3026 name += name_len;
3027 *name++ = ':';
3028 if (cr->mate_line >= 0 && cr->mate_line < rec)
3029 name = (char *)append_uint64((unsigned char *)name,
3030 s->hdr->record_counter +
3031 cr->mate_line + 1);
3032 else
3033 name = (char *)append_uint64((unsigned char *)name,
3034 s->hdr->record_counter +
3035 rec + 1);
3036 name_len = name - name_a;
3037 name = name_a;
3038 }
3039 } else {
3040 name = "?";
3041 name_len = 1;
3042 }
3043
3044 /* Generate BAM record */
3045 if (cr->rg < -1 || cr->rg >= bfd->nrg)
3046 return -1;
3047 rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3048
3049 if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3050 if (!BLOCK_DATA(s->seqs_blk))
3051 return -1;
3052 seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3053 } else {
3054 seq = "*";
3055 cr->len = 0;
3056 }
3057
3058 if (fd->required_fields & SAM_QUAL) {
3059 if (!BLOCK_DATA(s->qual_blk))
3060 return -1;
3061 qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3062 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3063 if (cr->flags & BAM_FREVERSE) {
3064 int i, j;
3065 for (i = 0, j = cr->len-1; i < j; i++, j--) {
3066 unsigned char c;
3067 c = qual[i];
3068 qual[i] = qual[j];
3069 qual[j] = c;
3070 }
3071 }
3072 }
3073 } else {
3074 qual = NULL;
3075 }
3076
3077 bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len,
3078 name, name_len,
3079 cr->flags,
3080 cr->ref_id,
3081 cr->apos,
3082 cr->aend,
3083 cr->mqual,
3084 cr->ncigar, &s->cigar[cr->cigar],
3085 cr->mate_ref_id,
3086 cr->mate_pos,
3087 cr->tlen,
3088 cr->len,
3089 seq,
3090 qual);
3091 if (bam_idx == -1)
3092 return -1;
3093
3094 aux = aux_orig = (char *)bam_aux(*bam);
3095
3096 /* Auxiliary strings */
3097 if (cr->aux_size != 0) {
3098 memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3099 aux += cr->aux_size;
3100 }
3101
3102 /* RG:Z: */
3103 if (cr->rg != -1) {
3104 int len = bfd->rg[cr->rg].name_len;
3105 *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3106 memcpy(aux, bfd->rg[cr->rg].name, len);
3107 aux += len;
3108 *aux++ = 0;
3109 }
3110
3111 #ifndef SAMTOOLS
3112 bam_set_blk_size(*bam, bam_blk_size(*bam) + (aux - aux_orig));
3113 #endif
3114
3115 *aux++ = 0;
3116
3117 return bam_idx + (aux - aux_orig);
3118 }
3119
3120 /*
3121 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3122 */
3123 /*
3124 * Load first container.
3125 * Called when fd->ctr is NULL>
3126 *
3127 * Returns container on success
3128 * NULL on failure.
3129 */
cram_first_slice(cram_fd * fd)3130 static cram_container *cram_first_slice(cram_fd *fd) {
3131 cram_container *c;
3132
3133 do {
3134 if (!(c = fd->ctr = cram_read_container(fd)))
3135 return NULL;
3136 c->curr_slice_mt = c->curr_slice;
3137 } while (c->length == 0);
3138
3139 /*
3140 * The first container may be a result of a sub-range query.
3141 * In which case it may still not be the optimal starting point
3142 * due to skipped containers/slices in the index.
3143 */
3144 // No need for locks here as we're in the main thread.
3145 if (fd->range.refid != -2) {
3146 while (c->ref_seq_id != -2 &&
3147 (c->ref_seq_id < fd->range.refid ||
3148 (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3149 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3150 if (0 != cram_seek(fd, c->length, SEEK_CUR))
3151 return NULL;
3152 cram_free_container(fd->ctr);
3153 do {
3154 if (!(c = fd->ctr = cram_read_container(fd)))
3155 return NULL;
3156 } while (c->length == 0);
3157 }
3158
3159 if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3160 fd->eof = 1;
3161 return NULL;
3162 }
3163 }
3164
3165 if (!(c->comp_hdr_block = cram_read_block(fd)))
3166 return NULL;
3167 if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3168 return NULL;
3169
3170 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3171 if (!c->comp_hdr)
3172 return NULL;
3173 if (!c->comp_hdr->AP_delta &&
3174 sam_hdr_sort_order(fd->header) != ORDER_COORD) {
3175 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
3176 fd->unsorted = 1;
3177 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3178 }
3179
3180 return c;
3181 }
3182
cram_next_slice(cram_fd * fd,cram_container ** cp)3183 static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3184 cram_container *c_curr; // container being consumed via cram_get_seq()
3185 cram_slice *s_curr = NULL;
3186
3187 // Populate the first container if unknown.
3188 if (!(c_curr = fd->ctr)) {
3189 if (!(c_curr = cram_first_slice(fd)))
3190 return NULL;
3191 }
3192
3193 // Discard previous slice
3194 if ((s_curr = c_curr->slice)) {
3195 c_curr->slice = NULL;
3196 cram_free_slice(s_curr);
3197 s_curr = NULL;
3198 }
3199
3200 // If we've consumed all slices in this container, also discard
3201 // the container too.
3202 if (c_curr->curr_slice == c_curr->max_slice) {
3203 if (fd->ctr == c_curr)
3204 fd->ctr = NULL;
3205 if (fd->ctr_mt == c_curr)
3206 fd->ctr_mt = NULL;
3207 cram_free_container(c_curr);
3208 c_curr = NULL;
3209 }
3210
3211 if (!fd->ctr_mt)
3212 fd->ctr_mt = c_curr;
3213
3214 // Fetch the next slice (and the container if necessary).
3215 //
3216 // If single threaded this loop bails out as soon as it finds
3217 // a slice in range. In this case c_next and c_curr end up being
3218 // the same thing.
3219 //
3220 // If multi-threaded, we loop until we have filled out
3221 // thread pool input queue. Here c_next and c_curr *may* differ, as
3222 // can fd->ctr and fd->ctr_mt.
3223 for (;;) {
3224 cram_container *c_next = fd->ctr_mt;
3225 cram_slice *s_next = NULL;
3226
3227 // Next slice; either from the last job we failed to push
3228 // to the input queue or via more I/O.
3229 if (fd->job_pending) {
3230 cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3231 c_next = j->c;
3232 s_next = j->s;
3233 free(fd->job_pending);
3234 fd->job_pending = NULL;
3235 } else if (!fd->ooc) {
3236 empty_container:
3237 if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3238 // new container
3239 for(;;) {
3240 if (!(c_next = cram_read_container(fd))) {
3241 if (fd->pool) {
3242 fd->ooc = 1;
3243 break;
3244 }
3245
3246 return NULL;
3247 }
3248 c_next->curr_slice_mt = c_next->curr_slice;
3249
3250 if (c_next->length != 0)
3251 break;
3252
3253 cram_free_container(c_next);
3254 }
3255 if (fd->ooc)
3256 break;
3257
3258 /* Skip containers not yet spanning our range */
3259 if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3260 // ref_id beyond end of range; bail out
3261 if (c_next->ref_seq_id != fd->range.refid) {
3262 cram_free_container(c_next);
3263 fd->ctr_mt = NULL;
3264 fd->ooc = 1;
3265 break;
3266 }
3267
3268 // position beyond end of range; bail out
3269 if (c_next->ref_seq_start > fd->range.end) {
3270 cram_free_container(c_next);
3271 fd->ctr_mt = NULL;
3272 fd->ooc = 1;
3273 break;
3274 }
3275
3276 // before start of range; skip to next container
3277 if (c_next->ref_seq_start + c_next->ref_seq_span-1 <
3278 fd->range.start) {
3279 c_next->curr_slice_mt = c_next->max_slice;
3280 cram_seek(fd, c_next->length, SEEK_CUR);
3281 cram_free_container(c_next);
3282 c_next = NULL;
3283 continue;
3284 }
3285 }
3286
3287 // Container is valid range, so remember it for restarting
3288 // this function.
3289 fd->ctr_mt = c_next;
3290
3291 if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3292 return NULL;
3293 if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3294 return NULL;
3295
3296 c_next->comp_hdr =
3297 cram_decode_compression_header(fd, c_next->comp_hdr_block);
3298 if (!c_next->comp_hdr)
3299 return NULL;
3300
3301 if (!c_next->comp_hdr->AP_delta &&
3302 sam_hdr_sort_order(fd->header) != ORDER_COORD) {
3303 if (fd->ref_lock) pthread_mutex_lock(fd->ref_lock);
3304 fd->unsorted = 1;
3305 if (fd->ref_lock) pthread_mutex_unlock(fd->ref_lock);
3306 }
3307 }
3308
3309 if (c_next->num_records == 0) {
3310 cram_free_container(c_next);
3311 if (fd->ctr_mt == c_next)
3312 fd->ctr_mt = NULL;
3313 c_next = NULL;
3314 goto empty_container;
3315 }
3316
3317 if (!(s_next = c_next->slice = cram_read_slice(fd)))
3318 return NULL;
3319
3320 s_next->slice_num = ++c_next->curr_slice_mt;
3321 s_next->curr_rec = 0;
3322 s_next->max_rec = s_next->hdr->num_records;
3323
3324 s_next->last_apos = s_next->hdr->ref_seq_start;
3325
3326 // We know the container overlaps our range, but with multi-slice
3327 // containers we may have slices that do not. Skip these also.
3328 if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3329 // ref_id beyond end of range; bail out
3330 if (s_next->hdr->ref_seq_id != fd->range.refid) {
3331 fd->ooc = 1;
3332 cram_free_slice(s_next);
3333 s_next = NULL;
3334 break;
3335 }
3336
3337 // position beyond end of range; bail out
3338 if (s_next->hdr->ref_seq_start > fd->range.end) {
3339 fd->ooc = 1;
3340 cram_free_slice(s_next);
3341 s_next = NULL;
3342 break;
3343 }
3344
3345 // before start of range; skip to next slice
3346 if (s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3347 fd->range.start) {
3348 cram_free_slice(s_next);
3349 continue;
3350 }
3351 }
3352 } // end: if (!fd->ooc)
3353
3354 if (!c_next || !s_next)
3355 break;
3356
3357 // Decode the slice, either right now (non-threaded) or by pushing
3358 // it to the a decode queue (threaded).
3359 if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3360 fprintf(stderr, "Failure to decode slice\n");
3361 cram_free_slice(s_next);
3362 c_next->slice = NULL;
3363 return NULL;
3364 }
3365
3366 // No thread pool, so don't loop again
3367 if (!fd->pool) {
3368 c_curr = c_next;
3369 s_curr = s_next;
3370 break;
3371 }
3372
3373 // With thread pool, but we have a job pending so our decode queue
3374 // is full.
3375 if (fd->job_pending)
3376 break;
3377
3378 // Otherwise we're threaded with room in the decode input queue, so
3379 // keep reading slices for decode.
3380 // Push it a bit far, to qsize in queue rather than pending arrival,
3381 // as cram tends to be a bit bursty in decode timings.
3382 if (t_pool_results_queue_len(fd->rqueue) > fd->pool->qsize)
3383 break;
3384 } // end of for(;;)
3385
3386
3387 // When not threaded we've already have c_curr and s_curr.
3388 // Otherwise we need get them by pulling off the decode output queue.
3389 if (fd->pool) {
3390 t_pool_result *res;
3391 cram_decode_job *j;
3392
3393 if (fd->ooc && t_pool_results_queue_empty(fd->rqueue))
3394 return NULL;
3395
3396 res = t_pool_next_result_wait(fd->rqueue);
3397
3398 if (!res || !res->data) {
3399 fprintf(stderr, "Call to t_pool_next_result failed\n");
3400 return NULL;
3401 }
3402
3403 j = (cram_decode_job *)res->data;
3404 c_curr = j->c;
3405 s_curr = j->s;
3406
3407 if (j->exit_code != 0) {
3408 fprintf(stderr, "Slice decode failure\n");
3409 fd->eof = 0;
3410 t_pool_delete_result(res, 1);
3411 return NULL;
3412 }
3413
3414 t_pool_delete_result(res, 1);
3415 }
3416
3417 *cp = c_curr;
3418
3419 // Update current slice being processed (as opposed to current
3420 // slice in the multi-threaded reahead.
3421 fd->ctr = c_curr;
3422 if (c_curr) {
3423 c_curr->slice = s_curr;
3424 if (s_curr)
3425 c_curr->curr_slice = s_curr->slice_num;
3426 }
3427 if (s_curr)
3428 s_curr->curr_rec = 0;
3429 else
3430 fd->eof = 1;
3431
3432 return s_curr;
3433 }
3434
3435 /*
3436 * Read the next cram record and return it.
3437 * Note that to decode cram_record the caller will need to look up some data
3438 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3439 * the next call to cram_get_seq (which may invalidate it).
3440 *
3441 * Returns record pointer on success (do not free)
3442 * NULL on failure
3443 */
cram_get_seq(cram_fd * fd)3444 cram_record *cram_get_seq(cram_fd *fd) {
3445 cram_container *c;
3446 cram_slice *s;
3447
3448 for (;;) {
3449 c = fd->ctr;
3450 if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3451 s = c->slice;
3452 } else {
3453 if (!(s = cram_next_slice(fd, &c)))
3454 return NULL;
3455 continue; /* In case slice contains no records */
3456 }
3457
3458 if (fd->range.refid != -2) {
3459 if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3460 // Special case when looking for unmapped blocks at end.
3461 // If these are mixed in with mapped data (c->ref_id == -2)
3462 // then we need skip until we find the unmapped data, if at all
3463 s->curr_rec++;
3464 continue;
3465 }
3466 if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3467 s->crecs[s->curr_rec].ref_id != -1) {
3468 // Looking for a mapped read, but not there yet. Special case
3469 // as -1 (unmapped) shouldn't be considered < refid.
3470 s->curr_rec++;
3471 continue;
3472 }
3473
3474 if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3475 fd->eof = 1;
3476 cram_free_slice(s);
3477 c->slice = NULL;
3478 return NULL;
3479 }
3480
3481 if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3482 fd->eof = 1;
3483 cram_free_slice(s);
3484 c->slice = NULL;
3485 return NULL;
3486 }
3487
3488 if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3489 s->curr_rec++;
3490 continue;
3491 }
3492 }
3493
3494 break;
3495 }
3496
3497 fd->ctr = c;
3498 c->slice = s;
3499 return &s->crecs[s->curr_rec++];
3500 }
3501
3502 /*
3503 * Read the next cram record and convert it to a bam_seq_t struct.
3504 *
3505 * Returns 0 on success
3506 * -1 on EOF or failure (check fd->err)
3507 */
cram_get_bam_seq(cram_fd * fd,bam_seq_t ** bam)3508 int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3509 cram_record *cr;
3510 cram_container *c;
3511 cram_slice *s;
3512
3513 if (!(cr = cram_get_seq(fd))) {
3514 //*bam=0;
3515 return -1;
3516 }
3517
3518 c = fd->ctr;
3519 s = c->slice;
3520
3521 if (s->bl) {
3522 //*bam = s->bl[s->curr_rec-1]; return 0;
3523
3524 // Ideally we'd just do: *bam = s->bl[s->curr_rec-1];
3525 // That works, but it changes the API as the bam object is
3526 // no longer a malloced block of memory and cannot be
3527 // freed by the caller. (Possibly we can do *bam=0
3528 // in the case where cram_get_seq hits EOF, but this is
3529 // also assuming that the *bam object was ours and not a
3530 // result of, say, a merge with a bam file.)
3531 //
3532 // Hence instead we laboriously manage the memory and do a
3533 // memcpy each time. (This is around an extra 40% time taken
3534 // in main to decode a CRAM file, harming parallel execution.)
3535 int sz = s->bl[s->curr_rec-1]->alloc;
3536 if (!*bam) {
3537 if (!(*bam = malloc(sz)))
3538 return -1;
3539 (*bam)->alloc = sz;
3540 } else if ((*bam)->alloc < sz) {
3541 if (!(*bam = realloc(*bam, sz)))
3542 return -1;
3543 (*bam)->alloc = sz;
3544 }
3545 memcpy(*bam, s->bl[s->curr_rec-1], sz);
3546 return 0;
3547 }
3548
3549 return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam) >= 0 ? 0 : -1;
3550 }
3551