1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #include <config.h>
32 
33 #include <stdio.h>
34 #include <errno.h>
35 #include <assert.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <zlib.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <math.h>
42 
43 #include "cram/cram.h"
44 #include "cram/os.h"
45 #include "htslib/hts.h"
46 #include "htslib/hts_endian.h"
47 
48 KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
49 
50 #define Z_CRAM_STRAT Z_FILTERED
51 //#define Z_CRAM_STRAT Z_RLE
52 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
53 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
54 
55 static int process_one_read(cram_fd *fd, cram_container *c,
56 			    cram_slice *s, cram_record *cr,
57 			    bam_seq_t *b, int rnum);
58 
59 /*
60  * Returns index of val into key.
61  * Basically strchr(key, val)-key;
62  */
sub_idx(char * key,char val)63 static int sub_idx(char *key, char val) {
64     int i;
65 
66     for (i = 0; *key && *key++ != val; i++);
67     return i;
68 }
69 
70 /*
71  * Encodes a compression header block into a generic cram_block structure.
72  *
73  * Returns cram_block ptr on success
74  *         NULL on failure
75  */
cram_encode_compression_header(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h)76 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
77 					   cram_block_compression_hdr *h) {
78     cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
79     cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
80     int i, mc;
81 
82     if (!cb || !map)
83 	return NULL;
84 
85     /*
86      * This is a concatenation of several blocks of data:
87      * header + landmarks, preservation map, read encoding map, and the tag
88      * encoding map.
89      * All 4 are variable sized and we need to know how large these are
90      * before creating the compression header itself as this starts with
91      * the total size (stored as a variable length string).
92      */
93 
94     // Duplicated from container itself, and removed in 1.1
95     if (CRAM_MAJOR_VERS(fd->version) == 1) {
96 	itf8_put_blk(cb, h->ref_seq_id);
97 	itf8_put_blk(cb, h->ref_seq_start);
98 	itf8_put_blk(cb, h->ref_seq_span);
99 	itf8_put_blk(cb, h->num_records);
100 	itf8_put_blk(cb, h->num_landmarks);
101 	for (i = 0; i < h->num_landmarks; i++) {
102 	    itf8_put_blk(cb, h->landmark[i]);
103 	}
104     }
105 
106     if (h->preservation_map)
107 	kh_destroy(map, h->preservation_map);
108 
109     /* Create in-memory preservation map */
110     /* FIXME: should create this when we create the container */
111     {
112 	khint_t k;
113 	int r;
114 
115 	if (!(h->preservation_map = kh_init(map)))
116 	    return NULL;
117 
118 	k = kh_put(map, h->preservation_map, "RN", &r);
119 	if (-1 == r) return NULL;
120 	kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
121 
122 	if (CRAM_MAJOR_VERS(fd->version) == 1) {
123 	    k = kh_put(map, h->preservation_map, "PI", &r);
124 	    if (-1 == r) return NULL;
125 	    kh_val(h->preservation_map, k).i = 0;
126 
127 	    k = kh_put(map, h->preservation_map, "UI", &r);
128 	    if (-1 == r) return NULL;
129 	    kh_val(h->preservation_map, k).i = 1;
130 
131 	    k = kh_put(map, h->preservation_map, "MI", &r);
132 	    if (-1 == r) return NULL;
133 	    kh_val(h->preservation_map, k).i = 1;
134 
135 	} else {
136 	    // Technically SM was in 1.0, but wasn't in Java impl.
137 	    k = kh_put(map, h->preservation_map, "SM", &r);
138 	    if (-1 == r) return NULL;
139 	    kh_val(h->preservation_map, k).i = 0;
140 
141 	    k = kh_put(map, h->preservation_map, "TD", &r);
142 	    if (-1 == r) return NULL;
143 	    kh_val(h->preservation_map, k).i = 0;
144 
145 	    k = kh_put(map, h->preservation_map, "AP", &r);
146 	    if (-1 == r) return NULL;
147 	    kh_val(h->preservation_map, k).i = h->AP_delta;
148 
149 	    if (fd->no_ref || fd->embed_ref) {
150 		// Reference Required == No
151 		k = kh_put(map, h->preservation_map, "RR", &r);
152 		if (-1 == r) return NULL;
153 		kh_val(h->preservation_map, k).i = 0;
154 	    }
155 	}
156     }
157 
158     /* Encode preservation map; could collapse this and above into one */
159     mc = 0;
160     BLOCK_SIZE(map) = 0;
161     if (h->preservation_map) {
162 	khint_t k;
163 
164 	for (k = kh_begin(h->preservation_map);
165 	     k != kh_end(h->preservation_map);
166 	     k++) {
167 	    const char *key;
168 	    khash_t(map) *pmap = h->preservation_map;
169 
170 
171 	    if (!kh_exist(pmap, k))
172 		continue;
173 
174 	    key = kh_key(pmap, k);
175 	    BLOCK_APPEND(map, key, 2);
176 
177 	    switch(CRAM_KEY(key[0], key[1])) {
178 	    case CRAM_KEY('M','I'):
179 		BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
180 		break;
181 
182 	    case CRAM_KEY('U','I'):
183 		BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
184 		break;
185 
186 	    case CRAM_KEY('P','I'):
187 		BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
188 		break;
189 
190 	    case CRAM_KEY('A','P'):
191 		BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
192 		break;
193 
194 	    case CRAM_KEY('R','N'):
195 		BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
196 		break;
197 
198 	    case CRAM_KEY('R','R'):
199 		BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
200 		break;
201 
202 	    case CRAM_KEY('S','M'): {
203 		char smat[5], *mp = smat;
204 		*mp++ =
205 		    (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
206 		    (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
207 		    (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
208 		    (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
209 		*mp++ =
210 		    (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
211 		    (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
212 		    (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
213 		    (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
214 		*mp++ =
215 		    (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
216 		    (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
217 		    (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
218 		    (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
219 		*mp++ =
220 		    (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
221 		    (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
222 		    (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
223 		    (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
224 		*mp++ =
225 		    (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
226 		    (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
227 		    (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
228 		    (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
229 		BLOCK_APPEND(map, smat, 5);
230 		break;
231 	    }
232 
233 	    case CRAM_KEY('T','D'): {
234 		itf8_put_blk(map, BLOCK_SIZE(h->TD_blk));
235 		BLOCK_APPEND(map,
236 			     BLOCK_DATA(h->TD_blk),
237 			     BLOCK_SIZE(h->TD_blk));
238 		break;
239 	    }
240 
241 	    default:
242 		fprintf(stderr, "Unknown preservation key '%.2s'\n", key);
243 		break;
244 	    }
245 
246 	    mc++;
247         }
248     }
249     itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
250     itf8_put_blk(cb, mc);
251     BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
252 
253     /* rec encoding map */
254     mc = 0;
255     BLOCK_SIZE(map) = 0;
256     if (h->codecs[DS_BF]) {
257 	if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
258 					  fd->version))
259 	    return NULL;
260 	mc++;
261     }
262     if (h->codecs[DS_CF]) {
263 	if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
264 					  fd->version))
265 	    return NULL;
266 	mc++;
267     }
268     if (h->codecs[DS_RL]) {
269 	if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
270 					  fd->version))
271 	    return NULL;
272 	mc++;
273     }
274     if (h->codecs[DS_AP]) {
275 	if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
276 					  fd->version))
277 	    return NULL;
278 	mc++;
279     }
280     if (h->codecs[DS_RG]) {
281 	if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
282 					  fd->version))
283 	    return NULL;
284 	mc++;
285     }
286     if (h->codecs[DS_MF]) {
287 	if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
288 					  fd->version))
289 	    return NULL;
290 	mc++;
291     }
292     if (h->codecs[DS_NS]) {
293 	if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
294 					  fd->version))
295 	    return NULL;
296 	mc++;
297     }
298     if (h->codecs[DS_NP]) {
299 	if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
300 					  fd->version))
301 	    return NULL;
302 	mc++;
303     }
304     if (h->codecs[DS_TS]) {
305 	if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
306 					  fd->version))
307 	    return NULL;
308 	mc++;
309     }
310     if (h->codecs[DS_NF]) {
311 	if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
312 					  fd->version))
313 	    return NULL;
314 	mc++;
315     }
316     if (h->codecs[DS_TC]) {
317 	if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
318 					  fd->version))
319 	    return NULL;
320 	mc++;
321     }
322     if (h->codecs[DS_TN]) {
323 	if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
324 					  fd->version))
325 	    return NULL;
326 	mc++;
327     }
328     if (h->codecs[DS_TL]) {
329 	if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
330 					  fd->version))
331 	    return NULL;
332 	mc++;
333     }
334     if (h->codecs[DS_FN]) {
335 	if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
336 					  fd->version))
337 	    return NULL;
338 	mc++;
339     }
340     if (h->codecs[DS_FC]) {
341 	if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
342 					  fd->version))
343 	    return NULL;
344 	mc++;
345     }
346     if (h->codecs[DS_FP]) {
347 	if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
348 					  fd->version))
349 	    return NULL;
350 	mc++;
351     }
352     if (h->codecs[DS_BS]) {
353 	if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
354 					  fd->version))
355 	    return NULL;
356 	mc++;
357     }
358     if (h->codecs[DS_IN]) {
359 	if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
360 					  fd->version))
361 	    return NULL;
362 	mc++;
363     }
364     if (h->codecs[DS_DL]) {
365 	if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
366 					  fd->version))
367 	    return NULL;
368 	mc++;
369     }
370     if (h->codecs[DS_BA]) {
371 	if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
372 					  fd->version))
373 	    return NULL;
374 	mc++;
375     }
376     if (h->codecs[DS_BB]) {
377 	if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
378 					  fd->version))
379 	    return NULL;
380 	mc++;
381     }
382     if (h->codecs[DS_MQ]) {
383 	if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
384 					  fd->version))
385 	    return NULL;
386 	mc++;
387     }
388     if (h->codecs[DS_RN]) {
389 	if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
390 					  fd->version))
391 	    return NULL;
392 	mc++;
393     }
394     if (h->codecs[DS_QS]) {
395 	if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
396 					  fd->version))
397 	    return NULL;
398 	mc++;
399     }
400     if (h->codecs[DS_QQ]) {
401 	if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
402 					  fd->version))
403 	    return NULL;
404 	mc++;
405     }
406     if (h->codecs[DS_RI]) {
407 	if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
408 					  fd->version))
409 	    return NULL;
410 	mc++;
411     }
412     if (CRAM_MAJOR_VERS(fd->version) != 1) {
413 	if (h->codecs[DS_SC]) {
414 	    if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
415 					      fd->version))
416 		return NULL;
417 	    mc++;
418 	}
419 	if (h->codecs[DS_RS]) {
420 	    if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
421 					      fd->version))
422 		return NULL;
423 	    mc++;
424 	}
425 	if (h->codecs[DS_PD]) {
426 	    if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
427 					      fd->version))
428 		return NULL;
429 	    mc++;
430 	}
431 	if (h->codecs[DS_HC]) {
432 	    if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
433 					      fd->version))
434 		return NULL;
435 	    mc++;
436 	}
437     }
438     if (h->codecs[DS_TM]) {
439 	if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
440 					  fd->version))
441 	    return NULL;
442 	mc++;
443     }
444     if (h->codecs[DS_TV]) {
445 	if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
446 					  fd->version))
447 	    return NULL;
448 	mc++;
449     }
450     itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
451     itf8_put_blk(cb, mc);
452     BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
453 
454     /* tag encoding map */
455 #if 0
456     mp = map; mc = 0;
457     if (h->tag_encoding_map) {
458         HashItem *hi;
459         HashIter *iter = HashTableIterCreate();
460 	if (!iter)
461 	    return NULL;
462 
463         while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) {
464             cram_map *m = hi->data.p;
465 	    int sz;
466 
467 	    mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]);
468 	    if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version)))
469 		return NULL;
470 	    mp += sz;
471 	    mc++;
472         }
473 
474         HashTableIterDestroy(iter);
475     }
476 #else
477     mc = 0;
478     BLOCK_SIZE(map) = 0;
479     if (c->tags_used) {
480 	khint_t k;
481 
482 	for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
483 	    int key;
484 	    if (!kh_exist(c->tags_used, k))
485 		continue;
486 
487 	    key = kh_key(c->tags_used, k);
488 	    cram_codec *cd = kh_val(c->tags_used, k)->codec;
489 
490 	    itf8_put_blk(map, key);
491 	    if (-1 == cd->store(cd, map, NULL, fd->version))
492 		return NULL;
493 
494 	    mc++;
495 	}
496     }
497 #endif
498     itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
499     itf8_put_blk(cb, mc);
500     BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
501 
502     if (fd->verbose)
503 	fprintf(stderr, "Wrote compression block header in %d bytes\n",
504 		(int)BLOCK_SIZE(cb));
505 
506     BLOCK_UPLEN(cb);
507 
508     cram_free_block(map);
509 
510     return cb;
511 }
512 
513 
514 /*
515  * Encodes a slice compression header.
516  *
517  * Returns cram_block on success
518  *         NULL on failure
519  */
cram_encode_slice_header(cram_fd * fd,cram_slice * s)520 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
521     char *buf;
522     char *cp;
523     cram_block *b = cram_new_block(MAPPED_SLICE, 0);
524     int j;
525 
526     if (!b)
527 	return NULL;
528 
529     if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) {
530 	cram_free_block(b);
531 	return NULL;
532     }
533 
534     cp += itf8_put(cp, s->hdr->ref_seq_id);
535     cp += itf8_put(cp, s->hdr->ref_seq_start);
536     cp += itf8_put(cp, s->hdr->ref_seq_span);
537     cp += itf8_put(cp, s->hdr->num_records);
538     if (CRAM_MAJOR_VERS(fd->version) == 2)
539 	cp += itf8_put(cp, s->hdr->record_counter);
540     else if (CRAM_MAJOR_VERS(fd->version) >= 3)
541 	cp += ltf8_put(cp, s->hdr->record_counter);
542     cp += itf8_put(cp, s->hdr->num_blocks);
543     cp += itf8_put(cp, s->hdr->num_content_ids);
544     for (j = 0; j < s->hdr->num_content_ids; j++) {
545 	cp += itf8_put(cp, s->hdr->block_content_ids[j]);
546     }
547     if (s->hdr->content_type == MAPPED_SLICE)
548 	cp += itf8_put(cp, s->hdr->ref_base_id);
549 
550     if (CRAM_MAJOR_VERS(fd->version) != 1) {
551 	memcpy(cp, s->hdr->md5, 16); cp += 16;
552     }
553 
554     assert(cp-buf <= 16+5*(8+s->hdr->num_blocks));
555 
556     b->data = (unsigned char *)buf;
557     b->comp_size = b->uncomp_size = cp-buf;
558 
559     return b;
560 }
561 
562 
563 /*
564  * Encodes a single read.
565  *
566  * Returns 0 on success
567  *        -1 on failure
568  */
cram_encode_slice_read(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s,cram_record * cr,int * last_pos)569 static int cram_encode_slice_read(cram_fd *fd,
570 				  cram_container *c,
571 				  cram_block_compression_hdr *h,
572 				  cram_slice *s,
573 				  cram_record *cr,
574 				  int *last_pos) {
575     int r = 0;
576     int32_t i32;
577     unsigned char uc;
578 
579     //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
580 
581     //printf("BF=0x%x\n", cr->flags);
582     //	    bf = cram_flag_swap[cr->flags];
583     i32 = fd->cram_flag_swap[cr->flags & 0xfff];
584     r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
585 
586     i32 = cr->cram_flags & CRAM_FLAG_MASK;
587     r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
588 
589     if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
590 	r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
591 
592     r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
593 
594     if (c->pos_sorted) {
595 	i32 = cr->apos - *last_pos;
596 	r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
597 	*last_pos = cr->apos;
598     } else {
599 	i32 = cr->apos;
600 	r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
601     }
602 
603     r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
604 
605     if (cr->cram_flags & CRAM_FLAG_DETACHED) {
606 	i32 = cr->mate_flags;
607 	r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
608 
609 	r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
610 				      (char *)&cr->mate_ref_id, 1);
611 
612 	r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
613 				      (char *)&cr->mate_pos, 1);
614 
615 	r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
616 				      (char *)&cr->tlen, 1);
617     } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
618 	r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
619 				      (char *)&cr->mate_line, 1);
620     }
621 
622     /* Aux tags */
623     if (CRAM_MAJOR_VERS(fd->version) == 1) {
624 	int j;
625 	uc = cr->ntags;
626 	r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
627 
628 	for (j = 0; j < cr->ntags; j++) {
629 	    uint32_t i32 = s->TN[cr->TN_idx + j]; // id
630 	    r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
631 	}
632     } else {
633 	r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
634     }
635 
636     // qual
637     // QS codec : Already stored in block[2].
638 
639     // features (diffs)
640     if (!(cr->flags & BAM_FUNMAP)) {
641 	int prev_pos = 0, j;
642 
643 	r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
644 				      (char *)&cr->nfeature, 1);
645 	for (j = 0; j < cr->nfeature; j++) {
646 	    cram_feature *f = &s->features[cr->feature + j];
647 
648 	    uc = f->X.code;
649 	    r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
650 	    i32 = f->X.pos - prev_pos;
651 	    r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
652 	    prev_pos = f->X.pos;
653 
654 	    switch(f->X.code) {
655 		//char *seq;
656 
657 	    case 'X':
658 		//fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
659 
660 		uc = f->X.base;
661 		r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
662 					      (char *)&uc, 1);
663 		break;
664 	    case 'S':
665 		// Already done
666 //		r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
667 //					      BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
668 //					      f->S.len);
669 
670 //		if (IS_CRAM_3_VERS(fd)) {
671 //		    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
672 //						  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
673 //						  f->S.len);
674 //		}
675 		break;
676 	    case 'I':
677 		//seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
678 		//r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
679 		//			     seq, f->S.len);
680 //		if (IS_CRAM_3_VERS(fd)) {
681 //		    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
682 //						  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
683 //						  f->I.len);
684 //		}
685 		break;
686 	    case 'i':
687 		uc = f->i.base;
688 		r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
689 					      (char *)&uc, 1);
690 		//seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
691 		//r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
692 		//			     seq, 1);
693 		break;
694 	    case 'D':
695 		i32 = f->D.len;
696 		r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
697 					      (char *)&i32, 1);
698 		break;
699 
700 	    case 'B':
701 		//		    // Used when we try to store a non ACGTN base or an N
702 		//		    // that aligns against a non ACGTN reference
703 
704 		uc  = f->B.base;
705 		r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
706 					      (char *)&uc, 1);
707 
708 		//                  Already added
709 		//		    uc  = f->B.qual;
710 		//		    r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
711 		//					     (char *)&uc, 1);
712 		break;
713 
714 	    case 'b':
715 		// string of bases
716 		r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
717 					      (char *)BLOCK_DATA(s->seqs_blk)
718 					              + f->b.seq_idx,
719 					      f->b.len);
720 		break;
721 
722 	    case 'Q':
723 		//                  Already added
724 		//		    uc  = f->B.qual;
725 		//		    r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
726 		//					     (char *)&uc, 1);
727 		break;
728 
729 	    case 'N':
730 		i32 = f->N.len;
731 		r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
732 					      (char *)&i32, 1);
733 		break;
734 
735 	    case 'P':
736 		i32 = f->P.len;
737 		r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
738 					      (char *)&i32, 1);
739 		break;
740 
741 	    case 'H':
742 		i32 = f->H.len;
743 		r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
744 					      (char *)&i32, 1);
745 		break;
746 
747 
748 	    default:
749 		fprintf(stderr, "unhandled feature code %c\n",
750 			f->X.code);
751 		return -1;
752 	    }
753 	}
754 
755 	r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
756 				      (char *)&cr->mqual, 1);
757     } else {
758 	char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
759 	if (cr->len)
760 	    r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
761     }
762 
763     return r ? -1 : 0;
764 }
765 
766 
767 /*
768  * Applies various compression methods to specific blocks, depending on
769  * known observations of how data series compress.
770  *
771  * Returns 0 on success
772  *        -1 on failure
773  */
cram_compress_slice(cram_fd * fd,cram_container * c,cram_slice * s)774 static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
775     int level = fd->level, i;
776     int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
777 
778     /* Compress the CORE Block too, with minimal zlib level */
779     if (level > 5 && s->block[0]->uncomp_size > 500)
780 	cram_compress_block(fd, s->block[0], NULL, 1<<GZIP, 1);
781 
782     if (fd->use_bz2)
783 	method |= 1<<BZIP2;
784 
785     if (fd->use_rans)
786 	method |= (1<<RANS0) | (1<<RANS1);
787 
788     if (fd->use_lzma)
789 	method |= (1<<LZMA);
790 
791     /* Faster method for data series we only need entropy encoding on */
792     methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
793     if (level >= 6)
794 	methodF = method;
795 
796 
797     /* Specific compression methods for certain block types */
798     if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
799 			    method, level))
800 	return -1;
801 
802     if (fd->level == 0) {
803 	/* Do nothing */
804     } else if (fd->level == 1) {
805 	if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
806 				methodF, 1))
807 	    return -1;
808 	for (i = DS_aux; i <= DS_aux_oz; i++) {
809 	    if (s->block[i])
810 		if (cram_compress_block(fd, s->block[i], fd->m[i],
811 					method, 1))
812 		    return -1;
813 	}
814     } else if (fd->level < 3) {
815 	if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
816 				method, 1))
817 	    return -1;
818 	if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
819 				method, 1))
820 	    return -1;
821 	if (s->block[DS_BB])
822 	    if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
823 				    method, 1))
824 	    return -1;
825 	for (i = DS_aux; i <= DS_aux_oz; i++) {
826 	    if (s->block[i])
827 		if (cram_compress_block(fd, s->block[i], fd->m[i],
828 					method, level))
829 		    return -1;
830 	}
831     } else {
832 	if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
833 				method, level))
834 	    return -1;
835 	if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
836 				method, level))
837 	    return -1;
838 	if (s->block[DS_BB])
839 	    if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
840 				    method, level))
841 	    return -1;
842 	for (i = DS_aux; i <= DS_aux_oz; i++) {
843 	    if (s->block[i])
844 		if (cram_compress_block(fd, s->block[i], fd->m[i],
845 					method, level))
846 		    return -1;
847 	}
848     }
849 
850     // NAME: best is generally xz, bzip2, zlib then rans1
851     // It benefits well from a little bit extra compression level.
852     if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN],
853 			    method & ~(1<<RANS0 | 1<<GZIP_RLE),
854 			    MIN(9,level)))
855 	return -1;
856 
857     // NS shows strong local correlation as rearrangements are localised
858     if (s->block[DS_NS] != s->block[0])
859 	if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS],
860 				method, level))
861 	    return -1;
862 
863 
864     /*
865      * Compress any auxiliary tags with their own per-tag metrics
866      */
867     {
868 	int i;
869 	for (i = 0; i < s->naux_block; i++) {
870 	    if (!s->aux_block[i] || s->aux_block[i] == s->block[0])
871 		continue;
872 
873 	    if (s->aux_block[i]->method != RAW)
874 		continue;
875 
876 	    if (cram_compress_block(fd, s->aux_block[i], s->aux_block[i]->m,
877 				    method, level))
878 		return -1;
879 	}
880     }
881 
882     /*
883      * Minimal compression of any block still uncompressed, bar CORE
884      */
885     {
886 	int i;
887 	for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
888 	    if (!s->block[i] || s->block[i] == s->block[0])
889 		continue;
890 
891 	    if (s->block[i]->method != RAW)
892 		continue;
893 
894 	    if (cram_compress_block(fd, s->block[i], fd->m[i],
895 				    methodF, level))
896 		return -1;
897 	}
898     }
899 
900     return 0;
901 }
902 
903 /*
904  * Encodes a single slice from a container
905  *
906  * Returns 0 on success
907  *        -1 on failure
908  */
cram_encode_slice(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s)909 static int cram_encode_slice(cram_fd *fd, cram_container *c,
910 			     cram_block_compression_hdr *h, cram_slice *s) {
911     int rec, r = 0, last_pos;
912     int embed_ref;
913     enum cram_DS_ID id;
914 
915     embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
916 
917     /*
918      * Slice external blocks:
919      * ID 0 => base calls (insertions, soft-clip)
920      * ID 1 => qualities
921      * ID 2 => names
922      * ID 3 => TS (insert size), NP (next frag)
923      * ID 4 => tag values
924      * ID 6 => tag IDs (TN), if CRAM_V1.0
925      * ID 7 => TD tag dictionary, if !CRAM_V1.0
926      */
927 
928     /* Create cram slice header */
929     s->hdr->ref_base_id = embed_ref ? DS_ref : -1;
930     s->hdr->record_counter = c->num_records + c->record_counter;
931     c->num_records += s->hdr->num_records;
932 
933     int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
934     s->block = calloc(DS_END + ntags, sizeof(s->block[0]));
935     s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
936     if (!s->block || !s->hdr->block_content_ids)
937 	return -1;
938 
939     // Create first fixed blocks, always external.
940     // CORE
941     if (!(s->block[0] = cram_new_block(CORE, 0)))
942 	return -1;
943 
944     // TN block for CRAM v1
945     if (CRAM_MAJOR_VERS(fd->version) == 1) {
946 	if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
947 	    if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
948 	    h->codecs[DS_TN]->external.content_id = DS_TN;
949 	} else {
950 	    s->block[DS_TN] = s->block[0];
951 	}
952 	s->block[DS_TN] = s->block[DS_TN];
953     }
954 
955     // Embedded reference
956     if (embed_ref) {
957 	if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
958 	    return -1;
959 	s->ref_id = DS_ref; // needed?
960 	BLOCK_APPEND(s->block[DS_ref],
961 		     c->ref + c->first_base - c->ref_start,
962 		     c->last_base - c->first_base + 1);
963     }
964 
965     /*
966      * All the data-series blocks if appropriate.
967      */
968     for (id = DS_BF; id < DS_TN; id++) {
969 	if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL ||
970 			      h->codecs[id]->codec == E_BYTE_ARRAY_STOP ||
971 			      h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) {
972 	    switch (h->codecs[id]->codec) {
973 	    case E_EXTERNAL:
974 		if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
975 		    return -1;
976 		h->codecs[id]->external.content_id = id;
977 		break;
978 
979 	    case E_BYTE_ARRAY_STOP:
980 		if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
981 		    return -1;
982 		h->codecs[id]->byte_array_stop.content_id = id;
983 		break;
984 
985 	    case E_BYTE_ARRAY_LEN: {
986 		cram_codec *cc;
987 
988 		cc = h->codecs[id]->e_byte_array_len.len_codec;
989 		if (cc->codec == E_EXTERNAL) {
990 		    int eid = cc->external.content_id;
991 		    if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
992 			return -1;
993 		    cc->external.content_id = eid;
994 		    cc->out = s->block[eid];
995 		}
996 
997 		cc = h->codecs[id]->e_byte_array_len.val_codec;
998 		if (cc->codec == E_EXTERNAL) {
999 		    int eid = cc->external.content_id;
1000 		    if (!s->block[eid])
1001 			if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
1002 			    return -1;
1003 		    cc->external.content_id = eid;
1004 		    cc->out = s->block[eid];
1005 		}
1006 		break;
1007 	    }
1008 	    default:
1009 		break;
1010 	    }
1011 	} else {
1012 	    if (!(id == DS_BB && !h->codecs[DS_BB]))
1013 		s->block[id] = s->block[0];
1014 	}
1015 	if (h->codecs[id])
1016 	    h->codecs[id]->out = s->block[id];
1017     }
1018 
1019     /*
1020      * Add in the external tag blocks too.
1021      */
1022     if (c->tags_used) {
1023 	int n;
1024 	s->hdr->num_blocks = DS_END;
1025 	for (n = 0; n < s->naux_block; n++)
1026 	    s->block[s->hdr->num_blocks++] = s->aux_block[n];
1027     }
1028 
1029     /* Encode reads */
1030     last_pos = s->hdr->ref_seq_start;
1031     for (rec = 0; rec < s->hdr->num_records; rec++) {
1032 	cram_record *cr = &s->crecs[rec];
1033 	if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1034 	    return -1;
1035     }
1036 
1037     s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1038     s->block[0]->comp_size = s->block[0]->uncomp_size;
1039 
1040     // Make sure the fixed blocks point to the correct sources
1041     s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1042     s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1043     s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1044     s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1045 
1046     // Ensure block sizes are up to date.
1047     for (id = 1; id < s->hdr->num_blocks; id++) {
1048 	if (!s->block[id] || s->block[id] == s->block[0])
1049 	    continue;
1050 
1051 	if (s->block[id]->uncomp_size == 0)
1052 	    BLOCK_UPLEN(s->block[id]);
1053     }
1054 
1055     // Compress it all
1056     if (cram_compress_slice(fd, c, s) == -1)
1057 	return -1;
1058 
1059     // Collapse empty blocks and create hdr_block
1060     {
1061 	int i, j;
1062 
1063 	s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1064 					    s->hdr->num_blocks * sizeof(int32_t));
1065 	if (!s->hdr->block_content_ids)
1066 	    return -1;
1067 
1068 	for (i = j = 1; i < s->hdr->num_blocks; i++) {
1069 	    if (!s->block[i] || s->block[i] == s->block[0])
1070 		continue;
1071 	    if (s->block[i]->uncomp_size == 0) {
1072 		cram_free_block(s->block[i]);
1073 		s->block[i] = NULL;
1074 		continue;
1075 	    }
1076 	    s->block[j] = s->block[i];
1077 	    s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1078 	    j++;
1079 	}
1080 	s->hdr->num_content_ids = j-1;
1081 	s->hdr->num_blocks = j;
1082 
1083 	if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1084 	    return -1;
1085     }
1086 
1087     return r ? -1 : 0;
1088 }
1089 
1090  /*
1091  * Returns the number of expected read names for this record.
1092  */
expected_template_count(bam_seq_t * b)1093 static int expected_template_count(bam_seq_t *b) {
1094     int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1095 
1096     uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1097     if (TC) {
1098 	int n = bam_aux2i(TC);
1099 	if (expected < n)
1100 	    expected = n;
1101     }
1102 
1103     if (!TC && bam_aux_get(b, "SA")) {
1104 	// We could count the semicolons, but we'd have to do this for
1105 	// read1, read2 and read(not-1-or-2) combining the results
1106 	// together.  This is a cheap and safe alternative for now.
1107 	expected = INT_MAX;
1108     }
1109 
1110     return expected;
1111 }
1112 
1113 /*
1114  * Lossily reject read names.
1115  *
1116  * The rule here is that if all reads for this template reside in the
1117  * same slice then we can lose the name.  Otherwise we keep them as we
1118  * do not know when (or if) the other reads will turn up.
1119  *
1120  * Note there may be only 1 read (non-paired library) or more than 2
1121  * reads (paired library with supplementary reads), or other weird
1122  * setups.  We need to know how many are expected.  Ways to guess:
1123  *
1124  * - Flags (0x1 - has > 1 read)
1125  * - TC aux field (not mandatory)
1126  * - SA tags (count semicolons, NB per fragment so sum - hard)
1127  * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1128  *
1129  * Returns 0 on success
1130  *        -1 on failure
1131  */
lossy_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1132 static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1133 			    int bam_start) {
1134     int r1, r2, ret = -1;
1135 
1136     // Initialise cram_flags
1137     for (r2 = 0; r2 < s->hdr->num_records; r2++)
1138 	s->crecs[r2].cram_flags = 0;
1139 
1140     if (!fd->lossy_read_names)
1141 	return 0;
1142 
1143     khash_t(m_s2u64) *names = kh_init(m_s2u64);
1144     if (!names)
1145 	goto fail;
1146 
1147     // 1: Iterate through names to count frequency
1148     for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1149 	//cram_record *cr = &s->crecs[r2];
1150 	bam_seq_t *b = c->bams[r1];
1151 	khint_t k;
1152 	int n;
1153 	uint64_t e;
1154 	union {
1155 	    uint64_t i64;
1156 	    struct {
1157 		int32_t e,c; // expected & observed counts.
1158 	    };
1159 	} u;
1160 
1161 	e = expected_template_count(b);
1162 	u.e = e; u.c = 1;
1163 
1164 	k = kh_put(m_s2u64, names, bam_name(b), &n);
1165 	if (n == -1)
1166 	    goto fail;
1167 
1168 	if (n == 0) {
1169 	    // not a new name
1170 	    u.i64 = kh_val(names, k);
1171 	    if (u.e != e) {
1172 		// different expectation or already hit the max
1173 		//fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1174 		kh_val(names, k) = 0;
1175 	    } else {
1176 		u.c++;
1177 		if (u.e == u.c) {
1178 		    // Reached expected count.
1179 		    kh_val(names, k) = -1;
1180 		} else {
1181 		    kh_val(names, k) = u.i64;
1182 		}
1183 	    }
1184 	} else {
1185 	    // new name
1186 	    kh_val(names, k) = u.i64;
1187 	}
1188     }
1189 
1190     // 2: Remove names if all present (hd.i == -1)
1191     for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1192 	cram_record *cr = &s->crecs[r2];
1193 	bam_seq_t *b = c->bams[r1];
1194 	khint_t k;
1195 
1196 	k = kh_get(m_s2u64, names, bam_name(b));
1197 
1198 	if (k == kh_end(names))
1199 	    goto fail;
1200 
1201 	if (kh_val(names, k) == -1)
1202 	    cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1203     }
1204 
1205     ret = 0;
1206  fail: // ret==-1
1207 
1208     if (names)
1209 	kh_destroy(m_s2u64, names);
1210 
1211     return ret;
1212 }
1213 
1214 /*
1215  * Adds the reading names.  We do this here as a separate pass rather
1216  * than per record in the process_one_read calls as that function can
1217  * go back and change the CRAM_FLAG_DETACHED status of a previously
1218  * processed read if it subsequently determines the TLEN field is
1219  * incorrect.  Given DETACHED reads always try to decode read names,
1220  * we need to know their status before generating the read-name block.
1221  *
1222  * Output is an update s->name_blk, and cr->name / cr->name_len
1223  * fields.
1224  */
add_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1225 static void add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1226 			   int bam_start) {
1227     int r1, r2;
1228     int keep_names = !fd->lossy_read_names;
1229 
1230     for (r1 = bam_start, r2 = 0;
1231 	 r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1232 	 r1++, r2++) {
1233 	cram_record *cr = &s->crecs[r2];
1234 	bam_seq_t *b = c->bams[r1];
1235 
1236 	cr->name        = BLOCK_SIZE(s->name_blk);
1237 	if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1238 	    BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1239 	    cr->name_len    = bam_name_len(b);
1240 	} else {
1241 	    // Can only discard duplicate names if not detached
1242 	    cr->name_len = 0;
1243 	}
1244 	cram_stats_add(c->stats[DS_RN], cr->name_len);
1245     }
1246 }
1247 
1248 /*
1249  * Encodes all slices in a container into blocks.
1250  * Returns 0 on success
1251  *        -1 on failure
1252  */
cram_encode_container(cram_fd * fd,cram_container * c)1253 int cram_encode_container(cram_fd *fd, cram_container *c) {
1254     int i, j, slice_offset;
1255     cram_block_compression_hdr *h = c->comp_hdr;
1256     cram_block *c_hdr;
1257     int multi_ref = 0;
1258     int r1, r2, sn, nref;
1259     spare_bams *spares;
1260 
1261     /* Cache references up-front if we have unsorted access patterns */
1262     pthread_mutex_lock(&fd->ref_lock);
1263     nref = fd->refs->nref;
1264     pthread_mutex_unlock(&fd->ref_lock);
1265 
1266     if (!fd->no_ref && c->refs_used) {
1267 	for (i = 0; i < nref; i++) {
1268 	    if (c->refs_used[i])
1269 		cram_get_ref(fd, i, 1, 0);
1270 	}
1271     }
1272 
1273     /* To create M5 strings */
1274     /* Fetch reference sequence */
1275     if (!fd->no_ref) {
1276 	bam_seq_t *b = c->bams[0];
1277 	char *ref;
1278 
1279 	ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1280 	if (!ref && bam_ref(b) >= 0) {
1281 	    fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b));
1282 	    return -1;
1283 	}
1284 	if ((c->ref_id = bam_ref(b)) >= 0) {
1285 	    c->ref_seq_id = c->ref_id;
1286 	    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1287 	    c->ref_start = 1;
1288 	    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1289 	} else {
1290 	    c->ref_seq_id = c->ref_id; // FIXME remove one var!
1291 	}
1292     } else {
1293 	c->ref_id = bam_ref(c->bams[0]);
1294 	cram_ref_incr(fd->refs, c->ref_id);
1295 	c->ref_seq_id = c->ref_id;
1296     }
1297 
1298     /* Turn bams into cram_records and gather basic stats */
1299     for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1300 	cram_slice *s = c->slices[sn];
1301 	int first_base = INT_MAX, last_base = INT_MIN;
1302 
1303 	int r1_start = r1;
1304 
1305 	assert(sn < c->curr_slice);
1306 
1307 	// Discover which read names *may* be safely removed.
1308 	// Ie which ones have all their records in this slice.
1309 	if (lossy_read_names(fd, c, s, r1_start) != 0)
1310 	    return -1;
1311 
1312 	// Iterate through records creating the cram blocks for some
1313 	// fields and just gathering stats for others.
1314 	for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1315 	    cram_record *cr = &s->crecs[r2];
1316 	    bam_seq_t *b = c->bams[r1];
1317 
1318 	    /* If multi-ref we need to cope with changing reference per seq */
1319 	    if (c->multi_seq && !fd->no_ref) {
1320 		if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
1321 		    if (c->ref_seq_id >= 0)
1322 			cram_ref_decr(fd->refs, c->ref_seq_id);
1323 
1324 		    if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
1325 			fprintf(stderr, "Failed to load reference #%d\n",
1326 				bam_ref(b));
1327 			return -1;
1328 		    }
1329 
1330 		    c->ref_seq_id = bam_ref(b); // overwritten later by -2
1331 		    if (!fd->refs->ref_id[c->ref_seq_id]->seq)
1332 			return -1;
1333 		    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1334 		    c->ref_start = 1;
1335 		    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1336 		}
1337 	    }
1338 
1339 	    if (process_one_read(fd, c, s, cr, b, r2) != 0)
1340 		return -1;
1341 
1342 	    if (first_base > cr->apos)
1343 		first_base = cr->apos;
1344 
1345 	    if (last_base < cr->aend)
1346 		last_base = cr->aend;
1347 	}
1348 
1349 	// Process_one_read doesn't add read names as it can change
1350 	// its mind during the loop on the CRAM_FLAG_DETACHED setting
1351 	// of earlier records (if it detects the auto-generation of
1352 	// TLEN is incorrect).  This affects which read-names can be
1353 	// lossily compressed, so we do these in another pass.
1354 	add_read_names(fd, c, s, r1_start);
1355 
1356 	if (c->multi_seq) {
1357 	    s->hdr->ref_seq_id    = -2;
1358 	    s->hdr->ref_seq_start = 0;
1359 	    s->hdr->ref_seq_span  = 0;
1360 	} else {
1361 	    s->hdr->ref_seq_id    = c->ref_id;
1362 	    s->hdr->ref_seq_start = first_base;
1363 	    s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
1364 	}
1365 	s->hdr->num_records = r2;
1366 
1367 	// Processed a slice, now stash the aux blocks so the next
1368 	// slice can start aggregating them from the start again.
1369 	if (c->tags_used->n_occupied) {
1370 	    int ntags = c->tags_used->n_occupied;
1371 	    s->aux_block = calloc(ntags, sizeof(*s->aux_block));
1372 	    if (!s->aux_block)
1373 		return -1;
1374 
1375 	    khint_t k;
1376 
1377 	    s->naux_block = 0;
1378 	    for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
1379 		if (!kh_exist(c->tags_used, k))
1380 		    continue;
1381 
1382 		cram_tag_map *tm = kh_val(c->tags_used, k);
1383 		if (!tm->blk) continue;
1384 		s->aux_block[s->naux_block++] = tm->blk;
1385 		tm->blk = NULL;
1386 	    }
1387 	    assert(s->naux_block <= c->tags_used->n_occupied);
1388 	}
1389     }
1390 
1391     if (c->multi_seq && !fd->no_ref) {
1392 	if (c->ref_seq_id >= 0)
1393 	    cram_ref_decr(fd->refs, c->ref_seq_id);
1394     }
1395 
1396     /* Link our bams[] array onto the spare bam list for reuse */
1397     spares = malloc(sizeof(*spares));
1398     pthread_mutex_lock(&fd->bam_list_lock);
1399     spares->bams = c->bams;
1400     spares->next = fd->bl;
1401     fd->bl = spares;
1402     pthread_mutex_unlock(&fd->bam_list_lock);
1403     c->bams = NULL;
1404 
1405     /* Detect if a multi-seq container */
1406     cram_stats_encoding(fd, c->stats[DS_RI]);
1407     multi_ref = c->stats[DS_RI]->nvals > 1;
1408 
1409     if (multi_ref) {
1410 	if (fd->verbose)
1411 	    fprintf(stderr, "Multi-ref container\n");
1412 	c->ref_seq_id = -2;
1413 	c->ref_seq_start = 0;
1414 	c->ref_seq_span = 0;
1415     }
1416 
1417 
1418     /* Compute MD5s */
1419     for (i = 0; i < c->curr_slice; i++) {
1420 	cram_slice *s = c->slices[i];
1421 
1422 	if (CRAM_MAJOR_VERS(fd->version) != 1) {
1423 	    if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
1424 		hts_md5_context *md5 = hts_md5_init();
1425 		if (!md5)
1426 		    return -1;
1427 		hts_md5_update(md5,
1428 			   c->ref + s->hdr->ref_seq_start - c->ref_start,
1429 			   s->hdr->ref_seq_span);
1430 		hts_md5_final(s->hdr->md5, md5);
1431 		hts_md5_destroy(md5);
1432 	    } else {
1433 		memset(s->hdr->md5, 0, 16);
1434 	    }
1435 	}
1436     }
1437 
1438     c->num_records = 0;
1439     c->num_blocks = 1; // cram_block_compression_hdr
1440     c->length = 0;
1441 
1442     //fprintf(stderr, "=== BF ===\n");
1443     h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
1444 					 c->stats[DS_BF], E_INT, NULL,
1445 					 fd->version);
1446 
1447     //fprintf(stderr, "=== CF ===\n");
1448     h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
1449 					 c->stats[DS_CF], E_INT, NULL,
1450 					 fd->version);
1451 //    fprintf(stderr, "=== RN ===\n");
1452 //    h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
1453 //				    c->stats[DS_RN], E_BYTE_ARRAY, NULL,
1454 //				    fd->version);
1455 
1456     //fprintf(stderr, "=== AP ===\n");
1457     if (c->pos_sorted) {
1458 	h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
1459 					     c->stats[DS_AP], E_INT, NULL,
1460 					     fd->version);
1461     } else {
1462 	int p[2] = {0, c->max_apos};
1463 	h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p,
1464 					     fd->version);
1465     }
1466 
1467     //fprintf(stderr, "=== RG ===\n");
1468     h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
1469 					 c->stats[DS_RG], E_INT, NULL,
1470 					 fd->version);
1471 
1472     //fprintf(stderr, "=== MQ ===\n");
1473     h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
1474 					 c->stats[DS_MQ], E_INT, NULL,
1475 					 fd->version);
1476 
1477     //fprintf(stderr, "=== NS ===\n");
1478     h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
1479 					 c->stats[DS_NS], E_INT, NULL,
1480 					 fd->version);
1481 
1482     //fprintf(stderr, "=== MF ===\n");
1483     h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
1484 					 c->stats[DS_MF], E_INT, NULL,
1485 					 fd->version);
1486 
1487     //fprintf(stderr, "=== TS ===\n");
1488     h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
1489 					 c->stats[DS_TS], E_INT, NULL,
1490 					 fd->version);
1491     //fprintf(stderr, "=== NP ===\n");
1492     h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
1493 					 c->stats[DS_NP], E_INT, NULL,
1494 					 fd->version);
1495     //fprintf(stderr, "=== NF ===\n");
1496     h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
1497 					 c->stats[DS_NF], E_INT, NULL,
1498 					 fd->version);
1499 
1500     //fprintf(stderr, "=== RL ===\n");
1501     h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
1502 					 c->stats[DS_RL], E_INT, NULL,
1503 					 fd->version);
1504 
1505     //fprintf(stderr, "=== FN ===\n");
1506     h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
1507 					 c->stats[DS_FN], E_INT, NULL,
1508 					 fd->version);
1509 
1510     //fprintf(stderr, "=== FC ===\n");
1511     h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
1512 					 c->stats[DS_FC], E_BYTE, NULL,
1513 					 fd->version);
1514 
1515     //fprintf(stderr, "=== FP ===\n");
1516     h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
1517 					 c->stats[DS_FP], E_INT, NULL,
1518 					 fd->version);
1519 
1520     //fprintf(stderr, "=== DL ===\n");
1521     h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
1522 					 c->stats[DS_DL], E_INT, NULL,
1523 					 fd->version);
1524 
1525     //fprintf(stderr, "=== BA ===\n");
1526     h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
1527 					 c->stats[DS_BA], E_BYTE, NULL,
1528 					 fd->version);
1529 
1530     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1531 	cram_byte_array_len_encoder e;
1532 
1533 	e.len_encoding = E_EXTERNAL;
1534 	e.len_dat = (void *)DS_BB_len;
1535 	//e.len_dat = (void *)DS_BB;
1536 
1537 	e.val_encoding = E_EXTERNAL;
1538 	e.val_dat = (void *)DS_BB;
1539 
1540 	h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1541 					     E_BYTE_ARRAY, (void *)&e,
1542 					     fd->version);
1543     } else {
1544 	h->codecs[DS_BB] = NULL;
1545     }
1546 
1547     //fprintf(stderr, "=== BS ===\n");
1548     h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
1549 					 c->stats[DS_BS], E_BYTE, NULL,
1550 					 fd->version);
1551 
1552     if (CRAM_MAJOR_VERS(fd->version) == 1) {
1553 	h->codecs[DS_TL] = NULL;
1554 	h->codecs[DS_RI] = NULL;
1555 	h->codecs[DS_RS] = NULL;
1556 	h->codecs[DS_PD] = NULL;
1557 	h->codecs[DS_HC] = NULL;
1558 	h->codecs[DS_SC] = NULL;
1559 
1560 	//fprintf(stderr, "=== TC ===\n");
1561 	h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
1562 					     c->stats[DS_TC], E_BYTE, NULL,
1563 					     fd->version);
1564 
1565     //fprintf(stderr, "=== TN ===\n");
1566 	h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
1567 					     c->stats[DS_TN], E_INT, NULL,
1568 					     fd->version);
1569     } else {
1570 	h->codecs[DS_TC] = NULL;
1571 	h->codecs[DS_TN] = NULL;
1572 
1573 	//fprintf(stderr, "=== TL ===\n");
1574 	h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
1575 					     c->stats[DS_TL], E_INT, NULL,
1576 					     fd->version);
1577 
1578 
1579 	//fprintf(stderr, "=== RI ===\n");
1580 	h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
1581 					     c->stats[DS_RI], E_INT, NULL,
1582 					     fd->version);
1583 
1584 	//fprintf(stderr, "=== RS ===\n");
1585 	h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
1586 					     c->stats[DS_RS], E_INT, NULL,
1587 					     fd->version);
1588 
1589 	//fprintf(stderr, "=== PD ===\n");
1590 	h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
1591 					     c->stats[DS_PD], E_INT, NULL,
1592 					     fd->version);
1593 
1594 	//fprintf(stderr, "=== HC ===\n");
1595 	h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
1596 					     c->stats[DS_HC], E_INT, NULL,
1597 					     fd->version);
1598 
1599 	//fprintf(stderr, "=== SC ===\n");
1600 	if (1) {
1601 	    int i2[2] = {0, DS_SC};
1602 
1603 	    h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1604 						 E_BYTE_ARRAY, (void *)i2,
1605 						 fd->version);
1606 	} else {
1607 	    // Appears to be no practical benefit to using this method,
1608 	    // but it may work better if we start mixing SC, IN and BB
1609 	    // elements into the same external block.
1610 	    cram_byte_array_len_encoder e;
1611 
1612 	    e.len_encoding = E_EXTERNAL;
1613 	    e.len_dat = (void *)DS_SC_len;
1614 
1615 	    e.val_encoding = E_EXTERNAL;
1616 	    e.val_dat = (void *)DS_SC;
1617 
1618 	    h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1619 						 E_BYTE_ARRAY, (void *)&e,
1620 						 fd->version);
1621 	}
1622     }
1623 
1624     //fprintf(stderr, "=== IN ===\n");
1625     {
1626 	int i2[2] = {0, DS_IN};
1627 	h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1628 					     E_BYTE_ARRAY, (void *)i2,
1629 					     fd->version);
1630     }
1631 
1632     h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
1633 					 (void *)DS_QS,
1634 					 fd->version);
1635     {
1636 	int i2[2] = {0, DS_RN};
1637 	h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1638 					     E_BYTE_ARRAY, (void *)i2,
1639 					     fd->version);
1640     }
1641 
1642 
1643     /* Encode slices */
1644     for (i = 0; i < c->curr_slice; i++) {
1645 	if (fd->verbose)
1646 	    fprintf(stderr, "Encode slice %d\n", i);
1647 
1648 	if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
1649 	    return -1;
1650     }
1651 
1652     /* Create compression header */
1653     {
1654 	h->ref_seq_id    = c->ref_seq_id;
1655 	h->ref_seq_start = c->ref_seq_start;
1656 	h->ref_seq_span  = c->ref_seq_span;
1657 	h->num_records   = c->num_records;
1658 
1659 	h->mapped_qs_included = 0;   // fixme
1660 	h->unmapped_qs_included = 0; // fixme
1661 	h->AP_delta = c->pos_sorted;
1662 	// h->...  fixme
1663 	memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
1664 
1665 	if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
1666 	    return -1;
1667     }
1668 
1669     /* Compute landmarks */
1670     /* Fill out slice landmarks */
1671     c->num_landmarks = c->curr_slice;
1672     c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
1673     if (!c->landmark)
1674 	return -1;
1675 
1676     /*
1677      * Slice offset starts after the first block, so we need to simulate
1678      * writing it to work out the correct offset
1679      */
1680     {
1681 	slice_offset = c_hdr->method == RAW
1682 	    ? c_hdr->uncomp_size
1683 	    : c_hdr->comp_size;
1684 	slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1685 	    itf8_size(c_hdr->content_id) +
1686 	    itf8_size(c_hdr->comp_size) +
1687 	    itf8_size(c_hdr->uncomp_size);
1688     }
1689 
1690     c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
1691     c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
1692     c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
1693     for (i = 0; i < c->curr_slice; i++) {
1694 	cram_slice *s = c->slices[i];
1695 
1696 	c->num_blocks += s->hdr->num_blocks + 1; // slice header
1697 	c->landmark[i] = slice_offset;
1698 
1699 	if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
1700 	    c->ref_seq_start + c->ref_seq_span) {
1701 	    c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
1702 		- c->ref_seq_start;
1703 	}
1704 
1705 	slice_offset += s->hdr_block->method == RAW
1706 	    ? s->hdr_block->uncomp_size
1707 	    : s->hdr_block->comp_size;
1708 
1709 	slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1710 	    itf8_size(s->hdr_block->content_id) +
1711 	    itf8_size(s->hdr_block->comp_size) +
1712 	    itf8_size(s->hdr_block->uncomp_size);
1713 
1714 	for (j = 0; j < s->hdr->num_blocks; j++) {
1715 	    slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1716 		itf8_size(s->block[j]->content_id) +
1717 		itf8_size(s->block[j]->comp_size) +
1718 		itf8_size(s->block[j]->uncomp_size);
1719 
1720 	    slice_offset += s->block[j]->method == RAW
1721 		? s->block[j]->uncomp_size
1722 		: s->block[j]->comp_size;
1723 	}
1724     }
1725     c->length += slice_offset; // just past the final slice
1726 
1727     c->comp_hdr_block = c_hdr;
1728 
1729     if (c->ref_seq_id >= 0) {
1730 	cram_ref_decr(fd->refs, c->ref_seq_id);
1731     }
1732 
1733     /* Cache references up-front if we have unsorted access patterns */
1734     if (!fd->no_ref && c->refs_used) {
1735 	for (i = 0; i < fd->refs->nref; i++) {
1736 	    if (c->refs_used[i])
1737 		cram_ref_decr(fd->refs, i);
1738 	}
1739     }
1740 
1741     return 0;
1742 }
1743 
1744 
1745 /*
1746  * Adds a feature code to a read within a slice. For purposes of minimising
1747  * memory allocations and fragmentation we have one array of features for all
1748  * reads within the slice. We return the index into this array for this new
1749  * feature.
1750  *
1751  * Returns feature index on success
1752  *         -1 on failure.
1753  */
cram_add_feature(cram_container * c,cram_slice * s,cram_record * r,cram_feature * f)1754 static int cram_add_feature(cram_container *c, cram_slice *s,
1755 			    cram_record *r, cram_feature *f) {
1756     if (s->nfeatures >= s->afeatures) {
1757 	s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
1758 	s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
1759 	if (!s->features)
1760 	    return -1;
1761     }
1762 
1763     if (!r->nfeature++) {
1764 	r->feature = s->nfeatures;
1765 	cram_stats_add(c->stats[DS_FP], f->X.pos);
1766     } else {
1767 	cram_stats_add(c->stats[DS_FP],
1768 		       f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
1769     }
1770     cram_stats_add(c->stats[DS_FC], f->X.code);
1771 
1772     s->features[s->nfeatures++] = *f;
1773 
1774     return 0;
1775 }
1776 
cram_add_substitution(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual,char ref)1777 static int cram_add_substitution(cram_fd *fd, cram_container *c,
1778 				 cram_slice *s, cram_record *r,
1779 				 int pos, char base, char qual, char ref) {
1780     cram_feature f;
1781 
1782     // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
1783     if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
1784 	f.X.pos = pos+1;
1785 	f.X.code = 'X';
1786 	f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
1787 	cram_stats_add(c->stats[DS_BS], f.X.base);
1788     } else {
1789 	f.B.pos = pos+1;
1790 	f.B.code = 'B';
1791 	f.B.base = base;
1792 	f.B.qual = qual;
1793 	cram_stats_add(c->stats[DS_BA], f.B.base);
1794 	cram_stats_add(c->stats[DS_QS], f.B.qual);
1795 	BLOCK_APPEND_CHAR(s->qual_blk, qual);
1796     }
1797     return cram_add_feature(c, s, r, &f);
1798 }
1799 
cram_add_bases(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1800 static int cram_add_bases(cram_fd *fd, cram_container *c,
1801 			  cram_slice *s, cram_record *r,
1802 			  int pos, int len, char *base) {
1803     cram_feature f;
1804 
1805     f.b.pos = pos+1;
1806     f.b.code = 'b';
1807     f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
1808     f.b.len = len;
1809 
1810     return cram_add_feature(c, s, r, &f);
1811 }
1812 
cram_add_base(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual)1813 static int cram_add_base(cram_fd *fd, cram_container *c,
1814 			 cram_slice *s, cram_record *r,
1815 			 int pos, char base, char qual) {
1816     cram_feature f;
1817     f.B.pos = pos+1;
1818     f.B.code = 'B';
1819     f.B.base = base;
1820     f.B.qual = qual;
1821     cram_stats_add(c->stats[DS_BA], base);
1822     cram_stats_add(c->stats[DS_QS], qual);
1823     BLOCK_APPEND_CHAR(s->qual_blk, qual);
1824     return cram_add_feature(c, s, r, &f);
1825 }
1826 
cram_add_quality(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char qual)1827 static int cram_add_quality(cram_fd *fd, cram_container *c,
1828 			    cram_slice *s, cram_record *r,
1829 			    int pos, char qual) {
1830     cram_feature f;
1831     f.Q.pos = pos+1;
1832     f.Q.code = 'Q';
1833     f.Q.qual = qual;
1834     cram_stats_add(c->stats[DS_QS], qual);
1835     BLOCK_APPEND_CHAR(s->qual_blk, qual);
1836     return cram_add_feature(c, s, r, &f);
1837 }
1838 
cram_add_deletion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1839 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
1840 			     int pos, int len, char *base) {
1841     cram_feature f;
1842     f.D.pos = pos+1;
1843     f.D.code = 'D';
1844     f.D.len = len;
1845     cram_stats_add(c->stats[DS_DL], len);
1846     return cram_add_feature(c, s, r, &f);
1847 }
1848 
cram_add_softclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base,int version)1849 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
1850 			     int pos, int len, char *base, int version) {
1851     cram_feature f;
1852     f.S.pos = pos+1;
1853     f.S.code = 'S';
1854     f.S.len = len;
1855     switch (CRAM_MAJOR_VERS(version)) {
1856     case 1:
1857 	f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1858 	BLOCK_APPEND(s->base_blk, base, len);
1859 	BLOCK_APPEND_CHAR(s->base_blk, '\0');
1860 	break;
1861 
1862     case 2:
1863     default:
1864 	f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1865 	if (base) {
1866 	    BLOCK_APPEND(s->soft_blk, base, len);
1867 	} else {
1868 	    int i;
1869 	    for (i = 0; i < len; i++)
1870 		BLOCK_APPEND_CHAR(s->soft_blk, 'N');
1871 	}
1872 	BLOCK_APPEND_CHAR(s->soft_blk, '\0');
1873 	break;
1874 
1875 //    default:
1876 //	// v3.0 onwards uses BB data-series
1877 //	f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1878     }
1879     return cram_add_feature(c, s, r, &f);
1880 }
1881 
cram_add_hardclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1882 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
1883 			     int pos, int len, char *base) {
1884     cram_feature f;
1885     f.S.pos = pos+1;
1886     f.S.code = 'H';
1887     f.S.len = len;
1888     cram_stats_add(c->stats[DS_HC], len);
1889     return cram_add_feature(c, s, r, &f);
1890 }
1891 
cram_add_skip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1892 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
1893 			     int pos, int len, char *base) {
1894     cram_feature f;
1895     f.S.pos = pos+1;
1896     f.S.code = 'N';
1897     f.S.len = len;
1898     cram_stats_add(c->stats[DS_RS], len);
1899     return cram_add_feature(c, s, r, &f);
1900 }
1901 
cram_add_pad(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1902 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
1903 			     int pos, int len, char *base) {
1904     cram_feature f;
1905     f.S.pos = pos+1;
1906     f.S.code = 'P';
1907     f.S.len = len;
1908     cram_stats_add(c->stats[DS_PD], len);
1909     return cram_add_feature(c, s, r, &f);
1910 }
1911 
cram_add_insertion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1912 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
1913 			      int pos, int len, char *base) {
1914     cram_feature f;
1915     f.I.pos = pos+1;
1916     if (len == 1) {
1917 	char b = base ? *base : 'N';
1918 	f.i.code = 'i';
1919 	f.i.base = b;
1920 	cram_stats_add(c->stats[DS_BA], b);
1921     } else {
1922 	f.I.code = 'I';
1923 	f.I.len = len;
1924 	f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1925 	if (base) {
1926 	    BLOCK_APPEND(s->base_blk, base, len);
1927 	} else {
1928 	    int i;
1929 	    for (i = 0; i < len; i++)
1930 		BLOCK_APPEND_CHAR(s->base_blk, 'N');
1931 	}
1932 	BLOCK_APPEND_CHAR(s->base_blk, '\0');
1933     }
1934     return cram_add_feature(c, s, r, &f);
1935 }
1936 
1937 /*
1938  * Encodes auxiliary data.
1939  * Returns the read-group parsed out of the BAM aux fields on success
1940  *         NULL on failure or no rg present (FIXME)
1941  */
cram_encode_aux_1_0(cram_fd * fd,bam_seq_t * b,cram_container * c,cram_slice * s,cram_record * cr)1942 static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
1943 				 cram_slice *s, cram_record *cr) {
1944     char *aux, *tmp, *rg = NULL;
1945     int aux_size = bam_blk_size(b) -
1946 	((char *)bam_aux(b) - (char *)&bam_ref(b));
1947 
1948     /* Worst case is 1 nul char on every ??:Z: string, so +33% */
1949     BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
1950     tmp = (char *)BLOCK_END(s->aux_blk);
1951 
1952     aux = (char *)bam_aux(b);
1953     cr->TN_idx = s->nTN;
1954 
1955     while (aux[0] != 0) {
1956 	int32_t i32;
1957 	int r;
1958 
1959 	if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
1960 	    rg = &aux[3];
1961 	    while (*aux++);
1962 	    continue;
1963 	}
1964 	if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
1965 	    while (*aux++);
1966 	    continue;
1967 	}
1968 	if (aux[0] == 'N' && aux[1] == 'M') {
1969 	    switch(aux[2]) {
1970 	    case 'A': case 'C': case 'c': aux+=4; break;
1971 	    case 'I': case 'i': case 'f': aux+=7; break;
1972 	    default:
1973 		fprintf(stderr, "Unhandled type code for NM tag\n");
1974 		return NULL;
1975 	    }
1976 	    continue;
1977 	}
1978 
1979 	cr->ntags++;
1980 
1981 	i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
1982 	kh_put(m_tagmap, c->tags_used, i32, &r);
1983 	if (-1 == r)
1984 	    return NULL;
1985 
1986 	if (s->nTN >= s->aTN) {
1987 	    s->aTN = s->aTN ? s->aTN*2 : 1024;
1988 	    if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
1989 		return NULL;
1990 	}
1991 	s->TN[s->nTN++] = i32;
1992 	cram_stats_add(c->stats[DS_TN], i32);
1993 
1994 	switch(aux[2]) {
1995 	case 'A': case 'C': case 'c':
1996 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1997 	    *tmp++=*aux++;
1998 	    break;
1999 
2000 	case 'S': case 's':
2001 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2002 	    *tmp++=*aux++; *tmp++=*aux++;
2003 	    break;
2004 
2005 	case 'I': case 'i': case 'f':
2006 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2007 	    *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2008 	    break;
2009 
2010 	case 'd':
2011 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2012 	    *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2013 	    *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2014 	    break;
2015 
2016 	case 'Z': case 'H':
2017 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2018 	    while ((*tmp++=*aux++));
2019 	    *tmp++ = '\t'; // stop byte
2020 	    break;
2021 
2022 	case 'B': {
2023 	    int type = aux[3], blen;
2024 	    uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2025 					(((unsigned char *)aux)[5]<< 8) +
2026 					(((unsigned char *)aux)[6]<<16) +
2027 					(((unsigned char *)aux)[7]<<24));
2028 	    // skip TN field
2029 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2030 
2031 	    // We use BYTE_ARRAY_LEN with external length, so store that first
2032 	    switch (type) {
2033 	    case 'c': case 'C':
2034 		blen = count;
2035 		break;
2036 	    case 's': case 'S':
2037 		blen = 2*count;
2038 		break;
2039 	    case 'i': case 'I': case 'f':
2040 		blen = 4*count;
2041 		break;
2042 	    default:
2043 		fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2044 			type);
2045 		return NULL;
2046 
2047 	    }
2048 
2049 	    tmp += itf8_put(tmp, blen+5);
2050 
2051 	    *tmp++=*aux++; // sub-type & length
2052 	    *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2053 
2054 	    // The tag data itself
2055 	    memcpy(tmp, aux, blen); tmp += blen; aux += blen;
2056 
2057 	    //cram_stats_add(c->aux_B_stats, blen);
2058 	    break;
2059 	}
2060 	default:
2061 	    fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
2062 	    return NULL;
2063 	}
2064     }
2065     cram_stats_add(c->stats[DS_TC], cr->ntags);
2066 
2067     cr->aux = BLOCK_SIZE(s->aux_blk);
2068     cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
2069     BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
2070     assert(s->aux_blk->byte <= s->aux_blk->alloc);
2071 
2072     return rg;
2073 }
2074 
2075 /*
2076  * Encodes auxiliary data. Largely duplicated from above, but done so to
2077  * keep it simple and avoid a myriad of version ifs.
2078  *
2079  * Returns the read-group parsed out of the BAM aux fields on success
2080  *         NULL on failure or no rg present (FIXME)
2081  */
cram_encode_aux(cram_fd * fd,bam_seq_t * b,cram_container * c,cram_slice * s,cram_record * cr)2082 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
2083 			     cram_slice *s, cram_record *cr) {
2084     char *aux, *orig, *rg = NULL;
2085     int aux_size = bam_get_l_aux(b);
2086     cram_block *td_b = c->comp_hdr->TD_blk;
2087     int TD_blk_size = BLOCK_SIZE(td_b), new;
2088     char *key;
2089     khint_t k;
2090 
2091     orig = aux = (char *)bam_aux(b);
2092 
2093     // Copy aux keys to td_b and aux values to slice aux blocks
2094     while (aux - orig < aux_size && aux[0] != 0) {
2095 	int r;
2096 
2097 	// RG:Z
2098 	if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2099 	    rg = &aux[3];
2100 	    while (*aux++);
2101 	    continue;
2102 	}
2103 
2104 	// MD:Z
2105 	if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2106 	    if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
2107 		while (*aux++);
2108 		continue;
2109 	    }
2110 	}
2111 
2112 	// NM:i
2113 	if (aux[0] == 'N' && aux[1] == 'M') {
2114 	    if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
2115 		switch(aux[2]) {
2116 		case 'A': case 'C': case 'c': aux+=4; break;
2117 		case 'S': case 's':           aux+=5; break;
2118 		case 'I': case 'i': case 'f': aux+=7; break;
2119 		default:
2120 		    fprintf(stderr, "Unhandled type code for NM tag\n");
2121 		    return NULL;
2122 		}
2123 		continue;
2124 	    }
2125 	}
2126 
2127 	BLOCK_APPEND(td_b, aux, 3);
2128 
2129 	// Container level tags_used, for TD series
2130 	// Maps integer key ('X0i') to cram_tag_map struct.
2131 	int key = (aux[0]<<16)|(aux[1]<<8)|aux[2];
2132 	k = kh_put(m_tagmap, c->tags_used, key, &r);
2133 	if (-1 == r)
2134 	    return NULL;
2135 
2136 	if (r == 1) {
2137 	    khint_t k_global;
2138 
2139 	    // Global tags_used for cram_metrics support
2140 	    pthread_mutex_lock(&fd->metrics_lock);
2141 	    k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2142 	    if (-1 == r)
2143 		return NULL;
2144 	    if (r == 1)
2145 		kh_val(fd->tags_used, k_global) = cram_new_metrics();
2146 
2147 	    pthread_mutex_unlock(&fd->metrics_lock);
2148 
2149 	    int i2[2] = {'\t',key};
2150 	    size_t sk = key;
2151 	    cram_tag_map *m = calloc(1, sizeof(*m));
2152 	    kh_val(c->tags_used, k) = m;
2153 
2154 	    cram_codec *c;
2155 
2156 	    // Use a block content id based on the tag id.
2157 	    // Codec type depends on tag data type.
2158 	    switch(aux[2]) {
2159 	    case 'Z': case 'H':
2160 		// string as byte_array_stop
2161 		c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2162 				      E_BYTE_ARRAY, (void *)i2,
2163 				      fd->version);
2164 		break;
2165 
2166 	    case 'A': case 'c': case 'C': {
2167 		// byte array len, 1 byte
2168 		cram_byte_array_len_encoder e;
2169 		cram_stats st;
2170 
2171 		e.len_encoding = E_HUFFMAN;
2172 		e.len_dat = NULL;
2173 		memset(&st, 0, sizeof(st));
2174 		cram_stats_add(&st, 1);
2175 		cram_stats_encoding(fd, &st);
2176 
2177 		e.val_encoding = E_EXTERNAL;
2178 		e.val_dat = (void *)sk;
2179 
2180 		c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2181 				      E_BYTE_ARRAY, (void *)&e,
2182 				      fd->version);
2183 		break;
2184 	    }
2185 
2186 	    case 's': case 'S': {
2187 		// byte array len, 2 byte
2188 		cram_byte_array_len_encoder e;
2189 		cram_stats st;
2190 
2191 		e.len_encoding = E_HUFFMAN;
2192 		e.len_dat = NULL;
2193 		memset(&st, 0, sizeof(st));
2194 		cram_stats_add(&st, 2);
2195 		cram_stats_encoding(fd, &st);
2196 
2197 		e.val_encoding = E_EXTERNAL;
2198 		e.val_dat = (void *)sk;
2199 
2200 		c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2201 				      E_BYTE_ARRAY, (void *)&e,
2202 				      fd->version);
2203 		break;
2204 	    }
2205 	    case 'i': case 'I': case 'f': {
2206 		// byte array len, 4 byte
2207 		cram_byte_array_len_encoder e;
2208 		cram_stats st;
2209 
2210 		e.len_encoding = E_HUFFMAN;
2211 		e.len_dat = NULL;
2212 		memset(&st, 0, sizeof(st));
2213 		cram_stats_add(&st, 4);
2214 		cram_stats_encoding(fd, &st);
2215 
2216 		e.val_encoding = E_EXTERNAL;
2217 		e.val_dat = (void *)sk;
2218 
2219 		c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2220 				      E_BYTE_ARRAY, (void *)&e,
2221 				      fd->version);
2222 		break;
2223 	    }
2224 
2225 	    case 'B': {
2226 		// Byte array of variable size, but we generate our tag
2227 		// byte stream at the wrong stage (during reading and not
2228 		// after slice header construction). So we use
2229 		// BYTE_ARRAY_LEN with the length codec being external
2230 		// too.
2231 		cram_byte_array_len_encoder e;
2232 
2233 		e.len_encoding = E_EXTERNAL;
2234 		e.len_dat = (void *)sk; // or key+128 for len?
2235 
2236 		e.val_encoding = E_EXTERNAL;
2237 		e.val_dat = (void *)sk;
2238 
2239 		c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2240 				      E_BYTE_ARRAY, (void *)&e,
2241 				      fd->version);
2242 		break;
2243 	    }
2244 
2245 	    default:
2246 		fprintf(stderr, "Unsupported SAM aux type '%c'\n",
2247 			aux[2]);
2248 		c = NULL;
2249 	    }
2250 
2251 	    m->codec = c;
2252 
2253 	    // Link to fd-global tag metrics
2254 	    m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
2255 	}
2256 
2257 	cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2258 	cram_codec *codec = tm->codec;
2259 
2260 	switch(aux[2]) {
2261 	case 'A': case 'C': case 'c':
2262 	    if (!tm->blk) {
2263 		if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2264 		    return NULL;
2265 		codec->e_byte_array_len.val_codec->out = tm->blk;
2266 	    }
2267 
2268 	    aux+=3;
2269 	    //codec->encode(s, codec, aux, 1);
2270 	    // Functionally equivalent, but less code.
2271 	    BLOCK_APPEND_CHAR(tm->blk, *aux);
2272 	    aux++;
2273 	    break;
2274 
2275 	case 'S': case 's':
2276 	    if (!tm->blk) {
2277 		if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2278 		    return NULL;
2279 		codec->e_byte_array_len.val_codec->out = tm->blk;
2280 	    }
2281 
2282 	    aux+=3;
2283 	    //codec->encode(s, codec, aux, 2);
2284 	    BLOCK_APPEND(tm->blk, aux, 2);
2285 	    aux+=2;
2286 	    break;
2287 
2288 	case 'I': case 'i': case 'f':
2289 	    if (!tm->blk) {
2290 		if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2291 		    return NULL;
2292 		codec->e_byte_array_len.val_codec->out = tm->blk;
2293 	    }
2294 
2295 	    aux+=3;
2296 	    //codec->encode(s, codec, aux, 4);
2297 	    BLOCK_APPEND(tm->blk, aux, 4);
2298 	    aux+=4;
2299 	    break;
2300 
2301 	case 'd':
2302 	    if (!tm->blk) {
2303 		if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2304 		    return NULL;
2305 		codec->e_byte_array_len.val_codec->out = tm->blk;
2306 	    }
2307 
2308 	    aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2309 	    //codec->encode(s, codec, aux, 8);
2310 	    BLOCK_APPEND(tm->blk, aux, 8);
2311 	    aux+=8;
2312 	    break;
2313 
2314 	case 'Z': case 'H':
2315 	    {
2316 		if (!tm->blk) {
2317 		    if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2318 			return NULL;
2319 		    codec->out = tm->blk;
2320 		}
2321 
2322 		char *aux_s;
2323 		aux += 3;
2324 		aux_s = aux;
2325 		while (*aux++);
2326 		codec->encode(s, codec, aux_s, aux - aux_s);
2327 	    }
2328 	    break;
2329 
2330 	case 'B': {
2331 	    int type = aux[3], blen;
2332 	    uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2333 					(((unsigned char *)aux)[5]<< 8) +
2334 					(((unsigned char *)aux)[6]<<16) +
2335 					(((unsigned char *)aux)[7]<<24));
2336 	    if (!tm->blk) {
2337 		if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2338 		    return NULL;
2339 		codec->e_byte_array_len.len_codec->out = tm->blk;
2340 		codec->e_byte_array_len.val_codec->out = tm->blk;
2341 	    }
2342 
2343 	    // skip TN field
2344 	    aux+=3;
2345 
2346 	    // We use BYTE_ARRAY_LEN with external length, so store that first
2347 	    switch (type) {
2348 	    case 'c': case 'C':
2349 		blen = count;
2350 		break;
2351 	    case 's': case 'S':
2352 		blen = 2*count;
2353 		break;
2354 	    case 'i': case 'I': case 'f':
2355 		blen = 4*count;
2356 		break;
2357 	    default:
2358 		fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2359 			type);
2360 		return NULL;
2361 
2362 	    }
2363 
2364 	    blen += 5; // sub-type & length
2365 
2366 	    codec->encode(s, codec, aux, blen);
2367 	    aux += blen;
2368 	    break;
2369 	}
2370 	default:
2371 	    fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
2372 	    return NULL;
2373 	}
2374 	tm->blk->m = tm->m;
2375     }
2376 
2377     // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
2378 
2379     // And and increment TD hash entry
2380     BLOCK_APPEND_CHAR(td_b, 0);
2381 
2382     // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
2383     key = string_ndup(c->comp_hdr->TD_keys,
2384 		      (char *)BLOCK_DATA(td_b) + TD_blk_size,
2385 		      BLOCK_SIZE(td_b) - TD_blk_size);
2386     k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
2387     if (new < 0) {
2388 	return NULL;
2389     } else if (new == 0) {
2390 	BLOCK_SIZE(td_b) = TD_blk_size;
2391     } else {
2392 	kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
2393 	c->comp_hdr->nTL++;
2394     }
2395 
2396     cr->TL = kh_val(c->comp_hdr->TD_hash, k);
2397     cram_stats_add(c->stats[DS_TL], cr->TL);
2398 
2399     return rg;
2400 }
2401 
2402 /*
2403  * During cram_next_container or before the final flush at end of
2404  * file, we update the current slice headers and increment the slice
2405  * number to the next slice.
2406  *
2407  * See cram_next_container() and cram_close().
2408  */
cram_update_curr_slice(cram_container * c)2409 void cram_update_curr_slice(cram_container *c) {
2410     cram_slice *s = c->slice;
2411     if (c->multi_seq) {
2412 	s->hdr->ref_seq_id    = -2;
2413 	s->hdr->ref_seq_start = 0;
2414 	s->hdr->ref_seq_span  = 0;
2415     } else {
2416 	s->hdr->ref_seq_id    = c->curr_ref;
2417 	s->hdr->ref_seq_start = c->first_base;
2418 	s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
2419     }
2420     s->hdr->num_records   = c->curr_rec;
2421 
2422     if (c->curr_slice == 0) {
2423 	if (c->ref_seq_id != s->hdr->ref_seq_id)
2424 	    c->ref_seq_id  = s->hdr->ref_seq_id;
2425 	c->ref_seq_start = c->first_base;
2426     }
2427 
2428     c->curr_slice++;
2429 }
2430 
2431 /*
2432  * Handles creation of a new container or new slice, flushing any
2433  * existing containers when appropriate.
2434  *
2435  * Really this is next slice, which may or may not lead to a new container.
2436  *
2437  * Returns cram_container pointer on success
2438  *         NULL on failure.
2439  */
cram_next_container(cram_fd * fd,bam_seq_t * b)2440 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
2441     cram_container *c = fd->ctr;
2442     int i;
2443 
2444     /* First occurence */
2445     if (c->curr_ref == -2)
2446 	c->curr_ref = bam_ref(b);
2447 
2448     if (c->slice)
2449 	cram_update_curr_slice(c);
2450 
2451     /* Flush container */
2452     if (c->curr_slice == c->max_slice ||
2453 	(bam_ref(b) != c->curr_ref && !c->multi_seq)) {
2454 	c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
2455 	if (fd->verbose)
2456 	    fprintf(stderr, "Flush container %d/%d..%d\n",
2457 		    c->ref_seq_id, c->ref_seq_start,
2458 		    c->ref_seq_start + c->ref_seq_span -1);
2459 
2460 	/* Encode slices */
2461 	if (fd->pool) {
2462 	    if (-1 == cram_flush_container_mt(fd, c))
2463 		return NULL;
2464 	} else {
2465 	    if (-1 == cram_flush_container(fd, c))
2466 		return NULL;
2467 
2468 	    // Move to sep func, as we need cram_flush_container for
2469 	    // the closing phase to flush the partial container.
2470 	    for (i = 0; i < c->max_slice; i++) {
2471 		cram_free_slice(c->slices[i]);
2472 		c->slices[i] = NULL;
2473 	    }
2474 
2475 	    c->slice = NULL;
2476 	    c->curr_slice = 0;
2477 
2478 	    /* Easy approach for purposes of freeing stats */
2479 	    cram_free_container(c);
2480 	}
2481 
2482 	c = fd->ctr = cram_new_container(fd->seqs_per_slice,
2483 					 fd->slices_per_container);
2484 	if (!c)
2485 	    return NULL;
2486 	c->record_counter = fd->record_counter;
2487 	c->curr_ref = bam_ref(b);
2488     }
2489 
2490     c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
2491 
2492     /* New slice */
2493     c->slice = c->slices[c->curr_slice] =
2494 	cram_new_slice(MAPPED_SLICE, c->max_rec);
2495     if (!c->slice)
2496 	return NULL;
2497 
2498     if (c->multi_seq) {
2499 	c->slice->hdr->ref_seq_id = -2;
2500 	c->slice->hdr->ref_seq_start = 0;
2501 	c->slice->last_apos = 1;
2502     } else {
2503 	c->slice->hdr->ref_seq_id = bam_ref(b);
2504 	// wrong for unsorted data, will fix during encoding.
2505 	c->slice->hdr->ref_seq_start = bam_pos(b)+1;
2506 	c->slice->last_apos = bam_pos(b)+1;
2507     }
2508 
2509     c->curr_rec = 0;
2510     c->s_num_bases = 0;
2511 
2512     return c;
2513 }
2514 
2515 /*
2516  * Converts a single bam record into a cram record.
2517  * Possibly used within a thread.
2518  *
2519  * Returns 0 on success;
2520  *        -1 on failure
2521  */
process_one_read(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * cr,bam_seq_t * b,int rnum)2522 static int process_one_read(cram_fd *fd, cram_container *c,
2523 			    cram_slice *s, cram_record *cr,
2524 			    bam_seq_t *b, int rnum) {
2525     int i, fake_qual = -1;
2526     char *cp, *rg;
2527     char *ref, *seq, *qual;
2528 
2529     // FIXME: multi-ref containers
2530 
2531     ref = c->ref;
2532     cr->flags       = bam_flag(b);
2533     cr->len         = bam_seq_len(b);
2534 
2535     //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
2536 
2537     // Fields to resolve later
2538     //cr->mate_line;    // index to another cram_record
2539     //cr->mate_flags;   // MF
2540     //cr->ntags;        // TC
2541     cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
2542     if (CRAM_MAJOR_VERS(fd->version) == 1)
2543 	rg = cram_encode_aux_1_0(fd, b, c, s, cr);
2544     else
2545 	rg = cram_encode_aux(fd, b, c, s, cr);
2546 
2547     //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
2548     //cr->aux = DSTRING_LEN(s->aux_ds);
2549     //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
2550 
2551     /* Read group, identified earlier */
2552     if (rg) {
2553 	SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
2554 	cr->rg = brg ? brg->id : -1;
2555     } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
2556 	SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
2557 	assert(brg);
2558     } else {
2559 	cr->rg = -1;
2560     }
2561     cram_stats_add(c->stats[DS_RG], cr->rg);
2562 
2563 
2564     cr->ref_id      = bam_ref(b);  cram_stats_add(c->stats[DS_RI], cr->ref_id);
2565     cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
2566 
2567     // Non reference based encoding means storing the bases verbatim as features, which in
2568     // turn means every base also has a quality already stored.
2569     if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
2570 	cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2571 
2572     if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
2573 	cr->cram_flags |= CRAM_FLAG_NO_SEQ;
2574     //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
2575 
2576     c->num_bases   += cr->len;
2577     cr->apos        = bam_pos(b)+1;
2578     if (c->pos_sorted) {
2579 	if (cr->apos < s->last_apos) {
2580 	    c->pos_sorted = 0;
2581 	} else {
2582 	    cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos);
2583 	    s->last_apos = cr->apos;
2584 	}
2585     } else {
2586 	//cram_stats_add(c->stats[DS_AP], cr->apos);
2587     }
2588     c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
2589 
2590     /*
2591      * This seqs_ds is largely pointless and it could reuse the same memory
2592      * over and over.
2593      * s->base_blk is what we need for encoding.
2594      */
2595     cr->seq         = BLOCK_SIZE(s->seqs_blk);
2596     cr->qual        = BLOCK_SIZE(s->qual_blk);
2597     BLOCK_GROW(s->seqs_blk, cr->len+1);
2598     BLOCK_GROW(s->qual_blk, cr->len);
2599     seq = cp = (char *)BLOCK_END(s->seqs_blk);
2600 
2601     *seq = 0;
2602 #if HTS_ALLOW_UNALIGNED != 0
2603     {
2604 	// Convert seq 2 bases at a time for speed.
2605 	static const uint16_t code2base[256] = {
2606 	    15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
2607 	    21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
2608 	    15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
2609 	    21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
2610 	    15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
2611 	    21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
2612 	    15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
2613 	    21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
2614 	    15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
2615 	    21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
2616 	    15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
2617 	    21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
2618 	    15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
2619 	    21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
2620 	    15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
2621 	    21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
2622 	    15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
2623 	    21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
2624 	    15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
2625 	    21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
2626 	    15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
2627 	    21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
2628 	    15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
2629 	    21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
2630 	    15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
2631 	    21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
2632 	    15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
2633 	    21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
2634 	    15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
2635 	    21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
2636 	    15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
2637 	    21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
2638 	};
2639 
2640 	int l2 = cr->len / 2;
2641 	unsigned char *from = (unsigned char *)bam_seq(b);
2642 	uint16_t *cpi = (uint16_t *)cp;
2643 	cp[0] = 0;
2644 	for (i = 0; i < l2; i++)
2645 	    cpi[i] = le_int2(code2base[from[i]]);
2646 	if ((i *= 2) < cr->len)
2647 	    cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2648     }
2649 #else
2650     for (i = 0; i < cr->len; i++)
2651 	cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2652 #endif
2653     BLOCK_SIZE(s->seqs_blk) += cr->len;
2654 
2655     qual = cp = (char *)bam_qual(b);
2656 
2657     /* Copy and parse */
2658     if (!(cr->flags & BAM_FUNMAP)) {
2659 	uint32_t *cig_to, *cig_from;
2660 	int apos = cr->apos-1, spos = 0;
2661 
2662 	cr->cigar       = s->ncigar;
2663 	cr->ncigar      = bam_cigar_len(b);
2664 	while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
2665 	    s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
2666 	    s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
2667 	    if (!s->cigar)
2668 		return -1;
2669 	}
2670 
2671 	cig_to = (uint32_t *)s->cigar;
2672 	cig_from = (uint32_t *)bam_cigar(b);
2673 
2674 	cr->feature = 0;
2675 	cr->nfeature = 0;
2676 	for (i = 0; i < cr->ncigar; i++) {
2677 	    enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
2678 	    uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
2679 	    cig_to[i] = cig_from[i];
2680 
2681 	    /* Can also generate events from here for CRAM diffs */
2682 
2683 	    switch (cig_op) {
2684 		int l;
2685 
2686 		// Don't trust = and X ops to be correct.
2687 	    case BAM_CMATCH:
2688 	    case BAM_CBASE_MATCH:
2689 	    case BAM_CBASE_MISMATCH:
2690 		//fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
2691 		//	cig_len, &ref[apos], cig_len, &seq[spos]);
2692 		l = 0;
2693 		if (!fd->no_ref && cr->len) {
2694 		    int end = cig_len+apos < c->ref_end
2695 			? cig_len : c->ref_end - apos;
2696 		    char *sp = &seq[spos];
2697 		    char *rp = &ref[apos];
2698 		    char *qp = &qual[spos];
2699 		    if (end > cr->len) {
2700 			fprintf(stderr, "CIGAR and query sequence are of "
2701 				"different length\n");
2702 			return -1;
2703 		    }
2704 		    for (l = 0; l < end; l++) {
2705 			if (rp[l] != sp[l]) {
2706 			    if (!sp[l])
2707 				break;
2708 			    if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
2709 				// Disabled for the time being as it doesn't
2710 				// seem to gain us much.
2711 				int ol=l;
2712 				while (l<end && rp[l] != sp[l])
2713 				    l++;
2714 				if (l-ol > 1) {
2715 				    if (cram_add_bases(fd, c, s, cr, spos+ol,
2716 						       l-ol, &seq[spos+ol]))
2717 					return -1;
2718 				    l--;
2719 				} else {
2720 				    l = ol;
2721 				    if (cram_add_substitution(fd, c, s, cr,
2722 							      spos+l, sp[l],
2723 							      qp[l], rp[l]))
2724 					return -1;
2725 				}
2726 			    } else {
2727 				if (cram_add_substitution(fd, c, s, cr, spos+l,
2728 							  sp[l], qp[l], rp[l]))
2729 				    return -1;
2730 			    }
2731 			}
2732 		    }
2733 		    spos += l;
2734 		    apos += l;
2735 		}
2736 
2737 		if (l < cig_len && cr->len) {
2738 		    if (fd->no_ref) {
2739 			if (CRAM_MAJOR_VERS(fd->version) == 3) {
2740 			    if (cram_add_bases(fd, c, s, cr, spos,
2741 					       cig_len-l, &seq[spos]))
2742 				return -1;
2743 			    spos += cig_len-l;
2744 			} else {
2745 			    for (; l < cig_len && seq[spos]; l++, spos++) {
2746 				if (cram_add_base(fd, c, s, cr, spos,
2747 						  seq[spos], qual[spos]))
2748 				    return -1;
2749 			    }
2750 			}
2751 		    } else {
2752 			/* off end of sequence or non-ref based output */
2753 			for (; l < cig_len && seq[spos]; l++, spos++) {
2754 			    if (cram_add_base(fd, c, s, cr, spos,
2755 					      seq[spos], qual[spos]))
2756 				return -1;
2757 			}
2758 		    }
2759 		    apos += cig_len;
2760 		} else if (!cr->len) {
2761 		    /* Seq "*" */
2762 		    apos += cig_len;
2763 		    spos += cig_len;
2764 		}
2765 		break;
2766 
2767 	    case BAM_CDEL:
2768 		if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
2769 		    return -1;
2770 		apos += cig_len;
2771 		break;
2772 
2773 	    case BAM_CREF_SKIP:
2774 		if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
2775 		    return -1;
2776 		apos += cig_len;
2777 		break;
2778 
2779 	    case BAM_CINS:
2780 		if (cram_add_insertion(c, s, cr, spos, cig_len,
2781 				       cr->len ? &seq[spos] : NULL))
2782 		    return -1;
2783 		if (fd->no_ref && cr->len) {
2784 		    for (l = 0; l < cig_len; l++, spos++) {
2785 			cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2786 		    }
2787 		} else {
2788 		    spos += cig_len;
2789 		}
2790 		break;
2791 
2792 	    case BAM_CSOFT_CLIP:
2793 		if (cram_add_softclip(c, s, cr, spos, cig_len,
2794 				      cr->len ? &seq[spos] : NULL,
2795 				      fd->version))
2796 		    return -1;
2797 
2798 		if (fd->no_ref &&
2799 		    !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2800 		    if (cr->len) {
2801 			for (l = 0; l < cig_len; l++, spos++) {
2802 			    cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2803 			}
2804 		    } else {
2805 			for (l = 0; l < cig_len; l++, spos++) {
2806 			    cram_add_quality(fd, c, s, cr, spos, -1);
2807 			}
2808 		    }
2809 		} else {
2810 		    spos += cig_len;
2811 		}
2812 		break;
2813 
2814 	    case BAM_CHARD_CLIP:
2815 		if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
2816 		    return -1;
2817 		break;
2818 
2819 	    case BAM_CPAD:
2820 		if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
2821 		    return -1;
2822 		break;
2823 
2824 	    default:
2825 		fprintf(stderr, "Unknown CIGAR op code %d\n", cig_op);
2826 		return -1;
2827 	    }
2828 	}
2829 	if (cr->len && spos != cr->len) {
2830 	    fprintf(stderr, "CIGAR and query sequence are of different "
2831 		    "length\n");
2832 	    return -1;
2833 	}
2834 	fake_qual = spos;
2835 	cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end);
2836 	cram_stats_add(c->stats[DS_FN], cr->nfeature);
2837     } else {
2838 	// Unmapped
2839 	cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2840 	cr->cigar  = 0;
2841 	cr->ncigar = 0;
2842 	cr->nfeature = 0;
2843 	cr->aend = cr->apos;
2844 	for (i = 0; i < cr->len; i++)
2845 	    cram_stats_add(c->stats[DS_BA], seq[i]);
2846 	fake_qual = 0;
2847     }
2848 
2849     /*
2850      * Append to the qual block now. We do this here as
2851      * cram_add_substitution() can generate BA/QS events which need to
2852      * be in the qual block before we append the rest of the data.
2853      */
2854     if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
2855 	/* Special case of seq "*" */
2856 	if (cr->len == 0) {
2857 	    cr->len = fake_qual;
2858 	    BLOCK_GROW(s->qual_blk, cr->len);
2859 	    cp = (char *)BLOCK_END(s->qual_blk);
2860 	    memset(cp, 255, cr->len);
2861 	} else {
2862 	    BLOCK_GROW(s->qual_blk, cr->len);
2863 	    cp = (char *)BLOCK_END(s->qual_blk);
2864 	    char *from = (char *)&bam_qual(b)[0];
2865 	    char *to = &cp[0];
2866 	    memcpy(to, from, cr->len);
2867 	    //for (i = 0; i < cr->len; i++) cp[i] = from[i];
2868 	}
2869 	BLOCK_SIZE(s->qual_blk) += cr->len;
2870     } else {
2871 	if (cr->len == 0)
2872 	    cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
2873     }
2874 
2875     cram_stats_add(c->stats[DS_RL], cr->len);
2876 
2877     /* Now we know apos and aend both, update mate-pair information */
2878     {
2879 	int new;
2880 	khint_t k;
2881 	int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
2882 
2883 	//fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
2884 	//	cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
2885 	if (cr->flags & BAM_FPAIRED) {
2886 	    char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
2887 	    if (!key)
2888 		return -1;
2889 
2890 	    k = kh_put(m_s2i, s->pair[sec], key, &new);
2891 	    if (-1 == new)
2892 		return -1;
2893 	    else if (new > 0)
2894 		kh_val(s->pair[sec], k) = rnum;
2895 	} else {
2896 	    new = 1;
2897 	}
2898 
2899 	if (new == 0) {
2900 	    cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
2901 	    int aleft, aright, sign;
2902 
2903 	    aleft = MIN(cr->apos, p->apos);
2904 	    aright = MAX(cr->aend, p->aend);
2905 	    if (cr->apos < p->apos) {
2906 		sign = 1;
2907 	    } else if (cr->apos > p->apos) {
2908 		sign = -1;
2909 	    } else if (cr->flags & BAM_FREAD1) {
2910 		sign = 1;
2911 	    } else {
2912 		sign = -1;
2913 	    }
2914 
2915 	    // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
2916 	    // a small amount, if appropriate options set.
2917 	    if ((bam_ins_size(b) &&
2918 		 abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) ||
2919 		(!bam_ins_size(b) && !fd->tlen_zero))
2920 		goto detached;
2921 
2922 	    if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
2923 		!(fd->tlen_zero && bam_mate_pos(b) == 0))
2924 		goto detached;
2925 
2926 	    if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
2927 		((p->flags & BAM_FUNMAP) != 0))
2928 		goto detached;
2929 
2930 	    if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
2931 		((p->flags & BAM_FREVERSE) != 0))
2932 		goto detached;
2933 
2934 
2935 	    // p vs this: tlen, matepos, flags
2936 	    if (p->ref_id != cr->ref_id &&
2937 		!(fd->tlen_zero && p->ref_id == -1))
2938 		goto detached;
2939 
2940 	    if ((p->tlen && abs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) ||
2941 		(!p->tlen && !fd->tlen_zero))
2942 		goto detached;
2943 
2944 	    if (p->mate_pos != cr->apos &&
2945 		!(fd->tlen_zero && p->mate_pos == 0))
2946 		goto detached;
2947 
2948 	    if (((p->flags & BAM_FMUNMAP) != 0) !=
2949 		((p->mate_flags & CRAM_M_UNMAP) != 0))
2950 		goto detached;
2951 
2952 	    if (((p->flags & BAM_FMREVERSE) != 0) !=
2953 		((p->mate_flags & CRAM_M_REVERSE) != 0))
2954 		goto detached;
2955 
2956 	    // Supplementary reads are just too ill defined
2957 	    if ((cr->flags & BAM_FSUPPLEMENTARY) ||
2958 		(p->flags & BAM_FSUPPLEMENTARY))
2959 		goto detached;
2960 
2961 	    // When in lossy name mode, if a read isn't detached we
2962 	    // cannot store the name.  The corollary is that when we
2963 	    // must store the name, it must be detached (inefficient).
2964 	    if (fd->lossy_read_names &&
2965 		(!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
2966 		 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
2967 		goto detached;
2968 
2969 	    /*
2970 	     * The fields below are unused when encoding this read as it is
2971 	     * no longer detached.  In theory they may get referred to when
2972 	     * processing a 3rd or 4th read in this template?, so we set them
2973 	     * here just to be sure.
2974 	     *
2975 	     * They do not need cram_stats_add() calls those as they are
2976 	     * not emitted.
2977 	     */
2978 	    cr->mate_pos = p->apos;
2979 	    cr->tlen = sign*(aright-aleft+1);
2980 	    cr->mate_flags =
2981 	    	((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
2982 	    	((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
2983 
2984 	    // Decrement statistics aggregated earlier
2985 	    if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
2986 		cram_stats_del(c->stats[DS_NP], p->mate_pos);
2987 		cram_stats_del(c->stats[DS_MF], p->mate_flags);
2988 		cram_stats_del(c->stats[DS_TS], p->tlen);
2989 		cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
2990 	    }
2991 
2992 	    /* Similarly we could correct the p-> values too, but these will no
2993 	     * longer have any code that refers back to them as the new 'p'
2994 	     * for this template is our current 'cr'.
2995 	     */
2996 	    //p->mate_pos = cr->apos;
2997 	    //p->mate_flags =
2998 	    //	((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
2999 	    //	((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3000 	    //p->tlen = p->apos - cr->aend;
3001 
3002 	    // Clear detached from cr flags
3003 	    cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3004 	    cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3005 
3006 	    // Clear detached from p flags and set downstream
3007 	    if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3008 		cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3009 		p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3010 	    }
3011 
3012 	    p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3013 	    p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM;
3014 	    cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3015 
3016 	    p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
3017 	    cram_stats_add(c->stats[DS_NF], p->mate_line);
3018 
3019 	    kh_val(s->pair[sec], k) = rnum;
3020 	} else {
3021 	detached:
3022 	    //fprintf(stderr, "unpaired\n");
3023 
3024 	    /* Derive mate flags from this flag */
3025 	    cr->mate_flags = 0;
3026 	    if (bam_flag(b) & BAM_FMUNMAP)
3027 		cr->mate_flags |= CRAM_M_UNMAP;
3028 	    if (bam_flag(b) & BAM_FMREVERSE)
3029 		cr->mate_flags |= CRAM_M_REVERSE;
3030 
3031 	    cram_stats_add(c->stats[DS_MF], cr->mate_flags);
3032 
3033 	    cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3034 	    cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3035 
3036 	    cr->tlen        = bam_ins_size(b);
3037 	    cram_stats_add(c->stats[DS_TS], cr->tlen);
3038 
3039 	    cr->cram_flags |= CRAM_FLAG_DETACHED;
3040 	    cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3041 	    cram_stats_add(c->stats[DS_NS], bam_mate_ref(b));
3042 
3043 	    cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3044 	}
3045     }
3046 
3047     cr->mqual       = bam_map_qual(b);
3048     cram_stats_add(c->stats[DS_MQ], cr->mqual);
3049 
3050     cr->mate_ref_id = bam_mate_ref(b);
3051 
3052     if (!(bam_flag(b) & BAM_FUNMAP)) {
3053 	if (c->first_base > cr->apos)
3054 	    c->first_base = cr->apos;
3055 
3056 	if (c->last_base < cr->aend)
3057 	    c->last_base = cr->aend;
3058     }
3059 
3060     return 0;
3061 }
3062 
3063 /*
3064  * Write iterator: put BAM format sequences into a CRAM file.
3065  * We buffer up a containers worth of data at a time.
3066  *
3067  * Returns 0 on success
3068  *        -1 on failure
3069  */
cram_put_bam_seq(cram_fd * fd,bam_seq_t * b)3070 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
3071     cram_container *c;
3072 
3073     if (!fd->ctr) {
3074 	fd->ctr = cram_new_container(fd->seqs_per_slice,
3075 				     fd->slices_per_container);
3076 	if (!fd->ctr)
3077 	    return -1;
3078 	fd->ctr->record_counter = fd->record_counter;
3079     }
3080     c = fd->ctr;
3081 
3082     if (!c->slice || c->curr_rec == c->max_rec ||
3083 	(bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
3084 	(c->s_num_bases >= fd->bases_per_slice)) {
3085 	int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
3086 	int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
3087 
3088 	/*
3089 	 * Start packing slices when we routinely have under 1/4tr full.
3090 	 *
3091 	 * This option isn't available if we choose to embed references
3092 	 * since we can only have one per slice.
3093 	 */
3094 	if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
3095 	    fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
3096 	    !fd->embed_ref) {
3097 	    if (fd->verbose && !c->multi_seq)
3098 		fprintf(stderr, "Multi-ref enabled for this container\n");
3099 	    multi_seq = 1;
3100 	}
3101 
3102 	slice_rec = c->slice_rec;
3103 	curr_rec  = c->curr_rec;
3104 
3105 	if (CRAM_MAJOR_VERS(fd->version) == 1 ||
3106 	    c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
3107 	    c->s_num_bases >= fd->bases_per_slice) {
3108 	    if (NULL == (c = cram_next_container(fd, b))) {
3109 		if (fd->ctr) {
3110 		    // prevent cram_close attempting to flush
3111 		    cram_free_container(fd->ctr);
3112 		    fd->ctr = NULL;
3113 		}
3114 		return -1;
3115 	    }
3116 	}
3117 
3118 	/*
3119 	 * Due to our processing order, some things we've already done we
3120 	 * cannot easily undo. So when we first notice we should be packing
3121 	 * multiple sequences per container we emit the small partial
3122 	 * container as-is and then start a fresh one in a different mode.
3123 	 */
3124 	if (multi_seq) {
3125 	    fd->multi_seq = 1;
3126 	    c->multi_seq = 1;
3127 	    c->pos_sorted = 0; // required atm for multi_seq slices
3128 
3129 	    if (!c->refs_used) {
3130 		pthread_mutex_lock(&fd->ref_lock);
3131 		c->refs_used = calloc(fd->refs->nref, sizeof(int));
3132 		pthread_mutex_unlock(&fd->ref_lock);
3133 		if (!c->refs_used)
3134 		    return -1;
3135 	    }
3136 	}
3137 
3138 	fd->last_slice = curr_rec - slice_rec;
3139 	c->slice_rec = c->curr_rec;
3140 
3141 	// Have we seen this reference before?
3142 	if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
3143 	    !fd->unsorted && multi_seq) {
3144 
3145 	    if (!c->refs_used) {
3146 		pthread_mutex_lock(&fd->ref_lock);
3147 		c->refs_used = calloc(fd->refs->nref, sizeof(int));
3148 		pthread_mutex_unlock(&fd->ref_lock);
3149 		if (!c->refs_used)
3150 		    return -1;
3151 	    } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
3152 		pthread_mutex_lock(&fd->ref_lock);
3153 		fd->unsorted = 1;
3154 		pthread_mutex_unlock(&fd->ref_lock);
3155 		fd->multi_seq = 1;
3156 	    }
3157 	}
3158 
3159 	c->curr_ref = bam_ref(b);
3160 	if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
3161     }
3162 
3163     if (!c->bams) {
3164 	/* First time through, allocate a set of bam pointers */
3165 	pthread_mutex_lock(&fd->bam_list_lock);
3166 	if (fd->bl) {
3167 	    spare_bams *spare = fd->bl;
3168 	    c->bams = spare->bams;
3169 	    fd->bl = spare->next;
3170 	    free(spare);
3171 	} else {
3172 	    c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
3173 	    if (!c->bams)
3174 		return -1;
3175 	}
3176 	pthread_mutex_unlock(&fd->bam_list_lock);
3177     }
3178 
3179     /* Copy or alloc+copy the bam record, for later encoding */
3180     if (c->bams[c->curr_c_rec])
3181 	bam_copy1(c->bams[c->curr_c_rec], b);
3182     else
3183 	c->bams[c->curr_c_rec] = bam_dup(b);
3184 
3185     c->curr_rec++;
3186     c->curr_c_rec++;
3187     c->s_num_bases += bam_seq_len(b);
3188     fd->record_counter++;
3189 
3190     return 0;
3191 }
3192